MedPodReasoner / checkpoint-2000 /trainer_state.json
shuyuej's picture
Upload folder using huggingface_hub
fc027a3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.046646278355835305,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.332313917791765e-05,
"grad_norm": 1.6235620975494385,
"learning_rate": 5.182689816014512e-09,
"loss": 1.9275,
"step": 1
},
{
"epoch": 4.66462783558353e-05,
"grad_norm": 1.5710082054138184,
"learning_rate": 1.0365379632029025e-08,
"loss": 1.5593,
"step": 2
},
{
"epoch": 6.996941753375295e-05,
"grad_norm": 2.3231985569000244,
"learning_rate": 1.5548069448043534e-08,
"loss": 2.0021,
"step": 3
},
{
"epoch": 9.32925567116706e-05,
"grad_norm": 1.8349288702011108,
"learning_rate": 2.073075926405805e-08,
"loss": 2.1141,
"step": 4
},
{
"epoch": 0.00011661569588958826,
"grad_norm": 2.039928436279297,
"learning_rate": 2.5913449080072562e-08,
"loss": 1.9361,
"step": 5
},
{
"epoch": 0.0001399388350675059,
"grad_norm": 1.8988783359527588,
"learning_rate": 3.109613889608707e-08,
"loss": 2.2441,
"step": 6
},
{
"epoch": 0.00016326197424542356,
"grad_norm": 1.4865813255310059,
"learning_rate": 3.6278828712101586e-08,
"loss": 1.8118,
"step": 7
},
{
"epoch": 0.0001865851134233412,
"grad_norm": 1.4033368825912476,
"learning_rate": 4.14615185281161e-08,
"loss": 1.8838,
"step": 8
},
{
"epoch": 0.00020990825260125886,
"grad_norm": 1.876894235610962,
"learning_rate": 4.6644208344130604e-08,
"loss": 1.9916,
"step": 9
},
{
"epoch": 0.00023323139177917651,
"grad_norm": 2.4104366302490234,
"learning_rate": 5.1826898160145123e-08,
"loss": 1.8618,
"step": 10
},
{
"epoch": 0.0002565545309570942,
"grad_norm": 1.8457229137420654,
"learning_rate": 5.700958797615963e-08,
"loss": 1.7303,
"step": 11
},
{
"epoch": 0.0002798776701350118,
"grad_norm": 1.940317988395691,
"learning_rate": 6.219227779217413e-08,
"loss": 2.2692,
"step": 12
},
{
"epoch": 0.0003032008093129295,
"grad_norm": 2.455432891845703,
"learning_rate": 6.737496760818865e-08,
"loss": 2.3401,
"step": 13
},
{
"epoch": 0.0003265239484908471,
"grad_norm": 1.5163850784301758,
"learning_rate": 7.255765742420317e-08,
"loss": 2.1687,
"step": 14
},
{
"epoch": 0.0003498470876687648,
"grad_norm": 1.3012642860412598,
"learning_rate": 7.774034724021768e-08,
"loss": 1.8693,
"step": 15
},
{
"epoch": 0.0003731702268466824,
"grad_norm": 2.0896522998809814,
"learning_rate": 8.29230370562322e-08,
"loss": 1.7031,
"step": 16
},
{
"epoch": 0.0003964933660246001,
"grad_norm": 1.7818728685379028,
"learning_rate": 8.810572687224672e-08,
"loss": 2.0829,
"step": 17
},
{
"epoch": 0.0004198165052025177,
"grad_norm": 2.569828510284424,
"learning_rate": 9.328841668826121e-08,
"loss": 1.8998,
"step": 18
},
{
"epoch": 0.0004431396443804354,
"grad_norm": 1.4619100093841553,
"learning_rate": 9.847110650427573e-08,
"loss": 1.5964,
"step": 19
},
{
"epoch": 0.00046646278355835303,
"grad_norm": 1.9832793474197388,
"learning_rate": 1.0365379632029025e-07,
"loss": 1.9292,
"step": 20
},
{
"epoch": 0.0004897859227362707,
"grad_norm": 2.0182175636291504,
"learning_rate": 1.0883648613630475e-07,
"loss": 2.0115,
"step": 21
},
{
"epoch": 0.0005131090619141884,
"grad_norm": 1.4642307758331299,
"learning_rate": 1.1401917595231926e-07,
"loss": 2.0291,
"step": 22
},
{
"epoch": 0.000536432201092106,
"grad_norm": 2.887909173965454,
"learning_rate": 1.1920186576833378e-07,
"loss": 2.1946,
"step": 23
},
{
"epoch": 0.0005597553402700236,
"grad_norm": 1.595544457435608,
"learning_rate": 1.2438455558434827e-07,
"loss": 2.0246,
"step": 24
},
{
"epoch": 0.0005830784794479413,
"grad_norm": 1.5648566484451294,
"learning_rate": 1.295672454003628e-07,
"loss": 2.1832,
"step": 25
},
{
"epoch": 0.000606401618625859,
"grad_norm": 1.4702372550964355,
"learning_rate": 1.347499352163773e-07,
"loss": 1.6395,
"step": 26
},
{
"epoch": 0.0006297247578037766,
"grad_norm": 1.7178195714950562,
"learning_rate": 1.399326250323918e-07,
"loss": 1.6264,
"step": 27
},
{
"epoch": 0.0006530478969816942,
"grad_norm": 2.1751515865325928,
"learning_rate": 1.4511531484840635e-07,
"loss": 2.511,
"step": 28
},
{
"epoch": 0.0006763710361596119,
"grad_norm": 2.9443299770355225,
"learning_rate": 1.5029800466442085e-07,
"loss": 2.229,
"step": 29
},
{
"epoch": 0.0006996941753375296,
"grad_norm": 1.8316481113433838,
"learning_rate": 1.5548069448043536e-07,
"loss": 1.8414,
"step": 30
},
{
"epoch": 0.0007230173145154472,
"grad_norm": 1.9659239053726196,
"learning_rate": 1.6066338429644986e-07,
"loss": 2.0109,
"step": 31
},
{
"epoch": 0.0007463404536933648,
"grad_norm": 2.1653449535369873,
"learning_rate": 1.658460741124644e-07,
"loss": 2.0155,
"step": 32
},
{
"epoch": 0.0007696635928712825,
"grad_norm": 1.8755710124969482,
"learning_rate": 1.710287639284789e-07,
"loss": 2.1105,
"step": 33
},
{
"epoch": 0.0007929867320492002,
"grad_norm": 1.5989196300506592,
"learning_rate": 1.7621145374449343e-07,
"loss": 2.1583,
"step": 34
},
{
"epoch": 0.0008163098712271178,
"grad_norm": 1.865307331085205,
"learning_rate": 1.813941435605079e-07,
"loss": 2.001,
"step": 35
},
{
"epoch": 0.0008396330104050355,
"grad_norm": 1.4584789276123047,
"learning_rate": 1.8657683337652242e-07,
"loss": 1.8854,
"step": 36
},
{
"epoch": 0.0008629561495829531,
"grad_norm": 2.6818912029266357,
"learning_rate": 1.9175952319253695e-07,
"loss": 2.1888,
"step": 37
},
{
"epoch": 0.0008862792887608708,
"grad_norm": 2.17561674118042,
"learning_rate": 1.9694221300855146e-07,
"loss": 1.9616,
"step": 38
},
{
"epoch": 0.0009096024279387884,
"grad_norm": 1.252475619316101,
"learning_rate": 2.02124902824566e-07,
"loss": 1.9585,
"step": 39
},
{
"epoch": 0.0009329255671167061,
"grad_norm": 1.884366750717163,
"learning_rate": 2.073075926405805e-07,
"loss": 2.2436,
"step": 40
},
{
"epoch": 0.0009562487062946237,
"grad_norm": 1.4951350688934326,
"learning_rate": 2.1249028245659497e-07,
"loss": 1.7149,
"step": 41
},
{
"epoch": 0.0009795718454725414,
"grad_norm": 1.891728162765503,
"learning_rate": 2.176729722726095e-07,
"loss": 2.0472,
"step": 42
},
{
"epoch": 0.001002894984650459,
"grad_norm": 1.8992432355880737,
"learning_rate": 2.22855662088624e-07,
"loss": 2.1471,
"step": 43
},
{
"epoch": 0.0010262181238283768,
"grad_norm": 1.3931283950805664,
"learning_rate": 2.2803835190463852e-07,
"loss": 1.5292,
"step": 44
},
{
"epoch": 0.0010495412630062942,
"grad_norm": 1.8894548416137695,
"learning_rate": 2.3322104172065305e-07,
"loss": 1.7759,
"step": 45
},
{
"epoch": 0.001072864402184212,
"grad_norm": 1.592050552368164,
"learning_rate": 2.3840373153666755e-07,
"loss": 2.2498,
"step": 46
},
{
"epoch": 0.0010961875413621296,
"grad_norm": 1.3746178150177002,
"learning_rate": 2.4358642135268203e-07,
"loss": 1.8503,
"step": 47
},
{
"epoch": 0.0011195106805400473,
"grad_norm": 2.0268595218658447,
"learning_rate": 2.4876911116869654e-07,
"loss": 1.9358,
"step": 48
},
{
"epoch": 0.001142833819717965,
"grad_norm": 1.7836228609085083,
"learning_rate": 2.539518009847111e-07,
"loss": 1.9855,
"step": 49
},
{
"epoch": 0.0011661569588958826,
"grad_norm": 1.829447627067566,
"learning_rate": 2.591344908007256e-07,
"loss": 2.2802,
"step": 50
},
{
"epoch": 0.0011894800980738003,
"grad_norm": 2.2813496589660645,
"learning_rate": 2.643171806167401e-07,
"loss": 2.1593,
"step": 51
},
{
"epoch": 0.001212803237251718,
"grad_norm": 3.019044876098633,
"learning_rate": 2.694998704327546e-07,
"loss": 1.9534,
"step": 52
},
{
"epoch": 0.0012361263764296354,
"grad_norm": 2.011425256729126,
"learning_rate": 2.746825602487691e-07,
"loss": 2.1284,
"step": 53
},
{
"epoch": 0.0012594495156075531,
"grad_norm": 2.207106590270996,
"learning_rate": 2.798652500647836e-07,
"loss": 2.2427,
"step": 54
},
{
"epoch": 0.0012827726547854708,
"grad_norm": 1.3172473907470703,
"learning_rate": 2.8504793988079813e-07,
"loss": 1.9782,
"step": 55
},
{
"epoch": 0.0013060957939633885,
"grad_norm": 1.522895097732544,
"learning_rate": 2.902306296968127e-07,
"loss": 1.9455,
"step": 56
},
{
"epoch": 0.0013294189331413062,
"grad_norm": 2.657248020172119,
"learning_rate": 2.954133195128272e-07,
"loss": 1.959,
"step": 57
},
{
"epoch": 0.0013527420723192238,
"grad_norm": 1.9738789796829224,
"learning_rate": 3.005960093288417e-07,
"loss": 1.7878,
"step": 58
},
{
"epoch": 0.0013760652114971415,
"grad_norm": 1.5549254417419434,
"learning_rate": 3.057786991448562e-07,
"loss": 1.9405,
"step": 59
},
{
"epoch": 0.0013993883506750592,
"grad_norm": 2.9688899517059326,
"learning_rate": 3.109613889608707e-07,
"loss": 1.9969,
"step": 60
},
{
"epoch": 0.0014227114898529767,
"grad_norm": 1.4602586030960083,
"learning_rate": 3.1614407877688527e-07,
"loss": 1.9339,
"step": 61
},
{
"epoch": 0.0014460346290308943,
"grad_norm": 2.4017045497894287,
"learning_rate": 3.213267685928997e-07,
"loss": 2.0842,
"step": 62
},
{
"epoch": 0.001469357768208812,
"grad_norm": 1.7433497905731201,
"learning_rate": 3.2650945840891423e-07,
"loss": 2.0223,
"step": 63
},
{
"epoch": 0.0014926809073867297,
"grad_norm": 1.7395591735839844,
"learning_rate": 3.316921482249288e-07,
"loss": 1.9257,
"step": 64
},
{
"epoch": 0.0015160040465646474,
"grad_norm": 1.8336257934570312,
"learning_rate": 3.3687483804094324e-07,
"loss": 1.948,
"step": 65
},
{
"epoch": 0.001539327185742565,
"grad_norm": 1.6493985652923584,
"learning_rate": 3.420575278569578e-07,
"loss": 1.8672,
"step": 66
},
{
"epoch": 0.0015626503249204827,
"grad_norm": 1.5789337158203125,
"learning_rate": 3.472402176729723e-07,
"loss": 1.9446,
"step": 67
},
{
"epoch": 0.0015859734640984004,
"grad_norm": 1.3755509853363037,
"learning_rate": 3.5242290748898686e-07,
"loss": 2.1796,
"step": 68
},
{
"epoch": 0.001609296603276318,
"grad_norm": 1.7978087663650513,
"learning_rate": 3.576055973050013e-07,
"loss": 1.8974,
"step": 69
},
{
"epoch": 0.0016326197424542355,
"grad_norm": 1.8888216018676758,
"learning_rate": 3.627882871210158e-07,
"loss": 1.915,
"step": 70
},
{
"epoch": 0.0016559428816321532,
"grad_norm": 2.6150593757629395,
"learning_rate": 3.679709769370304e-07,
"loss": 2.2133,
"step": 71
},
{
"epoch": 0.001679266020810071,
"grad_norm": 1.7009005546569824,
"learning_rate": 3.7315366675304483e-07,
"loss": 2.1024,
"step": 72
},
{
"epoch": 0.0017025891599879886,
"grad_norm": 1.741734266281128,
"learning_rate": 3.783363565690594e-07,
"loss": 2.1839,
"step": 73
},
{
"epoch": 0.0017259122991659063,
"grad_norm": 2.7715041637420654,
"learning_rate": 3.835190463850739e-07,
"loss": 2.0734,
"step": 74
},
{
"epoch": 0.001749235438343824,
"grad_norm": 1.9710502624511719,
"learning_rate": 3.8870173620108835e-07,
"loss": 2.18,
"step": 75
},
{
"epoch": 0.0017725585775217416,
"grad_norm": 2.077986478805542,
"learning_rate": 3.938844260171029e-07,
"loss": 2.1482,
"step": 76
},
{
"epoch": 0.0017958817166996593,
"grad_norm": 2.583721160888672,
"learning_rate": 3.990671158331174e-07,
"loss": 2.5364,
"step": 77
},
{
"epoch": 0.0018192048558775768,
"grad_norm": 1.3425930738449097,
"learning_rate": 4.04249805649132e-07,
"loss": 1.8194,
"step": 78
},
{
"epoch": 0.0018425279950554944,
"grad_norm": 2.1111888885498047,
"learning_rate": 4.0943249546514643e-07,
"loss": 1.7878,
"step": 79
},
{
"epoch": 0.0018658511342334121,
"grad_norm": 2.0795626640319824,
"learning_rate": 4.14615185281161e-07,
"loss": 2.3006,
"step": 80
},
{
"epoch": 0.0018891742734113298,
"grad_norm": 1.273370623588562,
"learning_rate": 4.197978750971755e-07,
"loss": 1.7599,
"step": 81
},
{
"epoch": 0.0019124974125892475,
"grad_norm": 1.6202706098556519,
"learning_rate": 4.2498056491318994e-07,
"loss": 2.1727,
"step": 82
},
{
"epoch": 0.0019358205517671651,
"grad_norm": 2.4593732357025146,
"learning_rate": 4.301632547292045e-07,
"loss": 2.4588,
"step": 83
},
{
"epoch": 0.001959143690945083,
"grad_norm": 1.2617835998535156,
"learning_rate": 4.35345944545219e-07,
"loss": 1.9078,
"step": 84
},
{
"epoch": 0.0019824668301230003,
"grad_norm": 2.2640504837036133,
"learning_rate": 4.405286343612335e-07,
"loss": 1.8983,
"step": 85
},
{
"epoch": 0.002005789969300918,
"grad_norm": 1.6804454326629639,
"learning_rate": 4.45711324177248e-07,
"loss": 2.1049,
"step": 86
},
{
"epoch": 0.0020291131084788356,
"grad_norm": 2.060009717941284,
"learning_rate": 4.5089401399326253e-07,
"loss": 2.0153,
"step": 87
},
{
"epoch": 0.0020524362476567535,
"grad_norm": 1.7166160345077515,
"learning_rate": 4.5607670380927703e-07,
"loss": 2.1093,
"step": 88
},
{
"epoch": 0.002075759386834671,
"grad_norm": 1.6695979833602905,
"learning_rate": 4.6125939362529154e-07,
"loss": 1.8607,
"step": 89
},
{
"epoch": 0.0020990825260125885,
"grad_norm": 1.4339056015014648,
"learning_rate": 4.664420834413061e-07,
"loss": 2.2632,
"step": 90
},
{
"epoch": 0.0021224056651905064,
"grad_norm": 1.5228222608566284,
"learning_rate": 4.7162477325732055e-07,
"loss": 2.0851,
"step": 91
},
{
"epoch": 0.002145728804368424,
"grad_norm": 1.540848731994629,
"learning_rate": 4.768074630733351e-07,
"loss": 2.1446,
"step": 92
},
{
"epoch": 0.0021690519435463417,
"grad_norm": 1.480702519416809,
"learning_rate": 4.819901528893496e-07,
"loss": 2.0718,
"step": 93
},
{
"epoch": 0.002192375082724259,
"grad_norm": 2.23518705368042,
"learning_rate": 4.871728427053641e-07,
"loss": 1.6198,
"step": 94
},
{
"epoch": 0.002215698221902177,
"grad_norm": 1.6477755308151245,
"learning_rate": 4.923555325213786e-07,
"loss": 2.1136,
"step": 95
},
{
"epoch": 0.0022390213610800945,
"grad_norm": 1.9548614025115967,
"learning_rate": 4.975382223373931e-07,
"loss": 1.9143,
"step": 96
},
{
"epoch": 0.0022623445002580124,
"grad_norm": 1.3557407855987549,
"learning_rate": 5.027209121534076e-07,
"loss": 2.0044,
"step": 97
},
{
"epoch": 0.00228566763943593,
"grad_norm": 2.2781455516815186,
"learning_rate": 5.079036019694222e-07,
"loss": 1.7761,
"step": 98
},
{
"epoch": 0.0023089907786138474,
"grad_norm": 2.1195600032806396,
"learning_rate": 5.130862917854368e-07,
"loss": 1.8174,
"step": 99
},
{
"epoch": 0.0023323139177917653,
"grad_norm": 2.0798068046569824,
"learning_rate": 5.182689816014512e-07,
"loss": 2.1431,
"step": 100
},
{
"epoch": 0.0023556370569696827,
"grad_norm": 1.8773006200790405,
"learning_rate": 5.234516714174657e-07,
"loss": 1.5221,
"step": 101
},
{
"epoch": 0.0023789601961476006,
"grad_norm": 1.7917876243591309,
"learning_rate": 5.286343612334802e-07,
"loss": 1.9383,
"step": 102
},
{
"epoch": 0.002402283335325518,
"grad_norm": 1.4980329275131226,
"learning_rate": 5.338170510494947e-07,
"loss": 1.846,
"step": 103
},
{
"epoch": 0.002425606474503436,
"grad_norm": 2.0081095695495605,
"learning_rate": 5.389997408655092e-07,
"loss": 1.8777,
"step": 104
},
{
"epoch": 0.0024489296136813534,
"grad_norm": 1.525317907333374,
"learning_rate": 5.441824306815238e-07,
"loss": 1.971,
"step": 105
},
{
"epoch": 0.002472252752859271,
"grad_norm": 1.4131786823272705,
"learning_rate": 5.493651204975382e-07,
"loss": 2.2224,
"step": 106
},
{
"epoch": 0.002495575892037189,
"grad_norm": 1.164492130279541,
"learning_rate": 5.545478103135528e-07,
"loss": 1.8909,
"step": 107
},
{
"epoch": 0.0025188990312151062,
"grad_norm": 1.9998016357421875,
"learning_rate": 5.597305001295673e-07,
"loss": 2.1197,
"step": 108
},
{
"epoch": 0.002542222170393024,
"grad_norm": 1.6218236684799194,
"learning_rate": 5.649131899455818e-07,
"loss": 1.7799,
"step": 109
},
{
"epoch": 0.0025655453095709416,
"grad_norm": 1.535388708114624,
"learning_rate": 5.700958797615963e-07,
"loss": 1.7878,
"step": 110
},
{
"epoch": 0.0025888684487488595,
"grad_norm": 1.4929994344711304,
"learning_rate": 5.752785695776108e-07,
"loss": 2.0802,
"step": 111
},
{
"epoch": 0.002612191587926777,
"grad_norm": 2.183293104171753,
"learning_rate": 5.804612593936254e-07,
"loss": 2.0506,
"step": 112
},
{
"epoch": 0.002635514727104695,
"grad_norm": 1.6339191198349,
"learning_rate": 5.856439492096398e-07,
"loss": 1.7152,
"step": 113
},
{
"epoch": 0.0026588378662826123,
"grad_norm": 1.4886974096298218,
"learning_rate": 5.908266390256544e-07,
"loss": 1.8327,
"step": 114
},
{
"epoch": 0.0026821610054605298,
"grad_norm": 1.4198302030563354,
"learning_rate": 5.960093288416688e-07,
"loss": 1.8342,
"step": 115
},
{
"epoch": 0.0027054841446384477,
"grad_norm": 2.041900157928467,
"learning_rate": 6.011920186576834e-07,
"loss": 1.9101,
"step": 116
},
{
"epoch": 0.002728807283816365,
"grad_norm": 1.7576725482940674,
"learning_rate": 6.063747084736979e-07,
"loss": 2.3793,
"step": 117
},
{
"epoch": 0.002752130422994283,
"grad_norm": 1.620440125465393,
"learning_rate": 6.115573982897124e-07,
"loss": 1.7363,
"step": 118
},
{
"epoch": 0.0027754535621722005,
"grad_norm": 1.972102403640747,
"learning_rate": 6.16740088105727e-07,
"loss": 2.0338,
"step": 119
},
{
"epoch": 0.0027987767013501184,
"grad_norm": 1.5385342836380005,
"learning_rate": 6.219227779217414e-07,
"loss": 1.829,
"step": 120
},
{
"epoch": 0.002822099840528036,
"grad_norm": 1.4439769983291626,
"learning_rate": 6.27105467737756e-07,
"loss": 1.9893,
"step": 121
},
{
"epoch": 0.0028454229797059533,
"grad_norm": 1.5146026611328125,
"learning_rate": 6.322881575537705e-07,
"loss": 1.6563,
"step": 122
},
{
"epoch": 0.002868746118883871,
"grad_norm": 1.7177401781082153,
"learning_rate": 6.374708473697849e-07,
"loss": 1.9483,
"step": 123
},
{
"epoch": 0.0028920692580617887,
"grad_norm": 2.484865188598633,
"learning_rate": 6.426535371857994e-07,
"loss": 2.0949,
"step": 124
},
{
"epoch": 0.0029153923972397066,
"grad_norm": 1.5320651531219482,
"learning_rate": 6.47836227001814e-07,
"loss": 1.8557,
"step": 125
},
{
"epoch": 0.002938715536417624,
"grad_norm": 1.3804417848587036,
"learning_rate": 6.530189168178285e-07,
"loss": 1.8733,
"step": 126
},
{
"epoch": 0.002962038675595542,
"grad_norm": 2.0832831859588623,
"learning_rate": 6.58201606633843e-07,
"loss": 1.8556,
"step": 127
},
{
"epoch": 0.0029853618147734594,
"grad_norm": 1.2582931518554688,
"learning_rate": 6.633842964498576e-07,
"loss": 2.1239,
"step": 128
},
{
"epoch": 0.0030086849539513773,
"grad_norm": 1.6449629068374634,
"learning_rate": 6.685669862658721e-07,
"loss": 2.1635,
"step": 129
},
{
"epoch": 0.0030320080931292947,
"grad_norm": 1.3350502252578735,
"learning_rate": 6.737496760818865e-07,
"loss": 1.801,
"step": 130
},
{
"epoch": 0.003055331232307212,
"grad_norm": 1.7689651250839233,
"learning_rate": 6.78932365897901e-07,
"loss": 1.7541,
"step": 131
},
{
"epoch": 0.00307865437148513,
"grad_norm": 1.4711276292800903,
"learning_rate": 6.841150557139156e-07,
"loss": 2.3916,
"step": 132
},
{
"epoch": 0.0031019775106630476,
"grad_norm": 1.2806516885757446,
"learning_rate": 6.892977455299301e-07,
"loss": 1.8609,
"step": 133
},
{
"epoch": 0.0031253006498409655,
"grad_norm": 1.5531939268112183,
"learning_rate": 6.944804353459446e-07,
"loss": 1.7721,
"step": 134
},
{
"epoch": 0.003148623789018883,
"grad_norm": 1.6541032791137695,
"learning_rate": 6.996631251619592e-07,
"loss": 2.1091,
"step": 135
},
{
"epoch": 0.003171946928196801,
"grad_norm": 2.050734281539917,
"learning_rate": 7.048458149779737e-07,
"loss": 1.8932,
"step": 136
},
{
"epoch": 0.0031952700673747183,
"grad_norm": 1.2903157472610474,
"learning_rate": 7.100285047939881e-07,
"loss": 2.0833,
"step": 137
},
{
"epoch": 0.003218593206552636,
"grad_norm": 1.3316091299057007,
"learning_rate": 7.152111946100026e-07,
"loss": 1.9307,
"step": 138
},
{
"epoch": 0.0032419163457305536,
"grad_norm": 1.441341519355774,
"learning_rate": 7.203938844260172e-07,
"loss": 2.2529,
"step": 139
},
{
"epoch": 0.003265239484908471,
"grad_norm": 2.159276008605957,
"learning_rate": 7.255765742420316e-07,
"loss": 1.847,
"step": 140
},
{
"epoch": 0.003288562624086389,
"grad_norm": 1.8410853147506714,
"learning_rate": 7.307592640580462e-07,
"loss": 2.2465,
"step": 141
},
{
"epoch": 0.0033118857632643064,
"grad_norm": 1.8678739070892334,
"learning_rate": 7.359419538740608e-07,
"loss": 1.9261,
"step": 142
},
{
"epoch": 0.0033352089024422243,
"grad_norm": 1.2097922563552856,
"learning_rate": 7.411246436900751e-07,
"loss": 2.0205,
"step": 143
},
{
"epoch": 0.003358532041620142,
"grad_norm": 1.733077883720398,
"learning_rate": 7.463073335060897e-07,
"loss": 1.8389,
"step": 144
},
{
"epoch": 0.0033818551807980597,
"grad_norm": 1.7118474245071411,
"learning_rate": 7.514900233221042e-07,
"loss": 1.9511,
"step": 145
},
{
"epoch": 0.003405178319975977,
"grad_norm": 1.6960872411727905,
"learning_rate": 7.566727131381188e-07,
"loss": 1.8828,
"step": 146
},
{
"epoch": 0.0034285014591538946,
"grad_norm": 1.2409390211105347,
"learning_rate": 7.618554029541332e-07,
"loss": 1.6878,
"step": 147
},
{
"epoch": 0.0034518245983318125,
"grad_norm": 1.3440965414047241,
"learning_rate": 7.670380927701478e-07,
"loss": 1.64,
"step": 148
},
{
"epoch": 0.00347514773750973,
"grad_norm": 1.539393663406372,
"learning_rate": 7.722207825861624e-07,
"loss": 1.6754,
"step": 149
},
{
"epoch": 0.003498470876687648,
"grad_norm": 1.5395653247833252,
"learning_rate": 7.774034724021767e-07,
"loss": 1.9761,
"step": 150
},
{
"epoch": 0.0035217940158655653,
"grad_norm": 2.0169472694396973,
"learning_rate": 7.825861622181913e-07,
"loss": 1.6927,
"step": 151
},
{
"epoch": 0.0035451171550434832,
"grad_norm": 1.8776079416275024,
"learning_rate": 7.877688520342058e-07,
"loss": 1.9273,
"step": 152
},
{
"epoch": 0.0035684402942214007,
"grad_norm": 2.078824043273926,
"learning_rate": 7.929515418502204e-07,
"loss": 1.6756,
"step": 153
},
{
"epoch": 0.0035917634333993186,
"grad_norm": 1.407560110092163,
"learning_rate": 7.981342316662348e-07,
"loss": 1.6038,
"step": 154
},
{
"epoch": 0.003615086572577236,
"grad_norm": 1.1770573854446411,
"learning_rate": 8.033169214822494e-07,
"loss": 1.6679,
"step": 155
},
{
"epoch": 0.0036384097117551535,
"grad_norm": 1.2057602405548096,
"learning_rate": 8.08499611298264e-07,
"loss": 1.7916,
"step": 156
},
{
"epoch": 0.0036617328509330714,
"grad_norm": 1.117970585823059,
"learning_rate": 8.136823011142783e-07,
"loss": 1.7974,
"step": 157
},
{
"epoch": 0.003685055990110989,
"grad_norm": 1.5996465682983398,
"learning_rate": 8.188649909302929e-07,
"loss": 1.6053,
"step": 158
},
{
"epoch": 0.0037083791292889068,
"grad_norm": 1.4170929193496704,
"learning_rate": 8.240476807463074e-07,
"loss": 1.7155,
"step": 159
},
{
"epoch": 0.0037317022684668242,
"grad_norm": 1.8114391565322876,
"learning_rate": 8.29230370562322e-07,
"loss": 1.9192,
"step": 160
},
{
"epoch": 0.003755025407644742,
"grad_norm": 1.3462793827056885,
"learning_rate": 8.344130603783364e-07,
"loss": 1.4624,
"step": 161
},
{
"epoch": 0.0037783485468226596,
"grad_norm": 1.6305956840515137,
"learning_rate": 8.39595750194351e-07,
"loss": 1.8017,
"step": 162
},
{
"epoch": 0.003801671686000577,
"grad_norm": 1.662576675415039,
"learning_rate": 8.447784400103655e-07,
"loss": 1.733,
"step": 163
},
{
"epoch": 0.003824994825178495,
"grad_norm": 1.556788682937622,
"learning_rate": 8.499611298263799e-07,
"loss": 1.9586,
"step": 164
},
{
"epoch": 0.0038483179643564124,
"grad_norm": 1.5282272100448608,
"learning_rate": 8.551438196423944e-07,
"loss": 1.8254,
"step": 165
},
{
"epoch": 0.0038716411035343303,
"grad_norm": 1.6790592670440674,
"learning_rate": 8.60326509458409e-07,
"loss": 2.1866,
"step": 166
},
{
"epoch": 0.0038949642427122478,
"grad_norm": 1.5164263248443604,
"learning_rate": 8.655091992744236e-07,
"loss": 1.6651,
"step": 167
},
{
"epoch": 0.003918287381890166,
"grad_norm": 1.5002336502075195,
"learning_rate": 8.70691889090438e-07,
"loss": 1.9295,
"step": 168
},
{
"epoch": 0.0039416105210680836,
"grad_norm": 1.2122441530227661,
"learning_rate": 8.758745789064526e-07,
"loss": 1.761,
"step": 169
},
{
"epoch": 0.003964933660246001,
"grad_norm": 1.637898564338684,
"learning_rate": 8.81057268722467e-07,
"loss": 1.8697,
"step": 170
},
{
"epoch": 0.0039882567994239185,
"grad_norm": 0.988777220249176,
"learning_rate": 8.862399585384815e-07,
"loss": 2.1249,
"step": 171
},
{
"epoch": 0.004011579938601836,
"grad_norm": 1.8833587169647217,
"learning_rate": 8.91422648354496e-07,
"loss": 1.6915,
"step": 172
},
{
"epoch": 0.004034903077779753,
"grad_norm": 1.8418108224868774,
"learning_rate": 8.966053381705106e-07,
"loss": 2.0019,
"step": 173
},
{
"epoch": 0.004058226216957671,
"grad_norm": 1.6375901699066162,
"learning_rate": 9.017880279865251e-07,
"loss": 1.7625,
"step": 174
},
{
"epoch": 0.004081549356135589,
"grad_norm": 1.8701720237731934,
"learning_rate": 9.069707178025396e-07,
"loss": 1.801,
"step": 175
},
{
"epoch": 0.004104872495313507,
"grad_norm": 1.4488773345947266,
"learning_rate": 9.121534076185541e-07,
"loss": 1.9971,
"step": 176
},
{
"epoch": 0.004128195634491424,
"grad_norm": 0.9587986469268799,
"learning_rate": 9.173360974345686e-07,
"loss": 1.6253,
"step": 177
},
{
"epoch": 0.004151518773669342,
"grad_norm": 2.6533186435699463,
"learning_rate": 9.225187872505831e-07,
"loss": 1.572,
"step": 178
},
{
"epoch": 0.00417484191284726,
"grad_norm": 2.4528841972351074,
"learning_rate": 9.277014770665976e-07,
"loss": 1.7586,
"step": 179
},
{
"epoch": 0.004198165052025177,
"grad_norm": 1.1871824264526367,
"learning_rate": 9.328841668826122e-07,
"loss": 1.6765,
"step": 180
},
{
"epoch": 0.004221488191203095,
"grad_norm": 1.1292660236358643,
"learning_rate": 9.380668566986266e-07,
"loss": 2.0673,
"step": 181
},
{
"epoch": 0.004244811330381013,
"grad_norm": 1.3055285215377808,
"learning_rate": 9.432495465146411e-07,
"loss": 1.8103,
"step": 182
},
{
"epoch": 0.004268134469558931,
"grad_norm": 1.5225868225097656,
"learning_rate": 9.484322363306557e-07,
"loss": 2.0813,
"step": 183
},
{
"epoch": 0.004291457608736848,
"grad_norm": 1.2439767122268677,
"learning_rate": 9.536149261466702e-07,
"loss": 1.6919,
"step": 184
},
{
"epoch": 0.0043147807479147655,
"grad_norm": 1.2424002885818481,
"learning_rate": 9.587976159626847e-07,
"loss": 1.9506,
"step": 185
},
{
"epoch": 0.0043381038870926834,
"grad_norm": 0.9796323776245117,
"learning_rate": 9.639803057786992e-07,
"loss": 1.7342,
"step": 186
},
{
"epoch": 0.0043614270262706005,
"grad_norm": 1.2240192890167236,
"learning_rate": 9.691629955947138e-07,
"loss": 2.0646,
"step": 187
},
{
"epoch": 0.004384750165448518,
"grad_norm": 0.8779449462890625,
"learning_rate": 9.743456854107281e-07,
"loss": 1.4535,
"step": 188
},
{
"epoch": 0.004408073304626436,
"grad_norm": 1.3131407499313354,
"learning_rate": 9.795283752267427e-07,
"loss": 1.9817,
"step": 189
},
{
"epoch": 0.004431396443804354,
"grad_norm": 1.3259912729263306,
"learning_rate": 9.847110650427573e-07,
"loss": 1.709,
"step": 190
},
{
"epoch": 0.004454719582982271,
"grad_norm": 1.4236465692520142,
"learning_rate": 9.898937548587718e-07,
"loss": 1.7059,
"step": 191
},
{
"epoch": 0.004478042722160189,
"grad_norm": 1.2791959047317505,
"learning_rate": 9.950764446747862e-07,
"loss": 1.9633,
"step": 192
},
{
"epoch": 0.004501365861338107,
"grad_norm": 0.9857053160667419,
"learning_rate": 1.0002591344908007e-06,
"loss": 1.807,
"step": 193
},
{
"epoch": 0.004524689000516025,
"grad_norm": 1.264302372932434,
"learning_rate": 1.0054418243068153e-06,
"loss": 1.5389,
"step": 194
},
{
"epoch": 0.004548012139693942,
"grad_norm": 1.2205390930175781,
"learning_rate": 1.0106245141228298e-06,
"loss": 1.4549,
"step": 195
},
{
"epoch": 0.00457133527887186,
"grad_norm": 1.055471420288086,
"learning_rate": 1.0158072039388444e-06,
"loss": 1.6931,
"step": 196
},
{
"epoch": 0.004594658418049778,
"grad_norm": 1.0585546493530273,
"learning_rate": 1.020989893754859e-06,
"loss": 1.8054,
"step": 197
},
{
"epoch": 0.004617981557227695,
"grad_norm": 2.16025972366333,
"learning_rate": 1.0261725835708735e-06,
"loss": 2.0077,
"step": 198
},
{
"epoch": 0.004641304696405613,
"grad_norm": 2.125786781311035,
"learning_rate": 1.0313552733868879e-06,
"loss": 1.9117,
"step": 199
},
{
"epoch": 0.0046646278355835305,
"grad_norm": 1.3560391664505005,
"learning_rate": 1.0365379632029024e-06,
"loss": 1.9871,
"step": 200
},
{
"epoch": 0.004687950974761448,
"grad_norm": 1.3505181074142456,
"learning_rate": 1.041720653018917e-06,
"loss": 1.714,
"step": 201
},
{
"epoch": 0.004711274113939365,
"grad_norm": 1.1724427938461304,
"learning_rate": 1.0469033428349313e-06,
"loss": 1.7611,
"step": 202
},
{
"epoch": 0.004734597253117283,
"grad_norm": 1.1746799945831299,
"learning_rate": 1.0520860326509459e-06,
"loss": 1.867,
"step": 203
},
{
"epoch": 0.004757920392295201,
"grad_norm": 1.0976382493972778,
"learning_rate": 1.0572687224669604e-06,
"loss": 1.808,
"step": 204
},
{
"epoch": 0.004781243531473118,
"grad_norm": 1.3842298984527588,
"learning_rate": 1.062451412282975e-06,
"loss": 1.7973,
"step": 205
},
{
"epoch": 0.004804566670651036,
"grad_norm": 1.6715288162231445,
"learning_rate": 1.0676341020989893e-06,
"loss": 1.9817,
"step": 206
},
{
"epoch": 0.004827889809828954,
"grad_norm": 1.0734590291976929,
"learning_rate": 1.072816791915004e-06,
"loss": 1.4297,
"step": 207
},
{
"epoch": 0.004851212949006872,
"grad_norm": 1.0182546377182007,
"learning_rate": 1.0779994817310185e-06,
"loss": 1.713,
"step": 208
},
{
"epoch": 0.004874536088184789,
"grad_norm": 1.1884313821792603,
"learning_rate": 1.083182171547033e-06,
"loss": 1.5234,
"step": 209
},
{
"epoch": 0.004897859227362707,
"grad_norm": 1.520266056060791,
"learning_rate": 1.0883648613630476e-06,
"loss": 2.0598,
"step": 210
},
{
"epoch": 0.004921182366540625,
"grad_norm": 1.1709904670715332,
"learning_rate": 1.0935475511790621e-06,
"loss": 2.1461,
"step": 211
},
{
"epoch": 0.004944505505718542,
"grad_norm": 1.2634027004241943,
"learning_rate": 1.0987302409950765e-06,
"loss": 1.5076,
"step": 212
},
{
"epoch": 0.00496782864489646,
"grad_norm": 1.490717887878418,
"learning_rate": 1.103912930811091e-06,
"loss": 1.8628,
"step": 213
},
{
"epoch": 0.004991151784074378,
"grad_norm": 2.077373743057251,
"learning_rate": 1.1090956206271056e-06,
"loss": 1.9295,
"step": 214
},
{
"epoch": 0.0050144749232522955,
"grad_norm": 1.647877812385559,
"learning_rate": 1.1142783104431202e-06,
"loss": 1.7929,
"step": 215
},
{
"epoch": 0.0050377980624302125,
"grad_norm": 1.1937353610992432,
"learning_rate": 1.1194610002591345e-06,
"loss": 1.6509,
"step": 216
},
{
"epoch": 0.00506112120160813,
"grad_norm": 1.0805108547210693,
"learning_rate": 1.124643690075149e-06,
"loss": 1.6447,
"step": 217
},
{
"epoch": 0.005084444340786048,
"grad_norm": 1.1077872514724731,
"learning_rate": 1.1298263798911636e-06,
"loss": 1.7675,
"step": 218
},
{
"epoch": 0.005107767479963966,
"grad_norm": 0.8648241758346558,
"learning_rate": 1.135009069707178e-06,
"loss": 1.6687,
"step": 219
},
{
"epoch": 0.005131090619141883,
"grad_norm": 1.0522700548171997,
"learning_rate": 1.1401917595231925e-06,
"loss": 1.2878,
"step": 220
},
{
"epoch": 0.005154413758319801,
"grad_norm": 1.3021256923675537,
"learning_rate": 1.145374449339207e-06,
"loss": 1.8535,
"step": 221
},
{
"epoch": 0.005177736897497719,
"grad_norm": 1.2912962436676025,
"learning_rate": 1.1505571391552216e-06,
"loss": 1.865,
"step": 222
},
{
"epoch": 0.005201060036675636,
"grad_norm": 1.6733994483947754,
"learning_rate": 1.1557398289712362e-06,
"loss": 1.5748,
"step": 223
},
{
"epoch": 0.005224383175853554,
"grad_norm": 1.0865724086761475,
"learning_rate": 1.1609225187872508e-06,
"loss": 1.8159,
"step": 224
},
{
"epoch": 0.005247706315031472,
"grad_norm": 1.1498301029205322,
"learning_rate": 1.1661052086032653e-06,
"loss": 1.8579,
"step": 225
},
{
"epoch": 0.00527102945420939,
"grad_norm": 1.9360573291778564,
"learning_rate": 1.1712878984192797e-06,
"loss": 1.7366,
"step": 226
},
{
"epoch": 0.005294352593387307,
"grad_norm": 1.0133939981460571,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.4571,
"step": 227
},
{
"epoch": 0.005317675732565225,
"grad_norm": 1.6443811655044556,
"learning_rate": 1.1816532780513088e-06,
"loss": 1.5312,
"step": 228
},
{
"epoch": 0.0053409988717431425,
"grad_norm": 1.1923338174819946,
"learning_rate": 1.1868359678673233e-06,
"loss": 1.6993,
"step": 229
},
{
"epoch": 0.0053643220109210596,
"grad_norm": 1.0345349311828613,
"learning_rate": 1.1920186576833377e-06,
"loss": 1.5739,
"step": 230
},
{
"epoch": 0.0053876451500989775,
"grad_norm": 0.9833806753158569,
"learning_rate": 1.1972013474993522e-06,
"loss": 1.819,
"step": 231
},
{
"epoch": 0.005410968289276895,
"grad_norm": 1.3315545320510864,
"learning_rate": 1.2023840373153668e-06,
"loss": 1.9472,
"step": 232
},
{
"epoch": 0.005434291428454813,
"grad_norm": 1.0042314529418945,
"learning_rate": 1.2075667271313812e-06,
"loss": 1.993,
"step": 233
},
{
"epoch": 0.00545761456763273,
"grad_norm": 1.2731118202209473,
"learning_rate": 1.2127494169473957e-06,
"loss": 1.6763,
"step": 234
},
{
"epoch": 0.005480937706810648,
"grad_norm": 0.9664155840873718,
"learning_rate": 1.2179321067634103e-06,
"loss": 1.3091,
"step": 235
},
{
"epoch": 0.005504260845988566,
"grad_norm": 1.6930897235870361,
"learning_rate": 1.2231147965794248e-06,
"loss": 1.6111,
"step": 236
},
{
"epoch": 0.005527583985166483,
"grad_norm": 0.9807016253471375,
"learning_rate": 1.2282974863954394e-06,
"loss": 1.6131,
"step": 237
},
{
"epoch": 0.005550907124344401,
"grad_norm": 1.321951150894165,
"learning_rate": 1.233480176211454e-06,
"loss": 1.242,
"step": 238
},
{
"epoch": 0.005574230263522319,
"grad_norm": 1.1465637683868408,
"learning_rate": 1.2386628660274685e-06,
"loss": 1.7035,
"step": 239
},
{
"epoch": 0.005597553402700237,
"grad_norm": 2.4264347553253174,
"learning_rate": 1.2438455558434829e-06,
"loss": 1.9859,
"step": 240
},
{
"epoch": 0.005620876541878154,
"grad_norm": 1.429149866104126,
"learning_rate": 1.2490282456594974e-06,
"loss": 1.8249,
"step": 241
},
{
"epoch": 0.005644199681056072,
"grad_norm": 1.1119049787521362,
"learning_rate": 1.254210935475512e-06,
"loss": 1.8005,
"step": 242
},
{
"epoch": 0.00566752282023399,
"grad_norm": 1.9002227783203125,
"learning_rate": 1.2593936252915265e-06,
"loss": 1.6951,
"step": 243
},
{
"epoch": 0.005690845959411907,
"grad_norm": 1.067659854888916,
"learning_rate": 1.264576315107541e-06,
"loss": 1.799,
"step": 244
},
{
"epoch": 0.0057141690985898245,
"grad_norm": 1.2947990894317627,
"learning_rate": 1.2697590049235552e-06,
"loss": 1.7837,
"step": 245
},
{
"epoch": 0.005737492237767742,
"grad_norm": 1.0790272951126099,
"learning_rate": 1.2749416947395698e-06,
"loss": 1.67,
"step": 246
},
{
"epoch": 0.00576081537694566,
"grad_norm": 1.3589330911636353,
"learning_rate": 1.2801243845555843e-06,
"loss": 1.9282,
"step": 247
},
{
"epoch": 0.005784138516123577,
"grad_norm": 1.4140998125076294,
"learning_rate": 1.285307074371599e-06,
"loss": 1.6708,
"step": 248
},
{
"epoch": 0.005807461655301495,
"grad_norm": 1.000994086265564,
"learning_rate": 1.2904897641876135e-06,
"loss": 1.4077,
"step": 249
},
{
"epoch": 0.005830784794479413,
"grad_norm": 1.3655062913894653,
"learning_rate": 1.295672454003628e-06,
"loss": 1.8862,
"step": 250
},
{
"epoch": 0.005854107933657331,
"grad_norm": 1.1164065599441528,
"learning_rate": 1.3008551438196426e-06,
"loss": 1.528,
"step": 251
},
{
"epoch": 0.005877431072835248,
"grad_norm": 1.1792149543762207,
"learning_rate": 1.306037833635657e-06,
"loss": 1.2879,
"step": 252
},
{
"epoch": 0.005900754212013166,
"grad_norm": 2.236320734024048,
"learning_rate": 1.3112205234516715e-06,
"loss": 1.4929,
"step": 253
},
{
"epoch": 0.005924077351191084,
"grad_norm": 1.8795088529586792,
"learning_rate": 1.316403213267686e-06,
"loss": 1.2468,
"step": 254
},
{
"epoch": 0.005947400490369001,
"grad_norm": 1.2248806953430176,
"learning_rate": 1.3215859030837006e-06,
"loss": 1.769,
"step": 255
},
{
"epoch": 0.005970723629546919,
"grad_norm": 1.252236008644104,
"learning_rate": 1.3267685928997152e-06,
"loss": 1.9014,
"step": 256
},
{
"epoch": 0.005994046768724837,
"grad_norm": 1.3926386833190918,
"learning_rate": 1.3319512827157297e-06,
"loss": 1.9599,
"step": 257
},
{
"epoch": 0.0060173699079027546,
"grad_norm": 1.5681990385055542,
"learning_rate": 1.3371339725317443e-06,
"loss": 1.8109,
"step": 258
},
{
"epoch": 0.006040693047080672,
"grad_norm": 1.6841275691986084,
"learning_rate": 1.3423166623477584e-06,
"loss": 1.4601,
"step": 259
},
{
"epoch": 0.0060640161862585895,
"grad_norm": 1.5262291431427002,
"learning_rate": 1.347499352163773e-06,
"loss": 1.6493,
"step": 260
},
{
"epoch": 0.006087339325436507,
"grad_norm": 1.0905576944351196,
"learning_rate": 1.3526820419797875e-06,
"loss": 2.0847,
"step": 261
},
{
"epoch": 0.006110662464614424,
"grad_norm": 1.4682683944702148,
"learning_rate": 1.357864731795802e-06,
"loss": 1.6889,
"step": 262
},
{
"epoch": 0.006133985603792342,
"grad_norm": 1.1054515838623047,
"learning_rate": 1.3630474216118166e-06,
"loss": 1.55,
"step": 263
},
{
"epoch": 0.00615730874297026,
"grad_norm": 1.3931388854980469,
"learning_rate": 1.3682301114278312e-06,
"loss": 1.655,
"step": 264
},
{
"epoch": 0.006180631882148178,
"grad_norm": 1.1766420602798462,
"learning_rate": 1.3734128012438458e-06,
"loss": 1.9555,
"step": 265
},
{
"epoch": 0.006203955021326095,
"grad_norm": 1.1652954816818237,
"learning_rate": 1.3785954910598601e-06,
"loss": 1.8446,
"step": 266
},
{
"epoch": 0.006227278160504013,
"grad_norm": 1.378980278968811,
"learning_rate": 1.3837781808758747e-06,
"loss": 1.4449,
"step": 267
},
{
"epoch": 0.006250601299681931,
"grad_norm": 1.2017453908920288,
"learning_rate": 1.3889608706918892e-06,
"loss": 1.6272,
"step": 268
},
{
"epoch": 0.006273924438859848,
"grad_norm": 1.2221115827560425,
"learning_rate": 1.3941435605079038e-06,
"loss": 1.7299,
"step": 269
},
{
"epoch": 0.006297247578037766,
"grad_norm": 1.189775824546814,
"learning_rate": 1.3993262503239183e-06,
"loss": 1.1664,
"step": 270
},
{
"epoch": 0.006320570717215684,
"grad_norm": 1.0103381872177124,
"learning_rate": 1.404508940139933e-06,
"loss": 1.3519,
"step": 271
},
{
"epoch": 0.006343893856393602,
"grad_norm": 1.1243481636047363,
"learning_rate": 1.4096916299559475e-06,
"loss": 1.6704,
"step": 272
},
{
"epoch": 0.006367216995571519,
"grad_norm": 1.8137811422348022,
"learning_rate": 1.4148743197719616e-06,
"loss": 1.279,
"step": 273
},
{
"epoch": 0.0063905401347494365,
"grad_norm": 1.0875202417373657,
"learning_rate": 1.4200570095879762e-06,
"loss": 1.1564,
"step": 274
},
{
"epoch": 0.0064138632739273544,
"grad_norm": 1.0839550495147705,
"learning_rate": 1.4252396994039907e-06,
"loss": 1.7263,
"step": 275
},
{
"epoch": 0.006437186413105272,
"grad_norm": 1.7203173637390137,
"learning_rate": 1.4304223892200053e-06,
"loss": 1.9309,
"step": 276
},
{
"epoch": 0.006460509552283189,
"grad_norm": 1.3320658206939697,
"learning_rate": 1.4356050790360198e-06,
"loss": 1.8276,
"step": 277
},
{
"epoch": 0.006483832691461107,
"grad_norm": 1.5260910987854004,
"learning_rate": 1.4407877688520344e-06,
"loss": 1.413,
"step": 278
},
{
"epoch": 0.006507155830639025,
"grad_norm": 1.2401058673858643,
"learning_rate": 1.445970458668049e-06,
"loss": 1.4087,
"step": 279
},
{
"epoch": 0.006530478969816942,
"grad_norm": 1.2722922563552856,
"learning_rate": 1.4511531484840633e-06,
"loss": 1.6216,
"step": 280
},
{
"epoch": 0.00655380210899486,
"grad_norm": 1.2668229341506958,
"learning_rate": 1.4563358383000779e-06,
"loss": 1.6252,
"step": 281
},
{
"epoch": 0.006577125248172778,
"grad_norm": 1.4556583166122437,
"learning_rate": 1.4615185281160924e-06,
"loss": 2.3276,
"step": 282
},
{
"epoch": 0.006600448387350696,
"grad_norm": 1.537610411643982,
"learning_rate": 1.466701217932107e-06,
"loss": 1.4319,
"step": 283
},
{
"epoch": 0.006623771526528613,
"grad_norm": 1.3130170106887817,
"learning_rate": 1.4718839077481215e-06,
"loss": 1.4978,
"step": 284
},
{
"epoch": 0.006647094665706531,
"grad_norm": 1.5020934343338013,
"learning_rate": 1.477066597564136e-06,
"loss": 1.8697,
"step": 285
},
{
"epoch": 0.006670417804884449,
"grad_norm": 1.6949779987335205,
"learning_rate": 1.4822492873801502e-06,
"loss": 1.7433,
"step": 286
},
{
"epoch": 0.006693740944062366,
"grad_norm": 1.5566325187683105,
"learning_rate": 1.4874319771961648e-06,
"loss": 1.5674,
"step": 287
},
{
"epoch": 0.006717064083240284,
"grad_norm": 1.015093445777893,
"learning_rate": 1.4926146670121793e-06,
"loss": 1.9903,
"step": 288
},
{
"epoch": 0.0067403872224182015,
"grad_norm": 2.229853868484497,
"learning_rate": 1.497797356828194e-06,
"loss": 1.1905,
"step": 289
},
{
"epoch": 0.006763710361596119,
"grad_norm": 1.5241860151290894,
"learning_rate": 1.5029800466442085e-06,
"loss": 1.958,
"step": 290
},
{
"epoch": 0.006787033500774036,
"grad_norm": 0.8666454553604126,
"learning_rate": 1.508162736460223e-06,
"loss": 1.7141,
"step": 291
},
{
"epoch": 0.006810356639951954,
"grad_norm": 1.4594520330429077,
"learning_rate": 1.5133454262762376e-06,
"loss": 1.7235,
"step": 292
},
{
"epoch": 0.006833679779129872,
"grad_norm": 1.3267074823379517,
"learning_rate": 1.518528116092252e-06,
"loss": 1.6172,
"step": 293
},
{
"epoch": 0.006857002918307789,
"grad_norm": 1.5386312007904053,
"learning_rate": 1.5237108059082665e-06,
"loss": 1.4843,
"step": 294
},
{
"epoch": 0.006880326057485707,
"grad_norm": 1.3275539875030518,
"learning_rate": 1.528893495724281e-06,
"loss": 1.5444,
"step": 295
},
{
"epoch": 0.006903649196663625,
"grad_norm": 1.1002707481384277,
"learning_rate": 1.5340761855402956e-06,
"loss": 1.717,
"step": 296
},
{
"epoch": 0.006926972335841543,
"grad_norm": 1.172974944114685,
"learning_rate": 1.5392588753563102e-06,
"loss": 1.6963,
"step": 297
},
{
"epoch": 0.00695029547501946,
"grad_norm": 1.0728440284729004,
"learning_rate": 1.5444415651723247e-06,
"loss": 1.6228,
"step": 298
},
{
"epoch": 0.006973618614197378,
"grad_norm": 1.274348258972168,
"learning_rate": 1.5496242549883393e-06,
"loss": 1.2559,
"step": 299
},
{
"epoch": 0.006996941753375296,
"grad_norm": 1.2520028352737427,
"learning_rate": 1.5548069448043534e-06,
"loss": 1.6118,
"step": 300
},
{
"epoch": 0.007020264892553213,
"grad_norm": 1.5844305753707886,
"learning_rate": 1.559989634620368e-06,
"loss": 1.5645,
"step": 301
},
{
"epoch": 0.007043588031731131,
"grad_norm": 2.285438299179077,
"learning_rate": 1.5651723244363825e-06,
"loss": 1.4541,
"step": 302
},
{
"epoch": 0.007066911170909049,
"grad_norm": 1.2873152494430542,
"learning_rate": 1.570355014252397e-06,
"loss": 1.4835,
"step": 303
},
{
"epoch": 0.0070902343100869665,
"grad_norm": 1.1332640647888184,
"learning_rate": 1.5755377040684116e-06,
"loss": 1.8279,
"step": 304
},
{
"epoch": 0.0071135574492648835,
"grad_norm": 1.6483525037765503,
"learning_rate": 1.5807203938844262e-06,
"loss": 1.2509,
"step": 305
},
{
"epoch": 0.007136880588442801,
"grad_norm": 1.0219485759735107,
"learning_rate": 1.5859030837004408e-06,
"loss": 1.8421,
"step": 306
},
{
"epoch": 0.007160203727620719,
"grad_norm": 1.2478340864181519,
"learning_rate": 1.5910857735164551e-06,
"loss": 1.9144,
"step": 307
},
{
"epoch": 0.007183526866798637,
"grad_norm": 1.4016437530517578,
"learning_rate": 1.5962684633324697e-06,
"loss": 1.5146,
"step": 308
},
{
"epoch": 0.007206850005976554,
"grad_norm": 1.1399790048599243,
"learning_rate": 1.6014511531484842e-06,
"loss": 1.6714,
"step": 309
},
{
"epoch": 0.007230173145154472,
"grad_norm": 2.047961473464966,
"learning_rate": 1.6066338429644988e-06,
"loss": 1.1777,
"step": 310
},
{
"epoch": 0.00725349628433239,
"grad_norm": 1.1410201787948608,
"learning_rate": 1.6118165327805133e-06,
"loss": 1.6783,
"step": 311
},
{
"epoch": 0.007276819423510307,
"grad_norm": 1.2840640544891357,
"learning_rate": 1.616999222596528e-06,
"loss": 1.9351,
"step": 312
},
{
"epoch": 0.007300142562688225,
"grad_norm": 0.9116181135177612,
"learning_rate": 1.6221819124125425e-06,
"loss": 1.7705,
"step": 313
},
{
"epoch": 0.007323465701866143,
"grad_norm": 1.3190463781356812,
"learning_rate": 1.6273646022285566e-06,
"loss": 1.4484,
"step": 314
},
{
"epoch": 0.007346788841044061,
"grad_norm": 0.9988270401954651,
"learning_rate": 1.6325472920445712e-06,
"loss": 1.5159,
"step": 315
},
{
"epoch": 0.007370111980221978,
"grad_norm": 0.8620725870132446,
"learning_rate": 1.6377299818605857e-06,
"loss": 1.5605,
"step": 316
},
{
"epoch": 0.007393435119399896,
"grad_norm": 1.284604549407959,
"learning_rate": 1.6429126716766003e-06,
"loss": 1.4822,
"step": 317
},
{
"epoch": 0.0074167582585778135,
"grad_norm": 1.2546097040176392,
"learning_rate": 1.6480953614926148e-06,
"loss": 1.436,
"step": 318
},
{
"epoch": 0.0074400813977557306,
"grad_norm": 0.9116978645324707,
"learning_rate": 1.6532780513086294e-06,
"loss": 1.2708,
"step": 319
},
{
"epoch": 0.0074634045369336485,
"grad_norm": 0.9910548329353333,
"learning_rate": 1.658460741124644e-06,
"loss": 1.8144,
"step": 320
},
{
"epoch": 0.007486727676111566,
"grad_norm": 1.9879093170166016,
"learning_rate": 1.6636434309406583e-06,
"loss": 1.4826,
"step": 321
},
{
"epoch": 0.007510050815289484,
"grad_norm": 1.0845030546188354,
"learning_rate": 1.6688261207566729e-06,
"loss": 1.3364,
"step": 322
},
{
"epoch": 0.007533373954467401,
"grad_norm": 1.342966079711914,
"learning_rate": 1.6740088105726874e-06,
"loss": 1.6453,
"step": 323
},
{
"epoch": 0.007556697093645319,
"grad_norm": 0.9570252895355225,
"learning_rate": 1.679191500388702e-06,
"loss": 1.5384,
"step": 324
},
{
"epoch": 0.007580020232823237,
"grad_norm": 1.531516671180725,
"learning_rate": 1.6843741902047165e-06,
"loss": 1.5775,
"step": 325
},
{
"epoch": 0.007603343372001154,
"grad_norm": 1.4623240232467651,
"learning_rate": 1.689556880020731e-06,
"loss": 1.7159,
"step": 326
},
{
"epoch": 0.007626666511179072,
"grad_norm": 1.109586238861084,
"learning_rate": 1.6947395698367454e-06,
"loss": 1.7403,
"step": 327
},
{
"epoch": 0.00764998965035699,
"grad_norm": 1.3199604749679565,
"learning_rate": 1.6999222596527598e-06,
"loss": 1.7208,
"step": 328
},
{
"epoch": 0.007673312789534908,
"grad_norm": 1.0979784727096558,
"learning_rate": 1.7051049494687743e-06,
"loss": 1.6097,
"step": 329
},
{
"epoch": 0.007696635928712825,
"grad_norm": 1.0952926874160767,
"learning_rate": 1.710287639284789e-06,
"loss": 1.8262,
"step": 330
},
{
"epoch": 0.007719959067890743,
"grad_norm": 1.1149373054504395,
"learning_rate": 1.7154703291008035e-06,
"loss": 1.5762,
"step": 331
},
{
"epoch": 0.007743282207068661,
"grad_norm": 1.2090753316879272,
"learning_rate": 1.720653018916818e-06,
"loss": 1.6161,
"step": 332
},
{
"epoch": 0.007766605346246578,
"grad_norm": 1.3476163148880005,
"learning_rate": 1.7258357087328326e-06,
"loss": 1.6854,
"step": 333
},
{
"epoch": 0.0077899284854244955,
"grad_norm": 1.3222614526748657,
"learning_rate": 1.7310183985488471e-06,
"loss": 1.5996,
"step": 334
},
{
"epoch": 0.007813251624602413,
"grad_norm": 1.2350871562957764,
"learning_rate": 1.7362010883648615e-06,
"loss": 1.5052,
"step": 335
},
{
"epoch": 0.007836574763780331,
"grad_norm": 1.4628745317459106,
"learning_rate": 1.741383778180876e-06,
"loss": 1.6268,
"step": 336
},
{
"epoch": 0.00785989790295825,
"grad_norm": 1.3481048345565796,
"learning_rate": 1.7465664679968906e-06,
"loss": 1.4308,
"step": 337
},
{
"epoch": 0.007883221042136167,
"grad_norm": 1.0008901357650757,
"learning_rate": 1.7517491578129052e-06,
"loss": 1.6487,
"step": 338
},
{
"epoch": 0.007906544181314083,
"grad_norm": 2.4258437156677246,
"learning_rate": 1.7569318476289195e-06,
"loss": 1.5327,
"step": 339
},
{
"epoch": 0.007929867320492001,
"grad_norm": 1.3444914817810059,
"learning_rate": 1.762114537444934e-06,
"loss": 1.5257,
"step": 340
},
{
"epoch": 0.007953190459669919,
"grad_norm": 2.297591209411621,
"learning_rate": 1.7672972272609486e-06,
"loss": 1.9581,
"step": 341
},
{
"epoch": 0.007976513598847837,
"grad_norm": 1.107711672782898,
"learning_rate": 1.772479917076963e-06,
"loss": 1.3486,
"step": 342
},
{
"epoch": 0.007999836738025755,
"grad_norm": 1.4064106941223145,
"learning_rate": 1.7776626068929775e-06,
"loss": 1.3169,
"step": 343
},
{
"epoch": 0.008023159877203673,
"grad_norm": 1.1236720085144043,
"learning_rate": 1.782845296708992e-06,
"loss": 2.0225,
"step": 344
},
{
"epoch": 0.00804648301638159,
"grad_norm": 1.9214081764221191,
"learning_rate": 1.7880279865250066e-06,
"loss": 1.7269,
"step": 345
},
{
"epoch": 0.008069806155559507,
"grad_norm": 1.1544204950332642,
"learning_rate": 1.7932106763410212e-06,
"loss": 1.8407,
"step": 346
},
{
"epoch": 0.008093129294737425,
"grad_norm": 1.3266545534133911,
"learning_rate": 1.7983933661570358e-06,
"loss": 1.3316,
"step": 347
},
{
"epoch": 0.008116452433915343,
"grad_norm": 1.4208300113677979,
"learning_rate": 1.8035760559730501e-06,
"loss": 1.7712,
"step": 348
},
{
"epoch": 0.00813977557309326,
"grad_norm": 1.1849939823150635,
"learning_rate": 1.8087587457890647e-06,
"loss": 1.3843,
"step": 349
},
{
"epoch": 0.008163098712271178,
"grad_norm": 0.9147690534591675,
"learning_rate": 1.8139414356050792e-06,
"loss": 1.703,
"step": 350
},
{
"epoch": 0.008186421851449096,
"grad_norm": 1.2026822566986084,
"learning_rate": 1.8191241254210938e-06,
"loss": 1.642,
"step": 351
},
{
"epoch": 0.008209744990627014,
"grad_norm": 1.6620279550552368,
"learning_rate": 1.8243068152371081e-06,
"loss": 1.2861,
"step": 352
},
{
"epoch": 0.00823306812980493,
"grad_norm": 1.20318603515625,
"learning_rate": 1.8294895050531227e-06,
"loss": 1.7781,
"step": 353
},
{
"epoch": 0.008256391268982848,
"grad_norm": 1.117148756980896,
"learning_rate": 1.8346721948691372e-06,
"loss": 1.7056,
"step": 354
},
{
"epoch": 0.008279714408160766,
"grad_norm": 1.3435394763946533,
"learning_rate": 1.8398548846851516e-06,
"loss": 1.7352,
"step": 355
},
{
"epoch": 0.008303037547338684,
"grad_norm": 1.6550534963607788,
"learning_rate": 1.8450375745011662e-06,
"loss": 1.4283,
"step": 356
},
{
"epoch": 0.008326360686516602,
"grad_norm": 1.0326530933380127,
"learning_rate": 1.8502202643171807e-06,
"loss": 1.8726,
"step": 357
},
{
"epoch": 0.00834968382569452,
"grad_norm": 1.1237214803695679,
"learning_rate": 1.8554029541331953e-06,
"loss": 1.7547,
"step": 358
},
{
"epoch": 0.008373006964872438,
"grad_norm": 1.3457711935043335,
"learning_rate": 1.8605856439492098e-06,
"loss": 1.5047,
"step": 359
},
{
"epoch": 0.008396330104050354,
"grad_norm": 1.3615081310272217,
"learning_rate": 1.8657683337652244e-06,
"loss": 1.3476,
"step": 360
},
{
"epoch": 0.008419653243228272,
"grad_norm": 1.4443084001541138,
"learning_rate": 1.870951023581239e-06,
"loss": 1.4259,
"step": 361
},
{
"epoch": 0.00844297638240619,
"grad_norm": 0.9154095649719238,
"learning_rate": 1.8761337133972533e-06,
"loss": 1.6089,
"step": 362
},
{
"epoch": 0.008466299521584108,
"grad_norm": 1.1972756385803223,
"learning_rate": 1.8813164032132679e-06,
"loss": 1.5704,
"step": 363
},
{
"epoch": 0.008489622660762025,
"grad_norm": 1.1325738430023193,
"learning_rate": 1.8864990930292822e-06,
"loss": 1.7252,
"step": 364
},
{
"epoch": 0.008512945799939943,
"grad_norm": 1.2257301807403564,
"learning_rate": 1.8916817828452968e-06,
"loss": 1.5124,
"step": 365
},
{
"epoch": 0.008536268939117861,
"grad_norm": 1.7714002132415771,
"learning_rate": 1.8968644726613113e-06,
"loss": 1.5799,
"step": 366
},
{
"epoch": 0.008559592078295777,
"grad_norm": 1.1215579509735107,
"learning_rate": 1.9020471624773259e-06,
"loss": 1.7692,
"step": 367
},
{
"epoch": 0.008582915217473695,
"grad_norm": 1.3264069557189941,
"learning_rate": 1.9072298522933404e-06,
"loss": 1.7848,
"step": 368
},
{
"epoch": 0.008606238356651613,
"grad_norm": 0.9898104667663574,
"learning_rate": 1.912412542109355e-06,
"loss": 1.945,
"step": 369
},
{
"epoch": 0.008629561495829531,
"grad_norm": 0.9507944583892822,
"learning_rate": 1.9175952319253693e-06,
"loss": 1.6469,
"step": 370
},
{
"epoch": 0.008652884635007449,
"grad_norm": 1.1940997838974,
"learning_rate": 1.9227779217413837e-06,
"loss": 1.5144,
"step": 371
},
{
"epoch": 0.008676207774185367,
"grad_norm": 1.2926305532455444,
"learning_rate": 1.9279606115573985e-06,
"loss": 1.6527,
"step": 372
},
{
"epoch": 0.008699530913363285,
"grad_norm": 0.9909786581993103,
"learning_rate": 1.933143301373413e-06,
"loss": 1.8003,
"step": 373
},
{
"epoch": 0.008722854052541201,
"grad_norm": 1.3900662660598755,
"learning_rate": 1.9383259911894276e-06,
"loss": 1.7743,
"step": 374
},
{
"epoch": 0.008746177191719119,
"grad_norm": 0.9942039251327515,
"learning_rate": 1.943508681005442e-06,
"loss": 1.5635,
"step": 375
},
{
"epoch": 0.008769500330897037,
"grad_norm": 1.3887672424316406,
"learning_rate": 1.9486913708214563e-06,
"loss": 1.744,
"step": 376
},
{
"epoch": 0.008792823470074955,
"grad_norm": 1.2873059511184692,
"learning_rate": 1.953874060637471e-06,
"loss": 1.64,
"step": 377
},
{
"epoch": 0.008816146609252873,
"grad_norm": 1.2259247303009033,
"learning_rate": 1.9590567504534854e-06,
"loss": 1.6418,
"step": 378
},
{
"epoch": 0.00883946974843079,
"grad_norm": 1.5709097385406494,
"learning_rate": 1.9642394402695e-06,
"loss": 1.4343,
"step": 379
},
{
"epoch": 0.008862792887608708,
"grad_norm": 1.016625165939331,
"learning_rate": 1.9694221300855145e-06,
"loss": 1.5838,
"step": 380
},
{
"epoch": 0.008886116026786626,
"grad_norm": 1.5763674974441528,
"learning_rate": 1.9746048199015293e-06,
"loss": 1.3391,
"step": 381
},
{
"epoch": 0.008909439165964542,
"grad_norm": 1.014722466468811,
"learning_rate": 1.9797875097175436e-06,
"loss": 1.7185,
"step": 382
},
{
"epoch": 0.00893276230514246,
"grad_norm": 1.5255705118179321,
"learning_rate": 1.984970199533558e-06,
"loss": 1.5749,
"step": 383
},
{
"epoch": 0.008956085444320378,
"grad_norm": 1.4036648273468018,
"learning_rate": 1.9901528893495723e-06,
"loss": 1.4134,
"step": 384
},
{
"epoch": 0.008979408583498296,
"grad_norm": 1.327813982963562,
"learning_rate": 1.995335579165587e-06,
"loss": 1.8475,
"step": 385
},
{
"epoch": 0.009002731722676214,
"grad_norm": 1.357269287109375,
"learning_rate": 2.0005182689816014e-06,
"loss": 1.4145,
"step": 386
},
{
"epoch": 0.009026054861854132,
"grad_norm": 1.4663738012313843,
"learning_rate": 2.005700958797616e-06,
"loss": 1.5207,
"step": 387
},
{
"epoch": 0.00904937800103205,
"grad_norm": 0.9792691469192505,
"learning_rate": 2.0108836486136305e-06,
"loss": 1.7392,
"step": 388
},
{
"epoch": 0.009072701140209966,
"grad_norm": 1.9074856042861938,
"learning_rate": 2.0160663384296453e-06,
"loss": 1.5931,
"step": 389
},
{
"epoch": 0.009096024279387884,
"grad_norm": 1.562455654144287,
"learning_rate": 2.0212490282456597e-06,
"loss": 1.3503,
"step": 390
},
{
"epoch": 0.009119347418565802,
"grad_norm": 1.6827714443206787,
"learning_rate": 2.026431718061674e-06,
"loss": 1.8409,
"step": 391
},
{
"epoch": 0.00914267055774372,
"grad_norm": 0.969691276550293,
"learning_rate": 2.0316144078776888e-06,
"loss": 1.5167,
"step": 392
},
{
"epoch": 0.009165993696921637,
"grad_norm": 1.1107996702194214,
"learning_rate": 2.036797097693703e-06,
"loss": 1.5723,
"step": 393
},
{
"epoch": 0.009189316836099555,
"grad_norm": 0.9862359762191772,
"learning_rate": 2.041979787509718e-06,
"loss": 1.1188,
"step": 394
},
{
"epoch": 0.009212639975277473,
"grad_norm": 1.4997074604034424,
"learning_rate": 2.0471624773257322e-06,
"loss": 1.6742,
"step": 395
},
{
"epoch": 0.00923596311445539,
"grad_norm": 1.1336885690689087,
"learning_rate": 2.052345167141747e-06,
"loss": 1.5602,
"step": 396
},
{
"epoch": 0.009259286253633307,
"grad_norm": 1.4929397106170654,
"learning_rate": 2.057527856957761e-06,
"loss": 1.4891,
"step": 397
},
{
"epoch": 0.009282609392811225,
"grad_norm": 1.3118637800216675,
"learning_rate": 2.0627105467737757e-06,
"loss": 1.5758,
"step": 398
},
{
"epoch": 0.009305932531989143,
"grad_norm": 1.1043623685836792,
"learning_rate": 2.06789323658979e-06,
"loss": 1.9455,
"step": 399
},
{
"epoch": 0.009329255671167061,
"grad_norm": 1.3472813367843628,
"learning_rate": 2.073075926405805e-06,
"loss": 1.4657,
"step": 400
},
{
"epoch": 0.009352578810344979,
"grad_norm": 1.5614628791809082,
"learning_rate": 2.078258616221819e-06,
"loss": 1.3351,
"step": 401
},
{
"epoch": 0.009375901949522897,
"grad_norm": 1.393477439880371,
"learning_rate": 2.083441306037834e-06,
"loss": 1.8887,
"step": 402
},
{
"epoch": 0.009399225088700813,
"grad_norm": 1.0576095581054688,
"learning_rate": 2.0886239958538483e-06,
"loss": 1.7814,
"step": 403
},
{
"epoch": 0.00942254822787873,
"grad_norm": 1.5161347389221191,
"learning_rate": 2.0938066856698626e-06,
"loss": 1.2316,
"step": 404
},
{
"epoch": 0.009445871367056649,
"grad_norm": 1.05890691280365,
"learning_rate": 2.0989893754858774e-06,
"loss": 1.5303,
"step": 405
},
{
"epoch": 0.009469194506234567,
"grad_norm": 0.801816463470459,
"learning_rate": 2.1041720653018918e-06,
"loss": 1.5165,
"step": 406
},
{
"epoch": 0.009492517645412485,
"grad_norm": 1.2811832427978516,
"learning_rate": 2.1093547551179065e-06,
"loss": 1.8638,
"step": 407
},
{
"epoch": 0.009515840784590402,
"grad_norm": 1.2984956502914429,
"learning_rate": 2.114537444933921e-06,
"loss": 1.4195,
"step": 408
},
{
"epoch": 0.00953916392376832,
"grad_norm": 2.3772926330566406,
"learning_rate": 2.1197201347499356e-06,
"loss": 1.2616,
"step": 409
},
{
"epoch": 0.009562487062946236,
"grad_norm": 1.102181315422058,
"learning_rate": 2.12490282456595e-06,
"loss": 1.6683,
"step": 410
},
{
"epoch": 0.009585810202124154,
"grad_norm": 1.4473963975906372,
"learning_rate": 2.1300855143819643e-06,
"loss": 1.6474,
"step": 411
},
{
"epoch": 0.009609133341302072,
"grad_norm": 2.3995816707611084,
"learning_rate": 2.1352682041979787e-06,
"loss": 1.6203,
"step": 412
},
{
"epoch": 0.00963245648047999,
"grad_norm": 0.9490773677825928,
"learning_rate": 2.1404508940139935e-06,
"loss": 1.8082,
"step": 413
},
{
"epoch": 0.009655779619657908,
"grad_norm": 0.9358771443367004,
"learning_rate": 2.145633583830008e-06,
"loss": 1.5929,
"step": 414
},
{
"epoch": 0.009679102758835826,
"grad_norm": 0.9875616431236267,
"learning_rate": 2.1508162736460226e-06,
"loss": 1.4312,
"step": 415
},
{
"epoch": 0.009702425898013744,
"grad_norm": 1.197416067123413,
"learning_rate": 2.155998963462037e-06,
"loss": 1.3165,
"step": 416
},
{
"epoch": 0.00972574903719166,
"grad_norm": 2.0210750102996826,
"learning_rate": 2.1611816532780513e-06,
"loss": 1.4962,
"step": 417
},
{
"epoch": 0.009749072176369578,
"grad_norm": 1.2700085639953613,
"learning_rate": 2.166364343094066e-06,
"loss": 1.6101,
"step": 418
},
{
"epoch": 0.009772395315547496,
"grad_norm": 1.124679684638977,
"learning_rate": 2.1715470329100804e-06,
"loss": 1.7477,
"step": 419
},
{
"epoch": 0.009795718454725414,
"grad_norm": 1.178290843963623,
"learning_rate": 2.176729722726095e-06,
"loss": 1.4108,
"step": 420
},
{
"epoch": 0.009819041593903332,
"grad_norm": 1.792117953300476,
"learning_rate": 2.1819124125421095e-06,
"loss": 1.5568,
"step": 421
},
{
"epoch": 0.00984236473308125,
"grad_norm": 1.7381610870361328,
"learning_rate": 2.1870951023581243e-06,
"loss": 1.3229,
"step": 422
},
{
"epoch": 0.009865687872259167,
"grad_norm": 1.023553490638733,
"learning_rate": 2.1922777921741386e-06,
"loss": 1.1633,
"step": 423
},
{
"epoch": 0.009889011011437084,
"grad_norm": 1.5537900924682617,
"learning_rate": 2.197460481990153e-06,
"loss": 1.291,
"step": 424
},
{
"epoch": 0.009912334150615001,
"grad_norm": 1.722598671913147,
"learning_rate": 2.2026431718061673e-06,
"loss": 1.5201,
"step": 425
},
{
"epoch": 0.00993565728979292,
"grad_norm": 1.546295166015625,
"learning_rate": 2.207825861622182e-06,
"loss": 1.3554,
"step": 426
},
{
"epoch": 0.009958980428970837,
"grad_norm": 1.4075593948364258,
"learning_rate": 2.2130085514381964e-06,
"loss": 1.3831,
"step": 427
},
{
"epoch": 0.009982303568148755,
"grad_norm": 1.441125512123108,
"learning_rate": 2.218191241254211e-06,
"loss": 1.4806,
"step": 428
},
{
"epoch": 0.010005626707326673,
"grad_norm": 1.4198213815689087,
"learning_rate": 2.2233739310702255e-06,
"loss": 1.6962,
"step": 429
},
{
"epoch": 0.010028949846504591,
"grad_norm": 1.1716971397399902,
"learning_rate": 2.2285566208862403e-06,
"loss": 1.0423,
"step": 430
},
{
"epoch": 0.010052272985682507,
"grad_norm": 1.1271895170211792,
"learning_rate": 2.2337393107022547e-06,
"loss": 1.4246,
"step": 431
},
{
"epoch": 0.010075596124860425,
"grad_norm": 1.2987208366394043,
"learning_rate": 2.238922000518269e-06,
"loss": 1.5946,
"step": 432
},
{
"epoch": 0.010098919264038343,
"grad_norm": 1.7283997535705566,
"learning_rate": 2.2441046903342838e-06,
"loss": 1.5761,
"step": 433
},
{
"epoch": 0.01012224240321626,
"grad_norm": 1.635098934173584,
"learning_rate": 2.249287380150298e-06,
"loss": 1.6912,
"step": 434
},
{
"epoch": 0.010145565542394179,
"grad_norm": 2.1896469593048096,
"learning_rate": 2.254470069966313e-06,
"loss": 1.2961,
"step": 435
},
{
"epoch": 0.010168888681572097,
"grad_norm": 1.1874053478240967,
"learning_rate": 2.2596527597823272e-06,
"loss": 1.4999,
"step": 436
},
{
"epoch": 0.010192211820750014,
"grad_norm": 1.2898855209350586,
"learning_rate": 2.264835449598342e-06,
"loss": 1.7152,
"step": 437
},
{
"epoch": 0.010215534959927932,
"grad_norm": 0.792107105255127,
"learning_rate": 2.270018139414356e-06,
"loss": 1.4129,
"step": 438
},
{
"epoch": 0.010238858099105849,
"grad_norm": 1.2092666625976562,
"learning_rate": 2.2752008292303707e-06,
"loss": 1.4687,
"step": 439
},
{
"epoch": 0.010262181238283766,
"grad_norm": 1.2261115312576294,
"learning_rate": 2.280383519046385e-06,
"loss": 1.5548,
"step": 440
},
{
"epoch": 0.010285504377461684,
"grad_norm": 2.0835094451904297,
"learning_rate": 2.2855662088624e-06,
"loss": 1.5925,
"step": 441
},
{
"epoch": 0.010308827516639602,
"grad_norm": 1.075907826423645,
"learning_rate": 2.290748898678414e-06,
"loss": 1.4967,
"step": 442
},
{
"epoch": 0.01033215065581752,
"grad_norm": 0.9633646011352539,
"learning_rate": 2.295931588494429e-06,
"loss": 1.6798,
"step": 443
},
{
"epoch": 0.010355473794995438,
"grad_norm": 1.6833699941635132,
"learning_rate": 2.3011142783104433e-06,
"loss": 1.3053,
"step": 444
},
{
"epoch": 0.010378796934173356,
"grad_norm": 1.1333974599838257,
"learning_rate": 2.3062969681264576e-06,
"loss": 1.3658,
"step": 445
},
{
"epoch": 0.010402120073351272,
"grad_norm": 1.3382309675216675,
"learning_rate": 2.3114796579424724e-06,
"loss": 1.6492,
"step": 446
},
{
"epoch": 0.01042544321252919,
"grad_norm": 0.7148923873901367,
"learning_rate": 2.3166623477584868e-06,
"loss": 1.6269,
"step": 447
},
{
"epoch": 0.010448766351707108,
"grad_norm": 1.084245204925537,
"learning_rate": 2.3218450375745015e-06,
"loss": 2.0708,
"step": 448
},
{
"epoch": 0.010472089490885026,
"grad_norm": 1.1463004350662231,
"learning_rate": 2.327027727390516e-06,
"loss": 2.0115,
"step": 449
},
{
"epoch": 0.010495412630062944,
"grad_norm": 1.5500133037567139,
"learning_rate": 2.3322104172065306e-06,
"loss": 1.5454,
"step": 450
},
{
"epoch": 0.010518735769240862,
"grad_norm": 1.2993839979171753,
"learning_rate": 2.337393107022545e-06,
"loss": 1.5475,
"step": 451
},
{
"epoch": 0.01054205890841878,
"grad_norm": 1.295839786529541,
"learning_rate": 2.3425757968385593e-06,
"loss": 1.2895,
"step": 452
},
{
"epoch": 0.010565382047596696,
"grad_norm": 1.045040488243103,
"learning_rate": 2.3477584866545737e-06,
"loss": 1.7306,
"step": 453
},
{
"epoch": 0.010588705186774613,
"grad_norm": 1.4592766761779785,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.7795,
"step": 454
},
{
"epoch": 0.010612028325952531,
"grad_norm": 0.9432761073112488,
"learning_rate": 2.358123866286603e-06,
"loss": 1.6963,
"step": 455
},
{
"epoch": 0.01063535146513045,
"grad_norm": 1.3770086765289307,
"learning_rate": 2.3633065561026176e-06,
"loss": 1.2003,
"step": 456
},
{
"epoch": 0.010658674604308367,
"grad_norm": 1.1453793048858643,
"learning_rate": 2.368489245918632e-06,
"loss": 1.9012,
"step": 457
},
{
"epoch": 0.010681997743486285,
"grad_norm": 1.2836976051330566,
"learning_rate": 2.3736719357346467e-06,
"loss": 1.4324,
"step": 458
},
{
"epoch": 0.010705320882664203,
"grad_norm": 1.6498123407363892,
"learning_rate": 2.378854625550661e-06,
"loss": 1.6212,
"step": 459
},
{
"epoch": 0.010728644021842119,
"grad_norm": 1.3681795597076416,
"learning_rate": 2.3840373153666754e-06,
"loss": 1.6047,
"step": 460
},
{
"epoch": 0.010751967161020037,
"grad_norm": 1.4474722146987915,
"learning_rate": 2.38922000518269e-06,
"loss": 1.5279,
"step": 461
},
{
"epoch": 0.010775290300197955,
"grad_norm": 1.4832510948181152,
"learning_rate": 2.3944026949987045e-06,
"loss": 1.7073,
"step": 462
},
{
"epoch": 0.010798613439375873,
"grad_norm": 1.343935251235962,
"learning_rate": 2.3995853848147193e-06,
"loss": 1.4637,
"step": 463
},
{
"epoch": 0.01082193657855379,
"grad_norm": 1.8285539150238037,
"learning_rate": 2.4047680746307336e-06,
"loss": 1.3944,
"step": 464
},
{
"epoch": 0.010845259717731709,
"grad_norm": 1.4653230905532837,
"learning_rate": 2.4099507644467484e-06,
"loss": 1.8847,
"step": 465
},
{
"epoch": 0.010868582856909626,
"grad_norm": 1.4410351514816284,
"learning_rate": 2.4151334542627623e-06,
"loss": 1.7298,
"step": 466
},
{
"epoch": 0.010891905996087543,
"grad_norm": 1.3057256937026978,
"learning_rate": 2.420316144078777e-06,
"loss": 1.6188,
"step": 467
},
{
"epoch": 0.01091522913526546,
"grad_norm": 1.574479103088379,
"learning_rate": 2.4254988338947914e-06,
"loss": 1.585,
"step": 468
},
{
"epoch": 0.010938552274443378,
"grad_norm": 1.4391696453094482,
"learning_rate": 2.430681523710806e-06,
"loss": 1.7272,
"step": 469
},
{
"epoch": 0.010961875413621296,
"grad_norm": 2.304706335067749,
"learning_rate": 2.4358642135268205e-06,
"loss": 1.7127,
"step": 470
},
{
"epoch": 0.010985198552799214,
"grad_norm": 1.2380545139312744,
"learning_rate": 2.4410469033428353e-06,
"loss": 1.5428,
"step": 471
},
{
"epoch": 0.011008521691977132,
"grad_norm": 1.303446888923645,
"learning_rate": 2.4462295931588497e-06,
"loss": 1.609,
"step": 472
},
{
"epoch": 0.01103184483115505,
"grad_norm": 1.3888837099075317,
"learning_rate": 2.451412282974864e-06,
"loss": 1.7134,
"step": 473
},
{
"epoch": 0.011055167970332966,
"grad_norm": 0.9802701473236084,
"learning_rate": 2.4565949727908788e-06,
"loss": 1.4401,
"step": 474
},
{
"epoch": 0.011078491109510884,
"grad_norm": 1.5808403491973877,
"learning_rate": 2.461777662606893e-06,
"loss": 1.7415,
"step": 475
},
{
"epoch": 0.011101814248688802,
"grad_norm": 1.299912691116333,
"learning_rate": 2.466960352422908e-06,
"loss": 1.361,
"step": 476
},
{
"epoch": 0.01112513738786672,
"grad_norm": 0.9326110482215881,
"learning_rate": 2.4721430422389222e-06,
"loss": 1.222,
"step": 477
},
{
"epoch": 0.011148460527044638,
"grad_norm": 1.0385396480560303,
"learning_rate": 2.477325732054937e-06,
"loss": 1.4813,
"step": 478
},
{
"epoch": 0.011171783666222556,
"grad_norm": 1.1004397869110107,
"learning_rate": 2.482508421870951e-06,
"loss": 1.5064,
"step": 479
},
{
"epoch": 0.011195106805400474,
"grad_norm": 1.274898886680603,
"learning_rate": 2.4876911116869657e-06,
"loss": 1.3046,
"step": 480
},
{
"epoch": 0.01121842994457839,
"grad_norm": 1.0818660259246826,
"learning_rate": 2.49287380150298e-06,
"loss": 1.878,
"step": 481
},
{
"epoch": 0.011241753083756308,
"grad_norm": 1.2744652032852173,
"learning_rate": 2.498056491318995e-06,
"loss": 1.6394,
"step": 482
},
{
"epoch": 0.011265076222934226,
"grad_norm": 1.0467538833618164,
"learning_rate": 2.503239181135009e-06,
"loss": 1.8949,
"step": 483
},
{
"epoch": 0.011288399362112143,
"grad_norm": 1.2507177591323853,
"learning_rate": 2.508421870951024e-06,
"loss": 1.5386,
"step": 484
},
{
"epoch": 0.011311722501290061,
"grad_norm": 2.0707380771636963,
"learning_rate": 2.5136045607670383e-06,
"loss": 1.3359,
"step": 485
},
{
"epoch": 0.01133504564046798,
"grad_norm": 1.0060955286026,
"learning_rate": 2.518787250583053e-06,
"loss": 1.5551,
"step": 486
},
{
"epoch": 0.011358368779645897,
"grad_norm": 2.1019294261932373,
"learning_rate": 2.5239699403990674e-06,
"loss": 1.4009,
"step": 487
},
{
"epoch": 0.011381691918823813,
"grad_norm": 1.2085974216461182,
"learning_rate": 2.529152630215082e-06,
"loss": 1.1264,
"step": 488
},
{
"epoch": 0.011405015058001731,
"grad_norm": 1.2670215368270874,
"learning_rate": 2.5343353200310965e-06,
"loss": 1.4005,
"step": 489
},
{
"epoch": 0.011428338197179649,
"grad_norm": 0.976809024810791,
"learning_rate": 2.5395180098471104e-06,
"loss": 1.6539,
"step": 490
},
{
"epoch": 0.011451661336357567,
"grad_norm": 1.8012447357177734,
"learning_rate": 2.5447006996631252e-06,
"loss": 1.5083,
"step": 491
},
{
"epoch": 0.011474984475535485,
"grad_norm": 2.0657784938812256,
"learning_rate": 2.5498833894791396e-06,
"loss": 1.4127,
"step": 492
},
{
"epoch": 0.011498307614713403,
"grad_norm": 1.4070103168487549,
"learning_rate": 2.5550660792951543e-06,
"loss": 1.4707,
"step": 493
},
{
"epoch": 0.01152163075389132,
"grad_norm": 0.859045147895813,
"learning_rate": 2.5602487691111687e-06,
"loss": 1.6301,
"step": 494
},
{
"epoch": 0.011544953893069239,
"grad_norm": 1.5209952592849731,
"learning_rate": 2.5654314589271835e-06,
"loss": 1.8438,
"step": 495
},
{
"epoch": 0.011568277032247155,
"grad_norm": 1.1508231163024902,
"learning_rate": 2.570614148743198e-06,
"loss": 1.2495,
"step": 496
},
{
"epoch": 0.011591600171425073,
"grad_norm": 0.9130313396453857,
"learning_rate": 2.5757968385592126e-06,
"loss": 1.1848,
"step": 497
},
{
"epoch": 0.01161492331060299,
"grad_norm": 1.5925562381744385,
"learning_rate": 2.580979528375227e-06,
"loss": 1.4745,
"step": 498
},
{
"epoch": 0.011638246449780908,
"grad_norm": 2.5118539333343506,
"learning_rate": 2.5861622181912417e-06,
"loss": 1.6218,
"step": 499
},
{
"epoch": 0.011661569588958826,
"grad_norm": 1.272691249847412,
"learning_rate": 2.591344908007256e-06,
"loss": 1.2147,
"step": 500
},
{
"epoch": 0.011684892728136744,
"grad_norm": 1.1436160802841187,
"learning_rate": 2.596527597823271e-06,
"loss": 1.5556,
"step": 501
},
{
"epoch": 0.011708215867314662,
"grad_norm": 1.0195647478103638,
"learning_rate": 2.601710287639285e-06,
"loss": 1.3303,
"step": 502
},
{
"epoch": 0.011731539006492578,
"grad_norm": 1.4576568603515625,
"learning_rate": 2.6068929774553e-06,
"loss": 1.6531,
"step": 503
},
{
"epoch": 0.011754862145670496,
"grad_norm": 1.360716462135315,
"learning_rate": 2.612075667271314e-06,
"loss": 1.1761,
"step": 504
},
{
"epoch": 0.011778185284848414,
"grad_norm": 2.7770462036132812,
"learning_rate": 2.617258357087328e-06,
"loss": 1.247,
"step": 505
},
{
"epoch": 0.011801508424026332,
"grad_norm": 1.3706661462783813,
"learning_rate": 2.622441046903343e-06,
"loss": 1.5103,
"step": 506
},
{
"epoch": 0.01182483156320425,
"grad_norm": 1.5405017137527466,
"learning_rate": 2.6276237367193573e-06,
"loss": 1.6827,
"step": 507
},
{
"epoch": 0.011848154702382168,
"grad_norm": 1.1809494495391846,
"learning_rate": 2.632806426535372e-06,
"loss": 1.7162,
"step": 508
},
{
"epoch": 0.011871477841560086,
"grad_norm": 1.085557222366333,
"learning_rate": 2.6379891163513864e-06,
"loss": 1.514,
"step": 509
},
{
"epoch": 0.011894800980738002,
"grad_norm": 1.2155910730361938,
"learning_rate": 2.643171806167401e-06,
"loss": 1.4029,
"step": 510
},
{
"epoch": 0.01191812411991592,
"grad_norm": 1.240242600440979,
"learning_rate": 2.6483544959834155e-06,
"loss": 1.4336,
"step": 511
},
{
"epoch": 0.011941447259093838,
"grad_norm": 1.649802327156067,
"learning_rate": 2.6535371857994303e-06,
"loss": 1.9082,
"step": 512
},
{
"epoch": 0.011964770398271755,
"grad_norm": 1.3479831218719482,
"learning_rate": 2.6587198756154447e-06,
"loss": 1.5424,
"step": 513
},
{
"epoch": 0.011988093537449673,
"grad_norm": 1.2537102699279785,
"learning_rate": 2.6639025654314594e-06,
"loss": 1.6061,
"step": 514
},
{
"epoch": 0.012011416676627591,
"grad_norm": 1.1049939393997192,
"learning_rate": 2.6690852552474738e-06,
"loss": 1.8361,
"step": 515
},
{
"epoch": 0.012034739815805509,
"grad_norm": 2.9946062564849854,
"learning_rate": 2.6742679450634885e-06,
"loss": 1.4471,
"step": 516
},
{
"epoch": 0.012058062954983425,
"grad_norm": 0.9455610513687134,
"learning_rate": 2.6794506348795025e-06,
"loss": 1.6831,
"step": 517
},
{
"epoch": 0.012081386094161343,
"grad_norm": 1.4750438928604126,
"learning_rate": 2.684633324695517e-06,
"loss": 1.3143,
"step": 518
},
{
"epoch": 0.012104709233339261,
"grad_norm": 1.1056557893753052,
"learning_rate": 2.6898160145115316e-06,
"loss": 1.5054,
"step": 519
},
{
"epoch": 0.012128032372517179,
"grad_norm": 0.9718064069747925,
"learning_rate": 2.694998704327546e-06,
"loss": 1.3134,
"step": 520
},
{
"epoch": 0.012151355511695097,
"grad_norm": 2.2384724617004395,
"learning_rate": 2.7001813941435607e-06,
"loss": 1.4851,
"step": 521
},
{
"epoch": 0.012174678650873015,
"grad_norm": 1.2468239068984985,
"learning_rate": 2.705364083959575e-06,
"loss": 1.4873,
"step": 522
},
{
"epoch": 0.012198001790050933,
"grad_norm": 1.4248602390289307,
"learning_rate": 2.71054677377559e-06,
"loss": 1.7643,
"step": 523
},
{
"epoch": 0.012221324929228849,
"grad_norm": 1.3377385139465332,
"learning_rate": 2.715729463591604e-06,
"loss": 1.7064,
"step": 524
},
{
"epoch": 0.012244648068406767,
"grad_norm": 0.9933966994285583,
"learning_rate": 2.720912153407619e-06,
"loss": 1.7187,
"step": 525
},
{
"epoch": 0.012267971207584685,
"grad_norm": 1.018750548362732,
"learning_rate": 2.7260948432236333e-06,
"loss": 1.5915,
"step": 526
},
{
"epoch": 0.012291294346762602,
"grad_norm": 1.356325387954712,
"learning_rate": 2.731277533039648e-06,
"loss": 1.7193,
"step": 527
},
{
"epoch": 0.01231461748594052,
"grad_norm": 1.2781217098236084,
"learning_rate": 2.7364602228556624e-06,
"loss": 1.5494,
"step": 528
},
{
"epoch": 0.012337940625118438,
"grad_norm": 1.561498761177063,
"learning_rate": 2.741642912671677e-06,
"loss": 1.6972,
"step": 529
},
{
"epoch": 0.012361263764296356,
"grad_norm": 1.1695748567581177,
"learning_rate": 2.7468256024876915e-06,
"loss": 2.1633,
"step": 530
},
{
"epoch": 0.012384586903474272,
"grad_norm": 1.4304964542388916,
"learning_rate": 2.7520082923037054e-06,
"loss": 1.6321,
"step": 531
},
{
"epoch": 0.01240791004265219,
"grad_norm": 1.0513828992843628,
"learning_rate": 2.7571909821197202e-06,
"loss": 1.2897,
"step": 532
},
{
"epoch": 0.012431233181830108,
"grad_norm": 1.0206960439682007,
"learning_rate": 2.7623736719357346e-06,
"loss": 1.7842,
"step": 533
},
{
"epoch": 0.012454556321008026,
"grad_norm": 1.1440876722335815,
"learning_rate": 2.7675563617517493e-06,
"loss": 1.4399,
"step": 534
},
{
"epoch": 0.012477879460185944,
"grad_norm": 1.0837441682815552,
"learning_rate": 2.7727390515677637e-06,
"loss": 1.5155,
"step": 535
},
{
"epoch": 0.012501202599363862,
"grad_norm": 1.071378231048584,
"learning_rate": 2.7779217413837785e-06,
"loss": 1.6459,
"step": 536
},
{
"epoch": 0.01252452573854178,
"grad_norm": 1.6966552734375,
"learning_rate": 2.783104431199793e-06,
"loss": 1.6015,
"step": 537
},
{
"epoch": 0.012547848877719696,
"grad_norm": 1.2789183855056763,
"learning_rate": 2.7882871210158076e-06,
"loss": 1.2423,
"step": 538
},
{
"epoch": 0.012571172016897614,
"grad_norm": 1.2072651386260986,
"learning_rate": 2.793469810831822e-06,
"loss": 1.69,
"step": 539
},
{
"epoch": 0.012594495156075532,
"grad_norm": 1.5257117748260498,
"learning_rate": 2.7986525006478367e-06,
"loss": 1.7608,
"step": 540
},
{
"epoch": 0.01261781829525345,
"grad_norm": 1.0233759880065918,
"learning_rate": 2.803835190463851e-06,
"loss": 1.1299,
"step": 541
},
{
"epoch": 0.012641141434431367,
"grad_norm": 1.8280616998672485,
"learning_rate": 2.809017880279866e-06,
"loss": 1.3338,
"step": 542
},
{
"epoch": 0.012664464573609285,
"grad_norm": 1.6891363859176636,
"learning_rate": 2.81420057009588e-06,
"loss": 1.5505,
"step": 543
},
{
"epoch": 0.012687787712787203,
"grad_norm": 1.1501421928405762,
"learning_rate": 2.819383259911895e-06,
"loss": 1.6788,
"step": 544
},
{
"epoch": 0.01271111085196512,
"grad_norm": 1.107029914855957,
"learning_rate": 2.824565949727909e-06,
"loss": 1.3782,
"step": 545
},
{
"epoch": 0.012734433991143037,
"grad_norm": 0.9627429246902466,
"learning_rate": 2.829748639543923e-06,
"loss": 1.3155,
"step": 546
},
{
"epoch": 0.012757757130320955,
"grad_norm": 2.330007791519165,
"learning_rate": 2.834931329359938e-06,
"loss": 1.425,
"step": 547
},
{
"epoch": 0.012781080269498873,
"grad_norm": 1.4026503562927246,
"learning_rate": 2.8401140191759523e-06,
"loss": 1.5578,
"step": 548
},
{
"epoch": 0.012804403408676791,
"grad_norm": 0.9430487155914307,
"learning_rate": 2.845296708991967e-06,
"loss": 1.6075,
"step": 549
},
{
"epoch": 0.012827726547854709,
"grad_norm": 1.0779294967651367,
"learning_rate": 2.8504793988079814e-06,
"loss": 1.5169,
"step": 550
},
{
"epoch": 0.012851049687032627,
"grad_norm": 1.130324125289917,
"learning_rate": 2.855662088623996e-06,
"loss": 1.5016,
"step": 551
},
{
"epoch": 0.012874372826210545,
"grad_norm": 1.0127092599868774,
"learning_rate": 2.8608447784400105e-06,
"loss": 1.8715,
"step": 552
},
{
"epoch": 0.01289769596538846,
"grad_norm": 1.1831302642822266,
"learning_rate": 2.8660274682560253e-06,
"loss": 1.678,
"step": 553
},
{
"epoch": 0.012921019104566379,
"grad_norm": 1.3394455909729004,
"learning_rate": 2.8712101580720397e-06,
"loss": 1.4129,
"step": 554
},
{
"epoch": 0.012944342243744297,
"grad_norm": 1.2189030647277832,
"learning_rate": 2.8763928478880544e-06,
"loss": 1.7364,
"step": 555
},
{
"epoch": 0.012967665382922215,
"grad_norm": 1.2808138132095337,
"learning_rate": 2.8815755377040688e-06,
"loss": 1.6274,
"step": 556
},
{
"epoch": 0.012990988522100132,
"grad_norm": 1.0384689569473267,
"learning_rate": 2.8867582275200835e-06,
"loss": 1.5942,
"step": 557
},
{
"epoch": 0.01301431166127805,
"grad_norm": 1.8520807027816772,
"learning_rate": 2.891940917336098e-06,
"loss": 1.3067,
"step": 558
},
{
"epoch": 0.013037634800455968,
"grad_norm": 1.1817374229431152,
"learning_rate": 2.897123607152112e-06,
"loss": 1.6405,
"step": 559
},
{
"epoch": 0.013060957939633884,
"grad_norm": 1.1010823249816895,
"learning_rate": 2.9023062969681266e-06,
"loss": 1.4339,
"step": 560
},
{
"epoch": 0.013084281078811802,
"grad_norm": 1.2461942434310913,
"learning_rate": 2.907488986784141e-06,
"loss": 1.9866,
"step": 561
},
{
"epoch": 0.01310760421798972,
"grad_norm": 1.1503125429153442,
"learning_rate": 2.9126716766001557e-06,
"loss": 1.585,
"step": 562
},
{
"epoch": 0.013130927357167638,
"grad_norm": 1.542434573173523,
"learning_rate": 2.91785436641617e-06,
"loss": 1.4524,
"step": 563
},
{
"epoch": 0.013154250496345556,
"grad_norm": 1.0469673871994019,
"learning_rate": 2.923037056232185e-06,
"loss": 1.6884,
"step": 564
},
{
"epoch": 0.013177573635523474,
"grad_norm": 1.5137437582015991,
"learning_rate": 2.928219746048199e-06,
"loss": 1.5377,
"step": 565
},
{
"epoch": 0.013200896774701392,
"grad_norm": 1.1454534530639648,
"learning_rate": 2.933402435864214e-06,
"loss": 1.8508,
"step": 566
},
{
"epoch": 0.013224219913879308,
"grad_norm": 1.310381531715393,
"learning_rate": 2.9385851256802283e-06,
"loss": 1.5774,
"step": 567
},
{
"epoch": 0.013247543053057226,
"grad_norm": 1.1223838329315186,
"learning_rate": 2.943767815496243e-06,
"loss": 1.4496,
"step": 568
},
{
"epoch": 0.013270866192235144,
"grad_norm": 1.4537910223007202,
"learning_rate": 2.9489505053122574e-06,
"loss": 1.4423,
"step": 569
},
{
"epoch": 0.013294189331413062,
"grad_norm": 1.1783167123794556,
"learning_rate": 2.954133195128272e-06,
"loss": 1.9314,
"step": 570
},
{
"epoch": 0.01331751247059098,
"grad_norm": 1.211719274520874,
"learning_rate": 2.9593158849442865e-06,
"loss": 1.5366,
"step": 571
},
{
"epoch": 0.013340835609768897,
"grad_norm": 2.9552671909332275,
"learning_rate": 2.9644985747603004e-06,
"loss": 1.3431,
"step": 572
},
{
"epoch": 0.013364158748946815,
"grad_norm": 1.2814795970916748,
"learning_rate": 2.9696812645763152e-06,
"loss": 1.3879,
"step": 573
},
{
"epoch": 0.013387481888124731,
"grad_norm": 1.2598010301589966,
"learning_rate": 2.9748639543923296e-06,
"loss": 1.4775,
"step": 574
},
{
"epoch": 0.01341080502730265,
"grad_norm": 1.3874925374984741,
"learning_rate": 2.9800466442083443e-06,
"loss": 1.4012,
"step": 575
},
{
"epoch": 0.013434128166480567,
"grad_norm": 1.1846306324005127,
"learning_rate": 2.9852293340243587e-06,
"loss": 1.4491,
"step": 576
},
{
"epoch": 0.013457451305658485,
"grad_norm": 1.388150691986084,
"learning_rate": 2.9904120238403734e-06,
"loss": 1.6913,
"step": 577
},
{
"epoch": 0.013480774444836403,
"grad_norm": 1.8026880025863647,
"learning_rate": 2.995594713656388e-06,
"loss": 1.1754,
"step": 578
},
{
"epoch": 0.013504097584014321,
"grad_norm": 1.9366620779037476,
"learning_rate": 3.0007774034724026e-06,
"loss": 1.4406,
"step": 579
},
{
"epoch": 0.013527420723192239,
"grad_norm": 1.039657473564148,
"learning_rate": 3.005960093288417e-06,
"loss": 1.4823,
"step": 580
},
{
"epoch": 0.013550743862370155,
"grad_norm": 1.0928449630737305,
"learning_rate": 3.0111427831044317e-06,
"loss": 1.4502,
"step": 581
},
{
"epoch": 0.013574067001548073,
"grad_norm": 2.408292531967163,
"learning_rate": 3.016325472920446e-06,
"loss": 1.4778,
"step": 582
},
{
"epoch": 0.01359739014072599,
"grad_norm": 1.2284953594207764,
"learning_rate": 3.021508162736461e-06,
"loss": 1.5887,
"step": 583
},
{
"epoch": 0.013620713279903909,
"grad_norm": 1.3841763734817505,
"learning_rate": 3.026690852552475e-06,
"loss": 1.3778,
"step": 584
},
{
"epoch": 0.013644036419081827,
"grad_norm": 1.305172324180603,
"learning_rate": 3.03187354236849e-06,
"loss": 1.2837,
"step": 585
},
{
"epoch": 0.013667359558259744,
"grad_norm": 1.087904691696167,
"learning_rate": 3.037056232184504e-06,
"loss": 1.4361,
"step": 586
},
{
"epoch": 0.013690682697437662,
"grad_norm": 1.1818716526031494,
"learning_rate": 3.042238922000518e-06,
"loss": 1.4903,
"step": 587
},
{
"epoch": 0.013714005836615578,
"grad_norm": 0.9969412088394165,
"learning_rate": 3.047421611816533e-06,
"loss": 1.6923,
"step": 588
},
{
"epoch": 0.013737328975793496,
"grad_norm": 1.3729232549667358,
"learning_rate": 3.0526043016325473e-06,
"loss": 1.4219,
"step": 589
},
{
"epoch": 0.013760652114971414,
"grad_norm": 1.091769814491272,
"learning_rate": 3.057786991448562e-06,
"loss": 1.6978,
"step": 590
},
{
"epoch": 0.013783975254149332,
"grad_norm": 1.1668254137039185,
"learning_rate": 3.0629696812645764e-06,
"loss": 1.4609,
"step": 591
},
{
"epoch": 0.01380729839332725,
"grad_norm": 1.3739502429962158,
"learning_rate": 3.068152371080591e-06,
"loss": 1.7247,
"step": 592
},
{
"epoch": 0.013830621532505168,
"grad_norm": 1.480758547782898,
"learning_rate": 3.0733350608966055e-06,
"loss": 1.6142,
"step": 593
},
{
"epoch": 0.013853944671683086,
"grad_norm": 0.853581964969635,
"learning_rate": 3.0785177507126203e-06,
"loss": 1.5563,
"step": 594
},
{
"epoch": 0.013877267810861002,
"grad_norm": 1.144692063331604,
"learning_rate": 3.0837004405286347e-06,
"loss": 1.6145,
"step": 595
},
{
"epoch": 0.01390059095003892,
"grad_norm": 1.2413440942764282,
"learning_rate": 3.0888831303446494e-06,
"loss": 1.5762,
"step": 596
},
{
"epoch": 0.013923914089216838,
"grad_norm": 1.147834062576294,
"learning_rate": 3.0940658201606638e-06,
"loss": 1.4478,
"step": 597
},
{
"epoch": 0.013947237228394756,
"grad_norm": 1.0349398851394653,
"learning_rate": 3.0992485099766785e-06,
"loss": 1.612,
"step": 598
},
{
"epoch": 0.013970560367572674,
"grad_norm": 1.4780391454696655,
"learning_rate": 3.104431199792693e-06,
"loss": 1.5179,
"step": 599
},
{
"epoch": 0.013993883506750592,
"grad_norm": 1.1395933628082275,
"learning_rate": 3.109613889608707e-06,
"loss": 1.4845,
"step": 600
},
{
"epoch": 0.01401720664592851,
"grad_norm": 1.37168550491333,
"learning_rate": 3.1147965794247216e-06,
"loss": 1.581,
"step": 601
},
{
"epoch": 0.014040529785106426,
"grad_norm": 1.8260347843170166,
"learning_rate": 3.119979269240736e-06,
"loss": 1.1221,
"step": 602
},
{
"epoch": 0.014063852924284343,
"grad_norm": 2.5528669357299805,
"learning_rate": 3.1251619590567507e-06,
"loss": 1.255,
"step": 603
},
{
"epoch": 0.014087176063462261,
"grad_norm": 1.3272032737731934,
"learning_rate": 3.130344648872765e-06,
"loss": 1.2713,
"step": 604
},
{
"epoch": 0.01411049920264018,
"grad_norm": 1.147449254989624,
"learning_rate": 3.13552733868878e-06,
"loss": 1.3694,
"step": 605
},
{
"epoch": 0.014133822341818097,
"grad_norm": 1.173793077468872,
"learning_rate": 3.140710028504794e-06,
"loss": 1.5818,
"step": 606
},
{
"epoch": 0.014157145480996015,
"grad_norm": 1.2347713708877563,
"learning_rate": 3.145892718320809e-06,
"loss": 1.501,
"step": 607
},
{
"epoch": 0.014180468620173933,
"grad_norm": 1.3945446014404297,
"learning_rate": 3.1510754081368233e-06,
"loss": 1.8674,
"step": 608
},
{
"epoch": 0.01420379175935185,
"grad_norm": 1.239762544631958,
"learning_rate": 3.156258097952838e-06,
"loss": 1.2516,
"step": 609
},
{
"epoch": 0.014227114898529767,
"grad_norm": 1.552531361579895,
"learning_rate": 3.1614407877688524e-06,
"loss": 1.5358,
"step": 610
},
{
"epoch": 0.014250438037707685,
"grad_norm": 1.576997995376587,
"learning_rate": 3.166623477584867e-06,
"loss": 1.7601,
"step": 611
},
{
"epoch": 0.014273761176885603,
"grad_norm": 1.3251402378082275,
"learning_rate": 3.1718061674008815e-06,
"loss": 1.2758,
"step": 612
},
{
"epoch": 0.01429708431606352,
"grad_norm": 1.2837574481964111,
"learning_rate": 3.1769888572168963e-06,
"loss": 1.528,
"step": 613
},
{
"epoch": 0.014320407455241439,
"grad_norm": 0.9697505831718445,
"learning_rate": 3.1821715470329102e-06,
"loss": 1.6359,
"step": 614
},
{
"epoch": 0.014343730594419356,
"grad_norm": 1.2682685852050781,
"learning_rate": 3.1873542368489246e-06,
"loss": 1.4759,
"step": 615
},
{
"epoch": 0.014367053733597274,
"grad_norm": 0.9607746005058289,
"learning_rate": 3.1925369266649393e-06,
"loss": 1.7474,
"step": 616
},
{
"epoch": 0.01439037687277519,
"grad_norm": 1.056736946105957,
"learning_rate": 3.1977196164809537e-06,
"loss": 1.8812,
"step": 617
},
{
"epoch": 0.014413700011953108,
"grad_norm": 1.1990852355957031,
"learning_rate": 3.2029023062969684e-06,
"loss": 1.6217,
"step": 618
},
{
"epoch": 0.014437023151131026,
"grad_norm": 1.1339764595031738,
"learning_rate": 3.208084996112983e-06,
"loss": 1.3557,
"step": 619
},
{
"epoch": 0.014460346290308944,
"grad_norm": 1.0672523975372314,
"learning_rate": 3.2132676859289976e-06,
"loss": 1.8239,
"step": 620
},
{
"epoch": 0.014483669429486862,
"grad_norm": 1.4371954202651978,
"learning_rate": 3.218450375745012e-06,
"loss": 1.4571,
"step": 621
},
{
"epoch": 0.01450699256866478,
"grad_norm": 1.9893105030059814,
"learning_rate": 3.2236330655610267e-06,
"loss": 1.3716,
"step": 622
},
{
"epoch": 0.014530315707842698,
"grad_norm": 1.7084318399429321,
"learning_rate": 3.228815755377041e-06,
"loss": 1.5201,
"step": 623
},
{
"epoch": 0.014553638847020614,
"grad_norm": 1.308225154876709,
"learning_rate": 3.233998445193056e-06,
"loss": 1.9173,
"step": 624
},
{
"epoch": 0.014576961986198532,
"grad_norm": 0.9914215803146362,
"learning_rate": 3.23918113500907e-06,
"loss": 1.7351,
"step": 625
},
{
"epoch": 0.01460028512537645,
"grad_norm": 1.0292766094207764,
"learning_rate": 3.244363824825085e-06,
"loss": 1.4073,
"step": 626
},
{
"epoch": 0.014623608264554368,
"grad_norm": 1.0998982191085815,
"learning_rate": 3.2495465146410993e-06,
"loss": 1.5979,
"step": 627
},
{
"epoch": 0.014646931403732286,
"grad_norm": 1.1409685611724854,
"learning_rate": 3.254729204457113e-06,
"loss": 1.3442,
"step": 628
},
{
"epoch": 0.014670254542910204,
"grad_norm": 1.7685736417770386,
"learning_rate": 3.259911894273128e-06,
"loss": 1.251,
"step": 629
},
{
"epoch": 0.014693577682088121,
"grad_norm": 1.6536918878555298,
"learning_rate": 3.2650945840891423e-06,
"loss": 1.4698,
"step": 630
},
{
"epoch": 0.014716900821266038,
"grad_norm": 2.046391248703003,
"learning_rate": 3.270277273905157e-06,
"loss": 1.5142,
"step": 631
},
{
"epoch": 0.014740223960443955,
"grad_norm": 1.3458948135375977,
"learning_rate": 3.2754599637211714e-06,
"loss": 1.3999,
"step": 632
},
{
"epoch": 0.014763547099621873,
"grad_norm": 1.7265046834945679,
"learning_rate": 3.280642653537186e-06,
"loss": 1.2212,
"step": 633
},
{
"epoch": 0.014786870238799791,
"grad_norm": 1.3191124200820923,
"learning_rate": 3.2858253433532005e-06,
"loss": 1.4354,
"step": 634
},
{
"epoch": 0.01481019337797771,
"grad_norm": 1.2317379713058472,
"learning_rate": 3.2910080331692153e-06,
"loss": 1.5661,
"step": 635
},
{
"epoch": 0.014833516517155627,
"grad_norm": 1.400969386100769,
"learning_rate": 3.2961907229852297e-06,
"loss": 1.462,
"step": 636
},
{
"epoch": 0.014856839656333545,
"grad_norm": 2.060718059539795,
"learning_rate": 3.3013734128012444e-06,
"loss": 1.7522,
"step": 637
},
{
"epoch": 0.014880162795511461,
"grad_norm": 1.138715386390686,
"learning_rate": 3.3065561026172588e-06,
"loss": 1.4923,
"step": 638
},
{
"epoch": 0.014903485934689379,
"grad_norm": 1.1973599195480347,
"learning_rate": 3.3117387924332735e-06,
"loss": 1.4462,
"step": 639
},
{
"epoch": 0.014926809073867297,
"grad_norm": 1.266867756843567,
"learning_rate": 3.316921482249288e-06,
"loss": 1.3159,
"step": 640
},
{
"epoch": 0.014950132213045215,
"grad_norm": 3.4681708812713623,
"learning_rate": 3.322104172065302e-06,
"loss": 1.3566,
"step": 641
},
{
"epoch": 0.014973455352223133,
"grad_norm": 1.248502492904663,
"learning_rate": 3.3272868618813166e-06,
"loss": 1.6299,
"step": 642
},
{
"epoch": 0.01499677849140105,
"grad_norm": 1.561563491821289,
"learning_rate": 3.332469551697331e-06,
"loss": 1.3246,
"step": 643
},
{
"epoch": 0.015020101630578968,
"grad_norm": 1.1922053098678589,
"learning_rate": 3.3376522415133457e-06,
"loss": 1.6847,
"step": 644
},
{
"epoch": 0.015043424769756885,
"grad_norm": 1.0779014825820923,
"learning_rate": 3.34283493132936e-06,
"loss": 1.8025,
"step": 645
},
{
"epoch": 0.015066747908934803,
"grad_norm": 1.5236597061157227,
"learning_rate": 3.348017621145375e-06,
"loss": 1.3894,
"step": 646
},
{
"epoch": 0.01509007104811272,
"grad_norm": 1.2087934017181396,
"learning_rate": 3.353200310961389e-06,
"loss": 1.9119,
"step": 647
},
{
"epoch": 0.015113394187290638,
"grad_norm": 1.435085654258728,
"learning_rate": 3.358383000777404e-06,
"loss": 1.4334,
"step": 648
},
{
"epoch": 0.015136717326468556,
"grad_norm": 1.3662467002868652,
"learning_rate": 3.3635656905934183e-06,
"loss": 1.6717,
"step": 649
},
{
"epoch": 0.015160040465646474,
"grad_norm": 1.379262924194336,
"learning_rate": 3.368748380409433e-06,
"loss": 1.0914,
"step": 650
},
{
"epoch": 0.015183363604824392,
"grad_norm": 1.436503529548645,
"learning_rate": 3.3739310702254474e-06,
"loss": 1.296,
"step": 651
},
{
"epoch": 0.015206686744002308,
"grad_norm": 1.0189919471740723,
"learning_rate": 3.379113760041462e-06,
"loss": 1.5578,
"step": 652
},
{
"epoch": 0.015230009883180226,
"grad_norm": 1.3371915817260742,
"learning_rate": 3.3842964498574765e-06,
"loss": 1.3883,
"step": 653
},
{
"epoch": 0.015253333022358144,
"grad_norm": 1.152949333190918,
"learning_rate": 3.389479139673491e-06,
"loss": 1.3408,
"step": 654
},
{
"epoch": 0.015276656161536062,
"grad_norm": 0.865856945514679,
"learning_rate": 3.3946618294895052e-06,
"loss": 1.8154,
"step": 655
},
{
"epoch": 0.01529997930071398,
"grad_norm": 1.3607538938522339,
"learning_rate": 3.3998445193055196e-06,
"loss": 1.5139,
"step": 656
},
{
"epoch": 0.015323302439891898,
"grad_norm": 1.0469399690628052,
"learning_rate": 3.4050272091215343e-06,
"loss": 1.4246,
"step": 657
},
{
"epoch": 0.015346625579069816,
"grad_norm": 1.2417982816696167,
"learning_rate": 3.4102098989375487e-06,
"loss": 1.4392,
"step": 658
},
{
"epoch": 0.015369948718247732,
"grad_norm": 2.018418073654175,
"learning_rate": 3.4153925887535634e-06,
"loss": 1.5175,
"step": 659
},
{
"epoch": 0.01539327185742565,
"grad_norm": 1.2593055963516235,
"learning_rate": 3.420575278569578e-06,
"loss": 1.6338,
"step": 660
},
{
"epoch": 0.015416594996603568,
"grad_norm": 1.0297298431396484,
"learning_rate": 3.4257579683855926e-06,
"loss": 1.6309,
"step": 661
},
{
"epoch": 0.015439918135781485,
"grad_norm": 1.2963732481002808,
"learning_rate": 3.430940658201607e-06,
"loss": 1.3099,
"step": 662
},
{
"epoch": 0.015463241274959403,
"grad_norm": 1.0868266820907593,
"learning_rate": 3.4361233480176217e-06,
"loss": 1.4949,
"step": 663
},
{
"epoch": 0.015486564414137321,
"grad_norm": 1.156296968460083,
"learning_rate": 3.441306037833636e-06,
"loss": 1.7845,
"step": 664
},
{
"epoch": 0.015509887553315239,
"grad_norm": 1.412965178489685,
"learning_rate": 3.446488727649651e-06,
"loss": 1.19,
"step": 665
},
{
"epoch": 0.015533210692493155,
"grad_norm": 1.0419931411743164,
"learning_rate": 3.451671417465665e-06,
"loss": 1.7125,
"step": 666
},
{
"epoch": 0.015556533831671073,
"grad_norm": 1.035372018814087,
"learning_rate": 3.4568541072816795e-06,
"loss": 1.7003,
"step": 667
},
{
"epoch": 0.015579856970848991,
"grad_norm": 1.1559805870056152,
"learning_rate": 3.4620367970976943e-06,
"loss": 1.981,
"step": 668
},
{
"epoch": 0.015603180110026909,
"grad_norm": 0.8634515404701233,
"learning_rate": 3.467219486913708e-06,
"loss": 1.2609,
"step": 669
},
{
"epoch": 0.015626503249204827,
"grad_norm": 1.1953692436218262,
"learning_rate": 3.472402176729723e-06,
"loss": 1.3956,
"step": 670
},
{
"epoch": 0.015649826388382745,
"grad_norm": 0.9668301939964294,
"learning_rate": 3.4775848665457373e-06,
"loss": 1.0568,
"step": 671
},
{
"epoch": 0.015673149527560663,
"grad_norm": 2.4868035316467285,
"learning_rate": 3.482767556361752e-06,
"loss": 1.364,
"step": 672
},
{
"epoch": 0.01569647266673858,
"grad_norm": 1.4255839586257935,
"learning_rate": 3.4879502461777664e-06,
"loss": 1.5207,
"step": 673
},
{
"epoch": 0.0157197958059165,
"grad_norm": 1.2752389907836914,
"learning_rate": 3.493132935993781e-06,
"loss": 1.5141,
"step": 674
},
{
"epoch": 0.015743118945094416,
"grad_norm": 1.2186245918273926,
"learning_rate": 3.4983156258097955e-06,
"loss": 1.3655,
"step": 675
},
{
"epoch": 0.015766442084272334,
"grad_norm": 1.3544304370880127,
"learning_rate": 3.5034983156258103e-06,
"loss": 1.7428,
"step": 676
},
{
"epoch": 0.01578976522345025,
"grad_norm": 1.0968130826950073,
"learning_rate": 3.5086810054418247e-06,
"loss": 1.3491,
"step": 677
},
{
"epoch": 0.015813088362628167,
"grad_norm": 1.1593806743621826,
"learning_rate": 3.513863695257839e-06,
"loss": 1.6708,
"step": 678
},
{
"epoch": 0.015836411501806084,
"grad_norm": 1.0408954620361328,
"learning_rate": 3.5190463850738538e-06,
"loss": 1.6977,
"step": 679
},
{
"epoch": 0.015859734640984002,
"grad_norm": 1.196632742881775,
"learning_rate": 3.524229074889868e-06,
"loss": 1.2019,
"step": 680
},
{
"epoch": 0.01588305778016192,
"grad_norm": 1.2698166370391846,
"learning_rate": 3.529411764705883e-06,
"loss": 1.8457,
"step": 681
},
{
"epoch": 0.015906380919339838,
"grad_norm": 0.9075011014938354,
"learning_rate": 3.5345944545218972e-06,
"loss": 1.2717,
"step": 682
},
{
"epoch": 0.015929704058517756,
"grad_norm": 1.0426501035690308,
"learning_rate": 3.5397771443379116e-06,
"loss": 1.6601,
"step": 683
},
{
"epoch": 0.015953027197695674,
"grad_norm": 1.4904205799102783,
"learning_rate": 3.544959834153926e-06,
"loss": 1.6324,
"step": 684
},
{
"epoch": 0.015976350336873592,
"grad_norm": 1.0664643049240112,
"learning_rate": 3.5501425239699407e-06,
"loss": 1.4896,
"step": 685
},
{
"epoch": 0.01599967347605151,
"grad_norm": 1.3758978843688965,
"learning_rate": 3.555325213785955e-06,
"loss": 1.5457,
"step": 686
},
{
"epoch": 0.016022996615229428,
"grad_norm": 1.4759879112243652,
"learning_rate": 3.56050790360197e-06,
"loss": 1.3865,
"step": 687
},
{
"epoch": 0.016046319754407345,
"grad_norm": 1.4678733348846436,
"learning_rate": 3.565690593417984e-06,
"loss": 1.223,
"step": 688
},
{
"epoch": 0.016069642893585263,
"grad_norm": 1.2057251930236816,
"learning_rate": 3.570873283233999e-06,
"loss": 1.4864,
"step": 689
},
{
"epoch": 0.01609296603276318,
"grad_norm": 1.3976320028305054,
"learning_rate": 3.5760559730500133e-06,
"loss": 1.3371,
"step": 690
},
{
"epoch": 0.016116289171941096,
"grad_norm": 1.0588197708129883,
"learning_rate": 3.5812386628660276e-06,
"loss": 1.264,
"step": 691
},
{
"epoch": 0.016139612311119014,
"grad_norm": 0.891678512096405,
"learning_rate": 3.5864213526820424e-06,
"loss": 1.6566,
"step": 692
},
{
"epoch": 0.01616293545029693,
"grad_norm": 1.1149228811264038,
"learning_rate": 3.5916040424980567e-06,
"loss": 1.6862,
"step": 693
},
{
"epoch": 0.01618625858947485,
"grad_norm": 1.463218331336975,
"learning_rate": 3.5967867323140715e-06,
"loss": 1.5771,
"step": 694
},
{
"epoch": 0.016209581728652767,
"grad_norm": 1.291648030281067,
"learning_rate": 3.601969422130086e-06,
"loss": 1.443,
"step": 695
},
{
"epoch": 0.016232904867830685,
"grad_norm": 1.1534149646759033,
"learning_rate": 3.6071521119461002e-06,
"loss": 1.76,
"step": 696
},
{
"epoch": 0.016256228007008603,
"grad_norm": 1.3349847793579102,
"learning_rate": 3.6123348017621146e-06,
"loss": 2.0584,
"step": 697
},
{
"epoch": 0.01627955114618652,
"grad_norm": 1.665682315826416,
"learning_rate": 3.6175174915781293e-06,
"loss": 1.5989,
"step": 698
},
{
"epoch": 0.01630287428536444,
"grad_norm": 1.6486263275146484,
"learning_rate": 3.6227001813941437e-06,
"loss": 1.7698,
"step": 699
},
{
"epoch": 0.016326197424542357,
"grad_norm": 1.5153722763061523,
"learning_rate": 3.6278828712101584e-06,
"loss": 1.3312,
"step": 700
},
{
"epoch": 0.016349520563720275,
"grad_norm": 1.3090248107910156,
"learning_rate": 3.633065561026173e-06,
"loss": 1.0735,
"step": 701
},
{
"epoch": 0.016372843702898193,
"grad_norm": 1.5462753772735596,
"learning_rate": 3.6382482508421876e-06,
"loss": 1.5408,
"step": 702
},
{
"epoch": 0.01639616684207611,
"grad_norm": 1.3447730541229248,
"learning_rate": 3.643430940658202e-06,
"loss": 1.5295,
"step": 703
},
{
"epoch": 0.01641948998125403,
"grad_norm": 1.232865571975708,
"learning_rate": 3.6486136304742163e-06,
"loss": 1.8686,
"step": 704
},
{
"epoch": 0.016442813120431946,
"grad_norm": 0.9742329120635986,
"learning_rate": 3.653796320290231e-06,
"loss": 1.5951,
"step": 705
},
{
"epoch": 0.01646613625960986,
"grad_norm": 1.1572047472000122,
"learning_rate": 3.6589790101062454e-06,
"loss": 1.5068,
"step": 706
},
{
"epoch": 0.01648945939878778,
"grad_norm": 1.2024304866790771,
"learning_rate": 3.66416169992226e-06,
"loss": 1.3933,
"step": 707
},
{
"epoch": 0.016512782537965696,
"grad_norm": 2.442342758178711,
"learning_rate": 3.6693443897382745e-06,
"loss": 1.0126,
"step": 708
},
{
"epoch": 0.016536105677143614,
"grad_norm": 1.2786589860916138,
"learning_rate": 3.6745270795542893e-06,
"loss": 1.6902,
"step": 709
},
{
"epoch": 0.016559428816321532,
"grad_norm": 0.9200882315635681,
"learning_rate": 3.679709769370303e-06,
"loss": 1.3918,
"step": 710
},
{
"epoch": 0.01658275195549945,
"grad_norm": 1.3768819570541382,
"learning_rate": 3.684892459186318e-06,
"loss": 1.6518,
"step": 711
},
{
"epoch": 0.016606075094677368,
"grad_norm": 1.274484395980835,
"learning_rate": 3.6900751490023323e-06,
"loss": 1.3728,
"step": 712
},
{
"epoch": 0.016629398233855286,
"grad_norm": 1.1752501726150513,
"learning_rate": 3.695257838818347e-06,
"loss": 1.4234,
"step": 713
},
{
"epoch": 0.016652721373033204,
"grad_norm": 1.4458903074264526,
"learning_rate": 3.7004405286343614e-06,
"loss": 1.5695,
"step": 714
},
{
"epoch": 0.01667604451221112,
"grad_norm": 1.2630547285079956,
"learning_rate": 3.705623218450376e-06,
"loss": 1.5334,
"step": 715
},
{
"epoch": 0.01669936765138904,
"grad_norm": 1.3754082918167114,
"learning_rate": 3.7108059082663905e-06,
"loss": 1.4807,
"step": 716
},
{
"epoch": 0.016722690790566958,
"grad_norm": 1.4704689979553223,
"learning_rate": 3.715988598082405e-06,
"loss": 1.5409,
"step": 717
},
{
"epoch": 0.016746013929744875,
"grad_norm": 1.4692633152008057,
"learning_rate": 3.7211712878984197e-06,
"loss": 1.5922,
"step": 718
},
{
"epoch": 0.016769337068922793,
"grad_norm": 1.2148405313491821,
"learning_rate": 3.726353977714434e-06,
"loss": 1.8115,
"step": 719
},
{
"epoch": 0.016792660208100708,
"grad_norm": 1.5564905405044556,
"learning_rate": 3.7315366675304488e-06,
"loss": 1.4189,
"step": 720
},
{
"epoch": 0.016815983347278626,
"grad_norm": 1.130292296409607,
"learning_rate": 3.736719357346463e-06,
"loss": 1.4455,
"step": 721
},
{
"epoch": 0.016839306486456544,
"grad_norm": 2.0609545707702637,
"learning_rate": 3.741902047162478e-06,
"loss": 1.6052,
"step": 722
},
{
"epoch": 0.01686262962563446,
"grad_norm": 1.0422543287277222,
"learning_rate": 3.7470847369784922e-06,
"loss": 1.5889,
"step": 723
},
{
"epoch": 0.01688595276481238,
"grad_norm": 1.7926782369613647,
"learning_rate": 3.7522674267945066e-06,
"loss": 1.2304,
"step": 724
},
{
"epoch": 0.016909275903990297,
"grad_norm": 1.2486250400543213,
"learning_rate": 3.757450116610521e-06,
"loss": 1.7512,
"step": 725
},
{
"epoch": 0.016932599043168215,
"grad_norm": 1.6907048225402832,
"learning_rate": 3.7626328064265357e-06,
"loss": 1.2031,
"step": 726
},
{
"epoch": 0.016955922182346133,
"grad_norm": 1.2899296283721924,
"learning_rate": 3.76781549624255e-06,
"loss": 1.3111,
"step": 727
},
{
"epoch": 0.01697924532152405,
"grad_norm": 2.320288896560669,
"learning_rate": 3.7729981860585644e-06,
"loss": 1.2764,
"step": 728
},
{
"epoch": 0.01700256846070197,
"grad_norm": 1.4165383577346802,
"learning_rate": 3.778180875874579e-06,
"loss": 1.2847,
"step": 729
},
{
"epoch": 0.017025891599879887,
"grad_norm": 1.1537601947784424,
"learning_rate": 3.7833635656905935e-06,
"loss": 1.6002,
"step": 730
},
{
"epoch": 0.017049214739057805,
"grad_norm": 1.3128899335861206,
"learning_rate": 3.7885462555066083e-06,
"loss": 1.4159,
"step": 731
},
{
"epoch": 0.017072537878235722,
"grad_norm": 0.9494642615318298,
"learning_rate": 3.7937289453226226e-06,
"loss": 1.5425,
"step": 732
},
{
"epoch": 0.01709586101741364,
"grad_norm": 1.8949923515319824,
"learning_rate": 3.7989116351386374e-06,
"loss": 1.109,
"step": 733
},
{
"epoch": 0.017119184156591555,
"grad_norm": 1.3136776685714722,
"learning_rate": 3.8040943249546517e-06,
"loss": 1.4208,
"step": 734
},
{
"epoch": 0.017142507295769473,
"grad_norm": 1.0108048915863037,
"learning_rate": 3.8092770147706665e-06,
"loss": 1.3101,
"step": 735
},
{
"epoch": 0.01716583043494739,
"grad_norm": 1.1397989988327026,
"learning_rate": 3.814459704586681e-06,
"loss": 1.6643,
"step": 736
},
{
"epoch": 0.01718915357412531,
"grad_norm": 0.9662717580795288,
"learning_rate": 3.819642394402696e-06,
"loss": 1.5524,
"step": 737
},
{
"epoch": 0.017212476713303226,
"grad_norm": 1.5264514684677124,
"learning_rate": 3.82482508421871e-06,
"loss": 1.6702,
"step": 738
},
{
"epoch": 0.017235799852481144,
"grad_norm": 1.1797709465026855,
"learning_rate": 3.830007774034724e-06,
"loss": 1.5751,
"step": 739
},
{
"epoch": 0.017259122991659062,
"grad_norm": 1.3964486122131348,
"learning_rate": 3.835190463850739e-06,
"loss": 1.3497,
"step": 740
},
{
"epoch": 0.01728244613083698,
"grad_norm": 1.0540798902511597,
"learning_rate": 3.840373153666753e-06,
"loss": 1.623,
"step": 741
},
{
"epoch": 0.017305769270014898,
"grad_norm": 1.8619107007980347,
"learning_rate": 3.845555843482767e-06,
"loss": 1.836,
"step": 742
},
{
"epoch": 0.017329092409192816,
"grad_norm": 1.190048098564148,
"learning_rate": 3.8507385332987826e-06,
"loss": 1.6031,
"step": 743
},
{
"epoch": 0.017352415548370734,
"grad_norm": 1.32784903049469,
"learning_rate": 3.855921223114797e-06,
"loss": 1.6144,
"step": 744
},
{
"epoch": 0.01737573868754865,
"grad_norm": 1.7393810749053955,
"learning_rate": 3.861103912930811e-06,
"loss": 1.4898,
"step": 745
},
{
"epoch": 0.01739906182672657,
"grad_norm": 1.008122444152832,
"learning_rate": 3.866286602746826e-06,
"loss": 1.6506,
"step": 746
},
{
"epoch": 0.017422384965904487,
"grad_norm": 1.3282239437103271,
"learning_rate": 3.871469292562841e-06,
"loss": 1.5178,
"step": 747
},
{
"epoch": 0.017445708105082402,
"grad_norm": 1.4479358196258545,
"learning_rate": 3.876651982378855e-06,
"loss": 1.5896,
"step": 748
},
{
"epoch": 0.01746903124426032,
"grad_norm": 1.9100661277770996,
"learning_rate": 3.8818346721948695e-06,
"loss": 1.2946,
"step": 749
},
{
"epoch": 0.017492354383438238,
"grad_norm": 1.269235610961914,
"learning_rate": 3.887017362010884e-06,
"loss": 1.5707,
"step": 750
},
{
"epoch": 0.017515677522616156,
"grad_norm": 1.3187369108200073,
"learning_rate": 3.892200051826899e-06,
"loss": 1.8153,
"step": 751
},
{
"epoch": 0.017539000661794073,
"grad_norm": 1.3091131448745728,
"learning_rate": 3.8973827416429125e-06,
"loss": 1.5973,
"step": 752
},
{
"epoch": 0.01756232380097199,
"grad_norm": 1.4826890230178833,
"learning_rate": 3.902565431458927e-06,
"loss": 1.3277,
"step": 753
},
{
"epoch": 0.01758564694014991,
"grad_norm": 1.2626949548721313,
"learning_rate": 3.907748121274942e-06,
"loss": 1.5531,
"step": 754
},
{
"epoch": 0.017608970079327827,
"grad_norm": 1.1990412473678589,
"learning_rate": 3.912930811090956e-06,
"loss": 1.349,
"step": 755
},
{
"epoch": 0.017632293218505745,
"grad_norm": 1.3036906719207764,
"learning_rate": 3.918113500906971e-06,
"loss": 1.5648,
"step": 756
},
{
"epoch": 0.017655616357683663,
"grad_norm": 1.3129525184631348,
"learning_rate": 3.923296190722985e-06,
"loss": 1.7147,
"step": 757
},
{
"epoch": 0.01767893949686158,
"grad_norm": 1.4686280488967896,
"learning_rate": 3.928478880539e-06,
"loss": 1.6136,
"step": 758
},
{
"epoch": 0.0177022626360395,
"grad_norm": 1.6845604181289673,
"learning_rate": 3.933661570355015e-06,
"loss": 1.763,
"step": 759
},
{
"epoch": 0.017725585775217417,
"grad_norm": 2.019049644470215,
"learning_rate": 3.938844260171029e-06,
"loss": 1.2543,
"step": 760
},
{
"epoch": 0.017748908914395334,
"grad_norm": 1.4184072017669678,
"learning_rate": 3.944026949987043e-06,
"loss": 1.596,
"step": 761
},
{
"epoch": 0.017772232053573252,
"grad_norm": 1.127982497215271,
"learning_rate": 3.9492096398030585e-06,
"loss": 1.5485,
"step": 762
},
{
"epoch": 0.017795555192751167,
"grad_norm": 1.5097321271896362,
"learning_rate": 3.954392329619073e-06,
"loss": 1.5452,
"step": 763
},
{
"epoch": 0.017818878331929085,
"grad_norm": 1.3832807540893555,
"learning_rate": 3.959575019435087e-06,
"loss": 1.3865,
"step": 764
},
{
"epoch": 0.017842201471107003,
"grad_norm": 1.065623164176941,
"learning_rate": 3.964757709251102e-06,
"loss": 1.2218,
"step": 765
},
{
"epoch": 0.01786552461028492,
"grad_norm": 1.2190065383911133,
"learning_rate": 3.969940399067116e-06,
"loss": 1.2169,
"step": 766
},
{
"epoch": 0.01788884774946284,
"grad_norm": 1.741749882698059,
"learning_rate": 3.97512308888313e-06,
"loss": 1.7316,
"step": 767
},
{
"epoch": 0.017912170888640756,
"grad_norm": 1.2072060108184814,
"learning_rate": 3.980305778699145e-06,
"loss": 1.815,
"step": 768
},
{
"epoch": 0.017935494027818674,
"grad_norm": 1.4645625352859497,
"learning_rate": 3.98548846851516e-06,
"loss": 1.2218,
"step": 769
},
{
"epoch": 0.017958817166996592,
"grad_norm": 1.4466350078582764,
"learning_rate": 3.990671158331174e-06,
"loss": 1.7291,
"step": 770
},
{
"epoch": 0.01798214030617451,
"grad_norm": 1.364358901977539,
"learning_rate": 3.9958538481471885e-06,
"loss": 1.6527,
"step": 771
},
{
"epoch": 0.018005463445352428,
"grad_norm": 1.2262394428253174,
"learning_rate": 4.001036537963203e-06,
"loss": 1.5522,
"step": 772
},
{
"epoch": 0.018028786584530346,
"grad_norm": 1.694001317024231,
"learning_rate": 4.006219227779218e-06,
"loss": 1.5791,
"step": 773
},
{
"epoch": 0.018052109723708264,
"grad_norm": 0.7941157817840576,
"learning_rate": 4.011401917595232e-06,
"loss": 1.23,
"step": 774
},
{
"epoch": 0.01807543286288618,
"grad_norm": 1.1942747831344604,
"learning_rate": 4.016584607411247e-06,
"loss": 1.4316,
"step": 775
},
{
"epoch": 0.0180987560020641,
"grad_norm": 1.5809072256088257,
"learning_rate": 4.021767297227261e-06,
"loss": 1.7361,
"step": 776
},
{
"epoch": 0.018122079141242014,
"grad_norm": 1.2918401956558228,
"learning_rate": 4.026949987043276e-06,
"loss": 1.3285,
"step": 777
},
{
"epoch": 0.018145402280419932,
"grad_norm": 1.966123342514038,
"learning_rate": 4.032132676859291e-06,
"loss": 1.2037,
"step": 778
},
{
"epoch": 0.01816872541959785,
"grad_norm": 1.3362590074539185,
"learning_rate": 4.037315366675304e-06,
"loss": 1.3811,
"step": 779
},
{
"epoch": 0.018192048558775768,
"grad_norm": 1.0375605821609497,
"learning_rate": 4.042498056491319e-06,
"loss": 1.481,
"step": 780
},
{
"epoch": 0.018215371697953685,
"grad_norm": 2.414684295654297,
"learning_rate": 4.047680746307334e-06,
"loss": 1.773,
"step": 781
},
{
"epoch": 0.018238694837131603,
"grad_norm": 1.2252676486968994,
"learning_rate": 4.052863436123348e-06,
"loss": 1.514,
"step": 782
},
{
"epoch": 0.01826201797630952,
"grad_norm": 1.517791748046875,
"learning_rate": 4.058046125939362e-06,
"loss": 1.3442,
"step": 783
},
{
"epoch": 0.01828534111548744,
"grad_norm": 1.0303611755371094,
"learning_rate": 4.0632288157553776e-06,
"loss": 1.5593,
"step": 784
},
{
"epoch": 0.018308664254665357,
"grad_norm": 1.3615033626556396,
"learning_rate": 4.068411505571392e-06,
"loss": 1.6971,
"step": 785
},
{
"epoch": 0.018331987393843275,
"grad_norm": 1.1224147081375122,
"learning_rate": 4.073594195387406e-06,
"loss": 1.2134,
"step": 786
},
{
"epoch": 0.018355310533021193,
"grad_norm": 1.3592679500579834,
"learning_rate": 4.078776885203421e-06,
"loss": 1.7391,
"step": 787
},
{
"epoch": 0.01837863367219911,
"grad_norm": 1.6286187171936035,
"learning_rate": 4.083959575019436e-06,
"loss": 1.7279,
"step": 788
},
{
"epoch": 0.01840195681137703,
"grad_norm": 1.2597742080688477,
"learning_rate": 4.08914226483545e-06,
"loss": 1.5227,
"step": 789
},
{
"epoch": 0.018425279950554947,
"grad_norm": 1.2776849269866943,
"learning_rate": 4.0943249546514645e-06,
"loss": 1.3575,
"step": 790
},
{
"epoch": 0.01844860308973286,
"grad_norm": 1.2529163360595703,
"learning_rate": 4.099507644467479e-06,
"loss": 1.6356,
"step": 791
},
{
"epoch": 0.01847192622891078,
"grad_norm": 1.184187650680542,
"learning_rate": 4.104690334283494e-06,
"loss": 1.734,
"step": 792
},
{
"epoch": 0.018495249368088697,
"grad_norm": 1.176222562789917,
"learning_rate": 4.1098730240995075e-06,
"loss": 1.5206,
"step": 793
},
{
"epoch": 0.018518572507266615,
"grad_norm": 1.0694701671600342,
"learning_rate": 4.115055713915522e-06,
"loss": 1.1824,
"step": 794
},
{
"epoch": 0.018541895646444533,
"grad_norm": 1.5169551372528076,
"learning_rate": 4.120238403731537e-06,
"loss": 1.3817,
"step": 795
},
{
"epoch": 0.01856521878562245,
"grad_norm": 1.0996246337890625,
"learning_rate": 4.125421093547551e-06,
"loss": 1.0921,
"step": 796
},
{
"epoch": 0.01858854192480037,
"grad_norm": 1.0202140808105469,
"learning_rate": 4.130603783363566e-06,
"loss": 1.2687,
"step": 797
},
{
"epoch": 0.018611865063978286,
"grad_norm": 2.089864730834961,
"learning_rate": 4.13578647317958e-06,
"loss": 1.5417,
"step": 798
},
{
"epoch": 0.018635188203156204,
"grad_norm": 1.1465847492218018,
"learning_rate": 4.140969162995595e-06,
"loss": 1.3415,
"step": 799
},
{
"epoch": 0.018658511342334122,
"grad_norm": 1.1085565090179443,
"learning_rate": 4.14615185281161e-06,
"loss": 1.4662,
"step": 800
},
{
"epoch": 0.01868183448151204,
"grad_norm": 1.2206768989562988,
"learning_rate": 4.151334542627624e-06,
"loss": 1.4954,
"step": 801
},
{
"epoch": 0.018705157620689958,
"grad_norm": 1.1540756225585938,
"learning_rate": 4.156517232443638e-06,
"loss": 1.4953,
"step": 802
},
{
"epoch": 0.018728480759867876,
"grad_norm": 1.9667025804519653,
"learning_rate": 4.1616999222596535e-06,
"loss": 1.1834,
"step": 803
},
{
"epoch": 0.018751803899045794,
"grad_norm": 1.2202988862991333,
"learning_rate": 4.166882612075668e-06,
"loss": 1.7045,
"step": 804
},
{
"epoch": 0.018775127038223708,
"grad_norm": 1.2399123907089233,
"learning_rate": 4.172065301891682e-06,
"loss": 1.4937,
"step": 805
},
{
"epoch": 0.018798450177401626,
"grad_norm": 1.5780203342437744,
"learning_rate": 4.177247991707697e-06,
"loss": 1.6386,
"step": 806
},
{
"epoch": 0.018821773316579544,
"grad_norm": 1.524564266204834,
"learning_rate": 4.182430681523711e-06,
"loss": 1.4951,
"step": 807
},
{
"epoch": 0.01884509645575746,
"grad_norm": 1.342991590499878,
"learning_rate": 4.187613371339725e-06,
"loss": 1.3007,
"step": 808
},
{
"epoch": 0.01886841959493538,
"grad_norm": 1.320813775062561,
"learning_rate": 4.19279606115574e-06,
"loss": 1.2112,
"step": 809
},
{
"epoch": 0.018891742734113297,
"grad_norm": 1.2329927682876587,
"learning_rate": 4.197978750971755e-06,
"loss": 1.333,
"step": 810
},
{
"epoch": 0.018915065873291215,
"grad_norm": 1.3429094552993774,
"learning_rate": 4.203161440787769e-06,
"loss": 1.4805,
"step": 811
},
{
"epoch": 0.018938389012469133,
"grad_norm": 1.643641710281372,
"learning_rate": 4.2083441306037835e-06,
"loss": 1.5665,
"step": 812
},
{
"epoch": 0.01896171215164705,
"grad_norm": 1.111887812614441,
"learning_rate": 4.213526820419798e-06,
"loss": 1.6087,
"step": 813
},
{
"epoch": 0.01898503529082497,
"grad_norm": 1.3594610691070557,
"learning_rate": 4.218709510235813e-06,
"loss": 1.7666,
"step": 814
},
{
"epoch": 0.019008358430002887,
"grad_norm": 1.2298046350479126,
"learning_rate": 4.223892200051827e-06,
"loss": 1.5032,
"step": 815
},
{
"epoch": 0.019031681569180805,
"grad_norm": 1.2679171562194824,
"learning_rate": 4.229074889867842e-06,
"loss": 1.4375,
"step": 816
},
{
"epoch": 0.019055004708358723,
"grad_norm": 1.0543935298919678,
"learning_rate": 4.234257579683856e-06,
"loss": 1.6645,
"step": 817
},
{
"epoch": 0.01907832784753664,
"grad_norm": 1.2821168899536133,
"learning_rate": 4.239440269499871e-06,
"loss": 1.1945,
"step": 818
},
{
"epoch": 0.01910165098671456,
"grad_norm": 1.5575084686279297,
"learning_rate": 4.244622959315886e-06,
"loss": 1.3262,
"step": 819
},
{
"epoch": 0.019124974125892473,
"grad_norm": 1.2359989881515503,
"learning_rate": 4.2498056491319e-06,
"loss": 1.4127,
"step": 820
},
{
"epoch": 0.01914829726507039,
"grad_norm": 1.0559273958206177,
"learning_rate": 4.254988338947914e-06,
"loss": 1.4455,
"step": 821
},
{
"epoch": 0.01917162040424831,
"grad_norm": 1.3651732206344604,
"learning_rate": 4.260171028763929e-06,
"loss": 1.245,
"step": 822
},
{
"epoch": 0.019194943543426227,
"grad_norm": 1.0067932605743408,
"learning_rate": 4.265353718579943e-06,
"loss": 1.4954,
"step": 823
},
{
"epoch": 0.019218266682604145,
"grad_norm": 1.7477822303771973,
"learning_rate": 4.270536408395957e-06,
"loss": 1.8164,
"step": 824
},
{
"epoch": 0.019241589821782062,
"grad_norm": 1.1976604461669922,
"learning_rate": 4.2757190982119726e-06,
"loss": 1.4552,
"step": 825
},
{
"epoch": 0.01926491296095998,
"grad_norm": 1.306269884109497,
"learning_rate": 4.280901788027987e-06,
"loss": 1.6348,
"step": 826
},
{
"epoch": 0.019288236100137898,
"grad_norm": 1.5786314010620117,
"learning_rate": 4.286084477844001e-06,
"loss": 1.4592,
"step": 827
},
{
"epoch": 0.019311559239315816,
"grad_norm": 1.4481762647628784,
"learning_rate": 4.291267167660016e-06,
"loss": 1.3409,
"step": 828
},
{
"epoch": 0.019334882378493734,
"grad_norm": 1.1410714387893677,
"learning_rate": 4.296449857476031e-06,
"loss": 1.5746,
"step": 829
},
{
"epoch": 0.019358205517671652,
"grad_norm": 1.363434076309204,
"learning_rate": 4.301632547292045e-06,
"loss": 1.0836,
"step": 830
},
{
"epoch": 0.01938152865684957,
"grad_norm": 1.1413646936416626,
"learning_rate": 4.3068152371080595e-06,
"loss": 1.8687,
"step": 831
},
{
"epoch": 0.019404851796027488,
"grad_norm": 1.9734309911727905,
"learning_rate": 4.311997926924074e-06,
"loss": 1.3295,
"step": 832
},
{
"epoch": 0.019428174935205406,
"grad_norm": 1.5119333267211914,
"learning_rate": 4.317180616740089e-06,
"loss": 1.6817,
"step": 833
},
{
"epoch": 0.01945149807438332,
"grad_norm": 1.3933395147323608,
"learning_rate": 4.3223633065561025e-06,
"loss": 1.5288,
"step": 834
},
{
"epoch": 0.019474821213561238,
"grad_norm": 1.3713746070861816,
"learning_rate": 4.327545996372117e-06,
"loss": 1.6361,
"step": 835
},
{
"epoch": 0.019498144352739156,
"grad_norm": 1.1849229335784912,
"learning_rate": 4.332728686188132e-06,
"loss": 1.6611,
"step": 836
},
{
"epoch": 0.019521467491917074,
"grad_norm": 2.122307777404785,
"learning_rate": 4.337911376004146e-06,
"loss": 1.6258,
"step": 837
},
{
"epoch": 0.01954479063109499,
"grad_norm": 1.221781611442566,
"learning_rate": 4.343094065820161e-06,
"loss": 1.9081,
"step": 838
},
{
"epoch": 0.01956811377027291,
"grad_norm": 1.2895511388778687,
"learning_rate": 4.348276755636175e-06,
"loss": 1.2742,
"step": 839
},
{
"epoch": 0.019591436909450827,
"grad_norm": 1.1531336307525635,
"learning_rate": 4.35345944545219e-06,
"loss": 1.587,
"step": 840
},
{
"epoch": 0.019614760048628745,
"grad_norm": 1.3979135751724243,
"learning_rate": 4.358642135268205e-06,
"loss": 1.5208,
"step": 841
},
{
"epoch": 0.019638083187806663,
"grad_norm": 1.3758100271224976,
"learning_rate": 4.363824825084219e-06,
"loss": 1.246,
"step": 842
},
{
"epoch": 0.01966140632698458,
"grad_norm": 1.3759677410125732,
"learning_rate": 4.369007514900233e-06,
"loss": 1.7344,
"step": 843
},
{
"epoch": 0.0196847294661625,
"grad_norm": 1.5575461387634277,
"learning_rate": 4.3741902047162485e-06,
"loss": 1.5554,
"step": 844
},
{
"epoch": 0.019708052605340417,
"grad_norm": 1.5018088817596436,
"learning_rate": 4.379372894532263e-06,
"loss": 1.3433,
"step": 845
},
{
"epoch": 0.019731375744518335,
"grad_norm": 1.4393954277038574,
"learning_rate": 4.384555584348277e-06,
"loss": 1.7277,
"step": 846
},
{
"epoch": 0.019754698883696253,
"grad_norm": 1.0249360799789429,
"learning_rate": 4.389738274164292e-06,
"loss": 1.6538,
"step": 847
},
{
"epoch": 0.019778022022874167,
"grad_norm": 1.128587007522583,
"learning_rate": 4.394920963980306e-06,
"loss": 1.2935,
"step": 848
},
{
"epoch": 0.019801345162052085,
"grad_norm": 1.301287293434143,
"learning_rate": 4.40010365379632e-06,
"loss": 1.4193,
"step": 849
},
{
"epoch": 0.019824668301230003,
"grad_norm": 1.5180747509002686,
"learning_rate": 4.405286343612335e-06,
"loss": 1.2061,
"step": 850
},
{
"epoch": 0.01984799144040792,
"grad_norm": 0.9110321402549744,
"learning_rate": 4.41046903342835e-06,
"loss": 1.2803,
"step": 851
},
{
"epoch": 0.01987131457958584,
"grad_norm": 1.68843674659729,
"learning_rate": 4.415651723244364e-06,
"loss": 1.2037,
"step": 852
},
{
"epoch": 0.019894637718763757,
"grad_norm": 1.2198610305786133,
"learning_rate": 4.4208344130603785e-06,
"loss": 1.6652,
"step": 853
},
{
"epoch": 0.019917960857941674,
"grad_norm": 1.579087257385254,
"learning_rate": 4.426017102876393e-06,
"loss": 1.5859,
"step": 854
},
{
"epoch": 0.019941283997119592,
"grad_norm": 1.7198874950408936,
"learning_rate": 4.431199792692408e-06,
"loss": 1.4662,
"step": 855
},
{
"epoch": 0.01996460713629751,
"grad_norm": 2.817178726196289,
"learning_rate": 4.436382482508422e-06,
"loss": 1.3427,
"step": 856
},
{
"epoch": 0.019987930275475428,
"grad_norm": 1.4508287906646729,
"learning_rate": 4.441565172324437e-06,
"loss": 1.2893,
"step": 857
},
{
"epoch": 0.020011253414653346,
"grad_norm": 1.29767644405365,
"learning_rate": 4.446747862140451e-06,
"loss": 1.5759,
"step": 858
},
{
"epoch": 0.020034576553831264,
"grad_norm": 1.84248685836792,
"learning_rate": 4.451930551956466e-06,
"loss": 2.1373,
"step": 859
},
{
"epoch": 0.020057899693009182,
"grad_norm": 1.6153839826583862,
"learning_rate": 4.457113241772481e-06,
"loss": 1.3915,
"step": 860
},
{
"epoch": 0.0200812228321871,
"grad_norm": 1.3203104734420776,
"learning_rate": 4.462295931588495e-06,
"loss": 1.569,
"step": 861
},
{
"epoch": 0.020104545971365014,
"grad_norm": 1.6475995779037476,
"learning_rate": 4.467478621404509e-06,
"loss": 1.6446,
"step": 862
},
{
"epoch": 0.020127869110542932,
"grad_norm": 1.165834665298462,
"learning_rate": 4.472661311220524e-06,
"loss": 1.7323,
"step": 863
},
{
"epoch": 0.02015119224972085,
"grad_norm": 1.3182172775268555,
"learning_rate": 4.477844001036538e-06,
"loss": 1.6265,
"step": 864
},
{
"epoch": 0.020174515388898768,
"grad_norm": 1.1236745119094849,
"learning_rate": 4.483026690852552e-06,
"loss": 1.2358,
"step": 865
},
{
"epoch": 0.020197838528076686,
"grad_norm": 1.2104893922805786,
"learning_rate": 4.4882093806685676e-06,
"loss": 1.4677,
"step": 866
},
{
"epoch": 0.020221161667254604,
"grad_norm": 1.6824678182601929,
"learning_rate": 4.493392070484582e-06,
"loss": 1.5802,
"step": 867
},
{
"epoch": 0.02024448480643252,
"grad_norm": 1.0679930448532104,
"learning_rate": 4.498574760300596e-06,
"loss": 1.4105,
"step": 868
},
{
"epoch": 0.02026780794561044,
"grad_norm": 1.3705253601074219,
"learning_rate": 4.503757450116611e-06,
"loss": 1.5095,
"step": 869
},
{
"epoch": 0.020291131084788357,
"grad_norm": 1.307491660118103,
"learning_rate": 4.508940139932626e-06,
"loss": 1.3987,
"step": 870
},
{
"epoch": 0.020314454223966275,
"grad_norm": 1.4814496040344238,
"learning_rate": 4.51412282974864e-06,
"loss": 1.635,
"step": 871
},
{
"epoch": 0.020337777363144193,
"grad_norm": 0.935867190361023,
"learning_rate": 4.5193055195646545e-06,
"loss": 1.6734,
"step": 872
},
{
"epoch": 0.02036110050232211,
"grad_norm": 1.3890215158462524,
"learning_rate": 4.524488209380669e-06,
"loss": 1.4458,
"step": 873
},
{
"epoch": 0.02038442364150003,
"grad_norm": 1.628081202507019,
"learning_rate": 4.529670899196684e-06,
"loss": 1.4814,
"step": 874
},
{
"epoch": 0.020407746780677947,
"grad_norm": 1.5255577564239502,
"learning_rate": 4.534853589012698e-06,
"loss": 1.3884,
"step": 875
},
{
"epoch": 0.020431069919855865,
"grad_norm": 2.09283185005188,
"learning_rate": 4.540036278828712e-06,
"loss": 1.7396,
"step": 876
},
{
"epoch": 0.02045439305903378,
"grad_norm": 0.9901561737060547,
"learning_rate": 4.545218968644727e-06,
"loss": 1.4941,
"step": 877
},
{
"epoch": 0.020477716198211697,
"grad_norm": 1.8444923162460327,
"learning_rate": 4.550401658460741e-06,
"loss": 1.2724,
"step": 878
},
{
"epoch": 0.020501039337389615,
"grad_norm": 1.414305567741394,
"learning_rate": 4.555584348276756e-06,
"loss": 1.5781,
"step": 879
},
{
"epoch": 0.020524362476567533,
"grad_norm": 1.1960091590881348,
"learning_rate": 4.56076703809277e-06,
"loss": 1.536,
"step": 880
},
{
"epoch": 0.02054768561574545,
"grad_norm": 2.241649627685547,
"learning_rate": 4.565949727908785e-06,
"loss": 1.6636,
"step": 881
},
{
"epoch": 0.02057100875492337,
"grad_norm": 1.0672343969345093,
"learning_rate": 4.5711324177248e-06,
"loss": 1.6369,
"step": 882
},
{
"epoch": 0.020594331894101287,
"grad_norm": 1.6761622428894043,
"learning_rate": 4.576315107540814e-06,
"loss": 1.2554,
"step": 883
},
{
"epoch": 0.020617655033279204,
"grad_norm": 1.1365658044815063,
"learning_rate": 4.581497797356828e-06,
"loss": 1.6271,
"step": 884
},
{
"epoch": 0.020640978172457122,
"grad_norm": 1.0631389617919922,
"learning_rate": 4.5866804871728435e-06,
"loss": 1.6393,
"step": 885
},
{
"epoch": 0.02066430131163504,
"grad_norm": 3.27304744720459,
"learning_rate": 4.591863176988858e-06,
"loss": 1.3521,
"step": 886
},
{
"epoch": 0.020687624450812958,
"grad_norm": 1.3354477882385254,
"learning_rate": 4.597045866804872e-06,
"loss": 1.5137,
"step": 887
},
{
"epoch": 0.020710947589990876,
"grad_norm": 2.192812919616699,
"learning_rate": 4.602228556620887e-06,
"loss": 1.7294,
"step": 888
},
{
"epoch": 0.020734270729168794,
"grad_norm": 0.9716669321060181,
"learning_rate": 4.607411246436901e-06,
"loss": 1.4244,
"step": 889
},
{
"epoch": 0.020757593868346712,
"grad_norm": 1.0377227067947388,
"learning_rate": 4.612593936252915e-06,
"loss": 1.3041,
"step": 890
},
{
"epoch": 0.020780917007524626,
"grad_norm": 1.971074104309082,
"learning_rate": 4.61777662606893e-06,
"loss": 1.4917,
"step": 891
},
{
"epoch": 0.020804240146702544,
"grad_norm": 1.3108222484588623,
"learning_rate": 4.622959315884945e-06,
"loss": 1.5923,
"step": 892
},
{
"epoch": 0.020827563285880462,
"grad_norm": 1.4194189310073853,
"learning_rate": 4.628142005700959e-06,
"loss": 1.2378,
"step": 893
},
{
"epoch": 0.02085088642505838,
"grad_norm": 1.5872682332992554,
"learning_rate": 4.6333246955169735e-06,
"loss": 1.3573,
"step": 894
},
{
"epoch": 0.020874209564236298,
"grad_norm": 1.351704716682434,
"learning_rate": 4.638507385332988e-06,
"loss": 1.8374,
"step": 895
},
{
"epoch": 0.020897532703414216,
"grad_norm": 1.15986168384552,
"learning_rate": 4.643690075149003e-06,
"loss": 1.4303,
"step": 896
},
{
"epoch": 0.020920855842592134,
"grad_norm": 1.912819743156433,
"learning_rate": 4.648872764965017e-06,
"loss": 1.7733,
"step": 897
},
{
"epoch": 0.02094417898177005,
"grad_norm": 1.6582539081573486,
"learning_rate": 4.654055454781032e-06,
"loss": 1.4696,
"step": 898
},
{
"epoch": 0.02096750212094797,
"grad_norm": 1.147661805152893,
"learning_rate": 4.659238144597046e-06,
"loss": 1.5037,
"step": 899
},
{
"epoch": 0.020990825260125887,
"grad_norm": 1.1773402690887451,
"learning_rate": 4.664420834413061e-06,
"loss": 1.604,
"step": 900
},
{
"epoch": 0.021014148399303805,
"grad_norm": 1.9128248691558838,
"learning_rate": 4.669603524229076e-06,
"loss": 1.3081,
"step": 901
},
{
"epoch": 0.021037471538481723,
"grad_norm": 1.0742683410644531,
"learning_rate": 4.67478621404509e-06,
"loss": 1.5619,
"step": 902
},
{
"epoch": 0.02106079467765964,
"grad_norm": 1.19862699508667,
"learning_rate": 4.679968903861104e-06,
"loss": 1.6896,
"step": 903
},
{
"epoch": 0.02108411781683756,
"grad_norm": 1.276283860206604,
"learning_rate": 4.685151593677119e-06,
"loss": 1.65,
"step": 904
},
{
"epoch": 0.021107440956015473,
"grad_norm": 1.3582435846328735,
"learning_rate": 4.690334283493133e-06,
"loss": 1.2686,
"step": 905
},
{
"epoch": 0.02113076409519339,
"grad_norm": 1.2145341634750366,
"learning_rate": 4.695516973309147e-06,
"loss": 1.8032,
"step": 906
},
{
"epoch": 0.02115408723437131,
"grad_norm": 1.1219233274459839,
"learning_rate": 4.7006996631251626e-06,
"loss": 1.7681,
"step": 907
},
{
"epoch": 0.021177410373549227,
"grad_norm": 1.0474015474319458,
"learning_rate": 4.705882352941177e-06,
"loss": 1.4555,
"step": 908
},
{
"epoch": 0.021200733512727145,
"grad_norm": 1.6325182914733887,
"learning_rate": 4.711065042757191e-06,
"loss": 1.432,
"step": 909
},
{
"epoch": 0.021224056651905063,
"grad_norm": 1.5804178714752197,
"learning_rate": 4.716247732573206e-06,
"loss": 1.7409,
"step": 910
},
{
"epoch": 0.02124737979108298,
"grad_norm": 1.226804256439209,
"learning_rate": 4.721430422389221e-06,
"loss": 1.8077,
"step": 911
},
{
"epoch": 0.0212707029302609,
"grad_norm": 1.0747625827789307,
"learning_rate": 4.726613112205235e-06,
"loss": 1.411,
"step": 912
},
{
"epoch": 0.021294026069438816,
"grad_norm": 1.2126623392105103,
"learning_rate": 4.7317958020212495e-06,
"loss": 1.6464,
"step": 913
},
{
"epoch": 0.021317349208616734,
"grad_norm": 1.196486473083496,
"learning_rate": 4.736978491837264e-06,
"loss": 1.4365,
"step": 914
},
{
"epoch": 0.021340672347794652,
"grad_norm": 1.4727115631103516,
"learning_rate": 4.742161181653279e-06,
"loss": 1.5059,
"step": 915
},
{
"epoch": 0.02136399548697257,
"grad_norm": 1.293938159942627,
"learning_rate": 4.747343871469293e-06,
"loss": 1.5508,
"step": 916
},
{
"epoch": 0.021387318626150488,
"grad_norm": 1.3074458837509155,
"learning_rate": 4.752526561285307e-06,
"loss": 1.364,
"step": 917
},
{
"epoch": 0.021410641765328406,
"grad_norm": 1.708522081375122,
"learning_rate": 4.757709251101322e-06,
"loss": 1.2891,
"step": 918
},
{
"epoch": 0.02143396490450632,
"grad_norm": 1.2926160097122192,
"learning_rate": 4.762891940917336e-06,
"loss": 1.1779,
"step": 919
},
{
"epoch": 0.021457288043684238,
"grad_norm": 1.7751168012619019,
"learning_rate": 4.768074630733351e-06,
"loss": 1.3136,
"step": 920
},
{
"epoch": 0.021480611182862156,
"grad_norm": 1.3698194026947021,
"learning_rate": 4.773257320549365e-06,
"loss": 1.5203,
"step": 921
},
{
"epoch": 0.021503934322040074,
"grad_norm": 1.4710402488708496,
"learning_rate": 4.77844001036538e-06,
"loss": 2.0632,
"step": 922
},
{
"epoch": 0.021527257461217992,
"grad_norm": 1.3340466022491455,
"learning_rate": 4.783622700181395e-06,
"loss": 0.9449,
"step": 923
},
{
"epoch": 0.02155058060039591,
"grad_norm": 1.990078330039978,
"learning_rate": 4.788805389997409e-06,
"loss": 1.4095,
"step": 924
},
{
"epoch": 0.021573903739573828,
"grad_norm": 2.6495463848114014,
"learning_rate": 4.793988079813423e-06,
"loss": 1.5914,
"step": 925
},
{
"epoch": 0.021597226878751746,
"grad_norm": 1.368868350982666,
"learning_rate": 4.7991707696294385e-06,
"loss": 1.8007,
"step": 926
},
{
"epoch": 0.021620550017929663,
"grad_norm": 1.3946820497512817,
"learning_rate": 4.804353459445453e-06,
"loss": 1.3846,
"step": 927
},
{
"epoch": 0.02164387315710758,
"grad_norm": 1.6035547256469727,
"learning_rate": 4.809536149261467e-06,
"loss": 1.6677,
"step": 928
},
{
"epoch": 0.0216671962962855,
"grad_norm": 1.29734468460083,
"learning_rate": 4.814718839077482e-06,
"loss": 1.3697,
"step": 929
},
{
"epoch": 0.021690519435463417,
"grad_norm": 1.1746439933776855,
"learning_rate": 4.819901528893497e-06,
"loss": 1.6134,
"step": 930
},
{
"epoch": 0.021713842574641335,
"grad_norm": 1.255861759185791,
"learning_rate": 4.82508421870951e-06,
"loss": 1.6253,
"step": 931
},
{
"epoch": 0.021737165713819253,
"grad_norm": 1.5499615669250488,
"learning_rate": 4.830266908525525e-06,
"loss": 1.2794,
"step": 932
},
{
"epoch": 0.02176048885299717,
"grad_norm": 1.6138273477554321,
"learning_rate": 4.83544959834154e-06,
"loss": 1.6365,
"step": 933
},
{
"epoch": 0.021783811992175085,
"grad_norm": 1.7135401964187622,
"learning_rate": 4.840632288157554e-06,
"loss": 1.509,
"step": 934
},
{
"epoch": 0.021807135131353003,
"grad_norm": 1.4290528297424316,
"learning_rate": 4.8458149779735685e-06,
"loss": 1.3415,
"step": 935
},
{
"epoch": 0.02183045827053092,
"grad_norm": 2.034870147705078,
"learning_rate": 4.850997667789583e-06,
"loss": 1.6834,
"step": 936
},
{
"epoch": 0.02185378140970884,
"grad_norm": 1.6626250743865967,
"learning_rate": 4.856180357605598e-06,
"loss": 1.3573,
"step": 937
},
{
"epoch": 0.021877104548886757,
"grad_norm": 1.2256288528442383,
"learning_rate": 4.861363047421612e-06,
"loss": 1.5497,
"step": 938
},
{
"epoch": 0.021900427688064675,
"grad_norm": 1.218955397605896,
"learning_rate": 4.866545737237627e-06,
"loss": 1.6823,
"step": 939
},
{
"epoch": 0.021923750827242593,
"grad_norm": 1.0629289150238037,
"learning_rate": 4.871728427053641e-06,
"loss": 1.3894,
"step": 940
},
{
"epoch": 0.02194707396642051,
"grad_norm": 2.6169822216033936,
"learning_rate": 4.876911116869656e-06,
"loss": 1.4063,
"step": 941
},
{
"epoch": 0.02197039710559843,
"grad_norm": 1.1517153978347778,
"learning_rate": 4.882093806685671e-06,
"loss": 1.3838,
"step": 942
},
{
"epoch": 0.021993720244776346,
"grad_norm": 1.6320403814315796,
"learning_rate": 4.887276496501685e-06,
"loss": 1.5752,
"step": 943
},
{
"epoch": 0.022017043383954264,
"grad_norm": 1.7344862222671509,
"learning_rate": 4.892459186317699e-06,
"loss": 1.3182,
"step": 944
},
{
"epoch": 0.022040366523132182,
"grad_norm": 1.2497214078903198,
"learning_rate": 4.897641876133714e-06,
"loss": 1.2266,
"step": 945
},
{
"epoch": 0.0220636896623101,
"grad_norm": 1.996893048286438,
"learning_rate": 4.902824565949728e-06,
"loss": 1.2708,
"step": 946
},
{
"epoch": 0.022087012801488018,
"grad_norm": 1.1130571365356445,
"learning_rate": 4.908007255765742e-06,
"loss": 1.4791,
"step": 947
},
{
"epoch": 0.022110335940665932,
"grad_norm": 1.2698702812194824,
"learning_rate": 4.9131899455817576e-06,
"loss": 1.3711,
"step": 948
},
{
"epoch": 0.02213365907984385,
"grad_norm": 1.0363445281982422,
"learning_rate": 4.918372635397772e-06,
"loss": 1.4153,
"step": 949
},
{
"epoch": 0.022156982219021768,
"grad_norm": 1.1418310403823853,
"learning_rate": 4.923555325213786e-06,
"loss": 1.3377,
"step": 950
},
{
"epoch": 0.022180305358199686,
"grad_norm": 1.3740698099136353,
"learning_rate": 4.928738015029801e-06,
"loss": 1.375,
"step": 951
},
{
"epoch": 0.022203628497377604,
"grad_norm": 1.5656532049179077,
"learning_rate": 4.933920704845816e-06,
"loss": 1.651,
"step": 952
},
{
"epoch": 0.022226951636555522,
"grad_norm": 1.209380865097046,
"learning_rate": 4.93910339466183e-06,
"loss": 1.6956,
"step": 953
},
{
"epoch": 0.02225027477573344,
"grad_norm": 1.9917747974395752,
"learning_rate": 4.9442860844778445e-06,
"loss": 1.2802,
"step": 954
},
{
"epoch": 0.022273597914911358,
"grad_norm": 2.168260097503662,
"learning_rate": 4.949468774293859e-06,
"loss": 1.9773,
"step": 955
},
{
"epoch": 0.022296921054089276,
"grad_norm": 1.113978624343872,
"learning_rate": 4.954651464109874e-06,
"loss": 1.8121,
"step": 956
},
{
"epoch": 0.022320244193267193,
"grad_norm": 1.4833635091781616,
"learning_rate": 4.959834153925888e-06,
"loss": 1.694,
"step": 957
},
{
"epoch": 0.02234356733244511,
"grad_norm": 1.3287935256958008,
"learning_rate": 4.965016843741902e-06,
"loss": 1.4865,
"step": 958
},
{
"epoch": 0.02236689047162303,
"grad_norm": 1.5515238046646118,
"learning_rate": 4.970199533557917e-06,
"loss": 1.6035,
"step": 959
},
{
"epoch": 0.022390213610800947,
"grad_norm": 1.2824245691299438,
"learning_rate": 4.975382223373931e-06,
"loss": 1.5124,
"step": 960
},
{
"epoch": 0.022413536749978865,
"grad_norm": 1.2062418460845947,
"learning_rate": 4.980564913189946e-06,
"loss": 1.5982,
"step": 961
},
{
"epoch": 0.02243685988915678,
"grad_norm": 1.2790741920471191,
"learning_rate": 4.98574760300596e-06,
"loss": 1.586,
"step": 962
},
{
"epoch": 0.022460183028334697,
"grad_norm": 1.202909231185913,
"learning_rate": 4.990930292821975e-06,
"loss": 1.7387,
"step": 963
},
{
"epoch": 0.022483506167512615,
"grad_norm": 1.328963041305542,
"learning_rate": 4.99611298263799e-06,
"loss": 1.5611,
"step": 964
},
{
"epoch": 0.022506829306690533,
"grad_norm": 1.3728841543197632,
"learning_rate": 5.001295672454004e-06,
"loss": 1.6887,
"step": 965
},
{
"epoch": 0.02253015244586845,
"grad_norm": 1.2474596500396729,
"learning_rate": 5.006478362270018e-06,
"loss": 1.7337,
"step": 966
},
{
"epoch": 0.02255347558504637,
"grad_norm": 1.4526808261871338,
"learning_rate": 5.0116610520860335e-06,
"loss": 1.4009,
"step": 967
},
{
"epoch": 0.022576798724224287,
"grad_norm": 1.74959397315979,
"learning_rate": 5.016843741902048e-06,
"loss": 1.4153,
"step": 968
},
{
"epoch": 0.022600121863402205,
"grad_norm": 1.7886738777160645,
"learning_rate": 5.022026431718062e-06,
"loss": 1.3897,
"step": 969
},
{
"epoch": 0.022623445002580123,
"grad_norm": 1.3122284412384033,
"learning_rate": 5.027209121534077e-06,
"loss": 1.6551,
"step": 970
},
{
"epoch": 0.02264676814175804,
"grad_norm": 1.5374927520751953,
"learning_rate": 5.032391811350092e-06,
"loss": 1.6396,
"step": 971
},
{
"epoch": 0.02267009128093596,
"grad_norm": 1.6476905345916748,
"learning_rate": 5.037574501166106e-06,
"loss": 1.733,
"step": 972
},
{
"epoch": 0.022693414420113876,
"grad_norm": 1.3407307863235474,
"learning_rate": 5.0427571909821205e-06,
"loss": 1.4984,
"step": 973
},
{
"epoch": 0.022716737559291794,
"grad_norm": 1.5565712451934814,
"learning_rate": 5.047939880798135e-06,
"loss": 1.6524,
"step": 974
},
{
"epoch": 0.022740060698469712,
"grad_norm": 1.381903052330017,
"learning_rate": 5.053122570614149e-06,
"loss": 1.5325,
"step": 975
},
{
"epoch": 0.022763383837647626,
"grad_norm": 1.916326880455017,
"learning_rate": 5.058305260430164e-06,
"loss": 1.2326,
"step": 976
},
{
"epoch": 0.022786706976825544,
"grad_norm": 1.1621575355529785,
"learning_rate": 5.063487950246179e-06,
"loss": 1.2568,
"step": 977
},
{
"epoch": 0.022810030116003462,
"grad_norm": 1.3575561046600342,
"learning_rate": 5.068670640062193e-06,
"loss": 1.3755,
"step": 978
},
{
"epoch": 0.02283335325518138,
"grad_norm": 1.482701063156128,
"learning_rate": 5.0738533298782065e-06,
"loss": 1.598,
"step": 979
},
{
"epoch": 0.022856676394359298,
"grad_norm": 1.2530887126922607,
"learning_rate": 5.079036019694221e-06,
"loss": 1.66,
"step": 980
},
{
"epoch": 0.022879999533537216,
"grad_norm": 1.4960439205169678,
"learning_rate": 5.084218709510236e-06,
"loss": 1.5341,
"step": 981
},
{
"epoch": 0.022903322672715134,
"grad_norm": 1.507735252380371,
"learning_rate": 5.0894013993262504e-06,
"loss": 1.3987,
"step": 982
},
{
"epoch": 0.022926645811893052,
"grad_norm": 2.0131475925445557,
"learning_rate": 5.094584089142265e-06,
"loss": 1.3134,
"step": 983
},
{
"epoch": 0.02294996895107097,
"grad_norm": 1.8096015453338623,
"learning_rate": 5.099766778958279e-06,
"loss": 1.3707,
"step": 984
},
{
"epoch": 0.022973292090248888,
"grad_norm": 1.0444198846817017,
"learning_rate": 5.104949468774294e-06,
"loss": 1.4119,
"step": 985
},
{
"epoch": 0.022996615229426805,
"grad_norm": 1.3110159635543823,
"learning_rate": 5.110132158590309e-06,
"loss": 1.2187,
"step": 986
},
{
"epoch": 0.023019938368604723,
"grad_norm": 1.3191614151000977,
"learning_rate": 5.115314848406323e-06,
"loss": 1.3691,
"step": 987
},
{
"epoch": 0.02304326150778264,
"grad_norm": 1.3888386487960815,
"learning_rate": 5.120497538222337e-06,
"loss": 1.1934,
"step": 988
},
{
"epoch": 0.02306658464696056,
"grad_norm": 1.2101585865020752,
"learning_rate": 5.1256802280383526e-06,
"loss": 1.4962,
"step": 989
},
{
"epoch": 0.023089907786138477,
"grad_norm": 1.2938464879989624,
"learning_rate": 5.130862917854367e-06,
"loss": 1.4601,
"step": 990
},
{
"epoch": 0.02311323092531639,
"grad_norm": 2.072444200515747,
"learning_rate": 5.136045607670381e-06,
"loss": 1.7241,
"step": 991
},
{
"epoch": 0.02313655406449431,
"grad_norm": 1.7139407396316528,
"learning_rate": 5.141228297486396e-06,
"loss": 1.394,
"step": 992
},
{
"epoch": 0.023159877203672227,
"grad_norm": 1.5825177431106567,
"learning_rate": 5.146410987302411e-06,
"loss": 1.4218,
"step": 993
},
{
"epoch": 0.023183200342850145,
"grad_norm": 1.2233787775039673,
"learning_rate": 5.151593677118425e-06,
"loss": 1.2882,
"step": 994
},
{
"epoch": 0.023206523482028063,
"grad_norm": 1.6474647521972656,
"learning_rate": 5.1567763669344395e-06,
"loss": 1.6499,
"step": 995
},
{
"epoch": 0.02322984662120598,
"grad_norm": 1.669651985168457,
"learning_rate": 5.161959056750454e-06,
"loss": 1.1727,
"step": 996
},
{
"epoch": 0.0232531697603839,
"grad_norm": 1.4976879358291626,
"learning_rate": 5.167141746566469e-06,
"loss": 1.2149,
"step": 997
},
{
"epoch": 0.023276492899561817,
"grad_norm": 1.4033470153808594,
"learning_rate": 5.172324436382483e-06,
"loss": 1.3004,
"step": 998
},
{
"epoch": 0.023299816038739735,
"grad_norm": 1.3042150735855103,
"learning_rate": 5.177507126198498e-06,
"loss": 1.3803,
"step": 999
},
{
"epoch": 0.023323139177917653,
"grad_norm": 1.4327346086502075,
"learning_rate": 5.182689816014512e-06,
"loss": 1.7267,
"step": 1000
},
{
"epoch": 0.02334646231709557,
"grad_norm": 1.4823616743087769,
"learning_rate": 5.187872505830526e-06,
"loss": 1.6386,
"step": 1001
},
{
"epoch": 0.02336978545627349,
"grad_norm": 1.7083938121795654,
"learning_rate": 5.193055195646542e-06,
"loss": 1.3112,
"step": 1002
},
{
"epoch": 0.023393108595451406,
"grad_norm": 1.51584792137146,
"learning_rate": 5.198237885462556e-06,
"loss": 1.6169,
"step": 1003
},
{
"epoch": 0.023416431734629324,
"grad_norm": 1.0864455699920654,
"learning_rate": 5.20342057527857e-06,
"loss": 1.3013,
"step": 1004
},
{
"epoch": 0.02343975487380724,
"grad_norm": 1.9760619401931763,
"learning_rate": 5.208603265094585e-06,
"loss": 1.7865,
"step": 1005
},
{
"epoch": 0.023463078012985156,
"grad_norm": 2.5747292041778564,
"learning_rate": 5.2137859549106e-06,
"loss": 1.3345,
"step": 1006
},
{
"epoch": 0.023486401152163074,
"grad_norm": 1.689779281616211,
"learning_rate": 5.218968644726613e-06,
"loss": 1.7856,
"step": 1007
},
{
"epoch": 0.023509724291340992,
"grad_norm": 1.9847980737686157,
"learning_rate": 5.224151334542628e-06,
"loss": 1.8401,
"step": 1008
},
{
"epoch": 0.02353304743051891,
"grad_norm": 1.3654876947402954,
"learning_rate": 5.229334024358642e-06,
"loss": 1.7705,
"step": 1009
},
{
"epoch": 0.023556370569696828,
"grad_norm": 1.7249932289123535,
"learning_rate": 5.234516714174656e-06,
"loss": 1.1657,
"step": 1010
},
{
"epoch": 0.023579693708874746,
"grad_norm": 1.0710606575012207,
"learning_rate": 5.2396994039906716e-06,
"loss": 1.1676,
"step": 1011
},
{
"epoch": 0.023603016848052664,
"grad_norm": 1.213040828704834,
"learning_rate": 5.244882093806686e-06,
"loss": 1.4183,
"step": 1012
},
{
"epoch": 0.02362633998723058,
"grad_norm": 1.6341387033462524,
"learning_rate": 5.2500647836227e-06,
"loss": 1.6092,
"step": 1013
},
{
"epoch": 0.0236496631264085,
"grad_norm": 1.6445837020874023,
"learning_rate": 5.255247473438715e-06,
"loss": 1.6693,
"step": 1014
},
{
"epoch": 0.023672986265586417,
"grad_norm": 1.2804230451583862,
"learning_rate": 5.26043016325473e-06,
"loss": 1.5687,
"step": 1015
},
{
"epoch": 0.023696309404764335,
"grad_norm": 1.8683735132217407,
"learning_rate": 5.265612853070744e-06,
"loss": 1.3944,
"step": 1016
},
{
"epoch": 0.023719632543942253,
"grad_norm": 1.6504722833633423,
"learning_rate": 5.2707955428867585e-06,
"loss": 1.3018,
"step": 1017
},
{
"epoch": 0.02374295568312017,
"grad_norm": 1.71793532371521,
"learning_rate": 5.275978232702773e-06,
"loss": 1.4581,
"step": 1018
},
{
"epoch": 0.023766278822298086,
"grad_norm": 1.1414326429367065,
"learning_rate": 5.281160922518788e-06,
"loss": 1.4924,
"step": 1019
},
{
"epoch": 0.023789601961476003,
"grad_norm": 1.6553568840026855,
"learning_rate": 5.286343612334802e-06,
"loss": 1.6926,
"step": 1020
},
{
"epoch": 0.02381292510065392,
"grad_norm": 1.4217321872711182,
"learning_rate": 5.291526302150817e-06,
"loss": 1.4806,
"step": 1021
},
{
"epoch": 0.02383624823983184,
"grad_norm": 1.4322501420974731,
"learning_rate": 5.296708991966831e-06,
"loss": 1.5978,
"step": 1022
},
{
"epoch": 0.023859571379009757,
"grad_norm": 1.9824562072753906,
"learning_rate": 5.3018916817828454e-06,
"loss": 1.493,
"step": 1023
},
{
"epoch": 0.023882894518187675,
"grad_norm": 1.3815537691116333,
"learning_rate": 5.307074371598861e-06,
"loss": 1.3702,
"step": 1024
},
{
"epoch": 0.023906217657365593,
"grad_norm": 1.101647138595581,
"learning_rate": 5.312257061414875e-06,
"loss": 1.1745,
"step": 1025
},
{
"epoch": 0.02392954079654351,
"grad_norm": 1.2983593940734863,
"learning_rate": 5.317439751230889e-06,
"loss": 1.7473,
"step": 1026
},
{
"epoch": 0.02395286393572143,
"grad_norm": 1.2676076889038086,
"learning_rate": 5.322622441046904e-06,
"loss": 1.6349,
"step": 1027
},
{
"epoch": 0.023976187074899347,
"grad_norm": 1.2923870086669922,
"learning_rate": 5.327805130862919e-06,
"loss": 1.619,
"step": 1028
},
{
"epoch": 0.023999510214077265,
"grad_norm": 1.4195587635040283,
"learning_rate": 5.332987820678933e-06,
"loss": 1.4933,
"step": 1029
},
{
"epoch": 0.024022833353255182,
"grad_norm": 1.3498200178146362,
"learning_rate": 5.3381705104949476e-06,
"loss": 1.489,
"step": 1030
},
{
"epoch": 0.0240461564924331,
"grad_norm": 1.473960280418396,
"learning_rate": 5.343353200310962e-06,
"loss": 1.5181,
"step": 1031
},
{
"epoch": 0.024069479631611018,
"grad_norm": 1.2730071544647217,
"learning_rate": 5.348535890126977e-06,
"loss": 1.5796,
"step": 1032
},
{
"epoch": 0.024092802770788933,
"grad_norm": 1.2243895530700684,
"learning_rate": 5.3537185799429914e-06,
"loss": 1.4051,
"step": 1033
},
{
"epoch": 0.02411612590996685,
"grad_norm": 2.1219441890716553,
"learning_rate": 5.358901269759005e-06,
"loss": 1.4317,
"step": 1034
},
{
"epoch": 0.02413944904914477,
"grad_norm": 1.0719225406646729,
"learning_rate": 5.364083959575019e-06,
"loss": 1.3937,
"step": 1035
},
{
"epoch": 0.024162772188322686,
"grad_norm": 1.6711935997009277,
"learning_rate": 5.369266649391034e-06,
"loss": 1.5832,
"step": 1036
},
{
"epoch": 0.024186095327500604,
"grad_norm": 1.33745276927948,
"learning_rate": 5.374449339207049e-06,
"loss": 1.4582,
"step": 1037
},
{
"epoch": 0.024209418466678522,
"grad_norm": 1.4278967380523682,
"learning_rate": 5.379632029023063e-06,
"loss": 1.6069,
"step": 1038
},
{
"epoch": 0.02423274160585644,
"grad_norm": 1.2003988027572632,
"learning_rate": 5.3848147188390775e-06,
"loss": 1.4942,
"step": 1039
},
{
"epoch": 0.024256064745034358,
"grad_norm": 1.7350938320159912,
"learning_rate": 5.389997408655092e-06,
"loss": 1.637,
"step": 1040
},
{
"epoch": 0.024279387884212276,
"grad_norm": 1.6094862222671509,
"learning_rate": 5.395180098471107e-06,
"loss": 1.6944,
"step": 1041
},
{
"epoch": 0.024302711023390194,
"grad_norm": 1.369091510772705,
"learning_rate": 5.400362788287121e-06,
"loss": 1.6905,
"step": 1042
},
{
"epoch": 0.02432603416256811,
"grad_norm": 1.275787353515625,
"learning_rate": 5.405545478103136e-06,
"loss": 1.6749,
"step": 1043
},
{
"epoch": 0.02434935730174603,
"grad_norm": 1.24448823928833,
"learning_rate": 5.41072816791915e-06,
"loss": 1.4275,
"step": 1044
},
{
"epoch": 0.024372680440923947,
"grad_norm": 1.7868009805679321,
"learning_rate": 5.415910857735165e-06,
"loss": 1.5942,
"step": 1045
},
{
"epoch": 0.024396003580101865,
"grad_norm": 1.5386407375335693,
"learning_rate": 5.42109354755118e-06,
"loss": 1.6505,
"step": 1046
},
{
"epoch": 0.024419326719279783,
"grad_norm": 1.9666537046432495,
"learning_rate": 5.426276237367194e-06,
"loss": 1.7035,
"step": 1047
},
{
"epoch": 0.024442649858457698,
"grad_norm": 1.7937966585159302,
"learning_rate": 5.431458927183208e-06,
"loss": 1.7956,
"step": 1048
},
{
"epoch": 0.024465972997635616,
"grad_norm": 1.1397721767425537,
"learning_rate": 5.436641616999223e-06,
"loss": 1.3459,
"step": 1049
},
{
"epoch": 0.024489296136813533,
"grad_norm": 1.28958261013031,
"learning_rate": 5.441824306815238e-06,
"loss": 1.0963,
"step": 1050
},
{
"epoch": 0.02451261927599145,
"grad_norm": 1.3734923601150513,
"learning_rate": 5.447006996631252e-06,
"loss": 1.3196,
"step": 1051
},
{
"epoch": 0.02453594241516937,
"grad_norm": 1.8763736486434937,
"learning_rate": 5.4521896864472666e-06,
"loss": 1.7322,
"step": 1052
},
{
"epoch": 0.024559265554347287,
"grad_norm": 1.5179871320724487,
"learning_rate": 5.457372376263281e-06,
"loss": 1.2844,
"step": 1053
},
{
"epoch": 0.024582588693525205,
"grad_norm": 1.4944384098052979,
"learning_rate": 5.462555066079296e-06,
"loss": 1.442,
"step": 1054
},
{
"epoch": 0.024605911832703123,
"grad_norm": 1.499028205871582,
"learning_rate": 5.4677377558953105e-06,
"loss": 1.394,
"step": 1055
},
{
"epoch": 0.02462923497188104,
"grad_norm": 1.1869397163391113,
"learning_rate": 5.472920445711325e-06,
"loss": 1.2928,
"step": 1056
},
{
"epoch": 0.02465255811105896,
"grad_norm": 1.3456541299819946,
"learning_rate": 5.478103135527339e-06,
"loss": 1.5983,
"step": 1057
},
{
"epoch": 0.024675881250236877,
"grad_norm": 1.5931065082550049,
"learning_rate": 5.483285825343354e-06,
"loss": 1.4794,
"step": 1058
},
{
"epoch": 0.024699204389414794,
"grad_norm": 1.4096170663833618,
"learning_rate": 5.488468515159369e-06,
"loss": 1.471,
"step": 1059
},
{
"epoch": 0.024722527528592712,
"grad_norm": 1.5033949613571167,
"learning_rate": 5.493651204975383e-06,
"loss": 1.2857,
"step": 1060
},
{
"epoch": 0.02474585066777063,
"grad_norm": 1.632089614868164,
"learning_rate": 5.498833894791397e-06,
"loss": 1.5157,
"step": 1061
},
{
"epoch": 0.024769173806948545,
"grad_norm": 1.563462495803833,
"learning_rate": 5.504016584607411e-06,
"loss": 1.5072,
"step": 1062
},
{
"epoch": 0.024792496946126463,
"grad_norm": 1.4055378437042236,
"learning_rate": 5.509199274423426e-06,
"loss": 1.1545,
"step": 1063
},
{
"epoch": 0.02481582008530438,
"grad_norm": 1.3467985391616821,
"learning_rate": 5.5143819642394404e-06,
"loss": 1.4615,
"step": 1064
},
{
"epoch": 0.0248391432244823,
"grad_norm": 1.6450691223144531,
"learning_rate": 5.519564654055455e-06,
"loss": 1.8051,
"step": 1065
},
{
"epoch": 0.024862466363660216,
"grad_norm": 1.247313141822815,
"learning_rate": 5.524747343871469e-06,
"loss": 1.5971,
"step": 1066
},
{
"epoch": 0.024885789502838134,
"grad_norm": 1.7429383993148804,
"learning_rate": 5.529930033687484e-06,
"loss": 1.5401,
"step": 1067
},
{
"epoch": 0.024909112642016052,
"grad_norm": 1.7351207733154297,
"learning_rate": 5.535112723503499e-06,
"loss": 1.4898,
"step": 1068
},
{
"epoch": 0.02493243578119397,
"grad_norm": 1.5003080368041992,
"learning_rate": 5.540295413319513e-06,
"loss": 1.773,
"step": 1069
},
{
"epoch": 0.024955758920371888,
"grad_norm": 1.370918869972229,
"learning_rate": 5.545478103135527e-06,
"loss": 1.6648,
"step": 1070
},
{
"epoch": 0.024979082059549806,
"grad_norm": 1.125687837600708,
"learning_rate": 5.5506607929515426e-06,
"loss": 1.5297,
"step": 1071
},
{
"epoch": 0.025002405198727724,
"grad_norm": 1.984605073928833,
"learning_rate": 5.555843482767557e-06,
"loss": 1.4637,
"step": 1072
},
{
"epoch": 0.02502572833790564,
"grad_norm": 1.6429048776626587,
"learning_rate": 5.561026172583571e-06,
"loss": 1.2794,
"step": 1073
},
{
"epoch": 0.02504905147708356,
"grad_norm": 1.8730500936508179,
"learning_rate": 5.566208862399586e-06,
"loss": 1.4462,
"step": 1074
},
{
"epoch": 0.025072374616261477,
"grad_norm": 1.536036729812622,
"learning_rate": 5.5713915522156e-06,
"loss": 1.2484,
"step": 1075
},
{
"epoch": 0.025095697755439392,
"grad_norm": 1.2056294679641724,
"learning_rate": 5.576574242031615e-06,
"loss": 1.7819,
"step": 1076
},
{
"epoch": 0.02511902089461731,
"grad_norm": 1.4317046403884888,
"learning_rate": 5.5817569318476295e-06,
"loss": 1.5005,
"step": 1077
},
{
"epoch": 0.025142344033795228,
"grad_norm": 1.5313549041748047,
"learning_rate": 5.586939621663644e-06,
"loss": 1.6916,
"step": 1078
},
{
"epoch": 0.025165667172973145,
"grad_norm": 1.2438437938690186,
"learning_rate": 5.592122311479658e-06,
"loss": 1.4453,
"step": 1079
},
{
"epoch": 0.025188990312151063,
"grad_norm": 1.665187954902649,
"learning_rate": 5.597305001295673e-06,
"loss": 1.1324,
"step": 1080
},
{
"epoch": 0.02521231345132898,
"grad_norm": 1.910433053970337,
"learning_rate": 5.602487691111688e-06,
"loss": 2.003,
"step": 1081
},
{
"epoch": 0.0252356365905069,
"grad_norm": 1.6894274950027466,
"learning_rate": 5.607670380927702e-06,
"loss": 1.5041,
"step": 1082
},
{
"epoch": 0.025258959729684817,
"grad_norm": 1.246095061302185,
"learning_rate": 5.612853070743716e-06,
"loss": 1.7421,
"step": 1083
},
{
"epoch": 0.025282282868862735,
"grad_norm": 1.7268954515457153,
"learning_rate": 5.618035760559732e-06,
"loss": 1.4601,
"step": 1084
},
{
"epoch": 0.025305606008040653,
"grad_norm": 1.2897146940231323,
"learning_rate": 5.623218450375746e-06,
"loss": 1.4538,
"step": 1085
},
{
"epoch": 0.02532892914721857,
"grad_norm": 1.329236388206482,
"learning_rate": 5.62840114019176e-06,
"loss": 1.6763,
"step": 1086
},
{
"epoch": 0.02535225228639649,
"grad_norm": 1.4001597166061401,
"learning_rate": 5.633583830007775e-06,
"loss": 1.4887,
"step": 1087
},
{
"epoch": 0.025375575425574406,
"grad_norm": 2.036400079727173,
"learning_rate": 5.63876651982379e-06,
"loss": 1.4996,
"step": 1088
},
{
"epoch": 0.025398898564752324,
"grad_norm": 1.4963785409927368,
"learning_rate": 5.643949209639803e-06,
"loss": 1.6515,
"step": 1089
},
{
"epoch": 0.02542222170393024,
"grad_norm": 1.4221199750900269,
"learning_rate": 5.649131899455818e-06,
"loss": 1.814,
"step": 1090
},
{
"epoch": 0.025445544843108157,
"grad_norm": 1.7034932374954224,
"learning_rate": 5.654314589271832e-06,
"loss": 1.478,
"step": 1091
},
{
"epoch": 0.025468867982286075,
"grad_norm": 1.5419113636016846,
"learning_rate": 5.659497279087846e-06,
"loss": 1.8225,
"step": 1092
},
{
"epoch": 0.025492191121463992,
"grad_norm": 1.8337044715881348,
"learning_rate": 5.6646799689038616e-06,
"loss": 1.5037,
"step": 1093
},
{
"epoch": 0.02551551426064191,
"grad_norm": 1.3712172508239746,
"learning_rate": 5.669862658719876e-06,
"loss": 1.4449,
"step": 1094
},
{
"epoch": 0.02553883739981983,
"grad_norm": 1.312258005142212,
"learning_rate": 5.67504534853589e-06,
"loss": 1.5159,
"step": 1095
},
{
"epoch": 0.025562160538997746,
"grad_norm": 1.5284754037857056,
"learning_rate": 5.680228038351905e-06,
"loss": 1.4479,
"step": 1096
},
{
"epoch": 0.025585483678175664,
"grad_norm": 1.1178314685821533,
"learning_rate": 5.68541072816792e-06,
"loss": 1.4729,
"step": 1097
},
{
"epoch": 0.025608806817353582,
"grad_norm": 1.2439149618148804,
"learning_rate": 5.690593417983934e-06,
"loss": 1.436,
"step": 1098
},
{
"epoch": 0.0256321299565315,
"grad_norm": 1.580632209777832,
"learning_rate": 5.6957761077999485e-06,
"loss": 1.2718,
"step": 1099
},
{
"epoch": 0.025655453095709418,
"grad_norm": 1.6244875192642212,
"learning_rate": 5.700958797615963e-06,
"loss": 1.6024,
"step": 1100
},
{
"epoch": 0.025678776234887336,
"grad_norm": 1.2542647123336792,
"learning_rate": 5.706141487431977e-06,
"loss": 1.4344,
"step": 1101
},
{
"epoch": 0.025702099374065254,
"grad_norm": 1.227737307548523,
"learning_rate": 5.711324177247992e-06,
"loss": 1.2912,
"step": 1102
},
{
"epoch": 0.02572542251324317,
"grad_norm": 1.705132007598877,
"learning_rate": 5.716506867064007e-06,
"loss": 1.7786,
"step": 1103
},
{
"epoch": 0.02574874565242109,
"grad_norm": 1.4411309957504272,
"learning_rate": 5.721689556880021e-06,
"loss": 1.6456,
"step": 1104
},
{
"epoch": 0.025772068791599004,
"grad_norm": 1.5248507261276245,
"learning_rate": 5.7268722466960354e-06,
"loss": 1.308,
"step": 1105
},
{
"epoch": 0.02579539193077692,
"grad_norm": 1.3953535556793213,
"learning_rate": 5.732054936512051e-06,
"loss": 1.7294,
"step": 1106
},
{
"epoch": 0.02581871506995484,
"grad_norm": 2.0566859245300293,
"learning_rate": 5.737237626328065e-06,
"loss": 1.4392,
"step": 1107
},
{
"epoch": 0.025842038209132757,
"grad_norm": 1.4723169803619385,
"learning_rate": 5.742420316144079e-06,
"loss": 1.4799,
"step": 1108
},
{
"epoch": 0.025865361348310675,
"grad_norm": 1.4092565774917603,
"learning_rate": 5.747603005960094e-06,
"loss": 1.199,
"step": 1109
},
{
"epoch": 0.025888684487488593,
"grad_norm": 1.277365803718567,
"learning_rate": 5.752785695776109e-06,
"loss": 1.6108,
"step": 1110
},
{
"epoch": 0.02591200762666651,
"grad_norm": 2.465951919555664,
"learning_rate": 5.757968385592123e-06,
"loss": 1.6563,
"step": 1111
},
{
"epoch": 0.02593533076584443,
"grad_norm": 1.8686498403549194,
"learning_rate": 5.7631510754081376e-06,
"loss": 1.4241,
"step": 1112
},
{
"epoch": 0.025958653905022347,
"grad_norm": 1.6791915893554688,
"learning_rate": 5.768333765224152e-06,
"loss": 1.5922,
"step": 1113
},
{
"epoch": 0.025981977044200265,
"grad_norm": 1.7679352760314941,
"learning_rate": 5.773516455040167e-06,
"loss": 1.3589,
"step": 1114
},
{
"epoch": 0.026005300183378183,
"grad_norm": 1.535530686378479,
"learning_rate": 5.7786991448561814e-06,
"loss": 1.1027,
"step": 1115
},
{
"epoch": 0.0260286233225561,
"grad_norm": 1.5171246528625488,
"learning_rate": 5.783881834672196e-06,
"loss": 1.5711,
"step": 1116
},
{
"epoch": 0.02605194646173402,
"grad_norm": 1.101453185081482,
"learning_rate": 5.789064524488209e-06,
"loss": 1.2025,
"step": 1117
},
{
"epoch": 0.026075269600911936,
"grad_norm": 1.4143930673599243,
"learning_rate": 5.794247214304224e-06,
"loss": 1.4293,
"step": 1118
},
{
"epoch": 0.02609859274008985,
"grad_norm": 1.4917521476745605,
"learning_rate": 5.799429904120239e-06,
"loss": 1.5479,
"step": 1119
},
{
"epoch": 0.02612191587926777,
"grad_norm": 1.4023706912994385,
"learning_rate": 5.804612593936253e-06,
"loss": 1.7088,
"step": 1120
},
{
"epoch": 0.026145239018445687,
"grad_norm": 1.4056384563446045,
"learning_rate": 5.8097952837522675e-06,
"loss": 1.3657,
"step": 1121
},
{
"epoch": 0.026168562157623605,
"grad_norm": 1.3393616676330566,
"learning_rate": 5.814977973568282e-06,
"loss": 1.1497,
"step": 1122
},
{
"epoch": 0.026191885296801522,
"grad_norm": 1.6090584993362427,
"learning_rate": 5.820160663384296e-06,
"loss": 1.391,
"step": 1123
},
{
"epoch": 0.02621520843597944,
"grad_norm": 1.4391287565231323,
"learning_rate": 5.825343353200311e-06,
"loss": 1.4316,
"step": 1124
},
{
"epoch": 0.026238531575157358,
"grad_norm": 1.0588252544403076,
"learning_rate": 5.830526043016326e-06,
"loss": 1.3495,
"step": 1125
},
{
"epoch": 0.026261854714335276,
"grad_norm": 1.2646477222442627,
"learning_rate": 5.83570873283234e-06,
"loss": 1.9107,
"step": 1126
},
{
"epoch": 0.026285177853513194,
"grad_norm": 1.2594728469848633,
"learning_rate": 5.8408914226483545e-06,
"loss": 1.3878,
"step": 1127
},
{
"epoch": 0.026308500992691112,
"grad_norm": 2.413245677947998,
"learning_rate": 5.84607411246437e-06,
"loss": 1.2988,
"step": 1128
},
{
"epoch": 0.02633182413186903,
"grad_norm": 1.8143887519836426,
"learning_rate": 5.851256802280384e-06,
"loss": 1.8778,
"step": 1129
},
{
"epoch": 0.026355147271046948,
"grad_norm": 1.4549977779388428,
"learning_rate": 5.856439492096398e-06,
"loss": 1.7828,
"step": 1130
},
{
"epoch": 0.026378470410224866,
"grad_norm": 1.370773196220398,
"learning_rate": 5.861622181912413e-06,
"loss": 1.6647,
"step": 1131
},
{
"epoch": 0.026401793549402783,
"grad_norm": 1.7972664833068848,
"learning_rate": 5.866804871728428e-06,
"loss": 1.8871,
"step": 1132
},
{
"epoch": 0.026425116688580698,
"grad_norm": 1.6887913942337036,
"learning_rate": 5.871987561544442e-06,
"loss": 1.4938,
"step": 1133
},
{
"epoch": 0.026448439827758616,
"grad_norm": 1.4011859893798828,
"learning_rate": 5.8771702513604566e-06,
"loss": 1.2893,
"step": 1134
},
{
"epoch": 0.026471762966936534,
"grad_norm": 1.2820593118667603,
"learning_rate": 5.882352941176471e-06,
"loss": 1.8028,
"step": 1135
},
{
"epoch": 0.02649508610611445,
"grad_norm": 1.5501364469528198,
"learning_rate": 5.887535630992486e-06,
"loss": 1.5666,
"step": 1136
},
{
"epoch": 0.02651840924529237,
"grad_norm": 1.635021686553955,
"learning_rate": 5.8927183208085005e-06,
"loss": 1.4217,
"step": 1137
},
{
"epoch": 0.026541732384470287,
"grad_norm": 1.780432105064392,
"learning_rate": 5.897901010624515e-06,
"loss": 1.5926,
"step": 1138
},
{
"epoch": 0.026565055523648205,
"grad_norm": 1.747233271598816,
"learning_rate": 5.903083700440529e-06,
"loss": 1.7011,
"step": 1139
},
{
"epoch": 0.026588378662826123,
"grad_norm": 1.6612962484359741,
"learning_rate": 5.908266390256544e-06,
"loss": 1.1466,
"step": 1140
},
{
"epoch": 0.02661170180200404,
"grad_norm": 1.906965732574463,
"learning_rate": 5.913449080072559e-06,
"loss": 1.2679,
"step": 1141
},
{
"epoch": 0.02663502494118196,
"grad_norm": 1.3008593320846558,
"learning_rate": 5.918631769888573e-06,
"loss": 1.1242,
"step": 1142
},
{
"epoch": 0.026658348080359877,
"grad_norm": 1.2631815671920776,
"learning_rate": 5.923814459704587e-06,
"loss": 1.6476,
"step": 1143
},
{
"epoch": 0.026681671219537795,
"grad_norm": 1.3338450193405151,
"learning_rate": 5.928997149520601e-06,
"loss": 1.6404,
"step": 1144
},
{
"epoch": 0.026704994358715713,
"grad_norm": 1.4749959707260132,
"learning_rate": 5.934179839336616e-06,
"loss": 1.4754,
"step": 1145
},
{
"epoch": 0.02672831749789363,
"grad_norm": 1.399997353553772,
"learning_rate": 5.9393625291526304e-06,
"loss": 1.776,
"step": 1146
},
{
"epoch": 0.026751640637071545,
"grad_norm": 1.6688719987869263,
"learning_rate": 5.944545218968645e-06,
"loss": 1.4341,
"step": 1147
},
{
"epoch": 0.026774963776249463,
"grad_norm": 1.2055866718292236,
"learning_rate": 5.949727908784659e-06,
"loss": 1.366,
"step": 1148
},
{
"epoch": 0.02679828691542738,
"grad_norm": 1.834375262260437,
"learning_rate": 5.9549105986006735e-06,
"loss": 1.7205,
"step": 1149
},
{
"epoch": 0.0268216100546053,
"grad_norm": 1.6463091373443604,
"learning_rate": 5.960093288416689e-06,
"loss": 1.2175,
"step": 1150
},
{
"epoch": 0.026844933193783217,
"grad_norm": 1.2439314126968384,
"learning_rate": 5.965275978232703e-06,
"loss": 1.1599,
"step": 1151
},
{
"epoch": 0.026868256332961134,
"grad_norm": 1.428876519203186,
"learning_rate": 5.970458668048717e-06,
"loss": 1.7428,
"step": 1152
},
{
"epoch": 0.026891579472139052,
"grad_norm": 1.3530622720718384,
"learning_rate": 5.975641357864732e-06,
"loss": 1.4968,
"step": 1153
},
{
"epoch": 0.02691490261131697,
"grad_norm": 2.7352559566497803,
"learning_rate": 5.980824047680747e-06,
"loss": 1.5478,
"step": 1154
},
{
"epoch": 0.026938225750494888,
"grad_norm": 1.8357428312301636,
"learning_rate": 5.986006737496761e-06,
"loss": 1.5217,
"step": 1155
},
{
"epoch": 0.026961548889672806,
"grad_norm": 1.3974493741989136,
"learning_rate": 5.991189427312776e-06,
"loss": 1.6203,
"step": 1156
},
{
"epoch": 0.026984872028850724,
"grad_norm": 1.3089922666549683,
"learning_rate": 5.99637211712879e-06,
"loss": 1.7992,
"step": 1157
},
{
"epoch": 0.027008195168028642,
"grad_norm": 1.8275575637817383,
"learning_rate": 6.001554806944805e-06,
"loss": 1.4841,
"step": 1158
},
{
"epoch": 0.02703151830720656,
"grad_norm": 2.55710506439209,
"learning_rate": 6.0067374967608195e-06,
"loss": 1.3043,
"step": 1159
},
{
"epoch": 0.027054841446384478,
"grad_norm": 2.4591903686523438,
"learning_rate": 6.011920186576834e-06,
"loss": 1.3368,
"step": 1160
},
{
"epoch": 0.027078164585562395,
"grad_norm": 1.9370126724243164,
"learning_rate": 6.017102876392848e-06,
"loss": 1.4075,
"step": 1161
},
{
"epoch": 0.02710148772474031,
"grad_norm": 1.4310760498046875,
"learning_rate": 6.022285566208863e-06,
"loss": 1.5424,
"step": 1162
},
{
"epoch": 0.027124810863918228,
"grad_norm": 1.3892368078231812,
"learning_rate": 6.027468256024878e-06,
"loss": 1.6432,
"step": 1163
},
{
"epoch": 0.027148134003096146,
"grad_norm": 1.4820071458816528,
"learning_rate": 6.032650945840892e-06,
"loss": 1.409,
"step": 1164
},
{
"epoch": 0.027171457142274064,
"grad_norm": 1.1135878562927246,
"learning_rate": 6.037833635656906e-06,
"loss": 1.5977,
"step": 1165
},
{
"epoch": 0.02719478028145198,
"grad_norm": 1.6016969680786133,
"learning_rate": 6.043016325472922e-06,
"loss": 1.6486,
"step": 1166
},
{
"epoch": 0.0272181034206299,
"grad_norm": 1.5183762311935425,
"learning_rate": 6.048199015288936e-06,
"loss": 1.4068,
"step": 1167
},
{
"epoch": 0.027241426559807817,
"grad_norm": 1.4730808734893799,
"learning_rate": 6.05338170510495e-06,
"loss": 1.6202,
"step": 1168
},
{
"epoch": 0.027264749698985735,
"grad_norm": 1.4382350444793701,
"learning_rate": 6.058564394920965e-06,
"loss": 1.7055,
"step": 1169
},
{
"epoch": 0.027288072838163653,
"grad_norm": 0.9570834040641785,
"learning_rate": 6.06374708473698e-06,
"loss": 0.8602,
"step": 1170
},
{
"epoch": 0.02731139597734157,
"grad_norm": 1.2127379179000854,
"learning_rate": 6.068929774552994e-06,
"loss": 1.5333,
"step": 1171
},
{
"epoch": 0.02733471911651949,
"grad_norm": 1.5822348594665527,
"learning_rate": 6.074112464369008e-06,
"loss": 0.9605,
"step": 1172
},
{
"epoch": 0.027358042255697407,
"grad_norm": 1.3108526468276978,
"learning_rate": 6.079295154185022e-06,
"loss": 1.1987,
"step": 1173
},
{
"epoch": 0.027381365394875325,
"grad_norm": 2.005154848098755,
"learning_rate": 6.084477844001036e-06,
"loss": 1.7214,
"step": 1174
},
{
"epoch": 0.027404688534053243,
"grad_norm": 2.299222707748413,
"learning_rate": 6.089660533817051e-06,
"loss": 1.5244,
"step": 1175
},
{
"epoch": 0.027428011673231157,
"grad_norm": 1.2665340900421143,
"learning_rate": 6.094843223633066e-06,
"loss": 1.1735,
"step": 1176
},
{
"epoch": 0.027451334812409075,
"grad_norm": 1.418123483657837,
"learning_rate": 6.10002591344908e-06,
"loss": 1.6755,
"step": 1177
},
{
"epoch": 0.027474657951586993,
"grad_norm": 1.4280682802200317,
"learning_rate": 6.105208603265095e-06,
"loss": 1.6664,
"step": 1178
},
{
"epoch": 0.02749798109076491,
"grad_norm": 2.0804097652435303,
"learning_rate": 6.110391293081109e-06,
"loss": 1.4688,
"step": 1179
},
{
"epoch": 0.02752130422994283,
"grad_norm": 1.7536234855651855,
"learning_rate": 6.115573982897124e-06,
"loss": 1.5823,
"step": 1180
},
{
"epoch": 0.027544627369120746,
"grad_norm": 1.1604044437408447,
"learning_rate": 6.1207566727131385e-06,
"loss": 1.4818,
"step": 1181
},
{
"epoch": 0.027567950508298664,
"grad_norm": 1.3865594863891602,
"learning_rate": 6.125939362529153e-06,
"loss": 1.5467,
"step": 1182
},
{
"epoch": 0.027591273647476582,
"grad_norm": 1.526190996170044,
"learning_rate": 6.131122052345167e-06,
"loss": 1.3397,
"step": 1183
},
{
"epoch": 0.0276145967866545,
"grad_norm": 1.6010215282440186,
"learning_rate": 6.136304742161182e-06,
"loss": 1.5507,
"step": 1184
},
{
"epoch": 0.027637919925832418,
"grad_norm": 1.4297575950622559,
"learning_rate": 6.141487431977197e-06,
"loss": 1.397,
"step": 1185
},
{
"epoch": 0.027661243065010336,
"grad_norm": 1.380254864692688,
"learning_rate": 6.146670121793211e-06,
"loss": 1.251,
"step": 1186
},
{
"epoch": 0.027684566204188254,
"grad_norm": 1.5398340225219727,
"learning_rate": 6.1518528116092254e-06,
"loss": 1.7319,
"step": 1187
},
{
"epoch": 0.027707889343366172,
"grad_norm": 1.8836907148361206,
"learning_rate": 6.157035501425241e-06,
"loss": 1.1504,
"step": 1188
},
{
"epoch": 0.02773121248254409,
"grad_norm": 1.200628399848938,
"learning_rate": 6.162218191241255e-06,
"loss": 1.5138,
"step": 1189
},
{
"epoch": 0.027754535621722004,
"grad_norm": 1.7400058507919312,
"learning_rate": 6.167400881057269e-06,
"loss": 1.5398,
"step": 1190
},
{
"epoch": 0.027777858760899922,
"grad_norm": 1.2723171710968018,
"learning_rate": 6.172583570873284e-06,
"loss": 1.1157,
"step": 1191
},
{
"epoch": 0.02780118190007784,
"grad_norm": 1.4392553567886353,
"learning_rate": 6.177766260689299e-06,
"loss": 1.7444,
"step": 1192
},
{
"epoch": 0.027824505039255758,
"grad_norm": 1.533337950706482,
"learning_rate": 6.182948950505313e-06,
"loss": 1.4784,
"step": 1193
},
{
"epoch": 0.027847828178433676,
"grad_norm": 1.5458931922912598,
"learning_rate": 6.1881316403213276e-06,
"loss": 1.8139,
"step": 1194
},
{
"epoch": 0.027871151317611594,
"grad_norm": 1.133946180343628,
"learning_rate": 6.193314330137342e-06,
"loss": 1.5137,
"step": 1195
},
{
"epoch": 0.02789447445678951,
"grad_norm": 1.458628535270691,
"learning_rate": 6.198497019953357e-06,
"loss": 1.3172,
"step": 1196
},
{
"epoch": 0.02791779759596743,
"grad_norm": 2.2303454875946045,
"learning_rate": 6.2036797097693714e-06,
"loss": 1.2295,
"step": 1197
},
{
"epoch": 0.027941120735145347,
"grad_norm": 1.2555915117263794,
"learning_rate": 6.208862399585386e-06,
"loss": 1.5021,
"step": 1198
},
{
"epoch": 0.027964443874323265,
"grad_norm": 1.7872976064682007,
"learning_rate": 6.2140450894014e-06,
"loss": 0.9375,
"step": 1199
},
{
"epoch": 0.027987767013501183,
"grad_norm": 1.5110255479812622,
"learning_rate": 6.219227779217414e-06,
"loss": 1.871,
"step": 1200
},
{
"epoch": 0.0280110901526791,
"grad_norm": 1.5963770151138306,
"learning_rate": 6.224410469033428e-06,
"loss": 1.6184,
"step": 1201
},
{
"epoch": 0.02803441329185702,
"grad_norm": 1.7600239515304565,
"learning_rate": 6.229593158849443e-06,
"loss": 1.5337,
"step": 1202
},
{
"epoch": 0.028057736431034937,
"grad_norm": 1.3252232074737549,
"learning_rate": 6.2347758486654575e-06,
"loss": 1.4088,
"step": 1203
},
{
"epoch": 0.02808105957021285,
"grad_norm": 1.3839343786239624,
"learning_rate": 6.239958538481472e-06,
"loss": 1.305,
"step": 1204
},
{
"epoch": 0.02810438270939077,
"grad_norm": 1.6570122241973877,
"learning_rate": 6.245141228297486e-06,
"loss": 1.5596,
"step": 1205
},
{
"epoch": 0.028127705848568687,
"grad_norm": 1.4685866832733154,
"learning_rate": 6.250323918113501e-06,
"loss": 1.4931,
"step": 1206
},
{
"epoch": 0.028151028987746605,
"grad_norm": 1.263984203338623,
"learning_rate": 6.255506607929516e-06,
"loss": 1.5393,
"step": 1207
},
{
"epoch": 0.028174352126924523,
"grad_norm": 1.8634412288665771,
"learning_rate": 6.26068929774553e-06,
"loss": 1.2369,
"step": 1208
},
{
"epoch": 0.02819767526610244,
"grad_norm": 1.676034927368164,
"learning_rate": 6.2658719875615444e-06,
"loss": 1.5886,
"step": 1209
},
{
"epoch": 0.02822099840528036,
"grad_norm": 1.7271007299423218,
"learning_rate": 6.27105467737756e-06,
"loss": 1.2692,
"step": 1210
},
{
"epoch": 0.028244321544458276,
"grad_norm": 1.4238859415054321,
"learning_rate": 6.276237367193574e-06,
"loss": 1.6261,
"step": 1211
},
{
"epoch": 0.028267644683636194,
"grad_norm": 2.13999080657959,
"learning_rate": 6.281420057009588e-06,
"loss": 1.7009,
"step": 1212
},
{
"epoch": 0.028290967822814112,
"grad_norm": 2.1164069175720215,
"learning_rate": 6.286602746825603e-06,
"loss": 1.4856,
"step": 1213
},
{
"epoch": 0.02831429096199203,
"grad_norm": 1.6996465921401978,
"learning_rate": 6.291785436641618e-06,
"loss": 1.4621,
"step": 1214
},
{
"epoch": 0.028337614101169948,
"grad_norm": 1.466536045074463,
"learning_rate": 6.296968126457632e-06,
"loss": 1.5882,
"step": 1215
},
{
"epoch": 0.028360937240347866,
"grad_norm": 1.7248129844665527,
"learning_rate": 6.3021508162736466e-06,
"loss": 1.658,
"step": 1216
},
{
"epoch": 0.028384260379525784,
"grad_norm": 1.7973899841308594,
"learning_rate": 6.307333506089661e-06,
"loss": 1.4981,
"step": 1217
},
{
"epoch": 0.0284075835187037,
"grad_norm": 1.4502708911895752,
"learning_rate": 6.312516195905676e-06,
"loss": 1.8872,
"step": 1218
},
{
"epoch": 0.028430906657881616,
"grad_norm": 1.592411756515503,
"learning_rate": 6.3176988857216905e-06,
"loss": 1.4145,
"step": 1219
},
{
"epoch": 0.028454229797059534,
"grad_norm": 1.931400179862976,
"learning_rate": 6.322881575537705e-06,
"loss": 1.6221,
"step": 1220
},
{
"epoch": 0.028477552936237452,
"grad_norm": 1.5922832489013672,
"learning_rate": 6.328064265353719e-06,
"loss": 1.3897,
"step": 1221
},
{
"epoch": 0.02850087607541537,
"grad_norm": 1.4899603128433228,
"learning_rate": 6.333246955169734e-06,
"loss": 1.66,
"step": 1222
},
{
"epoch": 0.028524199214593288,
"grad_norm": 1.3820170164108276,
"learning_rate": 6.338429644985749e-06,
"loss": 1.8425,
"step": 1223
},
{
"epoch": 0.028547522353771206,
"grad_norm": 1.6127132177352905,
"learning_rate": 6.343612334801763e-06,
"loss": 1.3965,
"step": 1224
},
{
"epoch": 0.028570845492949123,
"grad_norm": 1.927259922027588,
"learning_rate": 6.348795024617777e-06,
"loss": 1.486,
"step": 1225
},
{
"epoch": 0.02859416863212704,
"grad_norm": 1.5987411737442017,
"learning_rate": 6.353977714433793e-06,
"loss": 1.4371,
"step": 1226
},
{
"epoch": 0.02861749177130496,
"grad_norm": 1.7805335521697998,
"learning_rate": 6.359160404249805e-06,
"loss": 1.56,
"step": 1227
},
{
"epoch": 0.028640814910482877,
"grad_norm": 1.7960704565048218,
"learning_rate": 6.3643430940658204e-06,
"loss": 1.5536,
"step": 1228
},
{
"epoch": 0.028664138049660795,
"grad_norm": 1.4014300107955933,
"learning_rate": 6.369525783881835e-06,
"loss": 1.4391,
"step": 1229
},
{
"epoch": 0.028687461188838713,
"grad_norm": 1.7049264907836914,
"learning_rate": 6.374708473697849e-06,
"loss": 1.9225,
"step": 1230
},
{
"epoch": 0.02871078432801663,
"grad_norm": 1.9948570728302002,
"learning_rate": 6.3798911635138635e-06,
"loss": 1.6279,
"step": 1231
},
{
"epoch": 0.02873410746719455,
"grad_norm": 2.101736068725586,
"learning_rate": 6.385073853329879e-06,
"loss": 1.5433,
"step": 1232
},
{
"epoch": 0.028757430606372463,
"grad_norm": 1.342325210571289,
"learning_rate": 6.390256543145893e-06,
"loss": 1.3606,
"step": 1233
},
{
"epoch": 0.02878075374555038,
"grad_norm": 1.5539692640304565,
"learning_rate": 6.395439232961907e-06,
"loss": 1.4339,
"step": 1234
},
{
"epoch": 0.0288040768847283,
"grad_norm": 1.6053344011306763,
"learning_rate": 6.400621922777922e-06,
"loss": 1.5735,
"step": 1235
},
{
"epoch": 0.028827400023906217,
"grad_norm": 1.1527775526046753,
"learning_rate": 6.405804612593937e-06,
"loss": 1.3265,
"step": 1236
},
{
"epoch": 0.028850723163084135,
"grad_norm": 2.401747465133667,
"learning_rate": 6.410987302409951e-06,
"loss": 1.3331,
"step": 1237
},
{
"epoch": 0.028874046302262053,
"grad_norm": 1.372536301612854,
"learning_rate": 6.416169992225966e-06,
"loss": 1.6371,
"step": 1238
},
{
"epoch": 0.02889736944143997,
"grad_norm": 1.528669834136963,
"learning_rate": 6.42135268204198e-06,
"loss": 1.4658,
"step": 1239
},
{
"epoch": 0.02892069258061789,
"grad_norm": 1.7370809316635132,
"learning_rate": 6.426535371857995e-06,
"loss": 1.4893,
"step": 1240
},
{
"epoch": 0.028944015719795806,
"grad_norm": 1.5757806301116943,
"learning_rate": 6.4317180616740095e-06,
"loss": 1.2563,
"step": 1241
},
{
"epoch": 0.028967338858973724,
"grad_norm": 1.2458890676498413,
"learning_rate": 6.436900751490024e-06,
"loss": 1.6522,
"step": 1242
},
{
"epoch": 0.028990661998151642,
"grad_norm": 1.743046760559082,
"learning_rate": 6.442083441306038e-06,
"loss": 1.6444,
"step": 1243
},
{
"epoch": 0.02901398513732956,
"grad_norm": 1.5543162822723389,
"learning_rate": 6.447266131122053e-06,
"loss": 1.6381,
"step": 1244
},
{
"epoch": 0.029037308276507478,
"grad_norm": 1.3490428924560547,
"learning_rate": 6.452448820938068e-06,
"loss": 1.4615,
"step": 1245
},
{
"epoch": 0.029060631415685396,
"grad_norm": 1.3732086420059204,
"learning_rate": 6.457631510754082e-06,
"loss": 1.4085,
"step": 1246
},
{
"epoch": 0.02908395455486331,
"grad_norm": 2.9364993572235107,
"learning_rate": 6.462814200570096e-06,
"loss": 1.4811,
"step": 1247
},
{
"epoch": 0.029107277694041228,
"grad_norm": 1.2069623470306396,
"learning_rate": 6.467996890386112e-06,
"loss": 1.3635,
"step": 1248
},
{
"epoch": 0.029130600833219146,
"grad_norm": 1.2883137464523315,
"learning_rate": 6.473179580202126e-06,
"loss": 1.4202,
"step": 1249
},
{
"epoch": 0.029153923972397064,
"grad_norm": 1.592976689338684,
"learning_rate": 6.47836227001814e-06,
"loss": 2.1116,
"step": 1250
},
{
"epoch": 0.029177247111574982,
"grad_norm": 1.394774079322815,
"learning_rate": 6.483544959834155e-06,
"loss": 1.5042,
"step": 1251
},
{
"epoch": 0.0292005702507529,
"grad_norm": 1.2127888202667236,
"learning_rate": 6.48872764965017e-06,
"loss": 1.3806,
"step": 1252
},
{
"epoch": 0.029223893389930818,
"grad_norm": 1.5445924997329712,
"learning_rate": 6.493910339466184e-06,
"loss": 1.5067,
"step": 1253
},
{
"epoch": 0.029247216529108735,
"grad_norm": 2.4520442485809326,
"learning_rate": 6.4990930292821985e-06,
"loss": 1.3649,
"step": 1254
},
{
"epoch": 0.029270539668286653,
"grad_norm": 2.032709836959839,
"learning_rate": 6.504275719098212e-06,
"loss": 1.2058,
"step": 1255
},
{
"epoch": 0.02929386280746457,
"grad_norm": 1.3742554187774658,
"learning_rate": 6.509458408914226e-06,
"loss": 1.4328,
"step": 1256
},
{
"epoch": 0.02931718594664249,
"grad_norm": 1.4859979152679443,
"learning_rate": 6.514641098730241e-06,
"loss": 1.6409,
"step": 1257
},
{
"epoch": 0.029340509085820407,
"grad_norm": 1.6881428956985474,
"learning_rate": 6.519823788546256e-06,
"loss": 1.6298,
"step": 1258
},
{
"epoch": 0.029363832224998325,
"grad_norm": 1.892412543296814,
"learning_rate": 6.52500647836227e-06,
"loss": 1.6898,
"step": 1259
},
{
"epoch": 0.029387155364176243,
"grad_norm": 1.4890961647033691,
"learning_rate": 6.530189168178285e-06,
"loss": 1.6164,
"step": 1260
},
{
"epoch": 0.029410478503354157,
"grad_norm": 1.530034065246582,
"learning_rate": 6.535371857994299e-06,
"loss": 1.4036,
"step": 1261
},
{
"epoch": 0.029433801642532075,
"grad_norm": 1.4801392555236816,
"learning_rate": 6.540554547810314e-06,
"loss": 1.5928,
"step": 1262
},
{
"epoch": 0.029457124781709993,
"grad_norm": 1.4419362545013428,
"learning_rate": 6.5457372376263285e-06,
"loss": 1.7833,
"step": 1263
},
{
"epoch": 0.02948044792088791,
"grad_norm": 1.6963889598846436,
"learning_rate": 6.550919927442343e-06,
"loss": 1.7366,
"step": 1264
},
{
"epoch": 0.02950377106006583,
"grad_norm": 1.4853816032409668,
"learning_rate": 6.556102617258357e-06,
"loss": 1.2297,
"step": 1265
},
{
"epoch": 0.029527094199243747,
"grad_norm": 1.6151559352874756,
"learning_rate": 6.561285307074372e-06,
"loss": 2.0062,
"step": 1266
},
{
"epoch": 0.029550417338421665,
"grad_norm": 1.3132925033569336,
"learning_rate": 6.566467996890387e-06,
"loss": 1.7708,
"step": 1267
},
{
"epoch": 0.029573740477599583,
"grad_norm": 1.4057172536849976,
"learning_rate": 6.571650686706401e-06,
"loss": 1.5811,
"step": 1268
},
{
"epoch": 0.0295970636167775,
"grad_norm": 1.5369668006896973,
"learning_rate": 6.5768333765224154e-06,
"loss": 1.5121,
"step": 1269
},
{
"epoch": 0.02962038675595542,
"grad_norm": 1.6567087173461914,
"learning_rate": 6.582016066338431e-06,
"loss": 1.2413,
"step": 1270
},
{
"epoch": 0.029643709895133336,
"grad_norm": 1.3374396562576294,
"learning_rate": 6.587198756154445e-06,
"loss": 1.5594,
"step": 1271
},
{
"epoch": 0.029667033034311254,
"grad_norm": 1.4892241954803467,
"learning_rate": 6.592381445970459e-06,
"loss": 1.6287,
"step": 1272
},
{
"epoch": 0.029690356173489172,
"grad_norm": 2.012141466140747,
"learning_rate": 6.597564135786474e-06,
"loss": 1.7356,
"step": 1273
},
{
"epoch": 0.02971367931266709,
"grad_norm": 2.2330586910247803,
"learning_rate": 6.602746825602489e-06,
"loss": 1.1928,
"step": 1274
},
{
"epoch": 0.029737002451845004,
"grad_norm": 1.7101742029190063,
"learning_rate": 6.607929515418503e-06,
"loss": 1.497,
"step": 1275
},
{
"epoch": 0.029760325591022922,
"grad_norm": 1.4773057699203491,
"learning_rate": 6.6131122052345175e-06,
"loss": 1.4135,
"step": 1276
},
{
"epoch": 0.02978364873020084,
"grad_norm": 1.4007784128189087,
"learning_rate": 6.618294895050532e-06,
"loss": 1.3921,
"step": 1277
},
{
"epoch": 0.029806971869378758,
"grad_norm": 1.7430599927902222,
"learning_rate": 6.623477584866547e-06,
"loss": 1.5352,
"step": 1278
},
{
"epoch": 0.029830295008556676,
"grad_norm": 2.562096118927002,
"learning_rate": 6.6286602746825614e-06,
"loss": 1.5325,
"step": 1279
},
{
"epoch": 0.029853618147734594,
"grad_norm": 1.192498803138733,
"learning_rate": 6.633842964498576e-06,
"loss": 1.1816,
"step": 1280
},
{
"epoch": 0.02987694128691251,
"grad_norm": 2.39277982711792,
"learning_rate": 6.63902565431459e-06,
"loss": 1.3732,
"step": 1281
},
{
"epoch": 0.02990026442609043,
"grad_norm": 1.3731800317764282,
"learning_rate": 6.644208344130604e-06,
"loss": 1.4175,
"step": 1282
},
{
"epoch": 0.029923587565268348,
"grad_norm": 2.297088146209717,
"learning_rate": 6.649391033946618e-06,
"loss": 1.5919,
"step": 1283
},
{
"epoch": 0.029946910704446265,
"grad_norm": 1.1062113046646118,
"learning_rate": 6.654573723762633e-06,
"loss": 1.3707,
"step": 1284
},
{
"epoch": 0.029970233843624183,
"grad_norm": 2.175673246383667,
"learning_rate": 6.6597564135786475e-06,
"loss": 1.4268,
"step": 1285
},
{
"epoch": 0.0299935569828021,
"grad_norm": 1.57578444480896,
"learning_rate": 6.664939103394662e-06,
"loss": 1.6065,
"step": 1286
},
{
"epoch": 0.03001688012198002,
"grad_norm": 1.757105827331543,
"learning_rate": 6.670121793210676e-06,
"loss": 1.5827,
"step": 1287
},
{
"epoch": 0.030040203261157937,
"grad_norm": 1.6778910160064697,
"learning_rate": 6.675304483026691e-06,
"loss": 1.4697,
"step": 1288
},
{
"epoch": 0.030063526400335855,
"grad_norm": 1.4940367937088013,
"learning_rate": 6.680487172842706e-06,
"loss": 1.2309,
"step": 1289
},
{
"epoch": 0.03008684953951377,
"grad_norm": 2.175011157989502,
"learning_rate": 6.68566986265872e-06,
"loss": 0.9675,
"step": 1290
},
{
"epoch": 0.030110172678691687,
"grad_norm": 2.0137412548065186,
"learning_rate": 6.6908525524747344e-06,
"loss": 1.6618,
"step": 1291
},
{
"epoch": 0.030133495817869605,
"grad_norm": 1.3541489839553833,
"learning_rate": 6.69603524229075e-06,
"loss": 1.2989,
"step": 1292
},
{
"epoch": 0.030156818957047523,
"grad_norm": 1.9265953302383423,
"learning_rate": 6.701217932106764e-06,
"loss": 1.3859,
"step": 1293
},
{
"epoch": 0.03018014209622544,
"grad_norm": 1.899145483970642,
"learning_rate": 6.706400621922778e-06,
"loss": 1.2468,
"step": 1294
},
{
"epoch": 0.03020346523540336,
"grad_norm": 1.6764010190963745,
"learning_rate": 6.711583311738793e-06,
"loss": 1.4796,
"step": 1295
},
{
"epoch": 0.030226788374581277,
"grad_norm": 1.502276062965393,
"learning_rate": 6.716766001554808e-06,
"loss": 1.6102,
"step": 1296
},
{
"epoch": 0.030250111513759195,
"grad_norm": 1.742180347442627,
"learning_rate": 6.721948691370822e-06,
"loss": 1.4743,
"step": 1297
},
{
"epoch": 0.030273434652937112,
"grad_norm": 1.503127098083496,
"learning_rate": 6.7271313811868366e-06,
"loss": 1.7023,
"step": 1298
},
{
"epoch": 0.03029675779211503,
"grad_norm": 1.4494696855545044,
"learning_rate": 6.732314071002851e-06,
"loss": 1.6774,
"step": 1299
},
{
"epoch": 0.030320080931292948,
"grad_norm": 1.3726390600204468,
"learning_rate": 6.737496760818866e-06,
"loss": 1.6272,
"step": 1300
},
{
"epoch": 0.030343404070470866,
"grad_norm": 1.6922540664672852,
"learning_rate": 6.7426794506348805e-06,
"loss": 1.6249,
"step": 1301
},
{
"epoch": 0.030366727209648784,
"grad_norm": 1.3822194337844849,
"learning_rate": 6.747862140450895e-06,
"loss": 1.779,
"step": 1302
},
{
"epoch": 0.030390050348826702,
"grad_norm": 1.2841784954071045,
"learning_rate": 6.753044830266909e-06,
"loss": 1.2516,
"step": 1303
},
{
"epoch": 0.030413373488004616,
"grad_norm": 2.045302152633667,
"learning_rate": 6.758227520082924e-06,
"loss": 1.4461,
"step": 1304
},
{
"epoch": 0.030436696627182534,
"grad_norm": 1.6968058347702026,
"learning_rate": 6.763410209898939e-06,
"loss": 1.545,
"step": 1305
},
{
"epoch": 0.030460019766360452,
"grad_norm": 1.6409857273101807,
"learning_rate": 6.768592899714953e-06,
"loss": 1.7205,
"step": 1306
},
{
"epoch": 0.03048334290553837,
"grad_norm": 1.2925307750701904,
"learning_rate": 6.773775589530967e-06,
"loss": 1.5889,
"step": 1307
},
{
"epoch": 0.030506666044716288,
"grad_norm": 1.4610506296157837,
"learning_rate": 6.778958279346982e-06,
"loss": 1.49,
"step": 1308
},
{
"epoch": 0.030529989183894206,
"grad_norm": 1.5941089391708374,
"learning_rate": 6.784140969162997e-06,
"loss": 1.8275,
"step": 1309
},
{
"epoch": 0.030553312323072124,
"grad_norm": 1.2063391208648682,
"learning_rate": 6.7893236589790104e-06,
"loss": 1.2659,
"step": 1310
},
{
"epoch": 0.03057663546225004,
"grad_norm": 1.512366771697998,
"learning_rate": 6.794506348795025e-06,
"loss": 1.502,
"step": 1311
},
{
"epoch": 0.03059995860142796,
"grad_norm": 2.0490636825561523,
"learning_rate": 6.799689038611039e-06,
"loss": 1.4567,
"step": 1312
},
{
"epoch": 0.030623281740605877,
"grad_norm": 2.196171522140503,
"learning_rate": 6.8048717284270535e-06,
"loss": 1.7189,
"step": 1313
},
{
"epoch": 0.030646604879783795,
"grad_norm": 1.434403419494629,
"learning_rate": 6.810054418243069e-06,
"loss": 1.4947,
"step": 1314
},
{
"epoch": 0.030669928018961713,
"grad_norm": 1.3586199283599854,
"learning_rate": 6.815237108059083e-06,
"loss": 1.5511,
"step": 1315
},
{
"epoch": 0.03069325115813963,
"grad_norm": 1.7212327718734741,
"learning_rate": 6.820419797875097e-06,
"loss": 1.625,
"step": 1316
},
{
"epoch": 0.03071657429731755,
"grad_norm": 1.7246372699737549,
"learning_rate": 6.825602487691112e-06,
"loss": 1.6043,
"step": 1317
},
{
"epoch": 0.030739897436495463,
"grad_norm": 1.401949405670166,
"learning_rate": 6.830785177507127e-06,
"loss": 0.9642,
"step": 1318
},
{
"epoch": 0.03076322057567338,
"grad_norm": 1.6501095294952393,
"learning_rate": 6.835967867323141e-06,
"loss": 1.4776,
"step": 1319
},
{
"epoch": 0.0307865437148513,
"grad_norm": 1.266641616821289,
"learning_rate": 6.841150557139156e-06,
"loss": 1.1332,
"step": 1320
},
{
"epoch": 0.030809866854029217,
"grad_norm": 1.0934447050094604,
"learning_rate": 6.84633324695517e-06,
"loss": 1.6201,
"step": 1321
},
{
"epoch": 0.030833189993207135,
"grad_norm": 1.4711166620254517,
"learning_rate": 6.851515936771185e-06,
"loss": 1.401,
"step": 1322
},
{
"epoch": 0.030856513132385053,
"grad_norm": 1.609348177909851,
"learning_rate": 6.8566986265871995e-06,
"loss": 1.5497,
"step": 1323
},
{
"epoch": 0.03087983627156297,
"grad_norm": 1.277185082435608,
"learning_rate": 6.861881316403214e-06,
"loss": 1.5056,
"step": 1324
},
{
"epoch": 0.03090315941074089,
"grad_norm": 1.4644626379013062,
"learning_rate": 6.867064006219228e-06,
"loss": 1.3443,
"step": 1325
},
{
"epoch": 0.030926482549918807,
"grad_norm": 1.4824533462524414,
"learning_rate": 6.872246696035243e-06,
"loss": 1.5054,
"step": 1326
},
{
"epoch": 0.030949805689096724,
"grad_norm": 1.4885330200195312,
"learning_rate": 6.877429385851258e-06,
"loss": 1.4403,
"step": 1327
},
{
"epoch": 0.030973128828274642,
"grad_norm": 1.639889121055603,
"learning_rate": 6.882612075667272e-06,
"loss": 1.7286,
"step": 1328
},
{
"epoch": 0.03099645196745256,
"grad_norm": 1.2644333839416504,
"learning_rate": 6.887794765483286e-06,
"loss": 1.4472,
"step": 1329
},
{
"epoch": 0.031019775106630478,
"grad_norm": 1.4533531665802002,
"learning_rate": 6.892977455299302e-06,
"loss": 1.6504,
"step": 1330
},
{
"epoch": 0.031043098245808396,
"grad_norm": 1.5860834121704102,
"learning_rate": 6.898160145115316e-06,
"loss": 1.3219,
"step": 1331
},
{
"epoch": 0.03106642138498631,
"grad_norm": 1.4244756698608398,
"learning_rate": 6.90334283493133e-06,
"loss": 1.2863,
"step": 1332
},
{
"epoch": 0.03108974452416423,
"grad_norm": 1.7279314994812012,
"learning_rate": 6.908525524747345e-06,
"loss": 1.5325,
"step": 1333
},
{
"epoch": 0.031113067663342146,
"grad_norm": 1.3759844303131104,
"learning_rate": 6.913708214563359e-06,
"loss": 1.7333,
"step": 1334
},
{
"epoch": 0.031136390802520064,
"grad_norm": 1.3596171140670776,
"learning_rate": 6.918890904379374e-06,
"loss": 1.4572,
"step": 1335
},
{
"epoch": 0.031159713941697982,
"grad_norm": 1.4598828554153442,
"learning_rate": 6.9240735941953885e-06,
"loss": 1.5375,
"step": 1336
},
{
"epoch": 0.0311830370808759,
"grad_norm": 1.7578270435333252,
"learning_rate": 6.929256284011402e-06,
"loss": 1.7456,
"step": 1337
},
{
"epoch": 0.031206360220053818,
"grad_norm": 1.8432106971740723,
"learning_rate": 6.934438973827416e-06,
"loss": 1.3632,
"step": 1338
},
{
"epoch": 0.031229683359231736,
"grad_norm": 1.3926173448562622,
"learning_rate": 6.939621663643431e-06,
"loss": 1.5246,
"step": 1339
},
{
"epoch": 0.031253006498409654,
"grad_norm": 1.639283299446106,
"learning_rate": 6.944804353459446e-06,
"loss": 1.4081,
"step": 1340
},
{
"epoch": 0.03127632963758757,
"grad_norm": 1.818247675895691,
"learning_rate": 6.94998704327546e-06,
"loss": 1.4222,
"step": 1341
},
{
"epoch": 0.03129965277676549,
"grad_norm": 1.7598317861557007,
"learning_rate": 6.955169733091475e-06,
"loss": 1.5457,
"step": 1342
},
{
"epoch": 0.03132297591594341,
"grad_norm": 1.9077101945877075,
"learning_rate": 6.960352422907489e-06,
"loss": 1.2585,
"step": 1343
},
{
"epoch": 0.031346299055121325,
"grad_norm": 1.7100765705108643,
"learning_rate": 6.965535112723504e-06,
"loss": 1.5487,
"step": 1344
},
{
"epoch": 0.03136962219429924,
"grad_norm": 1.4282541275024414,
"learning_rate": 6.9707178025395185e-06,
"loss": 1.7457,
"step": 1345
},
{
"epoch": 0.03139294533347716,
"grad_norm": 1.5989662408828735,
"learning_rate": 6.975900492355533e-06,
"loss": 1.7449,
"step": 1346
},
{
"epoch": 0.03141626847265508,
"grad_norm": 1.2489700317382812,
"learning_rate": 6.981083182171547e-06,
"loss": 1.4873,
"step": 1347
},
{
"epoch": 0.031439591611833,
"grad_norm": 1.60476815700531,
"learning_rate": 6.986265871987562e-06,
"loss": 1.4751,
"step": 1348
},
{
"epoch": 0.031462914751010915,
"grad_norm": 1.5303354263305664,
"learning_rate": 6.991448561803577e-06,
"loss": 1.5709,
"step": 1349
},
{
"epoch": 0.03148623789018883,
"grad_norm": 1.462499737739563,
"learning_rate": 6.996631251619591e-06,
"loss": 1.7366,
"step": 1350
},
{
"epoch": 0.03150956102936675,
"grad_norm": 1.4246290922164917,
"learning_rate": 7.0018139414356054e-06,
"loss": 1.1592,
"step": 1351
},
{
"epoch": 0.03153288416854467,
"grad_norm": 1.8897913694381714,
"learning_rate": 7.006996631251621e-06,
"loss": 1.1699,
"step": 1352
},
{
"epoch": 0.031556207307722586,
"grad_norm": 1.6516541242599487,
"learning_rate": 7.012179321067635e-06,
"loss": 1.4705,
"step": 1353
},
{
"epoch": 0.0315795304469005,
"grad_norm": 1.816272258758545,
"learning_rate": 7.017362010883649e-06,
"loss": 1.3166,
"step": 1354
},
{
"epoch": 0.031602853586078415,
"grad_norm": 1.631224274635315,
"learning_rate": 7.022544700699664e-06,
"loss": 1.9471,
"step": 1355
},
{
"epoch": 0.03162617672525633,
"grad_norm": 1.7657747268676758,
"learning_rate": 7.027727390515678e-06,
"loss": 1.6623,
"step": 1356
},
{
"epoch": 0.03164949986443425,
"grad_norm": 1.5499768257141113,
"learning_rate": 7.032910080331693e-06,
"loss": 1.328,
"step": 1357
},
{
"epoch": 0.03167282300361217,
"grad_norm": 1.5339092016220093,
"learning_rate": 7.0380927701477075e-06,
"loss": 1.79,
"step": 1358
},
{
"epoch": 0.03169614614279009,
"grad_norm": 2.1172358989715576,
"learning_rate": 7.043275459963722e-06,
"loss": 1.719,
"step": 1359
},
{
"epoch": 0.031719469281968005,
"grad_norm": 1.5365610122680664,
"learning_rate": 7.048458149779736e-06,
"loss": 1.2236,
"step": 1360
},
{
"epoch": 0.03174279242114592,
"grad_norm": 1.7277380228042603,
"learning_rate": 7.0536408395957514e-06,
"loss": 1.768,
"step": 1361
},
{
"epoch": 0.03176611556032384,
"grad_norm": 3.0157341957092285,
"learning_rate": 7.058823529411766e-06,
"loss": 1.023,
"step": 1362
},
{
"epoch": 0.03178943869950176,
"grad_norm": 1.682496190071106,
"learning_rate": 7.06400621922778e-06,
"loss": 1.5555,
"step": 1363
},
{
"epoch": 0.031812761838679676,
"grad_norm": 1.6679117679595947,
"learning_rate": 7.0691889090437945e-06,
"loss": 1.762,
"step": 1364
},
{
"epoch": 0.031836084977857594,
"grad_norm": 1.5026060342788696,
"learning_rate": 7.074371598859808e-06,
"loss": 1.2893,
"step": 1365
},
{
"epoch": 0.03185940811703551,
"grad_norm": 1.8401672840118408,
"learning_rate": 7.079554288675823e-06,
"loss": 1.4318,
"step": 1366
},
{
"epoch": 0.03188273125621343,
"grad_norm": 1.6953387260437012,
"learning_rate": 7.0847369784918375e-06,
"loss": 1.5304,
"step": 1367
},
{
"epoch": 0.03190605439539135,
"grad_norm": 1.7483880519866943,
"learning_rate": 7.089919668307852e-06,
"loss": 1.763,
"step": 1368
},
{
"epoch": 0.031929377534569266,
"grad_norm": 1.6970646381378174,
"learning_rate": 7.095102358123866e-06,
"loss": 1.4232,
"step": 1369
},
{
"epoch": 0.031952700673747184,
"grad_norm": 1.4489586353302002,
"learning_rate": 7.100285047939881e-06,
"loss": 1.2495,
"step": 1370
},
{
"epoch": 0.0319760238129251,
"grad_norm": 1.8368195295333862,
"learning_rate": 7.105467737755896e-06,
"loss": 1.3631,
"step": 1371
},
{
"epoch": 0.03199934695210302,
"grad_norm": 2.073723077774048,
"learning_rate": 7.11065042757191e-06,
"loss": 1.4958,
"step": 1372
},
{
"epoch": 0.03202267009128094,
"grad_norm": 1.7000291347503662,
"learning_rate": 7.1158331173879244e-06,
"loss": 1.5018,
"step": 1373
},
{
"epoch": 0.032045993230458855,
"grad_norm": 1.896183729171753,
"learning_rate": 7.12101580720394e-06,
"loss": 1.4754,
"step": 1374
},
{
"epoch": 0.03206931636963677,
"grad_norm": 1.4250632524490356,
"learning_rate": 7.126198497019954e-06,
"loss": 1.2758,
"step": 1375
},
{
"epoch": 0.03209263950881469,
"grad_norm": 1.968647837638855,
"learning_rate": 7.131381186835968e-06,
"loss": 1.5062,
"step": 1376
},
{
"epoch": 0.03211596264799261,
"grad_norm": 1.5044890642166138,
"learning_rate": 7.136563876651983e-06,
"loss": 1.7057,
"step": 1377
},
{
"epoch": 0.03213928578717053,
"grad_norm": 1.5252755880355835,
"learning_rate": 7.141746566467998e-06,
"loss": 1.4311,
"step": 1378
},
{
"epoch": 0.032162608926348445,
"grad_norm": 1.7001562118530273,
"learning_rate": 7.146929256284012e-06,
"loss": 1.5573,
"step": 1379
},
{
"epoch": 0.03218593206552636,
"grad_norm": 2.1587064266204834,
"learning_rate": 7.1521119461000266e-06,
"loss": 1.1552,
"step": 1380
},
{
"epoch": 0.03220925520470428,
"grad_norm": 1.5938003063201904,
"learning_rate": 7.157294635916041e-06,
"loss": 1.3843,
"step": 1381
},
{
"epoch": 0.03223257834388219,
"grad_norm": 1.5198419094085693,
"learning_rate": 7.162477325732055e-06,
"loss": 1.4412,
"step": 1382
},
{
"epoch": 0.03225590148306011,
"grad_norm": 1.8579787015914917,
"learning_rate": 7.1676600155480705e-06,
"loss": 1.2986,
"step": 1383
},
{
"epoch": 0.03227922462223803,
"grad_norm": 1.5341622829437256,
"learning_rate": 7.172842705364085e-06,
"loss": 1.2032,
"step": 1384
},
{
"epoch": 0.032302547761415945,
"grad_norm": 2.0681440830230713,
"learning_rate": 7.178025395180099e-06,
"loss": 1.7171,
"step": 1385
},
{
"epoch": 0.03232587090059386,
"grad_norm": 1.7611883878707886,
"learning_rate": 7.1832080849961135e-06,
"loss": 1.3376,
"step": 1386
},
{
"epoch": 0.03234919403977178,
"grad_norm": 1.6917016506195068,
"learning_rate": 7.188390774812129e-06,
"loss": 1.3909,
"step": 1387
},
{
"epoch": 0.0323725171789497,
"grad_norm": 1.1238902807235718,
"learning_rate": 7.193573464628143e-06,
"loss": 1.1826,
"step": 1388
},
{
"epoch": 0.03239584031812762,
"grad_norm": 1.5484822988510132,
"learning_rate": 7.198756154444157e-06,
"loss": 1.4476,
"step": 1389
},
{
"epoch": 0.032419163457305535,
"grad_norm": 1.703244686126709,
"learning_rate": 7.203938844260172e-06,
"loss": 1.5256,
"step": 1390
},
{
"epoch": 0.03244248659648345,
"grad_norm": 2.350940465927124,
"learning_rate": 7.209121534076187e-06,
"loss": 1.4486,
"step": 1391
},
{
"epoch": 0.03246580973566137,
"grad_norm": 1.2115894556045532,
"learning_rate": 7.2143042238922004e-06,
"loss": 1.2387,
"step": 1392
},
{
"epoch": 0.03248913287483929,
"grad_norm": 1.4883688688278198,
"learning_rate": 7.219486913708215e-06,
"loss": 1.4499,
"step": 1393
},
{
"epoch": 0.032512456014017206,
"grad_norm": 1.2324401140213013,
"learning_rate": 7.224669603524229e-06,
"loss": 1.3548,
"step": 1394
},
{
"epoch": 0.032535779153195124,
"grad_norm": 2.054262638092041,
"learning_rate": 7.2298522933402435e-06,
"loss": 1.4986,
"step": 1395
},
{
"epoch": 0.03255910229237304,
"grad_norm": 1.7639497518539429,
"learning_rate": 7.235034983156259e-06,
"loss": 1.4023,
"step": 1396
},
{
"epoch": 0.03258242543155096,
"grad_norm": 1.3556314706802368,
"learning_rate": 7.240217672972273e-06,
"loss": 1.4122,
"step": 1397
},
{
"epoch": 0.03260574857072888,
"grad_norm": 1.8941506147384644,
"learning_rate": 7.245400362788287e-06,
"loss": 1.1754,
"step": 1398
},
{
"epoch": 0.032629071709906796,
"grad_norm": 1.7958110570907593,
"learning_rate": 7.250583052604302e-06,
"loss": 1.9056,
"step": 1399
},
{
"epoch": 0.032652394849084714,
"grad_norm": 1.3702186346054077,
"learning_rate": 7.255765742420317e-06,
"loss": 1.5533,
"step": 1400
},
{
"epoch": 0.03267571798826263,
"grad_norm": 1.4540181159973145,
"learning_rate": 7.260948432236331e-06,
"loss": 1.4704,
"step": 1401
},
{
"epoch": 0.03269904112744055,
"grad_norm": 1.6024681329727173,
"learning_rate": 7.266131122052346e-06,
"loss": 1.4394,
"step": 1402
},
{
"epoch": 0.03272236426661847,
"grad_norm": 1.5546940565109253,
"learning_rate": 7.27131381186836e-06,
"loss": 1.54,
"step": 1403
},
{
"epoch": 0.032745687405796385,
"grad_norm": 1.5781769752502441,
"learning_rate": 7.276496501684375e-06,
"loss": 1.3658,
"step": 1404
},
{
"epoch": 0.0327690105449743,
"grad_norm": 1.4951281547546387,
"learning_rate": 7.2816791915003895e-06,
"loss": 1.3768,
"step": 1405
},
{
"epoch": 0.03279233368415222,
"grad_norm": 1.9413893222808838,
"learning_rate": 7.286861881316404e-06,
"loss": 1.3878,
"step": 1406
},
{
"epoch": 0.03281565682333014,
"grad_norm": 1.6263363361358643,
"learning_rate": 7.292044571132418e-06,
"loss": 1.4236,
"step": 1407
},
{
"epoch": 0.03283897996250806,
"grad_norm": 2.2151589393615723,
"learning_rate": 7.2972272609484325e-06,
"loss": 1.7296,
"step": 1408
},
{
"epoch": 0.032862303101685975,
"grad_norm": 1.3772640228271484,
"learning_rate": 7.302409950764448e-06,
"loss": 1.292,
"step": 1409
},
{
"epoch": 0.03288562624086389,
"grad_norm": 1.7607418298721313,
"learning_rate": 7.307592640580462e-06,
"loss": 1.6019,
"step": 1410
},
{
"epoch": 0.0329089493800418,
"grad_norm": 1.9470393657684326,
"learning_rate": 7.312775330396476e-06,
"loss": 1.3396,
"step": 1411
},
{
"epoch": 0.03293227251921972,
"grad_norm": 2.021190881729126,
"learning_rate": 7.317958020212491e-06,
"loss": 1.6207,
"step": 1412
},
{
"epoch": 0.03295559565839764,
"grad_norm": 1.7311667203903198,
"learning_rate": 7.323140710028506e-06,
"loss": 1.6409,
"step": 1413
},
{
"epoch": 0.03297891879757556,
"grad_norm": 1.6784627437591553,
"learning_rate": 7.32832339984452e-06,
"loss": 1.5595,
"step": 1414
},
{
"epoch": 0.033002241936753475,
"grad_norm": 1.517193078994751,
"learning_rate": 7.333506089660535e-06,
"loss": 1.574,
"step": 1415
},
{
"epoch": 0.03302556507593139,
"grad_norm": 1.4831286668777466,
"learning_rate": 7.338688779476549e-06,
"loss": 0.8727,
"step": 1416
},
{
"epoch": 0.03304888821510931,
"grad_norm": 1.6477752923965454,
"learning_rate": 7.343871469292564e-06,
"loss": 1.559,
"step": 1417
},
{
"epoch": 0.03307221135428723,
"grad_norm": 1.853326678276062,
"learning_rate": 7.3490541591085785e-06,
"loss": 1.8523,
"step": 1418
},
{
"epoch": 0.03309553449346515,
"grad_norm": 1.6894885301589966,
"learning_rate": 7.354236848924593e-06,
"loss": 1.5844,
"step": 1419
},
{
"epoch": 0.033118857632643064,
"grad_norm": 1.6442736387252808,
"learning_rate": 7.359419538740606e-06,
"loss": 1.986,
"step": 1420
},
{
"epoch": 0.03314218077182098,
"grad_norm": 1.787266731262207,
"learning_rate": 7.364602228556621e-06,
"loss": 1.4822,
"step": 1421
},
{
"epoch": 0.0331655039109989,
"grad_norm": 2.073798418045044,
"learning_rate": 7.369784918372636e-06,
"loss": 1.637,
"step": 1422
},
{
"epoch": 0.03318882705017682,
"grad_norm": 1.3428417444229126,
"learning_rate": 7.37496760818865e-06,
"loss": 1.5598,
"step": 1423
},
{
"epoch": 0.033212150189354736,
"grad_norm": 1.5737829208374023,
"learning_rate": 7.380150298004665e-06,
"loss": 1.2274,
"step": 1424
},
{
"epoch": 0.033235473328532654,
"grad_norm": 2.1165404319763184,
"learning_rate": 7.385332987820679e-06,
"loss": 1.4134,
"step": 1425
},
{
"epoch": 0.03325879646771057,
"grad_norm": 1.5476047992706299,
"learning_rate": 7.390515677636694e-06,
"loss": 1.6364,
"step": 1426
},
{
"epoch": 0.03328211960688849,
"grad_norm": 1.6927748918533325,
"learning_rate": 7.3956983674527085e-06,
"loss": 1.6977,
"step": 1427
},
{
"epoch": 0.03330544274606641,
"grad_norm": 1.4677228927612305,
"learning_rate": 7.400881057268723e-06,
"loss": 1.4168,
"step": 1428
},
{
"epoch": 0.033328765885244326,
"grad_norm": 1.5205353498458862,
"learning_rate": 7.406063747084737e-06,
"loss": 1.2843,
"step": 1429
},
{
"epoch": 0.03335208902442224,
"grad_norm": 1.5447300672531128,
"learning_rate": 7.411246436900752e-06,
"loss": 1.5689,
"step": 1430
},
{
"epoch": 0.03337541216360016,
"grad_norm": 1.63996160030365,
"learning_rate": 7.416429126716767e-06,
"loss": 1.5884,
"step": 1431
},
{
"epoch": 0.03339873530277808,
"grad_norm": 1.452081322669983,
"learning_rate": 7.421611816532781e-06,
"loss": 1.3101,
"step": 1432
},
{
"epoch": 0.033422058441956,
"grad_norm": 1.7910422086715698,
"learning_rate": 7.426794506348795e-06,
"loss": 1.4715,
"step": 1433
},
{
"epoch": 0.033445381581133915,
"grad_norm": 1.983233094215393,
"learning_rate": 7.43197719616481e-06,
"loss": 1.5482,
"step": 1434
},
{
"epoch": 0.03346870472031183,
"grad_norm": 1.767785906791687,
"learning_rate": 7.437159885980825e-06,
"loss": 1.6462,
"step": 1435
},
{
"epoch": 0.03349202785948975,
"grad_norm": 1.6161593198776245,
"learning_rate": 7.442342575796839e-06,
"loss": 1.279,
"step": 1436
},
{
"epoch": 0.03351535099866767,
"grad_norm": 1.4756333827972412,
"learning_rate": 7.447525265612854e-06,
"loss": 1.5475,
"step": 1437
},
{
"epoch": 0.03353867413784559,
"grad_norm": 1.8089308738708496,
"learning_rate": 7.452707955428868e-06,
"loss": 1.8059,
"step": 1438
},
{
"epoch": 0.0335619972770235,
"grad_norm": 1.6815400123596191,
"learning_rate": 7.457890645244883e-06,
"loss": 1.7294,
"step": 1439
},
{
"epoch": 0.033585320416201415,
"grad_norm": 2.2101638317108154,
"learning_rate": 7.4630733350608975e-06,
"loss": 1.1257,
"step": 1440
},
{
"epoch": 0.03360864355537933,
"grad_norm": 1.4447871446609497,
"learning_rate": 7.468256024876912e-06,
"loss": 1.5934,
"step": 1441
},
{
"epoch": 0.03363196669455725,
"grad_norm": 1.8209795951843262,
"learning_rate": 7.473438714692926e-06,
"loss": 1.4218,
"step": 1442
},
{
"epoch": 0.03365528983373517,
"grad_norm": 1.4553669691085815,
"learning_rate": 7.4786214045089414e-06,
"loss": 1.2059,
"step": 1443
},
{
"epoch": 0.03367861297291309,
"grad_norm": 1.7106033563613892,
"learning_rate": 7.483804094324956e-06,
"loss": 1.1671,
"step": 1444
},
{
"epoch": 0.033701936112091005,
"grad_norm": 1.3894087076187134,
"learning_rate": 7.48898678414097e-06,
"loss": 1.4522,
"step": 1445
},
{
"epoch": 0.03372525925126892,
"grad_norm": 1.1842654943466187,
"learning_rate": 7.4941694739569845e-06,
"loss": 1.4706,
"step": 1446
},
{
"epoch": 0.03374858239044684,
"grad_norm": 2.5644612312316895,
"learning_rate": 7.499352163773e-06,
"loss": 1.6062,
"step": 1447
},
{
"epoch": 0.03377190552962476,
"grad_norm": 1.5129215717315674,
"learning_rate": 7.504534853589013e-06,
"loss": 1.2178,
"step": 1448
},
{
"epoch": 0.033795228668802677,
"grad_norm": 1.7350616455078125,
"learning_rate": 7.5097175434050275e-06,
"loss": 1.7579,
"step": 1449
},
{
"epoch": 0.033818551807980594,
"grad_norm": 2.163621187210083,
"learning_rate": 7.514900233221042e-06,
"loss": 1.6504,
"step": 1450
},
{
"epoch": 0.03384187494715851,
"grad_norm": 1.946423888206482,
"learning_rate": 7.520082923037056e-06,
"loss": 1.6524,
"step": 1451
},
{
"epoch": 0.03386519808633643,
"grad_norm": 1.766641616821289,
"learning_rate": 7.525265612853071e-06,
"loss": 1.1683,
"step": 1452
},
{
"epoch": 0.03388852122551435,
"grad_norm": 1.928938627243042,
"learning_rate": 7.530448302669086e-06,
"loss": 1.4919,
"step": 1453
},
{
"epoch": 0.033911844364692266,
"grad_norm": 1.5574640035629272,
"learning_rate": 7.5356309924851e-06,
"loss": 1.3775,
"step": 1454
},
{
"epoch": 0.033935167503870184,
"grad_norm": 1.6000114679336548,
"learning_rate": 7.5408136823011144e-06,
"loss": 1.8033,
"step": 1455
},
{
"epoch": 0.0339584906430481,
"grad_norm": 1.4576321840286255,
"learning_rate": 7.545996372117129e-06,
"loss": 1.6291,
"step": 1456
},
{
"epoch": 0.03398181378222602,
"grad_norm": 1.67397940158844,
"learning_rate": 7.551179061933144e-06,
"loss": 1.501,
"step": 1457
},
{
"epoch": 0.03400513692140394,
"grad_norm": 1.6351300477981567,
"learning_rate": 7.556361751749158e-06,
"loss": 1.4177,
"step": 1458
},
{
"epoch": 0.034028460060581855,
"grad_norm": 1.806840181350708,
"learning_rate": 7.561544441565173e-06,
"loss": 1.2173,
"step": 1459
},
{
"epoch": 0.03405178319975977,
"grad_norm": 2.1059956550598145,
"learning_rate": 7.566727131381187e-06,
"loss": 1.2487,
"step": 1460
},
{
"epoch": 0.03407510633893769,
"grad_norm": 1.5448449850082397,
"learning_rate": 7.571909821197202e-06,
"loss": 1.4264,
"step": 1461
},
{
"epoch": 0.03409842947811561,
"grad_norm": 2.8610997200012207,
"learning_rate": 7.5770925110132166e-06,
"loss": 1.3305,
"step": 1462
},
{
"epoch": 0.03412175261729353,
"grad_norm": 1.7565038204193115,
"learning_rate": 7.582275200829231e-06,
"loss": 1.6971,
"step": 1463
},
{
"epoch": 0.034145075756471445,
"grad_norm": 1.5691516399383545,
"learning_rate": 7.587457890645245e-06,
"loss": 1.6759,
"step": 1464
},
{
"epoch": 0.03416839889564936,
"grad_norm": 1.4603890180587769,
"learning_rate": 7.5926405804612605e-06,
"loss": 1.6264,
"step": 1465
},
{
"epoch": 0.03419172203482728,
"grad_norm": 1.5885038375854492,
"learning_rate": 7.597823270277275e-06,
"loss": 1.124,
"step": 1466
},
{
"epoch": 0.0342150451740052,
"grad_norm": 1.4058237075805664,
"learning_rate": 7.603005960093289e-06,
"loss": 1.4257,
"step": 1467
},
{
"epoch": 0.03423836831318311,
"grad_norm": 1.552217721939087,
"learning_rate": 7.6081886499093035e-06,
"loss": 1.2563,
"step": 1468
},
{
"epoch": 0.03426169145236103,
"grad_norm": 2.235629081726074,
"learning_rate": 7.613371339725319e-06,
"loss": 1.8083,
"step": 1469
},
{
"epoch": 0.034285014591538945,
"grad_norm": 1.8639624118804932,
"learning_rate": 7.618554029541333e-06,
"loss": 1.5186,
"step": 1470
},
{
"epoch": 0.03430833773071686,
"grad_norm": 2.1537373065948486,
"learning_rate": 7.623736719357347e-06,
"loss": 1.3531,
"step": 1471
},
{
"epoch": 0.03433166086989478,
"grad_norm": 1.9041272401809692,
"learning_rate": 7.628919409173362e-06,
"loss": 1.4382,
"step": 1472
},
{
"epoch": 0.0343549840090727,
"grad_norm": 1.5207409858703613,
"learning_rate": 7.634102098989377e-06,
"loss": 1.349,
"step": 1473
},
{
"epoch": 0.03437830714825062,
"grad_norm": 1.446553349494934,
"learning_rate": 7.639284788805391e-06,
"loss": 1.1513,
"step": 1474
},
{
"epoch": 0.034401630287428535,
"grad_norm": 1.5411823987960815,
"learning_rate": 7.644467478621404e-06,
"loss": 1.2673,
"step": 1475
},
{
"epoch": 0.03442495342660645,
"grad_norm": 1.588210105895996,
"learning_rate": 7.64965016843742e-06,
"loss": 1.3414,
"step": 1476
},
{
"epoch": 0.03444827656578437,
"grad_norm": 1.4371896982192993,
"learning_rate": 7.654832858253434e-06,
"loss": 1.3386,
"step": 1477
},
{
"epoch": 0.03447159970496229,
"grad_norm": 1.2713488340377808,
"learning_rate": 7.660015548069449e-06,
"loss": 1.3674,
"step": 1478
},
{
"epoch": 0.034494922844140206,
"grad_norm": 1.9180690050125122,
"learning_rate": 7.665198237885463e-06,
"loss": 1.3761,
"step": 1479
},
{
"epoch": 0.034518245983318124,
"grad_norm": 1.7977988719940186,
"learning_rate": 7.670380927701477e-06,
"loss": 1.5416,
"step": 1480
},
{
"epoch": 0.03454156912249604,
"grad_norm": 1.6764715909957886,
"learning_rate": 7.675563617517492e-06,
"loss": 1.9225,
"step": 1481
},
{
"epoch": 0.03456489226167396,
"grad_norm": 1.8952007293701172,
"learning_rate": 7.680746307333506e-06,
"loss": 1.545,
"step": 1482
},
{
"epoch": 0.03458821540085188,
"grad_norm": 1.2648754119873047,
"learning_rate": 7.68592899714952e-06,
"loss": 1.3556,
"step": 1483
},
{
"epoch": 0.034611538540029796,
"grad_norm": 1.5882269144058228,
"learning_rate": 7.691111686965535e-06,
"loss": 1.4676,
"step": 1484
},
{
"epoch": 0.034634861679207714,
"grad_norm": 1.4746918678283691,
"learning_rate": 7.69629437678155e-06,
"loss": 1.5869,
"step": 1485
},
{
"epoch": 0.03465818481838563,
"grad_norm": 1.6212809085845947,
"learning_rate": 7.701477066597565e-06,
"loss": 1.7056,
"step": 1486
},
{
"epoch": 0.03468150795756355,
"grad_norm": 2.3746814727783203,
"learning_rate": 7.70665975641358e-06,
"loss": 1.4726,
"step": 1487
},
{
"epoch": 0.03470483109674147,
"grad_norm": 1.5706418752670288,
"learning_rate": 7.711842446229594e-06,
"loss": 1.3319,
"step": 1488
},
{
"epoch": 0.034728154235919385,
"grad_norm": 1.6811712980270386,
"learning_rate": 7.717025136045608e-06,
"loss": 1.4744,
"step": 1489
},
{
"epoch": 0.0347514773750973,
"grad_norm": 1.839852213859558,
"learning_rate": 7.722207825861623e-06,
"loss": 1.3563,
"step": 1490
},
{
"epoch": 0.03477480051427522,
"grad_norm": 1.2929447889328003,
"learning_rate": 7.727390515677637e-06,
"loss": 1.7944,
"step": 1491
},
{
"epoch": 0.03479812365345314,
"grad_norm": 1.7659885883331299,
"learning_rate": 7.732573205493651e-06,
"loss": 1.6265,
"step": 1492
},
{
"epoch": 0.03482144679263106,
"grad_norm": 1.7670022249221802,
"learning_rate": 7.737755895309667e-06,
"loss": 1.6311,
"step": 1493
},
{
"epoch": 0.034844769931808975,
"grad_norm": 1.7348347902297974,
"learning_rate": 7.742938585125682e-06,
"loss": 1.4573,
"step": 1494
},
{
"epoch": 0.03486809307098689,
"grad_norm": 1.5826637744903564,
"learning_rate": 7.748121274941696e-06,
"loss": 1.8183,
"step": 1495
},
{
"epoch": 0.034891416210164804,
"grad_norm": 1.6276066303253174,
"learning_rate": 7.75330396475771e-06,
"loss": 1.5105,
"step": 1496
},
{
"epoch": 0.03491473934934272,
"grad_norm": 1.4175602197647095,
"learning_rate": 7.758486654573725e-06,
"loss": 1.4397,
"step": 1497
},
{
"epoch": 0.03493806248852064,
"grad_norm": 1.2575039863586426,
"learning_rate": 7.763669344389739e-06,
"loss": 1.3638,
"step": 1498
},
{
"epoch": 0.03496138562769856,
"grad_norm": 1.591441035270691,
"learning_rate": 7.768852034205753e-06,
"loss": 1.2515,
"step": 1499
},
{
"epoch": 0.034984708766876475,
"grad_norm": 1.8170280456542969,
"learning_rate": 7.774034724021768e-06,
"loss": 1.6124,
"step": 1500
},
{
"epoch": 0.03500803190605439,
"grad_norm": 1.825690507888794,
"learning_rate": 7.779217413837784e-06,
"loss": 1.5076,
"step": 1501
},
{
"epoch": 0.03503135504523231,
"grad_norm": 1.61045241355896,
"learning_rate": 7.784400103653798e-06,
"loss": 1.5944,
"step": 1502
},
{
"epoch": 0.03505467818441023,
"grad_norm": 2.1213035583496094,
"learning_rate": 7.78958279346981e-06,
"loss": 1.561,
"step": 1503
},
{
"epoch": 0.03507800132358815,
"grad_norm": 1.5680464506149292,
"learning_rate": 7.794765483285825e-06,
"loss": 1.1515,
"step": 1504
},
{
"epoch": 0.035101324462766065,
"grad_norm": 1.7792956829071045,
"learning_rate": 7.79994817310184e-06,
"loss": 1.7459,
"step": 1505
},
{
"epoch": 0.03512464760194398,
"grad_norm": 1.5262699127197266,
"learning_rate": 7.805130862917854e-06,
"loss": 1.4087,
"step": 1506
},
{
"epoch": 0.0351479707411219,
"grad_norm": 1.9013603925704956,
"learning_rate": 7.81031355273387e-06,
"loss": 1.745,
"step": 1507
},
{
"epoch": 0.03517129388029982,
"grad_norm": 2.1864850521087646,
"learning_rate": 7.815496242549884e-06,
"loss": 1.6892,
"step": 1508
},
{
"epoch": 0.035194617019477736,
"grad_norm": 1.6094999313354492,
"learning_rate": 7.820678932365898e-06,
"loss": 1.1677,
"step": 1509
},
{
"epoch": 0.035217940158655654,
"grad_norm": 1.6659038066864014,
"learning_rate": 7.825861622181913e-06,
"loss": 1.3676,
"step": 1510
},
{
"epoch": 0.03524126329783357,
"grad_norm": 1.5591635704040527,
"learning_rate": 7.831044311997927e-06,
"loss": 1.3353,
"step": 1511
},
{
"epoch": 0.03526458643701149,
"grad_norm": 1.6324151754379272,
"learning_rate": 7.836227001813942e-06,
"loss": 1.5816,
"step": 1512
},
{
"epoch": 0.03528790957618941,
"grad_norm": 1.8007915019989014,
"learning_rate": 7.841409691629956e-06,
"loss": 1.9207,
"step": 1513
},
{
"epoch": 0.035311232715367326,
"grad_norm": 1.6061041355133057,
"learning_rate": 7.84659238144597e-06,
"loss": 1.6949,
"step": 1514
},
{
"epoch": 0.035334555854545244,
"grad_norm": 1.5150330066680908,
"learning_rate": 7.851775071261986e-06,
"loss": 1.7975,
"step": 1515
},
{
"epoch": 0.03535787899372316,
"grad_norm": 1.7966561317443848,
"learning_rate": 7.856957761078e-06,
"loss": 1.3558,
"step": 1516
},
{
"epoch": 0.03538120213290108,
"grad_norm": 1.6751410961151123,
"learning_rate": 7.862140450894015e-06,
"loss": 1.3387,
"step": 1517
},
{
"epoch": 0.035404525272079,
"grad_norm": 1.7746779918670654,
"learning_rate": 7.86732314071003e-06,
"loss": 1.8068,
"step": 1518
},
{
"epoch": 0.035427848411256915,
"grad_norm": 1.4943839311599731,
"learning_rate": 7.872505830526044e-06,
"loss": 1.4922,
"step": 1519
},
{
"epoch": 0.03545117155043483,
"grad_norm": 1.3683398962020874,
"learning_rate": 7.877688520342058e-06,
"loss": 1.382,
"step": 1520
},
{
"epoch": 0.03547449468961275,
"grad_norm": 1.6939599514007568,
"learning_rate": 7.882871210158072e-06,
"loss": 1.6179,
"step": 1521
},
{
"epoch": 0.03549781782879067,
"grad_norm": 1.4292916059494019,
"learning_rate": 7.888053899974087e-06,
"loss": 1.4422,
"step": 1522
},
{
"epoch": 0.03552114096796859,
"grad_norm": 1.96234929561615,
"learning_rate": 7.893236589790103e-06,
"loss": 1.1728,
"step": 1523
},
{
"epoch": 0.035544464107146505,
"grad_norm": 1.8289707899093628,
"learning_rate": 7.898419279606117e-06,
"loss": 1.4281,
"step": 1524
},
{
"epoch": 0.035567787246324416,
"grad_norm": 1.563638687133789,
"learning_rate": 7.903601969422131e-06,
"loss": 1.441,
"step": 1525
},
{
"epoch": 0.035591110385502334,
"grad_norm": 1.7753417491912842,
"learning_rate": 7.908784659238146e-06,
"loss": 1.5371,
"step": 1526
},
{
"epoch": 0.03561443352468025,
"grad_norm": 1.4442288875579834,
"learning_rate": 7.91396734905416e-06,
"loss": 1.21,
"step": 1527
},
{
"epoch": 0.03563775666385817,
"grad_norm": 1.5175955295562744,
"learning_rate": 7.919150038870174e-06,
"loss": 1.2075,
"step": 1528
},
{
"epoch": 0.03566107980303609,
"grad_norm": 1.6752229928970337,
"learning_rate": 7.924332728686189e-06,
"loss": 1.2919,
"step": 1529
},
{
"epoch": 0.035684402942214005,
"grad_norm": 1.7506253719329834,
"learning_rate": 7.929515418502203e-06,
"loss": 1.5369,
"step": 1530
},
{
"epoch": 0.03570772608139192,
"grad_norm": 1.9442663192749023,
"learning_rate": 7.934698108318218e-06,
"loss": 1.6756,
"step": 1531
},
{
"epoch": 0.03573104922056984,
"grad_norm": 1.658495545387268,
"learning_rate": 7.939880798134232e-06,
"loss": 1.2362,
"step": 1532
},
{
"epoch": 0.03575437235974776,
"grad_norm": 1.2289533615112305,
"learning_rate": 7.945063487950246e-06,
"loss": 1.5051,
"step": 1533
},
{
"epoch": 0.03577769549892568,
"grad_norm": 1.5502135753631592,
"learning_rate": 7.95024617776626e-06,
"loss": 1.3702,
"step": 1534
},
{
"epoch": 0.035801018638103595,
"grad_norm": 1.8727954626083374,
"learning_rate": 7.955428867582275e-06,
"loss": 1.417,
"step": 1535
},
{
"epoch": 0.03582434177728151,
"grad_norm": 1.1890602111816406,
"learning_rate": 7.96061155739829e-06,
"loss": 1.1737,
"step": 1536
},
{
"epoch": 0.03584766491645943,
"grad_norm": 5.72725772857666,
"learning_rate": 7.965794247214305e-06,
"loss": 1.3097,
"step": 1537
},
{
"epoch": 0.03587098805563735,
"grad_norm": 1.2847952842712402,
"learning_rate": 7.97097693703032e-06,
"loss": 1.456,
"step": 1538
},
{
"epoch": 0.035894311194815266,
"grad_norm": 2.3652467727661133,
"learning_rate": 7.976159626846334e-06,
"loss": 1.7498,
"step": 1539
},
{
"epoch": 0.035917634333993184,
"grad_norm": 2.2748360633850098,
"learning_rate": 7.981342316662348e-06,
"loss": 1.4181,
"step": 1540
},
{
"epoch": 0.0359409574731711,
"grad_norm": 1.9288114309310913,
"learning_rate": 7.986525006478363e-06,
"loss": 1.4505,
"step": 1541
},
{
"epoch": 0.03596428061234902,
"grad_norm": 1.9735311269760132,
"learning_rate": 7.991707696294377e-06,
"loss": 1.5196,
"step": 1542
},
{
"epoch": 0.03598760375152694,
"grad_norm": 1.5026898384094238,
"learning_rate": 7.996890386110391e-06,
"loss": 1.2868,
"step": 1543
},
{
"epoch": 0.036010926890704856,
"grad_norm": 1.4773675203323364,
"learning_rate": 8.002073075926406e-06,
"loss": 1.3777,
"step": 1544
},
{
"epoch": 0.036034250029882774,
"grad_norm": 1.7095143795013428,
"learning_rate": 8.007255765742422e-06,
"loss": 1.2692,
"step": 1545
},
{
"epoch": 0.03605757316906069,
"grad_norm": 1.7218233346939087,
"learning_rate": 8.012438455558436e-06,
"loss": 1.4015,
"step": 1546
},
{
"epoch": 0.03608089630823861,
"grad_norm": 1.5240681171417236,
"learning_rate": 8.01762114537445e-06,
"loss": 1.5267,
"step": 1547
},
{
"epoch": 0.03610421944741653,
"grad_norm": 1.9092682600021362,
"learning_rate": 8.022803835190465e-06,
"loss": 1.2564,
"step": 1548
},
{
"epoch": 0.036127542586594445,
"grad_norm": 1.844650149345398,
"learning_rate": 8.027986525006479e-06,
"loss": 1.5158,
"step": 1549
},
{
"epoch": 0.03615086572577236,
"grad_norm": 1.5689501762390137,
"learning_rate": 8.033169214822493e-06,
"loss": 1.5708,
"step": 1550
},
{
"epoch": 0.03617418886495028,
"grad_norm": 2.210259437561035,
"learning_rate": 8.038351904638508e-06,
"loss": 1.5915,
"step": 1551
},
{
"epoch": 0.0361975120041282,
"grad_norm": 1.4000816345214844,
"learning_rate": 8.043534594454522e-06,
"loss": 1.2189,
"step": 1552
},
{
"epoch": 0.03622083514330611,
"grad_norm": 1.4790806770324707,
"learning_rate": 8.048717284270538e-06,
"loss": 1.3637,
"step": 1553
},
{
"epoch": 0.03624415828248403,
"grad_norm": 1.9432685375213623,
"learning_rate": 8.053899974086553e-06,
"loss": 1.4459,
"step": 1554
},
{
"epoch": 0.036267481421661946,
"grad_norm": 1.9427974224090576,
"learning_rate": 8.059082663902567e-06,
"loss": 1.8405,
"step": 1555
},
{
"epoch": 0.036290804560839864,
"grad_norm": 1.6169490814208984,
"learning_rate": 8.064265353718581e-06,
"loss": 1.5894,
"step": 1556
},
{
"epoch": 0.03631412770001778,
"grad_norm": 2.189110517501831,
"learning_rate": 8.069448043534596e-06,
"loss": 1.4458,
"step": 1557
},
{
"epoch": 0.0363374508391957,
"grad_norm": 1.6950788497924805,
"learning_rate": 8.074630733350608e-06,
"loss": 1.2485,
"step": 1558
},
{
"epoch": 0.03636077397837362,
"grad_norm": 1.5580222606658936,
"learning_rate": 8.079813423166624e-06,
"loss": 1.3971,
"step": 1559
},
{
"epoch": 0.036384097117551535,
"grad_norm": 1.68899405002594,
"learning_rate": 8.084996112982639e-06,
"loss": 1.5722,
"step": 1560
},
{
"epoch": 0.03640742025672945,
"grad_norm": 1.6812056303024292,
"learning_rate": 8.090178802798653e-06,
"loss": 1.8336,
"step": 1561
},
{
"epoch": 0.03643074339590737,
"grad_norm": 2.962195634841919,
"learning_rate": 8.095361492614667e-06,
"loss": 1.7488,
"step": 1562
},
{
"epoch": 0.03645406653508529,
"grad_norm": 1.6132487058639526,
"learning_rate": 8.100544182430682e-06,
"loss": 1.9727,
"step": 1563
},
{
"epoch": 0.03647738967426321,
"grad_norm": 1.6288578510284424,
"learning_rate": 8.105726872246696e-06,
"loss": 1.6962,
"step": 1564
},
{
"epoch": 0.036500712813441125,
"grad_norm": 1.5894676446914673,
"learning_rate": 8.11090956206271e-06,
"loss": 1.7313,
"step": 1565
},
{
"epoch": 0.03652403595261904,
"grad_norm": 1.702314019203186,
"learning_rate": 8.116092251878725e-06,
"loss": 1.5682,
"step": 1566
},
{
"epoch": 0.03654735909179696,
"grad_norm": 2.3464395999908447,
"learning_rate": 8.12127494169474e-06,
"loss": 1.1367,
"step": 1567
},
{
"epoch": 0.03657068223097488,
"grad_norm": 1.3930420875549316,
"learning_rate": 8.126457631510755e-06,
"loss": 1.2127,
"step": 1568
},
{
"epoch": 0.036594005370152796,
"grad_norm": 1.964519739151001,
"learning_rate": 8.13164032132677e-06,
"loss": 1.5458,
"step": 1569
},
{
"epoch": 0.036617328509330714,
"grad_norm": 1.7511687278747559,
"learning_rate": 8.136823011142784e-06,
"loss": 1.4957,
"step": 1570
},
{
"epoch": 0.03664065164850863,
"grad_norm": 1.403041958808899,
"learning_rate": 8.142005700958798e-06,
"loss": 1.5422,
"step": 1571
},
{
"epoch": 0.03666397478768655,
"grad_norm": 2.368617534637451,
"learning_rate": 8.147188390774813e-06,
"loss": 1.2203,
"step": 1572
},
{
"epoch": 0.03668729792686447,
"grad_norm": 1.7351584434509277,
"learning_rate": 8.152371080590827e-06,
"loss": 1.5534,
"step": 1573
},
{
"epoch": 0.036710621066042386,
"grad_norm": 1.7059663534164429,
"learning_rate": 8.157553770406841e-06,
"loss": 1.5554,
"step": 1574
},
{
"epoch": 0.036733944205220304,
"grad_norm": 1.9748015403747559,
"learning_rate": 8.162736460222857e-06,
"loss": 1.3837,
"step": 1575
},
{
"epoch": 0.03675726734439822,
"grad_norm": 1.7517926692962646,
"learning_rate": 8.167919150038872e-06,
"loss": 1.5008,
"step": 1576
},
{
"epoch": 0.03678059048357614,
"grad_norm": 2.074340343475342,
"learning_rate": 8.173101839854886e-06,
"loss": 1.1944,
"step": 1577
},
{
"epoch": 0.03680391362275406,
"grad_norm": 1.7943975925445557,
"learning_rate": 8.1782845296709e-06,
"loss": 1.5017,
"step": 1578
},
{
"epoch": 0.036827236761931975,
"grad_norm": 1.7202725410461426,
"learning_rate": 8.183467219486915e-06,
"loss": 1.3468,
"step": 1579
},
{
"epoch": 0.03685055990110989,
"grad_norm": 2.03446364402771,
"learning_rate": 8.188649909302929e-06,
"loss": 1.8081,
"step": 1580
},
{
"epoch": 0.03687388304028781,
"grad_norm": 1.8767874240875244,
"learning_rate": 8.193832599118943e-06,
"loss": 1.3877,
"step": 1581
},
{
"epoch": 0.03689720617946572,
"grad_norm": 1.4143779277801514,
"learning_rate": 8.199015288934958e-06,
"loss": 1.5551,
"step": 1582
},
{
"epoch": 0.03692052931864364,
"grad_norm": 1.4130569696426392,
"learning_rate": 8.204197978750974e-06,
"loss": 1.3058,
"step": 1583
},
{
"epoch": 0.03694385245782156,
"grad_norm": 1.4558956623077393,
"learning_rate": 8.209380668566988e-06,
"loss": 1.4228,
"step": 1584
},
{
"epoch": 0.036967175596999476,
"grad_norm": 2.6582729816436768,
"learning_rate": 8.214563358383e-06,
"loss": 1.5081,
"step": 1585
},
{
"epoch": 0.036990498736177393,
"grad_norm": 1.4754345417022705,
"learning_rate": 8.219746048199015e-06,
"loss": 1.5688,
"step": 1586
},
{
"epoch": 0.03701382187535531,
"grad_norm": 1.5351654291152954,
"learning_rate": 8.22492873801503e-06,
"loss": 1.8144,
"step": 1587
},
{
"epoch": 0.03703714501453323,
"grad_norm": 1.6197818517684937,
"learning_rate": 8.230111427831044e-06,
"loss": 1.5846,
"step": 1588
},
{
"epoch": 0.03706046815371115,
"grad_norm": 1.8108611106872559,
"learning_rate": 8.23529411764706e-06,
"loss": 1.3416,
"step": 1589
},
{
"epoch": 0.037083791292889065,
"grad_norm": 1.6245759725570679,
"learning_rate": 8.240476807463074e-06,
"loss": 1.4319,
"step": 1590
},
{
"epoch": 0.03710711443206698,
"grad_norm": 1.9573677778244019,
"learning_rate": 8.245659497279088e-06,
"loss": 1.4856,
"step": 1591
},
{
"epoch": 0.0371304375712449,
"grad_norm": 2.195033550262451,
"learning_rate": 8.250842187095103e-06,
"loss": 1.4553,
"step": 1592
},
{
"epoch": 0.03715376071042282,
"grad_norm": 1.7342851161956787,
"learning_rate": 8.256024876911117e-06,
"loss": 1.4633,
"step": 1593
},
{
"epoch": 0.03717708384960074,
"grad_norm": 1.499495506286621,
"learning_rate": 8.261207566727132e-06,
"loss": 1.6055,
"step": 1594
},
{
"epoch": 0.037200406988778655,
"grad_norm": 1.4192696809768677,
"learning_rate": 8.266390256543146e-06,
"loss": 1.3659,
"step": 1595
},
{
"epoch": 0.03722373012795657,
"grad_norm": 1.8910040855407715,
"learning_rate": 8.27157294635916e-06,
"loss": 1.703,
"step": 1596
},
{
"epoch": 0.03724705326713449,
"grad_norm": 1.4962915182113647,
"learning_rate": 8.276755636175176e-06,
"loss": 1.3212,
"step": 1597
},
{
"epoch": 0.03727037640631241,
"grad_norm": 2.1940252780914307,
"learning_rate": 8.28193832599119e-06,
"loss": 1.8816,
"step": 1598
},
{
"epoch": 0.037293699545490326,
"grad_norm": 1.415831208229065,
"learning_rate": 8.287121015807205e-06,
"loss": 1.229,
"step": 1599
},
{
"epoch": 0.037317022684668244,
"grad_norm": 1.5565876960754395,
"learning_rate": 8.29230370562322e-06,
"loss": 1.7503,
"step": 1600
},
{
"epoch": 0.03734034582384616,
"grad_norm": 2.6450204849243164,
"learning_rate": 8.297486395439234e-06,
"loss": 1.3618,
"step": 1601
},
{
"epoch": 0.03736366896302408,
"grad_norm": 1.5824869871139526,
"learning_rate": 8.302669085255248e-06,
"loss": 1.3587,
"step": 1602
},
{
"epoch": 0.037386992102202,
"grad_norm": 1.6635199785232544,
"learning_rate": 8.307851775071262e-06,
"loss": 1.7323,
"step": 1603
},
{
"epoch": 0.037410315241379916,
"grad_norm": 1.5391467809677124,
"learning_rate": 8.313034464887277e-06,
"loss": 1.7489,
"step": 1604
},
{
"epoch": 0.037433638380557833,
"grad_norm": 2.136975049972534,
"learning_rate": 8.318217154703293e-06,
"loss": 1.4696,
"step": 1605
},
{
"epoch": 0.03745696151973575,
"grad_norm": 1.4561282396316528,
"learning_rate": 8.323399844519307e-06,
"loss": 1.746,
"step": 1606
},
{
"epoch": 0.03748028465891367,
"grad_norm": 1.323926329612732,
"learning_rate": 8.328582534335321e-06,
"loss": 1.3654,
"step": 1607
},
{
"epoch": 0.03750360779809159,
"grad_norm": 1.6495275497436523,
"learning_rate": 8.333765224151336e-06,
"loss": 1.6208,
"step": 1608
},
{
"epoch": 0.037526930937269505,
"grad_norm": 1.4379764795303345,
"learning_rate": 8.33894791396735e-06,
"loss": 1.4988,
"step": 1609
},
{
"epoch": 0.037550254076447416,
"grad_norm": 1.676405668258667,
"learning_rate": 8.344130603783364e-06,
"loss": 1.5563,
"step": 1610
},
{
"epoch": 0.037573577215625334,
"grad_norm": 1.0886626243591309,
"learning_rate": 8.349313293599379e-06,
"loss": 1.2886,
"step": 1611
},
{
"epoch": 0.03759690035480325,
"grad_norm": 1.5499573945999146,
"learning_rate": 8.354495983415393e-06,
"loss": 1.6758,
"step": 1612
},
{
"epoch": 0.03762022349398117,
"grad_norm": 1.2256261110305786,
"learning_rate": 8.359678673231408e-06,
"loss": 1.1831,
"step": 1613
},
{
"epoch": 0.03764354663315909,
"grad_norm": 2.0278513431549072,
"learning_rate": 8.364861363047422e-06,
"loss": 1.5379,
"step": 1614
},
{
"epoch": 0.037666869772337006,
"grad_norm": 1.6582108736038208,
"learning_rate": 8.370044052863436e-06,
"loss": 1.4499,
"step": 1615
},
{
"epoch": 0.03769019291151492,
"grad_norm": 2.517474412918091,
"learning_rate": 8.37522674267945e-06,
"loss": 1.3365,
"step": 1616
},
{
"epoch": 0.03771351605069284,
"grad_norm": 1.651391863822937,
"learning_rate": 8.380409432495465e-06,
"loss": 1.5475,
"step": 1617
},
{
"epoch": 0.03773683918987076,
"grad_norm": 1.9716179370880127,
"learning_rate": 8.38559212231148e-06,
"loss": 1.4809,
"step": 1618
},
{
"epoch": 0.03776016232904868,
"grad_norm": 2.0555307865142822,
"learning_rate": 8.390774812127495e-06,
"loss": 1.7496,
"step": 1619
},
{
"epoch": 0.037783485468226595,
"grad_norm": 1.5695487260818481,
"learning_rate": 8.39595750194351e-06,
"loss": 1.3654,
"step": 1620
},
{
"epoch": 0.03780680860740451,
"grad_norm": 1.4122220277786255,
"learning_rate": 8.401140191759524e-06,
"loss": 1.7411,
"step": 1621
},
{
"epoch": 0.03783013174658243,
"grad_norm": 1.4024474620819092,
"learning_rate": 8.406322881575538e-06,
"loss": 1.3807,
"step": 1622
},
{
"epoch": 0.03785345488576035,
"grad_norm": 1.932897686958313,
"learning_rate": 8.411505571391553e-06,
"loss": 1.6327,
"step": 1623
},
{
"epoch": 0.03787677802493827,
"grad_norm": 1.3100526332855225,
"learning_rate": 8.416688261207567e-06,
"loss": 1.0531,
"step": 1624
},
{
"epoch": 0.037900101164116184,
"grad_norm": 1.5780110359191895,
"learning_rate": 8.421870951023581e-06,
"loss": 1.3187,
"step": 1625
},
{
"epoch": 0.0379234243032941,
"grad_norm": 1.905220866203308,
"learning_rate": 8.427053640839596e-06,
"loss": 1.5322,
"step": 1626
},
{
"epoch": 0.03794674744247202,
"grad_norm": 1.4416756629943848,
"learning_rate": 8.432236330655612e-06,
"loss": 1.3633,
"step": 1627
},
{
"epoch": 0.03797007058164994,
"grad_norm": 1.720937728881836,
"learning_rate": 8.437419020471626e-06,
"loss": 1.4178,
"step": 1628
},
{
"epoch": 0.037993393720827856,
"grad_norm": 1.891658902168274,
"learning_rate": 8.44260171028764e-06,
"loss": 1.5383,
"step": 1629
},
{
"epoch": 0.038016716860005774,
"grad_norm": 2.179572105407715,
"learning_rate": 8.447784400103655e-06,
"loss": 1.3399,
"step": 1630
},
{
"epoch": 0.03804003999918369,
"grad_norm": 1.4478271007537842,
"learning_rate": 8.452967089919669e-06,
"loss": 1.4122,
"step": 1631
},
{
"epoch": 0.03806336313836161,
"grad_norm": 1.4049443006515503,
"learning_rate": 8.458149779735683e-06,
"loss": 1.5164,
"step": 1632
},
{
"epoch": 0.03808668627753953,
"grad_norm": 2.1146810054779053,
"learning_rate": 8.463332469551698e-06,
"loss": 1.5005,
"step": 1633
},
{
"epoch": 0.038110009416717446,
"grad_norm": 1.8528714179992676,
"learning_rate": 8.468515159367712e-06,
"loss": 1.4598,
"step": 1634
},
{
"epoch": 0.03813333255589536,
"grad_norm": 2.274590492248535,
"learning_rate": 8.473697849183728e-06,
"loss": 1.4343,
"step": 1635
},
{
"epoch": 0.03815665569507328,
"grad_norm": 1.837266445159912,
"learning_rate": 8.478880538999743e-06,
"loss": 1.6039,
"step": 1636
},
{
"epoch": 0.0381799788342512,
"grad_norm": 1.735687494277954,
"learning_rate": 8.484063228815757e-06,
"loss": 1.3623,
"step": 1637
},
{
"epoch": 0.03820330197342912,
"grad_norm": 1.8133695125579834,
"learning_rate": 8.489245918631771e-06,
"loss": 1.2941,
"step": 1638
},
{
"epoch": 0.03822662511260703,
"grad_norm": 1.9450503587722778,
"learning_rate": 8.494428608447786e-06,
"loss": 1.4706,
"step": 1639
},
{
"epoch": 0.038249948251784946,
"grad_norm": 1.6004278659820557,
"learning_rate": 8.4996112982638e-06,
"loss": 1.4726,
"step": 1640
},
{
"epoch": 0.038273271390962864,
"grad_norm": 1.7052674293518066,
"learning_rate": 8.504793988079814e-06,
"loss": 1.5521,
"step": 1641
},
{
"epoch": 0.03829659453014078,
"grad_norm": 1.3694720268249512,
"learning_rate": 8.509976677895829e-06,
"loss": 1.1333,
"step": 1642
},
{
"epoch": 0.0383199176693187,
"grad_norm": 1.7958831787109375,
"learning_rate": 8.515159367711843e-06,
"loss": 1.6234,
"step": 1643
},
{
"epoch": 0.03834324080849662,
"grad_norm": 1.7349238395690918,
"learning_rate": 8.520342057527857e-06,
"loss": 1.3697,
"step": 1644
},
{
"epoch": 0.038366563947674535,
"grad_norm": 1.5960413217544556,
"learning_rate": 8.525524747343872e-06,
"loss": 1.4687,
"step": 1645
},
{
"epoch": 0.03838988708685245,
"grad_norm": 2.78328800201416,
"learning_rate": 8.530707437159886e-06,
"loss": 1.462,
"step": 1646
},
{
"epoch": 0.03841321022603037,
"grad_norm": 1.310705304145813,
"learning_rate": 8.5358901269759e-06,
"loss": 1.347,
"step": 1647
},
{
"epoch": 0.03843653336520829,
"grad_norm": 1.554968237876892,
"learning_rate": 8.541072816791915e-06,
"loss": 1.41,
"step": 1648
},
{
"epoch": 0.03845985650438621,
"grad_norm": 2.0181522369384766,
"learning_rate": 8.54625550660793e-06,
"loss": 1.3945,
"step": 1649
},
{
"epoch": 0.038483179643564125,
"grad_norm": 1.816375494003296,
"learning_rate": 8.551438196423945e-06,
"loss": 1.6109,
"step": 1650
},
{
"epoch": 0.03850650278274204,
"grad_norm": 2.1661388874053955,
"learning_rate": 8.55662088623996e-06,
"loss": 1.8344,
"step": 1651
},
{
"epoch": 0.03852982592191996,
"grad_norm": 1.9306049346923828,
"learning_rate": 8.561803576055974e-06,
"loss": 1.2227,
"step": 1652
},
{
"epoch": 0.03855314906109788,
"grad_norm": 1.3145751953125,
"learning_rate": 8.566986265871988e-06,
"loss": 1.3881,
"step": 1653
},
{
"epoch": 0.038576472200275796,
"grad_norm": 1.6416202783584595,
"learning_rate": 8.572168955688003e-06,
"loss": 1.2782,
"step": 1654
},
{
"epoch": 0.038599795339453714,
"grad_norm": 1.3195691108703613,
"learning_rate": 8.577351645504017e-06,
"loss": 1.5958,
"step": 1655
},
{
"epoch": 0.03862311847863163,
"grad_norm": 1.786651372909546,
"learning_rate": 8.582534335320031e-06,
"loss": 1.6379,
"step": 1656
},
{
"epoch": 0.03864644161780955,
"grad_norm": 1.685196876525879,
"learning_rate": 8.587717025136047e-06,
"loss": 1.2548,
"step": 1657
},
{
"epoch": 0.03866976475698747,
"grad_norm": 2.0508875846862793,
"learning_rate": 8.592899714952062e-06,
"loss": 1.635,
"step": 1658
},
{
"epoch": 0.038693087896165386,
"grad_norm": 1.7226320505142212,
"learning_rate": 8.598082404768076e-06,
"loss": 1.4694,
"step": 1659
},
{
"epoch": 0.038716411035343304,
"grad_norm": 1.5333112478256226,
"learning_rate": 8.60326509458409e-06,
"loss": 1.4825,
"step": 1660
},
{
"epoch": 0.03873973417452122,
"grad_norm": 1.4121674299240112,
"learning_rate": 8.608447784400105e-06,
"loss": 1.2056,
"step": 1661
},
{
"epoch": 0.03876305731369914,
"grad_norm": 1.6394184827804565,
"learning_rate": 8.613630474216119e-06,
"loss": 1.5131,
"step": 1662
},
{
"epoch": 0.03878638045287706,
"grad_norm": 2.2525839805603027,
"learning_rate": 8.618813164032133e-06,
"loss": 1.4413,
"step": 1663
},
{
"epoch": 0.038809703592054975,
"grad_norm": 1.6599324941635132,
"learning_rate": 8.623995853848148e-06,
"loss": 1.1568,
"step": 1664
},
{
"epoch": 0.03883302673123289,
"grad_norm": 1.930284857749939,
"learning_rate": 8.629178543664162e-06,
"loss": 1.2182,
"step": 1665
},
{
"epoch": 0.03885634987041081,
"grad_norm": 1.366219401359558,
"learning_rate": 8.634361233480178e-06,
"loss": 1.6951,
"step": 1666
},
{
"epoch": 0.03887967300958872,
"grad_norm": 1.8555302619934082,
"learning_rate": 8.639543923296192e-06,
"loss": 1.4508,
"step": 1667
},
{
"epoch": 0.03890299614876664,
"grad_norm": 2.110704183578491,
"learning_rate": 8.644726613112205e-06,
"loss": 1.5057,
"step": 1668
},
{
"epoch": 0.03892631928794456,
"grad_norm": 1.4422646760940552,
"learning_rate": 8.64990930292822e-06,
"loss": 1.5628,
"step": 1669
},
{
"epoch": 0.038949642427122476,
"grad_norm": 1.8097025156021118,
"learning_rate": 8.655091992744234e-06,
"loss": 1.5336,
"step": 1670
},
{
"epoch": 0.038972965566300394,
"grad_norm": 1.5321156978607178,
"learning_rate": 8.66027468256025e-06,
"loss": 1.4985,
"step": 1671
},
{
"epoch": 0.03899628870547831,
"grad_norm": 1.715100884437561,
"learning_rate": 8.665457372376264e-06,
"loss": 1.5365,
"step": 1672
},
{
"epoch": 0.03901961184465623,
"grad_norm": 1.7432835102081299,
"learning_rate": 8.670640062192278e-06,
"loss": 1.5822,
"step": 1673
},
{
"epoch": 0.03904293498383415,
"grad_norm": 1.7451759576797485,
"learning_rate": 8.675822752008293e-06,
"loss": 1.6363,
"step": 1674
},
{
"epoch": 0.039066258123012065,
"grad_norm": 1.6405068635940552,
"learning_rate": 8.681005441824307e-06,
"loss": 1.6819,
"step": 1675
},
{
"epoch": 0.03908958126218998,
"grad_norm": 1.7980347871780396,
"learning_rate": 8.686188131640322e-06,
"loss": 1.5362,
"step": 1676
},
{
"epoch": 0.0391129044013679,
"grad_norm": 1.6365665197372437,
"learning_rate": 8.691370821456336e-06,
"loss": 2.0277,
"step": 1677
},
{
"epoch": 0.03913622754054582,
"grad_norm": 1.9490535259246826,
"learning_rate": 8.69655351127235e-06,
"loss": 1.5076,
"step": 1678
},
{
"epoch": 0.03915955067972374,
"grad_norm": 1.4164410829544067,
"learning_rate": 8.701736201088366e-06,
"loss": 1.8005,
"step": 1679
},
{
"epoch": 0.039182873818901655,
"grad_norm": 1.4707103967666626,
"learning_rate": 8.70691889090438e-06,
"loss": 1.353,
"step": 1680
},
{
"epoch": 0.03920619695807957,
"grad_norm": 1.7562110424041748,
"learning_rate": 8.712101580720395e-06,
"loss": 1.5621,
"step": 1681
},
{
"epoch": 0.03922952009725749,
"grad_norm": 2.0748794078826904,
"learning_rate": 8.71728427053641e-06,
"loss": 1.4923,
"step": 1682
},
{
"epoch": 0.03925284323643541,
"grad_norm": 2.031003475189209,
"learning_rate": 8.722466960352424e-06,
"loss": 1.1706,
"step": 1683
},
{
"epoch": 0.039276166375613326,
"grad_norm": 2.4340038299560547,
"learning_rate": 8.727649650168438e-06,
"loss": 1.3371,
"step": 1684
},
{
"epoch": 0.039299489514791244,
"grad_norm": 2.129331111907959,
"learning_rate": 8.732832339984452e-06,
"loss": 1.558,
"step": 1685
},
{
"epoch": 0.03932281265396916,
"grad_norm": 1.907139778137207,
"learning_rate": 8.738015029800467e-06,
"loss": 1.6016,
"step": 1686
},
{
"epoch": 0.03934613579314708,
"grad_norm": 1.8079878091812134,
"learning_rate": 8.743197719616483e-06,
"loss": 1.4186,
"step": 1687
},
{
"epoch": 0.039369458932325,
"grad_norm": 1.9196524620056152,
"learning_rate": 8.748380409432497e-06,
"loss": 1.6435,
"step": 1688
},
{
"epoch": 0.039392782071502916,
"grad_norm": 1.5702369213104248,
"learning_rate": 8.753563099248511e-06,
"loss": 1.6279,
"step": 1689
},
{
"epoch": 0.039416105210680834,
"grad_norm": 1.8079639673233032,
"learning_rate": 8.758745789064526e-06,
"loss": 1.4299,
"step": 1690
},
{
"epoch": 0.03943942834985875,
"grad_norm": 1.5084450244903564,
"learning_rate": 8.76392847888054e-06,
"loss": 1.6051,
"step": 1691
},
{
"epoch": 0.03946275148903667,
"grad_norm": 1.8773257732391357,
"learning_rate": 8.769111168696554e-06,
"loss": 1.2258,
"step": 1692
},
{
"epoch": 0.03948607462821459,
"grad_norm": 1.662649154663086,
"learning_rate": 8.774293858512569e-06,
"loss": 1.5057,
"step": 1693
},
{
"epoch": 0.039509397767392505,
"grad_norm": 1.7742561101913452,
"learning_rate": 8.779476548328583e-06,
"loss": 1.5083,
"step": 1694
},
{
"epoch": 0.03953272090657042,
"grad_norm": 1.6094675064086914,
"learning_rate": 8.784659238144598e-06,
"loss": 1.4416,
"step": 1695
},
{
"epoch": 0.039556044045748334,
"grad_norm": 1.7892067432403564,
"learning_rate": 8.789841927960612e-06,
"loss": 1.6939,
"step": 1696
},
{
"epoch": 0.03957936718492625,
"grad_norm": 1.4669241905212402,
"learning_rate": 8.795024617776626e-06,
"loss": 1.3218,
"step": 1697
},
{
"epoch": 0.03960269032410417,
"grad_norm": 1.6289660930633545,
"learning_rate": 8.80020730759264e-06,
"loss": 1.4026,
"step": 1698
},
{
"epoch": 0.03962601346328209,
"grad_norm": 1.4103940725326538,
"learning_rate": 8.805389997408655e-06,
"loss": 1.5594,
"step": 1699
},
{
"epoch": 0.039649336602460006,
"grad_norm": 1.8094227313995361,
"learning_rate": 8.81057268722467e-06,
"loss": 1.4749,
"step": 1700
},
{
"epoch": 0.039672659741637924,
"grad_norm": 1.9171851873397827,
"learning_rate": 8.815755377040685e-06,
"loss": 1.5853,
"step": 1701
},
{
"epoch": 0.03969598288081584,
"grad_norm": 1.7482846975326538,
"learning_rate": 8.8209380668567e-06,
"loss": 1.8572,
"step": 1702
},
{
"epoch": 0.03971930601999376,
"grad_norm": 1.494166374206543,
"learning_rate": 8.826120756672714e-06,
"loss": 1.4618,
"step": 1703
},
{
"epoch": 0.03974262915917168,
"grad_norm": 1.8293770551681519,
"learning_rate": 8.831303446488728e-06,
"loss": 1.278,
"step": 1704
},
{
"epoch": 0.039765952298349595,
"grad_norm": 1.7367064952850342,
"learning_rate": 8.836486136304743e-06,
"loss": 1.65,
"step": 1705
},
{
"epoch": 0.03978927543752751,
"grad_norm": 1.783642292022705,
"learning_rate": 8.841668826120757e-06,
"loss": 1.5325,
"step": 1706
},
{
"epoch": 0.03981259857670543,
"grad_norm": 1.5297502279281616,
"learning_rate": 8.846851515936771e-06,
"loss": 1.4132,
"step": 1707
},
{
"epoch": 0.03983592171588335,
"grad_norm": 1.9751566648483276,
"learning_rate": 8.852034205752786e-06,
"loss": 1.4317,
"step": 1708
},
{
"epoch": 0.03985924485506127,
"grad_norm": 2.1414785385131836,
"learning_rate": 8.857216895568802e-06,
"loss": 1.6401,
"step": 1709
},
{
"epoch": 0.039882567994239185,
"grad_norm": 1.4582406282424927,
"learning_rate": 8.862399585384816e-06,
"loss": 1.4949,
"step": 1710
},
{
"epoch": 0.0399058911334171,
"grad_norm": 1.3729748725891113,
"learning_rate": 8.86758227520083e-06,
"loss": 0.8325,
"step": 1711
},
{
"epoch": 0.03992921427259502,
"grad_norm": 1.5666522979736328,
"learning_rate": 8.872764965016845e-06,
"loss": 1.6165,
"step": 1712
},
{
"epoch": 0.03995253741177294,
"grad_norm": 1.8730623722076416,
"learning_rate": 8.877947654832859e-06,
"loss": 1.5912,
"step": 1713
},
{
"epoch": 0.039975860550950856,
"grad_norm": 1.3995941877365112,
"learning_rate": 8.883130344648873e-06,
"loss": 1.4624,
"step": 1714
},
{
"epoch": 0.039999183690128774,
"grad_norm": 1.6787446737289429,
"learning_rate": 8.888313034464888e-06,
"loss": 1.7264,
"step": 1715
},
{
"epoch": 0.04002250682930669,
"grad_norm": 1.6797045469284058,
"learning_rate": 8.893495724280902e-06,
"loss": 1.458,
"step": 1716
},
{
"epoch": 0.04004582996848461,
"grad_norm": 1.4562252759933472,
"learning_rate": 8.898678414096917e-06,
"loss": 1.4469,
"step": 1717
},
{
"epoch": 0.04006915310766253,
"grad_norm": 1.8270559310913086,
"learning_rate": 8.903861103912933e-06,
"loss": 1.5524,
"step": 1718
},
{
"epoch": 0.040092476246840446,
"grad_norm": 2.2723021507263184,
"learning_rate": 8.909043793728947e-06,
"loss": 1.5524,
"step": 1719
},
{
"epoch": 0.040115799386018364,
"grad_norm": 1.6696120500564575,
"learning_rate": 8.914226483544961e-06,
"loss": 1.6466,
"step": 1720
},
{
"epoch": 0.04013912252519628,
"grad_norm": 1.8067409992218018,
"learning_rate": 8.919409173360976e-06,
"loss": 1.4901,
"step": 1721
},
{
"epoch": 0.0401624456643742,
"grad_norm": 1.6212742328643799,
"learning_rate": 8.92459186317699e-06,
"loss": 1.3791,
"step": 1722
},
{
"epoch": 0.04018576880355212,
"grad_norm": 1.5557783842086792,
"learning_rate": 8.929774552993004e-06,
"loss": 1.4122,
"step": 1723
},
{
"epoch": 0.04020909194273003,
"grad_norm": 2.65142822265625,
"learning_rate": 8.934957242809019e-06,
"loss": 1.3679,
"step": 1724
},
{
"epoch": 0.040232415081907946,
"grad_norm": 1.9991352558135986,
"learning_rate": 8.940139932625033e-06,
"loss": 1.4746,
"step": 1725
},
{
"epoch": 0.040255738221085864,
"grad_norm": 2.054579257965088,
"learning_rate": 8.945322622441047e-06,
"loss": 1.5759,
"step": 1726
},
{
"epoch": 0.04027906136026378,
"grad_norm": 1.62351393699646,
"learning_rate": 8.950505312257062e-06,
"loss": 1.4139,
"step": 1727
},
{
"epoch": 0.0403023844994417,
"grad_norm": 1.754712462425232,
"learning_rate": 8.955688002073076e-06,
"loss": 1.2871,
"step": 1728
},
{
"epoch": 0.04032570763861962,
"grad_norm": 1.744728922843933,
"learning_rate": 8.96087069188909e-06,
"loss": 1.5207,
"step": 1729
},
{
"epoch": 0.040349030777797536,
"grad_norm": 1.9871348142623901,
"learning_rate": 8.966053381705105e-06,
"loss": 1.7927,
"step": 1730
},
{
"epoch": 0.040372353916975454,
"grad_norm": 1.898793339729309,
"learning_rate": 8.97123607152112e-06,
"loss": 1.5487,
"step": 1731
},
{
"epoch": 0.04039567705615337,
"grad_norm": 1.6234720945358276,
"learning_rate": 8.976418761337135e-06,
"loss": 1.3666,
"step": 1732
},
{
"epoch": 0.04041900019533129,
"grad_norm": 1.7883436679840088,
"learning_rate": 8.98160145115315e-06,
"loss": 1.59,
"step": 1733
},
{
"epoch": 0.04044232333450921,
"grad_norm": 2.030747890472412,
"learning_rate": 8.986784140969164e-06,
"loss": 1.5484,
"step": 1734
},
{
"epoch": 0.040465646473687125,
"grad_norm": 1.5323489904403687,
"learning_rate": 8.991966830785178e-06,
"loss": 1.4076,
"step": 1735
},
{
"epoch": 0.04048896961286504,
"grad_norm": 1.545076847076416,
"learning_rate": 8.997149520601193e-06,
"loss": 1.9024,
"step": 1736
},
{
"epoch": 0.04051229275204296,
"grad_norm": 1.775343656539917,
"learning_rate": 9.002332210417207e-06,
"loss": 1.6269,
"step": 1737
},
{
"epoch": 0.04053561589122088,
"grad_norm": 1.5936089754104614,
"learning_rate": 9.007514900233221e-06,
"loss": 1.388,
"step": 1738
},
{
"epoch": 0.0405589390303988,
"grad_norm": 2.0282087326049805,
"learning_rate": 9.012697590049236e-06,
"loss": 1.5258,
"step": 1739
},
{
"epoch": 0.040582262169576715,
"grad_norm": 1.769651174545288,
"learning_rate": 9.017880279865252e-06,
"loss": 1.6468,
"step": 1740
},
{
"epoch": 0.04060558530875463,
"grad_norm": 1.671475887298584,
"learning_rate": 9.023062969681266e-06,
"loss": 1.457,
"step": 1741
},
{
"epoch": 0.04062890844793255,
"grad_norm": 1.5717363357543945,
"learning_rate": 9.02824565949728e-06,
"loss": 1.0661,
"step": 1742
},
{
"epoch": 0.04065223158711047,
"grad_norm": 2.1011769771575928,
"learning_rate": 9.033428349313295e-06,
"loss": 1.8212,
"step": 1743
},
{
"epoch": 0.040675554726288386,
"grad_norm": 1.8593213558197021,
"learning_rate": 9.038611039129309e-06,
"loss": 1.2838,
"step": 1744
},
{
"epoch": 0.040698877865466304,
"grad_norm": 3.45039963722229,
"learning_rate": 9.043793728945323e-06,
"loss": 1.2977,
"step": 1745
},
{
"epoch": 0.04072220100464422,
"grad_norm": 1.5961792469024658,
"learning_rate": 9.048976418761338e-06,
"loss": 1.598,
"step": 1746
},
{
"epoch": 0.04074552414382214,
"grad_norm": 1.7901935577392578,
"learning_rate": 9.054159108577352e-06,
"loss": 1.197,
"step": 1747
},
{
"epoch": 0.04076884728300006,
"grad_norm": 1.7534990310668945,
"learning_rate": 9.059341798393368e-06,
"loss": 1.5957,
"step": 1748
},
{
"epoch": 0.040792170422177976,
"grad_norm": 2.0215656757354736,
"learning_rate": 9.064524488209382e-06,
"loss": 1.4019,
"step": 1749
},
{
"epoch": 0.040815493561355894,
"grad_norm": 1.7355159521102905,
"learning_rate": 9.069707178025397e-06,
"loss": 1.6056,
"step": 1750
},
{
"epoch": 0.04083881670053381,
"grad_norm": 2.3358545303344727,
"learning_rate": 9.07488986784141e-06,
"loss": 1.3946,
"step": 1751
},
{
"epoch": 0.04086213983971173,
"grad_norm": 2.4582395553588867,
"learning_rate": 9.080072557657424e-06,
"loss": 1.3848,
"step": 1752
},
{
"epoch": 0.04088546297888964,
"grad_norm": 1.8667892217636108,
"learning_rate": 9.08525524747344e-06,
"loss": 1.5908,
"step": 1753
},
{
"epoch": 0.04090878611806756,
"grad_norm": 2.2128000259399414,
"learning_rate": 9.090437937289454e-06,
"loss": 1.4584,
"step": 1754
},
{
"epoch": 0.040932109257245476,
"grad_norm": 1.714179277420044,
"learning_rate": 9.095620627105468e-06,
"loss": 1.3882,
"step": 1755
},
{
"epoch": 0.040955432396423394,
"grad_norm": 1.7891523838043213,
"learning_rate": 9.100803316921483e-06,
"loss": 1.6921,
"step": 1756
},
{
"epoch": 0.04097875553560131,
"grad_norm": 2.0620603561401367,
"learning_rate": 9.105986006737497e-06,
"loss": 1.4833,
"step": 1757
},
{
"epoch": 0.04100207867477923,
"grad_norm": 1.4664239883422852,
"learning_rate": 9.111168696553512e-06,
"loss": 1.5,
"step": 1758
},
{
"epoch": 0.04102540181395715,
"grad_norm": 2.151362180709839,
"learning_rate": 9.116351386369526e-06,
"loss": 1.4189,
"step": 1759
},
{
"epoch": 0.041048724953135066,
"grad_norm": 2.1404523849487305,
"learning_rate": 9.12153407618554e-06,
"loss": 1.512,
"step": 1760
},
{
"epoch": 0.041072048092312984,
"grad_norm": 1.5175687074661255,
"learning_rate": 9.126716766001556e-06,
"loss": 1.3527,
"step": 1761
},
{
"epoch": 0.0410953712314909,
"grad_norm": 1.6199604272842407,
"learning_rate": 9.13189945581757e-06,
"loss": 1.1717,
"step": 1762
},
{
"epoch": 0.04111869437066882,
"grad_norm": 1.655900001525879,
"learning_rate": 9.137082145633585e-06,
"loss": 1.4903,
"step": 1763
},
{
"epoch": 0.04114201750984674,
"grad_norm": 1.6075772047042847,
"learning_rate": 9.1422648354496e-06,
"loss": 1.3745,
"step": 1764
},
{
"epoch": 0.041165340649024655,
"grad_norm": 1.5534958839416504,
"learning_rate": 9.147447525265614e-06,
"loss": 1.4659,
"step": 1765
},
{
"epoch": 0.04118866378820257,
"grad_norm": 2.197490930557251,
"learning_rate": 9.152630215081628e-06,
"loss": 1.5412,
"step": 1766
},
{
"epoch": 0.04121198692738049,
"grad_norm": 2.1121668815612793,
"learning_rate": 9.157812904897642e-06,
"loss": 1.7137,
"step": 1767
},
{
"epoch": 0.04123531006655841,
"grad_norm": 2.2003660202026367,
"learning_rate": 9.162995594713657e-06,
"loss": 1.6095,
"step": 1768
},
{
"epoch": 0.04125863320573633,
"grad_norm": 1.617874264717102,
"learning_rate": 9.168178284529671e-06,
"loss": 1.4913,
"step": 1769
},
{
"epoch": 0.041281956344914245,
"grad_norm": 1.6809815168380737,
"learning_rate": 9.173360974345687e-06,
"loss": 1.6014,
"step": 1770
},
{
"epoch": 0.04130527948409216,
"grad_norm": 1.8234214782714844,
"learning_rate": 9.178543664161701e-06,
"loss": 1.4921,
"step": 1771
},
{
"epoch": 0.04132860262327008,
"grad_norm": 1.605371117591858,
"learning_rate": 9.183726353977716e-06,
"loss": 1.526,
"step": 1772
},
{
"epoch": 0.041351925762448,
"grad_norm": 1.7158360481262207,
"learning_rate": 9.18890904379373e-06,
"loss": 1.6063,
"step": 1773
},
{
"epoch": 0.041375248901625916,
"grad_norm": 1.8888566493988037,
"learning_rate": 9.194091733609744e-06,
"loss": 1.3013,
"step": 1774
},
{
"epoch": 0.041398572040803834,
"grad_norm": 1.8596553802490234,
"learning_rate": 9.199274423425759e-06,
"loss": 1.3611,
"step": 1775
},
{
"epoch": 0.04142189517998175,
"grad_norm": 1.770941972732544,
"learning_rate": 9.204457113241773e-06,
"loss": 1.553,
"step": 1776
},
{
"epoch": 0.04144521831915967,
"grad_norm": 1.4563987255096436,
"learning_rate": 9.209639803057788e-06,
"loss": 1.3261,
"step": 1777
},
{
"epoch": 0.04146854145833759,
"grad_norm": 1.5590494871139526,
"learning_rate": 9.214822492873802e-06,
"loss": 1.6303,
"step": 1778
},
{
"epoch": 0.041491864597515506,
"grad_norm": 1.6040290594100952,
"learning_rate": 9.220005182689816e-06,
"loss": 1.6656,
"step": 1779
},
{
"epoch": 0.041515187736693424,
"grad_norm": 1.6253089904785156,
"learning_rate": 9.22518787250583e-06,
"loss": 1.3086,
"step": 1780
},
{
"epoch": 0.041538510875871335,
"grad_norm": 2.282277822494507,
"learning_rate": 9.230370562321845e-06,
"loss": 1.3154,
"step": 1781
},
{
"epoch": 0.04156183401504925,
"grad_norm": 1.6955877542495728,
"learning_rate": 9.23555325213786e-06,
"loss": 1.4742,
"step": 1782
},
{
"epoch": 0.04158515715422717,
"grad_norm": 2.6918323040008545,
"learning_rate": 9.240735941953875e-06,
"loss": 1.4942,
"step": 1783
},
{
"epoch": 0.04160848029340509,
"grad_norm": 2.111135244369507,
"learning_rate": 9.24591863176989e-06,
"loss": 1.4501,
"step": 1784
},
{
"epoch": 0.041631803432583006,
"grad_norm": 1.6524665355682373,
"learning_rate": 9.251101321585904e-06,
"loss": 1.2801,
"step": 1785
},
{
"epoch": 0.041655126571760924,
"grad_norm": 1.812553882598877,
"learning_rate": 9.256284011401918e-06,
"loss": 1.2928,
"step": 1786
},
{
"epoch": 0.04167844971093884,
"grad_norm": 1.7474865913391113,
"learning_rate": 9.261466701217933e-06,
"loss": 1.5489,
"step": 1787
},
{
"epoch": 0.04170177285011676,
"grad_norm": 1.91874098777771,
"learning_rate": 9.266649391033947e-06,
"loss": 1.5997,
"step": 1788
},
{
"epoch": 0.04172509598929468,
"grad_norm": 1.4715979099273682,
"learning_rate": 9.271832080849961e-06,
"loss": 0.921,
"step": 1789
},
{
"epoch": 0.041748419128472596,
"grad_norm": 1.599254846572876,
"learning_rate": 9.277014770665976e-06,
"loss": 1.5168,
"step": 1790
},
{
"epoch": 0.04177174226765051,
"grad_norm": 1.8970310688018799,
"learning_rate": 9.28219746048199e-06,
"loss": 1.4821,
"step": 1791
},
{
"epoch": 0.04179506540682843,
"grad_norm": 1.5975875854492188,
"learning_rate": 9.287380150298006e-06,
"loss": 1.4889,
"step": 1792
},
{
"epoch": 0.04181838854600635,
"grad_norm": 1.7852643728256226,
"learning_rate": 9.29256284011402e-06,
"loss": 1.4124,
"step": 1793
},
{
"epoch": 0.04184171168518427,
"grad_norm": 1.8535397052764893,
"learning_rate": 9.297745529930035e-06,
"loss": 1.5964,
"step": 1794
},
{
"epoch": 0.041865034824362185,
"grad_norm": 1.532125473022461,
"learning_rate": 9.302928219746049e-06,
"loss": 1.2431,
"step": 1795
},
{
"epoch": 0.0418883579635401,
"grad_norm": 1.542386531829834,
"learning_rate": 9.308110909562063e-06,
"loss": 1.6327,
"step": 1796
},
{
"epoch": 0.04191168110271802,
"grad_norm": 1.8671448230743408,
"learning_rate": 9.313293599378078e-06,
"loss": 1.7695,
"step": 1797
},
{
"epoch": 0.04193500424189594,
"grad_norm": 1.6148124933242798,
"learning_rate": 9.318476289194092e-06,
"loss": 1.7227,
"step": 1798
},
{
"epoch": 0.04195832738107386,
"grad_norm": 1.4859371185302734,
"learning_rate": 9.323658979010107e-06,
"loss": 1.2807,
"step": 1799
},
{
"epoch": 0.041981650520251775,
"grad_norm": 3.0297629833221436,
"learning_rate": 9.328841668826123e-06,
"loss": 1.3824,
"step": 1800
},
{
"epoch": 0.04200497365942969,
"grad_norm": 1.6791976690292358,
"learning_rate": 9.334024358642137e-06,
"loss": 1.7325,
"step": 1801
},
{
"epoch": 0.04202829679860761,
"grad_norm": 1.4695453643798828,
"learning_rate": 9.339207048458151e-06,
"loss": 1.2062,
"step": 1802
},
{
"epoch": 0.04205161993778553,
"grad_norm": 1.5592173337936401,
"learning_rate": 9.344389738274166e-06,
"loss": 1.1919,
"step": 1803
},
{
"epoch": 0.042074943076963446,
"grad_norm": 1.4761253595352173,
"learning_rate": 9.34957242809018e-06,
"loss": 1.2845,
"step": 1804
},
{
"epoch": 0.042098266216141364,
"grad_norm": 1.3584182262420654,
"learning_rate": 9.354755117906194e-06,
"loss": 1.6216,
"step": 1805
},
{
"epoch": 0.04212158935531928,
"grad_norm": 2.0344326496124268,
"learning_rate": 9.359937807722209e-06,
"loss": 1.2301,
"step": 1806
},
{
"epoch": 0.0421449124944972,
"grad_norm": 1.549643874168396,
"learning_rate": 9.365120497538223e-06,
"loss": 1.446,
"step": 1807
},
{
"epoch": 0.04216823563367512,
"grad_norm": 1.6695293188095093,
"learning_rate": 9.370303187354237e-06,
"loss": 1.7588,
"step": 1808
},
{
"epoch": 0.042191558772853036,
"grad_norm": 1.817617416381836,
"learning_rate": 9.375485877170252e-06,
"loss": 1.5394,
"step": 1809
},
{
"epoch": 0.04221488191203095,
"grad_norm": 1.917152762413025,
"learning_rate": 9.380668566986266e-06,
"loss": 1.6437,
"step": 1810
},
{
"epoch": 0.042238205051208864,
"grad_norm": 0.9892622828483582,
"learning_rate": 9.38585125680228e-06,
"loss": 1.1554,
"step": 1811
},
{
"epoch": 0.04226152819038678,
"grad_norm": 1.577576994895935,
"learning_rate": 9.391033946618295e-06,
"loss": 1.4737,
"step": 1812
},
{
"epoch": 0.0422848513295647,
"grad_norm": 1.739229679107666,
"learning_rate": 9.39621663643431e-06,
"loss": 1.4077,
"step": 1813
},
{
"epoch": 0.04230817446874262,
"grad_norm": 1.6817034482955933,
"learning_rate": 9.401399326250325e-06,
"loss": 0.9329,
"step": 1814
},
{
"epoch": 0.042331497607920536,
"grad_norm": 1.6616978645324707,
"learning_rate": 9.40658201606634e-06,
"loss": 1.6185,
"step": 1815
},
{
"epoch": 0.042354820747098454,
"grad_norm": 1.379654049873352,
"learning_rate": 9.411764705882354e-06,
"loss": 1.6863,
"step": 1816
},
{
"epoch": 0.04237814388627637,
"grad_norm": 2.3998191356658936,
"learning_rate": 9.416947395698368e-06,
"loss": 1.4281,
"step": 1817
},
{
"epoch": 0.04240146702545429,
"grad_norm": 2.078322410583496,
"learning_rate": 9.422130085514383e-06,
"loss": 1.2324,
"step": 1818
},
{
"epoch": 0.04242479016463221,
"grad_norm": 1.8474605083465576,
"learning_rate": 9.427312775330397e-06,
"loss": 1.3242,
"step": 1819
},
{
"epoch": 0.042448113303810125,
"grad_norm": 1.4538230895996094,
"learning_rate": 9.432495465146411e-06,
"loss": 1.3117,
"step": 1820
},
{
"epoch": 0.04247143644298804,
"grad_norm": 2.528913974761963,
"learning_rate": 9.437678154962426e-06,
"loss": 1.3846,
"step": 1821
},
{
"epoch": 0.04249475958216596,
"grad_norm": 1.5370780229568481,
"learning_rate": 9.442860844778442e-06,
"loss": 1.4712,
"step": 1822
},
{
"epoch": 0.04251808272134388,
"grad_norm": 1.7554328441619873,
"learning_rate": 9.448043534594456e-06,
"loss": 1.5354,
"step": 1823
},
{
"epoch": 0.0425414058605218,
"grad_norm": 1.490560531616211,
"learning_rate": 9.45322622441047e-06,
"loss": 1.1725,
"step": 1824
},
{
"epoch": 0.042564728999699715,
"grad_norm": 1.55622136592865,
"learning_rate": 9.458408914226485e-06,
"loss": 1.5401,
"step": 1825
},
{
"epoch": 0.04258805213887763,
"grad_norm": 1.6288939714431763,
"learning_rate": 9.463591604042499e-06,
"loss": 1.404,
"step": 1826
},
{
"epoch": 0.04261137527805555,
"grad_norm": 1.9815454483032227,
"learning_rate": 9.468774293858513e-06,
"loss": 1.538,
"step": 1827
},
{
"epoch": 0.04263469841723347,
"grad_norm": 1.8967722654342651,
"learning_rate": 9.473956983674528e-06,
"loss": 1.4561,
"step": 1828
},
{
"epoch": 0.04265802155641139,
"grad_norm": 2.010972023010254,
"learning_rate": 9.479139673490542e-06,
"loss": 1.692,
"step": 1829
},
{
"epoch": 0.042681344695589304,
"grad_norm": 1.82353937625885,
"learning_rate": 9.484322363306558e-06,
"loss": 1.6394,
"step": 1830
},
{
"epoch": 0.04270466783476722,
"grad_norm": 1.6288769245147705,
"learning_rate": 9.489505053122572e-06,
"loss": 1.7251,
"step": 1831
},
{
"epoch": 0.04272799097394514,
"grad_norm": 2.7632317543029785,
"learning_rate": 9.494687742938587e-06,
"loss": 1.5771,
"step": 1832
},
{
"epoch": 0.04275131411312306,
"grad_norm": 1.7157068252563477,
"learning_rate": 9.499870432754601e-06,
"loss": 1.9245,
"step": 1833
},
{
"epoch": 0.042774637252300976,
"grad_norm": 1.6728345155715942,
"learning_rate": 9.505053122570614e-06,
"loss": 1.5874,
"step": 1834
},
{
"epoch": 0.042797960391478894,
"grad_norm": 1.6265268325805664,
"learning_rate": 9.51023581238663e-06,
"loss": 1.6633,
"step": 1835
},
{
"epoch": 0.04282128353065681,
"grad_norm": 1.8013489246368408,
"learning_rate": 9.515418502202644e-06,
"loss": 1.3856,
"step": 1836
},
{
"epoch": 0.04284460666983473,
"grad_norm": 1.85427987575531,
"learning_rate": 9.520601192018658e-06,
"loss": 1.2233,
"step": 1837
},
{
"epoch": 0.04286792980901264,
"grad_norm": 1.6943988800048828,
"learning_rate": 9.525783881834673e-06,
"loss": 1.3198,
"step": 1838
},
{
"epoch": 0.04289125294819056,
"grad_norm": 1.7103756666183472,
"learning_rate": 9.530966571650687e-06,
"loss": 1.4118,
"step": 1839
},
{
"epoch": 0.042914576087368476,
"grad_norm": 2.0107672214508057,
"learning_rate": 9.536149261466702e-06,
"loss": 1.3456,
"step": 1840
},
{
"epoch": 0.042937899226546394,
"grad_norm": 1.505422830581665,
"learning_rate": 9.541331951282716e-06,
"loss": 1.2676,
"step": 1841
},
{
"epoch": 0.04296122236572431,
"grad_norm": 2.090595245361328,
"learning_rate": 9.54651464109873e-06,
"loss": 1.2113,
"step": 1842
},
{
"epoch": 0.04298454550490223,
"grad_norm": 1.7776191234588623,
"learning_rate": 9.551697330914745e-06,
"loss": 1.5694,
"step": 1843
},
{
"epoch": 0.04300786864408015,
"grad_norm": 3.0254878997802734,
"learning_rate": 9.55688002073076e-06,
"loss": 1.244,
"step": 1844
},
{
"epoch": 0.043031191783258066,
"grad_norm": 1.8657838106155396,
"learning_rate": 9.562062710546775e-06,
"loss": 1.9444,
"step": 1845
},
{
"epoch": 0.043054514922435984,
"grad_norm": 2.1006710529327393,
"learning_rate": 9.56724540036279e-06,
"loss": 1.3202,
"step": 1846
},
{
"epoch": 0.0430778380616139,
"grad_norm": 1.2389309406280518,
"learning_rate": 9.572428090178804e-06,
"loss": 1.1992,
"step": 1847
},
{
"epoch": 0.04310116120079182,
"grad_norm": 2.162818193435669,
"learning_rate": 9.577610779994818e-06,
"loss": 1.5446,
"step": 1848
},
{
"epoch": 0.04312448433996974,
"grad_norm": 2.476367950439453,
"learning_rate": 9.582793469810832e-06,
"loss": 1.6178,
"step": 1849
},
{
"epoch": 0.043147807479147655,
"grad_norm": 2.1805801391601562,
"learning_rate": 9.587976159626847e-06,
"loss": 1.5745,
"step": 1850
},
{
"epoch": 0.04317113061832557,
"grad_norm": 1.7875632047653198,
"learning_rate": 9.593158849442861e-06,
"loss": 1.6798,
"step": 1851
},
{
"epoch": 0.04319445375750349,
"grad_norm": 2.506103515625,
"learning_rate": 9.598341539258877e-06,
"loss": 1.2824,
"step": 1852
},
{
"epoch": 0.04321777689668141,
"grad_norm": 2.027400016784668,
"learning_rate": 9.603524229074891e-06,
"loss": 1.7745,
"step": 1853
},
{
"epoch": 0.04324110003585933,
"grad_norm": 1.5254895687103271,
"learning_rate": 9.608706918890906e-06,
"loss": 1.6716,
"step": 1854
},
{
"epoch": 0.043264423175037245,
"grad_norm": 1.9832854270935059,
"learning_rate": 9.61388960870692e-06,
"loss": 1.2432,
"step": 1855
},
{
"epoch": 0.04328774631421516,
"grad_norm": 1.3785820007324219,
"learning_rate": 9.619072298522934e-06,
"loss": 1.3452,
"step": 1856
},
{
"epoch": 0.04331106945339308,
"grad_norm": 2.0536274909973145,
"learning_rate": 9.624254988338949e-06,
"loss": 1.9594,
"step": 1857
},
{
"epoch": 0.043334392592571,
"grad_norm": 1.8014826774597168,
"learning_rate": 9.629437678154963e-06,
"loss": 1.4811,
"step": 1858
},
{
"epoch": 0.043357715731748916,
"grad_norm": 1.5722678899765015,
"learning_rate": 9.634620367970978e-06,
"loss": 1.2694,
"step": 1859
},
{
"epoch": 0.043381038870926834,
"grad_norm": 1.849761724472046,
"learning_rate": 9.639803057786994e-06,
"loss": 1.4856,
"step": 1860
},
{
"epoch": 0.04340436201010475,
"grad_norm": 1.412558913230896,
"learning_rate": 9.644985747603006e-06,
"loss": 1.4672,
"step": 1861
},
{
"epoch": 0.04342768514928267,
"grad_norm": 2.028230667114258,
"learning_rate": 9.65016843741902e-06,
"loss": 1.5573,
"step": 1862
},
{
"epoch": 0.04345100828846059,
"grad_norm": 2.5457494258880615,
"learning_rate": 9.655351127235035e-06,
"loss": 1.3734,
"step": 1863
},
{
"epoch": 0.043474331427638506,
"grad_norm": 1.6199779510498047,
"learning_rate": 9.66053381705105e-06,
"loss": 1.6676,
"step": 1864
},
{
"epoch": 0.043497654566816424,
"grad_norm": 1.4922274351119995,
"learning_rate": 9.665716506867064e-06,
"loss": 1.5909,
"step": 1865
},
{
"epoch": 0.04352097770599434,
"grad_norm": 1.545914649963379,
"learning_rate": 9.67089919668308e-06,
"loss": 1.0683,
"step": 1866
},
{
"epoch": 0.04354430084517225,
"grad_norm": 1.4928728342056274,
"learning_rate": 9.676081886499094e-06,
"loss": 1.1975,
"step": 1867
},
{
"epoch": 0.04356762398435017,
"grad_norm": 2.042757272720337,
"learning_rate": 9.681264576315108e-06,
"loss": 1.9166,
"step": 1868
},
{
"epoch": 0.04359094712352809,
"grad_norm": 1.9415842294692993,
"learning_rate": 9.686447266131123e-06,
"loss": 1.5207,
"step": 1869
},
{
"epoch": 0.043614270262706006,
"grad_norm": 1.6906239986419678,
"learning_rate": 9.691629955947137e-06,
"loss": 1.4171,
"step": 1870
},
{
"epoch": 0.043637593401883924,
"grad_norm": 1.5644055604934692,
"learning_rate": 9.696812645763151e-06,
"loss": 1.4997,
"step": 1871
},
{
"epoch": 0.04366091654106184,
"grad_norm": 1.7778024673461914,
"learning_rate": 9.701995335579166e-06,
"loss": 1.3872,
"step": 1872
},
{
"epoch": 0.04368423968023976,
"grad_norm": 1.9999544620513916,
"learning_rate": 9.70717802539518e-06,
"loss": 1.6039,
"step": 1873
},
{
"epoch": 0.04370756281941768,
"grad_norm": 2.1065220832824707,
"learning_rate": 9.712360715211196e-06,
"loss": 1.4525,
"step": 1874
},
{
"epoch": 0.043730885958595596,
"grad_norm": 1.785739541053772,
"learning_rate": 9.71754340502721e-06,
"loss": 1.7723,
"step": 1875
},
{
"epoch": 0.043754209097773514,
"grad_norm": 1.7912609577178955,
"learning_rate": 9.722726094843225e-06,
"loss": 1.8857,
"step": 1876
},
{
"epoch": 0.04377753223695143,
"grad_norm": 2.2229981422424316,
"learning_rate": 9.727908784659239e-06,
"loss": 1.7359,
"step": 1877
},
{
"epoch": 0.04380085537612935,
"grad_norm": 1.7545627355575562,
"learning_rate": 9.733091474475253e-06,
"loss": 1.3878,
"step": 1878
},
{
"epoch": 0.04382417851530727,
"grad_norm": 1.6687484979629517,
"learning_rate": 9.738274164291268e-06,
"loss": 1.3148,
"step": 1879
},
{
"epoch": 0.043847501654485185,
"grad_norm": 1.661619782447815,
"learning_rate": 9.743456854107282e-06,
"loss": 1.5319,
"step": 1880
},
{
"epoch": 0.0438708247936631,
"grad_norm": 1.6879695653915405,
"learning_rate": 9.748639543923297e-06,
"loss": 1.2871,
"step": 1881
},
{
"epoch": 0.04389414793284102,
"grad_norm": 1.614043116569519,
"learning_rate": 9.753822233739313e-06,
"loss": 1.0429,
"step": 1882
},
{
"epoch": 0.04391747107201894,
"grad_norm": 1.310645341873169,
"learning_rate": 9.759004923555327e-06,
"loss": 1.5535,
"step": 1883
},
{
"epoch": 0.04394079421119686,
"grad_norm": 1.677807092666626,
"learning_rate": 9.764187613371341e-06,
"loss": 1.5612,
"step": 1884
},
{
"epoch": 0.043964117350374775,
"grad_norm": 2.004786252975464,
"learning_rate": 9.769370303187356e-06,
"loss": 1.1547,
"step": 1885
},
{
"epoch": 0.04398744048955269,
"grad_norm": 2.4537112712860107,
"learning_rate": 9.77455299300337e-06,
"loss": 1.6863,
"step": 1886
},
{
"epoch": 0.04401076362873061,
"grad_norm": 1.8132030963897705,
"learning_rate": 9.779735682819384e-06,
"loss": 1.2049,
"step": 1887
},
{
"epoch": 0.04403408676790853,
"grad_norm": 1.954026699066162,
"learning_rate": 9.784918372635399e-06,
"loss": 1.3946,
"step": 1888
},
{
"epoch": 0.044057409907086446,
"grad_norm": 1.742790699005127,
"learning_rate": 9.790101062451413e-06,
"loss": 1.3851,
"step": 1889
},
{
"epoch": 0.044080733046264364,
"grad_norm": 2.010481357574463,
"learning_rate": 9.795283752267427e-06,
"loss": 1.4181,
"step": 1890
},
{
"epoch": 0.04410405618544228,
"grad_norm": 1.6661536693572998,
"learning_rate": 9.800466442083442e-06,
"loss": 1.1611,
"step": 1891
},
{
"epoch": 0.0441273793246202,
"grad_norm": 1.6758571863174438,
"learning_rate": 9.805649131899456e-06,
"loss": 1.0906,
"step": 1892
},
{
"epoch": 0.04415070246379812,
"grad_norm": 1.7925001382827759,
"learning_rate": 9.81083182171547e-06,
"loss": 1.4299,
"step": 1893
},
{
"epoch": 0.044174025602976036,
"grad_norm": 1.9415634870529175,
"learning_rate": 9.816014511531485e-06,
"loss": 1.5619,
"step": 1894
},
{
"epoch": 0.04419734874215395,
"grad_norm": 1.3546884059906006,
"learning_rate": 9.821197201347499e-06,
"loss": 1.4994,
"step": 1895
},
{
"epoch": 0.044220671881331865,
"grad_norm": 2.0756897926330566,
"learning_rate": 9.826379891163515e-06,
"loss": 1.7483,
"step": 1896
},
{
"epoch": 0.04424399502050978,
"grad_norm": 1.7983125448226929,
"learning_rate": 9.83156258097953e-06,
"loss": 1.586,
"step": 1897
},
{
"epoch": 0.0442673181596877,
"grad_norm": 1.5559202432632446,
"learning_rate": 9.836745270795544e-06,
"loss": 1.5093,
"step": 1898
},
{
"epoch": 0.04429064129886562,
"grad_norm": 1.772439956665039,
"learning_rate": 9.841927960611558e-06,
"loss": 1.3449,
"step": 1899
},
{
"epoch": 0.044313964438043536,
"grad_norm": 1.9158481359481812,
"learning_rate": 9.847110650427573e-06,
"loss": 1.3239,
"step": 1900
},
{
"epoch": 0.044337287577221454,
"grad_norm": 1.801500916481018,
"learning_rate": 9.852293340243587e-06,
"loss": 1.1534,
"step": 1901
},
{
"epoch": 0.04436061071639937,
"grad_norm": 1.5766456127166748,
"learning_rate": 9.857476030059601e-06,
"loss": 1.7678,
"step": 1902
},
{
"epoch": 0.04438393385557729,
"grad_norm": 1.852655053138733,
"learning_rate": 9.862658719875616e-06,
"loss": 1.7286,
"step": 1903
},
{
"epoch": 0.04440725699475521,
"grad_norm": 1.9849982261657715,
"learning_rate": 9.867841409691632e-06,
"loss": 1.3984,
"step": 1904
},
{
"epoch": 0.044430580133933126,
"grad_norm": 1.7213250398635864,
"learning_rate": 9.873024099507646e-06,
"loss": 1.6215,
"step": 1905
},
{
"epoch": 0.044453903273111044,
"grad_norm": 1.9416676759719849,
"learning_rate": 9.87820678932366e-06,
"loss": 1.6314,
"step": 1906
},
{
"epoch": 0.04447722641228896,
"grad_norm": 1.8408985137939453,
"learning_rate": 9.883389479139675e-06,
"loss": 1.6611,
"step": 1907
},
{
"epoch": 0.04450054955146688,
"grad_norm": 1.528350591659546,
"learning_rate": 9.888572168955689e-06,
"loss": 1.7559,
"step": 1908
},
{
"epoch": 0.0445238726906448,
"grad_norm": 1.6557738780975342,
"learning_rate": 9.893754858771703e-06,
"loss": 1.5072,
"step": 1909
},
{
"epoch": 0.044547195829822715,
"grad_norm": 2.0431089401245117,
"learning_rate": 9.898937548587718e-06,
"loss": 1.2895,
"step": 1910
},
{
"epoch": 0.04457051896900063,
"grad_norm": 1.8927110433578491,
"learning_rate": 9.904120238403732e-06,
"loss": 1.4221,
"step": 1911
},
{
"epoch": 0.04459384210817855,
"grad_norm": 1.547044038772583,
"learning_rate": 9.909302928219748e-06,
"loss": 1.2597,
"step": 1912
},
{
"epoch": 0.04461716524735647,
"grad_norm": 1.81504487991333,
"learning_rate": 9.914485618035762e-06,
"loss": 1.4845,
"step": 1913
},
{
"epoch": 0.04464048838653439,
"grad_norm": 3.442282199859619,
"learning_rate": 9.919668307851777e-06,
"loss": 1.0979,
"step": 1914
},
{
"epoch": 0.044663811525712305,
"grad_norm": 1.8255623579025269,
"learning_rate": 9.924850997667791e-06,
"loss": 1.6663,
"step": 1915
},
{
"epoch": 0.04468713466489022,
"grad_norm": 1.7657500505447388,
"learning_rate": 9.930033687483804e-06,
"loss": 1.765,
"step": 1916
},
{
"epoch": 0.04471045780406814,
"grad_norm": 1.6761666536331177,
"learning_rate": 9.935216377299818e-06,
"loss": 1.3525,
"step": 1917
},
{
"epoch": 0.04473378094324606,
"grad_norm": 2.3319602012634277,
"learning_rate": 9.940399067115834e-06,
"loss": 1.1265,
"step": 1918
},
{
"epoch": 0.044757104082423976,
"grad_norm": 1.6062688827514648,
"learning_rate": 9.945581756931848e-06,
"loss": 1.6085,
"step": 1919
},
{
"epoch": 0.044780427221601894,
"grad_norm": 1.4931232929229736,
"learning_rate": 9.950764446747863e-06,
"loss": 1.6418,
"step": 1920
},
{
"epoch": 0.04480375036077981,
"grad_norm": 2.0092151165008545,
"learning_rate": 9.955947136563877e-06,
"loss": 1.2352,
"step": 1921
},
{
"epoch": 0.04482707349995773,
"grad_norm": 2.2695815563201904,
"learning_rate": 9.961129826379892e-06,
"loss": 1.3626,
"step": 1922
},
{
"epoch": 0.04485039663913565,
"grad_norm": 1.6969548463821411,
"learning_rate": 9.966312516195906e-06,
"loss": 1.6971,
"step": 1923
},
{
"epoch": 0.04487371977831356,
"grad_norm": 1.8436291217803955,
"learning_rate": 9.97149520601192e-06,
"loss": 1.7701,
"step": 1924
},
{
"epoch": 0.04489704291749148,
"grad_norm": 1.7749122381210327,
"learning_rate": 9.976677895827935e-06,
"loss": 1.3771,
"step": 1925
},
{
"epoch": 0.044920366056669395,
"grad_norm": 1.9239168167114258,
"learning_rate": 9.98186058564395e-06,
"loss": 1.7554,
"step": 1926
},
{
"epoch": 0.04494368919584731,
"grad_norm": 1.5236059427261353,
"learning_rate": 9.987043275459965e-06,
"loss": 1.337,
"step": 1927
},
{
"epoch": 0.04496701233502523,
"grad_norm": 2.0506536960601807,
"learning_rate": 9.99222596527598e-06,
"loss": 1.7227,
"step": 1928
},
{
"epoch": 0.04499033547420315,
"grad_norm": 1.4491156339645386,
"learning_rate": 9.997408655091994e-06,
"loss": 1.4032,
"step": 1929
},
{
"epoch": 0.045013658613381066,
"grad_norm": 2.175860643386841,
"learning_rate": 1.0002591344908008e-05,
"loss": 1.5994,
"step": 1930
},
{
"epoch": 0.045036981752558984,
"grad_norm": 1.9326441287994385,
"learning_rate": 1.0007774034724022e-05,
"loss": 1.1194,
"step": 1931
},
{
"epoch": 0.0450603048917369,
"grad_norm": 1.8562779426574707,
"learning_rate": 1.0012956724540037e-05,
"loss": 1.7551,
"step": 1932
},
{
"epoch": 0.04508362803091482,
"grad_norm": 1.7570141553878784,
"learning_rate": 1.0018139414356051e-05,
"loss": 0.97,
"step": 1933
},
{
"epoch": 0.04510695117009274,
"grad_norm": 1.2578299045562744,
"learning_rate": 1.0023322104172067e-05,
"loss": 1.3259,
"step": 1934
},
{
"epoch": 0.045130274309270656,
"grad_norm": 2.211773633956909,
"learning_rate": 1.0028504793988081e-05,
"loss": 1.6072,
"step": 1935
},
{
"epoch": 0.045153597448448574,
"grad_norm": 1.7696832418441772,
"learning_rate": 1.0033687483804096e-05,
"loss": 1.4227,
"step": 1936
},
{
"epoch": 0.04517692058762649,
"grad_norm": 1.940531611442566,
"learning_rate": 1.003887017362011e-05,
"loss": 1.8458,
"step": 1937
},
{
"epoch": 0.04520024372680441,
"grad_norm": 2.282905101776123,
"learning_rate": 1.0044052863436124e-05,
"loss": 1.2556,
"step": 1938
},
{
"epoch": 0.04522356686598233,
"grad_norm": 1.643122673034668,
"learning_rate": 1.0049235553252139e-05,
"loss": 1.5571,
"step": 1939
},
{
"epoch": 0.045246890005160245,
"grad_norm": 1.6886086463928223,
"learning_rate": 1.0054418243068153e-05,
"loss": 1.481,
"step": 1940
},
{
"epoch": 0.04527021314433816,
"grad_norm": 2.349867105484009,
"learning_rate": 1.0059600932884168e-05,
"loss": 1.8651,
"step": 1941
},
{
"epoch": 0.04529353628351608,
"grad_norm": 2.0965826511383057,
"learning_rate": 1.0064783622700184e-05,
"loss": 1.5702,
"step": 1942
},
{
"epoch": 0.045316859422694,
"grad_norm": 1.4684425592422485,
"learning_rate": 1.0069966312516198e-05,
"loss": 1.4283,
"step": 1943
},
{
"epoch": 0.04534018256187192,
"grad_norm": 3.0096945762634277,
"learning_rate": 1.0075149002332212e-05,
"loss": 1.4832,
"step": 1944
},
{
"epoch": 0.045363505701049835,
"grad_norm": 2.2389118671417236,
"learning_rate": 1.0080331692148227e-05,
"loss": 1.6346,
"step": 1945
},
{
"epoch": 0.04538682884022775,
"grad_norm": 1.7624162435531616,
"learning_rate": 1.0085514381964241e-05,
"loss": 1.7017,
"step": 1946
},
{
"epoch": 0.04541015197940567,
"grad_norm": 1.8136117458343506,
"learning_rate": 1.0090697071780255e-05,
"loss": 1.5987,
"step": 1947
},
{
"epoch": 0.04543347511858359,
"grad_norm": 1.678236484527588,
"learning_rate": 1.009587976159627e-05,
"loss": 1.3684,
"step": 1948
},
{
"epoch": 0.045456798257761506,
"grad_norm": 1.7862106561660767,
"learning_rate": 1.0101062451412284e-05,
"loss": 1.7998,
"step": 1949
},
{
"epoch": 0.045480121396939424,
"grad_norm": 2.0441555976867676,
"learning_rate": 1.0106245141228298e-05,
"loss": 1.2902,
"step": 1950
},
{
"epoch": 0.04550344453611734,
"grad_norm": 1.5820708274841309,
"learning_rate": 1.0111427831044314e-05,
"loss": 1.2032,
"step": 1951
},
{
"epoch": 0.04552676767529525,
"grad_norm": 1.4560632705688477,
"learning_rate": 1.0116610520860329e-05,
"loss": 1.5599,
"step": 1952
},
{
"epoch": 0.04555009081447317,
"grad_norm": 2.3671185970306396,
"learning_rate": 1.0121793210676343e-05,
"loss": 1.6144,
"step": 1953
},
{
"epoch": 0.04557341395365109,
"grad_norm": 1.7525554895401,
"learning_rate": 1.0126975900492357e-05,
"loss": 1.913,
"step": 1954
},
{
"epoch": 0.04559673709282901,
"grad_norm": 1.2725483179092407,
"learning_rate": 1.0132158590308372e-05,
"loss": 1.2048,
"step": 1955
},
{
"epoch": 0.045620060232006925,
"grad_norm": 1.8041915893554688,
"learning_rate": 1.0137341280124386e-05,
"loss": 1.1796,
"step": 1956
},
{
"epoch": 0.04564338337118484,
"grad_norm": 2.3629374504089355,
"learning_rate": 1.01425239699404e-05,
"loss": 1.8434,
"step": 1957
},
{
"epoch": 0.04566670651036276,
"grad_norm": 1.3975788354873657,
"learning_rate": 1.0147706659756413e-05,
"loss": 1.5474,
"step": 1958
},
{
"epoch": 0.04569002964954068,
"grad_norm": 1.4148329496383667,
"learning_rate": 1.0152889349572427e-05,
"loss": 1.4695,
"step": 1959
},
{
"epoch": 0.045713352788718596,
"grad_norm": 3.3544209003448486,
"learning_rate": 1.0158072039388442e-05,
"loss": 1.6851,
"step": 1960
},
{
"epoch": 0.045736675927896514,
"grad_norm": 1.795784592628479,
"learning_rate": 1.0163254729204458e-05,
"loss": 1.2823,
"step": 1961
},
{
"epoch": 0.04575999906707443,
"grad_norm": 2.3135123252868652,
"learning_rate": 1.0168437419020472e-05,
"loss": 1.7222,
"step": 1962
},
{
"epoch": 0.04578332220625235,
"grad_norm": 1.62346351146698,
"learning_rate": 1.0173620108836487e-05,
"loss": 1.3822,
"step": 1963
},
{
"epoch": 0.04580664534543027,
"grad_norm": 1.9713786840438843,
"learning_rate": 1.0178802798652501e-05,
"loss": 1.1212,
"step": 1964
},
{
"epoch": 0.045829968484608186,
"grad_norm": 1.5502241849899292,
"learning_rate": 1.0183985488468515e-05,
"loss": 1.0937,
"step": 1965
},
{
"epoch": 0.045853291623786104,
"grad_norm": 1.893622875213623,
"learning_rate": 1.018916817828453e-05,
"loss": 1.7849,
"step": 1966
},
{
"epoch": 0.04587661476296402,
"grad_norm": 1.7515870332717896,
"learning_rate": 1.0194350868100544e-05,
"loss": 1.4724,
"step": 1967
},
{
"epoch": 0.04589993790214194,
"grad_norm": 1.7589161396026611,
"learning_rate": 1.0199533557916558e-05,
"loss": 1.4281,
"step": 1968
},
{
"epoch": 0.04592326104131986,
"grad_norm": 2.377809762954712,
"learning_rate": 1.0204716247732573e-05,
"loss": 1.0402,
"step": 1969
},
{
"epoch": 0.045946584180497775,
"grad_norm": 1.6169410943984985,
"learning_rate": 1.0209898937548589e-05,
"loss": 1.2902,
"step": 1970
},
{
"epoch": 0.04596990731967569,
"grad_norm": 1.7550357580184937,
"learning_rate": 1.0215081627364603e-05,
"loss": 1.274,
"step": 1971
},
{
"epoch": 0.04599323045885361,
"grad_norm": 1.846411943435669,
"learning_rate": 1.0220264317180617e-05,
"loss": 1.2554,
"step": 1972
},
{
"epoch": 0.04601655359803153,
"grad_norm": 1.880225419998169,
"learning_rate": 1.0225447006996632e-05,
"loss": 1.3451,
"step": 1973
},
{
"epoch": 0.04603987673720945,
"grad_norm": 1.6644784212112427,
"learning_rate": 1.0230629696812646e-05,
"loss": 1.5651,
"step": 1974
},
{
"epoch": 0.046063199876387365,
"grad_norm": 1.2287671566009521,
"learning_rate": 1.023581238662866e-05,
"loss": 1.2272,
"step": 1975
},
{
"epoch": 0.04608652301556528,
"grad_norm": 5.595534801483154,
"learning_rate": 1.0240995076444675e-05,
"loss": 1.2381,
"step": 1976
},
{
"epoch": 0.0461098461547432,
"grad_norm": 1.6219606399536133,
"learning_rate": 1.0246177766260689e-05,
"loss": 1.1357,
"step": 1977
},
{
"epoch": 0.04613316929392112,
"grad_norm": 1.7713710069656372,
"learning_rate": 1.0251360456076705e-05,
"loss": 1.3458,
"step": 1978
},
{
"epoch": 0.046156492433099036,
"grad_norm": 1.6285533905029297,
"learning_rate": 1.025654314589272e-05,
"loss": 1.6516,
"step": 1979
},
{
"epoch": 0.046179815572276954,
"grad_norm": 1.479745864868164,
"learning_rate": 1.0261725835708734e-05,
"loss": 1.2629,
"step": 1980
},
{
"epoch": 0.046203138711454865,
"grad_norm": 1.6205228567123413,
"learning_rate": 1.0266908525524748e-05,
"loss": 1.6772,
"step": 1981
},
{
"epoch": 0.04622646185063278,
"grad_norm": 1.845969319343567,
"learning_rate": 1.0272091215340763e-05,
"loss": 1.7172,
"step": 1982
},
{
"epoch": 0.0462497849898107,
"grad_norm": 1.71135413646698,
"learning_rate": 1.0277273905156777e-05,
"loss": 1.3776,
"step": 1983
},
{
"epoch": 0.04627310812898862,
"grad_norm": 1.5999668836593628,
"learning_rate": 1.0282456594972791e-05,
"loss": 1.3148,
"step": 1984
},
{
"epoch": 0.04629643126816654,
"grad_norm": 2.372850179672241,
"learning_rate": 1.0287639284788806e-05,
"loss": 1.5203,
"step": 1985
},
{
"epoch": 0.046319754407344454,
"grad_norm": 1.9471055269241333,
"learning_rate": 1.0292821974604822e-05,
"loss": 1.1852,
"step": 1986
},
{
"epoch": 0.04634307754652237,
"grad_norm": 2.035149574279785,
"learning_rate": 1.0298004664420836e-05,
"loss": 1.5986,
"step": 1987
},
{
"epoch": 0.04636640068570029,
"grad_norm": 1.9274436235427856,
"learning_rate": 1.030318735423685e-05,
"loss": 1.3578,
"step": 1988
},
{
"epoch": 0.04638972382487821,
"grad_norm": 1.8304780721664429,
"learning_rate": 1.0308370044052865e-05,
"loss": 1.2624,
"step": 1989
},
{
"epoch": 0.046413046964056126,
"grad_norm": 2.2276337146759033,
"learning_rate": 1.0313552733868879e-05,
"loss": 1.5508,
"step": 1990
},
{
"epoch": 0.046436370103234044,
"grad_norm": 1.7837759256362915,
"learning_rate": 1.0318735423684893e-05,
"loss": 1.4839,
"step": 1991
},
{
"epoch": 0.04645969324241196,
"grad_norm": 1.766287088394165,
"learning_rate": 1.0323918113500908e-05,
"loss": 1.7001,
"step": 1992
},
{
"epoch": 0.04648301638158988,
"grad_norm": 1.6771559715270996,
"learning_rate": 1.0329100803316922e-05,
"loss": 1.6349,
"step": 1993
},
{
"epoch": 0.0465063395207678,
"grad_norm": 1.7568877935409546,
"learning_rate": 1.0334283493132938e-05,
"loss": 1.4524,
"step": 1994
},
{
"epoch": 0.046529662659945716,
"grad_norm": 2.070405960083008,
"learning_rate": 1.0339466182948952e-05,
"loss": 1.3437,
"step": 1995
},
{
"epoch": 0.04655298579912363,
"grad_norm": 2.852936267852783,
"learning_rate": 1.0344648872764967e-05,
"loss": 1.2623,
"step": 1996
},
{
"epoch": 0.04657630893830155,
"grad_norm": 1.3660649061203003,
"learning_rate": 1.0349831562580981e-05,
"loss": 1.3146,
"step": 1997
},
{
"epoch": 0.04659963207747947,
"grad_norm": 1.672303318977356,
"learning_rate": 1.0355014252396995e-05,
"loss": 1.3361,
"step": 1998
},
{
"epoch": 0.04662295521665739,
"grad_norm": 1.6566362380981445,
"learning_rate": 1.036019694221301e-05,
"loss": 1.4374,
"step": 1999
},
{
"epoch": 0.046646278355835305,
"grad_norm": 1.6957907676696777,
"learning_rate": 1.0365379632029024e-05,
"loss": 1.4639,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 128625,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.082516707749724e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}