MedPodReasoner / checkpoint-1000 /trainer_state.json
shuyuej's picture
Upload folder using huggingface_hub
d1f61b1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.023323139177917653,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.332313917791765e-05,
"grad_norm": 1.6235620975494385,
"learning_rate": 5.182689816014512e-09,
"loss": 1.9275,
"step": 1
},
{
"epoch": 4.66462783558353e-05,
"grad_norm": 1.5710082054138184,
"learning_rate": 1.0365379632029025e-08,
"loss": 1.5593,
"step": 2
},
{
"epoch": 6.996941753375295e-05,
"grad_norm": 2.3231985569000244,
"learning_rate": 1.5548069448043534e-08,
"loss": 2.0021,
"step": 3
},
{
"epoch": 9.32925567116706e-05,
"grad_norm": 1.8349288702011108,
"learning_rate": 2.073075926405805e-08,
"loss": 2.1141,
"step": 4
},
{
"epoch": 0.00011661569588958826,
"grad_norm": 2.039928436279297,
"learning_rate": 2.5913449080072562e-08,
"loss": 1.9361,
"step": 5
},
{
"epoch": 0.0001399388350675059,
"grad_norm": 1.8988783359527588,
"learning_rate": 3.109613889608707e-08,
"loss": 2.2441,
"step": 6
},
{
"epoch": 0.00016326197424542356,
"grad_norm": 1.4865813255310059,
"learning_rate": 3.6278828712101586e-08,
"loss": 1.8118,
"step": 7
},
{
"epoch": 0.0001865851134233412,
"grad_norm": 1.4033368825912476,
"learning_rate": 4.14615185281161e-08,
"loss": 1.8838,
"step": 8
},
{
"epoch": 0.00020990825260125886,
"grad_norm": 1.876894235610962,
"learning_rate": 4.6644208344130604e-08,
"loss": 1.9916,
"step": 9
},
{
"epoch": 0.00023323139177917651,
"grad_norm": 2.4104366302490234,
"learning_rate": 5.1826898160145123e-08,
"loss": 1.8618,
"step": 10
},
{
"epoch": 0.0002565545309570942,
"grad_norm": 1.8457229137420654,
"learning_rate": 5.700958797615963e-08,
"loss": 1.7303,
"step": 11
},
{
"epoch": 0.0002798776701350118,
"grad_norm": 1.940317988395691,
"learning_rate": 6.219227779217413e-08,
"loss": 2.2692,
"step": 12
},
{
"epoch": 0.0003032008093129295,
"grad_norm": 2.455432891845703,
"learning_rate": 6.737496760818865e-08,
"loss": 2.3401,
"step": 13
},
{
"epoch": 0.0003265239484908471,
"grad_norm": 1.5163850784301758,
"learning_rate": 7.255765742420317e-08,
"loss": 2.1687,
"step": 14
},
{
"epoch": 0.0003498470876687648,
"grad_norm": 1.3012642860412598,
"learning_rate": 7.774034724021768e-08,
"loss": 1.8693,
"step": 15
},
{
"epoch": 0.0003731702268466824,
"grad_norm": 2.0896522998809814,
"learning_rate": 8.29230370562322e-08,
"loss": 1.7031,
"step": 16
},
{
"epoch": 0.0003964933660246001,
"grad_norm": 1.7818728685379028,
"learning_rate": 8.810572687224672e-08,
"loss": 2.0829,
"step": 17
},
{
"epoch": 0.0004198165052025177,
"grad_norm": 2.569828510284424,
"learning_rate": 9.328841668826121e-08,
"loss": 1.8998,
"step": 18
},
{
"epoch": 0.0004431396443804354,
"grad_norm": 1.4619100093841553,
"learning_rate": 9.847110650427573e-08,
"loss": 1.5964,
"step": 19
},
{
"epoch": 0.00046646278355835303,
"grad_norm": 1.9832793474197388,
"learning_rate": 1.0365379632029025e-07,
"loss": 1.9292,
"step": 20
},
{
"epoch": 0.0004897859227362707,
"grad_norm": 2.0182175636291504,
"learning_rate": 1.0883648613630475e-07,
"loss": 2.0115,
"step": 21
},
{
"epoch": 0.0005131090619141884,
"grad_norm": 1.4642307758331299,
"learning_rate": 1.1401917595231926e-07,
"loss": 2.0291,
"step": 22
},
{
"epoch": 0.000536432201092106,
"grad_norm": 2.887909173965454,
"learning_rate": 1.1920186576833378e-07,
"loss": 2.1946,
"step": 23
},
{
"epoch": 0.0005597553402700236,
"grad_norm": 1.595544457435608,
"learning_rate": 1.2438455558434827e-07,
"loss": 2.0246,
"step": 24
},
{
"epoch": 0.0005830784794479413,
"grad_norm": 1.5648566484451294,
"learning_rate": 1.295672454003628e-07,
"loss": 2.1832,
"step": 25
},
{
"epoch": 0.000606401618625859,
"grad_norm": 1.4702372550964355,
"learning_rate": 1.347499352163773e-07,
"loss": 1.6395,
"step": 26
},
{
"epoch": 0.0006297247578037766,
"grad_norm": 1.7178195714950562,
"learning_rate": 1.399326250323918e-07,
"loss": 1.6264,
"step": 27
},
{
"epoch": 0.0006530478969816942,
"grad_norm": 2.1751515865325928,
"learning_rate": 1.4511531484840635e-07,
"loss": 2.511,
"step": 28
},
{
"epoch": 0.0006763710361596119,
"grad_norm": 2.9443299770355225,
"learning_rate": 1.5029800466442085e-07,
"loss": 2.229,
"step": 29
},
{
"epoch": 0.0006996941753375296,
"grad_norm": 1.8316481113433838,
"learning_rate": 1.5548069448043536e-07,
"loss": 1.8414,
"step": 30
},
{
"epoch": 0.0007230173145154472,
"grad_norm": 1.9659239053726196,
"learning_rate": 1.6066338429644986e-07,
"loss": 2.0109,
"step": 31
},
{
"epoch": 0.0007463404536933648,
"grad_norm": 2.1653449535369873,
"learning_rate": 1.658460741124644e-07,
"loss": 2.0155,
"step": 32
},
{
"epoch": 0.0007696635928712825,
"grad_norm": 1.8755710124969482,
"learning_rate": 1.710287639284789e-07,
"loss": 2.1105,
"step": 33
},
{
"epoch": 0.0007929867320492002,
"grad_norm": 1.5989196300506592,
"learning_rate": 1.7621145374449343e-07,
"loss": 2.1583,
"step": 34
},
{
"epoch": 0.0008163098712271178,
"grad_norm": 1.865307331085205,
"learning_rate": 1.813941435605079e-07,
"loss": 2.001,
"step": 35
},
{
"epoch": 0.0008396330104050355,
"grad_norm": 1.4584789276123047,
"learning_rate": 1.8657683337652242e-07,
"loss": 1.8854,
"step": 36
},
{
"epoch": 0.0008629561495829531,
"grad_norm": 2.6818912029266357,
"learning_rate": 1.9175952319253695e-07,
"loss": 2.1888,
"step": 37
},
{
"epoch": 0.0008862792887608708,
"grad_norm": 2.17561674118042,
"learning_rate": 1.9694221300855146e-07,
"loss": 1.9616,
"step": 38
},
{
"epoch": 0.0009096024279387884,
"grad_norm": 1.252475619316101,
"learning_rate": 2.02124902824566e-07,
"loss": 1.9585,
"step": 39
},
{
"epoch": 0.0009329255671167061,
"grad_norm": 1.884366750717163,
"learning_rate": 2.073075926405805e-07,
"loss": 2.2436,
"step": 40
},
{
"epoch": 0.0009562487062946237,
"grad_norm": 1.4951350688934326,
"learning_rate": 2.1249028245659497e-07,
"loss": 1.7149,
"step": 41
},
{
"epoch": 0.0009795718454725414,
"grad_norm": 1.891728162765503,
"learning_rate": 2.176729722726095e-07,
"loss": 2.0472,
"step": 42
},
{
"epoch": 0.001002894984650459,
"grad_norm": 1.8992432355880737,
"learning_rate": 2.22855662088624e-07,
"loss": 2.1471,
"step": 43
},
{
"epoch": 0.0010262181238283768,
"grad_norm": 1.3931283950805664,
"learning_rate": 2.2803835190463852e-07,
"loss": 1.5292,
"step": 44
},
{
"epoch": 0.0010495412630062942,
"grad_norm": 1.8894548416137695,
"learning_rate": 2.3322104172065305e-07,
"loss": 1.7759,
"step": 45
},
{
"epoch": 0.001072864402184212,
"grad_norm": 1.592050552368164,
"learning_rate": 2.3840373153666755e-07,
"loss": 2.2498,
"step": 46
},
{
"epoch": 0.0010961875413621296,
"grad_norm": 1.3746178150177002,
"learning_rate": 2.4358642135268203e-07,
"loss": 1.8503,
"step": 47
},
{
"epoch": 0.0011195106805400473,
"grad_norm": 2.0268595218658447,
"learning_rate": 2.4876911116869654e-07,
"loss": 1.9358,
"step": 48
},
{
"epoch": 0.001142833819717965,
"grad_norm": 1.7836228609085083,
"learning_rate": 2.539518009847111e-07,
"loss": 1.9855,
"step": 49
},
{
"epoch": 0.0011661569588958826,
"grad_norm": 1.829447627067566,
"learning_rate": 2.591344908007256e-07,
"loss": 2.2802,
"step": 50
},
{
"epoch": 0.0011894800980738003,
"grad_norm": 2.2813496589660645,
"learning_rate": 2.643171806167401e-07,
"loss": 2.1593,
"step": 51
},
{
"epoch": 0.001212803237251718,
"grad_norm": 3.019044876098633,
"learning_rate": 2.694998704327546e-07,
"loss": 1.9534,
"step": 52
},
{
"epoch": 0.0012361263764296354,
"grad_norm": 2.011425256729126,
"learning_rate": 2.746825602487691e-07,
"loss": 2.1284,
"step": 53
},
{
"epoch": 0.0012594495156075531,
"grad_norm": 2.207106590270996,
"learning_rate": 2.798652500647836e-07,
"loss": 2.2427,
"step": 54
},
{
"epoch": 0.0012827726547854708,
"grad_norm": 1.3172473907470703,
"learning_rate": 2.8504793988079813e-07,
"loss": 1.9782,
"step": 55
},
{
"epoch": 0.0013060957939633885,
"grad_norm": 1.522895097732544,
"learning_rate": 2.902306296968127e-07,
"loss": 1.9455,
"step": 56
},
{
"epoch": 0.0013294189331413062,
"grad_norm": 2.657248020172119,
"learning_rate": 2.954133195128272e-07,
"loss": 1.959,
"step": 57
},
{
"epoch": 0.0013527420723192238,
"grad_norm": 1.9738789796829224,
"learning_rate": 3.005960093288417e-07,
"loss": 1.7878,
"step": 58
},
{
"epoch": 0.0013760652114971415,
"grad_norm": 1.5549254417419434,
"learning_rate": 3.057786991448562e-07,
"loss": 1.9405,
"step": 59
},
{
"epoch": 0.0013993883506750592,
"grad_norm": 2.9688899517059326,
"learning_rate": 3.109613889608707e-07,
"loss": 1.9969,
"step": 60
},
{
"epoch": 0.0014227114898529767,
"grad_norm": 1.4602586030960083,
"learning_rate": 3.1614407877688527e-07,
"loss": 1.9339,
"step": 61
},
{
"epoch": 0.0014460346290308943,
"grad_norm": 2.4017045497894287,
"learning_rate": 3.213267685928997e-07,
"loss": 2.0842,
"step": 62
},
{
"epoch": 0.001469357768208812,
"grad_norm": 1.7433497905731201,
"learning_rate": 3.2650945840891423e-07,
"loss": 2.0223,
"step": 63
},
{
"epoch": 0.0014926809073867297,
"grad_norm": 1.7395591735839844,
"learning_rate": 3.316921482249288e-07,
"loss": 1.9257,
"step": 64
},
{
"epoch": 0.0015160040465646474,
"grad_norm": 1.8336257934570312,
"learning_rate": 3.3687483804094324e-07,
"loss": 1.948,
"step": 65
},
{
"epoch": 0.001539327185742565,
"grad_norm": 1.6493985652923584,
"learning_rate": 3.420575278569578e-07,
"loss": 1.8672,
"step": 66
},
{
"epoch": 0.0015626503249204827,
"grad_norm": 1.5789337158203125,
"learning_rate": 3.472402176729723e-07,
"loss": 1.9446,
"step": 67
},
{
"epoch": 0.0015859734640984004,
"grad_norm": 1.3755509853363037,
"learning_rate": 3.5242290748898686e-07,
"loss": 2.1796,
"step": 68
},
{
"epoch": 0.001609296603276318,
"grad_norm": 1.7978087663650513,
"learning_rate": 3.576055973050013e-07,
"loss": 1.8974,
"step": 69
},
{
"epoch": 0.0016326197424542355,
"grad_norm": 1.8888216018676758,
"learning_rate": 3.627882871210158e-07,
"loss": 1.915,
"step": 70
},
{
"epoch": 0.0016559428816321532,
"grad_norm": 2.6150593757629395,
"learning_rate": 3.679709769370304e-07,
"loss": 2.2133,
"step": 71
},
{
"epoch": 0.001679266020810071,
"grad_norm": 1.7009005546569824,
"learning_rate": 3.7315366675304483e-07,
"loss": 2.1024,
"step": 72
},
{
"epoch": 0.0017025891599879886,
"grad_norm": 1.741734266281128,
"learning_rate": 3.783363565690594e-07,
"loss": 2.1839,
"step": 73
},
{
"epoch": 0.0017259122991659063,
"grad_norm": 2.7715041637420654,
"learning_rate": 3.835190463850739e-07,
"loss": 2.0734,
"step": 74
},
{
"epoch": 0.001749235438343824,
"grad_norm": 1.9710502624511719,
"learning_rate": 3.8870173620108835e-07,
"loss": 2.18,
"step": 75
},
{
"epoch": 0.0017725585775217416,
"grad_norm": 2.077986478805542,
"learning_rate": 3.938844260171029e-07,
"loss": 2.1482,
"step": 76
},
{
"epoch": 0.0017958817166996593,
"grad_norm": 2.583721160888672,
"learning_rate": 3.990671158331174e-07,
"loss": 2.5364,
"step": 77
},
{
"epoch": 0.0018192048558775768,
"grad_norm": 1.3425930738449097,
"learning_rate": 4.04249805649132e-07,
"loss": 1.8194,
"step": 78
},
{
"epoch": 0.0018425279950554944,
"grad_norm": 2.1111888885498047,
"learning_rate": 4.0943249546514643e-07,
"loss": 1.7878,
"step": 79
},
{
"epoch": 0.0018658511342334121,
"grad_norm": 2.0795626640319824,
"learning_rate": 4.14615185281161e-07,
"loss": 2.3006,
"step": 80
},
{
"epoch": 0.0018891742734113298,
"grad_norm": 1.273370623588562,
"learning_rate": 4.197978750971755e-07,
"loss": 1.7599,
"step": 81
},
{
"epoch": 0.0019124974125892475,
"grad_norm": 1.6202706098556519,
"learning_rate": 4.2498056491318994e-07,
"loss": 2.1727,
"step": 82
},
{
"epoch": 0.0019358205517671651,
"grad_norm": 2.4593732357025146,
"learning_rate": 4.301632547292045e-07,
"loss": 2.4588,
"step": 83
},
{
"epoch": 0.001959143690945083,
"grad_norm": 1.2617835998535156,
"learning_rate": 4.35345944545219e-07,
"loss": 1.9078,
"step": 84
},
{
"epoch": 0.0019824668301230003,
"grad_norm": 2.2640504837036133,
"learning_rate": 4.405286343612335e-07,
"loss": 1.8983,
"step": 85
},
{
"epoch": 0.002005789969300918,
"grad_norm": 1.6804454326629639,
"learning_rate": 4.45711324177248e-07,
"loss": 2.1049,
"step": 86
},
{
"epoch": 0.0020291131084788356,
"grad_norm": 2.060009717941284,
"learning_rate": 4.5089401399326253e-07,
"loss": 2.0153,
"step": 87
},
{
"epoch": 0.0020524362476567535,
"grad_norm": 1.7166160345077515,
"learning_rate": 4.5607670380927703e-07,
"loss": 2.1093,
"step": 88
},
{
"epoch": 0.002075759386834671,
"grad_norm": 1.6695979833602905,
"learning_rate": 4.6125939362529154e-07,
"loss": 1.8607,
"step": 89
},
{
"epoch": 0.0020990825260125885,
"grad_norm": 1.4339056015014648,
"learning_rate": 4.664420834413061e-07,
"loss": 2.2632,
"step": 90
},
{
"epoch": 0.0021224056651905064,
"grad_norm": 1.5228222608566284,
"learning_rate": 4.7162477325732055e-07,
"loss": 2.0851,
"step": 91
},
{
"epoch": 0.002145728804368424,
"grad_norm": 1.540848731994629,
"learning_rate": 4.768074630733351e-07,
"loss": 2.1446,
"step": 92
},
{
"epoch": 0.0021690519435463417,
"grad_norm": 1.480702519416809,
"learning_rate": 4.819901528893496e-07,
"loss": 2.0718,
"step": 93
},
{
"epoch": 0.002192375082724259,
"grad_norm": 2.23518705368042,
"learning_rate": 4.871728427053641e-07,
"loss": 1.6198,
"step": 94
},
{
"epoch": 0.002215698221902177,
"grad_norm": 1.6477755308151245,
"learning_rate": 4.923555325213786e-07,
"loss": 2.1136,
"step": 95
},
{
"epoch": 0.0022390213610800945,
"grad_norm": 1.9548614025115967,
"learning_rate": 4.975382223373931e-07,
"loss": 1.9143,
"step": 96
},
{
"epoch": 0.0022623445002580124,
"grad_norm": 1.3557407855987549,
"learning_rate": 5.027209121534076e-07,
"loss": 2.0044,
"step": 97
},
{
"epoch": 0.00228566763943593,
"grad_norm": 2.2781455516815186,
"learning_rate": 5.079036019694222e-07,
"loss": 1.7761,
"step": 98
},
{
"epoch": 0.0023089907786138474,
"grad_norm": 2.1195600032806396,
"learning_rate": 5.130862917854368e-07,
"loss": 1.8174,
"step": 99
},
{
"epoch": 0.0023323139177917653,
"grad_norm": 2.0798068046569824,
"learning_rate": 5.182689816014512e-07,
"loss": 2.1431,
"step": 100
},
{
"epoch": 0.0023556370569696827,
"grad_norm": 1.8773006200790405,
"learning_rate": 5.234516714174657e-07,
"loss": 1.5221,
"step": 101
},
{
"epoch": 0.0023789601961476006,
"grad_norm": 1.7917876243591309,
"learning_rate": 5.286343612334802e-07,
"loss": 1.9383,
"step": 102
},
{
"epoch": 0.002402283335325518,
"grad_norm": 1.4980329275131226,
"learning_rate": 5.338170510494947e-07,
"loss": 1.846,
"step": 103
},
{
"epoch": 0.002425606474503436,
"grad_norm": 2.0081095695495605,
"learning_rate": 5.389997408655092e-07,
"loss": 1.8777,
"step": 104
},
{
"epoch": 0.0024489296136813534,
"grad_norm": 1.525317907333374,
"learning_rate": 5.441824306815238e-07,
"loss": 1.971,
"step": 105
},
{
"epoch": 0.002472252752859271,
"grad_norm": 1.4131786823272705,
"learning_rate": 5.493651204975382e-07,
"loss": 2.2224,
"step": 106
},
{
"epoch": 0.002495575892037189,
"grad_norm": 1.164492130279541,
"learning_rate": 5.545478103135528e-07,
"loss": 1.8909,
"step": 107
},
{
"epoch": 0.0025188990312151062,
"grad_norm": 1.9998016357421875,
"learning_rate": 5.597305001295673e-07,
"loss": 2.1197,
"step": 108
},
{
"epoch": 0.002542222170393024,
"grad_norm": 1.6218236684799194,
"learning_rate": 5.649131899455818e-07,
"loss": 1.7799,
"step": 109
},
{
"epoch": 0.0025655453095709416,
"grad_norm": 1.535388708114624,
"learning_rate": 5.700958797615963e-07,
"loss": 1.7878,
"step": 110
},
{
"epoch": 0.0025888684487488595,
"grad_norm": 1.4929994344711304,
"learning_rate": 5.752785695776108e-07,
"loss": 2.0802,
"step": 111
},
{
"epoch": 0.002612191587926777,
"grad_norm": 2.183293104171753,
"learning_rate": 5.804612593936254e-07,
"loss": 2.0506,
"step": 112
},
{
"epoch": 0.002635514727104695,
"grad_norm": 1.6339191198349,
"learning_rate": 5.856439492096398e-07,
"loss": 1.7152,
"step": 113
},
{
"epoch": 0.0026588378662826123,
"grad_norm": 1.4886974096298218,
"learning_rate": 5.908266390256544e-07,
"loss": 1.8327,
"step": 114
},
{
"epoch": 0.0026821610054605298,
"grad_norm": 1.4198302030563354,
"learning_rate": 5.960093288416688e-07,
"loss": 1.8342,
"step": 115
},
{
"epoch": 0.0027054841446384477,
"grad_norm": 2.041900157928467,
"learning_rate": 6.011920186576834e-07,
"loss": 1.9101,
"step": 116
},
{
"epoch": 0.002728807283816365,
"grad_norm": 1.7576725482940674,
"learning_rate": 6.063747084736979e-07,
"loss": 2.3793,
"step": 117
},
{
"epoch": 0.002752130422994283,
"grad_norm": 1.620440125465393,
"learning_rate": 6.115573982897124e-07,
"loss": 1.7363,
"step": 118
},
{
"epoch": 0.0027754535621722005,
"grad_norm": 1.972102403640747,
"learning_rate": 6.16740088105727e-07,
"loss": 2.0338,
"step": 119
},
{
"epoch": 0.0027987767013501184,
"grad_norm": 1.5385342836380005,
"learning_rate": 6.219227779217414e-07,
"loss": 1.829,
"step": 120
},
{
"epoch": 0.002822099840528036,
"grad_norm": 1.4439769983291626,
"learning_rate": 6.27105467737756e-07,
"loss": 1.9893,
"step": 121
},
{
"epoch": 0.0028454229797059533,
"grad_norm": 1.5146026611328125,
"learning_rate": 6.322881575537705e-07,
"loss": 1.6563,
"step": 122
},
{
"epoch": 0.002868746118883871,
"grad_norm": 1.7177401781082153,
"learning_rate": 6.374708473697849e-07,
"loss": 1.9483,
"step": 123
},
{
"epoch": 0.0028920692580617887,
"grad_norm": 2.484865188598633,
"learning_rate": 6.426535371857994e-07,
"loss": 2.0949,
"step": 124
},
{
"epoch": 0.0029153923972397066,
"grad_norm": 1.5320651531219482,
"learning_rate": 6.47836227001814e-07,
"loss": 1.8557,
"step": 125
},
{
"epoch": 0.002938715536417624,
"grad_norm": 1.3804417848587036,
"learning_rate": 6.530189168178285e-07,
"loss": 1.8733,
"step": 126
},
{
"epoch": 0.002962038675595542,
"grad_norm": 2.0832831859588623,
"learning_rate": 6.58201606633843e-07,
"loss": 1.8556,
"step": 127
},
{
"epoch": 0.0029853618147734594,
"grad_norm": 1.2582931518554688,
"learning_rate": 6.633842964498576e-07,
"loss": 2.1239,
"step": 128
},
{
"epoch": 0.0030086849539513773,
"grad_norm": 1.6449629068374634,
"learning_rate": 6.685669862658721e-07,
"loss": 2.1635,
"step": 129
},
{
"epoch": 0.0030320080931292947,
"grad_norm": 1.3350502252578735,
"learning_rate": 6.737496760818865e-07,
"loss": 1.801,
"step": 130
},
{
"epoch": 0.003055331232307212,
"grad_norm": 1.7689651250839233,
"learning_rate": 6.78932365897901e-07,
"loss": 1.7541,
"step": 131
},
{
"epoch": 0.00307865437148513,
"grad_norm": 1.4711276292800903,
"learning_rate": 6.841150557139156e-07,
"loss": 2.3916,
"step": 132
},
{
"epoch": 0.0031019775106630476,
"grad_norm": 1.2806516885757446,
"learning_rate": 6.892977455299301e-07,
"loss": 1.8609,
"step": 133
},
{
"epoch": 0.0031253006498409655,
"grad_norm": 1.5531939268112183,
"learning_rate": 6.944804353459446e-07,
"loss": 1.7721,
"step": 134
},
{
"epoch": 0.003148623789018883,
"grad_norm": 1.6541032791137695,
"learning_rate": 6.996631251619592e-07,
"loss": 2.1091,
"step": 135
},
{
"epoch": 0.003171946928196801,
"grad_norm": 2.050734281539917,
"learning_rate": 7.048458149779737e-07,
"loss": 1.8932,
"step": 136
},
{
"epoch": 0.0031952700673747183,
"grad_norm": 1.2903157472610474,
"learning_rate": 7.100285047939881e-07,
"loss": 2.0833,
"step": 137
},
{
"epoch": 0.003218593206552636,
"grad_norm": 1.3316091299057007,
"learning_rate": 7.152111946100026e-07,
"loss": 1.9307,
"step": 138
},
{
"epoch": 0.0032419163457305536,
"grad_norm": 1.441341519355774,
"learning_rate": 7.203938844260172e-07,
"loss": 2.2529,
"step": 139
},
{
"epoch": 0.003265239484908471,
"grad_norm": 2.159276008605957,
"learning_rate": 7.255765742420316e-07,
"loss": 1.847,
"step": 140
},
{
"epoch": 0.003288562624086389,
"grad_norm": 1.8410853147506714,
"learning_rate": 7.307592640580462e-07,
"loss": 2.2465,
"step": 141
},
{
"epoch": 0.0033118857632643064,
"grad_norm": 1.8678739070892334,
"learning_rate": 7.359419538740608e-07,
"loss": 1.9261,
"step": 142
},
{
"epoch": 0.0033352089024422243,
"grad_norm": 1.2097922563552856,
"learning_rate": 7.411246436900751e-07,
"loss": 2.0205,
"step": 143
},
{
"epoch": 0.003358532041620142,
"grad_norm": 1.733077883720398,
"learning_rate": 7.463073335060897e-07,
"loss": 1.8389,
"step": 144
},
{
"epoch": 0.0033818551807980597,
"grad_norm": 1.7118474245071411,
"learning_rate": 7.514900233221042e-07,
"loss": 1.9511,
"step": 145
},
{
"epoch": 0.003405178319975977,
"grad_norm": 1.6960872411727905,
"learning_rate": 7.566727131381188e-07,
"loss": 1.8828,
"step": 146
},
{
"epoch": 0.0034285014591538946,
"grad_norm": 1.2409390211105347,
"learning_rate": 7.618554029541332e-07,
"loss": 1.6878,
"step": 147
},
{
"epoch": 0.0034518245983318125,
"grad_norm": 1.3440965414047241,
"learning_rate": 7.670380927701478e-07,
"loss": 1.64,
"step": 148
},
{
"epoch": 0.00347514773750973,
"grad_norm": 1.539393663406372,
"learning_rate": 7.722207825861624e-07,
"loss": 1.6754,
"step": 149
},
{
"epoch": 0.003498470876687648,
"grad_norm": 1.5395653247833252,
"learning_rate": 7.774034724021767e-07,
"loss": 1.9761,
"step": 150
},
{
"epoch": 0.0035217940158655653,
"grad_norm": 2.0169472694396973,
"learning_rate": 7.825861622181913e-07,
"loss": 1.6927,
"step": 151
},
{
"epoch": 0.0035451171550434832,
"grad_norm": 1.8776079416275024,
"learning_rate": 7.877688520342058e-07,
"loss": 1.9273,
"step": 152
},
{
"epoch": 0.0035684402942214007,
"grad_norm": 2.078824043273926,
"learning_rate": 7.929515418502204e-07,
"loss": 1.6756,
"step": 153
},
{
"epoch": 0.0035917634333993186,
"grad_norm": 1.407560110092163,
"learning_rate": 7.981342316662348e-07,
"loss": 1.6038,
"step": 154
},
{
"epoch": 0.003615086572577236,
"grad_norm": 1.1770573854446411,
"learning_rate": 8.033169214822494e-07,
"loss": 1.6679,
"step": 155
},
{
"epoch": 0.0036384097117551535,
"grad_norm": 1.2057602405548096,
"learning_rate": 8.08499611298264e-07,
"loss": 1.7916,
"step": 156
},
{
"epoch": 0.0036617328509330714,
"grad_norm": 1.117970585823059,
"learning_rate": 8.136823011142783e-07,
"loss": 1.7974,
"step": 157
},
{
"epoch": 0.003685055990110989,
"grad_norm": 1.5996465682983398,
"learning_rate": 8.188649909302929e-07,
"loss": 1.6053,
"step": 158
},
{
"epoch": 0.0037083791292889068,
"grad_norm": 1.4170929193496704,
"learning_rate": 8.240476807463074e-07,
"loss": 1.7155,
"step": 159
},
{
"epoch": 0.0037317022684668242,
"grad_norm": 1.8114391565322876,
"learning_rate": 8.29230370562322e-07,
"loss": 1.9192,
"step": 160
},
{
"epoch": 0.003755025407644742,
"grad_norm": 1.3462793827056885,
"learning_rate": 8.344130603783364e-07,
"loss": 1.4624,
"step": 161
},
{
"epoch": 0.0037783485468226596,
"grad_norm": 1.6305956840515137,
"learning_rate": 8.39595750194351e-07,
"loss": 1.8017,
"step": 162
},
{
"epoch": 0.003801671686000577,
"grad_norm": 1.662576675415039,
"learning_rate": 8.447784400103655e-07,
"loss": 1.733,
"step": 163
},
{
"epoch": 0.003824994825178495,
"grad_norm": 1.556788682937622,
"learning_rate": 8.499611298263799e-07,
"loss": 1.9586,
"step": 164
},
{
"epoch": 0.0038483179643564124,
"grad_norm": 1.5282272100448608,
"learning_rate": 8.551438196423944e-07,
"loss": 1.8254,
"step": 165
},
{
"epoch": 0.0038716411035343303,
"grad_norm": 1.6790592670440674,
"learning_rate": 8.60326509458409e-07,
"loss": 2.1866,
"step": 166
},
{
"epoch": 0.0038949642427122478,
"grad_norm": 1.5164263248443604,
"learning_rate": 8.655091992744236e-07,
"loss": 1.6651,
"step": 167
},
{
"epoch": 0.003918287381890166,
"grad_norm": 1.5002336502075195,
"learning_rate": 8.70691889090438e-07,
"loss": 1.9295,
"step": 168
},
{
"epoch": 0.0039416105210680836,
"grad_norm": 1.2122441530227661,
"learning_rate": 8.758745789064526e-07,
"loss": 1.761,
"step": 169
},
{
"epoch": 0.003964933660246001,
"grad_norm": 1.637898564338684,
"learning_rate": 8.81057268722467e-07,
"loss": 1.8697,
"step": 170
},
{
"epoch": 0.0039882567994239185,
"grad_norm": 0.988777220249176,
"learning_rate": 8.862399585384815e-07,
"loss": 2.1249,
"step": 171
},
{
"epoch": 0.004011579938601836,
"grad_norm": 1.8833587169647217,
"learning_rate": 8.91422648354496e-07,
"loss": 1.6915,
"step": 172
},
{
"epoch": 0.004034903077779753,
"grad_norm": 1.8418108224868774,
"learning_rate": 8.966053381705106e-07,
"loss": 2.0019,
"step": 173
},
{
"epoch": 0.004058226216957671,
"grad_norm": 1.6375901699066162,
"learning_rate": 9.017880279865251e-07,
"loss": 1.7625,
"step": 174
},
{
"epoch": 0.004081549356135589,
"grad_norm": 1.8701720237731934,
"learning_rate": 9.069707178025396e-07,
"loss": 1.801,
"step": 175
},
{
"epoch": 0.004104872495313507,
"grad_norm": 1.4488773345947266,
"learning_rate": 9.121534076185541e-07,
"loss": 1.9971,
"step": 176
},
{
"epoch": 0.004128195634491424,
"grad_norm": 0.9587986469268799,
"learning_rate": 9.173360974345686e-07,
"loss": 1.6253,
"step": 177
},
{
"epoch": 0.004151518773669342,
"grad_norm": 2.6533186435699463,
"learning_rate": 9.225187872505831e-07,
"loss": 1.572,
"step": 178
},
{
"epoch": 0.00417484191284726,
"grad_norm": 2.4528841972351074,
"learning_rate": 9.277014770665976e-07,
"loss": 1.7586,
"step": 179
},
{
"epoch": 0.004198165052025177,
"grad_norm": 1.1871824264526367,
"learning_rate": 9.328841668826122e-07,
"loss": 1.6765,
"step": 180
},
{
"epoch": 0.004221488191203095,
"grad_norm": 1.1292660236358643,
"learning_rate": 9.380668566986266e-07,
"loss": 2.0673,
"step": 181
},
{
"epoch": 0.004244811330381013,
"grad_norm": 1.3055285215377808,
"learning_rate": 9.432495465146411e-07,
"loss": 1.8103,
"step": 182
},
{
"epoch": 0.004268134469558931,
"grad_norm": 1.5225868225097656,
"learning_rate": 9.484322363306557e-07,
"loss": 2.0813,
"step": 183
},
{
"epoch": 0.004291457608736848,
"grad_norm": 1.2439767122268677,
"learning_rate": 9.536149261466702e-07,
"loss": 1.6919,
"step": 184
},
{
"epoch": 0.0043147807479147655,
"grad_norm": 1.2424002885818481,
"learning_rate": 9.587976159626847e-07,
"loss": 1.9506,
"step": 185
},
{
"epoch": 0.0043381038870926834,
"grad_norm": 0.9796323776245117,
"learning_rate": 9.639803057786992e-07,
"loss": 1.7342,
"step": 186
},
{
"epoch": 0.0043614270262706005,
"grad_norm": 1.2240192890167236,
"learning_rate": 9.691629955947138e-07,
"loss": 2.0646,
"step": 187
},
{
"epoch": 0.004384750165448518,
"grad_norm": 0.8779449462890625,
"learning_rate": 9.743456854107281e-07,
"loss": 1.4535,
"step": 188
},
{
"epoch": 0.004408073304626436,
"grad_norm": 1.3131407499313354,
"learning_rate": 9.795283752267427e-07,
"loss": 1.9817,
"step": 189
},
{
"epoch": 0.004431396443804354,
"grad_norm": 1.3259912729263306,
"learning_rate": 9.847110650427573e-07,
"loss": 1.709,
"step": 190
},
{
"epoch": 0.004454719582982271,
"grad_norm": 1.4236465692520142,
"learning_rate": 9.898937548587718e-07,
"loss": 1.7059,
"step": 191
},
{
"epoch": 0.004478042722160189,
"grad_norm": 1.2791959047317505,
"learning_rate": 9.950764446747862e-07,
"loss": 1.9633,
"step": 192
},
{
"epoch": 0.004501365861338107,
"grad_norm": 0.9857053160667419,
"learning_rate": 1.0002591344908007e-06,
"loss": 1.807,
"step": 193
},
{
"epoch": 0.004524689000516025,
"grad_norm": 1.264302372932434,
"learning_rate": 1.0054418243068153e-06,
"loss": 1.5389,
"step": 194
},
{
"epoch": 0.004548012139693942,
"grad_norm": 1.2205390930175781,
"learning_rate": 1.0106245141228298e-06,
"loss": 1.4549,
"step": 195
},
{
"epoch": 0.00457133527887186,
"grad_norm": 1.055471420288086,
"learning_rate": 1.0158072039388444e-06,
"loss": 1.6931,
"step": 196
},
{
"epoch": 0.004594658418049778,
"grad_norm": 1.0585546493530273,
"learning_rate": 1.020989893754859e-06,
"loss": 1.8054,
"step": 197
},
{
"epoch": 0.004617981557227695,
"grad_norm": 2.16025972366333,
"learning_rate": 1.0261725835708735e-06,
"loss": 2.0077,
"step": 198
},
{
"epoch": 0.004641304696405613,
"grad_norm": 2.125786781311035,
"learning_rate": 1.0313552733868879e-06,
"loss": 1.9117,
"step": 199
},
{
"epoch": 0.0046646278355835305,
"grad_norm": 1.3560391664505005,
"learning_rate": 1.0365379632029024e-06,
"loss": 1.9871,
"step": 200
},
{
"epoch": 0.004687950974761448,
"grad_norm": 1.3505181074142456,
"learning_rate": 1.041720653018917e-06,
"loss": 1.714,
"step": 201
},
{
"epoch": 0.004711274113939365,
"grad_norm": 1.1724427938461304,
"learning_rate": 1.0469033428349313e-06,
"loss": 1.7611,
"step": 202
},
{
"epoch": 0.004734597253117283,
"grad_norm": 1.1746799945831299,
"learning_rate": 1.0520860326509459e-06,
"loss": 1.867,
"step": 203
},
{
"epoch": 0.004757920392295201,
"grad_norm": 1.0976382493972778,
"learning_rate": 1.0572687224669604e-06,
"loss": 1.808,
"step": 204
},
{
"epoch": 0.004781243531473118,
"grad_norm": 1.3842298984527588,
"learning_rate": 1.062451412282975e-06,
"loss": 1.7973,
"step": 205
},
{
"epoch": 0.004804566670651036,
"grad_norm": 1.6715288162231445,
"learning_rate": 1.0676341020989893e-06,
"loss": 1.9817,
"step": 206
},
{
"epoch": 0.004827889809828954,
"grad_norm": 1.0734590291976929,
"learning_rate": 1.072816791915004e-06,
"loss": 1.4297,
"step": 207
},
{
"epoch": 0.004851212949006872,
"grad_norm": 1.0182546377182007,
"learning_rate": 1.0779994817310185e-06,
"loss": 1.713,
"step": 208
},
{
"epoch": 0.004874536088184789,
"grad_norm": 1.1884313821792603,
"learning_rate": 1.083182171547033e-06,
"loss": 1.5234,
"step": 209
},
{
"epoch": 0.004897859227362707,
"grad_norm": 1.520266056060791,
"learning_rate": 1.0883648613630476e-06,
"loss": 2.0598,
"step": 210
},
{
"epoch": 0.004921182366540625,
"grad_norm": 1.1709904670715332,
"learning_rate": 1.0935475511790621e-06,
"loss": 2.1461,
"step": 211
},
{
"epoch": 0.004944505505718542,
"grad_norm": 1.2634027004241943,
"learning_rate": 1.0987302409950765e-06,
"loss": 1.5076,
"step": 212
},
{
"epoch": 0.00496782864489646,
"grad_norm": 1.490717887878418,
"learning_rate": 1.103912930811091e-06,
"loss": 1.8628,
"step": 213
},
{
"epoch": 0.004991151784074378,
"grad_norm": 2.077373743057251,
"learning_rate": 1.1090956206271056e-06,
"loss": 1.9295,
"step": 214
},
{
"epoch": 0.0050144749232522955,
"grad_norm": 1.647877812385559,
"learning_rate": 1.1142783104431202e-06,
"loss": 1.7929,
"step": 215
},
{
"epoch": 0.0050377980624302125,
"grad_norm": 1.1937353610992432,
"learning_rate": 1.1194610002591345e-06,
"loss": 1.6509,
"step": 216
},
{
"epoch": 0.00506112120160813,
"grad_norm": 1.0805108547210693,
"learning_rate": 1.124643690075149e-06,
"loss": 1.6447,
"step": 217
},
{
"epoch": 0.005084444340786048,
"grad_norm": 1.1077872514724731,
"learning_rate": 1.1298263798911636e-06,
"loss": 1.7675,
"step": 218
},
{
"epoch": 0.005107767479963966,
"grad_norm": 0.8648241758346558,
"learning_rate": 1.135009069707178e-06,
"loss": 1.6687,
"step": 219
},
{
"epoch": 0.005131090619141883,
"grad_norm": 1.0522700548171997,
"learning_rate": 1.1401917595231925e-06,
"loss": 1.2878,
"step": 220
},
{
"epoch": 0.005154413758319801,
"grad_norm": 1.3021256923675537,
"learning_rate": 1.145374449339207e-06,
"loss": 1.8535,
"step": 221
},
{
"epoch": 0.005177736897497719,
"grad_norm": 1.2912962436676025,
"learning_rate": 1.1505571391552216e-06,
"loss": 1.865,
"step": 222
},
{
"epoch": 0.005201060036675636,
"grad_norm": 1.6733994483947754,
"learning_rate": 1.1557398289712362e-06,
"loss": 1.5748,
"step": 223
},
{
"epoch": 0.005224383175853554,
"grad_norm": 1.0865724086761475,
"learning_rate": 1.1609225187872508e-06,
"loss": 1.8159,
"step": 224
},
{
"epoch": 0.005247706315031472,
"grad_norm": 1.1498301029205322,
"learning_rate": 1.1661052086032653e-06,
"loss": 1.8579,
"step": 225
},
{
"epoch": 0.00527102945420939,
"grad_norm": 1.9360573291778564,
"learning_rate": 1.1712878984192797e-06,
"loss": 1.7366,
"step": 226
},
{
"epoch": 0.005294352593387307,
"grad_norm": 1.0133939981460571,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.4571,
"step": 227
},
{
"epoch": 0.005317675732565225,
"grad_norm": 1.6443811655044556,
"learning_rate": 1.1816532780513088e-06,
"loss": 1.5312,
"step": 228
},
{
"epoch": 0.0053409988717431425,
"grad_norm": 1.1923338174819946,
"learning_rate": 1.1868359678673233e-06,
"loss": 1.6993,
"step": 229
},
{
"epoch": 0.0053643220109210596,
"grad_norm": 1.0345349311828613,
"learning_rate": 1.1920186576833377e-06,
"loss": 1.5739,
"step": 230
},
{
"epoch": 0.0053876451500989775,
"grad_norm": 0.9833806753158569,
"learning_rate": 1.1972013474993522e-06,
"loss": 1.819,
"step": 231
},
{
"epoch": 0.005410968289276895,
"grad_norm": 1.3315545320510864,
"learning_rate": 1.2023840373153668e-06,
"loss": 1.9472,
"step": 232
},
{
"epoch": 0.005434291428454813,
"grad_norm": 1.0042314529418945,
"learning_rate": 1.2075667271313812e-06,
"loss": 1.993,
"step": 233
},
{
"epoch": 0.00545761456763273,
"grad_norm": 1.2731118202209473,
"learning_rate": 1.2127494169473957e-06,
"loss": 1.6763,
"step": 234
},
{
"epoch": 0.005480937706810648,
"grad_norm": 0.9664155840873718,
"learning_rate": 1.2179321067634103e-06,
"loss": 1.3091,
"step": 235
},
{
"epoch": 0.005504260845988566,
"grad_norm": 1.6930897235870361,
"learning_rate": 1.2231147965794248e-06,
"loss": 1.6111,
"step": 236
},
{
"epoch": 0.005527583985166483,
"grad_norm": 0.9807016253471375,
"learning_rate": 1.2282974863954394e-06,
"loss": 1.6131,
"step": 237
},
{
"epoch": 0.005550907124344401,
"grad_norm": 1.321951150894165,
"learning_rate": 1.233480176211454e-06,
"loss": 1.242,
"step": 238
},
{
"epoch": 0.005574230263522319,
"grad_norm": 1.1465637683868408,
"learning_rate": 1.2386628660274685e-06,
"loss": 1.7035,
"step": 239
},
{
"epoch": 0.005597553402700237,
"grad_norm": 2.4264347553253174,
"learning_rate": 1.2438455558434829e-06,
"loss": 1.9859,
"step": 240
},
{
"epoch": 0.005620876541878154,
"grad_norm": 1.429149866104126,
"learning_rate": 1.2490282456594974e-06,
"loss": 1.8249,
"step": 241
},
{
"epoch": 0.005644199681056072,
"grad_norm": 1.1119049787521362,
"learning_rate": 1.254210935475512e-06,
"loss": 1.8005,
"step": 242
},
{
"epoch": 0.00566752282023399,
"grad_norm": 1.9002227783203125,
"learning_rate": 1.2593936252915265e-06,
"loss": 1.6951,
"step": 243
},
{
"epoch": 0.005690845959411907,
"grad_norm": 1.067659854888916,
"learning_rate": 1.264576315107541e-06,
"loss": 1.799,
"step": 244
},
{
"epoch": 0.0057141690985898245,
"grad_norm": 1.2947990894317627,
"learning_rate": 1.2697590049235552e-06,
"loss": 1.7837,
"step": 245
},
{
"epoch": 0.005737492237767742,
"grad_norm": 1.0790272951126099,
"learning_rate": 1.2749416947395698e-06,
"loss": 1.67,
"step": 246
},
{
"epoch": 0.00576081537694566,
"grad_norm": 1.3589330911636353,
"learning_rate": 1.2801243845555843e-06,
"loss": 1.9282,
"step": 247
},
{
"epoch": 0.005784138516123577,
"grad_norm": 1.4140998125076294,
"learning_rate": 1.285307074371599e-06,
"loss": 1.6708,
"step": 248
},
{
"epoch": 0.005807461655301495,
"grad_norm": 1.000994086265564,
"learning_rate": 1.2904897641876135e-06,
"loss": 1.4077,
"step": 249
},
{
"epoch": 0.005830784794479413,
"grad_norm": 1.3655062913894653,
"learning_rate": 1.295672454003628e-06,
"loss": 1.8862,
"step": 250
},
{
"epoch": 0.005854107933657331,
"grad_norm": 1.1164065599441528,
"learning_rate": 1.3008551438196426e-06,
"loss": 1.528,
"step": 251
},
{
"epoch": 0.005877431072835248,
"grad_norm": 1.1792149543762207,
"learning_rate": 1.306037833635657e-06,
"loss": 1.2879,
"step": 252
},
{
"epoch": 0.005900754212013166,
"grad_norm": 2.236320734024048,
"learning_rate": 1.3112205234516715e-06,
"loss": 1.4929,
"step": 253
},
{
"epoch": 0.005924077351191084,
"grad_norm": 1.8795088529586792,
"learning_rate": 1.316403213267686e-06,
"loss": 1.2468,
"step": 254
},
{
"epoch": 0.005947400490369001,
"grad_norm": 1.2248806953430176,
"learning_rate": 1.3215859030837006e-06,
"loss": 1.769,
"step": 255
},
{
"epoch": 0.005970723629546919,
"grad_norm": 1.252236008644104,
"learning_rate": 1.3267685928997152e-06,
"loss": 1.9014,
"step": 256
},
{
"epoch": 0.005994046768724837,
"grad_norm": 1.3926386833190918,
"learning_rate": 1.3319512827157297e-06,
"loss": 1.9599,
"step": 257
},
{
"epoch": 0.0060173699079027546,
"grad_norm": 1.5681990385055542,
"learning_rate": 1.3371339725317443e-06,
"loss": 1.8109,
"step": 258
},
{
"epoch": 0.006040693047080672,
"grad_norm": 1.6841275691986084,
"learning_rate": 1.3423166623477584e-06,
"loss": 1.4601,
"step": 259
},
{
"epoch": 0.0060640161862585895,
"grad_norm": 1.5262291431427002,
"learning_rate": 1.347499352163773e-06,
"loss": 1.6493,
"step": 260
},
{
"epoch": 0.006087339325436507,
"grad_norm": 1.0905576944351196,
"learning_rate": 1.3526820419797875e-06,
"loss": 2.0847,
"step": 261
},
{
"epoch": 0.006110662464614424,
"grad_norm": 1.4682683944702148,
"learning_rate": 1.357864731795802e-06,
"loss": 1.6889,
"step": 262
},
{
"epoch": 0.006133985603792342,
"grad_norm": 1.1054515838623047,
"learning_rate": 1.3630474216118166e-06,
"loss": 1.55,
"step": 263
},
{
"epoch": 0.00615730874297026,
"grad_norm": 1.3931388854980469,
"learning_rate": 1.3682301114278312e-06,
"loss": 1.655,
"step": 264
},
{
"epoch": 0.006180631882148178,
"grad_norm": 1.1766420602798462,
"learning_rate": 1.3734128012438458e-06,
"loss": 1.9555,
"step": 265
},
{
"epoch": 0.006203955021326095,
"grad_norm": 1.1652954816818237,
"learning_rate": 1.3785954910598601e-06,
"loss": 1.8446,
"step": 266
},
{
"epoch": 0.006227278160504013,
"grad_norm": 1.378980278968811,
"learning_rate": 1.3837781808758747e-06,
"loss": 1.4449,
"step": 267
},
{
"epoch": 0.006250601299681931,
"grad_norm": 1.2017453908920288,
"learning_rate": 1.3889608706918892e-06,
"loss": 1.6272,
"step": 268
},
{
"epoch": 0.006273924438859848,
"grad_norm": 1.2221115827560425,
"learning_rate": 1.3941435605079038e-06,
"loss": 1.7299,
"step": 269
},
{
"epoch": 0.006297247578037766,
"grad_norm": 1.189775824546814,
"learning_rate": 1.3993262503239183e-06,
"loss": 1.1664,
"step": 270
},
{
"epoch": 0.006320570717215684,
"grad_norm": 1.0103381872177124,
"learning_rate": 1.404508940139933e-06,
"loss": 1.3519,
"step": 271
},
{
"epoch": 0.006343893856393602,
"grad_norm": 1.1243481636047363,
"learning_rate": 1.4096916299559475e-06,
"loss": 1.6704,
"step": 272
},
{
"epoch": 0.006367216995571519,
"grad_norm": 1.8137811422348022,
"learning_rate": 1.4148743197719616e-06,
"loss": 1.279,
"step": 273
},
{
"epoch": 0.0063905401347494365,
"grad_norm": 1.0875202417373657,
"learning_rate": 1.4200570095879762e-06,
"loss": 1.1564,
"step": 274
},
{
"epoch": 0.0064138632739273544,
"grad_norm": 1.0839550495147705,
"learning_rate": 1.4252396994039907e-06,
"loss": 1.7263,
"step": 275
},
{
"epoch": 0.006437186413105272,
"grad_norm": 1.7203173637390137,
"learning_rate": 1.4304223892200053e-06,
"loss": 1.9309,
"step": 276
},
{
"epoch": 0.006460509552283189,
"grad_norm": 1.3320658206939697,
"learning_rate": 1.4356050790360198e-06,
"loss": 1.8276,
"step": 277
},
{
"epoch": 0.006483832691461107,
"grad_norm": 1.5260910987854004,
"learning_rate": 1.4407877688520344e-06,
"loss": 1.413,
"step": 278
},
{
"epoch": 0.006507155830639025,
"grad_norm": 1.2401058673858643,
"learning_rate": 1.445970458668049e-06,
"loss": 1.4087,
"step": 279
},
{
"epoch": 0.006530478969816942,
"grad_norm": 1.2722922563552856,
"learning_rate": 1.4511531484840633e-06,
"loss": 1.6216,
"step": 280
},
{
"epoch": 0.00655380210899486,
"grad_norm": 1.2668229341506958,
"learning_rate": 1.4563358383000779e-06,
"loss": 1.6252,
"step": 281
},
{
"epoch": 0.006577125248172778,
"grad_norm": 1.4556583166122437,
"learning_rate": 1.4615185281160924e-06,
"loss": 2.3276,
"step": 282
},
{
"epoch": 0.006600448387350696,
"grad_norm": 1.537610411643982,
"learning_rate": 1.466701217932107e-06,
"loss": 1.4319,
"step": 283
},
{
"epoch": 0.006623771526528613,
"grad_norm": 1.3130170106887817,
"learning_rate": 1.4718839077481215e-06,
"loss": 1.4978,
"step": 284
},
{
"epoch": 0.006647094665706531,
"grad_norm": 1.5020934343338013,
"learning_rate": 1.477066597564136e-06,
"loss": 1.8697,
"step": 285
},
{
"epoch": 0.006670417804884449,
"grad_norm": 1.6949779987335205,
"learning_rate": 1.4822492873801502e-06,
"loss": 1.7433,
"step": 286
},
{
"epoch": 0.006693740944062366,
"grad_norm": 1.5566325187683105,
"learning_rate": 1.4874319771961648e-06,
"loss": 1.5674,
"step": 287
},
{
"epoch": 0.006717064083240284,
"grad_norm": 1.015093445777893,
"learning_rate": 1.4926146670121793e-06,
"loss": 1.9903,
"step": 288
},
{
"epoch": 0.0067403872224182015,
"grad_norm": 2.229853868484497,
"learning_rate": 1.497797356828194e-06,
"loss": 1.1905,
"step": 289
},
{
"epoch": 0.006763710361596119,
"grad_norm": 1.5241860151290894,
"learning_rate": 1.5029800466442085e-06,
"loss": 1.958,
"step": 290
},
{
"epoch": 0.006787033500774036,
"grad_norm": 0.8666454553604126,
"learning_rate": 1.508162736460223e-06,
"loss": 1.7141,
"step": 291
},
{
"epoch": 0.006810356639951954,
"grad_norm": 1.4594520330429077,
"learning_rate": 1.5133454262762376e-06,
"loss": 1.7235,
"step": 292
},
{
"epoch": 0.006833679779129872,
"grad_norm": 1.3267074823379517,
"learning_rate": 1.518528116092252e-06,
"loss": 1.6172,
"step": 293
},
{
"epoch": 0.006857002918307789,
"grad_norm": 1.5386312007904053,
"learning_rate": 1.5237108059082665e-06,
"loss": 1.4843,
"step": 294
},
{
"epoch": 0.006880326057485707,
"grad_norm": 1.3275539875030518,
"learning_rate": 1.528893495724281e-06,
"loss": 1.5444,
"step": 295
},
{
"epoch": 0.006903649196663625,
"grad_norm": 1.1002707481384277,
"learning_rate": 1.5340761855402956e-06,
"loss": 1.717,
"step": 296
},
{
"epoch": 0.006926972335841543,
"grad_norm": 1.172974944114685,
"learning_rate": 1.5392588753563102e-06,
"loss": 1.6963,
"step": 297
},
{
"epoch": 0.00695029547501946,
"grad_norm": 1.0728440284729004,
"learning_rate": 1.5444415651723247e-06,
"loss": 1.6228,
"step": 298
},
{
"epoch": 0.006973618614197378,
"grad_norm": 1.274348258972168,
"learning_rate": 1.5496242549883393e-06,
"loss": 1.2559,
"step": 299
},
{
"epoch": 0.006996941753375296,
"grad_norm": 1.2520028352737427,
"learning_rate": 1.5548069448043534e-06,
"loss": 1.6118,
"step": 300
},
{
"epoch": 0.007020264892553213,
"grad_norm": 1.5844305753707886,
"learning_rate": 1.559989634620368e-06,
"loss": 1.5645,
"step": 301
},
{
"epoch": 0.007043588031731131,
"grad_norm": 2.285438299179077,
"learning_rate": 1.5651723244363825e-06,
"loss": 1.4541,
"step": 302
},
{
"epoch": 0.007066911170909049,
"grad_norm": 1.2873152494430542,
"learning_rate": 1.570355014252397e-06,
"loss": 1.4835,
"step": 303
},
{
"epoch": 0.0070902343100869665,
"grad_norm": 1.1332640647888184,
"learning_rate": 1.5755377040684116e-06,
"loss": 1.8279,
"step": 304
},
{
"epoch": 0.0071135574492648835,
"grad_norm": 1.6483525037765503,
"learning_rate": 1.5807203938844262e-06,
"loss": 1.2509,
"step": 305
},
{
"epoch": 0.007136880588442801,
"grad_norm": 1.0219485759735107,
"learning_rate": 1.5859030837004408e-06,
"loss": 1.8421,
"step": 306
},
{
"epoch": 0.007160203727620719,
"grad_norm": 1.2478340864181519,
"learning_rate": 1.5910857735164551e-06,
"loss": 1.9144,
"step": 307
},
{
"epoch": 0.007183526866798637,
"grad_norm": 1.4016437530517578,
"learning_rate": 1.5962684633324697e-06,
"loss": 1.5146,
"step": 308
},
{
"epoch": 0.007206850005976554,
"grad_norm": 1.1399790048599243,
"learning_rate": 1.6014511531484842e-06,
"loss": 1.6714,
"step": 309
},
{
"epoch": 0.007230173145154472,
"grad_norm": 2.047961473464966,
"learning_rate": 1.6066338429644988e-06,
"loss": 1.1777,
"step": 310
},
{
"epoch": 0.00725349628433239,
"grad_norm": 1.1410201787948608,
"learning_rate": 1.6118165327805133e-06,
"loss": 1.6783,
"step": 311
},
{
"epoch": 0.007276819423510307,
"grad_norm": 1.2840640544891357,
"learning_rate": 1.616999222596528e-06,
"loss": 1.9351,
"step": 312
},
{
"epoch": 0.007300142562688225,
"grad_norm": 0.9116181135177612,
"learning_rate": 1.6221819124125425e-06,
"loss": 1.7705,
"step": 313
},
{
"epoch": 0.007323465701866143,
"grad_norm": 1.3190463781356812,
"learning_rate": 1.6273646022285566e-06,
"loss": 1.4484,
"step": 314
},
{
"epoch": 0.007346788841044061,
"grad_norm": 0.9988270401954651,
"learning_rate": 1.6325472920445712e-06,
"loss": 1.5159,
"step": 315
},
{
"epoch": 0.007370111980221978,
"grad_norm": 0.8620725870132446,
"learning_rate": 1.6377299818605857e-06,
"loss": 1.5605,
"step": 316
},
{
"epoch": 0.007393435119399896,
"grad_norm": 1.284604549407959,
"learning_rate": 1.6429126716766003e-06,
"loss": 1.4822,
"step": 317
},
{
"epoch": 0.0074167582585778135,
"grad_norm": 1.2546097040176392,
"learning_rate": 1.6480953614926148e-06,
"loss": 1.436,
"step": 318
},
{
"epoch": 0.0074400813977557306,
"grad_norm": 0.9116978645324707,
"learning_rate": 1.6532780513086294e-06,
"loss": 1.2708,
"step": 319
},
{
"epoch": 0.0074634045369336485,
"grad_norm": 0.9910548329353333,
"learning_rate": 1.658460741124644e-06,
"loss": 1.8144,
"step": 320
},
{
"epoch": 0.007486727676111566,
"grad_norm": 1.9879093170166016,
"learning_rate": 1.6636434309406583e-06,
"loss": 1.4826,
"step": 321
},
{
"epoch": 0.007510050815289484,
"grad_norm": 1.0845030546188354,
"learning_rate": 1.6688261207566729e-06,
"loss": 1.3364,
"step": 322
},
{
"epoch": 0.007533373954467401,
"grad_norm": 1.342966079711914,
"learning_rate": 1.6740088105726874e-06,
"loss": 1.6453,
"step": 323
},
{
"epoch": 0.007556697093645319,
"grad_norm": 0.9570252895355225,
"learning_rate": 1.679191500388702e-06,
"loss": 1.5384,
"step": 324
},
{
"epoch": 0.007580020232823237,
"grad_norm": 1.531516671180725,
"learning_rate": 1.6843741902047165e-06,
"loss": 1.5775,
"step": 325
},
{
"epoch": 0.007603343372001154,
"grad_norm": 1.4623240232467651,
"learning_rate": 1.689556880020731e-06,
"loss": 1.7159,
"step": 326
},
{
"epoch": 0.007626666511179072,
"grad_norm": 1.109586238861084,
"learning_rate": 1.6947395698367454e-06,
"loss": 1.7403,
"step": 327
},
{
"epoch": 0.00764998965035699,
"grad_norm": 1.3199604749679565,
"learning_rate": 1.6999222596527598e-06,
"loss": 1.7208,
"step": 328
},
{
"epoch": 0.007673312789534908,
"grad_norm": 1.0979784727096558,
"learning_rate": 1.7051049494687743e-06,
"loss": 1.6097,
"step": 329
},
{
"epoch": 0.007696635928712825,
"grad_norm": 1.0952926874160767,
"learning_rate": 1.710287639284789e-06,
"loss": 1.8262,
"step": 330
},
{
"epoch": 0.007719959067890743,
"grad_norm": 1.1149373054504395,
"learning_rate": 1.7154703291008035e-06,
"loss": 1.5762,
"step": 331
},
{
"epoch": 0.007743282207068661,
"grad_norm": 1.2090753316879272,
"learning_rate": 1.720653018916818e-06,
"loss": 1.6161,
"step": 332
},
{
"epoch": 0.007766605346246578,
"grad_norm": 1.3476163148880005,
"learning_rate": 1.7258357087328326e-06,
"loss": 1.6854,
"step": 333
},
{
"epoch": 0.0077899284854244955,
"grad_norm": 1.3222614526748657,
"learning_rate": 1.7310183985488471e-06,
"loss": 1.5996,
"step": 334
},
{
"epoch": 0.007813251624602413,
"grad_norm": 1.2350871562957764,
"learning_rate": 1.7362010883648615e-06,
"loss": 1.5052,
"step": 335
},
{
"epoch": 0.007836574763780331,
"grad_norm": 1.4628745317459106,
"learning_rate": 1.741383778180876e-06,
"loss": 1.6268,
"step": 336
},
{
"epoch": 0.00785989790295825,
"grad_norm": 1.3481048345565796,
"learning_rate": 1.7465664679968906e-06,
"loss": 1.4308,
"step": 337
},
{
"epoch": 0.007883221042136167,
"grad_norm": 1.0008901357650757,
"learning_rate": 1.7517491578129052e-06,
"loss": 1.6487,
"step": 338
},
{
"epoch": 0.007906544181314083,
"grad_norm": 2.4258437156677246,
"learning_rate": 1.7569318476289195e-06,
"loss": 1.5327,
"step": 339
},
{
"epoch": 0.007929867320492001,
"grad_norm": 1.3444914817810059,
"learning_rate": 1.762114537444934e-06,
"loss": 1.5257,
"step": 340
},
{
"epoch": 0.007953190459669919,
"grad_norm": 2.297591209411621,
"learning_rate": 1.7672972272609486e-06,
"loss": 1.9581,
"step": 341
},
{
"epoch": 0.007976513598847837,
"grad_norm": 1.107711672782898,
"learning_rate": 1.772479917076963e-06,
"loss": 1.3486,
"step": 342
},
{
"epoch": 0.007999836738025755,
"grad_norm": 1.4064106941223145,
"learning_rate": 1.7776626068929775e-06,
"loss": 1.3169,
"step": 343
},
{
"epoch": 0.008023159877203673,
"grad_norm": 1.1236720085144043,
"learning_rate": 1.782845296708992e-06,
"loss": 2.0225,
"step": 344
},
{
"epoch": 0.00804648301638159,
"grad_norm": 1.9214081764221191,
"learning_rate": 1.7880279865250066e-06,
"loss": 1.7269,
"step": 345
},
{
"epoch": 0.008069806155559507,
"grad_norm": 1.1544204950332642,
"learning_rate": 1.7932106763410212e-06,
"loss": 1.8407,
"step": 346
},
{
"epoch": 0.008093129294737425,
"grad_norm": 1.3266545534133911,
"learning_rate": 1.7983933661570358e-06,
"loss": 1.3316,
"step": 347
},
{
"epoch": 0.008116452433915343,
"grad_norm": 1.4208300113677979,
"learning_rate": 1.8035760559730501e-06,
"loss": 1.7712,
"step": 348
},
{
"epoch": 0.00813977557309326,
"grad_norm": 1.1849939823150635,
"learning_rate": 1.8087587457890647e-06,
"loss": 1.3843,
"step": 349
},
{
"epoch": 0.008163098712271178,
"grad_norm": 0.9147690534591675,
"learning_rate": 1.8139414356050792e-06,
"loss": 1.703,
"step": 350
},
{
"epoch": 0.008186421851449096,
"grad_norm": 1.2026822566986084,
"learning_rate": 1.8191241254210938e-06,
"loss": 1.642,
"step": 351
},
{
"epoch": 0.008209744990627014,
"grad_norm": 1.6620279550552368,
"learning_rate": 1.8243068152371081e-06,
"loss": 1.2861,
"step": 352
},
{
"epoch": 0.00823306812980493,
"grad_norm": 1.20318603515625,
"learning_rate": 1.8294895050531227e-06,
"loss": 1.7781,
"step": 353
},
{
"epoch": 0.008256391268982848,
"grad_norm": 1.117148756980896,
"learning_rate": 1.8346721948691372e-06,
"loss": 1.7056,
"step": 354
},
{
"epoch": 0.008279714408160766,
"grad_norm": 1.3435394763946533,
"learning_rate": 1.8398548846851516e-06,
"loss": 1.7352,
"step": 355
},
{
"epoch": 0.008303037547338684,
"grad_norm": 1.6550534963607788,
"learning_rate": 1.8450375745011662e-06,
"loss": 1.4283,
"step": 356
},
{
"epoch": 0.008326360686516602,
"grad_norm": 1.0326530933380127,
"learning_rate": 1.8502202643171807e-06,
"loss": 1.8726,
"step": 357
},
{
"epoch": 0.00834968382569452,
"grad_norm": 1.1237214803695679,
"learning_rate": 1.8554029541331953e-06,
"loss": 1.7547,
"step": 358
},
{
"epoch": 0.008373006964872438,
"grad_norm": 1.3457711935043335,
"learning_rate": 1.8605856439492098e-06,
"loss": 1.5047,
"step": 359
},
{
"epoch": 0.008396330104050354,
"grad_norm": 1.3615081310272217,
"learning_rate": 1.8657683337652244e-06,
"loss": 1.3476,
"step": 360
},
{
"epoch": 0.008419653243228272,
"grad_norm": 1.4443084001541138,
"learning_rate": 1.870951023581239e-06,
"loss": 1.4259,
"step": 361
},
{
"epoch": 0.00844297638240619,
"grad_norm": 0.9154095649719238,
"learning_rate": 1.8761337133972533e-06,
"loss": 1.6089,
"step": 362
},
{
"epoch": 0.008466299521584108,
"grad_norm": 1.1972756385803223,
"learning_rate": 1.8813164032132679e-06,
"loss": 1.5704,
"step": 363
},
{
"epoch": 0.008489622660762025,
"grad_norm": 1.1325738430023193,
"learning_rate": 1.8864990930292822e-06,
"loss": 1.7252,
"step": 364
},
{
"epoch": 0.008512945799939943,
"grad_norm": 1.2257301807403564,
"learning_rate": 1.8916817828452968e-06,
"loss": 1.5124,
"step": 365
},
{
"epoch": 0.008536268939117861,
"grad_norm": 1.7714002132415771,
"learning_rate": 1.8968644726613113e-06,
"loss": 1.5799,
"step": 366
},
{
"epoch": 0.008559592078295777,
"grad_norm": 1.1215579509735107,
"learning_rate": 1.9020471624773259e-06,
"loss": 1.7692,
"step": 367
},
{
"epoch": 0.008582915217473695,
"grad_norm": 1.3264069557189941,
"learning_rate": 1.9072298522933404e-06,
"loss": 1.7848,
"step": 368
},
{
"epoch": 0.008606238356651613,
"grad_norm": 0.9898104667663574,
"learning_rate": 1.912412542109355e-06,
"loss": 1.945,
"step": 369
},
{
"epoch": 0.008629561495829531,
"grad_norm": 0.9507944583892822,
"learning_rate": 1.9175952319253693e-06,
"loss": 1.6469,
"step": 370
},
{
"epoch": 0.008652884635007449,
"grad_norm": 1.1940997838974,
"learning_rate": 1.9227779217413837e-06,
"loss": 1.5144,
"step": 371
},
{
"epoch": 0.008676207774185367,
"grad_norm": 1.2926305532455444,
"learning_rate": 1.9279606115573985e-06,
"loss": 1.6527,
"step": 372
},
{
"epoch": 0.008699530913363285,
"grad_norm": 0.9909786581993103,
"learning_rate": 1.933143301373413e-06,
"loss": 1.8003,
"step": 373
},
{
"epoch": 0.008722854052541201,
"grad_norm": 1.3900662660598755,
"learning_rate": 1.9383259911894276e-06,
"loss": 1.7743,
"step": 374
},
{
"epoch": 0.008746177191719119,
"grad_norm": 0.9942039251327515,
"learning_rate": 1.943508681005442e-06,
"loss": 1.5635,
"step": 375
},
{
"epoch": 0.008769500330897037,
"grad_norm": 1.3887672424316406,
"learning_rate": 1.9486913708214563e-06,
"loss": 1.744,
"step": 376
},
{
"epoch": 0.008792823470074955,
"grad_norm": 1.2873059511184692,
"learning_rate": 1.953874060637471e-06,
"loss": 1.64,
"step": 377
},
{
"epoch": 0.008816146609252873,
"grad_norm": 1.2259247303009033,
"learning_rate": 1.9590567504534854e-06,
"loss": 1.6418,
"step": 378
},
{
"epoch": 0.00883946974843079,
"grad_norm": 1.5709097385406494,
"learning_rate": 1.9642394402695e-06,
"loss": 1.4343,
"step": 379
},
{
"epoch": 0.008862792887608708,
"grad_norm": 1.016625165939331,
"learning_rate": 1.9694221300855145e-06,
"loss": 1.5838,
"step": 380
},
{
"epoch": 0.008886116026786626,
"grad_norm": 1.5763674974441528,
"learning_rate": 1.9746048199015293e-06,
"loss": 1.3391,
"step": 381
},
{
"epoch": 0.008909439165964542,
"grad_norm": 1.014722466468811,
"learning_rate": 1.9797875097175436e-06,
"loss": 1.7185,
"step": 382
},
{
"epoch": 0.00893276230514246,
"grad_norm": 1.5255705118179321,
"learning_rate": 1.984970199533558e-06,
"loss": 1.5749,
"step": 383
},
{
"epoch": 0.008956085444320378,
"grad_norm": 1.4036648273468018,
"learning_rate": 1.9901528893495723e-06,
"loss": 1.4134,
"step": 384
},
{
"epoch": 0.008979408583498296,
"grad_norm": 1.327813982963562,
"learning_rate": 1.995335579165587e-06,
"loss": 1.8475,
"step": 385
},
{
"epoch": 0.009002731722676214,
"grad_norm": 1.357269287109375,
"learning_rate": 2.0005182689816014e-06,
"loss": 1.4145,
"step": 386
},
{
"epoch": 0.009026054861854132,
"grad_norm": 1.4663738012313843,
"learning_rate": 2.005700958797616e-06,
"loss": 1.5207,
"step": 387
},
{
"epoch": 0.00904937800103205,
"grad_norm": 0.9792691469192505,
"learning_rate": 2.0108836486136305e-06,
"loss": 1.7392,
"step": 388
},
{
"epoch": 0.009072701140209966,
"grad_norm": 1.9074856042861938,
"learning_rate": 2.0160663384296453e-06,
"loss": 1.5931,
"step": 389
},
{
"epoch": 0.009096024279387884,
"grad_norm": 1.562455654144287,
"learning_rate": 2.0212490282456597e-06,
"loss": 1.3503,
"step": 390
},
{
"epoch": 0.009119347418565802,
"grad_norm": 1.6827714443206787,
"learning_rate": 2.026431718061674e-06,
"loss": 1.8409,
"step": 391
},
{
"epoch": 0.00914267055774372,
"grad_norm": 0.969691276550293,
"learning_rate": 2.0316144078776888e-06,
"loss": 1.5167,
"step": 392
},
{
"epoch": 0.009165993696921637,
"grad_norm": 1.1107996702194214,
"learning_rate": 2.036797097693703e-06,
"loss": 1.5723,
"step": 393
},
{
"epoch": 0.009189316836099555,
"grad_norm": 0.9862359762191772,
"learning_rate": 2.041979787509718e-06,
"loss": 1.1188,
"step": 394
},
{
"epoch": 0.009212639975277473,
"grad_norm": 1.4997074604034424,
"learning_rate": 2.0471624773257322e-06,
"loss": 1.6742,
"step": 395
},
{
"epoch": 0.00923596311445539,
"grad_norm": 1.1336885690689087,
"learning_rate": 2.052345167141747e-06,
"loss": 1.5602,
"step": 396
},
{
"epoch": 0.009259286253633307,
"grad_norm": 1.4929397106170654,
"learning_rate": 2.057527856957761e-06,
"loss": 1.4891,
"step": 397
},
{
"epoch": 0.009282609392811225,
"grad_norm": 1.3118637800216675,
"learning_rate": 2.0627105467737757e-06,
"loss": 1.5758,
"step": 398
},
{
"epoch": 0.009305932531989143,
"grad_norm": 1.1043623685836792,
"learning_rate": 2.06789323658979e-06,
"loss": 1.9455,
"step": 399
},
{
"epoch": 0.009329255671167061,
"grad_norm": 1.3472813367843628,
"learning_rate": 2.073075926405805e-06,
"loss": 1.4657,
"step": 400
},
{
"epoch": 0.009352578810344979,
"grad_norm": 1.5614628791809082,
"learning_rate": 2.078258616221819e-06,
"loss": 1.3351,
"step": 401
},
{
"epoch": 0.009375901949522897,
"grad_norm": 1.393477439880371,
"learning_rate": 2.083441306037834e-06,
"loss": 1.8887,
"step": 402
},
{
"epoch": 0.009399225088700813,
"grad_norm": 1.0576095581054688,
"learning_rate": 2.0886239958538483e-06,
"loss": 1.7814,
"step": 403
},
{
"epoch": 0.00942254822787873,
"grad_norm": 1.5161347389221191,
"learning_rate": 2.0938066856698626e-06,
"loss": 1.2316,
"step": 404
},
{
"epoch": 0.009445871367056649,
"grad_norm": 1.05890691280365,
"learning_rate": 2.0989893754858774e-06,
"loss": 1.5303,
"step": 405
},
{
"epoch": 0.009469194506234567,
"grad_norm": 0.801816463470459,
"learning_rate": 2.1041720653018918e-06,
"loss": 1.5165,
"step": 406
},
{
"epoch": 0.009492517645412485,
"grad_norm": 1.2811832427978516,
"learning_rate": 2.1093547551179065e-06,
"loss": 1.8638,
"step": 407
},
{
"epoch": 0.009515840784590402,
"grad_norm": 1.2984956502914429,
"learning_rate": 2.114537444933921e-06,
"loss": 1.4195,
"step": 408
},
{
"epoch": 0.00953916392376832,
"grad_norm": 2.3772926330566406,
"learning_rate": 2.1197201347499356e-06,
"loss": 1.2616,
"step": 409
},
{
"epoch": 0.009562487062946236,
"grad_norm": 1.102181315422058,
"learning_rate": 2.12490282456595e-06,
"loss": 1.6683,
"step": 410
},
{
"epoch": 0.009585810202124154,
"grad_norm": 1.4473963975906372,
"learning_rate": 2.1300855143819643e-06,
"loss": 1.6474,
"step": 411
},
{
"epoch": 0.009609133341302072,
"grad_norm": 2.3995816707611084,
"learning_rate": 2.1352682041979787e-06,
"loss": 1.6203,
"step": 412
},
{
"epoch": 0.00963245648047999,
"grad_norm": 0.9490773677825928,
"learning_rate": 2.1404508940139935e-06,
"loss": 1.8082,
"step": 413
},
{
"epoch": 0.009655779619657908,
"grad_norm": 0.9358771443367004,
"learning_rate": 2.145633583830008e-06,
"loss": 1.5929,
"step": 414
},
{
"epoch": 0.009679102758835826,
"grad_norm": 0.9875616431236267,
"learning_rate": 2.1508162736460226e-06,
"loss": 1.4312,
"step": 415
},
{
"epoch": 0.009702425898013744,
"grad_norm": 1.197416067123413,
"learning_rate": 2.155998963462037e-06,
"loss": 1.3165,
"step": 416
},
{
"epoch": 0.00972574903719166,
"grad_norm": 2.0210750102996826,
"learning_rate": 2.1611816532780513e-06,
"loss": 1.4962,
"step": 417
},
{
"epoch": 0.009749072176369578,
"grad_norm": 1.2700085639953613,
"learning_rate": 2.166364343094066e-06,
"loss": 1.6101,
"step": 418
},
{
"epoch": 0.009772395315547496,
"grad_norm": 1.124679684638977,
"learning_rate": 2.1715470329100804e-06,
"loss": 1.7477,
"step": 419
},
{
"epoch": 0.009795718454725414,
"grad_norm": 1.178290843963623,
"learning_rate": 2.176729722726095e-06,
"loss": 1.4108,
"step": 420
},
{
"epoch": 0.009819041593903332,
"grad_norm": 1.792117953300476,
"learning_rate": 2.1819124125421095e-06,
"loss": 1.5568,
"step": 421
},
{
"epoch": 0.00984236473308125,
"grad_norm": 1.7381610870361328,
"learning_rate": 2.1870951023581243e-06,
"loss": 1.3229,
"step": 422
},
{
"epoch": 0.009865687872259167,
"grad_norm": 1.023553490638733,
"learning_rate": 2.1922777921741386e-06,
"loss": 1.1633,
"step": 423
},
{
"epoch": 0.009889011011437084,
"grad_norm": 1.5537900924682617,
"learning_rate": 2.197460481990153e-06,
"loss": 1.291,
"step": 424
},
{
"epoch": 0.009912334150615001,
"grad_norm": 1.722598671913147,
"learning_rate": 2.2026431718061673e-06,
"loss": 1.5201,
"step": 425
},
{
"epoch": 0.00993565728979292,
"grad_norm": 1.546295166015625,
"learning_rate": 2.207825861622182e-06,
"loss": 1.3554,
"step": 426
},
{
"epoch": 0.009958980428970837,
"grad_norm": 1.4075593948364258,
"learning_rate": 2.2130085514381964e-06,
"loss": 1.3831,
"step": 427
},
{
"epoch": 0.009982303568148755,
"grad_norm": 1.441125512123108,
"learning_rate": 2.218191241254211e-06,
"loss": 1.4806,
"step": 428
},
{
"epoch": 0.010005626707326673,
"grad_norm": 1.4198213815689087,
"learning_rate": 2.2233739310702255e-06,
"loss": 1.6962,
"step": 429
},
{
"epoch": 0.010028949846504591,
"grad_norm": 1.1716971397399902,
"learning_rate": 2.2285566208862403e-06,
"loss": 1.0423,
"step": 430
},
{
"epoch": 0.010052272985682507,
"grad_norm": 1.1271895170211792,
"learning_rate": 2.2337393107022547e-06,
"loss": 1.4246,
"step": 431
},
{
"epoch": 0.010075596124860425,
"grad_norm": 1.2987208366394043,
"learning_rate": 2.238922000518269e-06,
"loss": 1.5946,
"step": 432
},
{
"epoch": 0.010098919264038343,
"grad_norm": 1.7283997535705566,
"learning_rate": 2.2441046903342838e-06,
"loss": 1.5761,
"step": 433
},
{
"epoch": 0.01012224240321626,
"grad_norm": 1.635098934173584,
"learning_rate": 2.249287380150298e-06,
"loss": 1.6912,
"step": 434
},
{
"epoch": 0.010145565542394179,
"grad_norm": 2.1896469593048096,
"learning_rate": 2.254470069966313e-06,
"loss": 1.2961,
"step": 435
},
{
"epoch": 0.010168888681572097,
"grad_norm": 1.1874053478240967,
"learning_rate": 2.2596527597823272e-06,
"loss": 1.4999,
"step": 436
},
{
"epoch": 0.010192211820750014,
"grad_norm": 1.2898855209350586,
"learning_rate": 2.264835449598342e-06,
"loss": 1.7152,
"step": 437
},
{
"epoch": 0.010215534959927932,
"grad_norm": 0.792107105255127,
"learning_rate": 2.270018139414356e-06,
"loss": 1.4129,
"step": 438
},
{
"epoch": 0.010238858099105849,
"grad_norm": 1.2092666625976562,
"learning_rate": 2.2752008292303707e-06,
"loss": 1.4687,
"step": 439
},
{
"epoch": 0.010262181238283766,
"grad_norm": 1.2261115312576294,
"learning_rate": 2.280383519046385e-06,
"loss": 1.5548,
"step": 440
},
{
"epoch": 0.010285504377461684,
"grad_norm": 2.0835094451904297,
"learning_rate": 2.2855662088624e-06,
"loss": 1.5925,
"step": 441
},
{
"epoch": 0.010308827516639602,
"grad_norm": 1.075907826423645,
"learning_rate": 2.290748898678414e-06,
"loss": 1.4967,
"step": 442
},
{
"epoch": 0.01033215065581752,
"grad_norm": 0.9633646011352539,
"learning_rate": 2.295931588494429e-06,
"loss": 1.6798,
"step": 443
},
{
"epoch": 0.010355473794995438,
"grad_norm": 1.6833699941635132,
"learning_rate": 2.3011142783104433e-06,
"loss": 1.3053,
"step": 444
},
{
"epoch": 0.010378796934173356,
"grad_norm": 1.1333974599838257,
"learning_rate": 2.3062969681264576e-06,
"loss": 1.3658,
"step": 445
},
{
"epoch": 0.010402120073351272,
"grad_norm": 1.3382309675216675,
"learning_rate": 2.3114796579424724e-06,
"loss": 1.6492,
"step": 446
},
{
"epoch": 0.01042544321252919,
"grad_norm": 0.7148923873901367,
"learning_rate": 2.3166623477584868e-06,
"loss": 1.6269,
"step": 447
},
{
"epoch": 0.010448766351707108,
"grad_norm": 1.084245204925537,
"learning_rate": 2.3218450375745015e-06,
"loss": 2.0708,
"step": 448
},
{
"epoch": 0.010472089490885026,
"grad_norm": 1.1463004350662231,
"learning_rate": 2.327027727390516e-06,
"loss": 2.0115,
"step": 449
},
{
"epoch": 0.010495412630062944,
"grad_norm": 1.5500133037567139,
"learning_rate": 2.3322104172065306e-06,
"loss": 1.5454,
"step": 450
},
{
"epoch": 0.010518735769240862,
"grad_norm": 1.2993839979171753,
"learning_rate": 2.337393107022545e-06,
"loss": 1.5475,
"step": 451
},
{
"epoch": 0.01054205890841878,
"grad_norm": 1.295839786529541,
"learning_rate": 2.3425757968385593e-06,
"loss": 1.2895,
"step": 452
},
{
"epoch": 0.010565382047596696,
"grad_norm": 1.045040488243103,
"learning_rate": 2.3477584866545737e-06,
"loss": 1.7306,
"step": 453
},
{
"epoch": 0.010588705186774613,
"grad_norm": 1.4592766761779785,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.7795,
"step": 454
},
{
"epoch": 0.010612028325952531,
"grad_norm": 0.9432761073112488,
"learning_rate": 2.358123866286603e-06,
"loss": 1.6963,
"step": 455
},
{
"epoch": 0.01063535146513045,
"grad_norm": 1.3770086765289307,
"learning_rate": 2.3633065561026176e-06,
"loss": 1.2003,
"step": 456
},
{
"epoch": 0.010658674604308367,
"grad_norm": 1.1453793048858643,
"learning_rate": 2.368489245918632e-06,
"loss": 1.9012,
"step": 457
},
{
"epoch": 0.010681997743486285,
"grad_norm": 1.2836976051330566,
"learning_rate": 2.3736719357346467e-06,
"loss": 1.4324,
"step": 458
},
{
"epoch": 0.010705320882664203,
"grad_norm": 1.6498123407363892,
"learning_rate": 2.378854625550661e-06,
"loss": 1.6212,
"step": 459
},
{
"epoch": 0.010728644021842119,
"grad_norm": 1.3681795597076416,
"learning_rate": 2.3840373153666754e-06,
"loss": 1.6047,
"step": 460
},
{
"epoch": 0.010751967161020037,
"grad_norm": 1.4474722146987915,
"learning_rate": 2.38922000518269e-06,
"loss": 1.5279,
"step": 461
},
{
"epoch": 0.010775290300197955,
"grad_norm": 1.4832510948181152,
"learning_rate": 2.3944026949987045e-06,
"loss": 1.7073,
"step": 462
},
{
"epoch": 0.010798613439375873,
"grad_norm": 1.343935251235962,
"learning_rate": 2.3995853848147193e-06,
"loss": 1.4637,
"step": 463
},
{
"epoch": 0.01082193657855379,
"grad_norm": 1.8285539150238037,
"learning_rate": 2.4047680746307336e-06,
"loss": 1.3944,
"step": 464
},
{
"epoch": 0.010845259717731709,
"grad_norm": 1.4653230905532837,
"learning_rate": 2.4099507644467484e-06,
"loss": 1.8847,
"step": 465
},
{
"epoch": 0.010868582856909626,
"grad_norm": 1.4410351514816284,
"learning_rate": 2.4151334542627623e-06,
"loss": 1.7298,
"step": 466
},
{
"epoch": 0.010891905996087543,
"grad_norm": 1.3057256937026978,
"learning_rate": 2.420316144078777e-06,
"loss": 1.6188,
"step": 467
},
{
"epoch": 0.01091522913526546,
"grad_norm": 1.574479103088379,
"learning_rate": 2.4254988338947914e-06,
"loss": 1.585,
"step": 468
},
{
"epoch": 0.010938552274443378,
"grad_norm": 1.4391696453094482,
"learning_rate": 2.430681523710806e-06,
"loss": 1.7272,
"step": 469
},
{
"epoch": 0.010961875413621296,
"grad_norm": 2.304706335067749,
"learning_rate": 2.4358642135268205e-06,
"loss": 1.7127,
"step": 470
},
{
"epoch": 0.010985198552799214,
"grad_norm": 1.2380545139312744,
"learning_rate": 2.4410469033428353e-06,
"loss": 1.5428,
"step": 471
},
{
"epoch": 0.011008521691977132,
"grad_norm": 1.303446888923645,
"learning_rate": 2.4462295931588497e-06,
"loss": 1.609,
"step": 472
},
{
"epoch": 0.01103184483115505,
"grad_norm": 1.3888837099075317,
"learning_rate": 2.451412282974864e-06,
"loss": 1.7134,
"step": 473
},
{
"epoch": 0.011055167970332966,
"grad_norm": 0.9802701473236084,
"learning_rate": 2.4565949727908788e-06,
"loss": 1.4401,
"step": 474
},
{
"epoch": 0.011078491109510884,
"grad_norm": 1.5808403491973877,
"learning_rate": 2.461777662606893e-06,
"loss": 1.7415,
"step": 475
},
{
"epoch": 0.011101814248688802,
"grad_norm": 1.299912691116333,
"learning_rate": 2.466960352422908e-06,
"loss": 1.361,
"step": 476
},
{
"epoch": 0.01112513738786672,
"grad_norm": 0.9326110482215881,
"learning_rate": 2.4721430422389222e-06,
"loss": 1.222,
"step": 477
},
{
"epoch": 0.011148460527044638,
"grad_norm": 1.0385396480560303,
"learning_rate": 2.477325732054937e-06,
"loss": 1.4813,
"step": 478
},
{
"epoch": 0.011171783666222556,
"grad_norm": 1.1004397869110107,
"learning_rate": 2.482508421870951e-06,
"loss": 1.5064,
"step": 479
},
{
"epoch": 0.011195106805400474,
"grad_norm": 1.274898886680603,
"learning_rate": 2.4876911116869657e-06,
"loss": 1.3046,
"step": 480
},
{
"epoch": 0.01121842994457839,
"grad_norm": 1.0818660259246826,
"learning_rate": 2.49287380150298e-06,
"loss": 1.878,
"step": 481
},
{
"epoch": 0.011241753083756308,
"grad_norm": 1.2744652032852173,
"learning_rate": 2.498056491318995e-06,
"loss": 1.6394,
"step": 482
},
{
"epoch": 0.011265076222934226,
"grad_norm": 1.0467538833618164,
"learning_rate": 2.503239181135009e-06,
"loss": 1.8949,
"step": 483
},
{
"epoch": 0.011288399362112143,
"grad_norm": 1.2507177591323853,
"learning_rate": 2.508421870951024e-06,
"loss": 1.5386,
"step": 484
},
{
"epoch": 0.011311722501290061,
"grad_norm": 2.0707380771636963,
"learning_rate": 2.5136045607670383e-06,
"loss": 1.3359,
"step": 485
},
{
"epoch": 0.01133504564046798,
"grad_norm": 1.0060955286026,
"learning_rate": 2.518787250583053e-06,
"loss": 1.5551,
"step": 486
},
{
"epoch": 0.011358368779645897,
"grad_norm": 2.1019294261932373,
"learning_rate": 2.5239699403990674e-06,
"loss": 1.4009,
"step": 487
},
{
"epoch": 0.011381691918823813,
"grad_norm": 1.2085974216461182,
"learning_rate": 2.529152630215082e-06,
"loss": 1.1264,
"step": 488
},
{
"epoch": 0.011405015058001731,
"grad_norm": 1.2670215368270874,
"learning_rate": 2.5343353200310965e-06,
"loss": 1.4005,
"step": 489
},
{
"epoch": 0.011428338197179649,
"grad_norm": 0.976809024810791,
"learning_rate": 2.5395180098471104e-06,
"loss": 1.6539,
"step": 490
},
{
"epoch": 0.011451661336357567,
"grad_norm": 1.8012447357177734,
"learning_rate": 2.5447006996631252e-06,
"loss": 1.5083,
"step": 491
},
{
"epoch": 0.011474984475535485,
"grad_norm": 2.0657784938812256,
"learning_rate": 2.5498833894791396e-06,
"loss": 1.4127,
"step": 492
},
{
"epoch": 0.011498307614713403,
"grad_norm": 1.4070103168487549,
"learning_rate": 2.5550660792951543e-06,
"loss": 1.4707,
"step": 493
},
{
"epoch": 0.01152163075389132,
"grad_norm": 0.859045147895813,
"learning_rate": 2.5602487691111687e-06,
"loss": 1.6301,
"step": 494
},
{
"epoch": 0.011544953893069239,
"grad_norm": 1.5209952592849731,
"learning_rate": 2.5654314589271835e-06,
"loss": 1.8438,
"step": 495
},
{
"epoch": 0.011568277032247155,
"grad_norm": 1.1508231163024902,
"learning_rate": 2.570614148743198e-06,
"loss": 1.2495,
"step": 496
},
{
"epoch": 0.011591600171425073,
"grad_norm": 0.9130313396453857,
"learning_rate": 2.5757968385592126e-06,
"loss": 1.1848,
"step": 497
},
{
"epoch": 0.01161492331060299,
"grad_norm": 1.5925562381744385,
"learning_rate": 2.580979528375227e-06,
"loss": 1.4745,
"step": 498
},
{
"epoch": 0.011638246449780908,
"grad_norm": 2.5118539333343506,
"learning_rate": 2.5861622181912417e-06,
"loss": 1.6218,
"step": 499
},
{
"epoch": 0.011661569588958826,
"grad_norm": 1.272691249847412,
"learning_rate": 2.591344908007256e-06,
"loss": 1.2147,
"step": 500
},
{
"epoch": 0.011684892728136744,
"grad_norm": 1.1436160802841187,
"learning_rate": 2.596527597823271e-06,
"loss": 1.5556,
"step": 501
},
{
"epoch": 0.011708215867314662,
"grad_norm": 1.0195647478103638,
"learning_rate": 2.601710287639285e-06,
"loss": 1.3303,
"step": 502
},
{
"epoch": 0.011731539006492578,
"grad_norm": 1.4576568603515625,
"learning_rate": 2.6068929774553e-06,
"loss": 1.6531,
"step": 503
},
{
"epoch": 0.011754862145670496,
"grad_norm": 1.360716462135315,
"learning_rate": 2.612075667271314e-06,
"loss": 1.1761,
"step": 504
},
{
"epoch": 0.011778185284848414,
"grad_norm": 2.7770462036132812,
"learning_rate": 2.617258357087328e-06,
"loss": 1.247,
"step": 505
},
{
"epoch": 0.011801508424026332,
"grad_norm": 1.3706661462783813,
"learning_rate": 2.622441046903343e-06,
"loss": 1.5103,
"step": 506
},
{
"epoch": 0.01182483156320425,
"grad_norm": 1.5405017137527466,
"learning_rate": 2.6276237367193573e-06,
"loss": 1.6827,
"step": 507
},
{
"epoch": 0.011848154702382168,
"grad_norm": 1.1809494495391846,
"learning_rate": 2.632806426535372e-06,
"loss": 1.7162,
"step": 508
},
{
"epoch": 0.011871477841560086,
"grad_norm": 1.085557222366333,
"learning_rate": 2.6379891163513864e-06,
"loss": 1.514,
"step": 509
},
{
"epoch": 0.011894800980738002,
"grad_norm": 1.2155910730361938,
"learning_rate": 2.643171806167401e-06,
"loss": 1.4029,
"step": 510
},
{
"epoch": 0.01191812411991592,
"grad_norm": 1.240242600440979,
"learning_rate": 2.6483544959834155e-06,
"loss": 1.4336,
"step": 511
},
{
"epoch": 0.011941447259093838,
"grad_norm": 1.649802327156067,
"learning_rate": 2.6535371857994303e-06,
"loss": 1.9082,
"step": 512
},
{
"epoch": 0.011964770398271755,
"grad_norm": 1.3479831218719482,
"learning_rate": 2.6587198756154447e-06,
"loss": 1.5424,
"step": 513
},
{
"epoch": 0.011988093537449673,
"grad_norm": 1.2537102699279785,
"learning_rate": 2.6639025654314594e-06,
"loss": 1.6061,
"step": 514
},
{
"epoch": 0.012011416676627591,
"grad_norm": 1.1049939393997192,
"learning_rate": 2.6690852552474738e-06,
"loss": 1.8361,
"step": 515
},
{
"epoch": 0.012034739815805509,
"grad_norm": 2.9946062564849854,
"learning_rate": 2.6742679450634885e-06,
"loss": 1.4471,
"step": 516
},
{
"epoch": 0.012058062954983425,
"grad_norm": 0.9455610513687134,
"learning_rate": 2.6794506348795025e-06,
"loss": 1.6831,
"step": 517
},
{
"epoch": 0.012081386094161343,
"grad_norm": 1.4750438928604126,
"learning_rate": 2.684633324695517e-06,
"loss": 1.3143,
"step": 518
},
{
"epoch": 0.012104709233339261,
"grad_norm": 1.1056557893753052,
"learning_rate": 2.6898160145115316e-06,
"loss": 1.5054,
"step": 519
},
{
"epoch": 0.012128032372517179,
"grad_norm": 0.9718064069747925,
"learning_rate": 2.694998704327546e-06,
"loss": 1.3134,
"step": 520
},
{
"epoch": 0.012151355511695097,
"grad_norm": 2.2384724617004395,
"learning_rate": 2.7001813941435607e-06,
"loss": 1.4851,
"step": 521
},
{
"epoch": 0.012174678650873015,
"grad_norm": 1.2468239068984985,
"learning_rate": 2.705364083959575e-06,
"loss": 1.4873,
"step": 522
},
{
"epoch": 0.012198001790050933,
"grad_norm": 1.4248602390289307,
"learning_rate": 2.71054677377559e-06,
"loss": 1.7643,
"step": 523
},
{
"epoch": 0.012221324929228849,
"grad_norm": 1.3377385139465332,
"learning_rate": 2.715729463591604e-06,
"loss": 1.7064,
"step": 524
},
{
"epoch": 0.012244648068406767,
"grad_norm": 0.9933966994285583,
"learning_rate": 2.720912153407619e-06,
"loss": 1.7187,
"step": 525
},
{
"epoch": 0.012267971207584685,
"grad_norm": 1.018750548362732,
"learning_rate": 2.7260948432236333e-06,
"loss": 1.5915,
"step": 526
},
{
"epoch": 0.012291294346762602,
"grad_norm": 1.356325387954712,
"learning_rate": 2.731277533039648e-06,
"loss": 1.7193,
"step": 527
},
{
"epoch": 0.01231461748594052,
"grad_norm": 1.2781217098236084,
"learning_rate": 2.7364602228556624e-06,
"loss": 1.5494,
"step": 528
},
{
"epoch": 0.012337940625118438,
"grad_norm": 1.561498761177063,
"learning_rate": 2.741642912671677e-06,
"loss": 1.6972,
"step": 529
},
{
"epoch": 0.012361263764296356,
"grad_norm": 1.1695748567581177,
"learning_rate": 2.7468256024876915e-06,
"loss": 2.1633,
"step": 530
},
{
"epoch": 0.012384586903474272,
"grad_norm": 1.4304964542388916,
"learning_rate": 2.7520082923037054e-06,
"loss": 1.6321,
"step": 531
},
{
"epoch": 0.01240791004265219,
"grad_norm": 1.0513828992843628,
"learning_rate": 2.7571909821197202e-06,
"loss": 1.2897,
"step": 532
},
{
"epoch": 0.012431233181830108,
"grad_norm": 1.0206960439682007,
"learning_rate": 2.7623736719357346e-06,
"loss": 1.7842,
"step": 533
},
{
"epoch": 0.012454556321008026,
"grad_norm": 1.1440876722335815,
"learning_rate": 2.7675563617517493e-06,
"loss": 1.4399,
"step": 534
},
{
"epoch": 0.012477879460185944,
"grad_norm": 1.0837441682815552,
"learning_rate": 2.7727390515677637e-06,
"loss": 1.5155,
"step": 535
},
{
"epoch": 0.012501202599363862,
"grad_norm": 1.071378231048584,
"learning_rate": 2.7779217413837785e-06,
"loss": 1.6459,
"step": 536
},
{
"epoch": 0.01252452573854178,
"grad_norm": 1.6966552734375,
"learning_rate": 2.783104431199793e-06,
"loss": 1.6015,
"step": 537
},
{
"epoch": 0.012547848877719696,
"grad_norm": 1.2789183855056763,
"learning_rate": 2.7882871210158076e-06,
"loss": 1.2423,
"step": 538
},
{
"epoch": 0.012571172016897614,
"grad_norm": 1.2072651386260986,
"learning_rate": 2.793469810831822e-06,
"loss": 1.69,
"step": 539
},
{
"epoch": 0.012594495156075532,
"grad_norm": 1.5257117748260498,
"learning_rate": 2.7986525006478367e-06,
"loss": 1.7608,
"step": 540
},
{
"epoch": 0.01261781829525345,
"grad_norm": 1.0233759880065918,
"learning_rate": 2.803835190463851e-06,
"loss": 1.1299,
"step": 541
},
{
"epoch": 0.012641141434431367,
"grad_norm": 1.8280616998672485,
"learning_rate": 2.809017880279866e-06,
"loss": 1.3338,
"step": 542
},
{
"epoch": 0.012664464573609285,
"grad_norm": 1.6891363859176636,
"learning_rate": 2.81420057009588e-06,
"loss": 1.5505,
"step": 543
},
{
"epoch": 0.012687787712787203,
"grad_norm": 1.1501421928405762,
"learning_rate": 2.819383259911895e-06,
"loss": 1.6788,
"step": 544
},
{
"epoch": 0.01271111085196512,
"grad_norm": 1.107029914855957,
"learning_rate": 2.824565949727909e-06,
"loss": 1.3782,
"step": 545
},
{
"epoch": 0.012734433991143037,
"grad_norm": 0.9627429246902466,
"learning_rate": 2.829748639543923e-06,
"loss": 1.3155,
"step": 546
},
{
"epoch": 0.012757757130320955,
"grad_norm": 2.330007791519165,
"learning_rate": 2.834931329359938e-06,
"loss": 1.425,
"step": 547
},
{
"epoch": 0.012781080269498873,
"grad_norm": 1.4026503562927246,
"learning_rate": 2.8401140191759523e-06,
"loss": 1.5578,
"step": 548
},
{
"epoch": 0.012804403408676791,
"grad_norm": 0.9430487155914307,
"learning_rate": 2.845296708991967e-06,
"loss": 1.6075,
"step": 549
},
{
"epoch": 0.012827726547854709,
"grad_norm": 1.0779294967651367,
"learning_rate": 2.8504793988079814e-06,
"loss": 1.5169,
"step": 550
},
{
"epoch": 0.012851049687032627,
"grad_norm": 1.130324125289917,
"learning_rate": 2.855662088623996e-06,
"loss": 1.5016,
"step": 551
},
{
"epoch": 0.012874372826210545,
"grad_norm": 1.0127092599868774,
"learning_rate": 2.8608447784400105e-06,
"loss": 1.8715,
"step": 552
},
{
"epoch": 0.01289769596538846,
"grad_norm": 1.1831302642822266,
"learning_rate": 2.8660274682560253e-06,
"loss": 1.678,
"step": 553
},
{
"epoch": 0.012921019104566379,
"grad_norm": 1.3394455909729004,
"learning_rate": 2.8712101580720397e-06,
"loss": 1.4129,
"step": 554
},
{
"epoch": 0.012944342243744297,
"grad_norm": 1.2189030647277832,
"learning_rate": 2.8763928478880544e-06,
"loss": 1.7364,
"step": 555
},
{
"epoch": 0.012967665382922215,
"grad_norm": 1.2808138132095337,
"learning_rate": 2.8815755377040688e-06,
"loss": 1.6274,
"step": 556
},
{
"epoch": 0.012990988522100132,
"grad_norm": 1.0384689569473267,
"learning_rate": 2.8867582275200835e-06,
"loss": 1.5942,
"step": 557
},
{
"epoch": 0.01301431166127805,
"grad_norm": 1.8520807027816772,
"learning_rate": 2.891940917336098e-06,
"loss": 1.3067,
"step": 558
},
{
"epoch": 0.013037634800455968,
"grad_norm": 1.1817374229431152,
"learning_rate": 2.897123607152112e-06,
"loss": 1.6405,
"step": 559
},
{
"epoch": 0.013060957939633884,
"grad_norm": 1.1010823249816895,
"learning_rate": 2.9023062969681266e-06,
"loss": 1.4339,
"step": 560
},
{
"epoch": 0.013084281078811802,
"grad_norm": 1.2461942434310913,
"learning_rate": 2.907488986784141e-06,
"loss": 1.9866,
"step": 561
},
{
"epoch": 0.01310760421798972,
"grad_norm": 1.1503125429153442,
"learning_rate": 2.9126716766001557e-06,
"loss": 1.585,
"step": 562
},
{
"epoch": 0.013130927357167638,
"grad_norm": 1.542434573173523,
"learning_rate": 2.91785436641617e-06,
"loss": 1.4524,
"step": 563
},
{
"epoch": 0.013154250496345556,
"grad_norm": 1.0469673871994019,
"learning_rate": 2.923037056232185e-06,
"loss": 1.6884,
"step": 564
},
{
"epoch": 0.013177573635523474,
"grad_norm": 1.5137437582015991,
"learning_rate": 2.928219746048199e-06,
"loss": 1.5377,
"step": 565
},
{
"epoch": 0.013200896774701392,
"grad_norm": 1.1454534530639648,
"learning_rate": 2.933402435864214e-06,
"loss": 1.8508,
"step": 566
},
{
"epoch": 0.013224219913879308,
"grad_norm": 1.310381531715393,
"learning_rate": 2.9385851256802283e-06,
"loss": 1.5774,
"step": 567
},
{
"epoch": 0.013247543053057226,
"grad_norm": 1.1223838329315186,
"learning_rate": 2.943767815496243e-06,
"loss": 1.4496,
"step": 568
},
{
"epoch": 0.013270866192235144,
"grad_norm": 1.4537910223007202,
"learning_rate": 2.9489505053122574e-06,
"loss": 1.4423,
"step": 569
},
{
"epoch": 0.013294189331413062,
"grad_norm": 1.1783167123794556,
"learning_rate": 2.954133195128272e-06,
"loss": 1.9314,
"step": 570
},
{
"epoch": 0.01331751247059098,
"grad_norm": 1.211719274520874,
"learning_rate": 2.9593158849442865e-06,
"loss": 1.5366,
"step": 571
},
{
"epoch": 0.013340835609768897,
"grad_norm": 2.9552671909332275,
"learning_rate": 2.9644985747603004e-06,
"loss": 1.3431,
"step": 572
},
{
"epoch": 0.013364158748946815,
"grad_norm": 1.2814795970916748,
"learning_rate": 2.9696812645763152e-06,
"loss": 1.3879,
"step": 573
},
{
"epoch": 0.013387481888124731,
"grad_norm": 1.2598010301589966,
"learning_rate": 2.9748639543923296e-06,
"loss": 1.4775,
"step": 574
},
{
"epoch": 0.01341080502730265,
"grad_norm": 1.3874925374984741,
"learning_rate": 2.9800466442083443e-06,
"loss": 1.4012,
"step": 575
},
{
"epoch": 0.013434128166480567,
"grad_norm": 1.1846306324005127,
"learning_rate": 2.9852293340243587e-06,
"loss": 1.4491,
"step": 576
},
{
"epoch": 0.013457451305658485,
"grad_norm": 1.388150691986084,
"learning_rate": 2.9904120238403734e-06,
"loss": 1.6913,
"step": 577
},
{
"epoch": 0.013480774444836403,
"grad_norm": 1.8026880025863647,
"learning_rate": 2.995594713656388e-06,
"loss": 1.1754,
"step": 578
},
{
"epoch": 0.013504097584014321,
"grad_norm": 1.9366620779037476,
"learning_rate": 3.0007774034724026e-06,
"loss": 1.4406,
"step": 579
},
{
"epoch": 0.013527420723192239,
"grad_norm": 1.039657473564148,
"learning_rate": 3.005960093288417e-06,
"loss": 1.4823,
"step": 580
},
{
"epoch": 0.013550743862370155,
"grad_norm": 1.0928449630737305,
"learning_rate": 3.0111427831044317e-06,
"loss": 1.4502,
"step": 581
},
{
"epoch": 0.013574067001548073,
"grad_norm": 2.408292531967163,
"learning_rate": 3.016325472920446e-06,
"loss": 1.4778,
"step": 582
},
{
"epoch": 0.01359739014072599,
"grad_norm": 1.2284953594207764,
"learning_rate": 3.021508162736461e-06,
"loss": 1.5887,
"step": 583
},
{
"epoch": 0.013620713279903909,
"grad_norm": 1.3841763734817505,
"learning_rate": 3.026690852552475e-06,
"loss": 1.3778,
"step": 584
},
{
"epoch": 0.013644036419081827,
"grad_norm": 1.305172324180603,
"learning_rate": 3.03187354236849e-06,
"loss": 1.2837,
"step": 585
},
{
"epoch": 0.013667359558259744,
"grad_norm": 1.087904691696167,
"learning_rate": 3.037056232184504e-06,
"loss": 1.4361,
"step": 586
},
{
"epoch": 0.013690682697437662,
"grad_norm": 1.1818716526031494,
"learning_rate": 3.042238922000518e-06,
"loss": 1.4903,
"step": 587
},
{
"epoch": 0.013714005836615578,
"grad_norm": 0.9969412088394165,
"learning_rate": 3.047421611816533e-06,
"loss": 1.6923,
"step": 588
},
{
"epoch": 0.013737328975793496,
"grad_norm": 1.3729232549667358,
"learning_rate": 3.0526043016325473e-06,
"loss": 1.4219,
"step": 589
},
{
"epoch": 0.013760652114971414,
"grad_norm": 1.091769814491272,
"learning_rate": 3.057786991448562e-06,
"loss": 1.6978,
"step": 590
},
{
"epoch": 0.013783975254149332,
"grad_norm": 1.1668254137039185,
"learning_rate": 3.0629696812645764e-06,
"loss": 1.4609,
"step": 591
},
{
"epoch": 0.01380729839332725,
"grad_norm": 1.3739502429962158,
"learning_rate": 3.068152371080591e-06,
"loss": 1.7247,
"step": 592
},
{
"epoch": 0.013830621532505168,
"grad_norm": 1.480758547782898,
"learning_rate": 3.0733350608966055e-06,
"loss": 1.6142,
"step": 593
},
{
"epoch": 0.013853944671683086,
"grad_norm": 0.853581964969635,
"learning_rate": 3.0785177507126203e-06,
"loss": 1.5563,
"step": 594
},
{
"epoch": 0.013877267810861002,
"grad_norm": 1.144692063331604,
"learning_rate": 3.0837004405286347e-06,
"loss": 1.6145,
"step": 595
},
{
"epoch": 0.01390059095003892,
"grad_norm": 1.2413440942764282,
"learning_rate": 3.0888831303446494e-06,
"loss": 1.5762,
"step": 596
},
{
"epoch": 0.013923914089216838,
"grad_norm": 1.147834062576294,
"learning_rate": 3.0940658201606638e-06,
"loss": 1.4478,
"step": 597
},
{
"epoch": 0.013947237228394756,
"grad_norm": 1.0349398851394653,
"learning_rate": 3.0992485099766785e-06,
"loss": 1.612,
"step": 598
},
{
"epoch": 0.013970560367572674,
"grad_norm": 1.4780391454696655,
"learning_rate": 3.104431199792693e-06,
"loss": 1.5179,
"step": 599
},
{
"epoch": 0.013993883506750592,
"grad_norm": 1.1395933628082275,
"learning_rate": 3.109613889608707e-06,
"loss": 1.4845,
"step": 600
},
{
"epoch": 0.01401720664592851,
"grad_norm": 1.37168550491333,
"learning_rate": 3.1147965794247216e-06,
"loss": 1.581,
"step": 601
},
{
"epoch": 0.014040529785106426,
"grad_norm": 1.8260347843170166,
"learning_rate": 3.119979269240736e-06,
"loss": 1.1221,
"step": 602
},
{
"epoch": 0.014063852924284343,
"grad_norm": 2.5528669357299805,
"learning_rate": 3.1251619590567507e-06,
"loss": 1.255,
"step": 603
},
{
"epoch": 0.014087176063462261,
"grad_norm": 1.3272032737731934,
"learning_rate": 3.130344648872765e-06,
"loss": 1.2713,
"step": 604
},
{
"epoch": 0.01411049920264018,
"grad_norm": 1.147449254989624,
"learning_rate": 3.13552733868878e-06,
"loss": 1.3694,
"step": 605
},
{
"epoch": 0.014133822341818097,
"grad_norm": 1.173793077468872,
"learning_rate": 3.140710028504794e-06,
"loss": 1.5818,
"step": 606
},
{
"epoch": 0.014157145480996015,
"grad_norm": 1.2347713708877563,
"learning_rate": 3.145892718320809e-06,
"loss": 1.501,
"step": 607
},
{
"epoch": 0.014180468620173933,
"grad_norm": 1.3945446014404297,
"learning_rate": 3.1510754081368233e-06,
"loss": 1.8674,
"step": 608
},
{
"epoch": 0.01420379175935185,
"grad_norm": 1.239762544631958,
"learning_rate": 3.156258097952838e-06,
"loss": 1.2516,
"step": 609
},
{
"epoch": 0.014227114898529767,
"grad_norm": 1.552531361579895,
"learning_rate": 3.1614407877688524e-06,
"loss": 1.5358,
"step": 610
},
{
"epoch": 0.014250438037707685,
"grad_norm": 1.576997995376587,
"learning_rate": 3.166623477584867e-06,
"loss": 1.7601,
"step": 611
},
{
"epoch": 0.014273761176885603,
"grad_norm": 1.3251402378082275,
"learning_rate": 3.1718061674008815e-06,
"loss": 1.2758,
"step": 612
},
{
"epoch": 0.01429708431606352,
"grad_norm": 1.2837574481964111,
"learning_rate": 3.1769888572168963e-06,
"loss": 1.528,
"step": 613
},
{
"epoch": 0.014320407455241439,
"grad_norm": 0.9697505831718445,
"learning_rate": 3.1821715470329102e-06,
"loss": 1.6359,
"step": 614
},
{
"epoch": 0.014343730594419356,
"grad_norm": 1.2682685852050781,
"learning_rate": 3.1873542368489246e-06,
"loss": 1.4759,
"step": 615
},
{
"epoch": 0.014367053733597274,
"grad_norm": 0.9607746005058289,
"learning_rate": 3.1925369266649393e-06,
"loss": 1.7474,
"step": 616
},
{
"epoch": 0.01439037687277519,
"grad_norm": 1.056736946105957,
"learning_rate": 3.1977196164809537e-06,
"loss": 1.8812,
"step": 617
},
{
"epoch": 0.014413700011953108,
"grad_norm": 1.1990852355957031,
"learning_rate": 3.2029023062969684e-06,
"loss": 1.6217,
"step": 618
},
{
"epoch": 0.014437023151131026,
"grad_norm": 1.1339764595031738,
"learning_rate": 3.208084996112983e-06,
"loss": 1.3557,
"step": 619
},
{
"epoch": 0.014460346290308944,
"grad_norm": 1.0672523975372314,
"learning_rate": 3.2132676859289976e-06,
"loss": 1.8239,
"step": 620
},
{
"epoch": 0.014483669429486862,
"grad_norm": 1.4371954202651978,
"learning_rate": 3.218450375745012e-06,
"loss": 1.4571,
"step": 621
},
{
"epoch": 0.01450699256866478,
"grad_norm": 1.9893105030059814,
"learning_rate": 3.2236330655610267e-06,
"loss": 1.3716,
"step": 622
},
{
"epoch": 0.014530315707842698,
"grad_norm": 1.7084318399429321,
"learning_rate": 3.228815755377041e-06,
"loss": 1.5201,
"step": 623
},
{
"epoch": 0.014553638847020614,
"grad_norm": 1.308225154876709,
"learning_rate": 3.233998445193056e-06,
"loss": 1.9173,
"step": 624
},
{
"epoch": 0.014576961986198532,
"grad_norm": 0.9914215803146362,
"learning_rate": 3.23918113500907e-06,
"loss": 1.7351,
"step": 625
},
{
"epoch": 0.01460028512537645,
"grad_norm": 1.0292766094207764,
"learning_rate": 3.244363824825085e-06,
"loss": 1.4073,
"step": 626
},
{
"epoch": 0.014623608264554368,
"grad_norm": 1.0998982191085815,
"learning_rate": 3.2495465146410993e-06,
"loss": 1.5979,
"step": 627
},
{
"epoch": 0.014646931403732286,
"grad_norm": 1.1409685611724854,
"learning_rate": 3.254729204457113e-06,
"loss": 1.3442,
"step": 628
},
{
"epoch": 0.014670254542910204,
"grad_norm": 1.7685736417770386,
"learning_rate": 3.259911894273128e-06,
"loss": 1.251,
"step": 629
},
{
"epoch": 0.014693577682088121,
"grad_norm": 1.6536918878555298,
"learning_rate": 3.2650945840891423e-06,
"loss": 1.4698,
"step": 630
},
{
"epoch": 0.014716900821266038,
"grad_norm": 2.046391248703003,
"learning_rate": 3.270277273905157e-06,
"loss": 1.5142,
"step": 631
},
{
"epoch": 0.014740223960443955,
"grad_norm": 1.3458948135375977,
"learning_rate": 3.2754599637211714e-06,
"loss": 1.3999,
"step": 632
},
{
"epoch": 0.014763547099621873,
"grad_norm": 1.7265046834945679,
"learning_rate": 3.280642653537186e-06,
"loss": 1.2212,
"step": 633
},
{
"epoch": 0.014786870238799791,
"grad_norm": 1.3191124200820923,
"learning_rate": 3.2858253433532005e-06,
"loss": 1.4354,
"step": 634
},
{
"epoch": 0.01481019337797771,
"grad_norm": 1.2317379713058472,
"learning_rate": 3.2910080331692153e-06,
"loss": 1.5661,
"step": 635
},
{
"epoch": 0.014833516517155627,
"grad_norm": 1.400969386100769,
"learning_rate": 3.2961907229852297e-06,
"loss": 1.462,
"step": 636
},
{
"epoch": 0.014856839656333545,
"grad_norm": 2.060718059539795,
"learning_rate": 3.3013734128012444e-06,
"loss": 1.7522,
"step": 637
},
{
"epoch": 0.014880162795511461,
"grad_norm": 1.138715386390686,
"learning_rate": 3.3065561026172588e-06,
"loss": 1.4923,
"step": 638
},
{
"epoch": 0.014903485934689379,
"grad_norm": 1.1973599195480347,
"learning_rate": 3.3117387924332735e-06,
"loss": 1.4462,
"step": 639
},
{
"epoch": 0.014926809073867297,
"grad_norm": 1.266867756843567,
"learning_rate": 3.316921482249288e-06,
"loss": 1.3159,
"step": 640
},
{
"epoch": 0.014950132213045215,
"grad_norm": 3.4681708812713623,
"learning_rate": 3.322104172065302e-06,
"loss": 1.3566,
"step": 641
},
{
"epoch": 0.014973455352223133,
"grad_norm": 1.248502492904663,
"learning_rate": 3.3272868618813166e-06,
"loss": 1.6299,
"step": 642
},
{
"epoch": 0.01499677849140105,
"grad_norm": 1.561563491821289,
"learning_rate": 3.332469551697331e-06,
"loss": 1.3246,
"step": 643
},
{
"epoch": 0.015020101630578968,
"grad_norm": 1.1922053098678589,
"learning_rate": 3.3376522415133457e-06,
"loss": 1.6847,
"step": 644
},
{
"epoch": 0.015043424769756885,
"grad_norm": 1.0779014825820923,
"learning_rate": 3.34283493132936e-06,
"loss": 1.8025,
"step": 645
},
{
"epoch": 0.015066747908934803,
"grad_norm": 1.5236597061157227,
"learning_rate": 3.348017621145375e-06,
"loss": 1.3894,
"step": 646
},
{
"epoch": 0.01509007104811272,
"grad_norm": 1.2087934017181396,
"learning_rate": 3.353200310961389e-06,
"loss": 1.9119,
"step": 647
},
{
"epoch": 0.015113394187290638,
"grad_norm": 1.435085654258728,
"learning_rate": 3.358383000777404e-06,
"loss": 1.4334,
"step": 648
},
{
"epoch": 0.015136717326468556,
"grad_norm": 1.3662467002868652,
"learning_rate": 3.3635656905934183e-06,
"loss": 1.6717,
"step": 649
},
{
"epoch": 0.015160040465646474,
"grad_norm": 1.379262924194336,
"learning_rate": 3.368748380409433e-06,
"loss": 1.0914,
"step": 650
},
{
"epoch": 0.015183363604824392,
"grad_norm": 1.436503529548645,
"learning_rate": 3.3739310702254474e-06,
"loss": 1.296,
"step": 651
},
{
"epoch": 0.015206686744002308,
"grad_norm": 1.0189919471740723,
"learning_rate": 3.379113760041462e-06,
"loss": 1.5578,
"step": 652
},
{
"epoch": 0.015230009883180226,
"grad_norm": 1.3371915817260742,
"learning_rate": 3.3842964498574765e-06,
"loss": 1.3883,
"step": 653
},
{
"epoch": 0.015253333022358144,
"grad_norm": 1.152949333190918,
"learning_rate": 3.389479139673491e-06,
"loss": 1.3408,
"step": 654
},
{
"epoch": 0.015276656161536062,
"grad_norm": 0.865856945514679,
"learning_rate": 3.3946618294895052e-06,
"loss": 1.8154,
"step": 655
},
{
"epoch": 0.01529997930071398,
"grad_norm": 1.3607538938522339,
"learning_rate": 3.3998445193055196e-06,
"loss": 1.5139,
"step": 656
},
{
"epoch": 0.015323302439891898,
"grad_norm": 1.0469399690628052,
"learning_rate": 3.4050272091215343e-06,
"loss": 1.4246,
"step": 657
},
{
"epoch": 0.015346625579069816,
"grad_norm": 1.2417982816696167,
"learning_rate": 3.4102098989375487e-06,
"loss": 1.4392,
"step": 658
},
{
"epoch": 0.015369948718247732,
"grad_norm": 2.018418073654175,
"learning_rate": 3.4153925887535634e-06,
"loss": 1.5175,
"step": 659
},
{
"epoch": 0.01539327185742565,
"grad_norm": 1.2593055963516235,
"learning_rate": 3.420575278569578e-06,
"loss": 1.6338,
"step": 660
},
{
"epoch": 0.015416594996603568,
"grad_norm": 1.0297298431396484,
"learning_rate": 3.4257579683855926e-06,
"loss": 1.6309,
"step": 661
},
{
"epoch": 0.015439918135781485,
"grad_norm": 1.2963732481002808,
"learning_rate": 3.430940658201607e-06,
"loss": 1.3099,
"step": 662
},
{
"epoch": 0.015463241274959403,
"grad_norm": 1.0868266820907593,
"learning_rate": 3.4361233480176217e-06,
"loss": 1.4949,
"step": 663
},
{
"epoch": 0.015486564414137321,
"grad_norm": 1.156296968460083,
"learning_rate": 3.441306037833636e-06,
"loss": 1.7845,
"step": 664
},
{
"epoch": 0.015509887553315239,
"grad_norm": 1.412965178489685,
"learning_rate": 3.446488727649651e-06,
"loss": 1.19,
"step": 665
},
{
"epoch": 0.015533210692493155,
"grad_norm": 1.0419931411743164,
"learning_rate": 3.451671417465665e-06,
"loss": 1.7125,
"step": 666
},
{
"epoch": 0.015556533831671073,
"grad_norm": 1.035372018814087,
"learning_rate": 3.4568541072816795e-06,
"loss": 1.7003,
"step": 667
},
{
"epoch": 0.015579856970848991,
"grad_norm": 1.1559805870056152,
"learning_rate": 3.4620367970976943e-06,
"loss": 1.981,
"step": 668
},
{
"epoch": 0.015603180110026909,
"grad_norm": 0.8634515404701233,
"learning_rate": 3.467219486913708e-06,
"loss": 1.2609,
"step": 669
},
{
"epoch": 0.015626503249204827,
"grad_norm": 1.1953692436218262,
"learning_rate": 3.472402176729723e-06,
"loss": 1.3956,
"step": 670
},
{
"epoch": 0.015649826388382745,
"grad_norm": 0.9668301939964294,
"learning_rate": 3.4775848665457373e-06,
"loss": 1.0568,
"step": 671
},
{
"epoch": 0.015673149527560663,
"grad_norm": 2.4868035316467285,
"learning_rate": 3.482767556361752e-06,
"loss": 1.364,
"step": 672
},
{
"epoch": 0.01569647266673858,
"grad_norm": 1.4255839586257935,
"learning_rate": 3.4879502461777664e-06,
"loss": 1.5207,
"step": 673
},
{
"epoch": 0.0157197958059165,
"grad_norm": 1.2752389907836914,
"learning_rate": 3.493132935993781e-06,
"loss": 1.5141,
"step": 674
},
{
"epoch": 0.015743118945094416,
"grad_norm": 1.2186245918273926,
"learning_rate": 3.4983156258097955e-06,
"loss": 1.3655,
"step": 675
},
{
"epoch": 0.015766442084272334,
"grad_norm": 1.3544304370880127,
"learning_rate": 3.5034983156258103e-06,
"loss": 1.7428,
"step": 676
},
{
"epoch": 0.01578976522345025,
"grad_norm": 1.0968130826950073,
"learning_rate": 3.5086810054418247e-06,
"loss": 1.3491,
"step": 677
},
{
"epoch": 0.015813088362628167,
"grad_norm": 1.1593806743621826,
"learning_rate": 3.513863695257839e-06,
"loss": 1.6708,
"step": 678
},
{
"epoch": 0.015836411501806084,
"grad_norm": 1.0408954620361328,
"learning_rate": 3.5190463850738538e-06,
"loss": 1.6977,
"step": 679
},
{
"epoch": 0.015859734640984002,
"grad_norm": 1.196632742881775,
"learning_rate": 3.524229074889868e-06,
"loss": 1.2019,
"step": 680
},
{
"epoch": 0.01588305778016192,
"grad_norm": 1.2698166370391846,
"learning_rate": 3.529411764705883e-06,
"loss": 1.8457,
"step": 681
},
{
"epoch": 0.015906380919339838,
"grad_norm": 0.9075011014938354,
"learning_rate": 3.5345944545218972e-06,
"loss": 1.2717,
"step": 682
},
{
"epoch": 0.015929704058517756,
"grad_norm": 1.0426501035690308,
"learning_rate": 3.5397771443379116e-06,
"loss": 1.6601,
"step": 683
},
{
"epoch": 0.015953027197695674,
"grad_norm": 1.4904205799102783,
"learning_rate": 3.544959834153926e-06,
"loss": 1.6324,
"step": 684
},
{
"epoch": 0.015976350336873592,
"grad_norm": 1.0664643049240112,
"learning_rate": 3.5501425239699407e-06,
"loss": 1.4896,
"step": 685
},
{
"epoch": 0.01599967347605151,
"grad_norm": 1.3758978843688965,
"learning_rate": 3.555325213785955e-06,
"loss": 1.5457,
"step": 686
},
{
"epoch": 0.016022996615229428,
"grad_norm": 1.4759879112243652,
"learning_rate": 3.56050790360197e-06,
"loss": 1.3865,
"step": 687
},
{
"epoch": 0.016046319754407345,
"grad_norm": 1.4678733348846436,
"learning_rate": 3.565690593417984e-06,
"loss": 1.223,
"step": 688
},
{
"epoch": 0.016069642893585263,
"grad_norm": 1.2057251930236816,
"learning_rate": 3.570873283233999e-06,
"loss": 1.4864,
"step": 689
},
{
"epoch": 0.01609296603276318,
"grad_norm": 1.3976320028305054,
"learning_rate": 3.5760559730500133e-06,
"loss": 1.3371,
"step": 690
},
{
"epoch": 0.016116289171941096,
"grad_norm": 1.0588197708129883,
"learning_rate": 3.5812386628660276e-06,
"loss": 1.264,
"step": 691
},
{
"epoch": 0.016139612311119014,
"grad_norm": 0.891678512096405,
"learning_rate": 3.5864213526820424e-06,
"loss": 1.6566,
"step": 692
},
{
"epoch": 0.01616293545029693,
"grad_norm": 1.1149228811264038,
"learning_rate": 3.5916040424980567e-06,
"loss": 1.6862,
"step": 693
},
{
"epoch": 0.01618625858947485,
"grad_norm": 1.463218331336975,
"learning_rate": 3.5967867323140715e-06,
"loss": 1.5771,
"step": 694
},
{
"epoch": 0.016209581728652767,
"grad_norm": 1.291648030281067,
"learning_rate": 3.601969422130086e-06,
"loss": 1.443,
"step": 695
},
{
"epoch": 0.016232904867830685,
"grad_norm": 1.1534149646759033,
"learning_rate": 3.6071521119461002e-06,
"loss": 1.76,
"step": 696
},
{
"epoch": 0.016256228007008603,
"grad_norm": 1.3349847793579102,
"learning_rate": 3.6123348017621146e-06,
"loss": 2.0584,
"step": 697
},
{
"epoch": 0.01627955114618652,
"grad_norm": 1.665682315826416,
"learning_rate": 3.6175174915781293e-06,
"loss": 1.5989,
"step": 698
},
{
"epoch": 0.01630287428536444,
"grad_norm": 1.6486263275146484,
"learning_rate": 3.6227001813941437e-06,
"loss": 1.7698,
"step": 699
},
{
"epoch": 0.016326197424542357,
"grad_norm": 1.5153722763061523,
"learning_rate": 3.6278828712101584e-06,
"loss": 1.3312,
"step": 700
},
{
"epoch": 0.016349520563720275,
"grad_norm": 1.3090248107910156,
"learning_rate": 3.633065561026173e-06,
"loss": 1.0735,
"step": 701
},
{
"epoch": 0.016372843702898193,
"grad_norm": 1.5462753772735596,
"learning_rate": 3.6382482508421876e-06,
"loss": 1.5408,
"step": 702
},
{
"epoch": 0.01639616684207611,
"grad_norm": 1.3447730541229248,
"learning_rate": 3.643430940658202e-06,
"loss": 1.5295,
"step": 703
},
{
"epoch": 0.01641948998125403,
"grad_norm": 1.232865571975708,
"learning_rate": 3.6486136304742163e-06,
"loss": 1.8686,
"step": 704
},
{
"epoch": 0.016442813120431946,
"grad_norm": 0.9742329120635986,
"learning_rate": 3.653796320290231e-06,
"loss": 1.5951,
"step": 705
},
{
"epoch": 0.01646613625960986,
"grad_norm": 1.1572047472000122,
"learning_rate": 3.6589790101062454e-06,
"loss": 1.5068,
"step": 706
},
{
"epoch": 0.01648945939878778,
"grad_norm": 1.2024304866790771,
"learning_rate": 3.66416169992226e-06,
"loss": 1.3933,
"step": 707
},
{
"epoch": 0.016512782537965696,
"grad_norm": 2.442342758178711,
"learning_rate": 3.6693443897382745e-06,
"loss": 1.0126,
"step": 708
},
{
"epoch": 0.016536105677143614,
"grad_norm": 1.2786589860916138,
"learning_rate": 3.6745270795542893e-06,
"loss": 1.6902,
"step": 709
},
{
"epoch": 0.016559428816321532,
"grad_norm": 0.9200882315635681,
"learning_rate": 3.679709769370303e-06,
"loss": 1.3918,
"step": 710
},
{
"epoch": 0.01658275195549945,
"grad_norm": 1.3768819570541382,
"learning_rate": 3.684892459186318e-06,
"loss": 1.6518,
"step": 711
},
{
"epoch": 0.016606075094677368,
"grad_norm": 1.274484395980835,
"learning_rate": 3.6900751490023323e-06,
"loss": 1.3728,
"step": 712
},
{
"epoch": 0.016629398233855286,
"grad_norm": 1.1752501726150513,
"learning_rate": 3.695257838818347e-06,
"loss": 1.4234,
"step": 713
},
{
"epoch": 0.016652721373033204,
"grad_norm": 1.4458903074264526,
"learning_rate": 3.7004405286343614e-06,
"loss": 1.5695,
"step": 714
},
{
"epoch": 0.01667604451221112,
"grad_norm": 1.2630547285079956,
"learning_rate": 3.705623218450376e-06,
"loss": 1.5334,
"step": 715
},
{
"epoch": 0.01669936765138904,
"grad_norm": 1.3754082918167114,
"learning_rate": 3.7108059082663905e-06,
"loss": 1.4807,
"step": 716
},
{
"epoch": 0.016722690790566958,
"grad_norm": 1.4704689979553223,
"learning_rate": 3.715988598082405e-06,
"loss": 1.5409,
"step": 717
},
{
"epoch": 0.016746013929744875,
"grad_norm": 1.4692633152008057,
"learning_rate": 3.7211712878984197e-06,
"loss": 1.5922,
"step": 718
},
{
"epoch": 0.016769337068922793,
"grad_norm": 1.2148405313491821,
"learning_rate": 3.726353977714434e-06,
"loss": 1.8115,
"step": 719
},
{
"epoch": 0.016792660208100708,
"grad_norm": 1.5564905405044556,
"learning_rate": 3.7315366675304488e-06,
"loss": 1.4189,
"step": 720
},
{
"epoch": 0.016815983347278626,
"grad_norm": 1.130292296409607,
"learning_rate": 3.736719357346463e-06,
"loss": 1.4455,
"step": 721
},
{
"epoch": 0.016839306486456544,
"grad_norm": 2.0609545707702637,
"learning_rate": 3.741902047162478e-06,
"loss": 1.6052,
"step": 722
},
{
"epoch": 0.01686262962563446,
"grad_norm": 1.0422543287277222,
"learning_rate": 3.7470847369784922e-06,
"loss": 1.5889,
"step": 723
},
{
"epoch": 0.01688595276481238,
"grad_norm": 1.7926782369613647,
"learning_rate": 3.7522674267945066e-06,
"loss": 1.2304,
"step": 724
},
{
"epoch": 0.016909275903990297,
"grad_norm": 1.2486250400543213,
"learning_rate": 3.757450116610521e-06,
"loss": 1.7512,
"step": 725
},
{
"epoch": 0.016932599043168215,
"grad_norm": 1.6907048225402832,
"learning_rate": 3.7626328064265357e-06,
"loss": 1.2031,
"step": 726
},
{
"epoch": 0.016955922182346133,
"grad_norm": 1.2899296283721924,
"learning_rate": 3.76781549624255e-06,
"loss": 1.3111,
"step": 727
},
{
"epoch": 0.01697924532152405,
"grad_norm": 2.320288896560669,
"learning_rate": 3.7729981860585644e-06,
"loss": 1.2764,
"step": 728
},
{
"epoch": 0.01700256846070197,
"grad_norm": 1.4165383577346802,
"learning_rate": 3.778180875874579e-06,
"loss": 1.2847,
"step": 729
},
{
"epoch": 0.017025891599879887,
"grad_norm": 1.1537601947784424,
"learning_rate": 3.7833635656905935e-06,
"loss": 1.6002,
"step": 730
},
{
"epoch": 0.017049214739057805,
"grad_norm": 1.3128899335861206,
"learning_rate": 3.7885462555066083e-06,
"loss": 1.4159,
"step": 731
},
{
"epoch": 0.017072537878235722,
"grad_norm": 0.9494642615318298,
"learning_rate": 3.7937289453226226e-06,
"loss": 1.5425,
"step": 732
},
{
"epoch": 0.01709586101741364,
"grad_norm": 1.8949923515319824,
"learning_rate": 3.7989116351386374e-06,
"loss": 1.109,
"step": 733
},
{
"epoch": 0.017119184156591555,
"grad_norm": 1.3136776685714722,
"learning_rate": 3.8040943249546517e-06,
"loss": 1.4208,
"step": 734
},
{
"epoch": 0.017142507295769473,
"grad_norm": 1.0108048915863037,
"learning_rate": 3.8092770147706665e-06,
"loss": 1.3101,
"step": 735
},
{
"epoch": 0.01716583043494739,
"grad_norm": 1.1397989988327026,
"learning_rate": 3.814459704586681e-06,
"loss": 1.6643,
"step": 736
},
{
"epoch": 0.01718915357412531,
"grad_norm": 0.9662717580795288,
"learning_rate": 3.819642394402696e-06,
"loss": 1.5524,
"step": 737
},
{
"epoch": 0.017212476713303226,
"grad_norm": 1.5264514684677124,
"learning_rate": 3.82482508421871e-06,
"loss": 1.6702,
"step": 738
},
{
"epoch": 0.017235799852481144,
"grad_norm": 1.1797709465026855,
"learning_rate": 3.830007774034724e-06,
"loss": 1.5751,
"step": 739
},
{
"epoch": 0.017259122991659062,
"grad_norm": 1.3964486122131348,
"learning_rate": 3.835190463850739e-06,
"loss": 1.3497,
"step": 740
},
{
"epoch": 0.01728244613083698,
"grad_norm": 1.0540798902511597,
"learning_rate": 3.840373153666753e-06,
"loss": 1.623,
"step": 741
},
{
"epoch": 0.017305769270014898,
"grad_norm": 1.8619107007980347,
"learning_rate": 3.845555843482767e-06,
"loss": 1.836,
"step": 742
},
{
"epoch": 0.017329092409192816,
"grad_norm": 1.190048098564148,
"learning_rate": 3.8507385332987826e-06,
"loss": 1.6031,
"step": 743
},
{
"epoch": 0.017352415548370734,
"grad_norm": 1.32784903049469,
"learning_rate": 3.855921223114797e-06,
"loss": 1.6144,
"step": 744
},
{
"epoch": 0.01737573868754865,
"grad_norm": 1.7393810749053955,
"learning_rate": 3.861103912930811e-06,
"loss": 1.4898,
"step": 745
},
{
"epoch": 0.01739906182672657,
"grad_norm": 1.008122444152832,
"learning_rate": 3.866286602746826e-06,
"loss": 1.6506,
"step": 746
},
{
"epoch": 0.017422384965904487,
"grad_norm": 1.3282239437103271,
"learning_rate": 3.871469292562841e-06,
"loss": 1.5178,
"step": 747
},
{
"epoch": 0.017445708105082402,
"grad_norm": 1.4479358196258545,
"learning_rate": 3.876651982378855e-06,
"loss": 1.5896,
"step": 748
},
{
"epoch": 0.01746903124426032,
"grad_norm": 1.9100661277770996,
"learning_rate": 3.8818346721948695e-06,
"loss": 1.2946,
"step": 749
},
{
"epoch": 0.017492354383438238,
"grad_norm": 1.269235610961914,
"learning_rate": 3.887017362010884e-06,
"loss": 1.5707,
"step": 750
},
{
"epoch": 0.017515677522616156,
"grad_norm": 1.3187369108200073,
"learning_rate": 3.892200051826899e-06,
"loss": 1.8153,
"step": 751
},
{
"epoch": 0.017539000661794073,
"grad_norm": 1.3091131448745728,
"learning_rate": 3.8973827416429125e-06,
"loss": 1.5973,
"step": 752
},
{
"epoch": 0.01756232380097199,
"grad_norm": 1.4826890230178833,
"learning_rate": 3.902565431458927e-06,
"loss": 1.3277,
"step": 753
},
{
"epoch": 0.01758564694014991,
"grad_norm": 1.2626949548721313,
"learning_rate": 3.907748121274942e-06,
"loss": 1.5531,
"step": 754
},
{
"epoch": 0.017608970079327827,
"grad_norm": 1.1990412473678589,
"learning_rate": 3.912930811090956e-06,
"loss": 1.349,
"step": 755
},
{
"epoch": 0.017632293218505745,
"grad_norm": 1.3036906719207764,
"learning_rate": 3.918113500906971e-06,
"loss": 1.5648,
"step": 756
},
{
"epoch": 0.017655616357683663,
"grad_norm": 1.3129525184631348,
"learning_rate": 3.923296190722985e-06,
"loss": 1.7147,
"step": 757
},
{
"epoch": 0.01767893949686158,
"grad_norm": 1.4686280488967896,
"learning_rate": 3.928478880539e-06,
"loss": 1.6136,
"step": 758
},
{
"epoch": 0.0177022626360395,
"grad_norm": 1.6845604181289673,
"learning_rate": 3.933661570355015e-06,
"loss": 1.763,
"step": 759
},
{
"epoch": 0.017725585775217417,
"grad_norm": 2.019049644470215,
"learning_rate": 3.938844260171029e-06,
"loss": 1.2543,
"step": 760
},
{
"epoch": 0.017748908914395334,
"grad_norm": 1.4184072017669678,
"learning_rate": 3.944026949987043e-06,
"loss": 1.596,
"step": 761
},
{
"epoch": 0.017772232053573252,
"grad_norm": 1.127982497215271,
"learning_rate": 3.9492096398030585e-06,
"loss": 1.5485,
"step": 762
},
{
"epoch": 0.017795555192751167,
"grad_norm": 1.5097321271896362,
"learning_rate": 3.954392329619073e-06,
"loss": 1.5452,
"step": 763
},
{
"epoch": 0.017818878331929085,
"grad_norm": 1.3832807540893555,
"learning_rate": 3.959575019435087e-06,
"loss": 1.3865,
"step": 764
},
{
"epoch": 0.017842201471107003,
"grad_norm": 1.065623164176941,
"learning_rate": 3.964757709251102e-06,
"loss": 1.2218,
"step": 765
},
{
"epoch": 0.01786552461028492,
"grad_norm": 1.2190065383911133,
"learning_rate": 3.969940399067116e-06,
"loss": 1.2169,
"step": 766
},
{
"epoch": 0.01788884774946284,
"grad_norm": 1.741749882698059,
"learning_rate": 3.97512308888313e-06,
"loss": 1.7316,
"step": 767
},
{
"epoch": 0.017912170888640756,
"grad_norm": 1.2072060108184814,
"learning_rate": 3.980305778699145e-06,
"loss": 1.815,
"step": 768
},
{
"epoch": 0.017935494027818674,
"grad_norm": 1.4645625352859497,
"learning_rate": 3.98548846851516e-06,
"loss": 1.2218,
"step": 769
},
{
"epoch": 0.017958817166996592,
"grad_norm": 1.4466350078582764,
"learning_rate": 3.990671158331174e-06,
"loss": 1.7291,
"step": 770
},
{
"epoch": 0.01798214030617451,
"grad_norm": 1.364358901977539,
"learning_rate": 3.9958538481471885e-06,
"loss": 1.6527,
"step": 771
},
{
"epoch": 0.018005463445352428,
"grad_norm": 1.2262394428253174,
"learning_rate": 4.001036537963203e-06,
"loss": 1.5522,
"step": 772
},
{
"epoch": 0.018028786584530346,
"grad_norm": 1.694001317024231,
"learning_rate": 4.006219227779218e-06,
"loss": 1.5791,
"step": 773
},
{
"epoch": 0.018052109723708264,
"grad_norm": 0.7941157817840576,
"learning_rate": 4.011401917595232e-06,
"loss": 1.23,
"step": 774
},
{
"epoch": 0.01807543286288618,
"grad_norm": 1.1942747831344604,
"learning_rate": 4.016584607411247e-06,
"loss": 1.4316,
"step": 775
},
{
"epoch": 0.0180987560020641,
"grad_norm": 1.5809072256088257,
"learning_rate": 4.021767297227261e-06,
"loss": 1.7361,
"step": 776
},
{
"epoch": 0.018122079141242014,
"grad_norm": 1.2918401956558228,
"learning_rate": 4.026949987043276e-06,
"loss": 1.3285,
"step": 777
},
{
"epoch": 0.018145402280419932,
"grad_norm": 1.966123342514038,
"learning_rate": 4.032132676859291e-06,
"loss": 1.2037,
"step": 778
},
{
"epoch": 0.01816872541959785,
"grad_norm": 1.3362590074539185,
"learning_rate": 4.037315366675304e-06,
"loss": 1.3811,
"step": 779
},
{
"epoch": 0.018192048558775768,
"grad_norm": 1.0375605821609497,
"learning_rate": 4.042498056491319e-06,
"loss": 1.481,
"step": 780
},
{
"epoch": 0.018215371697953685,
"grad_norm": 2.414684295654297,
"learning_rate": 4.047680746307334e-06,
"loss": 1.773,
"step": 781
},
{
"epoch": 0.018238694837131603,
"grad_norm": 1.2252676486968994,
"learning_rate": 4.052863436123348e-06,
"loss": 1.514,
"step": 782
},
{
"epoch": 0.01826201797630952,
"grad_norm": 1.517791748046875,
"learning_rate": 4.058046125939362e-06,
"loss": 1.3442,
"step": 783
},
{
"epoch": 0.01828534111548744,
"grad_norm": 1.0303611755371094,
"learning_rate": 4.0632288157553776e-06,
"loss": 1.5593,
"step": 784
},
{
"epoch": 0.018308664254665357,
"grad_norm": 1.3615033626556396,
"learning_rate": 4.068411505571392e-06,
"loss": 1.6971,
"step": 785
},
{
"epoch": 0.018331987393843275,
"grad_norm": 1.1224147081375122,
"learning_rate": 4.073594195387406e-06,
"loss": 1.2134,
"step": 786
},
{
"epoch": 0.018355310533021193,
"grad_norm": 1.3592679500579834,
"learning_rate": 4.078776885203421e-06,
"loss": 1.7391,
"step": 787
},
{
"epoch": 0.01837863367219911,
"grad_norm": 1.6286187171936035,
"learning_rate": 4.083959575019436e-06,
"loss": 1.7279,
"step": 788
},
{
"epoch": 0.01840195681137703,
"grad_norm": 1.2597742080688477,
"learning_rate": 4.08914226483545e-06,
"loss": 1.5227,
"step": 789
},
{
"epoch": 0.018425279950554947,
"grad_norm": 1.2776849269866943,
"learning_rate": 4.0943249546514645e-06,
"loss": 1.3575,
"step": 790
},
{
"epoch": 0.01844860308973286,
"grad_norm": 1.2529163360595703,
"learning_rate": 4.099507644467479e-06,
"loss": 1.6356,
"step": 791
},
{
"epoch": 0.01847192622891078,
"grad_norm": 1.184187650680542,
"learning_rate": 4.104690334283494e-06,
"loss": 1.734,
"step": 792
},
{
"epoch": 0.018495249368088697,
"grad_norm": 1.176222562789917,
"learning_rate": 4.1098730240995075e-06,
"loss": 1.5206,
"step": 793
},
{
"epoch": 0.018518572507266615,
"grad_norm": 1.0694701671600342,
"learning_rate": 4.115055713915522e-06,
"loss": 1.1824,
"step": 794
},
{
"epoch": 0.018541895646444533,
"grad_norm": 1.5169551372528076,
"learning_rate": 4.120238403731537e-06,
"loss": 1.3817,
"step": 795
},
{
"epoch": 0.01856521878562245,
"grad_norm": 1.0996246337890625,
"learning_rate": 4.125421093547551e-06,
"loss": 1.0921,
"step": 796
},
{
"epoch": 0.01858854192480037,
"grad_norm": 1.0202140808105469,
"learning_rate": 4.130603783363566e-06,
"loss": 1.2687,
"step": 797
},
{
"epoch": 0.018611865063978286,
"grad_norm": 2.089864730834961,
"learning_rate": 4.13578647317958e-06,
"loss": 1.5417,
"step": 798
},
{
"epoch": 0.018635188203156204,
"grad_norm": 1.1465847492218018,
"learning_rate": 4.140969162995595e-06,
"loss": 1.3415,
"step": 799
},
{
"epoch": 0.018658511342334122,
"grad_norm": 1.1085565090179443,
"learning_rate": 4.14615185281161e-06,
"loss": 1.4662,
"step": 800
},
{
"epoch": 0.01868183448151204,
"grad_norm": 1.2206768989562988,
"learning_rate": 4.151334542627624e-06,
"loss": 1.4954,
"step": 801
},
{
"epoch": 0.018705157620689958,
"grad_norm": 1.1540756225585938,
"learning_rate": 4.156517232443638e-06,
"loss": 1.4953,
"step": 802
},
{
"epoch": 0.018728480759867876,
"grad_norm": 1.9667025804519653,
"learning_rate": 4.1616999222596535e-06,
"loss": 1.1834,
"step": 803
},
{
"epoch": 0.018751803899045794,
"grad_norm": 1.2202988862991333,
"learning_rate": 4.166882612075668e-06,
"loss": 1.7045,
"step": 804
},
{
"epoch": 0.018775127038223708,
"grad_norm": 1.2399123907089233,
"learning_rate": 4.172065301891682e-06,
"loss": 1.4937,
"step": 805
},
{
"epoch": 0.018798450177401626,
"grad_norm": 1.5780203342437744,
"learning_rate": 4.177247991707697e-06,
"loss": 1.6386,
"step": 806
},
{
"epoch": 0.018821773316579544,
"grad_norm": 1.524564266204834,
"learning_rate": 4.182430681523711e-06,
"loss": 1.4951,
"step": 807
},
{
"epoch": 0.01884509645575746,
"grad_norm": 1.342991590499878,
"learning_rate": 4.187613371339725e-06,
"loss": 1.3007,
"step": 808
},
{
"epoch": 0.01886841959493538,
"grad_norm": 1.320813775062561,
"learning_rate": 4.19279606115574e-06,
"loss": 1.2112,
"step": 809
},
{
"epoch": 0.018891742734113297,
"grad_norm": 1.2329927682876587,
"learning_rate": 4.197978750971755e-06,
"loss": 1.333,
"step": 810
},
{
"epoch": 0.018915065873291215,
"grad_norm": 1.3429094552993774,
"learning_rate": 4.203161440787769e-06,
"loss": 1.4805,
"step": 811
},
{
"epoch": 0.018938389012469133,
"grad_norm": 1.643641710281372,
"learning_rate": 4.2083441306037835e-06,
"loss": 1.5665,
"step": 812
},
{
"epoch": 0.01896171215164705,
"grad_norm": 1.111887812614441,
"learning_rate": 4.213526820419798e-06,
"loss": 1.6087,
"step": 813
},
{
"epoch": 0.01898503529082497,
"grad_norm": 1.3594610691070557,
"learning_rate": 4.218709510235813e-06,
"loss": 1.7666,
"step": 814
},
{
"epoch": 0.019008358430002887,
"grad_norm": 1.2298046350479126,
"learning_rate": 4.223892200051827e-06,
"loss": 1.5032,
"step": 815
},
{
"epoch": 0.019031681569180805,
"grad_norm": 1.2679171562194824,
"learning_rate": 4.229074889867842e-06,
"loss": 1.4375,
"step": 816
},
{
"epoch": 0.019055004708358723,
"grad_norm": 1.0543935298919678,
"learning_rate": 4.234257579683856e-06,
"loss": 1.6645,
"step": 817
},
{
"epoch": 0.01907832784753664,
"grad_norm": 1.2821168899536133,
"learning_rate": 4.239440269499871e-06,
"loss": 1.1945,
"step": 818
},
{
"epoch": 0.01910165098671456,
"grad_norm": 1.5575084686279297,
"learning_rate": 4.244622959315886e-06,
"loss": 1.3262,
"step": 819
},
{
"epoch": 0.019124974125892473,
"grad_norm": 1.2359989881515503,
"learning_rate": 4.2498056491319e-06,
"loss": 1.4127,
"step": 820
},
{
"epoch": 0.01914829726507039,
"grad_norm": 1.0559273958206177,
"learning_rate": 4.254988338947914e-06,
"loss": 1.4455,
"step": 821
},
{
"epoch": 0.01917162040424831,
"grad_norm": 1.3651732206344604,
"learning_rate": 4.260171028763929e-06,
"loss": 1.245,
"step": 822
},
{
"epoch": 0.019194943543426227,
"grad_norm": 1.0067932605743408,
"learning_rate": 4.265353718579943e-06,
"loss": 1.4954,
"step": 823
},
{
"epoch": 0.019218266682604145,
"grad_norm": 1.7477822303771973,
"learning_rate": 4.270536408395957e-06,
"loss": 1.8164,
"step": 824
},
{
"epoch": 0.019241589821782062,
"grad_norm": 1.1976604461669922,
"learning_rate": 4.2757190982119726e-06,
"loss": 1.4552,
"step": 825
},
{
"epoch": 0.01926491296095998,
"grad_norm": 1.306269884109497,
"learning_rate": 4.280901788027987e-06,
"loss": 1.6348,
"step": 826
},
{
"epoch": 0.019288236100137898,
"grad_norm": 1.5786314010620117,
"learning_rate": 4.286084477844001e-06,
"loss": 1.4592,
"step": 827
},
{
"epoch": 0.019311559239315816,
"grad_norm": 1.4481762647628784,
"learning_rate": 4.291267167660016e-06,
"loss": 1.3409,
"step": 828
},
{
"epoch": 0.019334882378493734,
"grad_norm": 1.1410714387893677,
"learning_rate": 4.296449857476031e-06,
"loss": 1.5746,
"step": 829
},
{
"epoch": 0.019358205517671652,
"grad_norm": 1.363434076309204,
"learning_rate": 4.301632547292045e-06,
"loss": 1.0836,
"step": 830
},
{
"epoch": 0.01938152865684957,
"grad_norm": 1.1413646936416626,
"learning_rate": 4.3068152371080595e-06,
"loss": 1.8687,
"step": 831
},
{
"epoch": 0.019404851796027488,
"grad_norm": 1.9734309911727905,
"learning_rate": 4.311997926924074e-06,
"loss": 1.3295,
"step": 832
},
{
"epoch": 0.019428174935205406,
"grad_norm": 1.5119333267211914,
"learning_rate": 4.317180616740089e-06,
"loss": 1.6817,
"step": 833
},
{
"epoch": 0.01945149807438332,
"grad_norm": 1.3933395147323608,
"learning_rate": 4.3223633065561025e-06,
"loss": 1.5288,
"step": 834
},
{
"epoch": 0.019474821213561238,
"grad_norm": 1.3713746070861816,
"learning_rate": 4.327545996372117e-06,
"loss": 1.6361,
"step": 835
},
{
"epoch": 0.019498144352739156,
"grad_norm": 1.1849229335784912,
"learning_rate": 4.332728686188132e-06,
"loss": 1.6611,
"step": 836
},
{
"epoch": 0.019521467491917074,
"grad_norm": 2.122307777404785,
"learning_rate": 4.337911376004146e-06,
"loss": 1.6258,
"step": 837
},
{
"epoch": 0.01954479063109499,
"grad_norm": 1.221781611442566,
"learning_rate": 4.343094065820161e-06,
"loss": 1.9081,
"step": 838
},
{
"epoch": 0.01956811377027291,
"grad_norm": 1.2895511388778687,
"learning_rate": 4.348276755636175e-06,
"loss": 1.2742,
"step": 839
},
{
"epoch": 0.019591436909450827,
"grad_norm": 1.1531336307525635,
"learning_rate": 4.35345944545219e-06,
"loss": 1.587,
"step": 840
},
{
"epoch": 0.019614760048628745,
"grad_norm": 1.3979135751724243,
"learning_rate": 4.358642135268205e-06,
"loss": 1.5208,
"step": 841
},
{
"epoch": 0.019638083187806663,
"grad_norm": 1.3758100271224976,
"learning_rate": 4.363824825084219e-06,
"loss": 1.246,
"step": 842
},
{
"epoch": 0.01966140632698458,
"grad_norm": 1.3759677410125732,
"learning_rate": 4.369007514900233e-06,
"loss": 1.7344,
"step": 843
},
{
"epoch": 0.0196847294661625,
"grad_norm": 1.5575461387634277,
"learning_rate": 4.3741902047162485e-06,
"loss": 1.5554,
"step": 844
},
{
"epoch": 0.019708052605340417,
"grad_norm": 1.5018088817596436,
"learning_rate": 4.379372894532263e-06,
"loss": 1.3433,
"step": 845
},
{
"epoch": 0.019731375744518335,
"grad_norm": 1.4393954277038574,
"learning_rate": 4.384555584348277e-06,
"loss": 1.7277,
"step": 846
},
{
"epoch": 0.019754698883696253,
"grad_norm": 1.0249360799789429,
"learning_rate": 4.389738274164292e-06,
"loss": 1.6538,
"step": 847
},
{
"epoch": 0.019778022022874167,
"grad_norm": 1.128587007522583,
"learning_rate": 4.394920963980306e-06,
"loss": 1.2935,
"step": 848
},
{
"epoch": 0.019801345162052085,
"grad_norm": 1.301287293434143,
"learning_rate": 4.40010365379632e-06,
"loss": 1.4193,
"step": 849
},
{
"epoch": 0.019824668301230003,
"grad_norm": 1.5180747509002686,
"learning_rate": 4.405286343612335e-06,
"loss": 1.2061,
"step": 850
},
{
"epoch": 0.01984799144040792,
"grad_norm": 0.9110321402549744,
"learning_rate": 4.41046903342835e-06,
"loss": 1.2803,
"step": 851
},
{
"epoch": 0.01987131457958584,
"grad_norm": 1.68843674659729,
"learning_rate": 4.415651723244364e-06,
"loss": 1.2037,
"step": 852
},
{
"epoch": 0.019894637718763757,
"grad_norm": 1.2198610305786133,
"learning_rate": 4.4208344130603785e-06,
"loss": 1.6652,
"step": 853
},
{
"epoch": 0.019917960857941674,
"grad_norm": 1.579087257385254,
"learning_rate": 4.426017102876393e-06,
"loss": 1.5859,
"step": 854
},
{
"epoch": 0.019941283997119592,
"grad_norm": 1.7198874950408936,
"learning_rate": 4.431199792692408e-06,
"loss": 1.4662,
"step": 855
},
{
"epoch": 0.01996460713629751,
"grad_norm": 2.817178726196289,
"learning_rate": 4.436382482508422e-06,
"loss": 1.3427,
"step": 856
},
{
"epoch": 0.019987930275475428,
"grad_norm": 1.4508287906646729,
"learning_rate": 4.441565172324437e-06,
"loss": 1.2893,
"step": 857
},
{
"epoch": 0.020011253414653346,
"grad_norm": 1.29767644405365,
"learning_rate": 4.446747862140451e-06,
"loss": 1.5759,
"step": 858
},
{
"epoch": 0.020034576553831264,
"grad_norm": 1.84248685836792,
"learning_rate": 4.451930551956466e-06,
"loss": 2.1373,
"step": 859
},
{
"epoch": 0.020057899693009182,
"grad_norm": 1.6153839826583862,
"learning_rate": 4.457113241772481e-06,
"loss": 1.3915,
"step": 860
},
{
"epoch": 0.0200812228321871,
"grad_norm": 1.3203104734420776,
"learning_rate": 4.462295931588495e-06,
"loss": 1.569,
"step": 861
},
{
"epoch": 0.020104545971365014,
"grad_norm": 1.6475995779037476,
"learning_rate": 4.467478621404509e-06,
"loss": 1.6446,
"step": 862
},
{
"epoch": 0.020127869110542932,
"grad_norm": 1.165834665298462,
"learning_rate": 4.472661311220524e-06,
"loss": 1.7323,
"step": 863
},
{
"epoch": 0.02015119224972085,
"grad_norm": 1.3182172775268555,
"learning_rate": 4.477844001036538e-06,
"loss": 1.6265,
"step": 864
},
{
"epoch": 0.020174515388898768,
"grad_norm": 1.1236745119094849,
"learning_rate": 4.483026690852552e-06,
"loss": 1.2358,
"step": 865
},
{
"epoch": 0.020197838528076686,
"grad_norm": 1.2104893922805786,
"learning_rate": 4.4882093806685676e-06,
"loss": 1.4677,
"step": 866
},
{
"epoch": 0.020221161667254604,
"grad_norm": 1.6824678182601929,
"learning_rate": 4.493392070484582e-06,
"loss": 1.5802,
"step": 867
},
{
"epoch": 0.02024448480643252,
"grad_norm": 1.0679930448532104,
"learning_rate": 4.498574760300596e-06,
"loss": 1.4105,
"step": 868
},
{
"epoch": 0.02026780794561044,
"grad_norm": 1.3705253601074219,
"learning_rate": 4.503757450116611e-06,
"loss": 1.5095,
"step": 869
},
{
"epoch": 0.020291131084788357,
"grad_norm": 1.307491660118103,
"learning_rate": 4.508940139932626e-06,
"loss": 1.3987,
"step": 870
},
{
"epoch": 0.020314454223966275,
"grad_norm": 1.4814496040344238,
"learning_rate": 4.51412282974864e-06,
"loss": 1.635,
"step": 871
},
{
"epoch": 0.020337777363144193,
"grad_norm": 0.935867190361023,
"learning_rate": 4.5193055195646545e-06,
"loss": 1.6734,
"step": 872
},
{
"epoch": 0.02036110050232211,
"grad_norm": 1.3890215158462524,
"learning_rate": 4.524488209380669e-06,
"loss": 1.4458,
"step": 873
},
{
"epoch": 0.02038442364150003,
"grad_norm": 1.628081202507019,
"learning_rate": 4.529670899196684e-06,
"loss": 1.4814,
"step": 874
},
{
"epoch": 0.020407746780677947,
"grad_norm": 1.5255577564239502,
"learning_rate": 4.534853589012698e-06,
"loss": 1.3884,
"step": 875
},
{
"epoch": 0.020431069919855865,
"grad_norm": 2.09283185005188,
"learning_rate": 4.540036278828712e-06,
"loss": 1.7396,
"step": 876
},
{
"epoch": 0.02045439305903378,
"grad_norm": 0.9901561737060547,
"learning_rate": 4.545218968644727e-06,
"loss": 1.4941,
"step": 877
},
{
"epoch": 0.020477716198211697,
"grad_norm": 1.8444923162460327,
"learning_rate": 4.550401658460741e-06,
"loss": 1.2724,
"step": 878
},
{
"epoch": 0.020501039337389615,
"grad_norm": 1.414305567741394,
"learning_rate": 4.555584348276756e-06,
"loss": 1.5781,
"step": 879
},
{
"epoch": 0.020524362476567533,
"grad_norm": 1.1960091590881348,
"learning_rate": 4.56076703809277e-06,
"loss": 1.536,
"step": 880
},
{
"epoch": 0.02054768561574545,
"grad_norm": 2.241649627685547,
"learning_rate": 4.565949727908785e-06,
"loss": 1.6636,
"step": 881
},
{
"epoch": 0.02057100875492337,
"grad_norm": 1.0672343969345093,
"learning_rate": 4.5711324177248e-06,
"loss": 1.6369,
"step": 882
},
{
"epoch": 0.020594331894101287,
"grad_norm": 1.6761622428894043,
"learning_rate": 4.576315107540814e-06,
"loss": 1.2554,
"step": 883
},
{
"epoch": 0.020617655033279204,
"grad_norm": 1.1365658044815063,
"learning_rate": 4.581497797356828e-06,
"loss": 1.6271,
"step": 884
},
{
"epoch": 0.020640978172457122,
"grad_norm": 1.0631389617919922,
"learning_rate": 4.5866804871728435e-06,
"loss": 1.6393,
"step": 885
},
{
"epoch": 0.02066430131163504,
"grad_norm": 3.27304744720459,
"learning_rate": 4.591863176988858e-06,
"loss": 1.3521,
"step": 886
},
{
"epoch": 0.020687624450812958,
"grad_norm": 1.3354477882385254,
"learning_rate": 4.597045866804872e-06,
"loss": 1.5137,
"step": 887
},
{
"epoch": 0.020710947589990876,
"grad_norm": 2.192812919616699,
"learning_rate": 4.602228556620887e-06,
"loss": 1.7294,
"step": 888
},
{
"epoch": 0.020734270729168794,
"grad_norm": 0.9716669321060181,
"learning_rate": 4.607411246436901e-06,
"loss": 1.4244,
"step": 889
},
{
"epoch": 0.020757593868346712,
"grad_norm": 1.0377227067947388,
"learning_rate": 4.612593936252915e-06,
"loss": 1.3041,
"step": 890
},
{
"epoch": 0.020780917007524626,
"grad_norm": 1.971074104309082,
"learning_rate": 4.61777662606893e-06,
"loss": 1.4917,
"step": 891
},
{
"epoch": 0.020804240146702544,
"grad_norm": 1.3108222484588623,
"learning_rate": 4.622959315884945e-06,
"loss": 1.5923,
"step": 892
},
{
"epoch": 0.020827563285880462,
"grad_norm": 1.4194189310073853,
"learning_rate": 4.628142005700959e-06,
"loss": 1.2378,
"step": 893
},
{
"epoch": 0.02085088642505838,
"grad_norm": 1.5872682332992554,
"learning_rate": 4.6333246955169735e-06,
"loss": 1.3573,
"step": 894
},
{
"epoch": 0.020874209564236298,
"grad_norm": 1.351704716682434,
"learning_rate": 4.638507385332988e-06,
"loss": 1.8374,
"step": 895
},
{
"epoch": 0.020897532703414216,
"grad_norm": 1.15986168384552,
"learning_rate": 4.643690075149003e-06,
"loss": 1.4303,
"step": 896
},
{
"epoch": 0.020920855842592134,
"grad_norm": 1.912819743156433,
"learning_rate": 4.648872764965017e-06,
"loss": 1.7733,
"step": 897
},
{
"epoch": 0.02094417898177005,
"grad_norm": 1.6582539081573486,
"learning_rate": 4.654055454781032e-06,
"loss": 1.4696,
"step": 898
},
{
"epoch": 0.02096750212094797,
"grad_norm": 1.147661805152893,
"learning_rate": 4.659238144597046e-06,
"loss": 1.5037,
"step": 899
},
{
"epoch": 0.020990825260125887,
"grad_norm": 1.1773402690887451,
"learning_rate": 4.664420834413061e-06,
"loss": 1.604,
"step": 900
},
{
"epoch": 0.021014148399303805,
"grad_norm": 1.9128248691558838,
"learning_rate": 4.669603524229076e-06,
"loss": 1.3081,
"step": 901
},
{
"epoch": 0.021037471538481723,
"grad_norm": 1.0742683410644531,
"learning_rate": 4.67478621404509e-06,
"loss": 1.5619,
"step": 902
},
{
"epoch": 0.02106079467765964,
"grad_norm": 1.19862699508667,
"learning_rate": 4.679968903861104e-06,
"loss": 1.6896,
"step": 903
},
{
"epoch": 0.02108411781683756,
"grad_norm": 1.276283860206604,
"learning_rate": 4.685151593677119e-06,
"loss": 1.65,
"step": 904
},
{
"epoch": 0.021107440956015473,
"grad_norm": 1.3582435846328735,
"learning_rate": 4.690334283493133e-06,
"loss": 1.2686,
"step": 905
},
{
"epoch": 0.02113076409519339,
"grad_norm": 1.2145341634750366,
"learning_rate": 4.695516973309147e-06,
"loss": 1.8032,
"step": 906
},
{
"epoch": 0.02115408723437131,
"grad_norm": 1.1219233274459839,
"learning_rate": 4.7006996631251626e-06,
"loss": 1.7681,
"step": 907
},
{
"epoch": 0.021177410373549227,
"grad_norm": 1.0474015474319458,
"learning_rate": 4.705882352941177e-06,
"loss": 1.4555,
"step": 908
},
{
"epoch": 0.021200733512727145,
"grad_norm": 1.6325182914733887,
"learning_rate": 4.711065042757191e-06,
"loss": 1.432,
"step": 909
},
{
"epoch": 0.021224056651905063,
"grad_norm": 1.5804178714752197,
"learning_rate": 4.716247732573206e-06,
"loss": 1.7409,
"step": 910
},
{
"epoch": 0.02124737979108298,
"grad_norm": 1.226804256439209,
"learning_rate": 4.721430422389221e-06,
"loss": 1.8077,
"step": 911
},
{
"epoch": 0.0212707029302609,
"grad_norm": 1.0747625827789307,
"learning_rate": 4.726613112205235e-06,
"loss": 1.411,
"step": 912
},
{
"epoch": 0.021294026069438816,
"grad_norm": 1.2126623392105103,
"learning_rate": 4.7317958020212495e-06,
"loss": 1.6464,
"step": 913
},
{
"epoch": 0.021317349208616734,
"grad_norm": 1.196486473083496,
"learning_rate": 4.736978491837264e-06,
"loss": 1.4365,
"step": 914
},
{
"epoch": 0.021340672347794652,
"grad_norm": 1.4727115631103516,
"learning_rate": 4.742161181653279e-06,
"loss": 1.5059,
"step": 915
},
{
"epoch": 0.02136399548697257,
"grad_norm": 1.293938159942627,
"learning_rate": 4.747343871469293e-06,
"loss": 1.5508,
"step": 916
},
{
"epoch": 0.021387318626150488,
"grad_norm": 1.3074458837509155,
"learning_rate": 4.752526561285307e-06,
"loss": 1.364,
"step": 917
},
{
"epoch": 0.021410641765328406,
"grad_norm": 1.708522081375122,
"learning_rate": 4.757709251101322e-06,
"loss": 1.2891,
"step": 918
},
{
"epoch": 0.02143396490450632,
"grad_norm": 1.2926160097122192,
"learning_rate": 4.762891940917336e-06,
"loss": 1.1779,
"step": 919
},
{
"epoch": 0.021457288043684238,
"grad_norm": 1.7751168012619019,
"learning_rate": 4.768074630733351e-06,
"loss": 1.3136,
"step": 920
},
{
"epoch": 0.021480611182862156,
"grad_norm": 1.3698194026947021,
"learning_rate": 4.773257320549365e-06,
"loss": 1.5203,
"step": 921
},
{
"epoch": 0.021503934322040074,
"grad_norm": 1.4710402488708496,
"learning_rate": 4.77844001036538e-06,
"loss": 2.0632,
"step": 922
},
{
"epoch": 0.021527257461217992,
"grad_norm": 1.3340466022491455,
"learning_rate": 4.783622700181395e-06,
"loss": 0.9449,
"step": 923
},
{
"epoch": 0.02155058060039591,
"grad_norm": 1.990078330039978,
"learning_rate": 4.788805389997409e-06,
"loss": 1.4095,
"step": 924
},
{
"epoch": 0.021573903739573828,
"grad_norm": 2.6495463848114014,
"learning_rate": 4.793988079813423e-06,
"loss": 1.5914,
"step": 925
},
{
"epoch": 0.021597226878751746,
"grad_norm": 1.368868350982666,
"learning_rate": 4.7991707696294385e-06,
"loss": 1.8007,
"step": 926
},
{
"epoch": 0.021620550017929663,
"grad_norm": 1.3946820497512817,
"learning_rate": 4.804353459445453e-06,
"loss": 1.3846,
"step": 927
},
{
"epoch": 0.02164387315710758,
"grad_norm": 1.6035547256469727,
"learning_rate": 4.809536149261467e-06,
"loss": 1.6677,
"step": 928
},
{
"epoch": 0.0216671962962855,
"grad_norm": 1.29734468460083,
"learning_rate": 4.814718839077482e-06,
"loss": 1.3697,
"step": 929
},
{
"epoch": 0.021690519435463417,
"grad_norm": 1.1746439933776855,
"learning_rate": 4.819901528893497e-06,
"loss": 1.6134,
"step": 930
},
{
"epoch": 0.021713842574641335,
"grad_norm": 1.255861759185791,
"learning_rate": 4.82508421870951e-06,
"loss": 1.6253,
"step": 931
},
{
"epoch": 0.021737165713819253,
"grad_norm": 1.5499615669250488,
"learning_rate": 4.830266908525525e-06,
"loss": 1.2794,
"step": 932
},
{
"epoch": 0.02176048885299717,
"grad_norm": 1.6138273477554321,
"learning_rate": 4.83544959834154e-06,
"loss": 1.6365,
"step": 933
},
{
"epoch": 0.021783811992175085,
"grad_norm": 1.7135401964187622,
"learning_rate": 4.840632288157554e-06,
"loss": 1.509,
"step": 934
},
{
"epoch": 0.021807135131353003,
"grad_norm": 1.4290528297424316,
"learning_rate": 4.8458149779735685e-06,
"loss": 1.3415,
"step": 935
},
{
"epoch": 0.02183045827053092,
"grad_norm": 2.034870147705078,
"learning_rate": 4.850997667789583e-06,
"loss": 1.6834,
"step": 936
},
{
"epoch": 0.02185378140970884,
"grad_norm": 1.6626250743865967,
"learning_rate": 4.856180357605598e-06,
"loss": 1.3573,
"step": 937
},
{
"epoch": 0.021877104548886757,
"grad_norm": 1.2256288528442383,
"learning_rate": 4.861363047421612e-06,
"loss": 1.5497,
"step": 938
},
{
"epoch": 0.021900427688064675,
"grad_norm": 1.218955397605896,
"learning_rate": 4.866545737237627e-06,
"loss": 1.6823,
"step": 939
},
{
"epoch": 0.021923750827242593,
"grad_norm": 1.0629289150238037,
"learning_rate": 4.871728427053641e-06,
"loss": 1.3894,
"step": 940
},
{
"epoch": 0.02194707396642051,
"grad_norm": 2.6169822216033936,
"learning_rate": 4.876911116869656e-06,
"loss": 1.4063,
"step": 941
},
{
"epoch": 0.02197039710559843,
"grad_norm": 1.1517153978347778,
"learning_rate": 4.882093806685671e-06,
"loss": 1.3838,
"step": 942
},
{
"epoch": 0.021993720244776346,
"grad_norm": 1.6320403814315796,
"learning_rate": 4.887276496501685e-06,
"loss": 1.5752,
"step": 943
},
{
"epoch": 0.022017043383954264,
"grad_norm": 1.7344862222671509,
"learning_rate": 4.892459186317699e-06,
"loss": 1.3182,
"step": 944
},
{
"epoch": 0.022040366523132182,
"grad_norm": 1.2497214078903198,
"learning_rate": 4.897641876133714e-06,
"loss": 1.2266,
"step": 945
},
{
"epoch": 0.0220636896623101,
"grad_norm": 1.996893048286438,
"learning_rate": 4.902824565949728e-06,
"loss": 1.2708,
"step": 946
},
{
"epoch": 0.022087012801488018,
"grad_norm": 1.1130571365356445,
"learning_rate": 4.908007255765742e-06,
"loss": 1.4791,
"step": 947
},
{
"epoch": 0.022110335940665932,
"grad_norm": 1.2698702812194824,
"learning_rate": 4.9131899455817576e-06,
"loss": 1.3711,
"step": 948
},
{
"epoch": 0.02213365907984385,
"grad_norm": 1.0363445281982422,
"learning_rate": 4.918372635397772e-06,
"loss": 1.4153,
"step": 949
},
{
"epoch": 0.022156982219021768,
"grad_norm": 1.1418310403823853,
"learning_rate": 4.923555325213786e-06,
"loss": 1.3377,
"step": 950
},
{
"epoch": 0.022180305358199686,
"grad_norm": 1.3740698099136353,
"learning_rate": 4.928738015029801e-06,
"loss": 1.375,
"step": 951
},
{
"epoch": 0.022203628497377604,
"grad_norm": 1.5656532049179077,
"learning_rate": 4.933920704845816e-06,
"loss": 1.651,
"step": 952
},
{
"epoch": 0.022226951636555522,
"grad_norm": 1.209380865097046,
"learning_rate": 4.93910339466183e-06,
"loss": 1.6956,
"step": 953
},
{
"epoch": 0.02225027477573344,
"grad_norm": 1.9917747974395752,
"learning_rate": 4.9442860844778445e-06,
"loss": 1.2802,
"step": 954
},
{
"epoch": 0.022273597914911358,
"grad_norm": 2.168260097503662,
"learning_rate": 4.949468774293859e-06,
"loss": 1.9773,
"step": 955
},
{
"epoch": 0.022296921054089276,
"grad_norm": 1.113978624343872,
"learning_rate": 4.954651464109874e-06,
"loss": 1.8121,
"step": 956
},
{
"epoch": 0.022320244193267193,
"grad_norm": 1.4833635091781616,
"learning_rate": 4.959834153925888e-06,
"loss": 1.694,
"step": 957
},
{
"epoch": 0.02234356733244511,
"grad_norm": 1.3287935256958008,
"learning_rate": 4.965016843741902e-06,
"loss": 1.4865,
"step": 958
},
{
"epoch": 0.02236689047162303,
"grad_norm": 1.5515238046646118,
"learning_rate": 4.970199533557917e-06,
"loss": 1.6035,
"step": 959
},
{
"epoch": 0.022390213610800947,
"grad_norm": 1.2824245691299438,
"learning_rate": 4.975382223373931e-06,
"loss": 1.5124,
"step": 960
},
{
"epoch": 0.022413536749978865,
"grad_norm": 1.2062418460845947,
"learning_rate": 4.980564913189946e-06,
"loss": 1.5982,
"step": 961
},
{
"epoch": 0.02243685988915678,
"grad_norm": 1.2790741920471191,
"learning_rate": 4.98574760300596e-06,
"loss": 1.586,
"step": 962
},
{
"epoch": 0.022460183028334697,
"grad_norm": 1.202909231185913,
"learning_rate": 4.990930292821975e-06,
"loss": 1.7387,
"step": 963
},
{
"epoch": 0.022483506167512615,
"grad_norm": 1.328963041305542,
"learning_rate": 4.99611298263799e-06,
"loss": 1.5611,
"step": 964
},
{
"epoch": 0.022506829306690533,
"grad_norm": 1.3728841543197632,
"learning_rate": 5.001295672454004e-06,
"loss": 1.6887,
"step": 965
},
{
"epoch": 0.02253015244586845,
"grad_norm": 1.2474596500396729,
"learning_rate": 5.006478362270018e-06,
"loss": 1.7337,
"step": 966
},
{
"epoch": 0.02255347558504637,
"grad_norm": 1.4526808261871338,
"learning_rate": 5.0116610520860335e-06,
"loss": 1.4009,
"step": 967
},
{
"epoch": 0.022576798724224287,
"grad_norm": 1.74959397315979,
"learning_rate": 5.016843741902048e-06,
"loss": 1.4153,
"step": 968
},
{
"epoch": 0.022600121863402205,
"grad_norm": 1.7886738777160645,
"learning_rate": 5.022026431718062e-06,
"loss": 1.3897,
"step": 969
},
{
"epoch": 0.022623445002580123,
"grad_norm": 1.3122284412384033,
"learning_rate": 5.027209121534077e-06,
"loss": 1.6551,
"step": 970
},
{
"epoch": 0.02264676814175804,
"grad_norm": 1.5374927520751953,
"learning_rate": 5.032391811350092e-06,
"loss": 1.6396,
"step": 971
},
{
"epoch": 0.02267009128093596,
"grad_norm": 1.6476905345916748,
"learning_rate": 5.037574501166106e-06,
"loss": 1.733,
"step": 972
},
{
"epoch": 0.022693414420113876,
"grad_norm": 1.3407307863235474,
"learning_rate": 5.0427571909821205e-06,
"loss": 1.4984,
"step": 973
},
{
"epoch": 0.022716737559291794,
"grad_norm": 1.5565712451934814,
"learning_rate": 5.047939880798135e-06,
"loss": 1.6524,
"step": 974
},
{
"epoch": 0.022740060698469712,
"grad_norm": 1.381903052330017,
"learning_rate": 5.053122570614149e-06,
"loss": 1.5325,
"step": 975
},
{
"epoch": 0.022763383837647626,
"grad_norm": 1.916326880455017,
"learning_rate": 5.058305260430164e-06,
"loss": 1.2326,
"step": 976
},
{
"epoch": 0.022786706976825544,
"grad_norm": 1.1621575355529785,
"learning_rate": 5.063487950246179e-06,
"loss": 1.2568,
"step": 977
},
{
"epoch": 0.022810030116003462,
"grad_norm": 1.3575561046600342,
"learning_rate": 5.068670640062193e-06,
"loss": 1.3755,
"step": 978
},
{
"epoch": 0.02283335325518138,
"grad_norm": 1.482701063156128,
"learning_rate": 5.0738533298782065e-06,
"loss": 1.598,
"step": 979
},
{
"epoch": 0.022856676394359298,
"grad_norm": 1.2530887126922607,
"learning_rate": 5.079036019694221e-06,
"loss": 1.66,
"step": 980
},
{
"epoch": 0.022879999533537216,
"grad_norm": 1.4960439205169678,
"learning_rate": 5.084218709510236e-06,
"loss": 1.5341,
"step": 981
},
{
"epoch": 0.022903322672715134,
"grad_norm": 1.507735252380371,
"learning_rate": 5.0894013993262504e-06,
"loss": 1.3987,
"step": 982
},
{
"epoch": 0.022926645811893052,
"grad_norm": 2.0131475925445557,
"learning_rate": 5.094584089142265e-06,
"loss": 1.3134,
"step": 983
},
{
"epoch": 0.02294996895107097,
"grad_norm": 1.8096015453338623,
"learning_rate": 5.099766778958279e-06,
"loss": 1.3707,
"step": 984
},
{
"epoch": 0.022973292090248888,
"grad_norm": 1.0444198846817017,
"learning_rate": 5.104949468774294e-06,
"loss": 1.4119,
"step": 985
},
{
"epoch": 0.022996615229426805,
"grad_norm": 1.3110159635543823,
"learning_rate": 5.110132158590309e-06,
"loss": 1.2187,
"step": 986
},
{
"epoch": 0.023019938368604723,
"grad_norm": 1.3191614151000977,
"learning_rate": 5.115314848406323e-06,
"loss": 1.3691,
"step": 987
},
{
"epoch": 0.02304326150778264,
"grad_norm": 1.3888386487960815,
"learning_rate": 5.120497538222337e-06,
"loss": 1.1934,
"step": 988
},
{
"epoch": 0.02306658464696056,
"grad_norm": 1.2101585865020752,
"learning_rate": 5.1256802280383526e-06,
"loss": 1.4962,
"step": 989
},
{
"epoch": 0.023089907786138477,
"grad_norm": 1.2938464879989624,
"learning_rate": 5.130862917854367e-06,
"loss": 1.4601,
"step": 990
},
{
"epoch": 0.02311323092531639,
"grad_norm": 2.072444200515747,
"learning_rate": 5.136045607670381e-06,
"loss": 1.7241,
"step": 991
},
{
"epoch": 0.02313655406449431,
"grad_norm": 1.7139407396316528,
"learning_rate": 5.141228297486396e-06,
"loss": 1.394,
"step": 992
},
{
"epoch": 0.023159877203672227,
"grad_norm": 1.5825177431106567,
"learning_rate": 5.146410987302411e-06,
"loss": 1.4218,
"step": 993
},
{
"epoch": 0.023183200342850145,
"grad_norm": 1.2233787775039673,
"learning_rate": 5.151593677118425e-06,
"loss": 1.2882,
"step": 994
},
{
"epoch": 0.023206523482028063,
"grad_norm": 1.6474647521972656,
"learning_rate": 5.1567763669344395e-06,
"loss": 1.6499,
"step": 995
},
{
"epoch": 0.02322984662120598,
"grad_norm": 1.669651985168457,
"learning_rate": 5.161959056750454e-06,
"loss": 1.1727,
"step": 996
},
{
"epoch": 0.0232531697603839,
"grad_norm": 1.4976879358291626,
"learning_rate": 5.167141746566469e-06,
"loss": 1.2149,
"step": 997
},
{
"epoch": 0.023276492899561817,
"grad_norm": 1.4033470153808594,
"learning_rate": 5.172324436382483e-06,
"loss": 1.3004,
"step": 998
},
{
"epoch": 0.023299816038739735,
"grad_norm": 1.3042150735855103,
"learning_rate": 5.177507126198498e-06,
"loss": 1.3803,
"step": 999
},
{
"epoch": 0.023323139177917653,
"grad_norm": 1.4327346086502075,
"learning_rate": 5.182689816014512e-06,
"loss": 1.7267,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 128625,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5429008193870234e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}