Meditron3-Gemma2-2B / trainer_state.json
alexsallinen's picture
Upload folder using huggingface_hub
318178c verified
raw
history blame
325 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24997764863656682,
"eval_steps": 500,
"global_step": 1864,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00013410818059901653,
"grad_norm": 6.53010672133618,
"learning_rate": 5.999999760325567e-07,
"loss": 1.7583,
"step": 1
},
{
"epoch": 0.00026821636119803307,
"grad_norm": 3.9699106842198337,
"learning_rate": 5.999999041302309e-07,
"loss": 1.6802,
"step": 2
},
{
"epoch": 0.0004023245417970496,
"grad_norm": 1.9790778060230643,
"learning_rate": 5.999997842930357e-07,
"loss": 1.7683,
"step": 3
},
{
"epoch": 0.0005364327223960661,
"grad_norm": 4.227652802101559,
"learning_rate": 5.999996165209921e-07,
"loss": 1.7059,
"step": 4
},
{
"epoch": 0.0006705409029950827,
"grad_norm": 3.569278124536831,
"learning_rate": 5.9999940081413e-07,
"loss": 1.7249,
"step": 5
},
{
"epoch": 0.0008046490835940993,
"grad_norm": 4.901107992602518,
"learning_rate": 5.999991371724877e-07,
"loss": 1.7577,
"step": 6
},
{
"epoch": 0.0009387572641931158,
"grad_norm": 1.7387118214106754,
"learning_rate": 5.999988255961119e-07,
"loss": 1.7158,
"step": 7
},
{
"epoch": 0.0010728654447921323,
"grad_norm": 1.809495324631513,
"learning_rate": 5.99998466085058e-07,
"loss": 1.7287,
"step": 8
},
{
"epoch": 0.0012069736253911489,
"grad_norm": 2.596203473961021,
"learning_rate": 5.999980586393898e-07,
"loss": 1.7724,
"step": 9
},
{
"epoch": 0.0013410818059901655,
"grad_norm": 2.024872932500911,
"learning_rate": 5.999976032591797e-07,
"loss": 1.7405,
"step": 10
},
{
"epoch": 0.001475189986589182,
"grad_norm": 1.8852498791545222,
"learning_rate": 5.999970999445085e-07,
"loss": 1.8083,
"step": 11
},
{
"epoch": 0.0016092981671881985,
"grad_norm": 2.789618405199575,
"learning_rate": 5.999965486954655e-07,
"loss": 1.7057,
"step": 12
},
{
"epoch": 0.0017434063477872151,
"grad_norm": 5.494954113770268,
"learning_rate": 5.999959495121485e-07,
"loss": 1.7091,
"step": 13
},
{
"epoch": 0.0018775145283862315,
"grad_norm": 2.79414084165035,
"learning_rate": 5.999953023946642e-07,
"loss": 1.7631,
"step": 14
},
{
"epoch": 0.002011622708985248,
"grad_norm": 4.805471519443609,
"learning_rate": 5.999946073431272e-07,
"loss": 1.8484,
"step": 15
},
{
"epoch": 0.0021457308895842645,
"grad_norm": 1.3908444469943815,
"learning_rate": 5.99993864357661e-07,
"loss": 1.7106,
"step": 16
},
{
"epoch": 0.002279839070183281,
"grad_norm": 1.9101877569067494,
"learning_rate": 5.999930734383974e-07,
"loss": 1.7213,
"step": 17
},
{
"epoch": 0.0024139472507822978,
"grad_norm": 1.547605953954358,
"learning_rate": 5.999922345854771e-07,
"loss": 1.7222,
"step": 18
},
{
"epoch": 0.0025480554313813144,
"grad_norm": 1.4545546480798652,
"learning_rate": 5.999913477990486e-07,
"loss": 1.6248,
"step": 19
},
{
"epoch": 0.002682163611980331,
"grad_norm": 1.3664919966665414,
"learning_rate": 5.999904130792696e-07,
"loss": 1.7481,
"step": 20
},
{
"epoch": 0.002816271792579347,
"grad_norm": 1.2263286406299385,
"learning_rate": 5.999894304263061e-07,
"loss": 1.731,
"step": 21
},
{
"epoch": 0.002950379973178364,
"grad_norm": 1.4441675215823284,
"learning_rate": 5.999883998403325e-07,
"loss": 1.7489,
"step": 22
},
{
"epoch": 0.0030844881537773804,
"grad_norm": 2.076565725996637,
"learning_rate": 5.999873213215316e-07,
"loss": 1.609,
"step": 23
},
{
"epoch": 0.003218596334376397,
"grad_norm": 2.056874040951704,
"learning_rate": 5.999861948700952e-07,
"loss": 1.7387,
"step": 24
},
{
"epoch": 0.0033527045149754136,
"grad_norm": 1.196618920130671,
"learning_rate": 5.99985020486223e-07,
"loss": 1.6522,
"step": 25
},
{
"epoch": 0.0034868126955744302,
"grad_norm": 1.4295779403436433,
"learning_rate": 5.999837981701236e-07,
"loss": 1.7226,
"step": 26
},
{
"epoch": 0.0036209208761734464,
"grad_norm": 1.22926449530156,
"learning_rate": 5.99982527922014e-07,
"loss": 1.699,
"step": 27
},
{
"epoch": 0.003755029056772463,
"grad_norm": 1.324357519929758,
"learning_rate": 5.999812097421198e-07,
"loss": 1.784,
"step": 28
},
{
"epoch": 0.0038891372373714797,
"grad_norm": 1.351746272995911,
"learning_rate": 5.999798436306748e-07,
"loss": 1.7094,
"step": 29
},
{
"epoch": 0.004023245417970496,
"grad_norm": 1.3696717018122837,
"learning_rate": 5.999784295879217e-07,
"loss": 1.8113,
"step": 30
},
{
"epoch": 0.0041573535985695124,
"grad_norm": 1.2950751514861556,
"learning_rate": 5.999769676141116e-07,
"loss": 1.7043,
"step": 31
},
{
"epoch": 0.004291461779168529,
"grad_norm": 1.510791624383582,
"learning_rate": 5.99975457709504e-07,
"loss": 1.7247,
"step": 32
},
{
"epoch": 0.004425569959767546,
"grad_norm": 1.205151919537117,
"learning_rate": 5.999738998743669e-07,
"loss": 1.7102,
"step": 33
},
{
"epoch": 0.004559678140366562,
"grad_norm": 1.2313460275237813,
"learning_rate": 5.999722941089769e-07,
"loss": 1.6194,
"step": 34
},
{
"epoch": 0.004693786320965579,
"grad_norm": 1.2769810504677248,
"learning_rate": 5.999706404136191e-07,
"loss": 1.6776,
"step": 35
},
{
"epoch": 0.0048278945015645955,
"grad_norm": 1.210621301547261,
"learning_rate": 5.99968938788587e-07,
"loss": 1.658,
"step": 36
},
{
"epoch": 0.004962002682163612,
"grad_norm": 1.3309399301655989,
"learning_rate": 5.99967189234183e-07,
"loss": 1.598,
"step": 37
},
{
"epoch": 0.005096110862762629,
"grad_norm": 1.2698125491932901,
"learning_rate": 5.999653917507173e-07,
"loss": 1.6783,
"step": 38
},
{
"epoch": 0.005230219043361645,
"grad_norm": 1.269690475054205,
"learning_rate": 5.999635463385092e-07,
"loss": 1.7118,
"step": 39
},
{
"epoch": 0.005364327223960662,
"grad_norm": 1.239195449838068,
"learning_rate": 5.999616529978864e-07,
"loss": 1.7552,
"step": 40
},
{
"epoch": 0.005498435404559678,
"grad_norm": 1.213245919091097,
"learning_rate": 5.999597117291851e-07,
"loss": 1.6195,
"step": 41
},
{
"epoch": 0.005632543585158694,
"grad_norm": 1.472546911008587,
"learning_rate": 5.999577225327498e-07,
"loss": 1.7151,
"step": 42
},
{
"epoch": 0.005766651765757711,
"grad_norm": 1.1739645532291967,
"learning_rate": 5.999556854089335e-07,
"loss": 1.6848,
"step": 43
},
{
"epoch": 0.005900759946356728,
"grad_norm": 1.6603998730539062,
"learning_rate": 5.999536003580982e-07,
"loss": 1.7987,
"step": 44
},
{
"epoch": 0.006034868126955744,
"grad_norm": 1.2267822489395797,
"learning_rate": 5.999514673806138e-07,
"loss": 1.7743,
"step": 45
},
{
"epoch": 0.006168976307554761,
"grad_norm": 1.182672696849382,
"learning_rate": 5.999492864768594e-07,
"loss": 1.7007,
"step": 46
},
{
"epoch": 0.006303084488153777,
"grad_norm": 1.3725982814639008,
"learning_rate": 5.999470576472216e-07,
"loss": 1.6453,
"step": 47
},
{
"epoch": 0.006437192668752794,
"grad_norm": 1.2302523441661959,
"learning_rate": 5.999447808920965e-07,
"loss": 1.668,
"step": 48
},
{
"epoch": 0.006571300849351811,
"grad_norm": 1.5825139985036842,
"learning_rate": 5.999424562118882e-07,
"loss": 1.677,
"step": 49
},
{
"epoch": 0.006705409029950827,
"grad_norm": 1.3441000281769755,
"learning_rate": 5.999400836070092e-07,
"loss": 1.7907,
"step": 50
},
{
"epoch": 0.006839517210549844,
"grad_norm": 1.2662568205784916,
"learning_rate": 5.999376630778812e-07,
"loss": 1.7948,
"step": 51
},
{
"epoch": 0.0069736253911488605,
"grad_norm": 1.6969919156319755,
"learning_rate": 5.999351946249336e-07,
"loss": 1.704,
"step": 52
},
{
"epoch": 0.007107733571747876,
"grad_norm": 1.3702701009027687,
"learning_rate": 5.999326782486047e-07,
"loss": 1.7596,
"step": 53
},
{
"epoch": 0.007241841752346893,
"grad_norm": 1.2008226357772018,
"learning_rate": 5.999301139493413e-07,
"loss": 1.7446,
"step": 54
},
{
"epoch": 0.0073759499329459095,
"grad_norm": 1.1610594693793954,
"learning_rate": 5.999275017275985e-07,
"loss": 1.6545,
"step": 55
},
{
"epoch": 0.007510058113544926,
"grad_norm": 1.2318851837588591,
"learning_rate": 5.999248415838404e-07,
"loss": 1.6945,
"step": 56
},
{
"epoch": 0.007644166294143943,
"grad_norm": 1.3623097314650943,
"learning_rate": 5.99922133518539e-07,
"loss": 1.7576,
"step": 57
},
{
"epoch": 0.007778274474742959,
"grad_norm": 1.263711259426924,
"learning_rate": 5.999193775321749e-07,
"loss": 1.7202,
"step": 58
},
{
"epoch": 0.007912382655341976,
"grad_norm": 1.266618530800646,
"learning_rate": 5.999165736252378e-07,
"loss": 1.7277,
"step": 59
},
{
"epoch": 0.008046490835940992,
"grad_norm": 1.20263409583272,
"learning_rate": 5.999137217982253e-07,
"loss": 1.7287,
"step": 60
},
{
"epoch": 0.00818059901654001,
"grad_norm": 1.3137021476149842,
"learning_rate": 5.999108220516439e-07,
"loss": 1.7524,
"step": 61
},
{
"epoch": 0.008314707197139025,
"grad_norm": 1.2381760472328087,
"learning_rate": 5.999078743860079e-07,
"loss": 1.6713,
"step": 62
},
{
"epoch": 0.008448815377738042,
"grad_norm": 1.1488246018603008,
"learning_rate": 5.999048788018412e-07,
"loss": 1.61,
"step": 63
},
{
"epoch": 0.008582923558337058,
"grad_norm": 1.1657309327731467,
"learning_rate": 5.999018352996753e-07,
"loss": 1.7329,
"step": 64
},
{
"epoch": 0.008717031738936076,
"grad_norm": 1.4859993327682761,
"learning_rate": 5.998987438800507e-07,
"loss": 1.7751,
"step": 65
},
{
"epoch": 0.008851139919535091,
"grad_norm": 1.2336235778167894,
"learning_rate": 5.99895604543516e-07,
"loss": 1.7698,
"step": 66
},
{
"epoch": 0.008985248100134109,
"grad_norm": 1.2063484420298083,
"learning_rate": 5.998924172906287e-07,
"loss": 1.6674,
"step": 67
},
{
"epoch": 0.009119356280733125,
"grad_norm": 1.144489164232074,
"learning_rate": 5.998891821219549e-07,
"loss": 1.6727,
"step": 68
},
{
"epoch": 0.009253464461332142,
"grad_norm": 1.1661711232482204,
"learning_rate": 5.998858990380685e-07,
"loss": 1.72,
"step": 69
},
{
"epoch": 0.009387572641931158,
"grad_norm": 1.8657773898969878,
"learning_rate": 5.998825680395526e-07,
"loss": 1.67,
"step": 70
},
{
"epoch": 0.009521680822530174,
"grad_norm": 1.2765420086009807,
"learning_rate": 5.998791891269986e-07,
"loss": 1.7016,
"step": 71
},
{
"epoch": 0.009655789003129191,
"grad_norm": 1.1153772140374385,
"learning_rate": 5.998757623010063e-07,
"loss": 1.707,
"step": 72
},
{
"epoch": 0.009789897183728207,
"grad_norm": 1.1669261546443137,
"learning_rate": 5.998722875621842e-07,
"loss": 1.6859,
"step": 73
},
{
"epoch": 0.009924005364327224,
"grad_norm": 1.1777142907854627,
"learning_rate": 5.99868764911149e-07,
"loss": 1.616,
"step": 74
},
{
"epoch": 0.01005811354492624,
"grad_norm": 1.1386560612646601,
"learning_rate": 5.998651943485263e-07,
"loss": 1.7086,
"step": 75
},
{
"epoch": 0.010192221725525258,
"grad_norm": 1.1396265347862253,
"learning_rate": 5.998615758749499e-07,
"loss": 1.6094,
"step": 76
},
{
"epoch": 0.010326329906124273,
"grad_norm": 1.1418930865866173,
"learning_rate": 5.998579094910623e-07,
"loss": 1.5653,
"step": 77
},
{
"epoch": 0.01046043808672329,
"grad_norm": 1.2012675736770206,
"learning_rate": 5.998541951975143e-07,
"loss": 1.749,
"step": 78
},
{
"epoch": 0.010594546267322306,
"grad_norm": 1.1829649799589437,
"learning_rate": 5.998504329949654e-07,
"loss": 1.741,
"step": 79
},
{
"epoch": 0.010728654447921324,
"grad_norm": 1.1137771771242837,
"learning_rate": 5.998466228840834e-07,
"loss": 1.7467,
"step": 80
},
{
"epoch": 0.01086276262852034,
"grad_norm": 1.2213171478733171,
"learning_rate": 5.998427648655449e-07,
"loss": 1.7411,
"step": 81
},
{
"epoch": 0.010996870809119355,
"grad_norm": 1.2565644926554131,
"learning_rate": 5.998388589400348e-07,
"loss": 1.5334,
"step": 82
},
{
"epoch": 0.011130978989718373,
"grad_norm": 1.1677953640865506,
"learning_rate": 5.998349051082467e-07,
"loss": 1.6292,
"step": 83
},
{
"epoch": 0.011265087170317389,
"grad_norm": 1.1735267116017247,
"learning_rate": 5.998309033708821e-07,
"loss": 1.7093,
"step": 84
},
{
"epoch": 0.011399195350916406,
"grad_norm": 1.1800930162312424,
"learning_rate": 5.998268537286519e-07,
"loss": 1.6931,
"step": 85
},
{
"epoch": 0.011533303531515422,
"grad_norm": 1.267648133239451,
"learning_rate": 5.998227561822748e-07,
"loss": 1.7372,
"step": 86
},
{
"epoch": 0.01166741171211444,
"grad_norm": 1.322566507024075,
"learning_rate": 5.998186107324783e-07,
"loss": 1.729,
"step": 87
},
{
"epoch": 0.011801519892713455,
"grad_norm": 1.2115409550398644,
"learning_rate": 5.998144173799985e-07,
"loss": 1.8509,
"step": 88
},
{
"epoch": 0.011935628073312473,
"grad_norm": 1.2085609394825974,
"learning_rate": 5.998101761255799e-07,
"loss": 1.6913,
"step": 89
},
{
"epoch": 0.012069736253911488,
"grad_norm": 1.290801409771777,
"learning_rate": 5.998058869699753e-07,
"loss": 1.7102,
"step": 90
},
{
"epoch": 0.012203844434510506,
"grad_norm": 1.1367739383903264,
"learning_rate": 5.998015499139461e-07,
"loss": 1.6836,
"step": 91
},
{
"epoch": 0.012337952615109522,
"grad_norm": 1.1670495306196762,
"learning_rate": 5.997971649582626e-07,
"loss": 1.7664,
"step": 92
},
{
"epoch": 0.012472060795708539,
"grad_norm": 1.1506012664004979,
"learning_rate": 5.99792732103703e-07,
"loss": 1.6477,
"step": 93
},
{
"epoch": 0.012606168976307555,
"grad_norm": 1.1708715291743035,
"learning_rate": 5.997882513510546e-07,
"loss": 1.6524,
"step": 94
},
{
"epoch": 0.01274027715690657,
"grad_norm": 1.151933225888518,
"learning_rate": 5.997837227011127e-07,
"loss": 1.7245,
"step": 95
},
{
"epoch": 0.012874385337505588,
"grad_norm": 1.1690902149158897,
"learning_rate": 5.997791461546813e-07,
"loss": 1.7276,
"step": 96
},
{
"epoch": 0.013008493518104604,
"grad_norm": 1.2091849738704115,
"learning_rate": 5.997745217125728e-07,
"loss": 1.6816,
"step": 97
},
{
"epoch": 0.013142601698703621,
"grad_norm": 1.154532285060635,
"learning_rate": 5.997698493756085e-07,
"loss": 1.7065,
"step": 98
},
{
"epoch": 0.013276709879302637,
"grad_norm": 1.084556225295123,
"learning_rate": 5.997651291446176e-07,
"loss": 1.6972,
"step": 99
},
{
"epoch": 0.013410818059901655,
"grad_norm": 1.0844384684144817,
"learning_rate": 5.997603610204383e-07,
"loss": 1.6011,
"step": 100
},
{
"epoch": 0.01354492624050067,
"grad_norm": 1.1349833362519353,
"learning_rate": 5.997555450039173e-07,
"loss": 1.7058,
"step": 101
},
{
"epoch": 0.013679034421099688,
"grad_norm": 1.161646012371061,
"learning_rate": 5.997506810959091e-07,
"loss": 1.7284,
"step": 102
},
{
"epoch": 0.013813142601698703,
"grad_norm": 1.1931085385755509,
"learning_rate": 5.997457692972776e-07,
"loss": 1.6889,
"step": 103
},
{
"epoch": 0.013947250782297721,
"grad_norm": 1.1274496052792788,
"learning_rate": 5.997408096088949e-07,
"loss": 1.6966,
"step": 104
},
{
"epoch": 0.014081358962896737,
"grad_norm": 1.181021137421778,
"learning_rate": 5.997358020316412e-07,
"loss": 1.6328,
"step": 105
},
{
"epoch": 0.014215467143495752,
"grad_norm": 1.1775178821818613,
"learning_rate": 5.997307465664057e-07,
"loss": 1.776,
"step": 106
},
{
"epoch": 0.01434957532409477,
"grad_norm": 1.1589504262285564,
"learning_rate": 5.99725643214086e-07,
"loss": 1.7587,
"step": 107
},
{
"epoch": 0.014483683504693786,
"grad_norm": 1.0988787787594243,
"learning_rate": 5.99720491975588e-07,
"loss": 1.6803,
"step": 108
},
{
"epoch": 0.014617791685292803,
"grad_norm": 1.1461688756871193,
"learning_rate": 5.997152928518265e-07,
"loss": 1.607,
"step": 109
},
{
"epoch": 0.014751899865891819,
"grad_norm": 1.152474644239047,
"learning_rate": 5.99710045843724e-07,
"loss": 1.7633,
"step": 110
},
{
"epoch": 0.014886008046490836,
"grad_norm": 1.1059120772328972,
"learning_rate": 5.997047509522127e-07,
"loss": 1.6747,
"step": 111
},
{
"epoch": 0.015020116227089852,
"grad_norm": 1.313489457814451,
"learning_rate": 5.996994081782321e-07,
"loss": 1.7596,
"step": 112
},
{
"epoch": 0.01515422440768887,
"grad_norm": 1.111253336672023,
"learning_rate": 5.99694017522731e-07,
"loss": 1.6808,
"step": 113
},
{
"epoch": 0.015288332588287885,
"grad_norm": 1.1886992881117084,
"learning_rate": 5.996885789866662e-07,
"loss": 1.7115,
"step": 114
},
{
"epoch": 0.015422440768886903,
"grad_norm": 1.3209352003575652,
"learning_rate": 5.996830925710036e-07,
"loss": 1.6806,
"step": 115
},
{
"epoch": 0.015556548949485919,
"grad_norm": 1.1206995765571244,
"learning_rate": 5.99677558276717e-07,
"loss": 1.7454,
"step": 116
},
{
"epoch": 0.015690657130084936,
"grad_norm": 1.5425145699155092,
"learning_rate": 5.996719761047891e-07,
"loss": 1.7396,
"step": 117
},
{
"epoch": 0.015824765310683952,
"grad_norm": 1.1362376633432387,
"learning_rate": 5.996663460562107e-07,
"loss": 1.7999,
"step": 118
},
{
"epoch": 0.015958873491282968,
"grad_norm": 1.3633221428865825,
"learning_rate": 5.996606681319816e-07,
"loss": 1.7351,
"step": 119
},
{
"epoch": 0.016092981671881983,
"grad_norm": 1.3214169620385536,
"learning_rate": 5.996549423331097e-07,
"loss": 1.8187,
"step": 120
},
{
"epoch": 0.016227089852481003,
"grad_norm": 1.127623956399482,
"learning_rate": 5.996491686606115e-07,
"loss": 1.7869,
"step": 121
},
{
"epoch": 0.01636119803308002,
"grad_norm": 1.1391849506858633,
"learning_rate": 5.996433471155121e-07,
"loss": 1.6692,
"step": 122
},
{
"epoch": 0.016495306213679034,
"grad_norm": 1.2689844428227393,
"learning_rate": 5.99637477698845e-07,
"loss": 1.7503,
"step": 123
},
{
"epoch": 0.01662941439427805,
"grad_norm": 1.1304906129070134,
"learning_rate": 5.996315604116523e-07,
"loss": 1.7342,
"step": 124
},
{
"epoch": 0.01676352257487707,
"grad_norm": 1.114839424591474,
"learning_rate": 5.996255952549846e-07,
"loss": 1.7152,
"step": 125
},
{
"epoch": 0.016897630755476085,
"grad_norm": 1.1209913395354725,
"learning_rate": 5.996195822299007e-07,
"loss": 1.7016,
"step": 126
},
{
"epoch": 0.0170317389360751,
"grad_norm": 1.2030735367344376,
"learning_rate": 5.996135213374683e-07,
"loss": 1.6916,
"step": 127
},
{
"epoch": 0.017165847116674116,
"grad_norm": 1.1576104199692667,
"learning_rate": 5.996074125787635e-07,
"loss": 1.6998,
"step": 128
},
{
"epoch": 0.017299955297273132,
"grad_norm": 1.1589080789600115,
"learning_rate": 5.996012559548706e-07,
"loss": 1.7135,
"step": 129
},
{
"epoch": 0.01743406347787215,
"grad_norm": 1.1305459345535596,
"learning_rate": 5.995950514668828e-07,
"loss": 1.7388,
"step": 130
},
{
"epoch": 0.017568171658471167,
"grad_norm": 1.1845858349294451,
"learning_rate": 5.995887991159015e-07,
"loss": 1.6555,
"step": 131
},
{
"epoch": 0.017702279839070183,
"grad_norm": 1.1323032600098655,
"learning_rate": 5.99582498903037e-07,
"loss": 1.7391,
"step": 132
},
{
"epoch": 0.0178363880196692,
"grad_norm": 1.1284637339427535,
"learning_rate": 5.995761508294074e-07,
"loss": 1.7362,
"step": 133
},
{
"epoch": 0.017970496200268218,
"grad_norm": 1.1056404062639804,
"learning_rate": 5.995697548961401e-07,
"loss": 1.6097,
"step": 134
},
{
"epoch": 0.018104604380867233,
"grad_norm": 1.168355336516219,
"learning_rate": 5.995633111043703e-07,
"loss": 1.6254,
"step": 135
},
{
"epoch": 0.01823871256146625,
"grad_norm": 1.1077402595262742,
"learning_rate": 5.995568194552422e-07,
"loss": 1.6421,
"step": 136
},
{
"epoch": 0.018372820742065265,
"grad_norm": 1.1281814027534607,
"learning_rate": 5.995502799499084e-07,
"loss": 1.6564,
"step": 137
},
{
"epoch": 0.018506928922664284,
"grad_norm": 1.1954713277317879,
"learning_rate": 5.995436925895296e-07,
"loss": 1.7595,
"step": 138
},
{
"epoch": 0.0186410371032633,
"grad_norm": 1.0981557354609233,
"learning_rate": 5.995370573752754e-07,
"loss": 1.7267,
"step": 139
},
{
"epoch": 0.018775145283862316,
"grad_norm": 1.1055449164193234,
"learning_rate": 5.99530374308324e-07,
"loss": 1.7206,
"step": 140
},
{
"epoch": 0.01890925346446133,
"grad_norm": 1.1553645507386814,
"learning_rate": 5.995236433898617e-07,
"loss": 1.7575,
"step": 141
},
{
"epoch": 0.019043361645060347,
"grad_norm": 1.153673293502894,
"learning_rate": 5.995168646210836e-07,
"loss": 1.6141,
"step": 142
},
{
"epoch": 0.019177469825659366,
"grad_norm": 1.2666080786381764,
"learning_rate": 5.995100380031929e-07,
"loss": 1.6959,
"step": 143
},
{
"epoch": 0.019311578006258382,
"grad_norm": 1.1798276231576013,
"learning_rate": 5.99503163537402e-07,
"loss": 1.6898,
"step": 144
},
{
"epoch": 0.019445686186857398,
"grad_norm": 1.1774834485948251,
"learning_rate": 5.99496241224931e-07,
"loss": 1.6964,
"step": 145
},
{
"epoch": 0.019579794367456414,
"grad_norm": 1.1714422777051698,
"learning_rate": 5.994892710670092e-07,
"loss": 1.7554,
"step": 146
},
{
"epoch": 0.019713902548055433,
"grad_norm": 1.1423261455221443,
"learning_rate": 5.994822530648737e-07,
"loss": 1.6261,
"step": 147
},
{
"epoch": 0.01984801072865445,
"grad_norm": 1.2304343117411098,
"learning_rate": 5.994751872197707e-07,
"loss": 1.6867,
"step": 148
},
{
"epoch": 0.019982118909253464,
"grad_norm": 1.6067971505148326,
"learning_rate": 5.994680735329545e-07,
"loss": 1.7063,
"step": 149
},
{
"epoch": 0.02011622708985248,
"grad_norm": 1.2298413663973762,
"learning_rate": 5.994609120056881e-07,
"loss": 1.8201,
"step": 150
},
{
"epoch": 0.0202503352704515,
"grad_norm": 1.3415909829046686,
"learning_rate": 5.994537026392431e-07,
"loss": 1.7761,
"step": 151
},
{
"epoch": 0.020384443451050515,
"grad_norm": 1.092009599817307,
"learning_rate": 5.994464454348991e-07,
"loss": 1.6873,
"step": 152
},
{
"epoch": 0.02051855163164953,
"grad_norm": 1.123624476668709,
"learning_rate": 5.994391403939447e-07,
"loss": 1.6261,
"step": 153
},
{
"epoch": 0.020652659812248546,
"grad_norm": 1.194551927187871,
"learning_rate": 5.994317875176768e-07,
"loss": 1.6832,
"step": 154
},
{
"epoch": 0.020786767992847562,
"grad_norm": 1.2676660386871186,
"learning_rate": 5.99424386807401e-07,
"loss": 1.7296,
"step": 155
},
{
"epoch": 0.02092087617344658,
"grad_norm": 1.1316668463703698,
"learning_rate": 5.994169382644308e-07,
"loss": 1.5888,
"step": 156
},
{
"epoch": 0.021054984354045597,
"grad_norm": 1.1959893911689907,
"learning_rate": 5.994094418900889e-07,
"loss": 1.75,
"step": 157
},
{
"epoch": 0.021189092534644613,
"grad_norm": 1.1591910643371741,
"learning_rate": 5.994018976857061e-07,
"loss": 1.6475,
"step": 158
},
{
"epoch": 0.02132320071524363,
"grad_norm": 1.1845979417363135,
"learning_rate": 5.993943056526216e-07,
"loss": 1.6961,
"step": 159
},
{
"epoch": 0.021457308895842648,
"grad_norm": 1.1240443814893686,
"learning_rate": 5.993866657921835e-07,
"loss": 1.6806,
"step": 160
},
{
"epoch": 0.021591417076441664,
"grad_norm": 1.24112432554422,
"learning_rate": 5.99378978105748e-07,
"loss": 1.7856,
"step": 161
},
{
"epoch": 0.02172552525704068,
"grad_norm": 1.1097493699883791,
"learning_rate": 5.993712425946801e-07,
"loss": 1.6526,
"step": 162
},
{
"epoch": 0.021859633437639695,
"grad_norm": 1.1808847880790212,
"learning_rate": 5.99363459260353e-07,
"loss": 1.6635,
"step": 163
},
{
"epoch": 0.02199374161823871,
"grad_norm": 1.1260977048233447,
"learning_rate": 5.993556281041487e-07,
"loss": 1.6883,
"step": 164
},
{
"epoch": 0.02212784979883773,
"grad_norm": 1.1355214317178735,
"learning_rate": 5.993477491274572e-07,
"loss": 1.7197,
"step": 165
},
{
"epoch": 0.022261957979436746,
"grad_norm": 1.1667677632223183,
"learning_rate": 5.993398223316776e-07,
"loss": 1.652,
"step": 166
},
{
"epoch": 0.02239606616003576,
"grad_norm": 1.2054751712250278,
"learning_rate": 5.993318477182171e-07,
"loss": 1.7181,
"step": 167
},
{
"epoch": 0.022530174340634777,
"grad_norm": 1.102112367147099,
"learning_rate": 5.993238252884914e-07,
"loss": 1.7064,
"step": 168
},
{
"epoch": 0.022664282521233797,
"grad_norm": 1.1174237172322072,
"learning_rate": 5.99315755043925e-07,
"loss": 1.7088,
"step": 169
},
{
"epoch": 0.022798390701832812,
"grad_norm": 1.1526425806154745,
"learning_rate": 5.993076369859505e-07,
"loss": 1.6713,
"step": 170
},
{
"epoch": 0.022932498882431828,
"grad_norm": 1.189041831016279,
"learning_rate": 5.992994711160089e-07,
"loss": 1.796,
"step": 171
},
{
"epoch": 0.023066607063030844,
"grad_norm": 1.1020745587716836,
"learning_rate": 5.992912574355505e-07,
"loss": 1.7036,
"step": 172
},
{
"epoch": 0.023200715243629863,
"grad_norm": 1.1024337259717305,
"learning_rate": 5.992829959460332e-07,
"loss": 1.7183,
"step": 173
},
{
"epoch": 0.02333482342422888,
"grad_norm": 1.2334229037351967,
"learning_rate": 5.992746866489237e-07,
"loss": 1.7278,
"step": 174
},
{
"epoch": 0.023468931604827895,
"grad_norm": 1.1942148623032185,
"learning_rate": 5.992663295456972e-07,
"loss": 1.7127,
"step": 175
},
{
"epoch": 0.02360303978542691,
"grad_norm": 1.154550051904798,
"learning_rate": 5.992579246378375e-07,
"loss": 1.7259,
"step": 176
},
{
"epoch": 0.023737147966025926,
"grad_norm": 1.139454652919351,
"learning_rate": 5.992494719268369e-07,
"loss": 1.8202,
"step": 177
},
{
"epoch": 0.023871256146624945,
"grad_norm": 1.1334182773252635,
"learning_rate": 5.992409714141957e-07,
"loss": 1.7458,
"step": 178
},
{
"epoch": 0.02400536432722396,
"grad_norm": 1.175452351416824,
"learning_rate": 5.992324231014234e-07,
"loss": 1.7343,
"step": 179
},
{
"epoch": 0.024139472507822977,
"grad_norm": 1.15495982844933,
"learning_rate": 5.992238269900374e-07,
"loss": 1.6397,
"step": 180
},
{
"epoch": 0.024273580688421992,
"grad_norm": 1.222036602203619,
"learning_rate": 5.992151830815639e-07,
"loss": 1.6585,
"step": 181
},
{
"epoch": 0.02440768886902101,
"grad_norm": 1.1103145700032067,
"learning_rate": 5.992064913775376e-07,
"loss": 1.6729,
"step": 182
},
{
"epoch": 0.024541797049620027,
"grad_norm": 1.1627847561281206,
"learning_rate": 5.991977518795014e-07,
"loss": 1.6693,
"step": 183
},
{
"epoch": 0.024675905230219043,
"grad_norm": 1.2021941957895712,
"learning_rate": 5.991889645890071e-07,
"loss": 1.7692,
"step": 184
},
{
"epoch": 0.02481001341081806,
"grad_norm": 1.0987338031386753,
"learning_rate": 5.991801295076147e-07,
"loss": 1.7378,
"step": 185
},
{
"epoch": 0.024944121591417078,
"grad_norm": 1.1764726234102538,
"learning_rate": 5.991712466368927e-07,
"loss": 1.7519,
"step": 186
},
{
"epoch": 0.025078229772016094,
"grad_norm": 1.1211959879636015,
"learning_rate": 5.991623159784181e-07,
"loss": 1.6915,
"step": 187
},
{
"epoch": 0.02521233795261511,
"grad_norm": 1.183694924138999,
"learning_rate": 5.991533375337764e-07,
"loss": 1.6992,
"step": 188
},
{
"epoch": 0.025346446133214125,
"grad_norm": 1.1093630411034636,
"learning_rate": 5.991443113045618e-07,
"loss": 1.7517,
"step": 189
},
{
"epoch": 0.02548055431381314,
"grad_norm": 1.0851009440926755,
"learning_rate": 5.991352372923766e-07,
"loss": 1.6776,
"step": 190
},
{
"epoch": 0.02561466249441216,
"grad_norm": 1.1395885180659286,
"learning_rate": 5.99126115498832e-07,
"loss": 1.6924,
"step": 191
},
{
"epoch": 0.025748770675011176,
"grad_norm": 1.177643032023232,
"learning_rate": 5.99116945925547e-07,
"loss": 1.8049,
"step": 192
},
{
"epoch": 0.025882878855610192,
"grad_norm": 1.2055741329571488,
"learning_rate": 5.9910772857415e-07,
"loss": 1.7318,
"step": 193
},
{
"epoch": 0.026016987036209208,
"grad_norm": 1.0540261983643227,
"learning_rate": 5.990984634462772e-07,
"loss": 1.6957,
"step": 194
},
{
"epoch": 0.026151095216808227,
"grad_norm": 1.1229012489144132,
"learning_rate": 5.990891505435736e-07,
"loss": 1.6655,
"step": 195
},
{
"epoch": 0.026285203397407243,
"grad_norm": 1.244124126818224,
"learning_rate": 5.990797898676924e-07,
"loss": 1.6651,
"step": 196
},
{
"epoch": 0.02641931157800626,
"grad_norm": 1.153272959704337,
"learning_rate": 5.990703814202957e-07,
"loss": 1.614,
"step": 197
},
{
"epoch": 0.026553419758605274,
"grad_norm": 1.1097663064103196,
"learning_rate": 5.990609252030535e-07,
"loss": 1.6663,
"step": 198
},
{
"epoch": 0.02668752793920429,
"grad_norm": 1.1863665422420122,
"learning_rate": 5.990514212176451e-07,
"loss": 1.6996,
"step": 199
},
{
"epoch": 0.02682163611980331,
"grad_norm": 1.19855062119957,
"learning_rate": 5.990418694657574e-07,
"loss": 1.6788,
"step": 200
},
{
"epoch": 0.026955744300402325,
"grad_norm": 2.240646939516989,
"learning_rate": 5.990322699490864e-07,
"loss": 1.6072,
"step": 201
},
{
"epoch": 0.02708985248100134,
"grad_norm": 1.16787040451677,
"learning_rate": 5.990226226693363e-07,
"loss": 1.7495,
"step": 202
},
{
"epoch": 0.027223960661600356,
"grad_norm": 1.1801746959435724,
"learning_rate": 5.990129276282199e-07,
"loss": 1.7816,
"step": 203
},
{
"epoch": 0.027358068842199375,
"grad_norm": 1.0865741581201773,
"learning_rate": 5.990031848274582e-07,
"loss": 1.6386,
"step": 204
},
{
"epoch": 0.02749217702279839,
"grad_norm": 1.1195448058003792,
"learning_rate": 5.989933942687813e-07,
"loss": 1.7666,
"step": 205
},
{
"epoch": 0.027626285203397407,
"grad_norm": 1.1595509114049103,
"learning_rate": 5.989835559539271e-07,
"loss": 1.7783,
"step": 206
},
{
"epoch": 0.027760393383996423,
"grad_norm": 1.132633530996875,
"learning_rate": 5.989736698846422e-07,
"loss": 1.7369,
"step": 207
},
{
"epoch": 0.027894501564595442,
"grad_norm": 1.2238390397270622,
"learning_rate": 5.98963736062682e-07,
"loss": 1.77,
"step": 208
},
{
"epoch": 0.028028609745194458,
"grad_norm": 1.1148263262442593,
"learning_rate": 5.989537544898099e-07,
"loss": 1.7091,
"step": 209
},
{
"epoch": 0.028162717925793473,
"grad_norm": 1.8988797886120061,
"learning_rate": 5.989437251677981e-07,
"loss": 1.7075,
"step": 210
},
{
"epoch": 0.02829682610639249,
"grad_norm": 1.1460869915607401,
"learning_rate": 5.989336480984271e-07,
"loss": 1.7101,
"step": 211
},
{
"epoch": 0.028430934286991505,
"grad_norm": 1.1467483507445029,
"learning_rate": 5.989235232834861e-07,
"loss": 1.826,
"step": 212
},
{
"epoch": 0.028565042467590524,
"grad_norm": 1.1300279144587981,
"learning_rate": 5.989133507247724e-07,
"loss": 1.6014,
"step": 213
},
{
"epoch": 0.02869915064818954,
"grad_norm": 1.1992643920221002,
"learning_rate": 5.989031304240922e-07,
"loss": 1.7145,
"step": 214
},
{
"epoch": 0.028833258828788556,
"grad_norm": 1.1299143353929064,
"learning_rate": 5.988928623832598e-07,
"loss": 1.7769,
"step": 215
},
{
"epoch": 0.02896736700938757,
"grad_norm": 1.2042592418402756,
"learning_rate": 5.988825466040984e-07,
"loss": 1.7626,
"step": 216
},
{
"epoch": 0.02910147518998659,
"grad_norm": 1.0995902853233575,
"learning_rate": 5.988721830884392e-07,
"loss": 1.6348,
"step": 217
},
{
"epoch": 0.029235583370585606,
"grad_norm": 1.6143410051222686,
"learning_rate": 5.988617718381222e-07,
"loss": 1.6693,
"step": 218
},
{
"epoch": 0.029369691551184622,
"grad_norm": 1.1356912583442442,
"learning_rate": 5.988513128549958e-07,
"loss": 1.8413,
"step": 219
},
{
"epoch": 0.029503799731783638,
"grad_norm": 1.0893609511374684,
"learning_rate": 5.988408061409167e-07,
"loss": 1.7344,
"step": 220
},
{
"epoch": 0.029637907912382657,
"grad_norm": 1.7248790007955832,
"learning_rate": 5.988302516977504e-07,
"loss": 1.6685,
"step": 221
},
{
"epoch": 0.029772016092981673,
"grad_norm": 1.2197670257203657,
"learning_rate": 5.988196495273707e-07,
"loss": 1.7656,
"step": 222
},
{
"epoch": 0.02990612427358069,
"grad_norm": 1.0570007929897236,
"learning_rate": 5.988089996316597e-07,
"loss": 1.6939,
"step": 223
},
{
"epoch": 0.030040232454179704,
"grad_norm": 1.2787842409441683,
"learning_rate": 5.987983020125083e-07,
"loss": 1.6764,
"step": 224
},
{
"epoch": 0.03017434063477872,
"grad_norm": 1.1358825590170436,
"learning_rate": 5.987875566718158e-07,
"loss": 1.6609,
"step": 225
},
{
"epoch": 0.03030844881537774,
"grad_norm": 1.118237942922342,
"learning_rate": 5.987767636114897e-07,
"loss": 1.7554,
"step": 226
},
{
"epoch": 0.030442556995976755,
"grad_norm": 1.091737931283322,
"learning_rate": 5.987659228334462e-07,
"loss": 1.7449,
"step": 227
},
{
"epoch": 0.03057666517657577,
"grad_norm": 1.1839355406865255,
"learning_rate": 5.9875503433961e-07,
"loss": 1.5726,
"step": 228
},
{
"epoch": 0.030710773357174787,
"grad_norm": 1.1337421280370006,
"learning_rate": 5.987440981319141e-07,
"loss": 1.7921,
"step": 229
},
{
"epoch": 0.030844881537773806,
"grad_norm": 1.1412449749582727,
"learning_rate": 5.987331142123003e-07,
"loss": 1.74,
"step": 230
},
{
"epoch": 0.03097898971837282,
"grad_norm": 1.153189714483035,
"learning_rate": 5.987220825827184e-07,
"loss": 1.8381,
"step": 231
},
{
"epoch": 0.031113097898971837,
"grad_norm": 1.5918789493838401,
"learning_rate": 5.98711003245127e-07,
"loss": 1.775,
"step": 232
},
{
"epoch": 0.031247206079570853,
"grad_norm": 1.1156741804185832,
"learning_rate": 5.986998762014931e-07,
"loss": 1.7849,
"step": 233
},
{
"epoch": 0.03138131426016987,
"grad_norm": 1.3525186481687417,
"learning_rate": 5.986887014537923e-07,
"loss": 1.6405,
"step": 234
},
{
"epoch": 0.03151542244076889,
"grad_norm": 1.158420443205213,
"learning_rate": 5.986774790040083e-07,
"loss": 1.7375,
"step": 235
},
{
"epoch": 0.031649530621367904,
"grad_norm": 1.123395074640784,
"learning_rate": 5.986662088541335e-07,
"loss": 1.7682,
"step": 236
},
{
"epoch": 0.03178363880196692,
"grad_norm": 1.1675872323082288,
"learning_rate": 5.98654891006169e-07,
"loss": 1.7364,
"step": 237
},
{
"epoch": 0.031917746982565935,
"grad_norm": 1.0814715571489928,
"learning_rate": 5.986435254621239e-07,
"loss": 1.5985,
"step": 238
},
{
"epoch": 0.03205185516316495,
"grad_norm": 3.0737070295965427,
"learning_rate": 5.986321122240162e-07,
"loss": 1.7085,
"step": 239
},
{
"epoch": 0.03218596334376397,
"grad_norm": 1.1671133111581686,
"learning_rate": 5.986206512938719e-07,
"loss": 1.6533,
"step": 240
},
{
"epoch": 0.03232007152436299,
"grad_norm": 1.145018806372248,
"learning_rate": 5.98609142673726e-07,
"loss": 1.7335,
"step": 241
},
{
"epoch": 0.032454179704962005,
"grad_norm": 1.159474229307987,
"learning_rate": 5.985975863656216e-07,
"loss": 1.7531,
"step": 242
},
{
"epoch": 0.03258828788556102,
"grad_norm": 1.2078048688870913,
"learning_rate": 5.985859823716102e-07,
"loss": 1.7911,
"step": 243
},
{
"epoch": 0.03272239606616004,
"grad_norm": 1.123182359654964,
"learning_rate": 5.985743306937522e-07,
"loss": 1.7939,
"step": 244
},
{
"epoch": 0.03285650424675905,
"grad_norm": 1.2328138827190458,
"learning_rate": 5.985626313341161e-07,
"loss": 1.7224,
"step": 245
},
{
"epoch": 0.03299061242735807,
"grad_norm": 1.148111739587274,
"learning_rate": 5.98550884294779e-07,
"loss": 1.7458,
"step": 246
},
{
"epoch": 0.033124720607957084,
"grad_norm": 1.1781302748488391,
"learning_rate": 5.985390895778263e-07,
"loss": 1.7283,
"step": 247
},
{
"epoch": 0.0332588287885561,
"grad_norm": 1.1649269851093655,
"learning_rate": 5.985272471853521e-07,
"loss": 1.7535,
"step": 248
},
{
"epoch": 0.033392936969155115,
"grad_norm": 1.1003523240939477,
"learning_rate": 5.985153571194589e-07,
"loss": 1.7422,
"step": 249
},
{
"epoch": 0.03352704514975414,
"grad_norm": 1.1239095176492149,
"learning_rate": 5.985034193822575e-07,
"loss": 1.7838,
"step": 250
},
{
"epoch": 0.033661153330353154,
"grad_norm": 1.1810699355311947,
"learning_rate": 5.984914339758673e-07,
"loss": 1.6863,
"step": 251
},
{
"epoch": 0.03379526151095217,
"grad_norm": 1.1136505916452646,
"learning_rate": 5.984794009024162e-07,
"loss": 1.7424,
"step": 252
},
{
"epoch": 0.033929369691551185,
"grad_norm": 1.1748644896008424,
"learning_rate": 5.984673201640406e-07,
"loss": 1.7273,
"step": 253
},
{
"epoch": 0.0340634778721502,
"grad_norm": 1.1728309803897534,
"learning_rate": 5.98455191762885e-07,
"loss": 1.7322,
"step": 254
},
{
"epoch": 0.03419758605274922,
"grad_norm": 1.1617256887218326,
"learning_rate": 5.984430157011031e-07,
"loss": 1.6426,
"step": 255
},
{
"epoch": 0.03433169423334823,
"grad_norm": 1.0944959568956085,
"learning_rate": 5.984307919808561e-07,
"loss": 1.6643,
"step": 256
},
{
"epoch": 0.03446580241394725,
"grad_norm": 1.1692415951338644,
"learning_rate": 5.984185206043145e-07,
"loss": 1.6584,
"step": 257
},
{
"epoch": 0.034599910594546264,
"grad_norm": 4.382957589748632,
"learning_rate": 5.984062015736567e-07,
"loss": 1.7101,
"step": 258
},
{
"epoch": 0.03473401877514529,
"grad_norm": 1.1567530728762943,
"learning_rate": 5.983938348910698e-07,
"loss": 1.643,
"step": 259
},
{
"epoch": 0.0348681269557443,
"grad_norm": 1.215341418188577,
"learning_rate": 5.983814205587494e-07,
"loss": 1.7239,
"step": 260
},
{
"epoch": 0.03500223513634332,
"grad_norm": 1.0746883114524803,
"learning_rate": 5.983689585788997e-07,
"loss": 1.6076,
"step": 261
},
{
"epoch": 0.035136343316942334,
"grad_norm": 1.0844612292689275,
"learning_rate": 5.983564489537329e-07,
"loss": 1.6903,
"step": 262
},
{
"epoch": 0.03527045149754135,
"grad_norm": 1.2255887165848134,
"learning_rate": 5.983438916854698e-07,
"loss": 1.6497,
"step": 263
},
{
"epoch": 0.035404559678140365,
"grad_norm": 1.1308380556818496,
"learning_rate": 5.983312867763402e-07,
"loss": 1.7412,
"step": 264
},
{
"epoch": 0.03553866785873938,
"grad_norm": 1.1248240455028355,
"learning_rate": 5.983186342285815e-07,
"loss": 1.6542,
"step": 265
},
{
"epoch": 0.0356727760393384,
"grad_norm": 1.127913908764272,
"learning_rate": 5.983059340444401e-07,
"loss": 1.7996,
"step": 266
},
{
"epoch": 0.03580688421993742,
"grad_norm": 1.1345562808363212,
"learning_rate": 5.98293186226171e-07,
"loss": 1.7426,
"step": 267
},
{
"epoch": 0.035940992400536435,
"grad_norm": 1.1100506727991573,
"learning_rate": 5.982803907760373e-07,
"loss": 1.6947,
"step": 268
},
{
"epoch": 0.03607510058113545,
"grad_norm": 1.1397892876092324,
"learning_rate": 5.982675476963105e-07,
"loss": 1.7525,
"step": 269
},
{
"epoch": 0.03620920876173447,
"grad_norm": 1.0980888601137475,
"learning_rate": 5.982546569892707e-07,
"loss": 1.6763,
"step": 270
},
{
"epoch": 0.03634331694233348,
"grad_norm": 1.1179358157267492,
"learning_rate": 5.982417186572067e-07,
"loss": 1.8195,
"step": 271
},
{
"epoch": 0.0364774251229325,
"grad_norm": 1.15212876523653,
"learning_rate": 5.982287327024153e-07,
"loss": 1.7003,
"step": 272
},
{
"epoch": 0.036611533303531514,
"grad_norm": 1.0898032141275467,
"learning_rate": 5.982156991272021e-07,
"loss": 1.7347,
"step": 273
},
{
"epoch": 0.03674564148413053,
"grad_norm": 1.2234098091068482,
"learning_rate": 5.982026179338812e-07,
"loss": 1.71,
"step": 274
},
{
"epoch": 0.036879749664729546,
"grad_norm": 1.2077801818134501,
"learning_rate": 5.981894891247747e-07,
"loss": 1.7966,
"step": 275
},
{
"epoch": 0.03701385784532857,
"grad_norm": 1.1190450985953022,
"learning_rate": 5.981763127022135e-07,
"loss": 1.6619,
"step": 276
},
{
"epoch": 0.037147966025927584,
"grad_norm": 1.235343710444344,
"learning_rate": 5.981630886685369e-07,
"loss": 1.7484,
"step": 277
},
{
"epoch": 0.0372820742065266,
"grad_norm": 1.2266668117138695,
"learning_rate": 5.98149817026093e-07,
"loss": 1.6734,
"step": 278
},
{
"epoch": 0.037416182387125616,
"grad_norm": 1.4154140120426957,
"learning_rate": 5.981364977772374e-07,
"loss": 1.7073,
"step": 279
},
{
"epoch": 0.03755029056772463,
"grad_norm": 1.2222936436898488,
"learning_rate": 5.981231309243353e-07,
"loss": 1.7837,
"step": 280
},
{
"epoch": 0.03768439874832365,
"grad_norm": 1.1519207095634527,
"learning_rate": 5.981097164697594e-07,
"loss": 1.7349,
"step": 281
},
{
"epoch": 0.03781850692892266,
"grad_norm": 1.172450505222872,
"learning_rate": 5.980962544158915e-07,
"loss": 1.7005,
"step": 282
},
{
"epoch": 0.03795261510952168,
"grad_norm": 1.2857156876454048,
"learning_rate": 5.980827447651216e-07,
"loss": 1.561,
"step": 283
},
{
"epoch": 0.038086723290120694,
"grad_norm": 1.2389387482561154,
"learning_rate": 5.98069187519848e-07,
"loss": 1.7068,
"step": 284
},
{
"epoch": 0.03822083147071972,
"grad_norm": 1.163985598391861,
"learning_rate": 5.980555826824778e-07,
"loss": 1.7442,
"step": 285
},
{
"epoch": 0.03835493965131873,
"grad_norm": 1.1048173896847064,
"learning_rate": 5.980419302554261e-07,
"loss": 1.685,
"step": 286
},
{
"epoch": 0.03848904783191775,
"grad_norm": 1.472564099104008,
"learning_rate": 5.98028230241117e-07,
"loss": 1.6997,
"step": 287
},
{
"epoch": 0.038623156012516764,
"grad_norm": 1.287728938848147,
"learning_rate": 5.980144826419825e-07,
"loss": 1.7084,
"step": 288
},
{
"epoch": 0.03875726419311578,
"grad_norm": 1.124267938500328,
"learning_rate": 5.980006874604635e-07,
"loss": 1.7134,
"step": 289
},
{
"epoch": 0.038891372373714796,
"grad_norm": 1.1218572497983328,
"learning_rate": 5.979868446990091e-07,
"loss": 1.6841,
"step": 290
},
{
"epoch": 0.03902548055431381,
"grad_norm": 1.1011749075237598,
"learning_rate": 5.979729543600769e-07,
"loss": 1.7323,
"step": 291
},
{
"epoch": 0.03915958873491283,
"grad_norm": 1.100745780533083,
"learning_rate": 5.979590164461328e-07,
"loss": 1.6788,
"step": 292
},
{
"epoch": 0.03929369691551184,
"grad_norm": 1.1613502217053182,
"learning_rate": 5.979450309596514e-07,
"loss": 1.6776,
"step": 293
},
{
"epoch": 0.039427805096110866,
"grad_norm": 1.089657509345998,
"learning_rate": 5.979309979031158e-07,
"loss": 1.7068,
"step": 294
},
{
"epoch": 0.03956191327670988,
"grad_norm": 1.1436391576530838,
"learning_rate": 5.97916917279017e-07,
"loss": 1.7388,
"step": 295
},
{
"epoch": 0.0396960214573089,
"grad_norm": 1.1145075933124646,
"learning_rate": 5.979027890898551e-07,
"loss": 1.7004,
"step": 296
},
{
"epoch": 0.03983012963790791,
"grad_norm": 1.0907272047712597,
"learning_rate": 5.978886133381384e-07,
"loss": 1.679,
"step": 297
},
{
"epoch": 0.03996423781850693,
"grad_norm": 1.12558267559901,
"learning_rate": 5.978743900263835e-07,
"loss": 1.6608,
"step": 298
},
{
"epoch": 0.040098345999105944,
"grad_norm": 1.136659951867088,
"learning_rate": 5.978601191571155e-07,
"loss": 1.6383,
"step": 299
},
{
"epoch": 0.04023245417970496,
"grad_norm": 1.2441133556300974,
"learning_rate": 5.978458007328682e-07,
"loss": 1.7697,
"step": 300
},
{
"epoch": 0.040366562360303976,
"grad_norm": 1.216051798039534,
"learning_rate": 5.978314347561835e-07,
"loss": 1.7656,
"step": 301
},
{
"epoch": 0.040500670540903,
"grad_norm": 1.1193332609304543,
"learning_rate": 5.978170212296118e-07,
"loss": 1.7034,
"step": 302
},
{
"epoch": 0.040634778721502014,
"grad_norm": 1.1450830933525635,
"learning_rate": 5.978025601557124e-07,
"loss": 1.6769,
"step": 303
},
{
"epoch": 0.04076888690210103,
"grad_norm": 1.1570981861957024,
"learning_rate": 5.977880515370523e-07,
"loss": 1.7491,
"step": 304
},
{
"epoch": 0.040902995082700046,
"grad_norm": 1.103432713835437,
"learning_rate": 5.977734953762075e-07,
"loss": 1.6544,
"step": 305
},
{
"epoch": 0.04103710326329906,
"grad_norm": 1.134144784637958,
"learning_rate": 5.97758891675762e-07,
"loss": 1.7084,
"step": 306
},
{
"epoch": 0.04117121144389808,
"grad_norm": 1.07738843402297,
"learning_rate": 5.977442404383088e-07,
"loss": 1.7369,
"step": 307
},
{
"epoch": 0.04130531962449709,
"grad_norm": 1.1164259724731038,
"learning_rate": 5.977295416664489e-07,
"loss": 1.6785,
"step": 308
},
{
"epoch": 0.04143942780509611,
"grad_norm": 1.2001430339127754,
"learning_rate": 5.977147953627918e-07,
"loss": 1.6496,
"step": 309
},
{
"epoch": 0.041573535985695124,
"grad_norm": 1.1849867153137015,
"learning_rate": 5.977000015299557e-07,
"loss": 1.6736,
"step": 310
},
{
"epoch": 0.04170764416629415,
"grad_norm": 1.1582589308770772,
"learning_rate": 5.976851601705669e-07,
"loss": 1.6775,
"step": 311
},
{
"epoch": 0.04184175234689316,
"grad_norm": 1.1033822470615744,
"learning_rate": 5.976702712872603e-07,
"loss": 1.6598,
"step": 312
},
{
"epoch": 0.04197586052749218,
"grad_norm": 1.1682634791444901,
"learning_rate": 5.976553348826793e-07,
"loss": 1.7557,
"step": 313
},
{
"epoch": 0.042109968708091194,
"grad_norm": 1.0838004153530265,
"learning_rate": 5.976403509594756e-07,
"loss": 1.6741,
"step": 314
},
{
"epoch": 0.04224407688869021,
"grad_norm": 1.121835854661048,
"learning_rate": 5.976253195203092e-07,
"loss": 1.7262,
"step": 315
},
{
"epoch": 0.042378185069289226,
"grad_norm": 1.1243699312065234,
"learning_rate": 5.976102405678491e-07,
"loss": 1.7902,
"step": 316
},
{
"epoch": 0.04251229324988824,
"grad_norm": 1.0991499127058322,
"learning_rate": 5.975951141047721e-07,
"loss": 1.6865,
"step": 317
},
{
"epoch": 0.04264640143048726,
"grad_norm": 1.126580502499325,
"learning_rate": 5.975799401337638e-07,
"loss": 1.6798,
"step": 318
},
{
"epoch": 0.04278050961108627,
"grad_norm": 1.1221949135632994,
"learning_rate": 5.975647186575182e-07,
"loss": 1.7491,
"step": 319
},
{
"epoch": 0.042914617791685296,
"grad_norm": 1.14926550813679,
"learning_rate": 5.975494496787376e-07,
"loss": 1.6549,
"step": 320
},
{
"epoch": 0.04304872597228431,
"grad_norm": 1.12638348214928,
"learning_rate": 5.975341332001328e-07,
"loss": 1.5897,
"step": 321
},
{
"epoch": 0.04318283415288333,
"grad_norm": 1.1725295960645503,
"learning_rate": 5.97518769224423e-07,
"loss": 1.695,
"step": 322
},
{
"epoch": 0.04331694233348234,
"grad_norm": 1.0904790236385375,
"learning_rate": 5.975033577543359e-07,
"loss": 1.6841,
"step": 323
},
{
"epoch": 0.04345105051408136,
"grad_norm": 1.1090846497862015,
"learning_rate": 5.974878987926075e-07,
"loss": 1.6075,
"step": 324
},
{
"epoch": 0.043585158694680375,
"grad_norm": 1.2329654322486787,
"learning_rate": 5.974723923419827e-07,
"loss": 1.7124,
"step": 325
},
{
"epoch": 0.04371926687527939,
"grad_norm": 1.1520738825385615,
"learning_rate": 5.974568384052139e-07,
"loss": 1.7492,
"step": 326
},
{
"epoch": 0.043853375055878406,
"grad_norm": 1.107509031801798,
"learning_rate": 5.974412369850631e-07,
"loss": 1.7233,
"step": 327
},
{
"epoch": 0.04398748323647742,
"grad_norm": 1.9987713290159552,
"learning_rate": 5.974255880842995e-07,
"loss": 1.7005,
"step": 328
},
{
"epoch": 0.044121591417076444,
"grad_norm": 1.1227927295658309,
"learning_rate": 5.974098917057019e-07,
"loss": 1.8204,
"step": 329
},
{
"epoch": 0.04425569959767546,
"grad_norm": 1.1208739563830832,
"learning_rate": 5.973941478520565e-07,
"loss": 1.7393,
"step": 330
},
{
"epoch": 0.044389807778274476,
"grad_norm": 1.0722310163444908,
"learning_rate": 5.973783565261589e-07,
"loss": 1.6568,
"step": 331
},
{
"epoch": 0.04452391595887349,
"grad_norm": 1.1809997483096673,
"learning_rate": 5.973625177308124e-07,
"loss": 1.7233,
"step": 332
},
{
"epoch": 0.04465802413947251,
"grad_norm": 1.0854965350422932,
"learning_rate": 5.973466314688289e-07,
"loss": 1.5838,
"step": 333
},
{
"epoch": 0.04479213232007152,
"grad_norm": 1.0394749005048125,
"learning_rate": 5.973306977430288e-07,
"loss": 1.6982,
"step": 334
},
{
"epoch": 0.04492624050067054,
"grad_norm": 1.1372698128741796,
"learning_rate": 5.973147165562409e-07,
"loss": 1.7363,
"step": 335
},
{
"epoch": 0.045060348681269555,
"grad_norm": 1.0872018588712997,
"learning_rate": 5.972986879113027e-07,
"loss": 1.7134,
"step": 336
},
{
"epoch": 0.04519445686186858,
"grad_norm": 1.136573181976626,
"learning_rate": 5.972826118110597e-07,
"loss": 1.6747,
"step": 337
},
{
"epoch": 0.04532856504246759,
"grad_norm": 1.1438807799337474,
"learning_rate": 5.972664882583659e-07,
"loss": 1.7632,
"step": 338
},
{
"epoch": 0.04546267322306661,
"grad_norm": 1.1746151029086915,
"learning_rate": 5.97250317256084e-07,
"loss": 1.5568,
"step": 339
},
{
"epoch": 0.045596781403665625,
"grad_norm": 1.067551171735795,
"learning_rate": 5.972340988070848e-07,
"loss": 1.7722,
"step": 340
},
{
"epoch": 0.04573088958426464,
"grad_norm": 1.100004825990679,
"learning_rate": 5.972178329142476e-07,
"loss": 1.7111,
"step": 341
},
{
"epoch": 0.045864997764863656,
"grad_norm": 1.3130274389549708,
"learning_rate": 5.972015195804604e-07,
"loss": 1.7768,
"step": 342
},
{
"epoch": 0.04599910594546267,
"grad_norm": 1.1532781776242376,
"learning_rate": 5.971851588086195e-07,
"loss": 1.7096,
"step": 343
},
{
"epoch": 0.04613321412606169,
"grad_norm": 1.1087417118719138,
"learning_rate": 5.971687506016292e-07,
"loss": 1.6085,
"step": 344
},
{
"epoch": 0.0462673223066607,
"grad_norm": 1.105566689388399,
"learning_rate": 5.971522949624028e-07,
"loss": 1.6791,
"step": 345
},
{
"epoch": 0.046401430487259726,
"grad_norm": 1.090277130406352,
"learning_rate": 5.971357918938616e-07,
"loss": 1.6585,
"step": 346
},
{
"epoch": 0.04653553866785874,
"grad_norm": 1.1679080769492398,
"learning_rate": 5.971192413989357e-07,
"loss": 1.6861,
"step": 347
},
{
"epoch": 0.04666964684845776,
"grad_norm": 1.1647454348028623,
"learning_rate": 5.971026434805633e-07,
"loss": 1.7167,
"step": 348
},
{
"epoch": 0.04680375502905677,
"grad_norm": 1.1324717330275416,
"learning_rate": 5.970859981416911e-07,
"loss": 1.6656,
"step": 349
},
{
"epoch": 0.04693786320965579,
"grad_norm": 1.0895090583275637,
"learning_rate": 5.970693053852743e-07,
"loss": 1.7932,
"step": 350
},
{
"epoch": 0.047071971390254805,
"grad_norm": 1.0846672521830747,
"learning_rate": 5.970525652142767e-07,
"loss": 1.568,
"step": 351
},
{
"epoch": 0.04720607957085382,
"grad_norm": 1.0946401497383844,
"learning_rate": 5.970357776316699e-07,
"loss": 1.6717,
"step": 352
},
{
"epoch": 0.047340187751452836,
"grad_norm": 1.203590152178876,
"learning_rate": 5.970189426404346e-07,
"loss": 1.6852,
"step": 353
},
{
"epoch": 0.04747429593205185,
"grad_norm": 1.1550529538782315,
"learning_rate": 5.970020602435594e-07,
"loss": 1.7621,
"step": 354
},
{
"epoch": 0.047608404112650875,
"grad_norm": 1.096867626156823,
"learning_rate": 5.969851304440418e-07,
"loss": 1.7309,
"step": 355
},
{
"epoch": 0.04774251229324989,
"grad_norm": 1.166383772927886,
"learning_rate": 5.969681532448872e-07,
"loss": 1.7181,
"step": 356
},
{
"epoch": 0.047876620473848906,
"grad_norm": 1.1239983839028163,
"learning_rate": 5.9695112864911e-07,
"loss": 1.6855,
"step": 357
},
{
"epoch": 0.04801072865444792,
"grad_norm": 1.146063042749729,
"learning_rate": 5.969340566597323e-07,
"loss": 1.7481,
"step": 358
},
{
"epoch": 0.04814483683504694,
"grad_norm": 1.1888010033263623,
"learning_rate": 5.969169372797852e-07,
"loss": 1.7679,
"step": 359
},
{
"epoch": 0.048278945015645953,
"grad_norm": 1.1182477969412692,
"learning_rate": 5.96899770512308e-07,
"loss": 1.703,
"step": 360
},
{
"epoch": 0.04841305319624497,
"grad_norm": 1.1404473863138842,
"learning_rate": 5.968825563603486e-07,
"loss": 1.7899,
"step": 361
},
{
"epoch": 0.048547161376843985,
"grad_norm": 1.1404415220346715,
"learning_rate": 5.968652948269629e-07,
"loss": 1.6586,
"step": 362
},
{
"epoch": 0.048681269557443,
"grad_norm": 1.0188482574967557,
"learning_rate": 5.968479859152155e-07,
"loss": 1.6772,
"step": 363
},
{
"epoch": 0.04881537773804202,
"grad_norm": 1.1444032147790508,
"learning_rate": 5.968306296281794e-07,
"loss": 1.7235,
"step": 364
},
{
"epoch": 0.04894948591864104,
"grad_norm": 1.147526204803139,
"learning_rate": 5.968132259689361e-07,
"loss": 1.6656,
"step": 365
},
{
"epoch": 0.049083594099240055,
"grad_norm": 1.094173771252459,
"learning_rate": 5.967957749405751e-07,
"loss": 1.6133,
"step": 366
},
{
"epoch": 0.04921770227983907,
"grad_norm": 1.1560369729609308,
"learning_rate": 5.967782765461948e-07,
"loss": 1.7796,
"step": 367
},
{
"epoch": 0.049351810460438086,
"grad_norm": 1.1696121017752343,
"learning_rate": 5.967607307889018e-07,
"loss": 1.65,
"step": 368
},
{
"epoch": 0.0494859186410371,
"grad_norm": 1.134918792559745,
"learning_rate": 5.967431376718111e-07,
"loss": 1.717,
"step": 369
},
{
"epoch": 0.04962002682163612,
"grad_norm": 1.0765623022573645,
"learning_rate": 5.967254971980461e-07,
"loss": 1.7028,
"step": 370
},
{
"epoch": 0.049754135002235134,
"grad_norm": 1.1093533051376567,
"learning_rate": 5.967078093707387e-07,
"loss": 1.687,
"step": 371
},
{
"epoch": 0.049888243182834156,
"grad_norm": 1.0724867576763264,
"learning_rate": 5.966900741930289e-07,
"loss": 1.709,
"step": 372
},
{
"epoch": 0.05002235136343317,
"grad_norm": 1.1870703976775374,
"learning_rate": 5.966722916680656e-07,
"loss": 1.7623,
"step": 373
},
{
"epoch": 0.05015645954403219,
"grad_norm": 1.1118336624167122,
"learning_rate": 5.966544617990058e-07,
"loss": 1.713,
"step": 374
},
{
"epoch": 0.050290567724631204,
"grad_norm": 1.1147242423912,
"learning_rate": 5.966365845890149e-07,
"loss": 1.5956,
"step": 375
},
{
"epoch": 0.05042467590523022,
"grad_norm": 1.1489546583821737,
"learning_rate": 5.966186600412668e-07,
"loss": 1.7536,
"step": 376
},
{
"epoch": 0.050558784085829235,
"grad_norm": 1.0985836995809481,
"learning_rate": 5.966006881589437e-07,
"loss": 1.6415,
"step": 377
},
{
"epoch": 0.05069289226642825,
"grad_norm": 1.5210499221056473,
"learning_rate": 5.965826689452363e-07,
"loss": 1.7034,
"step": 378
},
{
"epoch": 0.05082700044702727,
"grad_norm": 1.1770747351660449,
"learning_rate": 5.965646024033437e-07,
"loss": 1.7998,
"step": 379
},
{
"epoch": 0.05096110862762628,
"grad_norm": 1.103353857870669,
"learning_rate": 5.965464885364734e-07,
"loss": 1.677,
"step": 380
},
{
"epoch": 0.051095216808225305,
"grad_norm": 1.1279052370658624,
"learning_rate": 5.965283273478411e-07,
"loss": 1.7125,
"step": 381
},
{
"epoch": 0.05122932498882432,
"grad_norm": 1.1260317026536582,
"learning_rate": 5.965101188406713e-07,
"loss": 1.713,
"step": 382
},
{
"epoch": 0.051363433169423336,
"grad_norm": 1.1217115939734228,
"learning_rate": 5.964918630181966e-07,
"loss": 1.7513,
"step": 383
},
{
"epoch": 0.05149754135002235,
"grad_norm": 1.0938140494838644,
"learning_rate": 5.964735598836581e-07,
"loss": 1.6722,
"step": 384
},
{
"epoch": 0.05163164953062137,
"grad_norm": 1.5746119243016816,
"learning_rate": 5.964552094403051e-07,
"loss": 1.7249,
"step": 385
},
{
"epoch": 0.051765757711220384,
"grad_norm": 1.1376993855927013,
"learning_rate": 5.964368116913957e-07,
"loss": 1.7292,
"step": 386
},
{
"epoch": 0.0518998658918194,
"grad_norm": 1.1288484886032422,
"learning_rate": 5.96418366640196e-07,
"loss": 1.7373,
"step": 387
},
{
"epoch": 0.052033974072418415,
"grad_norm": 1.0912837401536597,
"learning_rate": 5.963998742899809e-07,
"loss": 1.6279,
"step": 388
},
{
"epoch": 0.05216808225301743,
"grad_norm": 1.080399914264917,
"learning_rate": 5.963813346440332e-07,
"loss": 1.6828,
"step": 389
},
{
"epoch": 0.052302190433616454,
"grad_norm": 1.18296526148637,
"learning_rate": 5.963627477056445e-07,
"loss": 1.7037,
"step": 390
},
{
"epoch": 0.05243629861421547,
"grad_norm": 1.0700933148095726,
"learning_rate": 5.963441134781147e-07,
"loss": 1.6773,
"step": 391
},
{
"epoch": 0.052570406794814485,
"grad_norm": 1.5541605676471624,
"learning_rate": 5.963254319647519e-07,
"loss": 1.5786,
"step": 392
},
{
"epoch": 0.0527045149754135,
"grad_norm": 1.154992915725033,
"learning_rate": 5.96306703168873e-07,
"loss": 1.7743,
"step": 393
},
{
"epoch": 0.05283862315601252,
"grad_norm": 1.117612338423665,
"learning_rate": 5.962879270938028e-07,
"loss": 1.723,
"step": 394
},
{
"epoch": 0.05297273133661153,
"grad_norm": 1.0907791376426386,
"learning_rate": 5.96269103742875e-07,
"loss": 1.73,
"step": 395
},
{
"epoch": 0.05310683951721055,
"grad_norm": 1.1325939188472074,
"learning_rate": 5.962502331194311e-07,
"loss": 1.6756,
"step": 396
},
{
"epoch": 0.053240947697809564,
"grad_norm": 1.0925915487497773,
"learning_rate": 5.962313152268218e-07,
"loss": 1.7166,
"step": 397
},
{
"epoch": 0.05337505587840858,
"grad_norm": 1.1102789558363542,
"learning_rate": 5.96212350068405e-07,
"loss": 1.6697,
"step": 398
},
{
"epoch": 0.0535091640590076,
"grad_norm": 1.1054817006563584,
"learning_rate": 5.961933376475485e-07,
"loss": 1.7231,
"step": 399
},
{
"epoch": 0.05364327223960662,
"grad_norm": 1.307573555314525,
"learning_rate": 5.961742779676272e-07,
"loss": 1.7651,
"step": 400
},
{
"epoch": 0.053777380420205634,
"grad_norm": 1.1445042759796842,
"learning_rate": 5.961551710320251e-07,
"loss": 1.6765,
"step": 401
},
{
"epoch": 0.05391148860080465,
"grad_norm": 1.0762583158173675,
"learning_rate": 5.961360168441342e-07,
"loss": 1.6481,
"step": 402
},
{
"epoch": 0.054045596781403665,
"grad_norm": 1.1084304546525765,
"learning_rate": 5.961168154073553e-07,
"loss": 1.7338,
"step": 403
},
{
"epoch": 0.05417970496200268,
"grad_norm": 1.0982232521403124,
"learning_rate": 5.960975667250972e-07,
"loss": 1.6638,
"step": 404
},
{
"epoch": 0.0543138131426017,
"grad_norm": 1.2140530141548174,
"learning_rate": 5.960782708007773e-07,
"loss": 1.7516,
"step": 405
},
{
"epoch": 0.05444792132320071,
"grad_norm": 1.5212193377424008,
"learning_rate": 5.960589276378213e-07,
"loss": 1.7427,
"step": 406
},
{
"epoch": 0.054582029503799735,
"grad_norm": 1.11412919662803,
"learning_rate": 5.960395372396633e-07,
"loss": 1.6931,
"step": 407
},
{
"epoch": 0.05471613768439875,
"grad_norm": 1.0851895981130018,
"learning_rate": 5.960200996097458e-07,
"loss": 1.6913,
"step": 408
},
{
"epoch": 0.05485024586499777,
"grad_norm": 1.1246816244588258,
"learning_rate": 5.960006147515199e-07,
"loss": 1.7152,
"step": 409
},
{
"epoch": 0.05498435404559678,
"grad_norm": 1.0772018259030958,
"learning_rate": 5.959810826684446e-07,
"loss": 1.7227,
"step": 410
},
{
"epoch": 0.0551184622261958,
"grad_norm": 1.1172898063954977,
"learning_rate": 5.959615033639877e-07,
"loss": 1.6459,
"step": 411
},
{
"epoch": 0.055252570406794814,
"grad_norm": 1.190430020238442,
"learning_rate": 5.959418768416252e-07,
"loss": 1.7491,
"step": 412
},
{
"epoch": 0.05538667858739383,
"grad_norm": 1.0954974858449955,
"learning_rate": 5.959222031048417e-07,
"loss": 1.7136,
"step": 413
},
{
"epoch": 0.055520786767992845,
"grad_norm": 1.1287823535303052,
"learning_rate": 5.959024821571296e-07,
"loss": 1.7765,
"step": 414
},
{
"epoch": 0.05565489494859186,
"grad_norm": 1.0561812337694518,
"learning_rate": 5.958827140019905e-07,
"loss": 1.6913,
"step": 415
},
{
"epoch": 0.055789003129190884,
"grad_norm": 1.1085682708952787,
"learning_rate": 5.958628986429338e-07,
"loss": 1.7022,
"step": 416
},
{
"epoch": 0.0559231113097899,
"grad_norm": 1.145351387138441,
"learning_rate": 5.958430360834773e-07,
"loss": 1.7236,
"step": 417
},
{
"epoch": 0.056057219490388915,
"grad_norm": 1.0897443627255616,
"learning_rate": 5.958231263271476e-07,
"loss": 1.6012,
"step": 418
},
{
"epoch": 0.05619132767098793,
"grad_norm": 1.1200731868604838,
"learning_rate": 5.958031693774794e-07,
"loss": 1.7389,
"step": 419
},
{
"epoch": 0.05632543585158695,
"grad_norm": 1.1038585013517133,
"learning_rate": 5.957831652380156e-07,
"loss": 1.583,
"step": 420
},
{
"epoch": 0.05645954403218596,
"grad_norm": 1.4548045332193216,
"learning_rate": 5.95763113912308e-07,
"loss": 1.7524,
"step": 421
},
{
"epoch": 0.05659365221278498,
"grad_norm": 1.1692222790883888,
"learning_rate": 5.95743015403916e-07,
"loss": 1.6299,
"step": 422
},
{
"epoch": 0.056727760393383994,
"grad_norm": 1.1247764368969244,
"learning_rate": 5.95722869716408e-07,
"loss": 1.5839,
"step": 423
},
{
"epoch": 0.05686186857398301,
"grad_norm": 1.1555568325620067,
"learning_rate": 5.957026768533605e-07,
"loss": 1.7239,
"step": 424
},
{
"epoch": 0.05699597675458203,
"grad_norm": 1.1216899351148046,
"learning_rate": 5.956824368183589e-07,
"loss": 1.7256,
"step": 425
},
{
"epoch": 0.05713008493518105,
"grad_norm": 1.145568323616433,
"learning_rate": 5.956621496149961e-07,
"loss": 1.6824,
"step": 426
},
{
"epoch": 0.057264193115780064,
"grad_norm": 1.0986327998626733,
"learning_rate": 5.956418152468739e-07,
"loss": 1.6288,
"step": 427
},
{
"epoch": 0.05739830129637908,
"grad_norm": 1.107394613480044,
"learning_rate": 5.956214337176026e-07,
"loss": 1.7525,
"step": 428
},
{
"epoch": 0.057532409476978096,
"grad_norm": 1.1530636510188206,
"learning_rate": 5.956010050308003e-07,
"loss": 1.6703,
"step": 429
},
{
"epoch": 0.05766651765757711,
"grad_norm": 1.2684443748494443,
"learning_rate": 5.955805291900944e-07,
"loss": 1.7255,
"step": 430
},
{
"epoch": 0.05780062583817613,
"grad_norm": 1.1216850925610182,
"learning_rate": 5.955600061991196e-07,
"loss": 1.6833,
"step": 431
},
{
"epoch": 0.05793473401877514,
"grad_norm": 1.1163294449512198,
"learning_rate": 5.955394360615196e-07,
"loss": 1.6738,
"step": 432
},
{
"epoch": 0.05806884219937416,
"grad_norm": 1.0993928108999345,
"learning_rate": 5.955188187809465e-07,
"loss": 1.575,
"step": 433
},
{
"epoch": 0.05820295037997318,
"grad_norm": 1.199099074821361,
"learning_rate": 5.954981543610606e-07,
"loss": 1.7117,
"step": 434
},
{
"epoch": 0.0583370585605722,
"grad_norm": 1.1208106037393502,
"learning_rate": 5.954774428055305e-07,
"loss": 1.7093,
"step": 435
},
{
"epoch": 0.05847116674117121,
"grad_norm": 1.2627670829161222,
"learning_rate": 5.954566841180332e-07,
"loss": 1.6188,
"step": 436
},
{
"epoch": 0.05860527492177023,
"grad_norm": 1.0799814850943354,
"learning_rate": 5.954358783022543e-07,
"loss": 1.7059,
"step": 437
},
{
"epoch": 0.058739383102369244,
"grad_norm": 1.1341395954441937,
"learning_rate": 5.954150253618875e-07,
"loss": 1.5712,
"step": 438
},
{
"epoch": 0.05887349128296826,
"grad_norm": 1.1117856654912641,
"learning_rate": 5.95394125300635e-07,
"loss": 1.6777,
"step": 439
},
{
"epoch": 0.059007599463567276,
"grad_norm": 1.0923581672387388,
"learning_rate": 5.953731781222071e-07,
"loss": 1.7159,
"step": 440
},
{
"epoch": 0.05914170764416629,
"grad_norm": 1.0600443650637132,
"learning_rate": 5.953521838303231e-07,
"loss": 1.7249,
"step": 441
},
{
"epoch": 0.059275815824765314,
"grad_norm": 1.2138612225345329,
"learning_rate": 5.9533114242871e-07,
"loss": 1.7013,
"step": 442
},
{
"epoch": 0.05940992400536433,
"grad_norm": 1.0419430689297875,
"learning_rate": 5.953100539211034e-07,
"loss": 1.7552,
"step": 443
},
{
"epoch": 0.059544032185963346,
"grad_norm": 1.1237438417872123,
"learning_rate": 5.952889183112474e-07,
"loss": 1.7112,
"step": 444
},
{
"epoch": 0.05967814036656236,
"grad_norm": 1.2319625967973615,
"learning_rate": 5.952677356028943e-07,
"loss": 1.7093,
"step": 445
},
{
"epoch": 0.05981224854716138,
"grad_norm": 1.086955577183242,
"learning_rate": 5.952465057998049e-07,
"loss": 1.6358,
"step": 446
},
{
"epoch": 0.05994635672776039,
"grad_norm": 1.1264500428377913,
"learning_rate": 5.952252289057481e-07,
"loss": 1.7178,
"step": 447
},
{
"epoch": 0.06008046490835941,
"grad_norm": 1.128811841099524,
"learning_rate": 5.952039049245012e-07,
"loss": 1.7591,
"step": 448
},
{
"epoch": 0.060214573088958424,
"grad_norm": 1.1110504835526924,
"learning_rate": 5.951825338598503e-07,
"loss": 1.6403,
"step": 449
},
{
"epoch": 0.06034868126955744,
"grad_norm": 1.2271379194246814,
"learning_rate": 5.951611157155895e-07,
"loss": 1.7213,
"step": 450
},
{
"epoch": 0.06048278945015646,
"grad_norm": 1.1228932913870193,
"learning_rate": 5.951396504955212e-07,
"loss": 1.5935,
"step": 451
},
{
"epoch": 0.06061689763075548,
"grad_norm": 1.11062455626935,
"learning_rate": 5.951181382034563e-07,
"loss": 1.6998,
"step": 452
},
{
"epoch": 0.060751005811354494,
"grad_norm": 1.0990862927657152,
"learning_rate": 5.950965788432139e-07,
"loss": 1.6468,
"step": 453
},
{
"epoch": 0.06088511399195351,
"grad_norm": 1.2688756973522501,
"learning_rate": 5.950749724186219e-07,
"loss": 1.741,
"step": 454
},
{
"epoch": 0.061019222172552526,
"grad_norm": 1.2895801173515846,
"learning_rate": 5.950533189335158e-07,
"loss": 1.6955,
"step": 455
},
{
"epoch": 0.06115333035315154,
"grad_norm": 1.077512840039689,
"learning_rate": 5.950316183917403e-07,
"loss": 1.641,
"step": 456
},
{
"epoch": 0.06128743853375056,
"grad_norm": 1.0847961133378894,
"learning_rate": 5.950098707971477e-07,
"loss": 1.83,
"step": 457
},
{
"epoch": 0.06142154671434957,
"grad_norm": 1.1936301482363822,
"learning_rate": 5.949880761535992e-07,
"loss": 1.8029,
"step": 458
},
{
"epoch": 0.06155565489494859,
"grad_norm": 1.1712115230746196,
"learning_rate": 5.949662344649641e-07,
"loss": 1.7041,
"step": 459
},
{
"epoch": 0.06168976307554761,
"grad_norm": 1.1207575353150439,
"learning_rate": 5.9494434573512e-07,
"loss": 1.8268,
"step": 460
},
{
"epoch": 0.06182387125614663,
"grad_norm": 1.0875570889732413,
"learning_rate": 5.949224099679532e-07,
"loss": 1.7194,
"step": 461
},
{
"epoch": 0.06195797943674564,
"grad_norm": 1.0917010226696162,
"learning_rate": 5.949004271673578e-07,
"loss": 1.7354,
"step": 462
},
{
"epoch": 0.06209208761734466,
"grad_norm": 1.0997856156670267,
"learning_rate": 5.948783973372368e-07,
"loss": 1.7529,
"step": 463
},
{
"epoch": 0.062226195797943674,
"grad_norm": 1.0621713053596278,
"learning_rate": 5.948563204815011e-07,
"loss": 1.6898,
"step": 464
},
{
"epoch": 0.06236030397854269,
"grad_norm": 1.0614544715813865,
"learning_rate": 5.948341966040703e-07,
"loss": 1.7044,
"step": 465
},
{
"epoch": 0.062494412159141706,
"grad_norm": 1.154295913834985,
"learning_rate": 5.948120257088721e-07,
"loss": 1.739,
"step": 466
},
{
"epoch": 0.06262852033974073,
"grad_norm": 1.6321838989867514,
"learning_rate": 5.947898077998429e-07,
"loss": 1.6571,
"step": 467
},
{
"epoch": 0.06276262852033974,
"grad_norm": 1.1020818061209965,
"learning_rate": 5.947675428809268e-07,
"loss": 1.7457,
"step": 468
},
{
"epoch": 0.06289673670093876,
"grad_norm": 1.1541190378330166,
"learning_rate": 5.947452309560767e-07,
"loss": 1.7659,
"step": 469
},
{
"epoch": 0.06303084488153778,
"grad_norm": 1.084642443791217,
"learning_rate": 5.947228720292541e-07,
"loss": 1.7144,
"step": 470
},
{
"epoch": 0.06316495306213679,
"grad_norm": 1.1145594614023564,
"learning_rate": 5.947004661044283e-07,
"loss": 1.6729,
"step": 471
},
{
"epoch": 0.06329906124273581,
"grad_norm": 1.115158449397951,
"learning_rate": 5.946780131855772e-07,
"loss": 1.7349,
"step": 472
},
{
"epoch": 0.06343316942333482,
"grad_norm": 1.1366035122661107,
"learning_rate": 5.94655513276687e-07,
"loss": 1.7005,
"step": 473
},
{
"epoch": 0.06356727760393384,
"grad_norm": 1.1207240569861627,
"learning_rate": 5.946329663817522e-07,
"loss": 1.6988,
"step": 474
},
{
"epoch": 0.06370138578453285,
"grad_norm": 1.0633079931171385,
"learning_rate": 5.946103725047759e-07,
"loss": 1.6861,
"step": 475
},
{
"epoch": 0.06383549396513187,
"grad_norm": 1.148420369678469,
"learning_rate": 5.945877316497692e-07,
"loss": 1.7186,
"step": 476
},
{
"epoch": 0.06396960214573089,
"grad_norm": 1.1296345116481292,
"learning_rate": 5.945650438207517e-07,
"loss": 1.7515,
"step": 477
},
{
"epoch": 0.0641037103263299,
"grad_norm": 1.1072132368875205,
"learning_rate": 5.945423090217512e-07,
"loss": 1.7498,
"step": 478
},
{
"epoch": 0.06423781850692892,
"grad_norm": 1.0636459120097348,
"learning_rate": 5.945195272568042e-07,
"loss": 1.6705,
"step": 479
},
{
"epoch": 0.06437192668752793,
"grad_norm": 1.1184722760153458,
"learning_rate": 5.944966985299551e-07,
"loss": 1.74,
"step": 480
},
{
"epoch": 0.06450603486812695,
"grad_norm": 1.09226255206473,
"learning_rate": 5.944738228452569e-07,
"loss": 1.7125,
"step": 481
},
{
"epoch": 0.06464014304872598,
"grad_norm": 1.0980507704132523,
"learning_rate": 5.94450900206771e-07,
"loss": 1.7187,
"step": 482
},
{
"epoch": 0.064774251229325,
"grad_norm": 1.0944716620001702,
"learning_rate": 5.944279306185668e-07,
"loss": 1.5932,
"step": 483
},
{
"epoch": 0.06490835940992401,
"grad_norm": 1.1136224916178525,
"learning_rate": 5.944049140847224e-07,
"loss": 1.6976,
"step": 484
},
{
"epoch": 0.06504246759052303,
"grad_norm": 1.1013486929558047,
"learning_rate": 5.943818506093239e-07,
"loss": 1.6864,
"step": 485
},
{
"epoch": 0.06517657577112204,
"grad_norm": 1.1430455689049595,
"learning_rate": 5.943587401964661e-07,
"loss": 1.6274,
"step": 486
},
{
"epoch": 0.06531068395172106,
"grad_norm": 1.1269355413734778,
"learning_rate": 5.943355828502519e-07,
"loss": 1.7389,
"step": 487
},
{
"epoch": 0.06544479213232007,
"grad_norm": 1.1442671190598854,
"learning_rate": 5.943123785747925e-07,
"loss": 1.6724,
"step": 488
},
{
"epoch": 0.06557890031291909,
"grad_norm": 1.1006441895975216,
"learning_rate": 5.942891273742075e-07,
"loss": 1.687,
"step": 489
},
{
"epoch": 0.0657130084935181,
"grad_norm": 1.1130024103107554,
"learning_rate": 5.94265829252625e-07,
"loss": 1.6774,
"step": 490
},
{
"epoch": 0.06584711667411712,
"grad_norm": 1.10665029408129,
"learning_rate": 5.942424842141811e-07,
"loss": 1.7053,
"step": 491
},
{
"epoch": 0.06598122485471614,
"grad_norm": 1.0895398255696098,
"learning_rate": 5.942190922630204e-07,
"loss": 1.6816,
"step": 492
},
{
"epoch": 0.06611533303531515,
"grad_norm": 1.0952133118391503,
"learning_rate": 5.941956534032961e-07,
"loss": 1.58,
"step": 493
},
{
"epoch": 0.06624944121591417,
"grad_norm": 1.104962374424092,
"learning_rate": 5.941721676391691e-07,
"loss": 1.758,
"step": 494
},
{
"epoch": 0.06638354939651318,
"grad_norm": 1.1134158734370636,
"learning_rate": 5.941486349748091e-07,
"loss": 1.7508,
"step": 495
},
{
"epoch": 0.0665176575771122,
"grad_norm": 1.175784721072215,
"learning_rate": 5.94125055414394e-07,
"loss": 1.7113,
"step": 496
},
{
"epoch": 0.06665176575771121,
"grad_norm": 1.0778973456587042,
"learning_rate": 5.941014289621102e-07,
"loss": 1.7558,
"step": 497
},
{
"epoch": 0.06678587393831023,
"grad_norm": 1.11982522730228,
"learning_rate": 5.940777556221521e-07,
"loss": 1.6791,
"step": 498
},
{
"epoch": 0.06691998211890926,
"grad_norm": 1.1807400353238904,
"learning_rate": 5.940540353987225e-07,
"loss": 1.7484,
"step": 499
},
{
"epoch": 0.06705409029950828,
"grad_norm": 1.1987690536433178,
"learning_rate": 5.940302682960328e-07,
"loss": 1.59,
"step": 500
},
{
"epoch": 0.06718819848010729,
"grad_norm": 1.1093357389120035,
"learning_rate": 5.940064543183026e-07,
"loss": 1.8238,
"step": 501
},
{
"epoch": 0.06732230666070631,
"grad_norm": 1.2404864761664665,
"learning_rate": 5.939825934697594e-07,
"loss": 1.6965,
"step": 502
},
{
"epoch": 0.06745641484130532,
"grad_norm": 1.1369155507476978,
"learning_rate": 5.939586857546397e-07,
"loss": 1.7284,
"step": 503
},
{
"epoch": 0.06759052302190434,
"grad_norm": 1.0747025812432756,
"learning_rate": 5.939347311771877e-07,
"loss": 1.6029,
"step": 504
},
{
"epoch": 0.06772463120250335,
"grad_norm": 1.2065817260719833,
"learning_rate": 5.939107297416566e-07,
"loss": 1.7937,
"step": 505
},
{
"epoch": 0.06785873938310237,
"grad_norm": 1.072195510416472,
"learning_rate": 5.938866814523073e-07,
"loss": 1.6844,
"step": 506
},
{
"epoch": 0.06799284756370139,
"grad_norm": 1.0788223308291087,
"learning_rate": 5.938625863134092e-07,
"loss": 1.7651,
"step": 507
},
{
"epoch": 0.0681269557443004,
"grad_norm": 1.1125709389242076,
"learning_rate": 5.938384443292403e-07,
"loss": 1.6723,
"step": 508
},
{
"epoch": 0.06826106392489942,
"grad_norm": 1.2370173408194798,
"learning_rate": 5.938142555040863e-07,
"loss": 1.6491,
"step": 509
},
{
"epoch": 0.06839517210549843,
"grad_norm": 1.0646655039063193,
"learning_rate": 5.93790019842242e-07,
"loss": 1.7609,
"step": 510
},
{
"epoch": 0.06852928028609745,
"grad_norm": 1.137655615576816,
"learning_rate": 5.9376573734801e-07,
"loss": 1.6971,
"step": 511
},
{
"epoch": 0.06866338846669647,
"grad_norm": 1.1610648719854884,
"learning_rate": 5.937414080257011e-07,
"loss": 1.7563,
"step": 512
},
{
"epoch": 0.06879749664729548,
"grad_norm": 1.022128030652968,
"learning_rate": 5.93717031879635e-07,
"loss": 1.6585,
"step": 513
},
{
"epoch": 0.0689316048278945,
"grad_norm": 1.1094802666159138,
"learning_rate": 5.936926089141391e-07,
"loss": 1.6963,
"step": 514
},
{
"epoch": 0.06906571300849351,
"grad_norm": 1.0491463968940271,
"learning_rate": 5.936681391335494e-07,
"loss": 1.653,
"step": 515
},
{
"epoch": 0.06919982118909253,
"grad_norm": 1.1153617117594175,
"learning_rate": 5.936436225422104e-07,
"loss": 1.6738,
"step": 516
},
{
"epoch": 0.06933392936969156,
"grad_norm": 1.1150239468835819,
"learning_rate": 5.936190591444744e-07,
"loss": 1.726,
"step": 517
},
{
"epoch": 0.06946803755029057,
"grad_norm": 1.1299338290201733,
"learning_rate": 5.935944489447026e-07,
"loss": 1.6814,
"step": 518
},
{
"epoch": 0.06960214573088959,
"grad_norm": 1.0925086075502406,
"learning_rate": 5.935697919472639e-07,
"loss": 1.6141,
"step": 519
},
{
"epoch": 0.0697362539114886,
"grad_norm": 1.1136653572074133,
"learning_rate": 5.93545088156536e-07,
"loss": 1.6752,
"step": 520
},
{
"epoch": 0.06987036209208762,
"grad_norm": 1.086968726752448,
"learning_rate": 5.935203375769048e-07,
"loss": 1.6593,
"step": 521
},
{
"epoch": 0.07000447027268664,
"grad_norm": 1.0785790431427873,
"learning_rate": 5.934955402127642e-07,
"loss": 1.7806,
"step": 522
},
{
"epoch": 0.07013857845328565,
"grad_norm": 1.061202101435773,
"learning_rate": 5.934706960685168e-07,
"loss": 1.6015,
"step": 523
},
{
"epoch": 0.07027268663388467,
"grad_norm": 1.1217377555129306,
"learning_rate": 5.934458051485734e-07,
"loss": 1.6836,
"step": 524
},
{
"epoch": 0.07040679481448368,
"grad_norm": 1.1634463467399316,
"learning_rate": 5.934208674573529e-07,
"loss": 1.641,
"step": 525
},
{
"epoch": 0.0705409029950827,
"grad_norm": 1.1853874452885456,
"learning_rate": 5.933958829992828e-07,
"loss": 1.6501,
"step": 526
},
{
"epoch": 0.07067501117568172,
"grad_norm": 1.0827543649368265,
"learning_rate": 5.933708517787985e-07,
"loss": 1.6664,
"step": 527
},
{
"epoch": 0.07080911935628073,
"grad_norm": 1.1171619381364966,
"learning_rate": 5.933457738003443e-07,
"loss": 1.6758,
"step": 528
},
{
"epoch": 0.07094322753687975,
"grad_norm": 1.2171560054678998,
"learning_rate": 5.933206490683722e-07,
"loss": 1.6914,
"step": 529
},
{
"epoch": 0.07107733571747876,
"grad_norm": 1.130266539632813,
"learning_rate": 5.932954775873429e-07,
"loss": 1.6301,
"step": 530
},
{
"epoch": 0.07121144389807778,
"grad_norm": 1.1814157624655244,
"learning_rate": 5.932702593617252e-07,
"loss": 1.689,
"step": 531
},
{
"epoch": 0.0713455520786768,
"grad_norm": 1.1423293526842793,
"learning_rate": 5.932449943959963e-07,
"loss": 1.7379,
"step": 532
},
{
"epoch": 0.07147966025927581,
"grad_norm": 1.0830256450215578,
"learning_rate": 5.932196826946416e-07,
"loss": 1.6752,
"step": 533
},
{
"epoch": 0.07161376843987484,
"grad_norm": 1.2254212102036337,
"learning_rate": 5.931943242621548e-07,
"loss": 1.7602,
"step": 534
},
{
"epoch": 0.07174787662047385,
"grad_norm": 1.1254407305546181,
"learning_rate": 5.931689191030381e-07,
"loss": 1.7144,
"step": 535
},
{
"epoch": 0.07188198480107287,
"grad_norm": 1.7531628186363164,
"learning_rate": 5.931434672218018e-07,
"loss": 1.7868,
"step": 536
},
{
"epoch": 0.07201609298167189,
"grad_norm": 1.1530768773395477,
"learning_rate": 5.931179686229645e-07,
"loss": 1.7128,
"step": 537
},
{
"epoch": 0.0721502011622709,
"grad_norm": 1.0869645546426585,
"learning_rate": 5.930924233110532e-07,
"loss": 1.626,
"step": 538
},
{
"epoch": 0.07228430934286992,
"grad_norm": 1.2196040558075754,
"learning_rate": 5.930668312906031e-07,
"loss": 1.7148,
"step": 539
},
{
"epoch": 0.07241841752346893,
"grad_norm": 1.1904076173283444,
"learning_rate": 5.930411925661577e-07,
"loss": 1.6981,
"step": 540
},
{
"epoch": 0.07255252570406795,
"grad_norm": 1.5987820485565098,
"learning_rate": 5.930155071422687e-07,
"loss": 1.7351,
"step": 541
},
{
"epoch": 0.07268663388466697,
"grad_norm": 1.101070130998752,
"learning_rate": 5.929897750234963e-07,
"loss": 1.6313,
"step": 542
},
{
"epoch": 0.07282074206526598,
"grad_norm": 1.0908625387826942,
"learning_rate": 5.929639962144091e-07,
"loss": 1.5891,
"step": 543
},
{
"epoch": 0.072954850245865,
"grad_norm": 1.0986511244523132,
"learning_rate": 5.929381707195834e-07,
"loss": 1.6991,
"step": 544
},
{
"epoch": 0.07308895842646401,
"grad_norm": 1.055356610594688,
"learning_rate": 5.929122985436045e-07,
"loss": 1.7331,
"step": 545
},
{
"epoch": 0.07322306660706303,
"grad_norm": 1.035590332821026,
"learning_rate": 5.928863796910655e-07,
"loss": 1.5682,
"step": 546
},
{
"epoch": 0.07335717478766204,
"grad_norm": 1.0783361793793855,
"learning_rate": 5.928604141665679e-07,
"loss": 1.6092,
"step": 547
},
{
"epoch": 0.07349128296826106,
"grad_norm": 1.090736305001705,
"learning_rate": 5.928344019747217e-07,
"loss": 1.7072,
"step": 548
},
{
"epoch": 0.07362539114886008,
"grad_norm": 1.4276709820636466,
"learning_rate": 5.928083431201449e-07,
"loss": 1.6789,
"step": 549
},
{
"epoch": 0.07375949932945909,
"grad_norm": 1.0906054014326296,
"learning_rate": 5.927822376074639e-07,
"loss": 1.7215,
"step": 550
},
{
"epoch": 0.0738936075100581,
"grad_norm": 1.364150462787829,
"learning_rate": 5.927560854413134e-07,
"loss": 1.6841,
"step": 551
},
{
"epoch": 0.07402771569065714,
"grad_norm": 1.1159870574206099,
"learning_rate": 5.927298866263363e-07,
"loss": 1.7298,
"step": 552
},
{
"epoch": 0.07416182387125615,
"grad_norm": 1.1812983592653572,
"learning_rate": 5.92703641167184e-07,
"loss": 1.7091,
"step": 553
},
{
"epoch": 0.07429593205185517,
"grad_norm": 1.0688687878186984,
"learning_rate": 5.926773490685159e-07,
"loss": 1.8398,
"step": 554
},
{
"epoch": 0.07443004023245418,
"grad_norm": 1.2894858274000411,
"learning_rate": 5.92651010335e-07,
"loss": 1.6902,
"step": 555
},
{
"epoch": 0.0745641484130532,
"grad_norm": 1.1464943136824657,
"learning_rate": 5.926246249713121e-07,
"loss": 1.7249,
"step": 556
},
{
"epoch": 0.07469825659365222,
"grad_norm": 1.3070568856631266,
"learning_rate": 5.925981929821368e-07,
"loss": 1.6741,
"step": 557
},
{
"epoch": 0.07483236477425123,
"grad_norm": 1.1646332582267231,
"learning_rate": 5.925717143721665e-07,
"loss": 1.6975,
"step": 558
},
{
"epoch": 0.07496647295485025,
"grad_norm": 1.213733563154542,
"learning_rate": 5.925451891461026e-07,
"loss": 1.6688,
"step": 559
},
{
"epoch": 0.07510058113544926,
"grad_norm": 1.1250145434758787,
"learning_rate": 5.925186173086538e-07,
"loss": 1.7044,
"step": 560
},
{
"epoch": 0.07523468931604828,
"grad_norm": 1.0865739045197238,
"learning_rate": 5.924919988645377e-07,
"loss": 1.6663,
"step": 561
},
{
"epoch": 0.0753687974966473,
"grad_norm": 1.1159580863498637,
"learning_rate": 5.924653338184801e-07,
"loss": 1.5986,
"step": 562
},
{
"epoch": 0.07550290567724631,
"grad_norm": 1.0795350956359355,
"learning_rate": 5.924386221752151e-07,
"loss": 1.7059,
"step": 563
},
{
"epoch": 0.07563701385784533,
"grad_norm": 1.059523546111381,
"learning_rate": 5.924118639394849e-07,
"loss": 1.6525,
"step": 564
},
{
"epoch": 0.07577112203844434,
"grad_norm": 1.0995795687250527,
"learning_rate": 5.923850591160401e-07,
"loss": 1.6524,
"step": 565
},
{
"epoch": 0.07590523021904336,
"grad_norm": 1.1092841538303688,
"learning_rate": 5.923582077096395e-07,
"loss": 1.7758,
"step": 566
},
{
"epoch": 0.07603933839964237,
"grad_norm": 2.6979584052916503,
"learning_rate": 5.923313097250504e-07,
"loss": 1.6593,
"step": 567
},
{
"epoch": 0.07617344658024139,
"grad_norm": 1.0621178435726715,
"learning_rate": 5.923043651670478e-07,
"loss": 1.6983,
"step": 568
},
{
"epoch": 0.07630755476084042,
"grad_norm": 1.1573135825405225,
"learning_rate": 5.922773740404157e-07,
"loss": 1.7572,
"step": 569
},
{
"epoch": 0.07644166294143943,
"grad_norm": 1.3034930029837637,
"learning_rate": 5.922503363499457e-07,
"loss": 1.7229,
"step": 570
},
{
"epoch": 0.07657577112203845,
"grad_norm": 1.063644093194536,
"learning_rate": 5.922232521004384e-07,
"loss": 1.6373,
"step": 571
},
{
"epoch": 0.07670987930263747,
"grad_norm": 1.0799490002557715,
"learning_rate": 5.921961212967018e-07,
"loss": 1.7291,
"step": 572
},
{
"epoch": 0.07684398748323648,
"grad_norm": 1.1456297613060256,
"learning_rate": 5.921689439435529e-07,
"loss": 1.6715,
"step": 573
},
{
"epoch": 0.0769780956638355,
"grad_norm": 1.1064438116765838,
"learning_rate": 5.921417200458166e-07,
"loss": 1.6324,
"step": 574
},
{
"epoch": 0.07711220384443451,
"grad_norm": 1.2537502156532783,
"learning_rate": 5.921144496083261e-07,
"loss": 1.6255,
"step": 575
},
{
"epoch": 0.07724631202503353,
"grad_norm": 1.1130457826739977,
"learning_rate": 5.920871326359228e-07,
"loss": 1.7305,
"step": 576
},
{
"epoch": 0.07738042020563254,
"grad_norm": 1.1106269047087995,
"learning_rate": 5.920597691334568e-07,
"loss": 1.7839,
"step": 577
},
{
"epoch": 0.07751452838623156,
"grad_norm": 1.1308110312275523,
"learning_rate": 5.920323591057858e-07,
"loss": 1.702,
"step": 578
},
{
"epoch": 0.07764863656683058,
"grad_norm": 1.1274236401107995,
"learning_rate": 5.920049025577762e-07,
"loss": 1.6345,
"step": 579
},
{
"epoch": 0.07778274474742959,
"grad_norm": 1.1274894849868589,
"learning_rate": 5.919773994943026e-07,
"loss": 1.6358,
"step": 580
},
{
"epoch": 0.07791685292802861,
"grad_norm": 1.203139388656472,
"learning_rate": 5.919498499202476e-07,
"loss": 1.7228,
"step": 581
},
{
"epoch": 0.07805096110862762,
"grad_norm": 1.1343472094184475,
"learning_rate": 5.919222538405025e-07,
"loss": 1.5995,
"step": 582
},
{
"epoch": 0.07818506928922664,
"grad_norm": 1.1211098856442396,
"learning_rate": 5.918946112599665e-07,
"loss": 1.7545,
"step": 583
},
{
"epoch": 0.07831917746982565,
"grad_norm": 1.3590410455725328,
"learning_rate": 5.918669221835472e-07,
"loss": 1.6658,
"step": 584
},
{
"epoch": 0.07845328565042467,
"grad_norm": 1.1368973789149184,
"learning_rate": 5.918391866161604e-07,
"loss": 1.6578,
"step": 585
},
{
"epoch": 0.07858739383102369,
"grad_norm": 1.144480010176944,
"learning_rate": 5.918114045627301e-07,
"loss": 1.687,
"step": 586
},
{
"epoch": 0.07872150201162272,
"grad_norm": 1.1079667555369228,
"learning_rate": 5.91783576028189e-07,
"loss": 1.6571,
"step": 587
},
{
"epoch": 0.07885561019222173,
"grad_norm": 1.1172832381186681,
"learning_rate": 5.917557010174771e-07,
"loss": 1.6347,
"step": 588
},
{
"epoch": 0.07898971837282075,
"grad_norm": 1.1477730537939723,
"learning_rate": 5.917277795355436e-07,
"loss": 1.696,
"step": 589
},
{
"epoch": 0.07912382655341976,
"grad_norm": 1.1124249695741149,
"learning_rate": 5.916998115873455e-07,
"loss": 1.7316,
"step": 590
},
{
"epoch": 0.07925793473401878,
"grad_norm": 1.2132332214863524,
"learning_rate": 5.916717971778482e-07,
"loss": 1.7529,
"step": 591
},
{
"epoch": 0.0793920429146178,
"grad_norm": 1.1308959961423235,
"learning_rate": 5.916437363120253e-07,
"loss": 1.6713,
"step": 592
},
{
"epoch": 0.07952615109521681,
"grad_norm": 1.1204029361778143,
"learning_rate": 5.916156289948584e-07,
"loss": 1.6751,
"step": 593
},
{
"epoch": 0.07966025927581583,
"grad_norm": 1.1836584994154395,
"learning_rate": 5.91587475231338e-07,
"loss": 1.7145,
"step": 594
},
{
"epoch": 0.07979436745641484,
"grad_norm": 1.0952029272098618,
"learning_rate": 5.91559275026462e-07,
"loss": 1.6849,
"step": 595
},
{
"epoch": 0.07992847563701386,
"grad_norm": 1.2564246490346886,
"learning_rate": 5.915310283852372e-07,
"loss": 1.6352,
"step": 596
},
{
"epoch": 0.08006258381761287,
"grad_norm": 1.1465710959467506,
"learning_rate": 5.915027353126783e-07,
"loss": 1.6647,
"step": 597
},
{
"epoch": 0.08019669199821189,
"grad_norm": 1.1382835508015974,
"learning_rate": 5.914743958138086e-07,
"loss": 1.7106,
"step": 598
},
{
"epoch": 0.0803308001788109,
"grad_norm": 1.1192071556571492,
"learning_rate": 5.91446009893659e-07,
"loss": 1.706,
"step": 599
},
{
"epoch": 0.08046490835940992,
"grad_norm": 1.1629696564337242,
"learning_rate": 5.914175775572693e-07,
"loss": 1.676,
"step": 600
},
{
"epoch": 0.08059901654000894,
"grad_norm": 1.1336751221713581,
"learning_rate": 5.913890988096872e-07,
"loss": 1.7061,
"step": 601
},
{
"epoch": 0.08073312472060795,
"grad_norm": 1.063751409329425,
"learning_rate": 5.913605736559689e-07,
"loss": 1.6276,
"step": 602
},
{
"epoch": 0.08086723290120697,
"grad_norm": 1.7847493987152905,
"learning_rate": 5.913320021011784e-07,
"loss": 1.7643,
"step": 603
},
{
"epoch": 0.081001341081806,
"grad_norm": 1.1752588010758491,
"learning_rate": 5.913033841503882e-07,
"loss": 1.7136,
"step": 604
},
{
"epoch": 0.08113544926240501,
"grad_norm": 1.092151629247411,
"learning_rate": 5.912747198086793e-07,
"loss": 1.6921,
"step": 605
},
{
"epoch": 0.08126955744300403,
"grad_norm": 1.1813450877374088,
"learning_rate": 5.912460090811404e-07,
"loss": 1.5961,
"step": 606
},
{
"epoch": 0.08140366562360304,
"grad_norm": 1.1386503634209713,
"learning_rate": 5.912172519728691e-07,
"loss": 1.6936,
"step": 607
},
{
"epoch": 0.08153777380420206,
"grad_norm": 1.1478659529471829,
"learning_rate": 5.911884484889702e-07,
"loss": 1.7133,
"step": 608
},
{
"epoch": 0.08167188198480108,
"grad_norm": 1.2776303627444894,
"learning_rate": 5.911595986345579e-07,
"loss": 1.686,
"step": 609
},
{
"epoch": 0.08180599016540009,
"grad_norm": 1.0774582052806807,
"learning_rate": 5.91130702414754e-07,
"loss": 1.8028,
"step": 610
},
{
"epoch": 0.08194009834599911,
"grad_norm": 1.0810859242279176,
"learning_rate": 5.911017598346885e-07,
"loss": 1.6044,
"step": 611
},
{
"epoch": 0.08207420652659812,
"grad_norm": 1.1594727731031893,
"learning_rate": 5.910727708994998e-07,
"loss": 1.7686,
"step": 612
},
{
"epoch": 0.08220831470719714,
"grad_norm": 1.1321005040254193,
"learning_rate": 5.910437356143345e-07,
"loss": 1.6522,
"step": 613
},
{
"epoch": 0.08234242288779615,
"grad_norm": 1.0653919163589205,
"learning_rate": 5.910146539843476e-07,
"loss": 1.7465,
"step": 614
},
{
"epoch": 0.08247653106839517,
"grad_norm": 1.1128916496114905,
"learning_rate": 5.90985526014702e-07,
"loss": 1.6125,
"step": 615
},
{
"epoch": 0.08261063924899419,
"grad_norm": 1.4081204838899852,
"learning_rate": 5.90956351710569e-07,
"loss": 1.7639,
"step": 616
},
{
"epoch": 0.0827447474295932,
"grad_norm": 1.1683592035720405,
"learning_rate": 5.909271310771279e-07,
"loss": 1.637,
"step": 617
},
{
"epoch": 0.08287885561019222,
"grad_norm": 1.115793940661641,
"learning_rate": 5.90897864119567e-07,
"loss": 1.6118,
"step": 618
},
{
"epoch": 0.08301296379079123,
"grad_norm": 1.0879479857779484,
"learning_rate": 5.908685508430816e-07,
"loss": 1.6846,
"step": 619
},
{
"epoch": 0.08314707197139025,
"grad_norm": 1.1428114800136786,
"learning_rate": 5.908391912528764e-07,
"loss": 1.6949,
"step": 620
},
{
"epoch": 0.08328118015198926,
"grad_norm": 1.11661524840305,
"learning_rate": 5.908097853541634e-07,
"loss": 1.754,
"step": 621
},
{
"epoch": 0.0834152883325883,
"grad_norm": 1.0762293742420466,
"learning_rate": 5.907803331521635e-07,
"loss": 1.7609,
"step": 622
},
{
"epoch": 0.08354939651318731,
"grad_norm": 1.0719203407555025,
"learning_rate": 5.907508346521054e-07,
"loss": 1.6981,
"step": 623
},
{
"epoch": 0.08368350469378633,
"grad_norm": 1.1553772926251566,
"learning_rate": 5.907212898592263e-07,
"loss": 1.7024,
"step": 624
},
{
"epoch": 0.08381761287438534,
"grad_norm": 1.1270260996688657,
"learning_rate": 5.906916987787713e-07,
"loss": 1.6906,
"step": 625
},
{
"epoch": 0.08395172105498436,
"grad_norm": 1.1229658996843206,
"learning_rate": 5.90662061415994e-07,
"loss": 1.694,
"step": 626
},
{
"epoch": 0.08408582923558337,
"grad_norm": 1.1277068299424584,
"learning_rate": 5.906323777761561e-07,
"loss": 1.5693,
"step": 627
},
{
"epoch": 0.08421993741618239,
"grad_norm": 1.1180105581479995,
"learning_rate": 5.906026478645276e-07,
"loss": 1.7247,
"step": 628
},
{
"epoch": 0.0843540455967814,
"grad_norm": 1.2224062872746266,
"learning_rate": 5.905728716863865e-07,
"loss": 1.6829,
"step": 629
},
{
"epoch": 0.08448815377738042,
"grad_norm": 1.1085889629398797,
"learning_rate": 5.905430492470195e-07,
"loss": 1.7271,
"step": 630
},
{
"epoch": 0.08462226195797944,
"grad_norm": 1.1451977446739299,
"learning_rate": 5.905131805517207e-07,
"loss": 1.5877,
"step": 631
},
{
"epoch": 0.08475637013857845,
"grad_norm": 1.1422915014499277,
"learning_rate": 5.904832656057932e-07,
"loss": 1.6977,
"step": 632
},
{
"epoch": 0.08489047831917747,
"grad_norm": 1.131510544315339,
"learning_rate": 5.904533044145479e-07,
"loss": 1.5513,
"step": 633
},
{
"epoch": 0.08502458649977648,
"grad_norm": 1.2432140035573447,
"learning_rate": 5.904232969833039e-07,
"loss": 1.6835,
"step": 634
},
{
"epoch": 0.0851586946803755,
"grad_norm": 1.0744643011300827,
"learning_rate": 5.90393243317389e-07,
"loss": 1.6052,
"step": 635
},
{
"epoch": 0.08529280286097451,
"grad_norm": 1.3098823736310086,
"learning_rate": 5.903631434221384e-07,
"loss": 1.7622,
"step": 636
},
{
"epoch": 0.08542691104157353,
"grad_norm": 1.1182788647555526,
"learning_rate": 5.903329973028961e-07,
"loss": 1.7497,
"step": 637
},
{
"epoch": 0.08556101922217255,
"grad_norm": 1.305543631329334,
"learning_rate": 5.903028049650141e-07,
"loss": 1.6732,
"step": 638
},
{
"epoch": 0.08569512740277158,
"grad_norm": 1.1108546390310376,
"learning_rate": 5.902725664138528e-07,
"loss": 1.7271,
"step": 639
},
{
"epoch": 0.08582923558337059,
"grad_norm": 1.0769425748182762,
"learning_rate": 5.902422816547804e-07,
"loss": 1.666,
"step": 640
},
{
"epoch": 0.08596334376396961,
"grad_norm": 1.0710915573180522,
"learning_rate": 5.902119506931739e-07,
"loss": 1.7208,
"step": 641
},
{
"epoch": 0.08609745194456862,
"grad_norm": 1.1265338939849623,
"learning_rate": 5.901815735344178e-07,
"loss": 1.713,
"step": 642
},
{
"epoch": 0.08623156012516764,
"grad_norm": 1.1032977967977797,
"learning_rate": 5.901511501839053e-07,
"loss": 1.655,
"step": 643
},
{
"epoch": 0.08636566830576665,
"grad_norm": 1.067089553405501,
"learning_rate": 5.901206806470377e-07,
"loss": 1.6794,
"step": 644
},
{
"epoch": 0.08649977648636567,
"grad_norm": 1.1924702814140196,
"learning_rate": 5.900901649292243e-07,
"loss": 1.6186,
"step": 645
},
{
"epoch": 0.08663388466696469,
"grad_norm": 1.1000064746041005,
"learning_rate": 5.900596030358831e-07,
"loss": 1.7316,
"step": 646
},
{
"epoch": 0.0867679928475637,
"grad_norm": 1.16787242186727,
"learning_rate": 5.900289949724397e-07,
"loss": 1.6475,
"step": 647
},
{
"epoch": 0.08690210102816272,
"grad_norm": 1.153036807295657,
"learning_rate": 5.899983407443281e-07,
"loss": 1.604,
"step": 648
},
{
"epoch": 0.08703620920876173,
"grad_norm": 1.1418227950695776,
"learning_rate": 5.899676403569906e-07,
"loss": 1.7925,
"step": 649
},
{
"epoch": 0.08717031738936075,
"grad_norm": 1.1018946533270777,
"learning_rate": 5.899368938158777e-07,
"loss": 1.5998,
"step": 650
},
{
"epoch": 0.08730442556995976,
"grad_norm": 1.0898779658636957,
"learning_rate": 5.899061011264481e-07,
"loss": 1.6772,
"step": 651
},
{
"epoch": 0.08743853375055878,
"grad_norm": 1.1828085767178107,
"learning_rate": 5.898752622941684e-07,
"loss": 1.6564,
"step": 652
},
{
"epoch": 0.0875726419311578,
"grad_norm": 1.123777742875525,
"learning_rate": 5.89844377324514e-07,
"loss": 1.7173,
"step": 653
},
{
"epoch": 0.08770675011175681,
"grad_norm": 1.1137884706219183,
"learning_rate": 5.898134462229677e-07,
"loss": 1.705,
"step": 654
},
{
"epoch": 0.08784085829235583,
"grad_norm": 1.0736901627301867,
"learning_rate": 5.89782468995021e-07,
"loss": 1.6673,
"step": 655
},
{
"epoch": 0.08797496647295484,
"grad_norm": 1.1006296755478988,
"learning_rate": 5.897514456461737e-07,
"loss": 1.662,
"step": 656
},
{
"epoch": 0.08810907465355387,
"grad_norm": 1.0993086803454002,
"learning_rate": 5.897203761819334e-07,
"loss": 1.7671,
"step": 657
},
{
"epoch": 0.08824318283415289,
"grad_norm": 1.1555576950225783,
"learning_rate": 5.896892606078163e-07,
"loss": 1.6558,
"step": 658
},
{
"epoch": 0.0883772910147519,
"grad_norm": 1.1044269950107921,
"learning_rate": 5.896580989293461e-07,
"loss": 1.6538,
"step": 659
},
{
"epoch": 0.08851139919535092,
"grad_norm": 1.1293808136662087,
"learning_rate": 5.896268911520556e-07,
"loss": 1.6734,
"step": 660
},
{
"epoch": 0.08864550737594994,
"grad_norm": 1.0799327058316142,
"learning_rate": 5.895956372814851e-07,
"loss": 1.7258,
"step": 661
},
{
"epoch": 0.08877961555654895,
"grad_norm": 1.2412270489033748,
"learning_rate": 5.895643373231834e-07,
"loss": 1.7033,
"step": 662
},
{
"epoch": 0.08891372373714797,
"grad_norm": 1.2660732052099137,
"learning_rate": 5.895329912827074e-07,
"loss": 1.6607,
"step": 663
},
{
"epoch": 0.08904783191774698,
"grad_norm": 1.0851423150565935,
"learning_rate": 5.895015991656218e-07,
"loss": 1.7365,
"step": 664
},
{
"epoch": 0.089181940098346,
"grad_norm": 1.0926935688632777,
"learning_rate": 5.894701609775004e-07,
"loss": 1.723,
"step": 665
},
{
"epoch": 0.08931604827894501,
"grad_norm": 1.1335362217269433,
"learning_rate": 5.894386767239243e-07,
"loss": 1.7482,
"step": 666
},
{
"epoch": 0.08945015645954403,
"grad_norm": 1.0690769483519065,
"learning_rate": 5.894071464104832e-07,
"loss": 1.7083,
"step": 667
},
{
"epoch": 0.08958426464014305,
"grad_norm": 1.144239086274215,
"learning_rate": 5.893755700427749e-07,
"loss": 1.6672,
"step": 668
},
{
"epoch": 0.08971837282074206,
"grad_norm": 1.154969050751237,
"learning_rate": 5.893439476264053e-07,
"loss": 1.5992,
"step": 669
},
{
"epoch": 0.08985248100134108,
"grad_norm": 1.1692487930022055,
"learning_rate": 5.893122791669886e-07,
"loss": 1.6895,
"step": 670
},
{
"epoch": 0.0899865891819401,
"grad_norm": 1.1445503009803197,
"learning_rate": 5.892805646701471e-07,
"loss": 1.6176,
"step": 671
},
{
"epoch": 0.09012069736253911,
"grad_norm": 1.0860602124973238,
"learning_rate": 5.892488041415113e-07,
"loss": 1.7431,
"step": 672
},
{
"epoch": 0.09025480554313813,
"grad_norm": 1.1840804859528216,
"learning_rate": 5.892169975867196e-07,
"loss": 1.5377,
"step": 673
},
{
"epoch": 0.09038891372373715,
"grad_norm": 1.0925936180668785,
"learning_rate": 5.891851450114193e-07,
"loss": 1.693,
"step": 674
},
{
"epoch": 0.09052302190433617,
"grad_norm": 1.1412736395289622,
"learning_rate": 5.891532464212651e-07,
"loss": 1.6782,
"step": 675
},
{
"epoch": 0.09065713008493519,
"grad_norm": 1.1014154222006858,
"learning_rate": 5.891213018219203e-07,
"loss": 1.6661,
"step": 676
},
{
"epoch": 0.0907912382655342,
"grad_norm": 1.1028682933773437,
"learning_rate": 5.89089311219056e-07,
"loss": 1.6283,
"step": 677
},
{
"epoch": 0.09092534644613322,
"grad_norm": 1.0999221111301187,
"learning_rate": 5.89057274618352e-07,
"loss": 1.6288,
"step": 678
},
{
"epoch": 0.09105945462673223,
"grad_norm": 1.0929215008817739,
"learning_rate": 5.890251920254958e-07,
"loss": 1.6966,
"step": 679
},
{
"epoch": 0.09119356280733125,
"grad_norm": 1.0995793357287673,
"learning_rate": 5.889930634461832e-07,
"loss": 1.7086,
"step": 680
},
{
"epoch": 0.09132767098793027,
"grad_norm": 1.0809381415190136,
"learning_rate": 5.889608888861182e-07,
"loss": 1.6829,
"step": 681
},
{
"epoch": 0.09146177916852928,
"grad_norm": 1.0548227913499995,
"learning_rate": 5.889286683510132e-07,
"loss": 1.6826,
"step": 682
},
{
"epoch": 0.0915958873491283,
"grad_norm": 1.1106859513783915,
"learning_rate": 5.888964018465883e-07,
"loss": 1.6544,
"step": 683
},
{
"epoch": 0.09172999552972731,
"grad_norm": 1.0878369148062472,
"learning_rate": 5.88864089378572e-07,
"loss": 1.6342,
"step": 684
},
{
"epoch": 0.09186410371032633,
"grad_norm": 1.128955444803477,
"learning_rate": 5.888317309527009e-07,
"loss": 1.6121,
"step": 685
},
{
"epoch": 0.09199821189092534,
"grad_norm": 1.246867762194091,
"learning_rate": 5.887993265747201e-07,
"loss": 1.6819,
"step": 686
},
{
"epoch": 0.09213232007152436,
"grad_norm": 1.1533855664708184,
"learning_rate": 5.887668762503822e-07,
"loss": 1.7429,
"step": 687
},
{
"epoch": 0.09226642825212338,
"grad_norm": 1.0405450268075809,
"learning_rate": 5.887343799854485e-07,
"loss": 1.6759,
"step": 688
},
{
"epoch": 0.09240053643272239,
"grad_norm": 1.1507085139636744,
"learning_rate": 5.887018377856884e-07,
"loss": 1.8036,
"step": 689
},
{
"epoch": 0.0925346446133214,
"grad_norm": 6.743658343986094,
"learning_rate": 5.886692496568789e-07,
"loss": 1.6027,
"step": 690
},
{
"epoch": 0.09266875279392042,
"grad_norm": 1.0641784107760024,
"learning_rate": 5.886366156048061e-07,
"loss": 1.6558,
"step": 691
},
{
"epoch": 0.09280286097451945,
"grad_norm": 1.0922990524942957,
"learning_rate": 5.886039356352634e-07,
"loss": 1.7383,
"step": 692
},
{
"epoch": 0.09293696915511847,
"grad_norm": 1.1742618579401762,
"learning_rate": 5.885712097540529e-07,
"loss": 1.5927,
"step": 693
},
{
"epoch": 0.09307107733571748,
"grad_norm": 1.1075189838987614,
"learning_rate": 5.885384379669844e-07,
"loss": 1.7738,
"step": 694
},
{
"epoch": 0.0932051855163165,
"grad_norm": 2.1929813163212093,
"learning_rate": 5.885056202798763e-07,
"loss": 1.7975,
"step": 695
},
{
"epoch": 0.09333929369691552,
"grad_norm": 1.0998963175774283,
"learning_rate": 5.88472756698555e-07,
"loss": 1.6156,
"step": 696
},
{
"epoch": 0.09347340187751453,
"grad_norm": 1.0824346616111722,
"learning_rate": 5.884398472288546e-07,
"loss": 1.7226,
"step": 697
},
{
"epoch": 0.09360751005811355,
"grad_norm": 1.048887980139358,
"learning_rate": 5.884068918766182e-07,
"loss": 1.7065,
"step": 698
},
{
"epoch": 0.09374161823871256,
"grad_norm": 1.0293430293240384,
"learning_rate": 5.883738906476963e-07,
"loss": 1.6596,
"step": 699
},
{
"epoch": 0.09387572641931158,
"grad_norm": 1.0943419458638883,
"learning_rate": 5.88340843547948e-07,
"loss": 1.7356,
"step": 700
},
{
"epoch": 0.0940098345999106,
"grad_norm": 1.0980484739258698,
"learning_rate": 5.883077505832403e-07,
"loss": 1.6039,
"step": 701
},
{
"epoch": 0.09414394278050961,
"grad_norm": 1.1455036041824893,
"learning_rate": 5.882746117594482e-07,
"loss": 1.6255,
"step": 702
},
{
"epoch": 0.09427805096110863,
"grad_norm": 1.4001837690870673,
"learning_rate": 5.882414270824554e-07,
"loss": 1.6008,
"step": 703
},
{
"epoch": 0.09441215914170764,
"grad_norm": 1.1130500383248842,
"learning_rate": 5.882081965581533e-07,
"loss": 1.7358,
"step": 704
},
{
"epoch": 0.09454626732230666,
"grad_norm": 1.070694937502845,
"learning_rate": 5.881749201924413e-07,
"loss": 1.6635,
"step": 705
},
{
"epoch": 0.09468037550290567,
"grad_norm": 1.1144333495898877,
"learning_rate": 5.881415979912274e-07,
"loss": 1.7066,
"step": 706
},
{
"epoch": 0.09481448368350469,
"grad_norm": 1.1422205384748831,
"learning_rate": 5.881082299604276e-07,
"loss": 1.6546,
"step": 707
},
{
"epoch": 0.0949485918641037,
"grad_norm": 1.0853098558287595,
"learning_rate": 5.880748161059657e-07,
"loss": 1.6753,
"step": 708
},
{
"epoch": 0.09508270004470273,
"grad_norm": 1.198904753001485,
"learning_rate": 5.88041356433774e-07,
"loss": 1.7569,
"step": 709
},
{
"epoch": 0.09521680822530175,
"grad_norm": 1.1071829227283936,
"learning_rate": 5.880078509497928e-07,
"loss": 1.6232,
"step": 710
},
{
"epoch": 0.09535091640590077,
"grad_norm": 1.0695300790601336,
"learning_rate": 5.879742996599706e-07,
"loss": 1.6413,
"step": 711
},
{
"epoch": 0.09548502458649978,
"grad_norm": 3.3268091455655355,
"learning_rate": 5.879407025702638e-07,
"loss": 1.593,
"step": 712
},
{
"epoch": 0.0956191327670988,
"grad_norm": 1.0722393433959394,
"learning_rate": 5.879070596866374e-07,
"loss": 1.7546,
"step": 713
},
{
"epoch": 0.09575324094769781,
"grad_norm": 1.153579196694916,
"learning_rate": 5.87873371015064e-07,
"loss": 1.657,
"step": 714
},
{
"epoch": 0.09588734912829683,
"grad_norm": 1.1213730882230093,
"learning_rate": 5.878396365615248e-07,
"loss": 1.6892,
"step": 715
},
{
"epoch": 0.09602145730889584,
"grad_norm": 1.1795757056582914,
"learning_rate": 5.878058563320086e-07,
"loss": 1.6945,
"step": 716
},
{
"epoch": 0.09615556548949486,
"grad_norm": 1.075176593983707,
"learning_rate": 5.87772030332513e-07,
"loss": 1.7196,
"step": 717
},
{
"epoch": 0.09628967367009388,
"grad_norm": 1.0441316150069637,
"learning_rate": 5.877381585690431e-07,
"loss": 1.6256,
"step": 718
},
{
"epoch": 0.09642378185069289,
"grad_norm": 1.1023538045059467,
"learning_rate": 5.877042410476124e-07,
"loss": 1.6537,
"step": 719
},
{
"epoch": 0.09655789003129191,
"grad_norm": 1.154659783031204,
"learning_rate": 5.876702777742425e-07,
"loss": 1.75,
"step": 720
},
{
"epoch": 0.09669199821189092,
"grad_norm": 1.1756635069685608,
"learning_rate": 5.876362687549632e-07,
"loss": 1.6535,
"step": 721
},
{
"epoch": 0.09682610639248994,
"grad_norm": 1.1127957017636008,
"learning_rate": 5.876022139958122e-07,
"loss": 1.6513,
"step": 722
},
{
"epoch": 0.09696021457308895,
"grad_norm": 1.1770680572803744,
"learning_rate": 5.875681135028358e-07,
"loss": 1.6897,
"step": 723
},
{
"epoch": 0.09709432275368797,
"grad_norm": 1.054488251672258,
"learning_rate": 5.875339672820877e-07,
"loss": 1.7035,
"step": 724
},
{
"epoch": 0.09722843093428699,
"grad_norm": 1.1537946876962146,
"learning_rate": 5.874997753396303e-07,
"loss": 1.6564,
"step": 725
},
{
"epoch": 0.097362539114886,
"grad_norm": 1.2650547539228134,
"learning_rate": 5.874655376815338e-07,
"loss": 1.7448,
"step": 726
},
{
"epoch": 0.09749664729548503,
"grad_norm": 1.0865445919691652,
"learning_rate": 5.874312543138768e-07,
"loss": 1.7492,
"step": 727
},
{
"epoch": 0.09763075547608405,
"grad_norm": 1.0635064685924933,
"learning_rate": 5.873969252427457e-07,
"loss": 1.569,
"step": 728
},
{
"epoch": 0.09776486365668306,
"grad_norm": 1.1242141873259432,
"learning_rate": 5.873625504742354e-07,
"loss": 1.6972,
"step": 729
},
{
"epoch": 0.09789897183728208,
"grad_norm": 1.374622796897752,
"learning_rate": 5.873281300144483e-07,
"loss": 1.66,
"step": 730
},
{
"epoch": 0.0980330800178811,
"grad_norm": 1.0742640980921085,
"learning_rate": 5.872936638694958e-07,
"loss": 1.6395,
"step": 731
},
{
"epoch": 0.09816718819848011,
"grad_norm": 1.1834566808846507,
"learning_rate": 5.872591520454964e-07,
"loss": 1.6467,
"step": 732
},
{
"epoch": 0.09830129637907913,
"grad_norm": 1.1393523410825188,
"learning_rate": 5.872245945485774e-07,
"loss": 1.6715,
"step": 733
},
{
"epoch": 0.09843540455967814,
"grad_norm": 1.133914370439065,
"learning_rate": 5.871899913848743e-07,
"loss": 1.6661,
"step": 734
},
{
"epoch": 0.09856951274027716,
"grad_norm": 1.1318819144753365,
"learning_rate": 5.871553425605299e-07,
"loss": 1.7463,
"step": 735
},
{
"epoch": 0.09870362092087617,
"grad_norm": 1.119126620886235,
"learning_rate": 5.871206480816961e-07,
"loss": 1.681,
"step": 736
},
{
"epoch": 0.09883772910147519,
"grad_norm": 1.074480380396243,
"learning_rate": 5.870859079545321e-07,
"loss": 1.6163,
"step": 737
},
{
"epoch": 0.0989718372820742,
"grad_norm": 1.1208330921778833,
"learning_rate": 5.870511221852059e-07,
"loss": 1.619,
"step": 738
},
{
"epoch": 0.09910594546267322,
"grad_norm": 1.1594847796734538,
"learning_rate": 5.870162907798928e-07,
"loss": 1.6592,
"step": 739
},
{
"epoch": 0.09924005364327224,
"grad_norm": 1.058931279874539,
"learning_rate": 5.869814137447771e-07,
"loss": 1.6851,
"step": 740
},
{
"epoch": 0.09937416182387125,
"grad_norm": 1.1378546192527486,
"learning_rate": 5.869464910860505e-07,
"loss": 1.7918,
"step": 741
},
{
"epoch": 0.09950827000447027,
"grad_norm": 1.1325033016555488,
"learning_rate": 5.869115228099131e-07,
"loss": 1.6834,
"step": 742
},
{
"epoch": 0.09964237818506928,
"grad_norm": 1.3421525418201607,
"learning_rate": 5.86876508922573e-07,
"loss": 1.6549,
"step": 743
},
{
"epoch": 0.09977648636566831,
"grad_norm": 1.1427938179025248,
"learning_rate": 5.868414494302465e-07,
"loss": 1.6589,
"step": 744
},
{
"epoch": 0.09991059454626733,
"grad_norm": 1.1974168236579015,
"learning_rate": 5.86806344339158e-07,
"loss": 1.6378,
"step": 745
},
{
"epoch": 0.10004470272686634,
"grad_norm": 1.182005807170805,
"learning_rate": 5.867711936555398e-07,
"loss": 1.6299,
"step": 746
},
{
"epoch": 0.10017881090746536,
"grad_norm": 1.1347901749058797,
"learning_rate": 5.867359973856326e-07,
"loss": 1.6285,
"step": 747
},
{
"epoch": 0.10031291908806438,
"grad_norm": 1.0865847111724278,
"learning_rate": 5.867007555356848e-07,
"loss": 1.5712,
"step": 748
},
{
"epoch": 0.10044702726866339,
"grad_norm": 1.0792499138775284,
"learning_rate": 5.866654681119534e-07,
"loss": 1.6768,
"step": 749
},
{
"epoch": 0.10058113544926241,
"grad_norm": 1.1459851366680363,
"learning_rate": 5.866301351207031e-07,
"loss": 1.6162,
"step": 750
},
{
"epoch": 0.10071524362986142,
"grad_norm": 1.0878281762208375,
"learning_rate": 5.865947565682066e-07,
"loss": 1.6656,
"step": 751
},
{
"epoch": 0.10084935181046044,
"grad_norm": 1.0847043417176385,
"learning_rate": 5.865593324607452e-07,
"loss": 1.6349,
"step": 752
},
{
"epoch": 0.10098345999105945,
"grad_norm": 1.07175506702241,
"learning_rate": 5.865238628046077e-07,
"loss": 1.646,
"step": 753
},
{
"epoch": 0.10111756817165847,
"grad_norm": 1.1573886829728748,
"learning_rate": 5.864883476060915e-07,
"loss": 1.6585,
"step": 754
},
{
"epoch": 0.10125167635225749,
"grad_norm": 1.0662183481503906,
"learning_rate": 5.864527868715017e-07,
"loss": 1.6685,
"step": 755
},
{
"epoch": 0.1013857845328565,
"grad_norm": 1.1141344678729455,
"learning_rate": 5.864171806071517e-07,
"loss": 1.7169,
"step": 756
},
{
"epoch": 0.10151989271345552,
"grad_norm": 1.100766756813705,
"learning_rate": 5.863815288193628e-07,
"loss": 1.6247,
"step": 757
},
{
"epoch": 0.10165400089405453,
"grad_norm": 1.0952255674456979,
"learning_rate": 5.863458315144646e-07,
"loss": 1.6211,
"step": 758
},
{
"epoch": 0.10178810907465355,
"grad_norm": 1.1257453114351714,
"learning_rate": 5.863100886987948e-07,
"loss": 1.7725,
"step": 759
},
{
"epoch": 0.10192221725525256,
"grad_norm": 1.1540265958163123,
"learning_rate": 5.862743003786989e-07,
"loss": 1.7236,
"step": 760
},
{
"epoch": 0.10205632543585158,
"grad_norm": 1.1525383018656805,
"learning_rate": 5.862384665605306e-07,
"loss": 1.6291,
"step": 761
},
{
"epoch": 0.10219043361645061,
"grad_norm": 1.0998304145799205,
"learning_rate": 5.862025872506518e-07,
"loss": 1.6707,
"step": 762
},
{
"epoch": 0.10232454179704963,
"grad_norm": 1.1328389993712693,
"learning_rate": 5.861666624554323e-07,
"loss": 1.7046,
"step": 763
},
{
"epoch": 0.10245864997764864,
"grad_norm": 1.1261717885021774,
"learning_rate": 5.861306921812503e-07,
"loss": 1.7154,
"step": 764
},
{
"epoch": 0.10259275815824766,
"grad_norm": 1.1225339366672114,
"learning_rate": 5.860946764344915e-07,
"loss": 1.6906,
"step": 765
},
{
"epoch": 0.10272686633884667,
"grad_norm": 1.0705179266385985,
"learning_rate": 5.860586152215504e-07,
"loss": 1.6246,
"step": 766
},
{
"epoch": 0.10286097451944569,
"grad_norm": 1.1541152561285446,
"learning_rate": 5.860225085488287e-07,
"loss": 1.7682,
"step": 767
},
{
"epoch": 0.1029950827000447,
"grad_norm": 1.0637815973415343,
"learning_rate": 5.859863564227371e-07,
"loss": 1.5644,
"step": 768
},
{
"epoch": 0.10312919088064372,
"grad_norm": 1.4548832416501927,
"learning_rate": 5.859501588496937e-07,
"loss": 1.6585,
"step": 769
},
{
"epoch": 0.10326329906124274,
"grad_norm": 1.1159025503039528,
"learning_rate": 5.859139158361249e-07,
"loss": 1.7046,
"step": 770
},
{
"epoch": 0.10339740724184175,
"grad_norm": 1.1310495005094254,
"learning_rate": 5.858776273884653e-07,
"loss": 1.6818,
"step": 771
},
{
"epoch": 0.10353151542244077,
"grad_norm": 1.0517973047871627,
"learning_rate": 5.858412935131574e-07,
"loss": 1.6145,
"step": 772
},
{
"epoch": 0.10366562360303978,
"grad_norm": 1.080650360146408,
"learning_rate": 5.858049142166517e-07,
"loss": 1.6628,
"step": 773
},
{
"epoch": 0.1037997317836388,
"grad_norm": 1.1586931721545415,
"learning_rate": 5.857684895054069e-07,
"loss": 1.6491,
"step": 774
},
{
"epoch": 0.10393383996423781,
"grad_norm": 1.1442490123077105,
"learning_rate": 5.857320193858896e-07,
"loss": 1.701,
"step": 775
},
{
"epoch": 0.10406794814483683,
"grad_norm": 1.1690889705843661,
"learning_rate": 5.856955038645748e-07,
"loss": 1.6635,
"step": 776
},
{
"epoch": 0.10420205632543585,
"grad_norm": 1.0789106990522987,
"learning_rate": 5.856589429479454e-07,
"loss": 1.7244,
"step": 777
},
{
"epoch": 0.10433616450603486,
"grad_norm": 1.1621702061459454,
"learning_rate": 5.856223366424918e-07,
"loss": 1.6577,
"step": 778
},
{
"epoch": 0.10447027268663389,
"grad_norm": 1.234518365304015,
"learning_rate": 5.855856849547135e-07,
"loss": 1.628,
"step": 779
},
{
"epoch": 0.10460438086723291,
"grad_norm": 1.0985603622430586,
"learning_rate": 5.855489878911173e-07,
"loss": 1.5708,
"step": 780
},
{
"epoch": 0.10473848904783192,
"grad_norm": 1.2290143697832727,
"learning_rate": 5.855122454582182e-07,
"loss": 1.6148,
"step": 781
},
{
"epoch": 0.10487259722843094,
"grad_norm": 1.0968718099792736,
"learning_rate": 5.854754576625395e-07,
"loss": 1.6741,
"step": 782
},
{
"epoch": 0.10500670540902995,
"grad_norm": 1.1287867540808152,
"learning_rate": 5.854386245106123e-07,
"loss": 1.6414,
"step": 783
},
{
"epoch": 0.10514081358962897,
"grad_norm": 1.23300063689037,
"learning_rate": 5.854017460089758e-07,
"loss": 1.6692,
"step": 784
},
{
"epoch": 0.10527492177022799,
"grad_norm": 1.057896247934459,
"learning_rate": 5.853648221641774e-07,
"loss": 1.5768,
"step": 785
},
{
"epoch": 0.105409029950827,
"grad_norm": 1.1246918122007368,
"learning_rate": 5.853278529827722e-07,
"loss": 1.7188,
"step": 786
},
{
"epoch": 0.10554313813142602,
"grad_norm": 1.1394479386508116,
"learning_rate": 5.852908384713238e-07,
"loss": 1.6904,
"step": 787
},
{
"epoch": 0.10567724631202503,
"grad_norm": 1.111982268532425,
"learning_rate": 5.852537786364036e-07,
"loss": 1.6384,
"step": 788
},
{
"epoch": 0.10581135449262405,
"grad_norm": 1.1240815270464448,
"learning_rate": 5.85216673484591e-07,
"loss": 1.7382,
"step": 789
},
{
"epoch": 0.10594546267322306,
"grad_norm": 1.103447231107936,
"learning_rate": 5.851795230224736e-07,
"loss": 1.7285,
"step": 790
},
{
"epoch": 0.10607957085382208,
"grad_norm": 1.124305841718373,
"learning_rate": 5.851423272566469e-07,
"loss": 1.5874,
"step": 791
},
{
"epoch": 0.1062136790344211,
"grad_norm": 1.1424352731892036,
"learning_rate": 5.851050861937145e-07,
"loss": 1.7097,
"step": 792
},
{
"epoch": 0.10634778721502011,
"grad_norm": 1.1724771511120693,
"learning_rate": 5.850677998402881e-07,
"loss": 1.6847,
"step": 793
},
{
"epoch": 0.10648189539561913,
"grad_norm": 1.1246235851433404,
"learning_rate": 5.850304682029874e-07,
"loss": 1.6735,
"step": 794
},
{
"epoch": 0.10661600357621814,
"grad_norm": 1.1044843136711693,
"learning_rate": 5.849930912884402e-07,
"loss": 1.6758,
"step": 795
},
{
"epoch": 0.10675011175681716,
"grad_norm": 1.086861760986685,
"learning_rate": 5.849556691032821e-07,
"loss": 1.6564,
"step": 796
},
{
"epoch": 0.10688421993741619,
"grad_norm": 1.1156492790718477,
"learning_rate": 5.84918201654157e-07,
"loss": 1.7699,
"step": 797
},
{
"epoch": 0.1070183281180152,
"grad_norm": 1.105919104931648,
"learning_rate": 5.848806889477168e-07,
"loss": 1.6673,
"step": 798
},
{
"epoch": 0.10715243629861422,
"grad_norm": 1.1197711837565212,
"learning_rate": 5.848431309906213e-07,
"loss": 1.6681,
"step": 799
},
{
"epoch": 0.10728654447921324,
"grad_norm": 1.0624511416416331,
"learning_rate": 5.848055277895385e-07,
"loss": 1.6102,
"step": 800
},
{
"epoch": 0.10742065265981225,
"grad_norm": 1.2004229748929618,
"learning_rate": 5.847678793511441e-07,
"loss": 1.5863,
"step": 801
},
{
"epoch": 0.10755476084041127,
"grad_norm": 1.0858125624618846,
"learning_rate": 5.847301856821225e-07,
"loss": 1.5247,
"step": 802
},
{
"epoch": 0.10768886902101028,
"grad_norm": 1.1461866619519925,
"learning_rate": 5.846924467891654e-07,
"loss": 1.6982,
"step": 803
},
{
"epoch": 0.1078229772016093,
"grad_norm": 1.072949621974548,
"learning_rate": 5.846546626789727e-07,
"loss": 1.6836,
"step": 804
},
{
"epoch": 0.10795708538220831,
"grad_norm": 1.2070245013041887,
"learning_rate": 5.846168333582527e-07,
"loss": 1.6951,
"step": 805
},
{
"epoch": 0.10809119356280733,
"grad_norm": 1.1065226823941745,
"learning_rate": 5.845789588337217e-07,
"loss": 1.6581,
"step": 806
},
{
"epoch": 0.10822530174340635,
"grad_norm": 1.1493594907559954,
"learning_rate": 5.845410391121034e-07,
"loss": 1.5682,
"step": 807
},
{
"epoch": 0.10835940992400536,
"grad_norm": 1.060419028705976,
"learning_rate": 5.845030742001301e-07,
"loss": 1.6098,
"step": 808
},
{
"epoch": 0.10849351810460438,
"grad_norm": 1.0986472798667166,
"learning_rate": 5.84465064104542e-07,
"loss": 1.6998,
"step": 809
},
{
"epoch": 0.1086276262852034,
"grad_norm": 1.0780015294363108,
"learning_rate": 5.844270088320872e-07,
"loss": 1.6396,
"step": 810
},
{
"epoch": 0.10876173446580241,
"grad_norm": 1.1471597573517582,
"learning_rate": 5.843889083895219e-07,
"loss": 1.7247,
"step": 811
},
{
"epoch": 0.10889584264640143,
"grad_norm": 1.1383862809473648,
"learning_rate": 5.843507627836106e-07,
"loss": 1.6618,
"step": 812
},
{
"epoch": 0.10902995082700044,
"grad_norm": 1.1192741205184784,
"learning_rate": 5.843125720211251e-07,
"loss": 1.6551,
"step": 813
},
{
"epoch": 0.10916405900759947,
"grad_norm": 1.137804969239655,
"learning_rate": 5.84274336108846e-07,
"loss": 1.7777,
"step": 814
},
{
"epoch": 0.10929816718819849,
"grad_norm": 1.153664414743612,
"learning_rate": 5.842360550535614e-07,
"loss": 1.693,
"step": 815
},
{
"epoch": 0.1094322753687975,
"grad_norm": 1.2362947655431056,
"learning_rate": 5.841977288620676e-07,
"loss": 1.7216,
"step": 816
},
{
"epoch": 0.10956638354939652,
"grad_norm": 1.0845642638897275,
"learning_rate": 5.84159357541169e-07,
"loss": 1.704,
"step": 817
},
{
"epoch": 0.10970049172999553,
"grad_norm": 1.1373055917212407,
"learning_rate": 5.841209410976779e-07,
"loss": 1.7146,
"step": 818
},
{
"epoch": 0.10983459991059455,
"grad_norm": 1.071610572427508,
"learning_rate": 5.840824795384146e-07,
"loss": 1.6785,
"step": 819
},
{
"epoch": 0.10996870809119356,
"grad_norm": 1.1237115070149213,
"learning_rate": 5.840439728702073e-07,
"loss": 1.7022,
"step": 820
},
{
"epoch": 0.11010281627179258,
"grad_norm": 1.1135499435889078,
"learning_rate": 5.840054210998925e-07,
"loss": 1.6762,
"step": 821
},
{
"epoch": 0.1102369244523916,
"grad_norm": 1.1412142978650357,
"learning_rate": 5.839668242343147e-07,
"loss": 1.7325,
"step": 822
},
{
"epoch": 0.11037103263299061,
"grad_norm": 1.066696944750096,
"learning_rate": 5.839281822803259e-07,
"loss": 1.7209,
"step": 823
},
{
"epoch": 0.11050514081358963,
"grad_norm": 1.109425591853705,
"learning_rate": 5.838894952447866e-07,
"loss": 1.6248,
"step": 824
},
{
"epoch": 0.11063924899418864,
"grad_norm": 1.0738541935378725,
"learning_rate": 5.838507631345652e-07,
"loss": 1.6582,
"step": 825
},
{
"epoch": 0.11077335717478766,
"grad_norm": 1.4358787492291483,
"learning_rate": 5.838119859565381e-07,
"loss": 1.807,
"step": 826
},
{
"epoch": 0.11090746535538668,
"grad_norm": 1.1425108913039257,
"learning_rate": 5.837731637175898e-07,
"loss": 1.6146,
"step": 827
},
{
"epoch": 0.11104157353598569,
"grad_norm": 1.0637227390318094,
"learning_rate": 5.837342964246123e-07,
"loss": 1.6954,
"step": 828
},
{
"epoch": 0.1111756817165847,
"grad_norm": 1.1694795366123236,
"learning_rate": 5.836953840845062e-07,
"loss": 1.6337,
"step": 829
},
{
"epoch": 0.11130978989718372,
"grad_norm": 1.1776659131207758,
"learning_rate": 5.836564267041799e-07,
"loss": 1.7132,
"step": 830
},
{
"epoch": 0.11144389807778274,
"grad_norm": 1.0835328202264551,
"learning_rate": 5.836174242905497e-07,
"loss": 1.7406,
"step": 831
},
{
"epoch": 0.11157800625838177,
"grad_norm": 1.0933003960120042,
"learning_rate": 5.835783768505399e-07,
"loss": 1.6104,
"step": 832
},
{
"epoch": 0.11171211443898078,
"grad_norm": 1.075129502416788,
"learning_rate": 5.835392843910829e-07,
"loss": 1.6599,
"step": 833
},
{
"epoch": 0.1118462226195798,
"grad_norm": 1.1891418452392997,
"learning_rate": 5.835001469191191e-07,
"loss": 1.6589,
"step": 834
},
{
"epoch": 0.11198033080017882,
"grad_norm": 1.7726602578762463,
"learning_rate": 5.834609644415967e-07,
"loss": 1.8068,
"step": 835
},
{
"epoch": 0.11211443898077783,
"grad_norm": 1.1160187069875398,
"learning_rate": 5.834217369654723e-07,
"loss": 1.7302,
"step": 836
},
{
"epoch": 0.11224854716137685,
"grad_norm": 1.2586778829179404,
"learning_rate": 5.833824644977098e-07,
"loss": 1.5899,
"step": 837
},
{
"epoch": 0.11238265534197586,
"grad_norm": 1.1096559717797458,
"learning_rate": 5.833431470452818e-07,
"loss": 1.7175,
"step": 838
},
{
"epoch": 0.11251676352257488,
"grad_norm": 1.1754882099239772,
"learning_rate": 5.833037846151686e-07,
"loss": 1.6674,
"step": 839
},
{
"epoch": 0.1126508717031739,
"grad_norm": 1.030872040717494,
"learning_rate": 5.832643772143582e-07,
"loss": 1.6117,
"step": 840
},
{
"epoch": 0.11278497988377291,
"grad_norm": 1.1260356355011998,
"learning_rate": 5.832249248498472e-07,
"loss": 1.6813,
"step": 841
},
{
"epoch": 0.11291908806437193,
"grad_norm": 1.0550888868426265,
"learning_rate": 5.831854275286396e-07,
"loss": 1.6859,
"step": 842
},
{
"epoch": 0.11305319624497094,
"grad_norm": 1.165191007399385,
"learning_rate": 5.831458852577477e-07,
"loss": 1.6982,
"step": 843
},
{
"epoch": 0.11318730442556996,
"grad_norm": 1.178851685175072,
"learning_rate": 5.831062980441918e-07,
"loss": 1.6891,
"step": 844
},
{
"epoch": 0.11332141260616897,
"grad_norm": 1.173173669662085,
"learning_rate": 5.830666658949999e-07,
"loss": 1.7388,
"step": 845
},
{
"epoch": 0.11345552078676799,
"grad_norm": 1.1552209879477302,
"learning_rate": 5.830269888172083e-07,
"loss": 1.7383,
"step": 846
},
{
"epoch": 0.113589628967367,
"grad_norm": 1.0974766482142095,
"learning_rate": 5.82987266817861e-07,
"loss": 1.7139,
"step": 847
},
{
"epoch": 0.11372373714796602,
"grad_norm": 1.1314238053001549,
"learning_rate": 5.829474999040102e-07,
"loss": 1.6041,
"step": 848
},
{
"epoch": 0.11385784532856505,
"grad_norm": 1.100933720786019,
"learning_rate": 5.829076880827159e-07,
"loss": 1.7101,
"step": 849
},
{
"epoch": 0.11399195350916407,
"grad_norm": 1.1461722995944397,
"learning_rate": 5.828678313610463e-07,
"loss": 1.7009,
"step": 850
},
{
"epoch": 0.11412606168976308,
"grad_norm": 1.2722684302580665,
"learning_rate": 5.828279297460774e-07,
"loss": 1.6484,
"step": 851
},
{
"epoch": 0.1142601698703621,
"grad_norm": 1.1151947943169025,
"learning_rate": 5.82787983244893e-07,
"loss": 1.655,
"step": 852
},
{
"epoch": 0.11439427805096111,
"grad_norm": 1.1184598730723336,
"learning_rate": 5.827479918645852e-07,
"loss": 1.6165,
"step": 853
},
{
"epoch": 0.11452838623156013,
"grad_norm": 1.023276016208069,
"learning_rate": 5.827079556122542e-07,
"loss": 1.4802,
"step": 854
},
{
"epoch": 0.11466249441215914,
"grad_norm": 1.1363089821207286,
"learning_rate": 5.826678744950074e-07,
"loss": 1.7255,
"step": 855
},
{
"epoch": 0.11479660259275816,
"grad_norm": 1.1011868598006873,
"learning_rate": 5.826277485199609e-07,
"loss": 1.6958,
"step": 856
},
{
"epoch": 0.11493071077335718,
"grad_norm": 1.1338150939022813,
"learning_rate": 5.825875776942388e-07,
"loss": 1.7061,
"step": 857
},
{
"epoch": 0.11506481895395619,
"grad_norm": 1.130051416794989,
"learning_rate": 5.825473620249724e-07,
"loss": 1.7138,
"step": 858
},
{
"epoch": 0.1151989271345552,
"grad_norm": 1.0842663625693372,
"learning_rate": 5.825071015193018e-07,
"loss": 1.6059,
"step": 859
},
{
"epoch": 0.11533303531515422,
"grad_norm": 1.126331708345394,
"learning_rate": 5.824667961843746e-07,
"loss": 1.6874,
"step": 860
},
{
"epoch": 0.11546714349575324,
"grad_norm": 1.067788867144983,
"learning_rate": 5.824264460273465e-07,
"loss": 1.7211,
"step": 861
},
{
"epoch": 0.11560125167635225,
"grad_norm": 1.0567680329056464,
"learning_rate": 5.823860510553811e-07,
"loss": 1.5729,
"step": 862
},
{
"epoch": 0.11573535985695127,
"grad_norm": 1.088021498471896,
"learning_rate": 5.823456112756498e-07,
"loss": 1.6884,
"step": 863
},
{
"epoch": 0.11586946803755029,
"grad_norm": 1.1157283518569765,
"learning_rate": 5.823051266953325e-07,
"loss": 1.6806,
"step": 864
},
{
"epoch": 0.1160035762181493,
"grad_norm": 1.0681883774872867,
"learning_rate": 5.822645973216165e-07,
"loss": 1.6397,
"step": 865
},
{
"epoch": 0.11613768439874832,
"grad_norm": 1.0861783292304394,
"learning_rate": 5.822240231616973e-07,
"loss": 1.575,
"step": 866
},
{
"epoch": 0.11627179257934735,
"grad_norm": 1.068546853668492,
"learning_rate": 5.821834042227783e-07,
"loss": 1.6436,
"step": 867
},
{
"epoch": 0.11640590075994636,
"grad_norm": 1.1370891534192904,
"learning_rate": 5.821427405120708e-07,
"loss": 1.7133,
"step": 868
},
{
"epoch": 0.11654000894054538,
"grad_norm": 1.0975985479163,
"learning_rate": 5.821020320367942e-07,
"loss": 1.7395,
"step": 869
},
{
"epoch": 0.1166741171211444,
"grad_norm": 1.0979310675749658,
"learning_rate": 5.820612788041756e-07,
"loss": 1.733,
"step": 870
},
{
"epoch": 0.11680822530174341,
"grad_norm": 1.1290790783874916,
"learning_rate": 5.820204808214503e-07,
"loss": 1.5963,
"step": 871
},
{
"epoch": 0.11694233348234243,
"grad_norm": 1.0767125460282738,
"learning_rate": 5.819796380958613e-07,
"loss": 1.7139,
"step": 872
},
{
"epoch": 0.11707644166294144,
"grad_norm": 1.242641974109421,
"learning_rate": 5.819387506346598e-07,
"loss": 1.7068,
"step": 873
},
{
"epoch": 0.11721054984354046,
"grad_norm": 1.0978061234757794,
"learning_rate": 5.818978184451048e-07,
"loss": 1.625,
"step": 874
},
{
"epoch": 0.11734465802413947,
"grad_norm": 1.0887952709463755,
"learning_rate": 5.818568415344633e-07,
"loss": 1.6017,
"step": 875
},
{
"epoch": 0.11747876620473849,
"grad_norm": 1.0584442299701264,
"learning_rate": 5.818158199100101e-07,
"loss": 1.7367,
"step": 876
},
{
"epoch": 0.1176128743853375,
"grad_norm": 1.0996935525118328,
"learning_rate": 5.817747535790283e-07,
"loss": 1.6186,
"step": 877
},
{
"epoch": 0.11774698256593652,
"grad_norm": 1.1314747020843203,
"learning_rate": 5.817336425488082e-07,
"loss": 1.6249,
"step": 878
},
{
"epoch": 0.11788109074653554,
"grad_norm": 1.1919795844521832,
"learning_rate": 5.81692486826649e-07,
"loss": 1.6532,
"step": 879
},
{
"epoch": 0.11801519892713455,
"grad_norm": 1.305262723197089,
"learning_rate": 5.816512864198571e-07,
"loss": 1.5978,
"step": 880
},
{
"epoch": 0.11814930710773357,
"grad_norm": 1.1155976857853542,
"learning_rate": 5.816100413357471e-07,
"loss": 1.6797,
"step": 881
},
{
"epoch": 0.11828341528833258,
"grad_norm": 1.123108419027786,
"learning_rate": 5.815687515816415e-07,
"loss": 1.5944,
"step": 882
},
{
"epoch": 0.1184175234689316,
"grad_norm": 1.1318300431723485,
"learning_rate": 5.815274171648709e-07,
"loss": 1.6328,
"step": 883
},
{
"epoch": 0.11855163164953063,
"grad_norm": 1.1498251619378483,
"learning_rate": 5.814860380927734e-07,
"loss": 1.6131,
"step": 884
},
{
"epoch": 0.11868573983012964,
"grad_norm": 1.0940645690658886,
"learning_rate": 5.814446143726956e-07,
"loss": 1.6142,
"step": 885
},
{
"epoch": 0.11881984801072866,
"grad_norm": 1.0820516072736348,
"learning_rate": 5.814031460119914e-07,
"loss": 1.6148,
"step": 886
},
{
"epoch": 0.11895395619132768,
"grad_norm": 1.1247339726082044,
"learning_rate": 5.813616330180233e-07,
"loss": 1.7608,
"step": 887
},
{
"epoch": 0.11908806437192669,
"grad_norm": 1.3664008359044402,
"learning_rate": 5.813200753981611e-07,
"loss": 1.6969,
"step": 888
},
{
"epoch": 0.11922217255252571,
"grad_norm": 1.1603697359280436,
"learning_rate": 5.812784731597829e-07,
"loss": 1.7402,
"step": 889
},
{
"epoch": 0.11935628073312472,
"grad_norm": 1.1010475016983683,
"learning_rate": 5.812368263102746e-07,
"loss": 1.759,
"step": 890
},
{
"epoch": 0.11949038891372374,
"grad_norm": 1.1085219941083455,
"learning_rate": 5.811951348570302e-07,
"loss": 1.667,
"step": 891
},
{
"epoch": 0.11962449709432275,
"grad_norm": 1.1139382749577305,
"learning_rate": 5.811533988074512e-07,
"loss": 1.6677,
"step": 892
},
{
"epoch": 0.11975860527492177,
"grad_norm": 1.1325956159096344,
"learning_rate": 5.811116181689475e-07,
"loss": 1.7068,
"step": 893
},
{
"epoch": 0.11989271345552079,
"grad_norm": 1.0408410504808954,
"learning_rate": 5.810697929489365e-07,
"loss": 1.6708,
"step": 894
},
{
"epoch": 0.1200268216361198,
"grad_norm": 1.0658514906014669,
"learning_rate": 5.810279231548439e-07,
"loss": 1.6833,
"step": 895
},
{
"epoch": 0.12016092981671882,
"grad_norm": 1.0840346983956348,
"learning_rate": 5.80986008794103e-07,
"loss": 1.6973,
"step": 896
},
{
"epoch": 0.12029503799731783,
"grad_norm": 1.1508325943207491,
"learning_rate": 5.809440498741552e-07,
"loss": 1.7326,
"step": 897
},
{
"epoch": 0.12042914617791685,
"grad_norm": 1.0629236207923716,
"learning_rate": 5.809020464024496e-07,
"loss": 1.5428,
"step": 898
},
{
"epoch": 0.12056325435851586,
"grad_norm": 1.112200747649366,
"learning_rate": 5.808599983864435e-07,
"loss": 1.6729,
"step": 899
},
{
"epoch": 0.12069736253911488,
"grad_norm": 1.2078470991285137,
"learning_rate": 5.80817905833602e-07,
"loss": 1.738,
"step": 900
},
{
"epoch": 0.1208314707197139,
"grad_norm": 1.1190068417460075,
"learning_rate": 5.807757687513979e-07,
"loss": 1.6607,
"step": 901
},
{
"epoch": 0.12096557890031293,
"grad_norm": 1.0450615497760403,
"learning_rate": 5.807335871473122e-07,
"loss": 1.6588,
"step": 902
},
{
"epoch": 0.12109968708091194,
"grad_norm": 1.121198054415324,
"learning_rate": 5.806913610288336e-07,
"loss": 1.662,
"step": 903
},
{
"epoch": 0.12123379526151096,
"grad_norm": 1.1054682653267978,
"learning_rate": 5.806490904034589e-07,
"loss": 1.6706,
"step": 904
},
{
"epoch": 0.12136790344210997,
"grad_norm": 1.113997411395293,
"learning_rate": 5.806067752786926e-07,
"loss": 1.7632,
"step": 905
},
{
"epoch": 0.12150201162270899,
"grad_norm": 1.1613864633248003,
"learning_rate": 5.805644156620472e-07,
"loss": 1.7098,
"step": 906
},
{
"epoch": 0.121636119803308,
"grad_norm": 1.1055893873511211,
"learning_rate": 5.805220115610431e-07,
"loss": 1.7946,
"step": 907
},
{
"epoch": 0.12177022798390702,
"grad_norm": 1.059537639783976,
"learning_rate": 5.804795629832085e-07,
"loss": 1.6377,
"step": 908
},
{
"epoch": 0.12190433616450604,
"grad_norm": 1.075756870276535,
"learning_rate": 5.804370699360796e-07,
"loss": 1.6709,
"step": 909
},
{
"epoch": 0.12203844434510505,
"grad_norm": 1.0951662603881447,
"learning_rate": 5.803945324272006e-07,
"loss": 1.6114,
"step": 910
},
{
"epoch": 0.12217255252570407,
"grad_norm": 1.0835170338297386,
"learning_rate": 5.803519504641234e-07,
"loss": 1.6945,
"step": 911
},
{
"epoch": 0.12230666070630308,
"grad_norm": 1.188508933084379,
"learning_rate": 5.803093240544077e-07,
"loss": 1.7176,
"step": 912
},
{
"epoch": 0.1224407688869021,
"grad_norm": 1.0574940351976068,
"learning_rate": 5.802666532056215e-07,
"loss": 1.6449,
"step": 913
},
{
"epoch": 0.12257487706750111,
"grad_norm": 1.1011954691706793,
"learning_rate": 5.802239379253403e-07,
"loss": 1.7403,
"step": 914
},
{
"epoch": 0.12270898524810013,
"grad_norm": 1.05289982245001,
"learning_rate": 5.801811782211476e-07,
"loss": 1.7121,
"step": 915
},
{
"epoch": 0.12284309342869915,
"grad_norm": 1.1247742251938873,
"learning_rate": 5.801383741006349e-07,
"loss": 1.6904,
"step": 916
},
{
"epoch": 0.12297720160929816,
"grad_norm": 1.1060690034689273,
"learning_rate": 5.800955255714014e-07,
"loss": 1.5423,
"step": 917
},
{
"epoch": 0.12311130978989718,
"grad_norm": 1.17690980567079,
"learning_rate": 5.800526326410544e-07,
"loss": 1.6638,
"step": 918
},
{
"epoch": 0.12324541797049621,
"grad_norm": 1.0758724475892376,
"learning_rate": 5.800096953172088e-07,
"loss": 1.7136,
"step": 919
},
{
"epoch": 0.12337952615109522,
"grad_norm": 1.0847412248840858,
"learning_rate": 5.799667136074878e-07,
"loss": 1.7712,
"step": 920
},
{
"epoch": 0.12351363433169424,
"grad_norm": 1.1331387033738405,
"learning_rate": 5.799236875195219e-07,
"loss": 1.664,
"step": 921
},
{
"epoch": 0.12364774251229325,
"grad_norm": 1.3262309930515026,
"learning_rate": 5.798806170609502e-07,
"loss": 1.6546,
"step": 922
},
{
"epoch": 0.12378185069289227,
"grad_norm": 1.1280111604345993,
"learning_rate": 5.79837502239419e-07,
"loss": 1.6623,
"step": 923
},
{
"epoch": 0.12391595887349129,
"grad_norm": 1.1001484560762704,
"learning_rate": 5.797943430625828e-07,
"loss": 1.6743,
"step": 924
},
{
"epoch": 0.1240500670540903,
"grad_norm": 1.1051963249243846,
"learning_rate": 5.79751139538104e-07,
"loss": 1.6542,
"step": 925
},
{
"epoch": 0.12418417523468932,
"grad_norm": 2.096743814606382,
"learning_rate": 5.797078916736527e-07,
"loss": 1.7618,
"step": 926
},
{
"epoch": 0.12431828341528833,
"grad_norm": 1.1918807746678728,
"learning_rate": 5.79664599476907e-07,
"loss": 1.7489,
"step": 927
},
{
"epoch": 0.12445239159588735,
"grad_norm": 1.2255902304289672,
"learning_rate": 5.79621262955553e-07,
"loss": 1.805,
"step": 928
},
{
"epoch": 0.12458649977648636,
"grad_norm": 1.1112711388204457,
"learning_rate": 5.795778821172845e-07,
"loss": 1.6535,
"step": 929
},
{
"epoch": 0.12472060795708538,
"grad_norm": 1.15632851861526,
"learning_rate": 5.79534456969803e-07,
"loss": 1.7674,
"step": 930
},
{
"epoch": 0.1248547161376844,
"grad_norm": 1.1364857063021152,
"learning_rate": 5.794909875208182e-07,
"loss": 1.6668,
"step": 931
},
{
"epoch": 0.12498882431828341,
"grad_norm": 1.1554164021245972,
"learning_rate": 5.794474737780474e-07,
"loss": 1.6862,
"step": 932
},
{
"epoch": 0.12512293249888243,
"grad_norm": 1.1360253650713825,
"learning_rate": 5.79403915749216e-07,
"loss": 1.6811,
"step": 933
},
{
"epoch": 0.12525704067948146,
"grad_norm": 1.066412847829235,
"learning_rate": 5.793603134420571e-07,
"loss": 1.6562,
"step": 934
},
{
"epoch": 0.12539114886008046,
"grad_norm": 1.081900817528408,
"learning_rate": 5.793166668643118e-07,
"loss": 1.6319,
"step": 935
},
{
"epoch": 0.1255252570406795,
"grad_norm": 1.12430422704736,
"learning_rate": 5.792729760237288e-07,
"loss": 1.6679,
"step": 936
},
{
"epoch": 0.1256593652212785,
"grad_norm": 1.1555451362888864,
"learning_rate": 5.79229240928065e-07,
"loss": 1.6272,
"step": 937
},
{
"epoch": 0.12579347340187752,
"grad_norm": 1.1120423598959,
"learning_rate": 5.791854615850848e-07,
"loss": 1.7271,
"step": 938
},
{
"epoch": 0.12592758158247652,
"grad_norm": 1.099822375040922,
"learning_rate": 5.791416380025607e-07,
"loss": 1.6762,
"step": 939
},
{
"epoch": 0.12606168976307555,
"grad_norm": 1.1055384980174303,
"learning_rate": 5.79097770188273e-07,
"loss": 1.6526,
"step": 940
},
{
"epoch": 0.12619579794367455,
"grad_norm": 1.1135160613742192,
"learning_rate": 5.7905385815001e-07,
"loss": 1.7112,
"step": 941
},
{
"epoch": 0.12632990612427358,
"grad_norm": 1.172524893436665,
"learning_rate": 5.790099018955674e-07,
"loss": 1.6629,
"step": 942
},
{
"epoch": 0.12646401430487259,
"grad_norm": 1.143908651612981,
"learning_rate": 5.789659014327492e-07,
"loss": 1.6004,
"step": 943
},
{
"epoch": 0.12659812248547161,
"grad_norm": 1.0950798365706262,
"learning_rate": 5.789218567693672e-07,
"loss": 1.6794,
"step": 944
},
{
"epoch": 0.12673223066607062,
"grad_norm": 1.0865150988933485,
"learning_rate": 5.788777679132408e-07,
"loss": 1.7733,
"step": 945
},
{
"epoch": 0.12686633884666965,
"grad_norm": 1.081699940619205,
"learning_rate": 5.788336348721972e-07,
"loss": 1.6587,
"step": 946
},
{
"epoch": 0.12700044702726868,
"grad_norm": 1.0733926398236942,
"learning_rate": 5.787894576540721e-07,
"loss": 1.6461,
"step": 947
},
{
"epoch": 0.12713455520786768,
"grad_norm": 1.126195585933314,
"learning_rate": 5.787452362667083e-07,
"loss": 1.6838,
"step": 948
},
{
"epoch": 0.1272686633884667,
"grad_norm": 1.1329864382691732,
"learning_rate": 5.787009707179567e-07,
"loss": 1.6329,
"step": 949
},
{
"epoch": 0.1274027715690657,
"grad_norm": 1.1004395022968605,
"learning_rate": 5.786566610156759e-07,
"loss": 1.7147,
"step": 950
},
{
"epoch": 0.12753687974966474,
"grad_norm": 1.0391080576189866,
"learning_rate": 5.78612307167733e-07,
"loss": 1.6315,
"step": 951
},
{
"epoch": 0.12767098793026374,
"grad_norm": 1.0855474578853979,
"learning_rate": 5.78567909182002e-07,
"loss": 1.7127,
"step": 952
},
{
"epoch": 0.12780509611086277,
"grad_norm": 1.1433214364150983,
"learning_rate": 5.785234670663652e-07,
"loss": 1.7042,
"step": 953
},
{
"epoch": 0.12793920429146177,
"grad_norm": 1.0903898099360794,
"learning_rate": 5.784789808287129e-07,
"loss": 1.749,
"step": 954
},
{
"epoch": 0.1280733124720608,
"grad_norm": 1.1462757739762268,
"learning_rate": 5.784344504769428e-07,
"loss": 1.7118,
"step": 955
},
{
"epoch": 0.1282074206526598,
"grad_norm": 1.0944948131751315,
"learning_rate": 5.783898760189609e-07,
"loss": 1.7308,
"step": 956
},
{
"epoch": 0.12834152883325883,
"grad_norm": 1.0898739853739683,
"learning_rate": 5.783452574626806e-07,
"loss": 1.5947,
"step": 957
},
{
"epoch": 0.12847563701385784,
"grad_norm": 1.1070871512716438,
"learning_rate": 5.783005948160236e-07,
"loss": 1.7032,
"step": 958
},
{
"epoch": 0.12860974519445686,
"grad_norm": 1.1173517977218599,
"learning_rate": 5.782558880869187e-07,
"loss": 1.76,
"step": 959
},
{
"epoch": 0.12874385337505587,
"grad_norm": 1.0784753543720036,
"learning_rate": 5.782111372833035e-07,
"loss": 1.6817,
"step": 960
},
{
"epoch": 0.1288779615556549,
"grad_norm": 1.099729300157914,
"learning_rate": 5.781663424131225e-07,
"loss": 1.5885,
"step": 961
},
{
"epoch": 0.1290120697362539,
"grad_norm": 1.1053155402387764,
"learning_rate": 5.781215034843288e-07,
"loss": 1.649,
"step": 962
},
{
"epoch": 0.12914617791685293,
"grad_norm": 1.0498243431495933,
"learning_rate": 5.780766205048826e-07,
"loss": 1.6,
"step": 963
},
{
"epoch": 0.12928028609745196,
"grad_norm": 1.0650679197683777,
"learning_rate": 5.780316934827524e-07,
"loss": 1.7031,
"step": 964
},
{
"epoch": 0.12941439427805096,
"grad_norm": 1.2041255427364985,
"learning_rate": 5.779867224259144e-07,
"loss": 1.7187,
"step": 965
},
{
"epoch": 0.12954850245865,
"grad_norm": 1.0678692273869028,
"learning_rate": 5.779417073423526e-07,
"loss": 1.6825,
"step": 966
},
{
"epoch": 0.129682610639249,
"grad_norm": 1.1199711834538628,
"learning_rate": 5.778966482400589e-07,
"loss": 1.6826,
"step": 967
},
{
"epoch": 0.12981671881984802,
"grad_norm": 1.3086828320370905,
"learning_rate": 5.778515451270329e-07,
"loss": 1.6527,
"step": 968
},
{
"epoch": 0.12995082700044702,
"grad_norm": 1.1283872527591725,
"learning_rate": 5.77806398011282e-07,
"loss": 1.6979,
"step": 969
},
{
"epoch": 0.13008493518104605,
"grad_norm": 1.6891339086777561,
"learning_rate": 5.777612069008215e-07,
"loss": 1.6052,
"step": 970
},
{
"epoch": 0.13021904336164505,
"grad_norm": 1.0995419197341152,
"learning_rate": 5.777159718036745e-07,
"loss": 1.6741,
"step": 971
},
{
"epoch": 0.13035315154224408,
"grad_norm": 1.0826527648905109,
"learning_rate": 5.776706927278718e-07,
"loss": 1.7414,
"step": 972
},
{
"epoch": 0.13048725972284309,
"grad_norm": 1.1749450180853513,
"learning_rate": 5.776253696814523e-07,
"loss": 1.7253,
"step": 973
},
{
"epoch": 0.13062136790344211,
"grad_norm": 1.1522644681889058,
"learning_rate": 5.775800026724622e-07,
"loss": 1.7109,
"step": 974
},
{
"epoch": 0.13075547608404112,
"grad_norm": 1.1287433508002416,
"learning_rate": 5.775345917089561e-07,
"loss": 1.7602,
"step": 975
},
{
"epoch": 0.13088958426464015,
"grad_norm": 1.1367208391544785,
"learning_rate": 5.77489136798996e-07,
"loss": 1.7096,
"step": 976
},
{
"epoch": 0.13102369244523915,
"grad_norm": 1.093651839491161,
"learning_rate": 5.774436379506516e-07,
"loss": 1.7313,
"step": 977
},
{
"epoch": 0.13115780062583818,
"grad_norm": 1.1158114646345074,
"learning_rate": 5.773980951720009e-07,
"loss": 1.7152,
"step": 978
},
{
"epoch": 0.13129190880643718,
"grad_norm": 1.1405133501951592,
"learning_rate": 5.773525084711293e-07,
"loss": 1.6721,
"step": 979
},
{
"epoch": 0.1314260169870362,
"grad_norm": 1.0757304379815442,
"learning_rate": 5.773068778561302e-07,
"loss": 1.64,
"step": 980
},
{
"epoch": 0.13156012516763524,
"grad_norm": 1.0607235063703084,
"learning_rate": 5.772612033351045e-07,
"loss": 1.7254,
"step": 981
},
{
"epoch": 0.13169423334823424,
"grad_norm": 1.0583251896426324,
"learning_rate": 5.772154849161613e-07,
"loss": 1.687,
"step": 982
},
{
"epoch": 0.13182834152883327,
"grad_norm": 1.098628320814992,
"learning_rate": 5.771697226074171e-07,
"loss": 1.635,
"step": 983
},
{
"epoch": 0.13196244970943227,
"grad_norm": 1.1805474022437217,
"learning_rate": 5.771239164169966e-07,
"loss": 1.6698,
"step": 984
},
{
"epoch": 0.1320965578900313,
"grad_norm": 1.0875587476789947,
"learning_rate": 5.77078066353032e-07,
"loss": 1.6354,
"step": 985
},
{
"epoch": 0.1322306660706303,
"grad_norm": 1.2112176511625345,
"learning_rate": 5.770321724236633e-07,
"loss": 1.7872,
"step": 986
},
{
"epoch": 0.13236477425122933,
"grad_norm": 1.2350020465740164,
"learning_rate": 5.769862346370384e-07,
"loss": 1.7646,
"step": 987
},
{
"epoch": 0.13249888243182834,
"grad_norm": 1.1782226253464931,
"learning_rate": 5.769402530013128e-07,
"loss": 1.7215,
"step": 988
},
{
"epoch": 0.13263299061242736,
"grad_norm": 1.0995226058236465,
"learning_rate": 5.768942275246503e-07,
"loss": 1.6472,
"step": 989
},
{
"epoch": 0.13276709879302637,
"grad_norm": 1.1354276853120844,
"learning_rate": 5.768481582152218e-07,
"loss": 1.7206,
"step": 990
},
{
"epoch": 0.1329012069736254,
"grad_norm": 1.1299465711204602,
"learning_rate": 5.768020450812064e-07,
"loss": 1.6917,
"step": 991
},
{
"epoch": 0.1330353151542244,
"grad_norm": 1.0767689418910376,
"learning_rate": 5.767558881307906e-07,
"loss": 1.6643,
"step": 992
},
{
"epoch": 0.13316942333482343,
"grad_norm": 1.1138902596082148,
"learning_rate": 5.767096873721693e-07,
"loss": 1.7642,
"step": 993
},
{
"epoch": 0.13330353151542243,
"grad_norm": 1.1056642001660029,
"learning_rate": 5.766634428135447e-07,
"loss": 1.689,
"step": 994
},
{
"epoch": 0.13343763969602146,
"grad_norm": 1.0482595089911335,
"learning_rate": 5.76617154463127e-07,
"loss": 1.635,
"step": 995
},
{
"epoch": 0.13357174787662046,
"grad_norm": 1.0936790475077613,
"learning_rate": 5.765708223291338e-07,
"loss": 1.6614,
"step": 996
},
{
"epoch": 0.1337058560572195,
"grad_norm": 1.1904352264236198,
"learning_rate": 5.765244464197911e-07,
"loss": 1.6631,
"step": 997
},
{
"epoch": 0.13383996423781852,
"grad_norm": 1.1399324270789883,
"learning_rate": 5.76478026743332e-07,
"loss": 1.6956,
"step": 998
},
{
"epoch": 0.13397407241841752,
"grad_norm": 1.0631541550252919,
"learning_rate": 5.76431563307998e-07,
"loss": 1.6357,
"step": 999
},
{
"epoch": 0.13410818059901655,
"grad_norm": 2.7939617071812304,
"learning_rate": 5.763850561220378e-07,
"loss": 1.7513,
"step": 1000
},
{
"epoch": 0.13424228877961555,
"grad_norm": 1.1023053650764323,
"learning_rate": 5.763385051937082e-07,
"loss": 1.6986,
"step": 1001
},
{
"epoch": 0.13437639696021458,
"grad_norm": 1.1134127723095217,
"learning_rate": 5.762919105312739e-07,
"loss": 1.6972,
"step": 1002
},
{
"epoch": 0.13451050514081359,
"grad_norm": 1.3206325684664686,
"learning_rate": 5.762452721430068e-07,
"loss": 1.6561,
"step": 1003
},
{
"epoch": 0.13464461332141262,
"grad_norm": 1.1017815335316827,
"learning_rate": 5.761985900371871e-07,
"loss": 1.6294,
"step": 1004
},
{
"epoch": 0.13477872150201162,
"grad_norm": 1.091998126330244,
"learning_rate": 5.761518642221027e-07,
"loss": 1.6645,
"step": 1005
},
{
"epoch": 0.13491282968261065,
"grad_norm": 1.1390065790034687,
"learning_rate": 5.76105094706049e-07,
"loss": 1.6634,
"step": 1006
},
{
"epoch": 0.13504693786320965,
"grad_norm": 1.1165938666136697,
"learning_rate": 5.760582814973294e-07,
"loss": 1.6884,
"step": 1007
},
{
"epoch": 0.13518104604380868,
"grad_norm": 1.1265961333800854,
"learning_rate": 5.760114246042548e-07,
"loss": 1.581,
"step": 1008
},
{
"epoch": 0.13531515422440768,
"grad_norm": 1.1108402335230954,
"learning_rate": 5.759645240351442e-07,
"loss": 1.6948,
"step": 1009
},
{
"epoch": 0.1354492624050067,
"grad_norm": 1.1540406201851725,
"learning_rate": 5.75917579798324e-07,
"loss": 1.6816,
"step": 1010
},
{
"epoch": 0.1355833705856057,
"grad_norm": 1.0776760932575635,
"learning_rate": 5.758705919021285e-07,
"loss": 1.6455,
"step": 1011
},
{
"epoch": 0.13571747876620474,
"grad_norm": 1.1626622938941558,
"learning_rate": 5.758235603549001e-07,
"loss": 1.7679,
"step": 1012
},
{
"epoch": 0.13585158694680374,
"grad_norm": 1.187443307470314,
"learning_rate": 5.757764851649882e-07,
"loss": 1.6258,
"step": 1013
},
{
"epoch": 0.13598569512740277,
"grad_norm": 1.1483737298574974,
"learning_rate": 5.757293663407507e-07,
"loss": 1.7531,
"step": 1014
},
{
"epoch": 0.13611980330800177,
"grad_norm": 1.108423451892347,
"learning_rate": 5.756822038905527e-07,
"loss": 1.5847,
"step": 1015
},
{
"epoch": 0.1362539114886008,
"grad_norm": 1.056521665647446,
"learning_rate": 5.756349978227674e-07,
"loss": 1.6545,
"step": 1016
},
{
"epoch": 0.13638801966919983,
"grad_norm": 1.122523040636454,
"learning_rate": 5.755877481457756e-07,
"loss": 1.6762,
"step": 1017
},
{
"epoch": 0.13652212784979884,
"grad_norm": 1.1104212906292141,
"learning_rate": 5.755404548679657e-07,
"loss": 1.6761,
"step": 1018
},
{
"epoch": 0.13665623603039787,
"grad_norm": 1.0971062205375117,
"learning_rate": 5.75493117997734e-07,
"loss": 1.6676,
"step": 1019
},
{
"epoch": 0.13679034421099687,
"grad_norm": 1.1923600261259284,
"learning_rate": 5.754457375434848e-07,
"loss": 1.6966,
"step": 1020
},
{
"epoch": 0.1369244523915959,
"grad_norm": 1.1577052085464195,
"learning_rate": 5.753983135136295e-07,
"loss": 1.7123,
"step": 1021
},
{
"epoch": 0.1370585605721949,
"grad_norm": 1.1404232349413184,
"learning_rate": 5.753508459165879e-07,
"loss": 1.703,
"step": 1022
},
{
"epoch": 0.13719266875279393,
"grad_norm": 1.392333260935911,
"learning_rate": 5.75303334760787e-07,
"loss": 1.7096,
"step": 1023
},
{
"epoch": 0.13732677693339293,
"grad_norm": 1.113423870991827,
"learning_rate": 5.75255780054662e-07,
"loss": 1.7556,
"step": 1024
},
{
"epoch": 0.13746088511399196,
"grad_norm": 1.0653465618827531,
"learning_rate": 5.752081818066555e-07,
"loss": 1.7324,
"step": 1025
},
{
"epoch": 0.13759499329459096,
"grad_norm": 1.0145309694174296,
"learning_rate": 5.751605400252179e-07,
"loss": 1.684,
"step": 1026
},
{
"epoch": 0.13772910147519,
"grad_norm": 1.1507242589279925,
"learning_rate": 5.751128547188073e-07,
"loss": 1.7363,
"step": 1027
},
{
"epoch": 0.137863209655789,
"grad_norm": 1.1602441710831857,
"learning_rate": 5.750651258958897e-07,
"loss": 1.6452,
"step": 1028
},
{
"epoch": 0.13799731783638802,
"grad_norm": 1.0450164574336993,
"learning_rate": 5.750173535649387e-07,
"loss": 1.6581,
"step": 1029
},
{
"epoch": 0.13813142601698702,
"grad_norm": 1.1152601638616617,
"learning_rate": 5.749695377344356e-07,
"loss": 1.7178,
"step": 1030
},
{
"epoch": 0.13826553419758605,
"grad_norm": 1.1109479531814108,
"learning_rate": 5.749216784128695e-07,
"loss": 1.6318,
"step": 1031
},
{
"epoch": 0.13839964237818506,
"grad_norm": 1.1171173194344595,
"learning_rate": 5.748737756087372e-07,
"loss": 1.7563,
"step": 1032
},
{
"epoch": 0.13853375055878409,
"grad_norm": 1.1229721774030046,
"learning_rate": 5.74825829330543e-07,
"loss": 1.6557,
"step": 1033
},
{
"epoch": 0.13866785873938312,
"grad_norm": 1.0610467262170575,
"learning_rate": 5.747778395867995e-07,
"loss": 1.5954,
"step": 1034
},
{
"epoch": 0.13880196691998212,
"grad_norm": 1.057400993985582,
"learning_rate": 5.747298063860264e-07,
"loss": 1.6836,
"step": 1035
},
{
"epoch": 0.13893607510058115,
"grad_norm": 1.2946727429654457,
"learning_rate": 5.746817297367512e-07,
"loss": 1.7718,
"step": 1036
},
{
"epoch": 0.13907018328118015,
"grad_norm": 1.0793836410907007,
"learning_rate": 5.746336096475097e-07,
"loss": 1.6192,
"step": 1037
},
{
"epoch": 0.13920429146177918,
"grad_norm": 1.0456487983417475,
"learning_rate": 5.745854461268445e-07,
"loss": 1.6997,
"step": 1038
},
{
"epoch": 0.13933839964237818,
"grad_norm": 1.0783776132275518,
"learning_rate": 5.745372391833066e-07,
"loss": 1.5643,
"step": 1039
},
{
"epoch": 0.1394725078229772,
"grad_norm": 1.1073544797133057,
"learning_rate": 5.744889888254545e-07,
"loss": 1.7453,
"step": 1040
},
{
"epoch": 0.1396066160035762,
"grad_norm": 1.0897237578625294,
"learning_rate": 5.744406950618546e-07,
"loss": 1.7507,
"step": 1041
},
{
"epoch": 0.13974072418417524,
"grad_norm": 1.1334242880215313,
"learning_rate": 5.743923579010804e-07,
"loss": 1.5952,
"step": 1042
},
{
"epoch": 0.13987483236477424,
"grad_norm": 1.0794611740077888,
"learning_rate": 5.743439773517138e-07,
"loss": 1.6699,
"step": 1043
},
{
"epoch": 0.14000894054537327,
"grad_norm": 1.2221425859227393,
"learning_rate": 5.742955534223441e-07,
"loss": 1.6667,
"step": 1044
},
{
"epoch": 0.14014304872597227,
"grad_norm": 1.0734586645398891,
"learning_rate": 5.742470861215682e-07,
"loss": 1.7595,
"step": 1045
},
{
"epoch": 0.1402771569065713,
"grad_norm": 1.1044082425274806,
"learning_rate": 5.74198575457991e-07,
"loss": 1.6741,
"step": 1046
},
{
"epoch": 0.1404112650871703,
"grad_norm": 1.114278005814131,
"learning_rate": 5.741500214402247e-07,
"loss": 1.6869,
"step": 1047
},
{
"epoch": 0.14054537326776934,
"grad_norm": 1.1185672447220645,
"learning_rate": 5.741014240768896e-07,
"loss": 1.7676,
"step": 1048
},
{
"epoch": 0.14067948144836834,
"grad_norm": 1.1307460519899954,
"learning_rate": 5.740527833766135e-07,
"loss": 1.7232,
"step": 1049
},
{
"epoch": 0.14081358962896737,
"grad_norm": 1.1013230366573936,
"learning_rate": 5.740040993480318e-07,
"loss": 1.7287,
"step": 1050
},
{
"epoch": 0.1409476978095664,
"grad_norm": 1.2887563539916567,
"learning_rate": 5.739553719997877e-07,
"loss": 1.6725,
"step": 1051
},
{
"epoch": 0.1410818059901654,
"grad_norm": 1.128200473385445,
"learning_rate": 5.739066013405322e-07,
"loss": 1.7193,
"step": 1052
},
{
"epoch": 0.14121591417076443,
"grad_norm": 1.0948929309224316,
"learning_rate": 5.738577873789237e-07,
"loss": 1.6993,
"step": 1053
},
{
"epoch": 0.14135002235136343,
"grad_norm": 1.0842896614577642,
"learning_rate": 5.738089301236286e-07,
"loss": 1.7045,
"step": 1054
},
{
"epoch": 0.14148413053196246,
"grad_norm": 1.0699301937780477,
"learning_rate": 5.73760029583321e-07,
"loss": 1.7216,
"step": 1055
},
{
"epoch": 0.14161823871256146,
"grad_norm": 1.0958889223597748,
"learning_rate": 5.737110857666822e-07,
"loss": 1.6649,
"step": 1056
},
{
"epoch": 0.1417523468931605,
"grad_norm": 1.0656247406409773,
"learning_rate": 5.736620986824017e-07,
"loss": 1.683,
"step": 1057
},
{
"epoch": 0.1418864550737595,
"grad_norm": 1.2444649158517036,
"learning_rate": 5.736130683391765e-07,
"loss": 1.6188,
"step": 1058
},
{
"epoch": 0.14202056325435852,
"grad_norm": 1.0989443966595032,
"learning_rate": 5.735639947457113e-07,
"loss": 1.7038,
"step": 1059
},
{
"epoch": 0.14215467143495752,
"grad_norm": 1.142667824771637,
"learning_rate": 5.735148779107184e-07,
"loss": 1.6156,
"step": 1060
},
{
"epoch": 0.14228877961555655,
"grad_norm": 1.1299828935757683,
"learning_rate": 5.734657178429179e-07,
"loss": 1.6754,
"step": 1061
},
{
"epoch": 0.14242288779615556,
"grad_norm": 1.0986771884553144,
"learning_rate": 5.734165145510375e-07,
"loss": 1.6201,
"step": 1062
},
{
"epoch": 0.14255699597675459,
"grad_norm": 1.0853274840023213,
"learning_rate": 5.733672680438124e-07,
"loss": 1.6885,
"step": 1063
},
{
"epoch": 0.1426911041573536,
"grad_norm": 1.0820811488797877,
"learning_rate": 5.73317978329986e-07,
"loss": 1.7995,
"step": 1064
},
{
"epoch": 0.14282521233795262,
"grad_norm": 1.1295149364952306,
"learning_rate": 5.732686454183087e-07,
"loss": 1.6925,
"step": 1065
},
{
"epoch": 0.14295932051855162,
"grad_norm": 1.057888764325057,
"learning_rate": 5.732192693175391e-07,
"loss": 1.6412,
"step": 1066
},
{
"epoch": 0.14309342869915065,
"grad_norm": 1.098616962497695,
"learning_rate": 5.731698500364434e-07,
"loss": 1.6271,
"step": 1067
},
{
"epoch": 0.14322753687974968,
"grad_norm": 1.2745609637830848,
"learning_rate": 5.731203875837949e-07,
"loss": 1.671,
"step": 1068
},
{
"epoch": 0.14336164506034868,
"grad_norm": 1.120730846705753,
"learning_rate": 5.730708819683753e-07,
"loss": 1.7433,
"step": 1069
},
{
"epoch": 0.1434957532409477,
"grad_norm": 1.1177693123454027,
"learning_rate": 5.730213331989736e-07,
"loss": 1.7291,
"step": 1070
},
{
"epoch": 0.1436298614215467,
"grad_norm": 1.0910765331643333,
"learning_rate": 5.729717412843866e-07,
"loss": 1.6739,
"step": 1071
},
{
"epoch": 0.14376396960214574,
"grad_norm": 1.1741168573690484,
"learning_rate": 5.729221062334186e-07,
"loss": 1.7401,
"step": 1072
},
{
"epoch": 0.14389807778274474,
"grad_norm": 1.2230565196681809,
"learning_rate": 5.728724280548815e-07,
"loss": 1.6466,
"step": 1073
},
{
"epoch": 0.14403218596334377,
"grad_norm": 1.075125807457348,
"learning_rate": 5.728227067575953e-07,
"loss": 1.6632,
"step": 1074
},
{
"epoch": 0.14416629414394277,
"grad_norm": 1.0629310683077087,
"learning_rate": 5.727729423503871e-07,
"loss": 1.6456,
"step": 1075
},
{
"epoch": 0.1443004023245418,
"grad_norm": 1.131277162697691,
"learning_rate": 5.72723134842092e-07,
"loss": 1.7069,
"step": 1076
},
{
"epoch": 0.1444345105051408,
"grad_norm": 1.4319225703993534,
"learning_rate": 5.726732842415527e-07,
"loss": 1.7104,
"step": 1077
},
{
"epoch": 0.14456861868573984,
"grad_norm": 1.1218543441609072,
"learning_rate": 5.726233905576194e-07,
"loss": 1.8235,
"step": 1078
},
{
"epoch": 0.14470272686633884,
"grad_norm": 1.0682688173779038,
"learning_rate": 5.725734537991502e-07,
"loss": 1.7334,
"step": 1079
},
{
"epoch": 0.14483683504693787,
"grad_norm": 1.0513899411618064,
"learning_rate": 5.725234739750106e-07,
"loss": 1.564,
"step": 1080
},
{
"epoch": 0.14497094322753687,
"grad_norm": 1.073556864405118,
"learning_rate": 5.724734510940738e-07,
"loss": 1.6191,
"step": 1081
},
{
"epoch": 0.1451050514081359,
"grad_norm": 1.1272658425201874,
"learning_rate": 5.724233851652208e-07,
"loss": 1.5812,
"step": 1082
},
{
"epoch": 0.1452391595887349,
"grad_norm": 1.1649864304286308,
"learning_rate": 5.723732761973399e-07,
"loss": 1.7974,
"step": 1083
},
{
"epoch": 0.14537326776933393,
"grad_norm": 1.1842565824330795,
"learning_rate": 5.723231241993277e-07,
"loss": 1.642,
"step": 1084
},
{
"epoch": 0.14550737594993293,
"grad_norm": 1.1226873500626315,
"learning_rate": 5.722729291800877e-07,
"loss": 1.648,
"step": 1085
},
{
"epoch": 0.14564148413053196,
"grad_norm": 1.074175742058312,
"learning_rate": 5.722226911485315e-07,
"loss": 1.6477,
"step": 1086
},
{
"epoch": 0.145775592311131,
"grad_norm": 1.6414796585857712,
"learning_rate": 5.721724101135781e-07,
"loss": 1.6099,
"step": 1087
},
{
"epoch": 0.14590970049173,
"grad_norm": 1.1490676419596029,
"learning_rate": 5.721220860841543e-07,
"loss": 1.5671,
"step": 1088
},
{
"epoch": 0.14604380867232902,
"grad_norm": 1.0434774110585503,
"learning_rate": 5.720717190691943e-07,
"loss": 1.6001,
"step": 1089
},
{
"epoch": 0.14617791685292802,
"grad_norm": 1.0806260779363936,
"learning_rate": 5.720213090776403e-07,
"loss": 1.7541,
"step": 1090
},
{
"epoch": 0.14631202503352705,
"grad_norm": 1.1814630509058974,
"learning_rate": 5.719708561184417e-07,
"loss": 1.6864,
"step": 1091
},
{
"epoch": 0.14644613321412606,
"grad_norm": 1.0965207690798646,
"learning_rate": 5.719203602005559e-07,
"loss": 1.7179,
"step": 1092
},
{
"epoch": 0.14658024139472509,
"grad_norm": 1.187634257937833,
"learning_rate": 5.718698213329479e-07,
"loss": 1.5889,
"step": 1093
},
{
"epoch": 0.1467143495753241,
"grad_norm": 1.151719981823989,
"learning_rate": 5.718192395245899e-07,
"loss": 1.6503,
"step": 1094
},
{
"epoch": 0.14684845775592312,
"grad_norm": 1.0407283688373252,
"learning_rate": 5.717686147844622e-07,
"loss": 1.5976,
"step": 1095
},
{
"epoch": 0.14698256593652212,
"grad_norm": 1.0743575974553181,
"learning_rate": 5.717179471215527e-07,
"loss": 1.7028,
"step": 1096
},
{
"epoch": 0.14711667411712115,
"grad_norm": 1.080606301144591,
"learning_rate": 5.716672365448564e-07,
"loss": 1.6827,
"step": 1097
},
{
"epoch": 0.14725078229772015,
"grad_norm": 1.0807596555370267,
"learning_rate": 5.716164830633764e-07,
"loss": 1.6778,
"step": 1098
},
{
"epoch": 0.14738489047831918,
"grad_norm": 1.1284745845133346,
"learning_rate": 5.715656866861234e-07,
"loss": 1.6209,
"step": 1099
},
{
"epoch": 0.14751899865891818,
"grad_norm": 0.989581549531516,
"learning_rate": 5.715148474221156e-07,
"loss": 1.5879,
"step": 1100
},
{
"epoch": 0.1476531068395172,
"grad_norm": 1.1254043833078187,
"learning_rate": 5.714639652803788e-07,
"loss": 1.6834,
"step": 1101
},
{
"epoch": 0.1477872150201162,
"grad_norm": 1.0789006249002853,
"learning_rate": 5.714130402699465e-07,
"loss": 1.6314,
"step": 1102
},
{
"epoch": 0.14792132320071524,
"grad_norm": 1.0792687942782158,
"learning_rate": 5.713620723998597e-07,
"loss": 1.7229,
"step": 1103
},
{
"epoch": 0.14805543138131427,
"grad_norm": 1.1190452519207015,
"learning_rate": 5.71311061679167e-07,
"loss": 1.6851,
"step": 1104
},
{
"epoch": 0.14818953956191327,
"grad_norm": 1.1240598043365235,
"learning_rate": 5.712600081169248e-07,
"loss": 1.6486,
"step": 1105
},
{
"epoch": 0.1483236477425123,
"grad_norm": 1.110168533453958,
"learning_rate": 5.71208911722197e-07,
"loss": 1.651,
"step": 1106
},
{
"epoch": 0.1484577559231113,
"grad_norm": 1.0688369448448625,
"learning_rate": 5.71157772504055e-07,
"loss": 1.709,
"step": 1107
},
{
"epoch": 0.14859186410371034,
"grad_norm": 1.1187107525701387,
"learning_rate": 5.711065904715777e-07,
"loss": 1.7167,
"step": 1108
},
{
"epoch": 0.14872597228430934,
"grad_norm": 1.1397259364080825,
"learning_rate": 5.710553656338521e-07,
"loss": 1.6975,
"step": 1109
},
{
"epoch": 0.14886008046490837,
"grad_norm": 1.1590128512082682,
"learning_rate": 5.710040979999723e-07,
"loss": 1.7414,
"step": 1110
},
{
"epoch": 0.14899418864550737,
"grad_norm": 1.167811852838392,
"learning_rate": 5.709527875790403e-07,
"loss": 1.6626,
"step": 1111
},
{
"epoch": 0.1491282968261064,
"grad_norm": 1.0973271552840278,
"learning_rate": 5.709014343801655e-07,
"loss": 1.6324,
"step": 1112
},
{
"epoch": 0.1492624050067054,
"grad_norm": 1.3487898998822019,
"learning_rate": 5.708500384124648e-07,
"loss": 1.6641,
"step": 1113
},
{
"epoch": 0.14939651318730443,
"grad_norm": 1.173261054584497,
"learning_rate": 5.707985996850633e-07,
"loss": 1.6297,
"step": 1114
},
{
"epoch": 0.14953062136790343,
"grad_norm": 1.056190301936881,
"learning_rate": 5.707471182070929e-07,
"loss": 1.7222,
"step": 1115
},
{
"epoch": 0.14966472954850246,
"grad_norm": 1.0543304581404804,
"learning_rate": 5.706955939876936e-07,
"loss": 1.6486,
"step": 1116
},
{
"epoch": 0.14979883772910146,
"grad_norm": 1.0951287089797115,
"learning_rate": 5.706440270360128e-07,
"loss": 1.6158,
"step": 1117
},
{
"epoch": 0.1499329459097005,
"grad_norm": 1.1191851976325244,
"learning_rate": 5.705924173612055e-07,
"loss": 1.7315,
"step": 1118
},
{
"epoch": 0.1500670540902995,
"grad_norm": 1.0577825904689977,
"learning_rate": 5.705407649724343e-07,
"loss": 1.6935,
"step": 1119
},
{
"epoch": 0.15020116227089853,
"grad_norm": 1.056299942663864,
"learning_rate": 5.704890698788693e-07,
"loss": 1.628,
"step": 1120
},
{
"epoch": 0.15033527045149755,
"grad_norm": 1.1590721147664085,
"learning_rate": 5.704373320896886e-07,
"loss": 1.6249,
"step": 1121
},
{
"epoch": 0.15046937863209656,
"grad_norm": 1.1117527447235374,
"learning_rate": 5.703855516140773e-07,
"loss": 1.7004,
"step": 1122
},
{
"epoch": 0.1506034868126956,
"grad_norm": 1.1049104937281078,
"learning_rate": 5.703337284612283e-07,
"loss": 1.6377,
"step": 1123
},
{
"epoch": 0.1507375949932946,
"grad_norm": 1.59710670500923,
"learning_rate": 5.702818626403422e-07,
"loss": 1.6834,
"step": 1124
},
{
"epoch": 0.15087170317389362,
"grad_norm": 1.0967048039417424,
"learning_rate": 5.702299541606271e-07,
"loss": 1.7351,
"step": 1125
},
{
"epoch": 0.15100581135449262,
"grad_norm": 1.0979605765370022,
"learning_rate": 5.701780030312985e-07,
"loss": 1.6961,
"step": 1126
},
{
"epoch": 0.15113991953509165,
"grad_norm": 1.0799636277645253,
"learning_rate": 5.701260092615798e-07,
"loss": 1.6698,
"step": 1127
},
{
"epoch": 0.15127402771569065,
"grad_norm": 1.0680391383117414,
"learning_rate": 5.700739728607018e-07,
"loss": 1.6337,
"step": 1128
},
{
"epoch": 0.15140813589628968,
"grad_norm": 1.1265492196116744,
"learning_rate": 5.700218938379027e-07,
"loss": 1.758,
"step": 1129
},
{
"epoch": 0.15154224407688868,
"grad_norm": 1.1871181924509882,
"learning_rate": 5.699697722024286e-07,
"loss": 1.7564,
"step": 1130
},
{
"epoch": 0.1516763522574877,
"grad_norm": 1.0181987331367963,
"learning_rate": 5.69917607963533e-07,
"loss": 1.5776,
"step": 1131
},
{
"epoch": 0.15181046043808671,
"grad_norm": 1.1284590442586029,
"learning_rate": 5.698654011304768e-07,
"loss": 1.6984,
"step": 1132
},
{
"epoch": 0.15194456861868574,
"grad_norm": 1.2930521652564555,
"learning_rate": 5.698131517125288e-07,
"loss": 1.6334,
"step": 1133
},
{
"epoch": 0.15207867679928475,
"grad_norm": 1.117570312123897,
"learning_rate": 5.697608597189651e-07,
"loss": 1.6531,
"step": 1134
},
{
"epoch": 0.15221278497988378,
"grad_norm": 1.4856967946676458,
"learning_rate": 5.697085251590694e-07,
"loss": 1.6406,
"step": 1135
},
{
"epoch": 0.15234689316048278,
"grad_norm": 1.1601905755705224,
"learning_rate": 5.696561480421331e-07,
"loss": 1.6839,
"step": 1136
},
{
"epoch": 0.1524810013410818,
"grad_norm": 1.1233822318963709,
"learning_rate": 5.696037283774549e-07,
"loss": 1.6607,
"step": 1137
},
{
"epoch": 0.15261510952168084,
"grad_norm": 1.1742187355064484,
"learning_rate": 5.695512661743415e-07,
"loss": 1.6646,
"step": 1138
},
{
"epoch": 0.15274921770227984,
"grad_norm": 1.086363990541314,
"learning_rate": 5.694987614421066e-07,
"loss": 1.6739,
"step": 1139
},
{
"epoch": 0.15288332588287887,
"grad_norm": 1.194737878034564,
"learning_rate": 5.694462141900719e-07,
"loss": 1.6835,
"step": 1140
},
{
"epoch": 0.15301743406347787,
"grad_norm": 1.1598758612040898,
"learning_rate": 5.693936244275662e-07,
"loss": 1.6587,
"step": 1141
},
{
"epoch": 0.1531515422440769,
"grad_norm": 1.1381348609460207,
"learning_rate": 5.693409921639263e-07,
"loss": 1.7111,
"step": 1142
},
{
"epoch": 0.1532856504246759,
"grad_norm": 1.0954642701505761,
"learning_rate": 5.692883174084963e-07,
"loss": 1.6453,
"step": 1143
},
{
"epoch": 0.15341975860527493,
"grad_norm": 1.181240368838665,
"learning_rate": 5.69235600170628e-07,
"loss": 1.7074,
"step": 1144
},
{
"epoch": 0.15355386678587393,
"grad_norm": 1.0848362523541808,
"learning_rate": 5.691828404596804e-07,
"loss": 1.7188,
"step": 1145
},
{
"epoch": 0.15368797496647296,
"grad_norm": 1.0976088776241693,
"learning_rate": 5.691300382850205e-07,
"loss": 1.6133,
"step": 1146
},
{
"epoch": 0.15382208314707196,
"grad_norm": 1.1535833554516768,
"learning_rate": 5.690771936560228e-07,
"loss": 1.6823,
"step": 1147
},
{
"epoch": 0.153956191327671,
"grad_norm": 1.1763699702630221,
"learning_rate": 5.690243065820687e-07,
"loss": 1.692,
"step": 1148
},
{
"epoch": 0.15409029950827,
"grad_norm": 1.0627345607622845,
"learning_rate": 5.689713770725477e-07,
"loss": 1.5961,
"step": 1149
},
{
"epoch": 0.15422440768886903,
"grad_norm": 1.0792270716448427,
"learning_rate": 5.689184051368572e-07,
"loss": 1.64,
"step": 1150
},
{
"epoch": 0.15435851586946803,
"grad_norm": 1.0247043986886288,
"learning_rate": 5.688653907844009e-07,
"loss": 1.5285,
"step": 1151
},
{
"epoch": 0.15449262405006706,
"grad_norm": 1.07857428312717,
"learning_rate": 5.688123340245914e-07,
"loss": 1.6444,
"step": 1152
},
{
"epoch": 0.15462673223066606,
"grad_norm": 1.0930284133542458,
"learning_rate": 5.687592348668479e-07,
"loss": 1.6882,
"step": 1153
},
{
"epoch": 0.1547608404112651,
"grad_norm": 1.0484076712069612,
"learning_rate": 5.687060933205976e-07,
"loss": 1.5796,
"step": 1154
},
{
"epoch": 0.1548949485918641,
"grad_norm": 1.1209018475352952,
"learning_rate": 5.686529093952749e-07,
"loss": 1.702,
"step": 1155
},
{
"epoch": 0.15502905677246312,
"grad_norm": 1.084792074670866,
"learning_rate": 5.685996831003221e-07,
"loss": 1.6856,
"step": 1156
},
{
"epoch": 0.15516316495306215,
"grad_norm": 1.081652083067762,
"learning_rate": 5.685464144451888e-07,
"loss": 1.6781,
"step": 1157
},
{
"epoch": 0.15529727313366115,
"grad_norm": 1.2019370572090728,
"learning_rate": 5.684931034393319e-07,
"loss": 1.6854,
"step": 1158
},
{
"epoch": 0.15543138131426018,
"grad_norm": 1.1546384235930545,
"learning_rate": 5.684397500922163e-07,
"loss": 1.5995,
"step": 1159
},
{
"epoch": 0.15556548949485918,
"grad_norm": 1.0806139711906346,
"learning_rate": 5.68386354413314e-07,
"loss": 1.6043,
"step": 1160
},
{
"epoch": 0.1556995976754582,
"grad_norm": 1.1695139264738694,
"learning_rate": 5.683329164121049e-07,
"loss": 1.6565,
"step": 1161
},
{
"epoch": 0.15583370585605721,
"grad_norm": 1.1082458941671236,
"learning_rate": 5.682794360980761e-07,
"loss": 1.6997,
"step": 1162
},
{
"epoch": 0.15596781403665624,
"grad_norm": 1.171803562739694,
"learning_rate": 5.682259134807222e-07,
"loss": 1.5452,
"step": 1163
},
{
"epoch": 0.15610192221725525,
"grad_norm": 1.0813601117636722,
"learning_rate": 5.681723485695456e-07,
"loss": 1.6468,
"step": 1164
},
{
"epoch": 0.15623603039785428,
"grad_norm": 1.0850091737441245,
"learning_rate": 5.681187413740558e-07,
"loss": 1.6521,
"step": 1165
},
{
"epoch": 0.15637013857845328,
"grad_norm": 1.0888617126493352,
"learning_rate": 5.680650919037703e-07,
"loss": 1.6318,
"step": 1166
},
{
"epoch": 0.1565042467590523,
"grad_norm": 1.0832051131221956,
"learning_rate": 5.680114001682137e-07,
"loss": 1.6244,
"step": 1167
},
{
"epoch": 0.1566383549396513,
"grad_norm": 1.1345011329722676,
"learning_rate": 5.679576661769184e-07,
"loss": 1.6903,
"step": 1168
},
{
"epoch": 0.15677246312025034,
"grad_norm": 1.0989237696533585,
"learning_rate": 5.679038899394239e-07,
"loss": 1.748,
"step": 1169
},
{
"epoch": 0.15690657130084934,
"grad_norm": 1.0586060818560636,
"learning_rate": 5.678500714652776e-07,
"loss": 1.7243,
"step": 1170
},
{
"epoch": 0.15704067948144837,
"grad_norm": 1.1184535612835667,
"learning_rate": 5.677962107640342e-07,
"loss": 1.6538,
"step": 1171
},
{
"epoch": 0.15717478766204737,
"grad_norm": 1.0607792312898765,
"learning_rate": 5.677423078452561e-07,
"loss": 1.6324,
"step": 1172
},
{
"epoch": 0.1573088958426464,
"grad_norm": 1.0442851907949064,
"learning_rate": 5.676883627185129e-07,
"loss": 1.6818,
"step": 1173
},
{
"epoch": 0.15744300402324543,
"grad_norm": 1.0805916545031482,
"learning_rate": 5.676343753933818e-07,
"loss": 1.6477,
"step": 1174
},
{
"epoch": 0.15757711220384443,
"grad_norm": 1.055305047370012,
"learning_rate": 5.675803458794477e-07,
"loss": 1.675,
"step": 1175
},
{
"epoch": 0.15771122038444346,
"grad_norm": 1.1317344965112557,
"learning_rate": 5.675262741863026e-07,
"loss": 1.6195,
"step": 1176
},
{
"epoch": 0.15784532856504246,
"grad_norm": 1.0677408822746999,
"learning_rate": 5.674721603235462e-07,
"loss": 1.673,
"step": 1177
},
{
"epoch": 0.1579794367456415,
"grad_norm": 1.1173608676015656,
"learning_rate": 5.67418004300786e-07,
"loss": 1.704,
"step": 1178
},
{
"epoch": 0.1581135449262405,
"grad_norm": 1.056889330893961,
"learning_rate": 5.673638061276364e-07,
"loss": 1.6232,
"step": 1179
},
{
"epoch": 0.15824765310683953,
"grad_norm": 1.1175288057488566,
"learning_rate": 5.673095658137197e-07,
"loss": 1.7439,
"step": 1180
},
{
"epoch": 0.15838176128743853,
"grad_norm": 1.1363903828654547,
"learning_rate": 5.672552833686654e-07,
"loss": 1.6943,
"step": 1181
},
{
"epoch": 0.15851586946803756,
"grad_norm": 1.0761526122635945,
"learning_rate": 5.672009588021108e-07,
"loss": 1.6178,
"step": 1182
},
{
"epoch": 0.15864997764863656,
"grad_norm": 1.0868039624863182,
"learning_rate": 5.671465921237003e-07,
"loss": 1.7295,
"step": 1183
},
{
"epoch": 0.1587840858292356,
"grad_norm": 1.5375983527794888,
"learning_rate": 5.670921833430861e-07,
"loss": 1.5868,
"step": 1184
},
{
"epoch": 0.1589181940098346,
"grad_norm": 1.1761526374271758,
"learning_rate": 5.670377324699277e-07,
"loss": 1.6585,
"step": 1185
},
{
"epoch": 0.15905230219043362,
"grad_norm": 1.0911545993652647,
"learning_rate": 5.669832395138923e-07,
"loss": 1.6849,
"step": 1186
},
{
"epoch": 0.15918641037103262,
"grad_norm": 1.0517360680747312,
"learning_rate": 5.669287044846542e-07,
"loss": 1.7081,
"step": 1187
},
{
"epoch": 0.15932051855163165,
"grad_norm": 1.0460736006845528,
"learning_rate": 5.668741273918952e-07,
"loss": 1.6946,
"step": 1188
},
{
"epoch": 0.15945462673223065,
"grad_norm": 1.1057544457050006,
"learning_rate": 5.668195082453052e-07,
"loss": 1.6648,
"step": 1189
},
{
"epoch": 0.15958873491282968,
"grad_norm": 1.290894867238456,
"learning_rate": 5.667648470545808e-07,
"loss": 1.6921,
"step": 1190
},
{
"epoch": 0.1597228430934287,
"grad_norm": 1.2497492674256703,
"learning_rate": 5.667101438294264e-07,
"loss": 1.7095,
"step": 1191
},
{
"epoch": 0.15985695127402771,
"grad_norm": 1.1080523067750003,
"learning_rate": 5.666553985795538e-07,
"loss": 1.6313,
"step": 1192
},
{
"epoch": 0.15999105945462674,
"grad_norm": 1.0983444417697228,
"learning_rate": 5.666006113146823e-07,
"loss": 1.6836,
"step": 1193
},
{
"epoch": 0.16012516763522575,
"grad_norm": 1.1242609644362185,
"learning_rate": 5.665457820445387e-07,
"loss": 1.6522,
"step": 1194
},
{
"epoch": 0.16025927581582478,
"grad_norm": 1.1033082182518592,
"learning_rate": 5.664909107788571e-07,
"loss": 1.6958,
"step": 1195
},
{
"epoch": 0.16039338399642378,
"grad_norm": 1.1353654965954614,
"learning_rate": 5.664359975273792e-07,
"loss": 1.6604,
"step": 1196
},
{
"epoch": 0.1605274921770228,
"grad_norm": 1.1259316457840236,
"learning_rate": 5.663810422998543e-07,
"loss": 1.7241,
"step": 1197
},
{
"epoch": 0.1606616003576218,
"grad_norm": 1.0922411903046598,
"learning_rate": 5.663260451060388e-07,
"loss": 1.6432,
"step": 1198
},
{
"epoch": 0.16079570853822084,
"grad_norm": 1.0707962447880088,
"learning_rate": 5.662710059556966e-07,
"loss": 1.6666,
"step": 1199
},
{
"epoch": 0.16092981671881984,
"grad_norm": 1.0837296784325723,
"learning_rate": 5.662159248585993e-07,
"loss": 1.6965,
"step": 1200
},
{
"epoch": 0.16106392489941887,
"grad_norm": 1.0703824186490674,
"learning_rate": 5.66160801824526e-07,
"loss": 1.7293,
"step": 1201
},
{
"epoch": 0.16119803308001787,
"grad_norm": 1.095076268284643,
"learning_rate": 5.661056368632625e-07,
"loss": 1.6433,
"step": 1202
},
{
"epoch": 0.1613321412606169,
"grad_norm": 1.0622058510882262,
"learning_rate": 5.660504299846032e-07,
"loss": 1.6237,
"step": 1203
},
{
"epoch": 0.1614662494412159,
"grad_norm": 1.0981636682859879,
"learning_rate": 5.65995181198349e-07,
"loss": 1.8076,
"step": 1204
},
{
"epoch": 0.16160035762181493,
"grad_norm": 1.1393139443072446,
"learning_rate": 5.659398905143088e-07,
"loss": 1.7572,
"step": 1205
},
{
"epoch": 0.16173446580241393,
"grad_norm": 1.0960864805053374,
"learning_rate": 5.658845579422985e-07,
"loss": 1.6836,
"step": 1206
},
{
"epoch": 0.16186857398301296,
"grad_norm": 1.0536699550048987,
"learning_rate": 5.658291834921417e-07,
"loss": 1.6933,
"step": 1207
},
{
"epoch": 0.162002682163612,
"grad_norm": 1.1996669047917732,
"learning_rate": 5.657737671736696e-07,
"loss": 1.6405,
"step": 1208
},
{
"epoch": 0.162136790344211,
"grad_norm": 1.10569454454835,
"learning_rate": 5.657183089967204e-07,
"loss": 1.5797,
"step": 1209
},
{
"epoch": 0.16227089852481003,
"grad_norm": 1.2803251145710948,
"learning_rate": 5.6566280897114e-07,
"loss": 1.6207,
"step": 1210
},
{
"epoch": 0.16240500670540903,
"grad_norm": 1.048684333970024,
"learning_rate": 5.656072671067818e-07,
"loss": 1.5924,
"step": 1211
},
{
"epoch": 0.16253911488600806,
"grad_norm": 1.0612522875516415,
"learning_rate": 5.655516834135063e-07,
"loss": 1.5299,
"step": 1212
},
{
"epoch": 0.16267322306660706,
"grad_norm": 1.0932249588392913,
"learning_rate": 5.65496057901182e-07,
"loss": 1.6653,
"step": 1213
},
{
"epoch": 0.1628073312472061,
"grad_norm": 1.0734042304698213,
"learning_rate": 5.65440390579684e-07,
"loss": 1.5442,
"step": 1214
},
{
"epoch": 0.1629414394278051,
"grad_norm": 1.1189271058187575,
"learning_rate": 5.653846814588957e-07,
"loss": 1.6881,
"step": 1215
},
{
"epoch": 0.16307554760840412,
"grad_norm": 1.1589238023336688,
"learning_rate": 5.653289305487072e-07,
"loss": 1.7461,
"step": 1216
},
{
"epoch": 0.16320965578900312,
"grad_norm": 1.02665461506197,
"learning_rate": 5.652731378590166e-07,
"loss": 1.6576,
"step": 1217
},
{
"epoch": 0.16334376396960215,
"grad_norm": 1.1444702149064363,
"learning_rate": 5.65217303399729e-07,
"loss": 1.6162,
"step": 1218
},
{
"epoch": 0.16347787215020115,
"grad_norm": 1.1311619335366723,
"learning_rate": 5.65161427180757e-07,
"loss": 1.6957,
"step": 1219
},
{
"epoch": 0.16361198033080018,
"grad_norm": 1.0555386995041562,
"learning_rate": 5.651055092120208e-07,
"loss": 1.7145,
"step": 1220
},
{
"epoch": 0.16374608851139918,
"grad_norm": 1.189321876945114,
"learning_rate": 5.650495495034477e-07,
"loss": 1.698,
"step": 1221
},
{
"epoch": 0.16388019669199821,
"grad_norm": 1.084782331393969,
"learning_rate": 5.649935480649729e-07,
"loss": 1.6739,
"step": 1222
},
{
"epoch": 0.16401430487259722,
"grad_norm": 1.1283603135723947,
"learning_rate": 5.649375049065386e-07,
"loss": 1.752,
"step": 1223
},
{
"epoch": 0.16414841305319625,
"grad_norm": 1.11896193815645,
"learning_rate": 5.648814200380943e-07,
"loss": 1.6303,
"step": 1224
},
{
"epoch": 0.16428252123379525,
"grad_norm": 1.067115391566694,
"learning_rate": 5.648252934695973e-07,
"loss": 1.6735,
"step": 1225
},
{
"epoch": 0.16441662941439428,
"grad_norm": 1.0804557718519556,
"learning_rate": 5.64769125211012e-07,
"loss": 1.6247,
"step": 1226
},
{
"epoch": 0.1645507375949933,
"grad_norm": 1.0059736180266399,
"learning_rate": 5.647129152723106e-07,
"loss": 1.5354,
"step": 1227
},
{
"epoch": 0.1646848457755923,
"grad_norm": 1.0770670756683223,
"learning_rate": 5.646566636634721e-07,
"loss": 1.6768,
"step": 1228
},
{
"epoch": 0.16481895395619134,
"grad_norm": 1.0638623481159848,
"learning_rate": 5.646003703944834e-07,
"loss": 1.6413,
"step": 1229
},
{
"epoch": 0.16495306213679034,
"grad_norm": 1.0839631787802386,
"learning_rate": 5.645440354753386e-07,
"loss": 1.6411,
"step": 1230
},
{
"epoch": 0.16508717031738937,
"grad_norm": 1.1589896172936287,
"learning_rate": 5.644876589160391e-07,
"loss": 1.6042,
"step": 1231
},
{
"epoch": 0.16522127849798837,
"grad_norm": 1.1160410996742565,
"learning_rate": 5.644312407265939e-07,
"loss": 1.6573,
"step": 1232
},
{
"epoch": 0.1653553866785874,
"grad_norm": 1.4171454379604909,
"learning_rate": 5.643747809170193e-07,
"loss": 1.6332,
"step": 1233
},
{
"epoch": 0.1654894948591864,
"grad_norm": 1.0531642470485152,
"learning_rate": 5.643182794973391e-07,
"loss": 1.6602,
"step": 1234
},
{
"epoch": 0.16562360303978543,
"grad_norm": 1.1086706049405617,
"learning_rate": 5.64261736477584e-07,
"loss": 1.7038,
"step": 1235
},
{
"epoch": 0.16575771122038443,
"grad_norm": 1.0944161073367153,
"learning_rate": 5.642051518677929e-07,
"loss": 1.6386,
"step": 1236
},
{
"epoch": 0.16589181940098346,
"grad_norm": 1.0383994077860026,
"learning_rate": 5.641485256780112e-07,
"loss": 1.6683,
"step": 1237
},
{
"epoch": 0.16602592758158247,
"grad_norm": 1.110409441026267,
"learning_rate": 5.640918579182926e-07,
"loss": 1.7666,
"step": 1238
},
{
"epoch": 0.1661600357621815,
"grad_norm": 1.062864948914823,
"learning_rate": 5.640351485986973e-07,
"loss": 1.6995,
"step": 1239
},
{
"epoch": 0.1662941439427805,
"grad_norm": 1.1144719375181737,
"learning_rate": 5.639783977292936e-07,
"loss": 1.6904,
"step": 1240
},
{
"epoch": 0.16642825212337953,
"grad_norm": 1.090081045271864,
"learning_rate": 5.639216053201565e-07,
"loss": 1.696,
"step": 1241
},
{
"epoch": 0.16656236030397853,
"grad_norm": 1.0630959169468894,
"learning_rate": 5.638647713813691e-07,
"loss": 1.6521,
"step": 1242
},
{
"epoch": 0.16669646848457756,
"grad_norm": 2.998931919925447,
"learning_rate": 5.638078959230211e-07,
"loss": 1.706,
"step": 1243
},
{
"epoch": 0.1668305766651766,
"grad_norm": 1.2341388992185853,
"learning_rate": 5.637509789552104e-07,
"loss": 1.5942,
"step": 1244
},
{
"epoch": 0.1669646848457756,
"grad_norm": 1.1027382262588608,
"learning_rate": 5.636940204880415e-07,
"loss": 1.6176,
"step": 1245
},
{
"epoch": 0.16709879302637462,
"grad_norm": 1.1453532005308322,
"learning_rate": 5.636370205316269e-07,
"loss": 1.7051,
"step": 1246
},
{
"epoch": 0.16723290120697362,
"grad_norm": 1.1774692080993565,
"learning_rate": 5.63579979096086e-07,
"loss": 1.7089,
"step": 1247
},
{
"epoch": 0.16736700938757265,
"grad_norm": 1.05810539274269,
"learning_rate": 5.635228961915458e-07,
"loss": 1.6353,
"step": 1248
},
{
"epoch": 0.16750111756817165,
"grad_norm": 1.1450836955803443,
"learning_rate": 5.634657718281407e-07,
"loss": 1.7418,
"step": 1249
},
{
"epoch": 0.16763522574877068,
"grad_norm": 1.125948952992154,
"learning_rate": 5.634086060160121e-07,
"loss": 1.7343,
"step": 1250
},
{
"epoch": 0.16776933392936969,
"grad_norm": 1.069728820008434,
"learning_rate": 5.633513987653094e-07,
"loss": 1.4826,
"step": 1251
},
{
"epoch": 0.16790344210996871,
"grad_norm": 1.0401896130830024,
"learning_rate": 5.632941500861885e-07,
"loss": 1.7211,
"step": 1252
},
{
"epoch": 0.16803755029056772,
"grad_norm": 1.09563187676157,
"learning_rate": 5.632368599888135e-07,
"loss": 1.7378,
"step": 1253
},
{
"epoch": 0.16817165847116675,
"grad_norm": 1.0701481214906692,
"learning_rate": 5.631795284833555e-07,
"loss": 1.7191,
"step": 1254
},
{
"epoch": 0.16830576665176575,
"grad_norm": 1.2554327805183711,
"learning_rate": 5.631221555799927e-07,
"loss": 1.6476,
"step": 1255
},
{
"epoch": 0.16843987483236478,
"grad_norm": 1.0867457009428256,
"learning_rate": 5.63064741288911e-07,
"loss": 1.6594,
"step": 1256
},
{
"epoch": 0.16857398301296378,
"grad_norm": 1.0587419661389497,
"learning_rate": 5.630072856203037e-07,
"loss": 1.7365,
"step": 1257
},
{
"epoch": 0.1687080911935628,
"grad_norm": 1.0437016123668459,
"learning_rate": 5.629497885843712e-07,
"loss": 1.6223,
"step": 1258
},
{
"epoch": 0.1688421993741618,
"grad_norm": 1.093304989043814,
"learning_rate": 5.628922501913211e-07,
"loss": 1.7281,
"step": 1259
},
{
"epoch": 0.16897630755476084,
"grad_norm": 1.0787876584693192,
"learning_rate": 5.628346704513689e-07,
"loss": 1.7033,
"step": 1260
},
{
"epoch": 0.16911041573535987,
"grad_norm": 1.119310868984826,
"learning_rate": 5.627770493747369e-07,
"loss": 1.6785,
"step": 1261
},
{
"epoch": 0.16924452391595887,
"grad_norm": 1.0543862123255383,
"learning_rate": 5.62719386971655e-07,
"loss": 1.6329,
"step": 1262
},
{
"epoch": 0.1693786320965579,
"grad_norm": 1.1801974734059986,
"learning_rate": 5.626616832523605e-07,
"loss": 1.6647,
"step": 1263
},
{
"epoch": 0.1695127402771569,
"grad_norm": 1.0966012840078587,
"learning_rate": 5.626039382270977e-07,
"loss": 1.7489,
"step": 1264
},
{
"epoch": 0.16964684845775593,
"grad_norm": 1.0464685107772078,
"learning_rate": 5.625461519061187e-07,
"loss": 1.613,
"step": 1265
},
{
"epoch": 0.16978095663835494,
"grad_norm": 1.1162999981242707,
"learning_rate": 5.624883242996825e-07,
"loss": 1.6777,
"step": 1266
},
{
"epoch": 0.16991506481895396,
"grad_norm": 1.0848332959906992,
"learning_rate": 5.624304554180556e-07,
"loss": 1.6708,
"step": 1267
},
{
"epoch": 0.17004917299955297,
"grad_norm": 1.0397576875295036,
"learning_rate": 5.623725452715121e-07,
"loss": 1.6809,
"step": 1268
},
{
"epoch": 0.170183281180152,
"grad_norm": 1.0775743863836376,
"learning_rate": 5.62314593870333e-07,
"loss": 1.7068,
"step": 1269
},
{
"epoch": 0.170317389360751,
"grad_norm": 1.1030270698791587,
"learning_rate": 5.622566012248068e-07,
"loss": 1.7731,
"step": 1270
},
{
"epoch": 0.17045149754135003,
"grad_norm": 1.0632600433435002,
"learning_rate": 5.621985673452292e-07,
"loss": 1.6944,
"step": 1271
},
{
"epoch": 0.17058560572194903,
"grad_norm": 2.354964154428233,
"learning_rate": 5.621404922419036e-07,
"loss": 1.5583,
"step": 1272
},
{
"epoch": 0.17071971390254806,
"grad_norm": 1.0841684512277456,
"learning_rate": 5.620823759251403e-07,
"loss": 1.6523,
"step": 1273
},
{
"epoch": 0.17085382208314706,
"grad_norm": 1.1343004749820542,
"learning_rate": 5.62024218405257e-07,
"loss": 1.6026,
"step": 1274
},
{
"epoch": 0.1709879302637461,
"grad_norm": 1.3571816054618184,
"learning_rate": 5.619660196925789e-07,
"loss": 1.6434,
"step": 1275
},
{
"epoch": 0.1711220384443451,
"grad_norm": 1.058572028264877,
"learning_rate": 5.619077797974385e-07,
"loss": 1.6225,
"step": 1276
},
{
"epoch": 0.17125614662494412,
"grad_norm": 1.068136194752418,
"learning_rate": 5.618494987301753e-07,
"loss": 1.6629,
"step": 1277
},
{
"epoch": 0.17139025480554315,
"grad_norm": 1.2779625791938292,
"learning_rate": 5.617911765011364e-07,
"loss": 1.6295,
"step": 1278
},
{
"epoch": 0.17152436298614215,
"grad_norm": 1.09073380795014,
"learning_rate": 5.617328131206761e-07,
"loss": 1.6544,
"step": 1279
},
{
"epoch": 0.17165847116674118,
"grad_norm": 1.0808553452465872,
"learning_rate": 5.616744085991562e-07,
"loss": 1.6671,
"step": 1280
},
{
"epoch": 0.17179257934734019,
"grad_norm": 1.1043939527890692,
"learning_rate": 5.616159629469456e-07,
"loss": 1.6977,
"step": 1281
},
{
"epoch": 0.17192668752793921,
"grad_norm": 1.0969178723829076,
"learning_rate": 5.615574761744202e-07,
"loss": 1.7814,
"step": 1282
},
{
"epoch": 0.17206079570853822,
"grad_norm": 1.0619478458391556,
"learning_rate": 5.614989482919641e-07,
"loss": 1.6899,
"step": 1283
},
{
"epoch": 0.17219490388913725,
"grad_norm": 1.1116637641823053,
"learning_rate": 5.614403793099678e-07,
"loss": 1.6795,
"step": 1284
},
{
"epoch": 0.17232901206973625,
"grad_norm": 1.1188139751673378,
"learning_rate": 5.613817692388295e-07,
"loss": 1.6586,
"step": 1285
},
{
"epoch": 0.17246312025033528,
"grad_norm": 1.1092151541540025,
"learning_rate": 5.613231180889545e-07,
"loss": 1.731,
"step": 1286
},
{
"epoch": 0.17259722843093428,
"grad_norm": 1.0776307968053882,
"learning_rate": 5.612644258707557e-07,
"loss": 1.639,
"step": 1287
},
{
"epoch": 0.1727313366115333,
"grad_norm": 1.1568418405932983,
"learning_rate": 5.612056925946532e-07,
"loss": 1.6265,
"step": 1288
},
{
"epoch": 0.1728654447921323,
"grad_norm": 1.1686914549112786,
"learning_rate": 5.611469182710741e-07,
"loss": 1.5635,
"step": 1289
},
{
"epoch": 0.17299955297273134,
"grad_norm": 1.0798126174498692,
"learning_rate": 5.61088102910453e-07,
"loss": 1.6009,
"step": 1290
},
{
"epoch": 0.17313366115333034,
"grad_norm": 1.0565094574884266,
"learning_rate": 5.61029246523232e-07,
"loss": 1.6236,
"step": 1291
},
{
"epoch": 0.17326776933392937,
"grad_norm": 1.1580137951907012,
"learning_rate": 5.609703491198601e-07,
"loss": 1.6664,
"step": 1292
},
{
"epoch": 0.17340187751452837,
"grad_norm": 1.0812242416939941,
"learning_rate": 5.609114107107936e-07,
"loss": 1.5541,
"step": 1293
},
{
"epoch": 0.1735359856951274,
"grad_norm": 1.0926652109752668,
"learning_rate": 5.608524313064966e-07,
"loss": 1.6495,
"step": 1294
},
{
"epoch": 0.1736700938757264,
"grad_norm": 1.116001777343314,
"learning_rate": 5.607934109174398e-07,
"loss": 1.568,
"step": 1295
},
{
"epoch": 0.17380420205632544,
"grad_norm": 1.0742848470460207,
"learning_rate": 5.607343495541017e-07,
"loss": 1.6815,
"step": 1296
},
{
"epoch": 0.17393831023692446,
"grad_norm": 1.1104040571093063,
"learning_rate": 5.606752472269675e-07,
"loss": 1.7855,
"step": 1297
},
{
"epoch": 0.17407241841752347,
"grad_norm": 1.1082815736136737,
"learning_rate": 5.606161039465304e-07,
"loss": 1.5563,
"step": 1298
},
{
"epoch": 0.1742065265981225,
"grad_norm": 1.3426693471935263,
"learning_rate": 5.605569197232904e-07,
"loss": 1.6382,
"step": 1299
},
{
"epoch": 0.1743406347787215,
"grad_norm": 1.1018630739261308,
"learning_rate": 5.604976945677547e-07,
"loss": 1.5862,
"step": 1300
},
{
"epoch": 0.17447474295932053,
"grad_norm": 1.08258660371521,
"learning_rate": 5.604384284904382e-07,
"loss": 1.7377,
"step": 1301
},
{
"epoch": 0.17460885113991953,
"grad_norm": 1.0416433850048736,
"learning_rate": 5.603791215018626e-07,
"loss": 1.6654,
"step": 1302
},
{
"epoch": 0.17474295932051856,
"grad_norm": 1.0585227638311847,
"learning_rate": 5.603197736125572e-07,
"loss": 1.6259,
"step": 1303
},
{
"epoch": 0.17487706750111756,
"grad_norm": 1.800828493151873,
"learning_rate": 5.602603848330582e-07,
"loss": 1.6681,
"step": 1304
},
{
"epoch": 0.1750111756817166,
"grad_norm": 1.2442322404337642,
"learning_rate": 5.602009551739095e-07,
"loss": 1.7388,
"step": 1305
},
{
"epoch": 0.1751452838623156,
"grad_norm": 1.0650536278693077,
"learning_rate": 5.60141484645662e-07,
"loss": 1.6913,
"step": 1306
},
{
"epoch": 0.17527939204291462,
"grad_norm": 1.0715066374394453,
"learning_rate": 5.600819732588738e-07,
"loss": 1.7508,
"step": 1307
},
{
"epoch": 0.17541350022351362,
"grad_norm": 1.2154515219706747,
"learning_rate": 5.600224210241104e-07,
"loss": 1.6431,
"step": 1308
},
{
"epoch": 0.17554760840411265,
"grad_norm": 1.0580023010334576,
"learning_rate": 5.599628279519445e-07,
"loss": 1.7028,
"step": 1309
},
{
"epoch": 0.17568171658471166,
"grad_norm": 1.0649573978054163,
"learning_rate": 5.599031940529562e-07,
"loss": 1.7045,
"step": 1310
},
{
"epoch": 0.17581582476531069,
"grad_norm": 1.066600801218827,
"learning_rate": 5.598435193377324e-07,
"loss": 1.6888,
"step": 1311
},
{
"epoch": 0.1759499329459097,
"grad_norm": 1.2123022138020687,
"learning_rate": 5.597838038168678e-07,
"loss": 1.7297,
"step": 1312
},
{
"epoch": 0.17608404112650872,
"grad_norm": 1.0436067677488805,
"learning_rate": 5.59724047500964e-07,
"loss": 1.652,
"step": 1313
},
{
"epoch": 0.17621814930710775,
"grad_norm": 1.0487601395222634,
"learning_rate": 5.5966425040063e-07,
"loss": 1.7444,
"step": 1314
},
{
"epoch": 0.17635225748770675,
"grad_norm": 1.117082389094809,
"learning_rate": 5.596044125264818e-07,
"loss": 1.64,
"step": 1315
},
{
"epoch": 0.17648636566830578,
"grad_norm": 1.0558238043899169,
"learning_rate": 5.595445338891431e-07,
"loss": 1.6659,
"step": 1316
},
{
"epoch": 0.17662047384890478,
"grad_norm": 1.0478981037852866,
"learning_rate": 5.594846144992443e-07,
"loss": 1.52,
"step": 1317
},
{
"epoch": 0.1767545820295038,
"grad_norm": 1.257918943849832,
"learning_rate": 5.594246543674234e-07,
"loss": 1.7601,
"step": 1318
},
{
"epoch": 0.1768886902101028,
"grad_norm": 1.4225322949034613,
"learning_rate": 5.593646535043253e-07,
"loss": 1.7307,
"step": 1319
},
{
"epoch": 0.17702279839070184,
"grad_norm": 1.1490395041861463,
"learning_rate": 5.593046119206027e-07,
"loss": 1.7181,
"step": 1320
},
{
"epoch": 0.17715690657130084,
"grad_norm": 1.0611730445421508,
"learning_rate": 5.59244529626915e-07,
"loss": 1.6528,
"step": 1321
},
{
"epoch": 0.17729101475189987,
"grad_norm": 1.204549135410644,
"learning_rate": 5.591844066339289e-07,
"loss": 1.7908,
"step": 1322
},
{
"epoch": 0.17742512293249887,
"grad_norm": 1.1001829655239295,
"learning_rate": 5.591242429523187e-07,
"loss": 1.6403,
"step": 1323
},
{
"epoch": 0.1775592311130979,
"grad_norm": 1.1251080236723472,
"learning_rate": 5.590640385927655e-07,
"loss": 1.6476,
"step": 1324
},
{
"epoch": 0.1776933392936969,
"grad_norm": 1.0879047909659794,
"learning_rate": 5.590037935659577e-07,
"loss": 1.7197,
"step": 1325
},
{
"epoch": 0.17782744747429594,
"grad_norm": 1.0406989517811054,
"learning_rate": 5.589435078825912e-07,
"loss": 1.5898,
"step": 1326
},
{
"epoch": 0.17796155565489494,
"grad_norm": 1.055284942749228,
"learning_rate": 5.588831815533688e-07,
"loss": 1.6537,
"step": 1327
},
{
"epoch": 0.17809566383549397,
"grad_norm": 1.1132782384590842,
"learning_rate": 5.588228145890006e-07,
"loss": 1.6304,
"step": 1328
},
{
"epoch": 0.17822977201609297,
"grad_norm": 1.1856096238614278,
"learning_rate": 5.587624070002039e-07,
"loss": 1.6901,
"step": 1329
},
{
"epoch": 0.178363880196692,
"grad_norm": 1.0716839423819353,
"learning_rate": 5.587019587977035e-07,
"loss": 1.6256,
"step": 1330
},
{
"epoch": 0.17849798837729103,
"grad_norm": 1.0832321039520167,
"learning_rate": 5.586414699922309e-07,
"loss": 1.6811,
"step": 1331
},
{
"epoch": 0.17863209655789003,
"grad_norm": 1.0997046830321784,
"learning_rate": 5.585809405945252e-07,
"loss": 1.5625,
"step": 1332
},
{
"epoch": 0.17876620473848906,
"grad_norm": 1.0713255103444261,
"learning_rate": 5.585203706153326e-07,
"loss": 1.6532,
"step": 1333
},
{
"epoch": 0.17890031291908806,
"grad_norm": 1.097655546141729,
"learning_rate": 5.584597600654066e-07,
"loss": 1.561,
"step": 1334
},
{
"epoch": 0.1790344210996871,
"grad_norm": 1.118524842313588,
"learning_rate": 5.583991089555074e-07,
"loss": 1.6562,
"step": 1335
},
{
"epoch": 0.1791685292802861,
"grad_norm": 1.143484492621255,
"learning_rate": 5.583384172964032e-07,
"loss": 1.6106,
"step": 1336
},
{
"epoch": 0.17930263746088512,
"grad_norm": 1.1214046342101587,
"learning_rate": 5.582776850988688e-07,
"loss": 1.6307,
"step": 1337
},
{
"epoch": 0.17943674564148412,
"grad_norm": 1.1213846092161437,
"learning_rate": 5.582169123736864e-07,
"loss": 1.7581,
"step": 1338
},
{
"epoch": 0.17957085382208315,
"grad_norm": 1.1045643310044297,
"learning_rate": 5.581560991316455e-07,
"loss": 1.7356,
"step": 1339
},
{
"epoch": 0.17970496200268216,
"grad_norm": 1.1684585589911254,
"learning_rate": 5.580952453835426e-07,
"loss": 1.7319,
"step": 1340
},
{
"epoch": 0.17983907018328119,
"grad_norm": 1.3021764184252913,
"learning_rate": 5.580343511401813e-07,
"loss": 1.7263,
"step": 1341
},
{
"epoch": 0.1799731783638802,
"grad_norm": 1.113861073703856,
"learning_rate": 5.579734164123729e-07,
"loss": 1.6896,
"step": 1342
},
{
"epoch": 0.18010728654447922,
"grad_norm": 1.081482477946928,
"learning_rate": 5.579124412109352e-07,
"loss": 1.7272,
"step": 1343
},
{
"epoch": 0.18024139472507822,
"grad_norm": 1.2066355523363086,
"learning_rate": 5.578514255466939e-07,
"loss": 1.7111,
"step": 1344
},
{
"epoch": 0.18037550290567725,
"grad_norm": 1.0985468030112344,
"learning_rate": 5.577903694304811e-07,
"loss": 1.6341,
"step": 1345
},
{
"epoch": 0.18050961108627625,
"grad_norm": 1.171300719246094,
"learning_rate": 5.577292728731368e-07,
"loss": 1.7271,
"step": 1346
},
{
"epoch": 0.18064371926687528,
"grad_norm": 1.0938624509126613,
"learning_rate": 5.576681358855078e-07,
"loss": 1.6505,
"step": 1347
},
{
"epoch": 0.1807778274474743,
"grad_norm": 1.1376662655489747,
"learning_rate": 5.57606958478448e-07,
"loss": 1.6729,
"step": 1348
},
{
"epoch": 0.1809119356280733,
"grad_norm": 1.1248050141842243,
"learning_rate": 5.575457406628189e-07,
"loss": 1.6139,
"step": 1349
},
{
"epoch": 0.18104604380867234,
"grad_norm": 1.0939373053874768,
"learning_rate": 5.574844824494888e-07,
"loss": 1.6295,
"step": 1350
},
{
"epoch": 0.18118015198927134,
"grad_norm": 1.0842961883880395,
"learning_rate": 5.574231838493333e-07,
"loss": 1.5905,
"step": 1351
},
{
"epoch": 0.18131426016987037,
"grad_norm": 1.1099129964326464,
"learning_rate": 5.573618448732349e-07,
"loss": 1.5986,
"step": 1352
},
{
"epoch": 0.18144836835046937,
"grad_norm": 1.1232448273106495,
"learning_rate": 5.573004655320838e-07,
"loss": 1.7579,
"step": 1353
},
{
"epoch": 0.1815824765310684,
"grad_norm": 1.1666528664724998,
"learning_rate": 5.57239045836777e-07,
"loss": 1.6152,
"step": 1354
},
{
"epoch": 0.1817165847116674,
"grad_norm": 1.1370227967293582,
"learning_rate": 5.571775857982186e-07,
"loss": 1.7261,
"step": 1355
},
{
"epoch": 0.18185069289226644,
"grad_norm": 1.1281838118145104,
"learning_rate": 5.571160854273203e-07,
"loss": 1.7791,
"step": 1356
},
{
"epoch": 0.18198480107286544,
"grad_norm": 1.1128745175377743,
"learning_rate": 5.570545447350004e-07,
"loss": 1.6613,
"step": 1357
},
{
"epoch": 0.18211890925346447,
"grad_norm": 1.0867439824153309,
"learning_rate": 5.569929637321848e-07,
"loss": 1.7577,
"step": 1358
},
{
"epoch": 0.18225301743406347,
"grad_norm": 1.1168304669263995,
"learning_rate": 5.569313424298063e-07,
"loss": 1.6313,
"step": 1359
},
{
"epoch": 0.1823871256146625,
"grad_norm": 1.0783686555511454,
"learning_rate": 5.56869680838805e-07,
"loss": 1.6155,
"step": 1360
},
{
"epoch": 0.1825212337952615,
"grad_norm": 1.1849330577729977,
"learning_rate": 5.568079789701281e-07,
"loss": 1.7919,
"step": 1361
},
{
"epoch": 0.18265534197586053,
"grad_norm": 1.0642283339220127,
"learning_rate": 5.567462368347296e-07,
"loss": 1.6483,
"step": 1362
},
{
"epoch": 0.18278945015645953,
"grad_norm": 1.0762888034859384,
"learning_rate": 5.566844544435715e-07,
"loss": 1.6447,
"step": 1363
},
{
"epoch": 0.18292355833705856,
"grad_norm": 1.1102699057236556,
"learning_rate": 5.566226318076221e-07,
"loss": 1.6753,
"step": 1364
},
{
"epoch": 0.18305766651765756,
"grad_norm": 1.0900024036456375,
"learning_rate": 5.565607689378574e-07,
"loss": 1.6932,
"step": 1365
},
{
"epoch": 0.1831917746982566,
"grad_norm": 1.170525713074084,
"learning_rate": 5.564988658452601e-07,
"loss": 1.6378,
"step": 1366
},
{
"epoch": 0.18332588287885562,
"grad_norm": 1.1252580693238932,
"learning_rate": 5.564369225408206e-07,
"loss": 1.7611,
"step": 1367
},
{
"epoch": 0.18345999105945462,
"grad_norm": 1.0779299976202001,
"learning_rate": 5.563749390355356e-07,
"loss": 1.6517,
"step": 1368
},
{
"epoch": 0.18359409924005365,
"grad_norm": 1.0810638342875853,
"learning_rate": 5.563129153404099e-07,
"loss": 1.5525,
"step": 1369
},
{
"epoch": 0.18372820742065266,
"grad_norm": 1.061240323219775,
"learning_rate": 5.562508514664548e-07,
"loss": 1.7482,
"step": 1370
},
{
"epoch": 0.18386231560125169,
"grad_norm": 1.1362519090350038,
"learning_rate": 5.561887474246889e-07,
"loss": 1.5771,
"step": 1371
},
{
"epoch": 0.1839964237818507,
"grad_norm": 1.7306083620793078,
"learning_rate": 5.561266032261379e-07,
"loss": 1.6738,
"step": 1372
},
{
"epoch": 0.18413053196244972,
"grad_norm": 1.1266147426655102,
"learning_rate": 5.560644188818348e-07,
"loss": 1.6809,
"step": 1373
},
{
"epoch": 0.18426464014304872,
"grad_norm": 1.4560506903910069,
"learning_rate": 5.560021944028195e-07,
"loss": 1.7862,
"step": 1374
},
{
"epoch": 0.18439874832364775,
"grad_norm": 1.1339717685703572,
"learning_rate": 5.559399298001391e-07,
"loss": 1.7362,
"step": 1375
},
{
"epoch": 0.18453285650424675,
"grad_norm": 1.0605805880234964,
"learning_rate": 5.55877625084848e-07,
"loss": 1.6264,
"step": 1376
},
{
"epoch": 0.18466696468484578,
"grad_norm": 1.1589703072777433,
"learning_rate": 5.558152802680075e-07,
"loss": 1.6524,
"step": 1377
},
{
"epoch": 0.18480107286544478,
"grad_norm": 1.0842230894260985,
"learning_rate": 5.557528953606858e-07,
"loss": 1.8047,
"step": 1378
},
{
"epoch": 0.1849351810460438,
"grad_norm": 1.1794280210617787,
"learning_rate": 5.55690470373959e-07,
"loss": 1.6757,
"step": 1379
},
{
"epoch": 0.1850692892266428,
"grad_norm": 1.097631119847551,
"learning_rate": 5.556280053189095e-07,
"loss": 1.6108,
"step": 1380
},
{
"epoch": 0.18520339740724184,
"grad_norm": 1.1017129023282082,
"learning_rate": 5.555655002066273e-07,
"loss": 1.7577,
"step": 1381
},
{
"epoch": 0.18533750558784085,
"grad_norm": 1.1361790282178577,
"learning_rate": 5.555029550482091e-07,
"loss": 1.7294,
"step": 1382
},
{
"epoch": 0.18547161376843987,
"grad_norm": 1.055142090473337,
"learning_rate": 5.554403698547593e-07,
"loss": 1.6388,
"step": 1383
},
{
"epoch": 0.1856057219490389,
"grad_norm": 7.061910083877572,
"learning_rate": 5.553777446373886e-07,
"loss": 1.6087,
"step": 1384
},
{
"epoch": 0.1857398301296379,
"grad_norm": 1.1547867916367462,
"learning_rate": 5.553150794072159e-07,
"loss": 1.6509,
"step": 1385
},
{
"epoch": 0.18587393831023694,
"grad_norm": 1.193219273609135,
"learning_rate": 5.552523741753659e-07,
"loss": 1.8231,
"step": 1386
},
{
"epoch": 0.18600804649083594,
"grad_norm": 1.0693060290055107,
"learning_rate": 5.551896289529716e-07,
"loss": 1.656,
"step": 1387
},
{
"epoch": 0.18614215467143497,
"grad_norm": 1.1745807906563366,
"learning_rate": 5.551268437511724e-07,
"loss": 1.6985,
"step": 1388
},
{
"epoch": 0.18627626285203397,
"grad_norm": 1.099307648055397,
"learning_rate": 5.550640185811148e-07,
"loss": 1.6393,
"step": 1389
},
{
"epoch": 0.186410371032633,
"grad_norm": 1.1139438125947954,
"learning_rate": 5.550011534539527e-07,
"loss": 1.6638,
"step": 1390
},
{
"epoch": 0.186544479213232,
"grad_norm": 1.0670126218487324,
"learning_rate": 5.549382483808472e-07,
"loss": 1.6649,
"step": 1391
},
{
"epoch": 0.18667858739383103,
"grad_norm": 1.1017328082824618,
"learning_rate": 5.548753033729658e-07,
"loss": 1.6979,
"step": 1392
},
{
"epoch": 0.18681269557443003,
"grad_norm": 1.1113229457677472,
"learning_rate": 5.548123184414838e-07,
"loss": 1.6629,
"step": 1393
},
{
"epoch": 0.18694680375502906,
"grad_norm": 1.061154042048288,
"learning_rate": 5.547492935975834e-07,
"loss": 1.6141,
"step": 1394
},
{
"epoch": 0.18708091193562806,
"grad_norm": 1.1037785149371337,
"learning_rate": 5.546862288524536e-07,
"loss": 1.619,
"step": 1395
},
{
"epoch": 0.1872150201162271,
"grad_norm": 1.042211773070437,
"learning_rate": 5.546231242172909e-07,
"loss": 1.6314,
"step": 1396
},
{
"epoch": 0.1873491282968261,
"grad_norm": 1.0991209850271397,
"learning_rate": 5.545599797032986e-07,
"loss": 1.6851,
"step": 1397
},
{
"epoch": 0.18748323647742512,
"grad_norm": 1.0820523032730966,
"learning_rate": 5.544967953216872e-07,
"loss": 1.614,
"step": 1398
},
{
"epoch": 0.18761734465802413,
"grad_norm": 1.0852415180954882,
"learning_rate": 5.544335710836741e-07,
"loss": 1.7069,
"step": 1399
},
{
"epoch": 0.18775145283862316,
"grad_norm": 1.1386169714316008,
"learning_rate": 5.543703070004842e-07,
"loss": 1.7039,
"step": 1400
},
{
"epoch": 0.18788556101922219,
"grad_norm": 1.1108755081549002,
"learning_rate": 5.543070030833488e-07,
"loss": 1.5328,
"step": 1401
},
{
"epoch": 0.1880196691998212,
"grad_norm": 1.1300665980334217,
"learning_rate": 5.542436593435071e-07,
"loss": 1.5492,
"step": 1402
},
{
"epoch": 0.18815377738042022,
"grad_norm": 1.1648000097455284,
"learning_rate": 5.541802757922047e-07,
"loss": 1.7602,
"step": 1403
},
{
"epoch": 0.18828788556101922,
"grad_norm": 1.2562290482462215,
"learning_rate": 5.541168524406944e-07,
"loss": 1.7935,
"step": 1404
},
{
"epoch": 0.18842199374161825,
"grad_norm": 1.0533823510103384,
"learning_rate": 5.540533893002363e-07,
"loss": 1.6259,
"step": 1405
},
{
"epoch": 0.18855610192221725,
"grad_norm": 1.1167925185725207,
"learning_rate": 5.539898863820975e-07,
"loss": 1.6887,
"step": 1406
},
{
"epoch": 0.18869021010281628,
"grad_norm": 1.1001250134186094,
"learning_rate": 5.539263436975518e-07,
"loss": 1.6111,
"step": 1407
},
{
"epoch": 0.18882431828341528,
"grad_norm": 1.0625193817660576,
"learning_rate": 5.538627612578808e-07,
"loss": 1.6671,
"step": 1408
},
{
"epoch": 0.1889584264640143,
"grad_norm": 1.080439840869442,
"learning_rate": 5.537991390743723e-07,
"loss": 1.6131,
"step": 1409
},
{
"epoch": 0.1890925346446133,
"grad_norm": 1.0665498302900025,
"learning_rate": 5.537354771583218e-07,
"loss": 1.6202,
"step": 1410
},
{
"epoch": 0.18922664282521234,
"grad_norm": 1.062235350568671,
"learning_rate": 5.536717755210317e-07,
"loss": 1.7539,
"step": 1411
},
{
"epoch": 0.18936075100581135,
"grad_norm": 1.1086535902273902,
"learning_rate": 5.536080341738112e-07,
"loss": 1.6395,
"step": 1412
},
{
"epoch": 0.18949485918641037,
"grad_norm": 1.0667252815429409,
"learning_rate": 5.535442531279765e-07,
"loss": 1.6353,
"step": 1413
},
{
"epoch": 0.18962896736700938,
"grad_norm": 1.046416301897227,
"learning_rate": 5.534804323948516e-07,
"loss": 1.6511,
"step": 1414
},
{
"epoch": 0.1897630755476084,
"grad_norm": 1.075372549798005,
"learning_rate": 5.534165719857666e-07,
"loss": 1.7723,
"step": 1415
},
{
"epoch": 0.1898971837282074,
"grad_norm": 1.104299289930867,
"learning_rate": 5.533526719120594e-07,
"loss": 1.6641,
"step": 1416
},
{
"epoch": 0.19003129190880644,
"grad_norm": 1.0927891670744394,
"learning_rate": 5.532887321850742e-07,
"loss": 1.5863,
"step": 1417
},
{
"epoch": 0.19016540008940547,
"grad_norm": 1.1166521049854372,
"learning_rate": 5.532247528161629e-07,
"loss": 1.6574,
"step": 1418
},
{
"epoch": 0.19029950827000447,
"grad_norm": 1.1320778263461202,
"learning_rate": 5.531607338166842e-07,
"loss": 1.6688,
"step": 1419
},
{
"epoch": 0.1904336164506035,
"grad_norm": 1.1471242711580207,
"learning_rate": 5.530966751980036e-07,
"loss": 1.6654,
"step": 1420
},
{
"epoch": 0.1905677246312025,
"grad_norm": 1.0867888184745689,
"learning_rate": 5.530325769714941e-07,
"loss": 1.5906,
"step": 1421
},
{
"epoch": 0.19070183281180153,
"grad_norm": 1.4483826712692085,
"learning_rate": 5.529684391485354e-07,
"loss": 1.5822,
"step": 1422
},
{
"epoch": 0.19083594099240053,
"grad_norm": 1.255407468844781,
"learning_rate": 5.529042617405144e-07,
"loss": 1.7131,
"step": 1423
},
{
"epoch": 0.19097004917299956,
"grad_norm": 1.158464569939825,
"learning_rate": 5.528400447588247e-07,
"loss": 1.7756,
"step": 1424
},
{
"epoch": 0.19110415735359856,
"grad_norm": 1.0950885308074678,
"learning_rate": 5.527757882148672e-07,
"loss": 1.5582,
"step": 1425
},
{
"epoch": 0.1912382655341976,
"grad_norm": 1.1070256742947473,
"learning_rate": 5.527114921200501e-07,
"loss": 1.6467,
"step": 1426
},
{
"epoch": 0.1913723737147966,
"grad_norm": 1.0928498062976033,
"learning_rate": 5.52647156485788e-07,
"loss": 1.7125,
"step": 1427
},
{
"epoch": 0.19150648189539562,
"grad_norm": 1.1327469366060336,
"learning_rate": 5.525827813235029e-07,
"loss": 1.6743,
"step": 1428
},
{
"epoch": 0.19164059007599463,
"grad_norm": 1.0882012662709442,
"learning_rate": 5.525183666446239e-07,
"loss": 1.6799,
"step": 1429
},
{
"epoch": 0.19177469825659366,
"grad_norm": 1.1709943857735898,
"learning_rate": 5.524539124605868e-07,
"loss": 1.766,
"step": 1430
},
{
"epoch": 0.19190880643719266,
"grad_norm": 1.0839291014706198,
"learning_rate": 5.523894187828345e-07,
"loss": 1.6322,
"step": 1431
},
{
"epoch": 0.1920429146177917,
"grad_norm": 1.0975188778434444,
"learning_rate": 5.523248856228172e-07,
"loss": 1.7589,
"step": 1432
},
{
"epoch": 0.1921770227983907,
"grad_norm": 1.1022611138397802,
"learning_rate": 5.522603129919919e-07,
"loss": 1.6493,
"step": 1433
},
{
"epoch": 0.19231113097898972,
"grad_norm": 1.0944356638645014,
"learning_rate": 5.521957009018224e-07,
"loss": 1.6845,
"step": 1434
},
{
"epoch": 0.19244523915958872,
"grad_norm": 1.1206597827966063,
"learning_rate": 5.521310493637798e-07,
"loss": 1.6926,
"step": 1435
},
{
"epoch": 0.19257934734018775,
"grad_norm": 1.0956992634305383,
"learning_rate": 5.520663583893422e-07,
"loss": 1.6463,
"step": 1436
},
{
"epoch": 0.19271345552078678,
"grad_norm": 1.0831083719944854,
"learning_rate": 5.520016279899947e-07,
"loss": 1.599,
"step": 1437
},
{
"epoch": 0.19284756370138578,
"grad_norm": 1.391549260981187,
"learning_rate": 5.51936858177229e-07,
"loss": 1.6344,
"step": 1438
},
{
"epoch": 0.1929816718819848,
"grad_norm": 1.1524973265055787,
"learning_rate": 5.518720489625443e-07,
"loss": 1.7242,
"step": 1439
},
{
"epoch": 0.19311578006258381,
"grad_norm": 1.1802426876707486,
"learning_rate": 5.518072003574467e-07,
"loss": 1.6515,
"step": 1440
},
{
"epoch": 0.19324988824318284,
"grad_norm": 1.1402824833918361,
"learning_rate": 5.51742312373449e-07,
"loss": 1.8068,
"step": 1441
},
{
"epoch": 0.19338399642378185,
"grad_norm": 1.3034827789380141,
"learning_rate": 5.516773850220713e-07,
"loss": 1.5961,
"step": 1442
},
{
"epoch": 0.19351810460438088,
"grad_norm": 1.0690564805797904,
"learning_rate": 5.516124183148406e-07,
"loss": 1.6845,
"step": 1443
},
{
"epoch": 0.19365221278497988,
"grad_norm": 1.0643025118264189,
"learning_rate": 5.515474122632908e-07,
"loss": 1.6856,
"step": 1444
},
{
"epoch": 0.1937863209655789,
"grad_norm": 1.1264779418191524,
"learning_rate": 5.51482366878963e-07,
"loss": 1.6055,
"step": 1445
},
{
"epoch": 0.1939204291461779,
"grad_norm": 1.024225937952105,
"learning_rate": 5.51417282173405e-07,
"loss": 1.6615,
"step": 1446
},
{
"epoch": 0.19405453732677694,
"grad_norm": 1.161971897328525,
"learning_rate": 5.513521581581719e-07,
"loss": 1.6043,
"step": 1447
},
{
"epoch": 0.19418864550737594,
"grad_norm": 1.0885797045193277,
"learning_rate": 5.512869948448252e-07,
"loss": 1.701,
"step": 1448
},
{
"epoch": 0.19432275368797497,
"grad_norm": 1.1421314031719336,
"learning_rate": 5.512217922449342e-07,
"loss": 1.6471,
"step": 1449
},
{
"epoch": 0.19445686186857397,
"grad_norm": 1.077561352558914,
"learning_rate": 5.511565503700745e-07,
"loss": 1.7467,
"step": 1450
},
{
"epoch": 0.194590970049173,
"grad_norm": 1.1713587803273386,
"learning_rate": 5.51091269231829e-07,
"loss": 1.833,
"step": 1451
},
{
"epoch": 0.194725078229772,
"grad_norm": 1.1325441610620945,
"learning_rate": 5.510259488417875e-07,
"loss": 1.6516,
"step": 1452
},
{
"epoch": 0.19485918641037103,
"grad_norm": 1.105302401232543,
"learning_rate": 5.509605892115468e-07,
"loss": 1.6555,
"step": 1453
},
{
"epoch": 0.19499329459097006,
"grad_norm": 1.1082502770943088,
"learning_rate": 5.508951903527105e-07,
"loss": 1.6901,
"step": 1454
},
{
"epoch": 0.19512740277156906,
"grad_norm": 1.2100417158092283,
"learning_rate": 5.508297522768895e-07,
"loss": 1.7645,
"step": 1455
},
{
"epoch": 0.1952615109521681,
"grad_norm": 1.054087647701517,
"learning_rate": 5.507642749957011e-07,
"loss": 1.714,
"step": 1456
},
{
"epoch": 0.1953956191327671,
"grad_norm": 1.0560637240698765,
"learning_rate": 5.506987585207703e-07,
"loss": 1.6332,
"step": 1457
},
{
"epoch": 0.19552972731336613,
"grad_norm": 1.110689185152269,
"learning_rate": 5.506332028637285e-07,
"loss": 1.6175,
"step": 1458
},
{
"epoch": 0.19566383549396513,
"grad_norm": 1.0676099046686827,
"learning_rate": 5.505676080362142e-07,
"loss": 1.753,
"step": 1459
},
{
"epoch": 0.19579794367456416,
"grad_norm": 1.0306885085920625,
"learning_rate": 5.505019740498731e-07,
"loss": 1.5685,
"step": 1460
},
{
"epoch": 0.19593205185516316,
"grad_norm": 1.0775372740576943,
"learning_rate": 5.504363009163573e-07,
"loss": 1.6199,
"step": 1461
},
{
"epoch": 0.1960661600357622,
"grad_norm": 1.0643274573728114,
"learning_rate": 5.503705886473264e-07,
"loss": 1.6547,
"step": 1462
},
{
"epoch": 0.1962002682163612,
"grad_norm": 1.0711004226035805,
"learning_rate": 5.503048372544466e-07,
"loss": 1.7047,
"step": 1463
},
{
"epoch": 0.19633437639696022,
"grad_norm": 1.123667947934815,
"learning_rate": 5.502390467493915e-07,
"loss": 1.7008,
"step": 1464
},
{
"epoch": 0.19646848457755922,
"grad_norm": 1.0844329149084733,
"learning_rate": 5.501732171438408e-07,
"loss": 1.6279,
"step": 1465
},
{
"epoch": 0.19660259275815825,
"grad_norm": 1.436970815874584,
"learning_rate": 5.501073484494822e-07,
"loss": 1.6543,
"step": 1466
},
{
"epoch": 0.19673670093875725,
"grad_norm": 1.1579140829195231,
"learning_rate": 5.500414406780093e-07,
"loss": 1.6149,
"step": 1467
},
{
"epoch": 0.19687080911935628,
"grad_norm": 1.1219759034001007,
"learning_rate": 5.499754938411235e-07,
"loss": 1.6853,
"step": 1468
},
{
"epoch": 0.19700491729995528,
"grad_norm": 1.1456958318708046,
"learning_rate": 5.499095079505327e-07,
"loss": 1.6056,
"step": 1469
},
{
"epoch": 0.19713902548055431,
"grad_norm": 1.135367963109951,
"learning_rate": 5.498434830179519e-07,
"loss": 1.6775,
"step": 1470
},
{
"epoch": 0.19727313366115334,
"grad_norm": 1.0665068451667536,
"learning_rate": 5.497774190551028e-07,
"loss": 1.6953,
"step": 1471
},
{
"epoch": 0.19740724184175235,
"grad_norm": 1.0531212330423794,
"learning_rate": 5.497113160737142e-07,
"loss": 1.6531,
"step": 1472
},
{
"epoch": 0.19754135002235138,
"grad_norm": 1.1454744923401645,
"learning_rate": 5.496451740855217e-07,
"loss": 1.7061,
"step": 1473
},
{
"epoch": 0.19767545820295038,
"grad_norm": 1.1044037302229577,
"learning_rate": 5.49578993102268e-07,
"loss": 1.6111,
"step": 1474
},
{
"epoch": 0.1978095663835494,
"grad_norm": 1.0685087974547518,
"learning_rate": 5.495127731357029e-07,
"loss": 1.572,
"step": 1475
},
{
"epoch": 0.1979436745641484,
"grad_norm": 1.0974414948618096,
"learning_rate": 5.494465141975826e-07,
"loss": 1.6854,
"step": 1476
},
{
"epoch": 0.19807778274474744,
"grad_norm": 1.0834578832501205,
"learning_rate": 5.493802162996703e-07,
"loss": 1.6889,
"step": 1477
},
{
"epoch": 0.19821189092534644,
"grad_norm": 1.070274290599906,
"learning_rate": 5.493138794537367e-07,
"loss": 1.6939,
"step": 1478
},
{
"epoch": 0.19834599910594547,
"grad_norm": 1.115057911105637,
"learning_rate": 5.49247503671559e-07,
"loss": 1.6584,
"step": 1479
},
{
"epoch": 0.19848010728654447,
"grad_norm": 1.1561061527897827,
"learning_rate": 5.491810889649211e-07,
"loss": 1.7095,
"step": 1480
},
{
"epoch": 0.1986142154671435,
"grad_norm": 1.1456838684818837,
"learning_rate": 5.491146353456139e-07,
"loss": 1.5911,
"step": 1481
},
{
"epoch": 0.1987483236477425,
"grad_norm": 1.0828440940723576,
"learning_rate": 5.490481428254358e-07,
"loss": 1.6674,
"step": 1482
},
{
"epoch": 0.19888243182834153,
"grad_norm": 1.1636923332921367,
"learning_rate": 5.489816114161914e-07,
"loss": 1.7205,
"step": 1483
},
{
"epoch": 0.19901654000894053,
"grad_norm": 1.197166180009061,
"learning_rate": 5.489150411296926e-07,
"loss": 1.5965,
"step": 1484
},
{
"epoch": 0.19915064818953956,
"grad_norm": 1.9827106666547534,
"learning_rate": 5.488484319777578e-07,
"loss": 1.7469,
"step": 1485
},
{
"epoch": 0.19928475637013857,
"grad_norm": 1.140838885188788,
"learning_rate": 5.487817839722128e-07,
"loss": 1.7168,
"step": 1486
},
{
"epoch": 0.1994188645507376,
"grad_norm": 1.0817633006855307,
"learning_rate": 5.487150971248901e-07,
"loss": 1.5428,
"step": 1487
},
{
"epoch": 0.19955297273133663,
"grad_norm": 1.076792423128002,
"learning_rate": 5.486483714476288e-07,
"loss": 1.788,
"step": 1488
},
{
"epoch": 0.19968708091193563,
"grad_norm": 1.1267981548935038,
"learning_rate": 5.485816069522754e-07,
"loss": 1.692,
"step": 1489
},
{
"epoch": 0.19982118909253466,
"grad_norm": 1.0735390096180335,
"learning_rate": 5.485148036506829e-07,
"loss": 1.6896,
"step": 1490
},
{
"epoch": 0.19995529727313366,
"grad_norm": 1.067799342487284,
"learning_rate": 5.484479615547114e-07,
"loss": 1.5558,
"step": 1491
},
{
"epoch": 0.2000894054537327,
"grad_norm": 1.134188777380917,
"learning_rate": 5.483810806762278e-07,
"loss": 1.6667,
"step": 1492
},
{
"epoch": 0.2002235136343317,
"grad_norm": 1.0312169428441251,
"learning_rate": 5.483141610271059e-07,
"loss": 1.5311,
"step": 1493
},
{
"epoch": 0.20035762181493072,
"grad_norm": 1.113434318828811,
"learning_rate": 5.482472026192263e-07,
"loss": 1.662,
"step": 1494
},
{
"epoch": 0.20049172999552972,
"grad_norm": 1.0830554984993648,
"learning_rate": 5.481802054644767e-07,
"loss": 1.6549,
"step": 1495
},
{
"epoch": 0.20062583817612875,
"grad_norm": 1.1263172768039542,
"learning_rate": 5.481131695747516e-07,
"loss": 1.7273,
"step": 1496
},
{
"epoch": 0.20075994635672775,
"grad_norm": 1.0175973585933547,
"learning_rate": 5.480460949619521e-07,
"loss": 1.6573,
"step": 1497
},
{
"epoch": 0.20089405453732678,
"grad_norm": 1.0684638665771677,
"learning_rate": 5.479789816379866e-07,
"loss": 1.5783,
"step": 1498
},
{
"epoch": 0.20102816271792578,
"grad_norm": 1.100911731230959,
"learning_rate": 5.479118296147701e-07,
"loss": 1.7139,
"step": 1499
},
{
"epoch": 0.20116227089852481,
"grad_norm": 1.0645364314712737,
"learning_rate": 5.478446389042245e-07,
"loss": 1.6684,
"step": 1500
},
{
"epoch": 0.20129637907912382,
"grad_norm": 1.0556389823591241,
"learning_rate": 5.477774095182787e-07,
"loss": 1.5132,
"step": 1501
},
{
"epoch": 0.20143048725972285,
"grad_norm": 1.2334210157237786,
"learning_rate": 5.477101414688683e-07,
"loss": 1.6951,
"step": 1502
},
{
"epoch": 0.20156459544032185,
"grad_norm": 1.058485353217571,
"learning_rate": 5.47642834767936e-07,
"loss": 1.6295,
"step": 1503
},
{
"epoch": 0.20169870362092088,
"grad_norm": 1.0445219837933504,
"learning_rate": 5.475754894274309e-07,
"loss": 1.6173,
"step": 1504
},
{
"epoch": 0.20183281180151988,
"grad_norm": 1.1004187774444296,
"learning_rate": 5.475081054593096e-07,
"loss": 1.739,
"step": 1505
},
{
"epoch": 0.2019669199821189,
"grad_norm": 1.1602467124536924,
"learning_rate": 5.47440682875535e-07,
"loss": 1.6625,
"step": 1506
},
{
"epoch": 0.20210102816271794,
"grad_norm": 1.0567141838600442,
"learning_rate": 5.47373221688077e-07,
"loss": 1.7637,
"step": 1507
},
{
"epoch": 0.20223513634331694,
"grad_norm": 1.1231422155189525,
"learning_rate": 5.473057219089128e-07,
"loss": 1.6322,
"step": 1508
},
{
"epoch": 0.20236924452391597,
"grad_norm": 1.090099414447627,
"learning_rate": 5.472381835500258e-07,
"loss": 1.7463,
"step": 1509
},
{
"epoch": 0.20250335270451497,
"grad_norm": 1.036240114395212,
"learning_rate": 5.471706066234064e-07,
"loss": 1.5938,
"step": 1510
},
{
"epoch": 0.202637460885114,
"grad_norm": 1.0971271814274632,
"learning_rate": 5.471029911410524e-07,
"loss": 1.729,
"step": 1511
},
{
"epoch": 0.202771569065713,
"grad_norm": 1.0884227452009132,
"learning_rate": 5.470353371149678e-07,
"loss": 1.6752,
"step": 1512
},
{
"epoch": 0.20290567724631203,
"grad_norm": 1.0387697751366196,
"learning_rate": 5.469676445571636e-07,
"loss": 1.6329,
"step": 1513
},
{
"epoch": 0.20303978542691103,
"grad_norm": 1.0513306520797294,
"learning_rate": 5.468999134796577e-07,
"loss": 1.7112,
"step": 1514
},
{
"epoch": 0.20317389360751006,
"grad_norm": 1.0894137924530085,
"learning_rate": 5.46832143894475e-07,
"loss": 1.6982,
"step": 1515
},
{
"epoch": 0.20330800178810907,
"grad_norm": 1.0770824339698073,
"learning_rate": 5.467643358136469e-07,
"loss": 1.7484,
"step": 1516
},
{
"epoch": 0.2034421099687081,
"grad_norm": 1.095801657453924,
"learning_rate": 5.466964892492119e-07,
"loss": 1.6417,
"step": 1517
},
{
"epoch": 0.2035762181493071,
"grad_norm": 1.0796491311299437,
"learning_rate": 5.466286042132154e-07,
"loss": 1.701,
"step": 1518
},
{
"epoch": 0.20371032632990613,
"grad_norm": 1.1233329399203666,
"learning_rate": 5.465606807177093e-07,
"loss": 1.7951,
"step": 1519
},
{
"epoch": 0.20384443451050513,
"grad_norm": 1.1327885765244115,
"learning_rate": 5.464927187747525e-07,
"loss": 1.7971,
"step": 1520
},
{
"epoch": 0.20397854269110416,
"grad_norm": 1.088717235432573,
"learning_rate": 5.464247183964108e-07,
"loss": 1.7474,
"step": 1521
},
{
"epoch": 0.20411265087170316,
"grad_norm": 1.1850510030757087,
"learning_rate": 5.463566795947566e-07,
"loss": 1.755,
"step": 1522
},
{
"epoch": 0.2042467590523022,
"grad_norm": 1.0812752508540497,
"learning_rate": 5.462886023818697e-07,
"loss": 1.7443,
"step": 1523
},
{
"epoch": 0.20438086723290122,
"grad_norm": 1.1119200217165637,
"learning_rate": 5.462204867698359e-07,
"loss": 1.7364,
"step": 1524
},
{
"epoch": 0.20451497541350022,
"grad_norm": 1.0637313799778825,
"learning_rate": 5.461523327707483e-07,
"loss": 1.6503,
"step": 1525
},
{
"epoch": 0.20464908359409925,
"grad_norm": 1.0673393108107518,
"learning_rate": 5.460841403967067e-07,
"loss": 1.7131,
"step": 1526
},
{
"epoch": 0.20478319177469825,
"grad_norm": 1.1295826465075736,
"learning_rate": 5.46015909659818e-07,
"loss": 1.6669,
"step": 1527
},
{
"epoch": 0.20491729995529728,
"grad_norm": 1.037795106149209,
"learning_rate": 5.459476405721954e-07,
"loss": 1.7402,
"step": 1528
},
{
"epoch": 0.20505140813589628,
"grad_norm": 1.0645070431850514,
"learning_rate": 5.458793331459591e-07,
"loss": 1.5445,
"step": 1529
},
{
"epoch": 0.20518551631649531,
"grad_norm": 1.128995508468257,
"learning_rate": 5.458109873932364e-07,
"loss": 1.648,
"step": 1530
},
{
"epoch": 0.20531962449709432,
"grad_norm": 1.1073104845376167,
"learning_rate": 5.45742603326161e-07,
"loss": 1.6629,
"step": 1531
},
{
"epoch": 0.20545373267769335,
"grad_norm": 1.0389720964404514,
"learning_rate": 5.456741809568737e-07,
"loss": 1.6007,
"step": 1532
},
{
"epoch": 0.20558784085829235,
"grad_norm": 1.0874355308974621,
"learning_rate": 5.456057202975218e-07,
"loss": 1.7692,
"step": 1533
},
{
"epoch": 0.20572194903889138,
"grad_norm": 1.1762066099415274,
"learning_rate": 5.455372213602598e-07,
"loss": 1.7199,
"step": 1534
},
{
"epoch": 0.20585605721949038,
"grad_norm": 1.1248545879023728,
"learning_rate": 5.454686841572487e-07,
"loss": 1.6949,
"step": 1535
},
{
"epoch": 0.2059901654000894,
"grad_norm": 1.1062297817819333,
"learning_rate": 5.454001087006563e-07,
"loss": 1.6879,
"step": 1536
},
{
"epoch": 0.2061242735806884,
"grad_norm": 1.5278212260735322,
"learning_rate": 5.453314950026572e-07,
"loss": 1.6452,
"step": 1537
},
{
"epoch": 0.20625838176128744,
"grad_norm": 1.1382568321141864,
"learning_rate": 5.452628430754329e-07,
"loss": 1.6296,
"step": 1538
},
{
"epoch": 0.20639248994188644,
"grad_norm": 1.0827447066590228,
"learning_rate": 5.451941529311719e-07,
"loss": 1.6213,
"step": 1539
},
{
"epoch": 0.20652659812248547,
"grad_norm": 1.090225177526994,
"learning_rate": 5.451254245820687e-07,
"loss": 1.7525,
"step": 1540
},
{
"epoch": 0.2066607063030845,
"grad_norm": 1.1632282700056857,
"learning_rate": 5.450566580403255e-07,
"loss": 1.7183,
"step": 1541
},
{
"epoch": 0.2067948144836835,
"grad_norm": 1.0773895407601781,
"learning_rate": 5.449878533181507e-07,
"loss": 1.5786,
"step": 1542
},
{
"epoch": 0.20692892266428253,
"grad_norm": 1.1177081269020515,
"learning_rate": 5.449190104277597e-07,
"loss": 1.6153,
"step": 1543
},
{
"epoch": 0.20706303084488153,
"grad_norm": 1.0715060717734257,
"learning_rate": 5.448501293813747e-07,
"loss": 1.6768,
"step": 1544
},
{
"epoch": 0.20719713902548056,
"grad_norm": 1.0810287574993174,
"learning_rate": 5.447812101912244e-07,
"loss": 1.6401,
"step": 1545
},
{
"epoch": 0.20733124720607957,
"grad_norm": 1.130608952204106,
"learning_rate": 5.447122528695449e-07,
"loss": 1.6824,
"step": 1546
},
{
"epoch": 0.2074653553866786,
"grad_norm": 1.0467682842596422,
"learning_rate": 5.446432574285782e-07,
"loss": 1.6087,
"step": 1547
},
{
"epoch": 0.2075994635672776,
"grad_norm": 1.139618228642282,
"learning_rate": 5.445742238805737e-07,
"loss": 1.7645,
"step": 1548
},
{
"epoch": 0.20773357174787663,
"grad_norm": 1.1216742451759847,
"learning_rate": 5.445051522377873e-07,
"loss": 1.7316,
"step": 1549
},
{
"epoch": 0.20786767992847563,
"grad_norm": 1.0899102977167905,
"learning_rate": 5.44436042512482e-07,
"loss": 1.6322,
"step": 1550
},
{
"epoch": 0.20800178810907466,
"grad_norm": 1.0497718485142342,
"learning_rate": 5.44366894716927e-07,
"loss": 1.6566,
"step": 1551
},
{
"epoch": 0.20813589628967366,
"grad_norm": 1.0712432967566454,
"learning_rate": 5.442977088633988e-07,
"loss": 1.6461,
"step": 1552
},
{
"epoch": 0.2082700044702727,
"grad_norm": 1.1933916778735016,
"learning_rate": 5.442284849641803e-07,
"loss": 1.7043,
"step": 1553
},
{
"epoch": 0.2084041126508717,
"grad_norm": 1.0126599257311222,
"learning_rate": 5.441592230315611e-07,
"loss": 1.6054,
"step": 1554
},
{
"epoch": 0.20853822083147072,
"grad_norm": 1.3982183722799013,
"learning_rate": 5.440899230778381e-07,
"loss": 1.6898,
"step": 1555
},
{
"epoch": 0.20867232901206972,
"grad_norm": 1.056858598949215,
"learning_rate": 5.440205851153145e-07,
"loss": 1.6916,
"step": 1556
},
{
"epoch": 0.20880643719266875,
"grad_norm": 1.176924033372761,
"learning_rate": 5.439512091563e-07,
"loss": 1.7511,
"step": 1557
},
{
"epoch": 0.20894054537326778,
"grad_norm": 1.057297882595847,
"learning_rate": 5.438817952131117e-07,
"loss": 1.6588,
"step": 1558
},
{
"epoch": 0.20907465355386678,
"grad_norm": 1.0967767040598801,
"learning_rate": 5.43812343298073e-07,
"loss": 1.6058,
"step": 1559
},
{
"epoch": 0.20920876173446581,
"grad_norm": 1.0935764349197725,
"learning_rate": 5.437428534235142e-07,
"loss": 1.7097,
"step": 1560
},
{
"epoch": 0.20934286991506482,
"grad_norm": 1.1271250900157348,
"learning_rate": 5.436733256017723e-07,
"loss": 1.6236,
"step": 1561
},
{
"epoch": 0.20947697809566385,
"grad_norm": 1.1745352200541934,
"learning_rate": 5.43603759845191e-07,
"loss": 1.6031,
"step": 1562
},
{
"epoch": 0.20961108627626285,
"grad_norm": 1.110585453023522,
"learning_rate": 5.435341561661208e-07,
"loss": 1.6934,
"step": 1563
},
{
"epoch": 0.20974519445686188,
"grad_norm": 1.1030892366405238,
"learning_rate": 5.434645145769189e-07,
"loss": 1.6745,
"step": 1564
},
{
"epoch": 0.20987930263746088,
"grad_norm": 1.0681781865728208,
"learning_rate": 5.433948350899491e-07,
"loss": 1.6327,
"step": 1565
},
{
"epoch": 0.2100134108180599,
"grad_norm": 1.1588290451716836,
"learning_rate": 5.433251177175822e-07,
"loss": 1.6737,
"step": 1566
},
{
"epoch": 0.2101475189986589,
"grad_norm": 1.055357765245883,
"learning_rate": 5.432553624721957e-07,
"loss": 1.6018,
"step": 1567
},
{
"epoch": 0.21028162717925794,
"grad_norm": 1.2241168862848832,
"learning_rate": 5.431855693661734e-07,
"loss": 1.6702,
"step": 1568
},
{
"epoch": 0.21041573535985694,
"grad_norm": 1.0592720322600389,
"learning_rate": 5.431157384119064e-07,
"loss": 1.6243,
"step": 1569
},
{
"epoch": 0.21054984354045597,
"grad_norm": 1.0780860574912356,
"learning_rate": 5.43045869621792e-07,
"loss": 1.5921,
"step": 1570
},
{
"epoch": 0.21068395172105497,
"grad_norm": 1.0964102584808006,
"learning_rate": 5.429759630082348e-07,
"loss": 1.6461,
"step": 1571
},
{
"epoch": 0.210818059901654,
"grad_norm": 1.135891674611892,
"learning_rate": 5.429060185836456e-07,
"loss": 1.6602,
"step": 1572
},
{
"epoch": 0.210952168082253,
"grad_norm": 1.104678715415077,
"learning_rate": 5.42836036360442e-07,
"loss": 1.5908,
"step": 1573
},
{
"epoch": 0.21108627626285204,
"grad_norm": 1.1405223716065391,
"learning_rate": 5.427660163510486e-07,
"loss": 1.6062,
"step": 1574
},
{
"epoch": 0.21122038444345104,
"grad_norm": 1.055115497261772,
"learning_rate": 5.426959585678964e-07,
"loss": 1.614,
"step": 1575
},
{
"epoch": 0.21135449262405007,
"grad_norm": 1.0866284593737212,
"learning_rate": 5.426258630234232e-07,
"loss": 1.623,
"step": 1576
},
{
"epoch": 0.2114886008046491,
"grad_norm": 1.1082738074471385,
"learning_rate": 5.425557297300736e-07,
"loss": 1.6905,
"step": 1577
},
{
"epoch": 0.2116227089852481,
"grad_norm": 1.0561977130172522,
"learning_rate": 5.424855587002988e-07,
"loss": 1.7265,
"step": 1578
},
{
"epoch": 0.21175681716584713,
"grad_norm": 1.111034072593952,
"learning_rate": 5.424153499465566e-07,
"loss": 1.5797,
"step": 1579
},
{
"epoch": 0.21189092534644613,
"grad_norm": 1.110485425151033,
"learning_rate": 5.42345103481312e-07,
"loss": 1.7321,
"step": 1580
},
{
"epoch": 0.21202503352704516,
"grad_norm": 1.057458554660141,
"learning_rate": 5.42274819317036e-07,
"loss": 1.6052,
"step": 1581
},
{
"epoch": 0.21215914170764416,
"grad_norm": 1.0759547522338926,
"learning_rate": 5.422044974662066e-07,
"loss": 1.5403,
"step": 1582
},
{
"epoch": 0.2122932498882432,
"grad_norm": 1.09889881778652,
"learning_rate": 5.421341379413087e-07,
"loss": 1.6477,
"step": 1583
},
{
"epoch": 0.2124273580688422,
"grad_norm": 1.0824182868909191,
"learning_rate": 5.420637407548336e-07,
"loss": 1.6666,
"step": 1584
},
{
"epoch": 0.21256146624944122,
"grad_norm": 1.1246790227619754,
"learning_rate": 5.419933059192792e-07,
"loss": 1.7284,
"step": 1585
},
{
"epoch": 0.21269557443004022,
"grad_norm": 1.1784965009347046,
"learning_rate": 5.419228334471505e-07,
"loss": 1.6751,
"step": 1586
},
{
"epoch": 0.21282968261063925,
"grad_norm": 1.0981401155317758,
"learning_rate": 5.418523233509588e-07,
"loss": 1.5569,
"step": 1587
},
{
"epoch": 0.21296379079123826,
"grad_norm": 1.059671249600233,
"learning_rate": 5.417817756432223e-07,
"loss": 1.6094,
"step": 1588
},
{
"epoch": 0.21309789897183729,
"grad_norm": 1.0850751309161322,
"learning_rate": 5.417111903364658e-07,
"loss": 1.6205,
"step": 1589
},
{
"epoch": 0.2132320071524363,
"grad_norm": 1.1513764671534936,
"learning_rate": 5.416405674432208e-07,
"loss": 1.6778,
"step": 1590
},
{
"epoch": 0.21336611533303532,
"grad_norm": 1.0380273585127677,
"learning_rate": 5.415699069760254e-07,
"loss": 1.6195,
"step": 1591
},
{
"epoch": 0.21350022351363432,
"grad_norm": 1.166702747823365,
"learning_rate": 5.414992089474245e-07,
"loss": 1.6814,
"step": 1592
},
{
"epoch": 0.21363433169423335,
"grad_norm": 1.1893324397979834,
"learning_rate": 5.414284733699695e-07,
"loss": 1.773,
"step": 1593
},
{
"epoch": 0.21376843987483238,
"grad_norm": 1.1127641897298384,
"learning_rate": 5.413577002562186e-07,
"loss": 1.7076,
"step": 1594
},
{
"epoch": 0.21390254805543138,
"grad_norm": 1.080383382708094,
"learning_rate": 5.412868896187365e-07,
"loss": 1.7324,
"step": 1595
},
{
"epoch": 0.2140366562360304,
"grad_norm": 1.0952540724207267,
"learning_rate": 5.412160414700948e-07,
"loss": 1.7437,
"step": 1596
},
{
"epoch": 0.2141707644166294,
"grad_norm": 1.153542257175551,
"learning_rate": 5.411451558228716e-07,
"loss": 1.7386,
"step": 1597
},
{
"epoch": 0.21430487259722844,
"grad_norm": 1.111562609679836,
"learning_rate": 5.410742326896519e-07,
"loss": 1.6339,
"step": 1598
},
{
"epoch": 0.21443898077782744,
"grad_norm": 1.0752256282606487,
"learning_rate": 5.410032720830268e-07,
"loss": 1.6502,
"step": 1599
},
{
"epoch": 0.21457308895842647,
"grad_norm": 1.1124138961511616,
"learning_rate": 5.409322740155947e-07,
"loss": 1.6977,
"step": 1600
},
{
"epoch": 0.21470719713902547,
"grad_norm": 1.1079557958778445,
"learning_rate": 5.408612384999601e-07,
"loss": 1.752,
"step": 1601
},
{
"epoch": 0.2148413053196245,
"grad_norm": 1.0753628455770323,
"learning_rate": 5.407901655487346e-07,
"loss": 1.6314,
"step": 1602
},
{
"epoch": 0.2149754135002235,
"grad_norm": 1.083459999091914,
"learning_rate": 5.407190551745362e-07,
"loss": 1.6034,
"step": 1603
},
{
"epoch": 0.21510952168082254,
"grad_norm": 1.0970151998487565,
"learning_rate": 5.406479073899896e-07,
"loss": 1.6246,
"step": 1604
},
{
"epoch": 0.21524362986142154,
"grad_norm": 1.0937201976032398,
"learning_rate": 5.405767222077262e-07,
"loss": 1.7172,
"step": 1605
},
{
"epoch": 0.21537773804202057,
"grad_norm": 1.0450933325728613,
"learning_rate": 5.405054996403838e-07,
"loss": 1.6418,
"step": 1606
},
{
"epoch": 0.21551184622261957,
"grad_norm": 1.1080460169200497,
"learning_rate": 5.40434239700607e-07,
"loss": 1.5472,
"step": 1607
},
{
"epoch": 0.2156459544032186,
"grad_norm": 1.1272243483080113,
"learning_rate": 5.403629424010473e-07,
"loss": 1.6365,
"step": 1608
},
{
"epoch": 0.2157800625838176,
"grad_norm": 1.0764797457941864,
"learning_rate": 5.402916077543625e-07,
"loss": 1.6407,
"step": 1609
},
{
"epoch": 0.21591417076441663,
"grad_norm": 1.113524889126991,
"learning_rate": 5.402202357732169e-07,
"loss": 1.6827,
"step": 1610
},
{
"epoch": 0.21604827894501566,
"grad_norm": 1.0108430825355625,
"learning_rate": 5.40148826470282e-07,
"loss": 1.6089,
"step": 1611
},
{
"epoch": 0.21618238712561466,
"grad_norm": 1.0591615486944377,
"learning_rate": 5.400773798582352e-07,
"loss": 1.6503,
"step": 1612
},
{
"epoch": 0.2163164953062137,
"grad_norm": 1.0340063487662052,
"learning_rate": 5.400058959497611e-07,
"loss": 1.6383,
"step": 1613
},
{
"epoch": 0.2164506034868127,
"grad_norm": 1.1516572358715267,
"learning_rate": 5.399343747575507e-07,
"loss": 1.6974,
"step": 1614
},
{
"epoch": 0.21658471166741172,
"grad_norm": 1.0592103543406746,
"learning_rate": 5.398628162943016e-07,
"loss": 1.6353,
"step": 1615
},
{
"epoch": 0.21671881984801072,
"grad_norm": 1.0385313908985447,
"learning_rate": 5.39791220572718e-07,
"loss": 1.6162,
"step": 1616
},
{
"epoch": 0.21685292802860975,
"grad_norm": 1.2744072569777416,
"learning_rate": 5.397195876055107e-07,
"loss": 1.6091,
"step": 1617
},
{
"epoch": 0.21698703620920876,
"grad_norm": 1.1238614219371639,
"learning_rate": 5.396479174053974e-07,
"loss": 1.6806,
"step": 1618
},
{
"epoch": 0.21712114438980779,
"grad_norm": 1.1243988511377025,
"learning_rate": 5.39576209985102e-07,
"loss": 1.6404,
"step": 1619
},
{
"epoch": 0.2172552525704068,
"grad_norm": 1.110274303539327,
"learning_rate": 5.395044653573553e-07,
"loss": 1.7572,
"step": 1620
},
{
"epoch": 0.21738936075100582,
"grad_norm": 1.485784445158895,
"learning_rate": 5.394326835348946e-07,
"loss": 1.6521,
"step": 1621
},
{
"epoch": 0.21752346893160482,
"grad_norm": 1.1075544133593012,
"learning_rate": 5.393608645304638e-07,
"loss": 1.6241,
"step": 1622
},
{
"epoch": 0.21765757711220385,
"grad_norm": 1.1036354518105045,
"learning_rate": 5.392890083568133e-07,
"loss": 1.7734,
"step": 1623
},
{
"epoch": 0.21779168529280285,
"grad_norm": 1.1528361438777202,
"learning_rate": 5.392171150267002e-07,
"loss": 1.6317,
"step": 1624
},
{
"epoch": 0.21792579347340188,
"grad_norm": 1.093945976907915,
"learning_rate": 5.391451845528883e-07,
"loss": 1.6645,
"step": 1625
},
{
"epoch": 0.21805990165400088,
"grad_norm": 1.0725853841774324,
"learning_rate": 5.390732169481478e-07,
"loss": 1.6491,
"step": 1626
},
{
"epoch": 0.2181940098345999,
"grad_norm": 1.1106862604843828,
"learning_rate": 5.390012122252557e-07,
"loss": 1.6931,
"step": 1627
},
{
"epoch": 0.21832811801519894,
"grad_norm": 1.2277327010437984,
"learning_rate": 5.389291703969954e-07,
"loss": 1.6584,
"step": 1628
},
{
"epoch": 0.21846222619579794,
"grad_norm": 1.1082783806832028,
"learning_rate": 5.388570914761571e-07,
"loss": 1.6083,
"step": 1629
},
{
"epoch": 0.21859633437639697,
"grad_norm": 1.0835070473943422,
"learning_rate": 5.387849754755371e-07,
"loss": 1.6693,
"step": 1630
},
{
"epoch": 0.21873044255699597,
"grad_norm": 1.0984810480873552,
"learning_rate": 5.38712822407939e-07,
"loss": 1.7465,
"step": 1631
},
{
"epoch": 0.218864550737595,
"grad_norm": 1.0824052521651053,
"learning_rate": 5.386406322861723e-07,
"loss": 1.6514,
"step": 1632
},
{
"epoch": 0.218998658918194,
"grad_norm": 1.1359714482507233,
"learning_rate": 5.385684051230537e-07,
"loss": 1.7069,
"step": 1633
},
{
"epoch": 0.21913276709879304,
"grad_norm": 1.1071556040519455,
"learning_rate": 5.384961409314061e-07,
"loss": 1.7147,
"step": 1634
},
{
"epoch": 0.21926687527939204,
"grad_norm": 1.2083127255075479,
"learning_rate": 5.384238397240588e-07,
"loss": 1.6825,
"step": 1635
},
{
"epoch": 0.21940098345999107,
"grad_norm": 1.090487031491975,
"learning_rate": 5.383515015138481e-07,
"loss": 1.6754,
"step": 1636
},
{
"epoch": 0.21953509164059007,
"grad_norm": 1.1766814612885304,
"learning_rate": 5.382791263136168e-07,
"loss": 1.6694,
"step": 1637
},
{
"epoch": 0.2196691998211891,
"grad_norm": 1.122843389486521,
"learning_rate": 5.382067141362139e-07,
"loss": 1.6044,
"step": 1638
},
{
"epoch": 0.2198033080017881,
"grad_norm": 1.223339411577744,
"learning_rate": 5.381342649944952e-07,
"loss": 1.6101,
"step": 1639
},
{
"epoch": 0.21993741618238713,
"grad_norm": 1.0694591790206647,
"learning_rate": 5.380617789013233e-07,
"loss": 1.6867,
"step": 1640
},
{
"epoch": 0.22007152436298613,
"grad_norm": 1.2184481374104812,
"learning_rate": 5.379892558695671e-07,
"loss": 1.8251,
"step": 1641
},
{
"epoch": 0.22020563254358516,
"grad_norm": 1.144903181431307,
"learning_rate": 5.37916695912102e-07,
"loss": 1.6531,
"step": 1642
},
{
"epoch": 0.22033974072418416,
"grad_norm": 1.0887276474568761,
"learning_rate": 5.378440990418099e-07,
"loss": 1.6042,
"step": 1643
},
{
"epoch": 0.2204738489047832,
"grad_norm": 1.0674234053275629,
"learning_rate": 5.377714652715797e-07,
"loss": 1.6711,
"step": 1644
},
{
"epoch": 0.2206079570853822,
"grad_norm": 1.0790696186946844,
"learning_rate": 5.376987946143065e-07,
"loss": 1.6381,
"step": 1645
},
{
"epoch": 0.22074206526598122,
"grad_norm": 1.1045544089627806,
"learning_rate": 5.376260870828918e-07,
"loss": 1.6532,
"step": 1646
},
{
"epoch": 0.22087617344658025,
"grad_norm": 1.1325732851922752,
"learning_rate": 5.375533426902441e-07,
"loss": 1.698,
"step": 1647
},
{
"epoch": 0.22101028162717926,
"grad_norm": 1.1364383071296065,
"learning_rate": 5.37480561449278e-07,
"loss": 1.6822,
"step": 1648
},
{
"epoch": 0.22114438980777829,
"grad_norm": 1.2662493806229793,
"learning_rate": 5.374077433729149e-07,
"loss": 1.6811,
"step": 1649
},
{
"epoch": 0.2212784979883773,
"grad_norm": 1.0631367908379292,
"learning_rate": 5.373348884740827e-07,
"loss": 1.6659,
"step": 1650
},
{
"epoch": 0.22141260616897632,
"grad_norm": 1.041940858543604,
"learning_rate": 5.372619967657157e-07,
"loss": 1.6331,
"step": 1651
},
{
"epoch": 0.22154671434957532,
"grad_norm": 1.1280546628953805,
"learning_rate": 5.37189068260755e-07,
"loss": 1.56,
"step": 1652
},
{
"epoch": 0.22168082253017435,
"grad_norm": 1.1849258060825412,
"learning_rate": 5.371161029721481e-07,
"loss": 1.7092,
"step": 1653
},
{
"epoch": 0.22181493071077335,
"grad_norm": 1.049528339776241,
"learning_rate": 5.370431009128489e-07,
"loss": 1.6428,
"step": 1654
},
{
"epoch": 0.22194903889137238,
"grad_norm": 1.0820046738092695,
"learning_rate": 5.36970062095818e-07,
"loss": 1.7025,
"step": 1655
},
{
"epoch": 0.22208314707197138,
"grad_norm": 1.154353230216256,
"learning_rate": 5.368969865340224e-07,
"loss": 1.6826,
"step": 1656
},
{
"epoch": 0.2222172552525704,
"grad_norm": 1.053650977152218,
"learning_rate": 5.368238742404357e-07,
"loss": 1.6172,
"step": 1657
},
{
"epoch": 0.2223513634331694,
"grad_norm": 1.1279575224119966,
"learning_rate": 5.367507252280381e-07,
"loss": 1.6856,
"step": 1658
},
{
"epoch": 0.22248547161376844,
"grad_norm": 1.084009451627439,
"learning_rate": 5.36677539509816e-07,
"loss": 1.7398,
"step": 1659
},
{
"epoch": 0.22261957979436744,
"grad_norm": 1.1545149862581074,
"learning_rate": 5.366043170987628e-07,
"loss": 1.7321,
"step": 1660
},
{
"epoch": 0.22275368797496647,
"grad_norm": 1.1304140083027916,
"learning_rate": 5.365310580078781e-07,
"loss": 1.773,
"step": 1661
},
{
"epoch": 0.22288779615556548,
"grad_norm": 1.0642630051886424,
"learning_rate": 5.364577622501681e-07,
"loss": 1.711,
"step": 1662
},
{
"epoch": 0.2230219043361645,
"grad_norm": 1.040347865228387,
"learning_rate": 5.363844298386453e-07,
"loss": 1.631,
"step": 1663
},
{
"epoch": 0.22315601251676354,
"grad_norm": 1.0625862966142028,
"learning_rate": 5.36311060786329e-07,
"loss": 1.7056,
"step": 1664
},
{
"epoch": 0.22329012069736254,
"grad_norm": 1.051398453698011,
"learning_rate": 5.36237655106245e-07,
"loss": 1.5779,
"step": 1665
},
{
"epoch": 0.22342422887796157,
"grad_norm": 1.0373708741511485,
"learning_rate": 5.361642128114253e-07,
"loss": 1.6937,
"step": 1666
},
{
"epoch": 0.22355833705856057,
"grad_norm": 1.0970775365230832,
"learning_rate": 5.360907339149088e-07,
"loss": 1.7652,
"step": 1667
},
{
"epoch": 0.2236924452391596,
"grad_norm": 1.0939499626158076,
"learning_rate": 5.360172184297405e-07,
"loss": 1.7164,
"step": 1668
},
{
"epoch": 0.2238265534197586,
"grad_norm": 1.2815989841015132,
"learning_rate": 5.359436663689721e-07,
"loss": 1.6641,
"step": 1669
},
{
"epoch": 0.22396066160035763,
"grad_norm": 1.143698149806719,
"learning_rate": 5.358700777456621e-07,
"loss": 1.6344,
"step": 1670
},
{
"epoch": 0.22409476978095663,
"grad_norm": 1.1716879090974532,
"learning_rate": 5.357964525728747e-07,
"loss": 1.6979,
"step": 1671
},
{
"epoch": 0.22422887796155566,
"grad_norm": 1.063819709741502,
"learning_rate": 5.357227908636814e-07,
"loss": 1.624,
"step": 1672
},
{
"epoch": 0.22436298614215466,
"grad_norm": 1.2013467122145707,
"learning_rate": 5.356490926311598e-07,
"loss": 1.6952,
"step": 1673
},
{
"epoch": 0.2244970943227537,
"grad_norm": 1.0555387980604758,
"learning_rate": 5.355753578883939e-07,
"loss": 1.6313,
"step": 1674
},
{
"epoch": 0.2246312025033527,
"grad_norm": 1.0893242689976388,
"learning_rate": 5.355015866484744e-07,
"loss": 1.6749,
"step": 1675
},
{
"epoch": 0.22476531068395172,
"grad_norm": 1.1013312078930966,
"learning_rate": 5.354277789244984e-07,
"loss": 1.6346,
"step": 1676
},
{
"epoch": 0.22489941886455073,
"grad_norm": 1.0396725082524636,
"learning_rate": 5.353539347295696e-07,
"loss": 1.6516,
"step": 1677
},
{
"epoch": 0.22503352704514976,
"grad_norm": 1.1068093515212976,
"learning_rate": 5.352800540767978e-07,
"loss": 1.6229,
"step": 1678
},
{
"epoch": 0.22516763522574876,
"grad_norm": 1.0984721962823492,
"learning_rate": 5.352061369792997e-07,
"loss": 1.6208,
"step": 1679
},
{
"epoch": 0.2253017434063478,
"grad_norm": 1.0826869933413177,
"learning_rate": 5.351321834501981e-07,
"loss": 1.677,
"step": 1680
},
{
"epoch": 0.22543585158694682,
"grad_norm": 1.084000373067938,
"learning_rate": 5.350581935026227e-07,
"loss": 1.7401,
"step": 1681
},
{
"epoch": 0.22556995976754582,
"grad_norm": 1.0851285225408938,
"learning_rate": 5.349841671497093e-07,
"loss": 1.7231,
"step": 1682
},
{
"epoch": 0.22570406794814485,
"grad_norm": 1.1364065037848023,
"learning_rate": 5.349101044046004e-07,
"loss": 1.6977,
"step": 1683
},
{
"epoch": 0.22583817612874385,
"grad_norm": 1.1009000528239055,
"learning_rate": 5.348360052804447e-07,
"loss": 1.7396,
"step": 1684
},
{
"epoch": 0.22597228430934288,
"grad_norm": 1.0627127199486133,
"learning_rate": 5.347618697903976e-07,
"loss": 1.6,
"step": 1685
},
{
"epoch": 0.22610639248994188,
"grad_norm": 1.0936508465446555,
"learning_rate": 5.346876979476206e-07,
"loss": 1.6898,
"step": 1686
},
{
"epoch": 0.2262405006705409,
"grad_norm": 1.158039404421018,
"learning_rate": 5.346134897652824e-07,
"loss": 1.6173,
"step": 1687
},
{
"epoch": 0.2263746088511399,
"grad_norm": 1.1476901068480616,
"learning_rate": 5.345392452565574e-07,
"loss": 1.6939,
"step": 1688
},
{
"epoch": 0.22650871703173894,
"grad_norm": 1.1331738396979525,
"learning_rate": 5.344649644346266e-07,
"loss": 1.7156,
"step": 1689
},
{
"epoch": 0.22664282521233794,
"grad_norm": 1.0799876163240634,
"learning_rate": 5.343906473126778e-07,
"loss": 1.716,
"step": 1690
},
{
"epoch": 0.22677693339293697,
"grad_norm": 1.082964627665107,
"learning_rate": 5.343162939039048e-07,
"loss": 1.7274,
"step": 1691
},
{
"epoch": 0.22691104157353598,
"grad_norm": 1.0606670008679837,
"learning_rate": 5.342419042215082e-07,
"loss": 1.6872,
"step": 1692
},
{
"epoch": 0.227045149754135,
"grad_norm": 1.2139606651511192,
"learning_rate": 5.341674782786949e-07,
"loss": 1.6144,
"step": 1693
},
{
"epoch": 0.227179257934734,
"grad_norm": 1.1259721685135795,
"learning_rate": 5.340930160886783e-07,
"loss": 1.682,
"step": 1694
},
{
"epoch": 0.22731336611533304,
"grad_norm": 1.1971458828681856,
"learning_rate": 5.340185176646779e-07,
"loss": 1.666,
"step": 1695
},
{
"epoch": 0.22744747429593204,
"grad_norm": 1.0623938370168757,
"learning_rate": 5.339439830199201e-07,
"loss": 1.6716,
"step": 1696
},
{
"epoch": 0.22758158247653107,
"grad_norm": 1.0291752731398527,
"learning_rate": 5.338694121676374e-07,
"loss": 1.5643,
"step": 1697
},
{
"epoch": 0.2277156906571301,
"grad_norm": 1.073415400659899,
"learning_rate": 5.33794805121069e-07,
"loss": 1.7113,
"step": 1698
},
{
"epoch": 0.2278497988377291,
"grad_norm": 1.0719841904118037,
"learning_rate": 5.337201618934604e-07,
"loss": 1.6904,
"step": 1699
},
{
"epoch": 0.22798390701832813,
"grad_norm": 1.0589482779303245,
"learning_rate": 5.336454824980633e-07,
"loss": 1.6258,
"step": 1700
},
{
"epoch": 0.22811801519892713,
"grad_norm": 1.1032497481356218,
"learning_rate": 5.335707669481362e-07,
"loss": 1.6656,
"step": 1701
},
{
"epoch": 0.22825212337952616,
"grad_norm": 1.0840451749643811,
"learning_rate": 5.334960152569437e-07,
"loss": 1.5383,
"step": 1702
},
{
"epoch": 0.22838623156012516,
"grad_norm": 1.2721911706046112,
"learning_rate": 5.334212274377572e-07,
"loss": 1.6877,
"step": 1703
},
{
"epoch": 0.2285203397407242,
"grad_norm": 1.113467373081235,
"learning_rate": 5.333464035038541e-07,
"loss": 1.7795,
"step": 1704
},
{
"epoch": 0.2286544479213232,
"grad_norm": 1.0985371740747398,
"learning_rate": 5.332715434685184e-07,
"loss": 1.646,
"step": 1705
},
{
"epoch": 0.22878855610192222,
"grad_norm": 1.0986088766126445,
"learning_rate": 5.331966473450405e-07,
"loss": 1.7123,
"step": 1706
},
{
"epoch": 0.22892266428252123,
"grad_norm": 1.0916765886457365,
"learning_rate": 5.331217151467172e-07,
"loss": 1.6558,
"step": 1707
},
{
"epoch": 0.22905677246312026,
"grad_norm": 1.1105626967058537,
"learning_rate": 5.330467468868518e-07,
"loss": 1.6464,
"step": 1708
},
{
"epoch": 0.22919088064371926,
"grad_norm": 1.060186115294533,
"learning_rate": 5.329717425787539e-07,
"loss": 1.7554,
"step": 1709
},
{
"epoch": 0.2293249888243183,
"grad_norm": 1.1194774279858801,
"learning_rate": 5.328967022357393e-07,
"loss": 1.6726,
"step": 1710
},
{
"epoch": 0.2294590970049173,
"grad_norm": 1.04897630046238,
"learning_rate": 5.328216258711307e-07,
"loss": 1.658,
"step": 1711
},
{
"epoch": 0.22959320518551632,
"grad_norm": 1.0978402523327002,
"learning_rate": 5.327465134982568e-07,
"loss": 1.7228,
"step": 1712
},
{
"epoch": 0.22972731336611532,
"grad_norm": 1.0849254385283391,
"learning_rate": 5.326713651304527e-07,
"loss": 1.5941,
"step": 1713
},
{
"epoch": 0.22986142154671435,
"grad_norm": 1.1076316095810992,
"learning_rate": 5.3259618078106e-07,
"loss": 1.6087,
"step": 1714
},
{
"epoch": 0.22999552972731335,
"grad_norm": 1.173053113513891,
"learning_rate": 5.325209604634268e-07,
"loss": 1.6916,
"step": 1715
},
{
"epoch": 0.23012963790791238,
"grad_norm": 1.0524457049873044,
"learning_rate": 5.324457041909073e-07,
"loss": 1.7742,
"step": 1716
},
{
"epoch": 0.2302637460885114,
"grad_norm": 1.0634034874984304,
"learning_rate": 5.323704119768625e-07,
"loss": 1.676,
"step": 1717
},
{
"epoch": 0.2303978542691104,
"grad_norm": 1.1156008079132087,
"learning_rate": 5.322950838346592e-07,
"loss": 1.7271,
"step": 1718
},
{
"epoch": 0.23053196244970944,
"grad_norm": 1.1047727328230366,
"learning_rate": 5.322197197776711e-07,
"loss": 1.7865,
"step": 1719
},
{
"epoch": 0.23066607063030845,
"grad_norm": 1.027356701503526,
"learning_rate": 5.321443198192781e-07,
"loss": 1.709,
"step": 1720
},
{
"epoch": 0.23080017881090747,
"grad_norm": 1.136877539749875,
"learning_rate": 5.320688839728663e-07,
"loss": 1.6582,
"step": 1721
},
{
"epoch": 0.23093428699150648,
"grad_norm": 1.0127690499338695,
"learning_rate": 5.319934122518285e-07,
"loss": 1.7492,
"step": 1722
},
{
"epoch": 0.2310683951721055,
"grad_norm": 1.0939228317341436,
"learning_rate": 5.319179046695635e-07,
"loss": 1.5875,
"step": 1723
},
{
"epoch": 0.2312025033527045,
"grad_norm": 1.1310800565403134,
"learning_rate": 5.318423612394769e-07,
"loss": 1.6674,
"step": 1724
},
{
"epoch": 0.23133661153330354,
"grad_norm": 1.1687734972345458,
"learning_rate": 5.317667819749803e-07,
"loss": 1.6984,
"step": 1725
},
{
"epoch": 0.23147071971390254,
"grad_norm": 1.3079097416665406,
"learning_rate": 5.316911668894917e-07,
"loss": 1.7021,
"step": 1726
},
{
"epoch": 0.23160482789450157,
"grad_norm": 1.121551582881909,
"learning_rate": 5.316155159964357e-07,
"loss": 1.6389,
"step": 1727
},
{
"epoch": 0.23173893607510057,
"grad_norm": 1.110653445896344,
"learning_rate": 5.31539829309243e-07,
"loss": 1.6069,
"step": 1728
},
{
"epoch": 0.2318730442556996,
"grad_norm": 1.0532131317248028,
"learning_rate": 5.314641068413509e-07,
"loss": 1.6365,
"step": 1729
},
{
"epoch": 0.2320071524362986,
"grad_norm": 1.0606458320174244,
"learning_rate": 5.313883486062026e-07,
"loss": 1.7264,
"step": 1730
},
{
"epoch": 0.23214126061689763,
"grad_norm": 1.341898889664279,
"learning_rate": 5.313125546172484e-07,
"loss": 1.6649,
"step": 1731
},
{
"epoch": 0.23227536879749663,
"grad_norm": 1.1400544409976623,
"learning_rate": 5.312367248879441e-07,
"loss": 1.7331,
"step": 1732
},
{
"epoch": 0.23240947697809566,
"grad_norm": 1.0680650695769265,
"learning_rate": 5.311608594317525e-07,
"loss": 1.6919,
"step": 1733
},
{
"epoch": 0.2325435851586947,
"grad_norm": 1.1255461157368476,
"learning_rate": 5.310849582621425e-07,
"loss": 1.6049,
"step": 1734
},
{
"epoch": 0.2326776933392937,
"grad_norm": 1.1072444623083968,
"learning_rate": 5.310090213925891e-07,
"loss": 1.5269,
"step": 1735
},
{
"epoch": 0.23281180151989272,
"grad_norm": 1.0710603367422178,
"learning_rate": 5.309330488365741e-07,
"loss": 1.5994,
"step": 1736
},
{
"epoch": 0.23294590970049173,
"grad_norm": 1.0644784872053028,
"learning_rate": 5.308570406075853e-07,
"loss": 1.7374,
"step": 1737
},
{
"epoch": 0.23308001788109076,
"grad_norm": 1.1498695736382247,
"learning_rate": 5.307809967191172e-07,
"loss": 1.7718,
"step": 1738
},
{
"epoch": 0.23321412606168976,
"grad_norm": 1.1460626302338928,
"learning_rate": 5.307049171846698e-07,
"loss": 1.7527,
"step": 1739
},
{
"epoch": 0.2333482342422888,
"grad_norm": 1.0375010028149447,
"learning_rate": 5.306288020177507e-07,
"loss": 1.6096,
"step": 1740
},
{
"epoch": 0.2334823424228878,
"grad_norm": 1.0840298111802271,
"learning_rate": 5.305526512318727e-07,
"loss": 1.6765,
"step": 1741
},
{
"epoch": 0.23361645060348682,
"grad_norm": 1.175481103771977,
"learning_rate": 5.304764648405554e-07,
"loss": 1.6737,
"step": 1742
},
{
"epoch": 0.23375055878408582,
"grad_norm": 1.0760963915335215,
"learning_rate": 5.304002428573248e-07,
"loss": 1.6407,
"step": 1743
},
{
"epoch": 0.23388466696468485,
"grad_norm": 1.0391117459687709,
"learning_rate": 5.303239852957129e-07,
"loss": 1.7296,
"step": 1744
},
{
"epoch": 0.23401877514528385,
"grad_norm": 1.2433142693729942,
"learning_rate": 5.302476921692584e-07,
"loss": 1.6453,
"step": 1745
},
{
"epoch": 0.23415288332588288,
"grad_norm": 1.1097947586973798,
"learning_rate": 5.30171363491506e-07,
"loss": 1.6873,
"step": 1746
},
{
"epoch": 0.23428699150648188,
"grad_norm": 1.044700396070487,
"learning_rate": 5.30094999276007e-07,
"loss": 1.5877,
"step": 1747
},
{
"epoch": 0.2344210996870809,
"grad_norm": 1.1166075784138738,
"learning_rate": 5.300185995363186e-07,
"loss": 1.6547,
"step": 1748
},
{
"epoch": 0.23455520786767992,
"grad_norm": 1.1455525392590689,
"learning_rate": 5.299421642860049e-07,
"loss": 1.6328,
"step": 1749
},
{
"epoch": 0.23468931604827895,
"grad_norm": 1.0432073116091243,
"learning_rate": 5.298656935386355e-07,
"loss": 1.6934,
"step": 1750
},
{
"epoch": 0.23482342422887797,
"grad_norm": 1.301933185584584,
"learning_rate": 5.297891873077872e-07,
"loss": 1.6322,
"step": 1751
},
{
"epoch": 0.23495753240947698,
"grad_norm": 1.1184463227985266,
"learning_rate": 5.297126456070423e-07,
"loss": 1.5901,
"step": 1752
},
{
"epoch": 0.235091640590076,
"grad_norm": 1.0894760385328393,
"learning_rate": 5.296360684499899e-07,
"loss": 1.6307,
"step": 1753
},
{
"epoch": 0.235225748770675,
"grad_norm": 1.0810964826554634,
"learning_rate": 5.295594558502254e-07,
"loss": 1.671,
"step": 1754
},
{
"epoch": 0.23535985695127404,
"grad_norm": 1.0867830593910155,
"learning_rate": 5.2948280782135e-07,
"loss": 1.5898,
"step": 1755
},
{
"epoch": 0.23549396513187304,
"grad_norm": 1.0826732184990124,
"learning_rate": 5.29406124376972e-07,
"loss": 1.6753,
"step": 1756
},
{
"epoch": 0.23562807331247207,
"grad_norm": 1.1750857610640004,
"learning_rate": 5.29329405530705e-07,
"loss": 1.6238,
"step": 1757
},
{
"epoch": 0.23576218149307107,
"grad_norm": 1.145244574282678,
"learning_rate": 5.292526512961698e-07,
"loss": 1.7374,
"step": 1758
},
{
"epoch": 0.2358962896736701,
"grad_norm": 1.0998728885819122,
"learning_rate": 5.291758616869928e-07,
"loss": 1.7178,
"step": 1759
},
{
"epoch": 0.2360303978542691,
"grad_norm": 1.122069140362572,
"learning_rate": 5.290990367168073e-07,
"loss": 1.634,
"step": 1760
},
{
"epoch": 0.23616450603486813,
"grad_norm": 1.1231670039812451,
"learning_rate": 5.290221763992522e-07,
"loss": 1.6238,
"step": 1761
},
{
"epoch": 0.23629861421546713,
"grad_norm": 1.0647516707650018,
"learning_rate": 5.289452807479734e-07,
"loss": 1.6579,
"step": 1762
},
{
"epoch": 0.23643272239606616,
"grad_norm": 1.2107894163734518,
"learning_rate": 5.288683497766222e-07,
"loss": 1.7207,
"step": 1763
},
{
"epoch": 0.23656683057666517,
"grad_norm": 1.1025744988730661,
"learning_rate": 5.287913834988569e-07,
"loss": 1.7006,
"step": 1764
},
{
"epoch": 0.2367009387572642,
"grad_norm": 1.0797524236014637,
"learning_rate": 5.287143819283421e-07,
"loss": 1.7584,
"step": 1765
},
{
"epoch": 0.2368350469378632,
"grad_norm": 1.0751286199968113,
"learning_rate": 5.286373450787481e-07,
"loss": 1.5611,
"step": 1766
},
{
"epoch": 0.23696915511846223,
"grad_norm": 1.0636517626500344,
"learning_rate": 5.285602729637518e-07,
"loss": 1.6433,
"step": 1767
},
{
"epoch": 0.23710326329906126,
"grad_norm": 1.048651758235017,
"learning_rate": 5.284831655970363e-07,
"loss": 1.6267,
"step": 1768
},
{
"epoch": 0.23723737147966026,
"grad_norm": 1.0862538156700035,
"learning_rate": 5.28406022992291e-07,
"loss": 1.591,
"step": 1769
},
{
"epoch": 0.2373714796602593,
"grad_norm": 1.112560210549691,
"learning_rate": 5.283288451632116e-07,
"loss": 1.6387,
"step": 1770
},
{
"epoch": 0.2375055878408583,
"grad_norm": 1.163175696596488,
"learning_rate": 5.282516321235001e-07,
"loss": 1.8051,
"step": 1771
},
{
"epoch": 0.23763969602145732,
"grad_norm": 1.112481677106296,
"learning_rate": 5.281743838868644e-07,
"loss": 1.5411,
"step": 1772
},
{
"epoch": 0.23777380420205632,
"grad_norm": 1.1911416700291582,
"learning_rate": 5.28097100467019e-07,
"loss": 1.6194,
"step": 1773
},
{
"epoch": 0.23790791238265535,
"grad_norm": 1.0990682965946412,
"learning_rate": 5.280197818776845e-07,
"loss": 1.6605,
"step": 1774
},
{
"epoch": 0.23804202056325435,
"grad_norm": 1.0591136451690275,
"learning_rate": 5.279424281325878e-07,
"loss": 1.6389,
"step": 1775
},
{
"epoch": 0.23817612874385338,
"grad_norm": 1.0683888995182673,
"learning_rate": 5.278650392454621e-07,
"loss": 1.6092,
"step": 1776
},
{
"epoch": 0.23831023692445238,
"grad_norm": 1.1224739302408693,
"learning_rate": 5.277876152300467e-07,
"loss": 1.6494,
"step": 1777
},
{
"epoch": 0.23844434510505141,
"grad_norm": 1.0723497695462585,
"learning_rate": 5.27710156100087e-07,
"loss": 1.7937,
"step": 1778
},
{
"epoch": 0.23857845328565042,
"grad_norm": 1.1351190756385903,
"learning_rate": 5.276326618693352e-07,
"loss": 1.7266,
"step": 1779
},
{
"epoch": 0.23871256146624945,
"grad_norm": 1.0579576318516895,
"learning_rate": 5.275551325515491e-07,
"loss": 1.6662,
"step": 1780
},
{
"epoch": 0.23884666964684845,
"grad_norm": 1.1337655082128173,
"learning_rate": 5.27477568160493e-07,
"loss": 1.6656,
"step": 1781
},
{
"epoch": 0.23898077782744748,
"grad_norm": 1.3625169955042795,
"learning_rate": 5.273999687099377e-07,
"loss": 1.6154,
"step": 1782
},
{
"epoch": 0.23911488600804648,
"grad_norm": 1.0606076186008175,
"learning_rate": 5.273223342136596e-07,
"loss": 1.6295,
"step": 1783
},
{
"epoch": 0.2392489941886455,
"grad_norm": 3.7952746706102753,
"learning_rate": 5.27244664685442e-07,
"loss": 1.593,
"step": 1784
},
{
"epoch": 0.2393831023692445,
"grad_norm": 1.1015598004917457,
"learning_rate": 5.271669601390737e-07,
"loss": 1.659,
"step": 1785
},
{
"epoch": 0.23951721054984354,
"grad_norm": 1.1429465431928834,
"learning_rate": 5.270892205883503e-07,
"loss": 1.7055,
"step": 1786
},
{
"epoch": 0.23965131873044257,
"grad_norm": 1.1572569512743107,
"learning_rate": 5.270114460470735e-07,
"loss": 1.75,
"step": 1787
},
{
"epoch": 0.23978542691104157,
"grad_norm": 1.1342505841464177,
"learning_rate": 5.269336365290511e-07,
"loss": 1.692,
"step": 1788
},
{
"epoch": 0.2399195350916406,
"grad_norm": 1.1491667363729234,
"learning_rate": 5.268557920480969e-07,
"loss": 1.6956,
"step": 1789
},
{
"epoch": 0.2400536432722396,
"grad_norm": 1.1290663441601718,
"learning_rate": 5.267779126180313e-07,
"loss": 1.7194,
"step": 1790
},
{
"epoch": 0.24018775145283863,
"grad_norm": 1.1068721597891535,
"learning_rate": 5.26699998252681e-07,
"loss": 1.6775,
"step": 1791
},
{
"epoch": 0.24032185963343763,
"grad_norm": 1.0965127649518425,
"learning_rate": 5.266220489658783e-07,
"loss": 1.7381,
"step": 1792
},
{
"epoch": 0.24045596781403666,
"grad_norm": 1.0539192312552248,
"learning_rate": 5.265440647714622e-07,
"loss": 1.6916,
"step": 1793
},
{
"epoch": 0.24059007599463567,
"grad_norm": 1.3925405964228643,
"learning_rate": 5.264660456832777e-07,
"loss": 1.6934,
"step": 1794
},
{
"epoch": 0.2407241841752347,
"grad_norm": 1.0796598245896871,
"learning_rate": 5.263879917151761e-07,
"loss": 1.6891,
"step": 1795
},
{
"epoch": 0.2408582923558337,
"grad_norm": 1.0549168383726284,
"learning_rate": 5.263099028810148e-07,
"loss": 1.6417,
"step": 1796
},
{
"epoch": 0.24099240053643273,
"grad_norm": 1.0854208022859217,
"learning_rate": 5.262317791946574e-07,
"loss": 1.6132,
"step": 1797
},
{
"epoch": 0.24112650871703173,
"grad_norm": 1.1038896542176981,
"learning_rate": 5.261536206699738e-07,
"loss": 1.6074,
"step": 1798
},
{
"epoch": 0.24126061689763076,
"grad_norm": 1.0646960968846464,
"learning_rate": 5.2607542732084e-07,
"loss": 1.601,
"step": 1799
},
{
"epoch": 0.24139472507822976,
"grad_norm": 1.1557060399556212,
"learning_rate": 5.259971991611381e-07,
"loss": 1.7684,
"step": 1800
},
{
"epoch": 0.2415288332588288,
"grad_norm": 1.0313305926934546,
"learning_rate": 5.259189362047565e-07,
"loss": 1.6322,
"step": 1801
},
{
"epoch": 0.2416629414394278,
"grad_norm": 1.0974406411588324,
"learning_rate": 5.258406384655897e-07,
"loss": 1.6857,
"step": 1802
},
{
"epoch": 0.24179704962002682,
"grad_norm": 1.1146673930740303,
"learning_rate": 5.257623059575385e-07,
"loss": 1.6456,
"step": 1803
},
{
"epoch": 0.24193115780062585,
"grad_norm": 1.0970256705246042,
"learning_rate": 5.256839386945097e-07,
"loss": 1.7583,
"step": 1804
},
{
"epoch": 0.24206526598122485,
"grad_norm": 1.107274760930789,
"learning_rate": 5.256055366904164e-07,
"loss": 1.6586,
"step": 1805
},
{
"epoch": 0.24219937416182388,
"grad_norm": 1.1073843937392611,
"learning_rate": 5.255270999591779e-07,
"loss": 1.7062,
"step": 1806
},
{
"epoch": 0.24233348234242288,
"grad_norm": 1.0566525499472572,
"learning_rate": 5.254486285147196e-07,
"loss": 1.6526,
"step": 1807
},
{
"epoch": 0.24246759052302191,
"grad_norm": 1.1537228290096582,
"learning_rate": 5.253701223709729e-07,
"loss": 1.6933,
"step": 1808
},
{
"epoch": 0.24260169870362092,
"grad_norm": 1.0990727257935735,
"learning_rate": 5.252915815418755e-07,
"loss": 1.7125,
"step": 1809
},
{
"epoch": 0.24273580688421995,
"grad_norm": 1.244262115292612,
"learning_rate": 5.252130060413716e-07,
"loss": 1.6264,
"step": 1810
},
{
"epoch": 0.24286991506481895,
"grad_norm": 1.1688493530359219,
"learning_rate": 5.251343958834107e-07,
"loss": 1.6785,
"step": 1811
},
{
"epoch": 0.24300402324541798,
"grad_norm": 1.2285366933673156,
"learning_rate": 5.250557510819494e-07,
"loss": 1.572,
"step": 1812
},
{
"epoch": 0.24313813142601698,
"grad_norm": 1.1296607396854323,
"learning_rate": 5.249770716509499e-07,
"loss": 1.6761,
"step": 1813
},
{
"epoch": 0.243272239606616,
"grad_norm": 1.1537668172261726,
"learning_rate": 5.248983576043808e-07,
"loss": 1.6839,
"step": 1814
},
{
"epoch": 0.243406347787215,
"grad_norm": 1.2774536095786413,
"learning_rate": 5.248196089562165e-07,
"loss": 1.6752,
"step": 1815
},
{
"epoch": 0.24354045596781404,
"grad_norm": 1.0391234761075887,
"learning_rate": 5.247408257204379e-07,
"loss": 1.713,
"step": 1816
},
{
"epoch": 0.24367456414841304,
"grad_norm": 1.1351662284778345,
"learning_rate": 5.24662007911032e-07,
"loss": 1.741,
"step": 1817
},
{
"epoch": 0.24380867232901207,
"grad_norm": 1.101327635041692,
"learning_rate": 5.245831555419915e-07,
"loss": 1.6196,
"step": 1818
},
{
"epoch": 0.24394278050961107,
"grad_norm": 1.0713266982503056,
"learning_rate": 5.24504268627316e-07,
"loss": 1.6454,
"step": 1819
},
{
"epoch": 0.2440768886902101,
"grad_norm": 1.1530834766346107,
"learning_rate": 5.244253471810106e-07,
"loss": 1.7217,
"step": 1820
},
{
"epoch": 0.24421099687080913,
"grad_norm": 1.121128499361746,
"learning_rate": 5.243463912170868e-07,
"loss": 1.635,
"step": 1821
},
{
"epoch": 0.24434510505140813,
"grad_norm": 1.1890728819475802,
"learning_rate": 5.242674007495621e-07,
"loss": 1.6498,
"step": 1822
},
{
"epoch": 0.24447921323200716,
"grad_norm": 1.0869958269746995,
"learning_rate": 5.241883757924604e-07,
"loss": 1.6685,
"step": 1823
},
{
"epoch": 0.24461332141260617,
"grad_norm": 1.072161128457571,
"learning_rate": 5.241093163598111e-07,
"loss": 1.613,
"step": 1824
},
{
"epoch": 0.2447474295932052,
"grad_norm": 1.0697959147126053,
"learning_rate": 5.240302224656507e-07,
"loss": 1.7839,
"step": 1825
},
{
"epoch": 0.2448815377738042,
"grad_norm": 1.0447563021570512,
"learning_rate": 5.239510941240209e-07,
"loss": 1.553,
"step": 1826
},
{
"epoch": 0.24501564595440323,
"grad_norm": 1.1246283994835846,
"learning_rate": 5.2387193134897e-07,
"loss": 1.7167,
"step": 1827
},
{
"epoch": 0.24514975413500223,
"grad_norm": 1.0539923982868098,
"learning_rate": 5.237927341545521e-07,
"loss": 1.6228,
"step": 1828
},
{
"epoch": 0.24528386231560126,
"grad_norm": 1.1056807313462267,
"learning_rate": 5.23713502554828e-07,
"loss": 1.6631,
"step": 1829
},
{
"epoch": 0.24541797049620026,
"grad_norm": 1.1081084022345968,
"learning_rate": 5.236342365638638e-07,
"loss": 1.7182,
"step": 1830
},
{
"epoch": 0.2455520786767993,
"grad_norm": 1.1259734401016548,
"learning_rate": 5.235549361957323e-07,
"loss": 1.6281,
"step": 1831
},
{
"epoch": 0.2456861868573983,
"grad_norm": 1.073575909581403,
"learning_rate": 5.234756014645123e-07,
"loss": 1.7089,
"step": 1832
},
{
"epoch": 0.24582029503799732,
"grad_norm": 1.182395764700481,
"learning_rate": 5.233962323842885e-07,
"loss": 1.6138,
"step": 1833
},
{
"epoch": 0.24595440321859632,
"grad_norm": 1.067652195605279,
"learning_rate": 5.233168289691518e-07,
"loss": 1.6409,
"step": 1834
},
{
"epoch": 0.24608851139919535,
"grad_norm": 1.0539945315127641,
"learning_rate": 5.232373912331994e-07,
"loss": 1.6632,
"step": 1835
},
{
"epoch": 0.24622261957979436,
"grad_norm": 1.1353497557175543,
"learning_rate": 5.231579191905341e-07,
"loss": 1.6481,
"step": 1836
},
{
"epoch": 0.24635672776039338,
"grad_norm": 1.0518079931176558,
"learning_rate": 5.230784128552653e-07,
"loss": 1.641,
"step": 1837
},
{
"epoch": 0.24649083594099241,
"grad_norm": 1.068415705515305,
"learning_rate": 5.229988722415082e-07,
"loss": 1.706,
"step": 1838
},
{
"epoch": 0.24662494412159142,
"grad_norm": 1.128403860172621,
"learning_rate": 5.229192973633844e-07,
"loss": 1.6095,
"step": 1839
},
{
"epoch": 0.24675905230219045,
"grad_norm": 1.069414952826673,
"learning_rate": 5.22839688235021e-07,
"loss": 1.6543,
"step": 1840
},
{
"epoch": 0.24689316048278945,
"grad_norm": 1.0821194973907244,
"learning_rate": 5.227600448705517e-07,
"loss": 1.556,
"step": 1841
},
{
"epoch": 0.24702726866338848,
"grad_norm": 1.084344318240152,
"learning_rate": 5.226803672841162e-07,
"loss": 1.6034,
"step": 1842
},
{
"epoch": 0.24716137684398748,
"grad_norm": 1.1202391548493928,
"learning_rate": 5.226006554898601e-07,
"loss": 1.6966,
"step": 1843
},
{
"epoch": 0.2472954850245865,
"grad_norm": 1.0911354590278528,
"learning_rate": 5.225209095019351e-07,
"loss": 1.6948,
"step": 1844
},
{
"epoch": 0.2474295932051855,
"grad_norm": 1.1062195036954834,
"learning_rate": 5.224411293344992e-07,
"loss": 1.5054,
"step": 1845
},
{
"epoch": 0.24756370138578454,
"grad_norm": 1.0581940583028457,
"learning_rate": 5.223613150017162e-07,
"loss": 1.6027,
"step": 1846
},
{
"epoch": 0.24769780956638354,
"grad_norm": 1.0564622037081781,
"learning_rate": 5.22281466517756e-07,
"loss": 1.6139,
"step": 1847
},
{
"epoch": 0.24783191774698257,
"grad_norm": 1.0965905968449954,
"learning_rate": 5.222015838967948e-07,
"loss": 1.6531,
"step": 1848
},
{
"epoch": 0.24796602592758157,
"grad_norm": 1.1162216415234159,
"learning_rate": 5.221216671530146e-07,
"loss": 1.6434,
"step": 1849
},
{
"epoch": 0.2481001341081806,
"grad_norm": 1.0760593930698765,
"learning_rate": 5.220417163006035e-07,
"loss": 1.7068,
"step": 1850
},
{
"epoch": 0.2482342422887796,
"grad_norm": 1.3461498868058117,
"learning_rate": 5.219617313537557e-07,
"loss": 1.6895,
"step": 1851
},
{
"epoch": 0.24836835046937863,
"grad_norm": 1.116707873551399,
"learning_rate": 5.218817123266716e-07,
"loss": 1.6986,
"step": 1852
},
{
"epoch": 0.24850245864997764,
"grad_norm": 1.0874229858859366,
"learning_rate": 5.218016592335574e-07,
"loss": 1.696,
"step": 1853
},
{
"epoch": 0.24863656683057667,
"grad_norm": 1.2149675834773461,
"learning_rate": 5.217215720886254e-07,
"loss": 1.6334,
"step": 1854
},
{
"epoch": 0.24877067501117567,
"grad_norm": 1.0673684982385807,
"learning_rate": 5.21641450906094e-07,
"loss": 1.6445,
"step": 1855
},
{
"epoch": 0.2489047831917747,
"grad_norm": 1.0639747826797143,
"learning_rate": 5.215612957001879e-07,
"loss": 1.7352,
"step": 1856
},
{
"epoch": 0.24903889137237373,
"grad_norm": 1.1955320747693832,
"learning_rate": 5.214811064851373e-07,
"loss": 1.6991,
"step": 1857
},
{
"epoch": 0.24917299955297273,
"grad_norm": 1.1925934103789766,
"learning_rate": 5.214008832751788e-07,
"loss": 1.6421,
"step": 1858
},
{
"epoch": 0.24930710773357176,
"grad_norm": 1.152167600482823,
"learning_rate": 5.21320626084555e-07,
"loss": 1.6614,
"step": 1859
},
{
"epoch": 0.24944121591417076,
"grad_norm": 1.115689117193753,
"learning_rate": 5.212403349275145e-07,
"loss": 1.67,
"step": 1860
},
{
"epoch": 0.2495753240947698,
"grad_norm": 1.0409261709804483,
"learning_rate": 5.211600098183119e-07,
"loss": 1.5712,
"step": 1861
},
{
"epoch": 0.2497094322753688,
"grad_norm": 1.1645359690711583,
"learning_rate": 5.210796507712078e-07,
"loss": 1.6747,
"step": 1862
},
{
"epoch": 0.24984354045596782,
"grad_norm": 1.1220835902669124,
"learning_rate": 5.209992578004688e-07,
"loss": 1.6994,
"step": 1863
},
{
"epoch": 0.24997764863656682,
"grad_norm": 1.0574464835321717,
"learning_rate": 5.209188309203678e-07,
"loss": 1.6434,
"step": 1864
}
],
"logging_steps": 1,
"max_steps": 7456,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1864,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 498613144780800.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}