Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
imliiny1's picture
Model save
b3e8f7d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5442,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009187798603454612,
"grad_norm": 3.0641477925906218,
"learning_rate": 9.157509157509158e-07,
"loss": 1.1494,
"mean_token_accuracy": 0.6977963209152221,
"step": 5
},
{
"epoch": 0.0018375597206909224,
"grad_norm": 3.2105399334572433,
"learning_rate": 1.8315018315018316e-06,
"loss": 1.1392,
"mean_token_accuracy": 0.7049754142761231,
"step": 10
},
{
"epoch": 0.0027563395810363835,
"grad_norm": 2.9642594903358033,
"learning_rate": 2.747252747252747e-06,
"loss": 1.115,
"mean_token_accuracy": 0.7039887428283691,
"step": 15
},
{
"epoch": 0.003675119441381845,
"grad_norm": 2.220104102612778,
"learning_rate": 3.663003663003663e-06,
"loss": 0.9948,
"mean_token_accuracy": 0.7305674076080322,
"step": 20
},
{
"epoch": 0.004593899301727306,
"grad_norm": 1.9105601134372614,
"learning_rate": 4.578754578754579e-06,
"loss": 0.9409,
"mean_token_accuracy": 0.7411547303199768,
"step": 25
},
{
"epoch": 0.005512679162072767,
"grad_norm": 1.8598985707265425,
"learning_rate": 5.494505494505494e-06,
"loss": 0.9798,
"mean_token_accuracy": 0.7281824707984924,
"step": 30
},
{
"epoch": 0.006431459022418228,
"grad_norm": 2.585520808525946,
"learning_rate": 6.41025641025641e-06,
"loss": 0.9642,
"mean_token_accuracy": 0.7303176164627075,
"step": 35
},
{
"epoch": 0.00735023888276369,
"grad_norm": 3.0068202552964665,
"learning_rate": 7.326007326007326e-06,
"loss": 0.9358,
"mean_token_accuracy": 0.7378472447395324,
"step": 40
},
{
"epoch": 0.008269018743109152,
"grad_norm": 1.6811073433803756,
"learning_rate": 8.241758241758243e-06,
"loss": 0.959,
"mean_token_accuracy": 0.7312322854995728,
"step": 45
},
{
"epoch": 0.009187798603454611,
"grad_norm": 1.8266484737246598,
"learning_rate": 9.157509157509158e-06,
"loss": 0.8436,
"mean_token_accuracy": 0.7629137873649597,
"step": 50
},
{
"epoch": 0.010106578463800073,
"grad_norm": 1.7588858560286977,
"learning_rate": 1.0073260073260074e-05,
"loss": 0.8931,
"mean_token_accuracy": 0.7447961091995239,
"step": 55
},
{
"epoch": 0.011025358324145534,
"grad_norm": 1.9424500939228353,
"learning_rate": 1.0989010989010989e-05,
"loss": 0.9072,
"mean_token_accuracy": 0.7389389991760253,
"step": 60
},
{
"epoch": 0.011944138184490995,
"grad_norm": 1.93385540963114,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.8793,
"mean_token_accuracy": 0.748711359500885,
"step": 65
},
{
"epoch": 0.012862918044836457,
"grad_norm": 1.5600948544295363,
"learning_rate": 1.282051282051282e-05,
"loss": 0.9042,
"mean_token_accuracy": 0.7399611473083496,
"step": 70
},
{
"epoch": 0.013781697905181918,
"grad_norm": 1.7742267258287694,
"learning_rate": 1.3736263736263738e-05,
"loss": 0.9196,
"mean_token_accuracy": 0.7337918877601624,
"step": 75
},
{
"epoch": 0.01470047776552738,
"grad_norm": 1.7237265700273807,
"learning_rate": 1.4652014652014653e-05,
"loss": 0.8143,
"mean_token_accuracy": 0.7621858239173889,
"step": 80
},
{
"epoch": 0.01561925762587284,
"grad_norm": 2.032178530068195,
"learning_rate": 1.556776556776557e-05,
"loss": 0.881,
"mean_token_accuracy": 0.7456439375877381,
"step": 85
},
{
"epoch": 0.016538037486218304,
"grad_norm": 1.5476057141233361,
"learning_rate": 1.6483516483516486e-05,
"loss": 0.774,
"mean_token_accuracy": 0.7747476458549499,
"step": 90
},
{
"epoch": 0.017456817346563763,
"grad_norm": 1.7859288763560526,
"learning_rate": 1.73992673992674e-05,
"loss": 0.8551,
"mean_token_accuracy": 0.7515540599822998,
"step": 95
},
{
"epoch": 0.018375597206909223,
"grad_norm": 1.7870239578433738,
"learning_rate": 1.8315018315018315e-05,
"loss": 0.8048,
"mean_token_accuracy": 0.7651323318481446,
"step": 100
},
{
"epoch": 0.019294377067254686,
"grad_norm": 2.0924810885176077,
"learning_rate": 1.923076923076923e-05,
"loss": 0.8006,
"mean_token_accuracy": 0.76546790599823,
"step": 105
},
{
"epoch": 0.020213156927600145,
"grad_norm": 1.7880225651210455,
"learning_rate": 2.0146520146520148e-05,
"loss": 0.8121,
"mean_token_accuracy": 0.7632635951042175,
"step": 110
},
{
"epoch": 0.02113193678794561,
"grad_norm": 1.6627547537831113,
"learning_rate": 2.1062271062271064e-05,
"loss": 0.7817,
"mean_token_accuracy": 0.772648024559021,
"step": 115
},
{
"epoch": 0.022050716648291068,
"grad_norm": 1.820743441798809,
"learning_rate": 2.1978021978021977e-05,
"loss": 0.8784,
"mean_token_accuracy": 0.7446165084838867,
"step": 120
},
{
"epoch": 0.02296949650863653,
"grad_norm": 2.099675060940829,
"learning_rate": 2.2893772893772894e-05,
"loss": 0.8524,
"mean_token_accuracy": 0.749963927268982,
"step": 125
},
{
"epoch": 0.02388827636898199,
"grad_norm": 1.954655640615228,
"learning_rate": 2.380952380952381e-05,
"loss": 0.8528,
"mean_token_accuracy": 0.7490226745605468,
"step": 130
},
{
"epoch": 0.024807056229327454,
"grad_norm": 1.6762777606996013,
"learning_rate": 2.4725274725274727e-05,
"loss": 0.7488,
"mean_token_accuracy": 0.7781027436256409,
"step": 135
},
{
"epoch": 0.025725836089672913,
"grad_norm": 1.818950775045492,
"learning_rate": 2.564102564102564e-05,
"loss": 0.8932,
"mean_token_accuracy": 0.7370708823204041,
"step": 140
},
{
"epoch": 0.026644615950018376,
"grad_norm": 1.7549497495325217,
"learning_rate": 2.655677655677656e-05,
"loss": 0.8299,
"mean_token_accuracy": 0.7562183260917663,
"step": 145
},
{
"epoch": 0.027563395810363836,
"grad_norm": 1.9651810255018103,
"learning_rate": 2.7472527472527476e-05,
"loss": 0.8961,
"mean_token_accuracy": 0.7379685401916504,
"step": 150
},
{
"epoch": 0.0284821756707093,
"grad_norm": 1.7437158607043617,
"learning_rate": 2.838827838827839e-05,
"loss": 0.8796,
"mean_token_accuracy": 0.7450518012046814,
"step": 155
},
{
"epoch": 0.02940095553105476,
"grad_norm": 1.6994422932790823,
"learning_rate": 2.9304029304029305e-05,
"loss": 0.8178,
"mean_token_accuracy": 0.7607359051704407,
"step": 160
},
{
"epoch": 0.03031973539140022,
"grad_norm": 1.858782783599879,
"learning_rate": 3.021978021978022e-05,
"loss": 0.8074,
"mean_token_accuracy": 0.7634272456169129,
"step": 165
},
{
"epoch": 0.03123851525174568,
"grad_norm": 1.8619803118890237,
"learning_rate": 3.113553113553114e-05,
"loss": 0.8546,
"mean_token_accuracy": 0.7492664337158204,
"step": 170
},
{
"epoch": 0.032157295112091144,
"grad_norm": 1.609592048201869,
"learning_rate": 3.205128205128206e-05,
"loss": 0.7985,
"mean_token_accuracy": 0.7690797805786133,
"step": 175
},
{
"epoch": 0.03307607497243661,
"grad_norm": 1.6908537703053188,
"learning_rate": 3.296703296703297e-05,
"loss": 0.8704,
"mean_token_accuracy": 0.744064450263977,
"step": 180
},
{
"epoch": 0.033994854832782063,
"grad_norm": 1.8165162354197588,
"learning_rate": 3.3882783882783884e-05,
"loss": 0.8849,
"mean_token_accuracy": 0.7387139916419982,
"step": 185
},
{
"epoch": 0.03491363469312753,
"grad_norm": 1.8897649307357878,
"learning_rate": 3.47985347985348e-05,
"loss": 0.9233,
"mean_token_accuracy": 0.7323238730430603,
"step": 190
},
{
"epoch": 0.03583241455347299,
"grad_norm": 1.7585678570520256,
"learning_rate": 3.571428571428572e-05,
"loss": 0.9059,
"mean_token_accuracy": 0.7369635701179504,
"step": 195
},
{
"epoch": 0.036751194413818446,
"grad_norm": 1.8196207496857761,
"learning_rate": 3.663003663003663e-05,
"loss": 0.7984,
"mean_token_accuracy": 0.7655532121658325,
"step": 200
},
{
"epoch": 0.03766997427416391,
"grad_norm": 1.8001783116526198,
"learning_rate": 3.754578754578755e-05,
"loss": 0.9032,
"mean_token_accuracy": 0.7356508016586304,
"step": 205
},
{
"epoch": 0.03858875413450937,
"grad_norm": 2.1616041041174143,
"learning_rate": 3.846153846153846e-05,
"loss": 0.8959,
"mean_token_accuracy": 0.7379406213760376,
"step": 210
},
{
"epoch": 0.039507533994854835,
"grad_norm": 1.6869890778597507,
"learning_rate": 3.9377289377289376e-05,
"loss": 0.7926,
"mean_token_accuracy": 0.7656057000160217,
"step": 215
},
{
"epoch": 0.04042631385520029,
"grad_norm": 1.5982107775692642,
"learning_rate": 4.0293040293040296e-05,
"loss": 0.8742,
"mean_token_accuracy": 0.743067741394043,
"step": 220
},
{
"epoch": 0.041345093715545754,
"grad_norm": 1.8744347527261311,
"learning_rate": 4.120879120879121e-05,
"loss": 0.8344,
"mean_token_accuracy": 0.7555591464042664,
"step": 225
},
{
"epoch": 0.04226387357589122,
"grad_norm": 1.7387305953271162,
"learning_rate": 4.212454212454213e-05,
"loss": 0.8268,
"mean_token_accuracy": 0.7600399851799011,
"step": 230
},
{
"epoch": 0.04318265343623668,
"grad_norm": 1.5706104238018215,
"learning_rate": 4.304029304029304e-05,
"loss": 0.8534,
"mean_token_accuracy": 0.7509700894355774,
"step": 235
},
{
"epoch": 0.044101433296582136,
"grad_norm": 2.3765119929284566,
"learning_rate": 4.3956043956043955e-05,
"loss": 0.833,
"mean_token_accuracy": 0.7555877804756165,
"step": 240
},
{
"epoch": 0.0450202131569276,
"grad_norm": 1.8036798399760634,
"learning_rate": 4.4871794871794874e-05,
"loss": 0.9457,
"mean_token_accuracy": 0.7283176898956298,
"step": 245
},
{
"epoch": 0.04593899301727306,
"grad_norm": 1.5687281786415206,
"learning_rate": 4.578754578754579e-05,
"loss": 0.9033,
"mean_token_accuracy": 0.7391088247299195,
"step": 250
},
{
"epoch": 0.046857772877618525,
"grad_norm": 1.906886876307677,
"learning_rate": 4.670329670329671e-05,
"loss": 0.8814,
"mean_token_accuracy": 0.7455077290534973,
"step": 255
},
{
"epoch": 0.04777655273796398,
"grad_norm": 2.459151452178755,
"learning_rate": 4.761904761904762e-05,
"loss": 0.8482,
"mean_token_accuracy": 0.7487390875816345,
"step": 260
},
{
"epoch": 0.048695332598309445,
"grad_norm": 2.090972233335094,
"learning_rate": 4.8534798534798533e-05,
"loss": 0.8361,
"mean_token_accuracy": 0.7550442337989807,
"step": 265
},
{
"epoch": 0.04961411245865491,
"grad_norm": 2.3215263657474834,
"learning_rate": 4.945054945054945e-05,
"loss": 0.9193,
"mean_token_accuracy": 0.7319010257720947,
"step": 270
},
{
"epoch": 0.05053289231900037,
"grad_norm": 1.603682503261401,
"learning_rate": 4.999998337739284e-05,
"loss": 0.8252,
"mean_token_accuracy": 0.7569657444953919,
"step": 275
},
{
"epoch": 0.05145167217934583,
"grad_norm": 1.5710545327372318,
"learning_rate": 4.999979637334437e-05,
"loss": 0.8146,
"mean_token_accuracy": 0.7597472548484803,
"step": 280
},
{
"epoch": 0.05237045203969129,
"grad_norm": 1.7769444956116853,
"learning_rate": 4.9999401588721174e-05,
"loss": 0.922,
"mean_token_accuracy": 0.7304396867752075,
"step": 285
},
{
"epoch": 0.05328923190003675,
"grad_norm": 1.5021517935884932,
"learning_rate": 4.999879902716899e-05,
"loss": 0.7971,
"mean_token_accuracy": 0.7652329564094543,
"step": 290
},
{
"epoch": 0.05420801176038221,
"grad_norm": 1.7950075085630441,
"learning_rate": 4.999798869425236e-05,
"loss": 0.9554,
"mean_token_accuracy": 0.7232900500297547,
"step": 295
},
{
"epoch": 0.05512679162072767,
"grad_norm": 1.5725822266760454,
"learning_rate": 4.999697059745451e-05,
"loss": 0.8468,
"mean_token_accuracy": 0.7524639129638672,
"step": 300
},
{
"epoch": 0.056045571481073135,
"grad_norm": 1.6589539268411269,
"learning_rate": 4.999574474617734e-05,
"loss": 0.9022,
"mean_token_accuracy": 0.7418521285057068,
"step": 305
},
{
"epoch": 0.0569643513414186,
"grad_norm": 1.6096903752899234,
"learning_rate": 4.999431115174131e-05,
"loss": 0.8665,
"mean_token_accuracy": 0.7496641755104065,
"step": 310
},
{
"epoch": 0.057883131201764054,
"grad_norm": 1.4434886028998843,
"learning_rate": 4.999266982738535e-05,
"loss": 0.9006,
"mean_token_accuracy": 0.737899649143219,
"step": 315
},
{
"epoch": 0.05880191106210952,
"grad_norm": 1.3481221265945444,
"learning_rate": 4.999082078826671e-05,
"loss": 0.8867,
"mean_token_accuracy": 0.744273555278778,
"step": 320
},
{
"epoch": 0.05972069092245498,
"grad_norm": 1.4681948954197888,
"learning_rate": 4.998876405146087e-05,
"loss": 0.9196,
"mean_token_accuracy": 0.7311511278152466,
"step": 325
},
{
"epoch": 0.06063947078280044,
"grad_norm": 1.4689641413707268,
"learning_rate": 4.998649963596131e-05,
"loss": 0.9618,
"mean_token_accuracy": 0.7219404339790344,
"step": 330
},
{
"epoch": 0.0615582506431459,
"grad_norm": 1.5996506188069943,
"learning_rate": 4.998402756267943e-05,
"loss": 0.8775,
"mean_token_accuracy": 0.7418051362037659,
"step": 335
},
{
"epoch": 0.06247703050349136,
"grad_norm": 1.3723739821198824,
"learning_rate": 4.998134785444425e-05,
"loss": 0.8755,
"mean_token_accuracy": 0.7445154070854187,
"step": 340
},
{
"epoch": 0.06339581036383682,
"grad_norm": 1.5484060868058451,
"learning_rate": 4.997846053600227e-05,
"loss": 0.8893,
"mean_token_accuracy": 0.7417627811431885,
"step": 345
},
{
"epoch": 0.06431459022418229,
"grad_norm": 1.374032479571469,
"learning_rate": 4.997536563401724e-05,
"loss": 0.926,
"mean_token_accuracy": 0.7293352723121643,
"step": 350
},
{
"epoch": 0.06523337008452774,
"grad_norm": 1.3396355962311042,
"learning_rate": 4.9972063177069894e-05,
"loss": 0.8363,
"mean_token_accuracy": 0.7584239602088928,
"step": 355
},
{
"epoch": 0.06615214994487321,
"grad_norm": 1.4157203063425363,
"learning_rate": 4.9968553195657665e-05,
"loss": 0.796,
"mean_token_accuracy": 0.7692983031272889,
"step": 360
},
{
"epoch": 0.06707092980521867,
"grad_norm": 1.4641176633277067,
"learning_rate": 4.9964835722194455e-05,
"loss": 0.8386,
"mean_token_accuracy": 0.7571163177490234,
"step": 365
},
{
"epoch": 0.06798970966556413,
"grad_norm": 1.3754550053722292,
"learning_rate": 4.996091079101028e-05,
"loss": 0.8487,
"mean_token_accuracy": 0.7535093784332275,
"step": 370
},
{
"epoch": 0.0689084895259096,
"grad_norm": 1.434562106690179,
"learning_rate": 4.995677843835103e-05,
"loss": 0.9616,
"mean_token_accuracy": 0.7227682590484619,
"step": 375
},
{
"epoch": 0.06982726938625505,
"grad_norm": 1.625727387715634,
"learning_rate": 4.995243870237803e-05,
"loss": 0.8748,
"mean_token_accuracy": 0.7452502608299255,
"step": 380
},
{
"epoch": 0.07074604924660051,
"grad_norm": 1.377875493136272,
"learning_rate": 4.994789162316778e-05,
"loss": 0.8707,
"mean_token_accuracy": 0.7500712752342225,
"step": 385
},
{
"epoch": 0.07166482910694598,
"grad_norm": 1.4193673422415896,
"learning_rate": 4.994313724271153e-05,
"loss": 0.9701,
"mean_token_accuracy": 0.720650053024292,
"step": 390
},
{
"epoch": 0.07258360896729144,
"grad_norm": 1.3011757424952435,
"learning_rate": 4.993817560491493e-05,
"loss": 0.9133,
"mean_token_accuracy": 0.7373546719551086,
"step": 395
},
{
"epoch": 0.07350238882763689,
"grad_norm": 1.53341664668217,
"learning_rate": 4.993300675559757e-05,
"loss": 0.903,
"mean_token_accuracy": 0.7406406998634338,
"step": 400
},
{
"epoch": 0.07442116868798236,
"grad_norm": 1.2526811498029908,
"learning_rate": 4.9927630742492644e-05,
"loss": 0.8457,
"mean_token_accuracy": 0.7558913826942444,
"step": 405
},
{
"epoch": 0.07533994854832782,
"grad_norm": 1.2395634856051558,
"learning_rate": 4.992204761524641e-05,
"loss": 0.7751,
"mean_token_accuracy": 0.7746022939682007,
"step": 410
},
{
"epoch": 0.07625872840867329,
"grad_norm": 1.500918488887097,
"learning_rate": 4.9916257425417796e-05,
"loss": 0.9186,
"mean_token_accuracy": 0.7323673367500305,
"step": 415
},
{
"epoch": 0.07717750826901874,
"grad_norm": 1.4971365871256093,
"learning_rate": 4.99102602264779e-05,
"loss": 0.8465,
"mean_token_accuracy": 0.7565265655517578,
"step": 420
},
{
"epoch": 0.0780962881293642,
"grad_norm": 1.2649360311231244,
"learning_rate": 4.990405607380953e-05,
"loss": 0.9161,
"mean_token_accuracy": 0.7346989989280701,
"step": 425
},
{
"epoch": 0.07901506798970967,
"grad_norm": 1.2585193614261605,
"learning_rate": 4.9897645024706634e-05,
"loss": 0.8489,
"mean_token_accuracy": 0.7520861387252807,
"step": 430
},
{
"epoch": 0.07993384785005513,
"grad_norm": 1.4511781114343052,
"learning_rate": 4.989102713837381e-05,
"loss": 0.8646,
"mean_token_accuracy": 0.7488693952560425,
"step": 435
},
{
"epoch": 0.08085262771040058,
"grad_norm": 1.3187982044036815,
"learning_rate": 4.9884202475925754e-05,
"loss": 0.8395,
"mean_token_accuracy": 0.7578373312950134,
"step": 440
},
{
"epoch": 0.08177140757074605,
"grad_norm": 1.3677439877875948,
"learning_rate": 4.9877171100386704e-05,
"loss": 0.8294,
"mean_token_accuracy": 0.7562382102012635,
"step": 445
},
{
"epoch": 0.08269018743109151,
"grad_norm": 1.4162808960942412,
"learning_rate": 4.9869933076689826e-05,
"loss": 0.9207,
"mean_token_accuracy": 0.7346067547798156,
"step": 450
},
{
"epoch": 0.08360896729143698,
"grad_norm": 1.3055956101463557,
"learning_rate": 4.9862488471676646e-05,
"loss": 0.863,
"mean_token_accuracy": 0.749237322807312,
"step": 455
},
{
"epoch": 0.08452774715178243,
"grad_norm": 1.273094086173784,
"learning_rate": 4.985483735409643e-05,
"loss": 0.869,
"mean_token_accuracy": 0.7482211112976074,
"step": 460
},
{
"epoch": 0.08544652701212789,
"grad_norm": 1.4557355335456608,
"learning_rate": 4.9846979794605526e-05,
"loss": 0.8914,
"mean_token_accuracy": 0.7409134864807129,
"step": 465
},
{
"epoch": 0.08636530687247336,
"grad_norm": 1.3769618557958148,
"learning_rate": 4.983891586576674e-05,
"loss": 0.9477,
"mean_token_accuracy": 0.7257415533065796,
"step": 470
},
{
"epoch": 0.08728408673281882,
"grad_norm": 1.2865943966930515,
"learning_rate": 4.983064564204864e-05,
"loss": 0.8597,
"mean_token_accuracy": 0.7478152275085449,
"step": 475
},
{
"epoch": 0.08820286659316427,
"grad_norm": 1.5688420565520587,
"learning_rate": 4.98221691998249e-05,
"loss": 0.9275,
"mean_token_accuracy": 0.7304094076156616,
"step": 480
},
{
"epoch": 0.08912164645350974,
"grad_norm": 1.518182798525247,
"learning_rate": 4.9813486617373545e-05,
"loss": 0.9003,
"mean_token_accuracy": 0.7381687164306641,
"step": 485
},
{
"epoch": 0.0900404263138552,
"grad_norm": 1.2240176455308946,
"learning_rate": 4.980459797487629e-05,
"loss": 0.8663,
"mean_token_accuracy": 0.7481726765632629,
"step": 490
},
{
"epoch": 0.09095920617420065,
"grad_norm": 1.348170741270758,
"learning_rate": 4.979550335441776e-05,
"loss": 0.9427,
"mean_token_accuracy": 0.7272454261779785,
"step": 495
},
{
"epoch": 0.09187798603454612,
"grad_norm": 1.3960656738373651,
"learning_rate": 4.978620283998472e-05,
"loss": 0.8659,
"mean_token_accuracy": 0.7438789248466492,
"step": 500
},
{
"epoch": 0.09279676589489158,
"grad_norm": 1.3587049381306813,
"learning_rate": 4.977669651746534e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.7308396458625793,
"step": 505
},
{
"epoch": 0.09371554575523705,
"grad_norm": 1.2831086518032893,
"learning_rate": 4.976698447464839e-05,
"loss": 0.8296,
"mean_token_accuracy": 0.7564551353454589,
"step": 510
},
{
"epoch": 0.0946343256155825,
"grad_norm": 1.3126315362065073,
"learning_rate": 4.975706680122239e-05,
"loss": 0.8901,
"mean_token_accuracy": 0.7394080519676208,
"step": 515
},
{
"epoch": 0.09555310547592796,
"grad_norm": 1.326727886512097,
"learning_rate": 4.9746943588774845e-05,
"loss": 0.8862,
"mean_token_accuracy": 0.7405213117599487,
"step": 520
},
{
"epoch": 0.09647188533627343,
"grad_norm": 1.1564203773195028,
"learning_rate": 4.9736614930791345e-05,
"loss": 0.8734,
"mean_token_accuracy": 0.7428488969802857,
"step": 525
},
{
"epoch": 0.09739066519661889,
"grad_norm": 1.32950010677836,
"learning_rate": 4.972608092265473e-05,
"loss": 0.9342,
"mean_token_accuracy": 0.7337097644805908,
"step": 530
},
{
"epoch": 0.09830944505696435,
"grad_norm": 1.465381788689711,
"learning_rate": 4.971534166164421e-05,
"loss": 0.873,
"mean_token_accuracy": 0.7453126430511474,
"step": 535
},
{
"epoch": 0.09922822491730982,
"grad_norm": 1.2034166956705263,
"learning_rate": 4.970439724693445e-05,
"loss": 0.8915,
"mean_token_accuracy": 0.7360377907752991,
"step": 540
},
{
"epoch": 0.10014700477765527,
"grad_norm": 1.411243613830083,
"learning_rate": 4.969324777959465e-05,
"loss": 0.881,
"mean_token_accuracy": 0.7421476721763611,
"step": 545
},
{
"epoch": 0.10106578463800074,
"grad_norm": 1.2799706017877437,
"learning_rate": 4.968189336258767e-05,
"loss": 0.8403,
"mean_token_accuracy": 0.7561326265335083,
"step": 550
},
{
"epoch": 0.1019845644983462,
"grad_norm": 1.4044688342965088,
"learning_rate": 4.967033410076898e-05,
"loss": 0.9382,
"mean_token_accuracy": 0.7271358609199524,
"step": 555
},
{
"epoch": 0.10290334435869165,
"grad_norm": 1.4228321929552081,
"learning_rate": 4.965857010088579e-05,
"loss": 0.7972,
"mean_token_accuracy": 0.7670902132987976,
"step": 560
},
{
"epoch": 0.10382212421903712,
"grad_norm": 1.2255950177016557,
"learning_rate": 4.964660147157599e-05,
"loss": 0.8801,
"mean_token_accuracy": 0.7435322165489197,
"step": 565
},
{
"epoch": 0.10474090407938258,
"grad_norm": 1.2075368004666547,
"learning_rate": 4.9634428323367184e-05,
"loss": 0.807,
"mean_token_accuracy": 0.7603043556213379,
"step": 570
},
{
"epoch": 0.10565968393972804,
"grad_norm": 1.2015436573449916,
"learning_rate": 4.962205076867567e-05,
"loss": 0.8521,
"mean_token_accuracy": 0.7509374380111694,
"step": 575
},
{
"epoch": 0.1065784638000735,
"grad_norm": 1.2414360388934507,
"learning_rate": 4.96094689218054e-05,
"loss": 0.8354,
"mean_token_accuracy": 0.7580932378768921,
"step": 580
},
{
"epoch": 0.10749724366041896,
"grad_norm": 1.4551894189161494,
"learning_rate": 4.959668289894691e-05,
"loss": 0.9427,
"mean_token_accuracy": 0.7268964529037476,
"step": 585
},
{
"epoch": 0.10841602352076442,
"grad_norm": 1.1543433517316415,
"learning_rate": 4.9583692818176224e-05,
"loss": 0.8493,
"mean_token_accuracy": 0.7501084446907044,
"step": 590
},
{
"epoch": 0.10933480338110989,
"grad_norm": 1.320648188350521,
"learning_rate": 4.9570498799453864e-05,
"loss": 0.8808,
"mean_token_accuracy": 0.7438010811805725,
"step": 595
},
{
"epoch": 0.11025358324145534,
"grad_norm": 1.2877252561167056,
"learning_rate": 4.955710096462362e-05,
"loss": 0.8779,
"mean_token_accuracy": 0.7458134055137634,
"step": 600
},
{
"epoch": 0.11117236310180081,
"grad_norm": 1.3602330448083289,
"learning_rate": 4.954349943741148e-05,
"loss": 0.8569,
"mean_token_accuracy": 0.7528672575950622,
"step": 605
},
{
"epoch": 0.11209114296214627,
"grad_norm": 1.3741249752237,
"learning_rate": 4.952969434342452e-05,
"loss": 0.9154,
"mean_token_accuracy": 0.7366644501686096,
"step": 610
},
{
"epoch": 0.11300992282249173,
"grad_norm": 1.2731189631033297,
"learning_rate": 4.951568581014967e-05,
"loss": 0.8101,
"mean_token_accuracy": 0.7622666001319885,
"step": 615
},
{
"epoch": 0.1139287026828372,
"grad_norm": 1.3238359092320755,
"learning_rate": 4.95014739669526e-05,
"loss": 0.8681,
"mean_token_accuracy": 0.7467660307884216,
"step": 620
},
{
"epoch": 0.11484748254318265,
"grad_norm": 1.202393469716897,
"learning_rate": 4.94870589450765e-05,
"loss": 0.8455,
"mean_token_accuracy": 0.7547730088233948,
"step": 625
},
{
"epoch": 0.11576626240352811,
"grad_norm": 1.1456259434669744,
"learning_rate": 4.9472440877640856e-05,
"loss": 0.9136,
"mean_token_accuracy": 0.7327568888664245,
"step": 630
},
{
"epoch": 0.11668504226387358,
"grad_norm": 1.3091846984844044,
"learning_rate": 4.945761989964025e-05,
"loss": 0.8093,
"mean_token_accuracy": 0.7623311281204224,
"step": 635
},
{
"epoch": 0.11760382212421903,
"grad_norm": 1.2205761022635602,
"learning_rate": 4.9442596147943095e-05,
"loss": 0.8025,
"mean_token_accuracy": 0.7651844978332519,
"step": 640
},
{
"epoch": 0.1185226019845645,
"grad_norm": 1.3351343961105993,
"learning_rate": 4.942736976129035e-05,
"loss": 0.8144,
"mean_token_accuracy": 0.7613680481910705,
"step": 645
},
{
"epoch": 0.11944138184490996,
"grad_norm": 1.430243553624981,
"learning_rate": 4.941194088029431e-05,
"loss": 0.9086,
"mean_token_accuracy": 0.7342401266098022,
"step": 650
},
{
"epoch": 0.12036016170525542,
"grad_norm": 1.2556349548265657,
"learning_rate": 4.939630964743721e-05,
"loss": 0.8369,
"mean_token_accuracy": 0.7542879939079284,
"step": 655
},
{
"epoch": 0.12127894156560089,
"grad_norm": 1.3152471325239643,
"learning_rate": 4.9380476207069984e-05,
"loss": 0.8687,
"mean_token_accuracy": 0.7439038634300232,
"step": 660
},
{
"epoch": 0.12219772142594634,
"grad_norm": 1.3345822562447953,
"learning_rate": 4.936444070541091e-05,
"loss": 0.8826,
"mean_token_accuracy": 0.7404947400093078,
"step": 665
},
{
"epoch": 0.1231165012862918,
"grad_norm": 1.1814130966039975,
"learning_rate": 4.9348203290544245e-05,
"loss": 0.8797,
"mean_token_accuracy": 0.7429808259010315,
"step": 670
},
{
"epoch": 0.12403528114663727,
"grad_norm": 1.2046715413009972,
"learning_rate": 4.933176411241888e-05,
"loss": 0.7764,
"mean_token_accuracy": 0.7709425568580628,
"step": 675
},
{
"epoch": 0.12495406100698273,
"grad_norm": 1.2359612025083841,
"learning_rate": 4.9315123322846934e-05,
"loss": 0.8757,
"mean_token_accuracy": 0.7420969247817993,
"step": 680
},
{
"epoch": 0.1258728408673282,
"grad_norm": 1.1918939688175052,
"learning_rate": 4.929828107550237e-05,
"loss": 0.8439,
"mean_token_accuracy": 0.7540834426879883,
"step": 685
},
{
"epoch": 0.12679162072767364,
"grad_norm": 1.1826184909342607,
"learning_rate": 4.928123752591957e-05,
"loss": 0.8801,
"mean_token_accuracy": 0.7422125935554504,
"step": 690
},
{
"epoch": 0.1277104005880191,
"grad_norm": 1.255648051492681,
"learning_rate": 4.926399283149188e-05,
"loss": 0.8429,
"mean_token_accuracy": 0.7524136543273926,
"step": 695
},
{
"epoch": 0.12862918044836458,
"grad_norm": 1.1605510458950452,
"learning_rate": 4.9246547151470205e-05,
"loss": 0.9021,
"mean_token_accuracy": 0.7373670816421509,
"step": 700
},
{
"epoch": 0.12954796030871002,
"grad_norm": 1.3238344494004473,
"learning_rate": 4.9228900646961474e-05,
"loss": 0.9057,
"mean_token_accuracy": 0.7368204951286316,
"step": 705
},
{
"epoch": 0.1304667401690555,
"grad_norm": 1.4306839271560785,
"learning_rate": 4.921105348092721e-05,
"loss": 0.7625,
"mean_token_accuracy": 0.7744701862335205,
"step": 710
},
{
"epoch": 0.13138552002940096,
"grad_norm": 1.2165530726795277,
"learning_rate": 4.919300581818197e-05,
"loss": 0.9154,
"mean_token_accuracy": 0.7359979271888732,
"step": 715
},
{
"epoch": 0.13230429988974643,
"grad_norm": 1.286173428804154,
"learning_rate": 4.91747578253919e-05,
"loss": 0.8585,
"mean_token_accuracy": 0.7492180824279785,
"step": 720
},
{
"epoch": 0.13322307975009187,
"grad_norm": 1.1184283075387254,
"learning_rate": 4.91563096710731e-05,
"loss": 0.8903,
"mean_token_accuracy": 0.7384656071662903,
"step": 725
},
{
"epoch": 0.13414185961043734,
"grad_norm": 1.3568563512463876,
"learning_rate": 4.913766152559015e-05,
"loss": 0.9028,
"mean_token_accuracy": 0.7395498275756835,
"step": 730
},
{
"epoch": 0.1350606394707828,
"grad_norm": 1.4808915141658863,
"learning_rate": 4.911881356115449e-05,
"loss": 0.9084,
"mean_token_accuracy": 0.7352772116661072,
"step": 735
},
{
"epoch": 0.13597941933112825,
"grad_norm": 1.2602878053642064,
"learning_rate": 4.909976595182285e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.7459996342658997,
"step": 740
},
{
"epoch": 0.13689819919147372,
"grad_norm": 1.1270873445880305,
"learning_rate": 4.908051887349562e-05,
"loss": 0.85,
"mean_token_accuracy": 0.751087772846222,
"step": 745
},
{
"epoch": 0.1378169790518192,
"grad_norm": 1.2900722126857331,
"learning_rate": 4.906107250391527e-05,
"loss": 0.8333,
"mean_token_accuracy": 0.755375337600708,
"step": 750
},
{
"epoch": 0.13873575891216464,
"grad_norm": 1.3067285908120727,
"learning_rate": 4.9041427022664645e-05,
"loss": 0.8661,
"mean_token_accuracy": 0.7458638072013855,
"step": 755
},
{
"epoch": 0.1396545387725101,
"grad_norm": 1.1705361580698839,
"learning_rate": 4.902158261116537e-05,
"loss": 0.9127,
"mean_token_accuracy": 0.736179769039154,
"step": 760
},
{
"epoch": 0.14057331863285558,
"grad_norm": 1.0599820145259966,
"learning_rate": 4.900153945267612e-05,
"loss": 0.8433,
"mean_token_accuracy": 0.7551376700401307,
"step": 765
},
{
"epoch": 0.14149209849320102,
"grad_norm": 1.282905055646039,
"learning_rate": 4.8981297732291e-05,
"loss": 0.8554,
"mean_token_accuracy": 0.7511672616004944,
"step": 770
},
{
"epoch": 0.1424108783535465,
"grad_norm": 1.3437700886513289,
"learning_rate": 4.896085763693773e-05,
"loss": 0.9227,
"mean_token_accuracy": 0.733566403388977,
"step": 775
},
{
"epoch": 0.14332965821389196,
"grad_norm": 1.1297724344725806,
"learning_rate": 4.894021935537603e-05,
"loss": 0.8507,
"mean_token_accuracy": 0.7513582468032837,
"step": 780
},
{
"epoch": 0.1442484380742374,
"grad_norm": 1.3689332887934986,
"learning_rate": 4.891938307819578e-05,
"loss": 0.8849,
"mean_token_accuracy": 0.7436878085136414,
"step": 785
},
{
"epoch": 0.14516721793458287,
"grad_norm": 1.1498634876050613,
"learning_rate": 4.889834899781535e-05,
"loss": 0.8429,
"mean_token_accuracy": 0.753303873538971,
"step": 790
},
{
"epoch": 0.14608599779492834,
"grad_norm": 1.140208758740646,
"learning_rate": 4.887711730847975e-05,
"loss": 0.7601,
"mean_token_accuracy": 0.773739755153656,
"step": 795
},
{
"epoch": 0.14700477765527378,
"grad_norm": 1.3016614852116664,
"learning_rate": 4.885568820625885e-05,
"loss": 0.9065,
"mean_token_accuracy": 0.738306713104248,
"step": 800
},
{
"epoch": 0.14792355751561925,
"grad_norm": 1.2509551297005208,
"learning_rate": 4.883406188904564e-05,
"loss": 0.7737,
"mean_token_accuracy": 0.7723647236824036,
"step": 805
},
{
"epoch": 0.14884233737596472,
"grad_norm": 1.2304502037360896,
"learning_rate": 4.8812238556554284e-05,
"loss": 0.9195,
"mean_token_accuracy": 0.7318793773651123,
"step": 810
},
{
"epoch": 0.1497611172363102,
"grad_norm": 1.160104764900568,
"learning_rate": 4.8790218410318374e-05,
"loss": 0.8646,
"mean_token_accuracy": 0.7459649324417115,
"step": 815
},
{
"epoch": 0.15067989709665564,
"grad_norm": 1.4673570710895905,
"learning_rate": 4.8768001653689024e-05,
"loss": 0.9062,
"mean_token_accuracy": 0.7345248699188233,
"step": 820
},
{
"epoch": 0.1515986769570011,
"grad_norm": 1.171954013640319,
"learning_rate": 4.874558849183299e-05,
"loss": 0.8867,
"mean_token_accuracy": 0.7401813983917236,
"step": 825
},
{
"epoch": 0.15251745681734658,
"grad_norm": 1.1448163904842934,
"learning_rate": 4.872297913173081e-05,
"loss": 0.8006,
"mean_token_accuracy": 0.7656459212303162,
"step": 830
},
{
"epoch": 0.15343623667769202,
"grad_norm": 1.2603194785671739,
"learning_rate": 4.870017378217485e-05,
"loss": 0.9037,
"mean_token_accuracy": 0.7398361563682556,
"step": 835
},
{
"epoch": 0.1543550165380375,
"grad_norm": 1.3408423677606498,
"learning_rate": 4.86771726537674e-05,
"loss": 0.9383,
"mean_token_accuracy": 0.7285741686820983,
"step": 840
},
{
"epoch": 0.15527379639838296,
"grad_norm": 1.2835325171738854,
"learning_rate": 4.865397595891872e-05,
"loss": 0.8478,
"mean_token_accuracy": 0.75036780834198,
"step": 845
},
{
"epoch": 0.1561925762587284,
"grad_norm": 1.1980068838306206,
"learning_rate": 4.8630583911845084e-05,
"loss": 0.7627,
"mean_token_accuracy": 0.7726967930793762,
"step": 850
},
{
"epoch": 0.15711135611907387,
"grad_norm": 1.34919534477724,
"learning_rate": 4.860699672856682e-05,
"loss": 0.8838,
"mean_token_accuracy": 0.7415394306182861,
"step": 855
},
{
"epoch": 0.15803013597941934,
"grad_norm": 0.9624608396618806,
"learning_rate": 4.8583214626906246e-05,
"loss": 0.8601,
"mean_token_accuracy": 0.7497328519821167,
"step": 860
},
{
"epoch": 0.15894891583976478,
"grad_norm": 1.1697672429495145,
"learning_rate": 4.8559237826485766e-05,
"loss": 0.8228,
"mean_token_accuracy": 0.7570769906044006,
"step": 865
},
{
"epoch": 0.15986769570011025,
"grad_norm": 1.1935364635660837,
"learning_rate": 4.853506654872575e-05,
"loss": 0.9142,
"mean_token_accuracy": 0.7316269755363465,
"step": 870
},
{
"epoch": 0.16078647556045572,
"grad_norm": 1.2042547413733748,
"learning_rate": 4.851070101684252e-05,
"loss": 0.8742,
"mean_token_accuracy": 0.7418438553810119,
"step": 875
},
{
"epoch": 0.16170525542080116,
"grad_norm": 1.0707432913591217,
"learning_rate": 4.84861414558463e-05,
"loss": 0.8196,
"mean_token_accuracy": 0.759805703163147,
"step": 880
},
{
"epoch": 0.16262403528114663,
"grad_norm": 1.0629646526131367,
"learning_rate": 4.846138809253914e-05,
"loss": 0.874,
"mean_token_accuracy": 0.7462024927139282,
"step": 885
},
{
"epoch": 0.1635428151414921,
"grad_norm": 1.2106261509029042,
"learning_rate": 4.843644115551279e-05,
"loss": 0.9328,
"mean_token_accuracy": 0.7267791390419006,
"step": 890
},
{
"epoch": 0.16446159500183755,
"grad_norm": 1.1324590773137175,
"learning_rate": 4.841130087514662e-05,
"loss": 0.9211,
"mean_token_accuracy": 0.7309597492218017,
"step": 895
},
{
"epoch": 0.16538037486218302,
"grad_norm": 1.184689325779188,
"learning_rate": 4.8385967483605496e-05,
"loss": 0.8618,
"mean_token_accuracy": 0.7446626782417297,
"step": 900
},
{
"epoch": 0.16629915472252849,
"grad_norm": 1.138313035203993,
"learning_rate": 4.836044121483759e-05,
"loss": 0.8447,
"mean_token_accuracy": 0.7529171824455261,
"step": 905
},
{
"epoch": 0.16721793458287396,
"grad_norm": 1.1673447148654126,
"learning_rate": 4.833472230457229e-05,
"loss": 0.8979,
"mean_token_accuracy": 0.7358499765396118,
"step": 910
},
{
"epoch": 0.1681367144432194,
"grad_norm": 1.084780317701543,
"learning_rate": 4.830881099031795e-05,
"loss": 0.9185,
"mean_token_accuracy": 0.7328409552574158,
"step": 915
},
{
"epoch": 0.16905549430356487,
"grad_norm": 1.1240239095787248,
"learning_rate": 4.828270751135975e-05,
"loss": 0.7975,
"mean_token_accuracy": 0.7656158566474914,
"step": 920
},
{
"epoch": 0.16997427416391034,
"grad_norm": 1.2150981390103572,
"learning_rate": 4.8256412108757466e-05,
"loss": 0.9078,
"mean_token_accuracy": 0.7345719337463379,
"step": 925
},
{
"epoch": 0.17089305402425578,
"grad_norm": 1.1664463354681245,
"learning_rate": 4.822992502534325e-05,
"loss": 0.9038,
"mean_token_accuracy": 0.7323048114776611,
"step": 930
},
{
"epoch": 0.17181183388460125,
"grad_norm": 1.3331259616500464,
"learning_rate": 4.820324650571938e-05,
"loss": 0.8287,
"mean_token_accuracy": 0.7578937888145447,
"step": 935
},
{
"epoch": 0.17273061374494672,
"grad_norm": 1.2600746932123381,
"learning_rate": 4.8176376796256e-05,
"loss": 0.9795,
"mean_token_accuracy": 0.7109430193901062,
"step": 940
},
{
"epoch": 0.17364939360529216,
"grad_norm": 1.4190547333037917,
"learning_rate": 4.814931614508884e-05,
"loss": 0.8004,
"mean_token_accuracy": 0.7619459390640259,
"step": 945
},
{
"epoch": 0.17456817346563763,
"grad_norm": 1.149443686205735,
"learning_rate": 4.812206480211697e-05,
"loss": 0.8498,
"mean_token_accuracy": 0.7484025120735168,
"step": 950
},
{
"epoch": 0.1754869533259831,
"grad_norm": 1.717641721438028,
"learning_rate": 4.809462301900042e-05,
"loss": 0.8926,
"mean_token_accuracy": 0.7387519717216492,
"step": 955
},
{
"epoch": 0.17640573318632854,
"grad_norm": 1.286493811954347,
"learning_rate": 4.806699104915789e-05,
"loss": 0.9063,
"mean_token_accuracy": 0.733875036239624,
"step": 960
},
{
"epoch": 0.17732451304667401,
"grad_norm": 0.9127620652567309,
"learning_rate": 4.803916914776445e-05,
"loss": 0.7582,
"mean_token_accuracy": 0.7734929203987122,
"step": 965
},
{
"epoch": 0.17824329290701948,
"grad_norm": 1.1385225837750934,
"learning_rate": 4.801115757174911e-05,
"loss": 0.8003,
"mean_token_accuracy": 0.7619274735450745,
"step": 970
},
{
"epoch": 0.17916207276736493,
"grad_norm": 1.0365511688061633,
"learning_rate": 4.798295657979249e-05,
"loss": 0.8788,
"mean_token_accuracy": 0.7446885228157043,
"step": 975
},
{
"epoch": 0.1800808526277104,
"grad_norm": 1.168341096775078,
"learning_rate": 4.795456643232444e-05,
"loss": 0.8201,
"mean_token_accuracy": 0.7583209872245789,
"step": 980
},
{
"epoch": 0.18099963248805587,
"grad_norm": 1.2070861524091874,
"learning_rate": 4.79259873915216e-05,
"loss": 0.8247,
"mean_token_accuracy": 0.754137146472931,
"step": 985
},
{
"epoch": 0.1819184123484013,
"grad_norm": 1.1865394539255603,
"learning_rate": 4.789721972130499e-05,
"loss": 0.8068,
"mean_token_accuracy": 0.7631414651870727,
"step": 990
},
{
"epoch": 0.18283719220874678,
"grad_norm": 1.052924845633483,
"learning_rate": 4.7868263687337613e-05,
"loss": 0.7659,
"mean_token_accuracy": 0.7754044890403747,
"step": 995
},
{
"epoch": 0.18375597206909225,
"grad_norm": 1.1878988268801094,
"learning_rate": 4.783911955702196e-05,
"loss": 0.8474,
"mean_token_accuracy": 0.7484631299972534,
"step": 1000
},
{
"epoch": 0.18467475192943772,
"grad_norm": 1.2268828963125744,
"learning_rate": 4.7809787599497504e-05,
"loss": 0.8361,
"mean_token_accuracy": 0.756050968170166,
"step": 1005
},
{
"epoch": 0.18559353178978316,
"grad_norm": 1.1621815374305198,
"learning_rate": 4.778026808563833e-05,
"loss": 0.8081,
"mean_token_accuracy": 0.7624092817306518,
"step": 1010
},
{
"epoch": 0.18651231165012863,
"grad_norm": 1.3537024766805776,
"learning_rate": 4.775056128805051e-05,
"loss": 0.8903,
"mean_token_accuracy": 0.7347793221473694,
"step": 1015
},
{
"epoch": 0.1874310915104741,
"grad_norm": 1.1053617147333854,
"learning_rate": 4.772066748106967e-05,
"loss": 0.8345,
"mean_token_accuracy": 0.7528262138366699,
"step": 1020
},
{
"epoch": 0.18834987137081954,
"grad_norm": 1.270871735891865,
"learning_rate": 4.7690586940758405e-05,
"loss": 0.8519,
"mean_token_accuracy": 0.7496292948722839,
"step": 1025
},
{
"epoch": 0.189268651231165,
"grad_norm": 1.1951114465300396,
"learning_rate": 4.766031994490377e-05,
"loss": 0.8632,
"mean_token_accuracy": 0.7459157705307007,
"step": 1030
},
{
"epoch": 0.19018743109151048,
"grad_norm": 1.2145135516278585,
"learning_rate": 4.762986677301468e-05,
"loss": 0.7844,
"mean_token_accuracy": 0.7638005137443542,
"step": 1035
},
{
"epoch": 0.19110621095185593,
"grad_norm": 1.2377029681542826,
"learning_rate": 4.759922770631935e-05,
"loss": 0.8294,
"mean_token_accuracy": 0.7549967885017395,
"step": 1040
},
{
"epoch": 0.1920249908122014,
"grad_norm": 1.2317484830433059,
"learning_rate": 4.7568403027762696e-05,
"loss": 0.7993,
"mean_token_accuracy": 0.763549017906189,
"step": 1045
},
{
"epoch": 0.19294377067254687,
"grad_norm": 1.106617224635935,
"learning_rate": 4.75373930220037e-05,
"loss": 0.8114,
"mean_token_accuracy": 0.7601318001747132,
"step": 1050
},
{
"epoch": 0.1938625505328923,
"grad_norm": 1.2406149376598858,
"learning_rate": 4.7506197975412826e-05,
"loss": 0.901,
"mean_token_accuracy": 0.7375799655914307,
"step": 1055
},
{
"epoch": 0.19478133039323778,
"grad_norm": 1.368471676340253,
"learning_rate": 4.747481817606933e-05,
"loss": 0.9158,
"mean_token_accuracy": 0.730099368095398,
"step": 1060
},
{
"epoch": 0.19570011025358325,
"grad_norm": 1.2045592901618265,
"learning_rate": 4.7443253913758617e-05,
"loss": 0.8766,
"mean_token_accuracy": 0.7418853521347046,
"step": 1065
},
{
"epoch": 0.1966188901139287,
"grad_norm": 1.0674799173203142,
"learning_rate": 4.741150547996958e-05,
"loss": 0.8079,
"mean_token_accuracy": 0.763364028930664,
"step": 1070
},
{
"epoch": 0.19753766997427416,
"grad_norm": 1.3569414017684345,
"learning_rate": 4.737957316789189e-05,
"loss": 0.8038,
"mean_token_accuracy": 0.7652618408203125,
"step": 1075
},
{
"epoch": 0.19845644983461963,
"grad_norm": 1.237112155686031,
"learning_rate": 4.734745727241328e-05,
"loss": 0.9153,
"mean_token_accuracy": 0.7342644929885864,
"step": 1080
},
{
"epoch": 0.19937522969496507,
"grad_norm": 1.1267547624767125,
"learning_rate": 4.7315158090116854e-05,
"loss": 0.8808,
"mean_token_accuracy": 0.7401048541069031,
"step": 1085
},
{
"epoch": 0.20029400955531054,
"grad_norm": 1.1393796581940456,
"learning_rate": 4.728267591927831e-05,
"loss": 0.8232,
"mean_token_accuracy": 0.7574564695358277,
"step": 1090
},
{
"epoch": 0.201212789415656,
"grad_norm": 1.0439059841475136,
"learning_rate": 4.7250011059863207e-05,
"loss": 0.8255,
"mean_token_accuracy": 0.7512354731559754,
"step": 1095
},
{
"epoch": 0.20213156927600148,
"grad_norm": 1.1400055909091193,
"learning_rate": 4.721716381352422e-05,
"loss": 0.8547,
"mean_token_accuracy": 0.7499767065048217,
"step": 1100
},
{
"epoch": 0.20305034913634692,
"grad_norm": 1.1315286051266153,
"learning_rate": 4.718413448359828e-05,
"loss": 0.8083,
"mean_token_accuracy": 0.7595677256584168,
"step": 1105
},
{
"epoch": 0.2039691289966924,
"grad_norm": 1.1864811752644395,
"learning_rate": 4.715092337510386e-05,
"loss": 0.8823,
"mean_token_accuracy": 0.7407166361808777,
"step": 1110
},
{
"epoch": 0.20488790885703786,
"grad_norm": 1.1151247170457734,
"learning_rate": 4.711753079473809e-05,
"loss": 0.8344,
"mean_token_accuracy": 0.7524962782859802,
"step": 1115
},
{
"epoch": 0.2058066887173833,
"grad_norm": 1.1211350911528808,
"learning_rate": 4.7083957050873965e-05,
"loss": 0.8168,
"mean_token_accuracy": 0.755139684677124,
"step": 1120
},
{
"epoch": 0.20672546857772878,
"grad_norm": 1.2831690134615248,
"learning_rate": 4.705020245355749e-05,
"loss": 0.9413,
"mean_token_accuracy": 0.72357656955719,
"step": 1125
},
{
"epoch": 0.20764424843807425,
"grad_norm": 1.034972121395733,
"learning_rate": 4.701626731450479e-05,
"loss": 0.8167,
"mean_token_accuracy": 0.7568554997444152,
"step": 1130
},
{
"epoch": 0.2085630282984197,
"grad_norm": 1.012657589652359,
"learning_rate": 4.6982151947099276e-05,
"loss": 0.833,
"mean_token_accuracy": 0.7557546138763428,
"step": 1135
},
{
"epoch": 0.20948180815876516,
"grad_norm": 1.1282926356775929,
"learning_rate": 4.694785666638871e-05,
"loss": 0.8341,
"mean_token_accuracy": 0.7547509074211121,
"step": 1140
},
{
"epoch": 0.21040058801911063,
"grad_norm": 1.0977221337381091,
"learning_rate": 4.691338178908232e-05,
"loss": 0.8154,
"mean_token_accuracy": 0.7610322952270507,
"step": 1145
},
{
"epoch": 0.21131936787945607,
"grad_norm": 1.093844804786045,
"learning_rate": 4.687872763354788e-05,
"loss": 0.8406,
"mean_token_accuracy": 0.7520750164985657,
"step": 1150
},
{
"epoch": 0.21223814773980154,
"grad_norm": 1.0307882599984655,
"learning_rate": 4.684389451980873e-05,
"loss": 0.7764,
"mean_token_accuracy": 0.7720999121665955,
"step": 1155
},
{
"epoch": 0.213156927600147,
"grad_norm": 1.2074018111359583,
"learning_rate": 4.680888276954087e-05,
"loss": 0.8309,
"mean_token_accuracy": 0.7553021907806396,
"step": 1160
},
{
"epoch": 0.21407570746049245,
"grad_norm": 1.0917419046828303,
"learning_rate": 4.677369270606997e-05,
"loss": 0.8418,
"mean_token_accuracy": 0.7502257823944092,
"step": 1165
},
{
"epoch": 0.21499448732083792,
"grad_norm": 1.0820900629957635,
"learning_rate": 4.673832465436837e-05,
"loss": 0.7671,
"mean_token_accuracy": 0.7708743929862976,
"step": 1170
},
{
"epoch": 0.2159132671811834,
"grad_norm": 1.089247755922322,
"learning_rate": 4.67027789410521e-05,
"loss": 0.8538,
"mean_token_accuracy": 0.7494909524917602,
"step": 1175
},
{
"epoch": 0.21683204704152884,
"grad_norm": 1.1197011602210687,
"learning_rate": 4.6667055894377857e-05,
"loss": 0.8645,
"mean_token_accuracy": 0.7444219350814819,
"step": 1180
},
{
"epoch": 0.2177508269018743,
"grad_norm": 1.1249433607043806,
"learning_rate": 4.663115584423995e-05,
"loss": 0.7794,
"mean_token_accuracy": 0.7685939073562622,
"step": 1185
},
{
"epoch": 0.21866960676221978,
"grad_norm": 1.0486000440190792,
"learning_rate": 4.659507912216732e-05,
"loss": 0.9305,
"mean_token_accuracy": 0.7281524419784546,
"step": 1190
},
{
"epoch": 0.21958838662256525,
"grad_norm": 1.167591023080102,
"learning_rate": 4.6558826061320384e-05,
"loss": 0.7969,
"mean_token_accuracy": 0.7660298943519592,
"step": 1195
},
{
"epoch": 0.2205071664829107,
"grad_norm": 1.1633015665730886,
"learning_rate": 4.652239699648803e-05,
"loss": 0.8005,
"mean_token_accuracy": 0.7678845167160034,
"step": 1200
},
{
"epoch": 0.22142594634325616,
"grad_norm": 1.1234875884233444,
"learning_rate": 4.648579226408452e-05,
"loss": 0.8267,
"mean_token_accuracy": 0.7536736965179444,
"step": 1205
},
{
"epoch": 0.22234472620360163,
"grad_norm": 1.1117529930711065,
"learning_rate": 4.644901220214634e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.7595484375953674,
"step": 1210
},
{
"epoch": 0.22326350606394707,
"grad_norm": 1.3136389068951135,
"learning_rate": 4.641205715032912e-05,
"loss": 0.7867,
"mean_token_accuracy": 0.7665369153022766,
"step": 1215
},
{
"epoch": 0.22418228592429254,
"grad_norm": 1.1409276324481026,
"learning_rate": 4.637492744990448e-05,
"loss": 0.8867,
"mean_token_accuracy": 0.7416447997093201,
"step": 1220
},
{
"epoch": 0.225101065784638,
"grad_norm": 1.2345145683805576,
"learning_rate": 4.6337623443756866e-05,
"loss": 0.7859,
"mean_token_accuracy": 0.7682509303092957,
"step": 1225
},
{
"epoch": 0.22601984564498345,
"grad_norm": 1.052220032293985,
"learning_rate": 4.630014547638043e-05,
"loss": 0.8437,
"mean_token_accuracy": 0.7497885942459106,
"step": 1230
},
{
"epoch": 0.22693862550532892,
"grad_norm": 1.1364062731073377,
"learning_rate": 4.626249389387577e-05,
"loss": 0.7733,
"mean_token_accuracy": 0.769334900379181,
"step": 1235
},
{
"epoch": 0.2278574053656744,
"grad_norm": 1.21546684775545,
"learning_rate": 4.622466904394683e-05,
"loss": 0.8526,
"mean_token_accuracy": 0.7492899537086487,
"step": 1240
},
{
"epoch": 0.22877618522601983,
"grad_norm": 1.2541380409672236,
"learning_rate": 4.6186671275897615e-05,
"loss": 0.8368,
"mean_token_accuracy": 0.7558955073356628,
"step": 1245
},
{
"epoch": 0.2296949650863653,
"grad_norm": 1.2101979681857873,
"learning_rate": 4.614850094062899e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.7446130990982056,
"step": 1250
},
{
"epoch": 0.23061374494671077,
"grad_norm": 1.0752792678811776,
"learning_rate": 4.6110158390635444e-05,
"loss": 0.8294,
"mean_token_accuracy": 0.7560481548309326,
"step": 1255
},
{
"epoch": 0.23153252480705622,
"grad_norm": 1.0122734972045033,
"learning_rate": 4.6071643980001825e-05,
"loss": 0.8331,
"mean_token_accuracy": 0.7490222334861756,
"step": 1260
},
{
"epoch": 0.2324513046674017,
"grad_norm": 1.1910590877588172,
"learning_rate": 4.603295806440009e-05,
"loss": 0.7723,
"mean_token_accuracy": 0.7715782880783081,
"step": 1265
},
{
"epoch": 0.23337008452774716,
"grad_norm": 1.0660522556095817,
"learning_rate": 4.599410100108598e-05,
"loss": 0.8337,
"mean_token_accuracy": 0.7520880579948426,
"step": 1270
},
{
"epoch": 0.2342888643880926,
"grad_norm": 0.9720081756723926,
"learning_rate": 4.5955073148895784e-05,
"loss": 0.8179,
"mean_token_accuracy": 0.7595946788787842,
"step": 1275
},
{
"epoch": 0.23520764424843807,
"grad_norm": 1.1185494512129268,
"learning_rate": 4.5915874868242944e-05,
"loss": 0.8655,
"mean_token_accuracy": 0.7462962985038757,
"step": 1280
},
{
"epoch": 0.23612642410878354,
"grad_norm": 1.2897065646338821,
"learning_rate": 4.5876506521114805e-05,
"loss": 0.8233,
"mean_token_accuracy": 0.7591111898422241,
"step": 1285
},
{
"epoch": 0.237045203969129,
"grad_norm": 1.1197184208975648,
"learning_rate": 4.583696847106923e-05,
"loss": 0.8585,
"mean_token_accuracy": 0.7474006295204163,
"step": 1290
},
{
"epoch": 0.23796398382947445,
"grad_norm": 1.2068264298929217,
"learning_rate": 4.579726108323123e-05,
"loss": 0.9136,
"mean_token_accuracy": 0.7314973592758178,
"step": 1295
},
{
"epoch": 0.23888276368981992,
"grad_norm": 1.109906508524664,
"learning_rate": 4.5757384724289646e-05,
"loss": 0.7947,
"mean_token_accuracy": 0.765422809123993,
"step": 1300
},
{
"epoch": 0.2398015435501654,
"grad_norm": 1.2554961190022804,
"learning_rate": 4.57173397624937e-05,
"loss": 0.8618,
"mean_token_accuracy": 0.7456292510032654,
"step": 1305
},
{
"epoch": 0.24072032341051083,
"grad_norm": 1.358639687516225,
"learning_rate": 4.567712656764964e-05,
"loss": 0.9191,
"mean_token_accuracy": 0.734754741191864,
"step": 1310
},
{
"epoch": 0.2416391032708563,
"grad_norm": 1.088229104509902,
"learning_rate": 4.5636745511117305e-05,
"loss": 0.8064,
"mean_token_accuracy": 0.7617093205451966,
"step": 1315
},
{
"epoch": 0.24255788313120177,
"grad_norm": 0.9758498688217456,
"learning_rate": 4.559619696580671e-05,
"loss": 0.7845,
"mean_token_accuracy": 0.7633411526679993,
"step": 1320
},
{
"epoch": 0.24347666299154722,
"grad_norm": 1.2094716352749706,
"learning_rate": 4.555548130617455e-05,
"loss": 0.7992,
"mean_token_accuracy": 0.7611837387084961,
"step": 1325
},
{
"epoch": 0.24439544285189269,
"grad_norm": 1.1427008905651062,
"learning_rate": 4.551459890822083e-05,
"loss": 0.9158,
"mean_token_accuracy": 0.733444607257843,
"step": 1330
},
{
"epoch": 0.24531422271223816,
"grad_norm": 1.1713473006194377,
"learning_rate": 4.547355014948534e-05,
"loss": 0.845,
"mean_token_accuracy": 0.7504712104797363,
"step": 1335
},
{
"epoch": 0.2462330025725836,
"grad_norm": 1.2702712471344686,
"learning_rate": 4.543233540904414e-05,
"loss": 0.8789,
"mean_token_accuracy": 0.7380323767662048,
"step": 1340
},
{
"epoch": 0.24715178243292907,
"grad_norm": 0.9938570050140668,
"learning_rate": 4.539095506750614e-05,
"loss": 0.8884,
"mean_token_accuracy": 0.7428679585456848,
"step": 1345
},
{
"epoch": 0.24807056229327454,
"grad_norm": 1.1279522970582605,
"learning_rate": 4.534940950700949e-05,
"loss": 0.8753,
"mean_token_accuracy": 0.7421611309051513,
"step": 1350
},
{
"epoch": 0.24898934215361998,
"grad_norm": 1.2588052277827508,
"learning_rate": 4.530769911121815e-05,
"loss": 0.8473,
"mean_token_accuracy": 0.7505762934684753,
"step": 1355
},
{
"epoch": 0.24990812201396545,
"grad_norm": 1.096669421502074,
"learning_rate": 4.526582426531826e-05,
"loss": 0.8353,
"mean_token_accuracy": 0.7524473786354064,
"step": 1360
},
{
"epoch": 0.2508269018743109,
"grad_norm": 1.0282246931353665,
"learning_rate": 4.5223785356014634e-05,
"loss": 0.8702,
"mean_token_accuracy": 0.7407379150390625,
"step": 1365
},
{
"epoch": 0.2517456817346564,
"grad_norm": 1.093689037275291,
"learning_rate": 4.518158277152717e-05,
"loss": 0.8428,
"mean_token_accuracy": 0.7473413228988648,
"step": 1370
},
{
"epoch": 0.25266446159500183,
"grad_norm": 1.0139841625812813,
"learning_rate": 4.51392169015873e-05,
"loss": 0.7938,
"mean_token_accuracy": 0.7664546370506287,
"step": 1375
},
{
"epoch": 0.2535832414553473,
"grad_norm": 1.1094571215928501,
"learning_rate": 4.509668813743429e-05,
"loss": 0.8017,
"mean_token_accuracy": 0.7636664628982544,
"step": 1380
},
{
"epoch": 0.2545020213156928,
"grad_norm": 1.2159505822364018,
"learning_rate": 4.505399687181178e-05,
"loss": 0.8561,
"mean_token_accuracy": 0.7478325366973877,
"step": 1385
},
{
"epoch": 0.2554208011760382,
"grad_norm": 1.0494835563541474,
"learning_rate": 4.501114349896401e-05,
"loss": 0.8611,
"mean_token_accuracy": 0.7463506817817688,
"step": 1390
},
{
"epoch": 0.25633958103638366,
"grad_norm": 1.5114697063891234,
"learning_rate": 4.496812841463229e-05,
"loss": 0.755,
"mean_token_accuracy": 0.7759661912918091,
"step": 1395
},
{
"epoch": 0.25725836089672915,
"grad_norm": 1.0714063923864912,
"learning_rate": 4.492495201605126e-05,
"loss": 0.7358,
"mean_token_accuracy": 0.7826925754547119,
"step": 1400
},
{
"epoch": 0.2581771407570746,
"grad_norm": 1.0758074940838653,
"learning_rate": 4.4881614701945296e-05,
"loss": 0.8875,
"mean_token_accuracy": 0.7398916482925415,
"step": 1405
},
{
"epoch": 0.25909592061742004,
"grad_norm": 1.215294771351842,
"learning_rate": 4.483811687252477e-05,
"loss": 0.8486,
"mean_token_accuracy": 0.7489311933517456,
"step": 1410
},
{
"epoch": 0.26001470047776554,
"grad_norm": 1.0794344380786876,
"learning_rate": 4.479445892948238e-05,
"loss": 0.7227,
"mean_token_accuracy": 0.784658420085907,
"step": 1415
},
{
"epoch": 0.260933480338111,
"grad_norm": 1.069221904312748,
"learning_rate": 4.4750641275989454e-05,
"loss": 0.8486,
"mean_token_accuracy": 0.7488225340843201,
"step": 1420
},
{
"epoch": 0.2618522601984565,
"grad_norm": 0.9620708178874104,
"learning_rate": 4.470666431669217e-05,
"loss": 0.8034,
"mean_token_accuracy": 0.764237916469574,
"step": 1425
},
{
"epoch": 0.2627710400588019,
"grad_norm": 1.638007368384327,
"learning_rate": 4.4662528457707925e-05,
"loss": 0.8552,
"mean_token_accuracy": 0.7481104493141174,
"step": 1430
},
{
"epoch": 0.26368981991914736,
"grad_norm": 1.057487407894169,
"learning_rate": 4.4618234106621464e-05,
"loss": 0.8672,
"mean_token_accuracy": 0.7452296495437623,
"step": 1435
},
{
"epoch": 0.26460859977949286,
"grad_norm": 0.9574304406812039,
"learning_rate": 4.457378167248117e-05,
"loss": 0.834,
"mean_token_accuracy": 0.7546884775161743,
"step": 1440
},
{
"epoch": 0.2655273796398383,
"grad_norm": 0.9556755030834319,
"learning_rate": 4.452917156579533e-05,
"loss": 0.8089,
"mean_token_accuracy": 0.7618599176406861,
"step": 1445
},
{
"epoch": 0.26644615950018374,
"grad_norm": 1.0854572977327381,
"learning_rate": 4.4484404198528275e-05,
"loss": 0.8759,
"mean_token_accuracy": 0.7410173654556275,
"step": 1450
},
{
"epoch": 0.26736493936052924,
"grad_norm": 1.2057318993172499,
"learning_rate": 4.443947998409658e-05,
"loss": 0.8436,
"mean_token_accuracy": 0.7513974785804749,
"step": 1455
},
{
"epoch": 0.2682837192208747,
"grad_norm": 1.0376956546110065,
"learning_rate": 4.439439933736532e-05,
"loss": 0.849,
"mean_token_accuracy": 0.7492346167564392,
"step": 1460
},
{
"epoch": 0.2692024990812201,
"grad_norm": 1.1016805520406512,
"learning_rate": 4.434916267464416e-05,
"loss": 0.7783,
"mean_token_accuracy": 0.7683018922805787,
"step": 1465
},
{
"epoch": 0.2701212789415656,
"grad_norm": 1.2408322866975516,
"learning_rate": 4.430377041368351e-05,
"loss": 0.8772,
"mean_token_accuracy": 0.738334059715271,
"step": 1470
},
{
"epoch": 0.27104005880191107,
"grad_norm": 1.090229737310603,
"learning_rate": 4.425822297367075e-05,
"loss": 0.7981,
"mean_token_accuracy": 0.7645934343338012,
"step": 1475
},
{
"epoch": 0.2719588386622565,
"grad_norm": 1.1864254928194882,
"learning_rate": 4.4212520775226256e-05,
"loss": 0.8155,
"mean_token_accuracy": 0.7581284165382385,
"step": 1480
},
{
"epoch": 0.272877618522602,
"grad_norm": 0.9904815545158214,
"learning_rate": 4.4166664240399606e-05,
"loss": 0.8076,
"mean_token_accuracy": 0.7610304713249206,
"step": 1485
},
{
"epoch": 0.27379639838294745,
"grad_norm": 1.0991992028451756,
"learning_rate": 4.412065379266559e-05,
"loss": 0.9142,
"mean_token_accuracy": 0.7303188562393188,
"step": 1490
},
{
"epoch": 0.2747151782432929,
"grad_norm": 1.0413802785892232,
"learning_rate": 4.4074489856920406e-05,
"loss": 0.8434,
"mean_token_accuracy": 0.7503148317337036,
"step": 1495
},
{
"epoch": 0.2756339581036384,
"grad_norm": 0.8788289426619142,
"learning_rate": 4.4028172859477626e-05,
"loss": 0.7476,
"mean_token_accuracy": 0.7781436324119568,
"step": 1500
},
{
"epoch": 0.27655273796398383,
"grad_norm": 1.1409867687324795,
"learning_rate": 4.398170322806435e-05,
"loss": 0.9066,
"mean_token_accuracy": 0.7312582850456237,
"step": 1505
},
{
"epoch": 0.2774715178243293,
"grad_norm": 1.0726549315103535,
"learning_rate": 4.3935081391817194e-05,
"loss": 0.8533,
"mean_token_accuracy": 0.7442232012748718,
"step": 1510
},
{
"epoch": 0.27839029768467477,
"grad_norm": 1.0743736375904043,
"learning_rate": 4.388830778127837e-05,
"loss": 0.8109,
"mean_token_accuracy": 0.755815064907074,
"step": 1515
},
{
"epoch": 0.2793090775450202,
"grad_norm": 1.1737139300743868,
"learning_rate": 4.3841382828391684e-05,
"loss": 0.783,
"mean_token_accuracy": 0.770452618598938,
"step": 1520
},
{
"epoch": 0.28022785740536565,
"grad_norm": 0.9732962716875773,
"learning_rate": 4.379430696649856e-05,
"loss": 0.8423,
"mean_token_accuracy": 0.7509778499603271,
"step": 1525
},
{
"epoch": 0.28114663726571115,
"grad_norm": 1.2143466656736133,
"learning_rate": 4.374708063033403e-05,
"loss": 0.8262,
"mean_token_accuracy": 0.7557825446128845,
"step": 1530
},
{
"epoch": 0.2820654171260566,
"grad_norm": 1.1347429120882544,
"learning_rate": 4.369970425602269e-05,
"loss": 0.7872,
"mean_token_accuracy": 0.7643797039985657,
"step": 1535
},
{
"epoch": 0.28298419698640204,
"grad_norm": 1.0471262947053317,
"learning_rate": 4.365217828107476e-05,
"loss": 0.8227,
"mean_token_accuracy": 0.7580597996711731,
"step": 1540
},
{
"epoch": 0.28390297684674753,
"grad_norm": 0.9720065253460837,
"learning_rate": 4.3604503144381964e-05,
"loss": 0.8133,
"mean_token_accuracy": 0.7570616483688355,
"step": 1545
},
{
"epoch": 0.284821756707093,
"grad_norm": 1.0983353140713001,
"learning_rate": 4.3556679286213495e-05,
"loss": 0.8416,
"mean_token_accuracy": 0.7502852201461792,
"step": 1550
},
{
"epoch": 0.2857405365674384,
"grad_norm": 0.9936040636732534,
"learning_rate": 4.3508707148211946e-05,
"loss": 0.7351,
"mean_token_accuracy": 0.7779555797576905,
"step": 1555
},
{
"epoch": 0.2866593164277839,
"grad_norm": 1.0923713064546872,
"learning_rate": 4.3460587173389284e-05,
"loss": 0.8502,
"mean_token_accuracy": 0.7482675671577453,
"step": 1560
},
{
"epoch": 0.28757809628812936,
"grad_norm": 1.0011444400816414,
"learning_rate": 4.341231980612266e-05,
"loss": 0.8008,
"mean_token_accuracy": 0.7629394650459289,
"step": 1565
},
{
"epoch": 0.2884968761484748,
"grad_norm": 1.057600337329318,
"learning_rate": 4.336390549215041e-05,
"loss": 0.8052,
"mean_token_accuracy": 0.7602485775947571,
"step": 1570
},
{
"epoch": 0.2894156560088203,
"grad_norm": 0.9400172537775919,
"learning_rate": 4.331534467856785e-05,
"loss": 0.8037,
"mean_token_accuracy": 0.7623314976692199,
"step": 1575
},
{
"epoch": 0.29033443586916574,
"grad_norm": 1.1540389749120974,
"learning_rate": 4.3266637813823216e-05,
"loss": 0.8087,
"mean_token_accuracy": 0.7602805018424987,
"step": 1580
},
{
"epoch": 0.2912532157295112,
"grad_norm": 1.1256888029606915,
"learning_rate": 4.3217785347713486e-05,
"loss": 0.8196,
"mean_token_accuracy": 0.7612602710723877,
"step": 1585
},
{
"epoch": 0.2921719955898567,
"grad_norm": 1.1230902967533625,
"learning_rate": 4.3168787731380224e-05,
"loss": 0.7872,
"mean_token_accuracy": 0.7645440459251404,
"step": 1590
},
{
"epoch": 0.2930907754502021,
"grad_norm": 1.0556955876997007,
"learning_rate": 4.3119645417305435e-05,
"loss": 0.8697,
"mean_token_accuracy": 0.7421263337135315,
"step": 1595
},
{
"epoch": 0.29400955531054757,
"grad_norm": 1.2917611391283017,
"learning_rate": 4.307035885930736e-05,
"loss": 0.7776,
"mean_token_accuracy": 0.7674265027046203,
"step": 1600
},
{
"epoch": 0.29492833517089306,
"grad_norm": 1.1648596155514652,
"learning_rate": 4.3020928512536326e-05,
"loss": 0.7851,
"mean_token_accuracy": 0.7669198989868165,
"step": 1605
},
{
"epoch": 0.2958471150312385,
"grad_norm": 1.0970597977003158,
"learning_rate": 4.29713548334705e-05,
"loss": 0.8279,
"mean_token_accuracy": 0.7507619738578797,
"step": 1610
},
{
"epoch": 0.296765894891584,
"grad_norm": 1.054076923212483,
"learning_rate": 4.292163827991168e-05,
"loss": 0.7722,
"mean_token_accuracy": 0.7705003499984742,
"step": 1615
},
{
"epoch": 0.29768467475192945,
"grad_norm": 1.2125775832361394,
"learning_rate": 4.2871779310981114e-05,
"loss": 0.8192,
"mean_token_accuracy": 0.7588199496269226,
"step": 1620
},
{
"epoch": 0.2986034546122749,
"grad_norm": 0.9888172610894965,
"learning_rate": 4.282177838711518e-05,
"loss": 0.7953,
"mean_token_accuracy": 0.7682381868362427,
"step": 1625
},
{
"epoch": 0.2995222344726204,
"grad_norm": 0.9981457336248658,
"learning_rate": 4.277163597006121e-05,
"loss": 0.824,
"mean_token_accuracy": 0.7541024565696717,
"step": 1630
},
{
"epoch": 0.30044101433296583,
"grad_norm": 1.036834210307202,
"learning_rate": 4.2721352522873184e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.7723967909812928,
"step": 1635
},
{
"epoch": 0.30135979419331127,
"grad_norm": 1.1851229887536607,
"learning_rate": 4.2670928509907446e-05,
"loss": 0.8349,
"mean_token_accuracy": 0.7524407744407654,
"step": 1640
},
{
"epoch": 0.30227857405365677,
"grad_norm": 1.0536941977185987,
"learning_rate": 4.262036439681847e-05,
"loss": 0.8138,
"mean_token_accuracy": 0.7575963020324707,
"step": 1645
},
{
"epoch": 0.3031973539140022,
"grad_norm": 1.0372574521222562,
"learning_rate": 4.256966065055449e-05,
"loss": 0.7325,
"mean_token_accuracy": 0.7790537357330323,
"step": 1650
},
{
"epoch": 0.30411613377434765,
"grad_norm": 1.0174389909956805,
"learning_rate": 4.251881773935325e-05,
"loss": 0.864,
"mean_token_accuracy": 0.74665367603302,
"step": 1655
},
{
"epoch": 0.30503491363469315,
"grad_norm": 0.9845084927156172,
"learning_rate": 4.246783613273761e-05,
"loss": 0.7645,
"mean_token_accuracy": 0.7687517642974854,
"step": 1660
},
{
"epoch": 0.3059536934950386,
"grad_norm": 1.0336916342848663,
"learning_rate": 4.2416716301511305e-05,
"loss": 0.8479,
"mean_token_accuracy": 0.7481852293014526,
"step": 1665
},
{
"epoch": 0.30687247335538403,
"grad_norm": 1.0381531094343786,
"learning_rate": 4.2365458717754494e-05,
"loss": 0.8085,
"mean_token_accuracy": 0.75991370677948,
"step": 1670
},
{
"epoch": 0.30779125321572953,
"grad_norm": 1.110015448227854,
"learning_rate": 4.231406385481947e-05,
"loss": 0.7717,
"mean_token_accuracy": 0.7670859694480896,
"step": 1675
},
{
"epoch": 0.308710033076075,
"grad_norm": 1.036136344871459,
"learning_rate": 4.226253218732629e-05,
"loss": 0.7949,
"mean_token_accuracy": 0.7634945988655091,
"step": 1680
},
{
"epoch": 0.3096288129364204,
"grad_norm": 1.0484877630675096,
"learning_rate": 4.221086419115832e-05,
"loss": 0.8448,
"mean_token_accuracy": 0.751638388633728,
"step": 1685
},
{
"epoch": 0.3105475927967659,
"grad_norm": 6.349506903012944,
"learning_rate": 4.2159060343457947e-05,
"loss": 0.9101,
"mean_token_accuracy": 0.7370145440101623,
"step": 1690
},
{
"epoch": 0.31146637265711136,
"grad_norm": 1.1977614262895908,
"learning_rate": 4.2107121122622066e-05,
"loss": 0.8389,
"mean_token_accuracy": 0.7488813638687134,
"step": 1695
},
{
"epoch": 0.3123851525174568,
"grad_norm": 1.2380653768153889,
"learning_rate": 4.2055047008297757e-05,
"loss": 0.8342,
"mean_token_accuracy": 0.7505980730056763,
"step": 1700
},
{
"epoch": 0.3133039323778023,
"grad_norm": 0.9546362693630366,
"learning_rate": 4.200283848137777e-05,
"loss": 0.7855,
"mean_token_accuracy": 0.7642045140266418,
"step": 1705
},
{
"epoch": 0.31422271223814774,
"grad_norm": 1.022441012881404,
"learning_rate": 4.195049602399616e-05,
"loss": 0.7877,
"mean_token_accuracy": 0.7621595740318299,
"step": 1710
},
{
"epoch": 0.3151414920984932,
"grad_norm": 1.0392246094486983,
"learning_rate": 4.189802011952378e-05,
"loss": 0.878,
"mean_token_accuracy": 0.744194757938385,
"step": 1715
},
{
"epoch": 0.3160602719588387,
"grad_norm": 1.077282143260173,
"learning_rate": 4.184541125256385e-05,
"loss": 0.7917,
"mean_token_accuracy": 0.7647501945495605,
"step": 1720
},
{
"epoch": 0.3169790518191841,
"grad_norm": 0.9775790537668483,
"learning_rate": 4.1792669908947436e-05,
"loss": 0.8597,
"mean_token_accuracy": 0.74363933801651,
"step": 1725
},
{
"epoch": 0.31789783167952956,
"grad_norm": 1.1117317785310954,
"learning_rate": 4.1739796575729045e-05,
"loss": 0.8114,
"mean_token_accuracy": 0.7558189272880554,
"step": 1730
},
{
"epoch": 0.31881661153987506,
"grad_norm": 1.1369274014331534,
"learning_rate": 4.168679174118205e-05,
"loss": 0.8715,
"mean_token_accuracy": 0.7428115725517273,
"step": 1735
},
{
"epoch": 0.3197353914002205,
"grad_norm": 1.0490276575831161,
"learning_rate": 4.1633655894794206e-05,
"loss": 0.8579,
"mean_token_accuracy": 0.7467806100845337,
"step": 1740
},
{
"epoch": 0.32065417126056595,
"grad_norm": 1.0117146921952147,
"learning_rate": 4.158038952726315e-05,
"loss": 0.7832,
"mean_token_accuracy": 0.7676323890686035,
"step": 1745
},
{
"epoch": 0.32157295112091144,
"grad_norm": 1.0514207056273204,
"learning_rate": 4.1526993130491834e-05,
"loss": 0.7417,
"mean_token_accuracy": 0.779768443107605,
"step": 1750
},
{
"epoch": 0.3224917309812569,
"grad_norm": 1.1710593436653487,
"learning_rate": 4.147346719758401e-05,
"loss": 0.759,
"mean_token_accuracy": 0.7754043459892273,
"step": 1755
},
{
"epoch": 0.32341051084160233,
"grad_norm": 1.1210033487742597,
"learning_rate": 4.141981222283969e-05,
"loss": 0.8426,
"mean_token_accuracy": 0.7512526273727417,
"step": 1760
},
{
"epoch": 0.3243292907019478,
"grad_norm": 1.067779284913716,
"learning_rate": 4.136602870175049e-05,
"loss": 0.7312,
"mean_token_accuracy": 0.7808745861053467,
"step": 1765
},
{
"epoch": 0.32524807056229327,
"grad_norm": 0.9739576922638749,
"learning_rate": 4.131211713099522e-05,
"loss": 0.7442,
"mean_token_accuracy": 0.7744468688964844,
"step": 1770
},
{
"epoch": 0.3261668504226387,
"grad_norm": 1.013655175975763,
"learning_rate": 4.1258078008435103e-05,
"loss": 0.7824,
"mean_token_accuracy": 0.7647914290428162,
"step": 1775
},
{
"epoch": 0.3270856302829842,
"grad_norm": 0.9850930887046532,
"learning_rate": 4.120391183310934e-05,
"loss": 0.7274,
"mean_token_accuracy": 0.7834605932235718,
"step": 1780
},
{
"epoch": 0.32800441014332965,
"grad_norm": 1.323489905547871,
"learning_rate": 4.114961910523042e-05,
"loss": 0.8074,
"mean_token_accuracy": 0.7612802505493164,
"step": 1785
},
{
"epoch": 0.3289231900036751,
"grad_norm": 1.035219788914723,
"learning_rate": 4.109520032617952e-05,
"loss": 0.8369,
"mean_token_accuracy": 0.7539438486099244,
"step": 1790
},
{
"epoch": 0.3298419698640206,
"grad_norm": 0.984325460018373,
"learning_rate": 4.104065599850183e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.7480033159255981,
"step": 1795
},
{
"epoch": 0.33076074972436603,
"grad_norm": 1.1555611010512028,
"learning_rate": 4.098598662590202e-05,
"loss": 0.7045,
"mean_token_accuracy": 0.7892690062522888,
"step": 1800
},
{
"epoch": 0.33167952958471153,
"grad_norm": 1.0781062858261419,
"learning_rate": 4.093119271323947e-05,
"loss": 0.8231,
"mean_token_accuracy": 0.75406334400177,
"step": 1805
},
{
"epoch": 0.33259830944505697,
"grad_norm": 1.0129880605444779,
"learning_rate": 4.0876274766523674e-05,
"loss": 0.9059,
"mean_token_accuracy": 0.7340885043144226,
"step": 1810
},
{
"epoch": 0.3335170893054024,
"grad_norm": 1.214320088802432,
"learning_rate": 4.0821233292909575e-05,
"loss": 0.8751,
"mean_token_accuracy": 0.7407148957252503,
"step": 1815
},
{
"epoch": 0.3344358691657479,
"grad_norm": 1.000936871191356,
"learning_rate": 4.076606880069283e-05,
"loss": 0.7856,
"mean_token_accuracy": 0.7644298434257507,
"step": 1820
},
{
"epoch": 0.33535464902609335,
"grad_norm": 1.1385038309062536,
"learning_rate": 4.0710781799305146e-05,
"loss": 0.8165,
"mean_token_accuracy": 0.7551571488380432,
"step": 1825
},
{
"epoch": 0.3362734288864388,
"grad_norm": 1.308386913579212,
"learning_rate": 4.065537279930961e-05,
"loss": 0.8436,
"mean_token_accuracy": 0.7464751482009888,
"step": 1830
},
{
"epoch": 0.3371922087467843,
"grad_norm": 1.1854394162632642,
"learning_rate": 4.059984231239587e-05,
"loss": 0.8499,
"mean_token_accuracy": 0.7523553133010864,
"step": 1835
},
{
"epoch": 0.33811098860712974,
"grad_norm": 1.0597911593288654,
"learning_rate": 4.054419085137558e-05,
"loss": 0.7912,
"mean_token_accuracy": 0.7623480677604675,
"step": 1840
},
{
"epoch": 0.3390297684674752,
"grad_norm": 1.1332445452310214,
"learning_rate": 4.0488418930177464e-05,
"loss": 0.7861,
"mean_token_accuracy": 0.7626782655715942,
"step": 1845
},
{
"epoch": 0.3399485483278207,
"grad_norm": 1.1468762268738129,
"learning_rate": 4.043252706384273e-05,
"loss": 0.8866,
"mean_token_accuracy": 0.7364044427871704,
"step": 1850
},
{
"epoch": 0.3408673281881661,
"grad_norm": 1.0468393046787807,
"learning_rate": 4.037651576852021e-05,
"loss": 0.8192,
"mean_token_accuracy": 0.7569101452827454,
"step": 1855
},
{
"epoch": 0.34178610804851156,
"grad_norm": 1.034873991581434,
"learning_rate": 4.032038556146167e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7652035236358643,
"step": 1860
},
{
"epoch": 0.34270488790885706,
"grad_norm": 1.0816344286074944,
"learning_rate": 4.0264136961017e-05,
"loss": 0.8062,
"mean_token_accuracy": 0.7586339831352233,
"step": 1865
},
{
"epoch": 0.3436236677692025,
"grad_norm": 1.1216437423138468,
"learning_rate": 4.020777048662939e-05,
"loss": 0.8526,
"mean_token_accuracy": 0.7471354722976684,
"step": 1870
},
{
"epoch": 0.34454244762954794,
"grad_norm": 1.2787020788596146,
"learning_rate": 4.01512866588306e-05,
"loss": 0.8337,
"mean_token_accuracy": 0.7524662256240845,
"step": 1875
},
{
"epoch": 0.34546122748989344,
"grad_norm": 0.9726950672685023,
"learning_rate": 4.009468599923613e-05,
"loss": 0.8254,
"mean_token_accuracy": 0.7547502636909484,
"step": 1880
},
{
"epoch": 0.3463800073502389,
"grad_norm": 1.1522393714280965,
"learning_rate": 4.0037969030540356e-05,
"loss": 0.8788,
"mean_token_accuracy": 0.7409179091453553,
"step": 1885
},
{
"epoch": 0.3472987872105843,
"grad_norm": 1.0134958833049312,
"learning_rate": 3.9981136276511786e-05,
"loss": 0.7966,
"mean_token_accuracy": 0.7628639936447144,
"step": 1890
},
{
"epoch": 0.3482175670709298,
"grad_norm": 1.0067212082614574,
"learning_rate": 3.992418826198816e-05,
"loss": 0.7483,
"mean_token_accuracy": 0.7759244441986084,
"step": 1895
},
{
"epoch": 0.34913634693127527,
"grad_norm": 1.1440056397503298,
"learning_rate": 3.9867125512871604e-05,
"loss": 0.8465,
"mean_token_accuracy": 0.7483215093612671,
"step": 1900
},
{
"epoch": 0.3500551267916207,
"grad_norm": 1.015411004374869,
"learning_rate": 3.980994855612384e-05,
"loss": 0.7634,
"mean_token_accuracy": 0.768380320072174,
"step": 1905
},
{
"epoch": 0.3509739066519662,
"grad_norm": 1.0214270916485333,
"learning_rate": 3.975265791976122e-05,
"loss": 0.8031,
"mean_token_accuracy": 0.7592991948127746,
"step": 1910
},
{
"epoch": 0.35189268651231165,
"grad_norm": 1.0257322832506945,
"learning_rate": 3.969525413284994e-05,
"loss": 0.7808,
"mean_token_accuracy": 0.7686658024787902,
"step": 1915
},
{
"epoch": 0.3528114663726571,
"grad_norm": 1.0256575127828191,
"learning_rate": 3.96377377255011e-05,
"loss": 0.8711,
"mean_token_accuracy": 0.7431510090827942,
"step": 1920
},
{
"epoch": 0.3537302462330026,
"grad_norm": 1.2324007321771868,
"learning_rate": 3.958010922886582e-05,
"loss": 0.8813,
"mean_token_accuracy": 0.7428903222084046,
"step": 1925
},
{
"epoch": 0.35464902609334803,
"grad_norm": 1.3413818534531692,
"learning_rate": 3.9522369175130345e-05,
"loss": 0.8645,
"mean_token_accuracy": 0.7381167054176331,
"step": 1930
},
{
"epoch": 0.35556780595369347,
"grad_norm": 1.1450695156011848,
"learning_rate": 3.946451809751114e-05,
"loss": 0.8475,
"mean_token_accuracy": 0.7497512817382812,
"step": 1935
},
{
"epoch": 0.35648658581403897,
"grad_norm": 1.2054216083900955,
"learning_rate": 3.9406556530249905e-05,
"loss": 0.8103,
"mean_token_accuracy": 0.7571905732154847,
"step": 1940
},
{
"epoch": 0.3574053656743844,
"grad_norm": 1.06377210645749,
"learning_rate": 3.934848500860875e-05,
"loss": 0.7883,
"mean_token_accuracy": 0.7618215918540955,
"step": 1945
},
{
"epoch": 0.35832414553472985,
"grad_norm": 1.0312951220792854,
"learning_rate": 3.9290304068865144e-05,
"loss": 0.8129,
"mean_token_accuracy": 0.7582242131233216,
"step": 1950
},
{
"epoch": 0.35924292539507535,
"grad_norm": 0.9872094884682482,
"learning_rate": 3.923201424830701e-05,
"loss": 0.7861,
"mean_token_accuracy": 0.765390944480896,
"step": 1955
},
{
"epoch": 0.3601617052554208,
"grad_norm": 0.9446381855103185,
"learning_rate": 3.917361608522778e-05,
"loss": 0.8067,
"mean_token_accuracy": 0.7581991076469421,
"step": 1960
},
{
"epoch": 0.36108048511576624,
"grad_norm": 1.0047794050440646,
"learning_rate": 3.911511011892141e-05,
"loss": 0.815,
"mean_token_accuracy": 0.7577335119247437,
"step": 1965
},
{
"epoch": 0.36199926497611173,
"grad_norm": 1.0514135855864823,
"learning_rate": 3.905649688967736e-05,
"loss": 0.8003,
"mean_token_accuracy": 0.7607754588127136,
"step": 1970
},
{
"epoch": 0.3629180448364572,
"grad_norm": 0.9011315096836434,
"learning_rate": 3.8997776938775664e-05,
"loss": 0.8548,
"mean_token_accuracy": 0.748826515674591,
"step": 1975
},
{
"epoch": 0.3638368246968026,
"grad_norm": 1.0629251850472634,
"learning_rate": 3.893895080848192e-05,
"loss": 0.8871,
"mean_token_accuracy": 0.7375021696090698,
"step": 1980
},
{
"epoch": 0.3647556045571481,
"grad_norm": 1.002663862883126,
"learning_rate": 3.888001904204223e-05,
"loss": 0.7724,
"mean_token_accuracy": 0.769203269481659,
"step": 1985
},
{
"epoch": 0.36567438441749356,
"grad_norm": 0.9877909023393563,
"learning_rate": 3.882098218367826e-05,
"loss": 0.7703,
"mean_token_accuracy": 0.7695886373519898,
"step": 1990
},
{
"epoch": 0.36659316427783906,
"grad_norm": 1.148691358597791,
"learning_rate": 3.876184077858214e-05,
"loss": 0.707,
"mean_token_accuracy": 0.7888103008270264,
"step": 1995
},
{
"epoch": 0.3675119441381845,
"grad_norm": 0.9571216367735665,
"learning_rate": 3.8702595372911524e-05,
"loss": 0.7846,
"mean_token_accuracy": 0.769954240322113,
"step": 2000
},
{
"epoch": 0.36843072399852994,
"grad_norm": 0.8971436133233381,
"learning_rate": 3.86432465137844e-05,
"loss": 0.783,
"mean_token_accuracy": 0.7654212713241577,
"step": 2005
},
{
"epoch": 0.36934950385887544,
"grad_norm": 1.0855213777834216,
"learning_rate": 3.8583794749274197e-05,
"loss": 0.7858,
"mean_token_accuracy": 0.7648387908935547,
"step": 2010
},
{
"epoch": 0.3702682837192209,
"grad_norm": 1.0016490287283184,
"learning_rate": 3.852424062840465e-05,
"loss": 0.7997,
"mean_token_accuracy": 0.7611153483390808,
"step": 2015
},
{
"epoch": 0.3711870635795663,
"grad_norm": 1.0410544587880413,
"learning_rate": 3.846458470114469e-05,
"loss": 0.8434,
"mean_token_accuracy": 0.745345389842987,
"step": 2020
},
{
"epoch": 0.3721058434399118,
"grad_norm": 1.0734066596059144,
"learning_rate": 3.8404827518403424e-05,
"loss": 0.8303,
"mean_token_accuracy": 0.7534924626350403,
"step": 2025
},
{
"epoch": 0.37302462330025726,
"grad_norm": 1.1067287910564152,
"learning_rate": 3.834496963202506e-05,
"loss": 0.7138,
"mean_token_accuracy": 0.7858679056167602,
"step": 2030
},
{
"epoch": 0.3739434031606027,
"grad_norm": 1.0240680595901026,
"learning_rate": 3.828501159478374e-05,
"loss": 0.7816,
"mean_token_accuracy": 0.767118227481842,
"step": 2035
},
{
"epoch": 0.3748621830209482,
"grad_norm": 0.9108092672977341,
"learning_rate": 3.822495396037849e-05,
"loss": 0.7888,
"mean_token_accuracy": 0.7624866485595703,
"step": 2040
},
{
"epoch": 0.37578096288129365,
"grad_norm": 0.9408803997681696,
"learning_rate": 3.816479728342811e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7651725172996521,
"step": 2045
},
{
"epoch": 0.3766997427416391,
"grad_norm": 1.0909405717967111,
"learning_rate": 3.8104542119466024e-05,
"loss": 0.8526,
"mean_token_accuracy": 0.7467872500419617,
"step": 2050
},
{
"epoch": 0.3776185226019846,
"grad_norm": 1.0413672136227896,
"learning_rate": 3.804418902493515e-05,
"loss": 0.8557,
"mean_token_accuracy": 0.7429945468902588,
"step": 2055
},
{
"epoch": 0.37853730246233,
"grad_norm": 1.013837379671012,
"learning_rate": 3.798373855718281e-05,
"loss": 0.7514,
"mean_token_accuracy": 0.7755364179611206,
"step": 2060
},
{
"epoch": 0.37945608232267547,
"grad_norm": 1.1123775153381206,
"learning_rate": 3.7923191274455485e-05,
"loss": 0.8503,
"mean_token_accuracy": 0.746312165260315,
"step": 2065
},
{
"epoch": 0.38037486218302097,
"grad_norm": 1.1418197878690564,
"learning_rate": 3.786254773589378e-05,
"loss": 0.8214,
"mean_token_accuracy": 0.7556997299194336,
"step": 2070
},
{
"epoch": 0.3812936420433664,
"grad_norm": 1.067367442039563,
"learning_rate": 3.780180850152716e-05,
"loss": 0.8306,
"mean_token_accuracy": 0.7545937180519104,
"step": 2075
},
{
"epoch": 0.38221242190371185,
"grad_norm": 0.9953541285561653,
"learning_rate": 3.774097413226885e-05,
"loss": 0.8767,
"mean_token_accuracy": 0.7388915061950684,
"step": 2080
},
{
"epoch": 0.38313120176405735,
"grad_norm": 0.9620898865674795,
"learning_rate": 3.768004518991061e-05,
"loss": 0.8024,
"mean_token_accuracy": 0.7610628366470337,
"step": 2085
},
{
"epoch": 0.3840499816244028,
"grad_norm": 1.040454257360025,
"learning_rate": 3.761902223711754e-05,
"loss": 0.837,
"mean_token_accuracy": 0.749622106552124,
"step": 2090
},
{
"epoch": 0.38496876148474823,
"grad_norm": 0.9516590882481228,
"learning_rate": 3.755790583742296e-05,
"loss": 0.8153,
"mean_token_accuracy": 0.758633291721344,
"step": 2095
},
{
"epoch": 0.38588754134509373,
"grad_norm": 1.0838072652279054,
"learning_rate": 3.749669655522308e-05,
"loss": 0.8902,
"mean_token_accuracy": 0.7317674040794373,
"step": 2100
},
{
"epoch": 0.3868063212054392,
"grad_norm": 0.9749499461968839,
"learning_rate": 3.743539495577193e-05,
"loss": 0.8897,
"mean_token_accuracy": 0.739715039730072,
"step": 2105
},
{
"epoch": 0.3877251010657846,
"grad_norm": 0.9972815952612917,
"learning_rate": 3.7374001605176026e-05,
"loss": 0.7977,
"mean_token_accuracy": 0.7625495314598083,
"step": 2110
},
{
"epoch": 0.3886438809261301,
"grad_norm": 0.9037607701703367,
"learning_rate": 3.731251707038919e-05,
"loss": 0.7822,
"mean_token_accuracy": 0.7656629920005799,
"step": 2115
},
{
"epoch": 0.38956266078647556,
"grad_norm": 1.0416251065743547,
"learning_rate": 3.725094191920731e-05,
"loss": 0.8298,
"mean_token_accuracy": 0.7543026089668274,
"step": 2120
},
{
"epoch": 0.390481440646821,
"grad_norm": 0.947462414303698,
"learning_rate": 3.7189276720263124e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.7649134397506714,
"step": 2125
},
{
"epoch": 0.3914002205071665,
"grad_norm": 0.8871190316863635,
"learning_rate": 3.712752204302089e-05,
"loss": 0.8158,
"mean_token_accuracy": 0.7549549221992493,
"step": 2130
},
{
"epoch": 0.39231900036751194,
"grad_norm": 1.0445560118226038,
"learning_rate": 3.7065678457771224e-05,
"loss": 0.817,
"mean_token_accuracy": 0.7530762314796448,
"step": 2135
},
{
"epoch": 0.3932377802278574,
"grad_norm": 0.9686024583555402,
"learning_rate": 3.700374653562577e-05,
"loss": 0.7923,
"mean_token_accuracy": 0.7622018694877625,
"step": 2140
},
{
"epoch": 0.3941565600882029,
"grad_norm": 0.9213469714809804,
"learning_rate": 3.694172684851193e-05,
"loss": 0.7721,
"mean_token_accuracy": 0.7674794912338256,
"step": 2145
},
{
"epoch": 0.3950753399485483,
"grad_norm": 0.9345497817342381,
"learning_rate": 3.6879619969167614e-05,
"loss": 0.7492,
"mean_token_accuracy": 0.776430857181549,
"step": 2150
},
{
"epoch": 0.39599411980889376,
"grad_norm": 1.0666312061499093,
"learning_rate": 3.681742647113594e-05,
"loss": 0.8168,
"mean_token_accuracy": 0.7584180355072021,
"step": 2155
},
{
"epoch": 0.39691289966923926,
"grad_norm": 1.1091479631196337,
"learning_rate": 3.67551469287599e-05,
"loss": 0.8346,
"mean_token_accuracy": 0.7529264330863953,
"step": 2160
},
{
"epoch": 0.3978316795295847,
"grad_norm": 1.0802706313899886,
"learning_rate": 3.669278191717712e-05,
"loss": 0.8326,
"mean_token_accuracy": 0.748162055015564,
"step": 2165
},
{
"epoch": 0.39875045938993015,
"grad_norm": 1.2155117409623286,
"learning_rate": 3.6630332012314485e-05,
"loss": 0.8257,
"mean_token_accuracy": 0.7545464992523193,
"step": 2170
},
{
"epoch": 0.39966923925027564,
"grad_norm": 1.0387043754774363,
"learning_rate": 3.656779779088287e-05,
"loss": 0.7581,
"mean_token_accuracy": 0.769706928730011,
"step": 2175
},
{
"epoch": 0.4005880191106211,
"grad_norm": 0.9698917316969403,
"learning_rate": 3.650517983037179e-05,
"loss": 0.7412,
"mean_token_accuracy": 0.7771862506866455,
"step": 2180
},
{
"epoch": 0.4015067989709666,
"grad_norm": 0.9123240142029096,
"learning_rate": 3.6442478709044065e-05,
"loss": 0.7079,
"mean_token_accuracy": 0.7833864569664002,
"step": 2185
},
{
"epoch": 0.402425578831312,
"grad_norm": 1.0355123737426308,
"learning_rate": 3.6379695005930504e-05,
"loss": 0.7094,
"mean_token_accuracy": 0.7866922855377197,
"step": 2190
},
{
"epoch": 0.40334435869165747,
"grad_norm": 1.1267999437797982,
"learning_rate": 3.6316829300824514e-05,
"loss": 0.7638,
"mean_token_accuracy": 0.7694135665893554,
"step": 2195
},
{
"epoch": 0.40426313855200297,
"grad_norm": 0.9483752352311176,
"learning_rate": 3.6253882174276784e-05,
"loss": 0.8328,
"mean_token_accuracy": 0.7523651957511902,
"step": 2200
},
{
"epoch": 0.4051819184123484,
"grad_norm": 1.081747935405834,
"learning_rate": 3.619085420758994e-05,
"loss": 0.8821,
"mean_token_accuracy": 0.7345280289649964,
"step": 2205
},
{
"epoch": 0.40610069827269385,
"grad_norm": 1.2409774807789806,
"learning_rate": 3.612774598281309e-05,
"loss": 0.8638,
"mean_token_accuracy": 0.7448987007141114,
"step": 2210
},
{
"epoch": 0.40701947813303935,
"grad_norm": 1.0398975018242138,
"learning_rate": 3.606455808273656e-05,
"loss": 0.7303,
"mean_token_accuracy": 0.7799215197563172,
"step": 2215
},
{
"epoch": 0.4079382579933848,
"grad_norm": 1.1058430385137907,
"learning_rate": 3.600129109088644e-05,
"loss": 0.7463,
"mean_token_accuracy": 0.7737818479537963,
"step": 2220
},
{
"epoch": 0.40885703785373023,
"grad_norm": 2.3840075862300405,
"learning_rate": 3.593794559151921e-05,
"loss": 0.827,
"mean_token_accuracy": 0.7540715932846069,
"step": 2225
},
{
"epoch": 0.40977581771407573,
"grad_norm": 1.0043211971634207,
"learning_rate": 3.5874522169616346e-05,
"loss": 0.8156,
"mean_token_accuracy": 0.7552896976470947,
"step": 2230
},
{
"epoch": 0.41069459757442117,
"grad_norm": 1.040415072100305,
"learning_rate": 3.581102141087893e-05,
"loss": 0.7356,
"mean_token_accuracy": 0.7774260997772217,
"step": 2235
},
{
"epoch": 0.4116133774347666,
"grad_norm": 1.0107449068601633,
"learning_rate": 3.5747443901722246e-05,
"loss": 0.8332,
"mean_token_accuracy": 0.7481484651565552,
"step": 2240
},
{
"epoch": 0.4125321572951121,
"grad_norm": 1.0982955854264607,
"learning_rate": 3.568379022927032e-05,
"loss": 0.8514,
"mean_token_accuracy": 0.7456109881401062,
"step": 2245
},
{
"epoch": 0.41345093715545755,
"grad_norm": 0.9554863556690953,
"learning_rate": 3.562006098135056e-05,
"loss": 0.8014,
"mean_token_accuracy": 0.7609956502914429,
"step": 2250
},
{
"epoch": 0.414369717015803,
"grad_norm": 1.076779827899152,
"learning_rate": 3.5556256746488256e-05,
"loss": 0.7832,
"mean_token_accuracy": 0.7661887645721436,
"step": 2255
},
{
"epoch": 0.4152884968761485,
"grad_norm": 0.9529179922047704,
"learning_rate": 3.549237811390125e-05,
"loss": 0.8153,
"mean_token_accuracy": 0.7538660645484925,
"step": 2260
},
{
"epoch": 0.41620727673649394,
"grad_norm": 1.1195875955943517,
"learning_rate": 3.542842567349435e-05,
"loss": 0.7221,
"mean_token_accuracy": 0.7824627161026001,
"step": 2265
},
{
"epoch": 0.4171260565968394,
"grad_norm": 0.92653979162294,
"learning_rate": 3.536440001585405e-05,
"loss": 0.7777,
"mean_token_accuracy": 0.7661702513694764,
"step": 2270
},
{
"epoch": 0.4180448364571849,
"grad_norm": 1.1097040235889317,
"learning_rate": 3.5300301732242894e-05,
"loss": 0.6985,
"mean_token_accuracy": 0.7891063332557678,
"step": 2275
},
{
"epoch": 0.4189636163175303,
"grad_norm": 1.008299555421031,
"learning_rate": 3.523613141459418e-05,
"loss": 0.7802,
"mean_token_accuracy": 0.7641416311264038,
"step": 2280
},
{
"epoch": 0.41988239617787576,
"grad_norm": 1.032289956768311,
"learning_rate": 3.5171889655506415e-05,
"loss": 0.8249,
"mean_token_accuracy": 0.7521484732627869,
"step": 2285
},
{
"epoch": 0.42080117603822126,
"grad_norm": 1.0467412920187205,
"learning_rate": 3.510757704823784e-05,
"loss": 0.858,
"mean_token_accuracy": 0.746031641960144,
"step": 2290
},
{
"epoch": 0.4217199558985667,
"grad_norm": 0.9244613024912548,
"learning_rate": 3.5043194186700936e-05,
"loss": 0.7074,
"mean_token_accuracy": 0.7865028500556945,
"step": 2295
},
{
"epoch": 0.42263873575891214,
"grad_norm": 1.2481817962042026,
"learning_rate": 3.4978741665457025e-05,
"loss": 0.8653,
"mean_token_accuracy": 0.7401462674140931,
"step": 2300
},
{
"epoch": 0.42355751561925764,
"grad_norm": 1.1106698371323385,
"learning_rate": 3.4914220079710666e-05,
"loss": 0.7935,
"mean_token_accuracy": 0.7622707843780517,
"step": 2305
},
{
"epoch": 0.4244762954796031,
"grad_norm": 1.0106898769557273,
"learning_rate": 3.484963002530425e-05,
"loss": 0.7434,
"mean_token_accuracy": 0.7763538002967835,
"step": 2310
},
{
"epoch": 0.4253950753399485,
"grad_norm": 0.8849864404343236,
"learning_rate": 3.478497209871245e-05,
"loss": 0.6992,
"mean_token_accuracy": 0.7923735499382019,
"step": 2315
},
{
"epoch": 0.426313855200294,
"grad_norm": 0.9563259804283883,
"learning_rate": 3.472024689703671e-05,
"loss": 0.7486,
"mean_token_accuracy": 0.7754692554473877,
"step": 2320
},
{
"epoch": 0.42723263506063947,
"grad_norm": 0.9606543568940551,
"learning_rate": 3.465545501799976e-05,
"loss": 0.7453,
"mean_token_accuracy": 0.7748713374137879,
"step": 2325
},
{
"epoch": 0.4281514149209849,
"grad_norm": 0.9591747976619429,
"learning_rate": 3.4590597059940075e-05,
"loss": 0.7557,
"mean_token_accuracy": 0.7716753005981445,
"step": 2330
},
{
"epoch": 0.4290701947813304,
"grad_norm": 1.0062222102856666,
"learning_rate": 3.4525673621806365e-05,
"loss": 0.7196,
"mean_token_accuracy": 0.7826886892318725,
"step": 2335
},
{
"epoch": 0.42998897464167585,
"grad_norm": 0.9978547883457327,
"learning_rate": 3.4460685303152014e-05,
"loss": 0.7528,
"mean_token_accuracy": 0.773649275302887,
"step": 2340
},
{
"epoch": 0.4309077545020213,
"grad_norm": 1.0567272451689054,
"learning_rate": 3.4395632704129565e-05,
"loss": 0.7358,
"mean_token_accuracy": 0.7783871531486511,
"step": 2345
},
{
"epoch": 0.4318265343623668,
"grad_norm": 0.9796518204189265,
"learning_rate": 3.43305164254852e-05,
"loss": 0.8094,
"mean_token_accuracy": 0.7556680321693421,
"step": 2350
},
{
"epoch": 0.43274531422271223,
"grad_norm": 1.2145226004304532,
"learning_rate": 3.426533706855314e-05,
"loss": 0.8687,
"mean_token_accuracy": 0.7412409901618957,
"step": 2355
},
{
"epoch": 0.43366409408305767,
"grad_norm": 1.07424243219785,
"learning_rate": 3.420009523525016e-05,
"loss": 0.798,
"mean_token_accuracy": 0.7611199259757996,
"step": 2360
},
{
"epoch": 0.43458287394340317,
"grad_norm": 0.9910321317767857,
"learning_rate": 3.4134791528069924e-05,
"loss": 0.7826,
"mean_token_accuracy": 0.7638975620269776,
"step": 2365
},
{
"epoch": 0.4355016538037486,
"grad_norm": 0.9247920341694345,
"learning_rate": 3.406942655007755e-05,
"loss": 0.8644,
"mean_token_accuracy": 0.7422878861427307,
"step": 2370
},
{
"epoch": 0.4364204336640941,
"grad_norm": 1.1061133237797016,
"learning_rate": 3.400400090490394e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.7687703609466553,
"step": 2375
},
{
"epoch": 0.43733921352443955,
"grad_norm": 1.0542956872190579,
"learning_rate": 3.393851519674027e-05,
"loss": 0.7244,
"mean_token_accuracy": 0.7811267971992493,
"step": 2380
},
{
"epoch": 0.438257993384785,
"grad_norm": 0.9897649894527137,
"learning_rate": 3.387297003033237e-05,
"loss": 0.8368,
"mean_token_accuracy": 0.7458428382873535,
"step": 2385
},
{
"epoch": 0.4391767732451305,
"grad_norm": 0.9767768078602704,
"learning_rate": 3.380736601097514e-05,
"loss": 0.764,
"mean_token_accuracy": 0.7723331332206727,
"step": 2390
},
{
"epoch": 0.44009555310547593,
"grad_norm": 0.9595716112205526,
"learning_rate": 3.374170374450701e-05,
"loss": 0.7663,
"mean_token_accuracy": 0.7720773100852967,
"step": 2395
},
{
"epoch": 0.4410143329658214,
"grad_norm": 1.076074919473048,
"learning_rate": 3.367598383730429e-05,
"loss": 0.7088,
"mean_token_accuracy": 0.785472309589386,
"step": 2400
},
{
"epoch": 0.4419331128261669,
"grad_norm": 1.082210949676486,
"learning_rate": 3.361020689627556e-05,
"loss": 0.7326,
"mean_token_accuracy": 0.7807153582572937,
"step": 2405
},
{
"epoch": 0.4428518926865123,
"grad_norm": 0.9481216990417377,
"learning_rate": 3.354437352885616e-05,
"loss": 0.7801,
"mean_token_accuracy": 0.7678476572036743,
"step": 2410
},
{
"epoch": 0.44377067254685776,
"grad_norm": 0.9841501438162628,
"learning_rate": 3.347848434300244e-05,
"loss": 0.774,
"mean_token_accuracy": 0.7663671970367432,
"step": 2415
},
{
"epoch": 0.44468945240720326,
"grad_norm": 1.0107998180735964,
"learning_rate": 3.341253994718628e-05,
"loss": 0.7629,
"mean_token_accuracy": 0.7694483995437622,
"step": 2420
},
{
"epoch": 0.4456082322675487,
"grad_norm": 1.0972419453877216,
"learning_rate": 3.334654095038939e-05,
"loss": 0.8412,
"mean_token_accuracy": 0.7457273244857788,
"step": 2425
},
{
"epoch": 0.44652701212789414,
"grad_norm": 1.2432204140882914,
"learning_rate": 3.3280487962097696e-05,
"loss": 0.8691,
"mean_token_accuracy": 0.7431544780731201,
"step": 2430
},
{
"epoch": 0.44744579198823964,
"grad_norm": 0.9401271930787076,
"learning_rate": 3.3214381592295743e-05,
"loss": 0.7024,
"mean_token_accuracy": 0.784889030456543,
"step": 2435
},
{
"epoch": 0.4483645718485851,
"grad_norm": 1.041383480262174,
"learning_rate": 3.3148222451461035e-05,
"loss": 0.8058,
"mean_token_accuracy": 0.7577178955078125,
"step": 2440
},
{
"epoch": 0.4492833517089305,
"grad_norm": 1.0194759744273139,
"learning_rate": 3.308201115055841e-05,
"loss": 0.712,
"mean_token_accuracy": 0.7870323181152343,
"step": 2445
},
{
"epoch": 0.450202131569276,
"grad_norm": 1.4686250705410333,
"learning_rate": 3.301574830103437e-05,
"loss": 0.8155,
"mean_token_accuracy": 0.7529638648033142,
"step": 2450
},
{
"epoch": 0.45112091142962146,
"grad_norm": 1.031770370598432,
"learning_rate": 3.294943451481148e-05,
"loss": 0.7707,
"mean_token_accuracy": 0.7680568814277648,
"step": 2455
},
{
"epoch": 0.4520396912899669,
"grad_norm": 1.0322162169498617,
"learning_rate": 3.288307040428269e-05,
"loss": 0.7308,
"mean_token_accuracy": 0.7787389516830444,
"step": 2460
},
{
"epoch": 0.4529584711503124,
"grad_norm": 0.9740443305657719,
"learning_rate": 3.281665658230568e-05,
"loss": 0.7369,
"mean_token_accuracy": 0.7813670396804809,
"step": 2465
},
{
"epoch": 0.45387725101065785,
"grad_norm": 0.9656209530299383,
"learning_rate": 3.2750193662197196e-05,
"loss": 0.799,
"mean_token_accuracy": 0.7627607464790345,
"step": 2470
},
{
"epoch": 0.4547960308710033,
"grad_norm": 0.9955461958313095,
"learning_rate": 3.2683682257727424e-05,
"loss": 0.7435,
"mean_token_accuracy": 0.7717449307441712,
"step": 2475
},
{
"epoch": 0.4557148107313488,
"grad_norm": 1.0088438877085566,
"learning_rate": 3.261712298311425e-05,
"loss": 0.8432,
"mean_token_accuracy": 0.7495060801506043,
"step": 2480
},
{
"epoch": 0.4566335905916942,
"grad_norm": 0.9852421798418514,
"learning_rate": 3.255051645301766e-05,
"loss": 0.7598,
"mean_token_accuracy": 0.7723948240280152,
"step": 2485
},
{
"epoch": 0.45755237045203967,
"grad_norm": 1.1475584718237826,
"learning_rate": 3.2483863282534034e-05,
"loss": 0.7946,
"mean_token_accuracy": 0.7613343358039856,
"step": 2490
},
{
"epoch": 0.45847115031238517,
"grad_norm": 0.9748547683528608,
"learning_rate": 3.241716408719044e-05,
"loss": 0.7791,
"mean_token_accuracy": 0.7649445414543152,
"step": 2495
},
{
"epoch": 0.4593899301727306,
"grad_norm": 1.0462773904867664,
"learning_rate": 3.2350419482939006e-05,
"loss": 0.7762,
"mean_token_accuracy": 0.7663216352462768,
"step": 2500
},
{
"epoch": 0.46030871003307605,
"grad_norm": 1.0030710057983563,
"learning_rate": 3.228363008615117e-05,
"loss": 0.8001,
"mean_token_accuracy": 0.7575832843780518,
"step": 2505
},
{
"epoch": 0.46122748989342155,
"grad_norm": 0.9883286938382098,
"learning_rate": 3.2216796513612063e-05,
"loss": 0.7871,
"mean_token_accuracy": 0.7624288439750672,
"step": 2510
},
{
"epoch": 0.462146269753767,
"grad_norm": 1.0525039212029965,
"learning_rate": 3.214991938251472e-05,
"loss": 0.7558,
"mean_token_accuracy": 0.7720568418502808,
"step": 2515
},
{
"epoch": 0.46306504961411243,
"grad_norm": 1.0429696791102898,
"learning_rate": 3.208299931045446e-05,
"loss": 0.7642,
"mean_token_accuracy": 0.7731514692306518,
"step": 2520
},
{
"epoch": 0.46398382947445793,
"grad_norm": 1.0242365972013217,
"learning_rate": 3.2016036915423145e-05,
"loss": 0.7633,
"mean_token_accuracy": 0.7699605584144592,
"step": 2525
},
{
"epoch": 0.4649026093348034,
"grad_norm": 1.2324183314573516,
"learning_rate": 3.1949032815803475e-05,
"loss": 0.7682,
"mean_token_accuracy": 0.7663087368011474,
"step": 2530
},
{
"epoch": 0.4658213891951488,
"grad_norm": 0.9943168933618857,
"learning_rate": 3.188198763036329e-05,
"loss": 0.8362,
"mean_token_accuracy": 0.7509650230407715,
"step": 2535
},
{
"epoch": 0.4667401690554943,
"grad_norm": 0.9854698386880044,
"learning_rate": 3.181490197824985e-05,
"loss": 0.7956,
"mean_token_accuracy": 0.7612180948257447,
"step": 2540
},
{
"epoch": 0.46765894891583976,
"grad_norm": 0.960141519847968,
"learning_rate": 3.1747776478984096e-05,
"loss": 0.7204,
"mean_token_accuracy": 0.7808646440505982,
"step": 2545
},
{
"epoch": 0.4685777287761852,
"grad_norm": 1.1016288208270253,
"learning_rate": 3.168061175245497e-05,
"loss": 0.8181,
"mean_token_accuracy": 0.7522975325584411,
"step": 2550
},
{
"epoch": 0.4694965086365307,
"grad_norm": 1.0470917775420199,
"learning_rate": 3.1613408418913676e-05,
"loss": 0.7684,
"mean_token_accuracy": 0.7654074668884278,
"step": 2555
},
{
"epoch": 0.47041528849687614,
"grad_norm": 1.0068385295701907,
"learning_rate": 3.154616709896791e-05,
"loss": 0.8036,
"mean_token_accuracy": 0.7603312849998474,
"step": 2560
},
{
"epoch": 0.47133406835722164,
"grad_norm": 0.9829615470201221,
"learning_rate": 3.147888841357619e-05,
"loss": 0.813,
"mean_token_accuracy": 0.759647810459137,
"step": 2565
},
{
"epoch": 0.4722528482175671,
"grad_norm": 0.8978243887281022,
"learning_rate": 3.141157298404211e-05,
"loss": 0.7915,
"mean_token_accuracy": 0.7597061276435852,
"step": 2570
},
{
"epoch": 0.4731716280779125,
"grad_norm": 1.0947782890080606,
"learning_rate": 3.134422143200854e-05,
"loss": 0.8269,
"mean_token_accuracy": 0.7519834399223327,
"step": 2575
},
{
"epoch": 0.474090407938258,
"grad_norm": 0.9630113626806391,
"learning_rate": 3.127683437945199e-05,
"loss": 0.8306,
"mean_token_accuracy": 0.7524376153945923,
"step": 2580
},
{
"epoch": 0.47500918779860346,
"grad_norm": 0.912625232111843,
"learning_rate": 3.120941244867675e-05,
"loss": 0.7851,
"mean_token_accuracy": 0.7631929993629456,
"step": 2585
},
{
"epoch": 0.4759279676589489,
"grad_norm": 1.0798990597432274,
"learning_rate": 3.1141956262309265e-05,
"loss": 0.8272,
"mean_token_accuracy": 0.7549837350845336,
"step": 2590
},
{
"epoch": 0.4768467475192944,
"grad_norm": 0.9446129759073334,
"learning_rate": 3.1074466443292276e-05,
"loss": 0.7756,
"mean_token_accuracy": 0.7657612562179565,
"step": 2595
},
{
"epoch": 0.47776552737963984,
"grad_norm": 0.9259352218504912,
"learning_rate": 3.1006943614879127e-05,
"loss": 0.7342,
"mean_token_accuracy": 0.7782540440559387,
"step": 2600
},
{
"epoch": 0.4786843072399853,
"grad_norm": 1.0388874910734989,
"learning_rate": 3.0939388400628e-05,
"loss": 0.8209,
"mean_token_accuracy": 0.757353937625885,
"step": 2605
},
{
"epoch": 0.4796030871003308,
"grad_norm": 1.0351799688172598,
"learning_rate": 3.087180142439615e-05,
"loss": 0.7712,
"mean_token_accuracy": 0.7672750115394592,
"step": 2610
},
{
"epoch": 0.4805218669606762,
"grad_norm": 0.994898643969558,
"learning_rate": 3.080418331033416e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7735359907150269,
"step": 2615
},
{
"epoch": 0.48144064682102167,
"grad_norm": 0.8832020370330693,
"learning_rate": 3.073653468288014e-05,
"loss": 0.6924,
"mean_token_accuracy": 0.792470920085907,
"step": 2620
},
{
"epoch": 0.48235942668136716,
"grad_norm": 1.0051745659888849,
"learning_rate": 3.0668856166754014e-05,
"loss": 0.8004,
"mean_token_accuracy": 0.7600342750549316,
"step": 2625
},
{
"epoch": 0.4832782065417126,
"grad_norm": 1.0055252007007038,
"learning_rate": 3.060114838695168e-05,
"loss": 0.8243,
"mean_token_accuracy": 0.7516715884208679,
"step": 2630
},
{
"epoch": 0.48419698640205805,
"grad_norm": 1.0629225726483997,
"learning_rate": 3.0533411968739315e-05,
"loss": 0.7152,
"mean_token_accuracy": 0.7828492283821106,
"step": 2635
},
{
"epoch": 0.48511576626240355,
"grad_norm": 1.018552086343459,
"learning_rate": 3.0465647537647564e-05,
"loss": 0.7561,
"mean_token_accuracy": 0.7733739614486694,
"step": 2640
},
{
"epoch": 0.486034546122749,
"grad_norm": 1.110183845818711,
"learning_rate": 3.0397855719465736e-05,
"loss": 0.8057,
"mean_token_accuracy": 0.7568628549575805,
"step": 2645
},
{
"epoch": 0.48695332598309443,
"grad_norm": 0.9676140585791341,
"learning_rate": 3.0330037140236083e-05,
"loss": 0.795,
"mean_token_accuracy": 0.7640480756759643,
"step": 2650
},
{
"epoch": 0.48787210584343993,
"grad_norm": 0.9401353123276465,
"learning_rate": 3.026219242624797e-05,
"loss": 0.8139,
"mean_token_accuracy": 0.7546276330947876,
"step": 2655
},
{
"epoch": 0.48879088570378537,
"grad_norm": 1.0235062426914774,
"learning_rate": 3.019432220403212e-05,
"loss": 0.7659,
"mean_token_accuracy": 0.7717217683792115,
"step": 2660
},
{
"epoch": 0.4897096655641308,
"grad_norm": 1.1394523922744142,
"learning_rate": 3.012642710035484e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.7566407918930054,
"step": 2665
},
{
"epoch": 0.4906284454244763,
"grad_norm": 1.059673146029454,
"learning_rate": 3.0058507742212162e-05,
"loss": 0.7741,
"mean_token_accuracy": 0.7690371632575989,
"step": 2670
},
{
"epoch": 0.49154722528482175,
"grad_norm": 0.9607137840570246,
"learning_rate": 2.999056475682414e-05,
"loss": 0.7948,
"mean_token_accuracy": 0.7632219791412354,
"step": 2675
},
{
"epoch": 0.4924660051451672,
"grad_norm": 0.9922439140299888,
"learning_rate": 2.9922598771629005e-05,
"loss": 0.7874,
"mean_token_accuracy": 0.7601206183433533,
"step": 2680
},
{
"epoch": 0.4933847850055127,
"grad_norm": 0.9338793477773762,
"learning_rate": 2.9854610414277402e-05,
"loss": 0.7693,
"mean_token_accuracy": 0.7690744280815125,
"step": 2685
},
{
"epoch": 0.49430356486585814,
"grad_norm": 1.0330996973903106,
"learning_rate": 2.9786600312626563e-05,
"loss": 0.7401,
"mean_token_accuracy": 0.7735617399215698,
"step": 2690
},
{
"epoch": 0.4952223447262036,
"grad_norm": 1.020650007316944,
"learning_rate": 2.9718569094734515e-05,
"loss": 0.8004,
"mean_token_accuracy": 0.7575301885604858,
"step": 2695
},
{
"epoch": 0.4961411245865491,
"grad_norm": 0.8826033529136437,
"learning_rate": 2.965051738885432e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.773734736442566,
"step": 2700
},
{
"epoch": 0.4970599044468945,
"grad_norm": 1.2165920422760765,
"learning_rate": 2.958244582342822e-05,
"loss": 0.8248,
"mean_token_accuracy": 0.7535989284515381,
"step": 2705
},
{
"epoch": 0.49797868430723996,
"grad_norm": 0.978788492706274,
"learning_rate": 2.9514355027081846e-05,
"loss": 0.7831,
"mean_token_accuracy": 0.7655808568000794,
"step": 2710
},
{
"epoch": 0.49889746416758546,
"grad_norm": 0.9950576805013585,
"learning_rate": 2.944624562861845e-05,
"loss": 0.807,
"mean_token_accuracy": 0.7562234044075012,
"step": 2715
},
{
"epoch": 0.4998162440279309,
"grad_norm": 1.0612404822026047,
"learning_rate": 2.9378118257013054e-05,
"loss": 0.7904,
"mean_token_accuracy": 0.7584082007408142,
"step": 2720
},
{
"epoch": 0.5007350238882764,
"grad_norm": 0.915675944397533,
"learning_rate": 2.930997354140665e-05,
"loss": 0.7464,
"mean_token_accuracy": 0.7732234835624695,
"step": 2725
},
{
"epoch": 0.5016538037486218,
"grad_norm": 0.9232965003519343,
"learning_rate": 2.9241812111100414e-05,
"loss": 0.8088,
"mean_token_accuracy": 0.7525614619255065,
"step": 2730
},
{
"epoch": 0.5025725836089673,
"grad_norm": 1.52977648060804,
"learning_rate": 2.9173634595549876e-05,
"loss": 0.8046,
"mean_token_accuracy": 0.7584918379783631,
"step": 2735
},
{
"epoch": 0.5034913634693128,
"grad_norm": 0.9438156907635481,
"learning_rate": 2.910544162435909e-05,
"loss": 0.832,
"mean_token_accuracy": 0.751025402545929,
"step": 2740
},
{
"epoch": 0.5044101433296582,
"grad_norm": 0.9428923251443018,
"learning_rate": 2.9037233827274885e-05,
"loss": 0.8152,
"mean_token_accuracy": 0.7552414298057556,
"step": 2745
},
{
"epoch": 0.5053289231900037,
"grad_norm": 1.0224336975686246,
"learning_rate": 2.8969011834180937e-05,
"loss": 0.8284,
"mean_token_accuracy": 0.7509586930274963,
"step": 2750
},
{
"epoch": 0.5062477030503492,
"grad_norm": 0.985144401173651,
"learning_rate": 2.8900776275092083e-05,
"loss": 0.7888,
"mean_token_accuracy": 0.7600571990013123,
"step": 2755
},
{
"epoch": 0.5071664829106945,
"grad_norm": 0.9789390297665671,
"learning_rate": 2.8832527780148406e-05,
"loss": 0.7962,
"mean_token_accuracy": 0.7582376718521118,
"step": 2760
},
{
"epoch": 0.50808526277104,
"grad_norm": 1.0148219491244725,
"learning_rate": 2.8764266979609445e-05,
"loss": 0.8469,
"mean_token_accuracy": 0.7480961322784424,
"step": 2765
},
{
"epoch": 0.5090040426313855,
"grad_norm": 1.0037544910691434,
"learning_rate": 2.8695994503848395e-05,
"loss": 0.7421,
"mean_token_accuracy": 0.7739938139915467,
"step": 2770
},
{
"epoch": 0.5099228224917309,
"grad_norm": 0.9754806954239216,
"learning_rate": 2.8627710983346262e-05,
"loss": 0.7697,
"mean_token_accuracy": 0.7668745636940002,
"step": 2775
},
{
"epoch": 0.5108416023520764,
"grad_norm": 0.8806986584931391,
"learning_rate": 2.855941704868605e-05,
"loss": 0.7866,
"mean_token_accuracy": 0.7633078217506408,
"step": 2780
},
{
"epoch": 0.5117603822124219,
"grad_norm": 1.091789028436644,
"learning_rate": 2.8491113330546925e-05,
"loss": 0.8046,
"mean_token_accuracy": 0.7567707419395446,
"step": 2785
},
{
"epoch": 0.5126791620727673,
"grad_norm": 1.061632989714619,
"learning_rate": 2.8422800459698423e-05,
"loss": 0.7922,
"mean_token_accuracy": 0.7623422026634217,
"step": 2790
},
{
"epoch": 0.5135979419331128,
"grad_norm": 0.939885286311399,
"learning_rate": 2.835447906699457e-05,
"loss": 0.7691,
"mean_token_accuracy": 0.7693052887916565,
"step": 2795
},
{
"epoch": 0.5145167217934583,
"grad_norm": 0.9959159666772467,
"learning_rate": 2.8286149783368132e-05,
"loss": 0.7627,
"mean_token_accuracy": 0.7660305023193359,
"step": 2800
},
{
"epoch": 0.5154355016538037,
"grad_norm": 1.0033698323194213,
"learning_rate": 2.82178132398247e-05,
"loss": 0.7825,
"mean_token_accuracy": 0.7651003241539002,
"step": 2805
},
{
"epoch": 0.5163542815141492,
"grad_norm": 1.0942052865929373,
"learning_rate": 2.8149470067436945e-05,
"loss": 0.8091,
"mean_token_accuracy": 0.7585999965667725,
"step": 2810
},
{
"epoch": 0.5172730613744947,
"grad_norm": 1.0275503721326753,
"learning_rate": 2.8081120897338748e-05,
"loss": 0.7622,
"mean_token_accuracy": 0.7666819214820861,
"step": 2815
},
{
"epoch": 0.5181918412348401,
"grad_norm": 1.0162262995217026,
"learning_rate": 2.8012766360719346e-05,
"loss": 0.8351,
"mean_token_accuracy": 0.7463697791099548,
"step": 2820
},
{
"epoch": 0.5191106210951856,
"grad_norm": 0.8735638518849154,
"learning_rate": 2.794440708881758e-05,
"loss": 0.7526,
"mean_token_accuracy": 0.7740337014198303,
"step": 2825
},
{
"epoch": 0.5200294009555311,
"grad_norm": 0.8889407557447864,
"learning_rate": 2.787604371291599e-05,
"loss": 0.686,
"mean_token_accuracy": 0.7896162033081054,
"step": 2830
},
{
"epoch": 0.5209481808158766,
"grad_norm": 0.9682433350662344,
"learning_rate": 2.780767686433502e-05,
"loss": 0.7646,
"mean_token_accuracy": 0.7699775457382202,
"step": 2835
},
{
"epoch": 0.521866960676222,
"grad_norm": 1.0567812728267838,
"learning_rate": 2.7739307174427204e-05,
"loss": 0.769,
"mean_token_accuracy": 0.7676406979560852,
"step": 2840
},
{
"epoch": 0.5227857405365675,
"grad_norm": 0.8954213877251977,
"learning_rate": 2.767093527457128e-05,
"loss": 0.7958,
"mean_token_accuracy": 0.7604862689971924,
"step": 2845
},
{
"epoch": 0.523704520396913,
"grad_norm": 0.9391105206530606,
"learning_rate": 2.7602561796166426e-05,
"loss": 0.7794,
"mean_token_accuracy": 0.7647231101989747,
"step": 2850
},
{
"epoch": 0.5246233002572583,
"grad_norm": 0.952523275474733,
"learning_rate": 2.753418737062638e-05,
"loss": 0.7628,
"mean_token_accuracy": 0.7682720065116883,
"step": 2855
},
{
"epoch": 0.5255420801176038,
"grad_norm": 0.9201973347920627,
"learning_rate": 2.746581262937363e-05,
"loss": 0.7777,
"mean_token_accuracy": 0.7642363786697388,
"step": 2860
},
{
"epoch": 0.5264608599779493,
"grad_norm": 0.9335982392105177,
"learning_rate": 2.739743820383358e-05,
"loss": 0.7338,
"mean_token_accuracy": 0.7742905497550965,
"step": 2865
},
{
"epoch": 0.5273796398382947,
"grad_norm": 0.9846844485906041,
"learning_rate": 2.732906472542872e-05,
"loss": 0.7486,
"mean_token_accuracy": 0.7750791192054749,
"step": 2870
},
{
"epoch": 0.5282984196986402,
"grad_norm": 0.9865121950565456,
"learning_rate": 2.7260692825572808e-05,
"loss": 0.7584,
"mean_token_accuracy": 0.7667729616165161,
"step": 2875
},
{
"epoch": 0.5292171995589857,
"grad_norm": 1.0573279084656204,
"learning_rate": 2.7192323135664988e-05,
"loss": 0.8901,
"mean_token_accuracy": 0.7370211601257324,
"step": 2880
},
{
"epoch": 0.5301359794193311,
"grad_norm": 1.0198244813321502,
"learning_rate": 2.712395628708402e-05,
"loss": 0.7471,
"mean_token_accuracy": 0.771734893321991,
"step": 2885
},
{
"epoch": 0.5310547592796766,
"grad_norm": 1.0967316052811202,
"learning_rate": 2.7055592911182425e-05,
"loss": 0.7543,
"mean_token_accuracy": 0.7691154241561889,
"step": 2890
},
{
"epoch": 0.5319735391400221,
"grad_norm": 0.9658875640110149,
"learning_rate": 2.6987233639280656e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7853469371795654,
"step": 2895
},
{
"epoch": 0.5328923190003675,
"grad_norm": 1.041207894864275,
"learning_rate": 2.6918879102661264e-05,
"loss": 0.7403,
"mean_token_accuracy": 0.7763397812843322,
"step": 2900
},
{
"epoch": 0.533811098860713,
"grad_norm": 0.9959626369836271,
"learning_rate": 2.6850529932563057e-05,
"loss": 0.7526,
"mean_token_accuracy": 0.7707386016845703,
"step": 2905
},
{
"epoch": 0.5347298787210585,
"grad_norm": 0.994177922519465,
"learning_rate": 2.6782186760175303e-05,
"loss": 0.8229,
"mean_token_accuracy": 0.754144036769867,
"step": 2910
},
{
"epoch": 0.5356486585814039,
"grad_norm": 1.0745632939643772,
"learning_rate": 2.6713850216631876e-05,
"loss": 0.8191,
"mean_token_accuracy": 0.7573227047920227,
"step": 2915
},
{
"epoch": 0.5365674384417494,
"grad_norm": 1.0072152395633065,
"learning_rate": 2.6645520933005432e-05,
"loss": 0.7212,
"mean_token_accuracy": 0.7815118074417114,
"step": 2920
},
{
"epoch": 0.5374862183020949,
"grad_norm": 0.9856013490140734,
"learning_rate": 2.6577199540301583e-05,
"loss": 0.8058,
"mean_token_accuracy": 0.7568701386451722,
"step": 2925
},
{
"epoch": 0.5384049981624403,
"grad_norm": 1.0799881365103963,
"learning_rate": 2.6508886669453077e-05,
"loss": 0.722,
"mean_token_accuracy": 0.7820630311965943,
"step": 2930
},
{
"epoch": 0.5393237780227857,
"grad_norm": 0.9808614748561544,
"learning_rate": 2.6440582951313958e-05,
"loss": 0.7312,
"mean_token_accuracy": 0.7761293530464173,
"step": 2935
},
{
"epoch": 0.5402425578831312,
"grad_norm": 0.9988071044503206,
"learning_rate": 2.6372289016653747e-05,
"loss": 0.8052,
"mean_token_accuracy": 0.755142867565155,
"step": 2940
},
{
"epoch": 0.5411613377434766,
"grad_norm": 0.9640983566459411,
"learning_rate": 2.6304005496151607e-05,
"loss": 0.7501,
"mean_token_accuracy": 0.7724974870681762,
"step": 2945
},
{
"epoch": 0.5420801176038221,
"grad_norm": 0.9753132303800915,
"learning_rate": 2.6235733020390557e-05,
"loss": 0.7507,
"mean_token_accuracy": 0.7731342792510987,
"step": 2950
},
{
"epoch": 0.5429988974641676,
"grad_norm": 0.9941788148526961,
"learning_rate": 2.6167472219851606e-05,
"loss": 0.7777,
"mean_token_accuracy": 0.7659435391426086,
"step": 2955
},
{
"epoch": 0.543917677324513,
"grad_norm": 1.0342888870006444,
"learning_rate": 2.6099223724907922e-05,
"loss": 0.7954,
"mean_token_accuracy": 0.7611855626106262,
"step": 2960
},
{
"epoch": 0.5448364571848585,
"grad_norm": 0.9947482977334893,
"learning_rate": 2.603098816581907e-05,
"loss": 0.79,
"mean_token_accuracy": 0.7604100823402404,
"step": 2965
},
{
"epoch": 0.545755237045204,
"grad_norm": 0.9369443584148152,
"learning_rate": 2.5962766172725127e-05,
"loss": 0.7501,
"mean_token_accuracy": 0.7741901755332947,
"step": 2970
},
{
"epoch": 0.5466740169055494,
"grad_norm": 1.018205972168573,
"learning_rate": 2.589455837564091e-05,
"loss": 0.8146,
"mean_token_accuracy": 0.7549449682235718,
"step": 2975
},
{
"epoch": 0.5475927967658949,
"grad_norm": 0.9840855304963227,
"learning_rate": 2.5826365404450136e-05,
"loss": 0.6928,
"mean_token_accuracy": 0.7882686018943786,
"step": 2980
},
{
"epoch": 0.5485115766262404,
"grad_norm": 0.9765299396635874,
"learning_rate": 2.57581878888996e-05,
"loss": 0.7904,
"mean_token_accuracy": 0.7627172827720642,
"step": 2985
},
{
"epoch": 0.5494303564865858,
"grad_norm": 0.9611858171978122,
"learning_rate": 2.5690026458593362e-05,
"loss": 0.7849,
"mean_token_accuracy": 0.7626663684844971,
"step": 2990
},
{
"epoch": 0.5503491363469313,
"grad_norm": 0.9918441521186859,
"learning_rate": 2.562188174298695e-05,
"loss": 0.7139,
"mean_token_accuracy": 0.7834156632423401,
"step": 2995
},
{
"epoch": 0.5512679162072768,
"grad_norm": 0.9168184741389104,
"learning_rate": 2.5553754371381555e-05,
"loss": 0.7595,
"mean_token_accuracy": 0.7676758289337158,
"step": 3000
},
{
"epoch": 0.5521866960676222,
"grad_norm": 0.9307353989568666,
"learning_rate": 2.5485644972918153e-05,
"loss": 0.7309,
"mean_token_accuracy": 0.7800590991973877,
"step": 3005
},
{
"epoch": 0.5531054759279677,
"grad_norm": 0.9683740817546714,
"learning_rate": 2.541755417657179e-05,
"loss": 0.7913,
"mean_token_accuracy": 0.7614364624023438,
"step": 3010
},
{
"epoch": 0.5540242557883132,
"grad_norm": 0.9961298115995415,
"learning_rate": 2.5349482611145685e-05,
"loss": 0.8041,
"mean_token_accuracy": 0.7568534970283508,
"step": 3015
},
{
"epoch": 0.5549430356486585,
"grad_norm": 0.981662259480835,
"learning_rate": 2.528143090526549e-05,
"loss": 0.6952,
"mean_token_accuracy": 0.7897186994552612,
"step": 3020
},
{
"epoch": 0.555861815509004,
"grad_norm": 0.9140182959744487,
"learning_rate": 2.5213399687373446e-05,
"loss": 0.6967,
"mean_token_accuracy": 0.7841851711273193,
"step": 3025
},
{
"epoch": 0.5567805953693495,
"grad_norm": 1.05668077131703,
"learning_rate": 2.51453895857226e-05,
"loss": 0.751,
"mean_token_accuracy": 0.7735855102539062,
"step": 3030
},
{
"epoch": 0.5576993752296949,
"grad_norm": 0.9377501197010149,
"learning_rate": 2.5077401228371007e-05,
"loss": 0.7319,
"mean_token_accuracy": 0.7791807889938355,
"step": 3035
},
{
"epoch": 0.5586181550900404,
"grad_norm": 1.116838452205624,
"learning_rate": 2.5009435243175865e-05,
"loss": 0.8436,
"mean_token_accuracy": 0.7444709777832031,
"step": 3040
},
{
"epoch": 0.5595369349503859,
"grad_norm": 1.133786669142971,
"learning_rate": 2.4941492257787847e-05,
"loss": 0.7451,
"mean_token_accuracy": 0.7729416728019715,
"step": 3045
},
{
"epoch": 0.5604557148107313,
"grad_norm": 1.0531439168923706,
"learning_rate": 2.4873572899645164e-05,
"loss": 0.7914,
"mean_token_accuracy": 0.7595977902412414,
"step": 3050
},
{
"epoch": 0.5613744946710768,
"grad_norm": 0.9370063066983946,
"learning_rate": 2.4805677795967874e-05,
"loss": 0.7787,
"mean_token_accuracy": 0.762716269493103,
"step": 3055
},
{
"epoch": 0.5622932745314223,
"grad_norm": 0.994949145579561,
"learning_rate": 2.4737807573752036e-05,
"loss": 0.7431,
"mean_token_accuracy": 0.7747965931892395,
"step": 3060
},
{
"epoch": 0.5632120543917677,
"grad_norm": 1.0671188110858503,
"learning_rate": 2.466996285976393e-05,
"loss": 0.7917,
"mean_token_accuracy": 0.7583362698554993,
"step": 3065
},
{
"epoch": 0.5641308342521132,
"grad_norm": 0.892710934926214,
"learning_rate": 2.4602144280534273e-05,
"loss": 0.7498,
"mean_token_accuracy": 0.7732946038246155,
"step": 3070
},
{
"epoch": 0.5650496141124587,
"grad_norm": 0.99980677185357,
"learning_rate": 2.4534352462352445e-05,
"loss": 0.8074,
"mean_token_accuracy": 0.7578684329986572,
"step": 3075
},
{
"epoch": 0.5659683939728041,
"grad_norm": 0.9159293875905319,
"learning_rate": 2.4466588031260684e-05,
"loss": 0.7809,
"mean_token_accuracy": 0.7624441385269165,
"step": 3080
},
{
"epoch": 0.5668871738331496,
"grad_norm": 1.0362770618116839,
"learning_rate": 2.4398851613048322e-05,
"loss": 0.797,
"mean_token_accuracy": 0.760871410369873,
"step": 3085
},
{
"epoch": 0.5678059536934951,
"grad_norm": 1.0847148009284608,
"learning_rate": 2.4331143833245994e-05,
"loss": 0.7395,
"mean_token_accuracy": 0.7746615648269654,
"step": 3090
},
{
"epoch": 0.5687247335538405,
"grad_norm": 1.0671537398957074,
"learning_rate": 2.426346531711986e-05,
"loss": 0.774,
"mean_token_accuracy": 0.7641933798789978,
"step": 3095
},
{
"epoch": 0.569643513414186,
"grad_norm": 1.0063509161122495,
"learning_rate": 2.4195816689665847e-05,
"loss": 0.7038,
"mean_token_accuracy": 0.7864096641540528,
"step": 3100
},
{
"epoch": 0.5705622932745315,
"grad_norm": 0.9712630519098367,
"learning_rate": 2.4128198575603857e-05,
"loss": 0.7839,
"mean_token_accuracy": 0.7611940979957581,
"step": 3105
},
{
"epoch": 0.5714810731348768,
"grad_norm": 0.9158850127968227,
"learning_rate": 2.4060611599372007e-05,
"loss": 0.775,
"mean_token_accuracy": 0.7655367732048035,
"step": 3110
},
{
"epoch": 0.5723998529952223,
"grad_norm": 0.9829867717200517,
"learning_rate": 2.399305638512089e-05,
"loss": 0.8531,
"mean_token_accuracy": 0.742165744304657,
"step": 3115
},
{
"epoch": 0.5733186328555678,
"grad_norm": 0.9676209759663041,
"learning_rate": 2.3925533556707736e-05,
"loss": 0.7669,
"mean_token_accuracy": 0.7683526515960694,
"step": 3120
},
{
"epoch": 0.5742374127159132,
"grad_norm": 0.9456863854288068,
"learning_rate": 2.385804373769074e-05,
"loss": 0.736,
"mean_token_accuracy": 0.7773837327957154,
"step": 3125
},
{
"epoch": 0.5751561925762587,
"grad_norm": 1.042769286037687,
"learning_rate": 2.3790587551323252e-05,
"loss": 0.7869,
"mean_token_accuracy": 0.761770761013031,
"step": 3130
},
{
"epoch": 0.5760749724366042,
"grad_norm": 0.898131943412606,
"learning_rate": 2.372316562054802e-05,
"loss": 0.7311,
"mean_token_accuracy": 0.780720841884613,
"step": 3135
},
{
"epoch": 0.5769937522969496,
"grad_norm": 0.937736313205156,
"learning_rate": 2.3655778567991456e-05,
"loss": 0.8486,
"mean_token_accuracy": 0.7416357159614563,
"step": 3140
},
{
"epoch": 0.5779125321572951,
"grad_norm": 0.9983900168625015,
"learning_rate": 2.3588427015957904e-05,
"loss": 0.7432,
"mean_token_accuracy": 0.7713735103607178,
"step": 3145
},
{
"epoch": 0.5788313120176406,
"grad_norm": 1.0896357887586694,
"learning_rate": 2.352111158642381e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.76038818359375,
"step": 3150
},
{
"epoch": 0.579750091877986,
"grad_norm": 0.9547336671541522,
"learning_rate": 2.3453832901032097e-05,
"loss": 0.7723,
"mean_token_accuracy": 0.7679526925086975,
"step": 3155
},
{
"epoch": 0.5806688717383315,
"grad_norm": 0.9486005629151948,
"learning_rate": 2.3386591581086333e-05,
"loss": 0.6867,
"mean_token_accuracy": 0.7872913122177124,
"step": 3160
},
{
"epoch": 0.581587651598677,
"grad_norm": 1.4360094460321793,
"learning_rate": 2.3319388247545026e-05,
"loss": 0.6946,
"mean_token_accuracy": 0.7893529891967773,
"step": 3165
},
{
"epoch": 0.5825064314590224,
"grad_norm": 0.9675717631201467,
"learning_rate": 2.325222352101591e-05,
"loss": 0.794,
"mean_token_accuracy": 0.7627562046051025,
"step": 3170
},
{
"epoch": 0.5834252113193679,
"grad_norm": 0.8611284135924058,
"learning_rate": 2.3185098021750163e-05,
"loss": 0.7647,
"mean_token_accuracy": 0.7697438478469849,
"step": 3175
},
{
"epoch": 0.5843439911797134,
"grad_norm": 1.0945648293831518,
"learning_rate": 2.3118012369636715e-05,
"loss": 0.7374,
"mean_token_accuracy": 0.7741273403167724,
"step": 3180
},
{
"epoch": 0.5852627710400587,
"grad_norm": 0.9850152813442956,
"learning_rate": 2.3050967184196526e-05,
"loss": 0.7387,
"mean_token_accuracy": 0.7777738809585572,
"step": 3185
},
{
"epoch": 0.5861815509004042,
"grad_norm": 0.8639589995274697,
"learning_rate": 2.2983963084576854e-05,
"loss": 0.77,
"mean_token_accuracy": 0.7680123209953308,
"step": 3190
},
{
"epoch": 0.5871003307607497,
"grad_norm": 0.969485320702538,
"learning_rate": 2.2917000689545535e-05,
"loss": 0.8023,
"mean_token_accuracy": 0.759474766254425,
"step": 3195
},
{
"epoch": 0.5880191106210951,
"grad_norm": 0.9691992055808628,
"learning_rate": 2.2850080617485286e-05,
"loss": 0.7576,
"mean_token_accuracy": 0.7699379682540893,
"step": 3200
},
{
"epoch": 0.5889378904814406,
"grad_norm": 0.9709025550626744,
"learning_rate": 2.2783203486387945e-05,
"loss": 0.764,
"mean_token_accuracy": 0.7677761912345886,
"step": 3205
},
{
"epoch": 0.5898566703417861,
"grad_norm": 0.9395191069096172,
"learning_rate": 2.2716369913848827e-05,
"loss": 0.7572,
"mean_token_accuracy": 0.7745106220245361,
"step": 3210
},
{
"epoch": 0.5907754502021316,
"grad_norm": 0.9777159547594203,
"learning_rate": 2.2649580517061003e-05,
"loss": 0.7136,
"mean_token_accuracy": 0.7840847253799439,
"step": 3215
},
{
"epoch": 0.591694230062477,
"grad_norm": 0.9390454307687789,
"learning_rate": 2.2582835912809564e-05,
"loss": 0.7614,
"mean_token_accuracy": 0.7697038054466248,
"step": 3220
},
{
"epoch": 0.5926130099228225,
"grad_norm": 1.0367038259917516,
"learning_rate": 2.251613671746598e-05,
"loss": 0.7796,
"mean_token_accuracy": 0.7627864122390747,
"step": 3225
},
{
"epoch": 0.593531789783168,
"grad_norm": 0.9251948352297976,
"learning_rate": 2.2449483546982347e-05,
"loss": 0.6893,
"mean_token_accuracy": 0.7888349413871765,
"step": 3230
},
{
"epoch": 0.5944505696435134,
"grad_norm": 1.05108523432423,
"learning_rate": 2.2382877016885757e-05,
"loss": 0.7052,
"mean_token_accuracy": 0.7855964303016663,
"step": 3235
},
{
"epoch": 0.5953693495038589,
"grad_norm": 0.9994956513098704,
"learning_rate": 2.2316317742272585e-05,
"loss": 0.7682,
"mean_token_accuracy": 0.7651132106781006,
"step": 3240
},
{
"epoch": 0.5962881293642044,
"grad_norm": 0.9539355388832639,
"learning_rate": 2.224980633780281e-05,
"loss": 0.7181,
"mean_token_accuracy": 0.7789011836051941,
"step": 3245
},
{
"epoch": 0.5972069092245498,
"grad_norm": 0.9721681364733832,
"learning_rate": 2.2183343417694334e-05,
"loss": 0.7583,
"mean_token_accuracy": 0.7710484743118287,
"step": 3250
},
{
"epoch": 0.5981256890848953,
"grad_norm": 1.0132996609635718,
"learning_rate": 2.2116929595717317e-05,
"loss": 0.7719,
"mean_token_accuracy": 0.765598726272583,
"step": 3255
},
{
"epoch": 0.5990444689452408,
"grad_norm": 0.9659020670904003,
"learning_rate": 2.205056548518853e-05,
"loss": 0.7958,
"mean_token_accuracy": 0.7573135375976563,
"step": 3260
},
{
"epoch": 0.5999632488055862,
"grad_norm": 1.0145461160760352,
"learning_rate": 2.1984251698965637e-05,
"loss": 0.7506,
"mean_token_accuracy": 0.7711923003196717,
"step": 3265
},
{
"epoch": 0.6008820286659317,
"grad_norm": 1.0804834048147398,
"learning_rate": 2.1917988849441594e-05,
"loss": 0.8049,
"mean_token_accuracy": 0.755113685131073,
"step": 3270
},
{
"epoch": 0.6018008085262772,
"grad_norm": 0.9733796804471042,
"learning_rate": 2.185177754853896e-05,
"loss": 0.6773,
"mean_token_accuracy": 0.7920406103134155,
"step": 3275
},
{
"epoch": 0.6027195883866225,
"grad_norm": 0.990871804097787,
"learning_rate": 2.1785618407704255e-05,
"loss": 0.7619,
"mean_token_accuracy": 0.7680476665496826,
"step": 3280
},
{
"epoch": 0.603638368246968,
"grad_norm": 0.9094240503163677,
"learning_rate": 2.1719512037902306e-05,
"loss": 0.758,
"mean_token_accuracy": 0.7682316303253174,
"step": 3285
},
{
"epoch": 0.6045571481073135,
"grad_norm": 0.9504426996357046,
"learning_rate": 2.1653459049610618e-05,
"loss": 0.7037,
"mean_token_accuracy": 0.7844570279121399,
"step": 3290
},
{
"epoch": 0.6054759279676589,
"grad_norm": 1.0419237413786735,
"learning_rate": 2.1587460052813724e-05,
"loss": 0.7797,
"mean_token_accuracy": 0.7651678204536438,
"step": 3295
},
{
"epoch": 0.6063947078280044,
"grad_norm": 1.0189296741711382,
"learning_rate": 2.1521515656997567e-05,
"loss": 0.8125,
"mean_token_accuracy": 0.7538291454315186,
"step": 3300
},
{
"epoch": 0.6073134876883499,
"grad_norm": 0.9647782169864347,
"learning_rate": 2.145562647114386e-05,
"loss": 0.7115,
"mean_token_accuracy": 0.7819002747535706,
"step": 3305
},
{
"epoch": 0.6082322675486953,
"grad_norm": 0.8719676861547915,
"learning_rate": 2.1389793103724443e-05,
"loss": 0.7175,
"mean_token_accuracy": 0.7793567061424256,
"step": 3310
},
{
"epoch": 0.6091510474090408,
"grad_norm": 0.9761609575734019,
"learning_rate": 2.1324016162695722e-05,
"loss": 0.6784,
"mean_token_accuracy": 0.7919653534889222,
"step": 3315
},
{
"epoch": 0.6100698272693863,
"grad_norm": 0.8974448563579739,
"learning_rate": 2.125829625549299e-05,
"loss": 0.6786,
"mean_token_accuracy": 0.7931641936302185,
"step": 3320
},
{
"epoch": 0.6109886071297317,
"grad_norm": 1.0099374622071293,
"learning_rate": 2.1192633989024856e-05,
"loss": 0.8367,
"mean_token_accuracy": 0.7453663229942322,
"step": 3325
},
{
"epoch": 0.6119073869900772,
"grad_norm": 1.0422892359273228,
"learning_rate": 2.112702996966764e-05,
"loss": 0.7187,
"mean_token_accuracy": 0.7798493385314942,
"step": 3330
},
{
"epoch": 0.6128261668504227,
"grad_norm": 1.1083604247420085,
"learning_rate": 2.106148480325974e-05,
"loss": 0.7806,
"mean_token_accuracy": 0.761151397228241,
"step": 3335
},
{
"epoch": 0.6137449467107681,
"grad_norm": 1.0270421311335494,
"learning_rate": 2.0995999095096068e-05,
"loss": 0.7843,
"mean_token_accuracy": 0.7627219676971435,
"step": 3340
},
{
"epoch": 0.6146637265711136,
"grad_norm": 1.215757454497741,
"learning_rate": 2.0930573449922457e-05,
"loss": 0.7597,
"mean_token_accuracy": 0.769752562046051,
"step": 3345
},
{
"epoch": 0.6155825064314591,
"grad_norm": 1.2153983619499056,
"learning_rate": 2.086520847193008e-05,
"loss": 0.7792,
"mean_token_accuracy": 0.7656338334083557,
"step": 3350
},
{
"epoch": 0.6165012862918045,
"grad_norm": 0.952171476221175,
"learning_rate": 2.079990476474985e-05,
"loss": 0.683,
"mean_token_accuracy": 0.7914249539375305,
"step": 3355
},
{
"epoch": 0.61742006615215,
"grad_norm": 1.037769469357623,
"learning_rate": 2.0734662931446858e-05,
"loss": 0.7692,
"mean_token_accuracy": 0.7642071366310119,
"step": 3360
},
{
"epoch": 0.6183388460124954,
"grad_norm": 0.9750207162668445,
"learning_rate": 2.0669483574514807e-05,
"loss": 0.8355,
"mean_token_accuracy": 0.7468725085258484,
"step": 3365
},
{
"epoch": 0.6192576258728408,
"grad_norm": 1.0187302049829796,
"learning_rate": 2.060436729587044e-05,
"loss": 0.7502,
"mean_token_accuracy": 0.7742531776428223,
"step": 3370
},
{
"epoch": 0.6201764057331863,
"grad_norm": 0.943777765061105,
"learning_rate": 2.0539314696848e-05,
"loss": 0.7062,
"mean_token_accuracy": 0.7855054616928101,
"step": 3375
},
{
"epoch": 0.6210951855935318,
"grad_norm": 0.8930105332009788,
"learning_rate": 2.0474326378193637e-05,
"loss": 0.7458,
"mean_token_accuracy": 0.773654580116272,
"step": 3380
},
{
"epoch": 0.6220139654538772,
"grad_norm": 0.9035160403431316,
"learning_rate": 2.0409402940059937e-05,
"loss": 0.7268,
"mean_token_accuracy": 0.7792444586753845,
"step": 3385
},
{
"epoch": 0.6229327453142227,
"grad_norm": 1.0410393906012252,
"learning_rate": 2.0344544982000246e-05,
"loss": 0.7038,
"mean_token_accuracy": 0.7828059315681457,
"step": 3390
},
{
"epoch": 0.6238515251745682,
"grad_norm": 0.9123527907550557,
"learning_rate": 2.0279753102963296e-05,
"loss": 0.667,
"mean_token_accuracy": 0.7945937156677246,
"step": 3395
},
{
"epoch": 0.6247703050349136,
"grad_norm": 1.0453020521936442,
"learning_rate": 2.0215027901287555e-05,
"loss": 0.7062,
"mean_token_accuracy": 0.7823508858680726,
"step": 3400
},
{
"epoch": 0.6256890848952591,
"grad_norm": 0.9075834943890148,
"learning_rate": 2.0150369974695755e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.7846097946166992,
"step": 3405
},
{
"epoch": 0.6266078647556046,
"grad_norm": 0.9405566568561052,
"learning_rate": 2.008577992028934e-05,
"loss": 0.7387,
"mean_token_accuracy": 0.7755053520202637,
"step": 3410
},
{
"epoch": 0.62752664461595,
"grad_norm": 0.9311470578940665,
"learning_rate": 2.0021258334542987e-05,
"loss": 0.7867,
"mean_token_accuracy": 0.7582219243049622,
"step": 3415
},
{
"epoch": 0.6284454244762955,
"grad_norm": 0.9465824966191277,
"learning_rate": 1.9956805813299066e-05,
"loss": 0.7295,
"mean_token_accuracy": 0.7787384033203125,
"step": 3420
},
{
"epoch": 0.629364204336641,
"grad_norm": 0.9314873192239379,
"learning_rate": 1.9892422951762167e-05,
"loss": 0.7732,
"mean_token_accuracy": 0.7635803461074829,
"step": 3425
},
{
"epoch": 0.6302829841969864,
"grad_norm": 1.0303249115412232,
"learning_rate": 1.9828110344493583e-05,
"loss": 0.8374,
"mean_token_accuracy": 0.7502556920051575,
"step": 3430
},
{
"epoch": 0.6312017640573319,
"grad_norm": 0.8566968324816928,
"learning_rate": 1.9763868585405813e-05,
"loss": 0.6606,
"mean_token_accuracy": 0.7972531080245971,
"step": 3435
},
{
"epoch": 0.6321205439176774,
"grad_norm": 1.0376272306600982,
"learning_rate": 1.9699698267757115e-05,
"loss": 0.6992,
"mean_token_accuracy": 0.784684681892395,
"step": 3440
},
{
"epoch": 0.6330393237780227,
"grad_norm": 1.0897400517305982,
"learning_rate": 1.9635599984145965e-05,
"loss": 0.8341,
"mean_token_accuracy": 0.7503707766532898,
"step": 3445
},
{
"epoch": 0.6339581036383682,
"grad_norm": 0.9399776090183068,
"learning_rate": 1.9571574326505648e-05,
"loss": 0.7555,
"mean_token_accuracy": 0.7727354645729065,
"step": 3450
},
{
"epoch": 0.6348768834987137,
"grad_norm": 1.422297410503556,
"learning_rate": 1.950762188609876e-05,
"loss": 0.7891,
"mean_token_accuracy": 0.761411714553833,
"step": 3455
},
{
"epoch": 0.6357956633590591,
"grad_norm": 0.9985847497683605,
"learning_rate": 1.9443743253511736e-05,
"loss": 0.773,
"mean_token_accuracy": 0.7664777278900147,
"step": 3460
},
{
"epoch": 0.6367144432194046,
"grad_norm": 0.8928670622147518,
"learning_rate": 1.9379939018649447e-05,
"loss": 0.6888,
"mean_token_accuracy": 0.7935372710227966,
"step": 3465
},
{
"epoch": 0.6376332230797501,
"grad_norm": 0.8818099232101203,
"learning_rate": 1.9316209770729686e-05,
"loss": 0.6876,
"mean_token_accuracy": 0.7872507929801941,
"step": 3470
},
{
"epoch": 0.6385520029400955,
"grad_norm": 1.0220008034223411,
"learning_rate": 1.9252556098277762e-05,
"loss": 0.7423,
"mean_token_accuracy": 0.7761957883834839,
"step": 3475
},
{
"epoch": 0.639470782800441,
"grad_norm": 1.00646363617204,
"learning_rate": 1.9188978589121076e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.763306987285614,
"step": 3480
},
{
"epoch": 0.6403895626607865,
"grad_norm": 1.0853274497927512,
"learning_rate": 1.9125477830383663e-05,
"loss": 0.7638,
"mean_token_accuracy": 0.7669495463371276,
"step": 3485
},
{
"epoch": 0.6413083425211319,
"grad_norm": 1.065184143072551,
"learning_rate": 1.9062054408480804e-05,
"loss": 0.7743,
"mean_token_accuracy": 0.7621343255043029,
"step": 3490
},
{
"epoch": 0.6422271223814774,
"grad_norm": 0.9447419338666605,
"learning_rate": 1.899870890911357e-05,
"loss": 0.6677,
"mean_token_accuracy": 0.7952073097229004,
"step": 3495
},
{
"epoch": 0.6431459022418229,
"grad_norm": 1.0005267248842091,
"learning_rate": 1.8935441917263448e-05,
"loss": 0.6775,
"mean_token_accuracy": 0.7901732444763183,
"step": 3500
},
{
"epoch": 0.6440646821021683,
"grad_norm": 1.0892429541179653,
"learning_rate": 1.8872254017186915e-05,
"loss": 0.7837,
"mean_token_accuracy": 0.7588755011558532,
"step": 3505
},
{
"epoch": 0.6449834619625138,
"grad_norm": 1.0127229878185544,
"learning_rate": 1.880914579241007e-05,
"loss": 0.7123,
"mean_token_accuracy": 0.7827209591865539,
"step": 3510
},
{
"epoch": 0.6459022418228593,
"grad_norm": 1.0281554271788436,
"learning_rate": 1.8746117825723214e-05,
"loss": 0.6835,
"mean_token_accuracy": 0.7939071655273438,
"step": 3515
},
{
"epoch": 0.6468210216832047,
"grad_norm": 0.8913736086248946,
"learning_rate": 1.86831706991755e-05,
"loss": 0.7223,
"mean_token_accuracy": 0.7790691494941712,
"step": 3520
},
{
"epoch": 0.6477398015435502,
"grad_norm": 1.0046759104597491,
"learning_rate": 1.8620304994069508e-05,
"loss": 0.7165,
"mean_token_accuracy": 0.7822145223617554,
"step": 3525
},
{
"epoch": 0.6486585814038957,
"grad_norm": 0.8761990362360018,
"learning_rate": 1.8557521290955943e-05,
"loss": 0.6898,
"mean_token_accuracy": 0.7909232258796692,
"step": 3530
},
{
"epoch": 0.649577361264241,
"grad_norm": 1.0010762980226218,
"learning_rate": 1.849482016962822e-05,
"loss": 0.7426,
"mean_token_accuracy": 0.773716127872467,
"step": 3535
},
{
"epoch": 0.6504961411245865,
"grad_norm": 0.8997455734948419,
"learning_rate": 1.8432202209117132e-05,
"loss": 0.7354,
"mean_token_accuracy": 0.7769456744194031,
"step": 3540
},
{
"epoch": 0.651414920984932,
"grad_norm": 1.0530586816733762,
"learning_rate": 1.8369667987685517e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.7756595969200134,
"step": 3545
},
{
"epoch": 0.6523337008452774,
"grad_norm": 0.9090366247053898,
"learning_rate": 1.830721808282289e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.7681886911392212,
"step": 3550
},
{
"epoch": 0.6532524807056229,
"grad_norm": 0.8963988021562999,
"learning_rate": 1.8244853071240103e-05,
"loss": 0.7189,
"mean_token_accuracy": 0.7818469524383544,
"step": 3555
},
{
"epoch": 0.6541712605659684,
"grad_norm": 1.0901215733279344,
"learning_rate": 1.8182573528864066e-05,
"loss": 0.8269,
"mean_token_accuracy": 0.7504664659500122,
"step": 3560
},
{
"epoch": 0.6550900404263138,
"grad_norm": 1.1645208492459995,
"learning_rate": 1.812038003083239e-05,
"loss": 0.7093,
"mean_token_accuracy": 0.7835473537445068,
"step": 3565
},
{
"epoch": 0.6560088202866593,
"grad_norm": 1.0256562034267807,
"learning_rate": 1.805827315148808e-05,
"loss": 0.8014,
"mean_token_accuracy": 0.7579211831092835,
"step": 3570
},
{
"epoch": 0.6569276001470048,
"grad_norm": 0.9529727803585198,
"learning_rate": 1.799625346437424e-05,
"loss": 0.7738,
"mean_token_accuracy": 0.7688822269439697,
"step": 3575
},
{
"epoch": 0.6578463800073502,
"grad_norm": 0.94739155264938,
"learning_rate": 1.793432154222878e-05,
"loss": 0.7292,
"mean_token_accuracy": 0.7785593032836914,
"step": 3580
},
{
"epoch": 0.6587651598676957,
"grad_norm": 0.9662895425381987,
"learning_rate": 1.7872477956979117e-05,
"loss": 0.7436,
"mean_token_accuracy": 0.7758478641510009,
"step": 3585
},
{
"epoch": 0.6596839397280412,
"grad_norm": 0.9840960134871575,
"learning_rate": 1.7810723279736885e-05,
"loss": 0.7916,
"mean_token_accuracy": 0.7603202104568482,
"step": 3590
},
{
"epoch": 0.6606027195883867,
"grad_norm": 1.0261319427030933,
"learning_rate": 1.774905808079269e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7864163637161254,
"step": 3595
},
{
"epoch": 0.6615214994487321,
"grad_norm": 1.0500468697129208,
"learning_rate": 1.768748292961082e-05,
"loss": 0.8148,
"mean_token_accuracy": 0.7488227248191833,
"step": 3600
},
{
"epoch": 0.6624402793090776,
"grad_norm": 1.0731369628716187,
"learning_rate": 1.7625998394823983e-05,
"loss": 0.8241,
"mean_token_accuracy": 0.7512738227844238,
"step": 3605
},
{
"epoch": 0.6633590591694231,
"grad_norm": 1.034876172959453,
"learning_rate": 1.756460504422807e-05,
"loss": 0.7318,
"mean_token_accuracy": 0.7753666043281555,
"step": 3610
},
{
"epoch": 0.6642778390297684,
"grad_norm": 0.9683609087211331,
"learning_rate": 1.750330344477692e-05,
"loss": 0.7759,
"mean_token_accuracy": 0.7623879432678222,
"step": 3615
},
{
"epoch": 0.6651966188901139,
"grad_norm": 1.0746172926951512,
"learning_rate": 1.7442094162577048e-05,
"loss": 0.7414,
"mean_token_accuracy": 0.7732792139053345,
"step": 3620
},
{
"epoch": 0.6661153987504594,
"grad_norm": 1.0348081377114133,
"learning_rate": 1.7380977762882462e-05,
"loss": 0.7379,
"mean_token_accuracy": 0.7739031314849854,
"step": 3625
},
{
"epoch": 0.6670341786108048,
"grad_norm": 1.0461877004048412,
"learning_rate": 1.731995481008941e-05,
"loss": 0.7448,
"mean_token_accuracy": 0.773258650302887,
"step": 3630
},
{
"epoch": 0.6679529584711503,
"grad_norm": 0.9323745094099202,
"learning_rate": 1.725902586773116e-05,
"loss": 0.6793,
"mean_token_accuracy": 0.7933961987495423,
"step": 3635
},
{
"epoch": 0.6688717383314958,
"grad_norm": 1.046949059494339,
"learning_rate": 1.7198191498472838e-05,
"loss": 0.7922,
"mean_token_accuracy": 0.7601482748985291,
"step": 3640
},
{
"epoch": 0.6697905181918412,
"grad_norm": 1.022387930805979,
"learning_rate": 1.7137452264106223e-05,
"loss": 0.7352,
"mean_token_accuracy": 0.7750853300094604,
"step": 3645
},
{
"epoch": 0.6707092980521867,
"grad_norm": 1.0168638470278177,
"learning_rate": 1.7076808725544513e-05,
"loss": 0.7946,
"mean_token_accuracy": 0.76027911901474,
"step": 3650
},
{
"epoch": 0.6716280779125322,
"grad_norm": 1.011273043579098,
"learning_rate": 1.7016261442817195e-05,
"loss": 0.7686,
"mean_token_accuracy": 0.7633870005607605,
"step": 3655
},
{
"epoch": 0.6725468577728776,
"grad_norm": 1.0527976338992284,
"learning_rate": 1.6955810975064852e-05,
"loss": 0.744,
"mean_token_accuracy": 0.7737329721450805,
"step": 3660
},
{
"epoch": 0.6734656376332231,
"grad_norm": 0.9597608034824768,
"learning_rate": 1.689545788053398e-05,
"loss": 0.7701,
"mean_token_accuracy": 0.7696826219558716,
"step": 3665
},
{
"epoch": 0.6743844174935686,
"grad_norm": 1.0258518237885876,
"learning_rate": 1.6835202716571896e-05,
"loss": 0.7254,
"mean_token_accuracy": 0.7749346971511841,
"step": 3670
},
{
"epoch": 0.675303197353914,
"grad_norm": 0.9578329259933241,
"learning_rate": 1.677504603962151e-05,
"loss": 0.7372,
"mean_token_accuracy": 0.7727353811264038,
"step": 3675
},
{
"epoch": 0.6762219772142595,
"grad_norm": 0.9425151659951094,
"learning_rate": 1.6714988405216268e-05,
"loss": 0.7622,
"mean_token_accuracy": 0.768218743801117,
"step": 3680
},
{
"epoch": 0.677140757074605,
"grad_norm": 0.9402489651093421,
"learning_rate": 1.6655030367974956e-05,
"loss": 0.7042,
"mean_token_accuracy": 0.7838626861572265,
"step": 3685
},
{
"epoch": 0.6780595369349504,
"grad_norm": 0.9434658856603072,
"learning_rate": 1.659517248159658e-05,
"loss": 0.6985,
"mean_token_accuracy": 0.7856457352638244,
"step": 3690
},
{
"epoch": 0.6789783167952959,
"grad_norm": 0.9784597438802801,
"learning_rate": 1.6535415298855327e-05,
"loss": 0.724,
"mean_token_accuracy": 0.7787894964218139,
"step": 3695
},
{
"epoch": 0.6798970966556414,
"grad_norm": 0.9287408780062713,
"learning_rate": 1.6475759371595363e-05,
"loss": 0.7246,
"mean_token_accuracy": 0.7800618410110474,
"step": 3700
},
{
"epoch": 0.6808158765159867,
"grad_norm": 0.8854707560115899,
"learning_rate": 1.6416205250725805e-05,
"loss": 0.7302,
"mean_token_accuracy": 0.7747718214988708,
"step": 3705
},
{
"epoch": 0.6817346563763322,
"grad_norm": 1.0559953134942033,
"learning_rate": 1.635675348621561e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.7618914604187011,
"step": 3710
},
{
"epoch": 0.6826534362366777,
"grad_norm": 0.904527688485687,
"learning_rate": 1.6297404627088495e-05,
"loss": 0.6821,
"mean_token_accuracy": 0.7847250699996948,
"step": 3715
},
{
"epoch": 0.6835722160970231,
"grad_norm": 0.9469214300582695,
"learning_rate": 1.623815922141786e-05,
"loss": 0.7542,
"mean_token_accuracy": 0.7689258933067322,
"step": 3720
},
{
"epoch": 0.6844909959573686,
"grad_norm": 0.967860721114202,
"learning_rate": 1.6179017816321747e-05,
"loss": 0.7363,
"mean_token_accuracy": 0.7743378639221191,
"step": 3725
},
{
"epoch": 0.6854097758177141,
"grad_norm": 0.8886741465453643,
"learning_rate": 1.6119980957957777e-05,
"loss": 0.6988,
"mean_token_accuracy": 0.7837384343147278,
"step": 3730
},
{
"epoch": 0.6863285556780595,
"grad_norm": 0.8776280447144813,
"learning_rate": 1.6061049191518085e-05,
"loss": 0.7209,
"mean_token_accuracy": 0.7783106327056885,
"step": 3735
},
{
"epoch": 0.687247335538405,
"grad_norm": 0.9158307911784594,
"learning_rate": 1.6002223061224335e-05,
"loss": 0.7088,
"mean_token_accuracy": 0.781765878200531,
"step": 3740
},
{
"epoch": 0.6881661153987505,
"grad_norm": 1.162396078380293,
"learning_rate": 1.5943503110322645e-05,
"loss": 0.7807,
"mean_token_accuracy": 0.7625959992408753,
"step": 3745
},
{
"epoch": 0.6890848952590959,
"grad_norm": 1.0152287109252447,
"learning_rate": 1.5884889881078597e-05,
"loss": 0.7434,
"mean_token_accuracy": 0.7718896269798279,
"step": 3750
},
{
"epoch": 0.6900036751194414,
"grad_norm": 1.008310002000136,
"learning_rate": 1.5826383914772224e-05,
"loss": 0.7251,
"mean_token_accuracy": 0.7803327202796936,
"step": 3755
},
{
"epoch": 0.6909224549797869,
"grad_norm": 0.9966365535572344,
"learning_rate": 1.5767985751693e-05,
"loss": 0.7973,
"mean_token_accuracy": 0.755574083328247,
"step": 3760
},
{
"epoch": 0.6918412348401323,
"grad_norm": 0.9091331211868702,
"learning_rate": 1.5709695931134865e-05,
"loss": 0.6733,
"mean_token_accuracy": 0.7941539287567139,
"step": 3765
},
{
"epoch": 0.6927600147004778,
"grad_norm": 0.9104102247083076,
"learning_rate": 1.5651514991391257e-05,
"loss": 0.776,
"mean_token_accuracy": 0.7669570446014404,
"step": 3770
},
{
"epoch": 0.6936787945608233,
"grad_norm": 1.0293997774105645,
"learning_rate": 1.5593443469750096e-05,
"loss": 0.8177,
"mean_token_accuracy": 0.7502638220787048,
"step": 3775
},
{
"epoch": 0.6945975744211687,
"grad_norm": 0.929448240683312,
"learning_rate": 1.5535481902488867e-05,
"loss": 0.7637,
"mean_token_accuracy": 0.7701873660087586,
"step": 3780
},
{
"epoch": 0.6955163542815141,
"grad_norm": 0.9731391197018507,
"learning_rate": 1.5477630824869654e-05,
"loss": 0.7091,
"mean_token_accuracy": 0.7808983325958252,
"step": 3785
},
{
"epoch": 0.6964351341418596,
"grad_norm": 1.0202913846698398,
"learning_rate": 1.541989077113418e-05,
"loss": 0.7465,
"mean_token_accuracy": 0.7717735052108765,
"step": 3790
},
{
"epoch": 0.697353914002205,
"grad_norm": 0.9301401711992584,
"learning_rate": 1.5362262274498905e-05,
"loss": 0.6822,
"mean_token_accuracy": 0.7897647023200989,
"step": 3795
},
{
"epoch": 0.6982726938625505,
"grad_norm": 1.0207577033975543,
"learning_rate": 1.5304745867150057e-05,
"loss": 0.7438,
"mean_token_accuracy": 0.774781858921051,
"step": 3800
},
{
"epoch": 0.699191473722896,
"grad_norm": 1.0524548953066566,
"learning_rate": 1.524734208023878e-05,
"loss": 0.7102,
"mean_token_accuracy": 0.781788682937622,
"step": 3805
},
{
"epoch": 0.7001102535832414,
"grad_norm": 1.5303502399912878,
"learning_rate": 1.5190051443876164e-05,
"loss": 0.75,
"mean_token_accuracy": 0.7729594349861145,
"step": 3810
},
{
"epoch": 0.7010290334435869,
"grad_norm": 0.9420746725481757,
"learning_rate": 1.5132874487128395e-05,
"loss": 0.7092,
"mean_token_accuracy": 0.7798316001892089,
"step": 3815
},
{
"epoch": 0.7019478133039324,
"grad_norm": 0.9519856865931159,
"learning_rate": 1.5075811738011856e-05,
"loss": 0.7228,
"mean_token_accuracy": 0.7796306014060974,
"step": 3820
},
{
"epoch": 0.7028665931642778,
"grad_norm": 0.9860104895952649,
"learning_rate": 1.5018863723488225e-05,
"loss": 0.7966,
"mean_token_accuracy": 0.7599681258201599,
"step": 3825
},
{
"epoch": 0.7037853730246233,
"grad_norm": 1.0271424947345908,
"learning_rate": 1.4962030969459653e-05,
"loss": 0.7635,
"mean_token_accuracy": 0.7661967992782592,
"step": 3830
},
{
"epoch": 0.7047041528849688,
"grad_norm": 0.9528580532465843,
"learning_rate": 1.4905314000763879e-05,
"loss": 0.8305,
"mean_token_accuracy": 0.748454475402832,
"step": 3835
},
{
"epoch": 0.7056229327453142,
"grad_norm": 1.127501991853972,
"learning_rate": 1.48487133411694e-05,
"loss": 0.7435,
"mean_token_accuracy": 0.7706256151199341,
"step": 3840
},
{
"epoch": 0.7065417126056597,
"grad_norm": 0.9715164350420321,
"learning_rate": 1.4792229513370623e-05,
"loss": 0.7749,
"mean_token_accuracy": 0.7648235201835633,
"step": 3845
},
{
"epoch": 0.7074604924660052,
"grad_norm": 1.0410599346511435,
"learning_rate": 1.4735863038983017e-05,
"loss": 0.7929,
"mean_token_accuracy": 0.7620292901992798,
"step": 3850
},
{
"epoch": 0.7083792723263506,
"grad_norm": 1.0230288642653715,
"learning_rate": 1.4679614438538336e-05,
"loss": 0.7096,
"mean_token_accuracy": 0.7822004795074463,
"step": 3855
},
{
"epoch": 0.7092980521866961,
"grad_norm": 0.9688977031604671,
"learning_rate": 1.4623484231479797e-05,
"loss": 0.7349,
"mean_token_accuracy": 0.7777714133262634,
"step": 3860
},
{
"epoch": 0.7102168320470416,
"grad_norm": 0.9626561356173854,
"learning_rate": 1.4567472936157272e-05,
"loss": 0.7146,
"mean_token_accuracy": 0.781309711933136,
"step": 3865
},
{
"epoch": 0.7111356119073869,
"grad_norm": 0.9708419426566777,
"learning_rate": 1.451158106982253e-05,
"loss": 0.7092,
"mean_token_accuracy": 0.783543837070465,
"step": 3870
},
{
"epoch": 0.7120543917677324,
"grad_norm": 1.109902456844337,
"learning_rate": 1.4455809148624427e-05,
"loss": 0.6661,
"mean_token_accuracy": 0.7925106644630432,
"step": 3875
},
{
"epoch": 0.7129731716280779,
"grad_norm": 1.0531361212213257,
"learning_rate": 1.4400157687604127e-05,
"loss": 0.7478,
"mean_token_accuracy": 0.7699988007545471,
"step": 3880
},
{
"epoch": 0.7138919514884233,
"grad_norm": 0.9181223816529849,
"learning_rate": 1.4344627200690408e-05,
"loss": 0.7828,
"mean_token_accuracy": 0.7599815845489502,
"step": 3885
},
{
"epoch": 0.7148107313487688,
"grad_norm": 0.960236605434846,
"learning_rate": 1.4289218200694863e-05,
"loss": 0.6859,
"mean_token_accuracy": 0.7898363471031189,
"step": 3890
},
{
"epoch": 0.7157295112091143,
"grad_norm": 1.0122066887422,
"learning_rate": 1.4233931199307182e-05,
"loss": 0.7232,
"mean_token_accuracy": 0.7770133495330811,
"step": 3895
},
{
"epoch": 0.7166482910694597,
"grad_norm": 1.040565449358011,
"learning_rate": 1.4178766707090435e-05,
"loss": 0.6839,
"mean_token_accuracy": 0.7898031234741211,
"step": 3900
},
{
"epoch": 0.7175670709298052,
"grad_norm": 1.0510693995270706,
"learning_rate": 1.4123725233476331e-05,
"loss": 0.7013,
"mean_token_accuracy": 0.7850608229637146,
"step": 3905
},
{
"epoch": 0.7184858507901507,
"grad_norm": 0.9406082797289776,
"learning_rate": 1.406880728676054e-05,
"loss": 0.694,
"mean_token_accuracy": 0.7835015416145324,
"step": 3910
},
{
"epoch": 0.7194046306504961,
"grad_norm": 0.86803820647997,
"learning_rate": 1.401401337409799e-05,
"loss": 0.7519,
"mean_token_accuracy": 0.7705330729484559,
"step": 3915
},
{
"epoch": 0.7203234105108416,
"grad_norm": 1.004330967776519,
"learning_rate": 1.3959344001498173e-05,
"loss": 0.7427,
"mean_token_accuracy": 0.775149667263031,
"step": 3920
},
{
"epoch": 0.7212421903711871,
"grad_norm": 0.9559190227857477,
"learning_rate": 1.390479967382049e-05,
"loss": 0.791,
"mean_token_accuracy": 0.7609505772590637,
"step": 3925
},
{
"epoch": 0.7221609702315325,
"grad_norm": 1.028049651883388,
"learning_rate": 1.3850380894769577e-05,
"loss": 0.7556,
"mean_token_accuracy": 0.76885005235672,
"step": 3930
},
{
"epoch": 0.723079750091878,
"grad_norm": 0.9303174472709201,
"learning_rate": 1.3796088166890658e-05,
"loss": 0.7354,
"mean_token_accuracy": 0.7731772422790527,
"step": 3935
},
{
"epoch": 0.7239985299522235,
"grad_norm": 0.9187982243033715,
"learning_rate": 1.3741921991564902e-05,
"loss": 0.7279,
"mean_token_accuracy": 0.7771438717842102,
"step": 3940
},
{
"epoch": 0.7249173098125689,
"grad_norm": 1.0153656088945144,
"learning_rate": 1.3687882869004793e-05,
"loss": 0.7822,
"mean_token_accuracy": 0.7594830989837646,
"step": 3945
},
{
"epoch": 0.7258360896729144,
"grad_norm": 0.9068523793270754,
"learning_rate": 1.3633971298249509e-05,
"loss": 0.726,
"mean_token_accuracy": 0.7766485810279846,
"step": 3950
},
{
"epoch": 0.7267548695332599,
"grad_norm": 0.9192709791074424,
"learning_rate": 1.358018777716033e-05,
"loss": 0.6736,
"mean_token_accuracy": 0.7924223423004151,
"step": 3955
},
{
"epoch": 0.7276736493936052,
"grad_norm": 0.9362834673989403,
"learning_rate": 1.3526532802415986e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.7815822243690491,
"step": 3960
},
{
"epoch": 0.7285924292539507,
"grad_norm": 0.9618182073527164,
"learning_rate": 1.347300686950817e-05,
"loss": 0.7136,
"mean_token_accuracy": 0.7804886937141419,
"step": 3965
},
{
"epoch": 0.7295112091142962,
"grad_norm": 1.0033704504611825,
"learning_rate": 1.3419610472736854e-05,
"loss": 0.7774,
"mean_token_accuracy": 0.7617066621780395,
"step": 3970
},
{
"epoch": 0.7304299889746417,
"grad_norm": 0.9824057566253805,
"learning_rate": 1.3366344105205795e-05,
"loss": 0.7415,
"mean_token_accuracy": 0.7728252649307251,
"step": 3975
},
{
"epoch": 0.7313487688349871,
"grad_norm": 0.9259208825526174,
"learning_rate": 1.3313208258817961e-05,
"loss": 0.668,
"mean_token_accuracy": 0.7945244908332825,
"step": 3980
},
{
"epoch": 0.7322675486953326,
"grad_norm": 1.0678551747273641,
"learning_rate": 1.3260203424270962e-05,
"loss": 0.6779,
"mean_token_accuracy": 0.7914282798767089,
"step": 3985
},
{
"epoch": 0.7331863285556781,
"grad_norm": 0.9074661173815383,
"learning_rate": 1.3207330091052564e-05,
"loss": 0.7319,
"mean_token_accuracy": 0.7765037894248963,
"step": 3990
},
{
"epoch": 0.7341051084160235,
"grad_norm": 0.9568097933924034,
"learning_rate": 1.3154588747436159e-05,
"loss": 0.7078,
"mean_token_accuracy": 0.7828231930732727,
"step": 3995
},
{
"epoch": 0.735023888276369,
"grad_norm": 0.9371717410955112,
"learning_rate": 1.310197988047622e-05,
"loss": 0.6858,
"mean_token_accuracy": 0.7882918357849121,
"step": 4000
},
{
"epoch": 0.7359426681367145,
"grad_norm": 0.9954273414248298,
"learning_rate": 1.3049503976003838e-05,
"loss": 0.7514,
"mean_token_accuracy": 0.7692143678665161,
"step": 4005
},
{
"epoch": 0.7368614479970599,
"grad_norm": 0.9856979788439847,
"learning_rate": 1.2997161518622236e-05,
"loss": 0.7208,
"mean_token_accuracy": 0.7764803051948548,
"step": 4010
},
{
"epoch": 0.7377802278574054,
"grad_norm": 0.9346284220975282,
"learning_rate": 1.2944952991702252e-05,
"loss": 0.6963,
"mean_token_accuracy": 0.7852182865142823,
"step": 4015
},
{
"epoch": 0.7386990077177509,
"grad_norm": 1.013720256514248,
"learning_rate": 1.289287887737794e-05,
"loss": 0.7001,
"mean_token_accuracy": 0.7830116629600525,
"step": 4020
},
{
"epoch": 0.7396177875780963,
"grad_norm": 0.9031619991653583,
"learning_rate": 1.2840939656542055e-05,
"loss": 0.6997,
"mean_token_accuracy": 0.7874221801757812,
"step": 4025
},
{
"epoch": 0.7405365674384418,
"grad_norm": 0.9825845455381703,
"learning_rate": 1.2789135808841677e-05,
"loss": 0.6957,
"mean_token_accuracy": 0.7857596635818481,
"step": 4030
},
{
"epoch": 0.7414553472987873,
"grad_norm": 0.9061423884854145,
"learning_rate": 1.2737467812673723e-05,
"loss": 0.7169,
"mean_token_accuracy": 0.781167495250702,
"step": 4035
},
{
"epoch": 0.7423741271591326,
"grad_norm": 0.9638102626531402,
"learning_rate": 1.2685936145180532e-05,
"loss": 0.69,
"mean_token_accuracy": 0.7890314221382141,
"step": 4040
},
{
"epoch": 0.7432929070194781,
"grad_norm": 1.0053384054052024,
"learning_rate": 1.2634541282245516e-05,
"loss": 0.807,
"mean_token_accuracy": 0.7533567190170288,
"step": 4045
},
{
"epoch": 0.7442116868798236,
"grad_norm": 1.04109340250319,
"learning_rate": 1.2583283698488704e-05,
"loss": 0.7067,
"mean_token_accuracy": 0.7812132358551025,
"step": 4050
},
{
"epoch": 0.745130466740169,
"grad_norm": 1.0112381563115767,
"learning_rate": 1.2532163867262392e-05,
"loss": 0.7399,
"mean_token_accuracy": 0.7726234674453736,
"step": 4055
},
{
"epoch": 0.7460492466005145,
"grad_norm": 0.8721284775041804,
"learning_rate": 1.2481182260646752e-05,
"loss": 0.7306,
"mean_token_accuracy": 0.7757495403289795,
"step": 4060
},
{
"epoch": 0.74696802646086,
"grad_norm": 1.0082015116923577,
"learning_rate": 1.2430339349445513e-05,
"loss": 0.7431,
"mean_token_accuracy": 0.7711400389671326,
"step": 4065
},
{
"epoch": 0.7478868063212054,
"grad_norm": 0.9592384870972069,
"learning_rate": 1.2379635603181537e-05,
"loss": 0.7367,
"mean_token_accuracy": 0.7738732933998108,
"step": 4070
},
{
"epoch": 0.7488055861815509,
"grad_norm": 1.0584048963162198,
"learning_rate": 1.2329071490092558e-05,
"loss": 0.768,
"mean_token_accuracy": 0.7642792701721192,
"step": 4075
},
{
"epoch": 0.7497243660418964,
"grad_norm": 1.006126544893952,
"learning_rate": 1.2278647477126825e-05,
"loss": 0.7155,
"mean_token_accuracy": 0.7793737649917603,
"step": 4080
},
{
"epoch": 0.7506431459022418,
"grad_norm": 0.9059136663503262,
"learning_rate": 1.2228364029938794e-05,
"loss": 0.6934,
"mean_token_accuracy": 0.7861241817474365,
"step": 4085
},
{
"epoch": 0.7515619257625873,
"grad_norm": 0.9164252616053726,
"learning_rate": 1.2178221612884821e-05,
"loss": 0.6858,
"mean_token_accuracy": 0.7915996551513672,
"step": 4090
},
{
"epoch": 0.7524807056229328,
"grad_norm": 0.9705885896913202,
"learning_rate": 1.212822068901889e-05,
"loss": 0.7124,
"mean_token_accuracy": 0.7814687609672546,
"step": 4095
},
{
"epoch": 0.7533994854832782,
"grad_norm": 0.9020603765566574,
"learning_rate": 1.2078361720088317e-05,
"loss": 0.6295,
"mean_token_accuracy": 0.8092963337898255,
"step": 4100
},
{
"epoch": 0.7543182653436237,
"grad_norm": 1.0248400255161636,
"learning_rate": 1.2028645166529502e-05,
"loss": 0.6836,
"mean_token_accuracy": 0.7892769694328308,
"step": 4105
},
{
"epoch": 0.7552370452039692,
"grad_norm": 0.9961571096740167,
"learning_rate": 1.1979071487463676e-05,
"loss": 0.7571,
"mean_token_accuracy": 0.7688749432563782,
"step": 4110
},
{
"epoch": 0.7561558250643146,
"grad_norm": 0.944702960313809,
"learning_rate": 1.1929641140692642e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7719345092773438,
"step": 4115
},
{
"epoch": 0.75707460492466,
"grad_norm": 0.901399230174195,
"learning_rate": 1.1880354582694574e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7755969166755676,
"step": 4120
},
{
"epoch": 0.7579933847850056,
"grad_norm": 0.8741911991495609,
"learning_rate": 1.183121226861978e-05,
"loss": 0.6733,
"mean_token_accuracy": 0.791340708732605,
"step": 4125
},
{
"epoch": 0.7589121646453509,
"grad_norm": 0.9630743810579115,
"learning_rate": 1.1782214652286517e-05,
"loss": 0.7611,
"mean_token_accuracy": 0.7667882919311524,
"step": 4130
},
{
"epoch": 0.7598309445056964,
"grad_norm": 0.9810984916908249,
"learning_rate": 1.1733362186176783e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7780536890029908,
"step": 4135
},
{
"epoch": 0.7607497243660419,
"grad_norm": 1.0335671142572738,
"learning_rate": 1.1684655321432151e-05,
"loss": 0.8171,
"mean_token_accuracy": 0.7540881991386413,
"step": 4140
},
{
"epoch": 0.7616685042263873,
"grad_norm": 1.131108893177244,
"learning_rate": 1.1636094507849602e-05,
"loss": 0.8238,
"mean_token_accuracy": 0.7476210117340087,
"step": 4145
},
{
"epoch": 0.7625872840867328,
"grad_norm": 0.936028514421637,
"learning_rate": 1.1587680193877339e-05,
"loss": 0.7193,
"mean_token_accuracy": 0.778421950340271,
"step": 4150
},
{
"epoch": 0.7635060639470783,
"grad_norm": 1.0740607162155091,
"learning_rate": 1.153941282661072e-05,
"loss": 0.7396,
"mean_token_accuracy": 0.7715651631355286,
"step": 4155
},
{
"epoch": 0.7644248438074237,
"grad_norm": 1.0319017337525411,
"learning_rate": 1.149129285178805e-05,
"loss": 0.7647,
"mean_token_accuracy": 0.7646437525749207,
"step": 4160
},
{
"epoch": 0.7653436236677692,
"grad_norm": 1.0657076517097614,
"learning_rate": 1.1443320713786512e-05,
"loss": 0.761,
"mean_token_accuracy": 0.7698405861854554,
"step": 4165
},
{
"epoch": 0.7662624035281147,
"grad_norm": 1.1933980574421776,
"learning_rate": 1.1395496855618047e-05,
"loss": 0.6857,
"mean_token_accuracy": 0.786463487148285,
"step": 4170
},
{
"epoch": 0.7671811833884601,
"grad_norm": 1.0399268658004284,
"learning_rate": 1.1347821718925246e-05,
"loss": 0.6951,
"mean_token_accuracy": 0.7830422759056092,
"step": 4175
},
{
"epoch": 0.7680999632488056,
"grad_norm": 0.9523591554987267,
"learning_rate": 1.1300295743977319e-05,
"loss": 0.7314,
"mean_token_accuracy": 0.7771936178207397,
"step": 4180
},
{
"epoch": 0.7690187431091511,
"grad_norm": 0.9704227649157859,
"learning_rate": 1.1252919369665982e-05,
"loss": 0.6644,
"mean_token_accuracy": 0.7947867512702942,
"step": 4185
},
{
"epoch": 0.7699375229694965,
"grad_norm": 1.1427236915156969,
"learning_rate": 1.1205693033501438e-05,
"loss": 0.8105,
"mean_token_accuracy": 0.7546621441841126,
"step": 4190
},
{
"epoch": 0.770856302829842,
"grad_norm": 0.9971989693482147,
"learning_rate": 1.115861717160831e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.7745411634445191,
"step": 4195
},
{
"epoch": 0.7717750826901875,
"grad_norm": 0.9708619660373234,
"learning_rate": 1.1111692218721634e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7753921389579773,
"step": 4200
},
{
"epoch": 0.7726938625505329,
"grad_norm": 1.0024258510538886,
"learning_rate": 1.1064918608182811e-05,
"loss": 0.7042,
"mean_token_accuracy": 0.7805647253990173,
"step": 4205
},
{
"epoch": 0.7736126424108783,
"grad_norm": 1.043536654965297,
"learning_rate": 1.1018296771935662e-05,
"loss": 0.7479,
"mean_token_accuracy": 0.7747788667678833,
"step": 4210
},
{
"epoch": 0.7745314222712238,
"grad_norm": 0.9600156217205023,
"learning_rate": 1.097182714052238e-05,
"loss": 0.7103,
"mean_token_accuracy": 0.7833921790122986,
"step": 4215
},
{
"epoch": 0.7754502021315692,
"grad_norm": 1.0045159954634846,
"learning_rate": 1.0925510143079597e-05,
"loss": 0.7374,
"mean_token_accuracy": 0.7714961647987366,
"step": 4220
},
{
"epoch": 0.7763689819919147,
"grad_norm": 0.9309385715533749,
"learning_rate": 1.0879346207334413e-05,
"loss": 0.7726,
"mean_token_accuracy": 0.7604559183120727,
"step": 4225
},
{
"epoch": 0.7772877618522602,
"grad_norm": 1.0081696884584601,
"learning_rate": 1.0833335759600405e-05,
"loss": 0.7722,
"mean_token_accuracy": 0.7622892618179321,
"step": 4230
},
{
"epoch": 0.7782065417126056,
"grad_norm": 1.0452970764251144,
"learning_rate": 1.0787479224773747e-05,
"loss": 0.828,
"mean_token_accuracy": 0.7479719400405884,
"step": 4235
},
{
"epoch": 0.7791253215729511,
"grad_norm": 1.0247302466447017,
"learning_rate": 1.0741777026329258e-05,
"loss": 0.7903,
"mean_token_accuracy": 0.7618830919265747,
"step": 4240
},
{
"epoch": 0.7800441014332966,
"grad_norm": 0.9573944252741409,
"learning_rate": 1.0696229586316494e-05,
"loss": 0.7805,
"mean_token_accuracy": 0.7581877827644348,
"step": 4245
},
{
"epoch": 0.780962881293642,
"grad_norm": 0.9549288805564692,
"learning_rate": 1.065083732535585e-05,
"loss": 0.7232,
"mean_token_accuracy": 0.7758707642555237,
"step": 4250
},
{
"epoch": 0.7818816611539875,
"grad_norm": 1.0135283642977615,
"learning_rate": 1.060560066263468e-05,
"loss": 0.6985,
"mean_token_accuracy": 0.7865156173706055,
"step": 4255
},
{
"epoch": 0.782800441014333,
"grad_norm": 0.9768410936733116,
"learning_rate": 1.0560520015903421e-05,
"loss": 0.6995,
"mean_token_accuracy": 0.7879634141921997,
"step": 4260
},
{
"epoch": 0.7837192208746784,
"grad_norm": 1.0406851160802406,
"learning_rate": 1.0515595801471734e-05,
"loss": 0.7099,
"mean_token_accuracy": 0.7844684720039368,
"step": 4265
},
{
"epoch": 0.7846380007350239,
"grad_norm": 1.1640626929289857,
"learning_rate": 1.0470828434204672e-05,
"loss": 0.7507,
"mean_token_accuracy": 0.7699440717697144,
"step": 4270
},
{
"epoch": 0.7855567805953694,
"grad_norm": 0.9770361775953404,
"learning_rate": 1.0426218327518831e-05,
"loss": 0.7241,
"mean_token_accuracy": 0.7754392981529236,
"step": 4275
},
{
"epoch": 0.7864755604557148,
"grad_norm": 0.9511388600942011,
"learning_rate": 1.0381765893378545e-05,
"loss": 0.7491,
"mean_token_accuracy": 0.768705952167511,
"step": 4280
},
{
"epoch": 0.7873943403160603,
"grad_norm": 0.8951535359455135,
"learning_rate": 1.0337471542292076e-05,
"loss": 0.6546,
"mean_token_accuracy": 0.7975376367568969,
"step": 4285
},
{
"epoch": 0.7883131201764058,
"grad_norm": 0.9998894325690464,
"learning_rate": 1.0293335683307825e-05,
"loss": 0.717,
"mean_token_accuracy": 0.7781760573387146,
"step": 4290
},
{
"epoch": 0.7892319000367511,
"grad_norm": 0.9660670108974215,
"learning_rate": 1.0249358724010555e-05,
"loss": 0.7081,
"mean_token_accuracy": 0.7858733177185059,
"step": 4295
},
{
"epoch": 0.7901506798970966,
"grad_norm": 0.9012514691895656,
"learning_rate": 1.0205541070517624e-05,
"loss": 0.6758,
"mean_token_accuracy": 0.7909941792488098,
"step": 4300
},
{
"epoch": 0.7910694597574421,
"grad_norm": 0.9948867453145167,
"learning_rate": 1.0161883127475242e-05,
"loss": 0.6938,
"mean_token_accuracy": 0.7855447053909301,
"step": 4305
},
{
"epoch": 0.7919882396177875,
"grad_norm": 0.984814744134321,
"learning_rate": 1.0118385298054711e-05,
"loss": 0.7587,
"mean_token_accuracy": 0.7694467306137085,
"step": 4310
},
{
"epoch": 0.792907019478133,
"grad_norm": 0.9320375007219053,
"learning_rate": 1.0075047983948743e-05,
"loss": 0.7049,
"mean_token_accuracy": 0.7814609169960022,
"step": 4315
},
{
"epoch": 0.7938257993384785,
"grad_norm": 0.9094613720358601,
"learning_rate": 1.0031871585367718e-05,
"loss": 0.6712,
"mean_token_accuracy": 0.7946569919586182,
"step": 4320
},
{
"epoch": 0.7947445791988239,
"grad_norm": 0.956979005267424,
"learning_rate": 9.988856501035992e-06,
"loss": 0.6935,
"mean_token_accuracy": 0.7856648206710816,
"step": 4325
},
{
"epoch": 0.7956633590591694,
"grad_norm": 0.9605421546046741,
"learning_rate": 9.946003128188227e-06,
"loss": 0.7125,
"mean_token_accuracy": 0.7815356492996216,
"step": 4330
},
{
"epoch": 0.7965821389195149,
"grad_norm": 1.0112492842719905,
"learning_rate": 9.903311862565718e-06,
"loss": 0.7767,
"mean_token_accuracy": 0.7658894777297973,
"step": 4335
},
{
"epoch": 0.7975009187798603,
"grad_norm": 0.9643270276794048,
"learning_rate": 9.860783098412718e-06,
"loss": 0.7266,
"mean_token_accuracy": 0.7743983864784241,
"step": 4340
},
{
"epoch": 0.7984196986402058,
"grad_norm": 1.0552126857079376,
"learning_rate": 9.818417228472828e-06,
"loss": 0.784,
"mean_token_accuracy": 0.757087242603302,
"step": 4345
},
{
"epoch": 0.7993384785005513,
"grad_norm": 0.9398083766031105,
"learning_rate": 9.776214643985372e-06,
"loss": 0.7362,
"mean_token_accuracy": 0.7717908382415771,
"step": 4350
},
{
"epoch": 0.8002572583608968,
"grad_norm": 1.1034049454029626,
"learning_rate": 9.734175734681746e-06,
"loss": 0.745,
"mean_token_accuracy": 0.7712400317192077,
"step": 4355
},
{
"epoch": 0.8011760382212422,
"grad_norm": 0.9955484033373456,
"learning_rate": 9.69230088878186e-06,
"loss": 0.7156,
"mean_token_accuracy": 0.778664481639862,
"step": 4360
},
{
"epoch": 0.8020948180815877,
"grad_norm": 0.9268939856852966,
"learning_rate": 9.650590492990517e-06,
"loss": 0.6814,
"mean_token_accuracy": 0.7887930870056152,
"step": 4365
},
{
"epoch": 0.8030135979419332,
"grad_norm": 1.016524051741597,
"learning_rate": 9.609044932493873e-06,
"loss": 0.761,
"mean_token_accuracy": 0.7663564682006836,
"step": 4370
},
{
"epoch": 0.8039323778022786,
"grad_norm": 0.9217374732022973,
"learning_rate": 9.567664590955861e-06,
"loss": 0.7344,
"mean_token_accuracy": 0.7756752133369446,
"step": 4375
},
{
"epoch": 0.804851157662624,
"grad_norm": 1.010030392708083,
"learning_rate": 9.526449850514662e-06,
"loss": 0.7442,
"mean_token_accuracy": 0.770478630065918,
"step": 4380
},
{
"epoch": 0.8057699375229695,
"grad_norm": 1.0170619440794504,
"learning_rate": 9.485401091779171e-06,
"loss": 0.7571,
"mean_token_accuracy": 0.7664804577827453,
"step": 4385
},
{
"epoch": 0.8066887173833149,
"grad_norm": 0.9571205775213486,
"learning_rate": 9.444518693825456e-06,
"loss": 0.7053,
"mean_token_accuracy": 0.7821534872055054,
"step": 4390
},
{
"epoch": 0.8076074972436604,
"grad_norm": 1.014166913858275,
"learning_rate": 9.403803034193302e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.7787631750106812,
"step": 4395
},
{
"epoch": 0.8085262771040059,
"grad_norm": 1.0747929086580883,
"learning_rate": 9.363254488882694e-06,
"loss": 0.7338,
"mean_token_accuracy": 0.7740719437599182,
"step": 4400
},
{
"epoch": 0.8094450569643513,
"grad_norm": 1.0631010384857345,
"learning_rate": 9.322873432350361e-06,
"loss": 0.7597,
"mean_token_accuracy": 0.7654994845390319,
"step": 4405
},
{
"epoch": 0.8103638368246968,
"grad_norm": 1.9306478337494233,
"learning_rate": 9.282660237506296e-06,
"loss": 0.7027,
"mean_token_accuracy": 0.7840522766113281,
"step": 4410
},
{
"epoch": 0.8112826166850423,
"grad_norm": 0.9881521244458075,
"learning_rate": 9.242615275710359e-06,
"loss": 0.7735,
"mean_token_accuracy": 0.765105926990509,
"step": 4415
},
{
"epoch": 0.8122013965453877,
"grad_norm": 0.9822768789236431,
"learning_rate": 9.202738916768773e-06,
"loss": 0.7742,
"mean_token_accuracy": 0.7636497378349304,
"step": 4420
},
{
"epoch": 0.8131201764057332,
"grad_norm": 0.9597545385693992,
"learning_rate": 9.16303152893078e-06,
"loss": 0.7168,
"mean_token_accuracy": 0.7784831523895264,
"step": 4425
},
{
"epoch": 0.8140389562660787,
"grad_norm": 1.018005126822509,
"learning_rate": 9.123493478885197e-06,
"loss": 0.7051,
"mean_token_accuracy": 0.7817409634590149,
"step": 4430
},
{
"epoch": 0.8149577361264241,
"grad_norm": 0.8612534899442146,
"learning_rate": 9.084125131757061e-06,
"loss": 0.6905,
"mean_token_accuracy": 0.7883997678756713,
"step": 4435
},
{
"epoch": 0.8158765159867696,
"grad_norm": 1.004828814027327,
"learning_rate": 9.044926851104225e-06,
"loss": 0.7088,
"mean_token_accuracy": 0.7787980914115906,
"step": 4440
},
{
"epoch": 0.8167952958471151,
"grad_norm": 1.034772375773089,
"learning_rate": 9.005898998914021e-06,
"loss": 0.7563,
"mean_token_accuracy": 0.7687358140945435,
"step": 4445
},
{
"epoch": 0.8177140757074605,
"grad_norm": 0.9599050149742322,
"learning_rate": 8.967041935599915e-06,
"loss": 0.7534,
"mean_token_accuracy": 0.7682107329368592,
"step": 4450
},
{
"epoch": 0.818632855567806,
"grad_norm": 0.9069132664938562,
"learning_rate": 8.928356019998177e-06,
"loss": 0.725,
"mean_token_accuracy": 0.7773229837417602,
"step": 4455
},
{
"epoch": 0.8195516354281515,
"grad_norm": 0.8705834756972108,
"learning_rate": 8.88984160936456e-06,
"loss": 0.7287,
"mean_token_accuracy": 0.7764084458351135,
"step": 4460
},
{
"epoch": 0.8204704152884968,
"grad_norm": 1.0456174549561577,
"learning_rate": 8.851499059371016e-06,
"loss": 0.7831,
"mean_token_accuracy": 0.7606392741203308,
"step": 4465
},
{
"epoch": 0.8213891951488423,
"grad_norm": 1.02656987229662,
"learning_rate": 8.813328724102389e-06,
"loss": 0.6944,
"mean_token_accuracy": 0.7881085634231567,
"step": 4470
},
{
"epoch": 0.8223079750091878,
"grad_norm": 0.959837964691812,
"learning_rate": 8.775330956053171e-06,
"loss": 0.7732,
"mean_token_accuracy": 0.7633563637733459,
"step": 4475
},
{
"epoch": 0.8232267548695332,
"grad_norm": 1.1225654804198197,
"learning_rate": 8.737506106124235e-06,
"loss": 0.7812,
"mean_token_accuracy": 0.7637458205223083,
"step": 4480
},
{
"epoch": 0.8241455347298787,
"grad_norm": 0.9381645378723508,
"learning_rate": 8.69985452361958e-06,
"loss": 0.704,
"mean_token_accuracy": 0.7797535300254822,
"step": 4485
},
{
"epoch": 0.8250643145902242,
"grad_norm": 0.9968842713077971,
"learning_rate": 8.662376556243134e-06,
"loss": 0.7743,
"mean_token_accuracy": 0.7624358177185059,
"step": 4490
},
{
"epoch": 0.8259830944505696,
"grad_norm": 0.8956734493127111,
"learning_rate": 8.625072550095529e-06,
"loss": 0.6901,
"mean_token_accuracy": 0.7880960464477539,
"step": 4495
},
{
"epoch": 0.8269018743109151,
"grad_norm": 0.9031387569534376,
"learning_rate": 8.587942849670877e-06,
"loss": 0.719,
"mean_token_accuracy": 0.7778927087783813,
"step": 4500
},
{
"epoch": 0.8278206541712606,
"grad_norm": 1.008720758966792,
"learning_rate": 8.550987797853658e-06,
"loss": 0.6524,
"mean_token_accuracy": 0.7953348755836487,
"step": 4505
},
{
"epoch": 0.828739434031606,
"grad_norm": 0.9852433501783017,
"learning_rate": 8.51420773591548e-06,
"loss": 0.6965,
"mean_token_accuracy": 0.7834087967872619,
"step": 4510
},
{
"epoch": 0.8296582138919515,
"grad_norm": 0.9178304984994476,
"learning_rate": 8.47760300351197e-06,
"loss": 0.6903,
"mean_token_accuracy": 0.7837494611740112,
"step": 4515
},
{
"epoch": 0.830576993752297,
"grad_norm": 0.9503212461224423,
"learning_rate": 8.441173938679624e-06,
"loss": 0.729,
"mean_token_accuracy": 0.7761277437210083,
"step": 4520
},
{
"epoch": 0.8314957736126424,
"grad_norm": 0.9554820837395569,
"learning_rate": 8.404920877832693e-06,
"loss": 0.6229,
"mean_token_accuracy": 0.8066902041435242,
"step": 4525
},
{
"epoch": 0.8324145534729879,
"grad_norm": 0.9220834077432549,
"learning_rate": 8.368844155760054e-06,
"loss": 0.7483,
"mean_token_accuracy": 0.7662014603614807,
"step": 4530
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.9412463783905453,
"learning_rate": 8.33294410562215e-06,
"loss": 0.6702,
"mean_token_accuracy": 0.792554771900177,
"step": 4535
},
{
"epoch": 0.8342521131936788,
"grad_norm": 0.9594825001322896,
"learning_rate": 8.297221058947901e-06,
"loss": 0.6827,
"mean_token_accuracy": 0.7895113706588746,
"step": 4540
},
{
"epoch": 0.8351708930540243,
"grad_norm": 1.0075829456004026,
"learning_rate": 8.26167534563163e-06,
"loss": 0.7226,
"mean_token_accuracy": 0.7793241381645203,
"step": 4545
},
{
"epoch": 0.8360896729143698,
"grad_norm": 0.9667343292842628,
"learning_rate": 8.226307293930038e-06,
"loss": 0.6909,
"mean_token_accuracy": 0.7852010130882263,
"step": 4550
},
{
"epoch": 0.8370084527747151,
"grad_norm": 1.123375486497552,
"learning_rate": 8.191117230459137e-06,
"loss": 0.7471,
"mean_token_accuracy": 0.7665111303329468,
"step": 4555
},
{
"epoch": 0.8379272326350606,
"grad_norm": 1.0938857594365514,
"learning_rate": 8.156105480191279e-06,
"loss": 0.7277,
"mean_token_accuracy": 0.7771796703338623,
"step": 4560
},
{
"epoch": 0.8388460124954061,
"grad_norm": 1.1106743041566718,
"learning_rate": 8.12127236645213e-06,
"loss": 0.686,
"mean_token_accuracy": 0.7906163096427917,
"step": 4565
},
{
"epoch": 0.8397647923557515,
"grad_norm": 1.0123275172064825,
"learning_rate": 8.08661821091768e-06,
"loss": 0.6952,
"mean_token_accuracy": 0.7839462161064148,
"step": 4570
},
{
"epoch": 0.840683572216097,
"grad_norm": 0.9381954109006908,
"learning_rate": 8.052143333611299e-06,
"loss": 0.6343,
"mean_token_accuracy": 0.8027025938034058,
"step": 4575
},
{
"epoch": 0.8416023520764425,
"grad_norm": 1.0894046730967675,
"learning_rate": 8.017848052900732e-06,
"loss": 0.7705,
"mean_token_accuracy": 0.7612823724746705,
"step": 4580
},
{
"epoch": 0.8425211319367879,
"grad_norm": 0.8738132153298477,
"learning_rate": 7.983732685495216e-06,
"loss": 0.6674,
"mean_token_accuracy": 0.7931976318359375,
"step": 4585
},
{
"epoch": 0.8434399117971334,
"grad_norm": 0.9130249915825608,
"learning_rate": 7.94979754644252e-06,
"loss": 0.7047,
"mean_token_accuracy": 0.780854594707489,
"step": 4590
},
{
"epoch": 0.8443586916574789,
"grad_norm": 0.8969413048245845,
"learning_rate": 7.91604294912604e-06,
"loss": 0.6926,
"mean_token_accuracy": 0.7844374537467956,
"step": 4595
},
{
"epoch": 0.8452774715178243,
"grad_norm": 0.9969218679518915,
"learning_rate": 7.882469205261912e-06,
"loss": 0.725,
"mean_token_accuracy": 0.7759457588195801,
"step": 4600
},
{
"epoch": 0.8461962513781698,
"grad_norm": 0.9327495309294612,
"learning_rate": 7.849076624896148e-06,
"loss": 0.7338,
"mean_token_accuracy": 0.7735802173614502,
"step": 4605
},
{
"epoch": 0.8471150312385153,
"grad_norm": 1.0765849478605605,
"learning_rate": 7.815865516401724e-06,
"loss": 0.7843,
"mean_token_accuracy": 0.7614648342132568,
"step": 4610
},
{
"epoch": 0.8480338110988607,
"grad_norm": 1.055107377021268,
"learning_rate": 7.782836186475787e-06,
"loss": 0.7252,
"mean_token_accuracy": 0.7763318181037903,
"step": 4615
},
{
"epoch": 0.8489525909592062,
"grad_norm": 0.9991555504178357,
"learning_rate": 7.749988940136794e-06,
"loss": 0.7624,
"mean_token_accuracy": 0.7689498424530029,
"step": 4620
},
{
"epoch": 0.8498713708195517,
"grad_norm": 0.9862061842467479,
"learning_rate": 7.717324080721698e-06,
"loss": 0.7724,
"mean_token_accuracy": 0.7633493065834045,
"step": 4625
},
{
"epoch": 0.850790150679897,
"grad_norm": 0.9444771429350269,
"learning_rate": 7.684841909883153e-06,
"loss": 0.736,
"mean_token_accuracy": 0.772719144821167,
"step": 4630
},
{
"epoch": 0.8517089305402425,
"grad_norm": 0.9384955588777603,
"learning_rate": 7.652542727586722e-06,
"loss": 0.732,
"mean_token_accuracy": 0.7758396387100219,
"step": 4635
},
{
"epoch": 0.852627710400588,
"grad_norm": 0.9275047945197651,
"learning_rate": 7.620426832108114e-06,
"loss": 0.6528,
"mean_token_accuracy": 0.7958010911941529,
"step": 4640
},
{
"epoch": 0.8535464902609334,
"grad_norm": 1.0403097729529958,
"learning_rate": 7.588494520030422e-06,
"loss": 0.6619,
"mean_token_accuracy": 0.7915726184844971,
"step": 4645
},
{
"epoch": 0.8544652701212789,
"grad_norm": 0.947341642816108,
"learning_rate": 7.556746086241387e-06,
"loss": 0.731,
"mean_token_accuracy": 0.775021767616272,
"step": 4650
},
{
"epoch": 0.8553840499816244,
"grad_norm": 0.920825040895765,
"learning_rate": 7.52518182393068e-06,
"loss": 0.7206,
"mean_token_accuracy": 0.7786232590675354,
"step": 4655
},
{
"epoch": 0.8563028298419698,
"grad_norm": 1.00451196504111,
"learning_rate": 7.493802024587182e-06,
"loss": 0.7028,
"mean_token_accuracy": 0.7852194666862488,
"step": 4660
},
{
"epoch": 0.8572216097023153,
"grad_norm": 1.0502227606850303,
"learning_rate": 7.4626069779963044e-06,
"loss": 0.7102,
"mean_token_accuracy": 0.7817628145217895,
"step": 4665
},
{
"epoch": 0.8581403895626608,
"grad_norm": 0.9506022079001433,
"learning_rate": 7.431596972237313e-06,
"loss": 0.7541,
"mean_token_accuracy": 0.7674841046333313,
"step": 4670
},
{
"epoch": 0.8590591694230062,
"grad_norm": 0.8995410623765239,
"learning_rate": 7.400772293680655e-06,
"loss": 0.6585,
"mean_token_accuracy": 0.7939380526542663,
"step": 4675
},
{
"epoch": 0.8599779492833517,
"grad_norm": 1.0524188630602864,
"learning_rate": 7.370133226985324e-06,
"loss": 0.7053,
"mean_token_accuracy": 0.7818097829818725,
"step": 4680
},
{
"epoch": 0.8608967291436972,
"grad_norm": 1.0628395588693211,
"learning_rate": 7.339680055096238e-06,
"loss": 0.7268,
"mean_token_accuracy": 0.7778566598892211,
"step": 4685
},
{
"epoch": 0.8618155090040426,
"grad_norm": 1.0409171879354133,
"learning_rate": 7.3094130592416e-06,
"loss": 0.7509,
"mean_token_accuracy": 0.7721391081809997,
"step": 4690
},
{
"epoch": 0.8627342888643881,
"grad_norm": 0.9450810576486318,
"learning_rate": 7.279332518930333e-06,
"loss": 0.7354,
"mean_token_accuracy": 0.7757332921028137,
"step": 4695
},
{
"epoch": 0.8636530687247336,
"grad_norm": 1.005547743201597,
"learning_rate": 7.24943871194949e-06,
"loss": 0.7123,
"mean_token_accuracy": 0.784231448173523,
"step": 4700
},
{
"epoch": 0.864571848585079,
"grad_norm": 1.0321990207147438,
"learning_rate": 7.219731914361673e-06,
"loss": 0.7119,
"mean_token_accuracy": 0.7776958227157593,
"step": 4705
},
{
"epoch": 0.8654906284454245,
"grad_norm": 1.034071085635404,
"learning_rate": 7.190212400502496e-06,
"loss": 0.6915,
"mean_token_accuracy": 0.7870635032653809,
"step": 4710
},
{
"epoch": 0.86640940830577,
"grad_norm": 0.9373543843862668,
"learning_rate": 7.160880442978049e-06,
"loss": 0.6896,
"mean_token_accuracy": 0.7847368240356445,
"step": 4715
},
{
"epoch": 0.8673281881661153,
"grad_norm": 0.9200571177499256,
"learning_rate": 7.131736312662385e-06,
"loss": 0.7087,
"mean_token_accuracy": 0.7825104236602783,
"step": 4720
},
{
"epoch": 0.8682469680264608,
"grad_norm": 0.9546006111783019,
"learning_rate": 7.1027802786950064e-06,
"loss": 0.803,
"mean_token_accuracy": 0.7619799256324769,
"step": 4725
},
{
"epoch": 0.8691657478868063,
"grad_norm": 1.0153027436586384,
"learning_rate": 7.074012608478406e-06,
"loss": 0.745,
"mean_token_accuracy": 0.771528446674347,
"step": 4730
},
{
"epoch": 0.8700845277471518,
"grad_norm": 0.9597117720266102,
"learning_rate": 7.04543356767556e-06,
"loss": 0.6761,
"mean_token_accuracy": 0.7918476939201355,
"step": 4735
},
{
"epoch": 0.8710033076074972,
"grad_norm": 0.978172489455684,
"learning_rate": 7.0170434202075115e-06,
"loss": 0.7295,
"mean_token_accuracy": 0.7755934953689575,
"step": 4740
},
{
"epoch": 0.8719220874678427,
"grad_norm": 0.9901049864461209,
"learning_rate": 6.9888424282508955e-06,
"loss": 0.6808,
"mean_token_accuracy": 0.7899694681167603,
"step": 4745
},
{
"epoch": 0.8728408673281882,
"grad_norm": 0.9937024532653225,
"learning_rate": 6.960830852235556e-06,
"loss": 0.7051,
"mean_token_accuracy": 0.784772801399231,
"step": 4750
},
{
"epoch": 0.8737596471885336,
"grad_norm": 0.9976292512478198,
"learning_rate": 6.9330089508421125e-06,
"loss": 0.7258,
"mean_token_accuracy": 0.778191328048706,
"step": 4755
},
{
"epoch": 0.8746784270488791,
"grad_norm": 1.0403981175236854,
"learning_rate": 6.905376980999588e-06,
"loss": 0.7431,
"mean_token_accuracy": 0.7714271783828736,
"step": 4760
},
{
"epoch": 0.8755972069092246,
"grad_norm": 1.0581186557788087,
"learning_rate": 6.877935197883034e-06,
"loss": 0.712,
"mean_token_accuracy": 0.782374131679535,
"step": 4765
},
{
"epoch": 0.87651598676957,
"grad_norm": 0.9127662765794763,
"learning_rate": 6.85068385491116e-06,
"loss": 0.6783,
"mean_token_accuracy": 0.7902564287185669,
"step": 4770
},
{
"epoch": 0.8774347666299155,
"grad_norm": 1.0211009545118401,
"learning_rate": 6.823623203744009e-06,
"loss": 0.7424,
"mean_token_accuracy": 0.7728718996047974,
"step": 4775
},
{
"epoch": 0.878353546490261,
"grad_norm": 0.9378238243616713,
"learning_rate": 6.796753494280624e-06,
"loss": 0.6775,
"mean_token_accuracy": 0.7908179044723511,
"step": 4780
},
{
"epoch": 0.8792723263506064,
"grad_norm": 1.0104116736997681,
"learning_rate": 6.770074974656751e-06,
"loss": 0.6963,
"mean_token_accuracy": 0.7875288009643555,
"step": 4785
},
{
"epoch": 0.8801911062109519,
"grad_norm": 0.9673097547324119,
"learning_rate": 6.743587891242536e-06,
"loss": 0.7006,
"mean_token_accuracy": 0.7846636652946473,
"step": 4790
},
{
"epoch": 0.8811098860712974,
"grad_norm": 1.0556257979311925,
"learning_rate": 6.717292488640256e-06,
"loss": 0.8204,
"mean_token_accuracy": 0.7494418621063232,
"step": 4795
},
{
"epoch": 0.8820286659316428,
"grad_norm": 0.9770128218279439,
"learning_rate": 6.691189009682059e-06,
"loss": 0.6983,
"mean_token_accuracy": 0.787074613571167,
"step": 4800
},
{
"epoch": 0.8829474457919883,
"grad_norm": 0.8883173557701798,
"learning_rate": 6.665277695427717e-06,
"loss": 0.7084,
"mean_token_accuracy": 0.7833673834800721,
"step": 4805
},
{
"epoch": 0.8838662256523337,
"grad_norm": 0.9692250808371157,
"learning_rate": 6.63955878516241e-06,
"loss": 0.6823,
"mean_token_accuracy": 0.7903570294380188,
"step": 4810
},
{
"epoch": 0.8847850055126791,
"grad_norm": 0.8810560098191498,
"learning_rate": 6.614032516394509e-06,
"loss": 0.6837,
"mean_token_accuracy": 0.7892964124679566,
"step": 4815
},
{
"epoch": 0.8857037853730246,
"grad_norm": 0.9613974496107343,
"learning_rate": 6.588699124853379e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.7778627634048462,
"step": 4820
},
{
"epoch": 0.8866225652333701,
"grad_norm": 0.9155720741178934,
"learning_rate": 6.563558844487215e-06,
"loss": 0.6238,
"mean_token_accuracy": 0.806905460357666,
"step": 4825
},
{
"epoch": 0.8875413450937155,
"grad_norm": 0.8747125126149234,
"learning_rate": 6.538611907460866e-06,
"loss": 0.6377,
"mean_token_accuracy": 0.8006066799163818,
"step": 4830
},
{
"epoch": 0.888460124954061,
"grad_norm": 1.007447868920518,
"learning_rate": 6.513858544153706e-06,
"loss": 0.7043,
"mean_token_accuracy": 0.7833499908447266,
"step": 4835
},
{
"epoch": 0.8893789048144065,
"grad_norm": 0.9255281703679149,
"learning_rate": 6.48929898315749e-06,
"loss": 0.6973,
"mean_token_accuracy": 0.782793152332306,
"step": 4840
},
{
"epoch": 0.8902976846747519,
"grad_norm": 0.9388975020879022,
"learning_rate": 6.464933451274261e-06,
"loss": 0.6256,
"mean_token_accuracy": 0.8049848675727844,
"step": 4845
},
{
"epoch": 0.8912164645350974,
"grad_norm": 1.012760049959928,
"learning_rate": 6.440762173514238e-06,
"loss": 0.7309,
"mean_token_accuracy": 0.7747970938682556,
"step": 4850
},
{
"epoch": 0.8921352443954429,
"grad_norm": 0.9633483639424572,
"learning_rate": 6.416785373093756e-06,
"loss": 0.7864,
"mean_token_accuracy": 0.7609822034835816,
"step": 4855
},
{
"epoch": 0.8930540242557883,
"grad_norm": 0.9375903231729885,
"learning_rate": 6.39300327143319e-06,
"loss": 0.7045,
"mean_token_accuracy": 0.7813974022865295,
"step": 4860
},
{
"epoch": 0.8939728041161338,
"grad_norm": 0.9116788058926382,
"learning_rate": 6.369416088154917e-06,
"loss": 0.7031,
"mean_token_accuracy": 0.7866278529167176,
"step": 4865
},
{
"epoch": 0.8948915839764793,
"grad_norm": 0.8514994068032359,
"learning_rate": 6.346024041081286e-06,
"loss": 0.6053,
"mean_token_accuracy": 0.8105675458908081,
"step": 4870
},
{
"epoch": 0.8958103638368247,
"grad_norm": 1.0010033445452702,
"learning_rate": 6.32282734623261e-06,
"loss": 0.7069,
"mean_token_accuracy": 0.7806268095970154,
"step": 4875
},
{
"epoch": 0.8967291436971702,
"grad_norm": 0.9335886122342348,
"learning_rate": 6.299826217825156e-06,
"loss": 0.7464,
"mean_token_accuracy": 0.770034921169281,
"step": 4880
},
{
"epoch": 0.8976479235575157,
"grad_norm": 0.9307585065246715,
"learning_rate": 6.277020868269191e-06,
"loss": 0.7473,
"mean_token_accuracy": 0.7723272204399109,
"step": 4885
},
{
"epoch": 0.898566703417861,
"grad_norm": 0.9507339920147042,
"learning_rate": 6.254411508167009e-06,
"loss": 0.7498,
"mean_token_accuracy": 0.7717328667640686,
"step": 4890
},
{
"epoch": 0.8994854832782065,
"grad_norm": 0.9826282972840381,
"learning_rate": 6.23199834631098e-06,
"loss": 0.7964,
"mean_token_accuracy": 0.7543912172317505,
"step": 4895
},
{
"epoch": 0.900404263138552,
"grad_norm": 0.9073522447780296,
"learning_rate": 6.2097815896816306e-06,
"loss": 0.7151,
"mean_token_accuracy": 0.7821811556816101,
"step": 4900
},
{
"epoch": 0.9013230429988974,
"grad_norm": 1.0820009074965473,
"learning_rate": 6.187761443445719e-06,
"loss": 0.7168,
"mean_token_accuracy": 0.779189658164978,
"step": 4905
},
{
"epoch": 0.9022418228592429,
"grad_norm": 1.0031779458182357,
"learning_rate": 6.165938110954365e-06,
"loss": 0.7316,
"mean_token_accuracy": 0.7749498963356019,
"step": 4910
},
{
"epoch": 0.9031606027195884,
"grad_norm": 0.9350157804822582,
"learning_rate": 6.144311793741147e-06,
"loss": 0.7289,
"mean_token_accuracy": 0.7790675520896911,
"step": 4915
},
{
"epoch": 0.9040793825799338,
"grad_norm": 0.9268769698711982,
"learning_rate": 6.122882691520254e-06,
"loss": 0.7369,
"mean_token_accuracy": 0.7718736052513122,
"step": 4920
},
{
"epoch": 0.9049981624402793,
"grad_norm": 0.9014435353019561,
"learning_rate": 6.101651002184649e-06,
"loss": 0.7007,
"mean_token_accuracy": 0.7885142803192139,
"step": 4925
},
{
"epoch": 0.9059169423006248,
"grad_norm": 1.1124655246934698,
"learning_rate": 6.0806169218042185e-06,
"loss": 0.7541,
"mean_token_accuracy": 0.7671143889427186,
"step": 4930
},
{
"epoch": 0.9068357221609702,
"grad_norm": 0.941789239472656,
"learning_rate": 6.0597806446239775e-06,
"loss": 0.6182,
"mean_token_accuracy": 0.8084997653961181,
"step": 4935
},
{
"epoch": 0.9077545020213157,
"grad_norm": 0.9986182896305318,
"learning_rate": 6.039142363062271e-06,
"loss": 0.6456,
"mean_token_accuracy": 0.8005677580833435,
"step": 4940
},
{
"epoch": 0.9086732818816612,
"grad_norm": 0.9291170901784223,
"learning_rate": 6.018702267709008e-06,
"loss": 0.7112,
"mean_token_accuracy": 0.7811415076255799,
"step": 4945
},
{
"epoch": 0.9095920617420066,
"grad_norm": 1.0595420029395017,
"learning_rate": 5.998460547323881e-06,
"loss": 0.7741,
"mean_token_accuracy": 0.7643965363502503,
"step": 4950
},
{
"epoch": 0.9105108416023521,
"grad_norm": 1.086020272651734,
"learning_rate": 5.978417388834642e-06,
"loss": 0.8087,
"mean_token_accuracy": 0.754006028175354,
"step": 4955
},
{
"epoch": 0.9114296214626976,
"grad_norm": 1.0314753490097466,
"learning_rate": 5.958572977335365e-06,
"loss": 0.647,
"mean_token_accuracy": 0.8025306582450866,
"step": 4960
},
{
"epoch": 0.912348401323043,
"grad_norm": 0.8858957049531258,
"learning_rate": 5.93892749608474e-06,
"loss": 0.6734,
"mean_token_accuracy": 0.789763331413269,
"step": 4965
},
{
"epoch": 0.9132671811833885,
"grad_norm": 0.9228304919571796,
"learning_rate": 5.919481126504383e-06,
"loss": 0.6979,
"mean_token_accuracy": 0.7835509300231933,
"step": 4970
},
{
"epoch": 0.914185961043734,
"grad_norm": 0.9172131766669608,
"learning_rate": 5.900234048177156e-06,
"loss": 0.7468,
"mean_token_accuracy": 0.7716853857040405,
"step": 4975
},
{
"epoch": 0.9151047409040793,
"grad_norm": 0.9717495119840875,
"learning_rate": 5.881186438845511e-06,
"loss": 0.6534,
"mean_token_accuracy": 0.7953248977661133,
"step": 4980
},
{
"epoch": 0.9160235207644248,
"grad_norm": 0.9387494667741174,
"learning_rate": 5.862338474409852e-06,
"loss": 0.7276,
"mean_token_accuracy": 0.7760698676109314,
"step": 4985
},
{
"epoch": 0.9169423006247703,
"grad_norm": 0.9759164760125104,
"learning_rate": 5.843690328926905e-06,
"loss": 0.7429,
"mean_token_accuracy": 0.7714617133140564,
"step": 4990
},
{
"epoch": 0.9178610804851157,
"grad_norm": 0.9934351587205096,
"learning_rate": 5.825242174608107e-06,
"loss": 0.7111,
"mean_token_accuracy": 0.7826705813407898,
"step": 4995
},
{
"epoch": 0.9187798603454612,
"grad_norm": 0.9923527821765938,
"learning_rate": 5.8069941818180335e-06,
"loss": 0.6332,
"mean_token_accuracy": 0.8037675261497498,
"step": 5000
},
{
"epoch": 0.9196986402058067,
"grad_norm": 1.0549322241395105,
"learning_rate": 5.788946519072802e-06,
"loss": 0.7442,
"mean_token_accuracy": 0.7685003876686096,
"step": 5005
},
{
"epoch": 0.9206174200661521,
"grad_norm": 0.9976060100023064,
"learning_rate": 5.771099353038532e-06,
"loss": 0.7271,
"mean_token_accuracy": 0.7736078143119812,
"step": 5010
},
{
"epoch": 0.9215361999264976,
"grad_norm": 0.9383299705731176,
"learning_rate": 5.7534528485298e-06,
"loss": 0.725,
"mean_token_accuracy": 0.7763983011245728,
"step": 5015
},
{
"epoch": 0.9224549797868431,
"grad_norm": 0.963672340776997,
"learning_rate": 5.736007168508121e-06,
"loss": 0.6831,
"mean_token_accuracy": 0.7851462960243225,
"step": 5020
},
{
"epoch": 0.9233737596471885,
"grad_norm": 0.9469619269658613,
"learning_rate": 5.7187624740804345e-06,
"loss": 0.7275,
"mean_token_accuracy": 0.7783573985099792,
"step": 5025
},
{
"epoch": 0.924292539507534,
"grad_norm": 0.9984975559017129,
"learning_rate": 5.701718924497633e-06,
"loss": 0.7006,
"mean_token_accuracy": 0.786095142364502,
"step": 5030
},
{
"epoch": 0.9252113193678795,
"grad_norm": 0.9976875410663416,
"learning_rate": 5.684876677153069e-06,
"loss": 0.78,
"mean_token_accuracy": 0.7615157961845398,
"step": 5035
},
{
"epoch": 0.9261300992282249,
"grad_norm": 0.9677019035350851,
"learning_rate": 5.668235887581126e-06,
"loss": 0.7171,
"mean_token_accuracy": 0.7806509256362915,
"step": 5040
},
{
"epoch": 0.9270488790885704,
"grad_norm": 1.021088055495027,
"learning_rate": 5.651796709455757e-06,
"loss": 0.7329,
"mean_token_accuracy": 0.7730448007583618,
"step": 5045
},
{
"epoch": 0.9279676589489159,
"grad_norm": 1.0552382975961647,
"learning_rate": 5.6355592945890934e-06,
"loss": 0.6811,
"mean_token_accuracy": 0.786517608165741,
"step": 5050
},
{
"epoch": 0.9288864388092613,
"grad_norm": 1.0701013175318246,
"learning_rate": 5.619523792930021e-06,
"loss": 0.7371,
"mean_token_accuracy": 0.7725589275360107,
"step": 5055
},
{
"epoch": 0.9298052186696067,
"grad_norm": 0.9720134878474826,
"learning_rate": 5.6036903525627975e-06,
"loss": 0.6481,
"mean_token_accuracy": 0.8003619313240051,
"step": 5060
},
{
"epoch": 0.9307239985299522,
"grad_norm": 1.129850002758599,
"learning_rate": 5.588059119705699e-06,
"loss": 0.753,
"mean_token_accuracy": 0.7675109386444092,
"step": 5065
},
{
"epoch": 0.9316427783902976,
"grad_norm": 0.9756659000260551,
"learning_rate": 5.5726302387096506e-06,
"loss": 0.7282,
"mean_token_accuracy": 0.7749423146247864,
"step": 5070
},
{
"epoch": 0.9325615582506431,
"grad_norm": 0.9956804499004365,
"learning_rate": 5.557403852056914e-06,
"loss": 0.7106,
"mean_token_accuracy": 0.7811750769615173,
"step": 5075
},
{
"epoch": 0.9334803381109886,
"grad_norm": 0.9560365954578341,
"learning_rate": 5.542380100359751e-06,
"loss": 0.71,
"mean_token_accuracy": 0.7802268862724304,
"step": 5080
},
{
"epoch": 0.934399117971334,
"grad_norm": 0.9454250302290028,
"learning_rate": 5.527559122359145e-06,
"loss": 0.6968,
"mean_token_accuracy": 0.7844350814819336,
"step": 5085
},
{
"epoch": 0.9353178978316795,
"grad_norm": 1.0545713849704708,
"learning_rate": 5.512941054923507e-06,
"loss": 0.7085,
"mean_token_accuracy": 0.7803709745407105,
"step": 5090
},
{
"epoch": 0.936236677692025,
"grad_norm": 1.103412663642094,
"learning_rate": 5.498526033047404e-06,
"loss": 0.726,
"mean_token_accuracy": 0.7740720987319947,
"step": 5095
},
{
"epoch": 0.9371554575523704,
"grad_norm": 1.0018246605663172,
"learning_rate": 5.484314189850335e-06,
"loss": 0.6914,
"mean_token_accuracy": 0.7853707432746887,
"step": 5100
},
{
"epoch": 0.9380742374127159,
"grad_norm": 0.9292037350722813,
"learning_rate": 5.470305656575487e-06,
"loss": 0.6809,
"mean_token_accuracy": 0.7880851745605468,
"step": 5105
},
{
"epoch": 0.9389930172730614,
"grad_norm": 0.942507726345383,
"learning_rate": 5.45650056258852e-06,
"loss": 0.6444,
"mean_token_accuracy": 0.8017876148223877,
"step": 5110
},
{
"epoch": 0.9399117971334069,
"grad_norm": 0.950473741974958,
"learning_rate": 5.442899035376386e-06,
"loss": 0.6918,
"mean_token_accuracy": 0.7830354809761048,
"step": 5115
},
{
"epoch": 0.9408305769937523,
"grad_norm": 1.0328707402514066,
"learning_rate": 5.429501200546137e-06,
"loss": 0.6809,
"mean_token_accuracy": 0.7883656024932861,
"step": 5120
},
{
"epoch": 0.9417493568540978,
"grad_norm": 0.9106187724534776,
"learning_rate": 5.416307181823773e-06,
"loss": 0.6529,
"mean_token_accuracy": 0.7979433417320252,
"step": 5125
},
{
"epoch": 0.9426681367144433,
"grad_norm": 0.8301715826585289,
"learning_rate": 5.403317101053101e-06,
"loss": 0.6319,
"mean_token_accuracy": 0.8041222810745239,
"step": 5130
},
{
"epoch": 0.9435869165747887,
"grad_norm": 0.9401314468555536,
"learning_rate": 5.3905310781946005e-06,
"loss": 0.7681,
"mean_token_accuracy": 0.7674003601074219,
"step": 5135
},
{
"epoch": 0.9445056964351342,
"grad_norm": 0.9634667604340411,
"learning_rate": 5.377949231324331e-06,
"loss": 0.6745,
"mean_token_accuracy": 0.7905578970909118,
"step": 5140
},
{
"epoch": 0.9454244762954797,
"grad_norm": 0.895622215360212,
"learning_rate": 5.3655716766328235e-06,
"loss": 0.678,
"mean_token_accuracy": 0.7900139689445496,
"step": 5145
},
{
"epoch": 0.946343256155825,
"grad_norm": 0.9199451561014395,
"learning_rate": 5.353398528424019e-06,
"loss": 0.6503,
"mean_token_accuracy": 0.8004558086395264,
"step": 5150
},
{
"epoch": 0.9472620360161705,
"grad_norm": 1.0795462562761053,
"learning_rate": 5.341429899114216e-06,
"loss": 0.796,
"mean_token_accuracy": 0.7549399971961975,
"step": 5155
},
{
"epoch": 0.948180815876516,
"grad_norm": 1.0593495804757314,
"learning_rate": 5.3296658992310215e-06,
"loss": 0.7738,
"mean_token_accuracy": 0.7613824367523193,
"step": 5160
},
{
"epoch": 0.9490995957368614,
"grad_norm": 0.87159278381413,
"learning_rate": 5.318106637412333e-06,
"loss": 0.7275,
"mean_token_accuracy": 0.7751541256904602,
"step": 5165
},
{
"epoch": 0.9500183755972069,
"grad_norm": 0.9358627934311341,
"learning_rate": 5.306752220405349e-06,
"loss": 0.6959,
"mean_token_accuracy": 0.7841734409332275,
"step": 5170
},
{
"epoch": 0.9509371554575524,
"grad_norm": 1.046481143355732,
"learning_rate": 5.295602753065557e-06,
"loss": 0.7627,
"mean_token_accuracy": 0.7681636333465576,
"step": 5175
},
{
"epoch": 0.9518559353178978,
"grad_norm": 0.9759465901582869,
"learning_rate": 5.284658338355793e-06,
"loss": 0.6722,
"mean_token_accuracy": 0.7921370148658753,
"step": 5180
},
{
"epoch": 0.9527747151782433,
"grad_norm": 0.9867516760566891,
"learning_rate": 5.27391907734527e-06,
"loss": 0.6645,
"mean_token_accuracy": 0.7929692029953003,
"step": 5185
},
{
"epoch": 0.9536934950385888,
"grad_norm": 1.0576850268434166,
"learning_rate": 5.263385069208657e-06,
"loss": 0.6768,
"mean_token_accuracy": 0.7890636920928955,
"step": 5190
},
{
"epoch": 0.9546122748989342,
"grad_norm": 0.9776114467729704,
"learning_rate": 5.253056411225155e-06,
"loss": 0.742,
"mean_token_accuracy": 0.7734999060630798,
"step": 5195
},
{
"epoch": 0.9555310547592797,
"grad_norm": 1.0194931640790486,
"learning_rate": 5.242933198777612e-06,
"loss": 0.7576,
"mean_token_accuracy": 0.7681198120117188,
"step": 5200
},
{
"epoch": 0.9564498346196252,
"grad_norm": 1.0129376817436055,
"learning_rate": 5.233015525351615e-06,
"loss": 0.7419,
"mean_token_accuracy": 0.7743308544158936,
"step": 5205
},
{
"epoch": 0.9573686144799706,
"grad_norm": 1.0893813952165832,
"learning_rate": 5.223303482534663e-06,
"loss": 0.7326,
"mean_token_accuracy": 0.7767649531364441,
"step": 5210
},
{
"epoch": 0.9582873943403161,
"grad_norm": 0.9505887964318166,
"learning_rate": 5.213797160015287e-06,
"loss": 0.6831,
"mean_token_accuracy": 0.7896853566169739,
"step": 5215
},
{
"epoch": 0.9592061742006616,
"grad_norm": 0.9619901073178158,
"learning_rate": 5.204496645582251e-06,
"loss": 0.6331,
"mean_token_accuracy": 0.8018953204154968,
"step": 5220
},
{
"epoch": 0.960124954061007,
"grad_norm": 0.9557772773728191,
"learning_rate": 5.195402025123713e-06,
"loss": 0.7335,
"mean_token_accuracy": 0.7740659594535828,
"step": 5225
},
{
"epoch": 0.9610437339213524,
"grad_norm": 1.0297793279179666,
"learning_rate": 5.18651338262646e-06,
"loss": 0.7168,
"mean_token_accuracy": 0.7795220851898194,
"step": 5230
},
{
"epoch": 0.961962513781698,
"grad_norm": 1.2404081410589938,
"learning_rate": 5.177830800175107e-06,
"loss": 0.8105,
"mean_token_accuracy": 0.7509904742240906,
"step": 5235
},
{
"epoch": 0.9628812936420433,
"grad_norm": 0.9134300813748663,
"learning_rate": 5.169354357951361e-06,
"loss": 0.6651,
"mean_token_accuracy": 0.7976749420166016,
"step": 5240
},
{
"epoch": 0.9638000735023888,
"grad_norm": 1.0273400541069664,
"learning_rate": 5.161084134233264e-06,
"loss": 0.7448,
"mean_token_accuracy": 0.7694393396377563,
"step": 5245
},
{
"epoch": 0.9647188533627343,
"grad_norm": 0.8905853308436543,
"learning_rate": 5.153020205394477e-06,
"loss": 0.7163,
"mean_token_accuracy": 0.7769248247146606,
"step": 5250
},
{
"epoch": 0.9656376332230797,
"grad_norm": 0.9529346136037607,
"learning_rate": 5.145162645903574e-06,
"loss": 0.761,
"mean_token_accuracy": 0.7652369141578674,
"step": 5255
},
{
"epoch": 0.9665564130834252,
"grad_norm": 0.9650979477578159,
"learning_rate": 5.1375115283233555e-06,
"loss": 0.6776,
"mean_token_accuracy": 0.7902606129646301,
"step": 5260
},
{
"epoch": 0.9674751929437707,
"grad_norm": 0.9641271433925336,
"learning_rate": 5.130066923310179e-06,
"loss": 0.7631,
"mean_token_accuracy": 0.7635928511619567,
"step": 5265
},
{
"epoch": 0.9683939728041161,
"grad_norm": 0.9213872430466661,
"learning_rate": 5.122828899613301e-06,
"loss": 0.7167,
"mean_token_accuracy": 0.7826451182365417,
"step": 5270
},
{
"epoch": 0.9693127526644616,
"grad_norm": 0.944684513184743,
"learning_rate": 5.115797524074245e-06,
"loss": 0.7028,
"mean_token_accuracy": 0.7823728322982788,
"step": 5275
},
{
"epoch": 0.9702315325248071,
"grad_norm": 0.933365103880651,
"learning_rate": 5.108972861626195e-06,
"loss": 0.675,
"mean_token_accuracy": 0.7928666949272156,
"step": 5280
},
{
"epoch": 0.9711503123851525,
"grad_norm": 0.9439434977514946,
"learning_rate": 5.102354975293371e-06,
"loss": 0.737,
"mean_token_accuracy": 0.7746310830116272,
"step": 5285
},
{
"epoch": 0.972069092245498,
"grad_norm": 0.9602834611038171,
"learning_rate": 5.0959439261904715e-06,
"loss": 0.6584,
"mean_token_accuracy": 0.7969113111495971,
"step": 5290
},
{
"epoch": 0.9729878721058435,
"grad_norm": 0.9686252030086645,
"learning_rate": 5.089739773522099e-06,
"loss": 0.7185,
"mean_token_accuracy": 0.7791404247283935,
"step": 5295
},
{
"epoch": 0.9739066519661889,
"grad_norm": 0.9606975967780023,
"learning_rate": 5.083742574582211e-06,
"loss": 0.6582,
"mean_token_accuracy": 0.7957128643989563,
"step": 5300
},
{
"epoch": 0.9748254318265344,
"grad_norm": 0.9508520485094629,
"learning_rate": 5.077952384753596e-06,
"loss": 0.6683,
"mean_token_accuracy": 0.794218647480011,
"step": 5305
},
{
"epoch": 0.9757442116868799,
"grad_norm": 1.019210345399437,
"learning_rate": 5.072369257507359e-06,
"loss": 0.6832,
"mean_token_accuracy": 0.7870172739028931,
"step": 5310
},
{
"epoch": 0.9766629915472252,
"grad_norm": 0.8796036807363417,
"learning_rate": 5.066993244402426e-06,
"loss": 0.6779,
"mean_token_accuracy": 0.7899052858352661,
"step": 5315
},
{
"epoch": 0.9775817714075707,
"grad_norm": 0.9949077138684812,
"learning_rate": 5.061824395085075e-06,
"loss": 0.7555,
"mean_token_accuracy": 0.7675400733947754,
"step": 5320
},
{
"epoch": 0.9785005512679162,
"grad_norm": 1.0386477233819087,
"learning_rate": 5.056862757288469e-06,
"loss": 0.6976,
"mean_token_accuracy": 0.785806167125702,
"step": 5325
},
{
"epoch": 0.9794193311282616,
"grad_norm": 1.1386145757601591,
"learning_rate": 5.052108376832222e-06,
"loss": 0.7237,
"mean_token_accuracy": 0.774617874622345,
"step": 5330
},
{
"epoch": 0.9803381109886071,
"grad_norm": 1.0398368664600088,
"learning_rate": 5.04756129762197e-06,
"loss": 0.6786,
"mean_token_accuracy": 0.7888760805130005,
"step": 5335
},
{
"epoch": 0.9812568908489526,
"grad_norm": 0.8969882432761646,
"learning_rate": 5.043221561648972e-06,
"loss": 0.7473,
"mean_token_accuracy": 0.7688890337944031,
"step": 5340
},
{
"epoch": 0.982175670709298,
"grad_norm": 0.9121065335737262,
"learning_rate": 5.039089208989717e-06,
"loss": 0.6528,
"mean_token_accuracy": 0.8000433087348938,
"step": 5345
},
{
"epoch": 0.9830944505696435,
"grad_norm": 1.0556010790241608,
"learning_rate": 5.035164277805552e-06,
"loss": 0.6922,
"mean_token_accuracy": 0.7864078521728516,
"step": 5350
},
{
"epoch": 0.984013230429989,
"grad_norm": 0.875446950379487,
"learning_rate": 5.031446804342338e-06,
"loss": 0.6736,
"mean_token_accuracy": 0.7885773062705994,
"step": 5355
},
{
"epoch": 0.9849320102903344,
"grad_norm": 1.0577726392343791,
"learning_rate": 5.027936822930111e-06,
"loss": 0.6329,
"mean_token_accuracy": 0.8035769701004029,
"step": 5360
},
{
"epoch": 0.9858507901506799,
"grad_norm": 0.9114969331526765,
"learning_rate": 5.024634365982759e-06,
"loss": 0.6764,
"mean_token_accuracy": 0.7900681734085083,
"step": 5365
},
{
"epoch": 0.9867695700110254,
"grad_norm": 1.0562487370376004,
"learning_rate": 5.021539463997731e-06,
"loss": 0.7614,
"mean_token_accuracy": 0.7608750462532043,
"step": 5370
},
{
"epoch": 0.9876883498713708,
"grad_norm": 0.9772893925508611,
"learning_rate": 5.018652145555758e-06,
"loss": 0.7183,
"mean_token_accuracy": 0.780507218837738,
"step": 5375
},
{
"epoch": 0.9886071297317163,
"grad_norm": 0.9153996098402806,
"learning_rate": 5.015972437320575e-06,
"loss": 0.6912,
"mean_token_accuracy": 0.7845159888267517,
"step": 5380
},
{
"epoch": 0.9895259095920618,
"grad_norm": 0.9730616558454289,
"learning_rate": 5.013500364038685e-06,
"loss": 0.7219,
"mean_token_accuracy": 0.7788301348686218,
"step": 5385
},
{
"epoch": 0.9904446894524072,
"grad_norm": 1.0739524554988746,
"learning_rate": 5.011235948539137e-06,
"loss": 0.7236,
"mean_token_accuracy": 0.7729606032371521,
"step": 5390
},
{
"epoch": 0.9913634693127527,
"grad_norm": 0.9269482984472148,
"learning_rate": 5.00917921173329e-06,
"loss": 0.6858,
"mean_token_accuracy": 0.7901342034339904,
"step": 5395
},
{
"epoch": 0.9922822491730982,
"grad_norm": 1.0961849910542936,
"learning_rate": 5.007330172614658e-06,
"loss": 0.7486,
"mean_token_accuracy": 0.7710240960121155,
"step": 5400
},
{
"epoch": 0.9932010290334435,
"grad_norm": 0.9749225295480475,
"learning_rate": 5.005688848258695e-06,
"loss": 0.6663,
"mean_token_accuracy": 0.7941651105880737,
"step": 5405
},
{
"epoch": 0.994119808893789,
"grad_norm": 0.9260821925718788,
"learning_rate": 5.004255253822668e-06,
"loss": 0.6861,
"mean_token_accuracy": 0.7862310886383057,
"step": 5410
},
{
"epoch": 0.9950385887541345,
"grad_norm": 1.088283597268154,
"learning_rate": 5.0030294025454985e-06,
"loss": 0.7656,
"mean_token_accuracy": 0.7660502552986145,
"step": 5415
},
{
"epoch": 0.9959573686144799,
"grad_norm": 0.9441714298335138,
"learning_rate": 5.002011305747647e-06,
"loss": 0.6971,
"mean_token_accuracy": 0.7839124202728271,
"step": 5420
},
{
"epoch": 0.9968761484748254,
"grad_norm": 1.0363518119067192,
"learning_rate": 5.0012009728310115e-06,
"loss": 0.7004,
"mean_token_accuracy": 0.7838043451309205,
"step": 5425
},
{
"epoch": 0.9977949283351709,
"grad_norm": 0.9601846400078509,
"learning_rate": 5.0005984112788325e-06,
"loss": 0.7021,
"mean_token_accuracy": 0.7850103259086609,
"step": 5430
},
{
"epoch": 0.9987137081955163,
"grad_norm": 1.00552667894893,
"learning_rate": 5.0002036266556325e-06,
"loss": 0.7154,
"mean_token_accuracy": 0.7820982575416565,
"step": 5435
},
{
"epoch": 0.9996324880558618,
"grad_norm": 0.9185989598293809,
"learning_rate": 5.000016622607158e-06,
"loss": 0.7066,
"mean_token_accuracy": 0.7812744855880738,
"step": 5440
},
{
"epoch": 1.0,
"step": 5442,
"total_flos": 77507944513536.0,
"train_loss": 0.0,
"train_runtime": 1.7307,
"train_samples_per_second": 12576.308,
"train_steps_per_second": 3144.366
}
],
"logging_steps": 5,
"max_steps": 5442,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 77507944513536.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}