{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5442, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009187798603454612, "grad_norm": 3.0641477925906218, "learning_rate": 9.157509157509158e-07, "loss": 1.1494, "mean_token_accuracy": 0.6977963209152221, "step": 5 }, { "epoch": 0.0018375597206909224, "grad_norm": 3.2105399334572433, "learning_rate": 1.8315018315018316e-06, "loss": 1.1392, "mean_token_accuracy": 0.7049754142761231, "step": 10 }, { "epoch": 0.0027563395810363835, "grad_norm": 2.9642594903358033, "learning_rate": 2.747252747252747e-06, "loss": 1.115, "mean_token_accuracy": 0.7039887428283691, "step": 15 }, { "epoch": 0.003675119441381845, "grad_norm": 2.220104102612778, "learning_rate": 3.663003663003663e-06, "loss": 0.9948, "mean_token_accuracy": 0.7305674076080322, "step": 20 }, { "epoch": 0.004593899301727306, "grad_norm": 1.9105601134372614, "learning_rate": 4.578754578754579e-06, "loss": 0.9409, "mean_token_accuracy": 0.7411547303199768, "step": 25 }, { "epoch": 0.005512679162072767, "grad_norm": 1.8598985707265425, "learning_rate": 5.494505494505494e-06, "loss": 0.9798, "mean_token_accuracy": 0.7281824707984924, "step": 30 }, { "epoch": 0.006431459022418228, "grad_norm": 2.585520808525946, "learning_rate": 6.41025641025641e-06, "loss": 0.9642, "mean_token_accuracy": 0.7303176164627075, "step": 35 }, { "epoch": 0.00735023888276369, "grad_norm": 3.0068202552964665, "learning_rate": 7.326007326007326e-06, "loss": 0.9358, "mean_token_accuracy": 0.7378472447395324, "step": 40 }, { "epoch": 0.008269018743109152, "grad_norm": 1.6811073433803756, "learning_rate": 8.241758241758243e-06, "loss": 0.959, "mean_token_accuracy": 0.7312322854995728, "step": 45 }, { "epoch": 0.009187798603454611, "grad_norm": 1.8266484737246598, "learning_rate": 9.157509157509158e-06, "loss": 0.8436, "mean_token_accuracy": 0.7629137873649597, "step": 50 }, { "epoch": 0.010106578463800073, "grad_norm": 1.7588858560286977, "learning_rate": 1.0073260073260074e-05, "loss": 0.8931, "mean_token_accuracy": 0.7447961091995239, "step": 55 }, { "epoch": 0.011025358324145534, "grad_norm": 1.9424500939228353, "learning_rate": 1.0989010989010989e-05, "loss": 0.9072, "mean_token_accuracy": 0.7389389991760253, "step": 60 }, { "epoch": 0.011944138184490995, "grad_norm": 1.93385540963114, "learning_rate": 1.1904761904761905e-05, "loss": 0.8793, "mean_token_accuracy": 0.748711359500885, "step": 65 }, { "epoch": 0.012862918044836457, "grad_norm": 1.5600948544295363, "learning_rate": 1.282051282051282e-05, "loss": 0.9042, "mean_token_accuracy": 0.7399611473083496, "step": 70 }, { "epoch": 0.013781697905181918, "grad_norm": 1.7742267258287694, "learning_rate": 1.3736263736263738e-05, "loss": 0.9196, "mean_token_accuracy": 0.7337918877601624, "step": 75 }, { "epoch": 0.01470047776552738, "grad_norm": 1.7237265700273807, "learning_rate": 1.4652014652014653e-05, "loss": 0.8143, "mean_token_accuracy": 0.7621858239173889, "step": 80 }, { "epoch": 0.01561925762587284, "grad_norm": 2.032178530068195, "learning_rate": 1.556776556776557e-05, "loss": 0.881, "mean_token_accuracy": 0.7456439375877381, "step": 85 }, { "epoch": 0.016538037486218304, "grad_norm": 1.5476057141233361, "learning_rate": 1.6483516483516486e-05, "loss": 0.774, "mean_token_accuracy": 0.7747476458549499, "step": 90 }, { "epoch": 0.017456817346563763, "grad_norm": 1.7859288763560526, "learning_rate": 1.73992673992674e-05, "loss": 0.8551, "mean_token_accuracy": 0.7515540599822998, "step": 95 }, { "epoch": 0.018375597206909223, "grad_norm": 1.7870239578433738, "learning_rate": 1.8315018315018315e-05, "loss": 0.8048, "mean_token_accuracy": 0.7651323318481446, "step": 100 }, { "epoch": 0.019294377067254686, "grad_norm": 2.0924810885176077, "learning_rate": 1.923076923076923e-05, "loss": 0.8006, "mean_token_accuracy": 0.76546790599823, "step": 105 }, { "epoch": 0.020213156927600145, "grad_norm": 1.7880225651210455, "learning_rate": 2.0146520146520148e-05, "loss": 0.8121, "mean_token_accuracy": 0.7632635951042175, "step": 110 }, { "epoch": 0.02113193678794561, "grad_norm": 1.6627547537831113, "learning_rate": 2.1062271062271064e-05, "loss": 0.7817, "mean_token_accuracy": 0.772648024559021, "step": 115 }, { "epoch": 0.022050716648291068, "grad_norm": 1.820743441798809, "learning_rate": 2.1978021978021977e-05, "loss": 0.8784, "mean_token_accuracy": 0.7446165084838867, "step": 120 }, { "epoch": 0.02296949650863653, "grad_norm": 2.099675060940829, "learning_rate": 2.2893772893772894e-05, "loss": 0.8524, "mean_token_accuracy": 0.749963927268982, "step": 125 }, { "epoch": 0.02388827636898199, "grad_norm": 1.954655640615228, "learning_rate": 2.380952380952381e-05, "loss": 0.8528, "mean_token_accuracy": 0.7490226745605468, "step": 130 }, { "epoch": 0.024807056229327454, "grad_norm": 1.6762777606996013, "learning_rate": 2.4725274725274727e-05, "loss": 0.7488, "mean_token_accuracy": 0.7781027436256409, "step": 135 }, { "epoch": 0.025725836089672913, "grad_norm": 1.818950775045492, "learning_rate": 2.564102564102564e-05, "loss": 0.8932, "mean_token_accuracy": 0.7370708823204041, "step": 140 }, { "epoch": 0.026644615950018376, "grad_norm": 1.7549497495325217, "learning_rate": 2.655677655677656e-05, "loss": 0.8299, "mean_token_accuracy": 0.7562183260917663, "step": 145 }, { "epoch": 0.027563395810363836, "grad_norm": 1.9651810255018103, "learning_rate": 2.7472527472527476e-05, "loss": 0.8961, "mean_token_accuracy": 0.7379685401916504, "step": 150 }, { "epoch": 0.0284821756707093, "grad_norm": 1.7437158607043617, "learning_rate": 2.838827838827839e-05, "loss": 0.8796, "mean_token_accuracy": 0.7450518012046814, "step": 155 }, { "epoch": 0.02940095553105476, "grad_norm": 1.6994422932790823, "learning_rate": 2.9304029304029305e-05, "loss": 0.8178, "mean_token_accuracy": 0.7607359051704407, "step": 160 }, { "epoch": 0.03031973539140022, "grad_norm": 1.858782783599879, "learning_rate": 3.021978021978022e-05, "loss": 0.8074, "mean_token_accuracy": 0.7634272456169129, "step": 165 }, { "epoch": 0.03123851525174568, "grad_norm": 1.8619803118890237, "learning_rate": 3.113553113553114e-05, "loss": 0.8546, "mean_token_accuracy": 0.7492664337158204, "step": 170 }, { "epoch": 0.032157295112091144, "grad_norm": 1.609592048201869, "learning_rate": 3.205128205128206e-05, "loss": 0.7985, "mean_token_accuracy": 0.7690797805786133, "step": 175 }, { "epoch": 0.03307607497243661, "grad_norm": 1.6908537703053188, "learning_rate": 3.296703296703297e-05, "loss": 0.8704, "mean_token_accuracy": 0.744064450263977, "step": 180 }, { "epoch": 0.033994854832782063, "grad_norm": 1.8165162354197588, "learning_rate": 3.3882783882783884e-05, "loss": 0.8849, "mean_token_accuracy": 0.7387139916419982, "step": 185 }, { "epoch": 0.03491363469312753, "grad_norm": 1.8897649307357878, "learning_rate": 3.47985347985348e-05, "loss": 0.9233, "mean_token_accuracy": 0.7323238730430603, "step": 190 }, { "epoch": 0.03583241455347299, "grad_norm": 1.7585678570520256, "learning_rate": 3.571428571428572e-05, "loss": 0.9059, "mean_token_accuracy": 0.7369635701179504, "step": 195 }, { "epoch": 0.036751194413818446, "grad_norm": 1.8196207496857761, "learning_rate": 3.663003663003663e-05, "loss": 0.7984, "mean_token_accuracy": 0.7655532121658325, "step": 200 }, { "epoch": 0.03766997427416391, "grad_norm": 1.8001783116526198, "learning_rate": 3.754578754578755e-05, "loss": 0.9032, "mean_token_accuracy": 0.7356508016586304, "step": 205 }, { "epoch": 0.03858875413450937, "grad_norm": 2.1616041041174143, "learning_rate": 3.846153846153846e-05, "loss": 0.8959, "mean_token_accuracy": 0.7379406213760376, "step": 210 }, { "epoch": 0.039507533994854835, "grad_norm": 1.6869890778597507, "learning_rate": 3.9377289377289376e-05, "loss": 0.7926, "mean_token_accuracy": 0.7656057000160217, "step": 215 }, { "epoch": 0.04042631385520029, "grad_norm": 1.5982107775692642, "learning_rate": 4.0293040293040296e-05, "loss": 0.8742, "mean_token_accuracy": 0.743067741394043, "step": 220 }, { "epoch": 0.041345093715545754, "grad_norm": 1.8744347527261311, "learning_rate": 4.120879120879121e-05, "loss": 0.8344, "mean_token_accuracy": 0.7555591464042664, "step": 225 }, { "epoch": 0.04226387357589122, "grad_norm": 1.7387305953271162, "learning_rate": 4.212454212454213e-05, "loss": 0.8268, "mean_token_accuracy": 0.7600399851799011, "step": 230 }, { "epoch": 0.04318265343623668, "grad_norm": 1.5706104238018215, "learning_rate": 4.304029304029304e-05, "loss": 0.8534, "mean_token_accuracy": 0.7509700894355774, "step": 235 }, { "epoch": 0.044101433296582136, "grad_norm": 2.3765119929284566, "learning_rate": 4.3956043956043955e-05, "loss": 0.833, "mean_token_accuracy": 0.7555877804756165, "step": 240 }, { "epoch": 0.0450202131569276, "grad_norm": 1.8036798399760634, "learning_rate": 4.4871794871794874e-05, "loss": 0.9457, "mean_token_accuracy": 0.7283176898956298, "step": 245 }, { "epoch": 0.04593899301727306, "grad_norm": 1.5687281786415206, "learning_rate": 4.578754578754579e-05, "loss": 0.9033, "mean_token_accuracy": 0.7391088247299195, "step": 250 }, { "epoch": 0.046857772877618525, "grad_norm": 1.906886876307677, "learning_rate": 4.670329670329671e-05, "loss": 0.8814, "mean_token_accuracy": 0.7455077290534973, "step": 255 }, { "epoch": 0.04777655273796398, "grad_norm": 2.459151452178755, "learning_rate": 4.761904761904762e-05, "loss": 0.8482, "mean_token_accuracy": 0.7487390875816345, "step": 260 }, { "epoch": 0.048695332598309445, "grad_norm": 2.090972233335094, "learning_rate": 4.8534798534798533e-05, "loss": 0.8361, "mean_token_accuracy": 0.7550442337989807, "step": 265 }, { "epoch": 0.04961411245865491, "grad_norm": 2.3215263657474834, "learning_rate": 4.945054945054945e-05, "loss": 0.9193, "mean_token_accuracy": 0.7319010257720947, "step": 270 }, { "epoch": 0.05053289231900037, "grad_norm": 1.603682503261401, "learning_rate": 4.999998337739284e-05, "loss": 0.8252, "mean_token_accuracy": 0.7569657444953919, "step": 275 }, { "epoch": 0.05145167217934583, "grad_norm": 1.5710545327372318, "learning_rate": 4.999979637334437e-05, "loss": 0.8146, "mean_token_accuracy": 0.7597472548484803, "step": 280 }, { "epoch": 0.05237045203969129, "grad_norm": 1.7769444956116853, "learning_rate": 4.9999401588721174e-05, "loss": 0.922, "mean_token_accuracy": 0.7304396867752075, "step": 285 }, { "epoch": 0.05328923190003675, "grad_norm": 1.5021517935884932, "learning_rate": 4.999879902716899e-05, "loss": 0.7971, "mean_token_accuracy": 0.7652329564094543, "step": 290 }, { "epoch": 0.05420801176038221, "grad_norm": 1.7950075085630441, "learning_rate": 4.999798869425236e-05, "loss": 0.9554, "mean_token_accuracy": 0.7232900500297547, "step": 295 }, { "epoch": 0.05512679162072767, "grad_norm": 1.5725822266760454, "learning_rate": 4.999697059745451e-05, "loss": 0.8468, "mean_token_accuracy": 0.7524639129638672, "step": 300 }, { "epoch": 0.056045571481073135, "grad_norm": 1.6589539268411269, "learning_rate": 4.999574474617734e-05, "loss": 0.9022, "mean_token_accuracy": 0.7418521285057068, "step": 305 }, { "epoch": 0.0569643513414186, "grad_norm": 1.6096903752899234, "learning_rate": 4.999431115174131e-05, "loss": 0.8665, "mean_token_accuracy": 0.7496641755104065, "step": 310 }, { "epoch": 0.057883131201764054, "grad_norm": 1.4434886028998843, "learning_rate": 4.999266982738535e-05, "loss": 0.9006, "mean_token_accuracy": 0.737899649143219, "step": 315 }, { "epoch": 0.05880191106210952, "grad_norm": 1.3481221265945444, "learning_rate": 4.999082078826671e-05, "loss": 0.8867, "mean_token_accuracy": 0.744273555278778, "step": 320 }, { "epoch": 0.05972069092245498, "grad_norm": 1.4681948954197888, "learning_rate": 4.998876405146087e-05, "loss": 0.9196, "mean_token_accuracy": 0.7311511278152466, "step": 325 }, { "epoch": 0.06063947078280044, "grad_norm": 1.4689641413707268, "learning_rate": 4.998649963596131e-05, "loss": 0.9618, "mean_token_accuracy": 0.7219404339790344, "step": 330 }, { "epoch": 0.0615582506431459, "grad_norm": 1.5996506188069943, "learning_rate": 4.998402756267943e-05, "loss": 0.8775, "mean_token_accuracy": 0.7418051362037659, "step": 335 }, { "epoch": 0.06247703050349136, "grad_norm": 1.3723739821198824, "learning_rate": 4.998134785444425e-05, "loss": 0.8755, "mean_token_accuracy": 0.7445154070854187, "step": 340 }, { "epoch": 0.06339581036383682, "grad_norm": 1.5484060868058451, "learning_rate": 4.997846053600227e-05, "loss": 0.8893, "mean_token_accuracy": 0.7417627811431885, "step": 345 }, { "epoch": 0.06431459022418229, "grad_norm": 1.374032479571469, "learning_rate": 4.997536563401724e-05, "loss": 0.926, "mean_token_accuracy": 0.7293352723121643, "step": 350 }, { "epoch": 0.06523337008452774, "grad_norm": 1.3396355962311042, "learning_rate": 4.9972063177069894e-05, "loss": 0.8363, "mean_token_accuracy": 0.7584239602088928, "step": 355 }, { "epoch": 0.06615214994487321, "grad_norm": 1.4157203063425363, "learning_rate": 4.9968553195657665e-05, "loss": 0.796, "mean_token_accuracy": 0.7692983031272889, "step": 360 }, { "epoch": 0.06707092980521867, "grad_norm": 1.4641176633277067, "learning_rate": 4.9964835722194455e-05, "loss": 0.8386, "mean_token_accuracy": 0.7571163177490234, "step": 365 }, { "epoch": 0.06798970966556413, "grad_norm": 1.3754550053722292, "learning_rate": 4.996091079101028e-05, "loss": 0.8487, "mean_token_accuracy": 0.7535093784332275, "step": 370 }, { "epoch": 0.0689084895259096, "grad_norm": 1.434562106690179, "learning_rate": 4.995677843835103e-05, "loss": 0.9616, "mean_token_accuracy": 0.7227682590484619, "step": 375 }, { "epoch": 0.06982726938625505, "grad_norm": 1.625727387715634, "learning_rate": 4.995243870237803e-05, "loss": 0.8748, "mean_token_accuracy": 0.7452502608299255, "step": 380 }, { "epoch": 0.07074604924660051, "grad_norm": 1.377875493136272, "learning_rate": 4.994789162316778e-05, "loss": 0.8707, "mean_token_accuracy": 0.7500712752342225, "step": 385 }, { "epoch": 0.07166482910694598, "grad_norm": 1.4193673422415896, "learning_rate": 4.994313724271153e-05, "loss": 0.9701, "mean_token_accuracy": 0.720650053024292, "step": 390 }, { "epoch": 0.07258360896729144, "grad_norm": 1.3011757424952435, "learning_rate": 4.993817560491493e-05, "loss": 0.9133, "mean_token_accuracy": 0.7373546719551086, "step": 395 }, { "epoch": 0.07350238882763689, "grad_norm": 1.53341664668217, "learning_rate": 4.993300675559757e-05, "loss": 0.903, "mean_token_accuracy": 0.7406406998634338, "step": 400 }, { "epoch": 0.07442116868798236, "grad_norm": 1.2526811498029908, "learning_rate": 4.9927630742492644e-05, "loss": 0.8457, "mean_token_accuracy": 0.7558913826942444, "step": 405 }, { "epoch": 0.07533994854832782, "grad_norm": 1.2395634856051558, "learning_rate": 4.992204761524641e-05, "loss": 0.7751, "mean_token_accuracy": 0.7746022939682007, "step": 410 }, { "epoch": 0.07625872840867329, "grad_norm": 1.500918488887097, "learning_rate": 4.9916257425417796e-05, "loss": 0.9186, "mean_token_accuracy": 0.7323673367500305, "step": 415 }, { "epoch": 0.07717750826901874, "grad_norm": 1.4971365871256093, "learning_rate": 4.99102602264779e-05, "loss": 0.8465, "mean_token_accuracy": 0.7565265655517578, "step": 420 }, { "epoch": 0.0780962881293642, "grad_norm": 1.2649360311231244, "learning_rate": 4.990405607380953e-05, "loss": 0.9161, "mean_token_accuracy": 0.7346989989280701, "step": 425 }, { "epoch": 0.07901506798970967, "grad_norm": 1.2585193614261605, "learning_rate": 4.9897645024706634e-05, "loss": 0.8489, "mean_token_accuracy": 0.7520861387252807, "step": 430 }, { "epoch": 0.07993384785005513, "grad_norm": 1.4511781114343052, "learning_rate": 4.989102713837381e-05, "loss": 0.8646, "mean_token_accuracy": 0.7488693952560425, "step": 435 }, { "epoch": 0.08085262771040058, "grad_norm": 1.3187982044036815, "learning_rate": 4.9884202475925754e-05, "loss": 0.8395, "mean_token_accuracy": 0.7578373312950134, "step": 440 }, { "epoch": 0.08177140757074605, "grad_norm": 1.3677439877875948, "learning_rate": 4.9877171100386704e-05, "loss": 0.8294, "mean_token_accuracy": 0.7562382102012635, "step": 445 }, { "epoch": 0.08269018743109151, "grad_norm": 1.4162808960942412, "learning_rate": 4.9869933076689826e-05, "loss": 0.9207, "mean_token_accuracy": 0.7346067547798156, "step": 450 }, { "epoch": 0.08360896729143698, "grad_norm": 1.3055956101463557, "learning_rate": 4.9862488471676646e-05, "loss": 0.863, "mean_token_accuracy": 0.749237322807312, "step": 455 }, { "epoch": 0.08452774715178243, "grad_norm": 1.273094086173784, "learning_rate": 4.985483735409643e-05, "loss": 0.869, "mean_token_accuracy": 0.7482211112976074, "step": 460 }, { "epoch": 0.08544652701212789, "grad_norm": 1.4557355335456608, "learning_rate": 4.9846979794605526e-05, "loss": 0.8914, "mean_token_accuracy": 0.7409134864807129, "step": 465 }, { "epoch": 0.08636530687247336, "grad_norm": 1.3769618557958148, "learning_rate": 4.983891586576674e-05, "loss": 0.9477, "mean_token_accuracy": 0.7257415533065796, "step": 470 }, { "epoch": 0.08728408673281882, "grad_norm": 1.2865943966930515, "learning_rate": 4.983064564204864e-05, "loss": 0.8597, "mean_token_accuracy": 0.7478152275085449, "step": 475 }, { "epoch": 0.08820286659316427, "grad_norm": 1.5688420565520587, "learning_rate": 4.98221691998249e-05, "loss": 0.9275, "mean_token_accuracy": 0.7304094076156616, "step": 480 }, { "epoch": 0.08912164645350974, "grad_norm": 1.518182798525247, "learning_rate": 4.9813486617373545e-05, "loss": 0.9003, "mean_token_accuracy": 0.7381687164306641, "step": 485 }, { "epoch": 0.0900404263138552, "grad_norm": 1.2240176455308946, "learning_rate": 4.980459797487629e-05, "loss": 0.8663, "mean_token_accuracy": 0.7481726765632629, "step": 490 }, { "epoch": 0.09095920617420065, "grad_norm": 1.348170741270758, "learning_rate": 4.979550335441776e-05, "loss": 0.9427, "mean_token_accuracy": 0.7272454261779785, "step": 495 }, { "epoch": 0.09187798603454612, "grad_norm": 1.3960656738373651, "learning_rate": 4.978620283998472e-05, "loss": 0.8659, "mean_token_accuracy": 0.7438789248466492, "step": 500 }, { "epoch": 0.09279676589489158, "grad_norm": 1.3587049381306813, "learning_rate": 4.977669651746534e-05, "loss": 0.9216, "mean_token_accuracy": 0.7308396458625793, "step": 505 }, { "epoch": 0.09371554575523705, "grad_norm": 1.2831086518032893, "learning_rate": 4.976698447464839e-05, "loss": 0.8296, "mean_token_accuracy": 0.7564551353454589, "step": 510 }, { "epoch": 0.0946343256155825, "grad_norm": 1.3126315362065073, "learning_rate": 4.975706680122239e-05, "loss": 0.8901, "mean_token_accuracy": 0.7394080519676208, "step": 515 }, { "epoch": 0.09555310547592796, "grad_norm": 1.326727886512097, "learning_rate": 4.9746943588774845e-05, "loss": 0.8862, "mean_token_accuracy": 0.7405213117599487, "step": 520 }, { "epoch": 0.09647188533627343, "grad_norm": 1.1564203773195028, "learning_rate": 4.9736614930791345e-05, "loss": 0.8734, "mean_token_accuracy": 0.7428488969802857, "step": 525 }, { "epoch": 0.09739066519661889, "grad_norm": 1.32950010677836, "learning_rate": 4.972608092265473e-05, "loss": 0.9342, "mean_token_accuracy": 0.7337097644805908, "step": 530 }, { "epoch": 0.09830944505696435, "grad_norm": 1.465381788689711, "learning_rate": 4.971534166164421e-05, "loss": 0.873, "mean_token_accuracy": 0.7453126430511474, "step": 535 }, { "epoch": 0.09922822491730982, "grad_norm": 1.2034166956705263, "learning_rate": 4.970439724693445e-05, "loss": 0.8915, "mean_token_accuracy": 0.7360377907752991, "step": 540 }, { "epoch": 0.10014700477765527, "grad_norm": 1.411243613830083, "learning_rate": 4.969324777959465e-05, "loss": 0.881, "mean_token_accuracy": 0.7421476721763611, "step": 545 }, { "epoch": 0.10106578463800074, "grad_norm": 1.2799706017877437, "learning_rate": 4.968189336258767e-05, "loss": 0.8403, "mean_token_accuracy": 0.7561326265335083, "step": 550 }, { "epoch": 0.1019845644983462, "grad_norm": 1.4044688342965088, "learning_rate": 4.967033410076898e-05, "loss": 0.9382, "mean_token_accuracy": 0.7271358609199524, "step": 555 }, { "epoch": 0.10290334435869165, "grad_norm": 1.4228321929552081, "learning_rate": 4.965857010088579e-05, "loss": 0.7972, "mean_token_accuracy": 0.7670902132987976, "step": 560 }, { "epoch": 0.10382212421903712, "grad_norm": 1.2255950177016557, "learning_rate": 4.964660147157599e-05, "loss": 0.8801, "mean_token_accuracy": 0.7435322165489197, "step": 565 }, { "epoch": 0.10474090407938258, "grad_norm": 1.2075368004666547, "learning_rate": 4.9634428323367184e-05, "loss": 0.807, "mean_token_accuracy": 0.7603043556213379, "step": 570 }, { "epoch": 0.10565968393972804, "grad_norm": 1.2015436573449916, "learning_rate": 4.962205076867567e-05, "loss": 0.8521, "mean_token_accuracy": 0.7509374380111694, "step": 575 }, { "epoch": 0.1065784638000735, "grad_norm": 1.2414360388934507, "learning_rate": 4.96094689218054e-05, "loss": 0.8354, "mean_token_accuracy": 0.7580932378768921, "step": 580 }, { "epoch": 0.10749724366041896, "grad_norm": 1.4551894189161494, "learning_rate": 4.959668289894691e-05, "loss": 0.9427, "mean_token_accuracy": 0.7268964529037476, "step": 585 }, { "epoch": 0.10841602352076442, "grad_norm": 1.1543433517316415, "learning_rate": 4.9583692818176224e-05, "loss": 0.8493, "mean_token_accuracy": 0.7501084446907044, "step": 590 }, { "epoch": 0.10933480338110989, "grad_norm": 1.320648188350521, "learning_rate": 4.9570498799453864e-05, "loss": 0.8808, "mean_token_accuracy": 0.7438010811805725, "step": 595 }, { "epoch": 0.11025358324145534, "grad_norm": 1.2877252561167056, "learning_rate": 4.955710096462362e-05, "loss": 0.8779, "mean_token_accuracy": 0.7458134055137634, "step": 600 }, { "epoch": 0.11117236310180081, "grad_norm": 1.3602330448083289, "learning_rate": 4.954349943741148e-05, "loss": 0.8569, "mean_token_accuracy": 0.7528672575950622, "step": 605 }, { "epoch": 0.11209114296214627, "grad_norm": 1.3741249752237, "learning_rate": 4.952969434342452e-05, "loss": 0.9154, "mean_token_accuracy": 0.7366644501686096, "step": 610 }, { "epoch": 0.11300992282249173, "grad_norm": 1.2731189631033297, "learning_rate": 4.951568581014967e-05, "loss": 0.8101, "mean_token_accuracy": 0.7622666001319885, "step": 615 }, { "epoch": 0.1139287026828372, "grad_norm": 1.3238359092320755, "learning_rate": 4.95014739669526e-05, "loss": 0.8681, "mean_token_accuracy": 0.7467660307884216, "step": 620 }, { "epoch": 0.11484748254318265, "grad_norm": 1.202393469716897, "learning_rate": 4.94870589450765e-05, "loss": 0.8455, "mean_token_accuracy": 0.7547730088233948, "step": 625 }, { "epoch": 0.11576626240352811, "grad_norm": 1.1456259434669744, "learning_rate": 4.9472440877640856e-05, "loss": 0.9136, "mean_token_accuracy": 0.7327568888664245, "step": 630 }, { "epoch": 0.11668504226387358, "grad_norm": 1.3091846984844044, "learning_rate": 4.945761989964025e-05, "loss": 0.8093, "mean_token_accuracy": 0.7623311281204224, "step": 635 }, { "epoch": 0.11760382212421903, "grad_norm": 1.2205761022635602, "learning_rate": 4.9442596147943095e-05, "loss": 0.8025, "mean_token_accuracy": 0.7651844978332519, "step": 640 }, { "epoch": 0.1185226019845645, "grad_norm": 1.3351343961105993, "learning_rate": 4.942736976129035e-05, "loss": 0.8144, "mean_token_accuracy": 0.7613680481910705, "step": 645 }, { "epoch": 0.11944138184490996, "grad_norm": 1.430243553624981, "learning_rate": 4.941194088029431e-05, "loss": 0.9086, "mean_token_accuracy": 0.7342401266098022, "step": 650 }, { "epoch": 0.12036016170525542, "grad_norm": 1.2556349548265657, "learning_rate": 4.939630964743721e-05, "loss": 0.8369, "mean_token_accuracy": 0.7542879939079284, "step": 655 }, { "epoch": 0.12127894156560089, "grad_norm": 1.3152471325239643, "learning_rate": 4.9380476207069984e-05, "loss": 0.8687, "mean_token_accuracy": 0.7439038634300232, "step": 660 }, { "epoch": 0.12219772142594634, "grad_norm": 1.3345822562447953, "learning_rate": 4.936444070541091e-05, "loss": 0.8826, "mean_token_accuracy": 0.7404947400093078, "step": 665 }, { "epoch": 0.1231165012862918, "grad_norm": 1.1814130966039975, "learning_rate": 4.9348203290544245e-05, "loss": 0.8797, "mean_token_accuracy": 0.7429808259010315, "step": 670 }, { "epoch": 0.12403528114663727, "grad_norm": 1.2046715413009972, "learning_rate": 4.933176411241888e-05, "loss": 0.7764, "mean_token_accuracy": 0.7709425568580628, "step": 675 }, { "epoch": 0.12495406100698273, "grad_norm": 1.2359612025083841, "learning_rate": 4.9315123322846934e-05, "loss": 0.8757, "mean_token_accuracy": 0.7420969247817993, "step": 680 }, { "epoch": 0.1258728408673282, "grad_norm": 1.1918939688175052, "learning_rate": 4.929828107550237e-05, "loss": 0.8439, "mean_token_accuracy": 0.7540834426879883, "step": 685 }, { "epoch": 0.12679162072767364, "grad_norm": 1.1826184909342607, "learning_rate": 4.928123752591957e-05, "loss": 0.8801, "mean_token_accuracy": 0.7422125935554504, "step": 690 }, { "epoch": 0.1277104005880191, "grad_norm": 1.255648051492681, "learning_rate": 4.926399283149188e-05, "loss": 0.8429, "mean_token_accuracy": 0.7524136543273926, "step": 695 }, { "epoch": 0.12862918044836458, "grad_norm": 1.1605510458950452, "learning_rate": 4.9246547151470205e-05, "loss": 0.9021, "mean_token_accuracy": 0.7373670816421509, "step": 700 }, { "epoch": 0.12954796030871002, "grad_norm": 1.3238344494004473, "learning_rate": 4.9228900646961474e-05, "loss": 0.9057, "mean_token_accuracy": 0.7368204951286316, "step": 705 }, { "epoch": 0.1304667401690555, "grad_norm": 1.4306839271560785, "learning_rate": 4.921105348092721e-05, "loss": 0.7625, "mean_token_accuracy": 0.7744701862335205, "step": 710 }, { "epoch": 0.13138552002940096, "grad_norm": 1.2165530726795277, "learning_rate": 4.919300581818197e-05, "loss": 0.9154, "mean_token_accuracy": 0.7359979271888732, "step": 715 }, { "epoch": 0.13230429988974643, "grad_norm": 1.286173428804154, "learning_rate": 4.91747578253919e-05, "loss": 0.8585, "mean_token_accuracy": 0.7492180824279785, "step": 720 }, { "epoch": 0.13322307975009187, "grad_norm": 1.1184283075387254, "learning_rate": 4.91563096710731e-05, "loss": 0.8903, "mean_token_accuracy": 0.7384656071662903, "step": 725 }, { "epoch": 0.13414185961043734, "grad_norm": 1.3568563512463876, "learning_rate": 4.913766152559015e-05, "loss": 0.9028, "mean_token_accuracy": 0.7395498275756835, "step": 730 }, { "epoch": 0.1350606394707828, "grad_norm": 1.4808915141658863, "learning_rate": 4.911881356115449e-05, "loss": 0.9084, "mean_token_accuracy": 0.7352772116661072, "step": 735 }, { "epoch": 0.13597941933112825, "grad_norm": 1.2602878053642064, "learning_rate": 4.909976595182285e-05, "loss": 0.8593, "mean_token_accuracy": 0.7459996342658997, "step": 740 }, { "epoch": 0.13689819919147372, "grad_norm": 1.1270873445880305, "learning_rate": 4.908051887349562e-05, "loss": 0.85, "mean_token_accuracy": 0.751087772846222, "step": 745 }, { "epoch": 0.1378169790518192, "grad_norm": 1.2900722126857331, "learning_rate": 4.906107250391527e-05, "loss": 0.8333, "mean_token_accuracy": 0.755375337600708, "step": 750 }, { "epoch": 0.13873575891216464, "grad_norm": 1.3067285908120727, "learning_rate": 4.9041427022664645e-05, "loss": 0.8661, "mean_token_accuracy": 0.7458638072013855, "step": 755 }, { "epoch": 0.1396545387725101, "grad_norm": 1.1705361580698839, "learning_rate": 4.902158261116537e-05, "loss": 0.9127, "mean_token_accuracy": 0.736179769039154, "step": 760 }, { "epoch": 0.14057331863285558, "grad_norm": 1.0599820145259966, "learning_rate": 4.900153945267612e-05, "loss": 0.8433, "mean_token_accuracy": 0.7551376700401307, "step": 765 }, { "epoch": 0.14149209849320102, "grad_norm": 1.282905055646039, "learning_rate": 4.8981297732291e-05, "loss": 0.8554, "mean_token_accuracy": 0.7511672616004944, "step": 770 }, { "epoch": 0.1424108783535465, "grad_norm": 1.3437700886513289, "learning_rate": 4.896085763693773e-05, "loss": 0.9227, "mean_token_accuracy": 0.733566403388977, "step": 775 }, { "epoch": 0.14332965821389196, "grad_norm": 1.1297724344725806, "learning_rate": 4.894021935537603e-05, "loss": 0.8507, "mean_token_accuracy": 0.7513582468032837, "step": 780 }, { "epoch": 0.1442484380742374, "grad_norm": 1.3689332887934986, "learning_rate": 4.891938307819578e-05, "loss": 0.8849, "mean_token_accuracy": 0.7436878085136414, "step": 785 }, { "epoch": 0.14516721793458287, "grad_norm": 1.1498634876050613, "learning_rate": 4.889834899781535e-05, "loss": 0.8429, "mean_token_accuracy": 0.753303873538971, "step": 790 }, { "epoch": 0.14608599779492834, "grad_norm": 1.140208758740646, "learning_rate": 4.887711730847975e-05, "loss": 0.7601, "mean_token_accuracy": 0.773739755153656, "step": 795 }, { "epoch": 0.14700477765527378, "grad_norm": 1.3016614852116664, "learning_rate": 4.885568820625885e-05, "loss": 0.9065, "mean_token_accuracy": 0.738306713104248, "step": 800 }, { "epoch": 0.14792355751561925, "grad_norm": 1.2509551297005208, "learning_rate": 4.883406188904564e-05, "loss": 0.7737, "mean_token_accuracy": 0.7723647236824036, "step": 805 }, { "epoch": 0.14884233737596472, "grad_norm": 1.2304502037360896, "learning_rate": 4.8812238556554284e-05, "loss": 0.9195, "mean_token_accuracy": 0.7318793773651123, "step": 810 }, { "epoch": 0.1497611172363102, "grad_norm": 1.160104764900568, "learning_rate": 4.8790218410318374e-05, "loss": 0.8646, "mean_token_accuracy": 0.7459649324417115, "step": 815 }, { "epoch": 0.15067989709665564, "grad_norm": 1.4673570710895905, "learning_rate": 4.8768001653689024e-05, "loss": 0.9062, "mean_token_accuracy": 0.7345248699188233, "step": 820 }, { "epoch": 0.1515986769570011, "grad_norm": 1.171954013640319, "learning_rate": 4.874558849183299e-05, "loss": 0.8867, "mean_token_accuracy": 0.7401813983917236, "step": 825 }, { "epoch": 0.15251745681734658, "grad_norm": 1.1448163904842934, "learning_rate": 4.872297913173081e-05, "loss": 0.8006, "mean_token_accuracy": 0.7656459212303162, "step": 830 }, { "epoch": 0.15343623667769202, "grad_norm": 1.2603194785671739, "learning_rate": 4.870017378217485e-05, "loss": 0.9037, "mean_token_accuracy": 0.7398361563682556, "step": 835 }, { "epoch": 0.1543550165380375, "grad_norm": 1.3408423677606498, "learning_rate": 4.86771726537674e-05, "loss": 0.9383, "mean_token_accuracy": 0.7285741686820983, "step": 840 }, { "epoch": 0.15527379639838296, "grad_norm": 1.2835325171738854, "learning_rate": 4.865397595891872e-05, "loss": 0.8478, "mean_token_accuracy": 0.75036780834198, "step": 845 }, { "epoch": 0.1561925762587284, "grad_norm": 1.1980068838306206, "learning_rate": 4.8630583911845084e-05, "loss": 0.7627, "mean_token_accuracy": 0.7726967930793762, "step": 850 }, { "epoch": 0.15711135611907387, "grad_norm": 1.34919534477724, "learning_rate": 4.860699672856682e-05, "loss": 0.8838, "mean_token_accuracy": 0.7415394306182861, "step": 855 }, { "epoch": 0.15803013597941934, "grad_norm": 0.9624608396618806, "learning_rate": 4.8583214626906246e-05, "loss": 0.8601, "mean_token_accuracy": 0.7497328519821167, "step": 860 }, { "epoch": 0.15894891583976478, "grad_norm": 1.1697672429495145, "learning_rate": 4.8559237826485766e-05, "loss": 0.8228, "mean_token_accuracy": 0.7570769906044006, "step": 865 }, { "epoch": 0.15986769570011025, "grad_norm": 1.1935364635660837, "learning_rate": 4.853506654872575e-05, "loss": 0.9142, "mean_token_accuracy": 0.7316269755363465, "step": 870 }, { "epoch": 0.16078647556045572, "grad_norm": 1.2042547413733748, "learning_rate": 4.851070101684252e-05, "loss": 0.8742, "mean_token_accuracy": 0.7418438553810119, "step": 875 }, { "epoch": 0.16170525542080116, "grad_norm": 1.0707432913591217, "learning_rate": 4.84861414558463e-05, "loss": 0.8196, "mean_token_accuracy": 0.759805703163147, "step": 880 }, { "epoch": 0.16262403528114663, "grad_norm": 1.0629646526131367, "learning_rate": 4.846138809253914e-05, "loss": 0.874, "mean_token_accuracy": 0.7462024927139282, "step": 885 }, { "epoch": 0.1635428151414921, "grad_norm": 1.2106261509029042, "learning_rate": 4.843644115551279e-05, "loss": 0.9328, "mean_token_accuracy": 0.7267791390419006, "step": 890 }, { "epoch": 0.16446159500183755, "grad_norm": 1.1324590773137175, "learning_rate": 4.841130087514662e-05, "loss": 0.9211, "mean_token_accuracy": 0.7309597492218017, "step": 895 }, { "epoch": 0.16538037486218302, "grad_norm": 1.184689325779188, "learning_rate": 4.8385967483605496e-05, "loss": 0.8618, "mean_token_accuracy": 0.7446626782417297, "step": 900 }, { "epoch": 0.16629915472252849, "grad_norm": 1.138313035203993, "learning_rate": 4.836044121483759e-05, "loss": 0.8447, "mean_token_accuracy": 0.7529171824455261, "step": 905 }, { "epoch": 0.16721793458287396, "grad_norm": 1.1673447148654126, "learning_rate": 4.833472230457229e-05, "loss": 0.8979, "mean_token_accuracy": 0.7358499765396118, "step": 910 }, { "epoch": 0.1681367144432194, "grad_norm": 1.084780317701543, "learning_rate": 4.830881099031795e-05, "loss": 0.9185, "mean_token_accuracy": 0.7328409552574158, "step": 915 }, { "epoch": 0.16905549430356487, "grad_norm": 1.1240239095787248, "learning_rate": 4.828270751135975e-05, "loss": 0.7975, "mean_token_accuracy": 0.7656158566474914, "step": 920 }, { "epoch": 0.16997427416391034, "grad_norm": 1.2150981390103572, "learning_rate": 4.8256412108757466e-05, "loss": 0.9078, "mean_token_accuracy": 0.7345719337463379, "step": 925 }, { "epoch": 0.17089305402425578, "grad_norm": 1.1664463354681245, "learning_rate": 4.822992502534325e-05, "loss": 0.9038, "mean_token_accuracy": 0.7323048114776611, "step": 930 }, { "epoch": 0.17181183388460125, "grad_norm": 1.3331259616500464, "learning_rate": 4.820324650571938e-05, "loss": 0.8287, "mean_token_accuracy": 0.7578937888145447, "step": 935 }, { "epoch": 0.17273061374494672, "grad_norm": 1.2600746932123381, "learning_rate": 4.8176376796256e-05, "loss": 0.9795, "mean_token_accuracy": 0.7109430193901062, "step": 940 }, { "epoch": 0.17364939360529216, "grad_norm": 1.4190547333037917, "learning_rate": 4.814931614508884e-05, "loss": 0.8004, "mean_token_accuracy": 0.7619459390640259, "step": 945 }, { "epoch": 0.17456817346563763, "grad_norm": 1.149443686205735, "learning_rate": 4.812206480211697e-05, "loss": 0.8498, "mean_token_accuracy": 0.7484025120735168, "step": 950 }, { "epoch": 0.1754869533259831, "grad_norm": 1.717641721438028, "learning_rate": 4.809462301900042e-05, "loss": 0.8926, "mean_token_accuracy": 0.7387519717216492, "step": 955 }, { "epoch": 0.17640573318632854, "grad_norm": 1.286493811954347, "learning_rate": 4.806699104915789e-05, "loss": 0.9063, "mean_token_accuracy": 0.733875036239624, "step": 960 }, { "epoch": 0.17732451304667401, "grad_norm": 0.9127620652567309, "learning_rate": 4.803916914776445e-05, "loss": 0.7582, "mean_token_accuracy": 0.7734929203987122, "step": 965 }, { "epoch": 0.17824329290701948, "grad_norm": 1.1385225837750934, "learning_rate": 4.801115757174911e-05, "loss": 0.8003, "mean_token_accuracy": 0.7619274735450745, "step": 970 }, { "epoch": 0.17916207276736493, "grad_norm": 1.0365511688061633, "learning_rate": 4.798295657979249e-05, "loss": 0.8788, "mean_token_accuracy": 0.7446885228157043, "step": 975 }, { "epoch": 0.1800808526277104, "grad_norm": 1.168341096775078, "learning_rate": 4.795456643232444e-05, "loss": 0.8201, "mean_token_accuracy": 0.7583209872245789, "step": 980 }, { "epoch": 0.18099963248805587, "grad_norm": 1.2070861524091874, "learning_rate": 4.79259873915216e-05, "loss": 0.8247, "mean_token_accuracy": 0.754137146472931, "step": 985 }, { "epoch": 0.1819184123484013, "grad_norm": 1.1865394539255603, "learning_rate": 4.789721972130499e-05, "loss": 0.8068, "mean_token_accuracy": 0.7631414651870727, "step": 990 }, { "epoch": 0.18283719220874678, "grad_norm": 1.052924845633483, "learning_rate": 4.7868263687337613e-05, "loss": 0.7659, "mean_token_accuracy": 0.7754044890403747, "step": 995 }, { "epoch": 0.18375597206909225, "grad_norm": 1.1878988268801094, "learning_rate": 4.783911955702196e-05, "loss": 0.8474, "mean_token_accuracy": 0.7484631299972534, "step": 1000 }, { "epoch": 0.18467475192943772, "grad_norm": 1.2268828963125744, "learning_rate": 4.7809787599497504e-05, "loss": 0.8361, "mean_token_accuracy": 0.756050968170166, "step": 1005 }, { "epoch": 0.18559353178978316, "grad_norm": 1.1621815374305198, "learning_rate": 4.778026808563833e-05, "loss": 0.8081, "mean_token_accuracy": 0.7624092817306518, "step": 1010 }, { "epoch": 0.18651231165012863, "grad_norm": 1.3537024766805776, "learning_rate": 4.775056128805051e-05, "loss": 0.8903, "mean_token_accuracy": 0.7347793221473694, "step": 1015 }, { "epoch": 0.1874310915104741, "grad_norm": 1.1053617147333854, "learning_rate": 4.772066748106967e-05, "loss": 0.8345, "mean_token_accuracy": 0.7528262138366699, "step": 1020 }, { "epoch": 0.18834987137081954, "grad_norm": 1.270871735891865, "learning_rate": 4.7690586940758405e-05, "loss": 0.8519, "mean_token_accuracy": 0.7496292948722839, "step": 1025 }, { "epoch": 0.189268651231165, "grad_norm": 1.1951114465300396, "learning_rate": 4.766031994490377e-05, "loss": 0.8632, "mean_token_accuracy": 0.7459157705307007, "step": 1030 }, { "epoch": 0.19018743109151048, "grad_norm": 1.2145135516278585, "learning_rate": 4.762986677301468e-05, "loss": 0.7844, "mean_token_accuracy": 0.7638005137443542, "step": 1035 }, { "epoch": 0.19110621095185593, "grad_norm": 1.2377029681542826, "learning_rate": 4.759922770631935e-05, "loss": 0.8294, "mean_token_accuracy": 0.7549967885017395, "step": 1040 }, { "epoch": 0.1920249908122014, "grad_norm": 1.2317484830433059, "learning_rate": 4.7568403027762696e-05, "loss": 0.7993, "mean_token_accuracy": 0.763549017906189, "step": 1045 }, { "epoch": 0.19294377067254687, "grad_norm": 1.106617224635935, "learning_rate": 4.75373930220037e-05, "loss": 0.8114, "mean_token_accuracy": 0.7601318001747132, "step": 1050 }, { "epoch": 0.1938625505328923, "grad_norm": 1.2406149376598858, "learning_rate": 4.7506197975412826e-05, "loss": 0.901, "mean_token_accuracy": 0.7375799655914307, "step": 1055 }, { "epoch": 0.19478133039323778, "grad_norm": 1.368471676340253, "learning_rate": 4.747481817606933e-05, "loss": 0.9158, "mean_token_accuracy": 0.730099368095398, "step": 1060 }, { "epoch": 0.19570011025358325, "grad_norm": 1.2045592901618265, "learning_rate": 4.7443253913758617e-05, "loss": 0.8766, "mean_token_accuracy": 0.7418853521347046, "step": 1065 }, { "epoch": 0.1966188901139287, "grad_norm": 1.0674799173203142, "learning_rate": 4.741150547996958e-05, "loss": 0.8079, "mean_token_accuracy": 0.763364028930664, "step": 1070 }, { "epoch": 0.19753766997427416, "grad_norm": 1.3569414017684345, "learning_rate": 4.737957316789189e-05, "loss": 0.8038, "mean_token_accuracy": 0.7652618408203125, "step": 1075 }, { "epoch": 0.19845644983461963, "grad_norm": 1.237112155686031, "learning_rate": 4.734745727241328e-05, "loss": 0.9153, "mean_token_accuracy": 0.7342644929885864, "step": 1080 }, { "epoch": 0.19937522969496507, "grad_norm": 1.1267547624767125, "learning_rate": 4.7315158090116854e-05, "loss": 0.8808, "mean_token_accuracy": 0.7401048541069031, "step": 1085 }, { "epoch": 0.20029400955531054, "grad_norm": 1.1393796581940456, "learning_rate": 4.728267591927831e-05, "loss": 0.8232, "mean_token_accuracy": 0.7574564695358277, "step": 1090 }, { "epoch": 0.201212789415656, "grad_norm": 1.0439059841475136, "learning_rate": 4.7250011059863207e-05, "loss": 0.8255, "mean_token_accuracy": 0.7512354731559754, "step": 1095 }, { "epoch": 0.20213156927600148, "grad_norm": 1.1400055909091193, "learning_rate": 4.721716381352422e-05, "loss": 0.8547, "mean_token_accuracy": 0.7499767065048217, "step": 1100 }, { "epoch": 0.20305034913634692, "grad_norm": 1.1315286051266153, "learning_rate": 4.718413448359828e-05, "loss": 0.8083, "mean_token_accuracy": 0.7595677256584168, "step": 1105 }, { "epoch": 0.2039691289966924, "grad_norm": 1.1864811752644395, "learning_rate": 4.715092337510386e-05, "loss": 0.8823, "mean_token_accuracy": 0.7407166361808777, "step": 1110 }, { "epoch": 0.20488790885703786, "grad_norm": 1.1151247170457734, "learning_rate": 4.711753079473809e-05, "loss": 0.8344, "mean_token_accuracy": 0.7524962782859802, "step": 1115 }, { "epoch": 0.2058066887173833, "grad_norm": 1.1211350911528808, "learning_rate": 4.7083957050873965e-05, "loss": 0.8168, "mean_token_accuracy": 0.755139684677124, "step": 1120 }, { "epoch": 0.20672546857772878, "grad_norm": 1.2831690134615248, "learning_rate": 4.705020245355749e-05, "loss": 0.9413, "mean_token_accuracy": 0.72357656955719, "step": 1125 }, { "epoch": 0.20764424843807425, "grad_norm": 1.034972121395733, "learning_rate": 4.701626731450479e-05, "loss": 0.8167, "mean_token_accuracy": 0.7568554997444152, "step": 1130 }, { "epoch": 0.2085630282984197, "grad_norm": 1.012657589652359, "learning_rate": 4.6982151947099276e-05, "loss": 0.833, "mean_token_accuracy": 0.7557546138763428, "step": 1135 }, { "epoch": 0.20948180815876516, "grad_norm": 1.1282926356775929, "learning_rate": 4.694785666638871e-05, "loss": 0.8341, "mean_token_accuracy": 0.7547509074211121, "step": 1140 }, { "epoch": 0.21040058801911063, "grad_norm": 1.0977221337381091, "learning_rate": 4.691338178908232e-05, "loss": 0.8154, "mean_token_accuracy": 0.7610322952270507, "step": 1145 }, { "epoch": 0.21131936787945607, "grad_norm": 1.093844804786045, "learning_rate": 4.687872763354788e-05, "loss": 0.8406, "mean_token_accuracy": 0.7520750164985657, "step": 1150 }, { "epoch": 0.21223814773980154, "grad_norm": 1.0307882599984655, "learning_rate": 4.684389451980873e-05, "loss": 0.7764, "mean_token_accuracy": 0.7720999121665955, "step": 1155 }, { "epoch": 0.213156927600147, "grad_norm": 1.2074018111359583, "learning_rate": 4.680888276954087e-05, "loss": 0.8309, "mean_token_accuracy": 0.7553021907806396, "step": 1160 }, { "epoch": 0.21407570746049245, "grad_norm": 1.0917419046828303, "learning_rate": 4.677369270606997e-05, "loss": 0.8418, "mean_token_accuracy": 0.7502257823944092, "step": 1165 }, { "epoch": 0.21499448732083792, "grad_norm": 1.0820900629957635, "learning_rate": 4.673832465436837e-05, "loss": 0.7671, "mean_token_accuracy": 0.7708743929862976, "step": 1170 }, { "epoch": 0.2159132671811834, "grad_norm": 1.089247755922322, "learning_rate": 4.67027789410521e-05, "loss": 0.8538, "mean_token_accuracy": 0.7494909524917602, "step": 1175 }, { "epoch": 0.21683204704152884, "grad_norm": 1.1197011602210687, "learning_rate": 4.6667055894377857e-05, "loss": 0.8645, "mean_token_accuracy": 0.7444219350814819, "step": 1180 }, { "epoch": 0.2177508269018743, "grad_norm": 1.1249433607043806, "learning_rate": 4.663115584423995e-05, "loss": 0.7794, "mean_token_accuracy": 0.7685939073562622, "step": 1185 }, { "epoch": 0.21866960676221978, "grad_norm": 1.0486000440190792, "learning_rate": 4.659507912216732e-05, "loss": 0.9305, "mean_token_accuracy": 0.7281524419784546, "step": 1190 }, { "epoch": 0.21958838662256525, "grad_norm": 1.167591023080102, "learning_rate": 4.6558826061320384e-05, "loss": 0.7969, "mean_token_accuracy": 0.7660298943519592, "step": 1195 }, { "epoch": 0.2205071664829107, "grad_norm": 1.1633015665730886, "learning_rate": 4.652239699648803e-05, "loss": 0.8005, "mean_token_accuracy": 0.7678845167160034, "step": 1200 }, { "epoch": 0.22142594634325616, "grad_norm": 1.1234875884233444, "learning_rate": 4.648579226408452e-05, "loss": 0.8267, "mean_token_accuracy": 0.7536736965179444, "step": 1205 }, { "epoch": 0.22234472620360163, "grad_norm": 1.1117529930711065, "learning_rate": 4.644901220214634e-05, "loss": 0.8249, "mean_token_accuracy": 0.7595484375953674, "step": 1210 }, { "epoch": 0.22326350606394707, "grad_norm": 1.3136389068951135, "learning_rate": 4.641205715032912e-05, "loss": 0.7867, "mean_token_accuracy": 0.7665369153022766, "step": 1215 }, { "epoch": 0.22418228592429254, "grad_norm": 1.1409276324481026, "learning_rate": 4.637492744990448e-05, "loss": 0.8867, "mean_token_accuracy": 0.7416447997093201, "step": 1220 }, { "epoch": 0.225101065784638, "grad_norm": 1.2345145683805576, "learning_rate": 4.6337623443756866e-05, "loss": 0.7859, "mean_token_accuracy": 0.7682509303092957, "step": 1225 }, { "epoch": 0.22601984564498345, "grad_norm": 1.052220032293985, "learning_rate": 4.630014547638043e-05, "loss": 0.8437, "mean_token_accuracy": 0.7497885942459106, "step": 1230 }, { "epoch": 0.22693862550532892, "grad_norm": 1.1364062731073377, "learning_rate": 4.626249389387577e-05, "loss": 0.7733, "mean_token_accuracy": 0.769334900379181, "step": 1235 }, { "epoch": 0.2278574053656744, "grad_norm": 1.21546684775545, "learning_rate": 4.622466904394683e-05, "loss": 0.8526, "mean_token_accuracy": 0.7492899537086487, "step": 1240 }, { "epoch": 0.22877618522601983, "grad_norm": 1.2541380409672236, "learning_rate": 4.6186671275897615e-05, "loss": 0.8368, "mean_token_accuracy": 0.7558955073356628, "step": 1245 }, { "epoch": 0.2296949650863653, "grad_norm": 1.2101979681857873, "learning_rate": 4.614850094062899e-05, "loss": 0.8771, "mean_token_accuracy": 0.7446130990982056, "step": 1250 }, { "epoch": 0.23061374494671077, "grad_norm": 1.0752792678811776, "learning_rate": 4.6110158390635444e-05, "loss": 0.8294, "mean_token_accuracy": 0.7560481548309326, "step": 1255 }, { "epoch": 0.23153252480705622, "grad_norm": 1.0122734972045033, "learning_rate": 4.6071643980001825e-05, "loss": 0.8331, "mean_token_accuracy": 0.7490222334861756, "step": 1260 }, { "epoch": 0.2324513046674017, "grad_norm": 1.1910590877588172, "learning_rate": 4.603295806440009e-05, "loss": 0.7723, "mean_token_accuracy": 0.7715782880783081, "step": 1265 }, { "epoch": 0.23337008452774716, "grad_norm": 1.0660522556095817, "learning_rate": 4.599410100108598e-05, "loss": 0.8337, "mean_token_accuracy": 0.7520880579948426, "step": 1270 }, { "epoch": 0.2342888643880926, "grad_norm": 0.9720081756723926, "learning_rate": 4.5955073148895784e-05, "loss": 0.8179, "mean_token_accuracy": 0.7595946788787842, "step": 1275 }, { "epoch": 0.23520764424843807, "grad_norm": 1.1185494512129268, "learning_rate": 4.5915874868242944e-05, "loss": 0.8655, "mean_token_accuracy": 0.7462962985038757, "step": 1280 }, { "epoch": 0.23612642410878354, "grad_norm": 1.2897065646338821, "learning_rate": 4.5876506521114805e-05, "loss": 0.8233, "mean_token_accuracy": 0.7591111898422241, "step": 1285 }, { "epoch": 0.237045203969129, "grad_norm": 1.1197184208975648, "learning_rate": 4.583696847106923e-05, "loss": 0.8585, "mean_token_accuracy": 0.7474006295204163, "step": 1290 }, { "epoch": 0.23796398382947445, "grad_norm": 1.2068264298929217, "learning_rate": 4.579726108323123e-05, "loss": 0.9136, "mean_token_accuracy": 0.7314973592758178, "step": 1295 }, { "epoch": 0.23888276368981992, "grad_norm": 1.109906508524664, "learning_rate": 4.5757384724289646e-05, "loss": 0.7947, "mean_token_accuracy": 0.765422809123993, "step": 1300 }, { "epoch": 0.2398015435501654, "grad_norm": 1.2554961190022804, "learning_rate": 4.57173397624937e-05, "loss": 0.8618, "mean_token_accuracy": 0.7456292510032654, "step": 1305 }, { "epoch": 0.24072032341051083, "grad_norm": 1.358639687516225, "learning_rate": 4.567712656764964e-05, "loss": 0.9191, "mean_token_accuracy": 0.734754741191864, "step": 1310 }, { "epoch": 0.2416391032708563, "grad_norm": 1.088229104509902, "learning_rate": 4.5636745511117305e-05, "loss": 0.8064, "mean_token_accuracy": 0.7617093205451966, "step": 1315 }, { "epoch": 0.24255788313120177, "grad_norm": 0.9758498688217456, "learning_rate": 4.559619696580671e-05, "loss": 0.7845, "mean_token_accuracy": 0.7633411526679993, "step": 1320 }, { "epoch": 0.24347666299154722, "grad_norm": 1.2094716352749706, "learning_rate": 4.555548130617455e-05, "loss": 0.7992, "mean_token_accuracy": 0.7611837387084961, "step": 1325 }, { "epoch": 0.24439544285189269, "grad_norm": 1.1427008905651062, "learning_rate": 4.551459890822083e-05, "loss": 0.9158, "mean_token_accuracy": 0.733444607257843, "step": 1330 }, { "epoch": 0.24531422271223816, "grad_norm": 1.1713473006194377, "learning_rate": 4.547355014948534e-05, "loss": 0.845, "mean_token_accuracy": 0.7504712104797363, "step": 1335 }, { "epoch": 0.2462330025725836, "grad_norm": 1.2702712471344686, "learning_rate": 4.543233540904414e-05, "loss": 0.8789, "mean_token_accuracy": 0.7380323767662048, "step": 1340 }, { "epoch": 0.24715178243292907, "grad_norm": 0.9938570050140668, "learning_rate": 4.539095506750614e-05, "loss": 0.8884, "mean_token_accuracy": 0.7428679585456848, "step": 1345 }, { "epoch": 0.24807056229327454, "grad_norm": 1.1279522970582605, "learning_rate": 4.534940950700949e-05, "loss": 0.8753, "mean_token_accuracy": 0.7421611309051513, "step": 1350 }, { "epoch": 0.24898934215361998, "grad_norm": 1.2588052277827508, "learning_rate": 4.530769911121815e-05, "loss": 0.8473, "mean_token_accuracy": 0.7505762934684753, "step": 1355 }, { "epoch": 0.24990812201396545, "grad_norm": 1.096669421502074, "learning_rate": 4.526582426531826e-05, "loss": 0.8353, "mean_token_accuracy": 0.7524473786354064, "step": 1360 }, { "epoch": 0.2508269018743109, "grad_norm": 1.0282246931353665, "learning_rate": 4.5223785356014634e-05, "loss": 0.8702, "mean_token_accuracy": 0.7407379150390625, "step": 1365 }, { "epoch": 0.2517456817346564, "grad_norm": 1.093689037275291, "learning_rate": 4.518158277152717e-05, "loss": 0.8428, "mean_token_accuracy": 0.7473413228988648, "step": 1370 }, { "epoch": 0.25266446159500183, "grad_norm": 1.0139841625812813, "learning_rate": 4.51392169015873e-05, "loss": 0.7938, "mean_token_accuracy": 0.7664546370506287, "step": 1375 }, { "epoch": 0.2535832414553473, "grad_norm": 1.1094571215928501, "learning_rate": 4.509668813743429e-05, "loss": 0.8017, "mean_token_accuracy": 0.7636664628982544, "step": 1380 }, { "epoch": 0.2545020213156928, "grad_norm": 1.2159505822364018, "learning_rate": 4.505399687181178e-05, "loss": 0.8561, "mean_token_accuracy": 0.7478325366973877, "step": 1385 }, { "epoch": 0.2554208011760382, "grad_norm": 1.0494835563541474, "learning_rate": 4.501114349896401e-05, "loss": 0.8611, "mean_token_accuracy": 0.7463506817817688, "step": 1390 }, { "epoch": 0.25633958103638366, "grad_norm": 1.5114697063891234, "learning_rate": 4.496812841463229e-05, "loss": 0.755, "mean_token_accuracy": 0.7759661912918091, "step": 1395 }, { "epoch": 0.25725836089672915, "grad_norm": 1.0714063923864912, "learning_rate": 4.492495201605126e-05, "loss": 0.7358, "mean_token_accuracy": 0.7826925754547119, "step": 1400 }, { "epoch": 0.2581771407570746, "grad_norm": 1.0758074940838653, "learning_rate": 4.4881614701945296e-05, "loss": 0.8875, "mean_token_accuracy": 0.7398916482925415, "step": 1405 }, { "epoch": 0.25909592061742004, "grad_norm": 1.215294771351842, "learning_rate": 4.483811687252477e-05, "loss": 0.8486, "mean_token_accuracy": 0.7489311933517456, "step": 1410 }, { "epoch": 0.26001470047776554, "grad_norm": 1.0794344380786876, "learning_rate": 4.479445892948238e-05, "loss": 0.7227, "mean_token_accuracy": 0.784658420085907, "step": 1415 }, { "epoch": 0.260933480338111, "grad_norm": 1.069221904312748, "learning_rate": 4.4750641275989454e-05, "loss": 0.8486, "mean_token_accuracy": 0.7488225340843201, "step": 1420 }, { "epoch": 0.2618522601984565, "grad_norm": 0.9620708178874104, "learning_rate": 4.470666431669217e-05, "loss": 0.8034, "mean_token_accuracy": 0.764237916469574, "step": 1425 }, { "epoch": 0.2627710400588019, "grad_norm": 1.638007368384327, "learning_rate": 4.4662528457707925e-05, "loss": 0.8552, "mean_token_accuracy": 0.7481104493141174, "step": 1430 }, { "epoch": 0.26368981991914736, "grad_norm": 1.057487407894169, "learning_rate": 4.4618234106621464e-05, "loss": 0.8672, "mean_token_accuracy": 0.7452296495437623, "step": 1435 }, { "epoch": 0.26460859977949286, "grad_norm": 0.9574304406812039, "learning_rate": 4.457378167248117e-05, "loss": 0.834, "mean_token_accuracy": 0.7546884775161743, "step": 1440 }, { "epoch": 0.2655273796398383, "grad_norm": 0.9556755030834319, "learning_rate": 4.452917156579533e-05, "loss": 0.8089, "mean_token_accuracy": 0.7618599176406861, "step": 1445 }, { "epoch": 0.26644615950018374, "grad_norm": 1.0854572977327381, "learning_rate": 4.4484404198528275e-05, "loss": 0.8759, "mean_token_accuracy": 0.7410173654556275, "step": 1450 }, { "epoch": 0.26736493936052924, "grad_norm": 1.2057318993172499, "learning_rate": 4.443947998409658e-05, "loss": 0.8436, "mean_token_accuracy": 0.7513974785804749, "step": 1455 }, { "epoch": 0.2682837192208747, "grad_norm": 1.0376956546110065, "learning_rate": 4.439439933736532e-05, "loss": 0.849, "mean_token_accuracy": 0.7492346167564392, "step": 1460 }, { "epoch": 0.2692024990812201, "grad_norm": 1.1016805520406512, "learning_rate": 4.434916267464416e-05, "loss": 0.7783, "mean_token_accuracy": 0.7683018922805787, "step": 1465 }, { "epoch": 0.2701212789415656, "grad_norm": 1.2408322866975516, "learning_rate": 4.430377041368351e-05, "loss": 0.8772, "mean_token_accuracy": 0.738334059715271, "step": 1470 }, { "epoch": 0.27104005880191107, "grad_norm": 1.090229737310603, "learning_rate": 4.425822297367075e-05, "loss": 0.7981, "mean_token_accuracy": 0.7645934343338012, "step": 1475 }, { "epoch": 0.2719588386622565, "grad_norm": 1.1864254928194882, "learning_rate": 4.4212520775226256e-05, "loss": 0.8155, "mean_token_accuracy": 0.7581284165382385, "step": 1480 }, { "epoch": 0.272877618522602, "grad_norm": 0.9904815545158214, "learning_rate": 4.4166664240399606e-05, "loss": 0.8076, "mean_token_accuracy": 0.7610304713249206, "step": 1485 }, { "epoch": 0.27379639838294745, "grad_norm": 1.0991992028451756, "learning_rate": 4.412065379266559e-05, "loss": 0.9142, "mean_token_accuracy": 0.7303188562393188, "step": 1490 }, { "epoch": 0.2747151782432929, "grad_norm": 1.0413802785892232, "learning_rate": 4.4074489856920406e-05, "loss": 0.8434, "mean_token_accuracy": 0.7503148317337036, "step": 1495 }, { "epoch": 0.2756339581036384, "grad_norm": 0.8788289426619142, "learning_rate": 4.4028172859477626e-05, "loss": 0.7476, "mean_token_accuracy": 0.7781436324119568, "step": 1500 }, { "epoch": 0.27655273796398383, "grad_norm": 1.1409867687324795, "learning_rate": 4.398170322806435e-05, "loss": 0.9066, "mean_token_accuracy": 0.7312582850456237, "step": 1505 }, { "epoch": 0.2774715178243293, "grad_norm": 1.0726549315103535, "learning_rate": 4.3935081391817194e-05, "loss": 0.8533, "mean_token_accuracy": 0.7442232012748718, "step": 1510 }, { "epoch": 0.27839029768467477, "grad_norm": 1.0743736375904043, "learning_rate": 4.388830778127837e-05, "loss": 0.8109, "mean_token_accuracy": 0.755815064907074, "step": 1515 }, { "epoch": 0.2793090775450202, "grad_norm": 1.1737139300743868, "learning_rate": 4.3841382828391684e-05, "loss": 0.783, "mean_token_accuracy": 0.770452618598938, "step": 1520 }, { "epoch": 0.28022785740536565, "grad_norm": 0.9732962716875773, "learning_rate": 4.379430696649856e-05, "loss": 0.8423, "mean_token_accuracy": 0.7509778499603271, "step": 1525 }, { "epoch": 0.28114663726571115, "grad_norm": 1.2143466656736133, "learning_rate": 4.374708063033403e-05, "loss": 0.8262, "mean_token_accuracy": 0.7557825446128845, "step": 1530 }, { "epoch": 0.2820654171260566, "grad_norm": 1.1347429120882544, "learning_rate": 4.369970425602269e-05, "loss": 0.7872, "mean_token_accuracy": 0.7643797039985657, "step": 1535 }, { "epoch": 0.28298419698640204, "grad_norm": 1.0471262947053317, "learning_rate": 4.365217828107476e-05, "loss": 0.8227, "mean_token_accuracy": 0.7580597996711731, "step": 1540 }, { "epoch": 0.28390297684674753, "grad_norm": 0.9720065253460837, "learning_rate": 4.3604503144381964e-05, "loss": 0.8133, "mean_token_accuracy": 0.7570616483688355, "step": 1545 }, { "epoch": 0.284821756707093, "grad_norm": 1.0983353140713001, "learning_rate": 4.3556679286213495e-05, "loss": 0.8416, "mean_token_accuracy": 0.7502852201461792, "step": 1550 }, { "epoch": 0.2857405365674384, "grad_norm": 0.9936040636732534, "learning_rate": 4.3508707148211946e-05, "loss": 0.7351, "mean_token_accuracy": 0.7779555797576905, "step": 1555 }, { "epoch": 0.2866593164277839, "grad_norm": 1.0923713064546872, "learning_rate": 4.3460587173389284e-05, "loss": 0.8502, "mean_token_accuracy": 0.7482675671577453, "step": 1560 }, { "epoch": 0.28757809628812936, "grad_norm": 1.0011444400816414, "learning_rate": 4.341231980612266e-05, "loss": 0.8008, "mean_token_accuracy": 0.7629394650459289, "step": 1565 }, { "epoch": 0.2884968761484748, "grad_norm": 1.057600337329318, "learning_rate": 4.336390549215041e-05, "loss": 0.8052, "mean_token_accuracy": 0.7602485775947571, "step": 1570 }, { "epoch": 0.2894156560088203, "grad_norm": 0.9400172537775919, "learning_rate": 4.331534467856785e-05, "loss": 0.8037, "mean_token_accuracy": 0.7623314976692199, "step": 1575 }, { "epoch": 0.29033443586916574, "grad_norm": 1.1540389749120974, "learning_rate": 4.3266637813823216e-05, "loss": 0.8087, "mean_token_accuracy": 0.7602805018424987, "step": 1580 }, { "epoch": 0.2912532157295112, "grad_norm": 1.1256888029606915, "learning_rate": 4.3217785347713486e-05, "loss": 0.8196, "mean_token_accuracy": 0.7612602710723877, "step": 1585 }, { "epoch": 0.2921719955898567, "grad_norm": 1.1230902967533625, "learning_rate": 4.3168787731380224e-05, "loss": 0.7872, "mean_token_accuracy": 0.7645440459251404, "step": 1590 }, { "epoch": 0.2930907754502021, "grad_norm": 1.0556955876997007, "learning_rate": 4.3119645417305435e-05, "loss": 0.8697, "mean_token_accuracy": 0.7421263337135315, "step": 1595 }, { "epoch": 0.29400955531054757, "grad_norm": 1.2917611391283017, "learning_rate": 4.307035885930736e-05, "loss": 0.7776, "mean_token_accuracy": 0.7674265027046203, "step": 1600 }, { "epoch": 0.29492833517089306, "grad_norm": 1.1648596155514652, "learning_rate": 4.3020928512536326e-05, "loss": 0.7851, "mean_token_accuracy": 0.7669198989868165, "step": 1605 }, { "epoch": 0.2958471150312385, "grad_norm": 1.0970597977003158, "learning_rate": 4.29713548334705e-05, "loss": 0.8279, "mean_token_accuracy": 0.7507619738578797, "step": 1610 }, { "epoch": 0.296765894891584, "grad_norm": 1.054076923212483, "learning_rate": 4.292163827991168e-05, "loss": 0.7722, "mean_token_accuracy": 0.7705003499984742, "step": 1615 }, { "epoch": 0.29768467475192945, "grad_norm": 1.2125775832361394, "learning_rate": 4.2871779310981114e-05, "loss": 0.8192, "mean_token_accuracy": 0.7588199496269226, "step": 1620 }, { "epoch": 0.2986034546122749, "grad_norm": 0.9888172610894965, "learning_rate": 4.282177838711518e-05, "loss": 0.7953, "mean_token_accuracy": 0.7682381868362427, "step": 1625 }, { "epoch": 0.2995222344726204, "grad_norm": 0.9981457336248658, "learning_rate": 4.277163597006121e-05, "loss": 0.824, "mean_token_accuracy": 0.7541024565696717, "step": 1630 }, { "epoch": 0.30044101433296583, "grad_norm": 1.036834210307202, "learning_rate": 4.2721352522873184e-05, "loss": 0.7632, "mean_token_accuracy": 0.7723967909812928, "step": 1635 }, { "epoch": 0.30135979419331127, "grad_norm": 1.1851229887536607, "learning_rate": 4.2670928509907446e-05, "loss": 0.8349, "mean_token_accuracy": 0.7524407744407654, "step": 1640 }, { "epoch": 0.30227857405365677, "grad_norm": 1.0536941977185987, "learning_rate": 4.262036439681847e-05, "loss": 0.8138, "mean_token_accuracy": 0.7575963020324707, "step": 1645 }, { "epoch": 0.3031973539140022, "grad_norm": 1.0372574521222562, "learning_rate": 4.256966065055449e-05, "loss": 0.7325, "mean_token_accuracy": 0.7790537357330323, "step": 1650 }, { "epoch": 0.30411613377434765, "grad_norm": 1.0174389909956805, "learning_rate": 4.251881773935325e-05, "loss": 0.864, "mean_token_accuracy": 0.74665367603302, "step": 1655 }, { "epoch": 0.30503491363469315, "grad_norm": 0.9845084927156172, "learning_rate": 4.246783613273761e-05, "loss": 0.7645, "mean_token_accuracy": 0.7687517642974854, "step": 1660 }, { "epoch": 0.3059536934950386, "grad_norm": 1.0336916342848663, "learning_rate": 4.2416716301511305e-05, "loss": 0.8479, "mean_token_accuracy": 0.7481852293014526, "step": 1665 }, { "epoch": 0.30687247335538403, "grad_norm": 1.0381531094343786, "learning_rate": 4.2365458717754494e-05, "loss": 0.8085, "mean_token_accuracy": 0.75991370677948, "step": 1670 }, { "epoch": 0.30779125321572953, "grad_norm": 1.110015448227854, "learning_rate": 4.231406385481947e-05, "loss": 0.7717, "mean_token_accuracy": 0.7670859694480896, "step": 1675 }, { "epoch": 0.308710033076075, "grad_norm": 1.036136344871459, "learning_rate": 4.226253218732629e-05, "loss": 0.7949, "mean_token_accuracy": 0.7634945988655091, "step": 1680 }, { "epoch": 0.3096288129364204, "grad_norm": 1.0484877630675096, "learning_rate": 4.221086419115832e-05, "loss": 0.8448, "mean_token_accuracy": 0.751638388633728, "step": 1685 }, { "epoch": 0.3105475927967659, "grad_norm": 6.349506903012944, "learning_rate": 4.2159060343457947e-05, "loss": 0.9101, "mean_token_accuracy": 0.7370145440101623, "step": 1690 }, { "epoch": 0.31146637265711136, "grad_norm": 1.1977614262895908, "learning_rate": 4.2107121122622066e-05, "loss": 0.8389, "mean_token_accuracy": 0.7488813638687134, "step": 1695 }, { "epoch": 0.3123851525174568, "grad_norm": 1.2380653768153889, "learning_rate": 4.2055047008297757e-05, "loss": 0.8342, "mean_token_accuracy": 0.7505980730056763, "step": 1700 }, { "epoch": 0.3133039323778023, "grad_norm": 0.9546362693630366, "learning_rate": 4.200283848137777e-05, "loss": 0.7855, "mean_token_accuracy": 0.7642045140266418, "step": 1705 }, { "epoch": 0.31422271223814774, "grad_norm": 1.022441012881404, "learning_rate": 4.195049602399616e-05, "loss": 0.7877, "mean_token_accuracy": 0.7621595740318299, "step": 1710 }, { "epoch": 0.3151414920984932, "grad_norm": 1.0392246094486983, "learning_rate": 4.189802011952378e-05, "loss": 0.878, "mean_token_accuracy": 0.744194757938385, "step": 1715 }, { "epoch": 0.3160602719588387, "grad_norm": 1.077282143260173, "learning_rate": 4.184541125256385e-05, "loss": 0.7917, "mean_token_accuracy": 0.7647501945495605, "step": 1720 }, { "epoch": 0.3169790518191841, "grad_norm": 0.9775790537668483, "learning_rate": 4.1792669908947436e-05, "loss": 0.8597, "mean_token_accuracy": 0.74363933801651, "step": 1725 }, { "epoch": 0.31789783167952956, "grad_norm": 1.1117317785310954, "learning_rate": 4.1739796575729045e-05, "loss": 0.8114, "mean_token_accuracy": 0.7558189272880554, "step": 1730 }, { "epoch": 0.31881661153987506, "grad_norm": 1.1369274014331534, "learning_rate": 4.168679174118205e-05, "loss": 0.8715, "mean_token_accuracy": 0.7428115725517273, "step": 1735 }, { "epoch": 0.3197353914002205, "grad_norm": 1.0490276575831161, "learning_rate": 4.1633655894794206e-05, "loss": 0.8579, "mean_token_accuracy": 0.7467806100845337, "step": 1740 }, { "epoch": 0.32065417126056595, "grad_norm": 1.0117146921952147, "learning_rate": 4.158038952726315e-05, "loss": 0.7832, "mean_token_accuracy": 0.7676323890686035, "step": 1745 }, { "epoch": 0.32157295112091144, "grad_norm": 1.0514207056273204, "learning_rate": 4.1526993130491834e-05, "loss": 0.7417, "mean_token_accuracy": 0.779768443107605, "step": 1750 }, { "epoch": 0.3224917309812569, "grad_norm": 1.1710593436653487, "learning_rate": 4.147346719758401e-05, "loss": 0.759, "mean_token_accuracy": 0.7754043459892273, "step": 1755 }, { "epoch": 0.32341051084160233, "grad_norm": 1.1210033487742597, "learning_rate": 4.141981222283969e-05, "loss": 0.8426, "mean_token_accuracy": 0.7512526273727417, "step": 1760 }, { "epoch": 0.3243292907019478, "grad_norm": 1.067779284913716, "learning_rate": 4.136602870175049e-05, "loss": 0.7312, "mean_token_accuracy": 0.7808745861053467, "step": 1765 }, { "epoch": 0.32524807056229327, "grad_norm": 0.9739576922638749, "learning_rate": 4.131211713099522e-05, "loss": 0.7442, "mean_token_accuracy": 0.7744468688964844, "step": 1770 }, { "epoch": 0.3261668504226387, "grad_norm": 1.013655175975763, "learning_rate": 4.1258078008435103e-05, "loss": 0.7824, "mean_token_accuracy": 0.7647914290428162, "step": 1775 }, { "epoch": 0.3270856302829842, "grad_norm": 0.9850930887046532, "learning_rate": 4.120391183310934e-05, "loss": 0.7274, "mean_token_accuracy": 0.7834605932235718, "step": 1780 }, { "epoch": 0.32800441014332965, "grad_norm": 1.323489905547871, "learning_rate": 4.114961910523042e-05, "loss": 0.8074, "mean_token_accuracy": 0.7612802505493164, "step": 1785 }, { "epoch": 0.3289231900036751, "grad_norm": 1.035219788914723, "learning_rate": 4.109520032617952e-05, "loss": 0.8369, "mean_token_accuracy": 0.7539438486099244, "step": 1790 }, { "epoch": 0.3298419698640206, "grad_norm": 0.984325460018373, "learning_rate": 4.104065599850183e-05, "loss": 0.8593, "mean_token_accuracy": 0.7480033159255981, "step": 1795 }, { "epoch": 0.33076074972436603, "grad_norm": 1.1555611010512028, "learning_rate": 4.098598662590202e-05, "loss": 0.7045, "mean_token_accuracy": 0.7892690062522888, "step": 1800 }, { "epoch": 0.33167952958471153, "grad_norm": 1.0781062858261419, "learning_rate": 4.093119271323947e-05, "loss": 0.8231, "mean_token_accuracy": 0.75406334400177, "step": 1805 }, { "epoch": 0.33259830944505697, "grad_norm": 1.0129880605444779, "learning_rate": 4.0876274766523674e-05, "loss": 0.9059, "mean_token_accuracy": 0.7340885043144226, "step": 1810 }, { "epoch": 0.3335170893054024, "grad_norm": 1.214320088802432, "learning_rate": 4.0821233292909575e-05, "loss": 0.8751, "mean_token_accuracy": 0.7407148957252503, "step": 1815 }, { "epoch": 0.3344358691657479, "grad_norm": 1.000936871191356, "learning_rate": 4.076606880069283e-05, "loss": 0.7856, "mean_token_accuracy": 0.7644298434257507, "step": 1820 }, { "epoch": 0.33535464902609335, "grad_norm": 1.1385038309062536, "learning_rate": 4.0710781799305146e-05, "loss": 0.8165, "mean_token_accuracy": 0.7551571488380432, "step": 1825 }, { "epoch": 0.3362734288864388, "grad_norm": 1.308386913579212, "learning_rate": 4.065537279930961e-05, "loss": 0.8436, "mean_token_accuracy": 0.7464751482009888, "step": 1830 }, { "epoch": 0.3371922087467843, "grad_norm": 1.1854394162632642, "learning_rate": 4.059984231239587e-05, "loss": 0.8499, "mean_token_accuracy": 0.7523553133010864, "step": 1835 }, { "epoch": 0.33811098860712974, "grad_norm": 1.0597911593288654, "learning_rate": 4.054419085137558e-05, "loss": 0.7912, "mean_token_accuracy": 0.7623480677604675, "step": 1840 }, { "epoch": 0.3390297684674752, "grad_norm": 1.1332445452310214, "learning_rate": 4.0488418930177464e-05, "loss": 0.7861, "mean_token_accuracy": 0.7626782655715942, "step": 1845 }, { "epoch": 0.3399485483278207, "grad_norm": 1.1468762268738129, "learning_rate": 4.043252706384273e-05, "loss": 0.8866, "mean_token_accuracy": 0.7364044427871704, "step": 1850 }, { "epoch": 0.3408673281881661, "grad_norm": 1.0468393046787807, "learning_rate": 4.037651576852021e-05, "loss": 0.8192, "mean_token_accuracy": 0.7569101452827454, "step": 1855 }, { "epoch": 0.34178610804851156, "grad_norm": 1.034873991581434, "learning_rate": 4.032038556146167e-05, "loss": 0.7799, "mean_token_accuracy": 0.7652035236358643, "step": 1860 }, { "epoch": 0.34270488790885706, "grad_norm": 1.0816344286074944, "learning_rate": 4.0264136961017e-05, "loss": 0.8062, "mean_token_accuracy": 0.7586339831352233, "step": 1865 }, { "epoch": 0.3436236677692025, "grad_norm": 1.1216437423138468, "learning_rate": 4.020777048662939e-05, "loss": 0.8526, "mean_token_accuracy": 0.7471354722976684, "step": 1870 }, { "epoch": 0.34454244762954794, "grad_norm": 1.2787020788596146, "learning_rate": 4.01512866588306e-05, "loss": 0.8337, "mean_token_accuracy": 0.7524662256240845, "step": 1875 }, { "epoch": 0.34546122748989344, "grad_norm": 0.9726950672685023, "learning_rate": 4.009468599923613e-05, "loss": 0.8254, "mean_token_accuracy": 0.7547502636909484, "step": 1880 }, { "epoch": 0.3463800073502389, "grad_norm": 1.1522393714280965, "learning_rate": 4.0037969030540356e-05, "loss": 0.8788, "mean_token_accuracy": 0.7409179091453553, "step": 1885 }, { "epoch": 0.3472987872105843, "grad_norm": 1.0134958833049312, "learning_rate": 3.9981136276511786e-05, "loss": 0.7966, "mean_token_accuracy": 0.7628639936447144, "step": 1890 }, { "epoch": 0.3482175670709298, "grad_norm": 1.0067212082614574, "learning_rate": 3.992418826198816e-05, "loss": 0.7483, "mean_token_accuracy": 0.7759244441986084, "step": 1895 }, { "epoch": 0.34913634693127527, "grad_norm": 1.1440056397503298, "learning_rate": 3.9867125512871604e-05, "loss": 0.8465, "mean_token_accuracy": 0.7483215093612671, "step": 1900 }, { "epoch": 0.3500551267916207, "grad_norm": 1.015411004374869, "learning_rate": 3.980994855612384e-05, "loss": 0.7634, "mean_token_accuracy": 0.768380320072174, "step": 1905 }, { "epoch": 0.3509739066519662, "grad_norm": 1.0214270916485333, "learning_rate": 3.975265791976122e-05, "loss": 0.8031, "mean_token_accuracy": 0.7592991948127746, "step": 1910 }, { "epoch": 0.35189268651231165, "grad_norm": 1.0257322832506945, "learning_rate": 3.969525413284994e-05, "loss": 0.7808, "mean_token_accuracy": 0.7686658024787902, "step": 1915 }, { "epoch": 0.3528114663726571, "grad_norm": 1.0256575127828191, "learning_rate": 3.96377377255011e-05, "loss": 0.8711, "mean_token_accuracy": 0.7431510090827942, "step": 1920 }, { "epoch": 0.3537302462330026, "grad_norm": 1.2324007321771868, "learning_rate": 3.958010922886582e-05, "loss": 0.8813, "mean_token_accuracy": 0.7428903222084046, "step": 1925 }, { "epoch": 0.35464902609334803, "grad_norm": 1.3413818534531692, "learning_rate": 3.9522369175130345e-05, "loss": 0.8645, "mean_token_accuracy": 0.7381167054176331, "step": 1930 }, { "epoch": 0.35556780595369347, "grad_norm": 1.1450695156011848, "learning_rate": 3.946451809751114e-05, "loss": 0.8475, "mean_token_accuracy": 0.7497512817382812, "step": 1935 }, { "epoch": 0.35648658581403897, "grad_norm": 1.2054216083900955, "learning_rate": 3.9406556530249905e-05, "loss": 0.8103, "mean_token_accuracy": 0.7571905732154847, "step": 1940 }, { "epoch": 0.3574053656743844, "grad_norm": 1.06377210645749, "learning_rate": 3.934848500860875e-05, "loss": 0.7883, "mean_token_accuracy": 0.7618215918540955, "step": 1945 }, { "epoch": 0.35832414553472985, "grad_norm": 1.0312951220792854, "learning_rate": 3.9290304068865144e-05, "loss": 0.8129, "mean_token_accuracy": 0.7582242131233216, "step": 1950 }, { "epoch": 0.35924292539507535, "grad_norm": 0.9872094884682482, "learning_rate": 3.923201424830701e-05, "loss": 0.7861, "mean_token_accuracy": 0.765390944480896, "step": 1955 }, { "epoch": 0.3601617052554208, "grad_norm": 0.9446381855103185, "learning_rate": 3.917361608522778e-05, "loss": 0.8067, "mean_token_accuracy": 0.7581991076469421, "step": 1960 }, { "epoch": 0.36108048511576624, "grad_norm": 1.0047794050440646, "learning_rate": 3.911511011892141e-05, "loss": 0.815, "mean_token_accuracy": 0.7577335119247437, "step": 1965 }, { "epoch": 0.36199926497611173, "grad_norm": 1.0514135855864823, "learning_rate": 3.905649688967736e-05, "loss": 0.8003, "mean_token_accuracy": 0.7607754588127136, "step": 1970 }, { "epoch": 0.3629180448364572, "grad_norm": 0.9011315096836434, "learning_rate": 3.8997776938775664e-05, "loss": 0.8548, "mean_token_accuracy": 0.748826515674591, "step": 1975 }, { "epoch": 0.3638368246968026, "grad_norm": 1.0629251850472634, "learning_rate": 3.893895080848192e-05, "loss": 0.8871, "mean_token_accuracy": 0.7375021696090698, "step": 1980 }, { "epoch": 0.3647556045571481, "grad_norm": 1.002663862883126, "learning_rate": 3.888001904204223e-05, "loss": 0.7724, "mean_token_accuracy": 0.769203269481659, "step": 1985 }, { "epoch": 0.36567438441749356, "grad_norm": 0.9877909023393563, "learning_rate": 3.882098218367826e-05, "loss": 0.7703, "mean_token_accuracy": 0.7695886373519898, "step": 1990 }, { "epoch": 0.36659316427783906, "grad_norm": 1.148691358597791, "learning_rate": 3.876184077858214e-05, "loss": 0.707, "mean_token_accuracy": 0.7888103008270264, "step": 1995 }, { "epoch": 0.3675119441381845, "grad_norm": 0.9571216367735665, "learning_rate": 3.8702595372911524e-05, "loss": 0.7846, "mean_token_accuracy": 0.769954240322113, "step": 2000 }, { "epoch": 0.36843072399852994, "grad_norm": 0.8971436133233381, "learning_rate": 3.86432465137844e-05, "loss": 0.783, "mean_token_accuracy": 0.7654212713241577, "step": 2005 }, { "epoch": 0.36934950385887544, "grad_norm": 1.0855213777834216, "learning_rate": 3.8583794749274197e-05, "loss": 0.7858, "mean_token_accuracy": 0.7648387908935547, "step": 2010 }, { "epoch": 0.3702682837192209, "grad_norm": 1.0016490287283184, "learning_rate": 3.852424062840465e-05, "loss": 0.7997, "mean_token_accuracy": 0.7611153483390808, "step": 2015 }, { "epoch": 0.3711870635795663, "grad_norm": 1.0410544587880413, "learning_rate": 3.846458470114469e-05, "loss": 0.8434, "mean_token_accuracy": 0.745345389842987, "step": 2020 }, { "epoch": 0.3721058434399118, "grad_norm": 1.0734066596059144, "learning_rate": 3.8404827518403424e-05, "loss": 0.8303, "mean_token_accuracy": 0.7534924626350403, "step": 2025 }, { "epoch": 0.37302462330025726, "grad_norm": 1.1067287910564152, "learning_rate": 3.834496963202506e-05, "loss": 0.7138, "mean_token_accuracy": 0.7858679056167602, "step": 2030 }, { "epoch": 0.3739434031606027, "grad_norm": 1.0240680595901026, "learning_rate": 3.828501159478374e-05, "loss": 0.7816, "mean_token_accuracy": 0.767118227481842, "step": 2035 }, { "epoch": 0.3748621830209482, "grad_norm": 0.9108092672977341, "learning_rate": 3.822495396037849e-05, "loss": 0.7888, "mean_token_accuracy": 0.7624866485595703, "step": 2040 }, { "epoch": 0.37578096288129365, "grad_norm": 0.9408803997681696, "learning_rate": 3.816479728342811e-05, "loss": 0.7799, "mean_token_accuracy": 0.7651725172996521, "step": 2045 }, { "epoch": 0.3766997427416391, "grad_norm": 1.0909405717967111, "learning_rate": 3.8104542119466024e-05, "loss": 0.8526, "mean_token_accuracy": 0.7467872500419617, "step": 2050 }, { "epoch": 0.3776185226019846, "grad_norm": 1.0413672136227896, "learning_rate": 3.804418902493515e-05, "loss": 0.8557, "mean_token_accuracy": 0.7429945468902588, "step": 2055 }, { "epoch": 0.37853730246233, "grad_norm": 1.013837379671012, "learning_rate": 3.798373855718281e-05, "loss": 0.7514, "mean_token_accuracy": 0.7755364179611206, "step": 2060 }, { "epoch": 0.37945608232267547, "grad_norm": 1.1123775153381206, "learning_rate": 3.7923191274455485e-05, "loss": 0.8503, "mean_token_accuracy": 0.746312165260315, "step": 2065 }, { "epoch": 0.38037486218302097, "grad_norm": 1.1418197878690564, "learning_rate": 3.786254773589378e-05, "loss": 0.8214, "mean_token_accuracy": 0.7556997299194336, "step": 2070 }, { "epoch": 0.3812936420433664, "grad_norm": 1.067367442039563, "learning_rate": 3.780180850152716e-05, "loss": 0.8306, "mean_token_accuracy": 0.7545937180519104, "step": 2075 }, { "epoch": 0.38221242190371185, "grad_norm": 0.9953541285561653, "learning_rate": 3.774097413226885e-05, "loss": 0.8767, "mean_token_accuracy": 0.7388915061950684, "step": 2080 }, { "epoch": 0.38313120176405735, "grad_norm": 0.9620898865674795, "learning_rate": 3.768004518991061e-05, "loss": 0.8024, "mean_token_accuracy": 0.7610628366470337, "step": 2085 }, { "epoch": 0.3840499816244028, "grad_norm": 1.040454257360025, "learning_rate": 3.761902223711754e-05, "loss": 0.837, "mean_token_accuracy": 0.749622106552124, "step": 2090 }, { "epoch": 0.38496876148474823, "grad_norm": 0.9516590882481228, "learning_rate": 3.755790583742296e-05, "loss": 0.8153, "mean_token_accuracy": 0.758633291721344, "step": 2095 }, { "epoch": 0.38588754134509373, "grad_norm": 1.0838072652279054, "learning_rate": 3.749669655522308e-05, "loss": 0.8902, "mean_token_accuracy": 0.7317674040794373, "step": 2100 }, { "epoch": 0.3868063212054392, "grad_norm": 0.9749499461968839, "learning_rate": 3.743539495577193e-05, "loss": 0.8897, "mean_token_accuracy": 0.739715039730072, "step": 2105 }, { "epoch": 0.3877251010657846, "grad_norm": 0.9972815952612917, "learning_rate": 3.7374001605176026e-05, "loss": 0.7977, "mean_token_accuracy": 0.7625495314598083, "step": 2110 }, { "epoch": 0.3886438809261301, "grad_norm": 0.9037607701703367, "learning_rate": 3.731251707038919e-05, "loss": 0.7822, "mean_token_accuracy": 0.7656629920005799, "step": 2115 }, { "epoch": 0.38956266078647556, "grad_norm": 1.0416251065743547, "learning_rate": 3.725094191920731e-05, "loss": 0.8298, "mean_token_accuracy": 0.7543026089668274, "step": 2120 }, { "epoch": 0.390481440646821, "grad_norm": 0.947462414303698, "learning_rate": 3.7189276720263124e-05, "loss": 0.7782, "mean_token_accuracy": 0.7649134397506714, "step": 2125 }, { "epoch": 0.3914002205071665, "grad_norm": 0.8871190316863635, "learning_rate": 3.712752204302089e-05, "loss": 0.8158, "mean_token_accuracy": 0.7549549221992493, "step": 2130 }, { "epoch": 0.39231900036751194, "grad_norm": 1.0445560118226038, "learning_rate": 3.7065678457771224e-05, "loss": 0.817, "mean_token_accuracy": 0.7530762314796448, "step": 2135 }, { "epoch": 0.3932377802278574, "grad_norm": 0.9686024583555402, "learning_rate": 3.700374653562577e-05, "loss": 0.7923, "mean_token_accuracy": 0.7622018694877625, "step": 2140 }, { "epoch": 0.3941565600882029, "grad_norm": 0.9213469714809804, "learning_rate": 3.694172684851193e-05, "loss": 0.7721, "mean_token_accuracy": 0.7674794912338256, "step": 2145 }, { "epoch": 0.3950753399485483, "grad_norm": 0.9345497817342381, "learning_rate": 3.6879619969167614e-05, "loss": 0.7492, "mean_token_accuracy": 0.776430857181549, "step": 2150 }, { "epoch": 0.39599411980889376, "grad_norm": 1.0666312061499093, "learning_rate": 3.681742647113594e-05, "loss": 0.8168, "mean_token_accuracy": 0.7584180355072021, "step": 2155 }, { "epoch": 0.39691289966923926, "grad_norm": 1.1091479631196337, "learning_rate": 3.67551469287599e-05, "loss": 0.8346, "mean_token_accuracy": 0.7529264330863953, "step": 2160 }, { "epoch": 0.3978316795295847, "grad_norm": 1.0802706313899886, "learning_rate": 3.669278191717712e-05, "loss": 0.8326, "mean_token_accuracy": 0.748162055015564, "step": 2165 }, { "epoch": 0.39875045938993015, "grad_norm": 1.2155117409623286, "learning_rate": 3.6630332012314485e-05, "loss": 0.8257, "mean_token_accuracy": 0.7545464992523193, "step": 2170 }, { "epoch": 0.39966923925027564, "grad_norm": 1.0387043754774363, "learning_rate": 3.656779779088287e-05, "loss": 0.7581, "mean_token_accuracy": 0.769706928730011, "step": 2175 }, { "epoch": 0.4005880191106211, "grad_norm": 0.9698917316969403, "learning_rate": 3.650517983037179e-05, "loss": 0.7412, "mean_token_accuracy": 0.7771862506866455, "step": 2180 }, { "epoch": 0.4015067989709666, "grad_norm": 0.9123240142029096, "learning_rate": 3.6442478709044065e-05, "loss": 0.7079, "mean_token_accuracy": 0.7833864569664002, "step": 2185 }, { "epoch": 0.402425578831312, "grad_norm": 1.0355123737426308, "learning_rate": 3.6379695005930504e-05, "loss": 0.7094, "mean_token_accuracy": 0.7866922855377197, "step": 2190 }, { "epoch": 0.40334435869165747, "grad_norm": 1.1267999437797982, "learning_rate": 3.6316829300824514e-05, "loss": 0.7638, "mean_token_accuracy": 0.7694135665893554, "step": 2195 }, { "epoch": 0.40426313855200297, "grad_norm": 0.9483752352311176, "learning_rate": 3.6253882174276784e-05, "loss": 0.8328, "mean_token_accuracy": 0.7523651957511902, "step": 2200 }, { "epoch": 0.4051819184123484, "grad_norm": 1.081747935405834, "learning_rate": 3.619085420758994e-05, "loss": 0.8821, "mean_token_accuracy": 0.7345280289649964, "step": 2205 }, { "epoch": 0.40610069827269385, "grad_norm": 1.2409774807789806, "learning_rate": 3.612774598281309e-05, "loss": 0.8638, "mean_token_accuracy": 0.7448987007141114, "step": 2210 }, { "epoch": 0.40701947813303935, "grad_norm": 1.0398975018242138, "learning_rate": 3.606455808273656e-05, "loss": 0.7303, "mean_token_accuracy": 0.7799215197563172, "step": 2215 }, { "epoch": 0.4079382579933848, "grad_norm": 1.1058430385137907, "learning_rate": 3.600129109088644e-05, "loss": 0.7463, "mean_token_accuracy": 0.7737818479537963, "step": 2220 }, { "epoch": 0.40885703785373023, "grad_norm": 2.3840075862300405, "learning_rate": 3.593794559151921e-05, "loss": 0.827, "mean_token_accuracy": 0.7540715932846069, "step": 2225 }, { "epoch": 0.40977581771407573, "grad_norm": 1.0043211971634207, "learning_rate": 3.5874522169616346e-05, "loss": 0.8156, "mean_token_accuracy": 0.7552896976470947, "step": 2230 }, { "epoch": 0.41069459757442117, "grad_norm": 1.040415072100305, "learning_rate": 3.581102141087893e-05, "loss": 0.7356, "mean_token_accuracy": 0.7774260997772217, "step": 2235 }, { "epoch": 0.4116133774347666, "grad_norm": 1.0107449068601633, "learning_rate": 3.5747443901722246e-05, "loss": 0.8332, "mean_token_accuracy": 0.7481484651565552, "step": 2240 }, { "epoch": 0.4125321572951121, "grad_norm": 1.0982955854264607, "learning_rate": 3.568379022927032e-05, "loss": 0.8514, "mean_token_accuracy": 0.7456109881401062, "step": 2245 }, { "epoch": 0.41345093715545755, "grad_norm": 0.9554863556690953, "learning_rate": 3.562006098135056e-05, "loss": 0.8014, "mean_token_accuracy": 0.7609956502914429, "step": 2250 }, { "epoch": 0.414369717015803, "grad_norm": 1.076779827899152, "learning_rate": 3.5556256746488256e-05, "loss": 0.7832, "mean_token_accuracy": 0.7661887645721436, "step": 2255 }, { "epoch": 0.4152884968761485, "grad_norm": 0.9529179922047704, "learning_rate": 3.549237811390125e-05, "loss": 0.8153, "mean_token_accuracy": 0.7538660645484925, "step": 2260 }, { "epoch": 0.41620727673649394, "grad_norm": 1.1195875955943517, "learning_rate": 3.542842567349435e-05, "loss": 0.7221, "mean_token_accuracy": 0.7824627161026001, "step": 2265 }, { "epoch": 0.4171260565968394, "grad_norm": 0.92653979162294, "learning_rate": 3.536440001585405e-05, "loss": 0.7777, "mean_token_accuracy": 0.7661702513694764, "step": 2270 }, { "epoch": 0.4180448364571849, "grad_norm": 1.1097040235889317, "learning_rate": 3.5300301732242894e-05, "loss": 0.6985, "mean_token_accuracy": 0.7891063332557678, "step": 2275 }, { "epoch": 0.4189636163175303, "grad_norm": 1.008299555421031, "learning_rate": 3.523613141459418e-05, "loss": 0.7802, "mean_token_accuracy": 0.7641416311264038, "step": 2280 }, { "epoch": 0.41988239617787576, "grad_norm": 1.032289956768311, "learning_rate": 3.5171889655506415e-05, "loss": 0.8249, "mean_token_accuracy": 0.7521484732627869, "step": 2285 }, { "epoch": 0.42080117603822126, "grad_norm": 1.0467412920187205, "learning_rate": 3.510757704823784e-05, "loss": 0.858, "mean_token_accuracy": 0.746031641960144, "step": 2290 }, { "epoch": 0.4217199558985667, "grad_norm": 0.9244613024912548, "learning_rate": 3.5043194186700936e-05, "loss": 0.7074, "mean_token_accuracy": 0.7865028500556945, "step": 2295 }, { "epoch": 0.42263873575891214, "grad_norm": 1.2481817962042026, "learning_rate": 3.4978741665457025e-05, "loss": 0.8653, "mean_token_accuracy": 0.7401462674140931, "step": 2300 }, { "epoch": 0.42355751561925764, "grad_norm": 1.1106698371323385, "learning_rate": 3.4914220079710666e-05, "loss": 0.7935, "mean_token_accuracy": 0.7622707843780517, "step": 2305 }, { "epoch": 0.4244762954796031, "grad_norm": 1.0106898769557273, "learning_rate": 3.484963002530425e-05, "loss": 0.7434, "mean_token_accuracy": 0.7763538002967835, "step": 2310 }, { "epoch": 0.4253950753399485, "grad_norm": 0.8849864404343236, "learning_rate": 3.478497209871245e-05, "loss": 0.6992, "mean_token_accuracy": 0.7923735499382019, "step": 2315 }, { "epoch": 0.426313855200294, "grad_norm": 0.9563259804283883, "learning_rate": 3.472024689703671e-05, "loss": 0.7486, "mean_token_accuracy": 0.7754692554473877, "step": 2320 }, { "epoch": 0.42723263506063947, "grad_norm": 0.9606543568940551, "learning_rate": 3.465545501799976e-05, "loss": 0.7453, "mean_token_accuracy": 0.7748713374137879, "step": 2325 }, { "epoch": 0.4281514149209849, "grad_norm": 0.9591747976619429, "learning_rate": 3.4590597059940075e-05, "loss": 0.7557, "mean_token_accuracy": 0.7716753005981445, "step": 2330 }, { "epoch": 0.4290701947813304, "grad_norm": 1.0062222102856666, "learning_rate": 3.4525673621806365e-05, "loss": 0.7196, "mean_token_accuracy": 0.7826886892318725, "step": 2335 }, { "epoch": 0.42998897464167585, "grad_norm": 0.9978547883457327, "learning_rate": 3.4460685303152014e-05, "loss": 0.7528, "mean_token_accuracy": 0.773649275302887, "step": 2340 }, { "epoch": 0.4309077545020213, "grad_norm": 1.0567272451689054, "learning_rate": 3.4395632704129565e-05, "loss": 0.7358, "mean_token_accuracy": 0.7783871531486511, "step": 2345 }, { "epoch": 0.4318265343623668, "grad_norm": 0.9796518204189265, "learning_rate": 3.43305164254852e-05, "loss": 0.8094, "mean_token_accuracy": 0.7556680321693421, "step": 2350 }, { "epoch": 0.43274531422271223, "grad_norm": 1.2145226004304532, "learning_rate": 3.426533706855314e-05, "loss": 0.8687, "mean_token_accuracy": 0.7412409901618957, "step": 2355 }, { "epoch": 0.43366409408305767, "grad_norm": 1.07424243219785, "learning_rate": 3.420009523525016e-05, "loss": 0.798, "mean_token_accuracy": 0.7611199259757996, "step": 2360 }, { "epoch": 0.43458287394340317, "grad_norm": 0.9910321317767857, "learning_rate": 3.4134791528069924e-05, "loss": 0.7826, "mean_token_accuracy": 0.7638975620269776, "step": 2365 }, { "epoch": 0.4355016538037486, "grad_norm": 0.9247920341694345, "learning_rate": 3.406942655007755e-05, "loss": 0.8644, "mean_token_accuracy": 0.7422878861427307, "step": 2370 }, { "epoch": 0.4364204336640941, "grad_norm": 1.1061133237797016, "learning_rate": 3.400400090490394e-05, "loss": 0.7632, "mean_token_accuracy": 0.7687703609466553, "step": 2375 }, { "epoch": 0.43733921352443955, "grad_norm": 1.0542956872190579, "learning_rate": 3.393851519674027e-05, "loss": 0.7244, "mean_token_accuracy": 0.7811267971992493, "step": 2380 }, { "epoch": 0.438257993384785, "grad_norm": 0.9897649894527137, "learning_rate": 3.387297003033237e-05, "loss": 0.8368, "mean_token_accuracy": 0.7458428382873535, "step": 2385 }, { "epoch": 0.4391767732451305, "grad_norm": 0.9767768078602704, "learning_rate": 3.380736601097514e-05, "loss": 0.764, "mean_token_accuracy": 0.7723331332206727, "step": 2390 }, { "epoch": 0.44009555310547593, "grad_norm": 0.9595716112205526, "learning_rate": 3.374170374450701e-05, "loss": 0.7663, "mean_token_accuracy": 0.7720773100852967, "step": 2395 }, { "epoch": 0.4410143329658214, "grad_norm": 1.076074919473048, "learning_rate": 3.367598383730429e-05, "loss": 0.7088, "mean_token_accuracy": 0.785472309589386, "step": 2400 }, { "epoch": 0.4419331128261669, "grad_norm": 1.082210949676486, "learning_rate": 3.361020689627556e-05, "loss": 0.7326, "mean_token_accuracy": 0.7807153582572937, "step": 2405 }, { "epoch": 0.4428518926865123, "grad_norm": 0.9481216990417377, "learning_rate": 3.354437352885616e-05, "loss": 0.7801, "mean_token_accuracy": 0.7678476572036743, "step": 2410 }, { "epoch": 0.44377067254685776, "grad_norm": 0.9841501438162628, "learning_rate": 3.347848434300244e-05, "loss": 0.774, "mean_token_accuracy": 0.7663671970367432, "step": 2415 }, { "epoch": 0.44468945240720326, "grad_norm": 1.0107998180735964, "learning_rate": 3.341253994718628e-05, "loss": 0.7629, "mean_token_accuracy": 0.7694483995437622, "step": 2420 }, { "epoch": 0.4456082322675487, "grad_norm": 1.0972419453877216, "learning_rate": 3.334654095038939e-05, "loss": 0.8412, "mean_token_accuracy": 0.7457273244857788, "step": 2425 }, { "epoch": 0.44652701212789414, "grad_norm": 1.2432204140882914, "learning_rate": 3.3280487962097696e-05, "loss": 0.8691, "mean_token_accuracy": 0.7431544780731201, "step": 2430 }, { "epoch": 0.44744579198823964, "grad_norm": 0.9401271930787076, "learning_rate": 3.3214381592295743e-05, "loss": 0.7024, "mean_token_accuracy": 0.784889030456543, "step": 2435 }, { "epoch": 0.4483645718485851, "grad_norm": 1.041383480262174, "learning_rate": 3.3148222451461035e-05, "loss": 0.8058, "mean_token_accuracy": 0.7577178955078125, "step": 2440 }, { "epoch": 0.4492833517089305, "grad_norm": 1.0194759744273139, "learning_rate": 3.308201115055841e-05, "loss": 0.712, "mean_token_accuracy": 0.7870323181152343, "step": 2445 }, { "epoch": 0.450202131569276, "grad_norm": 1.4686250705410333, "learning_rate": 3.301574830103437e-05, "loss": 0.8155, "mean_token_accuracy": 0.7529638648033142, "step": 2450 }, { "epoch": 0.45112091142962146, "grad_norm": 1.031770370598432, "learning_rate": 3.294943451481148e-05, "loss": 0.7707, "mean_token_accuracy": 0.7680568814277648, "step": 2455 }, { "epoch": 0.4520396912899669, "grad_norm": 1.0322162169498617, "learning_rate": 3.288307040428269e-05, "loss": 0.7308, "mean_token_accuracy": 0.7787389516830444, "step": 2460 }, { "epoch": 0.4529584711503124, "grad_norm": 0.9740443305657719, "learning_rate": 3.281665658230568e-05, "loss": 0.7369, "mean_token_accuracy": 0.7813670396804809, "step": 2465 }, { "epoch": 0.45387725101065785, "grad_norm": 0.9656209530299383, "learning_rate": 3.2750193662197196e-05, "loss": 0.799, "mean_token_accuracy": 0.7627607464790345, "step": 2470 }, { "epoch": 0.4547960308710033, "grad_norm": 0.9955461958313095, "learning_rate": 3.2683682257727424e-05, "loss": 0.7435, "mean_token_accuracy": 0.7717449307441712, "step": 2475 }, { "epoch": 0.4557148107313488, "grad_norm": 1.0088438877085566, "learning_rate": 3.261712298311425e-05, "loss": 0.8432, "mean_token_accuracy": 0.7495060801506043, "step": 2480 }, { "epoch": 0.4566335905916942, "grad_norm": 0.9852421798418514, "learning_rate": 3.255051645301766e-05, "loss": 0.7598, "mean_token_accuracy": 0.7723948240280152, "step": 2485 }, { "epoch": 0.45755237045203967, "grad_norm": 1.1475584718237826, "learning_rate": 3.2483863282534034e-05, "loss": 0.7946, "mean_token_accuracy": 0.7613343358039856, "step": 2490 }, { "epoch": 0.45847115031238517, "grad_norm": 0.9748547683528608, "learning_rate": 3.241716408719044e-05, "loss": 0.7791, "mean_token_accuracy": 0.7649445414543152, "step": 2495 }, { "epoch": 0.4593899301727306, "grad_norm": 1.0462773904867664, "learning_rate": 3.2350419482939006e-05, "loss": 0.7762, "mean_token_accuracy": 0.7663216352462768, "step": 2500 }, { "epoch": 0.46030871003307605, "grad_norm": 1.0030710057983563, "learning_rate": 3.228363008615117e-05, "loss": 0.8001, "mean_token_accuracy": 0.7575832843780518, "step": 2505 }, { "epoch": 0.46122748989342155, "grad_norm": 0.9883286938382098, "learning_rate": 3.2216796513612063e-05, "loss": 0.7871, "mean_token_accuracy": 0.7624288439750672, "step": 2510 }, { "epoch": 0.462146269753767, "grad_norm": 1.0525039212029965, "learning_rate": 3.214991938251472e-05, "loss": 0.7558, "mean_token_accuracy": 0.7720568418502808, "step": 2515 }, { "epoch": 0.46306504961411243, "grad_norm": 1.0429696791102898, "learning_rate": 3.208299931045446e-05, "loss": 0.7642, "mean_token_accuracy": 0.7731514692306518, "step": 2520 }, { "epoch": 0.46398382947445793, "grad_norm": 1.0242365972013217, "learning_rate": 3.2016036915423145e-05, "loss": 0.7633, "mean_token_accuracy": 0.7699605584144592, "step": 2525 }, { "epoch": 0.4649026093348034, "grad_norm": 1.2324183314573516, "learning_rate": 3.1949032815803475e-05, "loss": 0.7682, "mean_token_accuracy": 0.7663087368011474, "step": 2530 }, { "epoch": 0.4658213891951488, "grad_norm": 0.9943168933618857, "learning_rate": 3.188198763036329e-05, "loss": 0.8362, "mean_token_accuracy": 0.7509650230407715, "step": 2535 }, { "epoch": 0.4667401690554943, "grad_norm": 0.9854698386880044, "learning_rate": 3.181490197824985e-05, "loss": 0.7956, "mean_token_accuracy": 0.7612180948257447, "step": 2540 }, { "epoch": 0.46765894891583976, "grad_norm": 0.960141519847968, "learning_rate": 3.1747776478984096e-05, "loss": 0.7204, "mean_token_accuracy": 0.7808646440505982, "step": 2545 }, { "epoch": 0.4685777287761852, "grad_norm": 1.1016288208270253, "learning_rate": 3.168061175245497e-05, "loss": 0.8181, "mean_token_accuracy": 0.7522975325584411, "step": 2550 }, { "epoch": 0.4694965086365307, "grad_norm": 1.0470917775420199, "learning_rate": 3.1613408418913676e-05, "loss": 0.7684, "mean_token_accuracy": 0.7654074668884278, "step": 2555 }, { "epoch": 0.47041528849687614, "grad_norm": 1.0068385295701907, "learning_rate": 3.154616709896791e-05, "loss": 0.8036, "mean_token_accuracy": 0.7603312849998474, "step": 2560 }, { "epoch": 0.47133406835722164, "grad_norm": 0.9829615470201221, "learning_rate": 3.147888841357619e-05, "loss": 0.813, "mean_token_accuracy": 0.759647810459137, "step": 2565 }, { "epoch": 0.4722528482175671, "grad_norm": 0.8978243887281022, "learning_rate": 3.141157298404211e-05, "loss": 0.7915, "mean_token_accuracy": 0.7597061276435852, "step": 2570 }, { "epoch": 0.4731716280779125, "grad_norm": 1.0947782890080606, "learning_rate": 3.134422143200854e-05, "loss": 0.8269, "mean_token_accuracy": 0.7519834399223327, "step": 2575 }, { "epoch": 0.474090407938258, "grad_norm": 0.9630113626806391, "learning_rate": 3.127683437945199e-05, "loss": 0.8306, "mean_token_accuracy": 0.7524376153945923, "step": 2580 }, { "epoch": 0.47500918779860346, "grad_norm": 0.912625232111843, "learning_rate": 3.120941244867675e-05, "loss": 0.7851, "mean_token_accuracy": 0.7631929993629456, "step": 2585 }, { "epoch": 0.4759279676589489, "grad_norm": 1.0798990597432274, "learning_rate": 3.1141956262309265e-05, "loss": 0.8272, "mean_token_accuracy": 0.7549837350845336, "step": 2590 }, { "epoch": 0.4768467475192944, "grad_norm": 0.9446129759073334, "learning_rate": 3.1074466443292276e-05, "loss": 0.7756, "mean_token_accuracy": 0.7657612562179565, "step": 2595 }, { "epoch": 0.47776552737963984, "grad_norm": 0.9259352218504912, "learning_rate": 3.1006943614879127e-05, "loss": 0.7342, "mean_token_accuracy": 0.7782540440559387, "step": 2600 }, { "epoch": 0.4786843072399853, "grad_norm": 1.0388874910734989, "learning_rate": 3.0939388400628e-05, "loss": 0.8209, "mean_token_accuracy": 0.757353937625885, "step": 2605 }, { "epoch": 0.4796030871003308, "grad_norm": 1.0351799688172598, "learning_rate": 3.087180142439615e-05, "loss": 0.7712, "mean_token_accuracy": 0.7672750115394592, "step": 2610 }, { "epoch": 0.4805218669606762, "grad_norm": 0.994898643969558, "learning_rate": 3.080418331033416e-05, "loss": 0.7542, "mean_token_accuracy": 0.7735359907150269, "step": 2615 }, { "epoch": 0.48144064682102167, "grad_norm": 0.8832020370330693, "learning_rate": 3.073653468288014e-05, "loss": 0.6924, "mean_token_accuracy": 0.792470920085907, "step": 2620 }, { "epoch": 0.48235942668136716, "grad_norm": 1.0051745659888849, "learning_rate": 3.0668856166754014e-05, "loss": 0.8004, "mean_token_accuracy": 0.7600342750549316, "step": 2625 }, { "epoch": 0.4832782065417126, "grad_norm": 1.0055252007007038, "learning_rate": 3.060114838695168e-05, "loss": 0.8243, "mean_token_accuracy": 0.7516715884208679, "step": 2630 }, { "epoch": 0.48419698640205805, "grad_norm": 1.0629225726483997, "learning_rate": 3.0533411968739315e-05, "loss": 0.7152, "mean_token_accuracy": 0.7828492283821106, "step": 2635 }, { "epoch": 0.48511576626240355, "grad_norm": 1.018552086343459, "learning_rate": 3.0465647537647564e-05, "loss": 0.7561, "mean_token_accuracy": 0.7733739614486694, "step": 2640 }, { "epoch": 0.486034546122749, "grad_norm": 1.110183845818711, "learning_rate": 3.0397855719465736e-05, "loss": 0.8057, "mean_token_accuracy": 0.7568628549575805, "step": 2645 }, { "epoch": 0.48695332598309443, "grad_norm": 0.9676140585791341, "learning_rate": 3.0330037140236083e-05, "loss": 0.795, "mean_token_accuracy": 0.7640480756759643, "step": 2650 }, { "epoch": 0.48787210584343993, "grad_norm": 0.9401353123276465, "learning_rate": 3.026219242624797e-05, "loss": 0.8139, "mean_token_accuracy": 0.7546276330947876, "step": 2655 }, { "epoch": 0.48879088570378537, "grad_norm": 1.0235062426914774, "learning_rate": 3.019432220403212e-05, "loss": 0.7659, "mean_token_accuracy": 0.7717217683792115, "step": 2660 }, { "epoch": 0.4897096655641308, "grad_norm": 1.1394523922744142, "learning_rate": 3.012642710035484e-05, "loss": 0.8078, "mean_token_accuracy": 0.7566407918930054, "step": 2665 }, { "epoch": 0.4906284454244763, "grad_norm": 1.059673146029454, "learning_rate": 3.0058507742212162e-05, "loss": 0.7741, "mean_token_accuracy": 0.7690371632575989, "step": 2670 }, { "epoch": 0.49154722528482175, "grad_norm": 0.9607137840570246, "learning_rate": 2.999056475682414e-05, "loss": 0.7948, "mean_token_accuracy": 0.7632219791412354, "step": 2675 }, { "epoch": 0.4924660051451672, "grad_norm": 0.9922439140299888, "learning_rate": 2.9922598771629005e-05, "loss": 0.7874, "mean_token_accuracy": 0.7601206183433533, "step": 2680 }, { "epoch": 0.4933847850055127, "grad_norm": 0.9338793477773762, "learning_rate": 2.9854610414277402e-05, "loss": 0.7693, "mean_token_accuracy": 0.7690744280815125, "step": 2685 }, { "epoch": 0.49430356486585814, "grad_norm": 1.0330996973903106, "learning_rate": 2.9786600312626563e-05, "loss": 0.7401, "mean_token_accuracy": 0.7735617399215698, "step": 2690 }, { "epoch": 0.4952223447262036, "grad_norm": 1.020650007316944, "learning_rate": 2.9718569094734515e-05, "loss": 0.8004, "mean_token_accuracy": 0.7575301885604858, "step": 2695 }, { "epoch": 0.4961411245865491, "grad_norm": 0.8826033529136437, "learning_rate": 2.965051738885432e-05, "loss": 0.7503, "mean_token_accuracy": 0.773734736442566, "step": 2700 }, { "epoch": 0.4970599044468945, "grad_norm": 1.2165920422760765, "learning_rate": 2.958244582342822e-05, "loss": 0.8248, "mean_token_accuracy": 0.7535989284515381, "step": 2705 }, { "epoch": 0.49797868430723996, "grad_norm": 0.978788492706274, "learning_rate": 2.9514355027081846e-05, "loss": 0.7831, "mean_token_accuracy": 0.7655808568000794, "step": 2710 }, { "epoch": 0.49889746416758546, "grad_norm": 0.9950576805013585, "learning_rate": 2.944624562861845e-05, "loss": 0.807, "mean_token_accuracy": 0.7562234044075012, "step": 2715 }, { "epoch": 0.4998162440279309, "grad_norm": 1.0612404822026047, "learning_rate": 2.9378118257013054e-05, "loss": 0.7904, "mean_token_accuracy": 0.7584082007408142, "step": 2720 }, { "epoch": 0.5007350238882764, "grad_norm": 0.915675944397533, "learning_rate": 2.930997354140665e-05, "loss": 0.7464, "mean_token_accuracy": 0.7732234835624695, "step": 2725 }, { "epoch": 0.5016538037486218, "grad_norm": 0.9232965003519343, "learning_rate": 2.9241812111100414e-05, "loss": 0.8088, "mean_token_accuracy": 0.7525614619255065, "step": 2730 }, { "epoch": 0.5025725836089673, "grad_norm": 1.52977648060804, "learning_rate": 2.9173634595549876e-05, "loss": 0.8046, "mean_token_accuracy": 0.7584918379783631, "step": 2735 }, { "epoch": 0.5034913634693128, "grad_norm": 0.9438156907635481, "learning_rate": 2.910544162435909e-05, "loss": 0.832, "mean_token_accuracy": 0.751025402545929, "step": 2740 }, { "epoch": 0.5044101433296582, "grad_norm": 0.9428923251443018, "learning_rate": 2.9037233827274885e-05, "loss": 0.8152, "mean_token_accuracy": 0.7552414298057556, "step": 2745 }, { "epoch": 0.5053289231900037, "grad_norm": 1.0224336975686246, "learning_rate": 2.8969011834180937e-05, "loss": 0.8284, "mean_token_accuracy": 0.7509586930274963, "step": 2750 }, { "epoch": 0.5062477030503492, "grad_norm": 0.985144401173651, "learning_rate": 2.8900776275092083e-05, "loss": 0.7888, "mean_token_accuracy": 0.7600571990013123, "step": 2755 }, { "epoch": 0.5071664829106945, "grad_norm": 0.9789390297665671, "learning_rate": 2.8832527780148406e-05, "loss": 0.7962, "mean_token_accuracy": 0.7582376718521118, "step": 2760 }, { "epoch": 0.50808526277104, "grad_norm": 1.0148219491244725, "learning_rate": 2.8764266979609445e-05, "loss": 0.8469, "mean_token_accuracy": 0.7480961322784424, "step": 2765 }, { "epoch": 0.5090040426313855, "grad_norm": 1.0037544910691434, "learning_rate": 2.8695994503848395e-05, "loss": 0.7421, "mean_token_accuracy": 0.7739938139915467, "step": 2770 }, { "epoch": 0.5099228224917309, "grad_norm": 0.9754806954239216, "learning_rate": 2.8627710983346262e-05, "loss": 0.7697, "mean_token_accuracy": 0.7668745636940002, "step": 2775 }, { "epoch": 0.5108416023520764, "grad_norm": 0.8806986584931391, "learning_rate": 2.855941704868605e-05, "loss": 0.7866, "mean_token_accuracy": 0.7633078217506408, "step": 2780 }, { "epoch": 0.5117603822124219, "grad_norm": 1.091789028436644, "learning_rate": 2.8491113330546925e-05, "loss": 0.8046, "mean_token_accuracy": 0.7567707419395446, "step": 2785 }, { "epoch": 0.5126791620727673, "grad_norm": 1.061632989714619, "learning_rate": 2.8422800459698423e-05, "loss": 0.7922, "mean_token_accuracy": 0.7623422026634217, "step": 2790 }, { "epoch": 0.5135979419331128, "grad_norm": 0.939885286311399, "learning_rate": 2.835447906699457e-05, "loss": 0.7691, "mean_token_accuracy": 0.7693052887916565, "step": 2795 }, { "epoch": 0.5145167217934583, "grad_norm": 0.9959159666772467, "learning_rate": 2.8286149783368132e-05, "loss": 0.7627, "mean_token_accuracy": 0.7660305023193359, "step": 2800 }, { "epoch": 0.5154355016538037, "grad_norm": 1.0033698323194213, "learning_rate": 2.82178132398247e-05, "loss": 0.7825, "mean_token_accuracy": 0.7651003241539002, "step": 2805 }, { "epoch": 0.5163542815141492, "grad_norm": 1.0942052865929373, "learning_rate": 2.8149470067436945e-05, "loss": 0.8091, "mean_token_accuracy": 0.7585999965667725, "step": 2810 }, { "epoch": 0.5172730613744947, "grad_norm": 1.0275503721326753, "learning_rate": 2.8081120897338748e-05, "loss": 0.7622, "mean_token_accuracy": 0.7666819214820861, "step": 2815 }, { "epoch": 0.5181918412348401, "grad_norm": 1.0162262995217026, "learning_rate": 2.8012766360719346e-05, "loss": 0.8351, "mean_token_accuracy": 0.7463697791099548, "step": 2820 }, { "epoch": 0.5191106210951856, "grad_norm": 0.8735638518849154, "learning_rate": 2.794440708881758e-05, "loss": 0.7526, "mean_token_accuracy": 0.7740337014198303, "step": 2825 }, { "epoch": 0.5200294009555311, "grad_norm": 0.8889407557447864, "learning_rate": 2.787604371291599e-05, "loss": 0.686, "mean_token_accuracy": 0.7896162033081054, "step": 2830 }, { "epoch": 0.5209481808158766, "grad_norm": 0.9682433350662344, "learning_rate": 2.780767686433502e-05, "loss": 0.7646, "mean_token_accuracy": 0.7699775457382202, "step": 2835 }, { "epoch": 0.521866960676222, "grad_norm": 1.0567812728267838, "learning_rate": 2.7739307174427204e-05, "loss": 0.769, "mean_token_accuracy": 0.7676406979560852, "step": 2840 }, { "epoch": 0.5227857405365675, "grad_norm": 0.8954213877251977, "learning_rate": 2.767093527457128e-05, "loss": 0.7958, "mean_token_accuracy": 0.7604862689971924, "step": 2845 }, { "epoch": 0.523704520396913, "grad_norm": 0.9391105206530606, "learning_rate": 2.7602561796166426e-05, "loss": 0.7794, "mean_token_accuracy": 0.7647231101989747, "step": 2850 }, { "epoch": 0.5246233002572583, "grad_norm": 0.952523275474733, "learning_rate": 2.753418737062638e-05, "loss": 0.7628, "mean_token_accuracy": 0.7682720065116883, "step": 2855 }, { "epoch": 0.5255420801176038, "grad_norm": 0.9201973347920627, "learning_rate": 2.746581262937363e-05, "loss": 0.7777, "mean_token_accuracy": 0.7642363786697388, "step": 2860 }, { "epoch": 0.5264608599779493, "grad_norm": 0.9335982392105177, "learning_rate": 2.739743820383358e-05, "loss": 0.7338, "mean_token_accuracy": 0.7742905497550965, "step": 2865 }, { "epoch": 0.5273796398382947, "grad_norm": 0.9846844485906041, "learning_rate": 2.732906472542872e-05, "loss": 0.7486, "mean_token_accuracy": 0.7750791192054749, "step": 2870 }, { "epoch": 0.5282984196986402, "grad_norm": 0.9865121950565456, "learning_rate": 2.7260692825572808e-05, "loss": 0.7584, "mean_token_accuracy": 0.7667729616165161, "step": 2875 }, { "epoch": 0.5292171995589857, "grad_norm": 1.0573279084656204, "learning_rate": 2.7192323135664988e-05, "loss": 0.8901, "mean_token_accuracy": 0.7370211601257324, "step": 2880 }, { "epoch": 0.5301359794193311, "grad_norm": 1.0198244813321502, "learning_rate": 2.712395628708402e-05, "loss": 0.7471, "mean_token_accuracy": 0.771734893321991, "step": 2885 }, { "epoch": 0.5310547592796766, "grad_norm": 1.0967316052811202, "learning_rate": 2.7055592911182425e-05, "loss": 0.7543, "mean_token_accuracy": 0.7691154241561889, "step": 2890 }, { "epoch": 0.5319735391400221, "grad_norm": 0.9658875640110149, "learning_rate": 2.6987233639280656e-05, "loss": 0.6979, "mean_token_accuracy": 0.7853469371795654, "step": 2895 }, { "epoch": 0.5328923190003675, "grad_norm": 1.041207894864275, "learning_rate": 2.6918879102661264e-05, "loss": 0.7403, "mean_token_accuracy": 0.7763397812843322, "step": 2900 }, { "epoch": 0.533811098860713, "grad_norm": 0.9959626369836271, "learning_rate": 2.6850529932563057e-05, "loss": 0.7526, "mean_token_accuracy": 0.7707386016845703, "step": 2905 }, { "epoch": 0.5347298787210585, "grad_norm": 0.994177922519465, "learning_rate": 2.6782186760175303e-05, "loss": 0.8229, "mean_token_accuracy": 0.754144036769867, "step": 2910 }, { "epoch": 0.5356486585814039, "grad_norm": 1.0745632939643772, "learning_rate": 2.6713850216631876e-05, "loss": 0.8191, "mean_token_accuracy": 0.7573227047920227, "step": 2915 }, { "epoch": 0.5365674384417494, "grad_norm": 1.0072152395633065, "learning_rate": 2.6645520933005432e-05, "loss": 0.7212, "mean_token_accuracy": 0.7815118074417114, "step": 2920 }, { "epoch": 0.5374862183020949, "grad_norm": 0.9856013490140734, "learning_rate": 2.6577199540301583e-05, "loss": 0.8058, "mean_token_accuracy": 0.7568701386451722, "step": 2925 }, { "epoch": 0.5384049981624403, "grad_norm": 1.0799881365103963, "learning_rate": 2.6508886669453077e-05, "loss": 0.722, "mean_token_accuracy": 0.7820630311965943, "step": 2930 }, { "epoch": 0.5393237780227857, "grad_norm": 0.9808614748561544, "learning_rate": 2.6440582951313958e-05, "loss": 0.7312, "mean_token_accuracy": 0.7761293530464173, "step": 2935 }, { "epoch": 0.5402425578831312, "grad_norm": 0.9988071044503206, "learning_rate": 2.6372289016653747e-05, "loss": 0.8052, "mean_token_accuracy": 0.755142867565155, "step": 2940 }, { "epoch": 0.5411613377434766, "grad_norm": 0.9640983566459411, "learning_rate": 2.6304005496151607e-05, "loss": 0.7501, "mean_token_accuracy": 0.7724974870681762, "step": 2945 }, { "epoch": 0.5420801176038221, "grad_norm": 0.9753132303800915, "learning_rate": 2.6235733020390557e-05, "loss": 0.7507, "mean_token_accuracy": 0.7731342792510987, "step": 2950 }, { "epoch": 0.5429988974641676, "grad_norm": 0.9941788148526961, "learning_rate": 2.6167472219851606e-05, "loss": 0.7777, "mean_token_accuracy": 0.7659435391426086, "step": 2955 }, { "epoch": 0.543917677324513, "grad_norm": 1.0342888870006444, "learning_rate": 2.6099223724907922e-05, "loss": 0.7954, "mean_token_accuracy": 0.7611855626106262, "step": 2960 }, { "epoch": 0.5448364571848585, "grad_norm": 0.9947482977334893, "learning_rate": 2.603098816581907e-05, "loss": 0.79, "mean_token_accuracy": 0.7604100823402404, "step": 2965 }, { "epoch": 0.545755237045204, "grad_norm": 0.9369443584148152, "learning_rate": 2.5962766172725127e-05, "loss": 0.7501, "mean_token_accuracy": 0.7741901755332947, "step": 2970 }, { "epoch": 0.5466740169055494, "grad_norm": 1.018205972168573, "learning_rate": 2.589455837564091e-05, "loss": 0.8146, "mean_token_accuracy": 0.7549449682235718, "step": 2975 }, { "epoch": 0.5475927967658949, "grad_norm": 0.9840855304963227, "learning_rate": 2.5826365404450136e-05, "loss": 0.6928, "mean_token_accuracy": 0.7882686018943786, "step": 2980 }, { "epoch": 0.5485115766262404, "grad_norm": 0.9765299396635874, "learning_rate": 2.57581878888996e-05, "loss": 0.7904, "mean_token_accuracy": 0.7627172827720642, "step": 2985 }, { "epoch": 0.5494303564865858, "grad_norm": 0.9611858171978122, "learning_rate": 2.5690026458593362e-05, "loss": 0.7849, "mean_token_accuracy": 0.7626663684844971, "step": 2990 }, { "epoch": 0.5503491363469313, "grad_norm": 0.9918441521186859, "learning_rate": 2.562188174298695e-05, "loss": 0.7139, "mean_token_accuracy": 0.7834156632423401, "step": 2995 }, { "epoch": 0.5512679162072768, "grad_norm": 0.9168184741389104, "learning_rate": 2.5553754371381555e-05, "loss": 0.7595, "mean_token_accuracy": 0.7676758289337158, "step": 3000 }, { "epoch": 0.5521866960676222, "grad_norm": 0.9307353989568666, "learning_rate": 2.5485644972918153e-05, "loss": 0.7309, "mean_token_accuracy": 0.7800590991973877, "step": 3005 }, { "epoch": 0.5531054759279677, "grad_norm": 0.9683740817546714, "learning_rate": 2.541755417657179e-05, "loss": 0.7913, "mean_token_accuracy": 0.7614364624023438, "step": 3010 }, { "epoch": 0.5540242557883132, "grad_norm": 0.9961298115995415, "learning_rate": 2.5349482611145685e-05, "loss": 0.8041, "mean_token_accuracy": 0.7568534970283508, "step": 3015 }, { "epoch": 0.5549430356486585, "grad_norm": 0.981662259480835, "learning_rate": 2.528143090526549e-05, "loss": 0.6952, "mean_token_accuracy": 0.7897186994552612, "step": 3020 }, { "epoch": 0.555861815509004, "grad_norm": 0.9140182959744487, "learning_rate": 2.5213399687373446e-05, "loss": 0.6967, "mean_token_accuracy": 0.7841851711273193, "step": 3025 }, { "epoch": 0.5567805953693495, "grad_norm": 1.05668077131703, "learning_rate": 2.51453895857226e-05, "loss": 0.751, "mean_token_accuracy": 0.7735855102539062, "step": 3030 }, { "epoch": 0.5576993752296949, "grad_norm": 0.9377501197010149, "learning_rate": 2.5077401228371007e-05, "loss": 0.7319, "mean_token_accuracy": 0.7791807889938355, "step": 3035 }, { "epoch": 0.5586181550900404, "grad_norm": 1.116838452205624, "learning_rate": 2.5009435243175865e-05, "loss": 0.8436, "mean_token_accuracy": 0.7444709777832031, "step": 3040 }, { "epoch": 0.5595369349503859, "grad_norm": 1.133786669142971, "learning_rate": 2.4941492257787847e-05, "loss": 0.7451, "mean_token_accuracy": 0.7729416728019715, "step": 3045 }, { "epoch": 0.5604557148107313, "grad_norm": 1.0531439168923706, "learning_rate": 2.4873572899645164e-05, "loss": 0.7914, "mean_token_accuracy": 0.7595977902412414, "step": 3050 }, { "epoch": 0.5613744946710768, "grad_norm": 0.9370063066983946, "learning_rate": 2.4805677795967874e-05, "loss": 0.7787, "mean_token_accuracy": 0.762716269493103, "step": 3055 }, { "epoch": 0.5622932745314223, "grad_norm": 0.994949145579561, "learning_rate": 2.4737807573752036e-05, "loss": 0.7431, "mean_token_accuracy": 0.7747965931892395, "step": 3060 }, { "epoch": 0.5632120543917677, "grad_norm": 1.0671188110858503, "learning_rate": 2.466996285976393e-05, "loss": 0.7917, "mean_token_accuracy": 0.7583362698554993, "step": 3065 }, { "epoch": 0.5641308342521132, "grad_norm": 0.892710934926214, "learning_rate": 2.4602144280534273e-05, "loss": 0.7498, "mean_token_accuracy": 0.7732946038246155, "step": 3070 }, { "epoch": 0.5650496141124587, "grad_norm": 0.99980677185357, "learning_rate": 2.4534352462352445e-05, "loss": 0.8074, "mean_token_accuracy": 0.7578684329986572, "step": 3075 }, { "epoch": 0.5659683939728041, "grad_norm": 0.9159293875905319, "learning_rate": 2.4466588031260684e-05, "loss": 0.7809, "mean_token_accuracy": 0.7624441385269165, "step": 3080 }, { "epoch": 0.5668871738331496, "grad_norm": 1.0362770618116839, "learning_rate": 2.4398851613048322e-05, "loss": 0.797, "mean_token_accuracy": 0.760871410369873, "step": 3085 }, { "epoch": 0.5678059536934951, "grad_norm": 1.0847148009284608, "learning_rate": 2.4331143833245994e-05, "loss": 0.7395, "mean_token_accuracy": 0.7746615648269654, "step": 3090 }, { "epoch": 0.5687247335538405, "grad_norm": 1.0671537398957074, "learning_rate": 2.426346531711986e-05, "loss": 0.774, "mean_token_accuracy": 0.7641933798789978, "step": 3095 }, { "epoch": 0.569643513414186, "grad_norm": 1.0063509161122495, "learning_rate": 2.4195816689665847e-05, "loss": 0.7038, "mean_token_accuracy": 0.7864096641540528, "step": 3100 }, { "epoch": 0.5705622932745315, "grad_norm": 0.9712630519098367, "learning_rate": 2.4128198575603857e-05, "loss": 0.7839, "mean_token_accuracy": 0.7611940979957581, "step": 3105 }, { "epoch": 0.5714810731348768, "grad_norm": 0.9158850127968227, "learning_rate": 2.4060611599372007e-05, "loss": 0.775, "mean_token_accuracy": 0.7655367732048035, "step": 3110 }, { "epoch": 0.5723998529952223, "grad_norm": 0.9829867717200517, "learning_rate": 2.399305638512089e-05, "loss": 0.8531, "mean_token_accuracy": 0.742165744304657, "step": 3115 }, { "epoch": 0.5733186328555678, "grad_norm": 0.9676209759663041, "learning_rate": 2.3925533556707736e-05, "loss": 0.7669, "mean_token_accuracy": 0.7683526515960694, "step": 3120 }, { "epoch": 0.5742374127159132, "grad_norm": 0.9456863854288068, "learning_rate": 2.385804373769074e-05, "loss": 0.736, "mean_token_accuracy": 0.7773837327957154, "step": 3125 }, { "epoch": 0.5751561925762587, "grad_norm": 1.042769286037687, "learning_rate": 2.3790587551323252e-05, "loss": 0.7869, "mean_token_accuracy": 0.761770761013031, "step": 3130 }, { "epoch": 0.5760749724366042, "grad_norm": 0.898131943412606, "learning_rate": 2.372316562054802e-05, "loss": 0.7311, "mean_token_accuracy": 0.780720841884613, "step": 3135 }, { "epoch": 0.5769937522969496, "grad_norm": 0.937736313205156, "learning_rate": 2.3655778567991456e-05, "loss": 0.8486, "mean_token_accuracy": 0.7416357159614563, "step": 3140 }, { "epoch": 0.5779125321572951, "grad_norm": 0.9983900168625015, "learning_rate": 2.3588427015957904e-05, "loss": 0.7432, "mean_token_accuracy": 0.7713735103607178, "step": 3145 }, { "epoch": 0.5788313120176406, "grad_norm": 1.0896357887586694, "learning_rate": 2.352111158642381e-05, "loss": 0.7843, "mean_token_accuracy": 0.76038818359375, "step": 3150 }, { "epoch": 0.579750091877986, "grad_norm": 0.9547336671541522, "learning_rate": 2.3453832901032097e-05, "loss": 0.7723, "mean_token_accuracy": 0.7679526925086975, "step": 3155 }, { "epoch": 0.5806688717383315, "grad_norm": 0.9486005629151948, "learning_rate": 2.3386591581086333e-05, "loss": 0.6867, "mean_token_accuracy": 0.7872913122177124, "step": 3160 }, { "epoch": 0.581587651598677, "grad_norm": 1.4360094460321793, "learning_rate": 2.3319388247545026e-05, "loss": 0.6946, "mean_token_accuracy": 0.7893529891967773, "step": 3165 }, { "epoch": 0.5825064314590224, "grad_norm": 0.9675717631201467, "learning_rate": 2.325222352101591e-05, "loss": 0.794, "mean_token_accuracy": 0.7627562046051025, "step": 3170 }, { "epoch": 0.5834252113193679, "grad_norm": 0.8611284135924058, "learning_rate": 2.3185098021750163e-05, "loss": 0.7647, "mean_token_accuracy": 0.7697438478469849, "step": 3175 }, { "epoch": 0.5843439911797134, "grad_norm": 1.0945648293831518, "learning_rate": 2.3118012369636715e-05, "loss": 0.7374, "mean_token_accuracy": 0.7741273403167724, "step": 3180 }, { "epoch": 0.5852627710400587, "grad_norm": 0.9850152813442956, "learning_rate": 2.3050967184196526e-05, "loss": 0.7387, "mean_token_accuracy": 0.7777738809585572, "step": 3185 }, { "epoch": 0.5861815509004042, "grad_norm": 0.8639589995274697, "learning_rate": 2.2983963084576854e-05, "loss": 0.77, "mean_token_accuracy": 0.7680123209953308, "step": 3190 }, { "epoch": 0.5871003307607497, "grad_norm": 0.969485320702538, "learning_rate": 2.2917000689545535e-05, "loss": 0.8023, "mean_token_accuracy": 0.759474766254425, "step": 3195 }, { "epoch": 0.5880191106210951, "grad_norm": 0.9691992055808628, "learning_rate": 2.2850080617485286e-05, "loss": 0.7576, "mean_token_accuracy": 0.7699379682540893, "step": 3200 }, { "epoch": 0.5889378904814406, "grad_norm": 0.9709025550626744, "learning_rate": 2.2783203486387945e-05, "loss": 0.764, "mean_token_accuracy": 0.7677761912345886, "step": 3205 }, { "epoch": 0.5898566703417861, "grad_norm": 0.9395191069096172, "learning_rate": 2.2716369913848827e-05, "loss": 0.7572, "mean_token_accuracy": 0.7745106220245361, "step": 3210 }, { "epoch": 0.5907754502021316, "grad_norm": 0.9777159547594203, "learning_rate": 2.2649580517061003e-05, "loss": 0.7136, "mean_token_accuracy": 0.7840847253799439, "step": 3215 }, { "epoch": 0.591694230062477, "grad_norm": 0.9390454307687789, "learning_rate": 2.2582835912809564e-05, "loss": 0.7614, "mean_token_accuracy": 0.7697038054466248, "step": 3220 }, { "epoch": 0.5926130099228225, "grad_norm": 1.0367038259917516, "learning_rate": 2.251613671746598e-05, "loss": 0.7796, "mean_token_accuracy": 0.7627864122390747, "step": 3225 }, { "epoch": 0.593531789783168, "grad_norm": 0.9251948352297976, "learning_rate": 2.2449483546982347e-05, "loss": 0.6893, "mean_token_accuracy": 0.7888349413871765, "step": 3230 }, { "epoch": 0.5944505696435134, "grad_norm": 1.05108523432423, "learning_rate": 2.2382877016885757e-05, "loss": 0.7052, "mean_token_accuracy": 0.7855964303016663, "step": 3235 }, { "epoch": 0.5953693495038589, "grad_norm": 0.9994956513098704, "learning_rate": 2.2316317742272585e-05, "loss": 0.7682, "mean_token_accuracy": 0.7651132106781006, "step": 3240 }, { "epoch": 0.5962881293642044, "grad_norm": 0.9539355388832639, "learning_rate": 2.224980633780281e-05, "loss": 0.7181, "mean_token_accuracy": 0.7789011836051941, "step": 3245 }, { "epoch": 0.5972069092245498, "grad_norm": 0.9721681364733832, "learning_rate": 2.2183343417694334e-05, "loss": 0.7583, "mean_token_accuracy": 0.7710484743118287, "step": 3250 }, { "epoch": 0.5981256890848953, "grad_norm": 1.0132996609635718, "learning_rate": 2.2116929595717317e-05, "loss": 0.7719, "mean_token_accuracy": 0.765598726272583, "step": 3255 }, { "epoch": 0.5990444689452408, "grad_norm": 0.9659020670904003, "learning_rate": 2.205056548518853e-05, "loss": 0.7958, "mean_token_accuracy": 0.7573135375976563, "step": 3260 }, { "epoch": 0.5999632488055862, "grad_norm": 1.0145461160760352, "learning_rate": 2.1984251698965637e-05, "loss": 0.7506, "mean_token_accuracy": 0.7711923003196717, "step": 3265 }, { "epoch": 0.6008820286659317, "grad_norm": 1.0804834048147398, "learning_rate": 2.1917988849441594e-05, "loss": 0.8049, "mean_token_accuracy": 0.755113685131073, "step": 3270 }, { "epoch": 0.6018008085262772, "grad_norm": 0.9733796804471042, "learning_rate": 2.185177754853896e-05, "loss": 0.6773, "mean_token_accuracy": 0.7920406103134155, "step": 3275 }, { "epoch": 0.6027195883866225, "grad_norm": 0.990871804097787, "learning_rate": 2.1785618407704255e-05, "loss": 0.7619, "mean_token_accuracy": 0.7680476665496826, "step": 3280 }, { "epoch": 0.603638368246968, "grad_norm": 0.9094240503163677, "learning_rate": 2.1719512037902306e-05, "loss": 0.758, "mean_token_accuracy": 0.7682316303253174, "step": 3285 }, { "epoch": 0.6045571481073135, "grad_norm": 0.9504426996357046, "learning_rate": 2.1653459049610618e-05, "loss": 0.7037, "mean_token_accuracy": 0.7844570279121399, "step": 3290 }, { "epoch": 0.6054759279676589, "grad_norm": 1.0419237413786735, "learning_rate": 2.1587460052813724e-05, "loss": 0.7797, "mean_token_accuracy": 0.7651678204536438, "step": 3295 }, { "epoch": 0.6063947078280044, "grad_norm": 1.0189296741711382, "learning_rate": 2.1521515656997567e-05, "loss": 0.8125, "mean_token_accuracy": 0.7538291454315186, "step": 3300 }, { "epoch": 0.6073134876883499, "grad_norm": 0.9647782169864347, "learning_rate": 2.145562647114386e-05, "loss": 0.7115, "mean_token_accuracy": 0.7819002747535706, "step": 3305 }, { "epoch": 0.6082322675486953, "grad_norm": 0.8719676861547915, "learning_rate": 2.1389793103724443e-05, "loss": 0.7175, "mean_token_accuracy": 0.7793567061424256, "step": 3310 }, { "epoch": 0.6091510474090408, "grad_norm": 0.9761609575734019, "learning_rate": 2.1324016162695722e-05, "loss": 0.6784, "mean_token_accuracy": 0.7919653534889222, "step": 3315 }, { "epoch": 0.6100698272693863, "grad_norm": 0.8974448563579739, "learning_rate": 2.125829625549299e-05, "loss": 0.6786, "mean_token_accuracy": 0.7931641936302185, "step": 3320 }, { "epoch": 0.6109886071297317, "grad_norm": 1.0099374622071293, "learning_rate": 2.1192633989024856e-05, "loss": 0.8367, "mean_token_accuracy": 0.7453663229942322, "step": 3325 }, { "epoch": 0.6119073869900772, "grad_norm": 1.0422892359273228, "learning_rate": 2.112702996966764e-05, "loss": 0.7187, "mean_token_accuracy": 0.7798493385314942, "step": 3330 }, { "epoch": 0.6128261668504227, "grad_norm": 1.1083604247420085, "learning_rate": 2.106148480325974e-05, "loss": 0.7806, "mean_token_accuracy": 0.761151397228241, "step": 3335 }, { "epoch": 0.6137449467107681, "grad_norm": 1.0270421311335494, "learning_rate": 2.0995999095096068e-05, "loss": 0.7843, "mean_token_accuracy": 0.7627219676971435, "step": 3340 }, { "epoch": 0.6146637265711136, "grad_norm": 1.215757454497741, "learning_rate": 2.0930573449922457e-05, "loss": 0.7597, "mean_token_accuracy": 0.769752562046051, "step": 3345 }, { "epoch": 0.6155825064314591, "grad_norm": 1.2153983619499056, "learning_rate": 2.086520847193008e-05, "loss": 0.7792, "mean_token_accuracy": 0.7656338334083557, "step": 3350 }, { "epoch": 0.6165012862918045, "grad_norm": 0.952171476221175, "learning_rate": 2.079990476474985e-05, "loss": 0.683, "mean_token_accuracy": 0.7914249539375305, "step": 3355 }, { "epoch": 0.61742006615215, "grad_norm": 1.037769469357623, "learning_rate": 2.0734662931446858e-05, "loss": 0.7692, "mean_token_accuracy": 0.7642071366310119, "step": 3360 }, { "epoch": 0.6183388460124954, "grad_norm": 0.9750207162668445, "learning_rate": 2.0669483574514807e-05, "loss": 0.8355, "mean_token_accuracy": 0.7468725085258484, "step": 3365 }, { "epoch": 0.6192576258728408, "grad_norm": 1.0187302049829796, "learning_rate": 2.060436729587044e-05, "loss": 0.7502, "mean_token_accuracy": 0.7742531776428223, "step": 3370 }, { "epoch": 0.6201764057331863, "grad_norm": 0.943777765061105, "learning_rate": 2.0539314696848e-05, "loss": 0.7062, "mean_token_accuracy": 0.7855054616928101, "step": 3375 }, { "epoch": 0.6210951855935318, "grad_norm": 0.8930105332009788, "learning_rate": 2.0474326378193637e-05, "loss": 0.7458, "mean_token_accuracy": 0.773654580116272, "step": 3380 }, { "epoch": 0.6220139654538772, "grad_norm": 0.9035160403431316, "learning_rate": 2.0409402940059937e-05, "loss": 0.7268, "mean_token_accuracy": 0.7792444586753845, "step": 3385 }, { "epoch": 0.6229327453142227, "grad_norm": 1.0410393906012252, "learning_rate": 2.0344544982000246e-05, "loss": 0.7038, "mean_token_accuracy": 0.7828059315681457, "step": 3390 }, { "epoch": 0.6238515251745682, "grad_norm": 0.9123527907550557, "learning_rate": 2.0279753102963296e-05, "loss": 0.667, "mean_token_accuracy": 0.7945937156677246, "step": 3395 }, { "epoch": 0.6247703050349136, "grad_norm": 1.0453020521936442, "learning_rate": 2.0215027901287555e-05, "loss": 0.7062, "mean_token_accuracy": 0.7823508858680726, "step": 3400 }, { "epoch": 0.6256890848952591, "grad_norm": 0.9075834943890148, "learning_rate": 2.0150369974695755e-05, "loss": 0.7027, "mean_token_accuracy": 0.7846097946166992, "step": 3405 }, { "epoch": 0.6266078647556046, "grad_norm": 0.9405566568561052, "learning_rate": 2.008577992028934e-05, "loss": 0.7387, "mean_token_accuracy": 0.7755053520202637, "step": 3410 }, { "epoch": 0.62752664461595, "grad_norm": 0.9311470578940665, "learning_rate": 2.0021258334542987e-05, "loss": 0.7867, "mean_token_accuracy": 0.7582219243049622, "step": 3415 }, { "epoch": 0.6284454244762955, "grad_norm": 0.9465824966191277, "learning_rate": 1.9956805813299066e-05, "loss": 0.7295, "mean_token_accuracy": 0.7787384033203125, "step": 3420 }, { "epoch": 0.629364204336641, "grad_norm": 0.9314873192239379, "learning_rate": 1.9892422951762167e-05, "loss": 0.7732, "mean_token_accuracy": 0.7635803461074829, "step": 3425 }, { "epoch": 0.6302829841969864, "grad_norm": 1.0303249115412232, "learning_rate": 1.9828110344493583e-05, "loss": 0.8374, "mean_token_accuracy": 0.7502556920051575, "step": 3430 }, { "epoch": 0.6312017640573319, "grad_norm": 0.8566968324816928, "learning_rate": 1.9763868585405813e-05, "loss": 0.6606, "mean_token_accuracy": 0.7972531080245971, "step": 3435 }, { "epoch": 0.6321205439176774, "grad_norm": 1.0376272306600982, "learning_rate": 1.9699698267757115e-05, "loss": 0.6992, "mean_token_accuracy": 0.784684681892395, "step": 3440 }, { "epoch": 0.6330393237780227, "grad_norm": 1.0897400517305982, "learning_rate": 1.9635599984145965e-05, "loss": 0.8341, "mean_token_accuracy": 0.7503707766532898, "step": 3445 }, { "epoch": 0.6339581036383682, "grad_norm": 0.9399776090183068, "learning_rate": 1.9571574326505648e-05, "loss": 0.7555, "mean_token_accuracy": 0.7727354645729065, "step": 3450 }, { "epoch": 0.6348768834987137, "grad_norm": 1.422297410503556, "learning_rate": 1.950762188609876e-05, "loss": 0.7891, "mean_token_accuracy": 0.761411714553833, "step": 3455 }, { "epoch": 0.6357956633590591, "grad_norm": 0.9985847497683605, "learning_rate": 1.9443743253511736e-05, "loss": 0.773, "mean_token_accuracy": 0.7664777278900147, "step": 3460 }, { "epoch": 0.6367144432194046, "grad_norm": 0.8928670622147518, "learning_rate": 1.9379939018649447e-05, "loss": 0.6888, "mean_token_accuracy": 0.7935372710227966, "step": 3465 }, { "epoch": 0.6376332230797501, "grad_norm": 0.8818099232101203, "learning_rate": 1.9316209770729686e-05, "loss": 0.6876, "mean_token_accuracy": 0.7872507929801941, "step": 3470 }, { "epoch": 0.6385520029400955, "grad_norm": 1.0220008034223411, "learning_rate": 1.9252556098277762e-05, "loss": 0.7423, "mean_token_accuracy": 0.7761957883834839, "step": 3475 }, { "epoch": 0.639470782800441, "grad_norm": 1.00646363617204, "learning_rate": 1.9188978589121076e-05, "loss": 0.7799, "mean_token_accuracy": 0.763306987285614, "step": 3480 }, { "epoch": 0.6403895626607865, "grad_norm": 1.0853274497927512, "learning_rate": 1.9125477830383663e-05, "loss": 0.7638, "mean_token_accuracy": 0.7669495463371276, "step": 3485 }, { "epoch": 0.6413083425211319, "grad_norm": 1.065184143072551, "learning_rate": 1.9062054408480804e-05, "loss": 0.7743, "mean_token_accuracy": 0.7621343255043029, "step": 3490 }, { "epoch": 0.6422271223814774, "grad_norm": 0.9447419338666605, "learning_rate": 1.899870890911357e-05, "loss": 0.6677, "mean_token_accuracy": 0.7952073097229004, "step": 3495 }, { "epoch": 0.6431459022418229, "grad_norm": 1.0005267248842091, "learning_rate": 1.8935441917263448e-05, "loss": 0.6775, "mean_token_accuracy": 0.7901732444763183, "step": 3500 }, { "epoch": 0.6440646821021683, "grad_norm": 1.0892429541179653, "learning_rate": 1.8872254017186915e-05, "loss": 0.7837, "mean_token_accuracy": 0.7588755011558532, "step": 3505 }, { "epoch": 0.6449834619625138, "grad_norm": 1.0127229878185544, "learning_rate": 1.880914579241007e-05, "loss": 0.7123, "mean_token_accuracy": 0.7827209591865539, "step": 3510 }, { "epoch": 0.6459022418228593, "grad_norm": 1.0281554271788436, "learning_rate": 1.8746117825723214e-05, "loss": 0.6835, "mean_token_accuracy": 0.7939071655273438, "step": 3515 }, { "epoch": 0.6468210216832047, "grad_norm": 0.8913736086248946, "learning_rate": 1.86831706991755e-05, "loss": 0.7223, "mean_token_accuracy": 0.7790691494941712, "step": 3520 }, { "epoch": 0.6477398015435502, "grad_norm": 1.0046759104597491, "learning_rate": 1.8620304994069508e-05, "loss": 0.7165, "mean_token_accuracy": 0.7822145223617554, "step": 3525 }, { "epoch": 0.6486585814038957, "grad_norm": 0.8761990362360018, "learning_rate": 1.8557521290955943e-05, "loss": 0.6898, "mean_token_accuracy": 0.7909232258796692, "step": 3530 }, { "epoch": 0.649577361264241, "grad_norm": 1.0010762980226218, "learning_rate": 1.849482016962822e-05, "loss": 0.7426, "mean_token_accuracy": 0.773716127872467, "step": 3535 }, { "epoch": 0.6504961411245865, "grad_norm": 0.8997455734948419, "learning_rate": 1.8432202209117132e-05, "loss": 0.7354, "mean_token_accuracy": 0.7769456744194031, "step": 3540 }, { "epoch": 0.651414920984932, "grad_norm": 1.0530586816733762, "learning_rate": 1.8369667987685517e-05, "loss": 0.7285, "mean_token_accuracy": 0.7756595969200134, "step": 3545 }, { "epoch": 0.6523337008452774, "grad_norm": 0.9090366247053898, "learning_rate": 1.830721808282289e-05, "loss": 0.7539, "mean_token_accuracy": 0.7681886911392212, "step": 3550 }, { "epoch": 0.6532524807056229, "grad_norm": 0.8963988021562999, "learning_rate": 1.8244853071240103e-05, "loss": 0.7189, "mean_token_accuracy": 0.7818469524383544, "step": 3555 }, { "epoch": 0.6541712605659684, "grad_norm": 1.0901215733279344, "learning_rate": 1.8182573528864066e-05, "loss": 0.8269, "mean_token_accuracy": 0.7504664659500122, "step": 3560 }, { "epoch": 0.6550900404263138, "grad_norm": 1.1645208492459995, "learning_rate": 1.812038003083239e-05, "loss": 0.7093, "mean_token_accuracy": 0.7835473537445068, "step": 3565 }, { "epoch": 0.6560088202866593, "grad_norm": 1.0256562034267807, "learning_rate": 1.805827315148808e-05, "loss": 0.8014, "mean_token_accuracy": 0.7579211831092835, "step": 3570 }, { "epoch": 0.6569276001470048, "grad_norm": 0.9529727803585198, "learning_rate": 1.799625346437424e-05, "loss": 0.7738, "mean_token_accuracy": 0.7688822269439697, "step": 3575 }, { "epoch": 0.6578463800073502, "grad_norm": 0.94739155264938, "learning_rate": 1.793432154222878e-05, "loss": 0.7292, "mean_token_accuracy": 0.7785593032836914, "step": 3580 }, { "epoch": 0.6587651598676957, "grad_norm": 0.9662895425381987, "learning_rate": 1.7872477956979117e-05, "loss": 0.7436, "mean_token_accuracy": 0.7758478641510009, "step": 3585 }, { "epoch": 0.6596839397280412, "grad_norm": 0.9840960134871575, "learning_rate": 1.7810723279736885e-05, "loss": 0.7916, "mean_token_accuracy": 0.7603202104568482, "step": 3590 }, { "epoch": 0.6606027195883867, "grad_norm": 1.0261319427030933, "learning_rate": 1.774905808079269e-05, "loss": 0.6979, "mean_token_accuracy": 0.7864163637161254, "step": 3595 }, { "epoch": 0.6615214994487321, "grad_norm": 1.0500468697129208, "learning_rate": 1.768748292961082e-05, "loss": 0.8148, "mean_token_accuracy": 0.7488227248191833, "step": 3600 }, { "epoch": 0.6624402793090776, "grad_norm": 1.0731369628716187, "learning_rate": 1.7625998394823983e-05, "loss": 0.8241, "mean_token_accuracy": 0.7512738227844238, "step": 3605 }, { "epoch": 0.6633590591694231, "grad_norm": 1.034876172959453, "learning_rate": 1.756460504422807e-05, "loss": 0.7318, "mean_token_accuracy": 0.7753666043281555, "step": 3610 }, { "epoch": 0.6642778390297684, "grad_norm": 0.9683609087211331, "learning_rate": 1.750330344477692e-05, "loss": 0.7759, "mean_token_accuracy": 0.7623879432678222, "step": 3615 }, { "epoch": 0.6651966188901139, "grad_norm": 1.0746172926951512, "learning_rate": 1.7442094162577048e-05, "loss": 0.7414, "mean_token_accuracy": 0.7732792139053345, "step": 3620 }, { "epoch": 0.6661153987504594, "grad_norm": 1.0348081377114133, "learning_rate": 1.7380977762882462e-05, "loss": 0.7379, "mean_token_accuracy": 0.7739031314849854, "step": 3625 }, { "epoch": 0.6670341786108048, "grad_norm": 1.0461877004048412, "learning_rate": 1.731995481008941e-05, "loss": 0.7448, "mean_token_accuracy": 0.773258650302887, "step": 3630 }, { "epoch": 0.6679529584711503, "grad_norm": 0.9323745094099202, "learning_rate": 1.725902586773116e-05, "loss": 0.6793, "mean_token_accuracy": 0.7933961987495423, "step": 3635 }, { "epoch": 0.6688717383314958, "grad_norm": 1.046949059494339, "learning_rate": 1.7198191498472838e-05, "loss": 0.7922, "mean_token_accuracy": 0.7601482748985291, "step": 3640 }, { "epoch": 0.6697905181918412, "grad_norm": 1.022387930805979, "learning_rate": 1.7137452264106223e-05, "loss": 0.7352, "mean_token_accuracy": 0.7750853300094604, "step": 3645 }, { "epoch": 0.6707092980521867, "grad_norm": 1.0168638470278177, "learning_rate": 1.7076808725544513e-05, "loss": 0.7946, "mean_token_accuracy": 0.76027911901474, "step": 3650 }, { "epoch": 0.6716280779125322, "grad_norm": 1.011273043579098, "learning_rate": 1.7016261442817195e-05, "loss": 0.7686, "mean_token_accuracy": 0.7633870005607605, "step": 3655 }, { "epoch": 0.6725468577728776, "grad_norm": 1.0527976338992284, "learning_rate": 1.6955810975064852e-05, "loss": 0.744, "mean_token_accuracy": 0.7737329721450805, "step": 3660 }, { "epoch": 0.6734656376332231, "grad_norm": 0.9597608034824768, "learning_rate": 1.689545788053398e-05, "loss": 0.7701, "mean_token_accuracy": 0.7696826219558716, "step": 3665 }, { "epoch": 0.6743844174935686, "grad_norm": 1.0258518237885876, "learning_rate": 1.6835202716571896e-05, "loss": 0.7254, "mean_token_accuracy": 0.7749346971511841, "step": 3670 }, { "epoch": 0.675303197353914, "grad_norm": 0.9578329259933241, "learning_rate": 1.677504603962151e-05, "loss": 0.7372, "mean_token_accuracy": 0.7727353811264038, "step": 3675 }, { "epoch": 0.6762219772142595, "grad_norm": 0.9425151659951094, "learning_rate": 1.6714988405216268e-05, "loss": 0.7622, "mean_token_accuracy": 0.768218743801117, "step": 3680 }, { "epoch": 0.677140757074605, "grad_norm": 0.9402489651093421, "learning_rate": 1.6655030367974956e-05, "loss": 0.7042, "mean_token_accuracy": 0.7838626861572265, "step": 3685 }, { "epoch": 0.6780595369349504, "grad_norm": 0.9434658856603072, "learning_rate": 1.659517248159658e-05, "loss": 0.6985, "mean_token_accuracy": 0.7856457352638244, "step": 3690 }, { "epoch": 0.6789783167952959, "grad_norm": 0.9784597438802801, "learning_rate": 1.6535415298855327e-05, "loss": 0.724, "mean_token_accuracy": 0.7787894964218139, "step": 3695 }, { "epoch": 0.6798970966556414, "grad_norm": 0.9287408780062713, "learning_rate": 1.6475759371595363e-05, "loss": 0.7246, "mean_token_accuracy": 0.7800618410110474, "step": 3700 }, { "epoch": 0.6808158765159867, "grad_norm": 0.8854707560115899, "learning_rate": 1.6416205250725805e-05, "loss": 0.7302, "mean_token_accuracy": 0.7747718214988708, "step": 3705 }, { "epoch": 0.6817346563763322, "grad_norm": 1.0559953134942033, "learning_rate": 1.635675348621561e-05, "loss": 0.7812, "mean_token_accuracy": 0.7618914604187011, "step": 3710 }, { "epoch": 0.6826534362366777, "grad_norm": 0.904527688485687, "learning_rate": 1.6297404627088495e-05, "loss": 0.6821, "mean_token_accuracy": 0.7847250699996948, "step": 3715 }, { "epoch": 0.6835722160970231, "grad_norm": 0.9469214300582695, "learning_rate": 1.623815922141786e-05, "loss": 0.7542, "mean_token_accuracy": 0.7689258933067322, "step": 3720 }, { "epoch": 0.6844909959573686, "grad_norm": 0.967860721114202, "learning_rate": 1.6179017816321747e-05, "loss": 0.7363, "mean_token_accuracy": 0.7743378639221191, "step": 3725 }, { "epoch": 0.6854097758177141, "grad_norm": 0.8886741465453643, "learning_rate": 1.6119980957957777e-05, "loss": 0.6988, "mean_token_accuracy": 0.7837384343147278, "step": 3730 }, { "epoch": 0.6863285556780595, "grad_norm": 0.8776280447144813, "learning_rate": 1.6061049191518085e-05, "loss": 0.7209, "mean_token_accuracy": 0.7783106327056885, "step": 3735 }, { "epoch": 0.687247335538405, "grad_norm": 0.9158307911784594, "learning_rate": 1.6002223061224335e-05, "loss": 0.7088, "mean_token_accuracy": 0.781765878200531, "step": 3740 }, { "epoch": 0.6881661153987505, "grad_norm": 1.162396078380293, "learning_rate": 1.5943503110322645e-05, "loss": 0.7807, "mean_token_accuracy": 0.7625959992408753, "step": 3745 }, { "epoch": 0.6890848952590959, "grad_norm": 1.0152287109252447, "learning_rate": 1.5884889881078597e-05, "loss": 0.7434, "mean_token_accuracy": 0.7718896269798279, "step": 3750 }, { "epoch": 0.6900036751194414, "grad_norm": 1.008310002000136, "learning_rate": 1.5826383914772224e-05, "loss": 0.7251, "mean_token_accuracy": 0.7803327202796936, "step": 3755 }, { "epoch": 0.6909224549797869, "grad_norm": 0.9966365535572344, "learning_rate": 1.5767985751693e-05, "loss": 0.7973, "mean_token_accuracy": 0.755574083328247, "step": 3760 }, { "epoch": 0.6918412348401323, "grad_norm": 0.9091331211868702, "learning_rate": 1.5709695931134865e-05, "loss": 0.6733, "mean_token_accuracy": 0.7941539287567139, "step": 3765 }, { "epoch": 0.6927600147004778, "grad_norm": 0.9104102247083076, "learning_rate": 1.5651514991391257e-05, "loss": 0.776, "mean_token_accuracy": 0.7669570446014404, "step": 3770 }, { "epoch": 0.6936787945608233, "grad_norm": 1.0293997774105645, "learning_rate": 1.5593443469750096e-05, "loss": 0.8177, "mean_token_accuracy": 0.7502638220787048, "step": 3775 }, { "epoch": 0.6945975744211687, "grad_norm": 0.929448240683312, "learning_rate": 1.5535481902488867e-05, "loss": 0.7637, "mean_token_accuracy": 0.7701873660087586, "step": 3780 }, { "epoch": 0.6955163542815141, "grad_norm": 0.9731391197018507, "learning_rate": 1.5477630824869654e-05, "loss": 0.7091, "mean_token_accuracy": 0.7808983325958252, "step": 3785 }, { "epoch": 0.6964351341418596, "grad_norm": 1.0202913846698398, "learning_rate": 1.541989077113418e-05, "loss": 0.7465, "mean_token_accuracy": 0.7717735052108765, "step": 3790 }, { "epoch": 0.697353914002205, "grad_norm": 0.9301401711992584, "learning_rate": 1.5362262274498905e-05, "loss": 0.6822, "mean_token_accuracy": 0.7897647023200989, "step": 3795 }, { "epoch": 0.6982726938625505, "grad_norm": 1.0207577033975543, "learning_rate": 1.5304745867150057e-05, "loss": 0.7438, "mean_token_accuracy": 0.774781858921051, "step": 3800 }, { "epoch": 0.699191473722896, "grad_norm": 1.0524548953066566, "learning_rate": 1.524734208023878e-05, "loss": 0.7102, "mean_token_accuracy": 0.781788682937622, "step": 3805 }, { "epoch": 0.7001102535832414, "grad_norm": 1.5303502399912878, "learning_rate": 1.5190051443876164e-05, "loss": 0.75, "mean_token_accuracy": 0.7729594349861145, "step": 3810 }, { "epoch": 0.7010290334435869, "grad_norm": 0.9420746725481757, "learning_rate": 1.5132874487128395e-05, "loss": 0.7092, "mean_token_accuracy": 0.7798316001892089, "step": 3815 }, { "epoch": 0.7019478133039324, "grad_norm": 0.9519856865931159, "learning_rate": 1.5075811738011856e-05, "loss": 0.7228, "mean_token_accuracy": 0.7796306014060974, "step": 3820 }, { "epoch": 0.7028665931642778, "grad_norm": 0.9860104895952649, "learning_rate": 1.5018863723488225e-05, "loss": 0.7966, "mean_token_accuracy": 0.7599681258201599, "step": 3825 }, { "epoch": 0.7037853730246233, "grad_norm": 1.0271424947345908, "learning_rate": 1.4962030969459653e-05, "loss": 0.7635, "mean_token_accuracy": 0.7661967992782592, "step": 3830 }, { "epoch": 0.7047041528849688, "grad_norm": 0.9528580532465843, "learning_rate": 1.4905314000763879e-05, "loss": 0.8305, "mean_token_accuracy": 0.748454475402832, "step": 3835 }, { "epoch": 0.7056229327453142, "grad_norm": 1.127501991853972, "learning_rate": 1.48487133411694e-05, "loss": 0.7435, "mean_token_accuracy": 0.7706256151199341, "step": 3840 }, { "epoch": 0.7065417126056597, "grad_norm": 0.9715164350420321, "learning_rate": 1.4792229513370623e-05, "loss": 0.7749, "mean_token_accuracy": 0.7648235201835633, "step": 3845 }, { "epoch": 0.7074604924660052, "grad_norm": 1.0410599346511435, "learning_rate": 1.4735863038983017e-05, "loss": 0.7929, "mean_token_accuracy": 0.7620292901992798, "step": 3850 }, { "epoch": 0.7083792723263506, "grad_norm": 1.0230288642653715, "learning_rate": 1.4679614438538336e-05, "loss": 0.7096, "mean_token_accuracy": 0.7822004795074463, "step": 3855 }, { "epoch": 0.7092980521866961, "grad_norm": 0.9688977031604671, "learning_rate": 1.4623484231479797e-05, "loss": 0.7349, "mean_token_accuracy": 0.7777714133262634, "step": 3860 }, { "epoch": 0.7102168320470416, "grad_norm": 0.9626561356173854, "learning_rate": 1.4567472936157272e-05, "loss": 0.7146, "mean_token_accuracy": 0.781309711933136, "step": 3865 }, { "epoch": 0.7111356119073869, "grad_norm": 0.9708419426566777, "learning_rate": 1.451158106982253e-05, "loss": 0.7092, "mean_token_accuracy": 0.783543837070465, "step": 3870 }, { "epoch": 0.7120543917677324, "grad_norm": 1.109902456844337, "learning_rate": 1.4455809148624427e-05, "loss": 0.6661, "mean_token_accuracy": 0.7925106644630432, "step": 3875 }, { "epoch": 0.7129731716280779, "grad_norm": 1.0531361212213257, "learning_rate": 1.4400157687604127e-05, "loss": 0.7478, "mean_token_accuracy": 0.7699988007545471, "step": 3880 }, { "epoch": 0.7138919514884233, "grad_norm": 0.9181223816529849, "learning_rate": 1.4344627200690408e-05, "loss": 0.7828, "mean_token_accuracy": 0.7599815845489502, "step": 3885 }, { "epoch": 0.7148107313487688, "grad_norm": 0.960236605434846, "learning_rate": 1.4289218200694863e-05, "loss": 0.6859, "mean_token_accuracy": 0.7898363471031189, "step": 3890 }, { "epoch": 0.7157295112091143, "grad_norm": 1.0122066887422, "learning_rate": 1.4233931199307182e-05, "loss": 0.7232, "mean_token_accuracy": 0.7770133495330811, "step": 3895 }, { "epoch": 0.7166482910694597, "grad_norm": 1.040565449358011, "learning_rate": 1.4178766707090435e-05, "loss": 0.6839, "mean_token_accuracy": 0.7898031234741211, "step": 3900 }, { "epoch": 0.7175670709298052, "grad_norm": 1.0510693995270706, "learning_rate": 1.4123725233476331e-05, "loss": 0.7013, "mean_token_accuracy": 0.7850608229637146, "step": 3905 }, { "epoch": 0.7184858507901507, "grad_norm": 0.9406082797289776, "learning_rate": 1.406880728676054e-05, "loss": 0.694, "mean_token_accuracy": 0.7835015416145324, "step": 3910 }, { "epoch": 0.7194046306504961, "grad_norm": 0.86803820647997, "learning_rate": 1.401401337409799e-05, "loss": 0.7519, "mean_token_accuracy": 0.7705330729484559, "step": 3915 }, { "epoch": 0.7203234105108416, "grad_norm": 1.004330967776519, "learning_rate": 1.3959344001498173e-05, "loss": 0.7427, "mean_token_accuracy": 0.775149667263031, "step": 3920 }, { "epoch": 0.7212421903711871, "grad_norm": 0.9559190227857477, "learning_rate": 1.390479967382049e-05, "loss": 0.791, "mean_token_accuracy": 0.7609505772590637, "step": 3925 }, { "epoch": 0.7221609702315325, "grad_norm": 1.028049651883388, "learning_rate": 1.3850380894769577e-05, "loss": 0.7556, "mean_token_accuracy": 0.76885005235672, "step": 3930 }, { "epoch": 0.723079750091878, "grad_norm": 0.9303174472709201, "learning_rate": 1.3796088166890658e-05, "loss": 0.7354, "mean_token_accuracy": 0.7731772422790527, "step": 3935 }, { "epoch": 0.7239985299522235, "grad_norm": 0.9187982243033715, "learning_rate": 1.3741921991564902e-05, "loss": 0.7279, "mean_token_accuracy": 0.7771438717842102, "step": 3940 }, { "epoch": 0.7249173098125689, "grad_norm": 1.0153656088945144, "learning_rate": 1.3687882869004793e-05, "loss": 0.7822, "mean_token_accuracy": 0.7594830989837646, "step": 3945 }, { "epoch": 0.7258360896729144, "grad_norm": 0.9068523793270754, "learning_rate": 1.3633971298249509e-05, "loss": 0.726, "mean_token_accuracy": 0.7766485810279846, "step": 3950 }, { "epoch": 0.7267548695332599, "grad_norm": 0.9192709791074424, "learning_rate": 1.358018777716033e-05, "loss": 0.6736, "mean_token_accuracy": 0.7924223423004151, "step": 3955 }, { "epoch": 0.7276736493936052, "grad_norm": 0.9362834673989403, "learning_rate": 1.3526532802415986e-05, "loss": 0.7237, "mean_token_accuracy": 0.7815822243690491, "step": 3960 }, { "epoch": 0.7285924292539507, "grad_norm": 0.9618182073527164, "learning_rate": 1.347300686950817e-05, "loss": 0.7136, "mean_token_accuracy": 0.7804886937141419, "step": 3965 }, { "epoch": 0.7295112091142962, "grad_norm": 1.0033704504611825, "learning_rate": 1.3419610472736854e-05, "loss": 0.7774, "mean_token_accuracy": 0.7617066621780395, "step": 3970 }, { "epoch": 0.7304299889746417, "grad_norm": 0.9824057566253805, "learning_rate": 1.3366344105205795e-05, "loss": 0.7415, "mean_token_accuracy": 0.7728252649307251, "step": 3975 }, { "epoch": 0.7313487688349871, "grad_norm": 0.9259208825526174, "learning_rate": 1.3313208258817961e-05, "loss": 0.668, "mean_token_accuracy": 0.7945244908332825, "step": 3980 }, { "epoch": 0.7322675486953326, "grad_norm": 1.0678551747273641, "learning_rate": 1.3260203424270962e-05, "loss": 0.6779, "mean_token_accuracy": 0.7914282798767089, "step": 3985 }, { "epoch": 0.7331863285556781, "grad_norm": 0.9074661173815383, "learning_rate": 1.3207330091052564e-05, "loss": 0.7319, "mean_token_accuracy": 0.7765037894248963, "step": 3990 }, { "epoch": 0.7341051084160235, "grad_norm": 0.9568097933924034, "learning_rate": 1.3154588747436159e-05, "loss": 0.7078, "mean_token_accuracy": 0.7828231930732727, "step": 3995 }, { "epoch": 0.735023888276369, "grad_norm": 0.9371717410955112, "learning_rate": 1.310197988047622e-05, "loss": 0.6858, "mean_token_accuracy": 0.7882918357849121, "step": 4000 }, { "epoch": 0.7359426681367145, "grad_norm": 0.9954273414248298, "learning_rate": 1.3049503976003838e-05, "loss": 0.7514, "mean_token_accuracy": 0.7692143678665161, "step": 4005 }, { "epoch": 0.7368614479970599, "grad_norm": 0.9856979788439847, "learning_rate": 1.2997161518622236e-05, "loss": 0.7208, "mean_token_accuracy": 0.7764803051948548, "step": 4010 }, { "epoch": 0.7377802278574054, "grad_norm": 0.9346284220975282, "learning_rate": 1.2944952991702252e-05, "loss": 0.6963, "mean_token_accuracy": 0.7852182865142823, "step": 4015 }, { "epoch": 0.7386990077177509, "grad_norm": 1.013720256514248, "learning_rate": 1.289287887737794e-05, "loss": 0.7001, "mean_token_accuracy": 0.7830116629600525, "step": 4020 }, { "epoch": 0.7396177875780963, "grad_norm": 0.9031619991653583, "learning_rate": 1.2840939656542055e-05, "loss": 0.6997, "mean_token_accuracy": 0.7874221801757812, "step": 4025 }, { "epoch": 0.7405365674384418, "grad_norm": 0.9825845455381703, "learning_rate": 1.2789135808841677e-05, "loss": 0.6957, "mean_token_accuracy": 0.7857596635818481, "step": 4030 }, { "epoch": 0.7414553472987873, "grad_norm": 0.9061423884854145, "learning_rate": 1.2737467812673723e-05, "loss": 0.7169, "mean_token_accuracy": 0.781167495250702, "step": 4035 }, { "epoch": 0.7423741271591326, "grad_norm": 0.9638102626531402, "learning_rate": 1.2685936145180532e-05, "loss": 0.69, "mean_token_accuracy": 0.7890314221382141, "step": 4040 }, { "epoch": 0.7432929070194781, "grad_norm": 1.0053384054052024, "learning_rate": 1.2634541282245516e-05, "loss": 0.807, "mean_token_accuracy": 0.7533567190170288, "step": 4045 }, { "epoch": 0.7442116868798236, "grad_norm": 1.04109340250319, "learning_rate": 1.2583283698488704e-05, "loss": 0.7067, "mean_token_accuracy": 0.7812132358551025, "step": 4050 }, { "epoch": 0.745130466740169, "grad_norm": 1.0112381563115767, "learning_rate": 1.2532163867262392e-05, "loss": 0.7399, "mean_token_accuracy": 0.7726234674453736, "step": 4055 }, { "epoch": 0.7460492466005145, "grad_norm": 0.8721284775041804, "learning_rate": 1.2481182260646752e-05, "loss": 0.7306, "mean_token_accuracy": 0.7757495403289795, "step": 4060 }, { "epoch": 0.74696802646086, "grad_norm": 1.0082015116923577, "learning_rate": 1.2430339349445513e-05, "loss": 0.7431, "mean_token_accuracy": 0.7711400389671326, "step": 4065 }, { "epoch": 0.7478868063212054, "grad_norm": 0.9592384870972069, "learning_rate": 1.2379635603181537e-05, "loss": 0.7367, "mean_token_accuracy": 0.7738732933998108, "step": 4070 }, { "epoch": 0.7488055861815509, "grad_norm": 1.0584048963162198, "learning_rate": 1.2329071490092558e-05, "loss": 0.768, "mean_token_accuracy": 0.7642792701721192, "step": 4075 }, { "epoch": 0.7497243660418964, "grad_norm": 1.006126544893952, "learning_rate": 1.2278647477126825e-05, "loss": 0.7155, "mean_token_accuracy": 0.7793737649917603, "step": 4080 }, { "epoch": 0.7506431459022418, "grad_norm": 0.9059136663503262, "learning_rate": 1.2228364029938794e-05, "loss": 0.6934, "mean_token_accuracy": 0.7861241817474365, "step": 4085 }, { "epoch": 0.7515619257625873, "grad_norm": 0.9164252616053726, "learning_rate": 1.2178221612884821e-05, "loss": 0.6858, "mean_token_accuracy": 0.7915996551513672, "step": 4090 }, { "epoch": 0.7524807056229328, "grad_norm": 0.9705885896913202, "learning_rate": 1.212822068901889e-05, "loss": 0.7124, "mean_token_accuracy": 0.7814687609672546, "step": 4095 }, { "epoch": 0.7533994854832782, "grad_norm": 0.9020603765566574, "learning_rate": 1.2078361720088317e-05, "loss": 0.6295, "mean_token_accuracy": 0.8092963337898255, "step": 4100 }, { "epoch": 0.7543182653436237, "grad_norm": 1.0248400255161636, "learning_rate": 1.2028645166529502e-05, "loss": 0.6836, "mean_token_accuracy": 0.7892769694328308, "step": 4105 }, { "epoch": 0.7552370452039692, "grad_norm": 0.9961571096740167, "learning_rate": 1.1979071487463676e-05, "loss": 0.7571, "mean_token_accuracy": 0.7688749432563782, "step": 4110 }, { "epoch": 0.7561558250643146, "grad_norm": 0.944702960313809, "learning_rate": 1.1929641140692642e-05, "loss": 0.7449, "mean_token_accuracy": 0.7719345092773438, "step": 4115 }, { "epoch": 0.75707460492466, "grad_norm": 0.901399230174195, "learning_rate": 1.1880354582694574e-05, "loss": 0.7247, "mean_token_accuracy": 0.7755969166755676, "step": 4120 }, { "epoch": 0.7579933847850056, "grad_norm": 0.8741911991495609, "learning_rate": 1.183121226861978e-05, "loss": 0.6733, "mean_token_accuracy": 0.791340708732605, "step": 4125 }, { "epoch": 0.7589121646453509, "grad_norm": 0.9630743810579115, "learning_rate": 1.1782214652286517e-05, "loss": 0.7611, "mean_token_accuracy": 0.7667882919311524, "step": 4130 }, { "epoch": 0.7598309445056964, "grad_norm": 0.9810984916908249, "learning_rate": 1.1733362186176783e-05, "loss": 0.7248, "mean_token_accuracy": 0.7780536890029908, "step": 4135 }, { "epoch": 0.7607497243660419, "grad_norm": 1.0335671142572738, "learning_rate": 1.1684655321432151e-05, "loss": 0.8171, "mean_token_accuracy": 0.7540881991386413, "step": 4140 }, { "epoch": 0.7616685042263873, "grad_norm": 1.131108893177244, "learning_rate": 1.1636094507849602e-05, "loss": 0.8238, "mean_token_accuracy": 0.7476210117340087, "step": 4145 }, { "epoch": 0.7625872840867328, "grad_norm": 0.936028514421637, "learning_rate": 1.1587680193877339e-05, "loss": 0.7193, "mean_token_accuracy": 0.778421950340271, "step": 4150 }, { "epoch": 0.7635060639470783, "grad_norm": 1.0740607162155091, "learning_rate": 1.153941282661072e-05, "loss": 0.7396, "mean_token_accuracy": 0.7715651631355286, "step": 4155 }, { "epoch": 0.7644248438074237, "grad_norm": 1.0319017337525411, "learning_rate": 1.149129285178805e-05, "loss": 0.7647, "mean_token_accuracy": 0.7646437525749207, "step": 4160 }, { "epoch": 0.7653436236677692, "grad_norm": 1.0657076517097614, "learning_rate": 1.1443320713786512e-05, "loss": 0.761, "mean_token_accuracy": 0.7698405861854554, "step": 4165 }, { "epoch": 0.7662624035281147, "grad_norm": 1.1933980574421776, "learning_rate": 1.1395496855618047e-05, "loss": 0.6857, "mean_token_accuracy": 0.786463487148285, "step": 4170 }, { "epoch": 0.7671811833884601, "grad_norm": 1.0399268658004284, "learning_rate": 1.1347821718925246e-05, "loss": 0.6951, "mean_token_accuracy": 0.7830422759056092, "step": 4175 }, { "epoch": 0.7680999632488056, "grad_norm": 0.9523591554987267, "learning_rate": 1.1300295743977319e-05, "loss": 0.7314, "mean_token_accuracy": 0.7771936178207397, "step": 4180 }, { "epoch": 0.7690187431091511, "grad_norm": 0.9704227649157859, "learning_rate": 1.1252919369665982e-05, "loss": 0.6644, "mean_token_accuracy": 0.7947867512702942, "step": 4185 }, { "epoch": 0.7699375229694965, "grad_norm": 1.1427236915156969, "learning_rate": 1.1205693033501438e-05, "loss": 0.8105, "mean_token_accuracy": 0.7546621441841126, "step": 4190 }, { "epoch": 0.770856302829842, "grad_norm": 0.9971989693482147, "learning_rate": 1.115861717160831e-05, "loss": 0.7324, "mean_token_accuracy": 0.7745411634445191, "step": 4195 }, { "epoch": 0.7717750826901875, "grad_norm": 0.9708619660373234, "learning_rate": 1.1111692218721634e-05, "loss": 0.7248, "mean_token_accuracy": 0.7753921389579773, "step": 4200 }, { "epoch": 0.7726938625505329, "grad_norm": 1.0024258510538886, "learning_rate": 1.1064918608182811e-05, "loss": 0.7042, "mean_token_accuracy": 0.7805647253990173, "step": 4205 }, { "epoch": 0.7736126424108783, "grad_norm": 1.043536654965297, "learning_rate": 1.1018296771935662e-05, "loss": 0.7479, "mean_token_accuracy": 0.7747788667678833, "step": 4210 }, { "epoch": 0.7745314222712238, "grad_norm": 0.9600156217205023, "learning_rate": 1.097182714052238e-05, "loss": 0.7103, "mean_token_accuracy": 0.7833921790122986, "step": 4215 }, { "epoch": 0.7754502021315692, "grad_norm": 1.0045159954634846, "learning_rate": 1.0925510143079597e-05, "loss": 0.7374, "mean_token_accuracy": 0.7714961647987366, "step": 4220 }, { "epoch": 0.7763689819919147, "grad_norm": 0.9309385715533749, "learning_rate": 1.0879346207334413e-05, "loss": 0.7726, "mean_token_accuracy": 0.7604559183120727, "step": 4225 }, { "epoch": 0.7772877618522602, "grad_norm": 1.0081696884584601, "learning_rate": 1.0833335759600405e-05, "loss": 0.7722, "mean_token_accuracy": 0.7622892618179321, "step": 4230 }, { "epoch": 0.7782065417126056, "grad_norm": 1.0452970764251144, "learning_rate": 1.0787479224773747e-05, "loss": 0.828, "mean_token_accuracy": 0.7479719400405884, "step": 4235 }, { "epoch": 0.7791253215729511, "grad_norm": 1.0247302466447017, "learning_rate": 1.0741777026329258e-05, "loss": 0.7903, "mean_token_accuracy": 0.7618830919265747, "step": 4240 }, { "epoch": 0.7800441014332966, "grad_norm": 0.9573944252741409, "learning_rate": 1.0696229586316494e-05, "loss": 0.7805, "mean_token_accuracy": 0.7581877827644348, "step": 4245 }, { "epoch": 0.780962881293642, "grad_norm": 0.9549288805564692, "learning_rate": 1.065083732535585e-05, "loss": 0.7232, "mean_token_accuracy": 0.7758707642555237, "step": 4250 }, { "epoch": 0.7818816611539875, "grad_norm": 1.0135283642977615, "learning_rate": 1.060560066263468e-05, "loss": 0.6985, "mean_token_accuracy": 0.7865156173706055, "step": 4255 }, { "epoch": 0.782800441014333, "grad_norm": 0.9768410936733116, "learning_rate": 1.0560520015903421e-05, "loss": 0.6995, "mean_token_accuracy": 0.7879634141921997, "step": 4260 }, { "epoch": 0.7837192208746784, "grad_norm": 1.0406851160802406, "learning_rate": 1.0515595801471734e-05, "loss": 0.7099, "mean_token_accuracy": 0.7844684720039368, "step": 4265 }, { "epoch": 0.7846380007350239, "grad_norm": 1.1640626929289857, "learning_rate": 1.0470828434204672e-05, "loss": 0.7507, "mean_token_accuracy": 0.7699440717697144, "step": 4270 }, { "epoch": 0.7855567805953694, "grad_norm": 0.9770361775953404, "learning_rate": 1.0426218327518831e-05, "loss": 0.7241, "mean_token_accuracy": 0.7754392981529236, "step": 4275 }, { "epoch": 0.7864755604557148, "grad_norm": 0.9511388600942011, "learning_rate": 1.0381765893378545e-05, "loss": 0.7491, "mean_token_accuracy": 0.768705952167511, "step": 4280 }, { "epoch": 0.7873943403160603, "grad_norm": 0.8951535359455135, "learning_rate": 1.0337471542292076e-05, "loss": 0.6546, "mean_token_accuracy": 0.7975376367568969, "step": 4285 }, { "epoch": 0.7883131201764058, "grad_norm": 0.9998894325690464, "learning_rate": 1.0293335683307825e-05, "loss": 0.717, "mean_token_accuracy": 0.7781760573387146, "step": 4290 }, { "epoch": 0.7892319000367511, "grad_norm": 0.9660670108974215, "learning_rate": 1.0249358724010555e-05, "loss": 0.7081, "mean_token_accuracy": 0.7858733177185059, "step": 4295 }, { "epoch": 0.7901506798970966, "grad_norm": 0.9012514691895656, "learning_rate": 1.0205541070517624e-05, "loss": 0.6758, "mean_token_accuracy": 0.7909941792488098, "step": 4300 }, { "epoch": 0.7910694597574421, "grad_norm": 0.9948867453145167, "learning_rate": 1.0161883127475242e-05, "loss": 0.6938, "mean_token_accuracy": 0.7855447053909301, "step": 4305 }, { "epoch": 0.7919882396177875, "grad_norm": 0.984814744134321, "learning_rate": 1.0118385298054711e-05, "loss": 0.7587, "mean_token_accuracy": 0.7694467306137085, "step": 4310 }, { "epoch": 0.792907019478133, "grad_norm": 0.9320375007219053, "learning_rate": 1.0075047983948743e-05, "loss": 0.7049, "mean_token_accuracy": 0.7814609169960022, "step": 4315 }, { "epoch": 0.7938257993384785, "grad_norm": 0.9094613720358601, "learning_rate": 1.0031871585367718e-05, "loss": 0.6712, "mean_token_accuracy": 0.7946569919586182, "step": 4320 }, { "epoch": 0.7947445791988239, "grad_norm": 0.956979005267424, "learning_rate": 9.988856501035992e-06, "loss": 0.6935, "mean_token_accuracy": 0.7856648206710816, "step": 4325 }, { "epoch": 0.7956633590591694, "grad_norm": 0.9605421546046741, "learning_rate": 9.946003128188227e-06, "loss": 0.7125, "mean_token_accuracy": 0.7815356492996216, "step": 4330 }, { "epoch": 0.7965821389195149, "grad_norm": 1.0112492842719905, "learning_rate": 9.903311862565718e-06, "loss": 0.7767, "mean_token_accuracy": 0.7658894777297973, "step": 4335 }, { "epoch": 0.7975009187798603, "grad_norm": 0.9643270276794048, "learning_rate": 9.860783098412718e-06, "loss": 0.7266, "mean_token_accuracy": 0.7743983864784241, "step": 4340 }, { "epoch": 0.7984196986402058, "grad_norm": 1.0552126857079376, "learning_rate": 9.818417228472828e-06, "loss": 0.784, "mean_token_accuracy": 0.757087242603302, "step": 4345 }, { "epoch": 0.7993384785005513, "grad_norm": 0.9398083766031105, "learning_rate": 9.776214643985372e-06, "loss": 0.7362, "mean_token_accuracy": 0.7717908382415771, "step": 4350 }, { "epoch": 0.8002572583608968, "grad_norm": 1.1034049454029626, "learning_rate": 9.734175734681746e-06, "loss": 0.745, "mean_token_accuracy": 0.7712400317192077, "step": 4355 }, { "epoch": 0.8011760382212422, "grad_norm": 0.9955484033373456, "learning_rate": 9.69230088878186e-06, "loss": 0.7156, "mean_token_accuracy": 0.778664481639862, "step": 4360 }, { "epoch": 0.8020948180815877, "grad_norm": 0.9268939856852966, "learning_rate": 9.650590492990517e-06, "loss": 0.6814, "mean_token_accuracy": 0.7887930870056152, "step": 4365 }, { "epoch": 0.8030135979419332, "grad_norm": 1.016524051741597, "learning_rate": 9.609044932493873e-06, "loss": 0.761, "mean_token_accuracy": 0.7663564682006836, "step": 4370 }, { "epoch": 0.8039323778022786, "grad_norm": 0.9217374732022973, "learning_rate": 9.567664590955861e-06, "loss": 0.7344, "mean_token_accuracy": 0.7756752133369446, "step": 4375 }, { "epoch": 0.804851157662624, "grad_norm": 1.010030392708083, "learning_rate": 9.526449850514662e-06, "loss": 0.7442, "mean_token_accuracy": 0.770478630065918, "step": 4380 }, { "epoch": 0.8057699375229695, "grad_norm": 1.0170619440794504, "learning_rate": 9.485401091779171e-06, "loss": 0.7571, "mean_token_accuracy": 0.7664804577827453, "step": 4385 }, { "epoch": 0.8066887173833149, "grad_norm": 0.9571205775213486, "learning_rate": 9.444518693825456e-06, "loss": 0.7053, "mean_token_accuracy": 0.7821534872055054, "step": 4390 }, { "epoch": 0.8076074972436604, "grad_norm": 1.014166913858275, "learning_rate": 9.403803034193302e-06, "loss": 0.7171, "mean_token_accuracy": 0.7787631750106812, "step": 4395 }, { "epoch": 0.8085262771040059, "grad_norm": 1.0747929086580883, "learning_rate": 9.363254488882694e-06, "loss": 0.7338, "mean_token_accuracy": 0.7740719437599182, "step": 4400 }, { "epoch": 0.8094450569643513, "grad_norm": 1.0631010384857345, "learning_rate": 9.322873432350361e-06, "loss": 0.7597, "mean_token_accuracy": 0.7654994845390319, "step": 4405 }, { "epoch": 0.8103638368246968, "grad_norm": 1.9306478337494233, "learning_rate": 9.282660237506296e-06, "loss": 0.7027, "mean_token_accuracy": 0.7840522766113281, "step": 4410 }, { "epoch": 0.8112826166850423, "grad_norm": 0.9881521244458075, "learning_rate": 9.242615275710359e-06, "loss": 0.7735, "mean_token_accuracy": 0.765105926990509, "step": 4415 }, { "epoch": 0.8122013965453877, "grad_norm": 0.9822768789236431, "learning_rate": 9.202738916768773e-06, "loss": 0.7742, "mean_token_accuracy": 0.7636497378349304, "step": 4420 }, { "epoch": 0.8131201764057332, "grad_norm": 0.9597545385693992, "learning_rate": 9.16303152893078e-06, "loss": 0.7168, "mean_token_accuracy": 0.7784831523895264, "step": 4425 }, { "epoch": 0.8140389562660787, "grad_norm": 1.018005126822509, "learning_rate": 9.123493478885197e-06, "loss": 0.7051, "mean_token_accuracy": 0.7817409634590149, "step": 4430 }, { "epoch": 0.8149577361264241, "grad_norm": 0.8612534899442146, "learning_rate": 9.084125131757061e-06, "loss": 0.6905, "mean_token_accuracy": 0.7883997678756713, "step": 4435 }, { "epoch": 0.8158765159867696, "grad_norm": 1.004828814027327, "learning_rate": 9.044926851104225e-06, "loss": 0.7088, "mean_token_accuracy": 0.7787980914115906, "step": 4440 }, { "epoch": 0.8167952958471151, "grad_norm": 1.034772375773089, "learning_rate": 9.005898998914021e-06, "loss": 0.7563, "mean_token_accuracy": 0.7687358140945435, "step": 4445 }, { "epoch": 0.8177140757074605, "grad_norm": 0.9599050149742322, "learning_rate": 8.967041935599915e-06, "loss": 0.7534, "mean_token_accuracy": 0.7682107329368592, "step": 4450 }, { "epoch": 0.818632855567806, "grad_norm": 0.9069132664938562, "learning_rate": 8.928356019998177e-06, "loss": 0.725, "mean_token_accuracy": 0.7773229837417602, "step": 4455 }, { "epoch": 0.8195516354281515, "grad_norm": 0.8705834756972108, "learning_rate": 8.88984160936456e-06, "loss": 0.7287, "mean_token_accuracy": 0.7764084458351135, "step": 4460 }, { "epoch": 0.8204704152884968, "grad_norm": 1.0456174549561577, "learning_rate": 8.851499059371016e-06, "loss": 0.7831, "mean_token_accuracy": 0.7606392741203308, "step": 4465 }, { "epoch": 0.8213891951488423, "grad_norm": 1.02656987229662, "learning_rate": 8.813328724102389e-06, "loss": 0.6944, "mean_token_accuracy": 0.7881085634231567, "step": 4470 }, { "epoch": 0.8223079750091878, "grad_norm": 0.959837964691812, "learning_rate": 8.775330956053171e-06, "loss": 0.7732, "mean_token_accuracy": 0.7633563637733459, "step": 4475 }, { "epoch": 0.8232267548695332, "grad_norm": 1.1225654804198197, "learning_rate": 8.737506106124235e-06, "loss": 0.7812, "mean_token_accuracy": 0.7637458205223083, "step": 4480 }, { "epoch": 0.8241455347298787, "grad_norm": 0.9381645378723508, "learning_rate": 8.69985452361958e-06, "loss": 0.704, "mean_token_accuracy": 0.7797535300254822, "step": 4485 }, { "epoch": 0.8250643145902242, "grad_norm": 0.9968842713077971, "learning_rate": 8.662376556243134e-06, "loss": 0.7743, "mean_token_accuracy": 0.7624358177185059, "step": 4490 }, { "epoch": 0.8259830944505696, "grad_norm": 0.8956734493127111, "learning_rate": 8.625072550095529e-06, "loss": 0.6901, "mean_token_accuracy": 0.7880960464477539, "step": 4495 }, { "epoch": 0.8269018743109151, "grad_norm": 0.9031387569534376, "learning_rate": 8.587942849670877e-06, "loss": 0.719, "mean_token_accuracy": 0.7778927087783813, "step": 4500 }, { "epoch": 0.8278206541712606, "grad_norm": 1.008720758966792, "learning_rate": 8.550987797853658e-06, "loss": 0.6524, "mean_token_accuracy": 0.7953348755836487, "step": 4505 }, { "epoch": 0.828739434031606, "grad_norm": 0.9852433501783017, "learning_rate": 8.51420773591548e-06, "loss": 0.6965, "mean_token_accuracy": 0.7834087967872619, "step": 4510 }, { "epoch": 0.8296582138919515, "grad_norm": 0.9178304984994476, "learning_rate": 8.47760300351197e-06, "loss": 0.6903, "mean_token_accuracy": 0.7837494611740112, "step": 4515 }, { "epoch": 0.830576993752297, "grad_norm": 0.9503212461224423, "learning_rate": 8.441173938679624e-06, "loss": 0.729, "mean_token_accuracy": 0.7761277437210083, "step": 4520 }, { "epoch": 0.8314957736126424, "grad_norm": 0.9554820837395569, "learning_rate": 8.404920877832693e-06, "loss": 0.6229, "mean_token_accuracy": 0.8066902041435242, "step": 4525 }, { "epoch": 0.8324145534729879, "grad_norm": 0.9220834077432549, "learning_rate": 8.368844155760054e-06, "loss": 0.7483, "mean_token_accuracy": 0.7662014603614807, "step": 4530 }, { "epoch": 0.8333333333333334, "grad_norm": 0.9412463783905453, "learning_rate": 8.33294410562215e-06, "loss": 0.6702, "mean_token_accuracy": 0.792554771900177, "step": 4535 }, { "epoch": 0.8342521131936788, "grad_norm": 0.9594825001322896, "learning_rate": 8.297221058947901e-06, "loss": 0.6827, "mean_token_accuracy": 0.7895113706588746, "step": 4540 }, { "epoch": 0.8351708930540243, "grad_norm": 1.0075829456004026, "learning_rate": 8.26167534563163e-06, "loss": 0.7226, "mean_token_accuracy": 0.7793241381645203, "step": 4545 }, { "epoch": 0.8360896729143698, "grad_norm": 0.9667343292842628, "learning_rate": 8.226307293930038e-06, "loss": 0.6909, "mean_token_accuracy": 0.7852010130882263, "step": 4550 }, { "epoch": 0.8370084527747151, "grad_norm": 1.123375486497552, "learning_rate": 8.191117230459137e-06, "loss": 0.7471, "mean_token_accuracy": 0.7665111303329468, "step": 4555 }, { "epoch": 0.8379272326350606, "grad_norm": 1.0938857594365514, "learning_rate": 8.156105480191279e-06, "loss": 0.7277, "mean_token_accuracy": 0.7771796703338623, "step": 4560 }, { "epoch": 0.8388460124954061, "grad_norm": 1.1106743041566718, "learning_rate": 8.12127236645213e-06, "loss": 0.686, "mean_token_accuracy": 0.7906163096427917, "step": 4565 }, { "epoch": 0.8397647923557515, "grad_norm": 1.0123275172064825, "learning_rate": 8.08661821091768e-06, "loss": 0.6952, "mean_token_accuracy": 0.7839462161064148, "step": 4570 }, { "epoch": 0.840683572216097, "grad_norm": 0.9381954109006908, "learning_rate": 8.052143333611299e-06, "loss": 0.6343, "mean_token_accuracy": 0.8027025938034058, "step": 4575 }, { "epoch": 0.8416023520764425, "grad_norm": 1.0894046730967675, "learning_rate": 8.017848052900732e-06, "loss": 0.7705, "mean_token_accuracy": 0.7612823724746705, "step": 4580 }, { "epoch": 0.8425211319367879, "grad_norm": 0.8738132153298477, "learning_rate": 7.983732685495216e-06, "loss": 0.6674, "mean_token_accuracy": 0.7931976318359375, "step": 4585 }, { "epoch": 0.8434399117971334, "grad_norm": 0.9130249915825608, "learning_rate": 7.94979754644252e-06, "loss": 0.7047, "mean_token_accuracy": 0.780854594707489, "step": 4590 }, { "epoch": 0.8443586916574789, "grad_norm": 0.8969413048245845, "learning_rate": 7.91604294912604e-06, "loss": 0.6926, "mean_token_accuracy": 0.7844374537467956, "step": 4595 }, { "epoch": 0.8452774715178243, "grad_norm": 0.9969218679518915, "learning_rate": 7.882469205261912e-06, "loss": 0.725, "mean_token_accuracy": 0.7759457588195801, "step": 4600 }, { "epoch": 0.8461962513781698, "grad_norm": 0.9327495309294612, "learning_rate": 7.849076624896148e-06, "loss": 0.7338, "mean_token_accuracy": 0.7735802173614502, "step": 4605 }, { "epoch": 0.8471150312385153, "grad_norm": 1.0765849478605605, "learning_rate": 7.815865516401724e-06, "loss": 0.7843, "mean_token_accuracy": 0.7614648342132568, "step": 4610 }, { "epoch": 0.8480338110988607, "grad_norm": 1.055107377021268, "learning_rate": 7.782836186475787e-06, "loss": 0.7252, "mean_token_accuracy": 0.7763318181037903, "step": 4615 }, { "epoch": 0.8489525909592062, "grad_norm": 0.9991555504178357, "learning_rate": 7.749988940136794e-06, "loss": 0.7624, "mean_token_accuracy": 0.7689498424530029, "step": 4620 }, { "epoch": 0.8498713708195517, "grad_norm": 0.9862061842467479, "learning_rate": 7.717324080721698e-06, "loss": 0.7724, "mean_token_accuracy": 0.7633493065834045, "step": 4625 }, { "epoch": 0.850790150679897, "grad_norm": 0.9444771429350269, "learning_rate": 7.684841909883153e-06, "loss": 0.736, "mean_token_accuracy": 0.772719144821167, "step": 4630 }, { "epoch": 0.8517089305402425, "grad_norm": 0.9384955588777603, "learning_rate": 7.652542727586722e-06, "loss": 0.732, "mean_token_accuracy": 0.7758396387100219, "step": 4635 }, { "epoch": 0.852627710400588, "grad_norm": 0.9275047945197651, "learning_rate": 7.620426832108114e-06, "loss": 0.6528, "mean_token_accuracy": 0.7958010911941529, "step": 4640 }, { "epoch": 0.8535464902609334, "grad_norm": 1.0403097729529958, "learning_rate": 7.588494520030422e-06, "loss": 0.6619, "mean_token_accuracy": 0.7915726184844971, "step": 4645 }, { "epoch": 0.8544652701212789, "grad_norm": 0.947341642816108, "learning_rate": 7.556746086241387e-06, "loss": 0.731, "mean_token_accuracy": 0.775021767616272, "step": 4650 }, { "epoch": 0.8553840499816244, "grad_norm": 0.920825040895765, "learning_rate": 7.52518182393068e-06, "loss": 0.7206, "mean_token_accuracy": 0.7786232590675354, "step": 4655 }, { "epoch": 0.8563028298419698, "grad_norm": 1.00451196504111, "learning_rate": 7.493802024587182e-06, "loss": 0.7028, "mean_token_accuracy": 0.7852194666862488, "step": 4660 }, { "epoch": 0.8572216097023153, "grad_norm": 1.0502227606850303, "learning_rate": 7.4626069779963044e-06, "loss": 0.7102, "mean_token_accuracy": 0.7817628145217895, "step": 4665 }, { "epoch": 0.8581403895626608, "grad_norm": 0.9506022079001433, "learning_rate": 7.431596972237313e-06, "loss": 0.7541, "mean_token_accuracy": 0.7674841046333313, "step": 4670 }, { "epoch": 0.8590591694230062, "grad_norm": 0.8995410623765239, "learning_rate": 7.400772293680655e-06, "loss": 0.6585, "mean_token_accuracy": 0.7939380526542663, "step": 4675 }, { "epoch": 0.8599779492833517, "grad_norm": 1.0524188630602864, "learning_rate": 7.370133226985324e-06, "loss": 0.7053, "mean_token_accuracy": 0.7818097829818725, "step": 4680 }, { "epoch": 0.8608967291436972, "grad_norm": 1.0628395588693211, "learning_rate": 7.339680055096238e-06, "loss": 0.7268, "mean_token_accuracy": 0.7778566598892211, "step": 4685 }, { "epoch": 0.8618155090040426, "grad_norm": 1.0409171879354133, "learning_rate": 7.3094130592416e-06, "loss": 0.7509, "mean_token_accuracy": 0.7721391081809997, "step": 4690 }, { "epoch": 0.8627342888643881, "grad_norm": 0.9450810576486318, "learning_rate": 7.279332518930333e-06, "loss": 0.7354, "mean_token_accuracy": 0.7757332921028137, "step": 4695 }, { "epoch": 0.8636530687247336, "grad_norm": 1.005547743201597, "learning_rate": 7.24943871194949e-06, "loss": 0.7123, "mean_token_accuracy": 0.784231448173523, "step": 4700 }, { "epoch": 0.864571848585079, "grad_norm": 1.0321990207147438, "learning_rate": 7.219731914361673e-06, "loss": 0.7119, "mean_token_accuracy": 0.7776958227157593, "step": 4705 }, { "epoch": 0.8654906284454245, "grad_norm": 1.034071085635404, "learning_rate": 7.190212400502496e-06, "loss": 0.6915, "mean_token_accuracy": 0.7870635032653809, "step": 4710 }, { "epoch": 0.86640940830577, "grad_norm": 0.9373543843862668, "learning_rate": 7.160880442978049e-06, "loss": 0.6896, "mean_token_accuracy": 0.7847368240356445, "step": 4715 }, { "epoch": 0.8673281881661153, "grad_norm": 0.9200571177499256, "learning_rate": 7.131736312662385e-06, "loss": 0.7087, "mean_token_accuracy": 0.7825104236602783, "step": 4720 }, { "epoch": 0.8682469680264608, "grad_norm": 0.9546006111783019, "learning_rate": 7.1027802786950064e-06, "loss": 0.803, "mean_token_accuracy": 0.7619799256324769, "step": 4725 }, { "epoch": 0.8691657478868063, "grad_norm": 1.0153027436586384, "learning_rate": 7.074012608478406e-06, "loss": 0.745, "mean_token_accuracy": 0.771528446674347, "step": 4730 }, { "epoch": 0.8700845277471518, "grad_norm": 0.9597117720266102, "learning_rate": 7.04543356767556e-06, "loss": 0.6761, "mean_token_accuracy": 0.7918476939201355, "step": 4735 }, { "epoch": 0.8710033076074972, "grad_norm": 0.978172489455684, "learning_rate": 7.0170434202075115e-06, "loss": 0.7295, "mean_token_accuracy": 0.7755934953689575, "step": 4740 }, { "epoch": 0.8719220874678427, "grad_norm": 0.9901049864461209, "learning_rate": 6.9888424282508955e-06, "loss": 0.6808, "mean_token_accuracy": 0.7899694681167603, "step": 4745 }, { "epoch": 0.8728408673281882, "grad_norm": 0.9937024532653225, "learning_rate": 6.960830852235556e-06, "loss": 0.7051, "mean_token_accuracy": 0.784772801399231, "step": 4750 }, { "epoch": 0.8737596471885336, "grad_norm": 0.9976292512478198, "learning_rate": 6.9330089508421125e-06, "loss": 0.7258, "mean_token_accuracy": 0.778191328048706, "step": 4755 }, { "epoch": 0.8746784270488791, "grad_norm": 1.0403981175236854, "learning_rate": 6.905376980999588e-06, "loss": 0.7431, "mean_token_accuracy": 0.7714271783828736, "step": 4760 }, { "epoch": 0.8755972069092246, "grad_norm": 1.0581186557788087, "learning_rate": 6.877935197883034e-06, "loss": 0.712, "mean_token_accuracy": 0.782374131679535, "step": 4765 }, { "epoch": 0.87651598676957, "grad_norm": 0.9127662765794763, "learning_rate": 6.85068385491116e-06, "loss": 0.6783, "mean_token_accuracy": 0.7902564287185669, "step": 4770 }, { "epoch": 0.8774347666299155, "grad_norm": 1.0211009545118401, "learning_rate": 6.823623203744009e-06, "loss": 0.7424, "mean_token_accuracy": 0.7728718996047974, "step": 4775 }, { "epoch": 0.878353546490261, "grad_norm": 0.9378238243616713, "learning_rate": 6.796753494280624e-06, "loss": 0.6775, "mean_token_accuracy": 0.7908179044723511, "step": 4780 }, { "epoch": 0.8792723263506064, "grad_norm": 1.0104116736997681, "learning_rate": 6.770074974656751e-06, "loss": 0.6963, "mean_token_accuracy": 0.7875288009643555, "step": 4785 }, { "epoch": 0.8801911062109519, "grad_norm": 0.9673097547324119, "learning_rate": 6.743587891242536e-06, "loss": 0.7006, "mean_token_accuracy": 0.7846636652946473, "step": 4790 }, { "epoch": 0.8811098860712974, "grad_norm": 1.0556257979311925, "learning_rate": 6.717292488640256e-06, "loss": 0.8204, "mean_token_accuracy": 0.7494418621063232, "step": 4795 }, { "epoch": 0.8820286659316428, "grad_norm": 0.9770128218279439, "learning_rate": 6.691189009682059e-06, "loss": 0.6983, "mean_token_accuracy": 0.787074613571167, "step": 4800 }, { "epoch": 0.8829474457919883, "grad_norm": 0.8883173557701798, "learning_rate": 6.665277695427717e-06, "loss": 0.7084, "mean_token_accuracy": 0.7833673834800721, "step": 4805 }, { "epoch": 0.8838662256523337, "grad_norm": 0.9692250808371157, "learning_rate": 6.63955878516241e-06, "loss": 0.6823, "mean_token_accuracy": 0.7903570294380188, "step": 4810 }, { "epoch": 0.8847850055126791, "grad_norm": 0.8810560098191498, "learning_rate": 6.614032516394509e-06, "loss": 0.6837, "mean_token_accuracy": 0.7892964124679566, "step": 4815 }, { "epoch": 0.8857037853730246, "grad_norm": 0.9613974496107343, "learning_rate": 6.588699124853379e-06, "loss": 0.7171, "mean_token_accuracy": 0.7778627634048462, "step": 4820 }, { "epoch": 0.8866225652333701, "grad_norm": 0.9155720741178934, "learning_rate": 6.563558844487215e-06, "loss": 0.6238, "mean_token_accuracy": 0.806905460357666, "step": 4825 }, { "epoch": 0.8875413450937155, "grad_norm": 0.8747125126149234, "learning_rate": 6.538611907460866e-06, "loss": 0.6377, "mean_token_accuracy": 0.8006066799163818, "step": 4830 }, { "epoch": 0.888460124954061, "grad_norm": 1.007447868920518, "learning_rate": 6.513858544153706e-06, "loss": 0.7043, "mean_token_accuracy": 0.7833499908447266, "step": 4835 }, { "epoch": 0.8893789048144065, "grad_norm": 0.9255281703679149, "learning_rate": 6.48929898315749e-06, "loss": 0.6973, "mean_token_accuracy": 0.782793152332306, "step": 4840 }, { "epoch": 0.8902976846747519, "grad_norm": 0.9388975020879022, "learning_rate": 6.464933451274261e-06, "loss": 0.6256, "mean_token_accuracy": 0.8049848675727844, "step": 4845 }, { "epoch": 0.8912164645350974, "grad_norm": 1.012760049959928, "learning_rate": 6.440762173514238e-06, "loss": 0.7309, "mean_token_accuracy": 0.7747970938682556, "step": 4850 }, { "epoch": 0.8921352443954429, "grad_norm": 0.9633483639424572, "learning_rate": 6.416785373093756e-06, "loss": 0.7864, "mean_token_accuracy": 0.7609822034835816, "step": 4855 }, { "epoch": 0.8930540242557883, "grad_norm": 0.9375903231729885, "learning_rate": 6.39300327143319e-06, "loss": 0.7045, "mean_token_accuracy": 0.7813974022865295, "step": 4860 }, { "epoch": 0.8939728041161338, "grad_norm": 0.9116788058926382, "learning_rate": 6.369416088154917e-06, "loss": 0.7031, "mean_token_accuracy": 0.7866278529167176, "step": 4865 }, { "epoch": 0.8948915839764793, "grad_norm": 0.8514994068032359, "learning_rate": 6.346024041081286e-06, "loss": 0.6053, "mean_token_accuracy": 0.8105675458908081, "step": 4870 }, { "epoch": 0.8958103638368247, "grad_norm": 1.0010033445452702, "learning_rate": 6.32282734623261e-06, "loss": 0.7069, "mean_token_accuracy": 0.7806268095970154, "step": 4875 }, { "epoch": 0.8967291436971702, "grad_norm": 0.9335886122342348, "learning_rate": 6.299826217825156e-06, "loss": 0.7464, "mean_token_accuracy": 0.770034921169281, "step": 4880 }, { "epoch": 0.8976479235575157, "grad_norm": 0.9307585065246715, "learning_rate": 6.277020868269191e-06, "loss": 0.7473, "mean_token_accuracy": 0.7723272204399109, "step": 4885 }, { "epoch": 0.898566703417861, "grad_norm": 0.9507339920147042, "learning_rate": 6.254411508167009e-06, "loss": 0.7498, "mean_token_accuracy": 0.7717328667640686, "step": 4890 }, { "epoch": 0.8994854832782065, "grad_norm": 0.9826282972840381, "learning_rate": 6.23199834631098e-06, "loss": 0.7964, "mean_token_accuracy": 0.7543912172317505, "step": 4895 }, { "epoch": 0.900404263138552, "grad_norm": 0.9073522447780296, "learning_rate": 6.2097815896816306e-06, "loss": 0.7151, "mean_token_accuracy": 0.7821811556816101, "step": 4900 }, { "epoch": 0.9013230429988974, "grad_norm": 1.0820009074965473, "learning_rate": 6.187761443445719e-06, "loss": 0.7168, "mean_token_accuracy": 0.779189658164978, "step": 4905 }, { "epoch": 0.9022418228592429, "grad_norm": 1.0031779458182357, "learning_rate": 6.165938110954365e-06, "loss": 0.7316, "mean_token_accuracy": 0.7749498963356019, "step": 4910 }, { "epoch": 0.9031606027195884, "grad_norm": 0.9350157804822582, "learning_rate": 6.144311793741147e-06, "loss": 0.7289, "mean_token_accuracy": 0.7790675520896911, "step": 4915 }, { "epoch": 0.9040793825799338, "grad_norm": 0.9268769698711982, "learning_rate": 6.122882691520254e-06, "loss": 0.7369, "mean_token_accuracy": 0.7718736052513122, "step": 4920 }, { "epoch": 0.9049981624402793, "grad_norm": 0.9014435353019561, "learning_rate": 6.101651002184649e-06, "loss": 0.7007, "mean_token_accuracy": 0.7885142803192139, "step": 4925 }, { "epoch": 0.9059169423006248, "grad_norm": 1.1124655246934698, "learning_rate": 6.0806169218042185e-06, "loss": 0.7541, "mean_token_accuracy": 0.7671143889427186, "step": 4930 }, { "epoch": 0.9068357221609702, "grad_norm": 0.941789239472656, "learning_rate": 6.0597806446239775e-06, "loss": 0.6182, "mean_token_accuracy": 0.8084997653961181, "step": 4935 }, { "epoch": 0.9077545020213157, "grad_norm": 0.9986182896305318, "learning_rate": 6.039142363062271e-06, "loss": 0.6456, "mean_token_accuracy": 0.8005677580833435, "step": 4940 }, { "epoch": 0.9086732818816612, "grad_norm": 0.9291170901784223, "learning_rate": 6.018702267709008e-06, "loss": 0.7112, "mean_token_accuracy": 0.7811415076255799, "step": 4945 }, { "epoch": 0.9095920617420066, "grad_norm": 1.0595420029395017, "learning_rate": 5.998460547323881e-06, "loss": 0.7741, "mean_token_accuracy": 0.7643965363502503, "step": 4950 }, { "epoch": 0.9105108416023521, "grad_norm": 1.086020272651734, "learning_rate": 5.978417388834642e-06, "loss": 0.8087, "mean_token_accuracy": 0.754006028175354, "step": 4955 }, { "epoch": 0.9114296214626976, "grad_norm": 1.0314753490097466, "learning_rate": 5.958572977335365e-06, "loss": 0.647, "mean_token_accuracy": 0.8025306582450866, "step": 4960 }, { "epoch": 0.912348401323043, "grad_norm": 0.8858957049531258, "learning_rate": 5.93892749608474e-06, "loss": 0.6734, "mean_token_accuracy": 0.789763331413269, "step": 4965 }, { "epoch": 0.9132671811833885, "grad_norm": 0.9228304919571796, "learning_rate": 5.919481126504383e-06, "loss": 0.6979, "mean_token_accuracy": 0.7835509300231933, "step": 4970 }, { "epoch": 0.914185961043734, "grad_norm": 0.9172131766669608, "learning_rate": 5.900234048177156e-06, "loss": 0.7468, "mean_token_accuracy": 0.7716853857040405, "step": 4975 }, { "epoch": 0.9151047409040793, "grad_norm": 0.9717495119840875, "learning_rate": 5.881186438845511e-06, "loss": 0.6534, "mean_token_accuracy": 0.7953248977661133, "step": 4980 }, { "epoch": 0.9160235207644248, "grad_norm": 0.9387494667741174, "learning_rate": 5.862338474409852e-06, "loss": 0.7276, "mean_token_accuracy": 0.7760698676109314, "step": 4985 }, { "epoch": 0.9169423006247703, "grad_norm": 0.9759164760125104, "learning_rate": 5.843690328926905e-06, "loss": 0.7429, "mean_token_accuracy": 0.7714617133140564, "step": 4990 }, { "epoch": 0.9178610804851157, "grad_norm": 0.9934351587205096, "learning_rate": 5.825242174608107e-06, "loss": 0.7111, "mean_token_accuracy": 0.7826705813407898, "step": 4995 }, { "epoch": 0.9187798603454612, "grad_norm": 0.9923527821765938, "learning_rate": 5.8069941818180335e-06, "loss": 0.6332, "mean_token_accuracy": 0.8037675261497498, "step": 5000 }, { "epoch": 0.9196986402058067, "grad_norm": 1.0549322241395105, "learning_rate": 5.788946519072802e-06, "loss": 0.7442, "mean_token_accuracy": 0.7685003876686096, "step": 5005 }, { "epoch": 0.9206174200661521, "grad_norm": 0.9976060100023064, "learning_rate": 5.771099353038532e-06, "loss": 0.7271, "mean_token_accuracy": 0.7736078143119812, "step": 5010 }, { "epoch": 0.9215361999264976, "grad_norm": 0.9383299705731176, "learning_rate": 5.7534528485298e-06, "loss": 0.725, "mean_token_accuracy": 0.7763983011245728, "step": 5015 }, { "epoch": 0.9224549797868431, "grad_norm": 0.963672340776997, "learning_rate": 5.736007168508121e-06, "loss": 0.6831, "mean_token_accuracy": 0.7851462960243225, "step": 5020 }, { "epoch": 0.9233737596471885, "grad_norm": 0.9469619269658613, "learning_rate": 5.7187624740804345e-06, "loss": 0.7275, "mean_token_accuracy": 0.7783573985099792, "step": 5025 }, { "epoch": 0.924292539507534, "grad_norm": 0.9984975559017129, "learning_rate": 5.701718924497633e-06, "loss": 0.7006, "mean_token_accuracy": 0.786095142364502, "step": 5030 }, { "epoch": 0.9252113193678795, "grad_norm": 0.9976875410663416, "learning_rate": 5.684876677153069e-06, "loss": 0.78, "mean_token_accuracy": 0.7615157961845398, "step": 5035 }, { "epoch": 0.9261300992282249, "grad_norm": 0.9677019035350851, "learning_rate": 5.668235887581126e-06, "loss": 0.7171, "mean_token_accuracy": 0.7806509256362915, "step": 5040 }, { "epoch": 0.9270488790885704, "grad_norm": 1.021088055495027, "learning_rate": 5.651796709455757e-06, "loss": 0.7329, "mean_token_accuracy": 0.7730448007583618, "step": 5045 }, { "epoch": 0.9279676589489159, "grad_norm": 1.0552382975961647, "learning_rate": 5.6355592945890934e-06, "loss": 0.6811, "mean_token_accuracy": 0.786517608165741, "step": 5050 }, { "epoch": 0.9288864388092613, "grad_norm": 1.0701013175318246, "learning_rate": 5.619523792930021e-06, "loss": 0.7371, "mean_token_accuracy": 0.7725589275360107, "step": 5055 }, { "epoch": 0.9298052186696067, "grad_norm": 0.9720134878474826, "learning_rate": 5.6036903525627975e-06, "loss": 0.6481, "mean_token_accuracy": 0.8003619313240051, "step": 5060 }, { "epoch": 0.9307239985299522, "grad_norm": 1.129850002758599, "learning_rate": 5.588059119705699e-06, "loss": 0.753, "mean_token_accuracy": 0.7675109386444092, "step": 5065 }, { "epoch": 0.9316427783902976, "grad_norm": 0.9756659000260551, "learning_rate": 5.5726302387096506e-06, "loss": 0.7282, "mean_token_accuracy": 0.7749423146247864, "step": 5070 }, { "epoch": 0.9325615582506431, "grad_norm": 0.9956804499004365, "learning_rate": 5.557403852056914e-06, "loss": 0.7106, "mean_token_accuracy": 0.7811750769615173, "step": 5075 }, { "epoch": 0.9334803381109886, "grad_norm": 0.9560365954578341, "learning_rate": 5.542380100359751e-06, "loss": 0.71, "mean_token_accuracy": 0.7802268862724304, "step": 5080 }, { "epoch": 0.934399117971334, "grad_norm": 0.9454250302290028, "learning_rate": 5.527559122359145e-06, "loss": 0.6968, "mean_token_accuracy": 0.7844350814819336, "step": 5085 }, { "epoch": 0.9353178978316795, "grad_norm": 1.0545713849704708, "learning_rate": 5.512941054923507e-06, "loss": 0.7085, "mean_token_accuracy": 0.7803709745407105, "step": 5090 }, { "epoch": 0.936236677692025, "grad_norm": 1.103412663642094, "learning_rate": 5.498526033047404e-06, "loss": 0.726, "mean_token_accuracy": 0.7740720987319947, "step": 5095 }, { "epoch": 0.9371554575523704, "grad_norm": 1.0018246605663172, "learning_rate": 5.484314189850335e-06, "loss": 0.6914, "mean_token_accuracy": 0.7853707432746887, "step": 5100 }, { "epoch": 0.9380742374127159, "grad_norm": 0.9292037350722813, "learning_rate": 5.470305656575487e-06, "loss": 0.6809, "mean_token_accuracy": 0.7880851745605468, "step": 5105 }, { "epoch": 0.9389930172730614, "grad_norm": 0.942507726345383, "learning_rate": 5.45650056258852e-06, "loss": 0.6444, "mean_token_accuracy": 0.8017876148223877, "step": 5110 }, { "epoch": 0.9399117971334069, "grad_norm": 0.950473741974958, "learning_rate": 5.442899035376386e-06, "loss": 0.6918, "mean_token_accuracy": 0.7830354809761048, "step": 5115 }, { "epoch": 0.9408305769937523, "grad_norm": 1.0328707402514066, "learning_rate": 5.429501200546137e-06, "loss": 0.6809, "mean_token_accuracy": 0.7883656024932861, "step": 5120 }, { "epoch": 0.9417493568540978, "grad_norm": 0.9106187724534776, "learning_rate": 5.416307181823773e-06, "loss": 0.6529, "mean_token_accuracy": 0.7979433417320252, "step": 5125 }, { "epoch": 0.9426681367144433, "grad_norm": 0.8301715826585289, "learning_rate": 5.403317101053101e-06, "loss": 0.6319, "mean_token_accuracy": 0.8041222810745239, "step": 5130 }, { "epoch": 0.9435869165747887, "grad_norm": 0.9401314468555536, "learning_rate": 5.3905310781946005e-06, "loss": 0.7681, "mean_token_accuracy": 0.7674003601074219, "step": 5135 }, { "epoch": 0.9445056964351342, "grad_norm": 0.9634667604340411, "learning_rate": 5.377949231324331e-06, "loss": 0.6745, "mean_token_accuracy": 0.7905578970909118, "step": 5140 }, { "epoch": 0.9454244762954797, "grad_norm": 0.895622215360212, "learning_rate": 5.3655716766328235e-06, "loss": 0.678, "mean_token_accuracy": 0.7900139689445496, "step": 5145 }, { "epoch": 0.946343256155825, "grad_norm": 0.9199451561014395, "learning_rate": 5.353398528424019e-06, "loss": 0.6503, "mean_token_accuracy": 0.8004558086395264, "step": 5150 }, { "epoch": 0.9472620360161705, "grad_norm": 1.0795462562761053, "learning_rate": 5.341429899114216e-06, "loss": 0.796, "mean_token_accuracy": 0.7549399971961975, "step": 5155 }, { "epoch": 0.948180815876516, "grad_norm": 1.0593495804757314, "learning_rate": 5.3296658992310215e-06, "loss": 0.7738, "mean_token_accuracy": 0.7613824367523193, "step": 5160 }, { "epoch": 0.9490995957368614, "grad_norm": 0.87159278381413, "learning_rate": 5.318106637412333e-06, "loss": 0.7275, "mean_token_accuracy": 0.7751541256904602, "step": 5165 }, { "epoch": 0.9500183755972069, "grad_norm": 0.9358627934311341, "learning_rate": 5.306752220405349e-06, "loss": 0.6959, "mean_token_accuracy": 0.7841734409332275, "step": 5170 }, { "epoch": 0.9509371554575524, "grad_norm": 1.046481143355732, "learning_rate": 5.295602753065557e-06, "loss": 0.7627, "mean_token_accuracy": 0.7681636333465576, "step": 5175 }, { "epoch": 0.9518559353178978, "grad_norm": 0.9759465901582869, "learning_rate": 5.284658338355793e-06, "loss": 0.6722, "mean_token_accuracy": 0.7921370148658753, "step": 5180 }, { "epoch": 0.9527747151782433, "grad_norm": 0.9867516760566891, "learning_rate": 5.27391907734527e-06, "loss": 0.6645, "mean_token_accuracy": 0.7929692029953003, "step": 5185 }, { "epoch": 0.9536934950385888, "grad_norm": 1.0576850268434166, "learning_rate": 5.263385069208657e-06, "loss": 0.6768, "mean_token_accuracy": 0.7890636920928955, "step": 5190 }, { "epoch": 0.9546122748989342, "grad_norm": 0.9776114467729704, "learning_rate": 5.253056411225155e-06, "loss": 0.742, "mean_token_accuracy": 0.7734999060630798, "step": 5195 }, { "epoch": 0.9555310547592797, "grad_norm": 1.0194931640790486, "learning_rate": 5.242933198777612e-06, "loss": 0.7576, "mean_token_accuracy": 0.7681198120117188, "step": 5200 }, { "epoch": 0.9564498346196252, "grad_norm": 1.0129376817436055, "learning_rate": 5.233015525351615e-06, "loss": 0.7419, "mean_token_accuracy": 0.7743308544158936, "step": 5205 }, { "epoch": 0.9573686144799706, "grad_norm": 1.0893813952165832, "learning_rate": 5.223303482534663e-06, "loss": 0.7326, "mean_token_accuracy": 0.7767649531364441, "step": 5210 }, { "epoch": 0.9582873943403161, "grad_norm": 0.9505887964318166, "learning_rate": 5.213797160015287e-06, "loss": 0.6831, "mean_token_accuracy": 0.7896853566169739, "step": 5215 }, { "epoch": 0.9592061742006616, "grad_norm": 0.9619901073178158, "learning_rate": 5.204496645582251e-06, "loss": 0.6331, "mean_token_accuracy": 0.8018953204154968, "step": 5220 }, { "epoch": 0.960124954061007, "grad_norm": 0.9557772773728191, "learning_rate": 5.195402025123713e-06, "loss": 0.7335, "mean_token_accuracy": 0.7740659594535828, "step": 5225 }, { "epoch": 0.9610437339213524, "grad_norm": 1.0297793279179666, "learning_rate": 5.18651338262646e-06, "loss": 0.7168, "mean_token_accuracy": 0.7795220851898194, "step": 5230 }, { "epoch": 0.961962513781698, "grad_norm": 1.2404081410589938, "learning_rate": 5.177830800175107e-06, "loss": 0.8105, "mean_token_accuracy": 0.7509904742240906, "step": 5235 }, { "epoch": 0.9628812936420433, "grad_norm": 0.9134300813748663, "learning_rate": 5.169354357951361e-06, "loss": 0.6651, "mean_token_accuracy": 0.7976749420166016, "step": 5240 }, { "epoch": 0.9638000735023888, "grad_norm": 1.0273400541069664, "learning_rate": 5.161084134233264e-06, "loss": 0.7448, "mean_token_accuracy": 0.7694393396377563, "step": 5245 }, { "epoch": 0.9647188533627343, "grad_norm": 0.8905853308436543, "learning_rate": 5.153020205394477e-06, "loss": 0.7163, "mean_token_accuracy": 0.7769248247146606, "step": 5250 }, { "epoch": 0.9656376332230797, "grad_norm": 0.9529346136037607, "learning_rate": 5.145162645903574e-06, "loss": 0.761, "mean_token_accuracy": 0.7652369141578674, "step": 5255 }, { "epoch": 0.9665564130834252, "grad_norm": 0.9650979477578159, "learning_rate": 5.1375115283233555e-06, "loss": 0.6776, "mean_token_accuracy": 0.7902606129646301, "step": 5260 }, { "epoch": 0.9674751929437707, "grad_norm": 0.9641271433925336, "learning_rate": 5.130066923310179e-06, "loss": 0.7631, "mean_token_accuracy": 0.7635928511619567, "step": 5265 }, { "epoch": 0.9683939728041161, "grad_norm": 0.9213872430466661, "learning_rate": 5.122828899613301e-06, "loss": 0.7167, "mean_token_accuracy": 0.7826451182365417, "step": 5270 }, { "epoch": 0.9693127526644616, "grad_norm": 0.944684513184743, "learning_rate": 5.115797524074245e-06, "loss": 0.7028, "mean_token_accuracy": 0.7823728322982788, "step": 5275 }, { "epoch": 0.9702315325248071, "grad_norm": 0.933365103880651, "learning_rate": 5.108972861626195e-06, "loss": 0.675, "mean_token_accuracy": 0.7928666949272156, "step": 5280 }, { "epoch": 0.9711503123851525, "grad_norm": 0.9439434977514946, "learning_rate": 5.102354975293371e-06, "loss": 0.737, "mean_token_accuracy": 0.7746310830116272, "step": 5285 }, { "epoch": 0.972069092245498, "grad_norm": 0.9602834611038171, "learning_rate": 5.0959439261904715e-06, "loss": 0.6584, "mean_token_accuracy": 0.7969113111495971, "step": 5290 }, { "epoch": 0.9729878721058435, "grad_norm": 0.9686252030086645, "learning_rate": 5.089739773522099e-06, "loss": 0.7185, "mean_token_accuracy": 0.7791404247283935, "step": 5295 }, { "epoch": 0.9739066519661889, "grad_norm": 0.9606975967780023, "learning_rate": 5.083742574582211e-06, "loss": 0.6582, "mean_token_accuracy": 0.7957128643989563, "step": 5300 }, { "epoch": 0.9748254318265344, "grad_norm": 0.9508520485094629, "learning_rate": 5.077952384753596e-06, "loss": 0.6683, "mean_token_accuracy": 0.794218647480011, "step": 5305 }, { "epoch": 0.9757442116868799, "grad_norm": 1.019210345399437, "learning_rate": 5.072369257507359e-06, "loss": 0.6832, "mean_token_accuracy": 0.7870172739028931, "step": 5310 }, { "epoch": 0.9766629915472252, "grad_norm": 0.8796036807363417, "learning_rate": 5.066993244402426e-06, "loss": 0.6779, "mean_token_accuracy": 0.7899052858352661, "step": 5315 }, { "epoch": 0.9775817714075707, "grad_norm": 0.9949077138684812, "learning_rate": 5.061824395085075e-06, "loss": 0.7555, "mean_token_accuracy": 0.7675400733947754, "step": 5320 }, { "epoch": 0.9785005512679162, "grad_norm": 1.0386477233819087, "learning_rate": 5.056862757288469e-06, "loss": 0.6976, "mean_token_accuracy": 0.785806167125702, "step": 5325 }, { "epoch": 0.9794193311282616, "grad_norm": 1.1386145757601591, "learning_rate": 5.052108376832222e-06, "loss": 0.7237, "mean_token_accuracy": 0.774617874622345, "step": 5330 }, { "epoch": 0.9803381109886071, "grad_norm": 1.0398368664600088, "learning_rate": 5.04756129762197e-06, "loss": 0.6786, "mean_token_accuracy": 0.7888760805130005, "step": 5335 }, { "epoch": 0.9812568908489526, "grad_norm": 0.8969882432761646, "learning_rate": 5.043221561648972e-06, "loss": 0.7473, "mean_token_accuracy": 0.7688890337944031, "step": 5340 }, { "epoch": 0.982175670709298, "grad_norm": 0.9121065335737262, "learning_rate": 5.039089208989717e-06, "loss": 0.6528, "mean_token_accuracy": 0.8000433087348938, "step": 5345 }, { "epoch": 0.9830944505696435, "grad_norm": 1.0556010790241608, "learning_rate": 5.035164277805552e-06, "loss": 0.6922, "mean_token_accuracy": 0.7864078521728516, "step": 5350 }, { "epoch": 0.984013230429989, "grad_norm": 0.875446950379487, "learning_rate": 5.031446804342338e-06, "loss": 0.6736, "mean_token_accuracy": 0.7885773062705994, "step": 5355 }, { "epoch": 0.9849320102903344, "grad_norm": 1.0577726392343791, "learning_rate": 5.027936822930111e-06, "loss": 0.6329, "mean_token_accuracy": 0.8035769701004029, "step": 5360 }, { "epoch": 0.9858507901506799, "grad_norm": 0.9114969331526765, "learning_rate": 5.024634365982759e-06, "loss": 0.6764, "mean_token_accuracy": 0.7900681734085083, "step": 5365 }, { "epoch": 0.9867695700110254, "grad_norm": 1.0562487370376004, "learning_rate": 5.021539463997731e-06, "loss": 0.7614, "mean_token_accuracy": 0.7608750462532043, "step": 5370 }, { "epoch": 0.9876883498713708, "grad_norm": 0.9772893925508611, "learning_rate": 5.018652145555758e-06, "loss": 0.7183, "mean_token_accuracy": 0.780507218837738, "step": 5375 }, { "epoch": 0.9886071297317163, "grad_norm": 0.9153996098402806, "learning_rate": 5.015972437320575e-06, "loss": 0.6912, "mean_token_accuracy": 0.7845159888267517, "step": 5380 }, { "epoch": 0.9895259095920618, "grad_norm": 0.9730616558454289, "learning_rate": 5.013500364038685e-06, "loss": 0.7219, "mean_token_accuracy": 0.7788301348686218, "step": 5385 }, { "epoch": 0.9904446894524072, "grad_norm": 1.0739524554988746, "learning_rate": 5.011235948539137e-06, "loss": 0.7236, "mean_token_accuracy": 0.7729606032371521, "step": 5390 }, { "epoch": 0.9913634693127527, "grad_norm": 0.9269482984472148, "learning_rate": 5.00917921173329e-06, "loss": 0.6858, "mean_token_accuracy": 0.7901342034339904, "step": 5395 }, { "epoch": 0.9922822491730982, "grad_norm": 1.0961849910542936, "learning_rate": 5.007330172614658e-06, "loss": 0.7486, "mean_token_accuracy": 0.7710240960121155, "step": 5400 }, { "epoch": 0.9932010290334435, "grad_norm": 0.9749225295480475, "learning_rate": 5.005688848258695e-06, "loss": 0.6663, "mean_token_accuracy": 0.7941651105880737, "step": 5405 }, { "epoch": 0.994119808893789, "grad_norm": 0.9260821925718788, "learning_rate": 5.004255253822668e-06, "loss": 0.6861, "mean_token_accuracy": 0.7862310886383057, "step": 5410 }, { "epoch": 0.9950385887541345, "grad_norm": 1.088283597268154, "learning_rate": 5.0030294025454985e-06, "loss": 0.7656, "mean_token_accuracy": 0.7660502552986145, "step": 5415 }, { "epoch": 0.9959573686144799, "grad_norm": 0.9441714298335138, "learning_rate": 5.002011305747647e-06, "loss": 0.6971, "mean_token_accuracy": 0.7839124202728271, "step": 5420 }, { "epoch": 0.9968761484748254, "grad_norm": 1.0363518119067192, "learning_rate": 5.0012009728310115e-06, "loss": 0.7004, "mean_token_accuracy": 0.7838043451309205, "step": 5425 }, { "epoch": 0.9977949283351709, "grad_norm": 0.9601846400078509, "learning_rate": 5.0005984112788325e-06, "loss": 0.7021, "mean_token_accuracy": 0.7850103259086609, "step": 5430 }, { "epoch": 0.9987137081955163, "grad_norm": 1.00552667894893, "learning_rate": 5.0002036266556325e-06, "loss": 0.7154, "mean_token_accuracy": 0.7820982575416565, "step": 5435 }, { "epoch": 0.9996324880558618, "grad_norm": 0.9185989598293809, "learning_rate": 5.000016622607158e-06, "loss": 0.7066, "mean_token_accuracy": 0.7812744855880738, "step": 5440 }, { "epoch": 1.0, "step": 5442, "total_flos": 77507944513536.0, "train_loss": 0.0, "train_runtime": 1.7307, "train_samples_per_second": 12576.308, "train_steps_per_second": 3144.366 } ], "logging_steps": 5, "max_steps": 5442, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 77507944513536.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }