{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009196247930844216, "grad_norm": 4.087223679622462, "learning_rate": 9.191176470588236e-07, "loss": 1.3446, "mean_token_accuracy": 0.6661458969116211, "step": 5 }, { "epoch": 0.0018392495861688431, "grad_norm": 3.3376471514991324, "learning_rate": 1.8382352941176471e-06, "loss": 1.2534, "mean_token_accuracy": 0.6856188654899598, "step": 10 }, { "epoch": 0.0027588743792532648, "grad_norm": 3.1883807133419646, "learning_rate": 2.7573529411764708e-06, "loss": 1.2495, "mean_token_accuracy": 0.6844112038612366, "step": 15 }, { "epoch": 0.0036784991723376862, "grad_norm": 2.5757356327081826, "learning_rate": 3.6764705882352942e-06, "loss": 1.1962, "mean_token_accuracy": 0.6918170928955079, "step": 20 }, { "epoch": 0.004598123965422108, "grad_norm": 2.3971194855376092, "learning_rate": 4.595588235294118e-06, "loss": 1.2274, "mean_token_accuracy": 0.6844529986381531, "step": 25 }, { "epoch": 0.0055177487585065296, "grad_norm": 2.00434532423879, "learning_rate": 5.5147058823529415e-06, "loss": 1.1506, "mean_token_accuracy": 0.697660756111145, "step": 30 }, { "epoch": 0.006437373551590951, "grad_norm": 2.0663662496595543, "learning_rate": 6.433823529411764e-06, "loss": 1.1278, "mean_token_accuracy": 0.6973050832748413, "step": 35 }, { "epoch": 0.0073569983446753725, "grad_norm": 1.9519049901829761, "learning_rate": 7.3529411764705884e-06, "loss": 1.102, "mean_token_accuracy": 0.7046478033065796, "step": 40 }, { "epoch": 0.008276623137759793, "grad_norm": 1.8451875842176761, "learning_rate": 8.272058823529413e-06, "loss": 1.125, "mean_token_accuracy": 0.6951346158981323, "step": 45 }, { "epoch": 0.009196247930844215, "grad_norm": 2.000034845742239, "learning_rate": 9.191176470588236e-06, "loss": 1.0295, "mean_token_accuracy": 0.7154734015464783, "step": 50 }, { "epoch": 0.010115872723928637, "grad_norm": 1.621484821283711, "learning_rate": 1.011029411764706e-05, "loss": 1.0762, "mean_token_accuracy": 0.706468117237091, "step": 55 }, { "epoch": 0.011035497517013059, "grad_norm": 1.753826025706781, "learning_rate": 1.1029411764705883e-05, "loss": 1.0394, "mean_token_accuracy": 0.7156139016151428, "step": 60 }, { "epoch": 0.011955122310097481, "grad_norm": 1.6505676536191385, "learning_rate": 1.1948529411764707e-05, "loss": 1.0338, "mean_token_accuracy": 0.7132004976272583, "step": 65 }, { "epoch": 0.012874747103181901, "grad_norm": 1.8513933357249144, "learning_rate": 1.2867647058823528e-05, "loss": 0.9804, "mean_token_accuracy": 0.7274341702461242, "step": 70 }, { "epoch": 0.013794371896266323, "grad_norm": 2.4070230665851993, "learning_rate": 1.3786764705882355e-05, "loss": 1.0398, "mean_token_accuracy": 0.7116599082946777, "step": 75 }, { "epoch": 0.014713996689350745, "grad_norm": 1.798866895809756, "learning_rate": 1.4705882352941177e-05, "loss": 0.9922, "mean_token_accuracy": 0.720504081249237, "step": 80 }, { "epoch": 0.015633621482435165, "grad_norm": 1.709611126629724, "learning_rate": 1.5625e-05, "loss": 0.9938, "mean_token_accuracy": 0.7247263193130493, "step": 85 }, { "epoch": 0.016553246275519587, "grad_norm": 1.7626425485303618, "learning_rate": 1.6544117647058825e-05, "loss": 1.0122, "mean_token_accuracy": 0.717292582988739, "step": 90 }, { "epoch": 0.01747287106860401, "grad_norm": 2.036503882503329, "learning_rate": 1.7463235294117647e-05, "loss": 1.0109, "mean_token_accuracy": 0.7172105073928833, "step": 95 }, { "epoch": 0.01839249586168843, "grad_norm": 1.927409741133158, "learning_rate": 1.8382352941176472e-05, "loss": 1.0434, "mean_token_accuracy": 0.7078547954559327, "step": 100 }, { "epoch": 0.019312120654772853, "grad_norm": 2.079665033278075, "learning_rate": 1.9301470588235298e-05, "loss": 0.9959, "mean_token_accuracy": 0.7182355523109436, "step": 105 }, { "epoch": 0.020231745447857274, "grad_norm": 1.8479982769163703, "learning_rate": 2.022058823529412e-05, "loss": 1.0194, "mean_token_accuracy": 0.7173629522323608, "step": 110 }, { "epoch": 0.021151370240941696, "grad_norm": 1.831806807070413, "learning_rate": 2.113970588235294e-05, "loss": 0.9569, "mean_token_accuracy": 0.7312556385993958, "step": 115 }, { "epoch": 0.022070995034026118, "grad_norm": 1.7952413093248756, "learning_rate": 2.2058823529411766e-05, "loss": 1.0149, "mean_token_accuracy": 0.7192024111747741, "step": 120 }, { "epoch": 0.02299061982711054, "grad_norm": 1.6441769080980864, "learning_rate": 2.2977941176470588e-05, "loss": 0.9668, "mean_token_accuracy": 0.7280102610588074, "step": 125 }, { "epoch": 0.023910244620194962, "grad_norm": 1.7182187182460715, "learning_rate": 2.3897058823529413e-05, "loss": 1.025, "mean_token_accuracy": 0.7164386153221131, "step": 130 }, { "epoch": 0.02482986941327938, "grad_norm": 1.7665031820505241, "learning_rate": 2.4816176470588238e-05, "loss": 0.9879, "mean_token_accuracy": 0.7216517567634583, "step": 135 }, { "epoch": 0.025749494206363802, "grad_norm": 1.65781753659198, "learning_rate": 2.5735294117647057e-05, "loss": 1.0204, "mean_token_accuracy": 0.7183511853218079, "step": 140 }, { "epoch": 0.026669118999448224, "grad_norm": 1.5947996494100198, "learning_rate": 2.6654411764705882e-05, "loss": 0.9915, "mean_token_accuracy": 0.7210009098052979, "step": 145 }, { "epoch": 0.027588743792532646, "grad_norm": 1.6195741488866147, "learning_rate": 2.757352941176471e-05, "loss": 0.9609, "mean_token_accuracy": 0.7290344476699829, "step": 150 }, { "epoch": 0.028508368585617068, "grad_norm": 1.700795937176488, "learning_rate": 2.849264705882353e-05, "loss": 1.0017, "mean_token_accuracy": 0.7190845251083374, "step": 155 }, { "epoch": 0.02942799337870149, "grad_norm": 1.6626957868958252, "learning_rate": 2.9411764705882354e-05, "loss": 0.9801, "mean_token_accuracy": 0.7264268517494201, "step": 160 }, { "epoch": 0.03034761817178591, "grad_norm": 1.646176772035618, "learning_rate": 3.0330882352941176e-05, "loss": 0.9819, "mean_token_accuracy": 0.7258347868919373, "step": 165 }, { "epoch": 0.03126724296487033, "grad_norm": 1.7051406597026453, "learning_rate": 3.125e-05, "loss": 1.0021, "mean_token_accuracy": 0.7193678379058838, "step": 170 }, { "epoch": 0.032186867757954755, "grad_norm": 1.6583599673202631, "learning_rate": 3.2169117647058826e-05, "loss": 0.9863, "mean_token_accuracy": 0.7218608260154724, "step": 175 }, { "epoch": 0.033106492551039174, "grad_norm": 1.6811054631655953, "learning_rate": 3.308823529411765e-05, "loss": 0.9776, "mean_token_accuracy": 0.7252245903015136, "step": 180 }, { "epoch": 0.0340261173441236, "grad_norm": 1.6005295960642778, "learning_rate": 3.4007352941176476e-05, "loss": 0.952, "mean_token_accuracy": 0.7300998091697692, "step": 185 }, { "epoch": 0.03494574213720802, "grad_norm": 1.884741061084924, "learning_rate": 3.4926470588235294e-05, "loss": 1.0216, "mean_token_accuracy": 0.7144460439682007, "step": 190 }, { "epoch": 0.03586536693029244, "grad_norm": 1.61333499821342, "learning_rate": 3.584558823529412e-05, "loss": 1.0067, "mean_token_accuracy": 0.7160724878311158, "step": 195 }, { "epoch": 0.03678499172337686, "grad_norm": 1.592957572722435, "learning_rate": 3.6764705882352945e-05, "loss": 0.9367, "mean_token_accuracy": 0.7348474979400634, "step": 200 }, { "epoch": 0.03770461651646129, "grad_norm": 1.7666690880786284, "learning_rate": 3.768382352941176e-05, "loss": 0.9545, "mean_token_accuracy": 0.7297826528549194, "step": 205 }, { "epoch": 0.038624241309545705, "grad_norm": 1.5696177739032589, "learning_rate": 3.8602941176470595e-05, "loss": 1.0076, "mean_token_accuracy": 0.7160616636276245, "step": 210 }, { "epoch": 0.039543866102630124, "grad_norm": 1.5375849975431441, "learning_rate": 3.952205882352941e-05, "loss": 1.0082, "mean_token_accuracy": 0.7139402985572815, "step": 215 }, { "epoch": 0.04046349089571455, "grad_norm": 1.6613621558577687, "learning_rate": 4.044117647058824e-05, "loss": 1.0047, "mean_token_accuracy": 0.7157810091972351, "step": 220 }, { "epoch": 0.04138311568879897, "grad_norm": 1.6712866586887962, "learning_rate": 4.136029411764706e-05, "loss": 0.9841, "mean_token_accuracy": 0.7261144757270813, "step": 225 }, { "epoch": 0.04230274048188339, "grad_norm": 1.5868739813391535, "learning_rate": 4.227941176470588e-05, "loss": 1.0063, "mean_token_accuracy": 0.7146228194236756, "step": 230 }, { "epoch": 0.04322236527496781, "grad_norm": 1.4745940440239442, "learning_rate": 4.319852941176471e-05, "loss": 0.9895, "mean_token_accuracy": 0.7205227255821228, "step": 235 }, { "epoch": 0.044141990068052236, "grad_norm": 1.565812920746474, "learning_rate": 4.411764705882353e-05, "loss": 0.9883, "mean_token_accuracy": 0.7221224546432495, "step": 240 }, { "epoch": 0.045061614861136655, "grad_norm": 1.579279007990175, "learning_rate": 4.503676470588236e-05, "loss": 1.0339, "mean_token_accuracy": 0.7140692472457886, "step": 245 }, { "epoch": 0.04598123965422108, "grad_norm": 1.550674625710887, "learning_rate": 4.5955882352941176e-05, "loss": 1.009, "mean_token_accuracy": 0.717827045917511, "step": 250 }, { "epoch": 0.0469008644473055, "grad_norm": 1.494069442893164, "learning_rate": 4.6875e-05, "loss": 1.0163, "mean_token_accuracy": 0.7157993316650391, "step": 255 }, { "epoch": 0.047820489240389924, "grad_norm": 1.585433590429472, "learning_rate": 4.7794117647058826e-05, "loss": 0.9662, "mean_token_accuracy": 0.7260660767555237, "step": 260 }, { "epoch": 0.04874011403347434, "grad_norm": 1.5561077784742092, "learning_rate": 4.871323529411765e-05, "loss": 1.0521, "mean_token_accuracy": 0.7059531569480896, "step": 265 }, { "epoch": 0.04965973882655876, "grad_norm": 1.3842507274813078, "learning_rate": 4.9632352941176476e-05, "loss": 0.96, "mean_token_accuracy": 0.7317641496658325, "step": 270 }, { "epoch": 0.050579363619643186, "grad_norm": 1.4379239878799341, "learning_rate": 4.999996254118754e-05, "loss": 0.972, "mean_token_accuracy": 0.7297493696212769, "step": 275 }, { "epoch": 0.051498988412727605, "grad_norm": 1.3761784967587591, "learning_rate": 4.999973362667417e-05, "loss": 0.9844, "mean_token_accuracy": 0.724224853515625, "step": 280 }, { "epoch": 0.05241861320581203, "grad_norm": 1.4249636066532947, "learning_rate": 4.999929661021346e-05, "loss": 0.9974, "mean_token_accuracy": 0.7186186075210571, "step": 285 }, { "epoch": 0.05333823799889645, "grad_norm": 1.6467747117004, "learning_rate": 4.9998651495847435e-05, "loss": 1.0296, "mean_token_accuracy": 0.7110173583030701, "step": 290 }, { "epoch": 0.054257862791980874, "grad_norm": 1.3761801455599358, "learning_rate": 4.9997798289542816e-05, "loss": 1.0209, "mean_token_accuracy": 0.7124481081962586, "step": 295 }, { "epoch": 0.05517748758506529, "grad_norm": 1.4585308096786376, "learning_rate": 4.9996736999190965e-05, "loss": 1.0248, "mean_token_accuracy": 0.7100600242614746, "step": 300 }, { "epoch": 0.05609711237814972, "grad_norm": 1.4301378065367794, "learning_rate": 4.999546763460785e-05, "loss": 0.9864, "mean_token_accuracy": 0.7253738522529602, "step": 305 }, { "epoch": 0.057016737171234136, "grad_norm": 1.4586102770676173, "learning_rate": 4.999399020753393e-05, "loss": 0.9541, "mean_token_accuracy": 0.7308779239654541, "step": 310 }, { "epoch": 0.05793636196431856, "grad_norm": 1.5007400960218442, "learning_rate": 4.999230473163406e-05, "loss": 1.0123, "mean_token_accuracy": 0.7142405152320862, "step": 315 }, { "epoch": 0.05885598675740298, "grad_norm": 1.4247385882584611, "learning_rate": 4.999041122249735e-05, "loss": 1.0097, "mean_token_accuracy": 0.7164065957069397, "step": 320 }, { "epoch": 0.0597756115504874, "grad_norm": 1.4338281584111965, "learning_rate": 4.9988309697637025e-05, "loss": 1.0381, "mean_token_accuracy": 0.7093045115470886, "step": 325 }, { "epoch": 0.06069523634357182, "grad_norm": 1.3206321897141915, "learning_rate": 4.9986000176490264e-05, "loss": 1.0378, "mean_token_accuracy": 0.7081658363342285, "step": 330 }, { "epoch": 0.06161486113665624, "grad_norm": 1.4771390057019052, "learning_rate": 4.998348268041803e-05, "loss": 1.0473, "mean_token_accuracy": 0.7044042825698853, "step": 335 }, { "epoch": 0.06253448592974066, "grad_norm": 1.410427294901373, "learning_rate": 4.9980757232704836e-05, "loss": 1.0476, "mean_token_accuracy": 0.7044672727584839, "step": 340 }, { "epoch": 0.06345411072282509, "grad_norm": 1.293731368317575, "learning_rate": 4.997782385855862e-05, "loss": 0.9809, "mean_token_accuracy": 0.7207650065422058, "step": 345 }, { "epoch": 0.06437373551590951, "grad_norm": 1.373213488697433, "learning_rate": 4.9974682585110375e-05, "loss": 1.0238, "mean_token_accuracy": 0.713714337348938, "step": 350 }, { "epoch": 0.06529336030899394, "grad_norm": 1.4173612737543944, "learning_rate": 4.997133344141402e-05, "loss": 0.9995, "mean_token_accuracy": 0.7182128310203553, "step": 355 }, { "epoch": 0.06621298510207835, "grad_norm": 1.4208487527297817, "learning_rate": 4.9967776458446067e-05, "loss": 1.0247, "mean_token_accuracy": 0.7120985150337219, "step": 360 }, { "epoch": 0.06713260989516277, "grad_norm": 1.3468936690832556, "learning_rate": 4.996401166910535e-05, "loss": 1.0257, "mean_token_accuracy": 0.711448609828949, "step": 365 }, { "epoch": 0.0680522346882472, "grad_norm": 1.3418384776624692, "learning_rate": 4.996003910821273e-05, "loss": 0.9908, "mean_token_accuracy": 0.7198069810867309, "step": 370 }, { "epoch": 0.06897185948133161, "grad_norm": 1.2757020291626893, "learning_rate": 4.995585881251076e-05, "loss": 1.0029, "mean_token_accuracy": 0.7165916681289672, "step": 375 }, { "epoch": 0.06989148427441604, "grad_norm": 1.2215136508098425, "learning_rate": 4.995147082066335e-05, "loss": 1.0071, "mean_token_accuracy": 0.7161303281784057, "step": 380 }, { "epoch": 0.07081110906750046, "grad_norm": 1.5100364277085054, "learning_rate": 4.9946875173255405e-05, "loss": 0.9808, "mean_token_accuracy": 0.7223702430725097, "step": 385 }, { "epoch": 0.07173073386058489, "grad_norm": 1.3193074150499653, "learning_rate": 4.9942071912792463e-05, "loss": 0.9692, "mean_token_accuracy": 0.7253165245056152, "step": 390 }, { "epoch": 0.0726503586536693, "grad_norm": 1.360795639773644, "learning_rate": 4.9937061083700286e-05, "loss": 0.9248, "mean_token_accuracy": 0.738149356842041, "step": 395 }, { "epoch": 0.07356998344675372, "grad_norm": 1.3934617241628962, "learning_rate": 4.993184273232445e-05, "loss": 1.0174, "mean_token_accuracy": 0.7140317440032959, "step": 400 }, { "epoch": 0.07448960823983815, "grad_norm": 1.3755761090465115, "learning_rate": 4.9926416906929954e-05, "loss": 0.9371, "mean_token_accuracy": 0.7347567915916443, "step": 405 }, { "epoch": 0.07540923303292257, "grad_norm": 1.3123084901189321, "learning_rate": 4.9920783657700685e-05, "loss": 1.0494, "mean_token_accuracy": 0.7046082258224488, "step": 410 }, { "epoch": 0.07632885782600698, "grad_norm": 1.26236320940822, "learning_rate": 4.9914943036739075e-05, "loss": 0.9813, "mean_token_accuracy": 0.7248732924461365, "step": 415 }, { "epoch": 0.07724848261909141, "grad_norm": 1.4072657383382854, "learning_rate": 4.99088950980655e-05, "loss": 1.0041, "mean_token_accuracy": 0.7161918520927429, "step": 420 }, { "epoch": 0.07816810741217584, "grad_norm": 1.4142932157820918, "learning_rate": 4.9902639897617876e-05, "loss": 1.0343, "mean_token_accuracy": 0.7073235511779785, "step": 425 }, { "epoch": 0.07908773220526025, "grad_norm": 1.2620775477382082, "learning_rate": 4.9896177493251065e-05, "loss": 0.9773, "mean_token_accuracy": 0.724228036403656, "step": 430 }, { "epoch": 0.08000735699834467, "grad_norm": 1.2299977431090294, "learning_rate": 4.9889507944736405e-05, "loss": 0.9921, "mean_token_accuracy": 0.7193984985351562, "step": 435 }, { "epoch": 0.0809269817914291, "grad_norm": 1.272005618491772, "learning_rate": 4.9882631313761116e-05, "loss": 1.0266, "mean_token_accuracy": 0.7106949806213378, "step": 440 }, { "epoch": 0.08184660658451352, "grad_norm": 1.3368998742271194, "learning_rate": 4.9875547663927744e-05, "loss": 0.9945, "mean_token_accuracy": 0.7178430318832397, "step": 445 }, { "epoch": 0.08276623137759793, "grad_norm": 1.2395804635484349, "learning_rate": 4.986825706075357e-05, "loss": 0.9614, "mean_token_accuracy": 0.7270126938819885, "step": 450 }, { "epoch": 0.08368585617068236, "grad_norm": 1.2355105682399337, "learning_rate": 4.9860759571669987e-05, "loss": 1.017, "mean_token_accuracy": 0.7113536357879638, "step": 455 }, { "epoch": 0.08460548096376679, "grad_norm": 1.2769471363849882, "learning_rate": 4.985305526602192e-05, "loss": 0.9841, "mean_token_accuracy": 0.7207873582839965, "step": 460 }, { "epoch": 0.08552510575685121, "grad_norm": 1.3105851965485462, "learning_rate": 4.984514421506715e-05, "loss": 1.0238, "mean_token_accuracy": 0.7113570213317871, "step": 465 }, { "epoch": 0.08644473054993562, "grad_norm": 1.2226583029739935, "learning_rate": 4.983702649197565e-05, "loss": 1.0026, "mean_token_accuracy": 0.7175478458404541, "step": 470 }, { "epoch": 0.08736435534302005, "grad_norm": 1.3032963672614144, "learning_rate": 4.982870217182893e-05, "loss": 1.0102, "mean_token_accuracy": 0.7142111778259277, "step": 475 }, { "epoch": 0.08828398013610447, "grad_norm": 1.276533355049304, "learning_rate": 4.9820171331619343e-05, "loss": 1.0175, "mean_token_accuracy": 0.7140154242515564, "step": 480 }, { "epoch": 0.08920360492918888, "grad_norm": 1.3275369586760475, "learning_rate": 4.981143405024936e-05, "loss": 0.9664, "mean_token_accuracy": 0.7251969814300537, "step": 485 }, { "epoch": 0.09012322972227331, "grad_norm": 1.322475452296982, "learning_rate": 4.980249040853081e-05, "loss": 0.9572, "mean_token_accuracy": 0.7284212589263916, "step": 490 }, { "epoch": 0.09104285451535774, "grad_norm": 1.2219967426964762, "learning_rate": 4.979334048918422e-05, "loss": 1.0265, "mean_token_accuracy": 0.7094637989997864, "step": 495 }, { "epoch": 0.09196247930844216, "grad_norm": 1.2500649142513325, "learning_rate": 4.978398437683797e-05, "loss": 0.9429, "mean_token_accuracy": 0.7309910893440247, "step": 500 }, { "epoch": 0.09288210410152657, "grad_norm": 1.2382649121413325, "learning_rate": 4.977442215802753e-05, "loss": 1.0142, "mean_token_accuracy": 0.7163145303726196, "step": 505 }, { "epoch": 0.093801728894611, "grad_norm": 1.2494735942714719, "learning_rate": 4.976465392119467e-05, "loss": 0.9711, "mean_token_accuracy": 0.7253948450088501, "step": 510 }, { "epoch": 0.09472135368769542, "grad_norm": 1.1320102641208292, "learning_rate": 4.9754679756686654e-05, "loss": 0.9754, "mean_token_accuracy": 0.7240365982055664, "step": 515 }, { "epoch": 0.09564097848077985, "grad_norm": 1.2636397583226155, "learning_rate": 4.974449975675538e-05, "loss": 0.9683, "mean_token_accuracy": 0.7268050789833069, "step": 520 }, { "epoch": 0.09656060327386426, "grad_norm": 1.2638605012202537, "learning_rate": 4.9734114015556506e-05, "loss": 0.994, "mean_token_accuracy": 0.7192271828651429, "step": 525 }, { "epoch": 0.09748022806694868, "grad_norm": 1.3539672940723328, "learning_rate": 4.972352262914867e-05, "loss": 1.0219, "mean_token_accuracy": 0.712011969089508, "step": 530 }, { "epoch": 0.09839985286003311, "grad_norm": 1.2622022574950933, "learning_rate": 4.971272569549246e-05, "loss": 0.9993, "mean_token_accuracy": 0.717021644115448, "step": 535 }, { "epoch": 0.09931947765311752, "grad_norm": 1.2498621609285703, "learning_rate": 4.970172331444968e-05, "loss": 0.9869, "mean_token_accuracy": 0.7201068043708801, "step": 540 }, { "epoch": 0.10023910244620195, "grad_norm": 1.2563183037951813, "learning_rate": 4.969051558778226e-05, "loss": 1.0328, "mean_token_accuracy": 0.7072706580162048, "step": 545 }, { "epoch": 0.10115872723928637, "grad_norm": 1.1583096373701225, "learning_rate": 4.967910261915142e-05, "loss": 1.0073, "mean_token_accuracy": 0.7176116108894348, "step": 550 }, { "epoch": 0.1020783520323708, "grad_norm": 1.2337310449325847, "learning_rate": 4.966748451411668e-05, "loss": 1.0075, "mean_token_accuracy": 0.7166797518730164, "step": 555 }, { "epoch": 0.10299797682545521, "grad_norm": 1.187463601840395, "learning_rate": 4.9655661380134874e-05, "loss": 0.9978, "mean_token_accuracy": 0.7187446594238281, "step": 560 }, { "epoch": 0.10391760161853963, "grad_norm": 1.1950175317081544, "learning_rate": 4.964363332655918e-05, "loss": 1.0127, "mean_token_accuracy": 0.7141183018684387, "step": 565 }, { "epoch": 0.10483722641162406, "grad_norm": 1.1797983108141703, "learning_rate": 4.9631400464638074e-05, "loss": 1.0058, "mean_token_accuracy": 0.7147095799446106, "step": 570 }, { "epoch": 0.10575685120470849, "grad_norm": 1.3194739883489515, "learning_rate": 4.961896290751434e-05, "loss": 1.0125, "mean_token_accuracy": 0.7156966686248779, "step": 575 }, { "epoch": 0.1066764759977929, "grad_norm": 1.232197096442626, "learning_rate": 4.960632077022402e-05, "loss": 1.0096, "mean_token_accuracy": 0.7136348843574524, "step": 580 }, { "epoch": 0.10759610079087732, "grad_norm": 1.1109964489025674, "learning_rate": 4.959347416969529e-05, "loss": 0.9782, "mean_token_accuracy": 0.7218139052391053, "step": 585 }, { "epoch": 0.10851572558396175, "grad_norm": 1.1118328480221105, "learning_rate": 4.958042322474747e-05, "loss": 0.9138, "mean_token_accuracy": 0.7406689524650574, "step": 590 }, { "epoch": 0.10943535037704616, "grad_norm": 1.1550688598895895, "learning_rate": 4.956716805608984e-05, "loss": 1.0123, "mean_token_accuracy": 0.7150320529937744, "step": 595 }, { "epoch": 0.11035497517013058, "grad_norm": 1.2400379075265455, "learning_rate": 4.955370878632058e-05, "loss": 0.9642, "mean_token_accuracy": 0.7274539470672607, "step": 600 }, { "epoch": 0.11127459996321501, "grad_norm": 1.1266451881904362, "learning_rate": 4.954004553992564e-05, "loss": 0.9597, "mean_token_accuracy": 0.7269688129425049, "step": 605 }, { "epoch": 0.11219422475629943, "grad_norm": 1.195410688726218, "learning_rate": 4.952617844327753e-05, "loss": 0.9667, "mean_token_accuracy": 0.7273669600486755, "step": 610 }, { "epoch": 0.11311384954938385, "grad_norm": 1.2168436664941074, "learning_rate": 4.951210762463421e-05, "loss": 0.981, "mean_token_accuracy": 0.7224032163619996, "step": 615 }, { "epoch": 0.11403347434246827, "grad_norm": 1.1158577605300688, "learning_rate": 4.949783321413787e-05, "loss": 1.0133, "mean_token_accuracy": 0.7140767455101014, "step": 620 }, { "epoch": 0.1149530991355527, "grad_norm": 1.2227500677211205, "learning_rate": 4.948335534381375e-05, "loss": 1.0178, "mean_token_accuracy": 0.7107774257659912, "step": 625 }, { "epoch": 0.11587272392863712, "grad_norm": 1.1733820093333545, "learning_rate": 4.9468674147568906e-05, "loss": 0.9496, "mean_token_accuracy": 0.7264823913574219, "step": 630 }, { "epoch": 0.11679234872172153, "grad_norm": 1.1456005644666878, "learning_rate": 4.945378976119096e-05, "loss": 1.0301, "mean_token_accuracy": 0.7111668229103089, "step": 635 }, { "epoch": 0.11771197351480596, "grad_norm": 1.176194033859284, "learning_rate": 4.943870232234688e-05, "loss": 0.9904, "mean_token_accuracy": 0.7183448076248169, "step": 640 }, { "epoch": 0.11863159830789038, "grad_norm": 1.1767555657667275, "learning_rate": 4.9423411970581656e-05, "loss": 0.9565, "mean_token_accuracy": 0.7282203912734986, "step": 645 }, { "epoch": 0.1195512231009748, "grad_norm": 1.1593918150017006, "learning_rate": 4.940791884731706e-05, "loss": 0.9629, "mean_token_accuracy": 0.7265506267547608, "step": 650 }, { "epoch": 0.12047084789405922, "grad_norm": 1.1809244906539653, "learning_rate": 4.939222309585029e-05, "loss": 0.9506, "mean_token_accuracy": 0.7299855709075928, "step": 655 }, { "epoch": 0.12139047268714365, "grad_norm": 1.187342482868558, "learning_rate": 4.93763248613527e-05, "loss": 0.9873, "mean_token_accuracy": 0.7208028793334961, "step": 660 }, { "epoch": 0.12231009748022807, "grad_norm": 1.1643370561641233, "learning_rate": 4.936022429086841e-05, "loss": 1.019, "mean_token_accuracy": 0.7111838817596435, "step": 665 }, { "epoch": 0.12322972227331248, "grad_norm": 1.1548281507110767, "learning_rate": 4.9343921533312955e-05, "loss": 0.949, "mean_token_accuracy": 0.7271883249282837, "step": 670 }, { "epoch": 0.12414934706639691, "grad_norm": 1.1323282418083014, "learning_rate": 4.9327416739471935e-05, "loss": 0.9269, "mean_token_accuracy": 0.737087082862854, "step": 675 }, { "epoch": 0.12506897185948132, "grad_norm": 1.2363897419233494, "learning_rate": 4.9310710061999575e-05, "loss": 1.0061, "mean_token_accuracy": 0.714658522605896, "step": 680 }, { "epoch": 0.12598859665256576, "grad_norm": 1.15808211817011, "learning_rate": 4.9293801655417366e-05, "loss": 0.9426, "mean_token_accuracy": 0.7324698209762573, "step": 685 }, { "epoch": 0.12690822144565017, "grad_norm": 1.168156282468429, "learning_rate": 4.927669167611259e-05, "loss": 0.9516, "mean_token_accuracy": 0.726858627796173, "step": 690 }, { "epoch": 0.12782784623873458, "grad_norm": 1.1708412963628498, "learning_rate": 4.92593802823369e-05, "loss": 0.9565, "mean_token_accuracy": 0.7281310319900512, "step": 695 }, { "epoch": 0.12874747103181902, "grad_norm": 1.150205433303024, "learning_rate": 4.924186763420486e-05, "loss": 0.9966, "mean_token_accuracy": 0.7196317195892334, "step": 700 }, { "epoch": 0.12966709582490343, "grad_norm": 1.1412449351652514, "learning_rate": 4.922415389369243e-05, "loss": 0.9393, "mean_token_accuracy": 0.7308167576789856, "step": 705 }, { "epoch": 0.13058672061798787, "grad_norm": 1.2590368311590696, "learning_rate": 4.9206239224635486e-05, "loss": 0.9961, "mean_token_accuracy": 0.7167337894439697, "step": 710 }, { "epoch": 0.13150634541107228, "grad_norm": 1.1862573902159457, "learning_rate": 4.9188123792728344e-05, "loss": 0.9991, "mean_token_accuracy": 0.71655353307724, "step": 715 }, { "epoch": 0.1324259702041567, "grad_norm": 1.1728642333915622, "learning_rate": 4.916980776552218e-05, "loss": 0.9354, "mean_token_accuracy": 0.734131133556366, "step": 720 }, { "epoch": 0.13334559499724113, "grad_norm": 1.208191683152181, "learning_rate": 4.915129131242345e-05, "loss": 0.9578, "mean_token_accuracy": 0.7278777837753296, "step": 725 }, { "epoch": 0.13426521979032555, "grad_norm": 1.138309077411327, "learning_rate": 4.913257460469243e-05, "loss": 0.9448, "mean_token_accuracy": 0.7303597450256347, "step": 730 }, { "epoch": 0.13518484458340996, "grad_norm": 1.1410024150973699, "learning_rate": 4.911365781544153e-05, "loss": 0.9765, "mean_token_accuracy": 0.7208934783935547, "step": 735 }, { "epoch": 0.1361044693764944, "grad_norm": 1.135207319109893, "learning_rate": 4.9094541119633756e-05, "loss": 0.9625, "mean_token_accuracy": 0.7279266119003296, "step": 740 }, { "epoch": 0.1370240941695788, "grad_norm": 1.1470179542343784, "learning_rate": 4.907522469408103e-05, "loss": 1.0099, "mean_token_accuracy": 0.7129136681556701, "step": 745 }, { "epoch": 0.13794371896266322, "grad_norm": 1.1186516076443083, "learning_rate": 4.905570871744262e-05, "loss": 0.9492, "mean_token_accuracy": 0.7295220971107483, "step": 750 }, { "epoch": 0.13886334375574766, "grad_norm": 1.188235501807293, "learning_rate": 4.903599337022345e-05, "loss": 0.9158, "mean_token_accuracy": 0.7392297148704529, "step": 755 }, { "epoch": 0.13978296854883207, "grad_norm": 1.156585568722138, "learning_rate": 4.9016078834772436e-05, "loss": 1.0069, "mean_token_accuracy": 0.7133058428764343, "step": 760 }, { "epoch": 0.1407025933419165, "grad_norm": 1.0550430464679208, "learning_rate": 4.899596529528083e-05, "loss": 0.9804, "mean_token_accuracy": 0.7237313628196717, "step": 765 }, { "epoch": 0.14162221813500092, "grad_norm": 1.0828080346302627, "learning_rate": 4.897565293778045e-05, "loss": 0.9398, "mean_token_accuracy": 0.7297361016273498, "step": 770 }, { "epoch": 0.14254184292808533, "grad_norm": 1.0748821988518662, "learning_rate": 4.895514195014201e-05, "loss": 0.9512, "mean_token_accuracy": 0.727254593372345, "step": 775 }, { "epoch": 0.14346146772116977, "grad_norm": 1.1000801031665166, "learning_rate": 4.893443252207339e-05, "loss": 0.96, "mean_token_accuracy": 0.7277865290641785, "step": 780 }, { "epoch": 0.14438109251425418, "grad_norm": 1.1979288214254857, "learning_rate": 4.891352484511783e-05, "loss": 0.9904, "mean_token_accuracy": 0.7203876137733459, "step": 785 }, { "epoch": 0.1453007173073386, "grad_norm": 1.0336978471065938, "learning_rate": 4.889241911265224e-05, "loss": 0.9512, "mean_token_accuracy": 0.7298694252967834, "step": 790 }, { "epoch": 0.14622034210042303, "grad_norm": 1.093196247221492, "learning_rate": 4.887111551988531e-05, "loss": 1.0404, "mean_token_accuracy": 0.7045328140258789, "step": 795 }, { "epoch": 0.14713996689350745, "grad_norm": 1.224732532168464, "learning_rate": 4.884961426385578e-05, "loss": 1.0189, "mean_token_accuracy": 0.7101276278495788, "step": 800 }, { "epoch": 0.14805959168659186, "grad_norm": 1.1751595598375444, "learning_rate": 4.8827915543430604e-05, "loss": 0.9166, "mean_token_accuracy": 0.7369141817092896, "step": 805 }, { "epoch": 0.1489792164796763, "grad_norm": 1.0711984590567727, "learning_rate": 4.880601955930308e-05, "loss": 0.9528, "mean_token_accuracy": 0.7275946021080018, "step": 810 }, { "epoch": 0.1498988412727607, "grad_norm": 1.1523849563074238, "learning_rate": 4.878392651399103e-05, "loss": 0.9724, "mean_token_accuracy": 0.72748943567276, "step": 815 }, { "epoch": 0.15081846606584515, "grad_norm": 1.1385592224893888, "learning_rate": 4.8761636611834906e-05, "loss": 0.9423, "mean_token_accuracy": 0.7338582873344421, "step": 820 }, { "epoch": 0.15173809085892956, "grad_norm": 1.171019568482894, "learning_rate": 4.873915005899591e-05, "loss": 0.9823, "mean_token_accuracy": 0.7215001463890076, "step": 825 }, { "epoch": 0.15265771565201397, "grad_norm": 1.1181637038875023, "learning_rate": 4.871646706345407e-05, "loss": 0.9696, "mean_token_accuracy": 0.7244228839874267, "step": 830 }, { "epoch": 0.1535773404450984, "grad_norm": 1.140111709793846, "learning_rate": 4.869358783500634e-05, "loss": 0.9691, "mean_token_accuracy": 0.7219241619110107, "step": 835 }, { "epoch": 0.15449696523818282, "grad_norm": 1.1035668632214553, "learning_rate": 4.867051258526466e-05, "loss": 0.9216, "mean_token_accuracy": 0.7362164258956909, "step": 840 }, { "epoch": 0.15541659003126723, "grad_norm": 1.0632498704772437, "learning_rate": 4.864724152765396e-05, "loss": 0.9319, "mean_token_accuracy": 0.7335481762886047, "step": 845 }, { "epoch": 0.15633621482435167, "grad_norm": 1.1360641167900578, "learning_rate": 4.8623774877410235e-05, "loss": 0.998, "mean_token_accuracy": 0.7165634036064148, "step": 850 }, { "epoch": 0.15725583961743608, "grad_norm": 1.1574648839544697, "learning_rate": 4.860011285157852e-05, "loss": 0.9983, "mean_token_accuracy": 0.7154228448867798, "step": 855 }, { "epoch": 0.1581754644105205, "grad_norm": 1.1103379240939366, "learning_rate": 4.857625566901091e-05, "loss": 0.9606, "mean_token_accuracy": 0.7255040884017945, "step": 860 }, { "epoch": 0.15909508920360493, "grad_norm": 1.3478355454379694, "learning_rate": 4.85522035503645e-05, "loss": 0.9643, "mean_token_accuracy": 0.7249020457267761, "step": 865 }, { "epoch": 0.16001471399668935, "grad_norm": 1.129020628766503, "learning_rate": 4.852795671809941e-05, "loss": 0.9341, "mean_token_accuracy": 0.7329063415527344, "step": 870 }, { "epoch": 0.16093433878977378, "grad_norm": 1.1322677948976352, "learning_rate": 4.850351539647661e-05, "loss": 0.9977, "mean_token_accuracy": 0.7172942876815795, "step": 875 }, { "epoch": 0.1618539635828582, "grad_norm": 1.120014190171844, "learning_rate": 4.8478879811555986e-05, "loss": 0.9283, "mean_token_accuracy": 0.7341889500617981, "step": 880 }, { "epoch": 0.1627735883759426, "grad_norm": 1.1336097713701254, "learning_rate": 4.845405019119414e-05, "loss": 1.0008, "mean_token_accuracy": 0.7151533484458923, "step": 885 }, { "epoch": 0.16369321316902705, "grad_norm": 0.9922793909516228, "learning_rate": 4.842902676504235e-05, "loss": 0.9039, "mean_token_accuracy": 0.7395052313804626, "step": 890 }, { "epoch": 0.16461283796211146, "grad_norm": 1.2309806920357915, "learning_rate": 4.840380976454441e-05, "loss": 0.9143, "mean_token_accuracy": 0.7372842311859131, "step": 895 }, { "epoch": 0.16553246275519587, "grad_norm": 1.058725560363019, "learning_rate": 4.837839942293449e-05, "loss": 1.0122, "mean_token_accuracy": 0.7113693952560425, "step": 900 }, { "epoch": 0.1664520875482803, "grad_norm": 1.1050666066281727, "learning_rate": 4.835279597523501e-05, "loss": 0.9691, "mean_token_accuracy": 0.7241552948951722, "step": 905 }, { "epoch": 0.16737171234136472, "grad_norm": 1.1281645078253164, "learning_rate": 4.832699965825443e-05, "loss": 0.9783, "mean_token_accuracy": 0.7210159540176392, "step": 910 }, { "epoch": 0.16829133713444913, "grad_norm": 1.1049918709083206, "learning_rate": 4.830101071058506e-05, "loss": 0.9529, "mean_token_accuracy": 0.726420772075653, "step": 915 }, { "epoch": 0.16921096192753357, "grad_norm": 1.1589903082257091, "learning_rate": 4.82748293726009e-05, "loss": 1.0162, "mean_token_accuracy": 0.7134600043296814, "step": 920 }, { "epoch": 0.17013058672061798, "grad_norm": 1.0648743038360364, "learning_rate": 4.824845588645538e-05, "loss": 0.931, "mean_token_accuracy": 0.7355116486549378, "step": 925 }, { "epoch": 0.17105021151370242, "grad_norm": 1.0563630156850699, "learning_rate": 4.822189049607909e-05, "loss": 0.9303, "mean_token_accuracy": 0.7332427501678467, "step": 930 }, { "epoch": 0.17196983630678683, "grad_norm": 1.0946637430016075, "learning_rate": 4.819513344717759e-05, "loss": 0.9805, "mean_token_accuracy": 0.7218296766281128, "step": 935 }, { "epoch": 0.17288946109987124, "grad_norm": 1.218450386345206, "learning_rate": 4.8168184987229104e-05, "loss": 1.0025, "mean_token_accuracy": 0.7138312220573425, "step": 940 }, { "epoch": 0.17380908589295568, "grad_norm": 1.1265660437743932, "learning_rate": 4.814104536548222e-05, "loss": 0.9901, "mean_token_accuracy": 0.7183592796325684, "step": 945 }, { "epoch": 0.1747287106860401, "grad_norm": 1.1519197604777511, "learning_rate": 4.811371483295361e-05, "loss": 0.9677, "mean_token_accuracy": 0.723106038570404, "step": 950 }, { "epoch": 0.1756483354791245, "grad_norm": 1.0668603888469903, "learning_rate": 4.808619364242569e-05, "loss": 0.9428, "mean_token_accuracy": 0.7298098564147949, "step": 955 }, { "epoch": 0.17656796027220895, "grad_norm": 1.0617094358031158, "learning_rate": 4.805848204844427e-05, "loss": 0.9794, "mean_token_accuracy": 0.7198897957801819, "step": 960 }, { "epoch": 0.17748758506529336, "grad_norm": 1.1638181916029056, "learning_rate": 4.803058030731627e-05, "loss": 1.0356, "mean_token_accuracy": 0.7055891275405883, "step": 965 }, { "epoch": 0.17840720985837777, "grad_norm": 1.0804274338945197, "learning_rate": 4.800248867710724e-05, "loss": 0.9551, "mean_token_accuracy": 0.7267025232315063, "step": 970 }, { "epoch": 0.1793268346514622, "grad_norm": 1.1002302515677742, "learning_rate": 4.797420741763906e-05, "loss": 0.9513, "mean_token_accuracy": 0.727520763874054, "step": 975 }, { "epoch": 0.18024645944454662, "grad_norm": 1.0807257658531308, "learning_rate": 4.794573679048751e-05, "loss": 0.9667, "mean_token_accuracy": 0.7254797458648682, "step": 980 }, { "epoch": 0.18116608423763106, "grad_norm": 1.1423934429361384, "learning_rate": 4.791707705897982e-05, "loss": 0.9289, "mean_token_accuracy": 0.7316087126731873, "step": 985 }, { "epoch": 0.18208570903071547, "grad_norm": 1.0732201976252709, "learning_rate": 4.7888228488192294e-05, "loss": 0.9826, "mean_token_accuracy": 0.7205982804298401, "step": 990 }, { "epoch": 0.18300533382379988, "grad_norm": 1.0026696776201605, "learning_rate": 4.7859191344947804e-05, "loss": 0.9289, "mean_token_accuracy": 0.7336562752723694, "step": 995 }, { "epoch": 0.18392495861688432, "grad_norm": 1.138379913644609, "learning_rate": 4.782996589781337e-05, "loss": 0.9497, "mean_token_accuracy": 0.729135024547577, "step": 1000 }, { "epoch": 0.18484458340996873, "grad_norm": 1.107580666472087, "learning_rate": 4.780055241709762e-05, "loss": 0.9048, "mean_token_accuracy": 0.7381602048873901, "step": 1005 }, { "epoch": 0.18576420820305314, "grad_norm": 1.0667620674465943, "learning_rate": 4.7770951174848335e-05, "loss": 0.9742, "mean_token_accuracy": 0.7205707669258118, "step": 1010 }, { "epoch": 0.18668383299613758, "grad_norm": 1.0940019385189808, "learning_rate": 4.774116244484993e-05, "loss": 0.9857, "mean_token_accuracy": 0.718968415260315, "step": 1015 }, { "epoch": 0.187603457789222, "grad_norm": 1.0279044112611866, "learning_rate": 4.7711186502620894e-05, "loss": 1.0084, "mean_token_accuracy": 0.7144084692001342, "step": 1020 }, { "epoch": 0.1885230825823064, "grad_norm": 1.0751882464256728, "learning_rate": 4.768102362541126e-05, "loss": 0.9353, "mean_token_accuracy": 0.7318849921226501, "step": 1025 }, { "epoch": 0.18944270737539085, "grad_norm": 1.1701748750390102, "learning_rate": 4.765067409220004e-05, "loss": 0.957, "mean_token_accuracy": 0.7275319814682006, "step": 1030 }, { "epoch": 0.19036233216847526, "grad_norm": 1.0512353267451773, "learning_rate": 4.762013818369266e-05, "loss": 0.9367, "mean_token_accuracy": 0.7317106485366821, "step": 1035 }, { "epoch": 0.1912819569615597, "grad_norm": 1.1085851412035923, "learning_rate": 4.7589416182318305e-05, "loss": 0.9416, "mean_token_accuracy": 0.7324359536170959, "step": 1040 }, { "epoch": 0.1922015817546441, "grad_norm": 1.094731274119514, "learning_rate": 4.755850837222739e-05, "loss": 0.9474, "mean_token_accuracy": 0.7309187650680542, "step": 1045 }, { "epoch": 0.19312120654772852, "grad_norm": 1.0610610405848808, "learning_rate": 4.7527415039288874e-05, "loss": 0.9638, "mean_token_accuracy": 0.7251871824264526, "step": 1050 }, { "epoch": 0.19404083134081296, "grad_norm": 1.0919916417692772, "learning_rate": 4.749613647108764e-05, "loss": 1.0008, "mean_token_accuracy": 0.7152180433273315, "step": 1055 }, { "epoch": 0.19496045613389737, "grad_norm": 1.0847298297852, "learning_rate": 4.7464672956921814e-05, "loss": 0.9366, "mean_token_accuracy": 0.7313546657562255, "step": 1060 }, { "epoch": 0.19588008092698178, "grad_norm": 1.0912787695821449, "learning_rate": 4.743302478780011e-05, "loss": 0.945, "mean_token_accuracy": 0.728658664226532, "step": 1065 }, { "epoch": 0.19679970572006622, "grad_norm": 1.052195400658314, "learning_rate": 4.7401192256439144e-05, "loss": 0.9793, "mean_token_accuracy": 0.7213846921920777, "step": 1070 }, { "epoch": 0.19771933051315063, "grad_norm": 1.1107870405998106, "learning_rate": 4.736917565726069e-05, "loss": 0.9313, "mean_token_accuracy": 0.735443937778473, "step": 1075 }, { "epoch": 0.19863895530623504, "grad_norm": 1.1399365300090571, "learning_rate": 4.7336975286389e-05, "loss": 0.9717, "mean_token_accuracy": 0.7237229943275452, "step": 1080 }, { "epoch": 0.19955858009931948, "grad_norm": 1.0983682734144682, "learning_rate": 4.730459144164802e-05, "loss": 0.9306, "mean_token_accuracy": 0.733622133731842, "step": 1085 }, { "epoch": 0.2004782048924039, "grad_norm": 1.1053704101564246, "learning_rate": 4.727202442255871e-05, "loss": 0.9936, "mean_token_accuracy": 0.718384611606598, "step": 1090 }, { "epoch": 0.20139782968548833, "grad_norm": 1.0858488860538602, "learning_rate": 4.723927453033619e-05, "loss": 0.9548, "mean_token_accuracy": 0.7286873102188111, "step": 1095 }, { "epoch": 0.20231745447857274, "grad_norm": 1.0232898856111519, "learning_rate": 4.720634206788697e-05, "loss": 0.9804, "mean_token_accuracy": 0.7218252301216126, "step": 1100 }, { "epoch": 0.20323707927165716, "grad_norm": 1.1548447631409977, "learning_rate": 4.717322733980622e-05, "loss": 0.931, "mean_token_accuracy": 0.7311301946640014, "step": 1105 }, { "epoch": 0.2041567040647416, "grad_norm": 1.1168183831474872, "learning_rate": 4.713993065237486e-05, "loss": 0.9718, "mean_token_accuracy": 0.7235833764076233, "step": 1110 }, { "epoch": 0.205076328857826, "grad_norm": 1.1111836320920656, "learning_rate": 4.710645231355678e-05, "loss": 0.9855, "mean_token_accuracy": 0.7195135593414307, "step": 1115 }, { "epoch": 0.20599595365091042, "grad_norm": 1.0024638729648838, "learning_rate": 4.707279263299598e-05, "loss": 0.9729, "mean_token_accuracy": 0.7219846963882446, "step": 1120 }, { "epoch": 0.20691557844399486, "grad_norm": 1.0121762272601764, "learning_rate": 4.703895192201372e-05, "loss": 0.9459, "mean_token_accuracy": 0.7269375443458557, "step": 1125 }, { "epoch": 0.20783520323707927, "grad_norm": 1.0470465876428376, "learning_rate": 4.7004930493605573e-05, "loss": 1.0105, "mean_token_accuracy": 0.7086774349212647, "step": 1130 }, { "epoch": 0.20875482803016368, "grad_norm": 1.0632837126367782, "learning_rate": 4.697072866243866e-05, "loss": 0.9412, "mean_token_accuracy": 0.7307331085205078, "step": 1135 }, { "epoch": 0.20967445282324812, "grad_norm": 1.0768863946202714, "learning_rate": 4.69363467448486e-05, "loss": 0.9674, "mean_token_accuracy": 0.7221316814422607, "step": 1140 }, { "epoch": 0.21059407761633253, "grad_norm": 1.1181930167961487, "learning_rate": 4.6901785058836675e-05, "loss": 0.955, "mean_token_accuracy": 0.725222361087799, "step": 1145 }, { "epoch": 0.21151370240941697, "grad_norm": 1.0688002319746086, "learning_rate": 4.686704392406685e-05, "loss": 0.9687, "mean_token_accuracy": 0.7218108892440795, "step": 1150 }, { "epoch": 0.21243332720250138, "grad_norm": 1.1052965038670703, "learning_rate": 4.6832123661862835e-05, "loss": 0.9516, "mean_token_accuracy": 0.7287932515144349, "step": 1155 }, { "epoch": 0.2133529519955858, "grad_norm": 1.0349887525202925, "learning_rate": 4.6797024595205104e-05, "loss": 0.9599, "mean_token_accuracy": 0.7228366494178772, "step": 1160 }, { "epoch": 0.21427257678867023, "grad_norm": 1.052123043795087, "learning_rate": 4.6761747048727907e-05, "loss": 0.9833, "mean_token_accuracy": 0.714729118347168, "step": 1165 }, { "epoch": 0.21519220158175464, "grad_norm": 1.0646750046566955, "learning_rate": 4.672629134871625e-05, "loss": 0.98, "mean_token_accuracy": 0.7194055676460266, "step": 1170 }, { "epoch": 0.21611182637483906, "grad_norm": 1.072675922430035, "learning_rate": 4.669065782310294e-05, "loss": 0.9661, "mean_token_accuracy": 0.7228956103324891, "step": 1175 }, { "epoch": 0.2170314511679235, "grad_norm": 1.0475965649186345, "learning_rate": 4.665484680146546e-05, "loss": 0.9168, "mean_token_accuracy": 0.7354954957962037, "step": 1180 }, { "epoch": 0.2179510759610079, "grad_norm": 1.0183550500547607, "learning_rate": 4.6618858615023e-05, "loss": 0.9268, "mean_token_accuracy": 0.731166672706604, "step": 1185 }, { "epoch": 0.21887070075409232, "grad_norm": 1.0894438583208028, "learning_rate": 4.658269359663336e-05, "loss": 0.9134, "mean_token_accuracy": 0.7400953650474549, "step": 1190 }, { "epoch": 0.21979032554717676, "grad_norm": 0.9962620966267176, "learning_rate": 4.6546352080789854e-05, "loss": 0.9472, "mean_token_accuracy": 0.7283522963523865, "step": 1195 }, { "epoch": 0.22070995034026117, "grad_norm": 1.0767144498287804, "learning_rate": 4.650983440361825e-05, "loss": 0.9798, "mean_token_accuracy": 0.7208079814910888, "step": 1200 }, { "epoch": 0.2216295751333456, "grad_norm": 1.0451151540293229, "learning_rate": 4.6473140902873666e-05, "loss": 0.9735, "mean_token_accuracy": 0.7223762154579163, "step": 1205 }, { "epoch": 0.22254919992643002, "grad_norm": 0.9904423090265289, "learning_rate": 4.643627191793737e-05, "loss": 0.9416, "mean_token_accuracy": 0.7333443641662598, "step": 1210 }, { "epoch": 0.22346882471951443, "grad_norm": 1.0324822073086444, "learning_rate": 4.639922778981377e-05, "loss": 0.9096, "mean_token_accuracy": 0.7366245865821839, "step": 1215 }, { "epoch": 0.22438844951259887, "grad_norm": 1.00961392870682, "learning_rate": 4.636200886112714e-05, "loss": 0.9647, "mean_token_accuracy": 0.7272518515586853, "step": 1220 }, { "epoch": 0.22530807430568328, "grad_norm": 1.041598639678359, "learning_rate": 4.63246154761185e-05, "loss": 0.982, "mean_token_accuracy": 0.7185810923576355, "step": 1225 }, { "epoch": 0.2262276990987677, "grad_norm": 1.0574278162856792, "learning_rate": 4.628704798064247e-05, "loss": 0.9442, "mean_token_accuracy": 0.7297179222106933, "step": 1230 }, { "epoch": 0.22714732389185213, "grad_norm": 1.060076765820854, "learning_rate": 4.624930672216399e-05, "loss": 0.9614, "mean_token_accuracy": 0.7244118571281433, "step": 1235 }, { "epoch": 0.22806694868493654, "grad_norm": 1.0123003105589568, "learning_rate": 4.621139204975516e-05, "loss": 0.9169, "mean_token_accuracy": 0.7362489700317383, "step": 1240 }, { "epoch": 0.22898657347802095, "grad_norm": 1.1490153575204947, "learning_rate": 4.617330431409201e-05, "loss": 0.9929, "mean_token_accuracy": 0.7166203141212464, "step": 1245 }, { "epoch": 0.2299061982711054, "grad_norm": 1.0270625785191527, "learning_rate": 4.6135043867451255e-05, "loss": 0.9325, "mean_token_accuracy": 0.7311270833015442, "step": 1250 }, { "epoch": 0.2308258230641898, "grad_norm": 1.030694744170465, "learning_rate": 4.609661106370701e-05, "loss": 0.9228, "mean_token_accuracy": 0.7355565190315246, "step": 1255 }, { "epoch": 0.23174544785727424, "grad_norm": 1.0190672056189127, "learning_rate": 4.605800625832753e-05, "loss": 0.9577, "mean_token_accuracy": 0.7273682594299317, "step": 1260 }, { "epoch": 0.23266507265035866, "grad_norm": 1.025832787786935, "learning_rate": 4.6019229808371945e-05, "loss": 0.9291, "mean_token_accuracy": 0.7325186491012573, "step": 1265 }, { "epoch": 0.23358469744344307, "grad_norm": 1.0254402284447273, "learning_rate": 4.598028207248693e-05, "loss": 0.9681, "mean_token_accuracy": 0.7215327501296998, "step": 1270 }, { "epoch": 0.2345043222365275, "grad_norm": 1.043519079594266, "learning_rate": 4.5941163410903406e-05, "loss": 0.9565, "mean_token_accuracy": 0.7248036026954651, "step": 1275 }, { "epoch": 0.23542394702961192, "grad_norm": 0.9811685630848649, "learning_rate": 4.590187418543321e-05, "loss": 0.9204, "mean_token_accuracy": 0.7338666915893555, "step": 1280 }, { "epoch": 0.23634357182269633, "grad_norm": 1.0355767679745649, "learning_rate": 4.586241475946571e-05, "loss": 0.9824, "mean_token_accuracy": 0.7212961316108704, "step": 1285 }, { "epoch": 0.23726319661578077, "grad_norm": 0.9995187864598916, "learning_rate": 4.582278549796448e-05, "loss": 0.914, "mean_token_accuracy": 0.7355898737907409, "step": 1290 }, { "epoch": 0.23818282140886518, "grad_norm": 1.0163621938165361, "learning_rate": 4.5782986767463946e-05, "loss": 0.9614, "mean_token_accuracy": 0.7241615772247314, "step": 1295 }, { "epoch": 0.2391024462019496, "grad_norm": 1.0913821743861445, "learning_rate": 4.574301893606594e-05, "loss": 0.8839, "mean_token_accuracy": 0.7434832811355591, "step": 1300 }, { "epoch": 0.24002207099503403, "grad_norm": 1.0399223484753735, "learning_rate": 4.570288237343632e-05, "loss": 0.9104, "mean_token_accuracy": 0.7378169417381286, "step": 1305 }, { "epoch": 0.24094169578811844, "grad_norm": 1.011671028641558, "learning_rate": 4.5662577450801576e-05, "loss": 0.9595, "mean_token_accuracy": 0.7230379819869995, "step": 1310 }, { "epoch": 0.24186132058120288, "grad_norm": 1.008990928095214, "learning_rate": 4.562210454094535e-05, "loss": 0.9363, "mean_token_accuracy": 0.7295035600662232, "step": 1315 }, { "epoch": 0.2427809453742873, "grad_norm": 1.059357744292348, "learning_rate": 4.558146401820502e-05, "loss": 0.9569, "mean_token_accuracy": 0.7264422059059144, "step": 1320 }, { "epoch": 0.2437005701673717, "grad_norm": 1.0224904321964083, "learning_rate": 4.554065625846825e-05, "loss": 0.9838, "mean_token_accuracy": 0.7178040146827698, "step": 1325 }, { "epoch": 0.24462019496045614, "grad_norm": 1.0737296876090594, "learning_rate": 4.549968163916946e-05, "loss": 0.976, "mean_token_accuracy": 0.7180652141571044, "step": 1330 }, { "epoch": 0.24553981975354056, "grad_norm": 1.0129242243093401, "learning_rate": 4.545854053928639e-05, "loss": 0.9394, "mean_token_accuracy": 0.7314478039741517, "step": 1335 }, { "epoch": 0.24645944454662497, "grad_norm": 0.9860304727584566, "learning_rate": 4.541723333933657e-05, "loss": 0.9595, "mean_token_accuracy": 0.7271197676658631, "step": 1340 }, { "epoch": 0.2473790693397094, "grad_norm": 1.0235437508308431, "learning_rate": 4.5375760421373796e-05, "loss": 0.9888, "mean_token_accuracy": 0.7178149104118348, "step": 1345 }, { "epoch": 0.24829869413279382, "grad_norm": 1.076473129213084, "learning_rate": 4.533412216898461e-05, "loss": 0.9374, "mean_token_accuracy": 0.7287054538726807, "step": 1350 }, { "epoch": 0.24921831892587823, "grad_norm": 1.027000741915809, "learning_rate": 4.529231896728474e-05, "loss": 0.9098, "mean_token_accuracy": 0.7352772355079651, "step": 1355 }, { "epoch": 0.25013794371896264, "grad_norm": 1.0980991489181584, "learning_rate": 4.525035120291557e-05, "loss": 0.9613, "mean_token_accuracy": 0.7250553727149963, "step": 1360 }, { "epoch": 0.2510575685120471, "grad_norm": 1.0105378261394609, "learning_rate": 4.520821926404049e-05, "loss": 0.9232, "mean_token_accuracy": 0.7339854836463928, "step": 1365 }, { "epoch": 0.2519771933051315, "grad_norm": 1.0465671126237865, "learning_rate": 4.516592354034138e-05, "loss": 0.9578, "mean_token_accuracy": 0.7243474960327149, "step": 1370 }, { "epoch": 0.2528968180982159, "grad_norm": 1.0721948067984564, "learning_rate": 4.512346442301501e-05, "loss": 0.9305, "mean_token_accuracy": 0.7290533304214477, "step": 1375 }, { "epoch": 0.25381644289130034, "grad_norm": 1.083352961545848, "learning_rate": 4.5080842304769345e-05, "loss": 0.9338, "mean_token_accuracy": 0.733627998828888, "step": 1380 }, { "epoch": 0.2547360676843848, "grad_norm": 0.979913773136715, "learning_rate": 4.503805757981997e-05, "loss": 0.9012, "mean_token_accuracy": 0.7409675002098084, "step": 1385 }, { "epoch": 0.25565569247746917, "grad_norm": 1.1174510417210128, "learning_rate": 4.499511064388645e-05, "loss": 0.8754, "mean_token_accuracy": 0.7447872519493103, "step": 1390 }, { "epoch": 0.2565753172705536, "grad_norm": 1.0562227070300527, "learning_rate": 4.495200189418864e-05, "loss": 0.9505, "mean_token_accuracy": 0.7265227913856507, "step": 1395 }, { "epoch": 0.25749494206363804, "grad_norm": 1.0550543313489833, "learning_rate": 4.490873172944303e-05, "loss": 0.9096, "mean_token_accuracy": 0.7342225193977356, "step": 1400 }, { "epoch": 0.2584145668567225, "grad_norm": 1.0844914008772555, "learning_rate": 4.486530054985905e-05, "loss": 0.9643, "mean_token_accuracy": 0.7227702975273133, "step": 1405 }, { "epoch": 0.25933419164980687, "grad_norm": 1.11030675175993, "learning_rate": 4.482170875713536e-05, "loss": 0.98, "mean_token_accuracy": 0.7210663437843323, "step": 1410 }, { "epoch": 0.2602538164428913, "grad_norm": 1.0678730599548856, "learning_rate": 4.477795675445616e-05, "loss": 0.9248, "mean_token_accuracy": 0.7327564835548401, "step": 1415 }, { "epoch": 0.26117344123597575, "grad_norm": 0.9866628204231362, "learning_rate": 4.473404494648744e-05, "loss": 0.9216, "mean_token_accuracy": 0.7343960881233216, "step": 1420 }, { "epoch": 0.26209306602906013, "grad_norm": 0.9895263110250994, "learning_rate": 4.4689973739373244e-05, "loss": 0.9123, "mean_token_accuracy": 0.7354090452194214, "step": 1425 }, { "epoch": 0.26301269082214457, "grad_norm": 0.9560958289104061, "learning_rate": 4.46457435407319e-05, "loss": 0.9494, "mean_token_accuracy": 0.725600802898407, "step": 1430 }, { "epoch": 0.263932315615229, "grad_norm": 1.0418751893863187, "learning_rate": 4.460135475965227e-05, "loss": 0.887, "mean_token_accuracy": 0.744392192363739, "step": 1435 }, { "epoch": 0.2648519404083134, "grad_norm": 1.0270767884123133, "learning_rate": 4.455680780668997e-05, "loss": 0.98, "mean_token_accuracy": 0.717594051361084, "step": 1440 }, { "epoch": 0.26577156520139783, "grad_norm": 1.0194372684867639, "learning_rate": 4.4512103093863555e-05, "loss": 0.9145, "mean_token_accuracy": 0.7369788885116577, "step": 1445 }, { "epoch": 0.26669118999448227, "grad_norm": 1.0981284825838393, "learning_rate": 4.44672410346507e-05, "loss": 0.9519, "mean_token_accuracy": 0.7260895729064941, "step": 1450 }, { "epoch": 0.26761081478756665, "grad_norm": 1.0207625075556366, "learning_rate": 4.442222204398441e-05, "loss": 0.9555, "mean_token_accuracy": 0.7227967500686645, "step": 1455 }, { "epoch": 0.2685304395806511, "grad_norm": 0.98393868791661, "learning_rate": 4.437704653824915e-05, "loss": 0.8831, "mean_token_accuracy": 0.7438354253768921, "step": 1460 }, { "epoch": 0.26945006437373553, "grad_norm": 0.9817630950075087, "learning_rate": 4.433171493527701e-05, "loss": 0.9404, "mean_token_accuracy": 0.728731095790863, "step": 1465 }, { "epoch": 0.2703696891668199, "grad_norm": 1.0298652072064594, "learning_rate": 4.428622765434383e-05, "loss": 0.9136, "mean_token_accuracy": 0.7356218695640564, "step": 1470 }, { "epoch": 0.27128931395990435, "grad_norm": 0.981553092264934, "learning_rate": 4.4240585116165334e-05, "loss": 0.8555, "mean_token_accuracy": 0.753374171257019, "step": 1475 }, { "epoch": 0.2722089387529888, "grad_norm": 1.172918257192198, "learning_rate": 4.419478774289325e-05, "loss": 0.998, "mean_token_accuracy": 0.713919198513031, "step": 1480 }, { "epoch": 0.2731285635460732, "grad_norm": 1.003409782978005, "learning_rate": 4.414883595811136e-05, "loss": 0.8782, "mean_token_accuracy": 0.7452871680259705, "step": 1485 }, { "epoch": 0.2740481883391576, "grad_norm": 1.0316918646250515, "learning_rate": 4.410273018683163e-05, "loss": 0.9242, "mean_token_accuracy": 0.7311699628829956, "step": 1490 }, { "epoch": 0.27496781313224206, "grad_norm": 0.978003437149563, "learning_rate": 4.405647085549025e-05, "loss": 0.9241, "mean_token_accuracy": 0.7328976273536683, "step": 1495 }, { "epoch": 0.27588743792532644, "grad_norm": 1.0070406181231344, "learning_rate": 4.40100583919437e-05, "loss": 0.9001, "mean_token_accuracy": 0.7395057559013367, "step": 1500 }, { "epoch": 0.2768070627184109, "grad_norm": 0.9873878935159346, "learning_rate": 4.3963493225464817e-05, "loss": 0.9258, "mean_token_accuracy": 0.7336387634277344, "step": 1505 }, { "epoch": 0.2777266875114953, "grad_norm": 0.9521695030248521, "learning_rate": 4.3916775786738754e-05, "loss": 0.914, "mean_token_accuracy": 0.7378314137458801, "step": 1510 }, { "epoch": 0.27864631230457976, "grad_norm": 0.9502896850196428, "learning_rate": 4.3869906507859096e-05, "loss": 0.8987, "mean_token_accuracy": 0.7417943596839904, "step": 1515 }, { "epoch": 0.27956593709766414, "grad_norm": 0.991426828614557, "learning_rate": 4.382288582232376e-05, "loss": 0.9106, "mean_token_accuracy": 0.7390964746475219, "step": 1520 }, { "epoch": 0.2804855618907486, "grad_norm": 1.0581857743606324, "learning_rate": 4.377571416503108e-05, "loss": 0.9179, "mean_token_accuracy": 0.7379998922348022, "step": 1525 }, { "epoch": 0.281405186683833, "grad_norm": 0.9872377385823925, "learning_rate": 4.372839197227571e-05, "loss": 0.8848, "mean_token_accuracy": 0.7446985721588135, "step": 1530 }, { "epoch": 0.2823248114769174, "grad_norm": 1.0976151495403408, "learning_rate": 4.368091968174463e-05, "loss": 0.9632, "mean_token_accuracy": 0.723613953590393, "step": 1535 }, { "epoch": 0.28324443627000184, "grad_norm": 1.013680671037777, "learning_rate": 4.363329773251309e-05, "loss": 0.866, "mean_token_accuracy": 0.750942587852478, "step": 1540 }, { "epoch": 0.2841640610630863, "grad_norm": 1.1182733077200029, "learning_rate": 4.3585526565040543e-05, "loss": 0.9995, "mean_token_accuracy": 0.7137303233146668, "step": 1545 }, { "epoch": 0.28508368585617067, "grad_norm": 0.9779737007515391, "learning_rate": 4.353760662116658e-05, "loss": 0.9369, "mean_token_accuracy": 0.7336580872535705, "step": 1550 }, { "epoch": 0.2860033106492551, "grad_norm": 1.0260468281394197, "learning_rate": 4.348953834410683e-05, "loss": 0.9678, "mean_token_accuracy": 0.7206373929977417, "step": 1555 }, { "epoch": 0.28692293544233954, "grad_norm": 1.0263096637333005, "learning_rate": 4.3441322178448856e-05, "loss": 0.9572, "mean_token_accuracy": 0.7260561943054199, "step": 1560 }, { "epoch": 0.2878425602354239, "grad_norm": 0.9619383230028783, "learning_rate": 4.339295857014809e-05, "loss": 0.9501, "mean_token_accuracy": 0.7264659523963928, "step": 1565 }, { "epoch": 0.28876218502850837, "grad_norm": 0.9946060524217067, "learning_rate": 4.3344447966523634e-05, "loss": 0.9887, "mean_token_accuracy": 0.7160560727119446, "step": 1570 }, { "epoch": 0.2896818098215928, "grad_norm": 1.0275376139203307, "learning_rate": 4.3295790816254195e-05, "loss": 0.9262, "mean_token_accuracy": 0.734666109085083, "step": 1575 }, { "epoch": 0.2906014346146772, "grad_norm": 1.1276042923218728, "learning_rate": 4.324698756937388e-05, "loss": 0.9378, "mean_token_accuracy": 0.7300173878669739, "step": 1580 }, { "epoch": 0.29152105940776163, "grad_norm": 0.9552400868458645, "learning_rate": 4.319803867726807e-05, "loss": 0.8879, "mean_token_accuracy": 0.7425481796264648, "step": 1585 }, { "epoch": 0.29244068420084607, "grad_norm": 0.9486514468425481, "learning_rate": 4.3148944592669234e-05, "loss": 0.9613, "mean_token_accuracy": 0.7219538450241089, "step": 1590 }, { "epoch": 0.29336030899393045, "grad_norm": 0.9567962674802902, "learning_rate": 4.30997057696527e-05, "loss": 0.8741, "mean_token_accuracy": 0.7477473855018616, "step": 1595 }, { "epoch": 0.2942799337870149, "grad_norm": 0.9667609260469084, "learning_rate": 4.3050322663632564e-05, "loss": 0.9568, "mean_token_accuracy": 0.7255883097648621, "step": 1600 }, { "epoch": 0.29519955858009933, "grad_norm": 0.9920073647296315, "learning_rate": 4.3000795731357333e-05, "loss": 0.9237, "mean_token_accuracy": 0.7383288621902466, "step": 1605 }, { "epoch": 0.2961191833731837, "grad_norm": 1.0604465170326072, "learning_rate": 4.295112543090584e-05, "loss": 0.9609, "mean_token_accuracy": 0.7225096940994262, "step": 1610 }, { "epoch": 0.29703880816626815, "grad_norm": 1.0688037490276023, "learning_rate": 4.290131222168289e-05, "loss": 1.0008, "mean_token_accuracy": 0.7138909697532654, "step": 1615 }, { "epoch": 0.2979584329593526, "grad_norm": 1.143629206489082, "learning_rate": 4.2851356564415086e-05, "loss": 0.9867, "mean_token_accuracy": 0.7165561437606811, "step": 1620 }, { "epoch": 0.29887805775243703, "grad_norm": 1.0438745750713756, "learning_rate": 4.280125892114656e-05, "loss": 0.9434, "mean_token_accuracy": 0.7298865675926208, "step": 1625 }, { "epoch": 0.2997976825455214, "grad_norm": 1.0251559106803514, "learning_rate": 4.2751019755234664e-05, "loss": 0.935, "mean_token_accuracy": 0.7299148678779602, "step": 1630 }, { "epoch": 0.30071730733860585, "grad_norm": 0.9900961445552091, "learning_rate": 4.27006395313457e-05, "loss": 0.9963, "mean_token_accuracy": 0.7131295561790466, "step": 1635 }, { "epoch": 0.3016369321316903, "grad_norm": 1.040210108998438, "learning_rate": 4.265011871545066e-05, "loss": 0.9412, "mean_token_accuracy": 0.7279941439628601, "step": 1640 }, { "epoch": 0.3025565569247747, "grad_norm": 1.0262950854145634, "learning_rate": 4.259945777482085e-05, "loss": 0.9239, "mean_token_accuracy": 0.7327239632606506, "step": 1645 }, { "epoch": 0.3034761817178591, "grad_norm": 0.9969469234100081, "learning_rate": 4.25486571780236e-05, "loss": 0.9462, "mean_token_accuracy": 0.7269651889801025, "step": 1650 }, { "epoch": 0.30439580651094356, "grad_norm": 1.0021703198417462, "learning_rate": 4.249771739491795e-05, "loss": 0.9003, "mean_token_accuracy": 0.7421126961708069, "step": 1655 }, { "epoch": 0.30531543130402794, "grad_norm": 1.0255704189414308, "learning_rate": 4.24466388966503e-05, "loss": 0.9249, "mean_token_accuracy": 0.7345858454704285, "step": 1660 }, { "epoch": 0.3062350560971124, "grad_norm": 0.9438771845720968, "learning_rate": 4.239542215565e-05, "loss": 0.9749, "mean_token_accuracy": 0.7182752847671509, "step": 1665 }, { "epoch": 0.3071546808901968, "grad_norm": 0.9878451650581643, "learning_rate": 4.2344067645625036e-05, "loss": 0.9455, "mean_token_accuracy": 0.7264060854911805, "step": 1670 }, { "epoch": 0.3080743056832812, "grad_norm": 1.1287364443586523, "learning_rate": 4.229257584155765e-05, "loss": 0.9218, "mean_token_accuracy": 0.7332573175430298, "step": 1675 }, { "epoch": 0.30899393047636564, "grad_norm": 0.971666072350275, "learning_rate": 4.2240947219699895e-05, "loss": 0.8756, "mean_token_accuracy": 0.7459922909736634, "step": 1680 }, { "epoch": 0.3099135552694501, "grad_norm": 0.9593974583897734, "learning_rate": 4.2189182257569285e-05, "loss": 0.9329, "mean_token_accuracy": 0.730040967464447, "step": 1685 }, { "epoch": 0.31083318006253446, "grad_norm": 0.943158273064518, "learning_rate": 4.213728143394436e-05, "loss": 0.8839, "mean_token_accuracy": 0.7458212971687317, "step": 1690 }, { "epoch": 0.3117528048556189, "grad_norm": 1.050902490407755, "learning_rate": 4.208524522886022e-05, "loss": 0.9443, "mean_token_accuracy": 0.7311147809028625, "step": 1695 }, { "epoch": 0.31267242964870334, "grad_norm": 1.0074348860409519, "learning_rate": 4.203307412360418e-05, "loss": 0.9201, "mean_token_accuracy": 0.7326057314872741, "step": 1700 }, { "epoch": 0.3135920544417877, "grad_norm": 1.0039288385867127, "learning_rate": 4.1980768600711194e-05, "loss": 0.9169, "mean_token_accuracy": 0.736884355545044, "step": 1705 }, { "epoch": 0.31451167923487217, "grad_norm": 0.9456279018137994, "learning_rate": 4.1928329143959506e-05, "loss": 0.9198, "mean_token_accuracy": 0.7341038465499878, "step": 1710 }, { "epoch": 0.3154313040279566, "grad_norm": 0.969219875361889, "learning_rate": 4.18757562383661e-05, "loss": 0.9586, "mean_token_accuracy": 0.7229322910308837, "step": 1715 }, { "epoch": 0.316350928821041, "grad_norm": 0.9823553221239351, "learning_rate": 4.182305037018224e-05, "loss": 0.8674, "mean_token_accuracy": 0.7455045938491821, "step": 1720 }, { "epoch": 0.31727055361412543, "grad_norm": 0.9614849491835867, "learning_rate": 4.1770212026888974e-05, "loss": 0.8978, "mean_token_accuracy": 0.7393216609954834, "step": 1725 }, { "epoch": 0.31819017840720987, "grad_norm": 1.0298443865011644, "learning_rate": 4.1717241697192636e-05, "loss": 0.9046, "mean_token_accuracy": 0.7390219569206238, "step": 1730 }, { "epoch": 0.3191098032002943, "grad_norm": 0.9675044814332657, "learning_rate": 4.166413987102031e-05, "loss": 0.9014, "mean_token_accuracy": 0.7412125468254089, "step": 1735 }, { "epoch": 0.3200294279933787, "grad_norm": 0.9558901216962499, "learning_rate": 4.161090703951528e-05, "loss": 0.8915, "mean_token_accuracy": 0.7442119359970093, "step": 1740 }, { "epoch": 0.32094905278646313, "grad_norm": 1.0231471726772243, "learning_rate": 4.155754369503254e-05, "loss": 0.9508, "mean_token_accuracy": 0.7272051572799683, "step": 1745 }, { "epoch": 0.32186867757954757, "grad_norm": 0.971225693001968, "learning_rate": 4.1504050331134186e-05, "loss": 0.9271, "mean_token_accuracy": 0.7334083676338196, "step": 1750 }, { "epoch": 0.32278830237263195, "grad_norm": 0.9487975621871125, "learning_rate": 4.1450427442584885e-05, "loss": 0.9231, "mean_token_accuracy": 0.7330006003379822, "step": 1755 }, { "epoch": 0.3237079271657164, "grad_norm": 1.080234485746019, "learning_rate": 4.13966755253473e-05, "loss": 0.8934, "mean_token_accuracy": 0.7371908903121949, "step": 1760 }, { "epoch": 0.32462755195880083, "grad_norm": 1.0042744657060512, "learning_rate": 4.134279507657746e-05, "loss": 0.9357, "mean_token_accuracy": 0.7307947874069214, "step": 1765 }, { "epoch": 0.3255471767518852, "grad_norm": 1.0167454318885076, "learning_rate": 4.1288786594620224e-05, "loss": 0.9522, "mean_token_accuracy": 0.7250777244567871, "step": 1770 }, { "epoch": 0.32646680154496965, "grad_norm": 1.0378785371682158, "learning_rate": 4.123465057900463e-05, "loss": 0.8991, "mean_token_accuracy": 0.7383182883262634, "step": 1775 }, { "epoch": 0.3273864263380541, "grad_norm": 0.975574798117687, "learning_rate": 4.118038753043927e-05, "loss": 0.8962, "mean_token_accuracy": 0.7391498327255249, "step": 1780 }, { "epoch": 0.3283060511311385, "grad_norm": 0.9785593634297269, "learning_rate": 4.112599795080771e-05, "loss": 0.8976, "mean_token_accuracy": 0.7406945347785949, "step": 1785 }, { "epoch": 0.3292256759242229, "grad_norm": 0.9506069452238485, "learning_rate": 4.107148234316378e-05, "loss": 0.9792, "mean_token_accuracy": 0.7183930397033691, "step": 1790 }, { "epoch": 0.33014530071730736, "grad_norm": 0.9568388159915644, "learning_rate": 4.101684121172696e-05, "loss": 0.9445, "mean_token_accuracy": 0.7280240654945374, "step": 1795 }, { "epoch": 0.33106492551039174, "grad_norm": 1.022357456314008, "learning_rate": 4.096207506187773e-05, "loss": 0.9394, "mean_token_accuracy": 0.7300898432731628, "step": 1800 }, { "epoch": 0.3319845503034762, "grad_norm": 0.993312074550177, "learning_rate": 4.090718440015285e-05, "loss": 0.8857, "mean_token_accuracy": 0.7397880554199219, "step": 1805 }, { "epoch": 0.3329041750965606, "grad_norm": 0.9393217165901138, "learning_rate": 4.0852169734240715e-05, "loss": 0.9055, "mean_token_accuracy": 0.7397056937217712, "step": 1810 }, { "epoch": 0.333823799889645, "grad_norm": 1.0286146516865022, "learning_rate": 4.0797031572976644e-05, "loss": 0.9486, "mean_token_accuracy": 0.7270653247833252, "step": 1815 }, { "epoch": 0.33474342468272944, "grad_norm": 1.0433673618214743, "learning_rate": 4.074177042633818e-05, "loss": 0.8654, "mean_token_accuracy": 0.7493741869926452, "step": 1820 }, { "epoch": 0.3356630494758139, "grad_norm": 0.9978374983290279, "learning_rate": 4.068638680544035e-05, "loss": 0.9434, "mean_token_accuracy": 0.7284141898155212, "step": 1825 }, { "epoch": 0.33658267426889826, "grad_norm": 0.9268570875914646, "learning_rate": 4.063088122253096e-05, "loss": 0.9323, "mean_token_accuracy": 0.7292568445205688, "step": 1830 }, { "epoch": 0.3375022990619827, "grad_norm": 1.0098370277606412, "learning_rate": 4.05752541909859e-05, "loss": 0.8831, "mean_token_accuracy": 0.7427129149436951, "step": 1835 }, { "epoch": 0.33842192385506714, "grad_norm": 0.9840521255378257, "learning_rate": 4.0519506225304266e-05, "loss": 0.9129, "mean_token_accuracy": 0.7376075983047485, "step": 1840 }, { "epoch": 0.3393415486481516, "grad_norm": 0.9706147022595509, "learning_rate": 4.046363784110375e-05, "loss": 0.8867, "mean_token_accuracy": 0.7421358585357666, "step": 1845 }, { "epoch": 0.34026117344123596, "grad_norm": 1.0544553608523015, "learning_rate": 4.040764955511577e-05, "loss": 0.9404, "mean_token_accuracy": 0.7300120830535889, "step": 1850 }, { "epoch": 0.3411807982343204, "grad_norm": 0.9771051625951763, "learning_rate": 4.035154188518076e-05, "loss": 0.92, "mean_token_accuracy": 0.7353024840354919, "step": 1855 }, { "epoch": 0.34210042302740484, "grad_norm": 0.9612601058837731, "learning_rate": 4.02953153502433e-05, "loss": 0.8822, "mean_token_accuracy": 0.7446259975433349, "step": 1860 }, { "epoch": 0.3430200478204892, "grad_norm": 1.0790844365415948, "learning_rate": 4.0238970470347404e-05, "loss": 0.9243, "mean_token_accuracy": 0.7315137147903442, "step": 1865 }, { "epoch": 0.34393967261357367, "grad_norm": 0.9988868690440261, "learning_rate": 4.018250776663164e-05, "loss": 0.8875, "mean_token_accuracy": 0.7421119809150696, "step": 1870 }, { "epoch": 0.3448592974066581, "grad_norm": 1.0571095915292046, "learning_rate": 4.012592776132435e-05, "loss": 0.9273, "mean_token_accuracy": 0.731085193157196, "step": 1875 }, { "epoch": 0.3457789221997425, "grad_norm": 1.135743652086019, "learning_rate": 4.0069230977738826e-05, "loss": 0.9534, "mean_token_accuracy": 0.7248372554779052, "step": 1880 }, { "epoch": 0.34669854699282693, "grad_norm": 0.9715071563775657, "learning_rate": 4.001241794026842e-05, "loss": 0.94, "mean_token_accuracy": 0.731473171710968, "step": 1885 }, { "epoch": 0.34761817178591137, "grad_norm": 0.9942342778662301, "learning_rate": 3.9955489174381746e-05, "loss": 0.9329, "mean_token_accuracy": 0.7310616850852967, "step": 1890 }, { "epoch": 0.34853779657899575, "grad_norm": 1.0075175249825896, "learning_rate": 3.989844520661779e-05, "loss": 0.9438, "mean_token_accuracy": 0.7262274742126464, "step": 1895 }, { "epoch": 0.3494574213720802, "grad_norm": 0.9753954477573876, "learning_rate": 3.984128656458106e-05, "loss": 0.9702, "mean_token_accuracy": 0.7193968415260314, "step": 1900 }, { "epoch": 0.35037704616516463, "grad_norm": 1.0133558076382343, "learning_rate": 3.978401377693669e-05, "loss": 0.873, "mean_token_accuracy": 0.7490906119346619, "step": 1905 }, { "epoch": 0.351296670958249, "grad_norm": 1.0343688728685794, "learning_rate": 3.9726627373405544e-05, "loss": 0.9308, "mean_token_accuracy": 0.7297749042510986, "step": 1910 }, { "epoch": 0.35221629575133345, "grad_norm": 0.9695668089988693, "learning_rate": 3.966912788475937e-05, "loss": 0.9028, "mean_token_accuracy": 0.7381954431533814, "step": 1915 }, { "epoch": 0.3531359205444179, "grad_norm": 0.9832664588504738, "learning_rate": 3.961151584281581e-05, "loss": 0.8815, "mean_token_accuracy": 0.7429476737976074, "step": 1920 }, { "epoch": 0.3540555453375023, "grad_norm": 0.963687599953708, "learning_rate": 3.955379178043352e-05, "loss": 0.9823, "mean_token_accuracy": 0.7177613019943238, "step": 1925 }, { "epoch": 0.3549751701305867, "grad_norm": 0.9479437389842555, "learning_rate": 3.9495956231507266e-05, "loss": 0.9274, "mean_token_accuracy": 0.7312801122665405, "step": 1930 }, { "epoch": 0.35589479492367115, "grad_norm": 0.938691928481946, "learning_rate": 3.943800973096296e-05, "loss": 0.9017, "mean_token_accuracy": 0.7394131779670715, "step": 1935 }, { "epoch": 0.35681441971675554, "grad_norm": 0.967769246759337, "learning_rate": 3.937995281475269e-05, "loss": 0.9216, "mean_token_accuracy": 0.7352214097976685, "step": 1940 }, { "epoch": 0.35773404450984, "grad_norm": 0.9613349378582403, "learning_rate": 3.932178601984982e-05, "loss": 0.8861, "mean_token_accuracy": 0.7429886102676392, "step": 1945 }, { "epoch": 0.3586536693029244, "grad_norm": 0.9739202222729397, "learning_rate": 3.926350988424397e-05, "loss": 0.8628, "mean_token_accuracy": 0.7480137705802917, "step": 1950 }, { "epoch": 0.35957329409600886, "grad_norm": 1.00417983410191, "learning_rate": 3.920512494693607e-05, "loss": 0.879, "mean_token_accuracy": 0.7440518856048584, "step": 1955 }, { "epoch": 0.36049291888909324, "grad_norm": 1.0098406374163094, "learning_rate": 3.9146631747933366e-05, "loss": 0.8329, "mean_token_accuracy": 0.759476363658905, "step": 1960 }, { "epoch": 0.3614125436821777, "grad_norm": 0.9962046099940254, "learning_rate": 3.908803082824441e-05, "loss": 0.8369, "mean_token_accuracy": 0.7543352007865906, "step": 1965 }, { "epoch": 0.3623321684752621, "grad_norm": 1.0229275697874085, "learning_rate": 3.9029322729874104e-05, "loss": 0.9319, "mean_token_accuracy": 0.7315138220787049, "step": 1970 }, { "epoch": 0.3632517932683465, "grad_norm": 0.9131833883898176, "learning_rate": 3.8970507995818636e-05, "loss": 0.8373, "mean_token_accuracy": 0.754296875, "step": 1975 }, { "epoch": 0.36417141806143094, "grad_norm": 0.9558351857573911, "learning_rate": 3.891158717006046e-05, "loss": 0.892, "mean_token_accuracy": 0.7430965900421143, "step": 1980 }, { "epoch": 0.3650910428545154, "grad_norm": 0.9446973659937214, "learning_rate": 3.885256079756331e-05, "loss": 0.9394, "mean_token_accuracy": 0.7250162839889527, "step": 1985 }, { "epoch": 0.36601066764759976, "grad_norm": 0.9202948815573198, "learning_rate": 3.879342942426711e-05, "loss": 0.9124, "mean_token_accuracy": 0.7363432049751282, "step": 1990 }, { "epoch": 0.3669302924406842, "grad_norm": 0.9507433703052857, "learning_rate": 3.8734193597082964e-05, "loss": 0.9265, "mean_token_accuracy": 0.7309059858322143, "step": 1995 }, { "epoch": 0.36784991723376864, "grad_norm": 0.9721403940210892, "learning_rate": 3.867485386388806e-05, "loss": 0.9368, "mean_token_accuracy": 0.7331580281257629, "step": 2000 }, { "epoch": 0.368769542026853, "grad_norm": 0.9405505899400793, "learning_rate": 3.8615410773520635e-05, "loss": 0.9138, "mean_token_accuracy": 0.7358463048934937, "step": 2005 }, { "epoch": 0.36968916681993746, "grad_norm": 0.963025470188593, "learning_rate": 3.8555864875774885e-05, "loss": 0.9019, "mean_token_accuracy": 0.7384212732315063, "step": 2010 }, { "epoch": 0.3706087916130219, "grad_norm": 0.9907971594256944, "learning_rate": 3.849621672139588e-05, "loss": 0.8763, "mean_token_accuracy": 0.7444020867347717, "step": 2015 }, { "epoch": 0.3715284164061063, "grad_norm": 0.981696155165083, "learning_rate": 3.843646686207445e-05, "loss": 0.9202, "mean_token_accuracy": 0.7325111865997315, "step": 2020 }, { "epoch": 0.3724480411991907, "grad_norm": 0.990078628199776, "learning_rate": 3.837661585044211e-05, "loss": 0.9045, "mean_token_accuracy": 0.7379343152046204, "step": 2025 }, { "epoch": 0.37336766599227517, "grad_norm": 0.9302652014201332, "learning_rate": 3.831666424006598e-05, "loss": 0.9145, "mean_token_accuracy": 0.7369246363639832, "step": 2030 }, { "epoch": 0.37428729078535955, "grad_norm": 1.0127134327540788, "learning_rate": 3.825661258544358e-05, "loss": 0.8949, "mean_token_accuracy": 0.740783178806305, "step": 2035 }, { "epoch": 0.375206915578444, "grad_norm": 0.9456025309406082, "learning_rate": 3.819646144199777e-05, "loss": 0.8635, "mean_token_accuracy": 0.749360203742981, "step": 2040 }, { "epoch": 0.37612654037152843, "grad_norm": 0.9458510607283644, "learning_rate": 3.813621136607157e-05, "loss": 0.9212, "mean_token_accuracy": 0.7321518301963806, "step": 2045 }, { "epoch": 0.3770461651646128, "grad_norm": 0.995792214246869, "learning_rate": 3.8075862914923074e-05, "loss": 0.9529, "mean_token_accuracy": 0.7222961544990539, "step": 2050 }, { "epoch": 0.37796578995769725, "grad_norm": 0.931780686224964, "learning_rate": 3.801541664672021e-05, "loss": 0.9068, "mean_token_accuracy": 0.7373356938362121, "step": 2055 }, { "epoch": 0.3788854147507817, "grad_norm": 1.032699719779323, "learning_rate": 3.795487312053566e-05, "loss": 0.8428, "mean_token_accuracy": 0.754009485244751, "step": 2060 }, { "epoch": 0.37980503954386613, "grad_norm": 1.0082536583803767, "learning_rate": 3.789423289634163e-05, "loss": 0.8877, "mean_token_accuracy": 0.7419803261756897, "step": 2065 }, { "epoch": 0.3807246643369505, "grad_norm": 0.9922794484448726, "learning_rate": 3.783349653500472e-05, "loss": 0.9549, "mean_token_accuracy": 0.7244602799415588, "step": 2070 }, { "epoch": 0.38164428913003495, "grad_norm": 0.9289765959162268, "learning_rate": 3.777266459828067e-05, "loss": 0.9049, "mean_token_accuracy": 0.7346539378166199, "step": 2075 }, { "epoch": 0.3825639139231194, "grad_norm": 0.9418822148176986, "learning_rate": 3.7711737648809255e-05, "loss": 0.8631, "mean_token_accuracy": 0.7498388290405273, "step": 2080 }, { "epoch": 0.3834835387162038, "grad_norm": 0.9739714347813362, "learning_rate": 3.765071625010899e-05, "loss": 0.8642, "mean_token_accuracy": 0.7496488690376282, "step": 2085 }, { "epoch": 0.3844031635092882, "grad_norm": 0.9876318304111896, "learning_rate": 3.758960096657197e-05, "loss": 0.9409, "mean_token_accuracy": 0.7231215476989746, "step": 2090 }, { "epoch": 0.38532278830237265, "grad_norm": 0.9391298182307426, "learning_rate": 3.752839236345866e-05, "loss": 0.9321, "mean_token_accuracy": 0.7299721479415894, "step": 2095 }, { "epoch": 0.38624241309545704, "grad_norm": 0.9975883406823954, "learning_rate": 3.746709100689263e-05, "loss": 0.9119, "mean_token_accuracy": 0.7372664332389831, "step": 2100 }, { "epoch": 0.3871620378885415, "grad_norm": 0.9585598143365737, "learning_rate": 3.740569746385531e-05, "loss": 0.9511, "mean_token_accuracy": 0.7252285242080688, "step": 2105 }, { "epoch": 0.3880816626816259, "grad_norm": 0.9708930878655039, "learning_rate": 3.7344212302180807e-05, "loss": 0.9021, "mean_token_accuracy": 0.7373741269111633, "step": 2110 }, { "epoch": 0.3890012874747103, "grad_norm": 0.9842480657825518, "learning_rate": 3.7282636090550613e-05, "loss": 0.9155, "mean_token_accuracy": 0.7346144676208496, "step": 2115 }, { "epoch": 0.38992091226779474, "grad_norm": 1.010319909401371, "learning_rate": 3.722096939848833e-05, "loss": 0.8251, "mean_token_accuracy": 0.7569172263145447, "step": 2120 }, { "epoch": 0.3908405370608792, "grad_norm": 1.0232782350312868, "learning_rate": 3.7159212796354425e-05, "loss": 0.9061, "mean_token_accuracy": 0.7363372683525086, "step": 2125 }, { "epoch": 0.39176016185396356, "grad_norm": 0.9853933308782586, "learning_rate": 3.7097366855340974e-05, "loss": 0.9281, "mean_token_accuracy": 0.7297635912895203, "step": 2130 }, { "epoch": 0.392679786647048, "grad_norm": 1.0085562594833883, "learning_rate": 3.703543214746632e-05, "loss": 0.9345, "mean_token_accuracy": 0.7267664670944214, "step": 2135 }, { "epoch": 0.39359941144013244, "grad_norm": 0.9907065624349415, "learning_rate": 3.6973409245569846e-05, "loss": 0.9017, "mean_token_accuracy": 0.7393394112586975, "step": 2140 }, { "epoch": 0.3945190362332168, "grad_norm": 0.9488707860528096, "learning_rate": 3.691129872330663e-05, "loss": 0.9373, "mean_token_accuracy": 0.728193199634552, "step": 2145 }, { "epoch": 0.39543866102630126, "grad_norm": 0.9103606197233259, "learning_rate": 3.684910115514218e-05, "loss": 0.897, "mean_token_accuracy": 0.7412585973739624, "step": 2150 }, { "epoch": 0.3963582858193857, "grad_norm": 0.965709462156266, "learning_rate": 3.678681711634708e-05, "loss": 0.8715, "mean_token_accuracy": 0.74575275182724, "step": 2155 }, { "epoch": 0.3972779106124701, "grad_norm": 1.0272326947622106, "learning_rate": 3.67244471829917e-05, "loss": 0.8789, "mean_token_accuracy": 0.7422020196914673, "step": 2160 }, { "epoch": 0.3981975354055545, "grad_norm": 0.9300588922771316, "learning_rate": 3.6661991931940856e-05, "loss": 0.8945, "mean_token_accuracy": 0.7385678648948669, "step": 2165 }, { "epoch": 0.39911716019863896, "grad_norm": 1.002757392159615, "learning_rate": 3.6599451940848446e-05, "loss": 0.8993, "mean_token_accuracy": 0.7361081838607788, "step": 2170 }, { "epoch": 0.4000367849917234, "grad_norm": 1.1036859227862066, "learning_rate": 3.6536827788152176e-05, "loss": 0.9308, "mean_token_accuracy": 0.7304606318473816, "step": 2175 }, { "epoch": 0.4009564097848078, "grad_norm": 0.9701793563305904, "learning_rate": 3.6474120053068164e-05, "loss": 0.8472, "mean_token_accuracy": 0.7498792171478271, "step": 2180 }, { "epoch": 0.4018760345778922, "grad_norm": 1.041733702997736, "learning_rate": 3.641132931558556e-05, "loss": 0.9581, "mean_token_accuracy": 0.7201631188392639, "step": 2185 }, { "epoch": 0.40279565937097667, "grad_norm": 1.0348942168040987, "learning_rate": 3.634845615646123e-05, "loss": 0.9393, "mean_token_accuracy": 0.7280836224555969, "step": 2190 }, { "epoch": 0.40371528416406105, "grad_norm": 1.0131734961320986, "learning_rate": 3.628550115721437e-05, "loss": 0.927, "mean_token_accuracy": 0.729682469367981, "step": 2195 }, { "epoch": 0.4046349089571455, "grad_norm": 1.025738826571974, "learning_rate": 3.622246490012111e-05, "loss": 0.9357, "mean_token_accuracy": 0.724788224697113, "step": 2200 }, { "epoch": 0.40555453375022993, "grad_norm": 0.9501914998942569, "learning_rate": 3.615934796820915e-05, "loss": 0.8978, "mean_token_accuracy": 0.7385434865951538, "step": 2205 }, { "epoch": 0.4064741585433143, "grad_norm": 1.0106650660729533, "learning_rate": 3.609615094525235e-05, "loss": 0.952, "mean_token_accuracy": 0.7243346452713013, "step": 2210 }, { "epoch": 0.40739378333639875, "grad_norm": 0.9301771755028939, "learning_rate": 3.6032874415765344e-05, "loss": 0.8633, "mean_token_accuracy": 0.7481309175491333, "step": 2215 }, { "epoch": 0.4083134081294832, "grad_norm": 0.9662316400458029, "learning_rate": 3.596951896499813e-05, "loss": 0.8931, "mean_token_accuracy": 0.7380975484848022, "step": 2220 }, { "epoch": 0.4092330329225676, "grad_norm": 0.9612362754674141, "learning_rate": 3.590608517893065e-05, "loss": 0.8787, "mean_token_accuracy": 0.743196439743042, "step": 2225 }, { "epoch": 0.410152657715652, "grad_norm": 0.9923328807528666, "learning_rate": 3.584257364426738e-05, "loss": 0.942, "mean_token_accuracy": 0.7252677202224731, "step": 2230 }, { "epoch": 0.41107228250873645, "grad_norm": 0.9797715702136052, "learning_rate": 3.577898494843191e-05, "loss": 0.9523, "mean_token_accuracy": 0.7244603157043457, "step": 2235 }, { "epoch": 0.41199190730182084, "grad_norm": 0.9048445218025765, "learning_rate": 3.571531967956147e-05, "loss": 0.9136, "mean_token_accuracy": 0.7320458292961121, "step": 2240 }, { "epoch": 0.4129115320949053, "grad_norm": 0.9649058945655278, "learning_rate": 3.565157842650154e-05, "loss": 0.9041, "mean_token_accuracy": 0.7362257719039917, "step": 2245 }, { "epoch": 0.4138311568879897, "grad_norm": 0.9147474250541198, "learning_rate": 3.55877617788004e-05, "loss": 0.9155, "mean_token_accuracy": 0.7333362221717834, "step": 2250 }, { "epoch": 0.4147507816810741, "grad_norm": 0.876619458906422, "learning_rate": 3.5523870326703635e-05, "loss": 0.8492, "mean_token_accuracy": 0.7528911828994751, "step": 2255 }, { "epoch": 0.41567040647415854, "grad_norm": 1.0036194468259731, "learning_rate": 3.545990466114871e-05, "loss": 0.9137, "mean_token_accuracy": 0.734946858882904, "step": 2260 }, { "epoch": 0.416590031267243, "grad_norm": 0.9978348158615458, "learning_rate": 3.5395865373759504e-05, "loss": 0.8815, "mean_token_accuracy": 0.742937445640564, "step": 2265 }, { "epoch": 0.41750965606032736, "grad_norm": 0.9799485166888982, "learning_rate": 3.533175305684081e-05, "loss": 0.8857, "mean_token_accuracy": 0.7412702798843384, "step": 2270 }, { "epoch": 0.4184292808534118, "grad_norm": 0.9766101000667111, "learning_rate": 3.5267568303372914e-05, "loss": 0.8934, "mean_token_accuracy": 0.7409379720687866, "step": 2275 }, { "epoch": 0.41934890564649624, "grad_norm": 0.9775807722195559, "learning_rate": 3.520331170700605e-05, "loss": 0.9067, "mean_token_accuracy": 0.7377767205238343, "step": 2280 }, { "epoch": 0.4202685304395807, "grad_norm": 0.9690742278243399, "learning_rate": 3.513898386205491e-05, "loss": 0.9032, "mean_token_accuracy": 0.7356434345245362, "step": 2285 }, { "epoch": 0.42118815523266506, "grad_norm": 0.965511424805927, "learning_rate": 3.507458536349323e-05, "loss": 0.9157, "mean_token_accuracy": 0.7343951106071472, "step": 2290 }, { "epoch": 0.4221077800257495, "grad_norm": 0.9486968791577164, "learning_rate": 3.5010116806948166e-05, "loss": 0.901, "mean_token_accuracy": 0.7399522423744201, "step": 2295 }, { "epoch": 0.42302740481883394, "grad_norm": 0.9414293890579761, "learning_rate": 3.4945578788694894e-05, "loss": 0.9179, "mean_token_accuracy": 0.7342228889465332, "step": 2300 }, { "epoch": 0.4239470296119183, "grad_norm": 0.9896377940060639, "learning_rate": 3.4880971905651016e-05, "loss": 0.8784, "mean_token_accuracy": 0.7457787752151489, "step": 2305 }, { "epoch": 0.42486665440500276, "grad_norm": 0.9655527131977069, "learning_rate": 3.481629675537108e-05, "loss": 0.863, "mean_token_accuracy": 0.7453173756599426, "step": 2310 }, { "epoch": 0.4257862791980872, "grad_norm": 0.8936296988219236, "learning_rate": 3.475155393604104e-05, "loss": 0.8856, "mean_token_accuracy": 0.7441475629806519, "step": 2315 }, { "epoch": 0.4267059039911716, "grad_norm": 0.9149916486904485, "learning_rate": 3.468674404647273e-05, "loss": 0.8532, "mean_token_accuracy": 0.7507219910621643, "step": 2320 }, { "epoch": 0.427625528784256, "grad_norm": 0.9750792604803812, "learning_rate": 3.462186768609834e-05, "loss": 0.863, "mean_token_accuracy": 0.7469933509826661, "step": 2325 }, { "epoch": 0.42854515357734047, "grad_norm": 0.980901247745682, "learning_rate": 3.455692545496483e-05, "loss": 0.837, "mean_token_accuracy": 0.7545093297958374, "step": 2330 }, { "epoch": 0.42946477837042485, "grad_norm": 0.9686839306544004, "learning_rate": 3.4491917953728396e-05, "loss": 0.8885, "mean_token_accuracy": 0.7428396463394165, "step": 2335 }, { "epoch": 0.4303844031635093, "grad_norm": 0.9388350160272184, "learning_rate": 3.442684578364897e-05, "loss": 0.8951, "mean_token_accuracy": 0.7408537268638611, "step": 2340 }, { "epoch": 0.4313040279565937, "grad_norm": 0.8933385447401438, "learning_rate": 3.4361709546584545e-05, "loss": 0.8689, "mean_token_accuracy": 0.7458449006080627, "step": 2345 }, { "epoch": 0.4322236527496781, "grad_norm": 0.9411177313363235, "learning_rate": 3.429650984498573e-05, "loss": 0.8417, "mean_token_accuracy": 0.7528134107589721, "step": 2350 }, { "epoch": 0.43314327754276255, "grad_norm": 0.9359109119006161, "learning_rate": 3.423124728189009e-05, "loss": 0.8737, "mean_token_accuracy": 0.7434362411499024, "step": 2355 }, { "epoch": 0.434062902335847, "grad_norm": 0.966957214742338, "learning_rate": 3.4165922460916635e-05, "loss": 0.8946, "mean_token_accuracy": 0.7397825956344605, "step": 2360 }, { "epoch": 0.4349825271289314, "grad_norm": 0.9950941777576424, "learning_rate": 3.410053598626016e-05, "loss": 0.8833, "mean_token_accuracy": 0.7447291493415833, "step": 2365 }, { "epoch": 0.4359021519220158, "grad_norm": 0.963560335329199, "learning_rate": 3.403508846268574e-05, "loss": 0.8675, "mean_token_accuracy": 0.7479366779327392, "step": 2370 }, { "epoch": 0.43682177671510025, "grad_norm": 0.9286384422364868, "learning_rate": 3.396958049552307e-05, "loss": 0.9171, "mean_token_accuracy": 0.7304298520088196, "step": 2375 }, { "epoch": 0.43774140150818464, "grad_norm": 0.9750119805406471, "learning_rate": 3.39040126906609e-05, "loss": 0.8858, "mean_token_accuracy": 0.742851734161377, "step": 2380 }, { "epoch": 0.4386610263012691, "grad_norm": 0.9160809046368507, "learning_rate": 3.383838565454144e-05, "loss": 0.9062, "mean_token_accuracy": 0.7335192441940308, "step": 2385 }, { "epoch": 0.4395806510943535, "grad_norm": 0.9668435486381742, "learning_rate": 3.37726999941547e-05, "loss": 0.9243, "mean_token_accuracy": 0.7276196122169495, "step": 2390 }, { "epoch": 0.4405002758874379, "grad_norm": 0.9935097247563913, "learning_rate": 3.3706956317032954e-05, "loss": 0.8678, "mean_token_accuracy": 0.7438644409179688, "step": 2395 }, { "epoch": 0.44141990068052234, "grad_norm": 0.9939894791042586, "learning_rate": 3.364115523124503e-05, "loss": 0.8904, "mean_token_accuracy": 0.7412869215011597, "step": 2400 }, { "epoch": 0.4423395254736068, "grad_norm": 0.9937645932689831, "learning_rate": 3.357529734539079e-05, "loss": 0.8455, "mean_token_accuracy": 0.7517339706420898, "step": 2405 }, { "epoch": 0.4432591502666912, "grad_norm": 0.9375114941684974, "learning_rate": 3.350938326859539e-05, "loss": 0.8468, "mean_token_accuracy": 0.7528372883796692, "step": 2410 }, { "epoch": 0.4441787750597756, "grad_norm": 0.8973960962242926, "learning_rate": 3.3443413610503735e-05, "loss": 0.878, "mean_token_accuracy": 0.7442919254302979, "step": 2415 }, { "epoch": 0.44509839985286004, "grad_norm": 1.0080330285869648, "learning_rate": 3.337738898127479e-05, "loss": 0.8785, "mean_token_accuracy": 0.7428927779197693, "step": 2420 }, { "epoch": 0.4460180246459445, "grad_norm": 0.8985281228115014, "learning_rate": 3.331130999157597e-05, "loss": 0.8644, "mean_token_accuracy": 0.7480224132537842, "step": 2425 }, { "epoch": 0.44693764943902886, "grad_norm": 0.9291069202904676, "learning_rate": 3.3245177252577454e-05, "loss": 0.8976, "mean_token_accuracy": 0.7383280873298645, "step": 2430 }, { "epoch": 0.4478572742321133, "grad_norm": 0.9623008963786942, "learning_rate": 3.317899137594656e-05, "loss": 0.9593, "mean_token_accuracy": 0.7246118664741517, "step": 2435 }, { "epoch": 0.44877689902519774, "grad_norm": 0.9234507163948065, "learning_rate": 3.311275297384208e-05, "loss": 0.8413, "mean_token_accuracy": 0.7528854846954346, "step": 2440 }, { "epoch": 0.4496965238182821, "grad_norm": 0.979267043456503, "learning_rate": 3.3046462658908636e-05, "loss": 0.845, "mean_token_accuracy": 0.7532721877098083, "step": 2445 }, { "epoch": 0.45061614861136656, "grad_norm": 0.9032231134895651, "learning_rate": 3.298012104427097e-05, "loss": 0.895, "mean_token_accuracy": 0.7396630644798279, "step": 2450 }, { "epoch": 0.451535773404451, "grad_norm": 0.9383158653652773, "learning_rate": 3.291372874352832e-05, "loss": 0.8943, "mean_token_accuracy": 0.73899405002594, "step": 2455 }, { "epoch": 0.4524553981975354, "grad_norm": 0.9664126873169693, "learning_rate": 3.284728637074869e-05, "loss": 0.869, "mean_token_accuracy": 0.746407687664032, "step": 2460 }, { "epoch": 0.4533750229906198, "grad_norm": 0.993853088939543, "learning_rate": 3.278079454046325e-05, "loss": 0.9011, "mean_token_accuracy": 0.7388368129730225, "step": 2465 }, { "epoch": 0.45429464778370426, "grad_norm": 0.8741206209918251, "learning_rate": 3.271425386766058e-05, "loss": 0.8388, "mean_token_accuracy": 0.7533232569694519, "step": 2470 }, { "epoch": 0.45521427257678865, "grad_norm": 0.9447835076472045, "learning_rate": 3.2647664967781035e-05, "loss": 0.8228, "mean_token_accuracy": 0.7583665132522583, "step": 2475 }, { "epoch": 0.4561338973698731, "grad_norm": 1.0045001891415821, "learning_rate": 3.258102845671097e-05, "loss": 0.8934, "mean_token_accuracy": 0.7414227366447449, "step": 2480 }, { "epoch": 0.4570535221629575, "grad_norm": 0.9475063098055461, "learning_rate": 3.251434495077716e-05, "loss": 0.9182, "mean_token_accuracy": 0.7303388476371765, "step": 2485 }, { "epoch": 0.4579731469560419, "grad_norm": 0.9775463234456495, "learning_rate": 3.2447615066741004e-05, "loss": 0.9361, "mean_token_accuracy": 0.7293364763259887, "step": 2490 }, { "epoch": 0.45889277174912635, "grad_norm": 0.9174334893241889, "learning_rate": 3.238083942179288e-05, "loss": 0.8474, "mean_token_accuracy": 0.7529029250144958, "step": 2495 }, { "epoch": 0.4598123965422108, "grad_norm": 0.9021239390235616, "learning_rate": 3.2314018633546375e-05, "loss": 0.8314, "mean_token_accuracy": 0.7585980296134949, "step": 2500 }, { "epoch": 0.46073202133529517, "grad_norm": 0.9231622515184421, "learning_rate": 3.224715332003265e-05, "loss": 0.8498, "mean_token_accuracy": 0.7502579808235168, "step": 2505 }, { "epoch": 0.4616516461283796, "grad_norm": 0.9279166556927757, "learning_rate": 3.218024409969468e-05, "loss": 0.899, "mean_token_accuracy": 0.7380064010620118, "step": 2510 }, { "epoch": 0.46257127092146405, "grad_norm": 0.9333611856920211, "learning_rate": 3.2113291591381516e-05, "loss": 0.9113, "mean_token_accuracy": 0.7354224920272827, "step": 2515 }, { "epoch": 0.4634908957145485, "grad_norm": 0.9585859302538061, "learning_rate": 3.204629641434259e-05, "loss": 0.912, "mean_token_accuracy": 0.7332522869110107, "step": 2520 }, { "epoch": 0.4644105205076329, "grad_norm": 1.0072945032594127, "learning_rate": 3.197925918822199e-05, "loss": 0.8615, "mean_token_accuracy": 0.7460902214050293, "step": 2525 }, { "epoch": 0.4653301453007173, "grad_norm": 0.9703474311506037, "learning_rate": 3.1912180533052716e-05, "loss": 0.9391, "mean_token_accuracy": 0.7272826433181763, "step": 2530 }, { "epoch": 0.46624977009380175, "grad_norm": 0.9701812144923739, "learning_rate": 3.184506106925094e-05, "loss": 0.8677, "mean_token_accuracy": 0.747051191329956, "step": 2535 }, { "epoch": 0.46716939488688614, "grad_norm": 0.9672451609696705, "learning_rate": 3.177790141761029e-05, "loss": 0.8627, "mean_token_accuracy": 0.7482078075408936, "step": 2540 }, { "epoch": 0.4680890196799706, "grad_norm": 0.9530973638849749, "learning_rate": 3.1710702199296085e-05, "loss": 0.8492, "mean_token_accuracy": 0.7528972029685974, "step": 2545 }, { "epoch": 0.469008644473055, "grad_norm": 0.9084239076489461, "learning_rate": 3.16434640358396e-05, "loss": 0.8653, "mean_token_accuracy": 0.746622622013092, "step": 2550 }, { "epoch": 0.4699282692661394, "grad_norm": 0.9998420571855022, "learning_rate": 3.157618754913233e-05, "loss": 0.8975, "mean_token_accuracy": 0.738722312450409, "step": 2555 }, { "epoch": 0.47084789405922384, "grad_norm": 0.9250250902872688, "learning_rate": 3.15088733614202e-05, "loss": 0.8551, "mean_token_accuracy": 0.750208032131195, "step": 2560 }, { "epoch": 0.4717675188523083, "grad_norm": 1.0106796436372896, "learning_rate": 3.144152209529786e-05, "loss": 0.9079, "mean_token_accuracy": 0.7350385189056396, "step": 2565 }, { "epoch": 0.47268714364539266, "grad_norm": 0.9619558970415346, "learning_rate": 3.137413437370289e-05, "loss": 0.91, "mean_token_accuracy": 0.7369326472282409, "step": 2570 }, { "epoch": 0.4736067684384771, "grad_norm": 1.0109885841238913, "learning_rate": 3.130671081991005e-05, "loss": 0.9084, "mean_token_accuracy": 0.7353306174278259, "step": 2575 }, { "epoch": 0.47452639323156154, "grad_norm": 0.9779190292756188, "learning_rate": 3.123925205752552e-05, "loss": 0.8556, "mean_token_accuracy": 0.7515247583389282, "step": 2580 }, { "epoch": 0.4754460180246459, "grad_norm": 0.9645840220644, "learning_rate": 3.1171758710481096e-05, "loss": 0.8755, "mean_token_accuracy": 0.7436783194541932, "step": 2585 }, { "epoch": 0.47636564281773036, "grad_norm": 1.001058541812525, "learning_rate": 3.110423140302852e-05, "loss": 0.9096, "mean_token_accuracy": 0.7341774582862854, "step": 2590 }, { "epoch": 0.4772852676108148, "grad_norm": 0.8974468409856537, "learning_rate": 3.103667075973356e-05, "loss": 0.9083, "mean_token_accuracy": 0.7359666705131531, "step": 2595 }, { "epoch": 0.4782048924038992, "grad_norm": 1.0374371477545201, "learning_rate": 3.096907740547036e-05, "loss": 0.9111, "mean_token_accuracy": 0.7324892163276673, "step": 2600 }, { "epoch": 0.4791245171969836, "grad_norm": 0.9405864234939062, "learning_rate": 3.0901451965415595e-05, "loss": 0.812, "mean_token_accuracy": 0.7602822542190552, "step": 2605 }, { "epoch": 0.48004414199006806, "grad_norm": 0.9654353230874346, "learning_rate": 3.08337950650427e-05, "loss": 0.8978, "mean_token_accuracy": 0.7364333510398865, "step": 2610 }, { "epoch": 0.48096376678315245, "grad_norm": 1.0011041381512356, "learning_rate": 3.076610733011609e-05, "loss": 0.9049, "mean_token_accuracy": 0.7363562822341919, "step": 2615 }, { "epoch": 0.4818833915762369, "grad_norm": 0.9686831090055986, "learning_rate": 3.069838938668538e-05, "loss": 0.8898, "mean_token_accuracy": 0.7398189902305603, "step": 2620 }, { "epoch": 0.4828030163693213, "grad_norm": 0.9318085356157495, "learning_rate": 3.063064186107957e-05, "loss": 0.8791, "mean_token_accuracy": 0.7449330806732177, "step": 2625 }, { "epoch": 0.48372264116240576, "grad_norm": 0.8934228857530689, "learning_rate": 3.056286537990129e-05, "loss": 0.8632, "mean_token_accuracy": 0.7459052681922913, "step": 2630 }, { "epoch": 0.48464226595549015, "grad_norm": 0.9725972260652284, "learning_rate": 3.049506057002098e-05, "loss": 0.8541, "mean_token_accuracy": 0.7478031516075134, "step": 2635 }, { "epoch": 0.4855618907485746, "grad_norm": 0.9452628770649284, "learning_rate": 3.042722805857106e-05, "loss": 0.8555, "mean_token_accuracy": 0.746888279914856, "step": 2640 }, { "epoch": 0.486481515541659, "grad_norm": 0.8806175124503305, "learning_rate": 3.0359368472940208e-05, "loss": 0.9035, "mean_token_accuracy": 0.7369076132774353, "step": 2645 }, { "epoch": 0.4874011403347434, "grad_norm": 0.8988265278259941, "learning_rate": 3.029148244076749e-05, "loss": 0.8643, "mean_token_accuracy": 0.7449605345726014, "step": 2650 }, { "epoch": 0.48832076512782785, "grad_norm": 0.9176861265880045, "learning_rate": 3.022357058993657e-05, "loss": 0.8643, "mean_token_accuracy": 0.7462789297103882, "step": 2655 }, { "epoch": 0.4892403899209123, "grad_norm": 0.9232400004776917, "learning_rate": 3.0155633548569955e-05, "loss": 0.903, "mean_token_accuracy": 0.7353234887123108, "step": 2660 }, { "epoch": 0.4901600147139967, "grad_norm": 0.9476269194909095, "learning_rate": 3.008767194502309e-05, "loss": 0.9035, "mean_token_accuracy": 0.7386479258537293, "step": 2665 }, { "epoch": 0.4910796395070811, "grad_norm": 0.931067111141978, "learning_rate": 3.0019686407878617e-05, "loss": 0.8883, "mean_token_accuracy": 0.7414939045906067, "step": 2670 }, { "epoch": 0.49199926430016555, "grad_norm": 0.9153445295986272, "learning_rate": 2.995167756594055e-05, "loss": 0.8625, "mean_token_accuracy": 0.7501867294311524, "step": 2675 }, { "epoch": 0.49291888909324993, "grad_norm": 0.9210143810764434, "learning_rate": 2.988364604822845e-05, "loss": 0.8972, "mean_token_accuracy": 0.7386625647544861, "step": 2680 }, { "epoch": 0.4938385138863344, "grad_norm": 0.9925053868796728, "learning_rate": 2.9815592483971584e-05, "loss": 0.8458, "mean_token_accuracy": 0.751643443107605, "step": 2685 }, { "epoch": 0.4947581386794188, "grad_norm": 1.006336852347141, "learning_rate": 2.9747517502603167e-05, "loss": 0.8721, "mean_token_accuracy": 0.7480525851249695, "step": 2690 }, { "epoch": 0.4956777634725032, "grad_norm": 0.9701598502406181, "learning_rate": 2.967942173375447e-05, "loss": 0.8818, "mean_token_accuracy": 0.740173089504242, "step": 2695 }, { "epoch": 0.49659738826558764, "grad_norm": 0.9431128523024928, "learning_rate": 2.9611305807249052e-05, "loss": 0.8344, "mean_token_accuracy": 0.7551051139831543, "step": 2700 }, { "epoch": 0.4975170130586721, "grad_norm": 0.9346714282194056, "learning_rate": 2.95431703530969e-05, "loss": 0.835, "mean_token_accuracy": 0.7544684171676636, "step": 2705 }, { "epoch": 0.49843663785175646, "grad_norm": 0.9358393411052466, "learning_rate": 2.9475016001488608e-05, "loss": 0.8906, "mean_token_accuracy": 0.7427068829536438, "step": 2710 }, { "epoch": 0.4993562626448409, "grad_norm": 0.8867163340537708, "learning_rate": 2.9406843382789583e-05, "loss": 0.8719, "mean_token_accuracy": 0.745942211151123, "step": 2715 }, { "epoch": 0.5002758874379253, "grad_norm": 0.9212664551640851, "learning_rate": 2.9338653127534148e-05, "loss": 0.8562, "mean_token_accuracy": 0.7497703909873963, "step": 2720 }, { "epoch": 0.5011955122310098, "grad_norm": 0.9432905808331339, "learning_rate": 2.9270445866419766e-05, "loss": 0.8741, "mean_token_accuracy": 0.7432116866111755, "step": 2725 }, { "epoch": 0.5021151370240942, "grad_norm": 0.9512906709412812, "learning_rate": 2.92022222303012e-05, "loss": 0.8818, "mean_token_accuracy": 0.7435823440551758, "step": 2730 }, { "epoch": 0.5030347618171785, "grad_norm": 0.9468765725989278, "learning_rate": 2.9133982850184645e-05, "loss": 0.8627, "mean_token_accuracy": 0.748947024345398, "step": 2735 }, { "epoch": 0.503954386610263, "grad_norm": 1.0112504748902342, "learning_rate": 2.9065728357221927e-05, "loss": 0.8508, "mean_token_accuracy": 0.7537087440490723, "step": 2740 }, { "epoch": 0.5048740114033474, "grad_norm": 0.9649262010355393, "learning_rate": 2.899745938270465e-05, "loss": 0.8819, "mean_token_accuracy": 0.7414289236068725, "step": 2745 }, { "epoch": 0.5057936361964318, "grad_norm": 0.9373961423715033, "learning_rate": 2.8929176558058352e-05, "loss": 0.8876, "mean_token_accuracy": 0.741254198551178, "step": 2750 }, { "epoch": 0.5067132609895163, "grad_norm": 0.9616567239953456, "learning_rate": 2.8860880514836687e-05, "loss": 0.8826, "mean_token_accuracy": 0.7436172485351562, "step": 2755 }, { "epoch": 0.5076328857826007, "grad_norm": 0.9367792403626876, "learning_rate": 2.8792571884715546e-05, "loss": 0.8482, "mean_token_accuracy": 0.7529447674751282, "step": 2760 }, { "epoch": 0.5085525105756851, "grad_norm": 0.9104599971108884, "learning_rate": 2.8724251299487263e-05, "loss": 0.8753, "mean_token_accuracy": 0.7427584528923035, "step": 2765 }, { "epoch": 0.5094721353687696, "grad_norm": 1.0105096627504964, "learning_rate": 2.8655919391054732e-05, "loss": 0.8641, "mean_token_accuracy": 0.7479874610900878, "step": 2770 }, { "epoch": 0.510391760161854, "grad_norm": 0.9279979512504474, "learning_rate": 2.8587576791425568e-05, "loss": 0.8317, "mean_token_accuracy": 0.7535252571105957, "step": 2775 }, { "epoch": 0.5113113849549383, "grad_norm": 0.9297465828114925, "learning_rate": 2.8519224132706297e-05, "loss": 0.8774, "mean_token_accuracy": 0.7402622103691101, "step": 2780 }, { "epoch": 0.5122310097480228, "grad_norm": 0.9452271860575534, "learning_rate": 2.845086204709645e-05, "loss": 0.8771, "mean_token_accuracy": 0.744519031047821, "step": 2785 }, { "epoch": 0.5131506345411072, "grad_norm": 0.9830981203343458, "learning_rate": 2.838249116688277e-05, "loss": 0.9289, "mean_token_accuracy": 0.7298115253448486, "step": 2790 }, { "epoch": 0.5140702593341917, "grad_norm": 1.041430018260559, "learning_rate": 2.8314112124433334e-05, "loss": 0.9045, "mean_token_accuracy": 0.7383831977844239, "step": 2795 }, { "epoch": 0.5149898841272761, "grad_norm": 0.9620402098071436, "learning_rate": 2.8245725552191703e-05, "loss": 0.8634, "mean_token_accuracy": 0.746962821483612, "step": 2800 }, { "epoch": 0.5159095089203605, "grad_norm": 0.9015921123510985, "learning_rate": 2.8177332082671117e-05, "loss": 0.853, "mean_token_accuracy": 0.7487654685974121, "step": 2805 }, { "epoch": 0.516829133713445, "grad_norm": 0.9007228615494444, "learning_rate": 2.8108932348448553e-05, "loss": 0.8428, "mean_token_accuracy": 0.7535581469535828, "step": 2810 }, { "epoch": 0.5177487585065293, "grad_norm": 0.9827577309973088, "learning_rate": 2.8040526982158993e-05, "loss": 0.8789, "mean_token_accuracy": 0.7432992815971374, "step": 2815 }, { "epoch": 0.5186683832996137, "grad_norm": 0.9633925171762643, "learning_rate": 2.7972116616489464e-05, "loss": 0.8397, "mean_token_accuracy": 0.752094304561615, "step": 2820 }, { "epoch": 0.5195880080926982, "grad_norm": 0.9281148435495344, "learning_rate": 2.790370188417324e-05, "loss": 0.8596, "mean_token_accuracy": 0.7485750317573547, "step": 2825 }, { "epoch": 0.5205076328857826, "grad_norm": 1.0029136932204825, "learning_rate": 2.7835283417984005e-05, "loss": 0.8718, "mean_token_accuracy": 0.7433583855628967, "step": 2830 }, { "epoch": 0.521427257678867, "grad_norm": 0.9621263162970809, "learning_rate": 2.7766861850729958e-05, "loss": 0.8955, "mean_token_accuracy": 0.7394774556159973, "step": 2835 }, { "epoch": 0.5223468824719515, "grad_norm": 0.9670299071015823, "learning_rate": 2.7698437815247995e-05, "loss": 0.8529, "mean_token_accuracy": 0.7500015497207642, "step": 2840 }, { "epoch": 0.5232665072650359, "grad_norm": 0.9398184622397476, "learning_rate": 2.763001194439782e-05, "loss": 0.8447, "mean_token_accuracy": 0.7504964828491211, "step": 2845 }, { "epoch": 0.5241861320581203, "grad_norm": 0.8869891271688453, "learning_rate": 2.756158487105613e-05, "loss": 0.8404, "mean_token_accuracy": 0.7549336075782775, "step": 2850 }, { "epoch": 0.5251057568512048, "grad_norm": 0.9965820824716972, "learning_rate": 2.749315722811073e-05, "loss": 0.9179, "mean_token_accuracy": 0.7317790746688843, "step": 2855 }, { "epoch": 0.5260253816442891, "grad_norm": 0.9304946857092635, "learning_rate": 2.7424729648454717e-05, "loss": 0.8874, "mean_token_accuracy": 0.7398088812828064, "step": 2860 }, { "epoch": 0.5269450064373735, "grad_norm": 0.9880649590404676, "learning_rate": 2.735630276498058e-05, "loss": 0.8738, "mean_token_accuracy": 0.7432942867279053, "step": 2865 }, { "epoch": 0.527864631230458, "grad_norm": 0.9350070938993663, "learning_rate": 2.728787721057437e-05, "loss": 0.8758, "mean_token_accuracy": 0.7431787729263306, "step": 2870 }, { "epoch": 0.5287842560235424, "grad_norm": 0.8997664568286488, "learning_rate": 2.7219453618109853e-05, "loss": 0.842, "mean_token_accuracy": 0.7523634552955627, "step": 2875 }, { "epoch": 0.5297038808166268, "grad_norm": 0.9519585493296138, "learning_rate": 2.715103262044265e-05, "loss": 0.8744, "mean_token_accuracy": 0.7417232871055603, "step": 2880 }, { "epoch": 0.5306235056097113, "grad_norm": 0.8836119550117293, "learning_rate": 2.708261485040439e-05, "loss": 0.856, "mean_token_accuracy": 0.7496297836303711, "step": 2885 }, { "epoch": 0.5315431304027957, "grad_norm": 0.9589883589041829, "learning_rate": 2.7014200940796824e-05, "loss": 0.8418, "mean_token_accuracy": 0.7520057439804078, "step": 2890 }, { "epoch": 0.53246275519588, "grad_norm": 0.9563207815434712, "learning_rate": 2.694579152438601e-05, "loss": 0.8936, "mean_token_accuracy": 0.7398610949516297, "step": 2895 }, { "epoch": 0.5333823799889645, "grad_norm": 0.9233468769288075, "learning_rate": 2.6877387233896472e-05, "loss": 0.8634, "mean_token_accuracy": 0.745741093158722, "step": 2900 }, { "epoch": 0.5343020047820489, "grad_norm": 0.9541286928919233, "learning_rate": 2.6808988702005285e-05, "loss": 0.868, "mean_token_accuracy": 0.7439489006996155, "step": 2905 }, { "epoch": 0.5352216295751333, "grad_norm": 0.9922987370495847, "learning_rate": 2.6740596561336275e-05, "loss": 0.8482, "mean_token_accuracy": 0.7504428863525391, "step": 2910 }, { "epoch": 0.5361412543682178, "grad_norm": 0.9722831543231532, "learning_rate": 2.667221144445418e-05, "loss": 0.8177, "mean_token_accuracy": 0.7608316302299499, "step": 2915 }, { "epoch": 0.5370608791613022, "grad_norm": 1.0275441684092577, "learning_rate": 2.6603833983858738e-05, "loss": 0.9398, "mean_token_accuracy": 0.7276052117347718, "step": 2920 }, { "epoch": 0.5379805039543866, "grad_norm": 1.0068511170391965, "learning_rate": 2.6535464811978894e-05, "loss": 0.8424, "mean_token_accuracy": 0.7531503081321717, "step": 2925 }, { "epoch": 0.5389001287474711, "grad_norm": 0.9554905959505885, "learning_rate": 2.6467104561166927e-05, "loss": 0.8671, "mean_token_accuracy": 0.7456499934196472, "step": 2930 }, { "epoch": 0.5398197535405554, "grad_norm": 0.9318421761107843, "learning_rate": 2.639875386369261e-05, "loss": 0.8674, "mean_token_accuracy": 0.7474814653396606, "step": 2935 }, { "epoch": 0.5407393783336398, "grad_norm": 0.9797586514540253, "learning_rate": 2.6330413351737336e-05, "loss": 0.893, "mean_token_accuracy": 0.7371798276901245, "step": 2940 }, { "epoch": 0.5416590031267243, "grad_norm": 0.9627863342351398, "learning_rate": 2.626208365738831e-05, "loss": 0.8662, "mean_token_accuracy": 0.7450501322746277, "step": 2945 }, { "epoch": 0.5425786279198087, "grad_norm": 0.9378560834404903, "learning_rate": 2.6193765412632677e-05, "loss": 0.8427, "mean_token_accuracy": 0.750009298324585, "step": 2950 }, { "epoch": 0.5434982527128931, "grad_norm": 0.9349477883280783, "learning_rate": 2.6125459249351697e-05, "loss": 0.8908, "mean_token_accuracy": 0.7386453747749329, "step": 2955 }, { "epoch": 0.5444178775059776, "grad_norm": 0.9298587181804499, "learning_rate": 2.6057165799314854e-05, "loss": 0.855, "mean_token_accuracy": 0.7491998553276062, "step": 2960 }, { "epoch": 0.545337502299062, "grad_norm": 0.9026144571758381, "learning_rate": 2.5988885694174085e-05, "loss": 0.8786, "mean_token_accuracy": 0.7437506198883057, "step": 2965 }, { "epoch": 0.5462571270921464, "grad_norm": 0.9408107824152944, "learning_rate": 2.5920619565457877e-05, "loss": 0.8758, "mean_token_accuracy": 0.7427832961082459, "step": 2970 }, { "epoch": 0.5471767518852308, "grad_norm": 0.9195819021761746, "learning_rate": 2.5852368044565452e-05, "loss": 0.9277, "mean_token_accuracy": 0.7323094010353088, "step": 2975 }, { "epoch": 0.5480963766783152, "grad_norm": 0.9586681296133412, "learning_rate": 2.5784131762760922e-05, "loss": 0.8334, "mean_token_accuracy": 0.7566598057746887, "step": 2980 }, { "epoch": 0.5490160014713996, "grad_norm": 0.9092467816987784, "learning_rate": 2.5715911351167465e-05, "loss": 0.9014, "mean_token_accuracy": 0.7390154361724853, "step": 2985 }, { "epoch": 0.5499356262644841, "grad_norm": 0.966449128998816, "learning_rate": 2.564770744076144e-05, "loss": 0.8959, "mean_token_accuracy": 0.7373208284378052, "step": 2990 }, { "epoch": 0.5508552510575685, "grad_norm": 1.0269176653506933, "learning_rate": 2.5579520662366618e-05, "loss": 0.8626, "mean_token_accuracy": 0.7471036791801453, "step": 2995 }, { "epoch": 0.5517748758506529, "grad_norm": 0.9705454615801481, "learning_rate": 2.5511351646648324e-05, "loss": 0.8761, "mean_token_accuracy": 0.7408113241195678, "step": 3000 }, { "epoch": 0.5526945006437374, "grad_norm": 0.9683019669667483, "learning_rate": 2.5443201024107537e-05, "loss": 0.8974, "mean_token_accuracy": 0.7345914959907531, "step": 3005 }, { "epoch": 0.5536141254368218, "grad_norm": 0.9328296833493311, "learning_rate": 2.5375069425075176e-05, "loss": 0.8629, "mean_token_accuracy": 0.7468894720077515, "step": 3010 }, { "epoch": 0.5545337502299063, "grad_norm": 0.9565417579373001, "learning_rate": 2.5306957479706196e-05, "loss": 0.8914, "mean_token_accuracy": 0.7373947501182556, "step": 3015 }, { "epoch": 0.5554533750229906, "grad_norm": 0.9439811181197841, "learning_rate": 2.5238865817973735e-05, "loss": 0.8264, "mean_token_accuracy": 0.7566876411437988, "step": 3020 }, { "epoch": 0.556372999816075, "grad_norm": 0.8918377804941932, "learning_rate": 2.5170795069663374e-05, "loss": 0.8384, "mean_token_accuracy": 0.7532538652420044, "step": 3025 }, { "epoch": 0.5572926246091595, "grad_norm": 0.9531681758263391, "learning_rate": 2.510274586436725e-05, "loss": 0.9137, "mean_token_accuracy": 0.7336269617080688, "step": 3030 }, { "epoch": 0.5582122494022439, "grad_norm": 0.9547809224031603, "learning_rate": 2.5034718831478236e-05, "loss": 0.8121, "mean_token_accuracy": 0.7607084512710571, "step": 3035 }, { "epoch": 0.5591318741953283, "grad_norm": 0.9101416039188879, "learning_rate": 2.496671460018414e-05, "loss": 0.8374, "mean_token_accuracy": 0.7512237310409546, "step": 3040 }, { "epoch": 0.5600514989884128, "grad_norm": 0.9591588974138807, "learning_rate": 2.4898733799461866e-05, "loss": 0.8691, "mean_token_accuracy": 0.7475574612617493, "step": 3045 }, { "epoch": 0.5609711237814972, "grad_norm": 0.9481182124754315, "learning_rate": 2.4830777058071623e-05, "loss": 0.8541, "mean_token_accuracy": 0.7470650672912598, "step": 3050 }, { "epoch": 0.5618907485745815, "grad_norm": 0.8991567391844545, "learning_rate": 2.4762845004551077e-05, "loss": 0.834, "mean_token_accuracy": 0.7513617157936097, "step": 3055 }, { "epoch": 0.562810373367666, "grad_norm": 0.8993594505060807, "learning_rate": 2.4694938267209567e-05, "loss": 0.8302, "mean_token_accuracy": 0.7539983510971069, "step": 3060 }, { "epoch": 0.5637299981607504, "grad_norm": 0.9212463554308379, "learning_rate": 2.4627057474122273e-05, "loss": 0.8598, "mean_token_accuracy": 0.747953188419342, "step": 3065 }, { "epoch": 0.5646496229538348, "grad_norm": 0.9155845020709076, "learning_rate": 2.4559203253124407e-05, "loss": 0.8728, "mean_token_accuracy": 0.7440886616706848, "step": 3070 }, { "epoch": 0.5655692477469193, "grad_norm": 0.9376543570110895, "learning_rate": 2.4491376231805428e-05, "loss": 0.8529, "mean_token_accuracy": 0.7518376111984253, "step": 3075 }, { "epoch": 0.5664888725400037, "grad_norm": 0.9720221730313491, "learning_rate": 2.442357703750322e-05, "loss": 0.8423, "mean_token_accuracy": 0.7525236487388611, "step": 3080 }, { "epoch": 0.5674084973330881, "grad_norm": 0.9013738631587733, "learning_rate": 2.4355806297298296e-05, "loss": 0.8422, "mean_token_accuracy": 0.7528858304023742, "step": 3085 }, { "epoch": 0.5683281221261726, "grad_norm": 0.9524358228393591, "learning_rate": 2.4288064638007974e-05, "loss": 0.8672, "mean_token_accuracy": 0.7468002319335938, "step": 3090 }, { "epoch": 0.569247746919257, "grad_norm": 0.9505409858129935, "learning_rate": 2.4220352686180613e-05, "loss": 0.8416, "mean_token_accuracy": 0.7486450433731079, "step": 3095 }, { "epoch": 0.5701673717123413, "grad_norm": 0.9615751645550065, "learning_rate": 2.415267106808983e-05, "loss": 0.803, "mean_token_accuracy": 0.7603586912155151, "step": 3100 }, { "epoch": 0.5710869965054258, "grad_norm": 0.9458073029155306, "learning_rate": 2.4085020409728633e-05, "loss": 0.8614, "mean_token_accuracy": 0.7483598232269287, "step": 3105 }, { "epoch": 0.5720066212985102, "grad_norm": 0.959427274017189, "learning_rate": 2.4017401336803713e-05, "loss": 0.8795, "mean_token_accuracy": 0.7383235573768616, "step": 3110 }, { "epoch": 0.5729262460915946, "grad_norm": 0.9688058239251538, "learning_rate": 2.394981447472963e-05, "loss": 0.8854, "mean_token_accuracy": 0.7413538813591003, "step": 3115 }, { "epoch": 0.5738458708846791, "grad_norm": 0.9543674760330169, "learning_rate": 2.3882260448623002e-05, "loss": 0.8924, "mean_token_accuracy": 0.739243483543396, "step": 3120 }, { "epoch": 0.5747654956777635, "grad_norm": 0.9565581088949338, "learning_rate": 2.381473988329675e-05, "loss": 0.8878, "mean_token_accuracy": 0.737128746509552, "step": 3125 }, { "epoch": 0.5756851204708479, "grad_norm": 0.9446263148140598, "learning_rate": 2.374725340325433e-05, "loss": 0.8771, "mean_token_accuracy": 0.7424870610237122, "step": 3130 }, { "epoch": 0.5766047452639324, "grad_norm": 0.9235345865848048, "learning_rate": 2.3679801632683927e-05, "loss": 0.8791, "mean_token_accuracy": 0.7413055062294006, "step": 3135 }, { "epoch": 0.5775243700570167, "grad_norm": 0.931358306977097, "learning_rate": 2.3612385195452687e-05, "loss": 0.8864, "mean_token_accuracy": 0.7415070414543152, "step": 3140 }, { "epoch": 0.5784439948501011, "grad_norm": 0.9366462545353926, "learning_rate": 2.3545004715100966e-05, "loss": 0.8791, "mean_token_accuracy": 0.7428970575332642, "step": 3145 }, { "epoch": 0.5793636196431856, "grad_norm": 0.9312216076414869, "learning_rate": 2.3477660814836562e-05, "loss": 0.8318, "mean_token_accuracy": 0.7540540814399719, "step": 3150 }, { "epoch": 0.58028324443627, "grad_norm": 0.9058432741408705, "learning_rate": 2.3410354117528904e-05, "loss": 0.9128, "mean_token_accuracy": 0.7328131318092346, "step": 3155 }, { "epoch": 0.5812028692293544, "grad_norm": 0.92693757568253, "learning_rate": 2.3343085245703373e-05, "loss": 0.8356, "mean_token_accuracy": 0.754761004447937, "step": 3160 }, { "epoch": 0.5821224940224389, "grad_norm": 0.9685552745916727, "learning_rate": 2.3275854821535476e-05, "loss": 0.8696, "mean_token_accuracy": 0.7423434615135193, "step": 3165 }, { "epoch": 0.5830421188155233, "grad_norm": 0.9530016316914325, "learning_rate": 2.3208663466845108e-05, "loss": 0.8239, "mean_token_accuracy": 0.7581414461135865, "step": 3170 }, { "epoch": 0.5839617436086076, "grad_norm": 0.9912981010776241, "learning_rate": 2.3141511803090815e-05, "loss": 0.8784, "mean_token_accuracy": 0.743216586112976, "step": 3175 }, { "epoch": 0.5848813684016921, "grad_norm": 0.8897494823501038, "learning_rate": 2.3074400451364048e-05, "loss": 0.8771, "mean_token_accuracy": 0.7422731041908264, "step": 3180 }, { "epoch": 0.5858009931947765, "grad_norm": 0.9087254524604537, "learning_rate": 2.300733003238339e-05, "loss": 0.8249, "mean_token_accuracy": 0.75495365858078, "step": 3185 }, { "epoch": 0.5867206179878609, "grad_norm": 0.9615326948623956, "learning_rate": 2.2940301166488846e-05, "loss": 0.7821, "mean_token_accuracy": 0.7687617659568786, "step": 3190 }, { "epoch": 0.5876402427809454, "grad_norm": 0.9239773147706558, "learning_rate": 2.28733144736361e-05, "loss": 0.8034, "mean_token_accuracy": 0.7630661010742188, "step": 3195 }, { "epoch": 0.5885598675740298, "grad_norm": 0.9271354944208791, "learning_rate": 2.2806370573390745e-05, "loss": 0.8377, "mean_token_accuracy": 0.7517584562301636, "step": 3200 }, { "epoch": 0.5894794923671142, "grad_norm": 0.9307261567222711, "learning_rate": 2.2739470084922608e-05, "loss": 0.9145, "mean_token_accuracy": 0.7307730317115784, "step": 3205 }, { "epoch": 0.5903991171601987, "grad_norm": 0.8708186634436479, "learning_rate": 2.2672613626999994e-05, "loss": 0.8495, "mean_token_accuracy": 0.7486128211021423, "step": 3210 }, { "epoch": 0.591318741953283, "grad_norm": 0.9473141853732495, "learning_rate": 2.2605801817983958e-05, "loss": 0.8341, "mean_token_accuracy": 0.7518749475479126, "step": 3215 }, { "epoch": 0.5922383667463674, "grad_norm": 0.9382593885727152, "learning_rate": 2.253903527582259e-05, "loss": 0.8447, "mean_token_accuracy": 0.7506359577178955, "step": 3220 }, { "epoch": 0.5931579915394519, "grad_norm": 0.9696123819996886, "learning_rate": 2.247231461804532e-05, "loss": 0.8266, "mean_token_accuracy": 0.7562480688095092, "step": 3225 }, { "epoch": 0.5940776163325363, "grad_norm": 0.8949351423802622, "learning_rate": 2.2405640461757176e-05, "loss": 0.814, "mean_token_accuracy": 0.7592174887657166, "step": 3230 }, { "epoch": 0.5949972411256208, "grad_norm": 0.9615311548799811, "learning_rate": 2.2339013423633083e-05, "loss": 0.8503, "mean_token_accuracy": 0.7499252796173096, "step": 3235 }, { "epoch": 0.5959168659187052, "grad_norm": 0.9086052926810453, "learning_rate": 2.2272434119912184e-05, "loss": 0.8754, "mean_token_accuracy": 0.7434251546859741, "step": 3240 }, { "epoch": 0.5968364907117896, "grad_norm": 0.9221742878259598, "learning_rate": 2.2205903166392113e-05, "loss": 0.8477, "mean_token_accuracy": 0.7485897660255432, "step": 3245 }, { "epoch": 0.5977561155048741, "grad_norm": 0.967041034869552, "learning_rate": 2.2139421178423307e-05, "loss": 0.8225, "mean_token_accuracy": 0.7570245742797852, "step": 3250 }, { "epoch": 0.5986757402979584, "grad_norm": 0.981067205830958, "learning_rate": 2.207298877090333e-05, "loss": 0.8701, "mean_token_accuracy": 0.7440281748771668, "step": 3255 }, { "epoch": 0.5995953650910428, "grad_norm": 0.989973298607582, "learning_rate": 2.2006606558271142e-05, "loss": 0.8713, "mean_token_accuracy": 0.7413482785224914, "step": 3260 }, { "epoch": 0.6005149898841273, "grad_norm": 0.8672144464089592, "learning_rate": 2.1940275154501482e-05, "loss": 0.87, "mean_token_accuracy": 0.743138313293457, "step": 3265 }, { "epoch": 0.6014346146772117, "grad_norm": 0.9653292378844739, "learning_rate": 2.187399517309914e-05, "loss": 0.8575, "mean_token_accuracy": 0.7464121103286743, "step": 3270 }, { "epoch": 0.6023542394702961, "grad_norm": 0.9239524199502155, "learning_rate": 2.1807767227093268e-05, "loss": 0.8236, "mean_token_accuracy": 0.7573307991027832, "step": 3275 }, { "epoch": 0.6032738642633806, "grad_norm": 0.9806975126747703, "learning_rate": 2.1741591929031795e-05, "loss": 0.878, "mean_token_accuracy": 0.7407856106758117, "step": 3280 }, { "epoch": 0.604193489056465, "grad_norm": 0.9640808408127749, "learning_rate": 2.167546989097566e-05, "loss": 0.8638, "mean_token_accuracy": 0.7459958910942077, "step": 3285 }, { "epoch": 0.6051131138495494, "grad_norm": 0.9656473527433518, "learning_rate": 2.16094017244932e-05, "loss": 0.8783, "mean_token_accuracy": 0.7419638872146607, "step": 3290 }, { "epoch": 0.6060327386426339, "grad_norm": 0.9930014003610543, "learning_rate": 2.154338804065451e-05, "loss": 0.8615, "mean_token_accuracy": 0.7456332087516785, "step": 3295 }, { "epoch": 0.6069523634357182, "grad_norm": 0.9330196848152268, "learning_rate": 2.1477429450025767e-05, "loss": 0.8352, "mean_token_accuracy": 0.7517044901847839, "step": 3300 }, { "epoch": 0.6078719882288026, "grad_norm": 0.8777553334567131, "learning_rate": 2.1411526562663554e-05, "loss": 0.8364, "mean_token_accuracy": 0.7501665949821472, "step": 3305 }, { "epoch": 0.6087916130218871, "grad_norm": 0.9315142599796349, "learning_rate": 2.1345679988109284e-05, "loss": 0.8378, "mean_token_accuracy": 0.7534802198410034, "step": 3310 }, { "epoch": 0.6097112378149715, "grad_norm": 0.9385962221597601, "learning_rate": 2.1279890335383534e-05, "loss": 0.8876, "mean_token_accuracy": 0.7398653388023376, "step": 3315 }, { "epoch": 0.6106308626080559, "grad_norm": 0.9451857651632474, "learning_rate": 2.1214158212980366e-05, "loss": 0.7988, "mean_token_accuracy": 0.7636669516563416, "step": 3320 }, { "epoch": 0.6115504874011404, "grad_norm": 0.9310680714278403, "learning_rate": 2.114848422886177e-05, "loss": 0.8417, "mean_token_accuracy": 0.7545873999595643, "step": 3325 }, { "epoch": 0.6124701121942248, "grad_norm": 0.9555284993925652, "learning_rate": 2.108286899045202e-05, "loss": 0.8906, "mean_token_accuracy": 0.7384588122367859, "step": 3330 }, { "epoch": 0.6133897369873091, "grad_norm": 0.9525478437560697, "learning_rate": 2.1017313104632003e-05, "loss": 0.844, "mean_token_accuracy": 0.7497392654418945, "step": 3335 }, { "epoch": 0.6143093617803936, "grad_norm": 0.9657934498214388, "learning_rate": 2.0951817177733684e-05, "loss": 0.8748, "mean_token_accuracy": 0.7426393389701843, "step": 3340 }, { "epoch": 0.615228986573478, "grad_norm": 0.9174407552166862, "learning_rate": 2.088638181553446e-05, "loss": 0.8727, "mean_token_accuracy": 0.742801570892334, "step": 3345 }, { "epoch": 0.6161486113665624, "grad_norm": 0.9106809477969502, "learning_rate": 2.0821007623251564e-05, "loss": 0.8227, "mean_token_accuracy": 0.7550573825836182, "step": 3350 }, { "epoch": 0.6170682361596469, "grad_norm": 0.8816231707997737, "learning_rate": 2.075569520553643e-05, "loss": 0.8066, "mean_token_accuracy": 0.7590124368667602, "step": 3355 }, { "epoch": 0.6179878609527313, "grad_norm": 0.9651791807712018, "learning_rate": 2.0690445166469158e-05, "loss": 0.8575, "mean_token_accuracy": 0.7481630921363831, "step": 3360 }, { "epoch": 0.6189074857458157, "grad_norm": 0.962161882798645, "learning_rate": 2.0625258109552926e-05, "loss": 0.8842, "mean_token_accuracy": 0.743985378742218, "step": 3365 }, { "epoch": 0.6198271105389002, "grad_norm": 0.955250281560398, "learning_rate": 2.0560134637708334e-05, "loss": 0.8413, "mean_token_accuracy": 0.7497357606887818, "step": 3370 }, { "epoch": 0.6207467353319845, "grad_norm": 1.0327175413319667, "learning_rate": 2.0495075353267913e-05, "loss": 0.8697, "mean_token_accuracy": 0.7445659875869751, "step": 3375 }, { "epoch": 0.6216663601250689, "grad_norm": 0.9525687098312168, "learning_rate": 2.043008085797052e-05, "loss": 0.8722, "mean_token_accuracy": 0.7410041093826294, "step": 3380 }, { "epoch": 0.6225859849181534, "grad_norm": 0.9275514977855014, "learning_rate": 2.036515175295574e-05, "loss": 0.8412, "mean_token_accuracy": 0.7507887959480286, "step": 3385 }, { "epoch": 0.6235056097112378, "grad_norm": 0.9493961658678648, "learning_rate": 2.03002886387584e-05, "loss": 0.8556, "mean_token_accuracy": 0.7469261646270752, "step": 3390 }, { "epoch": 0.6244252345043222, "grad_norm": 0.9292345545436532, "learning_rate": 2.0235492115302944e-05, "loss": 0.8301, "mean_token_accuracy": 0.7550871014595032, "step": 3395 }, { "epoch": 0.6253448592974067, "grad_norm": 0.9430411664378814, "learning_rate": 2.017076278189794e-05, "loss": 0.8321, "mean_token_accuracy": 0.7533326983451843, "step": 3400 }, { "epoch": 0.6262644840904911, "grad_norm": 0.8889521393845567, "learning_rate": 2.0106101237230455e-05, "loss": 0.8324, "mean_token_accuracy": 0.7539088129997253, "step": 3405 }, { "epoch": 0.6271841088835755, "grad_norm": 0.9180009901150891, "learning_rate": 2.0041508079360634e-05, "loss": 0.7898, "mean_token_accuracy": 0.761493980884552, "step": 3410 }, { "epoch": 0.62810373367666, "grad_norm": 0.9055995921329637, "learning_rate": 1.997698390571608e-05, "loss": 0.8419, "mean_token_accuracy": 0.7503387928009033, "step": 3415 }, { "epoch": 0.6290233584697443, "grad_norm": 0.9447591194939752, "learning_rate": 1.991252931308633e-05, "loss": 0.8692, "mean_token_accuracy": 0.7452242970466614, "step": 3420 }, { "epoch": 0.6299429832628287, "grad_norm": 0.9351426059072258, "learning_rate": 1.9848144897617417e-05, "loss": 0.8149, "mean_token_accuracy": 0.7568124055862426, "step": 3425 }, { "epoch": 0.6308626080559132, "grad_norm": 0.9168023134449134, "learning_rate": 1.9783831254806257e-05, "loss": 0.8157, "mean_token_accuracy": 0.7554953694343567, "step": 3430 }, { "epoch": 0.6317822328489976, "grad_norm": 1.027979530127791, "learning_rate": 1.971958897949518e-05, "loss": 0.8229, "mean_token_accuracy": 0.7550533413887024, "step": 3435 }, { "epoch": 0.632701857642082, "grad_norm": 0.8964633060914129, "learning_rate": 1.9655418665866465e-05, "loss": 0.7966, "mean_token_accuracy": 0.7639833688735962, "step": 3440 }, { "epoch": 0.6336214824351665, "grad_norm": 0.8702615238247585, "learning_rate": 1.9591320907436782e-05, "loss": 0.8502, "mean_token_accuracy": 0.74614177942276, "step": 3445 }, { "epoch": 0.6345411072282509, "grad_norm": 0.9157962896320851, "learning_rate": 1.9527296297051765e-05, "loss": 0.8026, "mean_token_accuracy": 0.758307683467865, "step": 3450 }, { "epoch": 0.6354607320213354, "grad_norm": 0.9465005665572019, "learning_rate": 1.9463345426880448e-05, "loss": 0.8036, "mean_token_accuracy": 0.7617629647254944, "step": 3455 }, { "epoch": 0.6363803568144197, "grad_norm": 0.9618417431183126, "learning_rate": 1.939946888840986e-05, "loss": 0.8819, "mean_token_accuracy": 0.7395693898200989, "step": 3460 }, { "epoch": 0.6372999816075041, "grad_norm": 0.9326022903907812, "learning_rate": 1.933566727243956e-05, "loss": 0.8384, "mean_token_accuracy": 0.7497618556022644, "step": 3465 }, { "epoch": 0.6382196064005886, "grad_norm": 0.942168299955769, "learning_rate": 1.927194116907608e-05, "loss": 0.8821, "mean_token_accuracy": 0.7422310829162597, "step": 3470 }, { "epoch": 0.639139231193673, "grad_norm": 0.930256851029374, "learning_rate": 1.9208291167727576e-05, "loss": 0.8293, "mean_token_accuracy": 0.7561385631561279, "step": 3475 }, { "epoch": 0.6400588559867574, "grad_norm": 0.8857746537604931, "learning_rate": 1.9144717857098328e-05, "loss": 0.8166, "mean_token_accuracy": 0.7583439826965332, "step": 3480 }, { "epoch": 0.6409784807798419, "grad_norm": 0.9519372824273006, "learning_rate": 1.908122182518326e-05, "loss": 0.8674, "mean_token_accuracy": 0.741856062412262, "step": 3485 }, { "epoch": 0.6418981055729263, "grad_norm": 0.9483959540274922, "learning_rate": 1.9017803659262583e-05, "loss": 0.8496, "mean_token_accuracy": 0.7491413950920105, "step": 3490 }, { "epoch": 0.6428177303660106, "grad_norm": 0.9729346329964175, "learning_rate": 1.8954463945896293e-05, "loss": 0.8554, "mean_token_accuracy": 0.7483752846717835, "step": 3495 }, { "epoch": 0.6437373551590951, "grad_norm": 0.910719020599245, "learning_rate": 1.889120327091879e-05, "loss": 0.8332, "mean_token_accuracy": 0.753311276435852, "step": 3500 }, { "epoch": 0.6446569799521795, "grad_norm": 0.8997078755147822, "learning_rate": 1.8828022219433413e-05, "loss": 0.8311, "mean_token_accuracy": 0.7538302779197693, "step": 3505 }, { "epoch": 0.6455766047452639, "grad_norm": 0.9097287217365273, "learning_rate": 1.8764921375807083e-05, "loss": 0.8573, "mean_token_accuracy": 0.74767564535141, "step": 3510 }, { "epoch": 0.6464962295383484, "grad_norm": 0.9420262116863728, "learning_rate": 1.8701901323664863e-05, "loss": 0.8551, "mean_token_accuracy": 0.7479906916618347, "step": 3515 }, { "epoch": 0.6474158543314328, "grad_norm": 0.9297816459092663, "learning_rate": 1.8638962645884565e-05, "loss": 0.8066, "mean_token_accuracy": 0.7580268263816834, "step": 3520 }, { "epoch": 0.6483354791245172, "grad_norm": 0.946031226164797, "learning_rate": 1.8576105924591357e-05, "loss": 0.8179, "mean_token_accuracy": 0.7542472004890441, "step": 3525 }, { "epoch": 0.6492551039176017, "grad_norm": 0.9036904422802344, "learning_rate": 1.8513331741152412e-05, "loss": 0.8261, "mean_token_accuracy": 0.7552783608436584, "step": 3530 }, { "epoch": 0.650174728710686, "grad_norm": 0.921905554132334, "learning_rate": 1.8450640676171472e-05, "loss": 0.8351, "mean_token_accuracy": 0.752598226070404, "step": 3535 }, { "epoch": 0.6510943535037704, "grad_norm": 1.0035005670649164, "learning_rate": 1.8388033309483522e-05, "loss": 0.8981, "mean_token_accuracy": 0.7371325850486755, "step": 3540 }, { "epoch": 0.6520139782968549, "grad_norm": 0.9724909600231612, "learning_rate": 1.8325510220149413e-05, "loss": 0.8327, "mean_token_accuracy": 0.751532518863678, "step": 3545 }, { "epoch": 0.6529336030899393, "grad_norm": 0.9664687506252672, "learning_rate": 1.8263071986450524e-05, "loss": 0.8336, "mean_token_accuracy": 0.7516280770301819, "step": 3550 }, { "epoch": 0.6538532278830237, "grad_norm": 0.9164445815967506, "learning_rate": 1.8200719185883358e-05, "loss": 0.8316, "mean_token_accuracy": 0.7544404864311218, "step": 3555 }, { "epoch": 0.6547728526761082, "grad_norm": 0.9293565126179983, "learning_rate": 1.813845239515427e-05, "loss": 0.8257, "mean_token_accuracy": 0.7552899837493896, "step": 3560 }, { "epoch": 0.6556924774691926, "grad_norm": 0.9010810987925738, "learning_rate": 1.8076272190174115e-05, "loss": 0.8201, "mean_token_accuracy": 0.7565722703933716, "step": 3565 }, { "epoch": 0.656612102262277, "grad_norm": 1.0075745989661558, "learning_rate": 1.801417914605286e-05, "loss": 0.869, "mean_token_accuracy": 0.7453143835067749, "step": 3570 }, { "epoch": 0.6575317270553614, "grad_norm": 0.935586367301874, "learning_rate": 1.795217383709437e-05, "loss": 0.8845, "mean_token_accuracy": 0.7403179168701172, "step": 3575 }, { "epoch": 0.6584513518484458, "grad_norm": 0.9872971011864189, "learning_rate": 1.7890256836791008e-05, "loss": 0.8052, "mean_token_accuracy": 0.7629344463348389, "step": 3580 }, { "epoch": 0.6593709766415302, "grad_norm": 0.9876503263464145, "learning_rate": 1.7828428717818353e-05, "loss": 0.8135, "mean_token_accuracy": 0.7590724229812622, "step": 3585 }, { "epoch": 0.6602906014346147, "grad_norm": 0.8811578706911977, "learning_rate": 1.7766690052029944e-05, "loss": 0.8221, "mean_token_accuracy": 0.7560603976249695, "step": 3590 }, { "epoch": 0.6612102262276991, "grad_norm": 0.9719326557742581, "learning_rate": 1.770504141045194e-05, "loss": 0.8342, "mean_token_accuracy": 0.7510559558868408, "step": 3595 }, { "epoch": 0.6621298510207835, "grad_norm": 1.0132470520749903, "learning_rate": 1.7643483363277874e-05, "loss": 0.8487, "mean_token_accuracy": 0.7500616908073425, "step": 3600 }, { "epoch": 0.663049475813868, "grad_norm": 1.0318932699213554, "learning_rate": 1.7582016479863327e-05, "loss": 0.8487, "mean_token_accuracy": 0.7490703582763671, "step": 3605 }, { "epoch": 0.6639691006069524, "grad_norm": 0.8658023921332224, "learning_rate": 1.7520641328720756e-05, "loss": 0.8238, "mean_token_accuracy": 0.7564070224761963, "step": 3610 }, { "epoch": 0.6648887254000367, "grad_norm": 0.9750052383478849, "learning_rate": 1.7459358477514122e-05, "loss": 0.8249, "mean_token_accuracy": 0.7549832344055176, "step": 3615 }, { "epoch": 0.6658083501931212, "grad_norm": 0.957114636285714, "learning_rate": 1.7398168493053723e-05, "loss": 0.7881, "mean_token_accuracy": 0.7615378856658935, "step": 3620 }, { "epoch": 0.6667279749862056, "grad_norm": 0.9148381033348181, "learning_rate": 1.7337071941290944e-05, "loss": 0.8196, "mean_token_accuracy": 0.7577734112739563, "step": 3625 }, { "epoch": 0.66764759977929, "grad_norm": 0.9583843198631806, "learning_rate": 1.7276069387312955e-05, "loss": 0.9, "mean_token_accuracy": 0.7367844343185425, "step": 3630 }, { "epoch": 0.6685672245723745, "grad_norm": 0.9525242256598431, "learning_rate": 1.7215161395337572e-05, "loss": 0.8351, "mean_token_accuracy": 0.7536734580993653, "step": 3635 }, { "epoch": 0.6694868493654589, "grad_norm": 0.9218486580963495, "learning_rate": 1.7154348528707992e-05, "loss": 0.8512, "mean_token_accuracy": 0.7513302564620972, "step": 3640 }, { "epoch": 0.6704064741585433, "grad_norm": 0.9497350819436411, "learning_rate": 1.709363134988757e-05, "loss": 0.8522, "mean_token_accuracy": 0.747953987121582, "step": 3645 }, { "epoch": 0.6713260989516278, "grad_norm": 0.9359833703344925, "learning_rate": 1.7033010420454655e-05, "loss": 0.8091, "mean_token_accuracy": 0.7576663970947266, "step": 3650 }, { "epoch": 0.6722457237447121, "grad_norm": 0.9884296155896105, "learning_rate": 1.6972486301097376e-05, "loss": 0.8185, "mean_token_accuracy": 0.7578543424606323, "step": 3655 }, { "epoch": 0.6731653485377965, "grad_norm": 0.885165473016121, "learning_rate": 1.691205955160845e-05, "loss": 0.8461, "mean_token_accuracy": 0.7491200208663941, "step": 3660 }, { "epoch": 0.674084973330881, "grad_norm": 0.9715821597591158, "learning_rate": 1.6851730730880012e-05, "loss": 0.8527, "mean_token_accuracy": 0.7483757376670838, "step": 3665 }, { "epoch": 0.6750045981239654, "grad_norm": 0.8871437133597592, "learning_rate": 1.679150039689846e-05, "loss": 0.8148, "mean_token_accuracy": 0.7578411340713501, "step": 3670 }, { "epoch": 0.6759242229170498, "grad_norm": 0.9530586600231223, "learning_rate": 1.673136910673926e-05, "loss": 0.8645, "mean_token_accuracy": 0.7451423764228821, "step": 3675 }, { "epoch": 0.6768438477101343, "grad_norm": 0.9427729850229866, "learning_rate": 1.6671337416561817e-05, "loss": 0.8432, "mean_token_accuracy": 0.7509079575538635, "step": 3680 }, { "epoch": 0.6777634725032187, "grad_norm": 0.9325142143827265, "learning_rate": 1.661140588160435e-05, "loss": 0.8347, "mean_token_accuracy": 0.7516968011856079, "step": 3685 }, { "epoch": 0.6786830972963032, "grad_norm": 0.9601757924065347, "learning_rate": 1.6551575056178695e-05, "loss": 0.8166, "mean_token_accuracy": 0.7589465737342834, "step": 3690 }, { "epoch": 0.6796027220893875, "grad_norm": 1.0086779966517565, "learning_rate": 1.649184549366525e-05, "loss": 0.8395, "mean_token_accuracy": 0.7520246505737305, "step": 3695 }, { "epoch": 0.6805223468824719, "grad_norm": 0.9707009645804029, "learning_rate": 1.6432217746507814e-05, "loss": 0.8382, "mean_token_accuracy": 0.7533354997634888, "step": 3700 }, { "epoch": 0.6814419716755564, "grad_norm": 0.9109669918450888, "learning_rate": 1.6372692366208476e-05, "loss": 0.8186, "mean_token_accuracy": 0.7560298204421997, "step": 3705 }, { "epoch": 0.6823615964686408, "grad_norm": 0.931556246223817, "learning_rate": 1.6313269903322536e-05, "loss": 0.8682, "mean_token_accuracy": 0.7464072823524475, "step": 3710 }, { "epoch": 0.6832812212617252, "grad_norm": 0.9316943141031991, "learning_rate": 1.6253950907453414e-05, "loss": 0.7891, "mean_token_accuracy": 0.7643645644187927, "step": 3715 }, { "epoch": 0.6842008460548097, "grad_norm": 0.9367407375514984, "learning_rate": 1.619473592724752e-05, "loss": 0.8489, "mean_token_accuracy": 0.7488224864006042, "step": 3720 }, { "epoch": 0.6851204708478941, "grad_norm": 0.96189736553831, "learning_rate": 1.613562551038925e-05, "loss": 0.7964, "mean_token_accuracy": 0.7625237464904785, "step": 3725 }, { "epoch": 0.6860400956409785, "grad_norm": 0.9170890141555628, "learning_rate": 1.607662020359587e-05, "loss": 0.8404, "mean_token_accuracy": 0.7529777765274048, "step": 3730 }, { "epoch": 0.686959720434063, "grad_norm": 0.9456438498787428, "learning_rate": 1.6017720552612462e-05, "loss": 0.8036, "mean_token_accuracy": 0.7614395618438721, "step": 3735 }, { "epoch": 0.6878793452271473, "grad_norm": 0.9544770877536788, "learning_rate": 1.595892710220691e-05, "loss": 0.8413, "mean_token_accuracy": 0.7519929647445679, "step": 3740 }, { "epoch": 0.6887989700202317, "grad_norm": 1.022115954707187, "learning_rate": 1.5900240396164835e-05, "loss": 0.8612, "mean_token_accuracy": 0.747264850139618, "step": 3745 }, { "epoch": 0.6897185948133162, "grad_norm": 0.9476824745559427, "learning_rate": 1.584166097728455e-05, "loss": 0.847, "mean_token_accuracy": 0.7491350531578064, "step": 3750 }, { "epoch": 0.6906382196064006, "grad_norm": 0.8827290010499629, "learning_rate": 1.578318938737209e-05, "loss": 0.8284, "mean_token_accuracy": 0.7547004818916321, "step": 3755 }, { "epoch": 0.691557844399485, "grad_norm": 0.9009975487421323, "learning_rate": 1.5724826167236146e-05, "loss": 0.8214, "mean_token_accuracy": 0.7568115711212158, "step": 3760 }, { "epoch": 0.6924774691925695, "grad_norm": 0.9187149873785133, "learning_rate": 1.5666571856683116e-05, "loss": 0.827, "mean_token_accuracy": 0.7550323009490967, "step": 3765 }, { "epoch": 0.6933970939856539, "grad_norm": 0.9280641474823987, "learning_rate": 1.560842699451204e-05, "loss": 0.7616, "mean_token_accuracy": 0.7714649677276612, "step": 3770 }, { "epoch": 0.6943167187787382, "grad_norm": 0.9038372482824055, "learning_rate": 1.5550392118509705e-05, "loss": 0.8028, "mean_token_accuracy": 0.760212504863739, "step": 3775 }, { "epoch": 0.6952363435718227, "grad_norm": 0.9201432901179558, "learning_rate": 1.5492467765445613e-05, "loss": 0.8241, "mean_token_accuracy": 0.754262363910675, "step": 3780 }, { "epoch": 0.6961559683649071, "grad_norm": 0.9031896471527984, "learning_rate": 1.5434654471067007e-05, "loss": 0.8078, "mean_token_accuracy": 0.7623116612434387, "step": 3785 }, { "epoch": 0.6970755931579915, "grad_norm": 0.928442088214151, "learning_rate": 1.537695277009396e-05, "loss": 0.8667, "mean_token_accuracy": 0.7442408680915833, "step": 3790 }, { "epoch": 0.697995217951076, "grad_norm": 0.9545685310758198, "learning_rate": 1.5319363196214427e-05, "loss": 0.8147, "mean_token_accuracy": 0.757679283618927, "step": 3795 }, { "epoch": 0.6989148427441604, "grad_norm": 0.957997913837239, "learning_rate": 1.526188628207924e-05, "loss": 0.8674, "mean_token_accuracy": 0.7406766414642334, "step": 3800 }, { "epoch": 0.6998344675372448, "grad_norm": 0.907233770113165, "learning_rate": 1.5204522559297275e-05, "loss": 0.8228, "mean_token_accuracy": 0.7550997257232666, "step": 3805 }, { "epoch": 0.7007540923303293, "grad_norm": 0.9753264400407652, "learning_rate": 1.5147272558430472e-05, "loss": 0.812, "mean_token_accuracy": 0.7584111213684082, "step": 3810 }, { "epoch": 0.7016737171234136, "grad_norm": 0.898583550613599, "learning_rate": 1.509013680898896e-05, "loss": 0.814, "mean_token_accuracy": 0.7574291110038758, "step": 3815 }, { "epoch": 0.702593341916498, "grad_norm": 0.9245046858803572, "learning_rate": 1.5033115839426127e-05, "loss": 0.8002, "mean_token_accuracy": 0.7631544828414917, "step": 3820 }, { "epoch": 0.7035129667095825, "grad_norm": 0.9501909113953771, "learning_rate": 1.4976210177133764e-05, "loss": 0.8284, "mean_token_accuracy": 0.7537835121154786, "step": 3825 }, { "epoch": 0.7044325915026669, "grad_norm": 0.9118736011138947, "learning_rate": 1.4919420348437189e-05, "loss": 0.8637, "mean_token_accuracy": 0.746515440940857, "step": 3830 }, { "epoch": 0.7053522162957513, "grad_norm": 0.9346208775326443, "learning_rate": 1.4862746878590329e-05, "loss": 0.8325, "mean_token_accuracy": 0.7536684751510621, "step": 3835 }, { "epoch": 0.7062718410888358, "grad_norm": 0.9644025251262837, "learning_rate": 1.4806190291770932e-05, "loss": 0.9199, "mean_token_accuracy": 0.728544807434082, "step": 3840 }, { "epoch": 0.7071914658819202, "grad_norm": 0.9316658230434494, "learning_rate": 1.4749751111075682e-05, "loss": 0.8478, "mean_token_accuracy": 0.7476451396942139, "step": 3845 }, { "epoch": 0.7081110906750046, "grad_norm": 0.8593875878005443, "learning_rate": 1.469342985851534e-05, "loss": 0.7931, "mean_token_accuracy": 0.7640434741973877, "step": 3850 }, { "epoch": 0.709030715468089, "grad_norm": 0.9379422901278587, "learning_rate": 1.4637227055009962e-05, "loss": 0.8228, "mean_token_accuracy": 0.7573190450668335, "step": 3855 }, { "epoch": 0.7099503402611734, "grad_norm": 0.9026485371540945, "learning_rate": 1.4581143220384047e-05, "loss": 0.82, "mean_token_accuracy": 0.756511640548706, "step": 3860 }, { "epoch": 0.7108699650542578, "grad_norm": 0.9796042273923296, "learning_rate": 1.4525178873361756e-05, "loss": 0.8242, "mean_token_accuracy": 0.7555618524551392, "step": 3865 }, { "epoch": 0.7117895898473423, "grad_norm": 0.9383990549827186, "learning_rate": 1.4469334531562067e-05, "loss": 0.8448, "mean_token_accuracy": 0.7482100129127502, "step": 3870 }, { "epoch": 0.7127092146404267, "grad_norm": 0.9602931261847705, "learning_rate": 1.4413610711494058e-05, "loss": 0.8365, "mean_token_accuracy": 0.7580392360687256, "step": 3875 }, { "epoch": 0.7136288394335111, "grad_norm": 0.943240285031073, "learning_rate": 1.4358007928552075e-05, "loss": 0.7861, "mean_token_accuracy": 0.7667181611061096, "step": 3880 }, { "epoch": 0.7145484642265956, "grad_norm": 0.9447898247986761, "learning_rate": 1.4302526697010964e-05, "loss": 0.8078, "mean_token_accuracy": 0.7595344543457031, "step": 3885 }, { "epoch": 0.71546808901968, "grad_norm": 0.9841983235190546, "learning_rate": 1.424716753002136e-05, "loss": 0.8597, "mean_token_accuracy": 0.7481236219406128, "step": 3890 }, { "epoch": 0.7163877138127643, "grad_norm": 0.9684153403690037, "learning_rate": 1.4191930939604908e-05, "loss": 0.8117, "mean_token_accuracy": 0.7613986849784851, "step": 3895 }, { "epoch": 0.7173073386058488, "grad_norm": 0.996877698893722, "learning_rate": 1.4136817436649502e-05, "loss": 0.8766, "mean_token_accuracy": 0.738961935043335, "step": 3900 }, { "epoch": 0.7182269633989332, "grad_norm": 0.9051545491177592, "learning_rate": 1.4081827530904624e-05, "loss": 0.8445, "mean_token_accuracy": 0.749999487400055, "step": 3905 }, { "epoch": 0.7191465881920177, "grad_norm": 0.9684927881965169, "learning_rate": 1.4026961730976584e-05, "loss": 0.8209, "mean_token_accuracy": 0.7576812863349914, "step": 3910 }, { "epoch": 0.7200662129851021, "grad_norm": 0.9610042841526357, "learning_rate": 1.3972220544323832e-05, "loss": 0.8131, "mean_token_accuracy": 0.7582221627235413, "step": 3915 }, { "epoch": 0.7209858377781865, "grad_norm": 0.9412320092723402, "learning_rate": 1.3917604477252238e-05, "loss": 0.7937, "mean_token_accuracy": 0.7617234110832214, "step": 3920 }, { "epoch": 0.721905462571271, "grad_norm": 0.9321659094215312, "learning_rate": 1.3863114034910452e-05, "loss": 0.8156, "mean_token_accuracy": 0.7598451256752015, "step": 3925 }, { "epoch": 0.7228250873643554, "grad_norm": 0.956577146254236, "learning_rate": 1.3808749721285214e-05, "loss": 0.8107, "mean_token_accuracy": 0.757847785949707, "step": 3930 }, { "epoch": 0.7237447121574397, "grad_norm": 0.9139917904820034, "learning_rate": 1.3754512039196658e-05, "loss": 0.8754, "mean_token_accuracy": 0.7391230940818787, "step": 3935 }, { "epoch": 0.7246643369505242, "grad_norm": 0.92757564731535, "learning_rate": 1.3700401490293718e-05, "loss": 0.8193, "mean_token_accuracy": 0.7570781588554383, "step": 3940 }, { "epoch": 0.7255839617436086, "grad_norm": 0.9533935473757719, "learning_rate": 1.3646418575049475e-05, "loss": 0.8244, "mean_token_accuracy": 0.756612241268158, "step": 3945 }, { "epoch": 0.726503586536693, "grad_norm": 0.9319033478082173, "learning_rate": 1.3592563792756468e-05, "loss": 0.7994, "mean_token_accuracy": 0.7616767644882202, "step": 3950 }, { "epoch": 0.7274232113297775, "grad_norm": 0.9659322616790049, "learning_rate": 1.3538837641522172e-05, "loss": 0.776, "mean_token_accuracy": 0.7666900753974915, "step": 3955 }, { "epoch": 0.7283428361228619, "grad_norm": 0.9715937702004781, "learning_rate": 1.3485240618264322e-05, "loss": 0.8707, "mean_token_accuracy": 0.742601501941681, "step": 3960 }, { "epoch": 0.7292624609159463, "grad_norm": 0.9279423695840053, "learning_rate": 1.3431773218706336e-05, "loss": 0.8435, "mean_token_accuracy": 0.7503429889678955, "step": 3965 }, { "epoch": 0.7301820857090308, "grad_norm": 0.9826978876425828, "learning_rate": 1.3378435937372729e-05, "loss": 0.8609, "mean_token_accuracy": 0.7491580963134765, "step": 3970 }, { "epoch": 0.7311017105021151, "grad_norm": 0.9333913123309906, "learning_rate": 1.3325229267584549e-05, "loss": 0.8771, "mean_token_accuracy": 0.7425579071044922, "step": 3975 }, { "epoch": 0.7320213352951995, "grad_norm": 0.9125063830711305, "learning_rate": 1.3272153701454809e-05, "loss": 0.8086, "mean_token_accuracy": 0.7603332042694092, "step": 3980 }, { "epoch": 0.732940960088284, "grad_norm": 0.9868481200984651, "learning_rate": 1.3219209729883918e-05, "loss": 0.7879, "mean_token_accuracy": 0.7675115823745727, "step": 3985 }, { "epoch": 0.7338605848813684, "grad_norm": 0.9006549103315062, "learning_rate": 1.3166397842555175e-05, "loss": 0.7923, "mean_token_accuracy": 0.7659124851226806, "step": 3990 }, { "epoch": 0.7347802096744528, "grad_norm": 0.9128416767290051, "learning_rate": 1.3113718527930214e-05, "loss": 0.8363, "mean_token_accuracy": 0.751650869846344, "step": 3995 }, { "epoch": 0.7356998344675373, "grad_norm": 0.93586974280188, "learning_rate": 1.3061172273244477e-05, "loss": 0.8634, "mean_token_accuracy": 0.7428792953491211, "step": 4000 }, { "epoch": 0.7366194592606217, "grad_norm": 0.9865948469992011, "learning_rate": 1.3008759564502742e-05, "loss": 0.8627, "mean_token_accuracy": 0.7454355955123901, "step": 4005 }, { "epoch": 0.737539084053706, "grad_norm": 0.9395366278250679, "learning_rate": 1.2956480886474609e-05, "loss": 0.8408, "mean_token_accuracy": 0.7488868713378907, "step": 4010 }, { "epoch": 0.7384587088467905, "grad_norm": 0.9259161411169768, "learning_rate": 1.2904336722690013e-05, "loss": 0.8474, "mean_token_accuracy": 0.7509873270988464, "step": 4015 }, { "epoch": 0.7393783336398749, "grad_norm": 0.8982963261004637, "learning_rate": 1.2852327555434743e-05, "loss": 0.8272, "mean_token_accuracy": 0.7562850832939148, "step": 4020 }, { "epoch": 0.7402979584329593, "grad_norm": 0.9145268063018638, "learning_rate": 1.280045386574601e-05, "loss": 0.7964, "mean_token_accuracy": 0.7601189255714417, "step": 4025 }, { "epoch": 0.7412175832260438, "grad_norm": 0.9417030319528836, "learning_rate": 1.2748716133407985e-05, "loss": 0.8243, "mean_token_accuracy": 0.7563821077346802, "step": 4030 }, { "epoch": 0.7421372080191282, "grad_norm": 0.9170391844634309, "learning_rate": 1.269711483694733e-05, "loss": 0.8071, "mean_token_accuracy": 0.7610970735549927, "step": 4035 }, { "epoch": 0.7430568328122126, "grad_norm": 0.927700931925603, "learning_rate": 1.264565045362883e-05, "loss": 0.83, "mean_token_accuracy": 0.7542360424995422, "step": 4040 }, { "epoch": 0.7439764576052971, "grad_norm": 0.902718257172033, "learning_rate": 1.259432345945094e-05, "loss": 0.8026, "mean_token_accuracy": 0.7602586507797241, "step": 4045 }, { "epoch": 0.7448960823983815, "grad_norm": 0.9732168765607019, "learning_rate": 1.2543134329141382e-05, "loss": 0.8166, "mean_token_accuracy": 0.7585108041763305, "step": 4050 }, { "epoch": 0.7458157071914658, "grad_norm": 0.9466993086607015, "learning_rate": 1.2492083536152772e-05, "loss": 0.8169, "mean_token_accuracy": 0.758376932144165, "step": 4055 }, { "epoch": 0.7467353319845503, "grad_norm": 0.9757475911083087, "learning_rate": 1.2441171552658228e-05, "loss": 0.8389, "mean_token_accuracy": 0.7498653650283813, "step": 4060 }, { "epoch": 0.7476549567776347, "grad_norm": 0.9151481291254611, "learning_rate": 1.2390398849547023e-05, "loss": 0.8006, "mean_token_accuracy": 0.7613858461380005, "step": 4065 }, { "epoch": 0.7485745815707191, "grad_norm": 0.8890653066533022, "learning_rate": 1.2339765896420178e-05, "loss": 0.8404, "mean_token_accuracy": 0.7510004043579102, "step": 4070 }, { "epoch": 0.7494942063638036, "grad_norm": 0.9533182704017102, "learning_rate": 1.2289273161586194e-05, "loss": 0.8234, "mean_token_accuracy": 0.7551814436912536, "step": 4075 }, { "epoch": 0.750413831156888, "grad_norm": 0.9407240854533703, "learning_rate": 1.2238921112056663e-05, "loss": 0.8635, "mean_token_accuracy": 0.7466271042823791, "step": 4080 }, { "epoch": 0.7513334559499724, "grad_norm": 0.8895247933273808, "learning_rate": 1.2188710213541957e-05, "loss": 0.8332, "mean_token_accuracy": 0.752234959602356, "step": 4085 }, { "epoch": 0.7522530807430569, "grad_norm": 0.9353802672482648, "learning_rate": 1.213864093044695e-05, "loss": 0.8448, "mean_token_accuracy": 0.7497453451156616, "step": 4090 }, { "epoch": 0.7531727055361412, "grad_norm": 0.946809122144392, "learning_rate": 1.2088713725866696e-05, "loss": 0.8088, "mean_token_accuracy": 0.758155906200409, "step": 4095 }, { "epoch": 0.7540923303292256, "grad_norm": 0.9340815348568988, "learning_rate": 1.203892906158214e-05, "loss": 0.8525, "mean_token_accuracy": 0.7470645427703857, "step": 4100 }, { "epoch": 0.7550119551223101, "grad_norm": 0.9903725518055015, "learning_rate": 1.1989287398055874e-05, "loss": 0.8406, "mean_token_accuracy": 0.7499817609786987, "step": 4105 }, { "epoch": 0.7559315799153945, "grad_norm": 0.9005006268013445, "learning_rate": 1.193978919442787e-05, "loss": 0.833, "mean_token_accuracy": 0.7508885979652404, "step": 4110 }, { "epoch": 0.7568512047084789, "grad_norm": 0.922000222155766, "learning_rate": 1.1890434908511212e-05, "loss": 0.8256, "mean_token_accuracy": 0.7544254660606384, "step": 4115 }, { "epoch": 0.7577708295015634, "grad_norm": 0.9147121717124462, "learning_rate": 1.1841224996787876e-05, "loss": 0.8119, "mean_token_accuracy": 0.7572540044784546, "step": 4120 }, { "epoch": 0.7586904542946478, "grad_norm": 0.9401032528457242, "learning_rate": 1.1792159914404518e-05, "loss": 0.8389, "mean_token_accuracy": 0.7547949194908142, "step": 4125 }, { "epoch": 0.7596100790877323, "grad_norm": 0.899746427074481, "learning_rate": 1.1743240115168262e-05, "loss": 0.8104, "mean_token_accuracy": 0.7588290691375732, "step": 4130 }, { "epoch": 0.7605297038808166, "grad_norm": 0.9377432106115406, "learning_rate": 1.1694466051542473e-05, "loss": 0.8155, "mean_token_accuracy": 0.7565756559371948, "step": 4135 }, { "epoch": 0.761449328673901, "grad_norm": 0.9436429623996605, "learning_rate": 1.1645838174642614e-05, "loss": 0.8167, "mean_token_accuracy": 0.7574901819229126, "step": 4140 }, { "epoch": 0.7623689534669855, "grad_norm": 0.9163014099905564, "learning_rate": 1.1597356934232053e-05, "loss": 0.8518, "mean_token_accuracy": 0.7465153455734252, "step": 4145 }, { "epoch": 0.7632885782600699, "grad_norm": 0.8716564591657281, "learning_rate": 1.1549022778717888e-05, "loss": 0.8572, "mean_token_accuracy": 0.7444779276847839, "step": 4150 }, { "epoch": 0.7642082030531543, "grad_norm": 0.9408396749893937, "learning_rate": 1.1500836155146839e-05, "loss": 0.83, "mean_token_accuracy": 0.7533326983451843, "step": 4155 }, { "epoch": 0.7651278278462388, "grad_norm": 0.9335839862612282, "learning_rate": 1.1452797509201083e-05, "loss": 0.8751, "mean_token_accuracy": 0.7398134231567383, "step": 4160 }, { "epoch": 0.7660474526393232, "grad_norm": 0.9850624435923674, "learning_rate": 1.1404907285194125e-05, "loss": 0.8523, "mean_token_accuracy": 0.7461954593658447, "step": 4165 }, { "epoch": 0.7669670774324076, "grad_norm": 0.9679449146346353, "learning_rate": 1.1357165926066716e-05, "loss": 0.7892, "mean_token_accuracy": 0.7605505466461182, "step": 4170 }, { "epoch": 0.767886702225492, "grad_norm": 0.9416265509404674, "learning_rate": 1.130957387338275e-05, "loss": 0.8221, "mean_token_accuracy": 0.7559242844581604, "step": 4175 }, { "epoch": 0.7688063270185764, "grad_norm": 0.909615601406411, "learning_rate": 1.1262131567325163e-05, "loss": 0.8357, "mean_token_accuracy": 0.7517993927001954, "step": 4180 }, { "epoch": 0.7697259518116608, "grad_norm": 0.9047722281799156, "learning_rate": 1.1214839446691869e-05, "loss": 0.8032, "mean_token_accuracy": 0.7601001501083374, "step": 4185 }, { "epoch": 0.7706455766047453, "grad_norm": 0.9246634008625312, "learning_rate": 1.1167697948891707e-05, "loss": 0.8249, "mean_token_accuracy": 0.7536085605621338, "step": 4190 }, { "epoch": 0.7715652013978297, "grad_norm": 0.9460638804791452, "learning_rate": 1.1120707509940403e-05, "loss": 0.8167, "mean_token_accuracy": 0.7593476176261902, "step": 4195 }, { "epoch": 0.7724848261909141, "grad_norm": 0.9221593736048895, "learning_rate": 1.1073868564456503e-05, "loss": 0.845, "mean_token_accuracy": 0.7480282187461853, "step": 4200 }, { "epoch": 0.7734044509839986, "grad_norm": 0.8888076192030434, "learning_rate": 1.1027181545657403e-05, "loss": 0.7794, "mean_token_accuracy": 0.76693354845047, "step": 4205 }, { "epoch": 0.774324075777083, "grad_norm": 0.8891810327123515, "learning_rate": 1.0980646885355313e-05, "loss": 0.7885, "mean_token_accuracy": 0.7628621697425843, "step": 4210 }, { "epoch": 0.7752437005701673, "grad_norm": 0.9743526817712896, "learning_rate": 1.0934265013953239e-05, "loss": 0.8478, "mean_token_accuracy": 0.7504450678825378, "step": 4215 }, { "epoch": 0.7761633253632518, "grad_norm": 0.9143999464853897, "learning_rate": 1.0888036360441066e-05, "loss": 0.8059, "mean_token_accuracy": 0.7603421926498413, "step": 4220 }, { "epoch": 0.7770829501563362, "grad_norm": 0.9734913517153475, "learning_rate": 1.0841961352391522e-05, "loss": 0.8159, "mean_token_accuracy": 0.7574024796485901, "step": 4225 }, { "epoch": 0.7780025749494206, "grad_norm": 0.935773373300799, "learning_rate": 1.079604041595628e-05, "loss": 0.8562, "mean_token_accuracy": 0.7468973875045777, "step": 4230 }, { "epoch": 0.7789221997425051, "grad_norm": 0.9031689337704597, "learning_rate": 1.075027397586198e-05, "loss": 0.8165, "mean_token_accuracy": 0.7566033601760864, "step": 4235 }, { "epoch": 0.7798418245355895, "grad_norm": 0.9138920947374664, "learning_rate": 1.0704662455406309e-05, "loss": 0.8137, "mean_token_accuracy": 0.7558243870735168, "step": 4240 }, { "epoch": 0.7807614493286739, "grad_norm": 0.942480721965923, "learning_rate": 1.06592062764541e-05, "loss": 0.8103, "mean_token_accuracy": 0.7595886349678039, "step": 4245 }, { "epoch": 0.7816810741217584, "grad_norm": 0.8995689595482391, "learning_rate": 1.0613905859433412e-05, "loss": 0.8158, "mean_token_accuracy": 0.7546827673912049, "step": 4250 }, { "epoch": 0.7826006989148427, "grad_norm": 0.8666864815369382, "learning_rate": 1.0568761623331642e-05, "loss": 0.8082, "mean_token_accuracy": 0.7590071558952332, "step": 4255 }, { "epoch": 0.7835203237079271, "grad_norm": 0.9696655409923509, "learning_rate": 1.0523773985691673e-05, "loss": 0.8556, "mean_token_accuracy": 0.7452132105827332, "step": 4260 }, { "epoch": 0.7844399485010116, "grad_norm": 0.9833829005536767, "learning_rate": 1.0478943362607984e-05, "loss": 0.8586, "mean_token_accuracy": 0.7462344169616699, "step": 4265 }, { "epoch": 0.785359573294096, "grad_norm": 0.9595206401213471, "learning_rate": 1.0434270168722813e-05, "loss": 0.8351, "mean_token_accuracy": 0.7498462796211243, "step": 4270 }, { "epoch": 0.7862791980871804, "grad_norm": 0.9261440611345254, "learning_rate": 1.0389754817222325e-05, "loss": 0.77, "mean_token_accuracy": 0.7716120958328248, "step": 4275 }, { "epoch": 0.7871988228802649, "grad_norm": 0.926036803637149, "learning_rate": 1.0345397719832791e-05, "loss": 0.8117, "mean_token_accuracy": 0.75774165391922, "step": 4280 }, { "epoch": 0.7881184476733493, "grad_norm": 0.9482199838406158, "learning_rate": 1.0301199286816768e-05, "loss": 0.7869, "mean_token_accuracy": 0.7647076845169067, "step": 4285 }, { "epoch": 0.7890380724664336, "grad_norm": 0.9249156078948935, "learning_rate": 1.0257159926969315e-05, "loss": 0.8379, "mean_token_accuracy": 0.7494875431060791, "step": 4290 }, { "epoch": 0.7899576972595181, "grad_norm": 0.9426764037549299, "learning_rate": 1.0213280047614224e-05, "loss": 0.8399, "mean_token_accuracy": 0.748091197013855, "step": 4295 }, { "epoch": 0.7908773220526025, "grad_norm": 0.9001227058548062, "learning_rate": 1.016956005460021e-05, "loss": 0.8151, "mean_token_accuracy": 0.7553766012191773, "step": 4300 }, { "epoch": 0.7917969468456869, "grad_norm": 0.9494070318147612, "learning_rate": 1.0126000352297207e-05, "loss": 0.8161, "mean_token_accuracy": 0.7553802728652954, "step": 4305 }, { "epoch": 0.7927165716387714, "grad_norm": 0.9634025237949015, "learning_rate": 1.0082601343592613e-05, "loss": 0.8375, "mean_token_accuracy": 0.7490672588348388, "step": 4310 }, { "epoch": 0.7936361964318558, "grad_norm": 0.918509774691625, "learning_rate": 1.0039363429887526e-05, "loss": 0.8027, "mean_token_accuracy": 0.7611651062965393, "step": 4315 }, { "epoch": 0.7945558212249402, "grad_norm": 0.9045021299622812, "learning_rate": 9.996287011093095e-06, "loss": 0.8194, "mean_token_accuracy": 0.7530111193656921, "step": 4320 }, { "epoch": 0.7954754460180247, "grad_norm": 0.9575102184844824, "learning_rate": 9.95337248562677e-06, "loss": 0.813, "mean_token_accuracy": 0.7606404304504395, "step": 4325 }, { "epoch": 0.796395070811109, "grad_norm": 0.9520723107616024, "learning_rate": 9.910620250408654e-06, "loss": 0.8219, "mean_token_accuracy": 0.7527819633483886, "step": 4330 }, { "epoch": 0.7973146956041934, "grad_norm": 0.9957772801943348, "learning_rate": 9.868030700857786e-06, "loss": 0.8527, "mean_token_accuracy": 0.7474417209625244, "step": 4335 }, { "epoch": 0.7982343203972779, "grad_norm": 0.9206334782903142, "learning_rate": 9.825604230888534e-06, "loss": 0.8013, "mean_token_accuracy": 0.7611706376075744, "step": 4340 }, { "epoch": 0.7991539451903623, "grad_norm": 0.9528692345244755, "learning_rate": 9.783341232906929e-06, "loss": 0.8452, "mean_token_accuracy": 0.7476886630058288, "step": 4345 }, { "epoch": 0.8000735699834468, "grad_norm": 0.9501814513029114, "learning_rate": 9.741242097807015e-06, "loss": 0.7998, "mean_token_accuracy": 0.7616806149482727, "step": 4350 }, { "epoch": 0.8009931947765312, "grad_norm": 0.9162860642484046, "learning_rate": 9.699307214967278e-06, "loss": 0.8154, "mean_token_accuracy": 0.7584839701652527, "step": 4355 }, { "epoch": 0.8019128195696156, "grad_norm": 1.0326738672670173, "learning_rate": 9.657536972247011e-06, "loss": 0.8364, "mean_token_accuracy": 0.7505152702331543, "step": 4360 }, { "epoch": 0.8028324443627001, "grad_norm": 0.9226495279325524, "learning_rate": 9.615931755982732e-06, "loss": 0.8249, "mean_token_accuracy": 0.7548305869102478, "step": 4365 }, { "epoch": 0.8037520691557845, "grad_norm": 0.9998522862414826, "learning_rate": 9.574491950984617e-06, "loss": 0.8713, "mean_token_accuracy": 0.7403565168380737, "step": 4370 }, { "epoch": 0.8046716939488688, "grad_norm": 0.9493513097435586, "learning_rate": 9.533217940532952e-06, "loss": 0.8295, "mean_token_accuracy": 0.7500657081604004, "step": 4375 }, { "epoch": 0.8055913187419533, "grad_norm": 0.9906056177459279, "learning_rate": 9.492110106374562e-06, "loss": 0.7962, "mean_token_accuracy": 0.7624237060546875, "step": 4380 }, { "epoch": 0.8065109435350377, "grad_norm": 0.9844968670498593, "learning_rate": 9.451168828719293e-06, "loss": 0.7978, "mean_token_accuracy": 0.7625670194625854, "step": 4385 }, { "epoch": 0.8074305683281221, "grad_norm": 0.9677134975970255, "learning_rate": 9.410394486236498e-06, "loss": 0.8635, "mean_token_accuracy": 0.7404338598251343, "step": 4390 }, { "epoch": 0.8083501931212066, "grad_norm": 0.9239280726012725, "learning_rate": 9.369787456051545e-06, "loss": 0.8134, "mean_token_accuracy": 0.75517338514328, "step": 4395 }, { "epoch": 0.809269817914291, "grad_norm": 0.9448230478695528, "learning_rate": 9.329348113742293e-06, "loss": 0.8304, "mean_token_accuracy": 0.7514260888099671, "step": 4400 }, { "epoch": 0.8101894427073754, "grad_norm": 0.9454127260499946, "learning_rate": 9.289076833335659e-06, "loss": 0.8097, "mean_token_accuracy": 0.7581054925918579, "step": 4405 }, { "epoch": 0.8111090675004599, "grad_norm": 0.9492270487120692, "learning_rate": 9.24897398730414e-06, "loss": 0.8527, "mean_token_accuracy": 0.7465508818626404, "step": 4410 }, { "epoch": 0.8120286922935442, "grad_norm": 0.9570757946856893, "learning_rate": 9.209039946562354e-06, "loss": 0.8267, "mean_token_accuracy": 0.755340301990509, "step": 4415 }, { "epoch": 0.8129483170866286, "grad_norm": 0.9284190475550864, "learning_rate": 9.169275080463641e-06, "loss": 0.7752, "mean_token_accuracy": 0.7686259269714355, "step": 4420 }, { "epoch": 0.8138679418797131, "grad_norm": 0.9501950391649288, "learning_rate": 9.129679756796622e-06, "loss": 0.8111, "mean_token_accuracy": 0.7585479974746704, "step": 4425 }, { "epoch": 0.8147875666727975, "grad_norm": 0.9046262111625721, "learning_rate": 9.090254341781824e-06, "loss": 0.802, "mean_token_accuracy": 0.7600291728973388, "step": 4430 }, { "epoch": 0.8157071914658819, "grad_norm": 0.9379329497256937, "learning_rate": 9.05099920006824e-06, "loss": 0.8206, "mean_token_accuracy": 0.754150140285492, "step": 4435 }, { "epoch": 0.8166268162589664, "grad_norm": 0.9034131325499937, "learning_rate": 9.011914694730014e-06, "loss": 0.7971, "mean_token_accuracy": 0.7597368478775024, "step": 4440 }, { "epoch": 0.8175464410520508, "grad_norm": 0.9338149471790205, "learning_rate": 8.973001187263069e-06, "loss": 0.8184, "mean_token_accuracy": 0.7545792698860169, "step": 4445 }, { "epoch": 0.8184660658451351, "grad_norm": 0.9541079918085381, "learning_rate": 8.934259037581725e-06, "loss": 0.8097, "mean_token_accuracy": 0.7586872816085816, "step": 4450 }, { "epoch": 0.8193856906382196, "grad_norm": 0.9233023020738409, "learning_rate": 8.895688604015418e-06, "loss": 0.8276, "mean_token_accuracy": 0.7541133642196656, "step": 4455 }, { "epoch": 0.820305315431304, "grad_norm": 0.9312024884427347, "learning_rate": 8.857290243305372e-06, "loss": 0.8242, "mean_token_accuracy": 0.7540480494499207, "step": 4460 }, { "epoch": 0.8212249402243884, "grad_norm": 0.9636521068626411, "learning_rate": 8.819064310601274e-06, "loss": 0.827, "mean_token_accuracy": 0.754251503944397, "step": 4465 }, { "epoch": 0.8221445650174729, "grad_norm": 0.9594804588793242, "learning_rate": 8.78101115945803e-06, "loss": 0.8195, "mean_token_accuracy": 0.7567231893539429, "step": 4470 }, { "epoch": 0.8230641898105573, "grad_norm": 0.946382911890805, "learning_rate": 8.743131141832466e-06, "loss": 0.8093, "mean_token_accuracy": 0.7608936429023743, "step": 4475 }, { "epoch": 0.8239838146036417, "grad_norm": 0.9662210178630657, "learning_rate": 8.705424608080091e-06, "loss": 0.845, "mean_token_accuracy": 0.7482501983642578, "step": 4480 }, { "epoch": 0.8249034393967262, "grad_norm": 1.0134277900865423, "learning_rate": 8.667891906951822e-06, "loss": 0.806, "mean_token_accuracy": 0.7607534885406494, "step": 4485 }, { "epoch": 0.8258230641898106, "grad_norm": 0.969259829449015, "learning_rate": 8.63053338559081e-06, "loss": 0.8301, "mean_token_accuracy": 0.7495483517646789, "step": 4490 }, { "epoch": 0.8267426889828949, "grad_norm": 0.973132836806053, "learning_rate": 8.593349389529194e-06, "loss": 0.8412, "mean_token_accuracy": 0.7499716639518738, "step": 4495 }, { "epoch": 0.8276623137759794, "grad_norm": 0.9074516956073079, "learning_rate": 8.556340262684901e-06, "loss": 0.8239, "mean_token_accuracy": 0.7554465770721436, "step": 4500 }, { "epoch": 0.8285819385690638, "grad_norm": 0.930234934487542, "learning_rate": 8.519506347358495e-06, "loss": 0.7947, "mean_token_accuracy": 0.7629730701446533, "step": 4505 }, { "epoch": 0.8295015633621482, "grad_norm": 0.8753133502304897, "learning_rate": 8.482847984229992e-06, "loss": 0.8461, "mean_token_accuracy": 0.747829282283783, "step": 4510 }, { "epoch": 0.8304211881552327, "grad_norm": 0.9490806269639048, "learning_rate": 8.446365512355697e-06, "loss": 0.809, "mean_token_accuracy": 0.7590258955955506, "step": 4515 }, { "epoch": 0.8313408129483171, "grad_norm": 0.945014272705201, "learning_rate": 8.410059269165094e-06, "loss": 0.858, "mean_token_accuracy": 0.7476967573165894, "step": 4520 }, { "epoch": 0.8322604377414015, "grad_norm": 0.9585805628825262, "learning_rate": 8.37392959045771e-06, "loss": 0.8276, "mean_token_accuracy": 0.7536361336708068, "step": 4525 }, { "epoch": 0.833180062534486, "grad_norm": 0.9798760065535969, "learning_rate": 8.337976810400024e-06, "loss": 0.8271, "mean_token_accuracy": 0.7538176774978638, "step": 4530 }, { "epoch": 0.8340996873275703, "grad_norm": 0.9885247811188054, "learning_rate": 8.30220126152233e-06, "loss": 0.8351, "mean_token_accuracy": 0.7511208415031433, "step": 4535 }, { "epoch": 0.8350193121206547, "grad_norm": 0.926636431875522, "learning_rate": 8.266603274715734e-06, "loss": 0.8536, "mean_token_accuracy": 0.7437230348587036, "step": 4540 }, { "epoch": 0.8359389369137392, "grad_norm": 0.9639989728106565, "learning_rate": 8.231183179229041e-06, "loss": 0.8337, "mean_token_accuracy": 0.749656867980957, "step": 4545 }, { "epoch": 0.8368585617068236, "grad_norm": 0.9810922714927505, "learning_rate": 8.19594130266571e-06, "loss": 0.8441, "mean_token_accuracy": 0.7471103310585022, "step": 4550 }, { "epoch": 0.837778186499908, "grad_norm": 0.940673214702186, "learning_rate": 8.16087797098086e-06, "loss": 0.8076, "mean_token_accuracy": 0.757796049118042, "step": 4555 }, { "epoch": 0.8386978112929925, "grad_norm": 0.9808241732647448, "learning_rate": 8.125993508478222e-06, "loss": 0.8107, "mean_token_accuracy": 0.7570709705352783, "step": 4560 }, { "epoch": 0.8396174360860769, "grad_norm": 0.9417309972023068, "learning_rate": 8.091288237807148e-06, "loss": 0.7918, "mean_token_accuracy": 0.7627918124198914, "step": 4565 }, { "epoch": 0.8405370608791614, "grad_norm": 0.9994759897340699, "learning_rate": 8.05676247995964e-06, "loss": 0.8308, "mean_token_accuracy": 0.7522749185562134, "step": 4570 }, { "epoch": 0.8414566856722457, "grad_norm": 0.9575333123064316, "learning_rate": 8.022416554267361e-06, "loss": 0.8249, "mean_token_accuracy": 0.7555456757545471, "step": 4575 }, { "epoch": 0.8423763104653301, "grad_norm": 0.9428369551875321, "learning_rate": 7.988250778398704e-06, "loss": 0.7799, "mean_token_accuracy": 0.7657583713531494, "step": 4580 }, { "epoch": 0.8432959352584146, "grad_norm": 0.9491493130691244, "learning_rate": 7.95426546835582e-06, "loss": 0.8463, "mean_token_accuracy": 0.7497212409973144, "step": 4585 }, { "epoch": 0.844215560051499, "grad_norm": 0.9279119840497574, "learning_rate": 7.92046093847173e-06, "loss": 0.7911, "mean_token_accuracy": 0.7641847729682922, "step": 4590 }, { "epoch": 0.8451351848445834, "grad_norm": 0.975196157389162, "learning_rate": 7.88683750140741e-06, "loss": 0.7829, "mean_token_accuracy": 0.76539067029953, "step": 4595 }, { "epoch": 0.8460548096376679, "grad_norm": 0.9630038826041202, "learning_rate": 7.853395468148877e-06, "loss": 0.8214, "mean_token_accuracy": 0.7576993346214295, "step": 4600 }, { "epoch": 0.8469744344307523, "grad_norm": 0.9547194790847711, "learning_rate": 7.82013514800434e-06, "loss": 0.8133, "mean_token_accuracy": 0.7594569325447083, "step": 4605 }, { "epoch": 0.8478940592238366, "grad_norm": 0.9804442806928446, "learning_rate": 7.787056848601327e-06, "loss": 0.826, "mean_token_accuracy": 0.7542958974838256, "step": 4610 }, { "epoch": 0.8488136840169211, "grad_norm": 0.987211519153664, "learning_rate": 7.754160875883835e-06, "loss": 0.859, "mean_token_accuracy": 0.7447464466094971, "step": 4615 }, { "epoch": 0.8497333088100055, "grad_norm": 0.9279113898182684, "learning_rate": 7.721447534109509e-06, "loss": 0.8318, "mean_token_accuracy": 0.7507144689559937, "step": 4620 }, { "epoch": 0.8506529336030899, "grad_norm": 0.9722340874170035, "learning_rate": 7.688917125846836e-06, "loss": 0.8354, "mean_token_accuracy": 0.7506987690925598, "step": 4625 }, { "epoch": 0.8515725583961744, "grad_norm": 0.9470559135859266, "learning_rate": 7.65656995197231e-06, "loss": 0.846, "mean_token_accuracy": 0.7494428992271424, "step": 4630 }, { "epoch": 0.8524921831892588, "grad_norm": 1.0085786438496558, "learning_rate": 7.6244063116676965e-06, "loss": 0.8048, "mean_token_accuracy": 0.7590271830558777, "step": 4635 }, { "epoch": 0.8534118079823432, "grad_norm": 0.9122173396588265, "learning_rate": 7.592426502417235e-06, "loss": 0.792, "mean_token_accuracy": 0.7632818222045898, "step": 4640 }, { "epoch": 0.8543314327754277, "grad_norm": 0.920428242471814, "learning_rate": 7.560630820004905e-06, "loss": 0.7682, "mean_token_accuracy": 0.768799901008606, "step": 4645 }, { "epoch": 0.855251057568512, "grad_norm": 0.9650658819203722, "learning_rate": 7.529019558511664e-06, "loss": 0.8591, "mean_token_accuracy": 0.7465671896934509, "step": 4650 }, { "epoch": 0.8561706823615964, "grad_norm": 0.941100631374564, "learning_rate": 7.4975930103127575e-06, "loss": 0.8133, "mean_token_accuracy": 0.7577845811843872, "step": 4655 }, { "epoch": 0.8570903071546809, "grad_norm": 0.911355294655365, "learning_rate": 7.466351466075003e-06, "loss": 0.776, "mean_token_accuracy": 0.7704600811004638, "step": 4660 }, { "epoch": 0.8580099319477653, "grad_norm": 0.9600196890925632, "learning_rate": 7.43529521475409e-06, "loss": 0.8356, "mean_token_accuracy": 0.752436888217926, "step": 4665 }, { "epoch": 0.8589295567408497, "grad_norm": 0.9096404947618868, "learning_rate": 7.404424543591926e-06, "loss": 0.8434, "mean_token_accuracy": 0.749167013168335, "step": 4670 }, { "epoch": 0.8598491815339342, "grad_norm": 0.9645413054824178, "learning_rate": 7.37373973811398e-06, "loss": 0.8422, "mean_token_accuracy": 0.7523573756217956, "step": 4675 }, { "epoch": 0.8607688063270186, "grad_norm": 0.9461536188211753, "learning_rate": 7.343241082126609e-06, "loss": 0.789, "mean_token_accuracy": 0.7644837021827697, "step": 4680 }, { "epoch": 0.861688431120103, "grad_norm": 0.9177981778366934, "learning_rate": 7.312928857714484e-06, "loss": 0.7912, "mean_token_accuracy": 0.7650796055793763, "step": 4685 }, { "epoch": 0.8626080559131875, "grad_norm": 0.9395263274096144, "learning_rate": 7.282803345237937e-06, "loss": 0.779, "mean_token_accuracy": 0.766014575958252, "step": 4690 }, { "epoch": 0.8635276807062718, "grad_norm": 0.974228845887035, "learning_rate": 7.252864823330397e-06, "loss": 0.8096, "mean_token_accuracy": 0.7609816431999207, "step": 4695 }, { "epoch": 0.8644473054993562, "grad_norm": 0.9138771854988429, "learning_rate": 7.223113568895791e-06, "loss": 0.8228, "mean_token_accuracy": 0.7533741354942322, "step": 4700 }, { "epoch": 0.8653669302924407, "grad_norm": 0.9230858356341091, "learning_rate": 7.193549857105998e-06, "loss": 0.7817, "mean_token_accuracy": 0.7645957589149475, "step": 4705 }, { "epoch": 0.8662865550855251, "grad_norm": 0.9248959407091435, "learning_rate": 7.164173961398307e-06, "loss": 0.8123, "mean_token_accuracy": 0.758608341217041, "step": 4710 }, { "epoch": 0.8672061798786095, "grad_norm": 0.920957739245226, "learning_rate": 7.134986153472864e-06, "loss": 0.8089, "mean_token_accuracy": 0.7574970960617066, "step": 4715 }, { "epoch": 0.868125804671694, "grad_norm": 0.9365387305302294, "learning_rate": 7.105986703290185e-06, "loss": 0.8207, "mean_token_accuracy": 0.7519280552864075, "step": 4720 }, { "epoch": 0.8690454294647784, "grad_norm": 0.9848472191309555, "learning_rate": 7.077175879068652e-06, "loss": 0.8318, "mean_token_accuracy": 0.7514313578605651, "step": 4725 }, { "epoch": 0.8699650542578627, "grad_norm": 0.9841439973977463, "learning_rate": 7.04855394728202e-06, "loss": 0.8254, "mean_token_accuracy": 0.7536401510238647, "step": 4730 }, { "epoch": 0.8708846790509472, "grad_norm": 0.9368690483918741, "learning_rate": 7.020121172656971e-06, "loss": 0.8079, "mean_token_accuracy": 0.7589451789855957, "step": 4735 }, { "epoch": 0.8718043038440316, "grad_norm": 0.9537367969880632, "learning_rate": 6.991877818170647e-06, "loss": 0.8105, "mean_token_accuracy": 0.7570921540260315, "step": 4740 }, { "epoch": 0.872723928637116, "grad_norm": 0.9771290706741976, "learning_rate": 6.963824145048245e-06, "loss": 0.8383, "mean_token_accuracy": 0.7482818961143494, "step": 4745 }, { "epoch": 0.8736435534302005, "grad_norm": 0.9167489506515816, "learning_rate": 6.935960412760554e-06, "loss": 0.7956, "mean_token_accuracy": 0.7615381121635437, "step": 4750 }, { "epoch": 0.8745631782232849, "grad_norm": 0.9509142520738616, "learning_rate": 6.908286879021611e-06, "loss": 0.8272, "mean_token_accuracy": 0.7538857817649841, "step": 4755 }, { "epoch": 0.8754828030163693, "grad_norm": 0.9492010037774332, "learning_rate": 6.880803799786282e-06, "loss": 0.8083, "mean_token_accuracy": 0.7596304178237915, "step": 4760 }, { "epoch": 0.8764024278094538, "grad_norm": 0.9879455089380224, "learning_rate": 6.853511429247891e-06, "loss": 0.8501, "mean_token_accuracy": 0.7443594694137573, "step": 4765 }, { "epoch": 0.8773220526025381, "grad_norm": 0.900884905164465, "learning_rate": 6.826410019835897e-06, "loss": 0.8388, "mean_token_accuracy": 0.75017911195755, "step": 4770 }, { "epoch": 0.8782416773956225, "grad_norm": 0.9347399353088925, "learning_rate": 6.7994998222135415e-06, "loss": 0.8338, "mean_token_accuracy": 0.7503747582435608, "step": 4775 }, { "epoch": 0.879161302188707, "grad_norm": 0.9313447849733553, "learning_rate": 6.77278108527552e-06, "loss": 0.8223, "mean_token_accuracy": 0.7531881928443909, "step": 4780 }, { "epoch": 0.8800809269817914, "grad_norm": 0.9749122247147805, "learning_rate": 6.7462540561457035e-06, "loss": 0.8078, "mean_token_accuracy": 0.7597910761833191, "step": 4785 }, { "epoch": 0.8810005517748758, "grad_norm": 0.9459726297921652, "learning_rate": 6.719918980174842e-06, "loss": 0.7735, "mean_token_accuracy": 0.7680148124694824, "step": 4790 }, { "epoch": 0.8819201765679603, "grad_norm": 0.9477334526426899, "learning_rate": 6.6937761009382816e-06, "loss": 0.8025, "mean_token_accuracy": 0.759226131439209, "step": 4795 }, { "epoch": 0.8828398013610447, "grad_norm": 0.9350684746914302, "learning_rate": 6.667825660233736e-06, "loss": 0.8141, "mean_token_accuracy": 0.7565145611763, "step": 4800 }, { "epoch": 0.8837594261541292, "grad_norm": 0.9492764392082258, "learning_rate": 6.642067898079038e-06, "loss": 0.8311, "mean_token_accuracy": 0.7527845025062561, "step": 4805 }, { "epoch": 0.8846790509472136, "grad_norm": 0.8598768439927121, "learning_rate": 6.616503052709914e-06, "loss": 0.7896, "mean_token_accuracy": 0.7648340344429017, "step": 4810 }, { "epoch": 0.8855986757402979, "grad_norm": 0.9446656437839204, "learning_rate": 6.591131360577795e-06, "loss": 0.8052, "mean_token_accuracy": 0.7575154542922974, "step": 4815 }, { "epoch": 0.8865183005333824, "grad_norm": 0.8652514268793213, "learning_rate": 6.565953056347608e-06, "loss": 0.7534, "mean_token_accuracy": 0.7725171089172364, "step": 4820 }, { "epoch": 0.8874379253264668, "grad_norm": 0.9422431334861092, "learning_rate": 6.540968372895634e-06, "loss": 0.7977, "mean_token_accuracy": 0.7611649394035339, "step": 4825 }, { "epoch": 0.8883575501195512, "grad_norm": 0.9384703132768932, "learning_rate": 6.516177541307333e-06, "loss": 0.7995, "mean_token_accuracy": 0.7624763369560241, "step": 4830 }, { "epoch": 0.8892771749126357, "grad_norm": 1.015847599195386, "learning_rate": 6.491580790875209e-06, "loss": 0.7916, "mean_token_accuracy": 0.7621793508529663, "step": 4835 }, { "epoch": 0.8901967997057201, "grad_norm": 0.9098096698494834, "learning_rate": 6.4671783490966945e-06, "loss": 0.8088, "mean_token_accuracy": 0.7614699125289917, "step": 4840 }, { "epoch": 0.8911164244988045, "grad_norm": 0.9558674059824713, "learning_rate": 6.442970441672051e-06, "loss": 0.8545, "mean_token_accuracy": 0.7470506310462952, "step": 4845 }, { "epoch": 0.892036049291889, "grad_norm": 0.9590352976202275, "learning_rate": 6.4189572925022655e-06, "loss": 0.8363, "mean_token_accuracy": 0.7472939848899841, "step": 4850 }, { "epoch": 0.8929556740849733, "grad_norm": 0.8982751392912057, "learning_rate": 6.3951391236869985e-06, "loss": 0.8259, "mean_token_accuracy": 0.7548177719116211, "step": 4855 }, { "epoch": 0.8938752988780577, "grad_norm": 0.9627549202883984, "learning_rate": 6.371516155522513e-06, "loss": 0.8035, "mean_token_accuracy": 0.7578222513198852, "step": 4860 }, { "epoch": 0.8947949236711422, "grad_norm": 0.962995623951893, "learning_rate": 6.3480886064996484e-06, "loss": 0.8119, "mean_token_accuracy": 0.7579006910324096, "step": 4865 }, { "epoch": 0.8957145484642266, "grad_norm": 0.99045632467858, "learning_rate": 6.3248566933017975e-06, "loss": 0.7942, "mean_token_accuracy": 0.75965256690979, "step": 4870 }, { "epoch": 0.896634173257311, "grad_norm": 0.9510071830298487, "learning_rate": 6.3018206308028975e-06, "loss": 0.8185, "mean_token_accuracy": 0.7584743499755859, "step": 4875 }, { "epoch": 0.8975537980503955, "grad_norm": 0.9703791789576997, "learning_rate": 6.2789806320654456e-06, "loss": 0.7816, "mean_token_accuracy": 0.7649904489517212, "step": 4880 }, { "epoch": 0.8984734228434799, "grad_norm": 0.9398378664335288, "learning_rate": 6.256336908338531e-06, "loss": 0.78, "mean_token_accuracy": 0.767956817150116, "step": 4885 }, { "epoch": 0.8993930476365642, "grad_norm": 0.987114293205303, "learning_rate": 6.233889669055878e-06, "loss": 0.8443, "mean_token_accuracy": 0.7497469425201416, "step": 4890 }, { "epoch": 0.9003126724296487, "grad_norm": 0.9343500174042304, "learning_rate": 6.211639121833912e-06, "loss": 0.7931, "mean_token_accuracy": 0.763602340221405, "step": 4895 }, { "epoch": 0.9012322972227331, "grad_norm": 0.9262644956755969, "learning_rate": 6.189585472469829e-06, "loss": 0.7792, "mean_token_accuracy": 0.7697998642921448, "step": 4900 }, { "epoch": 0.9021519220158175, "grad_norm": 0.9622834108867682, "learning_rate": 6.167728924939705e-06, "loss": 0.797, "mean_token_accuracy": 0.7625941157341003, "step": 4905 }, { "epoch": 0.903071546808902, "grad_norm": 0.9190192726730757, "learning_rate": 6.146069681396612e-06, "loss": 0.8253, "mean_token_accuracy": 0.7542304992675781, "step": 4910 }, { "epoch": 0.9039911716019864, "grad_norm": 0.9361246140345745, "learning_rate": 6.124607942168726e-06, "loss": 0.8031, "mean_token_accuracy": 0.7584469556808472, "step": 4915 }, { "epoch": 0.9049107963950708, "grad_norm": 0.9457716726884055, "learning_rate": 6.1033439057574965e-06, "loss": 0.8153, "mean_token_accuracy": 0.758701741695404, "step": 4920 }, { "epoch": 0.9058304211881553, "grad_norm": 0.8853750515926242, "learning_rate": 6.082277768835807e-06, "loss": 0.7921, "mean_token_accuracy": 0.763675856590271, "step": 4925 }, { "epoch": 0.9067500459812396, "grad_norm": 0.9702784866596219, "learning_rate": 6.061409726246143e-06, "loss": 0.7851, "mean_token_accuracy": 0.7646818399429322, "step": 4930 }, { "epoch": 0.907669670774324, "grad_norm": 0.9693421985103569, "learning_rate": 6.040739970998802e-06, "loss": 0.8346, "mean_token_accuracy": 0.7530786991119385, "step": 4935 }, { "epoch": 0.9085892955674085, "grad_norm": 0.8930655347204544, "learning_rate": 6.020268694270109e-06, "loss": 0.7966, "mean_token_accuracy": 0.7641753435134888, "step": 4940 }, { "epoch": 0.9095089203604929, "grad_norm": 0.908390221485836, "learning_rate": 5.999996085400643e-06, "loss": 0.7995, "mean_token_accuracy": 0.7642928123474121, "step": 4945 }, { "epoch": 0.9104285451535773, "grad_norm": 0.9291773666129768, "learning_rate": 5.9799223318934765e-06, "loss": 0.801, "mean_token_accuracy": 0.7588168382644653, "step": 4950 }, { "epoch": 0.9113481699466618, "grad_norm": 0.9290002720904244, "learning_rate": 5.9600476194124675e-06, "loss": 0.7973, "mean_token_accuracy": 0.763935673236847, "step": 4955 }, { "epoch": 0.9122677947397462, "grad_norm": 0.9446442087955222, "learning_rate": 5.9403721317805245e-06, "loss": 0.801, "mean_token_accuracy": 0.7578533172607422, "step": 4960 }, { "epoch": 0.9131874195328306, "grad_norm": 0.9568316679901518, "learning_rate": 5.920896050977891e-06, "loss": 0.8926, "mean_token_accuracy": 0.7361096501350403, "step": 4965 }, { "epoch": 0.914107044325915, "grad_norm": 0.9761363167639366, "learning_rate": 5.901619557140502e-06, "loss": 0.8302, "mean_token_accuracy": 0.7517902731895447, "step": 4970 }, { "epoch": 0.9150266691189994, "grad_norm": 0.9363921634925068, "learning_rate": 5.882542828558286e-06, "loss": 0.8066, "mean_token_accuracy": 0.7580497026443481, "step": 4975 }, { "epoch": 0.9159462939120838, "grad_norm": 0.9898749363112332, "learning_rate": 5.86366604167352e-06, "loss": 0.7785, "mean_token_accuracy": 0.7676722645759583, "step": 4980 }, { "epoch": 0.9168659187051683, "grad_norm": 0.9461120512925497, "learning_rate": 5.844989371079215e-06, "loss": 0.7655, "mean_token_accuracy": 0.7703205943107605, "step": 4985 }, { "epoch": 0.9177855434982527, "grad_norm": 0.9340964548547984, "learning_rate": 5.826512989517478e-06, "loss": 0.8243, "mean_token_accuracy": 0.7529069542884826, "step": 4990 }, { "epoch": 0.9187051682913371, "grad_norm": 0.9542091804584825, "learning_rate": 5.808237067877942e-06, "loss": 0.7869, "mean_token_accuracy": 0.7639023303985596, "step": 4995 }, { "epoch": 0.9196247930844216, "grad_norm": 0.9799469338180448, "learning_rate": 5.790161775196144e-06, "loss": 0.7942, "mean_token_accuracy": 0.7624092340469361, "step": 5000 }, { "epoch": 0.920544417877506, "grad_norm": 0.9533254080832144, "learning_rate": 5.772287278652012e-06, "loss": 0.8109, "mean_token_accuracy": 0.7598010182380677, "step": 5005 }, { "epoch": 0.9214640426705903, "grad_norm": 0.9311527277134242, "learning_rate": 5.754613743568279e-06, "loss": 0.7906, "mean_token_accuracy": 0.7638931751251221, "step": 5010 }, { "epoch": 0.9223836674636748, "grad_norm": 0.9812836116539834, "learning_rate": 5.737141333408972e-06, "loss": 0.8008, "mean_token_accuracy": 0.7612162590026855, "step": 5015 }, { "epoch": 0.9233032922567592, "grad_norm": 0.9745443553849291, "learning_rate": 5.719870209777896e-06, "loss": 0.8417, "mean_token_accuracy": 0.7509512066841125, "step": 5020 }, { "epoch": 0.9242229170498437, "grad_norm": 0.9530895065948418, "learning_rate": 5.702800532417144e-06, "loss": 0.7899, "mean_token_accuracy": 0.7625620007514954, "step": 5025 }, { "epoch": 0.9251425418429281, "grad_norm": 0.9106620317823355, "learning_rate": 5.685932459205606e-06, "loss": 0.8075, "mean_token_accuracy": 0.7597783088684082, "step": 5030 }, { "epoch": 0.9260621666360125, "grad_norm": 0.9016062622069709, "learning_rate": 5.669266146157527e-06, "loss": 0.7956, "mean_token_accuracy": 0.7618203997612, "step": 5035 }, { "epoch": 0.926981791429097, "grad_norm": 0.9311871037406105, "learning_rate": 5.652801747421053e-06, "loss": 0.7755, "mean_token_accuracy": 0.7672530770301819, "step": 5040 }, { "epoch": 0.9279014162221814, "grad_norm": 0.9289149914362874, "learning_rate": 5.636539415276807e-06, "loss": 0.7971, "mean_token_accuracy": 0.7606992840766906, "step": 5045 }, { "epoch": 0.9288210410152657, "grad_norm": 0.9265920738234094, "learning_rate": 5.620479300136475e-06, "loss": 0.7675, "mean_token_accuracy": 0.7715546011924743, "step": 5050 }, { "epoch": 0.9297406658083502, "grad_norm": 1.001963123510446, "learning_rate": 5.604621550541429e-06, "loss": 0.8426, "mean_token_accuracy": 0.7474547743797302, "step": 5055 }, { "epoch": 0.9306602906014346, "grad_norm": 0.9062392197653472, "learning_rate": 5.5889663131613465e-06, "loss": 0.8237, "mean_token_accuracy": 0.7512851595878601, "step": 5060 }, { "epoch": 0.931579915394519, "grad_norm": 0.9878466692235598, "learning_rate": 5.5735137327928384e-06, "loss": 0.8018, "mean_token_accuracy": 0.7595331549644471, "step": 5065 }, { "epoch": 0.9324995401876035, "grad_norm": 0.911756127989921, "learning_rate": 5.558263952358139e-06, "loss": 0.8146, "mean_token_accuracy": 0.7572713255882263, "step": 5070 }, { "epoch": 0.9334191649806879, "grad_norm": 0.9534452188147857, "learning_rate": 5.543217112903766e-06, "loss": 0.8092, "mean_token_accuracy": 0.7591339111328125, "step": 5075 }, { "epoch": 0.9343387897737723, "grad_norm": 0.94136690175154, "learning_rate": 5.528373353599207e-06, "loss": 0.7945, "mean_token_accuracy": 0.7594197154045105, "step": 5080 }, { "epoch": 0.9352584145668568, "grad_norm": 0.9367268234664168, "learning_rate": 5.513732811735657e-06, "loss": 0.8123, "mean_token_accuracy": 0.7594240307807922, "step": 5085 }, { "epoch": 0.9361780393599411, "grad_norm": 0.8975989192963018, "learning_rate": 5.4992956227247345e-06, "loss": 0.7715, "mean_token_accuracy": 0.7677939176559448, "step": 5090 }, { "epoch": 0.9370976641530255, "grad_norm": 0.9987125543689239, "learning_rate": 5.48506192009722e-06, "loss": 0.8051, "mean_token_accuracy": 0.7597865104675293, "step": 5095 }, { "epoch": 0.93801728894611, "grad_norm": 0.9396093256392507, "learning_rate": 5.4710318355018435e-06, "loss": 0.8248, "mean_token_accuracy": 0.7557710766792297, "step": 5100 }, { "epoch": 0.9389369137391944, "grad_norm": 0.907072734656757, "learning_rate": 5.457205498704046e-06, "loss": 0.8104, "mean_token_accuracy": 0.7568627595901489, "step": 5105 }, { "epoch": 0.9398565385322788, "grad_norm": 0.9498606808400206, "learning_rate": 5.443583037584792e-06, "loss": 0.829, "mean_token_accuracy": 0.7537372469902038, "step": 5110 }, { "epoch": 0.9407761633253633, "grad_norm": 0.9500188031150016, "learning_rate": 5.430164578139382e-06, "loss": 0.771, "mean_token_accuracy": 0.7692322492599487, "step": 5115 }, { "epoch": 0.9416957881184477, "grad_norm": 0.9133488515736051, "learning_rate": 5.4169502444762836e-06, "loss": 0.8203, "mean_token_accuracy": 0.7578924179077149, "step": 5120 }, { "epoch": 0.9426154129115321, "grad_norm": 0.9585342004886042, "learning_rate": 5.403940158815996e-06, "loss": 0.8209, "mean_token_accuracy": 0.7570155620574951, "step": 5125 }, { "epoch": 0.9435350377046166, "grad_norm": 0.9797939933864984, "learning_rate": 5.391134441489905e-06, "loss": 0.7937, "mean_token_accuracy": 0.7618912696838379, "step": 5130 }, { "epoch": 0.9444546624977009, "grad_norm": 0.9293935572688817, "learning_rate": 5.378533210939176e-06, "loss": 0.7948, "mean_token_accuracy": 0.7596281886100769, "step": 5135 }, { "epoch": 0.9453742872907853, "grad_norm": 0.9221042858985046, "learning_rate": 5.366136583713665e-06, "loss": 0.7717, "mean_token_accuracy": 0.7698543071746826, "step": 5140 }, { "epoch": 0.9462939120838698, "grad_norm": 1.025946124148099, "learning_rate": 5.353944674470823e-06, "loss": 0.8213, "mean_token_accuracy": 0.7552660465240478, "step": 5145 }, { "epoch": 0.9472135368769542, "grad_norm": 0.984504169212397, "learning_rate": 5.341957595974662e-06, "loss": 0.8392, "mean_token_accuracy": 0.7498656630516052, "step": 5150 }, { "epoch": 0.9481331616700386, "grad_norm": 0.9188252633726173, "learning_rate": 5.3301754590946824e-06, "loss": 0.8166, "mean_token_accuracy": 0.7552522420883179, "step": 5155 }, { "epoch": 0.9490527864631231, "grad_norm": 0.8673224532160614, "learning_rate": 5.318598372804873e-06, "loss": 0.7689, "mean_token_accuracy": 0.7689907431602478, "step": 5160 }, { "epoch": 0.9499724112562075, "grad_norm": 0.9392909148393203, "learning_rate": 5.307226444182686e-06, "loss": 0.7877, "mean_token_accuracy": 0.7654459595680236, "step": 5165 }, { "epoch": 0.9508920360492918, "grad_norm": 1.0092515399603914, "learning_rate": 5.296059778408057e-06, "loss": 0.8228, "mean_token_accuracy": 0.7547815799713135, "step": 5170 }, { "epoch": 0.9518116608423763, "grad_norm": 0.9724478118701938, "learning_rate": 5.2850984787624264e-06, "loss": 0.8068, "mean_token_accuracy": 0.757933521270752, "step": 5175 }, { "epoch": 0.9527312856354607, "grad_norm": 0.9595437776833703, "learning_rate": 5.274342646627783e-06, "loss": 0.8612, "mean_token_accuracy": 0.7451163768768311, "step": 5180 }, { "epoch": 0.9536509104285451, "grad_norm": 0.9035621461181421, "learning_rate": 5.263792381485733e-06, "loss": 0.7942, "mean_token_accuracy": 0.7612574458122253, "step": 5185 }, { "epoch": 0.9545705352216296, "grad_norm": 0.9369759529937411, "learning_rate": 5.253447780916577e-06, "loss": 0.8199, "mean_token_accuracy": 0.755517327785492, "step": 5190 }, { "epoch": 0.955490160014714, "grad_norm": 0.9223279306007958, "learning_rate": 5.2433089405984e-06, "loss": 0.7855, "mean_token_accuracy": 0.7672001838684082, "step": 5195 }, { "epoch": 0.9564097848077984, "grad_norm": 0.9093658718364905, "learning_rate": 5.233375954306199e-06, "loss": 0.7588, "mean_token_accuracy": 0.7701982975006103, "step": 5200 }, { "epoch": 0.9573294096008829, "grad_norm": 0.9756234794282658, "learning_rate": 5.22364891391101e-06, "loss": 0.8294, "mean_token_accuracy": 0.75344318151474, "step": 5205 }, { "epoch": 0.9582490343939672, "grad_norm": 0.910212786589889, "learning_rate": 5.2141279093790575e-06, "loss": 0.7894, "mean_token_accuracy": 0.7678821444511413, "step": 5210 }, { "epoch": 0.9591686591870516, "grad_norm": 0.9474929875705357, "learning_rate": 5.204813028770913e-06, "loss": 0.7891, "mean_token_accuracy": 0.7625754833221435, "step": 5215 }, { "epoch": 0.9600882839801361, "grad_norm": 0.9344552952746554, "learning_rate": 5.195704358240704e-06, "loss": 0.8059, "mean_token_accuracy": 0.759453558921814, "step": 5220 }, { "epoch": 0.9610079087732205, "grad_norm": 0.9060367178226402, "learning_rate": 5.186801982035298e-06, "loss": 0.7846, "mean_token_accuracy": 0.7654222846031189, "step": 5225 }, { "epoch": 0.9619275335663049, "grad_norm": 0.9799737312884412, "learning_rate": 5.178105982493528e-06, "loss": 0.813, "mean_token_accuracy": 0.7591325879096985, "step": 5230 }, { "epoch": 0.9628471583593894, "grad_norm": 0.9419373863409995, "learning_rate": 5.169616440045433e-06, "loss": 0.7933, "mean_token_accuracy": 0.7605907201766968, "step": 5235 }, { "epoch": 0.9637667831524738, "grad_norm": 0.904753211539841, "learning_rate": 5.16133343321151e-06, "loss": 0.796, "mean_token_accuracy": 0.7628448724746704, "step": 5240 }, { "epoch": 0.9646864079455583, "grad_norm": 0.9588441625989744, "learning_rate": 5.1532570386019944e-06, "loss": 0.7746, "mean_token_accuracy": 0.7675014138221741, "step": 5245 }, { "epoch": 0.9656060327386427, "grad_norm": 0.8875696215604679, "learning_rate": 5.145387330916144e-06, "loss": 0.7988, "mean_token_accuracy": 0.7614070296287536, "step": 5250 }, { "epoch": 0.966525657531727, "grad_norm": 0.9405630235157387, "learning_rate": 5.137724382941557e-06, "loss": 0.7918, "mean_token_accuracy": 0.7650785088539124, "step": 5255 }, { "epoch": 0.9674452823248115, "grad_norm": 0.9562043810312459, "learning_rate": 5.130268265553487e-06, "loss": 0.8144, "mean_token_accuracy": 0.7557086706161499, "step": 5260 }, { "epoch": 0.9683649071178959, "grad_norm": 0.9274811086930055, "learning_rate": 5.123019047714198e-06, "loss": 0.7576, "mean_token_accuracy": 0.7753474235534668, "step": 5265 }, { "epoch": 0.9692845319109803, "grad_norm": 0.9409745943869224, "learning_rate": 5.115976796472322e-06, "loss": 0.8328, "mean_token_accuracy": 0.7535906672477722, "step": 5270 }, { "epoch": 0.9702041567040648, "grad_norm": 0.919927159373234, "learning_rate": 5.109141576962239e-06, "loss": 0.7912, "mean_token_accuracy": 0.7655844688415527, "step": 5275 }, { "epoch": 0.9711237814971492, "grad_norm": 0.951329112362283, "learning_rate": 5.102513452403473e-06, "loss": 0.7683, "mean_token_accuracy": 0.7696467399597168, "step": 5280 }, { "epoch": 0.9720434062902336, "grad_norm": 0.9201946233258363, "learning_rate": 5.0960924841001155e-06, "loss": 0.7988, "mean_token_accuracy": 0.7610312700271606, "step": 5285 }, { "epoch": 0.972963031083318, "grad_norm": 1.0032717462292577, "learning_rate": 5.089878731440241e-06, "loss": 0.821, "mean_token_accuracy": 0.7543939590454102, "step": 5290 }, { "epoch": 0.9738826558764024, "grad_norm": 0.9429172545610519, "learning_rate": 5.0838722518953816e-06, "loss": 0.7989, "mean_token_accuracy": 0.7595749855041504, "step": 5295 }, { "epoch": 0.9748022806694868, "grad_norm": 0.9007616401314099, "learning_rate": 5.078073101019974e-06, "loss": 0.8083, "mean_token_accuracy": 0.7579713940620423, "step": 5300 }, { "epoch": 0.9757219054625713, "grad_norm": 0.8990406462252963, "learning_rate": 5.072481332450857e-06, "loss": 0.8114, "mean_token_accuracy": 0.7577333807945251, "step": 5305 }, { "epoch": 0.9766415302556557, "grad_norm": 0.9615340254243923, "learning_rate": 5.067096997906774e-06, "loss": 0.7715, "mean_token_accuracy": 0.7705414056777954, "step": 5310 }, { "epoch": 0.9775611550487401, "grad_norm": 0.8455749234692341, "learning_rate": 5.06192014718789e-06, "loss": 0.7642, "mean_token_accuracy": 0.7697661995887757, "step": 5315 }, { "epoch": 0.9784807798418246, "grad_norm": 0.9292612449999305, "learning_rate": 5.05695082817534e-06, "loss": 0.7789, "mean_token_accuracy": 0.7671653866767884, "step": 5320 }, { "epoch": 0.979400404634909, "grad_norm": 0.9275056123774931, "learning_rate": 5.052189086830779e-06, "loss": 0.8018, "mean_token_accuracy": 0.7623230576515198, "step": 5325 }, { "epoch": 0.9803200294279933, "grad_norm": 0.9703545231339168, "learning_rate": 5.047634967195952e-06, "loss": 0.7877, "mean_token_accuracy": 0.7638481616973877, "step": 5330 }, { "epoch": 0.9812396542210778, "grad_norm": 0.955542417327297, "learning_rate": 5.043288511392302e-06, "loss": 0.7891, "mean_token_accuracy": 0.7614734530448913, "step": 5335 }, { "epoch": 0.9821592790141622, "grad_norm": 0.9645172124378145, "learning_rate": 5.039149759620569e-06, "loss": 0.7624, "mean_token_accuracy": 0.7724639177322388, "step": 5340 }, { "epoch": 0.9830789038072466, "grad_norm": 0.9734387825498484, "learning_rate": 5.0352187501604155e-06, "loss": 0.8579, "mean_token_accuracy": 0.746760880947113, "step": 5345 }, { "epoch": 0.9839985286003311, "grad_norm": 0.9730228991663388, "learning_rate": 5.031495519370083e-06, "loss": 0.8102, "mean_token_accuracy": 0.758979082107544, "step": 5350 }, { "epoch": 0.9849181533934155, "grad_norm": 1.0013660074202417, "learning_rate": 5.027980101686053e-06, "loss": 0.8396, "mean_token_accuracy": 0.7509408593177795, "step": 5355 }, { "epoch": 0.9858377781864999, "grad_norm": 0.9817157587290055, "learning_rate": 5.024672529622717e-06, "loss": 0.7935, "mean_token_accuracy": 0.7596516370773315, "step": 5360 }, { "epoch": 0.9867574029795844, "grad_norm": 0.9800745490721745, "learning_rate": 5.0215728337720955e-06, "loss": 0.7491, "mean_token_accuracy": 0.7768563270568848, "step": 5365 }, { "epoch": 0.9876770277726687, "grad_norm": 0.99189390574119, "learning_rate": 5.018681042803533e-06, "loss": 0.7759, "mean_token_accuracy": 0.7670275330543518, "step": 5370 }, { "epoch": 0.9885966525657531, "grad_norm": 0.9673022649880465, "learning_rate": 5.0159971834634545e-06, "loss": 0.7867, "mean_token_accuracy": 0.764349353313446, "step": 5375 }, { "epoch": 0.9895162773588376, "grad_norm": 1.0182176113772272, "learning_rate": 5.013521280575099e-06, "loss": 0.799, "mean_token_accuracy": 0.7618956327438354, "step": 5380 }, { "epoch": 0.990435902151922, "grad_norm": 0.9959171759739962, "learning_rate": 5.011253357038306e-06, "loss": 0.8392, "mean_token_accuracy": 0.7527823686599732, "step": 5385 }, { "epoch": 0.9913555269450064, "grad_norm": 0.8997528487054468, "learning_rate": 5.0091934338292915e-06, "loss": 0.7615, "mean_token_accuracy": 0.7715205192565918, "step": 5390 }, { "epoch": 0.9922751517380909, "grad_norm": 0.919462849827096, "learning_rate": 5.00734153000046e-06, "loss": 0.7409, "mean_token_accuracy": 0.77668297290802, "step": 5395 }, { "epoch": 0.9931947765311753, "grad_norm": 0.984326555402561, "learning_rate": 5.005697662680227e-06, "loss": 0.7989, "mean_token_accuracy": 0.7626922607421875, "step": 5400 }, { "epoch": 0.9941144013242597, "grad_norm": 0.9499542228497883, "learning_rate": 5.004261847072863e-06, "loss": 0.8283, "mean_token_accuracy": 0.7542143225669861, "step": 5405 }, { "epoch": 0.9950340261173442, "grad_norm": 0.9585799297597308, "learning_rate": 5.003034096458347e-06, "loss": 0.835, "mean_token_accuracy": 0.7544377326965332, "step": 5410 }, { "epoch": 0.9959536509104285, "grad_norm": 0.9165677599227604, "learning_rate": 5.0020144221922466e-06, "loss": 0.8013, "mean_token_accuracy": 0.7582892417907715, "step": 5415 }, { "epoch": 0.9968732757035129, "grad_norm": 0.9449991405622632, "learning_rate": 5.001202833705621e-06, "loss": 0.8352, "mean_token_accuracy": 0.7502840042114258, "step": 5420 }, { "epoch": 0.9977929004965974, "grad_norm": 0.9827477783752422, "learning_rate": 5.000599338504916e-06, "loss": 0.7931, "mean_token_accuracy": 0.762959897518158, "step": 5425 }, { "epoch": 0.9987125252896818, "grad_norm": 0.9751233701044131, "learning_rate": 5.0002039421719105e-06, "loss": 0.7978, "mean_token_accuracy": 0.7619426846504211, "step": 5430 }, { "epoch": 0.9996321500827662, "grad_norm": 0.971614941671036, "learning_rate": 5.000016648363663e-06, "loss": 0.801, "mean_token_accuracy": 0.7594120621681213, "step": 5435 }, { "epoch": 1.0, "mean_token_accuracy": 0.779580146074295, "step": 5437, "total_flos": 77442066677760.0, "train_loss": 0.8871173100675843, "train_runtime": 5515.7519, "train_samples_per_second": 15.771, "train_steps_per_second": 0.986 } ], "logging_steps": 5, "max_steps": 5437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 77442066677760.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }