diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8739 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5437, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009196247930844216, + "grad_norm": 4.087223679622462, + "learning_rate": 9.191176470588236e-07, + "loss": 1.3446, + "mean_token_accuracy": 0.6661458969116211, + "step": 5 + }, + { + "epoch": 0.0018392495861688431, + "grad_norm": 3.3376471514991324, + "learning_rate": 1.8382352941176471e-06, + "loss": 1.2534, + "mean_token_accuracy": 0.6856188654899598, + "step": 10 + }, + { + "epoch": 0.0027588743792532648, + "grad_norm": 3.1883807133419646, + "learning_rate": 2.7573529411764708e-06, + "loss": 1.2495, + "mean_token_accuracy": 0.6844112038612366, + "step": 15 + }, + { + "epoch": 0.0036784991723376862, + "grad_norm": 2.5757356327081826, + "learning_rate": 3.6764705882352942e-06, + "loss": 1.1962, + "mean_token_accuracy": 0.6918170928955079, + "step": 20 + }, + { + "epoch": 0.004598123965422108, + "grad_norm": 2.3971194855376092, + "learning_rate": 4.595588235294118e-06, + "loss": 1.2274, + "mean_token_accuracy": 0.6844529986381531, + "step": 25 + }, + { + "epoch": 0.0055177487585065296, + "grad_norm": 2.00434532423879, + "learning_rate": 5.5147058823529415e-06, + "loss": 1.1506, + "mean_token_accuracy": 0.697660756111145, + "step": 30 + }, + { + "epoch": 0.006437373551590951, + "grad_norm": 2.0663662496595543, + "learning_rate": 6.433823529411764e-06, + "loss": 1.1278, + "mean_token_accuracy": 0.6973050832748413, + "step": 35 + }, + { + "epoch": 0.0073569983446753725, + "grad_norm": 1.9519049901829761, + "learning_rate": 7.3529411764705884e-06, + "loss": 1.102, + "mean_token_accuracy": 0.7046478033065796, + "step": 40 + }, + { + "epoch": 0.008276623137759793, + "grad_norm": 1.8451875842176761, + "learning_rate": 8.272058823529413e-06, + "loss": 1.125, + "mean_token_accuracy": 0.6951346158981323, + "step": 45 + }, + { + "epoch": 0.009196247930844215, + "grad_norm": 2.000034845742239, + "learning_rate": 9.191176470588236e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7154734015464783, + "step": 50 + }, + { + "epoch": 0.010115872723928637, + "grad_norm": 1.621484821283711, + "learning_rate": 1.011029411764706e-05, + "loss": 1.0762, + "mean_token_accuracy": 0.706468117237091, + "step": 55 + }, + { + "epoch": 0.011035497517013059, + "grad_norm": 1.753826025706781, + "learning_rate": 1.1029411764705883e-05, + "loss": 1.0394, + "mean_token_accuracy": 0.7156139016151428, + "step": 60 + }, + { + "epoch": 0.011955122310097481, + "grad_norm": 1.6505676536191385, + "learning_rate": 1.1948529411764707e-05, + "loss": 1.0338, + "mean_token_accuracy": 0.7132004976272583, + "step": 65 + }, + { + "epoch": 0.012874747103181901, + "grad_norm": 1.8513933357249144, + "learning_rate": 1.2867647058823528e-05, + "loss": 0.9804, + "mean_token_accuracy": 0.7274341702461242, + "step": 70 + }, + { + "epoch": 0.013794371896266323, + "grad_norm": 2.4070230665851993, + "learning_rate": 1.3786764705882355e-05, + "loss": 1.0398, + "mean_token_accuracy": 0.7116599082946777, + "step": 75 + }, + { + "epoch": 0.014713996689350745, + "grad_norm": 1.798866895809756, + "learning_rate": 1.4705882352941177e-05, + "loss": 0.9922, + "mean_token_accuracy": 0.720504081249237, + "step": 80 + }, + { + "epoch": 0.015633621482435165, + "grad_norm": 1.709611126629724, + "learning_rate": 1.5625e-05, + "loss": 0.9938, + "mean_token_accuracy": 0.7247263193130493, + "step": 85 + }, + { + "epoch": 0.016553246275519587, + "grad_norm": 1.7626425485303618, + "learning_rate": 1.6544117647058825e-05, + "loss": 1.0122, + "mean_token_accuracy": 0.717292582988739, + "step": 90 + }, + { + "epoch": 0.01747287106860401, + "grad_norm": 2.036503882503329, + "learning_rate": 1.7463235294117647e-05, + "loss": 1.0109, + "mean_token_accuracy": 0.7172105073928833, + "step": 95 + }, + { + "epoch": 0.01839249586168843, + "grad_norm": 1.927409741133158, + "learning_rate": 1.8382352941176472e-05, + "loss": 1.0434, + "mean_token_accuracy": 0.7078547954559327, + "step": 100 + }, + { + "epoch": 0.019312120654772853, + "grad_norm": 2.079665033278075, + "learning_rate": 1.9301470588235298e-05, + "loss": 0.9959, + "mean_token_accuracy": 0.7182355523109436, + "step": 105 + }, + { + "epoch": 0.020231745447857274, + "grad_norm": 1.8479982769163703, + "learning_rate": 2.022058823529412e-05, + "loss": 1.0194, + "mean_token_accuracy": 0.7173629522323608, + "step": 110 + }, + { + "epoch": 0.021151370240941696, + "grad_norm": 1.831806807070413, + "learning_rate": 2.113970588235294e-05, + "loss": 0.9569, + "mean_token_accuracy": 0.7312556385993958, + "step": 115 + }, + { + "epoch": 0.022070995034026118, + "grad_norm": 1.7952413093248756, + "learning_rate": 2.2058823529411766e-05, + "loss": 1.0149, + "mean_token_accuracy": 0.7192024111747741, + "step": 120 + }, + { + "epoch": 0.02299061982711054, + "grad_norm": 1.6441769080980864, + "learning_rate": 2.2977941176470588e-05, + "loss": 0.9668, + "mean_token_accuracy": 0.7280102610588074, + "step": 125 + }, + { + "epoch": 0.023910244620194962, + "grad_norm": 1.7182187182460715, + "learning_rate": 2.3897058823529413e-05, + "loss": 1.025, + "mean_token_accuracy": 0.7164386153221131, + "step": 130 + }, + { + "epoch": 0.02482986941327938, + "grad_norm": 1.7665031820505241, + "learning_rate": 2.4816176470588238e-05, + "loss": 0.9879, + "mean_token_accuracy": 0.7216517567634583, + "step": 135 + }, + { + "epoch": 0.025749494206363802, + "grad_norm": 1.65781753659198, + "learning_rate": 2.5735294117647057e-05, + "loss": 1.0204, + "mean_token_accuracy": 0.7183511853218079, + "step": 140 + }, + { + "epoch": 0.026669118999448224, + "grad_norm": 1.5947996494100198, + "learning_rate": 2.6654411764705882e-05, + "loss": 0.9915, + "mean_token_accuracy": 0.7210009098052979, + "step": 145 + }, + { + "epoch": 0.027588743792532646, + "grad_norm": 1.6195741488866147, + "learning_rate": 2.757352941176471e-05, + "loss": 0.9609, + "mean_token_accuracy": 0.7290344476699829, + "step": 150 + }, + { + "epoch": 0.028508368585617068, + "grad_norm": 1.700795937176488, + "learning_rate": 2.849264705882353e-05, + "loss": 1.0017, + "mean_token_accuracy": 0.7190845251083374, + "step": 155 + }, + { + "epoch": 0.02942799337870149, + "grad_norm": 1.6626957868958252, + "learning_rate": 2.9411764705882354e-05, + "loss": 0.9801, + "mean_token_accuracy": 0.7264268517494201, + "step": 160 + }, + { + "epoch": 0.03034761817178591, + "grad_norm": 1.646176772035618, + "learning_rate": 3.0330882352941176e-05, + "loss": 0.9819, + "mean_token_accuracy": 0.7258347868919373, + "step": 165 + }, + { + "epoch": 0.03126724296487033, + "grad_norm": 1.7051406597026453, + "learning_rate": 3.125e-05, + "loss": 1.0021, + "mean_token_accuracy": 0.7193678379058838, + "step": 170 + }, + { + "epoch": 0.032186867757954755, + "grad_norm": 1.6583599673202631, + "learning_rate": 3.2169117647058826e-05, + "loss": 0.9863, + "mean_token_accuracy": 0.7218608260154724, + "step": 175 + }, + { + "epoch": 0.033106492551039174, + "grad_norm": 1.6811054631655953, + "learning_rate": 3.308823529411765e-05, + "loss": 0.9776, + "mean_token_accuracy": 0.7252245903015136, + "step": 180 + }, + { + "epoch": 0.0340261173441236, + "grad_norm": 1.6005295960642778, + "learning_rate": 3.4007352941176476e-05, + "loss": 0.952, + "mean_token_accuracy": 0.7300998091697692, + "step": 185 + }, + { + "epoch": 0.03494574213720802, + "grad_norm": 1.884741061084924, + "learning_rate": 3.4926470588235294e-05, + "loss": 1.0216, + "mean_token_accuracy": 0.7144460439682007, + "step": 190 + }, + { + "epoch": 0.03586536693029244, + "grad_norm": 1.61333499821342, + "learning_rate": 3.584558823529412e-05, + "loss": 1.0067, + "mean_token_accuracy": 0.7160724878311158, + "step": 195 + }, + { + "epoch": 0.03678499172337686, + "grad_norm": 1.592957572722435, + "learning_rate": 3.6764705882352945e-05, + "loss": 0.9367, + "mean_token_accuracy": 0.7348474979400634, + "step": 200 + }, + { + "epoch": 0.03770461651646129, + "grad_norm": 1.7666690880786284, + "learning_rate": 3.768382352941176e-05, + "loss": 0.9545, + "mean_token_accuracy": 0.7297826528549194, + "step": 205 + }, + { + "epoch": 0.038624241309545705, + "grad_norm": 1.5696177739032589, + "learning_rate": 3.8602941176470595e-05, + "loss": 1.0076, + "mean_token_accuracy": 0.7160616636276245, + "step": 210 + }, + { + "epoch": 0.039543866102630124, + "grad_norm": 1.5375849975431441, + "learning_rate": 3.952205882352941e-05, + "loss": 1.0082, + "mean_token_accuracy": 0.7139402985572815, + "step": 215 + }, + { + "epoch": 0.04046349089571455, + "grad_norm": 1.6613621558577687, + "learning_rate": 4.044117647058824e-05, + "loss": 1.0047, + "mean_token_accuracy": 0.7157810091972351, + "step": 220 + }, + { + "epoch": 0.04138311568879897, + "grad_norm": 1.6712866586887962, + "learning_rate": 4.136029411764706e-05, + "loss": 0.9841, + "mean_token_accuracy": 0.7261144757270813, + "step": 225 + }, + { + "epoch": 0.04230274048188339, + "grad_norm": 1.5868739813391535, + "learning_rate": 4.227941176470588e-05, + "loss": 1.0063, + "mean_token_accuracy": 0.7146228194236756, + "step": 230 + }, + { + "epoch": 0.04322236527496781, + "grad_norm": 1.4745940440239442, + "learning_rate": 4.319852941176471e-05, + "loss": 0.9895, + "mean_token_accuracy": 0.7205227255821228, + "step": 235 + }, + { + "epoch": 0.044141990068052236, + "grad_norm": 1.565812920746474, + "learning_rate": 4.411764705882353e-05, + "loss": 0.9883, + "mean_token_accuracy": 0.7221224546432495, + "step": 240 + }, + { + "epoch": 0.045061614861136655, + "grad_norm": 1.579279007990175, + "learning_rate": 4.503676470588236e-05, + "loss": 1.0339, + "mean_token_accuracy": 0.7140692472457886, + "step": 245 + }, + { + "epoch": 0.04598123965422108, + "grad_norm": 1.550674625710887, + "learning_rate": 4.5955882352941176e-05, + "loss": 1.009, + "mean_token_accuracy": 0.717827045917511, + "step": 250 + }, + { + "epoch": 0.0469008644473055, + "grad_norm": 1.494069442893164, + "learning_rate": 4.6875e-05, + "loss": 1.0163, + "mean_token_accuracy": 0.7157993316650391, + "step": 255 + }, + { + "epoch": 0.047820489240389924, + "grad_norm": 1.585433590429472, + "learning_rate": 4.7794117647058826e-05, + "loss": 0.9662, + "mean_token_accuracy": 0.7260660767555237, + "step": 260 + }, + { + "epoch": 0.04874011403347434, + "grad_norm": 1.5561077784742092, + "learning_rate": 4.871323529411765e-05, + "loss": 1.0521, + "mean_token_accuracy": 0.7059531569480896, + "step": 265 + }, + { + "epoch": 0.04965973882655876, + "grad_norm": 1.3842507274813078, + "learning_rate": 4.9632352941176476e-05, + "loss": 0.96, + "mean_token_accuracy": 0.7317641496658325, + "step": 270 + }, + { + "epoch": 0.050579363619643186, + "grad_norm": 1.4379239878799341, + "learning_rate": 4.999996254118754e-05, + "loss": 0.972, + "mean_token_accuracy": 0.7297493696212769, + "step": 275 + }, + { + "epoch": 0.051498988412727605, + "grad_norm": 1.3761784967587591, + "learning_rate": 4.999973362667417e-05, + "loss": 0.9844, + "mean_token_accuracy": 0.724224853515625, + "step": 280 + }, + { + "epoch": 0.05241861320581203, + "grad_norm": 1.4249636066532947, + "learning_rate": 4.999929661021346e-05, + "loss": 0.9974, + "mean_token_accuracy": 0.7186186075210571, + "step": 285 + }, + { + "epoch": 0.05333823799889645, + "grad_norm": 1.6467747117004, + "learning_rate": 4.9998651495847435e-05, + "loss": 1.0296, + "mean_token_accuracy": 0.7110173583030701, + "step": 290 + }, + { + "epoch": 0.054257862791980874, + "grad_norm": 1.3761801455599358, + "learning_rate": 4.9997798289542816e-05, + "loss": 1.0209, + "mean_token_accuracy": 0.7124481081962586, + "step": 295 + }, + { + "epoch": 0.05517748758506529, + "grad_norm": 1.4585308096786376, + "learning_rate": 4.9996736999190965e-05, + "loss": 1.0248, + "mean_token_accuracy": 0.7100600242614746, + "step": 300 + }, + { + "epoch": 0.05609711237814972, + "grad_norm": 1.4301378065367794, + "learning_rate": 4.999546763460785e-05, + "loss": 0.9864, + "mean_token_accuracy": 0.7253738522529602, + "step": 305 + }, + { + "epoch": 0.057016737171234136, + "grad_norm": 1.4586102770676173, + "learning_rate": 4.999399020753393e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.7308779239654541, + "step": 310 + }, + { + "epoch": 0.05793636196431856, + "grad_norm": 1.5007400960218442, + "learning_rate": 4.999230473163406e-05, + "loss": 1.0123, + "mean_token_accuracy": 0.7142405152320862, + "step": 315 + }, + { + "epoch": 0.05885598675740298, + "grad_norm": 1.4247385882584611, + "learning_rate": 4.999041122249735e-05, + "loss": 1.0097, + "mean_token_accuracy": 0.7164065957069397, + "step": 320 + }, + { + "epoch": 0.0597756115504874, + "grad_norm": 1.4338281584111965, + "learning_rate": 4.9988309697637025e-05, + "loss": 1.0381, + "mean_token_accuracy": 0.7093045115470886, + "step": 325 + }, + { + "epoch": 0.06069523634357182, + "grad_norm": 1.3206321897141915, + "learning_rate": 4.9986000176490264e-05, + "loss": 1.0378, + "mean_token_accuracy": 0.7081658363342285, + "step": 330 + }, + { + "epoch": 0.06161486113665624, + "grad_norm": 1.4771390057019052, + "learning_rate": 4.998348268041803e-05, + "loss": 1.0473, + "mean_token_accuracy": 0.7044042825698853, + "step": 335 + }, + { + "epoch": 0.06253448592974066, + "grad_norm": 1.410427294901373, + "learning_rate": 4.9980757232704836e-05, + "loss": 1.0476, + "mean_token_accuracy": 0.7044672727584839, + "step": 340 + }, + { + "epoch": 0.06345411072282509, + "grad_norm": 1.293731368317575, + "learning_rate": 4.997782385855862e-05, + "loss": 0.9809, + "mean_token_accuracy": 0.7207650065422058, + "step": 345 + }, + { + "epoch": 0.06437373551590951, + "grad_norm": 1.373213488697433, + "learning_rate": 4.9974682585110375e-05, + "loss": 1.0238, + "mean_token_accuracy": 0.713714337348938, + "step": 350 + }, + { + "epoch": 0.06529336030899394, + "grad_norm": 1.4173612737543944, + "learning_rate": 4.997133344141402e-05, + "loss": 0.9995, + "mean_token_accuracy": 0.7182128310203553, + "step": 355 + }, + { + "epoch": 0.06621298510207835, + "grad_norm": 1.4208487527297817, + "learning_rate": 4.9967776458446067e-05, + "loss": 1.0247, + "mean_token_accuracy": 0.7120985150337219, + "step": 360 + }, + { + "epoch": 0.06713260989516277, + "grad_norm": 1.3468936690832556, + "learning_rate": 4.996401166910535e-05, + "loss": 1.0257, + "mean_token_accuracy": 0.711448609828949, + "step": 365 + }, + { + "epoch": 0.0680522346882472, + "grad_norm": 1.3418384776624692, + "learning_rate": 4.996003910821273e-05, + "loss": 0.9908, + "mean_token_accuracy": 0.7198069810867309, + "step": 370 + }, + { + "epoch": 0.06897185948133161, + "grad_norm": 1.2757020291626893, + "learning_rate": 4.995585881251076e-05, + "loss": 1.0029, + "mean_token_accuracy": 0.7165916681289672, + "step": 375 + }, + { + "epoch": 0.06989148427441604, + "grad_norm": 1.2215136508098425, + "learning_rate": 4.995147082066335e-05, + "loss": 1.0071, + "mean_token_accuracy": 0.7161303281784057, + "step": 380 + }, + { + "epoch": 0.07081110906750046, + "grad_norm": 1.5100364277085054, + "learning_rate": 4.9946875173255405e-05, + "loss": 0.9808, + "mean_token_accuracy": 0.7223702430725097, + "step": 385 + }, + { + "epoch": 0.07173073386058489, + "grad_norm": 1.3193074150499653, + "learning_rate": 4.9942071912792463e-05, + "loss": 0.9692, + "mean_token_accuracy": 0.7253165245056152, + "step": 390 + }, + { + "epoch": 0.0726503586536693, + "grad_norm": 1.360795639773644, + "learning_rate": 4.9937061083700286e-05, + "loss": 0.9248, + "mean_token_accuracy": 0.738149356842041, + "step": 395 + }, + { + "epoch": 0.07356998344675372, + "grad_norm": 1.3934617241628962, + "learning_rate": 4.993184273232445e-05, + "loss": 1.0174, + "mean_token_accuracy": 0.7140317440032959, + "step": 400 + }, + { + "epoch": 0.07448960823983815, + "grad_norm": 1.3755761090465115, + "learning_rate": 4.9926416906929954e-05, + "loss": 0.9371, + "mean_token_accuracy": 0.7347567915916443, + "step": 405 + }, + { + "epoch": 0.07540923303292257, + "grad_norm": 1.3123084901189321, + "learning_rate": 4.9920783657700685e-05, + "loss": 1.0494, + "mean_token_accuracy": 0.7046082258224488, + "step": 410 + }, + { + "epoch": 0.07632885782600698, + "grad_norm": 1.26236320940822, + "learning_rate": 4.9914943036739075e-05, + "loss": 0.9813, + "mean_token_accuracy": 0.7248732924461365, + "step": 415 + }, + { + "epoch": 0.07724848261909141, + "grad_norm": 1.4072657383382854, + "learning_rate": 4.99088950980655e-05, + "loss": 1.0041, + "mean_token_accuracy": 0.7161918520927429, + "step": 420 + }, + { + "epoch": 0.07816810741217584, + "grad_norm": 1.4142932157820918, + "learning_rate": 4.9902639897617876e-05, + "loss": 1.0343, + "mean_token_accuracy": 0.7073235511779785, + "step": 425 + }, + { + "epoch": 0.07908773220526025, + "grad_norm": 1.2620775477382082, + "learning_rate": 4.9896177493251065e-05, + "loss": 0.9773, + "mean_token_accuracy": 0.724228036403656, + "step": 430 + }, + { + "epoch": 0.08000735699834467, + "grad_norm": 1.2299977431090294, + "learning_rate": 4.9889507944736405e-05, + "loss": 0.9921, + "mean_token_accuracy": 0.7193984985351562, + "step": 435 + }, + { + "epoch": 0.0809269817914291, + "grad_norm": 1.272005618491772, + "learning_rate": 4.9882631313761116e-05, + "loss": 1.0266, + "mean_token_accuracy": 0.7106949806213378, + "step": 440 + }, + { + "epoch": 0.08184660658451352, + "grad_norm": 1.3368998742271194, + "learning_rate": 4.9875547663927744e-05, + "loss": 0.9945, + "mean_token_accuracy": 0.7178430318832397, + "step": 445 + }, + { + "epoch": 0.08276623137759793, + "grad_norm": 1.2395804635484349, + "learning_rate": 4.986825706075357e-05, + "loss": 0.9614, + "mean_token_accuracy": 0.7270126938819885, + "step": 450 + }, + { + "epoch": 0.08368585617068236, + "grad_norm": 1.2355105682399337, + "learning_rate": 4.9860759571669987e-05, + "loss": 1.017, + "mean_token_accuracy": 0.7113536357879638, + "step": 455 + }, + { + "epoch": 0.08460548096376679, + "grad_norm": 1.2769471363849882, + "learning_rate": 4.985305526602192e-05, + "loss": 0.9841, + "mean_token_accuracy": 0.7207873582839965, + "step": 460 + }, + { + "epoch": 0.08552510575685121, + "grad_norm": 1.3105851965485462, + "learning_rate": 4.984514421506715e-05, + "loss": 1.0238, + "mean_token_accuracy": 0.7113570213317871, + "step": 465 + }, + { + "epoch": 0.08644473054993562, + "grad_norm": 1.2226583029739935, + "learning_rate": 4.983702649197565e-05, + "loss": 1.0026, + "mean_token_accuracy": 0.7175478458404541, + "step": 470 + }, + { + "epoch": 0.08736435534302005, + "grad_norm": 1.3032963672614144, + "learning_rate": 4.982870217182893e-05, + "loss": 1.0102, + "mean_token_accuracy": 0.7142111778259277, + "step": 475 + }, + { + "epoch": 0.08828398013610447, + "grad_norm": 1.276533355049304, + "learning_rate": 4.9820171331619343e-05, + "loss": 1.0175, + "mean_token_accuracy": 0.7140154242515564, + "step": 480 + }, + { + "epoch": 0.08920360492918888, + "grad_norm": 1.3275369586760475, + "learning_rate": 4.981143405024936e-05, + "loss": 0.9664, + "mean_token_accuracy": 0.7251969814300537, + "step": 485 + }, + { + "epoch": 0.09012322972227331, + "grad_norm": 1.322475452296982, + "learning_rate": 4.980249040853081e-05, + "loss": 0.9572, + "mean_token_accuracy": 0.7284212589263916, + "step": 490 + }, + { + "epoch": 0.09104285451535774, + "grad_norm": 1.2219967426964762, + "learning_rate": 4.979334048918422e-05, + "loss": 1.0265, + "mean_token_accuracy": 0.7094637989997864, + "step": 495 + }, + { + "epoch": 0.09196247930844216, + "grad_norm": 1.2500649142513325, + "learning_rate": 4.978398437683797e-05, + "loss": 0.9429, + "mean_token_accuracy": 0.7309910893440247, + "step": 500 + }, + { + "epoch": 0.09288210410152657, + "grad_norm": 1.2382649121413325, + "learning_rate": 4.977442215802753e-05, + "loss": 1.0142, + "mean_token_accuracy": 0.7163145303726196, + "step": 505 + }, + { + "epoch": 0.093801728894611, + "grad_norm": 1.2494735942714719, + "learning_rate": 4.976465392119467e-05, + "loss": 0.9711, + "mean_token_accuracy": 0.7253948450088501, + "step": 510 + }, + { + "epoch": 0.09472135368769542, + "grad_norm": 1.1320102641208292, + "learning_rate": 4.9754679756686654e-05, + "loss": 0.9754, + "mean_token_accuracy": 0.7240365982055664, + "step": 515 + }, + { + "epoch": 0.09564097848077985, + "grad_norm": 1.2636397583226155, + "learning_rate": 4.974449975675538e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7268050789833069, + "step": 520 + }, + { + "epoch": 0.09656060327386426, + "grad_norm": 1.2638605012202537, + "learning_rate": 4.9734114015556506e-05, + "loss": 0.994, + "mean_token_accuracy": 0.7192271828651429, + "step": 525 + }, + { + "epoch": 0.09748022806694868, + "grad_norm": 1.3539672940723328, + "learning_rate": 4.972352262914867e-05, + "loss": 1.0219, + "mean_token_accuracy": 0.712011969089508, + "step": 530 + }, + { + "epoch": 0.09839985286003311, + "grad_norm": 1.2622022574950933, + "learning_rate": 4.971272569549246e-05, + "loss": 0.9993, + "mean_token_accuracy": 0.717021644115448, + "step": 535 + }, + { + "epoch": 0.09931947765311752, + "grad_norm": 1.2498621609285703, + "learning_rate": 4.970172331444968e-05, + "loss": 0.9869, + "mean_token_accuracy": 0.7201068043708801, + "step": 540 + }, + { + "epoch": 0.10023910244620195, + "grad_norm": 1.2563183037951813, + "learning_rate": 4.969051558778226e-05, + "loss": 1.0328, + "mean_token_accuracy": 0.7072706580162048, + "step": 545 + }, + { + "epoch": 0.10115872723928637, + "grad_norm": 1.1583096373701225, + "learning_rate": 4.967910261915142e-05, + "loss": 1.0073, + "mean_token_accuracy": 0.7176116108894348, + "step": 550 + }, + { + "epoch": 0.1020783520323708, + "grad_norm": 1.2337310449325847, + "learning_rate": 4.966748451411668e-05, + "loss": 1.0075, + "mean_token_accuracy": 0.7166797518730164, + "step": 555 + }, + { + "epoch": 0.10299797682545521, + "grad_norm": 1.187463601840395, + "learning_rate": 4.9655661380134874e-05, + "loss": 0.9978, + "mean_token_accuracy": 0.7187446594238281, + "step": 560 + }, + { + "epoch": 0.10391760161853963, + "grad_norm": 1.1950175317081544, + "learning_rate": 4.964363332655918e-05, + "loss": 1.0127, + "mean_token_accuracy": 0.7141183018684387, + "step": 565 + }, + { + "epoch": 0.10483722641162406, + "grad_norm": 1.1797983108141703, + "learning_rate": 4.9631400464638074e-05, + "loss": 1.0058, + "mean_token_accuracy": 0.7147095799446106, + "step": 570 + }, + { + "epoch": 0.10575685120470849, + "grad_norm": 1.3194739883489515, + "learning_rate": 4.961896290751434e-05, + "loss": 1.0125, + "mean_token_accuracy": 0.7156966686248779, + "step": 575 + }, + { + "epoch": 0.1066764759977929, + "grad_norm": 1.232197096442626, + "learning_rate": 4.960632077022402e-05, + "loss": 1.0096, + "mean_token_accuracy": 0.7136348843574524, + "step": 580 + }, + { + "epoch": 0.10759610079087732, + "grad_norm": 1.1109964489025674, + "learning_rate": 4.959347416969529e-05, + "loss": 0.9782, + "mean_token_accuracy": 0.7218139052391053, + "step": 585 + }, + { + "epoch": 0.10851572558396175, + "grad_norm": 1.1118328480221105, + "learning_rate": 4.958042322474747e-05, + "loss": 0.9138, + "mean_token_accuracy": 0.7406689524650574, + "step": 590 + }, + { + "epoch": 0.10943535037704616, + "grad_norm": 1.1550688598895895, + "learning_rate": 4.956716805608984e-05, + "loss": 1.0123, + "mean_token_accuracy": 0.7150320529937744, + "step": 595 + }, + { + "epoch": 0.11035497517013058, + "grad_norm": 1.2400379075265455, + "learning_rate": 4.955370878632058e-05, + "loss": 0.9642, + "mean_token_accuracy": 0.7274539470672607, + "step": 600 + }, + { + "epoch": 0.11127459996321501, + "grad_norm": 1.1266451881904362, + "learning_rate": 4.954004553992564e-05, + "loss": 0.9597, + "mean_token_accuracy": 0.7269688129425049, + "step": 605 + }, + { + "epoch": 0.11219422475629943, + "grad_norm": 1.195410688726218, + "learning_rate": 4.952617844327753e-05, + "loss": 0.9667, + "mean_token_accuracy": 0.7273669600486755, + "step": 610 + }, + { + "epoch": 0.11311384954938385, + "grad_norm": 1.2168436664941074, + "learning_rate": 4.951210762463421e-05, + "loss": 0.981, + "mean_token_accuracy": 0.7224032163619996, + "step": 615 + }, + { + "epoch": 0.11403347434246827, + "grad_norm": 1.1158577605300688, + "learning_rate": 4.949783321413787e-05, + "loss": 1.0133, + "mean_token_accuracy": 0.7140767455101014, + "step": 620 + }, + { + "epoch": 0.1149530991355527, + "grad_norm": 1.2227500677211205, + "learning_rate": 4.948335534381375e-05, + "loss": 1.0178, + "mean_token_accuracy": 0.7107774257659912, + "step": 625 + }, + { + "epoch": 0.11587272392863712, + "grad_norm": 1.1733820093333545, + "learning_rate": 4.9468674147568906e-05, + "loss": 0.9496, + "mean_token_accuracy": 0.7264823913574219, + "step": 630 + }, + { + "epoch": 0.11679234872172153, + "grad_norm": 1.1456005644666878, + "learning_rate": 4.945378976119096e-05, + "loss": 1.0301, + "mean_token_accuracy": 0.7111668229103089, + "step": 635 + }, + { + "epoch": 0.11771197351480596, + "grad_norm": 1.176194033859284, + "learning_rate": 4.943870232234688e-05, + "loss": 0.9904, + "mean_token_accuracy": 0.7183448076248169, + "step": 640 + }, + { + "epoch": 0.11863159830789038, + "grad_norm": 1.1767555657667275, + "learning_rate": 4.9423411970581656e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7282203912734986, + "step": 645 + }, + { + "epoch": 0.1195512231009748, + "grad_norm": 1.1593918150017006, + "learning_rate": 4.940791884731706e-05, + "loss": 0.9629, + "mean_token_accuracy": 0.7265506267547608, + "step": 650 + }, + { + "epoch": 0.12047084789405922, + "grad_norm": 1.1809244906539653, + "learning_rate": 4.939222309585029e-05, + "loss": 0.9506, + "mean_token_accuracy": 0.7299855709075928, + "step": 655 + }, + { + "epoch": 0.12139047268714365, + "grad_norm": 1.187342482868558, + "learning_rate": 4.93763248613527e-05, + "loss": 0.9873, + "mean_token_accuracy": 0.7208028793334961, + "step": 660 + }, + { + "epoch": 0.12231009748022807, + "grad_norm": 1.1643370561641233, + "learning_rate": 4.936022429086841e-05, + "loss": 1.019, + "mean_token_accuracy": 0.7111838817596435, + "step": 665 + }, + { + "epoch": 0.12322972227331248, + "grad_norm": 1.1548281507110767, + "learning_rate": 4.9343921533312955e-05, + "loss": 0.949, + "mean_token_accuracy": 0.7271883249282837, + "step": 670 + }, + { + "epoch": 0.12414934706639691, + "grad_norm": 1.1323282418083014, + "learning_rate": 4.9327416739471935e-05, + "loss": 0.9269, + "mean_token_accuracy": 0.737087082862854, + "step": 675 + }, + { + "epoch": 0.12506897185948132, + "grad_norm": 1.2363897419233494, + "learning_rate": 4.9310710061999575e-05, + "loss": 1.0061, + "mean_token_accuracy": 0.714658522605896, + "step": 680 + }, + { + "epoch": 0.12598859665256576, + "grad_norm": 1.15808211817011, + "learning_rate": 4.9293801655417366e-05, + "loss": 0.9426, + "mean_token_accuracy": 0.7324698209762573, + "step": 685 + }, + { + "epoch": 0.12690822144565017, + "grad_norm": 1.168156282468429, + "learning_rate": 4.927669167611259e-05, + "loss": 0.9516, + "mean_token_accuracy": 0.726858627796173, + "step": 690 + }, + { + "epoch": 0.12782784623873458, + "grad_norm": 1.1708412963628498, + "learning_rate": 4.92593802823369e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7281310319900512, + "step": 695 + }, + { + "epoch": 0.12874747103181902, + "grad_norm": 1.150205433303024, + "learning_rate": 4.924186763420486e-05, + "loss": 0.9966, + "mean_token_accuracy": 0.7196317195892334, + "step": 700 + }, + { + "epoch": 0.12966709582490343, + "grad_norm": 1.1412449351652514, + "learning_rate": 4.922415389369243e-05, + "loss": 0.9393, + "mean_token_accuracy": 0.7308167576789856, + "step": 705 + }, + { + "epoch": 0.13058672061798787, + "grad_norm": 1.2590368311590696, + "learning_rate": 4.9206239224635486e-05, + "loss": 0.9961, + "mean_token_accuracy": 0.7167337894439697, + "step": 710 + }, + { + "epoch": 0.13150634541107228, + "grad_norm": 1.1862573902159457, + "learning_rate": 4.9188123792728344e-05, + "loss": 0.9991, + "mean_token_accuracy": 0.71655353307724, + "step": 715 + }, + { + "epoch": 0.1324259702041567, + "grad_norm": 1.1728642333915622, + "learning_rate": 4.916980776552218e-05, + "loss": 0.9354, + "mean_token_accuracy": 0.734131133556366, + "step": 720 + }, + { + "epoch": 0.13334559499724113, + "grad_norm": 1.208191683152181, + "learning_rate": 4.915129131242345e-05, + "loss": 0.9578, + "mean_token_accuracy": 0.7278777837753296, + "step": 725 + }, + { + "epoch": 0.13426521979032555, + "grad_norm": 1.138309077411327, + "learning_rate": 4.913257460469243e-05, + "loss": 0.9448, + "mean_token_accuracy": 0.7303597450256347, + "step": 730 + }, + { + "epoch": 0.13518484458340996, + "grad_norm": 1.1410024150973699, + "learning_rate": 4.911365781544153e-05, + "loss": 0.9765, + "mean_token_accuracy": 0.7208934783935547, + "step": 735 + }, + { + "epoch": 0.1361044693764944, + "grad_norm": 1.135207319109893, + "learning_rate": 4.9094541119633756e-05, + "loss": 0.9625, + "mean_token_accuracy": 0.7279266119003296, + "step": 740 + }, + { + "epoch": 0.1370240941695788, + "grad_norm": 1.1470179542343784, + "learning_rate": 4.907522469408103e-05, + "loss": 1.0099, + "mean_token_accuracy": 0.7129136681556701, + "step": 745 + }, + { + "epoch": 0.13794371896266322, + "grad_norm": 1.1186516076443083, + "learning_rate": 4.905570871744262e-05, + "loss": 0.9492, + "mean_token_accuracy": 0.7295220971107483, + "step": 750 + }, + { + "epoch": 0.13886334375574766, + "grad_norm": 1.188235501807293, + "learning_rate": 4.903599337022345e-05, + "loss": 0.9158, + "mean_token_accuracy": 0.7392297148704529, + "step": 755 + }, + { + "epoch": 0.13978296854883207, + "grad_norm": 1.156585568722138, + "learning_rate": 4.9016078834772436e-05, + "loss": 1.0069, + "mean_token_accuracy": 0.7133058428764343, + "step": 760 + }, + { + "epoch": 0.1407025933419165, + "grad_norm": 1.0550430464679208, + "learning_rate": 4.899596529528083e-05, + "loss": 0.9804, + "mean_token_accuracy": 0.7237313628196717, + "step": 765 + }, + { + "epoch": 0.14162221813500092, + "grad_norm": 1.0828080346302627, + "learning_rate": 4.897565293778045e-05, + "loss": 0.9398, + "mean_token_accuracy": 0.7297361016273498, + "step": 770 + }, + { + "epoch": 0.14254184292808533, + "grad_norm": 1.0748821988518662, + "learning_rate": 4.895514195014201e-05, + "loss": 0.9512, + "mean_token_accuracy": 0.727254593372345, + "step": 775 + }, + { + "epoch": 0.14346146772116977, + "grad_norm": 1.1000801031665166, + "learning_rate": 4.893443252207339e-05, + "loss": 0.96, + "mean_token_accuracy": 0.7277865290641785, + "step": 780 + }, + { + "epoch": 0.14438109251425418, + "grad_norm": 1.1979288214254857, + "learning_rate": 4.891352484511783e-05, + "loss": 0.9904, + "mean_token_accuracy": 0.7203876137733459, + "step": 785 + }, + { + "epoch": 0.1453007173073386, + "grad_norm": 1.0336978471065938, + "learning_rate": 4.889241911265224e-05, + "loss": 0.9512, + "mean_token_accuracy": 0.7298694252967834, + "step": 790 + }, + { + "epoch": 0.14622034210042303, + "grad_norm": 1.093196247221492, + "learning_rate": 4.887111551988531e-05, + "loss": 1.0404, + "mean_token_accuracy": 0.7045328140258789, + "step": 795 + }, + { + "epoch": 0.14713996689350745, + "grad_norm": 1.224732532168464, + "learning_rate": 4.884961426385578e-05, + "loss": 1.0189, + "mean_token_accuracy": 0.7101276278495788, + "step": 800 + }, + { + "epoch": 0.14805959168659186, + "grad_norm": 1.1751595598375444, + "learning_rate": 4.8827915543430604e-05, + "loss": 0.9166, + "mean_token_accuracy": 0.7369141817092896, + "step": 805 + }, + { + "epoch": 0.1489792164796763, + "grad_norm": 1.0711984590567727, + "learning_rate": 4.880601955930308e-05, + "loss": 0.9528, + "mean_token_accuracy": 0.7275946021080018, + "step": 810 + }, + { + "epoch": 0.1498988412727607, + "grad_norm": 1.1523849563074238, + "learning_rate": 4.878392651399103e-05, + "loss": 0.9724, + "mean_token_accuracy": 0.72748943567276, + "step": 815 + }, + { + "epoch": 0.15081846606584515, + "grad_norm": 1.1385592224893888, + "learning_rate": 4.8761636611834906e-05, + "loss": 0.9423, + "mean_token_accuracy": 0.7338582873344421, + "step": 820 + }, + { + "epoch": 0.15173809085892956, + "grad_norm": 1.171019568482894, + "learning_rate": 4.873915005899591e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.7215001463890076, + "step": 825 + }, + { + "epoch": 0.15265771565201397, + "grad_norm": 1.1181637038875023, + "learning_rate": 4.871646706345407e-05, + "loss": 0.9696, + "mean_token_accuracy": 0.7244228839874267, + "step": 830 + }, + { + "epoch": 0.1535773404450984, + "grad_norm": 1.140111709793846, + "learning_rate": 4.869358783500634e-05, + "loss": 0.9691, + "mean_token_accuracy": 0.7219241619110107, + "step": 835 + }, + { + "epoch": 0.15449696523818282, + "grad_norm": 1.1035668632214553, + "learning_rate": 4.867051258526466e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.7362164258956909, + "step": 840 + }, + { + "epoch": 0.15541659003126723, + "grad_norm": 1.0632498704772437, + "learning_rate": 4.864724152765396e-05, + "loss": 0.9319, + "mean_token_accuracy": 0.7335481762886047, + "step": 845 + }, + { + "epoch": 0.15633621482435167, + "grad_norm": 1.1360641167900578, + "learning_rate": 4.8623774877410235e-05, + "loss": 0.998, + "mean_token_accuracy": 0.7165634036064148, + "step": 850 + }, + { + "epoch": 0.15725583961743608, + "grad_norm": 1.1574648839544697, + "learning_rate": 4.860011285157852e-05, + "loss": 0.9983, + "mean_token_accuracy": 0.7154228448867798, + "step": 855 + }, + { + "epoch": 0.1581754644105205, + "grad_norm": 1.1103379240939366, + "learning_rate": 4.857625566901091e-05, + "loss": 0.9606, + "mean_token_accuracy": 0.7255040884017945, + "step": 860 + }, + { + "epoch": 0.15909508920360493, + "grad_norm": 1.3478355454379694, + "learning_rate": 4.85522035503645e-05, + "loss": 0.9643, + "mean_token_accuracy": 0.7249020457267761, + "step": 865 + }, + { + "epoch": 0.16001471399668935, + "grad_norm": 1.129020628766503, + "learning_rate": 4.852795671809941e-05, + "loss": 0.9341, + "mean_token_accuracy": 0.7329063415527344, + "step": 870 + }, + { + "epoch": 0.16093433878977378, + "grad_norm": 1.1322677948976352, + "learning_rate": 4.850351539647661e-05, + "loss": 0.9977, + "mean_token_accuracy": 0.7172942876815795, + "step": 875 + }, + { + "epoch": 0.1618539635828582, + "grad_norm": 1.120014190171844, + "learning_rate": 4.8478879811555986e-05, + "loss": 0.9283, + "mean_token_accuracy": 0.7341889500617981, + "step": 880 + }, + { + "epoch": 0.1627735883759426, + "grad_norm": 1.1336097713701254, + "learning_rate": 4.845405019119414e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7151533484458923, + "step": 885 + }, + { + "epoch": 0.16369321316902705, + "grad_norm": 0.9922793909516228, + "learning_rate": 4.842902676504235e-05, + "loss": 0.9039, + "mean_token_accuracy": 0.7395052313804626, + "step": 890 + }, + { + "epoch": 0.16461283796211146, + "grad_norm": 1.2309806920357915, + "learning_rate": 4.840380976454441e-05, + "loss": 0.9143, + "mean_token_accuracy": 0.7372842311859131, + "step": 895 + }, + { + "epoch": 0.16553246275519587, + "grad_norm": 1.058725560363019, + "learning_rate": 4.837839942293449e-05, + "loss": 1.0122, + "mean_token_accuracy": 0.7113693952560425, + "step": 900 + }, + { + "epoch": 0.1664520875482803, + "grad_norm": 1.1050666066281727, + "learning_rate": 4.835279597523501e-05, + "loss": 0.9691, + "mean_token_accuracy": 0.7241552948951722, + "step": 905 + }, + { + "epoch": 0.16737171234136472, + "grad_norm": 1.1281645078253164, + "learning_rate": 4.832699965825443e-05, + "loss": 0.9783, + "mean_token_accuracy": 0.7210159540176392, + "step": 910 + }, + { + "epoch": 0.16829133713444913, + "grad_norm": 1.1049918709083206, + "learning_rate": 4.830101071058506e-05, + "loss": 0.9529, + "mean_token_accuracy": 0.726420772075653, + "step": 915 + }, + { + "epoch": 0.16921096192753357, + "grad_norm": 1.1589903082257091, + "learning_rate": 4.82748293726009e-05, + "loss": 1.0162, + "mean_token_accuracy": 0.7134600043296814, + "step": 920 + }, + { + "epoch": 0.17013058672061798, + "grad_norm": 1.0648743038360364, + "learning_rate": 4.824845588645538e-05, + "loss": 0.931, + "mean_token_accuracy": 0.7355116486549378, + "step": 925 + }, + { + "epoch": 0.17105021151370242, + "grad_norm": 1.0563630156850699, + "learning_rate": 4.822189049607909e-05, + "loss": 0.9303, + "mean_token_accuracy": 0.7332427501678467, + "step": 930 + }, + { + "epoch": 0.17196983630678683, + "grad_norm": 1.0946637430016075, + "learning_rate": 4.819513344717759e-05, + "loss": 0.9805, + "mean_token_accuracy": 0.7218296766281128, + "step": 935 + }, + { + "epoch": 0.17288946109987124, + "grad_norm": 1.218450386345206, + "learning_rate": 4.8168184987229104e-05, + "loss": 1.0025, + "mean_token_accuracy": 0.7138312220573425, + "step": 940 + }, + { + "epoch": 0.17380908589295568, + "grad_norm": 1.1265660437743932, + "learning_rate": 4.814104536548222e-05, + "loss": 0.9901, + "mean_token_accuracy": 0.7183592796325684, + "step": 945 + }, + { + "epoch": 0.1747287106860401, + "grad_norm": 1.1519197604777511, + "learning_rate": 4.811371483295361e-05, + "loss": 0.9677, + "mean_token_accuracy": 0.723106038570404, + "step": 950 + }, + { + "epoch": 0.1756483354791245, + "grad_norm": 1.0668603888469903, + "learning_rate": 4.808619364242569e-05, + "loss": 0.9428, + "mean_token_accuracy": 0.7298098564147949, + "step": 955 + }, + { + "epoch": 0.17656796027220895, + "grad_norm": 1.0617094358031158, + "learning_rate": 4.805848204844427e-05, + "loss": 0.9794, + "mean_token_accuracy": 0.7198897957801819, + "step": 960 + }, + { + "epoch": 0.17748758506529336, + "grad_norm": 1.1638181916029056, + "learning_rate": 4.803058030731627e-05, + "loss": 1.0356, + "mean_token_accuracy": 0.7055891275405883, + "step": 965 + }, + { + "epoch": 0.17840720985837777, + "grad_norm": 1.0804274338945197, + "learning_rate": 4.800248867710724e-05, + "loss": 0.9551, + "mean_token_accuracy": 0.7267025232315063, + "step": 970 + }, + { + "epoch": 0.1793268346514622, + "grad_norm": 1.1002302515677742, + "learning_rate": 4.797420741763906e-05, + "loss": 0.9513, + "mean_token_accuracy": 0.727520763874054, + "step": 975 + }, + { + "epoch": 0.18024645944454662, + "grad_norm": 1.0807257658531308, + "learning_rate": 4.794573679048751e-05, + "loss": 0.9667, + "mean_token_accuracy": 0.7254797458648682, + "step": 980 + }, + { + "epoch": 0.18116608423763106, + "grad_norm": 1.1423934429361384, + "learning_rate": 4.791707705897982e-05, + "loss": 0.9289, + "mean_token_accuracy": 0.7316087126731873, + "step": 985 + }, + { + "epoch": 0.18208570903071547, + "grad_norm": 1.0732201976252709, + "learning_rate": 4.7888228488192294e-05, + "loss": 0.9826, + "mean_token_accuracy": 0.7205982804298401, + "step": 990 + }, + { + "epoch": 0.18300533382379988, + "grad_norm": 1.0026696776201605, + "learning_rate": 4.7859191344947804e-05, + "loss": 0.9289, + "mean_token_accuracy": 0.7336562752723694, + "step": 995 + }, + { + "epoch": 0.18392495861688432, + "grad_norm": 1.138379913644609, + "learning_rate": 4.782996589781337e-05, + "loss": 0.9497, + "mean_token_accuracy": 0.729135024547577, + "step": 1000 + }, + { + "epoch": 0.18484458340996873, + "grad_norm": 1.107580666472087, + "learning_rate": 4.780055241709762e-05, + "loss": 0.9048, + "mean_token_accuracy": 0.7381602048873901, + "step": 1005 + }, + { + "epoch": 0.18576420820305314, + "grad_norm": 1.0667620674465943, + "learning_rate": 4.7770951174848335e-05, + "loss": 0.9742, + "mean_token_accuracy": 0.7205707669258118, + "step": 1010 + }, + { + "epoch": 0.18668383299613758, + "grad_norm": 1.0940019385189808, + "learning_rate": 4.774116244484993e-05, + "loss": 0.9857, + "mean_token_accuracy": 0.718968415260315, + "step": 1015 + }, + { + "epoch": 0.187603457789222, + "grad_norm": 1.0279044112611866, + "learning_rate": 4.7711186502620894e-05, + "loss": 1.0084, + "mean_token_accuracy": 0.7144084692001342, + "step": 1020 + }, + { + "epoch": 0.1885230825823064, + "grad_norm": 1.0751882464256728, + "learning_rate": 4.768102362541126e-05, + "loss": 0.9353, + "mean_token_accuracy": 0.7318849921226501, + "step": 1025 + }, + { + "epoch": 0.18944270737539085, + "grad_norm": 1.1701748750390102, + "learning_rate": 4.765067409220004e-05, + "loss": 0.957, + "mean_token_accuracy": 0.7275319814682006, + "step": 1030 + }, + { + "epoch": 0.19036233216847526, + "grad_norm": 1.0512353267451773, + "learning_rate": 4.762013818369266e-05, + "loss": 0.9367, + "mean_token_accuracy": 0.7317106485366821, + "step": 1035 + }, + { + "epoch": 0.1912819569615597, + "grad_norm": 1.1085851412035923, + "learning_rate": 4.7589416182318305e-05, + "loss": 0.9416, + "mean_token_accuracy": 0.7324359536170959, + "step": 1040 + }, + { + "epoch": 0.1922015817546441, + "grad_norm": 1.094731274119514, + "learning_rate": 4.755850837222739e-05, + "loss": 0.9474, + "mean_token_accuracy": 0.7309187650680542, + "step": 1045 + }, + { + "epoch": 0.19312120654772852, + "grad_norm": 1.0610610405848808, + "learning_rate": 4.7527415039288874e-05, + "loss": 0.9638, + "mean_token_accuracy": 0.7251871824264526, + "step": 1050 + }, + { + "epoch": 0.19404083134081296, + "grad_norm": 1.0919916417692772, + "learning_rate": 4.749613647108764e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7152180433273315, + "step": 1055 + }, + { + "epoch": 0.19496045613389737, + "grad_norm": 1.0847298297852, + "learning_rate": 4.7464672956921814e-05, + "loss": 0.9366, + "mean_token_accuracy": 0.7313546657562255, + "step": 1060 + }, + { + "epoch": 0.19588008092698178, + "grad_norm": 1.0912787695821449, + "learning_rate": 4.743302478780011e-05, + "loss": 0.945, + "mean_token_accuracy": 0.728658664226532, + "step": 1065 + }, + { + "epoch": 0.19679970572006622, + "grad_norm": 1.052195400658314, + "learning_rate": 4.7401192256439144e-05, + "loss": 0.9793, + "mean_token_accuracy": 0.7213846921920777, + "step": 1070 + }, + { + "epoch": 0.19771933051315063, + "grad_norm": 1.1107870405998106, + "learning_rate": 4.736917565726069e-05, + "loss": 0.9313, + "mean_token_accuracy": 0.735443937778473, + "step": 1075 + }, + { + "epoch": 0.19863895530623504, + "grad_norm": 1.1399365300090571, + "learning_rate": 4.7336975286389e-05, + "loss": 0.9717, + "mean_token_accuracy": 0.7237229943275452, + "step": 1080 + }, + { + "epoch": 0.19955858009931948, + "grad_norm": 1.0983682734144682, + "learning_rate": 4.730459144164802e-05, + "loss": 0.9306, + "mean_token_accuracy": 0.733622133731842, + "step": 1085 + }, + { + "epoch": 0.2004782048924039, + "grad_norm": 1.1053704101564246, + "learning_rate": 4.727202442255871e-05, + "loss": 0.9936, + "mean_token_accuracy": 0.718384611606598, + "step": 1090 + }, + { + "epoch": 0.20139782968548833, + "grad_norm": 1.0858488860538602, + "learning_rate": 4.723927453033619e-05, + "loss": 0.9548, + "mean_token_accuracy": 0.7286873102188111, + "step": 1095 + }, + { + "epoch": 0.20231745447857274, + "grad_norm": 1.0232898856111519, + "learning_rate": 4.720634206788697e-05, + "loss": 0.9804, + "mean_token_accuracy": 0.7218252301216126, + "step": 1100 + }, + { + "epoch": 0.20323707927165716, + "grad_norm": 1.1548447631409977, + "learning_rate": 4.717322733980622e-05, + "loss": 0.931, + "mean_token_accuracy": 0.7311301946640014, + "step": 1105 + }, + { + "epoch": 0.2041567040647416, + "grad_norm": 1.1168183831474872, + "learning_rate": 4.713993065237486e-05, + "loss": 0.9718, + "mean_token_accuracy": 0.7235833764076233, + "step": 1110 + }, + { + "epoch": 0.205076328857826, + "grad_norm": 1.1111836320920656, + "learning_rate": 4.710645231355678e-05, + "loss": 0.9855, + "mean_token_accuracy": 0.7195135593414307, + "step": 1115 + }, + { + "epoch": 0.20599595365091042, + "grad_norm": 1.0024638729648838, + "learning_rate": 4.707279263299598e-05, + "loss": 0.9729, + "mean_token_accuracy": 0.7219846963882446, + "step": 1120 + }, + { + "epoch": 0.20691557844399486, + "grad_norm": 1.0121762272601764, + "learning_rate": 4.703895192201372e-05, + "loss": 0.9459, + "mean_token_accuracy": 0.7269375443458557, + "step": 1125 + }, + { + "epoch": 0.20783520323707927, + "grad_norm": 1.0470465876428376, + "learning_rate": 4.7004930493605573e-05, + "loss": 1.0105, + "mean_token_accuracy": 0.7086774349212647, + "step": 1130 + }, + { + "epoch": 0.20875482803016368, + "grad_norm": 1.0632837126367782, + "learning_rate": 4.697072866243866e-05, + "loss": 0.9412, + "mean_token_accuracy": 0.7307331085205078, + "step": 1135 + }, + { + "epoch": 0.20967445282324812, + "grad_norm": 1.0768863946202714, + "learning_rate": 4.69363467448486e-05, + "loss": 0.9674, + "mean_token_accuracy": 0.7221316814422607, + "step": 1140 + }, + { + "epoch": 0.21059407761633253, + "grad_norm": 1.1181930167961487, + "learning_rate": 4.6901785058836675e-05, + "loss": 0.955, + "mean_token_accuracy": 0.725222361087799, + "step": 1145 + }, + { + "epoch": 0.21151370240941697, + "grad_norm": 1.0688002319746086, + "learning_rate": 4.686704392406685e-05, + "loss": 0.9687, + "mean_token_accuracy": 0.7218108892440795, + "step": 1150 + }, + { + "epoch": 0.21243332720250138, + "grad_norm": 1.1052965038670703, + "learning_rate": 4.6832123661862835e-05, + "loss": 0.9516, + "mean_token_accuracy": 0.7287932515144349, + "step": 1155 + }, + { + "epoch": 0.2133529519955858, + "grad_norm": 1.0349887525202925, + "learning_rate": 4.6797024595205104e-05, + "loss": 0.9599, + "mean_token_accuracy": 0.7228366494178772, + "step": 1160 + }, + { + "epoch": 0.21427257678867023, + "grad_norm": 1.052123043795087, + "learning_rate": 4.6761747048727907e-05, + "loss": 0.9833, + "mean_token_accuracy": 0.714729118347168, + "step": 1165 + }, + { + "epoch": 0.21519220158175464, + "grad_norm": 1.0646750046566955, + "learning_rate": 4.672629134871625e-05, + "loss": 0.98, + "mean_token_accuracy": 0.7194055676460266, + "step": 1170 + }, + { + "epoch": 0.21611182637483906, + "grad_norm": 1.072675922430035, + "learning_rate": 4.669065782310294e-05, + "loss": 0.9661, + "mean_token_accuracy": 0.7228956103324891, + "step": 1175 + }, + { + "epoch": 0.2170314511679235, + "grad_norm": 1.0475965649186345, + "learning_rate": 4.665484680146546e-05, + "loss": 0.9168, + "mean_token_accuracy": 0.7354954957962037, + "step": 1180 + }, + { + "epoch": 0.2179510759610079, + "grad_norm": 1.0183550500547607, + "learning_rate": 4.6618858615023e-05, + "loss": 0.9268, + "mean_token_accuracy": 0.731166672706604, + "step": 1185 + }, + { + "epoch": 0.21887070075409232, + "grad_norm": 1.0894438583208028, + "learning_rate": 4.658269359663336e-05, + "loss": 0.9134, + "mean_token_accuracy": 0.7400953650474549, + "step": 1190 + }, + { + "epoch": 0.21979032554717676, + "grad_norm": 0.9962620966267176, + "learning_rate": 4.6546352080789854e-05, + "loss": 0.9472, + "mean_token_accuracy": 0.7283522963523865, + "step": 1195 + }, + { + "epoch": 0.22070995034026117, + "grad_norm": 1.0767144498287804, + "learning_rate": 4.650983440361825e-05, + "loss": 0.9798, + "mean_token_accuracy": 0.7208079814910888, + "step": 1200 + }, + { + "epoch": 0.2216295751333456, + "grad_norm": 1.0451151540293229, + "learning_rate": 4.6473140902873666e-05, + "loss": 0.9735, + "mean_token_accuracy": 0.7223762154579163, + "step": 1205 + }, + { + "epoch": 0.22254919992643002, + "grad_norm": 0.9904423090265289, + "learning_rate": 4.643627191793737e-05, + "loss": 0.9416, + "mean_token_accuracy": 0.7333443641662598, + "step": 1210 + }, + { + "epoch": 0.22346882471951443, + "grad_norm": 1.0324822073086444, + "learning_rate": 4.639922778981377e-05, + "loss": 0.9096, + "mean_token_accuracy": 0.7366245865821839, + "step": 1215 + }, + { + "epoch": 0.22438844951259887, + "grad_norm": 1.00961392870682, + "learning_rate": 4.636200886112714e-05, + "loss": 0.9647, + "mean_token_accuracy": 0.7272518515586853, + "step": 1220 + }, + { + "epoch": 0.22530807430568328, + "grad_norm": 1.041598639678359, + "learning_rate": 4.63246154761185e-05, + "loss": 0.982, + "mean_token_accuracy": 0.7185810923576355, + "step": 1225 + }, + { + "epoch": 0.2262276990987677, + "grad_norm": 1.0574278162856792, + "learning_rate": 4.628704798064247e-05, + "loss": 0.9442, + "mean_token_accuracy": 0.7297179222106933, + "step": 1230 + }, + { + "epoch": 0.22714732389185213, + "grad_norm": 1.060076765820854, + "learning_rate": 4.624930672216399e-05, + "loss": 0.9614, + "mean_token_accuracy": 0.7244118571281433, + "step": 1235 + }, + { + "epoch": 0.22806694868493654, + "grad_norm": 1.0123003105589568, + "learning_rate": 4.621139204975516e-05, + "loss": 0.9169, + "mean_token_accuracy": 0.7362489700317383, + "step": 1240 + }, + { + "epoch": 0.22898657347802095, + "grad_norm": 1.1490153575204947, + "learning_rate": 4.617330431409201e-05, + "loss": 0.9929, + "mean_token_accuracy": 0.7166203141212464, + "step": 1245 + }, + { + "epoch": 0.2299061982711054, + "grad_norm": 1.0270625785191527, + "learning_rate": 4.6135043867451255e-05, + "loss": 0.9325, + "mean_token_accuracy": 0.7311270833015442, + "step": 1250 + }, + { + "epoch": 0.2308258230641898, + "grad_norm": 1.030694744170465, + "learning_rate": 4.609661106370701e-05, + "loss": 0.9228, + "mean_token_accuracy": 0.7355565190315246, + "step": 1255 + }, + { + "epoch": 0.23174544785727424, + "grad_norm": 1.0190672056189127, + "learning_rate": 4.605800625832753e-05, + "loss": 0.9577, + "mean_token_accuracy": 0.7273682594299317, + "step": 1260 + }, + { + "epoch": 0.23266507265035866, + "grad_norm": 1.025832787786935, + "learning_rate": 4.6019229808371945e-05, + "loss": 0.9291, + "mean_token_accuracy": 0.7325186491012573, + "step": 1265 + }, + { + "epoch": 0.23358469744344307, + "grad_norm": 1.0254402284447273, + "learning_rate": 4.598028207248693e-05, + "loss": 0.9681, + "mean_token_accuracy": 0.7215327501296998, + "step": 1270 + }, + { + "epoch": 0.2345043222365275, + "grad_norm": 1.043519079594266, + "learning_rate": 4.5941163410903406e-05, + "loss": 0.9565, + "mean_token_accuracy": 0.7248036026954651, + "step": 1275 + }, + { + "epoch": 0.23542394702961192, + "grad_norm": 0.9811685630848649, + "learning_rate": 4.590187418543321e-05, + "loss": 0.9204, + "mean_token_accuracy": 0.7338666915893555, + "step": 1280 + }, + { + "epoch": 0.23634357182269633, + "grad_norm": 1.0355767679745649, + "learning_rate": 4.586241475946571e-05, + "loss": 0.9824, + "mean_token_accuracy": 0.7212961316108704, + "step": 1285 + }, + { + "epoch": 0.23726319661578077, + "grad_norm": 0.9995187864598916, + "learning_rate": 4.582278549796448e-05, + "loss": 0.914, + "mean_token_accuracy": 0.7355898737907409, + "step": 1290 + }, + { + "epoch": 0.23818282140886518, + "grad_norm": 1.0163621938165361, + "learning_rate": 4.5782986767463946e-05, + "loss": 0.9614, + "mean_token_accuracy": 0.7241615772247314, + "step": 1295 + }, + { + "epoch": 0.2391024462019496, + "grad_norm": 1.0913821743861445, + "learning_rate": 4.574301893606594e-05, + "loss": 0.8839, + "mean_token_accuracy": 0.7434832811355591, + "step": 1300 + }, + { + "epoch": 0.24002207099503403, + "grad_norm": 1.0399223484753735, + "learning_rate": 4.570288237343632e-05, + "loss": 0.9104, + "mean_token_accuracy": 0.7378169417381286, + "step": 1305 + }, + { + "epoch": 0.24094169578811844, + "grad_norm": 1.011671028641558, + "learning_rate": 4.5662577450801576e-05, + "loss": 0.9595, + "mean_token_accuracy": 0.7230379819869995, + "step": 1310 + }, + { + "epoch": 0.24186132058120288, + "grad_norm": 1.008990928095214, + "learning_rate": 4.562210454094535e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7295035600662232, + "step": 1315 + }, + { + "epoch": 0.2427809453742873, + "grad_norm": 1.059357744292348, + "learning_rate": 4.558146401820502e-05, + "loss": 0.9569, + "mean_token_accuracy": 0.7264422059059144, + "step": 1320 + }, + { + "epoch": 0.2437005701673717, + "grad_norm": 1.0224904321964083, + "learning_rate": 4.554065625846825e-05, + "loss": 0.9838, + "mean_token_accuracy": 0.7178040146827698, + "step": 1325 + }, + { + "epoch": 0.24462019496045614, + "grad_norm": 1.0737296876090594, + "learning_rate": 4.549968163916946e-05, + "loss": 0.976, + "mean_token_accuracy": 0.7180652141571044, + "step": 1330 + }, + { + "epoch": 0.24553981975354056, + "grad_norm": 1.0129242243093401, + "learning_rate": 4.545854053928639e-05, + "loss": 0.9394, + "mean_token_accuracy": 0.7314478039741517, + "step": 1335 + }, + { + "epoch": 0.24645944454662497, + "grad_norm": 0.9860304727584566, + "learning_rate": 4.541723333933657e-05, + "loss": 0.9595, + "mean_token_accuracy": 0.7271197676658631, + "step": 1340 + }, + { + "epoch": 0.2473790693397094, + "grad_norm": 1.0235437508308431, + "learning_rate": 4.5375760421373796e-05, + "loss": 0.9888, + "mean_token_accuracy": 0.7178149104118348, + "step": 1345 + }, + { + "epoch": 0.24829869413279382, + "grad_norm": 1.076473129213084, + "learning_rate": 4.533412216898461e-05, + "loss": 0.9374, + "mean_token_accuracy": 0.7287054538726807, + "step": 1350 + }, + { + "epoch": 0.24921831892587823, + "grad_norm": 1.027000741915809, + "learning_rate": 4.529231896728474e-05, + "loss": 0.9098, + "mean_token_accuracy": 0.7352772355079651, + "step": 1355 + }, + { + "epoch": 0.25013794371896264, + "grad_norm": 1.0980991489181584, + "learning_rate": 4.525035120291557e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.7250553727149963, + "step": 1360 + }, + { + "epoch": 0.2510575685120471, + "grad_norm": 1.0105378261394609, + "learning_rate": 4.520821926404049e-05, + "loss": 0.9232, + "mean_token_accuracy": 0.7339854836463928, + "step": 1365 + }, + { + "epoch": 0.2519771933051315, + "grad_norm": 1.0465671126237865, + "learning_rate": 4.516592354034138e-05, + "loss": 0.9578, + "mean_token_accuracy": 0.7243474960327149, + "step": 1370 + }, + { + "epoch": 0.2528968180982159, + "grad_norm": 1.0721948067984564, + "learning_rate": 4.512346442301501e-05, + "loss": 0.9305, + "mean_token_accuracy": 0.7290533304214477, + "step": 1375 + }, + { + "epoch": 0.25381644289130034, + "grad_norm": 1.083352961545848, + "learning_rate": 4.5080842304769345e-05, + "loss": 0.9338, + "mean_token_accuracy": 0.733627998828888, + "step": 1380 + }, + { + "epoch": 0.2547360676843848, + "grad_norm": 0.979913773136715, + "learning_rate": 4.503805757981997e-05, + "loss": 0.9012, + "mean_token_accuracy": 0.7409675002098084, + "step": 1385 + }, + { + "epoch": 0.25565569247746917, + "grad_norm": 1.1174510417210128, + "learning_rate": 4.499511064388645e-05, + "loss": 0.8754, + "mean_token_accuracy": 0.7447872519493103, + "step": 1390 + }, + { + "epoch": 0.2565753172705536, + "grad_norm": 1.0562227070300527, + "learning_rate": 4.495200189418864e-05, + "loss": 0.9505, + "mean_token_accuracy": 0.7265227913856507, + "step": 1395 + }, + { + "epoch": 0.25749494206363804, + "grad_norm": 1.0550543313489833, + "learning_rate": 4.490873172944303e-05, + "loss": 0.9096, + "mean_token_accuracy": 0.7342225193977356, + "step": 1400 + }, + { + "epoch": 0.2584145668567225, + "grad_norm": 1.0844914008772555, + "learning_rate": 4.486530054985905e-05, + "loss": 0.9643, + "mean_token_accuracy": 0.7227702975273133, + "step": 1405 + }, + { + "epoch": 0.25933419164980687, + "grad_norm": 1.11030675175993, + "learning_rate": 4.482170875713536e-05, + "loss": 0.98, + "mean_token_accuracy": 0.7210663437843323, + "step": 1410 + }, + { + "epoch": 0.2602538164428913, + "grad_norm": 1.0678730599548856, + "learning_rate": 4.477795675445616e-05, + "loss": 0.9248, + "mean_token_accuracy": 0.7327564835548401, + "step": 1415 + }, + { + "epoch": 0.26117344123597575, + "grad_norm": 0.9866628204231362, + "learning_rate": 4.473404494648744e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.7343960881233216, + "step": 1420 + }, + { + "epoch": 0.26209306602906013, + "grad_norm": 0.9895263110250994, + "learning_rate": 4.4689973739373244e-05, + "loss": 0.9123, + "mean_token_accuracy": 0.7354090452194214, + "step": 1425 + }, + { + "epoch": 0.26301269082214457, + "grad_norm": 0.9560958289104061, + "learning_rate": 4.46457435407319e-05, + "loss": 0.9494, + "mean_token_accuracy": 0.725600802898407, + "step": 1430 + }, + { + "epoch": 0.263932315615229, + "grad_norm": 1.0418751893863187, + "learning_rate": 4.460135475965227e-05, + "loss": 0.887, + "mean_token_accuracy": 0.744392192363739, + "step": 1435 + }, + { + "epoch": 0.2648519404083134, + "grad_norm": 1.0270767884123133, + "learning_rate": 4.455680780668997e-05, + "loss": 0.98, + "mean_token_accuracy": 0.717594051361084, + "step": 1440 + }, + { + "epoch": 0.26577156520139783, + "grad_norm": 1.0194372684867639, + "learning_rate": 4.4512103093863555e-05, + "loss": 0.9145, + "mean_token_accuracy": 0.7369788885116577, + "step": 1445 + }, + { + "epoch": 0.26669118999448227, + "grad_norm": 1.0981284825838393, + "learning_rate": 4.44672410346507e-05, + "loss": 0.9519, + "mean_token_accuracy": 0.7260895729064941, + "step": 1450 + }, + { + "epoch": 0.26761081478756665, + "grad_norm": 1.0207625075556366, + "learning_rate": 4.442222204398441e-05, + "loss": 0.9555, + "mean_token_accuracy": 0.7227967500686645, + "step": 1455 + }, + { + "epoch": 0.2685304395806511, + "grad_norm": 0.98393868791661, + "learning_rate": 4.437704653824915e-05, + "loss": 0.8831, + "mean_token_accuracy": 0.7438354253768921, + "step": 1460 + }, + { + "epoch": 0.26945006437373553, + "grad_norm": 0.9817630950075087, + "learning_rate": 4.433171493527701e-05, + "loss": 0.9404, + "mean_token_accuracy": 0.728731095790863, + "step": 1465 + }, + { + "epoch": 0.2703696891668199, + "grad_norm": 1.0298652072064594, + "learning_rate": 4.428622765434383e-05, + "loss": 0.9136, + "mean_token_accuracy": 0.7356218695640564, + "step": 1470 + }, + { + "epoch": 0.27128931395990435, + "grad_norm": 0.981553092264934, + "learning_rate": 4.4240585116165334e-05, + "loss": 0.8555, + "mean_token_accuracy": 0.753374171257019, + "step": 1475 + }, + { + "epoch": 0.2722089387529888, + "grad_norm": 1.172918257192198, + "learning_rate": 4.419478774289325e-05, + "loss": 0.998, + "mean_token_accuracy": 0.713919198513031, + "step": 1480 + }, + { + "epoch": 0.2731285635460732, + "grad_norm": 1.003409782978005, + "learning_rate": 4.414883595811136e-05, + "loss": 0.8782, + "mean_token_accuracy": 0.7452871680259705, + "step": 1485 + }, + { + "epoch": 0.2740481883391576, + "grad_norm": 1.0316918646250515, + "learning_rate": 4.410273018683163e-05, + "loss": 0.9242, + "mean_token_accuracy": 0.7311699628829956, + "step": 1490 + }, + { + "epoch": 0.27496781313224206, + "grad_norm": 0.978003437149563, + "learning_rate": 4.405647085549025e-05, + "loss": 0.9241, + "mean_token_accuracy": 0.7328976273536683, + "step": 1495 + }, + { + "epoch": 0.27588743792532644, + "grad_norm": 1.0070406181231344, + "learning_rate": 4.40100583919437e-05, + "loss": 0.9001, + "mean_token_accuracy": 0.7395057559013367, + "step": 1500 + }, + { + "epoch": 0.2768070627184109, + "grad_norm": 0.9873878935159346, + "learning_rate": 4.3963493225464817e-05, + "loss": 0.9258, + "mean_token_accuracy": 0.7336387634277344, + "step": 1505 + }, + { + "epoch": 0.2777266875114953, + "grad_norm": 0.9521695030248521, + "learning_rate": 4.3916775786738754e-05, + "loss": 0.914, + "mean_token_accuracy": 0.7378314137458801, + "step": 1510 + }, + { + "epoch": 0.27864631230457976, + "grad_norm": 0.9502896850196428, + "learning_rate": 4.3869906507859096e-05, + "loss": 0.8987, + "mean_token_accuracy": 0.7417943596839904, + "step": 1515 + }, + { + "epoch": 0.27956593709766414, + "grad_norm": 0.991426828614557, + "learning_rate": 4.382288582232376e-05, + "loss": 0.9106, + "mean_token_accuracy": 0.7390964746475219, + "step": 1520 + }, + { + "epoch": 0.2804855618907486, + "grad_norm": 1.0581857743606324, + "learning_rate": 4.377571416503108e-05, + "loss": 0.9179, + "mean_token_accuracy": 0.7379998922348022, + "step": 1525 + }, + { + "epoch": 0.281405186683833, + "grad_norm": 0.9872377385823925, + "learning_rate": 4.372839197227571e-05, + "loss": 0.8848, + "mean_token_accuracy": 0.7446985721588135, + "step": 1530 + }, + { + "epoch": 0.2823248114769174, + "grad_norm": 1.0976151495403408, + "learning_rate": 4.368091968174463e-05, + "loss": 0.9632, + "mean_token_accuracy": 0.723613953590393, + "step": 1535 + }, + { + "epoch": 0.28324443627000184, + "grad_norm": 1.013680671037777, + "learning_rate": 4.363329773251309e-05, + "loss": 0.866, + "mean_token_accuracy": 0.750942587852478, + "step": 1540 + }, + { + "epoch": 0.2841640610630863, + "grad_norm": 1.1182733077200029, + "learning_rate": 4.3585526565040543e-05, + "loss": 0.9995, + "mean_token_accuracy": 0.7137303233146668, + "step": 1545 + }, + { + "epoch": 0.28508368585617067, + "grad_norm": 0.9779737007515391, + "learning_rate": 4.353760662116658e-05, + "loss": 0.9369, + "mean_token_accuracy": 0.7336580872535705, + "step": 1550 + }, + { + "epoch": 0.2860033106492551, + "grad_norm": 1.0260468281394197, + "learning_rate": 4.348953834410683e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.7206373929977417, + "step": 1555 + }, + { + "epoch": 0.28692293544233954, + "grad_norm": 1.0263096637333005, + "learning_rate": 4.3441322178448856e-05, + "loss": 0.9572, + "mean_token_accuracy": 0.7260561943054199, + "step": 1560 + }, + { + "epoch": 0.2878425602354239, + "grad_norm": 0.9619383230028783, + "learning_rate": 4.339295857014809e-05, + "loss": 0.9501, + "mean_token_accuracy": 0.7264659523963928, + "step": 1565 + }, + { + "epoch": 0.28876218502850837, + "grad_norm": 0.9946060524217067, + "learning_rate": 4.3344447966523634e-05, + "loss": 0.9887, + "mean_token_accuracy": 0.7160560727119446, + "step": 1570 + }, + { + "epoch": 0.2896818098215928, + "grad_norm": 1.0275376139203307, + "learning_rate": 4.3295790816254195e-05, + "loss": 0.9262, + "mean_token_accuracy": 0.734666109085083, + "step": 1575 + }, + { + "epoch": 0.2906014346146772, + "grad_norm": 1.1276042923218728, + "learning_rate": 4.324698756937388e-05, + "loss": 0.9378, + "mean_token_accuracy": 0.7300173878669739, + "step": 1580 + }, + { + "epoch": 0.29152105940776163, + "grad_norm": 0.9552400868458645, + "learning_rate": 4.319803867726807e-05, + "loss": 0.8879, + "mean_token_accuracy": 0.7425481796264648, + "step": 1585 + }, + { + "epoch": 0.29244068420084607, + "grad_norm": 0.9486514468425481, + "learning_rate": 4.3148944592669234e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.7219538450241089, + "step": 1590 + }, + { + "epoch": 0.29336030899393045, + "grad_norm": 0.9567962674802902, + "learning_rate": 4.30997057696527e-05, + "loss": 0.8741, + "mean_token_accuracy": 0.7477473855018616, + "step": 1595 + }, + { + "epoch": 0.2942799337870149, + "grad_norm": 0.9667609260469084, + "learning_rate": 4.3050322663632564e-05, + "loss": 0.9568, + "mean_token_accuracy": 0.7255883097648621, + "step": 1600 + }, + { + "epoch": 0.29519955858009933, + "grad_norm": 0.9920073647296315, + "learning_rate": 4.3000795731357333e-05, + "loss": 0.9237, + "mean_token_accuracy": 0.7383288621902466, + "step": 1605 + }, + { + "epoch": 0.2961191833731837, + "grad_norm": 1.0604465170326072, + "learning_rate": 4.295112543090584e-05, + "loss": 0.9609, + "mean_token_accuracy": 0.7225096940994262, + "step": 1610 + }, + { + "epoch": 0.29703880816626815, + "grad_norm": 1.0688037490276023, + "learning_rate": 4.290131222168289e-05, + "loss": 1.0008, + "mean_token_accuracy": 0.7138909697532654, + "step": 1615 + }, + { + "epoch": 0.2979584329593526, + "grad_norm": 1.143629206489082, + "learning_rate": 4.2851356564415086e-05, + "loss": 0.9867, + "mean_token_accuracy": 0.7165561437606811, + "step": 1620 + }, + { + "epoch": 0.29887805775243703, + "grad_norm": 1.0438745750713756, + "learning_rate": 4.280125892114656e-05, + "loss": 0.9434, + "mean_token_accuracy": 0.7298865675926208, + "step": 1625 + }, + { + "epoch": 0.2997976825455214, + "grad_norm": 1.0251559106803514, + "learning_rate": 4.2751019755234664e-05, + "loss": 0.935, + "mean_token_accuracy": 0.7299148678779602, + "step": 1630 + }, + { + "epoch": 0.30071730733860585, + "grad_norm": 0.9900961445552091, + "learning_rate": 4.27006395313457e-05, + "loss": 0.9963, + "mean_token_accuracy": 0.7131295561790466, + "step": 1635 + }, + { + "epoch": 0.3016369321316903, + "grad_norm": 1.040210108998438, + "learning_rate": 4.265011871545066e-05, + "loss": 0.9412, + "mean_token_accuracy": 0.7279941439628601, + "step": 1640 + }, + { + "epoch": 0.3025565569247747, + "grad_norm": 1.0262950854145634, + "learning_rate": 4.259945777482085e-05, + "loss": 0.9239, + "mean_token_accuracy": 0.7327239632606506, + "step": 1645 + }, + { + "epoch": 0.3034761817178591, + "grad_norm": 0.9969469234100081, + "learning_rate": 4.25486571780236e-05, + "loss": 0.9462, + "mean_token_accuracy": 0.7269651889801025, + "step": 1650 + }, + { + "epoch": 0.30439580651094356, + "grad_norm": 1.0021703198417462, + "learning_rate": 4.249771739491795e-05, + "loss": 0.9003, + "mean_token_accuracy": 0.7421126961708069, + "step": 1655 + }, + { + "epoch": 0.30531543130402794, + "grad_norm": 1.0255704189414308, + "learning_rate": 4.24466388966503e-05, + "loss": 0.9249, + "mean_token_accuracy": 0.7345858454704285, + "step": 1660 + }, + { + "epoch": 0.3062350560971124, + "grad_norm": 0.9438771845720968, + "learning_rate": 4.239542215565e-05, + "loss": 0.9749, + "mean_token_accuracy": 0.7182752847671509, + "step": 1665 + }, + { + "epoch": 0.3071546808901968, + "grad_norm": 0.9878451650581643, + "learning_rate": 4.2344067645625036e-05, + "loss": 0.9455, + "mean_token_accuracy": 0.7264060854911805, + "step": 1670 + }, + { + "epoch": 0.3080743056832812, + "grad_norm": 1.1287364443586523, + "learning_rate": 4.229257584155765e-05, + "loss": 0.9218, + "mean_token_accuracy": 0.7332573175430298, + "step": 1675 + }, + { + "epoch": 0.30899393047636564, + "grad_norm": 0.971666072350275, + "learning_rate": 4.2240947219699895e-05, + "loss": 0.8756, + "mean_token_accuracy": 0.7459922909736634, + "step": 1680 + }, + { + "epoch": 0.3099135552694501, + "grad_norm": 0.9593974583897734, + "learning_rate": 4.2189182257569285e-05, + "loss": 0.9329, + "mean_token_accuracy": 0.730040967464447, + "step": 1685 + }, + { + "epoch": 0.31083318006253446, + "grad_norm": 0.943158273064518, + "learning_rate": 4.213728143394436e-05, + "loss": 0.8839, + "mean_token_accuracy": 0.7458212971687317, + "step": 1690 + }, + { + "epoch": 0.3117528048556189, + "grad_norm": 1.050902490407755, + "learning_rate": 4.208524522886022e-05, + "loss": 0.9443, + "mean_token_accuracy": 0.7311147809028625, + "step": 1695 + }, + { + "epoch": 0.31267242964870334, + "grad_norm": 1.0074348860409519, + "learning_rate": 4.203307412360418e-05, + "loss": 0.9201, + "mean_token_accuracy": 0.7326057314872741, + "step": 1700 + }, + { + "epoch": 0.3135920544417877, + "grad_norm": 1.0039288385867127, + "learning_rate": 4.1980768600711194e-05, + "loss": 0.9169, + "mean_token_accuracy": 0.736884355545044, + "step": 1705 + }, + { + "epoch": 0.31451167923487217, + "grad_norm": 0.9456279018137994, + "learning_rate": 4.1928329143959506e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.7341038465499878, + "step": 1710 + }, + { + "epoch": 0.3154313040279566, + "grad_norm": 0.969219875361889, + "learning_rate": 4.18757562383661e-05, + "loss": 0.9586, + "mean_token_accuracy": 0.7229322910308837, + "step": 1715 + }, + { + "epoch": 0.316350928821041, + "grad_norm": 0.9823553221239351, + "learning_rate": 4.182305037018224e-05, + "loss": 0.8674, + "mean_token_accuracy": 0.7455045938491821, + "step": 1720 + }, + { + "epoch": 0.31727055361412543, + "grad_norm": 0.9614849491835867, + "learning_rate": 4.1770212026888974e-05, + "loss": 0.8978, + "mean_token_accuracy": 0.7393216609954834, + "step": 1725 + }, + { + "epoch": 0.31819017840720987, + "grad_norm": 1.0298443865011644, + "learning_rate": 4.1717241697192636e-05, + "loss": 0.9046, + "mean_token_accuracy": 0.7390219569206238, + "step": 1730 + }, + { + "epoch": 0.3191098032002943, + "grad_norm": 0.9675044814332657, + "learning_rate": 4.166413987102031e-05, + "loss": 0.9014, + "mean_token_accuracy": 0.7412125468254089, + "step": 1735 + }, + { + "epoch": 0.3200294279933787, + "grad_norm": 0.9558901216962499, + "learning_rate": 4.161090703951528e-05, + "loss": 0.8915, + "mean_token_accuracy": 0.7442119359970093, + "step": 1740 + }, + { + "epoch": 0.32094905278646313, + "grad_norm": 1.0231471726772243, + "learning_rate": 4.155754369503254e-05, + "loss": 0.9508, + "mean_token_accuracy": 0.7272051572799683, + "step": 1745 + }, + { + "epoch": 0.32186867757954757, + "grad_norm": 0.971225693001968, + "learning_rate": 4.1504050331134186e-05, + "loss": 0.9271, + "mean_token_accuracy": 0.7334083676338196, + "step": 1750 + }, + { + "epoch": 0.32278830237263195, + "grad_norm": 0.9487975621871125, + "learning_rate": 4.1450427442584885e-05, + "loss": 0.9231, + "mean_token_accuracy": 0.7330006003379822, + "step": 1755 + }, + { + "epoch": 0.3237079271657164, + "grad_norm": 1.080234485746019, + "learning_rate": 4.13966755253473e-05, + "loss": 0.8934, + "mean_token_accuracy": 0.7371908903121949, + "step": 1760 + }, + { + "epoch": 0.32462755195880083, + "grad_norm": 1.0042744657060512, + "learning_rate": 4.134279507657746e-05, + "loss": 0.9357, + "mean_token_accuracy": 0.7307947874069214, + "step": 1765 + }, + { + "epoch": 0.3255471767518852, + "grad_norm": 1.0167454318885076, + "learning_rate": 4.1288786594620224e-05, + "loss": 0.9522, + "mean_token_accuracy": 0.7250777244567871, + "step": 1770 + }, + { + "epoch": 0.32646680154496965, + "grad_norm": 1.0378785371682158, + "learning_rate": 4.123465057900463e-05, + "loss": 0.8991, + "mean_token_accuracy": 0.7383182883262634, + "step": 1775 + }, + { + "epoch": 0.3273864263380541, + "grad_norm": 0.975574798117687, + "learning_rate": 4.118038753043927e-05, + "loss": 0.8962, + "mean_token_accuracy": 0.7391498327255249, + "step": 1780 + }, + { + "epoch": 0.3283060511311385, + "grad_norm": 0.9785593634297269, + "learning_rate": 4.112599795080771e-05, + "loss": 0.8976, + "mean_token_accuracy": 0.7406945347785949, + "step": 1785 + }, + { + "epoch": 0.3292256759242229, + "grad_norm": 0.9506069452238485, + "learning_rate": 4.107148234316378e-05, + "loss": 0.9792, + "mean_token_accuracy": 0.7183930397033691, + "step": 1790 + }, + { + "epoch": 0.33014530071730736, + "grad_norm": 0.9568388159915644, + "learning_rate": 4.101684121172696e-05, + "loss": 0.9445, + "mean_token_accuracy": 0.7280240654945374, + "step": 1795 + }, + { + "epoch": 0.33106492551039174, + "grad_norm": 1.022357456314008, + "learning_rate": 4.096207506187773e-05, + "loss": 0.9394, + "mean_token_accuracy": 0.7300898432731628, + "step": 1800 + }, + { + "epoch": 0.3319845503034762, + "grad_norm": 0.993312074550177, + "learning_rate": 4.090718440015285e-05, + "loss": 0.8857, + "mean_token_accuracy": 0.7397880554199219, + "step": 1805 + }, + { + "epoch": 0.3329041750965606, + "grad_norm": 0.9393217165901138, + "learning_rate": 4.0852169734240715e-05, + "loss": 0.9055, + "mean_token_accuracy": 0.7397056937217712, + "step": 1810 + }, + { + "epoch": 0.333823799889645, + "grad_norm": 1.0286146516865022, + "learning_rate": 4.0797031572976644e-05, + "loss": 0.9486, + "mean_token_accuracy": 0.7270653247833252, + "step": 1815 + }, + { + "epoch": 0.33474342468272944, + "grad_norm": 1.0433673618214743, + "learning_rate": 4.074177042633818e-05, + "loss": 0.8654, + "mean_token_accuracy": 0.7493741869926452, + "step": 1820 + }, + { + "epoch": 0.3356630494758139, + "grad_norm": 0.9978374983290279, + "learning_rate": 4.068638680544035e-05, + "loss": 0.9434, + "mean_token_accuracy": 0.7284141898155212, + "step": 1825 + }, + { + "epoch": 0.33658267426889826, + "grad_norm": 0.9268570875914646, + "learning_rate": 4.063088122253096e-05, + "loss": 0.9323, + "mean_token_accuracy": 0.7292568445205688, + "step": 1830 + }, + { + "epoch": 0.3375022990619827, + "grad_norm": 1.0098370277606412, + "learning_rate": 4.05752541909859e-05, + "loss": 0.8831, + "mean_token_accuracy": 0.7427129149436951, + "step": 1835 + }, + { + "epoch": 0.33842192385506714, + "grad_norm": 0.9840521255378257, + "learning_rate": 4.0519506225304266e-05, + "loss": 0.9129, + "mean_token_accuracy": 0.7376075983047485, + "step": 1840 + }, + { + "epoch": 0.3393415486481516, + "grad_norm": 0.9706147022595509, + "learning_rate": 4.046363784110375e-05, + "loss": 0.8867, + "mean_token_accuracy": 0.7421358585357666, + "step": 1845 + }, + { + "epoch": 0.34026117344123596, + "grad_norm": 1.0544553608523015, + "learning_rate": 4.040764955511577e-05, + "loss": 0.9404, + "mean_token_accuracy": 0.7300120830535889, + "step": 1850 + }, + { + "epoch": 0.3411807982343204, + "grad_norm": 0.9771051625951763, + "learning_rate": 4.035154188518076e-05, + "loss": 0.92, + "mean_token_accuracy": 0.7353024840354919, + "step": 1855 + }, + { + "epoch": 0.34210042302740484, + "grad_norm": 0.9612601058837731, + "learning_rate": 4.02953153502433e-05, + "loss": 0.8822, + "mean_token_accuracy": 0.7446259975433349, + "step": 1860 + }, + { + "epoch": 0.3430200478204892, + "grad_norm": 1.0790844365415948, + "learning_rate": 4.0238970470347404e-05, + "loss": 0.9243, + "mean_token_accuracy": 0.7315137147903442, + "step": 1865 + }, + { + "epoch": 0.34393967261357367, + "grad_norm": 0.9988868690440261, + "learning_rate": 4.018250776663164e-05, + "loss": 0.8875, + "mean_token_accuracy": 0.7421119809150696, + "step": 1870 + }, + { + "epoch": 0.3448592974066581, + "grad_norm": 1.0571095915292046, + "learning_rate": 4.012592776132435e-05, + "loss": 0.9273, + "mean_token_accuracy": 0.731085193157196, + "step": 1875 + }, + { + "epoch": 0.3457789221997425, + "grad_norm": 1.135743652086019, + "learning_rate": 4.0069230977738826e-05, + "loss": 0.9534, + "mean_token_accuracy": 0.7248372554779052, + "step": 1880 + }, + { + "epoch": 0.34669854699282693, + "grad_norm": 0.9715071563775657, + "learning_rate": 4.001241794026842e-05, + "loss": 0.94, + "mean_token_accuracy": 0.731473171710968, + "step": 1885 + }, + { + "epoch": 0.34761817178591137, + "grad_norm": 0.9942342778662301, + "learning_rate": 3.9955489174381746e-05, + "loss": 0.9329, + "mean_token_accuracy": 0.7310616850852967, + "step": 1890 + }, + { + "epoch": 0.34853779657899575, + "grad_norm": 1.0075175249825896, + "learning_rate": 3.989844520661779e-05, + "loss": 0.9438, + "mean_token_accuracy": 0.7262274742126464, + "step": 1895 + }, + { + "epoch": 0.3494574213720802, + "grad_norm": 0.9753954477573876, + "learning_rate": 3.984128656458106e-05, + "loss": 0.9702, + "mean_token_accuracy": 0.7193968415260314, + "step": 1900 + }, + { + "epoch": 0.35037704616516463, + "grad_norm": 1.0133558076382343, + "learning_rate": 3.978401377693669e-05, + "loss": 0.873, + "mean_token_accuracy": 0.7490906119346619, + "step": 1905 + }, + { + "epoch": 0.351296670958249, + "grad_norm": 1.0343688728685794, + "learning_rate": 3.9726627373405544e-05, + "loss": 0.9308, + "mean_token_accuracy": 0.7297749042510986, + "step": 1910 + }, + { + "epoch": 0.35221629575133345, + "grad_norm": 0.9695668089988693, + "learning_rate": 3.966912788475937e-05, + "loss": 0.9028, + "mean_token_accuracy": 0.7381954431533814, + "step": 1915 + }, + { + "epoch": 0.3531359205444179, + "grad_norm": 0.9832664588504738, + "learning_rate": 3.961151584281581e-05, + "loss": 0.8815, + "mean_token_accuracy": 0.7429476737976074, + "step": 1920 + }, + { + "epoch": 0.3540555453375023, + "grad_norm": 0.963687599953708, + "learning_rate": 3.955379178043352e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.7177613019943238, + "step": 1925 + }, + { + "epoch": 0.3549751701305867, + "grad_norm": 0.9479437389842555, + "learning_rate": 3.9495956231507266e-05, + "loss": 0.9274, + "mean_token_accuracy": 0.7312801122665405, + "step": 1930 + }, + { + "epoch": 0.35589479492367115, + "grad_norm": 0.938691928481946, + "learning_rate": 3.943800973096296e-05, + "loss": 0.9017, + "mean_token_accuracy": 0.7394131779670715, + "step": 1935 + }, + { + "epoch": 0.35681441971675554, + "grad_norm": 0.967769246759337, + "learning_rate": 3.937995281475269e-05, + "loss": 0.9216, + "mean_token_accuracy": 0.7352214097976685, + "step": 1940 + }, + { + "epoch": 0.35773404450984, + "grad_norm": 0.9613349378582403, + "learning_rate": 3.932178601984982e-05, + "loss": 0.8861, + "mean_token_accuracy": 0.7429886102676392, + "step": 1945 + }, + { + "epoch": 0.3586536693029244, + "grad_norm": 0.9739202222729397, + "learning_rate": 3.926350988424397e-05, + "loss": 0.8628, + "mean_token_accuracy": 0.7480137705802917, + "step": 1950 + }, + { + "epoch": 0.35957329409600886, + "grad_norm": 1.00417983410191, + "learning_rate": 3.920512494693607e-05, + "loss": 0.879, + "mean_token_accuracy": 0.7440518856048584, + "step": 1955 + }, + { + "epoch": 0.36049291888909324, + "grad_norm": 1.0098406374163094, + "learning_rate": 3.9146631747933366e-05, + "loss": 0.8329, + "mean_token_accuracy": 0.759476363658905, + "step": 1960 + }, + { + "epoch": 0.3614125436821777, + "grad_norm": 0.9962046099940254, + "learning_rate": 3.908803082824441e-05, + "loss": 0.8369, + "mean_token_accuracy": 0.7543352007865906, + "step": 1965 + }, + { + "epoch": 0.3623321684752621, + "grad_norm": 1.0229275697874085, + "learning_rate": 3.9029322729874104e-05, + "loss": 0.9319, + "mean_token_accuracy": 0.7315138220787049, + "step": 1970 + }, + { + "epoch": 0.3632517932683465, + "grad_norm": 0.9131833883898176, + "learning_rate": 3.8970507995818636e-05, + "loss": 0.8373, + "mean_token_accuracy": 0.754296875, + "step": 1975 + }, + { + "epoch": 0.36417141806143094, + "grad_norm": 0.9558351857573911, + "learning_rate": 3.891158717006046e-05, + "loss": 0.892, + "mean_token_accuracy": 0.7430965900421143, + "step": 1980 + }, + { + "epoch": 0.3650910428545154, + "grad_norm": 0.9446973659937214, + "learning_rate": 3.885256079756331e-05, + "loss": 0.9394, + "mean_token_accuracy": 0.7250162839889527, + "step": 1985 + }, + { + "epoch": 0.36601066764759976, + "grad_norm": 0.9202948815573198, + "learning_rate": 3.879342942426711e-05, + "loss": 0.9124, + "mean_token_accuracy": 0.7363432049751282, + "step": 1990 + }, + { + "epoch": 0.3669302924406842, + "grad_norm": 0.9507433703052857, + "learning_rate": 3.8734193597082964e-05, + "loss": 0.9265, + "mean_token_accuracy": 0.7309059858322143, + "step": 1995 + }, + { + "epoch": 0.36784991723376864, + "grad_norm": 0.9721403940210892, + "learning_rate": 3.867485386388806e-05, + "loss": 0.9368, + "mean_token_accuracy": 0.7331580281257629, + "step": 2000 + }, + { + "epoch": 0.368769542026853, + "grad_norm": 0.9405505899400793, + "learning_rate": 3.8615410773520635e-05, + "loss": 0.9138, + "mean_token_accuracy": 0.7358463048934937, + "step": 2005 + }, + { + "epoch": 0.36968916681993746, + "grad_norm": 0.963025470188593, + "learning_rate": 3.8555864875774885e-05, + "loss": 0.9019, + "mean_token_accuracy": 0.7384212732315063, + "step": 2010 + }, + { + "epoch": 0.3706087916130219, + "grad_norm": 0.9907971594256944, + "learning_rate": 3.849621672139588e-05, + "loss": 0.8763, + "mean_token_accuracy": 0.7444020867347717, + "step": 2015 + }, + { + "epoch": 0.3715284164061063, + "grad_norm": 0.981696155165083, + "learning_rate": 3.843646686207445e-05, + "loss": 0.9202, + "mean_token_accuracy": 0.7325111865997315, + "step": 2020 + }, + { + "epoch": 0.3724480411991907, + "grad_norm": 0.990078628199776, + "learning_rate": 3.837661585044211e-05, + "loss": 0.9045, + "mean_token_accuracy": 0.7379343152046204, + "step": 2025 + }, + { + "epoch": 0.37336766599227517, + "grad_norm": 0.9302652014201332, + "learning_rate": 3.831666424006598e-05, + "loss": 0.9145, + "mean_token_accuracy": 0.7369246363639832, + "step": 2030 + }, + { + "epoch": 0.37428729078535955, + "grad_norm": 1.0127134327540788, + "learning_rate": 3.825661258544358e-05, + "loss": 0.8949, + "mean_token_accuracy": 0.740783178806305, + "step": 2035 + }, + { + "epoch": 0.375206915578444, + "grad_norm": 0.9456025309406082, + "learning_rate": 3.819646144199777e-05, + "loss": 0.8635, + "mean_token_accuracy": 0.749360203742981, + "step": 2040 + }, + { + "epoch": 0.37612654037152843, + "grad_norm": 0.9458510607283644, + "learning_rate": 3.813621136607157e-05, + "loss": 0.9212, + "mean_token_accuracy": 0.7321518301963806, + "step": 2045 + }, + { + "epoch": 0.3770461651646128, + "grad_norm": 0.995792214246869, + "learning_rate": 3.8075862914923074e-05, + "loss": 0.9529, + "mean_token_accuracy": 0.7222961544990539, + "step": 2050 + }, + { + "epoch": 0.37796578995769725, + "grad_norm": 0.931780686224964, + "learning_rate": 3.801541664672021e-05, + "loss": 0.9068, + "mean_token_accuracy": 0.7373356938362121, + "step": 2055 + }, + { + "epoch": 0.3788854147507817, + "grad_norm": 1.032699719779323, + "learning_rate": 3.795487312053566e-05, + "loss": 0.8428, + "mean_token_accuracy": 0.754009485244751, + "step": 2060 + }, + { + "epoch": 0.37980503954386613, + "grad_norm": 1.0082536583803767, + "learning_rate": 3.789423289634163e-05, + "loss": 0.8877, + "mean_token_accuracy": 0.7419803261756897, + "step": 2065 + }, + { + "epoch": 0.3807246643369505, + "grad_norm": 0.9922794484448726, + "learning_rate": 3.783349653500472e-05, + "loss": 0.9549, + "mean_token_accuracy": 0.7244602799415588, + "step": 2070 + }, + { + "epoch": 0.38164428913003495, + "grad_norm": 0.9289765959162268, + "learning_rate": 3.777266459828067e-05, + "loss": 0.9049, + "mean_token_accuracy": 0.7346539378166199, + "step": 2075 + }, + { + "epoch": 0.3825639139231194, + "grad_norm": 0.9418822148176986, + "learning_rate": 3.7711737648809255e-05, + "loss": 0.8631, + "mean_token_accuracy": 0.7498388290405273, + "step": 2080 + }, + { + "epoch": 0.3834835387162038, + "grad_norm": 0.9739714347813362, + "learning_rate": 3.765071625010899e-05, + "loss": 0.8642, + "mean_token_accuracy": 0.7496488690376282, + "step": 2085 + }, + { + "epoch": 0.3844031635092882, + "grad_norm": 0.9876318304111896, + "learning_rate": 3.758960096657197e-05, + "loss": 0.9409, + "mean_token_accuracy": 0.7231215476989746, + "step": 2090 + }, + { + "epoch": 0.38532278830237265, + "grad_norm": 0.9391298182307426, + "learning_rate": 3.752839236345866e-05, + "loss": 0.9321, + "mean_token_accuracy": 0.7299721479415894, + "step": 2095 + }, + { + "epoch": 0.38624241309545704, + "grad_norm": 0.9975883406823954, + "learning_rate": 3.746709100689263e-05, + "loss": 0.9119, + "mean_token_accuracy": 0.7372664332389831, + "step": 2100 + }, + { + "epoch": 0.3871620378885415, + "grad_norm": 0.9585598143365737, + "learning_rate": 3.740569746385531e-05, + "loss": 0.9511, + "mean_token_accuracy": 0.7252285242080688, + "step": 2105 + }, + { + "epoch": 0.3880816626816259, + "grad_norm": 0.9708930878655039, + "learning_rate": 3.7344212302180807e-05, + "loss": 0.9021, + "mean_token_accuracy": 0.7373741269111633, + "step": 2110 + }, + { + "epoch": 0.3890012874747103, + "grad_norm": 0.9842480657825518, + "learning_rate": 3.7282636090550613e-05, + "loss": 0.9155, + "mean_token_accuracy": 0.7346144676208496, + "step": 2115 + }, + { + "epoch": 0.38992091226779474, + "grad_norm": 1.010319909401371, + "learning_rate": 3.722096939848833e-05, + "loss": 0.8251, + "mean_token_accuracy": 0.7569172263145447, + "step": 2120 + }, + { + "epoch": 0.3908405370608792, + "grad_norm": 1.0232782350312868, + "learning_rate": 3.7159212796354425e-05, + "loss": 0.9061, + "mean_token_accuracy": 0.7363372683525086, + "step": 2125 + }, + { + "epoch": 0.39176016185396356, + "grad_norm": 0.9853933308782586, + "learning_rate": 3.7097366855340974e-05, + "loss": 0.9281, + "mean_token_accuracy": 0.7297635912895203, + "step": 2130 + }, + { + "epoch": 0.392679786647048, + "grad_norm": 1.0085562594833883, + "learning_rate": 3.703543214746632e-05, + "loss": 0.9345, + "mean_token_accuracy": 0.7267664670944214, + "step": 2135 + }, + { + "epoch": 0.39359941144013244, + "grad_norm": 0.9907065624349415, + "learning_rate": 3.6973409245569846e-05, + "loss": 0.9017, + "mean_token_accuracy": 0.7393394112586975, + "step": 2140 + }, + { + "epoch": 0.3945190362332168, + "grad_norm": 0.9488707860528096, + "learning_rate": 3.691129872330663e-05, + "loss": 0.9373, + "mean_token_accuracy": 0.728193199634552, + "step": 2145 + }, + { + "epoch": 0.39543866102630126, + "grad_norm": 0.9103606197233259, + "learning_rate": 3.684910115514218e-05, + "loss": 0.897, + "mean_token_accuracy": 0.7412585973739624, + "step": 2150 + }, + { + "epoch": 0.3963582858193857, + "grad_norm": 0.965709462156266, + "learning_rate": 3.678681711634708e-05, + "loss": 0.8715, + "mean_token_accuracy": 0.74575275182724, + "step": 2155 + }, + { + "epoch": 0.3972779106124701, + "grad_norm": 1.0272326947622106, + "learning_rate": 3.67244471829917e-05, + "loss": 0.8789, + "mean_token_accuracy": 0.7422020196914673, + "step": 2160 + }, + { + "epoch": 0.3981975354055545, + "grad_norm": 0.9300588922771316, + "learning_rate": 3.6661991931940856e-05, + "loss": 0.8945, + "mean_token_accuracy": 0.7385678648948669, + "step": 2165 + }, + { + "epoch": 0.39911716019863896, + "grad_norm": 1.002757392159615, + "learning_rate": 3.6599451940848446e-05, + "loss": 0.8993, + "mean_token_accuracy": 0.7361081838607788, + "step": 2170 + }, + { + "epoch": 0.4000367849917234, + "grad_norm": 1.1036859227862066, + "learning_rate": 3.6536827788152176e-05, + "loss": 0.9308, + "mean_token_accuracy": 0.7304606318473816, + "step": 2175 + }, + { + "epoch": 0.4009564097848078, + "grad_norm": 0.9701793563305904, + "learning_rate": 3.6474120053068164e-05, + "loss": 0.8472, + "mean_token_accuracy": 0.7498792171478271, + "step": 2180 + }, + { + "epoch": 0.4018760345778922, + "grad_norm": 1.041733702997736, + "learning_rate": 3.641132931558556e-05, + "loss": 0.9581, + "mean_token_accuracy": 0.7201631188392639, + "step": 2185 + }, + { + "epoch": 0.40279565937097667, + "grad_norm": 1.0348942168040987, + "learning_rate": 3.634845615646123e-05, + "loss": 0.9393, + "mean_token_accuracy": 0.7280836224555969, + "step": 2190 + }, + { + "epoch": 0.40371528416406105, + "grad_norm": 1.0131734961320986, + "learning_rate": 3.628550115721437e-05, + "loss": 0.927, + "mean_token_accuracy": 0.729682469367981, + "step": 2195 + }, + { + "epoch": 0.4046349089571455, + "grad_norm": 1.025738826571974, + "learning_rate": 3.622246490012111e-05, + "loss": 0.9357, + "mean_token_accuracy": 0.724788224697113, + "step": 2200 + }, + { + "epoch": 0.40555453375022993, + "grad_norm": 0.9501914998942569, + "learning_rate": 3.615934796820915e-05, + "loss": 0.8978, + "mean_token_accuracy": 0.7385434865951538, + "step": 2205 + }, + { + "epoch": 0.4064741585433143, + "grad_norm": 1.0106650660729533, + "learning_rate": 3.609615094525235e-05, + "loss": 0.952, + "mean_token_accuracy": 0.7243346452713013, + "step": 2210 + }, + { + "epoch": 0.40739378333639875, + "grad_norm": 0.9301771755028939, + "learning_rate": 3.6032874415765344e-05, + "loss": 0.8633, + "mean_token_accuracy": 0.7481309175491333, + "step": 2215 + }, + { + "epoch": 0.4083134081294832, + "grad_norm": 0.9662316400458029, + "learning_rate": 3.596951896499813e-05, + "loss": 0.8931, + "mean_token_accuracy": 0.7380975484848022, + "step": 2220 + }, + { + "epoch": 0.4092330329225676, + "grad_norm": 0.9612362754674141, + "learning_rate": 3.590608517893065e-05, + "loss": 0.8787, + "mean_token_accuracy": 0.743196439743042, + "step": 2225 + }, + { + "epoch": 0.410152657715652, + "grad_norm": 0.9923328807528666, + "learning_rate": 3.584257364426738e-05, + "loss": 0.942, + "mean_token_accuracy": 0.7252677202224731, + "step": 2230 + }, + { + "epoch": 0.41107228250873645, + "grad_norm": 0.9797715702136052, + "learning_rate": 3.577898494843191e-05, + "loss": 0.9523, + "mean_token_accuracy": 0.7244603157043457, + "step": 2235 + }, + { + "epoch": 0.41199190730182084, + "grad_norm": 0.9048445218025765, + "learning_rate": 3.571531967956147e-05, + "loss": 0.9136, + "mean_token_accuracy": 0.7320458292961121, + "step": 2240 + }, + { + "epoch": 0.4129115320949053, + "grad_norm": 0.9649058945655278, + "learning_rate": 3.565157842650154e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7362257719039917, + "step": 2245 + }, + { + "epoch": 0.4138311568879897, + "grad_norm": 0.9147474250541198, + "learning_rate": 3.55877617788004e-05, + "loss": 0.9155, + "mean_token_accuracy": 0.7333362221717834, + "step": 2250 + }, + { + "epoch": 0.4147507816810741, + "grad_norm": 0.876619458906422, + "learning_rate": 3.5523870326703635e-05, + "loss": 0.8492, + "mean_token_accuracy": 0.7528911828994751, + "step": 2255 + }, + { + "epoch": 0.41567040647415854, + "grad_norm": 1.0036194468259731, + "learning_rate": 3.545990466114871e-05, + "loss": 0.9137, + "mean_token_accuracy": 0.734946858882904, + "step": 2260 + }, + { + "epoch": 0.416590031267243, + "grad_norm": 0.9978348158615458, + "learning_rate": 3.5395865373759504e-05, + "loss": 0.8815, + "mean_token_accuracy": 0.742937445640564, + "step": 2265 + }, + { + "epoch": 0.41750965606032736, + "grad_norm": 0.9799485166888982, + "learning_rate": 3.533175305684081e-05, + "loss": 0.8857, + "mean_token_accuracy": 0.7412702798843384, + "step": 2270 + }, + { + "epoch": 0.4184292808534118, + "grad_norm": 0.9766101000667111, + "learning_rate": 3.5267568303372914e-05, + "loss": 0.8934, + "mean_token_accuracy": 0.7409379720687866, + "step": 2275 + }, + { + "epoch": 0.41934890564649624, + "grad_norm": 0.9775807722195559, + "learning_rate": 3.520331170700605e-05, + "loss": 0.9067, + "mean_token_accuracy": 0.7377767205238343, + "step": 2280 + }, + { + "epoch": 0.4202685304395807, + "grad_norm": 0.9690742278243399, + "learning_rate": 3.513898386205491e-05, + "loss": 0.9032, + "mean_token_accuracy": 0.7356434345245362, + "step": 2285 + }, + { + "epoch": 0.42118815523266506, + "grad_norm": 0.965511424805927, + "learning_rate": 3.507458536349323e-05, + "loss": 0.9157, + "mean_token_accuracy": 0.7343951106071472, + "step": 2290 + }, + { + "epoch": 0.4221077800257495, + "grad_norm": 0.9486968791577164, + "learning_rate": 3.5010116806948166e-05, + "loss": 0.901, + "mean_token_accuracy": 0.7399522423744201, + "step": 2295 + }, + { + "epoch": 0.42302740481883394, + "grad_norm": 0.9414293890579761, + "learning_rate": 3.4945578788694894e-05, + "loss": 0.9179, + "mean_token_accuracy": 0.7342228889465332, + "step": 2300 + }, + { + "epoch": 0.4239470296119183, + "grad_norm": 0.9896377940060639, + "learning_rate": 3.4880971905651016e-05, + "loss": 0.8784, + "mean_token_accuracy": 0.7457787752151489, + "step": 2305 + }, + { + "epoch": 0.42486665440500276, + "grad_norm": 0.9655527131977069, + "learning_rate": 3.481629675537108e-05, + "loss": 0.863, + "mean_token_accuracy": 0.7453173756599426, + "step": 2310 + }, + { + "epoch": 0.4257862791980872, + "grad_norm": 0.8936296988219236, + "learning_rate": 3.475155393604104e-05, + "loss": 0.8856, + "mean_token_accuracy": 0.7441475629806519, + "step": 2315 + }, + { + "epoch": 0.4267059039911716, + "grad_norm": 0.9149916486904485, + "learning_rate": 3.468674404647273e-05, + "loss": 0.8532, + "mean_token_accuracy": 0.7507219910621643, + "step": 2320 + }, + { + "epoch": 0.427625528784256, + "grad_norm": 0.9750792604803812, + "learning_rate": 3.462186768609834e-05, + "loss": 0.863, + "mean_token_accuracy": 0.7469933509826661, + "step": 2325 + }, + { + "epoch": 0.42854515357734047, + "grad_norm": 0.980901247745682, + "learning_rate": 3.455692545496483e-05, + "loss": 0.837, + "mean_token_accuracy": 0.7545093297958374, + "step": 2330 + }, + { + "epoch": 0.42946477837042485, + "grad_norm": 0.9686839306544004, + "learning_rate": 3.4491917953728396e-05, + "loss": 0.8885, + "mean_token_accuracy": 0.7428396463394165, + "step": 2335 + }, + { + "epoch": 0.4303844031635093, + "grad_norm": 0.9388350160272184, + "learning_rate": 3.442684578364897e-05, + "loss": 0.8951, + "mean_token_accuracy": 0.7408537268638611, + "step": 2340 + }, + { + "epoch": 0.4313040279565937, + "grad_norm": 0.8933385447401438, + "learning_rate": 3.4361709546584545e-05, + "loss": 0.8689, + "mean_token_accuracy": 0.7458449006080627, + "step": 2345 + }, + { + "epoch": 0.4322236527496781, + "grad_norm": 0.9411177313363235, + "learning_rate": 3.429650984498573e-05, + "loss": 0.8417, + "mean_token_accuracy": 0.7528134107589721, + "step": 2350 + }, + { + "epoch": 0.43314327754276255, + "grad_norm": 0.9359109119006161, + "learning_rate": 3.423124728189009e-05, + "loss": 0.8737, + "mean_token_accuracy": 0.7434362411499024, + "step": 2355 + }, + { + "epoch": 0.434062902335847, + "grad_norm": 0.966957214742338, + "learning_rate": 3.4165922460916635e-05, + "loss": 0.8946, + "mean_token_accuracy": 0.7397825956344605, + "step": 2360 + }, + { + "epoch": 0.4349825271289314, + "grad_norm": 0.9950941777576424, + "learning_rate": 3.410053598626016e-05, + "loss": 0.8833, + "mean_token_accuracy": 0.7447291493415833, + "step": 2365 + }, + { + "epoch": 0.4359021519220158, + "grad_norm": 0.963560335329199, + "learning_rate": 3.403508846268574e-05, + "loss": 0.8675, + "mean_token_accuracy": 0.7479366779327392, + "step": 2370 + }, + { + "epoch": 0.43682177671510025, + "grad_norm": 0.9286384422364868, + "learning_rate": 3.396958049552307e-05, + "loss": 0.9171, + "mean_token_accuracy": 0.7304298520088196, + "step": 2375 + }, + { + "epoch": 0.43774140150818464, + "grad_norm": 0.9750119805406471, + "learning_rate": 3.39040126906609e-05, + "loss": 0.8858, + "mean_token_accuracy": 0.742851734161377, + "step": 2380 + }, + { + "epoch": 0.4386610263012691, + "grad_norm": 0.9160809046368507, + "learning_rate": 3.383838565454144e-05, + "loss": 0.9062, + "mean_token_accuracy": 0.7335192441940308, + "step": 2385 + }, + { + "epoch": 0.4395806510943535, + "grad_norm": 0.9668435486381742, + "learning_rate": 3.37726999941547e-05, + "loss": 0.9243, + "mean_token_accuracy": 0.7276196122169495, + "step": 2390 + }, + { + "epoch": 0.4405002758874379, + "grad_norm": 0.9935097247563913, + "learning_rate": 3.3706956317032954e-05, + "loss": 0.8678, + "mean_token_accuracy": 0.7438644409179688, + "step": 2395 + }, + { + "epoch": 0.44141990068052234, + "grad_norm": 0.9939894791042586, + "learning_rate": 3.364115523124503e-05, + "loss": 0.8904, + "mean_token_accuracy": 0.7412869215011597, + "step": 2400 + }, + { + "epoch": 0.4423395254736068, + "grad_norm": 0.9937645932689831, + "learning_rate": 3.357529734539079e-05, + "loss": 0.8455, + "mean_token_accuracy": 0.7517339706420898, + "step": 2405 + }, + { + "epoch": 0.4432591502666912, + "grad_norm": 0.9375114941684974, + "learning_rate": 3.350938326859539e-05, + "loss": 0.8468, + "mean_token_accuracy": 0.7528372883796692, + "step": 2410 + }, + { + "epoch": 0.4441787750597756, + "grad_norm": 0.8973960962242926, + "learning_rate": 3.3443413610503735e-05, + "loss": 0.878, + "mean_token_accuracy": 0.7442919254302979, + "step": 2415 + }, + { + "epoch": 0.44509839985286004, + "grad_norm": 1.0080330285869648, + "learning_rate": 3.337738898127479e-05, + "loss": 0.8785, + "mean_token_accuracy": 0.7428927779197693, + "step": 2420 + }, + { + "epoch": 0.4460180246459445, + "grad_norm": 0.8985281228115014, + "learning_rate": 3.331130999157597e-05, + "loss": 0.8644, + "mean_token_accuracy": 0.7480224132537842, + "step": 2425 + }, + { + "epoch": 0.44693764943902886, + "grad_norm": 0.9291069202904676, + "learning_rate": 3.3245177252577454e-05, + "loss": 0.8976, + "mean_token_accuracy": 0.7383280873298645, + "step": 2430 + }, + { + "epoch": 0.4478572742321133, + "grad_norm": 0.9623008963786942, + "learning_rate": 3.317899137594656e-05, + "loss": 0.9593, + "mean_token_accuracy": 0.7246118664741517, + "step": 2435 + }, + { + "epoch": 0.44877689902519774, + "grad_norm": 0.9234507163948065, + "learning_rate": 3.311275297384208e-05, + "loss": 0.8413, + "mean_token_accuracy": 0.7528854846954346, + "step": 2440 + }, + { + "epoch": 0.4496965238182821, + "grad_norm": 0.979267043456503, + "learning_rate": 3.3046462658908636e-05, + "loss": 0.845, + "mean_token_accuracy": 0.7532721877098083, + "step": 2445 + }, + { + "epoch": 0.45061614861136656, + "grad_norm": 0.9032231134895651, + "learning_rate": 3.298012104427097e-05, + "loss": 0.895, + "mean_token_accuracy": 0.7396630644798279, + "step": 2450 + }, + { + "epoch": 0.451535773404451, + "grad_norm": 0.9383158653652773, + "learning_rate": 3.291372874352832e-05, + "loss": 0.8943, + "mean_token_accuracy": 0.73899405002594, + "step": 2455 + }, + { + "epoch": 0.4524553981975354, + "grad_norm": 0.9664126873169693, + "learning_rate": 3.284728637074869e-05, + "loss": 0.869, + "mean_token_accuracy": 0.746407687664032, + "step": 2460 + }, + { + "epoch": 0.4533750229906198, + "grad_norm": 0.993853088939543, + "learning_rate": 3.278079454046325e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.7388368129730225, + "step": 2465 + }, + { + "epoch": 0.45429464778370426, + "grad_norm": 0.8741206209918251, + "learning_rate": 3.271425386766058e-05, + "loss": 0.8388, + "mean_token_accuracy": 0.7533232569694519, + "step": 2470 + }, + { + "epoch": 0.45521427257678865, + "grad_norm": 0.9447835076472045, + "learning_rate": 3.2647664967781035e-05, + "loss": 0.8228, + "mean_token_accuracy": 0.7583665132522583, + "step": 2475 + }, + { + "epoch": 0.4561338973698731, + "grad_norm": 1.0045001891415821, + "learning_rate": 3.258102845671097e-05, + "loss": 0.8934, + "mean_token_accuracy": 0.7414227366447449, + "step": 2480 + }, + { + "epoch": 0.4570535221629575, + "grad_norm": 0.9475063098055461, + "learning_rate": 3.251434495077716e-05, + "loss": 0.9182, + "mean_token_accuracy": 0.7303388476371765, + "step": 2485 + }, + { + "epoch": 0.4579731469560419, + "grad_norm": 0.9775463234456495, + "learning_rate": 3.2447615066741004e-05, + "loss": 0.9361, + "mean_token_accuracy": 0.7293364763259887, + "step": 2490 + }, + { + "epoch": 0.45889277174912635, + "grad_norm": 0.9174334893241889, + "learning_rate": 3.238083942179288e-05, + "loss": 0.8474, + "mean_token_accuracy": 0.7529029250144958, + "step": 2495 + }, + { + "epoch": 0.4598123965422108, + "grad_norm": 0.9021239390235616, + "learning_rate": 3.2314018633546375e-05, + "loss": 0.8314, + "mean_token_accuracy": 0.7585980296134949, + "step": 2500 + }, + { + "epoch": 0.46073202133529517, + "grad_norm": 0.9231622515184421, + "learning_rate": 3.224715332003265e-05, + "loss": 0.8498, + "mean_token_accuracy": 0.7502579808235168, + "step": 2505 + }, + { + "epoch": 0.4616516461283796, + "grad_norm": 0.9279166556927757, + "learning_rate": 3.218024409969468e-05, + "loss": 0.899, + "mean_token_accuracy": 0.7380064010620118, + "step": 2510 + }, + { + "epoch": 0.46257127092146405, + "grad_norm": 0.9333611856920211, + "learning_rate": 3.2113291591381516e-05, + "loss": 0.9113, + "mean_token_accuracy": 0.7354224920272827, + "step": 2515 + }, + { + "epoch": 0.4634908957145485, + "grad_norm": 0.9585859302538061, + "learning_rate": 3.204629641434259e-05, + "loss": 0.912, + "mean_token_accuracy": 0.7332522869110107, + "step": 2520 + }, + { + "epoch": 0.4644105205076329, + "grad_norm": 1.0072945032594127, + "learning_rate": 3.197925918822199e-05, + "loss": 0.8615, + "mean_token_accuracy": 0.7460902214050293, + "step": 2525 + }, + { + "epoch": 0.4653301453007173, + "grad_norm": 0.9703474311506037, + "learning_rate": 3.1912180533052716e-05, + "loss": 0.9391, + "mean_token_accuracy": 0.7272826433181763, + "step": 2530 + }, + { + "epoch": 0.46624977009380175, + "grad_norm": 0.9701812144923739, + "learning_rate": 3.184506106925094e-05, + "loss": 0.8677, + "mean_token_accuracy": 0.747051191329956, + "step": 2535 + }, + { + "epoch": 0.46716939488688614, + "grad_norm": 0.9672451609696705, + "learning_rate": 3.177790141761029e-05, + "loss": 0.8627, + "mean_token_accuracy": 0.7482078075408936, + "step": 2540 + }, + { + "epoch": 0.4680890196799706, + "grad_norm": 0.9530973638849749, + "learning_rate": 3.1710702199296085e-05, + "loss": 0.8492, + "mean_token_accuracy": 0.7528972029685974, + "step": 2545 + }, + { + "epoch": 0.469008644473055, + "grad_norm": 0.9084239076489461, + "learning_rate": 3.16434640358396e-05, + "loss": 0.8653, + "mean_token_accuracy": 0.746622622013092, + "step": 2550 + }, + { + "epoch": 0.4699282692661394, + "grad_norm": 0.9998420571855022, + "learning_rate": 3.157618754913233e-05, + "loss": 0.8975, + "mean_token_accuracy": 0.738722312450409, + "step": 2555 + }, + { + "epoch": 0.47084789405922384, + "grad_norm": 0.9250250902872688, + "learning_rate": 3.15088733614202e-05, + "loss": 0.8551, + "mean_token_accuracy": 0.750208032131195, + "step": 2560 + }, + { + "epoch": 0.4717675188523083, + "grad_norm": 1.0106796436372896, + "learning_rate": 3.144152209529786e-05, + "loss": 0.9079, + "mean_token_accuracy": 0.7350385189056396, + "step": 2565 + }, + { + "epoch": 0.47268714364539266, + "grad_norm": 0.9619558970415346, + "learning_rate": 3.137413437370289e-05, + "loss": 0.91, + "mean_token_accuracy": 0.7369326472282409, + "step": 2570 + }, + { + "epoch": 0.4736067684384771, + "grad_norm": 1.0109885841238913, + "learning_rate": 3.130671081991005e-05, + "loss": 0.9084, + "mean_token_accuracy": 0.7353306174278259, + "step": 2575 + }, + { + "epoch": 0.47452639323156154, + "grad_norm": 0.9779190292756188, + "learning_rate": 3.123925205752552e-05, + "loss": 0.8556, + "mean_token_accuracy": 0.7515247583389282, + "step": 2580 + }, + { + "epoch": 0.4754460180246459, + "grad_norm": 0.9645840220644, + "learning_rate": 3.1171758710481096e-05, + "loss": 0.8755, + "mean_token_accuracy": 0.7436783194541932, + "step": 2585 + }, + { + "epoch": 0.47636564281773036, + "grad_norm": 1.001058541812525, + "learning_rate": 3.110423140302852e-05, + "loss": 0.9096, + "mean_token_accuracy": 0.7341774582862854, + "step": 2590 + }, + { + "epoch": 0.4772852676108148, + "grad_norm": 0.8974468409856537, + "learning_rate": 3.103667075973356e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7359666705131531, + "step": 2595 + }, + { + "epoch": 0.4782048924038992, + "grad_norm": 1.0374371477545201, + "learning_rate": 3.096907740547036e-05, + "loss": 0.9111, + "mean_token_accuracy": 0.7324892163276673, + "step": 2600 + }, + { + "epoch": 0.4791245171969836, + "grad_norm": 0.9405864234939062, + "learning_rate": 3.0901451965415595e-05, + "loss": 0.812, + "mean_token_accuracy": 0.7602822542190552, + "step": 2605 + }, + { + "epoch": 0.48004414199006806, + "grad_norm": 0.9654353230874346, + "learning_rate": 3.08337950650427e-05, + "loss": 0.8978, + "mean_token_accuracy": 0.7364333510398865, + "step": 2610 + }, + { + "epoch": 0.48096376678315245, + "grad_norm": 1.0011041381512356, + "learning_rate": 3.076610733011609e-05, + "loss": 0.9049, + "mean_token_accuracy": 0.7363562822341919, + "step": 2615 + }, + { + "epoch": 0.4818833915762369, + "grad_norm": 0.9686831090055986, + "learning_rate": 3.069838938668538e-05, + "loss": 0.8898, + "mean_token_accuracy": 0.7398189902305603, + "step": 2620 + }, + { + "epoch": 0.4828030163693213, + "grad_norm": 0.9318085356157495, + "learning_rate": 3.063064186107957e-05, + "loss": 0.8791, + "mean_token_accuracy": 0.7449330806732177, + "step": 2625 + }, + { + "epoch": 0.48372264116240576, + "grad_norm": 0.8934228857530689, + "learning_rate": 3.056286537990129e-05, + "loss": 0.8632, + "mean_token_accuracy": 0.7459052681922913, + "step": 2630 + }, + { + "epoch": 0.48464226595549015, + "grad_norm": 0.9725972260652284, + "learning_rate": 3.049506057002098e-05, + "loss": 0.8541, + "mean_token_accuracy": 0.7478031516075134, + "step": 2635 + }, + { + "epoch": 0.4855618907485746, + "grad_norm": 0.9452628770649284, + "learning_rate": 3.042722805857106e-05, + "loss": 0.8555, + "mean_token_accuracy": 0.746888279914856, + "step": 2640 + }, + { + "epoch": 0.486481515541659, + "grad_norm": 0.8806175124503305, + "learning_rate": 3.0359368472940208e-05, + "loss": 0.9035, + "mean_token_accuracy": 0.7369076132774353, + "step": 2645 + }, + { + "epoch": 0.4874011403347434, + "grad_norm": 0.8988265278259941, + "learning_rate": 3.029148244076749e-05, + "loss": 0.8643, + "mean_token_accuracy": 0.7449605345726014, + "step": 2650 + }, + { + "epoch": 0.48832076512782785, + "grad_norm": 0.9176861265880045, + "learning_rate": 3.022357058993657e-05, + "loss": 0.8643, + "mean_token_accuracy": 0.7462789297103882, + "step": 2655 + }, + { + "epoch": 0.4892403899209123, + "grad_norm": 0.9232400004776917, + "learning_rate": 3.0155633548569955e-05, + "loss": 0.903, + "mean_token_accuracy": 0.7353234887123108, + "step": 2660 + }, + { + "epoch": 0.4901600147139967, + "grad_norm": 0.9476269194909095, + "learning_rate": 3.008767194502309e-05, + "loss": 0.9035, + "mean_token_accuracy": 0.7386479258537293, + "step": 2665 + }, + { + "epoch": 0.4910796395070811, + "grad_norm": 0.931067111141978, + "learning_rate": 3.0019686407878617e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7414939045906067, + "step": 2670 + }, + { + "epoch": 0.49199926430016555, + "grad_norm": 0.9153445295986272, + "learning_rate": 2.995167756594055e-05, + "loss": 0.8625, + "mean_token_accuracy": 0.7501867294311524, + "step": 2675 + }, + { + "epoch": 0.49291888909324993, + "grad_norm": 0.9210143810764434, + "learning_rate": 2.988364604822845e-05, + "loss": 0.8972, + "mean_token_accuracy": 0.7386625647544861, + "step": 2680 + }, + { + "epoch": 0.4938385138863344, + "grad_norm": 0.9925053868796728, + "learning_rate": 2.9815592483971584e-05, + "loss": 0.8458, + "mean_token_accuracy": 0.751643443107605, + "step": 2685 + }, + { + "epoch": 0.4947581386794188, + "grad_norm": 1.006336852347141, + "learning_rate": 2.9747517502603167e-05, + "loss": 0.8721, + "mean_token_accuracy": 0.7480525851249695, + "step": 2690 + }, + { + "epoch": 0.4956777634725032, + "grad_norm": 0.9701598502406181, + "learning_rate": 2.967942173375447e-05, + "loss": 0.8818, + "mean_token_accuracy": 0.740173089504242, + "step": 2695 + }, + { + "epoch": 0.49659738826558764, + "grad_norm": 0.9431128523024928, + "learning_rate": 2.9611305807249052e-05, + "loss": 0.8344, + "mean_token_accuracy": 0.7551051139831543, + "step": 2700 + }, + { + "epoch": 0.4975170130586721, + "grad_norm": 0.9346714282194056, + "learning_rate": 2.95431703530969e-05, + "loss": 0.835, + "mean_token_accuracy": 0.7544684171676636, + "step": 2705 + }, + { + "epoch": 0.49843663785175646, + "grad_norm": 0.9358393411052466, + "learning_rate": 2.9475016001488608e-05, + "loss": 0.8906, + "mean_token_accuracy": 0.7427068829536438, + "step": 2710 + }, + { + "epoch": 0.4993562626448409, + "grad_norm": 0.8867163340537708, + "learning_rate": 2.9406843382789583e-05, + "loss": 0.8719, + "mean_token_accuracy": 0.745942211151123, + "step": 2715 + }, + { + "epoch": 0.5002758874379253, + "grad_norm": 0.9212664551640851, + "learning_rate": 2.9338653127534148e-05, + "loss": 0.8562, + "mean_token_accuracy": 0.7497703909873963, + "step": 2720 + }, + { + "epoch": 0.5011955122310098, + "grad_norm": 0.9432905808331339, + "learning_rate": 2.9270445866419766e-05, + "loss": 0.8741, + "mean_token_accuracy": 0.7432116866111755, + "step": 2725 + }, + { + "epoch": 0.5021151370240942, + "grad_norm": 0.9512906709412812, + "learning_rate": 2.92022222303012e-05, + "loss": 0.8818, + "mean_token_accuracy": 0.7435823440551758, + "step": 2730 + }, + { + "epoch": 0.5030347618171785, + "grad_norm": 0.9468765725989278, + "learning_rate": 2.9133982850184645e-05, + "loss": 0.8627, + "mean_token_accuracy": 0.748947024345398, + "step": 2735 + }, + { + "epoch": 0.503954386610263, + "grad_norm": 1.0112504748902342, + "learning_rate": 2.9065728357221927e-05, + "loss": 0.8508, + "mean_token_accuracy": 0.7537087440490723, + "step": 2740 + }, + { + "epoch": 0.5048740114033474, + "grad_norm": 0.9649262010355393, + "learning_rate": 2.899745938270465e-05, + "loss": 0.8819, + "mean_token_accuracy": 0.7414289236068725, + "step": 2745 + }, + { + "epoch": 0.5057936361964318, + "grad_norm": 0.9373961423715033, + "learning_rate": 2.8929176558058352e-05, + "loss": 0.8876, + "mean_token_accuracy": 0.741254198551178, + "step": 2750 + }, + { + "epoch": 0.5067132609895163, + "grad_norm": 0.9616567239953456, + "learning_rate": 2.8860880514836687e-05, + "loss": 0.8826, + "mean_token_accuracy": 0.7436172485351562, + "step": 2755 + }, + { + "epoch": 0.5076328857826007, + "grad_norm": 0.9367792403626876, + "learning_rate": 2.8792571884715546e-05, + "loss": 0.8482, + "mean_token_accuracy": 0.7529447674751282, + "step": 2760 + }, + { + "epoch": 0.5085525105756851, + "grad_norm": 0.9104599971108884, + "learning_rate": 2.8724251299487263e-05, + "loss": 0.8753, + "mean_token_accuracy": 0.7427584528923035, + "step": 2765 + }, + { + "epoch": 0.5094721353687696, + "grad_norm": 1.0105096627504964, + "learning_rate": 2.8655919391054732e-05, + "loss": 0.8641, + "mean_token_accuracy": 0.7479874610900878, + "step": 2770 + }, + { + "epoch": 0.510391760161854, + "grad_norm": 0.9279979512504474, + "learning_rate": 2.8587576791425568e-05, + "loss": 0.8317, + "mean_token_accuracy": 0.7535252571105957, + "step": 2775 + }, + { + "epoch": 0.5113113849549383, + "grad_norm": 0.9297465828114925, + "learning_rate": 2.8519224132706297e-05, + "loss": 0.8774, + "mean_token_accuracy": 0.7402622103691101, + "step": 2780 + }, + { + "epoch": 0.5122310097480228, + "grad_norm": 0.9452271860575534, + "learning_rate": 2.845086204709645e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.744519031047821, + "step": 2785 + }, + { + "epoch": 0.5131506345411072, + "grad_norm": 0.9830981203343458, + "learning_rate": 2.838249116688277e-05, + "loss": 0.9289, + "mean_token_accuracy": 0.7298115253448486, + "step": 2790 + }, + { + "epoch": 0.5140702593341917, + "grad_norm": 1.041430018260559, + "learning_rate": 2.8314112124433334e-05, + "loss": 0.9045, + "mean_token_accuracy": 0.7383831977844239, + "step": 2795 + }, + { + "epoch": 0.5149898841272761, + "grad_norm": 0.9620402098071436, + "learning_rate": 2.8245725552191703e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.746962821483612, + "step": 2800 + }, + { + "epoch": 0.5159095089203605, + "grad_norm": 0.9015921123510985, + "learning_rate": 2.8177332082671117e-05, + "loss": 0.853, + "mean_token_accuracy": 0.7487654685974121, + "step": 2805 + }, + { + "epoch": 0.516829133713445, + "grad_norm": 0.9007228615494444, + "learning_rate": 2.8108932348448553e-05, + "loss": 0.8428, + "mean_token_accuracy": 0.7535581469535828, + "step": 2810 + }, + { + "epoch": 0.5177487585065293, + "grad_norm": 0.9827577309973088, + "learning_rate": 2.8040526982158993e-05, + "loss": 0.8789, + "mean_token_accuracy": 0.7432992815971374, + "step": 2815 + }, + { + "epoch": 0.5186683832996137, + "grad_norm": 0.9633925171762643, + "learning_rate": 2.7972116616489464e-05, + "loss": 0.8397, + "mean_token_accuracy": 0.752094304561615, + "step": 2820 + }, + { + "epoch": 0.5195880080926982, + "grad_norm": 0.9281148435495344, + "learning_rate": 2.790370188417324e-05, + "loss": 0.8596, + "mean_token_accuracy": 0.7485750317573547, + "step": 2825 + }, + { + "epoch": 0.5205076328857826, + "grad_norm": 1.0029136932204825, + "learning_rate": 2.7835283417984005e-05, + "loss": 0.8718, + "mean_token_accuracy": 0.7433583855628967, + "step": 2830 + }, + { + "epoch": 0.521427257678867, + "grad_norm": 0.9621263162970809, + "learning_rate": 2.7766861850729958e-05, + "loss": 0.8955, + "mean_token_accuracy": 0.7394774556159973, + "step": 2835 + }, + { + "epoch": 0.5223468824719515, + "grad_norm": 0.9670299071015823, + "learning_rate": 2.7698437815247995e-05, + "loss": 0.8529, + "mean_token_accuracy": 0.7500015497207642, + "step": 2840 + }, + { + "epoch": 0.5232665072650359, + "grad_norm": 0.9398184622397476, + "learning_rate": 2.763001194439782e-05, + "loss": 0.8447, + "mean_token_accuracy": 0.7504964828491211, + "step": 2845 + }, + { + "epoch": 0.5241861320581203, + "grad_norm": 0.8869891271688453, + "learning_rate": 2.756158487105613e-05, + "loss": 0.8404, + "mean_token_accuracy": 0.7549336075782775, + "step": 2850 + }, + { + "epoch": 0.5251057568512048, + "grad_norm": 0.9965820824716972, + "learning_rate": 2.749315722811073e-05, + "loss": 0.9179, + "mean_token_accuracy": 0.7317790746688843, + "step": 2855 + }, + { + "epoch": 0.5260253816442891, + "grad_norm": 0.9304946857092635, + "learning_rate": 2.7424729648454717e-05, + "loss": 0.8874, + "mean_token_accuracy": 0.7398088812828064, + "step": 2860 + }, + { + "epoch": 0.5269450064373735, + "grad_norm": 0.9880649590404676, + "learning_rate": 2.735630276498058e-05, + "loss": 0.8738, + "mean_token_accuracy": 0.7432942867279053, + "step": 2865 + }, + { + "epoch": 0.527864631230458, + "grad_norm": 0.9350070938993663, + "learning_rate": 2.728787721057437e-05, + "loss": 0.8758, + "mean_token_accuracy": 0.7431787729263306, + "step": 2870 + }, + { + "epoch": 0.5287842560235424, + "grad_norm": 0.8997664568286488, + "learning_rate": 2.7219453618109853e-05, + "loss": 0.842, + "mean_token_accuracy": 0.7523634552955627, + "step": 2875 + }, + { + "epoch": 0.5297038808166268, + "grad_norm": 0.9519585493296138, + "learning_rate": 2.715103262044265e-05, + "loss": 0.8744, + "mean_token_accuracy": 0.7417232871055603, + "step": 2880 + }, + { + "epoch": 0.5306235056097113, + "grad_norm": 0.8836119550117293, + "learning_rate": 2.708261485040439e-05, + "loss": 0.856, + "mean_token_accuracy": 0.7496297836303711, + "step": 2885 + }, + { + "epoch": 0.5315431304027957, + "grad_norm": 0.9589883589041829, + "learning_rate": 2.7014200940796824e-05, + "loss": 0.8418, + "mean_token_accuracy": 0.7520057439804078, + "step": 2890 + }, + { + "epoch": 0.53246275519588, + "grad_norm": 0.9563207815434712, + "learning_rate": 2.694579152438601e-05, + "loss": 0.8936, + "mean_token_accuracy": 0.7398610949516297, + "step": 2895 + }, + { + "epoch": 0.5333823799889645, + "grad_norm": 0.9233468769288075, + "learning_rate": 2.6877387233896472e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.745741093158722, + "step": 2900 + }, + { + "epoch": 0.5343020047820489, + "grad_norm": 0.9541286928919233, + "learning_rate": 2.6808988702005285e-05, + "loss": 0.868, + "mean_token_accuracy": 0.7439489006996155, + "step": 2905 + }, + { + "epoch": 0.5352216295751333, + "grad_norm": 0.9922987370495847, + "learning_rate": 2.6740596561336275e-05, + "loss": 0.8482, + "mean_token_accuracy": 0.7504428863525391, + "step": 2910 + }, + { + "epoch": 0.5361412543682178, + "grad_norm": 0.9722831543231532, + "learning_rate": 2.667221144445418e-05, + "loss": 0.8177, + "mean_token_accuracy": 0.7608316302299499, + "step": 2915 + }, + { + "epoch": 0.5370608791613022, + "grad_norm": 1.0275441684092577, + "learning_rate": 2.6603833983858738e-05, + "loss": 0.9398, + "mean_token_accuracy": 0.7276052117347718, + "step": 2920 + }, + { + "epoch": 0.5379805039543866, + "grad_norm": 1.0068511170391965, + "learning_rate": 2.6535464811978894e-05, + "loss": 0.8424, + "mean_token_accuracy": 0.7531503081321717, + "step": 2925 + }, + { + "epoch": 0.5389001287474711, + "grad_norm": 0.9554905959505885, + "learning_rate": 2.6467104561166927e-05, + "loss": 0.8671, + "mean_token_accuracy": 0.7456499934196472, + "step": 2930 + }, + { + "epoch": 0.5398197535405554, + "grad_norm": 0.9318421761107843, + "learning_rate": 2.639875386369261e-05, + "loss": 0.8674, + "mean_token_accuracy": 0.7474814653396606, + "step": 2935 + }, + { + "epoch": 0.5407393783336398, + "grad_norm": 0.9797586514540253, + "learning_rate": 2.6330413351737336e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7371798276901245, + "step": 2940 + }, + { + "epoch": 0.5416590031267243, + "grad_norm": 0.9627863342351398, + "learning_rate": 2.626208365738831e-05, + "loss": 0.8662, + "mean_token_accuracy": 0.7450501322746277, + "step": 2945 + }, + { + "epoch": 0.5425786279198087, + "grad_norm": 0.9378560834404903, + "learning_rate": 2.6193765412632677e-05, + "loss": 0.8427, + "mean_token_accuracy": 0.750009298324585, + "step": 2950 + }, + { + "epoch": 0.5434982527128931, + "grad_norm": 0.9349477883280783, + "learning_rate": 2.6125459249351697e-05, + "loss": 0.8908, + "mean_token_accuracy": 0.7386453747749329, + "step": 2955 + }, + { + "epoch": 0.5444178775059776, + "grad_norm": 0.9298587181804499, + "learning_rate": 2.6057165799314854e-05, + "loss": 0.855, + "mean_token_accuracy": 0.7491998553276062, + "step": 2960 + }, + { + "epoch": 0.545337502299062, + "grad_norm": 0.9026144571758381, + "learning_rate": 2.5988885694174085e-05, + "loss": 0.8786, + "mean_token_accuracy": 0.7437506198883057, + "step": 2965 + }, + { + "epoch": 0.5462571270921464, + "grad_norm": 0.9408107824152944, + "learning_rate": 2.5920619565457877e-05, + "loss": 0.8758, + "mean_token_accuracy": 0.7427832961082459, + "step": 2970 + }, + { + "epoch": 0.5471767518852308, + "grad_norm": 0.9195819021761746, + "learning_rate": 2.5852368044565452e-05, + "loss": 0.9277, + "mean_token_accuracy": 0.7323094010353088, + "step": 2975 + }, + { + "epoch": 0.5480963766783152, + "grad_norm": 0.9586681296133412, + "learning_rate": 2.5784131762760922e-05, + "loss": 0.8334, + "mean_token_accuracy": 0.7566598057746887, + "step": 2980 + }, + { + "epoch": 0.5490160014713996, + "grad_norm": 0.9092467816987784, + "learning_rate": 2.5715911351167465e-05, + "loss": 0.9014, + "mean_token_accuracy": 0.7390154361724853, + "step": 2985 + }, + { + "epoch": 0.5499356262644841, + "grad_norm": 0.966449128998816, + "learning_rate": 2.564770744076144e-05, + "loss": 0.8959, + "mean_token_accuracy": 0.7373208284378052, + "step": 2990 + }, + { + "epoch": 0.5508552510575685, + "grad_norm": 1.0269176653506933, + "learning_rate": 2.5579520662366618e-05, + "loss": 0.8626, + "mean_token_accuracy": 0.7471036791801453, + "step": 2995 + }, + { + "epoch": 0.5517748758506529, + "grad_norm": 0.9705454615801481, + "learning_rate": 2.5511351646648324e-05, + "loss": 0.8761, + "mean_token_accuracy": 0.7408113241195678, + "step": 3000 + }, + { + "epoch": 0.5526945006437374, + "grad_norm": 0.9683019669667483, + "learning_rate": 2.5443201024107537e-05, + "loss": 0.8974, + "mean_token_accuracy": 0.7345914959907531, + "step": 3005 + }, + { + "epoch": 0.5536141254368218, + "grad_norm": 0.9328296833493311, + "learning_rate": 2.5375069425075176e-05, + "loss": 0.8629, + "mean_token_accuracy": 0.7468894720077515, + "step": 3010 + }, + { + "epoch": 0.5545337502299063, + "grad_norm": 0.9565417579373001, + "learning_rate": 2.5306957479706196e-05, + "loss": 0.8914, + "mean_token_accuracy": 0.7373947501182556, + "step": 3015 + }, + { + "epoch": 0.5554533750229906, + "grad_norm": 0.9439811181197841, + "learning_rate": 2.5238865817973735e-05, + "loss": 0.8264, + "mean_token_accuracy": 0.7566876411437988, + "step": 3020 + }, + { + "epoch": 0.556372999816075, + "grad_norm": 0.8918377804941932, + "learning_rate": 2.5170795069663374e-05, + "loss": 0.8384, + "mean_token_accuracy": 0.7532538652420044, + "step": 3025 + }, + { + "epoch": 0.5572926246091595, + "grad_norm": 0.9531681758263391, + "learning_rate": 2.510274586436725e-05, + "loss": 0.9137, + "mean_token_accuracy": 0.7336269617080688, + "step": 3030 + }, + { + "epoch": 0.5582122494022439, + "grad_norm": 0.9547809224031603, + "learning_rate": 2.5034718831478236e-05, + "loss": 0.8121, + "mean_token_accuracy": 0.7607084512710571, + "step": 3035 + }, + { + "epoch": 0.5591318741953283, + "grad_norm": 0.9101416039188879, + "learning_rate": 2.496671460018414e-05, + "loss": 0.8374, + "mean_token_accuracy": 0.7512237310409546, + "step": 3040 + }, + { + "epoch": 0.5600514989884128, + "grad_norm": 0.9591588974138807, + "learning_rate": 2.4898733799461866e-05, + "loss": 0.8691, + "mean_token_accuracy": 0.7475574612617493, + "step": 3045 + }, + { + "epoch": 0.5609711237814972, + "grad_norm": 0.9481182124754315, + "learning_rate": 2.4830777058071623e-05, + "loss": 0.8541, + "mean_token_accuracy": 0.7470650672912598, + "step": 3050 + }, + { + "epoch": 0.5618907485745815, + "grad_norm": 0.8991567391844545, + "learning_rate": 2.4762845004551077e-05, + "loss": 0.834, + "mean_token_accuracy": 0.7513617157936097, + "step": 3055 + }, + { + "epoch": 0.562810373367666, + "grad_norm": 0.8993594505060807, + "learning_rate": 2.4694938267209567e-05, + "loss": 0.8302, + "mean_token_accuracy": 0.7539983510971069, + "step": 3060 + }, + { + "epoch": 0.5637299981607504, + "grad_norm": 0.9212463554308379, + "learning_rate": 2.4627057474122273e-05, + "loss": 0.8598, + "mean_token_accuracy": 0.747953188419342, + "step": 3065 + }, + { + "epoch": 0.5646496229538348, + "grad_norm": 0.9155845020709076, + "learning_rate": 2.4559203253124407e-05, + "loss": 0.8728, + "mean_token_accuracy": 0.7440886616706848, + "step": 3070 + }, + { + "epoch": 0.5655692477469193, + "grad_norm": 0.9376543570110895, + "learning_rate": 2.4491376231805428e-05, + "loss": 0.8529, + "mean_token_accuracy": 0.7518376111984253, + "step": 3075 + }, + { + "epoch": 0.5664888725400037, + "grad_norm": 0.9720221730313491, + "learning_rate": 2.442357703750322e-05, + "loss": 0.8423, + "mean_token_accuracy": 0.7525236487388611, + "step": 3080 + }, + { + "epoch": 0.5674084973330881, + "grad_norm": 0.9013738631587733, + "learning_rate": 2.4355806297298296e-05, + "loss": 0.8422, + "mean_token_accuracy": 0.7528858304023742, + "step": 3085 + }, + { + "epoch": 0.5683281221261726, + "grad_norm": 0.9524358228393591, + "learning_rate": 2.4288064638007974e-05, + "loss": 0.8672, + "mean_token_accuracy": 0.7468002319335938, + "step": 3090 + }, + { + "epoch": 0.569247746919257, + "grad_norm": 0.9505409858129935, + "learning_rate": 2.4220352686180613e-05, + "loss": 0.8416, + "mean_token_accuracy": 0.7486450433731079, + "step": 3095 + }, + { + "epoch": 0.5701673717123413, + "grad_norm": 0.9615751645550065, + "learning_rate": 2.415267106808983e-05, + "loss": 0.803, + "mean_token_accuracy": 0.7603586912155151, + "step": 3100 + }, + { + "epoch": 0.5710869965054258, + "grad_norm": 0.9458073029155306, + "learning_rate": 2.4085020409728633e-05, + "loss": 0.8614, + "mean_token_accuracy": 0.7483598232269287, + "step": 3105 + }, + { + "epoch": 0.5720066212985102, + "grad_norm": 0.959427274017189, + "learning_rate": 2.4017401336803713e-05, + "loss": 0.8795, + "mean_token_accuracy": 0.7383235573768616, + "step": 3110 + }, + { + "epoch": 0.5729262460915946, + "grad_norm": 0.9688058239251538, + "learning_rate": 2.394981447472963e-05, + "loss": 0.8854, + "mean_token_accuracy": 0.7413538813591003, + "step": 3115 + }, + { + "epoch": 0.5738458708846791, + "grad_norm": 0.9543674760330169, + "learning_rate": 2.3882260448623002e-05, + "loss": 0.8924, + "mean_token_accuracy": 0.739243483543396, + "step": 3120 + }, + { + "epoch": 0.5747654956777635, + "grad_norm": 0.9565581088949338, + "learning_rate": 2.381473988329675e-05, + "loss": 0.8878, + "mean_token_accuracy": 0.737128746509552, + "step": 3125 + }, + { + "epoch": 0.5756851204708479, + "grad_norm": 0.9446263148140598, + "learning_rate": 2.374725340325433e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7424870610237122, + "step": 3130 + }, + { + "epoch": 0.5766047452639324, + "grad_norm": 0.9235345865848048, + "learning_rate": 2.3679801632683927e-05, + "loss": 0.8791, + "mean_token_accuracy": 0.7413055062294006, + "step": 3135 + }, + { + "epoch": 0.5775243700570167, + "grad_norm": 0.931358306977097, + "learning_rate": 2.3612385195452687e-05, + "loss": 0.8864, + "mean_token_accuracy": 0.7415070414543152, + "step": 3140 + }, + { + "epoch": 0.5784439948501011, + "grad_norm": 0.9366462545353926, + "learning_rate": 2.3545004715100966e-05, + "loss": 0.8791, + "mean_token_accuracy": 0.7428970575332642, + "step": 3145 + }, + { + "epoch": 0.5793636196431856, + "grad_norm": 0.9312216076414869, + "learning_rate": 2.3477660814836562e-05, + "loss": 0.8318, + "mean_token_accuracy": 0.7540540814399719, + "step": 3150 + }, + { + "epoch": 0.58028324443627, + "grad_norm": 0.9058432741408705, + "learning_rate": 2.3410354117528904e-05, + "loss": 0.9128, + "mean_token_accuracy": 0.7328131318092346, + "step": 3155 + }, + { + "epoch": 0.5812028692293544, + "grad_norm": 0.92693757568253, + "learning_rate": 2.3343085245703373e-05, + "loss": 0.8356, + "mean_token_accuracy": 0.754761004447937, + "step": 3160 + }, + { + "epoch": 0.5821224940224389, + "grad_norm": 0.9685552745916727, + "learning_rate": 2.3275854821535476e-05, + "loss": 0.8696, + "mean_token_accuracy": 0.7423434615135193, + "step": 3165 + }, + { + "epoch": 0.5830421188155233, + "grad_norm": 0.9530016316914325, + "learning_rate": 2.3208663466845108e-05, + "loss": 0.8239, + "mean_token_accuracy": 0.7581414461135865, + "step": 3170 + }, + { + "epoch": 0.5839617436086076, + "grad_norm": 0.9912981010776241, + "learning_rate": 2.3141511803090815e-05, + "loss": 0.8784, + "mean_token_accuracy": 0.743216586112976, + "step": 3175 + }, + { + "epoch": 0.5848813684016921, + "grad_norm": 0.8897494823501038, + "learning_rate": 2.3074400451364048e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7422731041908264, + "step": 3180 + }, + { + "epoch": 0.5858009931947765, + "grad_norm": 0.9087254524604537, + "learning_rate": 2.300733003238339e-05, + "loss": 0.8249, + "mean_token_accuracy": 0.75495365858078, + "step": 3185 + }, + { + "epoch": 0.5867206179878609, + "grad_norm": 0.9615326948623956, + "learning_rate": 2.2940301166488846e-05, + "loss": 0.7821, + "mean_token_accuracy": 0.7687617659568786, + "step": 3190 + }, + { + "epoch": 0.5876402427809454, + "grad_norm": 0.9239773147706558, + "learning_rate": 2.28733144736361e-05, + "loss": 0.8034, + "mean_token_accuracy": 0.7630661010742188, + "step": 3195 + }, + { + "epoch": 0.5885598675740298, + "grad_norm": 0.9271354944208791, + "learning_rate": 2.2806370573390745e-05, + "loss": 0.8377, + "mean_token_accuracy": 0.7517584562301636, + "step": 3200 + }, + { + "epoch": 0.5894794923671142, + "grad_norm": 0.9307261567222711, + "learning_rate": 2.2739470084922608e-05, + "loss": 0.9145, + "mean_token_accuracy": 0.7307730317115784, + "step": 3205 + }, + { + "epoch": 0.5903991171601987, + "grad_norm": 0.8708186634436479, + "learning_rate": 2.2672613626999994e-05, + "loss": 0.8495, + "mean_token_accuracy": 0.7486128211021423, + "step": 3210 + }, + { + "epoch": 0.591318741953283, + "grad_norm": 0.9473141853732495, + "learning_rate": 2.2605801817983958e-05, + "loss": 0.8341, + "mean_token_accuracy": 0.7518749475479126, + "step": 3215 + }, + { + "epoch": 0.5922383667463674, + "grad_norm": 0.9382593885727152, + "learning_rate": 2.253903527582259e-05, + "loss": 0.8447, + "mean_token_accuracy": 0.7506359577178955, + "step": 3220 + }, + { + "epoch": 0.5931579915394519, + "grad_norm": 0.9696123819996886, + "learning_rate": 2.247231461804532e-05, + "loss": 0.8266, + "mean_token_accuracy": 0.7562480688095092, + "step": 3225 + }, + { + "epoch": 0.5940776163325363, + "grad_norm": 0.8949351423802622, + "learning_rate": 2.2405640461757176e-05, + "loss": 0.814, + "mean_token_accuracy": 0.7592174887657166, + "step": 3230 + }, + { + "epoch": 0.5949972411256208, + "grad_norm": 0.9615311548799811, + "learning_rate": 2.2339013423633083e-05, + "loss": 0.8503, + "mean_token_accuracy": 0.7499252796173096, + "step": 3235 + }, + { + "epoch": 0.5959168659187052, + "grad_norm": 0.9086052926810453, + "learning_rate": 2.2272434119912184e-05, + "loss": 0.8754, + "mean_token_accuracy": 0.7434251546859741, + "step": 3240 + }, + { + "epoch": 0.5968364907117896, + "grad_norm": 0.9221742878259598, + "learning_rate": 2.2205903166392113e-05, + "loss": 0.8477, + "mean_token_accuracy": 0.7485897660255432, + "step": 3245 + }, + { + "epoch": 0.5977561155048741, + "grad_norm": 0.967041034869552, + "learning_rate": 2.2139421178423307e-05, + "loss": 0.8225, + "mean_token_accuracy": 0.7570245742797852, + "step": 3250 + }, + { + "epoch": 0.5986757402979584, + "grad_norm": 0.981067205830958, + "learning_rate": 2.207298877090333e-05, + "loss": 0.8701, + "mean_token_accuracy": 0.7440281748771668, + "step": 3255 + }, + { + "epoch": 0.5995953650910428, + "grad_norm": 0.989973298607582, + "learning_rate": 2.2006606558271142e-05, + "loss": 0.8713, + "mean_token_accuracy": 0.7413482785224914, + "step": 3260 + }, + { + "epoch": 0.6005149898841273, + "grad_norm": 0.8672144464089592, + "learning_rate": 2.1940275154501482e-05, + "loss": 0.87, + "mean_token_accuracy": 0.743138313293457, + "step": 3265 + }, + { + "epoch": 0.6014346146772117, + "grad_norm": 0.9653292378844739, + "learning_rate": 2.187399517309914e-05, + "loss": 0.8575, + "mean_token_accuracy": 0.7464121103286743, + "step": 3270 + }, + { + "epoch": 0.6023542394702961, + "grad_norm": 0.9239524199502155, + "learning_rate": 2.1807767227093268e-05, + "loss": 0.8236, + "mean_token_accuracy": 0.7573307991027832, + "step": 3275 + }, + { + "epoch": 0.6032738642633806, + "grad_norm": 0.9806975126747703, + "learning_rate": 2.1741591929031795e-05, + "loss": 0.878, + "mean_token_accuracy": 0.7407856106758117, + "step": 3280 + }, + { + "epoch": 0.604193489056465, + "grad_norm": 0.9640808408127749, + "learning_rate": 2.167546989097566e-05, + "loss": 0.8638, + "mean_token_accuracy": 0.7459958910942077, + "step": 3285 + }, + { + "epoch": 0.6051131138495494, + "grad_norm": 0.9656473527433518, + "learning_rate": 2.16094017244932e-05, + "loss": 0.8783, + "mean_token_accuracy": 0.7419638872146607, + "step": 3290 + }, + { + "epoch": 0.6060327386426339, + "grad_norm": 0.9930014003610543, + "learning_rate": 2.154338804065451e-05, + "loss": 0.8615, + "mean_token_accuracy": 0.7456332087516785, + "step": 3295 + }, + { + "epoch": 0.6069523634357182, + "grad_norm": 0.9330196848152268, + "learning_rate": 2.1477429450025767e-05, + "loss": 0.8352, + "mean_token_accuracy": 0.7517044901847839, + "step": 3300 + }, + { + "epoch": 0.6078719882288026, + "grad_norm": 0.8777553334567131, + "learning_rate": 2.1411526562663554e-05, + "loss": 0.8364, + "mean_token_accuracy": 0.7501665949821472, + "step": 3305 + }, + { + "epoch": 0.6087916130218871, + "grad_norm": 0.9315142599796349, + "learning_rate": 2.1345679988109284e-05, + "loss": 0.8378, + "mean_token_accuracy": 0.7534802198410034, + "step": 3310 + }, + { + "epoch": 0.6097112378149715, + "grad_norm": 0.9385962221597601, + "learning_rate": 2.1279890335383534e-05, + "loss": 0.8876, + "mean_token_accuracy": 0.7398653388023376, + "step": 3315 + }, + { + "epoch": 0.6106308626080559, + "grad_norm": 0.9451857651632474, + "learning_rate": 2.1214158212980366e-05, + "loss": 0.7988, + "mean_token_accuracy": 0.7636669516563416, + "step": 3320 + }, + { + "epoch": 0.6115504874011404, + "grad_norm": 0.9310680714278403, + "learning_rate": 2.114848422886177e-05, + "loss": 0.8417, + "mean_token_accuracy": 0.7545873999595643, + "step": 3325 + }, + { + "epoch": 0.6124701121942248, + "grad_norm": 0.9555284993925652, + "learning_rate": 2.108286899045202e-05, + "loss": 0.8906, + "mean_token_accuracy": 0.7384588122367859, + "step": 3330 + }, + { + "epoch": 0.6133897369873091, + "grad_norm": 0.9525478437560697, + "learning_rate": 2.1017313104632003e-05, + "loss": 0.844, + "mean_token_accuracy": 0.7497392654418945, + "step": 3335 + }, + { + "epoch": 0.6143093617803936, + "grad_norm": 0.9657934498214388, + "learning_rate": 2.0951817177733684e-05, + "loss": 0.8748, + "mean_token_accuracy": 0.7426393389701843, + "step": 3340 + }, + { + "epoch": 0.615228986573478, + "grad_norm": 0.9174407552166862, + "learning_rate": 2.088638181553446e-05, + "loss": 0.8727, + "mean_token_accuracy": 0.742801570892334, + "step": 3345 + }, + { + "epoch": 0.6161486113665624, + "grad_norm": 0.9106809477969502, + "learning_rate": 2.0821007623251564e-05, + "loss": 0.8227, + "mean_token_accuracy": 0.7550573825836182, + "step": 3350 + }, + { + "epoch": 0.6170682361596469, + "grad_norm": 0.8816231707997737, + "learning_rate": 2.075569520553643e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.7590124368667602, + "step": 3355 + }, + { + "epoch": 0.6179878609527313, + "grad_norm": 0.9651791807712018, + "learning_rate": 2.0690445166469158e-05, + "loss": 0.8575, + "mean_token_accuracy": 0.7481630921363831, + "step": 3360 + }, + { + "epoch": 0.6189074857458157, + "grad_norm": 0.962161882798645, + "learning_rate": 2.0625258109552926e-05, + "loss": 0.8842, + "mean_token_accuracy": 0.743985378742218, + "step": 3365 + }, + { + "epoch": 0.6198271105389002, + "grad_norm": 0.955250281560398, + "learning_rate": 2.0560134637708334e-05, + "loss": 0.8413, + "mean_token_accuracy": 0.7497357606887818, + "step": 3370 + }, + { + "epoch": 0.6207467353319845, + "grad_norm": 1.0327175413319667, + "learning_rate": 2.0495075353267913e-05, + "loss": 0.8697, + "mean_token_accuracy": 0.7445659875869751, + "step": 3375 + }, + { + "epoch": 0.6216663601250689, + "grad_norm": 0.9525687098312168, + "learning_rate": 2.043008085797052e-05, + "loss": 0.8722, + "mean_token_accuracy": 0.7410041093826294, + "step": 3380 + }, + { + "epoch": 0.6225859849181534, + "grad_norm": 0.9275514977855014, + "learning_rate": 2.036515175295574e-05, + "loss": 0.8412, + "mean_token_accuracy": 0.7507887959480286, + "step": 3385 + }, + { + "epoch": 0.6235056097112378, + "grad_norm": 0.9493961658678648, + "learning_rate": 2.03002886387584e-05, + "loss": 0.8556, + "mean_token_accuracy": 0.7469261646270752, + "step": 3390 + }, + { + "epoch": 0.6244252345043222, + "grad_norm": 0.9292345545436532, + "learning_rate": 2.0235492115302944e-05, + "loss": 0.8301, + "mean_token_accuracy": 0.7550871014595032, + "step": 3395 + }, + { + "epoch": 0.6253448592974067, + "grad_norm": 0.9430411664378814, + "learning_rate": 2.017076278189794e-05, + "loss": 0.8321, + "mean_token_accuracy": 0.7533326983451843, + "step": 3400 + }, + { + "epoch": 0.6262644840904911, + "grad_norm": 0.8889521393845567, + "learning_rate": 2.0106101237230455e-05, + "loss": 0.8324, + "mean_token_accuracy": 0.7539088129997253, + "step": 3405 + }, + { + "epoch": 0.6271841088835755, + "grad_norm": 0.9180009901150891, + "learning_rate": 2.0041508079360634e-05, + "loss": 0.7898, + "mean_token_accuracy": 0.761493980884552, + "step": 3410 + }, + { + "epoch": 0.62810373367666, + "grad_norm": 0.9055995921329637, + "learning_rate": 1.997698390571608e-05, + "loss": 0.8419, + "mean_token_accuracy": 0.7503387928009033, + "step": 3415 + }, + { + "epoch": 0.6290233584697443, + "grad_norm": 0.9447591194939752, + "learning_rate": 1.991252931308633e-05, + "loss": 0.8692, + "mean_token_accuracy": 0.7452242970466614, + "step": 3420 + }, + { + "epoch": 0.6299429832628287, + "grad_norm": 0.9351426059072258, + "learning_rate": 1.9848144897617417e-05, + "loss": 0.8149, + "mean_token_accuracy": 0.7568124055862426, + "step": 3425 + }, + { + "epoch": 0.6308626080559132, + "grad_norm": 0.9168023134449134, + "learning_rate": 1.9783831254806257e-05, + "loss": 0.8157, + "mean_token_accuracy": 0.7554953694343567, + "step": 3430 + }, + { + "epoch": 0.6317822328489976, + "grad_norm": 1.027979530127791, + "learning_rate": 1.971958897949518e-05, + "loss": 0.8229, + "mean_token_accuracy": 0.7550533413887024, + "step": 3435 + }, + { + "epoch": 0.632701857642082, + "grad_norm": 0.8964633060914129, + "learning_rate": 1.9655418665866465e-05, + "loss": 0.7966, + "mean_token_accuracy": 0.7639833688735962, + "step": 3440 + }, + { + "epoch": 0.6336214824351665, + "grad_norm": 0.8702615238247585, + "learning_rate": 1.9591320907436782e-05, + "loss": 0.8502, + "mean_token_accuracy": 0.74614177942276, + "step": 3445 + }, + { + "epoch": 0.6345411072282509, + "grad_norm": 0.9157962896320851, + "learning_rate": 1.9527296297051765e-05, + "loss": 0.8026, + "mean_token_accuracy": 0.758307683467865, + "step": 3450 + }, + { + "epoch": 0.6354607320213354, + "grad_norm": 0.9465005665572019, + "learning_rate": 1.9463345426880448e-05, + "loss": 0.8036, + "mean_token_accuracy": 0.7617629647254944, + "step": 3455 + }, + { + "epoch": 0.6363803568144197, + "grad_norm": 0.9618417431183126, + "learning_rate": 1.939946888840986e-05, + "loss": 0.8819, + "mean_token_accuracy": 0.7395693898200989, + "step": 3460 + }, + { + "epoch": 0.6372999816075041, + "grad_norm": 0.9326022903907812, + "learning_rate": 1.933566727243956e-05, + "loss": 0.8384, + "mean_token_accuracy": 0.7497618556022644, + "step": 3465 + }, + { + "epoch": 0.6382196064005886, + "grad_norm": 0.942168299955769, + "learning_rate": 1.927194116907608e-05, + "loss": 0.8821, + "mean_token_accuracy": 0.7422310829162597, + "step": 3470 + }, + { + "epoch": 0.639139231193673, + "grad_norm": 0.930256851029374, + "learning_rate": 1.9208291167727576e-05, + "loss": 0.8293, + "mean_token_accuracy": 0.7561385631561279, + "step": 3475 + }, + { + "epoch": 0.6400588559867574, + "grad_norm": 0.8857746537604931, + "learning_rate": 1.9144717857098328e-05, + "loss": 0.8166, + "mean_token_accuracy": 0.7583439826965332, + "step": 3480 + }, + { + "epoch": 0.6409784807798419, + "grad_norm": 0.9519372824273006, + "learning_rate": 1.908122182518326e-05, + "loss": 0.8674, + "mean_token_accuracy": 0.741856062412262, + "step": 3485 + }, + { + "epoch": 0.6418981055729263, + "grad_norm": 0.9483959540274922, + "learning_rate": 1.9017803659262583e-05, + "loss": 0.8496, + "mean_token_accuracy": 0.7491413950920105, + "step": 3490 + }, + { + "epoch": 0.6428177303660106, + "grad_norm": 0.9729346329964175, + "learning_rate": 1.8954463945896293e-05, + "loss": 0.8554, + "mean_token_accuracy": 0.7483752846717835, + "step": 3495 + }, + { + "epoch": 0.6437373551590951, + "grad_norm": 0.910719020599245, + "learning_rate": 1.889120327091879e-05, + "loss": 0.8332, + "mean_token_accuracy": 0.753311276435852, + "step": 3500 + }, + { + "epoch": 0.6446569799521795, + "grad_norm": 0.8997078755147822, + "learning_rate": 1.8828022219433413e-05, + "loss": 0.8311, + "mean_token_accuracy": 0.7538302779197693, + "step": 3505 + }, + { + "epoch": 0.6455766047452639, + "grad_norm": 0.9097287217365273, + "learning_rate": 1.8764921375807083e-05, + "loss": 0.8573, + "mean_token_accuracy": 0.74767564535141, + "step": 3510 + }, + { + "epoch": 0.6464962295383484, + "grad_norm": 0.9420262116863728, + "learning_rate": 1.8701901323664863e-05, + "loss": 0.8551, + "mean_token_accuracy": 0.7479906916618347, + "step": 3515 + }, + { + "epoch": 0.6474158543314328, + "grad_norm": 0.9297816459092663, + "learning_rate": 1.8638962645884565e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.7580268263816834, + "step": 3520 + }, + { + "epoch": 0.6483354791245172, + "grad_norm": 0.946031226164797, + "learning_rate": 1.8576105924591357e-05, + "loss": 0.8179, + "mean_token_accuracy": 0.7542472004890441, + "step": 3525 + }, + { + "epoch": 0.6492551039176017, + "grad_norm": 0.9036904422802344, + "learning_rate": 1.8513331741152412e-05, + "loss": 0.8261, + "mean_token_accuracy": 0.7552783608436584, + "step": 3530 + }, + { + "epoch": 0.650174728710686, + "grad_norm": 0.921905554132334, + "learning_rate": 1.8450640676171472e-05, + "loss": 0.8351, + "mean_token_accuracy": 0.752598226070404, + "step": 3535 + }, + { + "epoch": 0.6510943535037704, + "grad_norm": 1.0035005670649164, + "learning_rate": 1.8388033309483522e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7371325850486755, + "step": 3540 + }, + { + "epoch": 0.6520139782968549, + "grad_norm": 0.9724909600231612, + "learning_rate": 1.8325510220149413e-05, + "loss": 0.8327, + "mean_token_accuracy": 0.751532518863678, + "step": 3545 + }, + { + "epoch": 0.6529336030899393, + "grad_norm": 0.9664687506252672, + "learning_rate": 1.8263071986450524e-05, + "loss": 0.8336, + "mean_token_accuracy": 0.7516280770301819, + "step": 3550 + }, + { + "epoch": 0.6538532278830237, + "grad_norm": 0.9164445815967506, + "learning_rate": 1.8200719185883358e-05, + "loss": 0.8316, + "mean_token_accuracy": 0.7544404864311218, + "step": 3555 + }, + { + "epoch": 0.6547728526761082, + "grad_norm": 0.9293565126179983, + "learning_rate": 1.813845239515427e-05, + "loss": 0.8257, + "mean_token_accuracy": 0.7552899837493896, + "step": 3560 + }, + { + "epoch": 0.6556924774691926, + "grad_norm": 0.9010810987925738, + "learning_rate": 1.8076272190174115e-05, + "loss": 0.8201, + "mean_token_accuracy": 0.7565722703933716, + "step": 3565 + }, + { + "epoch": 0.656612102262277, + "grad_norm": 1.0075745989661558, + "learning_rate": 1.801417914605286e-05, + "loss": 0.869, + "mean_token_accuracy": 0.7453143835067749, + "step": 3570 + }, + { + "epoch": 0.6575317270553614, + "grad_norm": 0.935586367301874, + "learning_rate": 1.795217383709437e-05, + "loss": 0.8845, + "mean_token_accuracy": 0.7403179168701172, + "step": 3575 + }, + { + "epoch": 0.6584513518484458, + "grad_norm": 0.9872971011864189, + "learning_rate": 1.7890256836791008e-05, + "loss": 0.8052, + "mean_token_accuracy": 0.7629344463348389, + "step": 3580 + }, + { + "epoch": 0.6593709766415302, + "grad_norm": 0.9876503263464145, + "learning_rate": 1.7828428717818353e-05, + "loss": 0.8135, + "mean_token_accuracy": 0.7590724229812622, + "step": 3585 + }, + { + "epoch": 0.6602906014346147, + "grad_norm": 0.8811578706911977, + "learning_rate": 1.7766690052029944e-05, + "loss": 0.8221, + "mean_token_accuracy": 0.7560603976249695, + "step": 3590 + }, + { + "epoch": 0.6612102262276991, + "grad_norm": 0.9719326557742581, + "learning_rate": 1.770504141045194e-05, + "loss": 0.8342, + "mean_token_accuracy": 0.7510559558868408, + "step": 3595 + }, + { + "epoch": 0.6621298510207835, + "grad_norm": 1.0132470520749903, + "learning_rate": 1.7643483363277874e-05, + "loss": 0.8487, + "mean_token_accuracy": 0.7500616908073425, + "step": 3600 + }, + { + "epoch": 0.663049475813868, + "grad_norm": 1.0318932699213554, + "learning_rate": 1.7582016479863327e-05, + "loss": 0.8487, + "mean_token_accuracy": 0.7490703582763671, + "step": 3605 + }, + { + "epoch": 0.6639691006069524, + "grad_norm": 0.8658023921332224, + "learning_rate": 1.7520641328720756e-05, + "loss": 0.8238, + "mean_token_accuracy": 0.7564070224761963, + "step": 3610 + }, + { + "epoch": 0.6648887254000367, + "grad_norm": 0.9750052383478849, + "learning_rate": 1.7459358477514122e-05, + "loss": 0.8249, + "mean_token_accuracy": 0.7549832344055176, + "step": 3615 + }, + { + "epoch": 0.6658083501931212, + "grad_norm": 0.957114636285714, + "learning_rate": 1.7398168493053723e-05, + "loss": 0.7881, + "mean_token_accuracy": 0.7615378856658935, + "step": 3620 + }, + { + "epoch": 0.6667279749862056, + "grad_norm": 0.9148381033348181, + "learning_rate": 1.7337071941290944e-05, + "loss": 0.8196, + "mean_token_accuracy": 0.7577734112739563, + "step": 3625 + }, + { + "epoch": 0.66764759977929, + "grad_norm": 0.9583843198631806, + "learning_rate": 1.7276069387312955e-05, + "loss": 0.9, + "mean_token_accuracy": 0.7367844343185425, + "step": 3630 + }, + { + "epoch": 0.6685672245723745, + "grad_norm": 0.9525242256598431, + "learning_rate": 1.7215161395337572e-05, + "loss": 0.8351, + "mean_token_accuracy": 0.7536734580993653, + "step": 3635 + }, + { + "epoch": 0.6694868493654589, + "grad_norm": 0.9218486580963495, + "learning_rate": 1.7154348528707992e-05, + "loss": 0.8512, + "mean_token_accuracy": 0.7513302564620972, + "step": 3640 + }, + { + "epoch": 0.6704064741585433, + "grad_norm": 0.9497350819436411, + "learning_rate": 1.709363134988757e-05, + "loss": 0.8522, + "mean_token_accuracy": 0.747953987121582, + "step": 3645 + }, + { + "epoch": 0.6713260989516278, + "grad_norm": 0.9359833703344925, + "learning_rate": 1.7033010420454655e-05, + "loss": 0.8091, + "mean_token_accuracy": 0.7576663970947266, + "step": 3650 + }, + { + "epoch": 0.6722457237447121, + "grad_norm": 0.9884296155896105, + "learning_rate": 1.6972486301097376e-05, + "loss": 0.8185, + "mean_token_accuracy": 0.7578543424606323, + "step": 3655 + }, + { + "epoch": 0.6731653485377965, + "grad_norm": 0.885165473016121, + "learning_rate": 1.691205955160845e-05, + "loss": 0.8461, + "mean_token_accuracy": 0.7491200208663941, + "step": 3660 + }, + { + "epoch": 0.674084973330881, + "grad_norm": 0.9715821597591158, + "learning_rate": 1.6851730730880012e-05, + "loss": 0.8527, + "mean_token_accuracy": 0.7483757376670838, + "step": 3665 + }, + { + "epoch": 0.6750045981239654, + "grad_norm": 0.8871437133597592, + "learning_rate": 1.679150039689846e-05, + "loss": 0.8148, + "mean_token_accuracy": 0.7578411340713501, + "step": 3670 + }, + { + "epoch": 0.6759242229170498, + "grad_norm": 0.9530586600231223, + "learning_rate": 1.673136910673926e-05, + "loss": 0.8645, + "mean_token_accuracy": 0.7451423764228821, + "step": 3675 + }, + { + "epoch": 0.6768438477101343, + "grad_norm": 0.9427729850229866, + "learning_rate": 1.6671337416561817e-05, + "loss": 0.8432, + "mean_token_accuracy": 0.7509079575538635, + "step": 3680 + }, + { + "epoch": 0.6777634725032187, + "grad_norm": 0.9325142143827265, + "learning_rate": 1.661140588160435e-05, + "loss": 0.8347, + "mean_token_accuracy": 0.7516968011856079, + "step": 3685 + }, + { + "epoch": 0.6786830972963032, + "grad_norm": 0.9601757924065347, + "learning_rate": 1.6551575056178695e-05, + "loss": 0.8166, + "mean_token_accuracy": 0.7589465737342834, + "step": 3690 + }, + { + "epoch": 0.6796027220893875, + "grad_norm": 1.0086779966517565, + "learning_rate": 1.649184549366525e-05, + "loss": 0.8395, + "mean_token_accuracy": 0.7520246505737305, + "step": 3695 + }, + { + "epoch": 0.6805223468824719, + "grad_norm": 0.9707009645804029, + "learning_rate": 1.6432217746507814e-05, + "loss": 0.8382, + "mean_token_accuracy": 0.7533354997634888, + "step": 3700 + }, + { + "epoch": 0.6814419716755564, + "grad_norm": 0.9109669918450888, + "learning_rate": 1.6372692366208476e-05, + "loss": 0.8186, + "mean_token_accuracy": 0.7560298204421997, + "step": 3705 + }, + { + "epoch": 0.6823615964686408, + "grad_norm": 0.931556246223817, + "learning_rate": 1.6313269903322536e-05, + "loss": 0.8682, + "mean_token_accuracy": 0.7464072823524475, + "step": 3710 + }, + { + "epoch": 0.6832812212617252, + "grad_norm": 0.9316943141031991, + "learning_rate": 1.6253950907453414e-05, + "loss": 0.7891, + "mean_token_accuracy": 0.7643645644187927, + "step": 3715 + }, + { + "epoch": 0.6842008460548097, + "grad_norm": 0.9367407375514984, + "learning_rate": 1.619473592724752e-05, + "loss": 0.8489, + "mean_token_accuracy": 0.7488224864006042, + "step": 3720 + }, + { + "epoch": 0.6851204708478941, + "grad_norm": 0.96189736553831, + "learning_rate": 1.613562551038925e-05, + "loss": 0.7964, + "mean_token_accuracy": 0.7625237464904785, + "step": 3725 + }, + { + "epoch": 0.6860400956409785, + "grad_norm": 0.9170890141555628, + "learning_rate": 1.607662020359587e-05, + "loss": 0.8404, + "mean_token_accuracy": 0.7529777765274048, + "step": 3730 + }, + { + "epoch": 0.686959720434063, + "grad_norm": 0.9456438498787428, + "learning_rate": 1.6017720552612462e-05, + "loss": 0.8036, + "mean_token_accuracy": 0.7614395618438721, + "step": 3735 + }, + { + "epoch": 0.6878793452271473, + "grad_norm": 0.9544770877536788, + "learning_rate": 1.595892710220691e-05, + "loss": 0.8413, + "mean_token_accuracy": 0.7519929647445679, + "step": 3740 + }, + { + "epoch": 0.6887989700202317, + "grad_norm": 1.022115954707187, + "learning_rate": 1.5900240396164835e-05, + "loss": 0.8612, + "mean_token_accuracy": 0.747264850139618, + "step": 3745 + }, + { + "epoch": 0.6897185948133162, + "grad_norm": 0.9476824745559427, + "learning_rate": 1.584166097728455e-05, + "loss": 0.847, + "mean_token_accuracy": 0.7491350531578064, + "step": 3750 + }, + { + "epoch": 0.6906382196064006, + "grad_norm": 0.8827290010499629, + "learning_rate": 1.578318938737209e-05, + "loss": 0.8284, + "mean_token_accuracy": 0.7547004818916321, + "step": 3755 + }, + { + "epoch": 0.691557844399485, + "grad_norm": 0.9009975487421323, + "learning_rate": 1.5724826167236146e-05, + "loss": 0.8214, + "mean_token_accuracy": 0.7568115711212158, + "step": 3760 + }, + { + "epoch": 0.6924774691925695, + "grad_norm": 0.9187149873785133, + "learning_rate": 1.5666571856683116e-05, + "loss": 0.827, + "mean_token_accuracy": 0.7550323009490967, + "step": 3765 + }, + { + "epoch": 0.6933970939856539, + "grad_norm": 0.9280641474823987, + "learning_rate": 1.560842699451204e-05, + "loss": 0.7616, + "mean_token_accuracy": 0.7714649677276612, + "step": 3770 + }, + { + "epoch": 0.6943167187787382, + "grad_norm": 0.9038372482824055, + "learning_rate": 1.5550392118509705e-05, + "loss": 0.8028, + "mean_token_accuracy": 0.760212504863739, + "step": 3775 + }, + { + "epoch": 0.6952363435718227, + "grad_norm": 0.9201432901179558, + "learning_rate": 1.5492467765445613e-05, + "loss": 0.8241, + "mean_token_accuracy": 0.754262363910675, + "step": 3780 + }, + { + "epoch": 0.6961559683649071, + "grad_norm": 0.9031896471527984, + "learning_rate": 1.5434654471067007e-05, + "loss": 0.8078, + "mean_token_accuracy": 0.7623116612434387, + "step": 3785 + }, + { + "epoch": 0.6970755931579915, + "grad_norm": 0.928442088214151, + "learning_rate": 1.537695277009396e-05, + "loss": 0.8667, + "mean_token_accuracy": 0.7442408680915833, + "step": 3790 + }, + { + "epoch": 0.697995217951076, + "grad_norm": 0.9545685310758198, + "learning_rate": 1.5319363196214427e-05, + "loss": 0.8147, + "mean_token_accuracy": 0.757679283618927, + "step": 3795 + }, + { + "epoch": 0.6989148427441604, + "grad_norm": 0.957997913837239, + "learning_rate": 1.526188628207924e-05, + "loss": 0.8674, + "mean_token_accuracy": 0.7406766414642334, + "step": 3800 + }, + { + "epoch": 0.6998344675372448, + "grad_norm": 0.907233770113165, + "learning_rate": 1.5204522559297275e-05, + "loss": 0.8228, + "mean_token_accuracy": 0.7550997257232666, + "step": 3805 + }, + { + "epoch": 0.7007540923303293, + "grad_norm": 0.9753264400407652, + "learning_rate": 1.5147272558430472e-05, + "loss": 0.812, + "mean_token_accuracy": 0.7584111213684082, + "step": 3810 + }, + { + "epoch": 0.7016737171234136, + "grad_norm": 0.898583550613599, + "learning_rate": 1.509013680898896e-05, + "loss": 0.814, + "mean_token_accuracy": 0.7574291110038758, + "step": 3815 + }, + { + "epoch": 0.702593341916498, + "grad_norm": 0.9245046858803572, + "learning_rate": 1.5033115839426127e-05, + "loss": 0.8002, + "mean_token_accuracy": 0.7631544828414917, + "step": 3820 + }, + { + "epoch": 0.7035129667095825, + "grad_norm": 0.9501909113953771, + "learning_rate": 1.4976210177133764e-05, + "loss": 0.8284, + "mean_token_accuracy": 0.7537835121154786, + "step": 3825 + }, + { + "epoch": 0.7044325915026669, + "grad_norm": 0.9118736011138947, + "learning_rate": 1.4919420348437189e-05, + "loss": 0.8637, + "mean_token_accuracy": 0.746515440940857, + "step": 3830 + }, + { + "epoch": 0.7053522162957513, + "grad_norm": 0.9346208775326443, + "learning_rate": 1.4862746878590329e-05, + "loss": 0.8325, + "mean_token_accuracy": 0.7536684751510621, + "step": 3835 + }, + { + "epoch": 0.7062718410888358, + "grad_norm": 0.9644025251262837, + "learning_rate": 1.4806190291770932e-05, + "loss": 0.9199, + "mean_token_accuracy": 0.728544807434082, + "step": 3840 + }, + { + "epoch": 0.7071914658819202, + "grad_norm": 0.9316658230434494, + "learning_rate": 1.4749751111075682e-05, + "loss": 0.8478, + "mean_token_accuracy": 0.7476451396942139, + "step": 3845 + }, + { + "epoch": 0.7081110906750046, + "grad_norm": 0.8593875878005443, + "learning_rate": 1.469342985851534e-05, + "loss": 0.7931, + "mean_token_accuracy": 0.7640434741973877, + "step": 3850 + }, + { + "epoch": 0.709030715468089, + "grad_norm": 0.9379422901278587, + "learning_rate": 1.4637227055009962e-05, + "loss": 0.8228, + "mean_token_accuracy": 0.7573190450668335, + "step": 3855 + }, + { + "epoch": 0.7099503402611734, + "grad_norm": 0.9026485371540945, + "learning_rate": 1.4581143220384047e-05, + "loss": 0.82, + "mean_token_accuracy": 0.756511640548706, + "step": 3860 + }, + { + "epoch": 0.7108699650542578, + "grad_norm": 0.9796042273923296, + "learning_rate": 1.4525178873361756e-05, + "loss": 0.8242, + "mean_token_accuracy": 0.7555618524551392, + "step": 3865 + }, + { + "epoch": 0.7117895898473423, + "grad_norm": 0.9383990549827186, + "learning_rate": 1.4469334531562067e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.7482100129127502, + "step": 3870 + }, + { + "epoch": 0.7127092146404267, + "grad_norm": 0.9602931261847705, + "learning_rate": 1.4413610711494058e-05, + "loss": 0.8365, + "mean_token_accuracy": 0.7580392360687256, + "step": 3875 + }, + { + "epoch": 0.7136288394335111, + "grad_norm": 0.943240285031073, + "learning_rate": 1.4358007928552075e-05, + "loss": 0.7861, + "mean_token_accuracy": 0.7667181611061096, + "step": 3880 + }, + { + "epoch": 0.7145484642265956, + "grad_norm": 0.9447898247986761, + "learning_rate": 1.4302526697010964e-05, + "loss": 0.8078, + "mean_token_accuracy": 0.7595344543457031, + "step": 3885 + }, + { + "epoch": 0.71546808901968, + "grad_norm": 0.9841983235190546, + "learning_rate": 1.424716753002136e-05, + "loss": 0.8597, + "mean_token_accuracy": 0.7481236219406128, + "step": 3890 + }, + { + "epoch": 0.7163877138127643, + "grad_norm": 0.9684153403690037, + "learning_rate": 1.4191930939604908e-05, + "loss": 0.8117, + "mean_token_accuracy": 0.7613986849784851, + "step": 3895 + }, + { + "epoch": 0.7173073386058488, + "grad_norm": 0.996877698893722, + "learning_rate": 1.4136817436649502e-05, + "loss": 0.8766, + "mean_token_accuracy": 0.738961935043335, + "step": 3900 + }, + { + "epoch": 0.7182269633989332, + "grad_norm": 0.9051545491177592, + "learning_rate": 1.4081827530904624e-05, + "loss": 0.8445, + "mean_token_accuracy": 0.749999487400055, + "step": 3905 + }, + { + "epoch": 0.7191465881920177, + "grad_norm": 0.9684927881965169, + "learning_rate": 1.4026961730976584e-05, + "loss": 0.8209, + "mean_token_accuracy": 0.7576812863349914, + "step": 3910 + }, + { + "epoch": 0.7200662129851021, + "grad_norm": 0.9610042841526357, + "learning_rate": 1.3972220544323832e-05, + "loss": 0.8131, + "mean_token_accuracy": 0.7582221627235413, + "step": 3915 + }, + { + "epoch": 0.7209858377781865, + "grad_norm": 0.9412320092723402, + "learning_rate": 1.3917604477252238e-05, + "loss": 0.7937, + "mean_token_accuracy": 0.7617234110832214, + "step": 3920 + }, + { + "epoch": 0.721905462571271, + "grad_norm": 0.9321659094215312, + "learning_rate": 1.3863114034910452e-05, + "loss": 0.8156, + "mean_token_accuracy": 0.7598451256752015, + "step": 3925 + }, + { + "epoch": 0.7228250873643554, + "grad_norm": 0.956577146254236, + "learning_rate": 1.3808749721285214e-05, + "loss": 0.8107, + "mean_token_accuracy": 0.757847785949707, + "step": 3930 + }, + { + "epoch": 0.7237447121574397, + "grad_norm": 0.9139917904820034, + "learning_rate": 1.3754512039196658e-05, + "loss": 0.8754, + "mean_token_accuracy": 0.7391230940818787, + "step": 3935 + }, + { + "epoch": 0.7246643369505242, + "grad_norm": 0.92757564731535, + "learning_rate": 1.3700401490293718e-05, + "loss": 0.8193, + "mean_token_accuracy": 0.7570781588554383, + "step": 3940 + }, + { + "epoch": 0.7255839617436086, + "grad_norm": 0.9533935473757719, + "learning_rate": 1.3646418575049475e-05, + "loss": 0.8244, + "mean_token_accuracy": 0.756612241268158, + "step": 3945 + }, + { + "epoch": 0.726503586536693, + "grad_norm": 0.9319033478082173, + "learning_rate": 1.3592563792756468e-05, + "loss": 0.7994, + "mean_token_accuracy": 0.7616767644882202, + "step": 3950 + }, + { + "epoch": 0.7274232113297775, + "grad_norm": 0.9659322616790049, + "learning_rate": 1.3538837641522172e-05, + "loss": 0.776, + "mean_token_accuracy": 0.7666900753974915, + "step": 3955 + }, + { + "epoch": 0.7283428361228619, + "grad_norm": 0.9715937702004781, + "learning_rate": 1.3485240618264322e-05, + "loss": 0.8707, + "mean_token_accuracy": 0.742601501941681, + "step": 3960 + }, + { + "epoch": 0.7292624609159463, + "grad_norm": 0.9279423695840053, + "learning_rate": 1.3431773218706336e-05, + "loss": 0.8435, + "mean_token_accuracy": 0.7503429889678955, + "step": 3965 + }, + { + "epoch": 0.7301820857090308, + "grad_norm": 0.9826978876425828, + "learning_rate": 1.3378435937372729e-05, + "loss": 0.8609, + "mean_token_accuracy": 0.7491580963134765, + "step": 3970 + }, + { + "epoch": 0.7311017105021151, + "grad_norm": 0.9333913123309906, + "learning_rate": 1.3325229267584549e-05, + "loss": 0.8771, + "mean_token_accuracy": 0.7425579071044922, + "step": 3975 + }, + { + "epoch": 0.7320213352951995, + "grad_norm": 0.9125063830711305, + "learning_rate": 1.3272153701454809e-05, + "loss": 0.8086, + "mean_token_accuracy": 0.7603332042694092, + "step": 3980 + }, + { + "epoch": 0.732940960088284, + "grad_norm": 0.9868481200984651, + "learning_rate": 1.3219209729883918e-05, + "loss": 0.7879, + "mean_token_accuracy": 0.7675115823745727, + "step": 3985 + }, + { + "epoch": 0.7338605848813684, + "grad_norm": 0.9006549103315062, + "learning_rate": 1.3166397842555175e-05, + "loss": 0.7923, + "mean_token_accuracy": 0.7659124851226806, + "step": 3990 + }, + { + "epoch": 0.7347802096744528, + "grad_norm": 0.9128416767290051, + "learning_rate": 1.3113718527930214e-05, + "loss": 0.8363, + "mean_token_accuracy": 0.751650869846344, + "step": 3995 + }, + { + "epoch": 0.7356998344675373, + "grad_norm": 0.93586974280188, + "learning_rate": 1.3061172273244477e-05, + "loss": 0.8634, + "mean_token_accuracy": 0.7428792953491211, + "step": 4000 + }, + { + "epoch": 0.7366194592606217, + "grad_norm": 0.9865948469992011, + "learning_rate": 1.3008759564502742e-05, + "loss": 0.8627, + "mean_token_accuracy": 0.7454355955123901, + "step": 4005 + }, + { + "epoch": 0.737539084053706, + "grad_norm": 0.9395366278250679, + "learning_rate": 1.2956480886474609e-05, + "loss": 0.8408, + "mean_token_accuracy": 0.7488868713378907, + "step": 4010 + }, + { + "epoch": 0.7384587088467905, + "grad_norm": 0.9259161411169768, + "learning_rate": 1.2904336722690013e-05, + "loss": 0.8474, + "mean_token_accuracy": 0.7509873270988464, + "step": 4015 + }, + { + "epoch": 0.7393783336398749, + "grad_norm": 0.8982963261004637, + "learning_rate": 1.2852327555434743e-05, + "loss": 0.8272, + "mean_token_accuracy": 0.7562850832939148, + "step": 4020 + }, + { + "epoch": 0.7402979584329593, + "grad_norm": 0.9145268063018638, + "learning_rate": 1.280045386574601e-05, + "loss": 0.7964, + "mean_token_accuracy": 0.7601189255714417, + "step": 4025 + }, + { + "epoch": 0.7412175832260438, + "grad_norm": 0.9417030319528836, + "learning_rate": 1.2748716133407985e-05, + "loss": 0.8243, + "mean_token_accuracy": 0.7563821077346802, + "step": 4030 + }, + { + "epoch": 0.7421372080191282, + "grad_norm": 0.9170391844634309, + "learning_rate": 1.269711483694733e-05, + "loss": 0.8071, + "mean_token_accuracy": 0.7610970735549927, + "step": 4035 + }, + { + "epoch": 0.7430568328122126, + "grad_norm": 0.927700931925603, + "learning_rate": 1.264565045362883e-05, + "loss": 0.83, + "mean_token_accuracy": 0.7542360424995422, + "step": 4040 + }, + { + "epoch": 0.7439764576052971, + "grad_norm": 0.902718257172033, + "learning_rate": 1.259432345945094e-05, + "loss": 0.8026, + "mean_token_accuracy": 0.7602586507797241, + "step": 4045 + }, + { + "epoch": 0.7448960823983815, + "grad_norm": 0.9732168765607019, + "learning_rate": 1.2543134329141382e-05, + "loss": 0.8166, + "mean_token_accuracy": 0.7585108041763305, + "step": 4050 + }, + { + "epoch": 0.7458157071914658, + "grad_norm": 0.9466993086607015, + "learning_rate": 1.2492083536152772e-05, + "loss": 0.8169, + "mean_token_accuracy": 0.758376932144165, + "step": 4055 + }, + { + "epoch": 0.7467353319845503, + "grad_norm": 0.9757475911083087, + "learning_rate": 1.2441171552658228e-05, + "loss": 0.8389, + "mean_token_accuracy": 0.7498653650283813, + "step": 4060 + }, + { + "epoch": 0.7476549567776347, + "grad_norm": 0.9151481291254611, + "learning_rate": 1.2390398849547023e-05, + "loss": 0.8006, + "mean_token_accuracy": 0.7613858461380005, + "step": 4065 + }, + { + "epoch": 0.7485745815707191, + "grad_norm": 0.8890653066533022, + "learning_rate": 1.2339765896420178e-05, + "loss": 0.8404, + "mean_token_accuracy": 0.7510004043579102, + "step": 4070 + }, + { + "epoch": 0.7494942063638036, + "grad_norm": 0.9533182704017102, + "learning_rate": 1.2289273161586194e-05, + "loss": 0.8234, + "mean_token_accuracy": 0.7551814436912536, + "step": 4075 + }, + { + "epoch": 0.750413831156888, + "grad_norm": 0.9407240854533703, + "learning_rate": 1.2238921112056663e-05, + "loss": 0.8635, + "mean_token_accuracy": 0.7466271042823791, + "step": 4080 + }, + { + "epoch": 0.7513334559499724, + "grad_norm": 0.8895247933273808, + "learning_rate": 1.2188710213541957e-05, + "loss": 0.8332, + "mean_token_accuracy": 0.752234959602356, + "step": 4085 + }, + { + "epoch": 0.7522530807430569, + "grad_norm": 0.9353802672482648, + "learning_rate": 1.213864093044695e-05, + "loss": 0.8448, + "mean_token_accuracy": 0.7497453451156616, + "step": 4090 + }, + { + "epoch": 0.7531727055361412, + "grad_norm": 0.946809122144392, + "learning_rate": 1.2088713725866696e-05, + "loss": 0.8088, + "mean_token_accuracy": 0.758155906200409, + "step": 4095 + }, + { + "epoch": 0.7540923303292256, + "grad_norm": 0.9340815348568988, + "learning_rate": 1.203892906158214e-05, + "loss": 0.8525, + "mean_token_accuracy": 0.7470645427703857, + "step": 4100 + }, + { + "epoch": 0.7550119551223101, + "grad_norm": 0.9903725518055015, + "learning_rate": 1.1989287398055874e-05, + "loss": 0.8406, + "mean_token_accuracy": 0.7499817609786987, + "step": 4105 + }, + { + "epoch": 0.7559315799153945, + "grad_norm": 0.9005006268013445, + "learning_rate": 1.193978919442787e-05, + "loss": 0.833, + "mean_token_accuracy": 0.7508885979652404, + "step": 4110 + }, + { + "epoch": 0.7568512047084789, + "grad_norm": 0.922000222155766, + "learning_rate": 1.1890434908511212e-05, + "loss": 0.8256, + "mean_token_accuracy": 0.7544254660606384, + "step": 4115 + }, + { + "epoch": 0.7577708295015634, + "grad_norm": 0.9147121717124462, + "learning_rate": 1.1841224996787876e-05, + "loss": 0.8119, + "mean_token_accuracy": 0.7572540044784546, + "step": 4120 + }, + { + "epoch": 0.7586904542946478, + "grad_norm": 0.9401032528457242, + "learning_rate": 1.1792159914404518e-05, + "loss": 0.8389, + "mean_token_accuracy": 0.7547949194908142, + "step": 4125 + }, + { + "epoch": 0.7596100790877323, + "grad_norm": 0.899746427074481, + "learning_rate": 1.1743240115168262e-05, + "loss": 0.8104, + "mean_token_accuracy": 0.7588290691375732, + "step": 4130 + }, + { + "epoch": 0.7605297038808166, + "grad_norm": 0.9377432106115406, + "learning_rate": 1.1694466051542473e-05, + "loss": 0.8155, + "mean_token_accuracy": 0.7565756559371948, + "step": 4135 + }, + { + "epoch": 0.761449328673901, + "grad_norm": 0.9436429623996605, + "learning_rate": 1.1645838174642614e-05, + "loss": 0.8167, + "mean_token_accuracy": 0.7574901819229126, + "step": 4140 + }, + { + "epoch": 0.7623689534669855, + "grad_norm": 0.9163014099905564, + "learning_rate": 1.1597356934232053e-05, + "loss": 0.8518, + "mean_token_accuracy": 0.7465153455734252, + "step": 4145 + }, + { + "epoch": 0.7632885782600699, + "grad_norm": 0.8716564591657281, + "learning_rate": 1.1549022778717888e-05, + "loss": 0.8572, + "mean_token_accuracy": 0.7444779276847839, + "step": 4150 + }, + { + "epoch": 0.7642082030531543, + "grad_norm": 0.9408396749893937, + "learning_rate": 1.1500836155146839e-05, + "loss": 0.83, + "mean_token_accuracy": 0.7533326983451843, + "step": 4155 + }, + { + "epoch": 0.7651278278462388, + "grad_norm": 0.9335839862612282, + "learning_rate": 1.1452797509201083e-05, + "loss": 0.8751, + "mean_token_accuracy": 0.7398134231567383, + "step": 4160 + }, + { + "epoch": 0.7660474526393232, + "grad_norm": 0.9850624435923674, + "learning_rate": 1.1404907285194125e-05, + "loss": 0.8523, + "mean_token_accuracy": 0.7461954593658447, + "step": 4165 + }, + { + "epoch": 0.7669670774324076, + "grad_norm": 0.9679449146346353, + "learning_rate": 1.1357165926066716e-05, + "loss": 0.7892, + "mean_token_accuracy": 0.7605505466461182, + "step": 4170 + }, + { + "epoch": 0.767886702225492, + "grad_norm": 0.9416265509404674, + "learning_rate": 1.130957387338275e-05, + "loss": 0.8221, + "mean_token_accuracy": 0.7559242844581604, + "step": 4175 + }, + { + "epoch": 0.7688063270185764, + "grad_norm": 0.909615601406411, + "learning_rate": 1.1262131567325163e-05, + "loss": 0.8357, + "mean_token_accuracy": 0.7517993927001954, + "step": 4180 + }, + { + "epoch": 0.7697259518116608, + "grad_norm": 0.9047722281799156, + "learning_rate": 1.1214839446691869e-05, + "loss": 0.8032, + "mean_token_accuracy": 0.7601001501083374, + "step": 4185 + }, + { + "epoch": 0.7706455766047453, + "grad_norm": 0.9246634008625312, + "learning_rate": 1.1167697948891707e-05, + "loss": 0.8249, + "mean_token_accuracy": 0.7536085605621338, + "step": 4190 + }, + { + "epoch": 0.7715652013978297, + "grad_norm": 0.9460638804791452, + "learning_rate": 1.1120707509940403e-05, + "loss": 0.8167, + "mean_token_accuracy": 0.7593476176261902, + "step": 4195 + }, + { + "epoch": 0.7724848261909141, + "grad_norm": 0.9221593736048895, + "learning_rate": 1.1073868564456503e-05, + "loss": 0.845, + "mean_token_accuracy": 0.7480282187461853, + "step": 4200 + }, + { + "epoch": 0.7734044509839986, + "grad_norm": 0.8888076192030434, + "learning_rate": 1.1027181545657403e-05, + "loss": 0.7794, + "mean_token_accuracy": 0.76693354845047, + "step": 4205 + }, + { + "epoch": 0.774324075777083, + "grad_norm": 0.8891810327123515, + "learning_rate": 1.0980646885355313e-05, + "loss": 0.7885, + "mean_token_accuracy": 0.7628621697425843, + "step": 4210 + }, + { + "epoch": 0.7752437005701673, + "grad_norm": 0.9743526817712896, + "learning_rate": 1.0934265013953239e-05, + "loss": 0.8478, + "mean_token_accuracy": 0.7504450678825378, + "step": 4215 + }, + { + "epoch": 0.7761633253632518, + "grad_norm": 0.9143999464853897, + "learning_rate": 1.0888036360441066e-05, + "loss": 0.8059, + "mean_token_accuracy": 0.7603421926498413, + "step": 4220 + }, + { + "epoch": 0.7770829501563362, + "grad_norm": 0.9734913517153475, + "learning_rate": 1.0841961352391522e-05, + "loss": 0.8159, + "mean_token_accuracy": 0.7574024796485901, + "step": 4225 + }, + { + "epoch": 0.7780025749494206, + "grad_norm": 0.935773373300799, + "learning_rate": 1.079604041595628e-05, + "loss": 0.8562, + "mean_token_accuracy": 0.7468973875045777, + "step": 4230 + }, + { + "epoch": 0.7789221997425051, + "grad_norm": 0.9031689337704597, + "learning_rate": 1.075027397586198e-05, + "loss": 0.8165, + "mean_token_accuracy": 0.7566033601760864, + "step": 4235 + }, + { + "epoch": 0.7798418245355895, + "grad_norm": 0.9138920947374664, + "learning_rate": 1.0704662455406309e-05, + "loss": 0.8137, + "mean_token_accuracy": 0.7558243870735168, + "step": 4240 + }, + { + "epoch": 0.7807614493286739, + "grad_norm": 0.942480721965923, + "learning_rate": 1.06592062764541e-05, + "loss": 0.8103, + "mean_token_accuracy": 0.7595886349678039, + "step": 4245 + }, + { + "epoch": 0.7816810741217584, + "grad_norm": 0.8995689595482391, + "learning_rate": 1.0613905859433412e-05, + "loss": 0.8158, + "mean_token_accuracy": 0.7546827673912049, + "step": 4250 + }, + { + "epoch": 0.7826006989148427, + "grad_norm": 0.8666864815369382, + "learning_rate": 1.0568761623331642e-05, + "loss": 0.8082, + "mean_token_accuracy": 0.7590071558952332, + "step": 4255 + }, + { + "epoch": 0.7835203237079271, + "grad_norm": 0.9696655409923509, + "learning_rate": 1.0523773985691673e-05, + "loss": 0.8556, + "mean_token_accuracy": 0.7452132105827332, + "step": 4260 + }, + { + "epoch": 0.7844399485010116, + "grad_norm": 0.9833829005536767, + "learning_rate": 1.0478943362607984e-05, + "loss": 0.8586, + "mean_token_accuracy": 0.7462344169616699, + "step": 4265 + }, + { + "epoch": 0.785359573294096, + "grad_norm": 0.9595206401213471, + "learning_rate": 1.0434270168722813e-05, + "loss": 0.8351, + "mean_token_accuracy": 0.7498462796211243, + "step": 4270 + }, + { + "epoch": 0.7862791980871804, + "grad_norm": 0.9261440611345254, + "learning_rate": 1.0389754817222325e-05, + "loss": 0.77, + "mean_token_accuracy": 0.7716120958328248, + "step": 4275 + }, + { + "epoch": 0.7871988228802649, + "grad_norm": 0.926036803637149, + "learning_rate": 1.0345397719832791e-05, + "loss": 0.8117, + "mean_token_accuracy": 0.75774165391922, + "step": 4280 + }, + { + "epoch": 0.7881184476733493, + "grad_norm": 0.9482199838406158, + "learning_rate": 1.0301199286816768e-05, + "loss": 0.7869, + "mean_token_accuracy": 0.7647076845169067, + "step": 4285 + }, + { + "epoch": 0.7890380724664336, + "grad_norm": 0.9249156078948935, + "learning_rate": 1.0257159926969315e-05, + "loss": 0.8379, + "mean_token_accuracy": 0.7494875431060791, + "step": 4290 + }, + { + "epoch": 0.7899576972595181, + "grad_norm": 0.9426764037549299, + "learning_rate": 1.0213280047614224e-05, + "loss": 0.8399, + "mean_token_accuracy": 0.748091197013855, + "step": 4295 + }, + { + "epoch": 0.7908773220526025, + "grad_norm": 0.9001227058548062, + "learning_rate": 1.016956005460021e-05, + "loss": 0.8151, + "mean_token_accuracy": 0.7553766012191773, + "step": 4300 + }, + { + "epoch": 0.7917969468456869, + "grad_norm": 0.9494070318147612, + "learning_rate": 1.0126000352297207e-05, + "loss": 0.8161, + "mean_token_accuracy": 0.7553802728652954, + "step": 4305 + }, + { + "epoch": 0.7927165716387714, + "grad_norm": 0.9634025237949015, + "learning_rate": 1.0082601343592613e-05, + "loss": 0.8375, + "mean_token_accuracy": 0.7490672588348388, + "step": 4310 + }, + { + "epoch": 0.7936361964318558, + "grad_norm": 0.918509774691625, + "learning_rate": 1.0039363429887526e-05, + "loss": 0.8027, + "mean_token_accuracy": 0.7611651062965393, + "step": 4315 + }, + { + "epoch": 0.7945558212249402, + "grad_norm": 0.9045021299622812, + "learning_rate": 9.996287011093095e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7530111193656921, + "step": 4320 + }, + { + "epoch": 0.7954754460180247, + "grad_norm": 0.9575102184844824, + "learning_rate": 9.95337248562677e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7606404304504395, + "step": 4325 + }, + { + "epoch": 0.796395070811109, + "grad_norm": 0.9520723107616024, + "learning_rate": 9.910620250408654e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7527819633483886, + "step": 4330 + }, + { + "epoch": 0.7973146956041934, + "grad_norm": 0.9957772801943348, + "learning_rate": 9.868030700857786e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7474417209625244, + "step": 4335 + }, + { + "epoch": 0.7982343203972779, + "grad_norm": 0.9206334782903142, + "learning_rate": 9.825604230888534e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7611706376075744, + "step": 4340 + }, + { + "epoch": 0.7991539451903623, + "grad_norm": 0.9528692345244755, + "learning_rate": 9.783341232906929e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7476886630058288, + "step": 4345 + }, + { + "epoch": 0.8000735699834468, + "grad_norm": 0.9501814513029114, + "learning_rate": 9.741242097807015e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7616806149482727, + "step": 4350 + }, + { + "epoch": 0.8009931947765312, + "grad_norm": 0.9162860642484046, + "learning_rate": 9.699307214967278e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7584839701652527, + "step": 4355 + }, + { + "epoch": 0.8019128195696156, + "grad_norm": 1.0326738672670173, + "learning_rate": 9.657536972247011e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7505152702331543, + "step": 4360 + }, + { + "epoch": 0.8028324443627001, + "grad_norm": 0.9226495279325524, + "learning_rate": 9.615931755982732e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7548305869102478, + "step": 4365 + }, + { + "epoch": 0.8037520691557845, + "grad_norm": 0.9998522862414826, + "learning_rate": 9.574491950984617e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7403565168380737, + "step": 4370 + }, + { + "epoch": 0.8046716939488688, + "grad_norm": 0.9493513097435586, + "learning_rate": 9.533217940532952e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7500657081604004, + "step": 4375 + }, + { + "epoch": 0.8055913187419533, + "grad_norm": 0.9906056177459279, + "learning_rate": 9.492110106374562e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7624237060546875, + "step": 4380 + }, + { + "epoch": 0.8065109435350377, + "grad_norm": 0.9844968670498593, + "learning_rate": 9.451168828719293e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7625670194625854, + "step": 4385 + }, + { + "epoch": 0.8074305683281221, + "grad_norm": 0.9677134975970255, + "learning_rate": 9.410394486236498e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7404338598251343, + "step": 4390 + }, + { + "epoch": 0.8083501931212066, + "grad_norm": 0.9239280726012725, + "learning_rate": 9.369787456051545e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.75517338514328, + "step": 4395 + }, + { + "epoch": 0.809269817914291, + "grad_norm": 0.9448230478695528, + "learning_rate": 9.329348113742293e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7514260888099671, + "step": 4400 + }, + { + "epoch": 0.8101894427073754, + "grad_norm": 0.9454127260499946, + "learning_rate": 9.289076833335659e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7581054925918579, + "step": 4405 + }, + { + "epoch": 0.8111090675004599, + "grad_norm": 0.9492270487120692, + "learning_rate": 9.24897398730414e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7465508818626404, + "step": 4410 + }, + { + "epoch": 0.8120286922935442, + "grad_norm": 0.9570757946856893, + "learning_rate": 9.209039946562354e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.755340301990509, + "step": 4415 + }, + { + "epoch": 0.8129483170866286, + "grad_norm": 0.9284190475550864, + "learning_rate": 9.169275080463641e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7686259269714355, + "step": 4420 + }, + { + "epoch": 0.8138679418797131, + "grad_norm": 0.9501950391649288, + "learning_rate": 9.129679756796622e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7585479974746704, + "step": 4425 + }, + { + "epoch": 0.8147875666727975, + "grad_norm": 0.9046262111625721, + "learning_rate": 9.090254341781824e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7600291728973388, + "step": 4430 + }, + { + "epoch": 0.8157071914658819, + "grad_norm": 0.9379329497256937, + "learning_rate": 9.05099920006824e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.754150140285492, + "step": 4435 + }, + { + "epoch": 0.8166268162589664, + "grad_norm": 0.9034131325499937, + "learning_rate": 9.011914694730014e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7597368478775024, + "step": 4440 + }, + { + "epoch": 0.8175464410520508, + "grad_norm": 0.9338149471790205, + "learning_rate": 8.973001187263069e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7545792698860169, + "step": 4445 + }, + { + "epoch": 0.8184660658451351, + "grad_norm": 0.9541079918085381, + "learning_rate": 8.934259037581725e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7586872816085816, + "step": 4450 + }, + { + "epoch": 0.8193856906382196, + "grad_norm": 0.9233023020738409, + "learning_rate": 8.895688604015418e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7541133642196656, + "step": 4455 + }, + { + "epoch": 0.820305315431304, + "grad_norm": 0.9312024884427347, + "learning_rate": 8.857290243305372e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7540480494499207, + "step": 4460 + }, + { + "epoch": 0.8212249402243884, + "grad_norm": 0.9636521068626411, + "learning_rate": 8.819064310601274e-06, + "loss": 0.827, + "mean_token_accuracy": 0.754251503944397, + "step": 4465 + }, + { + "epoch": 0.8221445650174729, + "grad_norm": 0.9594804588793242, + "learning_rate": 8.78101115945803e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7567231893539429, + "step": 4470 + }, + { + "epoch": 0.8230641898105573, + "grad_norm": 0.946382911890805, + "learning_rate": 8.743131141832466e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7608936429023743, + "step": 4475 + }, + { + "epoch": 0.8239838146036417, + "grad_norm": 0.9662210178630657, + "learning_rate": 8.705424608080091e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7482501983642578, + "step": 4480 + }, + { + "epoch": 0.8249034393967262, + "grad_norm": 1.0134277900865423, + "learning_rate": 8.667891906951822e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7607534885406494, + "step": 4485 + }, + { + "epoch": 0.8258230641898106, + "grad_norm": 0.969259829449015, + "learning_rate": 8.63053338559081e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7495483517646789, + "step": 4490 + }, + { + "epoch": 0.8267426889828949, + "grad_norm": 0.973132836806053, + "learning_rate": 8.593349389529194e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7499716639518738, + "step": 4495 + }, + { + "epoch": 0.8276623137759794, + "grad_norm": 0.9074516956073079, + "learning_rate": 8.556340262684901e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7554465770721436, + "step": 4500 + }, + { + "epoch": 0.8285819385690638, + "grad_norm": 0.930234934487542, + "learning_rate": 8.519506347358495e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7629730701446533, + "step": 4505 + }, + { + "epoch": 0.8295015633621482, + "grad_norm": 0.8753133502304897, + "learning_rate": 8.482847984229992e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.747829282283783, + "step": 4510 + }, + { + "epoch": 0.8304211881552327, + "grad_norm": 0.9490806269639048, + "learning_rate": 8.446365512355697e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7590258955955506, + "step": 4515 + }, + { + "epoch": 0.8313408129483171, + "grad_norm": 0.945014272705201, + "learning_rate": 8.410059269165094e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7476967573165894, + "step": 4520 + }, + { + "epoch": 0.8322604377414015, + "grad_norm": 0.9585805628825262, + "learning_rate": 8.37392959045771e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7536361336708068, + "step": 4525 + }, + { + "epoch": 0.833180062534486, + "grad_norm": 0.9798760065535969, + "learning_rate": 8.337976810400024e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7538176774978638, + "step": 4530 + }, + { + "epoch": 0.8340996873275703, + "grad_norm": 0.9885247811188054, + "learning_rate": 8.30220126152233e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7511208415031433, + "step": 4535 + }, + { + "epoch": 0.8350193121206547, + "grad_norm": 0.926636431875522, + "learning_rate": 8.266603274715734e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7437230348587036, + "step": 4540 + }, + { + "epoch": 0.8359389369137392, + "grad_norm": 0.9639989728106565, + "learning_rate": 8.231183179229041e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.749656867980957, + "step": 4545 + }, + { + "epoch": 0.8368585617068236, + "grad_norm": 0.9810922714927505, + "learning_rate": 8.19594130266571e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7471103310585022, + "step": 4550 + }, + { + "epoch": 0.837778186499908, + "grad_norm": 0.940673214702186, + "learning_rate": 8.16087797098086e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.757796049118042, + "step": 4555 + }, + { + "epoch": 0.8386978112929925, + "grad_norm": 0.9808241732647448, + "learning_rate": 8.125993508478222e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7570709705352783, + "step": 4560 + }, + { + "epoch": 0.8396174360860769, + "grad_norm": 0.9417309972023068, + "learning_rate": 8.091288237807148e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7627918124198914, + "step": 4565 + }, + { + "epoch": 0.8405370608791614, + "grad_norm": 0.9994759897340699, + "learning_rate": 8.05676247995964e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7522749185562134, + "step": 4570 + }, + { + "epoch": 0.8414566856722457, + "grad_norm": 0.9575333123064316, + "learning_rate": 8.022416554267361e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7555456757545471, + "step": 4575 + }, + { + "epoch": 0.8423763104653301, + "grad_norm": 0.9428369551875321, + "learning_rate": 7.988250778398704e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7657583713531494, + "step": 4580 + }, + { + "epoch": 0.8432959352584146, + "grad_norm": 0.9491493130691244, + "learning_rate": 7.95426546835582e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7497212409973144, + "step": 4585 + }, + { + "epoch": 0.844215560051499, + "grad_norm": 0.9279119840497574, + "learning_rate": 7.92046093847173e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7641847729682922, + "step": 4590 + }, + { + "epoch": 0.8451351848445834, + "grad_norm": 0.975196157389162, + "learning_rate": 7.88683750140741e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.76539067029953, + "step": 4595 + }, + { + "epoch": 0.8460548096376679, + "grad_norm": 0.9630038826041202, + "learning_rate": 7.853395468148877e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7576993346214295, + "step": 4600 + }, + { + "epoch": 0.8469744344307523, + "grad_norm": 0.9547194790847711, + "learning_rate": 7.82013514800434e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7594569325447083, + "step": 4605 + }, + { + "epoch": 0.8478940592238366, + "grad_norm": 0.9804442806928446, + "learning_rate": 7.787056848601327e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7542958974838256, + "step": 4610 + }, + { + "epoch": 0.8488136840169211, + "grad_norm": 0.987211519153664, + "learning_rate": 7.754160875883835e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7447464466094971, + "step": 4615 + }, + { + "epoch": 0.8497333088100055, + "grad_norm": 0.9279113898182684, + "learning_rate": 7.721447534109509e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7507144689559937, + "step": 4620 + }, + { + "epoch": 0.8506529336030899, + "grad_norm": 0.9722340874170035, + "learning_rate": 7.688917125846836e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7506987690925598, + "step": 4625 + }, + { + "epoch": 0.8515725583961744, + "grad_norm": 0.9470559135859266, + "learning_rate": 7.65656995197231e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7494428992271424, + "step": 4630 + }, + { + "epoch": 0.8524921831892588, + "grad_norm": 1.0085786438496558, + "learning_rate": 7.6244063116676965e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7590271830558777, + "step": 4635 + }, + { + "epoch": 0.8534118079823432, + "grad_norm": 0.9122173396588265, + "learning_rate": 7.592426502417235e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7632818222045898, + "step": 4640 + }, + { + "epoch": 0.8543314327754277, + "grad_norm": 0.920428242471814, + "learning_rate": 7.560630820004905e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.768799901008606, + "step": 4645 + }, + { + "epoch": 0.855251057568512, + "grad_norm": 0.9650658819203722, + "learning_rate": 7.529019558511664e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7465671896934509, + "step": 4650 + }, + { + "epoch": 0.8561706823615964, + "grad_norm": 0.941100631374564, + "learning_rate": 7.4975930103127575e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7577845811843872, + "step": 4655 + }, + { + "epoch": 0.8570903071546809, + "grad_norm": 0.911355294655365, + "learning_rate": 7.466351466075003e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7704600811004638, + "step": 4660 + }, + { + "epoch": 0.8580099319477653, + "grad_norm": 0.9600196890925632, + "learning_rate": 7.43529521475409e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.752436888217926, + "step": 4665 + }, + { + "epoch": 0.8589295567408497, + "grad_norm": 0.9096404947618868, + "learning_rate": 7.404424543591926e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.749167013168335, + "step": 4670 + }, + { + "epoch": 0.8598491815339342, + "grad_norm": 0.9645413054824178, + "learning_rate": 7.37373973811398e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7523573756217956, + "step": 4675 + }, + { + "epoch": 0.8607688063270186, + "grad_norm": 0.9461536188211753, + "learning_rate": 7.343241082126609e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7644837021827697, + "step": 4680 + }, + { + "epoch": 0.861688431120103, + "grad_norm": 0.9177981778366934, + "learning_rate": 7.312928857714484e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7650796055793763, + "step": 4685 + }, + { + "epoch": 0.8626080559131875, + "grad_norm": 0.9395263274096144, + "learning_rate": 7.282803345237937e-06, + "loss": 0.779, + "mean_token_accuracy": 0.766014575958252, + "step": 4690 + }, + { + "epoch": 0.8635276807062718, + "grad_norm": 0.974228845887035, + "learning_rate": 7.252864823330397e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7609816431999207, + "step": 4695 + }, + { + "epoch": 0.8644473054993562, + "grad_norm": 0.9138771854988429, + "learning_rate": 7.223113568895791e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7533741354942322, + "step": 4700 + }, + { + "epoch": 0.8653669302924407, + "grad_norm": 0.9230858356341091, + "learning_rate": 7.193549857105998e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7645957589149475, + "step": 4705 + }, + { + "epoch": 0.8662865550855251, + "grad_norm": 0.9248959407091435, + "learning_rate": 7.164173961398307e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.758608341217041, + "step": 4710 + }, + { + "epoch": 0.8672061798786095, + "grad_norm": 0.920957739245226, + "learning_rate": 7.134986153472864e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7574970960617066, + "step": 4715 + }, + { + "epoch": 0.868125804671694, + "grad_norm": 0.9365387305302294, + "learning_rate": 7.105986703290185e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7519280552864075, + "step": 4720 + }, + { + "epoch": 0.8690454294647784, + "grad_norm": 0.9848472191309555, + "learning_rate": 7.077175879068652e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7514313578605651, + "step": 4725 + }, + { + "epoch": 0.8699650542578627, + "grad_norm": 0.9841439973977463, + "learning_rate": 7.04855394728202e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7536401510238647, + "step": 4730 + }, + { + "epoch": 0.8708846790509472, + "grad_norm": 0.9368690483918741, + "learning_rate": 7.020121172656971e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7589451789855957, + "step": 4735 + }, + { + "epoch": 0.8718043038440316, + "grad_norm": 0.9537367969880632, + "learning_rate": 6.991877818170647e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7570921540260315, + "step": 4740 + }, + { + "epoch": 0.872723928637116, + "grad_norm": 0.9771290706741976, + "learning_rate": 6.963824145048245e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7482818961143494, + "step": 4745 + }, + { + "epoch": 0.8736435534302005, + "grad_norm": 0.9167489506515816, + "learning_rate": 6.935960412760554e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7615381121635437, + "step": 4750 + }, + { + "epoch": 0.8745631782232849, + "grad_norm": 0.9509142520738616, + "learning_rate": 6.908286879021611e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7538857817649841, + "step": 4755 + }, + { + "epoch": 0.8754828030163693, + "grad_norm": 0.9492010037774332, + "learning_rate": 6.880803799786282e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7596304178237915, + "step": 4760 + }, + { + "epoch": 0.8764024278094538, + "grad_norm": 0.9879455089380224, + "learning_rate": 6.853511429247891e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7443594694137573, + "step": 4765 + }, + { + "epoch": 0.8773220526025381, + "grad_norm": 0.900884905164465, + "learning_rate": 6.826410019835897e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.75017911195755, + "step": 4770 + }, + { + "epoch": 0.8782416773956225, + "grad_norm": 0.9347399353088925, + "learning_rate": 6.7994998222135415e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7503747582435608, + "step": 4775 + }, + { + "epoch": 0.879161302188707, + "grad_norm": 0.9313447849733553, + "learning_rate": 6.77278108527552e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7531881928443909, + "step": 4780 + }, + { + "epoch": 0.8800809269817914, + "grad_norm": 0.9749122247147805, + "learning_rate": 6.7462540561457035e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7597910761833191, + "step": 4785 + }, + { + "epoch": 0.8810005517748758, + "grad_norm": 0.9459726297921652, + "learning_rate": 6.719918980174842e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7680148124694824, + "step": 4790 + }, + { + "epoch": 0.8819201765679603, + "grad_norm": 0.9477334526426899, + "learning_rate": 6.6937761009382816e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.759226131439209, + "step": 4795 + }, + { + "epoch": 0.8828398013610447, + "grad_norm": 0.9350684746914302, + "learning_rate": 6.667825660233736e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7565145611763, + "step": 4800 + }, + { + "epoch": 0.8837594261541292, + "grad_norm": 0.9492764392082258, + "learning_rate": 6.642067898079038e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7527845025062561, + "step": 4805 + }, + { + "epoch": 0.8846790509472136, + "grad_norm": 0.8598768439927121, + "learning_rate": 6.616503052709914e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7648340344429017, + "step": 4810 + }, + { + "epoch": 0.8855986757402979, + "grad_norm": 0.9446656437839204, + "learning_rate": 6.591131360577795e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7575154542922974, + "step": 4815 + }, + { + "epoch": 0.8865183005333824, + "grad_norm": 0.8652514268793213, + "learning_rate": 6.565953056347608e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7725171089172364, + "step": 4820 + }, + { + "epoch": 0.8874379253264668, + "grad_norm": 0.9422431334861092, + "learning_rate": 6.540968372895634e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7611649394035339, + "step": 4825 + }, + { + "epoch": 0.8883575501195512, + "grad_norm": 0.9384703132768932, + "learning_rate": 6.516177541307333e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7624763369560241, + "step": 4830 + }, + { + "epoch": 0.8892771749126357, + "grad_norm": 1.015847599195386, + "learning_rate": 6.491580790875209e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7621793508529663, + "step": 4835 + }, + { + "epoch": 0.8901967997057201, + "grad_norm": 0.9098096698494834, + "learning_rate": 6.4671783490966945e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7614699125289917, + "step": 4840 + }, + { + "epoch": 0.8911164244988045, + "grad_norm": 0.9558674059824713, + "learning_rate": 6.442970441672051e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7470506310462952, + "step": 4845 + }, + { + "epoch": 0.892036049291889, + "grad_norm": 0.9590352976202275, + "learning_rate": 6.4189572925022655e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7472939848899841, + "step": 4850 + }, + { + "epoch": 0.8929556740849733, + "grad_norm": 0.8982751392912057, + "learning_rate": 6.3951391236869985e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7548177719116211, + "step": 4855 + }, + { + "epoch": 0.8938752988780577, + "grad_norm": 0.9627549202883984, + "learning_rate": 6.371516155522513e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7578222513198852, + "step": 4860 + }, + { + "epoch": 0.8947949236711422, + "grad_norm": 0.962995623951893, + "learning_rate": 6.3480886064996484e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7579006910324096, + "step": 4865 + }, + { + "epoch": 0.8957145484642266, + "grad_norm": 0.99045632467858, + "learning_rate": 6.3248566933017975e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.75965256690979, + "step": 4870 + }, + { + "epoch": 0.896634173257311, + "grad_norm": 0.9510071830298487, + "learning_rate": 6.3018206308028975e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7584743499755859, + "step": 4875 + }, + { + "epoch": 0.8975537980503955, + "grad_norm": 0.9703791789576997, + "learning_rate": 6.2789806320654456e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7649904489517212, + "step": 4880 + }, + { + "epoch": 0.8984734228434799, + "grad_norm": 0.9398378664335288, + "learning_rate": 6.256336908338531e-06, + "loss": 0.78, + "mean_token_accuracy": 0.767956817150116, + "step": 4885 + }, + { + "epoch": 0.8993930476365642, + "grad_norm": 0.987114293205303, + "learning_rate": 6.233889669055878e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7497469425201416, + "step": 4890 + }, + { + "epoch": 0.9003126724296487, + "grad_norm": 0.9343500174042304, + "learning_rate": 6.211639121833912e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.763602340221405, + "step": 4895 + }, + { + "epoch": 0.9012322972227331, + "grad_norm": 0.9262644956755969, + "learning_rate": 6.189585472469829e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7697998642921448, + "step": 4900 + }, + { + "epoch": 0.9021519220158175, + "grad_norm": 0.9622834108867682, + "learning_rate": 6.167728924939705e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7625941157341003, + "step": 4905 + }, + { + "epoch": 0.903071546808902, + "grad_norm": 0.9190192726730757, + "learning_rate": 6.146069681396612e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7542304992675781, + "step": 4910 + }, + { + "epoch": 0.9039911716019864, + "grad_norm": 0.9361246140345745, + "learning_rate": 6.124607942168726e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7584469556808472, + "step": 4915 + }, + { + "epoch": 0.9049107963950708, + "grad_norm": 0.9457716726884055, + "learning_rate": 6.1033439057574965e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.758701741695404, + "step": 4920 + }, + { + "epoch": 0.9058304211881553, + "grad_norm": 0.8853750515926242, + "learning_rate": 6.082277768835807e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.763675856590271, + "step": 4925 + }, + { + "epoch": 0.9067500459812396, + "grad_norm": 0.9702784866596219, + "learning_rate": 6.061409726246143e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7646818399429322, + "step": 4930 + }, + { + "epoch": 0.907669670774324, + "grad_norm": 0.9693421985103569, + "learning_rate": 6.040739970998802e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7530786991119385, + "step": 4935 + }, + { + "epoch": 0.9085892955674085, + "grad_norm": 0.8930655347204544, + "learning_rate": 6.020268694270109e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7641753435134888, + "step": 4940 + }, + { + "epoch": 0.9095089203604929, + "grad_norm": 0.908390221485836, + "learning_rate": 5.999996085400643e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7642928123474121, + "step": 4945 + }, + { + "epoch": 0.9104285451535773, + "grad_norm": 0.9291773666129768, + "learning_rate": 5.9799223318934765e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7588168382644653, + "step": 4950 + }, + { + "epoch": 0.9113481699466618, + "grad_norm": 0.9290002720904244, + "learning_rate": 5.9600476194124675e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.763935673236847, + "step": 4955 + }, + { + "epoch": 0.9122677947397462, + "grad_norm": 0.9446442087955222, + "learning_rate": 5.9403721317805245e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7578533172607422, + "step": 4960 + }, + { + "epoch": 0.9131874195328306, + "grad_norm": 0.9568316679901518, + "learning_rate": 5.920896050977891e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7361096501350403, + "step": 4965 + }, + { + "epoch": 0.914107044325915, + "grad_norm": 0.9761363167639366, + "learning_rate": 5.901619557140502e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7517902731895447, + "step": 4970 + }, + { + "epoch": 0.9150266691189994, + "grad_norm": 0.9363921634925068, + "learning_rate": 5.882542828558286e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7580497026443481, + "step": 4975 + }, + { + "epoch": 0.9159462939120838, + "grad_norm": 0.9898749363112332, + "learning_rate": 5.86366604167352e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7676722645759583, + "step": 4980 + }, + { + "epoch": 0.9168659187051683, + "grad_norm": 0.9461120512925497, + "learning_rate": 5.844989371079215e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7703205943107605, + "step": 4985 + }, + { + "epoch": 0.9177855434982527, + "grad_norm": 0.9340964548547984, + "learning_rate": 5.826512989517478e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7529069542884826, + "step": 4990 + }, + { + "epoch": 0.9187051682913371, + "grad_norm": 0.9542091804584825, + "learning_rate": 5.808237067877942e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7639023303985596, + "step": 4995 + }, + { + "epoch": 0.9196247930844216, + "grad_norm": 0.9799469338180448, + "learning_rate": 5.790161775196144e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7624092340469361, + "step": 5000 + }, + { + "epoch": 0.920544417877506, + "grad_norm": 0.9533254080832144, + "learning_rate": 5.772287278652012e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7598010182380677, + "step": 5005 + }, + { + "epoch": 0.9214640426705903, + "grad_norm": 0.9311527277134242, + "learning_rate": 5.754613743568279e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7638931751251221, + "step": 5010 + }, + { + "epoch": 0.9223836674636748, + "grad_norm": 0.9812836116539834, + "learning_rate": 5.737141333408972e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7612162590026855, + "step": 5015 + }, + { + "epoch": 0.9233032922567592, + "grad_norm": 0.9745443553849291, + "learning_rate": 5.719870209777896e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7509512066841125, + "step": 5020 + }, + { + "epoch": 0.9242229170498437, + "grad_norm": 0.9530895065948418, + "learning_rate": 5.702800532417144e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7625620007514954, + "step": 5025 + }, + { + "epoch": 0.9251425418429281, + "grad_norm": 0.9106620317823355, + "learning_rate": 5.685932459205606e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7597783088684082, + "step": 5030 + }, + { + "epoch": 0.9260621666360125, + "grad_norm": 0.9016062622069709, + "learning_rate": 5.669266146157527e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7618203997612, + "step": 5035 + }, + { + "epoch": 0.926981791429097, + "grad_norm": 0.9311871037406105, + "learning_rate": 5.652801747421053e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7672530770301819, + "step": 5040 + }, + { + "epoch": 0.9279014162221814, + "grad_norm": 0.9289149914362874, + "learning_rate": 5.636539415276807e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7606992840766906, + "step": 5045 + }, + { + "epoch": 0.9288210410152657, + "grad_norm": 0.9265920738234094, + "learning_rate": 5.620479300136475e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7715546011924743, + "step": 5050 + }, + { + "epoch": 0.9297406658083502, + "grad_norm": 1.001963123510446, + "learning_rate": 5.604621550541429e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7474547743797302, + "step": 5055 + }, + { + "epoch": 0.9306602906014346, + "grad_norm": 0.9062392197653472, + "learning_rate": 5.5889663131613465e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7512851595878601, + "step": 5060 + }, + { + "epoch": 0.931579915394519, + "grad_norm": 0.9878466692235598, + "learning_rate": 5.5735137327928384e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7595331549644471, + "step": 5065 + }, + { + "epoch": 0.9324995401876035, + "grad_norm": 0.911756127989921, + "learning_rate": 5.558263952358139e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7572713255882263, + "step": 5070 + }, + { + "epoch": 0.9334191649806879, + "grad_norm": 0.9534452188147857, + "learning_rate": 5.543217112903766e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7591339111328125, + "step": 5075 + }, + { + "epoch": 0.9343387897737723, + "grad_norm": 0.94136690175154, + "learning_rate": 5.528373353599207e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7594197154045105, + "step": 5080 + }, + { + "epoch": 0.9352584145668568, + "grad_norm": 0.9367268234664168, + "learning_rate": 5.513732811735657e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7594240307807922, + "step": 5085 + }, + { + "epoch": 0.9361780393599411, + "grad_norm": 0.8975989192963018, + "learning_rate": 5.4992956227247345e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7677939176559448, + "step": 5090 + }, + { + "epoch": 0.9370976641530255, + "grad_norm": 0.9987125543689239, + "learning_rate": 5.48506192009722e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7597865104675293, + "step": 5095 + }, + { + "epoch": 0.93801728894611, + "grad_norm": 0.9396093256392507, + "learning_rate": 5.4710318355018435e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7557710766792297, + "step": 5100 + }, + { + "epoch": 0.9389369137391944, + "grad_norm": 0.907072734656757, + "learning_rate": 5.457205498704046e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7568627595901489, + "step": 5105 + }, + { + "epoch": 0.9398565385322788, + "grad_norm": 0.9498606808400206, + "learning_rate": 5.443583037584792e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7537372469902038, + "step": 5110 + }, + { + "epoch": 0.9407761633253633, + "grad_norm": 0.9500188031150016, + "learning_rate": 5.430164578139382e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7692322492599487, + "step": 5115 + }, + { + "epoch": 0.9416957881184477, + "grad_norm": 0.9133488515736051, + "learning_rate": 5.4169502444762836e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7578924179077149, + "step": 5120 + }, + { + "epoch": 0.9426154129115321, + "grad_norm": 0.9585342004886042, + "learning_rate": 5.403940158815996e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7570155620574951, + "step": 5125 + }, + { + "epoch": 0.9435350377046166, + "grad_norm": 0.9797939933864984, + "learning_rate": 5.391134441489905e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7618912696838379, + "step": 5130 + }, + { + "epoch": 0.9444546624977009, + "grad_norm": 0.9293935572688817, + "learning_rate": 5.378533210939176e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7596281886100769, + "step": 5135 + }, + { + "epoch": 0.9453742872907853, + "grad_norm": 0.9221042858985046, + "learning_rate": 5.366136583713665e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7698543071746826, + "step": 5140 + }, + { + "epoch": 0.9462939120838698, + "grad_norm": 1.025946124148099, + "learning_rate": 5.353944674470823e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7552660465240478, + "step": 5145 + }, + { + "epoch": 0.9472135368769542, + "grad_norm": 0.984504169212397, + "learning_rate": 5.341957595974662e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7498656630516052, + "step": 5150 + }, + { + "epoch": 0.9481331616700386, + "grad_norm": 0.9188252633726173, + "learning_rate": 5.3301754590946824e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7552522420883179, + "step": 5155 + }, + { + "epoch": 0.9490527864631231, + "grad_norm": 0.8673224532160614, + "learning_rate": 5.318598372804873e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7689907431602478, + "step": 5160 + }, + { + "epoch": 0.9499724112562075, + "grad_norm": 0.9392909148393203, + "learning_rate": 5.307226444182686e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7654459595680236, + "step": 5165 + }, + { + "epoch": 0.9508920360492918, + "grad_norm": 1.0092515399603914, + "learning_rate": 5.296059778408057e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7547815799713135, + "step": 5170 + }, + { + "epoch": 0.9518116608423763, + "grad_norm": 0.9724478118701938, + "learning_rate": 5.2850984787624264e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.757933521270752, + "step": 5175 + }, + { + "epoch": 0.9527312856354607, + "grad_norm": 0.9595437776833703, + "learning_rate": 5.274342646627783e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7451163768768311, + "step": 5180 + }, + { + "epoch": 0.9536509104285451, + "grad_norm": 0.9035621461181421, + "learning_rate": 5.263792381485733e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7612574458122253, + "step": 5185 + }, + { + "epoch": 0.9545705352216296, + "grad_norm": 0.9369759529937411, + "learning_rate": 5.253447780916577e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.755517327785492, + "step": 5190 + }, + { + "epoch": 0.955490160014714, + "grad_norm": 0.9223279306007958, + "learning_rate": 5.2433089405984e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7672001838684082, + "step": 5195 + }, + { + "epoch": 0.9564097848077984, + "grad_norm": 0.9093658718364905, + "learning_rate": 5.233375954306199e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7701982975006103, + "step": 5200 + }, + { + "epoch": 0.9573294096008829, + "grad_norm": 0.9756234794282658, + "learning_rate": 5.22364891391101e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.75344318151474, + "step": 5205 + }, + { + "epoch": 0.9582490343939672, + "grad_norm": 0.910212786589889, + "learning_rate": 5.2141279093790575e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7678821444511413, + "step": 5210 + }, + { + "epoch": 0.9591686591870516, + "grad_norm": 0.9474929875705357, + "learning_rate": 5.204813028770913e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7625754833221435, + "step": 5215 + }, + { + "epoch": 0.9600882839801361, + "grad_norm": 0.9344552952746554, + "learning_rate": 5.195704358240704e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.759453558921814, + "step": 5220 + }, + { + "epoch": 0.9610079087732205, + "grad_norm": 0.9060367178226402, + "learning_rate": 5.186801982035298e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7654222846031189, + "step": 5225 + }, + { + "epoch": 0.9619275335663049, + "grad_norm": 0.9799737312884412, + "learning_rate": 5.178105982493528e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7591325879096985, + "step": 5230 + }, + { + "epoch": 0.9628471583593894, + "grad_norm": 0.9419373863409995, + "learning_rate": 5.169616440045433e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7605907201766968, + "step": 5235 + }, + { + "epoch": 0.9637667831524738, + "grad_norm": 0.904753211539841, + "learning_rate": 5.16133343321151e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7628448724746704, + "step": 5240 + }, + { + "epoch": 0.9646864079455583, + "grad_norm": 0.9588441625989744, + "learning_rate": 5.1532570386019944e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7675014138221741, + "step": 5245 + }, + { + "epoch": 0.9656060327386427, + "grad_norm": 0.8875696215604679, + "learning_rate": 5.145387330916144e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7614070296287536, + "step": 5250 + }, + { + "epoch": 0.966525657531727, + "grad_norm": 0.9405630235157387, + "learning_rate": 5.137724382941557e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7650785088539124, + "step": 5255 + }, + { + "epoch": 0.9674452823248115, + "grad_norm": 0.9562043810312459, + "learning_rate": 5.130268265553487e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7557086706161499, + "step": 5260 + }, + { + "epoch": 0.9683649071178959, + "grad_norm": 0.9274811086930055, + "learning_rate": 5.123019047714198e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7753474235534668, + "step": 5265 + }, + { + "epoch": 0.9692845319109803, + "grad_norm": 0.9409745943869224, + "learning_rate": 5.115976796472322e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7535906672477722, + "step": 5270 + }, + { + "epoch": 0.9702041567040648, + "grad_norm": 0.919927159373234, + "learning_rate": 5.109141576962239e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7655844688415527, + "step": 5275 + }, + { + "epoch": 0.9711237814971492, + "grad_norm": 0.951329112362283, + "learning_rate": 5.102513452403473e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7696467399597168, + "step": 5280 + }, + { + "epoch": 0.9720434062902336, + "grad_norm": 0.9201946233258363, + "learning_rate": 5.0960924841001155e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7610312700271606, + "step": 5285 + }, + { + "epoch": 0.972963031083318, + "grad_norm": 1.0032717462292577, + "learning_rate": 5.089878731440241e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7543939590454102, + "step": 5290 + }, + { + "epoch": 0.9738826558764024, + "grad_norm": 0.9429172545610519, + "learning_rate": 5.0838722518953816e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7595749855041504, + "step": 5295 + }, + { + "epoch": 0.9748022806694868, + "grad_norm": 0.9007616401314099, + "learning_rate": 5.078073101019974e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7579713940620423, + "step": 5300 + }, + { + "epoch": 0.9757219054625713, + "grad_norm": 0.8990406462252963, + "learning_rate": 5.072481332450857e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7577333807945251, + "step": 5305 + }, + { + "epoch": 0.9766415302556557, + "grad_norm": 0.9615340254243923, + "learning_rate": 5.067096997906774e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7705414056777954, + "step": 5310 + }, + { + "epoch": 0.9775611550487401, + "grad_norm": 0.8455749234692341, + "learning_rate": 5.06192014718789e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7697661995887757, + "step": 5315 + }, + { + "epoch": 0.9784807798418246, + "grad_norm": 0.9292612449999305, + "learning_rate": 5.05695082817534e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7671653866767884, + "step": 5320 + }, + { + "epoch": 0.979400404634909, + "grad_norm": 0.9275056123774931, + "learning_rate": 5.052189086830779e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7623230576515198, + "step": 5325 + }, + { + "epoch": 0.9803200294279933, + "grad_norm": 0.9703545231339168, + "learning_rate": 5.047634967195952e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7638481616973877, + "step": 5330 + }, + { + "epoch": 0.9812396542210778, + "grad_norm": 0.955542417327297, + "learning_rate": 5.043288511392302e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7614734530448913, + "step": 5335 + }, + { + "epoch": 0.9821592790141622, + "grad_norm": 0.9645172124378145, + "learning_rate": 5.039149759620569e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7724639177322388, + "step": 5340 + }, + { + "epoch": 0.9830789038072466, + "grad_norm": 0.9734387825498484, + "learning_rate": 5.0352187501604155e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.746760880947113, + "step": 5345 + }, + { + "epoch": 0.9839985286003311, + "grad_norm": 0.9730228991663388, + "learning_rate": 5.031495519370083e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.758979082107544, + "step": 5350 + }, + { + "epoch": 0.9849181533934155, + "grad_norm": 1.0013660074202417, + "learning_rate": 5.027980101686053e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7509408593177795, + "step": 5355 + }, + { + "epoch": 0.9858377781864999, + "grad_norm": 0.9817157587290055, + "learning_rate": 5.024672529622717e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7596516370773315, + "step": 5360 + }, + { + "epoch": 0.9867574029795844, + "grad_norm": 0.9800745490721745, + "learning_rate": 5.0215728337720955e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7768563270568848, + "step": 5365 + }, + { + "epoch": 0.9876770277726687, + "grad_norm": 0.99189390574119, + "learning_rate": 5.018681042803533e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7670275330543518, + "step": 5370 + }, + { + "epoch": 0.9885966525657531, + "grad_norm": 0.9673022649880465, + "learning_rate": 5.0159971834634545e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.764349353313446, + "step": 5375 + }, + { + "epoch": 0.9895162773588376, + "grad_norm": 1.0182176113772272, + "learning_rate": 5.013521280575099e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7618956327438354, + "step": 5380 + }, + { + "epoch": 0.990435902151922, + "grad_norm": 0.9959171759739962, + "learning_rate": 5.011253357038306e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7527823686599732, + "step": 5385 + }, + { + "epoch": 0.9913555269450064, + "grad_norm": 0.8997528487054468, + "learning_rate": 5.0091934338292915e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7715205192565918, + "step": 5390 + }, + { + "epoch": 0.9922751517380909, + "grad_norm": 0.919462849827096, + "learning_rate": 5.00734153000046e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.77668297290802, + "step": 5395 + }, + { + "epoch": 0.9931947765311753, + "grad_norm": 0.984326555402561, + "learning_rate": 5.005697662680227e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7626922607421875, + "step": 5400 + }, + { + "epoch": 0.9941144013242597, + "grad_norm": 0.9499542228497883, + "learning_rate": 5.004261847072863e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7542143225669861, + "step": 5405 + }, + { + "epoch": 0.9950340261173442, + "grad_norm": 0.9585799297597308, + "learning_rate": 5.003034096458347e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7544377326965332, + "step": 5410 + }, + { + "epoch": 0.9959536509104285, + "grad_norm": 0.9165677599227604, + "learning_rate": 5.0020144221922466e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7582892417907715, + "step": 5415 + }, + { + "epoch": 0.9968732757035129, + "grad_norm": 0.9449991405622632, + "learning_rate": 5.001202833705621e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7502840042114258, + "step": 5420 + }, + { + "epoch": 0.9977929004965974, + "grad_norm": 0.9827477783752422, + "learning_rate": 5.000599338504916e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.762959897518158, + "step": 5425 + }, + { + "epoch": 0.9987125252896818, + "grad_norm": 0.9751233701044131, + "learning_rate": 5.0002039421719105e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7619426846504211, + "step": 5430 + }, + { + "epoch": 0.9996321500827662, + "grad_norm": 0.971614941671036, + "learning_rate": 5.000016648363663e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7594120621681213, + "step": 5435 + }, + { + "epoch": 1.0, + "mean_token_accuracy": 0.779580146074295, + "step": 5437, + "total_flos": 77442066677760.0, + "train_loss": 0.8871173100675843, + "train_runtime": 5515.7519, + "train_samples_per_second": 15.771, + "train_steps_per_second": 0.986 + } + ], + "logging_steps": 5, + "max_steps": 5437, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 77442066677760.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}