| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 5437, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0009196247930844216, | |
| "grad_norm": 4.087223679622462, | |
| "learning_rate": 9.191176470588236e-07, | |
| "loss": 1.3446, | |
| "mean_token_accuracy": 0.6661458969116211, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0018392495861688431, | |
| "grad_norm": 3.3376471514991324, | |
| "learning_rate": 1.8382352941176471e-06, | |
| "loss": 1.2534, | |
| "mean_token_accuracy": 0.6856188654899598, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0027588743792532648, | |
| "grad_norm": 3.1883807133419646, | |
| "learning_rate": 2.7573529411764708e-06, | |
| "loss": 1.2495, | |
| "mean_token_accuracy": 0.6844112038612366, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0036784991723376862, | |
| "grad_norm": 2.5757356327081826, | |
| "learning_rate": 3.6764705882352942e-06, | |
| "loss": 1.1962, | |
| "mean_token_accuracy": 0.6918170928955079, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004598123965422108, | |
| "grad_norm": 2.3971194855376092, | |
| "learning_rate": 4.595588235294118e-06, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.6844529986381531, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0055177487585065296, | |
| "grad_norm": 2.00434532423879, | |
| "learning_rate": 5.5147058823529415e-06, | |
| "loss": 1.1506, | |
| "mean_token_accuracy": 0.697660756111145, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.006437373551590951, | |
| "grad_norm": 2.0663662496595543, | |
| "learning_rate": 6.433823529411764e-06, | |
| "loss": 1.1278, | |
| "mean_token_accuracy": 0.6973050832748413, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0073569983446753725, | |
| "grad_norm": 1.9519049901829761, | |
| "learning_rate": 7.3529411764705884e-06, | |
| "loss": 1.102, | |
| "mean_token_accuracy": 0.7046478033065796, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.008276623137759793, | |
| "grad_norm": 1.8451875842176761, | |
| "learning_rate": 8.272058823529413e-06, | |
| "loss": 1.125, | |
| "mean_token_accuracy": 0.6951346158981323, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.009196247930844215, | |
| "grad_norm": 2.000034845742239, | |
| "learning_rate": 9.191176470588236e-06, | |
| "loss": 1.0295, | |
| "mean_token_accuracy": 0.7154734015464783, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010115872723928637, | |
| "grad_norm": 1.621484821283711, | |
| "learning_rate": 1.011029411764706e-05, | |
| "loss": 1.0762, | |
| "mean_token_accuracy": 0.706468117237091, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.011035497517013059, | |
| "grad_norm": 1.753826025706781, | |
| "learning_rate": 1.1029411764705883e-05, | |
| "loss": 1.0394, | |
| "mean_token_accuracy": 0.7156139016151428, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.011955122310097481, | |
| "grad_norm": 1.6505676536191385, | |
| "learning_rate": 1.1948529411764707e-05, | |
| "loss": 1.0338, | |
| "mean_token_accuracy": 0.7132004976272583, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.012874747103181901, | |
| "grad_norm": 1.8513933357249144, | |
| "learning_rate": 1.2867647058823528e-05, | |
| "loss": 0.9804, | |
| "mean_token_accuracy": 0.7274341702461242, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.013794371896266323, | |
| "grad_norm": 2.4070230665851993, | |
| "learning_rate": 1.3786764705882355e-05, | |
| "loss": 1.0398, | |
| "mean_token_accuracy": 0.7116599082946777, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.014713996689350745, | |
| "grad_norm": 1.798866895809756, | |
| "learning_rate": 1.4705882352941177e-05, | |
| "loss": 0.9922, | |
| "mean_token_accuracy": 0.720504081249237, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.015633621482435165, | |
| "grad_norm": 1.709611126629724, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.7247263193130493, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.016553246275519587, | |
| "grad_norm": 1.7626425485303618, | |
| "learning_rate": 1.6544117647058825e-05, | |
| "loss": 1.0122, | |
| "mean_token_accuracy": 0.717292582988739, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.01747287106860401, | |
| "grad_norm": 2.036503882503329, | |
| "learning_rate": 1.7463235294117647e-05, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.7172105073928833, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.01839249586168843, | |
| "grad_norm": 1.927409741133158, | |
| "learning_rate": 1.8382352941176472e-05, | |
| "loss": 1.0434, | |
| "mean_token_accuracy": 0.7078547954559327, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.019312120654772853, | |
| "grad_norm": 2.079665033278075, | |
| "learning_rate": 1.9301470588235298e-05, | |
| "loss": 0.9959, | |
| "mean_token_accuracy": 0.7182355523109436, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.020231745447857274, | |
| "grad_norm": 1.8479982769163703, | |
| "learning_rate": 2.022058823529412e-05, | |
| "loss": 1.0194, | |
| "mean_token_accuracy": 0.7173629522323608, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.021151370240941696, | |
| "grad_norm": 1.831806807070413, | |
| "learning_rate": 2.113970588235294e-05, | |
| "loss": 0.9569, | |
| "mean_token_accuracy": 0.7312556385993958, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.022070995034026118, | |
| "grad_norm": 1.7952413093248756, | |
| "learning_rate": 2.2058823529411766e-05, | |
| "loss": 1.0149, | |
| "mean_token_accuracy": 0.7192024111747741, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.02299061982711054, | |
| "grad_norm": 1.6441769080980864, | |
| "learning_rate": 2.2977941176470588e-05, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.7280102610588074, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.023910244620194962, | |
| "grad_norm": 1.7182187182460715, | |
| "learning_rate": 2.3897058823529413e-05, | |
| "loss": 1.025, | |
| "mean_token_accuracy": 0.7164386153221131, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02482986941327938, | |
| "grad_norm": 1.7665031820505241, | |
| "learning_rate": 2.4816176470588238e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7216517567634583, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.025749494206363802, | |
| "grad_norm": 1.65781753659198, | |
| "learning_rate": 2.5735294117647057e-05, | |
| "loss": 1.0204, | |
| "mean_token_accuracy": 0.7183511853218079, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.026669118999448224, | |
| "grad_norm": 1.5947996494100198, | |
| "learning_rate": 2.6654411764705882e-05, | |
| "loss": 0.9915, | |
| "mean_token_accuracy": 0.7210009098052979, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.027588743792532646, | |
| "grad_norm": 1.6195741488866147, | |
| "learning_rate": 2.757352941176471e-05, | |
| "loss": 0.9609, | |
| "mean_token_accuracy": 0.7290344476699829, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.028508368585617068, | |
| "grad_norm": 1.700795937176488, | |
| "learning_rate": 2.849264705882353e-05, | |
| "loss": 1.0017, | |
| "mean_token_accuracy": 0.7190845251083374, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.02942799337870149, | |
| "grad_norm": 1.6626957868958252, | |
| "learning_rate": 2.9411764705882354e-05, | |
| "loss": 0.9801, | |
| "mean_token_accuracy": 0.7264268517494201, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03034761817178591, | |
| "grad_norm": 1.646176772035618, | |
| "learning_rate": 3.0330882352941176e-05, | |
| "loss": 0.9819, | |
| "mean_token_accuracy": 0.7258347868919373, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.03126724296487033, | |
| "grad_norm": 1.7051406597026453, | |
| "learning_rate": 3.125e-05, | |
| "loss": 1.0021, | |
| "mean_token_accuracy": 0.7193678379058838, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.032186867757954755, | |
| "grad_norm": 1.6583599673202631, | |
| "learning_rate": 3.2169117647058826e-05, | |
| "loss": 0.9863, | |
| "mean_token_accuracy": 0.7218608260154724, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.033106492551039174, | |
| "grad_norm": 1.6811054631655953, | |
| "learning_rate": 3.308823529411765e-05, | |
| "loss": 0.9776, | |
| "mean_token_accuracy": 0.7252245903015136, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0340261173441236, | |
| "grad_norm": 1.6005295960642778, | |
| "learning_rate": 3.4007352941176476e-05, | |
| "loss": 0.952, | |
| "mean_token_accuracy": 0.7300998091697692, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.03494574213720802, | |
| "grad_norm": 1.884741061084924, | |
| "learning_rate": 3.4926470588235294e-05, | |
| "loss": 1.0216, | |
| "mean_token_accuracy": 0.7144460439682007, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.03586536693029244, | |
| "grad_norm": 1.61333499821342, | |
| "learning_rate": 3.584558823529412e-05, | |
| "loss": 1.0067, | |
| "mean_token_accuracy": 0.7160724878311158, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.03678499172337686, | |
| "grad_norm": 1.592957572722435, | |
| "learning_rate": 3.6764705882352945e-05, | |
| "loss": 0.9367, | |
| "mean_token_accuracy": 0.7348474979400634, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03770461651646129, | |
| "grad_norm": 1.7666690880786284, | |
| "learning_rate": 3.768382352941176e-05, | |
| "loss": 0.9545, | |
| "mean_token_accuracy": 0.7297826528549194, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.038624241309545705, | |
| "grad_norm": 1.5696177739032589, | |
| "learning_rate": 3.8602941176470595e-05, | |
| "loss": 1.0076, | |
| "mean_token_accuracy": 0.7160616636276245, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.039543866102630124, | |
| "grad_norm": 1.5375849975431441, | |
| "learning_rate": 3.952205882352941e-05, | |
| "loss": 1.0082, | |
| "mean_token_accuracy": 0.7139402985572815, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.04046349089571455, | |
| "grad_norm": 1.6613621558577687, | |
| "learning_rate": 4.044117647058824e-05, | |
| "loss": 1.0047, | |
| "mean_token_accuracy": 0.7157810091972351, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.04138311568879897, | |
| "grad_norm": 1.6712866586887962, | |
| "learning_rate": 4.136029411764706e-05, | |
| "loss": 0.9841, | |
| "mean_token_accuracy": 0.7261144757270813, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.04230274048188339, | |
| "grad_norm": 1.5868739813391535, | |
| "learning_rate": 4.227941176470588e-05, | |
| "loss": 1.0063, | |
| "mean_token_accuracy": 0.7146228194236756, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.04322236527496781, | |
| "grad_norm": 1.4745940440239442, | |
| "learning_rate": 4.319852941176471e-05, | |
| "loss": 0.9895, | |
| "mean_token_accuracy": 0.7205227255821228, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.044141990068052236, | |
| "grad_norm": 1.565812920746474, | |
| "learning_rate": 4.411764705882353e-05, | |
| "loss": 0.9883, | |
| "mean_token_accuracy": 0.7221224546432495, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.045061614861136655, | |
| "grad_norm": 1.579279007990175, | |
| "learning_rate": 4.503676470588236e-05, | |
| "loss": 1.0339, | |
| "mean_token_accuracy": 0.7140692472457886, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.04598123965422108, | |
| "grad_norm": 1.550674625710887, | |
| "learning_rate": 4.5955882352941176e-05, | |
| "loss": 1.009, | |
| "mean_token_accuracy": 0.717827045917511, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0469008644473055, | |
| "grad_norm": 1.494069442893164, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 1.0163, | |
| "mean_token_accuracy": 0.7157993316650391, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.047820489240389924, | |
| "grad_norm": 1.585433590429472, | |
| "learning_rate": 4.7794117647058826e-05, | |
| "loss": 0.9662, | |
| "mean_token_accuracy": 0.7260660767555237, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04874011403347434, | |
| "grad_norm": 1.5561077784742092, | |
| "learning_rate": 4.871323529411765e-05, | |
| "loss": 1.0521, | |
| "mean_token_accuracy": 0.7059531569480896, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.04965973882655876, | |
| "grad_norm": 1.3842507274813078, | |
| "learning_rate": 4.9632352941176476e-05, | |
| "loss": 0.96, | |
| "mean_token_accuracy": 0.7317641496658325, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.050579363619643186, | |
| "grad_norm": 1.4379239878799341, | |
| "learning_rate": 4.999996254118754e-05, | |
| "loss": 0.972, | |
| "mean_token_accuracy": 0.7297493696212769, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.051498988412727605, | |
| "grad_norm": 1.3761784967587591, | |
| "learning_rate": 4.999973362667417e-05, | |
| "loss": 0.9844, | |
| "mean_token_accuracy": 0.724224853515625, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.05241861320581203, | |
| "grad_norm": 1.4249636066532947, | |
| "learning_rate": 4.999929661021346e-05, | |
| "loss": 0.9974, | |
| "mean_token_accuracy": 0.7186186075210571, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.05333823799889645, | |
| "grad_norm": 1.6467747117004, | |
| "learning_rate": 4.9998651495847435e-05, | |
| "loss": 1.0296, | |
| "mean_token_accuracy": 0.7110173583030701, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.054257862791980874, | |
| "grad_norm": 1.3761801455599358, | |
| "learning_rate": 4.9997798289542816e-05, | |
| "loss": 1.0209, | |
| "mean_token_accuracy": 0.7124481081962586, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.05517748758506529, | |
| "grad_norm": 1.4585308096786376, | |
| "learning_rate": 4.9996736999190965e-05, | |
| "loss": 1.0248, | |
| "mean_token_accuracy": 0.7100600242614746, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05609711237814972, | |
| "grad_norm": 1.4301378065367794, | |
| "learning_rate": 4.999546763460785e-05, | |
| "loss": 0.9864, | |
| "mean_token_accuracy": 0.7253738522529602, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.057016737171234136, | |
| "grad_norm": 1.4586102770676173, | |
| "learning_rate": 4.999399020753393e-05, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7308779239654541, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.05793636196431856, | |
| "grad_norm": 1.5007400960218442, | |
| "learning_rate": 4.999230473163406e-05, | |
| "loss": 1.0123, | |
| "mean_token_accuracy": 0.7142405152320862, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.05885598675740298, | |
| "grad_norm": 1.4247385882584611, | |
| "learning_rate": 4.999041122249735e-05, | |
| "loss": 1.0097, | |
| "mean_token_accuracy": 0.7164065957069397, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0597756115504874, | |
| "grad_norm": 1.4338281584111965, | |
| "learning_rate": 4.9988309697637025e-05, | |
| "loss": 1.0381, | |
| "mean_token_accuracy": 0.7093045115470886, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.06069523634357182, | |
| "grad_norm": 1.3206321897141915, | |
| "learning_rate": 4.9986000176490264e-05, | |
| "loss": 1.0378, | |
| "mean_token_accuracy": 0.7081658363342285, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.06161486113665624, | |
| "grad_norm": 1.4771390057019052, | |
| "learning_rate": 4.998348268041803e-05, | |
| "loss": 1.0473, | |
| "mean_token_accuracy": 0.7044042825698853, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.06253448592974066, | |
| "grad_norm": 1.410427294901373, | |
| "learning_rate": 4.9980757232704836e-05, | |
| "loss": 1.0476, | |
| "mean_token_accuracy": 0.7044672727584839, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.06345411072282509, | |
| "grad_norm": 1.293731368317575, | |
| "learning_rate": 4.997782385855862e-05, | |
| "loss": 0.9809, | |
| "mean_token_accuracy": 0.7207650065422058, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.06437373551590951, | |
| "grad_norm": 1.373213488697433, | |
| "learning_rate": 4.9974682585110375e-05, | |
| "loss": 1.0238, | |
| "mean_token_accuracy": 0.713714337348938, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.06529336030899394, | |
| "grad_norm": 1.4173612737543944, | |
| "learning_rate": 4.997133344141402e-05, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.7182128310203553, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.06621298510207835, | |
| "grad_norm": 1.4208487527297817, | |
| "learning_rate": 4.9967776458446067e-05, | |
| "loss": 1.0247, | |
| "mean_token_accuracy": 0.7120985150337219, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06713260989516277, | |
| "grad_norm": 1.3468936690832556, | |
| "learning_rate": 4.996401166910535e-05, | |
| "loss": 1.0257, | |
| "mean_token_accuracy": 0.711448609828949, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.0680522346882472, | |
| "grad_norm": 1.3418384776624692, | |
| "learning_rate": 4.996003910821273e-05, | |
| "loss": 0.9908, | |
| "mean_token_accuracy": 0.7198069810867309, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.06897185948133161, | |
| "grad_norm": 1.2757020291626893, | |
| "learning_rate": 4.995585881251076e-05, | |
| "loss": 1.0029, | |
| "mean_token_accuracy": 0.7165916681289672, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.06989148427441604, | |
| "grad_norm": 1.2215136508098425, | |
| "learning_rate": 4.995147082066335e-05, | |
| "loss": 1.0071, | |
| "mean_token_accuracy": 0.7161303281784057, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.07081110906750046, | |
| "grad_norm": 1.5100364277085054, | |
| "learning_rate": 4.9946875173255405e-05, | |
| "loss": 0.9808, | |
| "mean_token_accuracy": 0.7223702430725097, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.07173073386058489, | |
| "grad_norm": 1.3193074150499653, | |
| "learning_rate": 4.9942071912792463e-05, | |
| "loss": 0.9692, | |
| "mean_token_accuracy": 0.7253165245056152, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.0726503586536693, | |
| "grad_norm": 1.360795639773644, | |
| "learning_rate": 4.9937061083700286e-05, | |
| "loss": 0.9248, | |
| "mean_token_accuracy": 0.738149356842041, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.07356998344675372, | |
| "grad_norm": 1.3934617241628962, | |
| "learning_rate": 4.993184273232445e-05, | |
| "loss": 1.0174, | |
| "mean_token_accuracy": 0.7140317440032959, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07448960823983815, | |
| "grad_norm": 1.3755761090465115, | |
| "learning_rate": 4.9926416906929954e-05, | |
| "loss": 0.9371, | |
| "mean_token_accuracy": 0.7347567915916443, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.07540923303292257, | |
| "grad_norm": 1.3123084901189321, | |
| "learning_rate": 4.9920783657700685e-05, | |
| "loss": 1.0494, | |
| "mean_token_accuracy": 0.7046082258224488, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.07632885782600698, | |
| "grad_norm": 1.26236320940822, | |
| "learning_rate": 4.9914943036739075e-05, | |
| "loss": 0.9813, | |
| "mean_token_accuracy": 0.7248732924461365, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.07724848261909141, | |
| "grad_norm": 1.4072657383382854, | |
| "learning_rate": 4.99088950980655e-05, | |
| "loss": 1.0041, | |
| "mean_token_accuracy": 0.7161918520927429, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07816810741217584, | |
| "grad_norm": 1.4142932157820918, | |
| "learning_rate": 4.9902639897617876e-05, | |
| "loss": 1.0343, | |
| "mean_token_accuracy": 0.7073235511779785, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.07908773220526025, | |
| "grad_norm": 1.2620775477382082, | |
| "learning_rate": 4.9896177493251065e-05, | |
| "loss": 0.9773, | |
| "mean_token_accuracy": 0.724228036403656, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.08000735699834467, | |
| "grad_norm": 1.2299977431090294, | |
| "learning_rate": 4.9889507944736405e-05, | |
| "loss": 0.9921, | |
| "mean_token_accuracy": 0.7193984985351562, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.0809269817914291, | |
| "grad_norm": 1.272005618491772, | |
| "learning_rate": 4.9882631313761116e-05, | |
| "loss": 1.0266, | |
| "mean_token_accuracy": 0.7106949806213378, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.08184660658451352, | |
| "grad_norm": 1.3368998742271194, | |
| "learning_rate": 4.9875547663927744e-05, | |
| "loss": 0.9945, | |
| "mean_token_accuracy": 0.7178430318832397, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.08276623137759793, | |
| "grad_norm": 1.2395804635484349, | |
| "learning_rate": 4.986825706075357e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7270126938819885, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08368585617068236, | |
| "grad_norm": 1.2355105682399337, | |
| "learning_rate": 4.9860759571669987e-05, | |
| "loss": 1.017, | |
| "mean_token_accuracy": 0.7113536357879638, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.08460548096376679, | |
| "grad_norm": 1.2769471363849882, | |
| "learning_rate": 4.985305526602192e-05, | |
| "loss": 0.9841, | |
| "mean_token_accuracy": 0.7207873582839965, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.08552510575685121, | |
| "grad_norm": 1.3105851965485462, | |
| "learning_rate": 4.984514421506715e-05, | |
| "loss": 1.0238, | |
| "mean_token_accuracy": 0.7113570213317871, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.08644473054993562, | |
| "grad_norm": 1.2226583029739935, | |
| "learning_rate": 4.983702649197565e-05, | |
| "loss": 1.0026, | |
| "mean_token_accuracy": 0.7175478458404541, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.08736435534302005, | |
| "grad_norm": 1.3032963672614144, | |
| "learning_rate": 4.982870217182893e-05, | |
| "loss": 1.0102, | |
| "mean_token_accuracy": 0.7142111778259277, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.08828398013610447, | |
| "grad_norm": 1.276533355049304, | |
| "learning_rate": 4.9820171331619343e-05, | |
| "loss": 1.0175, | |
| "mean_token_accuracy": 0.7140154242515564, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08920360492918888, | |
| "grad_norm": 1.3275369586760475, | |
| "learning_rate": 4.981143405024936e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.7251969814300537, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.09012322972227331, | |
| "grad_norm": 1.322475452296982, | |
| "learning_rate": 4.980249040853081e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.7284212589263916, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.09104285451535774, | |
| "grad_norm": 1.2219967426964762, | |
| "learning_rate": 4.979334048918422e-05, | |
| "loss": 1.0265, | |
| "mean_token_accuracy": 0.7094637989997864, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.09196247930844216, | |
| "grad_norm": 1.2500649142513325, | |
| "learning_rate": 4.978398437683797e-05, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7309910893440247, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09288210410152657, | |
| "grad_norm": 1.2382649121413325, | |
| "learning_rate": 4.977442215802753e-05, | |
| "loss": 1.0142, | |
| "mean_token_accuracy": 0.7163145303726196, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.093801728894611, | |
| "grad_norm": 1.2494735942714719, | |
| "learning_rate": 4.976465392119467e-05, | |
| "loss": 0.9711, | |
| "mean_token_accuracy": 0.7253948450088501, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.09472135368769542, | |
| "grad_norm": 1.1320102641208292, | |
| "learning_rate": 4.9754679756686654e-05, | |
| "loss": 0.9754, | |
| "mean_token_accuracy": 0.7240365982055664, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.09564097848077985, | |
| "grad_norm": 1.2636397583226155, | |
| "learning_rate": 4.974449975675538e-05, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.7268050789833069, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.09656060327386426, | |
| "grad_norm": 1.2638605012202537, | |
| "learning_rate": 4.9734114015556506e-05, | |
| "loss": 0.994, | |
| "mean_token_accuracy": 0.7192271828651429, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.09748022806694868, | |
| "grad_norm": 1.3539672940723328, | |
| "learning_rate": 4.972352262914867e-05, | |
| "loss": 1.0219, | |
| "mean_token_accuracy": 0.712011969089508, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.09839985286003311, | |
| "grad_norm": 1.2622022574950933, | |
| "learning_rate": 4.971272569549246e-05, | |
| "loss": 0.9993, | |
| "mean_token_accuracy": 0.717021644115448, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.09931947765311752, | |
| "grad_norm": 1.2498621609285703, | |
| "learning_rate": 4.970172331444968e-05, | |
| "loss": 0.9869, | |
| "mean_token_accuracy": 0.7201068043708801, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.10023910244620195, | |
| "grad_norm": 1.2563183037951813, | |
| "learning_rate": 4.969051558778226e-05, | |
| "loss": 1.0328, | |
| "mean_token_accuracy": 0.7072706580162048, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.10115872723928637, | |
| "grad_norm": 1.1583096373701225, | |
| "learning_rate": 4.967910261915142e-05, | |
| "loss": 1.0073, | |
| "mean_token_accuracy": 0.7176116108894348, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1020783520323708, | |
| "grad_norm": 1.2337310449325847, | |
| "learning_rate": 4.966748451411668e-05, | |
| "loss": 1.0075, | |
| "mean_token_accuracy": 0.7166797518730164, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.10299797682545521, | |
| "grad_norm": 1.187463601840395, | |
| "learning_rate": 4.9655661380134874e-05, | |
| "loss": 0.9978, | |
| "mean_token_accuracy": 0.7187446594238281, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.10391760161853963, | |
| "grad_norm": 1.1950175317081544, | |
| "learning_rate": 4.964363332655918e-05, | |
| "loss": 1.0127, | |
| "mean_token_accuracy": 0.7141183018684387, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.10483722641162406, | |
| "grad_norm": 1.1797983108141703, | |
| "learning_rate": 4.9631400464638074e-05, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.7147095799446106, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.10575685120470849, | |
| "grad_norm": 1.3194739883489515, | |
| "learning_rate": 4.961896290751434e-05, | |
| "loss": 1.0125, | |
| "mean_token_accuracy": 0.7156966686248779, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.1066764759977929, | |
| "grad_norm": 1.232197096442626, | |
| "learning_rate": 4.960632077022402e-05, | |
| "loss": 1.0096, | |
| "mean_token_accuracy": 0.7136348843574524, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.10759610079087732, | |
| "grad_norm": 1.1109964489025674, | |
| "learning_rate": 4.959347416969529e-05, | |
| "loss": 0.9782, | |
| "mean_token_accuracy": 0.7218139052391053, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.10851572558396175, | |
| "grad_norm": 1.1118328480221105, | |
| "learning_rate": 4.958042322474747e-05, | |
| "loss": 0.9138, | |
| "mean_token_accuracy": 0.7406689524650574, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.10943535037704616, | |
| "grad_norm": 1.1550688598895895, | |
| "learning_rate": 4.956716805608984e-05, | |
| "loss": 1.0123, | |
| "mean_token_accuracy": 0.7150320529937744, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.11035497517013058, | |
| "grad_norm": 1.2400379075265455, | |
| "learning_rate": 4.955370878632058e-05, | |
| "loss": 0.9642, | |
| "mean_token_accuracy": 0.7274539470672607, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.11127459996321501, | |
| "grad_norm": 1.1266451881904362, | |
| "learning_rate": 4.954004553992564e-05, | |
| "loss": 0.9597, | |
| "mean_token_accuracy": 0.7269688129425049, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.11219422475629943, | |
| "grad_norm": 1.195410688726218, | |
| "learning_rate": 4.952617844327753e-05, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7273669600486755, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.11311384954938385, | |
| "grad_norm": 1.2168436664941074, | |
| "learning_rate": 4.951210762463421e-05, | |
| "loss": 0.981, | |
| "mean_token_accuracy": 0.7224032163619996, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.11403347434246827, | |
| "grad_norm": 1.1158577605300688, | |
| "learning_rate": 4.949783321413787e-05, | |
| "loss": 1.0133, | |
| "mean_token_accuracy": 0.7140767455101014, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1149530991355527, | |
| "grad_norm": 1.2227500677211205, | |
| "learning_rate": 4.948335534381375e-05, | |
| "loss": 1.0178, | |
| "mean_token_accuracy": 0.7107774257659912, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.11587272392863712, | |
| "grad_norm": 1.1733820093333545, | |
| "learning_rate": 4.9468674147568906e-05, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.7264823913574219, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.11679234872172153, | |
| "grad_norm": 1.1456005644666878, | |
| "learning_rate": 4.945378976119096e-05, | |
| "loss": 1.0301, | |
| "mean_token_accuracy": 0.7111668229103089, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.11771197351480596, | |
| "grad_norm": 1.176194033859284, | |
| "learning_rate": 4.943870232234688e-05, | |
| "loss": 0.9904, | |
| "mean_token_accuracy": 0.7183448076248169, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.11863159830789038, | |
| "grad_norm": 1.1767555657667275, | |
| "learning_rate": 4.9423411970581656e-05, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.7282203912734986, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.1195512231009748, | |
| "grad_norm": 1.1593918150017006, | |
| "learning_rate": 4.940791884731706e-05, | |
| "loss": 0.9629, | |
| "mean_token_accuracy": 0.7265506267547608, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.12047084789405922, | |
| "grad_norm": 1.1809244906539653, | |
| "learning_rate": 4.939222309585029e-05, | |
| "loss": 0.9506, | |
| "mean_token_accuracy": 0.7299855709075928, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.12139047268714365, | |
| "grad_norm": 1.187342482868558, | |
| "learning_rate": 4.93763248613527e-05, | |
| "loss": 0.9873, | |
| "mean_token_accuracy": 0.7208028793334961, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.12231009748022807, | |
| "grad_norm": 1.1643370561641233, | |
| "learning_rate": 4.936022429086841e-05, | |
| "loss": 1.019, | |
| "mean_token_accuracy": 0.7111838817596435, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.12322972227331248, | |
| "grad_norm": 1.1548281507110767, | |
| "learning_rate": 4.9343921533312955e-05, | |
| "loss": 0.949, | |
| "mean_token_accuracy": 0.7271883249282837, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.12414934706639691, | |
| "grad_norm": 1.1323282418083014, | |
| "learning_rate": 4.9327416739471935e-05, | |
| "loss": 0.9269, | |
| "mean_token_accuracy": 0.737087082862854, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.12506897185948132, | |
| "grad_norm": 1.2363897419233494, | |
| "learning_rate": 4.9310710061999575e-05, | |
| "loss": 1.0061, | |
| "mean_token_accuracy": 0.714658522605896, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12598859665256576, | |
| "grad_norm": 1.15808211817011, | |
| "learning_rate": 4.9293801655417366e-05, | |
| "loss": 0.9426, | |
| "mean_token_accuracy": 0.7324698209762573, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.12690822144565017, | |
| "grad_norm": 1.168156282468429, | |
| "learning_rate": 4.927669167611259e-05, | |
| "loss": 0.9516, | |
| "mean_token_accuracy": 0.726858627796173, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.12782784623873458, | |
| "grad_norm": 1.1708412963628498, | |
| "learning_rate": 4.92593802823369e-05, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.7281310319900512, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.12874747103181902, | |
| "grad_norm": 1.150205433303024, | |
| "learning_rate": 4.924186763420486e-05, | |
| "loss": 0.9966, | |
| "mean_token_accuracy": 0.7196317195892334, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12966709582490343, | |
| "grad_norm": 1.1412449351652514, | |
| "learning_rate": 4.922415389369243e-05, | |
| "loss": 0.9393, | |
| "mean_token_accuracy": 0.7308167576789856, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.13058672061798787, | |
| "grad_norm": 1.2590368311590696, | |
| "learning_rate": 4.9206239224635486e-05, | |
| "loss": 0.9961, | |
| "mean_token_accuracy": 0.7167337894439697, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.13150634541107228, | |
| "grad_norm": 1.1862573902159457, | |
| "learning_rate": 4.9188123792728344e-05, | |
| "loss": 0.9991, | |
| "mean_token_accuracy": 0.71655353307724, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.1324259702041567, | |
| "grad_norm": 1.1728642333915622, | |
| "learning_rate": 4.916980776552218e-05, | |
| "loss": 0.9354, | |
| "mean_token_accuracy": 0.734131133556366, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.13334559499724113, | |
| "grad_norm": 1.208191683152181, | |
| "learning_rate": 4.915129131242345e-05, | |
| "loss": 0.9578, | |
| "mean_token_accuracy": 0.7278777837753296, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.13426521979032555, | |
| "grad_norm": 1.138309077411327, | |
| "learning_rate": 4.913257460469243e-05, | |
| "loss": 0.9448, | |
| "mean_token_accuracy": 0.7303597450256347, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.13518484458340996, | |
| "grad_norm": 1.1410024150973699, | |
| "learning_rate": 4.911365781544153e-05, | |
| "loss": 0.9765, | |
| "mean_token_accuracy": 0.7208934783935547, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.1361044693764944, | |
| "grad_norm": 1.135207319109893, | |
| "learning_rate": 4.9094541119633756e-05, | |
| "loss": 0.9625, | |
| "mean_token_accuracy": 0.7279266119003296, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1370240941695788, | |
| "grad_norm": 1.1470179542343784, | |
| "learning_rate": 4.907522469408103e-05, | |
| "loss": 1.0099, | |
| "mean_token_accuracy": 0.7129136681556701, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.13794371896266322, | |
| "grad_norm": 1.1186516076443083, | |
| "learning_rate": 4.905570871744262e-05, | |
| "loss": 0.9492, | |
| "mean_token_accuracy": 0.7295220971107483, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.13886334375574766, | |
| "grad_norm": 1.188235501807293, | |
| "learning_rate": 4.903599337022345e-05, | |
| "loss": 0.9158, | |
| "mean_token_accuracy": 0.7392297148704529, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.13978296854883207, | |
| "grad_norm": 1.156585568722138, | |
| "learning_rate": 4.9016078834772436e-05, | |
| "loss": 1.0069, | |
| "mean_token_accuracy": 0.7133058428764343, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.1407025933419165, | |
| "grad_norm": 1.0550430464679208, | |
| "learning_rate": 4.899596529528083e-05, | |
| "loss": 0.9804, | |
| "mean_token_accuracy": 0.7237313628196717, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.14162221813500092, | |
| "grad_norm": 1.0828080346302627, | |
| "learning_rate": 4.897565293778045e-05, | |
| "loss": 0.9398, | |
| "mean_token_accuracy": 0.7297361016273498, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.14254184292808533, | |
| "grad_norm": 1.0748821988518662, | |
| "learning_rate": 4.895514195014201e-05, | |
| "loss": 0.9512, | |
| "mean_token_accuracy": 0.727254593372345, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.14346146772116977, | |
| "grad_norm": 1.1000801031665166, | |
| "learning_rate": 4.893443252207339e-05, | |
| "loss": 0.96, | |
| "mean_token_accuracy": 0.7277865290641785, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.14438109251425418, | |
| "grad_norm": 1.1979288214254857, | |
| "learning_rate": 4.891352484511783e-05, | |
| "loss": 0.9904, | |
| "mean_token_accuracy": 0.7203876137733459, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.1453007173073386, | |
| "grad_norm": 1.0336978471065938, | |
| "learning_rate": 4.889241911265224e-05, | |
| "loss": 0.9512, | |
| "mean_token_accuracy": 0.7298694252967834, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.14622034210042303, | |
| "grad_norm": 1.093196247221492, | |
| "learning_rate": 4.887111551988531e-05, | |
| "loss": 1.0404, | |
| "mean_token_accuracy": 0.7045328140258789, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.14713996689350745, | |
| "grad_norm": 1.224732532168464, | |
| "learning_rate": 4.884961426385578e-05, | |
| "loss": 1.0189, | |
| "mean_token_accuracy": 0.7101276278495788, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14805959168659186, | |
| "grad_norm": 1.1751595598375444, | |
| "learning_rate": 4.8827915543430604e-05, | |
| "loss": 0.9166, | |
| "mean_token_accuracy": 0.7369141817092896, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.1489792164796763, | |
| "grad_norm": 1.0711984590567727, | |
| "learning_rate": 4.880601955930308e-05, | |
| "loss": 0.9528, | |
| "mean_token_accuracy": 0.7275946021080018, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1498988412727607, | |
| "grad_norm": 1.1523849563074238, | |
| "learning_rate": 4.878392651399103e-05, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.72748943567276, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.15081846606584515, | |
| "grad_norm": 1.1385592224893888, | |
| "learning_rate": 4.8761636611834906e-05, | |
| "loss": 0.9423, | |
| "mean_token_accuracy": 0.7338582873344421, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.15173809085892956, | |
| "grad_norm": 1.171019568482894, | |
| "learning_rate": 4.873915005899591e-05, | |
| "loss": 0.9823, | |
| "mean_token_accuracy": 0.7215001463890076, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.15265771565201397, | |
| "grad_norm": 1.1181637038875023, | |
| "learning_rate": 4.871646706345407e-05, | |
| "loss": 0.9696, | |
| "mean_token_accuracy": 0.7244228839874267, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.1535773404450984, | |
| "grad_norm": 1.140111709793846, | |
| "learning_rate": 4.869358783500634e-05, | |
| "loss": 0.9691, | |
| "mean_token_accuracy": 0.7219241619110107, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.15449696523818282, | |
| "grad_norm": 1.1035668632214553, | |
| "learning_rate": 4.867051258526466e-05, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.7362164258956909, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.15541659003126723, | |
| "grad_norm": 1.0632498704772437, | |
| "learning_rate": 4.864724152765396e-05, | |
| "loss": 0.9319, | |
| "mean_token_accuracy": 0.7335481762886047, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.15633621482435167, | |
| "grad_norm": 1.1360641167900578, | |
| "learning_rate": 4.8623774877410235e-05, | |
| "loss": 0.998, | |
| "mean_token_accuracy": 0.7165634036064148, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.15725583961743608, | |
| "grad_norm": 1.1574648839544697, | |
| "learning_rate": 4.860011285157852e-05, | |
| "loss": 0.9983, | |
| "mean_token_accuracy": 0.7154228448867798, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.1581754644105205, | |
| "grad_norm": 1.1103379240939366, | |
| "learning_rate": 4.857625566901091e-05, | |
| "loss": 0.9606, | |
| "mean_token_accuracy": 0.7255040884017945, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.15909508920360493, | |
| "grad_norm": 1.3478355454379694, | |
| "learning_rate": 4.85522035503645e-05, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7249020457267761, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.16001471399668935, | |
| "grad_norm": 1.129020628766503, | |
| "learning_rate": 4.852795671809941e-05, | |
| "loss": 0.9341, | |
| "mean_token_accuracy": 0.7329063415527344, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.16093433878977378, | |
| "grad_norm": 1.1322677948976352, | |
| "learning_rate": 4.850351539647661e-05, | |
| "loss": 0.9977, | |
| "mean_token_accuracy": 0.7172942876815795, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.1618539635828582, | |
| "grad_norm": 1.120014190171844, | |
| "learning_rate": 4.8478879811555986e-05, | |
| "loss": 0.9283, | |
| "mean_token_accuracy": 0.7341889500617981, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.1627735883759426, | |
| "grad_norm": 1.1336097713701254, | |
| "learning_rate": 4.845405019119414e-05, | |
| "loss": 1.0008, | |
| "mean_token_accuracy": 0.7151533484458923, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.16369321316902705, | |
| "grad_norm": 0.9922793909516228, | |
| "learning_rate": 4.842902676504235e-05, | |
| "loss": 0.9039, | |
| "mean_token_accuracy": 0.7395052313804626, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.16461283796211146, | |
| "grad_norm": 1.2309806920357915, | |
| "learning_rate": 4.840380976454441e-05, | |
| "loss": 0.9143, | |
| "mean_token_accuracy": 0.7372842311859131, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.16553246275519587, | |
| "grad_norm": 1.058725560363019, | |
| "learning_rate": 4.837839942293449e-05, | |
| "loss": 1.0122, | |
| "mean_token_accuracy": 0.7113693952560425, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1664520875482803, | |
| "grad_norm": 1.1050666066281727, | |
| "learning_rate": 4.835279597523501e-05, | |
| "loss": 0.9691, | |
| "mean_token_accuracy": 0.7241552948951722, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.16737171234136472, | |
| "grad_norm": 1.1281645078253164, | |
| "learning_rate": 4.832699965825443e-05, | |
| "loss": 0.9783, | |
| "mean_token_accuracy": 0.7210159540176392, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.16829133713444913, | |
| "grad_norm": 1.1049918709083206, | |
| "learning_rate": 4.830101071058506e-05, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.726420772075653, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.16921096192753357, | |
| "grad_norm": 1.1589903082257091, | |
| "learning_rate": 4.82748293726009e-05, | |
| "loss": 1.0162, | |
| "mean_token_accuracy": 0.7134600043296814, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.17013058672061798, | |
| "grad_norm": 1.0648743038360364, | |
| "learning_rate": 4.824845588645538e-05, | |
| "loss": 0.931, | |
| "mean_token_accuracy": 0.7355116486549378, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.17105021151370242, | |
| "grad_norm": 1.0563630156850699, | |
| "learning_rate": 4.822189049607909e-05, | |
| "loss": 0.9303, | |
| "mean_token_accuracy": 0.7332427501678467, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.17196983630678683, | |
| "grad_norm": 1.0946637430016075, | |
| "learning_rate": 4.819513344717759e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.7218296766281128, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.17288946109987124, | |
| "grad_norm": 1.218450386345206, | |
| "learning_rate": 4.8168184987229104e-05, | |
| "loss": 1.0025, | |
| "mean_token_accuracy": 0.7138312220573425, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.17380908589295568, | |
| "grad_norm": 1.1265660437743932, | |
| "learning_rate": 4.814104536548222e-05, | |
| "loss": 0.9901, | |
| "mean_token_accuracy": 0.7183592796325684, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.1747287106860401, | |
| "grad_norm": 1.1519197604777511, | |
| "learning_rate": 4.811371483295361e-05, | |
| "loss": 0.9677, | |
| "mean_token_accuracy": 0.723106038570404, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1756483354791245, | |
| "grad_norm": 1.0668603888469903, | |
| "learning_rate": 4.808619364242569e-05, | |
| "loss": 0.9428, | |
| "mean_token_accuracy": 0.7298098564147949, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.17656796027220895, | |
| "grad_norm": 1.0617094358031158, | |
| "learning_rate": 4.805848204844427e-05, | |
| "loss": 0.9794, | |
| "mean_token_accuracy": 0.7198897957801819, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.17748758506529336, | |
| "grad_norm": 1.1638181916029056, | |
| "learning_rate": 4.803058030731627e-05, | |
| "loss": 1.0356, | |
| "mean_token_accuracy": 0.7055891275405883, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.17840720985837777, | |
| "grad_norm": 1.0804274338945197, | |
| "learning_rate": 4.800248867710724e-05, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7267025232315063, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.1793268346514622, | |
| "grad_norm": 1.1002302515677742, | |
| "learning_rate": 4.797420741763906e-05, | |
| "loss": 0.9513, | |
| "mean_token_accuracy": 0.727520763874054, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.18024645944454662, | |
| "grad_norm": 1.0807257658531308, | |
| "learning_rate": 4.794573679048751e-05, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7254797458648682, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.18116608423763106, | |
| "grad_norm": 1.1423934429361384, | |
| "learning_rate": 4.791707705897982e-05, | |
| "loss": 0.9289, | |
| "mean_token_accuracy": 0.7316087126731873, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.18208570903071547, | |
| "grad_norm": 1.0732201976252709, | |
| "learning_rate": 4.7888228488192294e-05, | |
| "loss": 0.9826, | |
| "mean_token_accuracy": 0.7205982804298401, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.18300533382379988, | |
| "grad_norm": 1.0026696776201605, | |
| "learning_rate": 4.7859191344947804e-05, | |
| "loss": 0.9289, | |
| "mean_token_accuracy": 0.7336562752723694, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.18392495861688432, | |
| "grad_norm": 1.138379913644609, | |
| "learning_rate": 4.782996589781337e-05, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.729135024547577, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18484458340996873, | |
| "grad_norm": 1.107580666472087, | |
| "learning_rate": 4.780055241709762e-05, | |
| "loss": 0.9048, | |
| "mean_token_accuracy": 0.7381602048873901, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.18576420820305314, | |
| "grad_norm": 1.0667620674465943, | |
| "learning_rate": 4.7770951174848335e-05, | |
| "loss": 0.9742, | |
| "mean_token_accuracy": 0.7205707669258118, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.18668383299613758, | |
| "grad_norm": 1.0940019385189808, | |
| "learning_rate": 4.774116244484993e-05, | |
| "loss": 0.9857, | |
| "mean_token_accuracy": 0.718968415260315, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.187603457789222, | |
| "grad_norm": 1.0279044112611866, | |
| "learning_rate": 4.7711186502620894e-05, | |
| "loss": 1.0084, | |
| "mean_token_accuracy": 0.7144084692001342, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1885230825823064, | |
| "grad_norm": 1.0751882464256728, | |
| "learning_rate": 4.768102362541126e-05, | |
| "loss": 0.9353, | |
| "mean_token_accuracy": 0.7318849921226501, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.18944270737539085, | |
| "grad_norm": 1.1701748750390102, | |
| "learning_rate": 4.765067409220004e-05, | |
| "loss": 0.957, | |
| "mean_token_accuracy": 0.7275319814682006, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.19036233216847526, | |
| "grad_norm": 1.0512353267451773, | |
| "learning_rate": 4.762013818369266e-05, | |
| "loss": 0.9367, | |
| "mean_token_accuracy": 0.7317106485366821, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.1912819569615597, | |
| "grad_norm": 1.1085851412035923, | |
| "learning_rate": 4.7589416182318305e-05, | |
| "loss": 0.9416, | |
| "mean_token_accuracy": 0.7324359536170959, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.1922015817546441, | |
| "grad_norm": 1.094731274119514, | |
| "learning_rate": 4.755850837222739e-05, | |
| "loss": 0.9474, | |
| "mean_token_accuracy": 0.7309187650680542, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.19312120654772852, | |
| "grad_norm": 1.0610610405848808, | |
| "learning_rate": 4.7527415039288874e-05, | |
| "loss": 0.9638, | |
| "mean_token_accuracy": 0.7251871824264526, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.19404083134081296, | |
| "grad_norm": 1.0919916417692772, | |
| "learning_rate": 4.749613647108764e-05, | |
| "loss": 1.0008, | |
| "mean_token_accuracy": 0.7152180433273315, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.19496045613389737, | |
| "grad_norm": 1.0847298297852, | |
| "learning_rate": 4.7464672956921814e-05, | |
| "loss": 0.9366, | |
| "mean_token_accuracy": 0.7313546657562255, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.19588008092698178, | |
| "grad_norm": 1.0912787695821449, | |
| "learning_rate": 4.743302478780011e-05, | |
| "loss": 0.945, | |
| "mean_token_accuracy": 0.728658664226532, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.19679970572006622, | |
| "grad_norm": 1.052195400658314, | |
| "learning_rate": 4.7401192256439144e-05, | |
| "loss": 0.9793, | |
| "mean_token_accuracy": 0.7213846921920777, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.19771933051315063, | |
| "grad_norm": 1.1107870405998106, | |
| "learning_rate": 4.736917565726069e-05, | |
| "loss": 0.9313, | |
| "mean_token_accuracy": 0.735443937778473, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.19863895530623504, | |
| "grad_norm": 1.1399365300090571, | |
| "learning_rate": 4.7336975286389e-05, | |
| "loss": 0.9717, | |
| "mean_token_accuracy": 0.7237229943275452, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.19955858009931948, | |
| "grad_norm": 1.0983682734144682, | |
| "learning_rate": 4.730459144164802e-05, | |
| "loss": 0.9306, | |
| "mean_token_accuracy": 0.733622133731842, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.2004782048924039, | |
| "grad_norm": 1.1053704101564246, | |
| "learning_rate": 4.727202442255871e-05, | |
| "loss": 0.9936, | |
| "mean_token_accuracy": 0.718384611606598, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.20139782968548833, | |
| "grad_norm": 1.0858488860538602, | |
| "learning_rate": 4.723927453033619e-05, | |
| "loss": 0.9548, | |
| "mean_token_accuracy": 0.7286873102188111, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.20231745447857274, | |
| "grad_norm": 1.0232898856111519, | |
| "learning_rate": 4.720634206788697e-05, | |
| "loss": 0.9804, | |
| "mean_token_accuracy": 0.7218252301216126, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.20323707927165716, | |
| "grad_norm": 1.1548447631409977, | |
| "learning_rate": 4.717322733980622e-05, | |
| "loss": 0.931, | |
| "mean_token_accuracy": 0.7311301946640014, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.2041567040647416, | |
| "grad_norm": 1.1168183831474872, | |
| "learning_rate": 4.713993065237486e-05, | |
| "loss": 0.9718, | |
| "mean_token_accuracy": 0.7235833764076233, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.205076328857826, | |
| "grad_norm": 1.1111836320920656, | |
| "learning_rate": 4.710645231355678e-05, | |
| "loss": 0.9855, | |
| "mean_token_accuracy": 0.7195135593414307, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.20599595365091042, | |
| "grad_norm": 1.0024638729648838, | |
| "learning_rate": 4.707279263299598e-05, | |
| "loss": 0.9729, | |
| "mean_token_accuracy": 0.7219846963882446, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.20691557844399486, | |
| "grad_norm": 1.0121762272601764, | |
| "learning_rate": 4.703895192201372e-05, | |
| "loss": 0.9459, | |
| "mean_token_accuracy": 0.7269375443458557, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.20783520323707927, | |
| "grad_norm": 1.0470465876428376, | |
| "learning_rate": 4.7004930493605573e-05, | |
| "loss": 1.0105, | |
| "mean_token_accuracy": 0.7086774349212647, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.20875482803016368, | |
| "grad_norm": 1.0632837126367782, | |
| "learning_rate": 4.697072866243866e-05, | |
| "loss": 0.9412, | |
| "mean_token_accuracy": 0.7307331085205078, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.20967445282324812, | |
| "grad_norm": 1.0768863946202714, | |
| "learning_rate": 4.69363467448486e-05, | |
| "loss": 0.9674, | |
| "mean_token_accuracy": 0.7221316814422607, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.21059407761633253, | |
| "grad_norm": 1.1181930167961487, | |
| "learning_rate": 4.6901785058836675e-05, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.725222361087799, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.21151370240941697, | |
| "grad_norm": 1.0688002319746086, | |
| "learning_rate": 4.686704392406685e-05, | |
| "loss": 0.9687, | |
| "mean_token_accuracy": 0.7218108892440795, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.21243332720250138, | |
| "grad_norm": 1.1052965038670703, | |
| "learning_rate": 4.6832123661862835e-05, | |
| "loss": 0.9516, | |
| "mean_token_accuracy": 0.7287932515144349, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.2133529519955858, | |
| "grad_norm": 1.0349887525202925, | |
| "learning_rate": 4.6797024595205104e-05, | |
| "loss": 0.9599, | |
| "mean_token_accuracy": 0.7228366494178772, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.21427257678867023, | |
| "grad_norm": 1.052123043795087, | |
| "learning_rate": 4.6761747048727907e-05, | |
| "loss": 0.9833, | |
| "mean_token_accuracy": 0.714729118347168, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.21519220158175464, | |
| "grad_norm": 1.0646750046566955, | |
| "learning_rate": 4.672629134871625e-05, | |
| "loss": 0.98, | |
| "mean_token_accuracy": 0.7194055676460266, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.21611182637483906, | |
| "grad_norm": 1.072675922430035, | |
| "learning_rate": 4.669065782310294e-05, | |
| "loss": 0.9661, | |
| "mean_token_accuracy": 0.7228956103324891, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.2170314511679235, | |
| "grad_norm": 1.0475965649186345, | |
| "learning_rate": 4.665484680146546e-05, | |
| "loss": 0.9168, | |
| "mean_token_accuracy": 0.7354954957962037, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.2179510759610079, | |
| "grad_norm": 1.0183550500547607, | |
| "learning_rate": 4.6618858615023e-05, | |
| "loss": 0.9268, | |
| "mean_token_accuracy": 0.731166672706604, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.21887070075409232, | |
| "grad_norm": 1.0894438583208028, | |
| "learning_rate": 4.658269359663336e-05, | |
| "loss": 0.9134, | |
| "mean_token_accuracy": 0.7400953650474549, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.21979032554717676, | |
| "grad_norm": 0.9962620966267176, | |
| "learning_rate": 4.6546352080789854e-05, | |
| "loss": 0.9472, | |
| "mean_token_accuracy": 0.7283522963523865, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.22070995034026117, | |
| "grad_norm": 1.0767144498287804, | |
| "learning_rate": 4.650983440361825e-05, | |
| "loss": 0.9798, | |
| "mean_token_accuracy": 0.7208079814910888, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2216295751333456, | |
| "grad_norm": 1.0451151540293229, | |
| "learning_rate": 4.6473140902873666e-05, | |
| "loss": 0.9735, | |
| "mean_token_accuracy": 0.7223762154579163, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.22254919992643002, | |
| "grad_norm": 0.9904423090265289, | |
| "learning_rate": 4.643627191793737e-05, | |
| "loss": 0.9416, | |
| "mean_token_accuracy": 0.7333443641662598, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.22346882471951443, | |
| "grad_norm": 1.0324822073086444, | |
| "learning_rate": 4.639922778981377e-05, | |
| "loss": 0.9096, | |
| "mean_token_accuracy": 0.7366245865821839, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.22438844951259887, | |
| "grad_norm": 1.00961392870682, | |
| "learning_rate": 4.636200886112714e-05, | |
| "loss": 0.9647, | |
| "mean_token_accuracy": 0.7272518515586853, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.22530807430568328, | |
| "grad_norm": 1.041598639678359, | |
| "learning_rate": 4.63246154761185e-05, | |
| "loss": 0.982, | |
| "mean_token_accuracy": 0.7185810923576355, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.2262276990987677, | |
| "grad_norm": 1.0574278162856792, | |
| "learning_rate": 4.628704798064247e-05, | |
| "loss": 0.9442, | |
| "mean_token_accuracy": 0.7297179222106933, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.22714732389185213, | |
| "grad_norm": 1.060076765820854, | |
| "learning_rate": 4.624930672216399e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7244118571281433, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.22806694868493654, | |
| "grad_norm": 1.0123003105589568, | |
| "learning_rate": 4.621139204975516e-05, | |
| "loss": 0.9169, | |
| "mean_token_accuracy": 0.7362489700317383, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.22898657347802095, | |
| "grad_norm": 1.1490153575204947, | |
| "learning_rate": 4.617330431409201e-05, | |
| "loss": 0.9929, | |
| "mean_token_accuracy": 0.7166203141212464, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.2299061982711054, | |
| "grad_norm": 1.0270625785191527, | |
| "learning_rate": 4.6135043867451255e-05, | |
| "loss": 0.9325, | |
| "mean_token_accuracy": 0.7311270833015442, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2308258230641898, | |
| "grad_norm": 1.030694744170465, | |
| "learning_rate": 4.609661106370701e-05, | |
| "loss": 0.9228, | |
| "mean_token_accuracy": 0.7355565190315246, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.23174544785727424, | |
| "grad_norm": 1.0190672056189127, | |
| "learning_rate": 4.605800625832753e-05, | |
| "loss": 0.9577, | |
| "mean_token_accuracy": 0.7273682594299317, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.23266507265035866, | |
| "grad_norm": 1.025832787786935, | |
| "learning_rate": 4.6019229808371945e-05, | |
| "loss": 0.9291, | |
| "mean_token_accuracy": 0.7325186491012573, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.23358469744344307, | |
| "grad_norm": 1.0254402284447273, | |
| "learning_rate": 4.598028207248693e-05, | |
| "loss": 0.9681, | |
| "mean_token_accuracy": 0.7215327501296998, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.2345043222365275, | |
| "grad_norm": 1.043519079594266, | |
| "learning_rate": 4.5941163410903406e-05, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.7248036026954651, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.23542394702961192, | |
| "grad_norm": 0.9811685630848649, | |
| "learning_rate": 4.590187418543321e-05, | |
| "loss": 0.9204, | |
| "mean_token_accuracy": 0.7338666915893555, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.23634357182269633, | |
| "grad_norm": 1.0355767679745649, | |
| "learning_rate": 4.586241475946571e-05, | |
| "loss": 0.9824, | |
| "mean_token_accuracy": 0.7212961316108704, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.23726319661578077, | |
| "grad_norm": 0.9995187864598916, | |
| "learning_rate": 4.582278549796448e-05, | |
| "loss": 0.914, | |
| "mean_token_accuracy": 0.7355898737907409, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.23818282140886518, | |
| "grad_norm": 1.0163621938165361, | |
| "learning_rate": 4.5782986767463946e-05, | |
| "loss": 0.9614, | |
| "mean_token_accuracy": 0.7241615772247314, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.2391024462019496, | |
| "grad_norm": 1.0913821743861445, | |
| "learning_rate": 4.574301893606594e-05, | |
| "loss": 0.8839, | |
| "mean_token_accuracy": 0.7434832811355591, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.24002207099503403, | |
| "grad_norm": 1.0399223484753735, | |
| "learning_rate": 4.570288237343632e-05, | |
| "loss": 0.9104, | |
| "mean_token_accuracy": 0.7378169417381286, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.24094169578811844, | |
| "grad_norm": 1.011671028641558, | |
| "learning_rate": 4.5662577450801576e-05, | |
| "loss": 0.9595, | |
| "mean_token_accuracy": 0.7230379819869995, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.24186132058120288, | |
| "grad_norm": 1.008990928095214, | |
| "learning_rate": 4.562210454094535e-05, | |
| "loss": 0.9363, | |
| "mean_token_accuracy": 0.7295035600662232, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.2427809453742873, | |
| "grad_norm": 1.059357744292348, | |
| "learning_rate": 4.558146401820502e-05, | |
| "loss": 0.9569, | |
| "mean_token_accuracy": 0.7264422059059144, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.2437005701673717, | |
| "grad_norm": 1.0224904321964083, | |
| "learning_rate": 4.554065625846825e-05, | |
| "loss": 0.9838, | |
| "mean_token_accuracy": 0.7178040146827698, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.24462019496045614, | |
| "grad_norm": 1.0737296876090594, | |
| "learning_rate": 4.549968163916946e-05, | |
| "loss": 0.976, | |
| "mean_token_accuracy": 0.7180652141571044, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.24553981975354056, | |
| "grad_norm": 1.0129242243093401, | |
| "learning_rate": 4.545854053928639e-05, | |
| "loss": 0.9394, | |
| "mean_token_accuracy": 0.7314478039741517, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.24645944454662497, | |
| "grad_norm": 0.9860304727584566, | |
| "learning_rate": 4.541723333933657e-05, | |
| "loss": 0.9595, | |
| "mean_token_accuracy": 0.7271197676658631, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.2473790693397094, | |
| "grad_norm": 1.0235437508308431, | |
| "learning_rate": 4.5375760421373796e-05, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.7178149104118348, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.24829869413279382, | |
| "grad_norm": 1.076473129213084, | |
| "learning_rate": 4.533412216898461e-05, | |
| "loss": 0.9374, | |
| "mean_token_accuracy": 0.7287054538726807, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.24921831892587823, | |
| "grad_norm": 1.027000741915809, | |
| "learning_rate": 4.529231896728474e-05, | |
| "loss": 0.9098, | |
| "mean_token_accuracy": 0.7352772355079651, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.25013794371896264, | |
| "grad_norm": 1.0980991489181584, | |
| "learning_rate": 4.525035120291557e-05, | |
| "loss": 0.9613, | |
| "mean_token_accuracy": 0.7250553727149963, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.2510575685120471, | |
| "grad_norm": 1.0105378261394609, | |
| "learning_rate": 4.520821926404049e-05, | |
| "loss": 0.9232, | |
| "mean_token_accuracy": 0.7339854836463928, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.2519771933051315, | |
| "grad_norm": 1.0465671126237865, | |
| "learning_rate": 4.516592354034138e-05, | |
| "loss": 0.9578, | |
| "mean_token_accuracy": 0.7243474960327149, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.2528968180982159, | |
| "grad_norm": 1.0721948067984564, | |
| "learning_rate": 4.512346442301501e-05, | |
| "loss": 0.9305, | |
| "mean_token_accuracy": 0.7290533304214477, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.25381644289130034, | |
| "grad_norm": 1.083352961545848, | |
| "learning_rate": 4.5080842304769345e-05, | |
| "loss": 0.9338, | |
| "mean_token_accuracy": 0.733627998828888, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.2547360676843848, | |
| "grad_norm": 0.979913773136715, | |
| "learning_rate": 4.503805757981997e-05, | |
| "loss": 0.9012, | |
| "mean_token_accuracy": 0.7409675002098084, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.25565569247746917, | |
| "grad_norm": 1.1174510417210128, | |
| "learning_rate": 4.499511064388645e-05, | |
| "loss": 0.8754, | |
| "mean_token_accuracy": 0.7447872519493103, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.2565753172705536, | |
| "grad_norm": 1.0562227070300527, | |
| "learning_rate": 4.495200189418864e-05, | |
| "loss": 0.9505, | |
| "mean_token_accuracy": 0.7265227913856507, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.25749494206363804, | |
| "grad_norm": 1.0550543313489833, | |
| "learning_rate": 4.490873172944303e-05, | |
| "loss": 0.9096, | |
| "mean_token_accuracy": 0.7342225193977356, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2584145668567225, | |
| "grad_norm": 1.0844914008772555, | |
| "learning_rate": 4.486530054985905e-05, | |
| "loss": 0.9643, | |
| "mean_token_accuracy": 0.7227702975273133, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.25933419164980687, | |
| "grad_norm": 1.11030675175993, | |
| "learning_rate": 4.482170875713536e-05, | |
| "loss": 0.98, | |
| "mean_token_accuracy": 0.7210663437843323, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.2602538164428913, | |
| "grad_norm": 1.0678730599548856, | |
| "learning_rate": 4.477795675445616e-05, | |
| "loss": 0.9248, | |
| "mean_token_accuracy": 0.7327564835548401, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.26117344123597575, | |
| "grad_norm": 0.9866628204231362, | |
| "learning_rate": 4.473404494648744e-05, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.7343960881233216, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.26209306602906013, | |
| "grad_norm": 0.9895263110250994, | |
| "learning_rate": 4.4689973739373244e-05, | |
| "loss": 0.9123, | |
| "mean_token_accuracy": 0.7354090452194214, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.26301269082214457, | |
| "grad_norm": 0.9560958289104061, | |
| "learning_rate": 4.46457435407319e-05, | |
| "loss": 0.9494, | |
| "mean_token_accuracy": 0.725600802898407, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.263932315615229, | |
| "grad_norm": 1.0418751893863187, | |
| "learning_rate": 4.460135475965227e-05, | |
| "loss": 0.887, | |
| "mean_token_accuracy": 0.744392192363739, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.2648519404083134, | |
| "grad_norm": 1.0270767884123133, | |
| "learning_rate": 4.455680780668997e-05, | |
| "loss": 0.98, | |
| "mean_token_accuracy": 0.717594051361084, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.26577156520139783, | |
| "grad_norm": 1.0194372684867639, | |
| "learning_rate": 4.4512103093863555e-05, | |
| "loss": 0.9145, | |
| "mean_token_accuracy": 0.7369788885116577, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.26669118999448227, | |
| "grad_norm": 1.0981284825838393, | |
| "learning_rate": 4.44672410346507e-05, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7260895729064941, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.26761081478756665, | |
| "grad_norm": 1.0207625075556366, | |
| "learning_rate": 4.442222204398441e-05, | |
| "loss": 0.9555, | |
| "mean_token_accuracy": 0.7227967500686645, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.2685304395806511, | |
| "grad_norm": 0.98393868791661, | |
| "learning_rate": 4.437704653824915e-05, | |
| "loss": 0.8831, | |
| "mean_token_accuracy": 0.7438354253768921, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.26945006437373553, | |
| "grad_norm": 0.9817630950075087, | |
| "learning_rate": 4.433171493527701e-05, | |
| "loss": 0.9404, | |
| "mean_token_accuracy": 0.728731095790863, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.2703696891668199, | |
| "grad_norm": 1.0298652072064594, | |
| "learning_rate": 4.428622765434383e-05, | |
| "loss": 0.9136, | |
| "mean_token_accuracy": 0.7356218695640564, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.27128931395990435, | |
| "grad_norm": 0.981553092264934, | |
| "learning_rate": 4.4240585116165334e-05, | |
| "loss": 0.8555, | |
| "mean_token_accuracy": 0.753374171257019, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.2722089387529888, | |
| "grad_norm": 1.172918257192198, | |
| "learning_rate": 4.419478774289325e-05, | |
| "loss": 0.998, | |
| "mean_token_accuracy": 0.713919198513031, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2731285635460732, | |
| "grad_norm": 1.003409782978005, | |
| "learning_rate": 4.414883595811136e-05, | |
| "loss": 0.8782, | |
| "mean_token_accuracy": 0.7452871680259705, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.2740481883391576, | |
| "grad_norm": 1.0316918646250515, | |
| "learning_rate": 4.410273018683163e-05, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.7311699628829956, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.27496781313224206, | |
| "grad_norm": 0.978003437149563, | |
| "learning_rate": 4.405647085549025e-05, | |
| "loss": 0.9241, | |
| "mean_token_accuracy": 0.7328976273536683, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.27588743792532644, | |
| "grad_norm": 1.0070406181231344, | |
| "learning_rate": 4.40100583919437e-05, | |
| "loss": 0.9001, | |
| "mean_token_accuracy": 0.7395057559013367, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2768070627184109, | |
| "grad_norm": 0.9873878935159346, | |
| "learning_rate": 4.3963493225464817e-05, | |
| "loss": 0.9258, | |
| "mean_token_accuracy": 0.7336387634277344, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.2777266875114953, | |
| "grad_norm": 0.9521695030248521, | |
| "learning_rate": 4.3916775786738754e-05, | |
| "loss": 0.914, | |
| "mean_token_accuracy": 0.7378314137458801, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.27864631230457976, | |
| "grad_norm": 0.9502896850196428, | |
| "learning_rate": 4.3869906507859096e-05, | |
| "loss": 0.8987, | |
| "mean_token_accuracy": 0.7417943596839904, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.27956593709766414, | |
| "grad_norm": 0.991426828614557, | |
| "learning_rate": 4.382288582232376e-05, | |
| "loss": 0.9106, | |
| "mean_token_accuracy": 0.7390964746475219, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2804855618907486, | |
| "grad_norm": 1.0581857743606324, | |
| "learning_rate": 4.377571416503108e-05, | |
| "loss": 0.9179, | |
| "mean_token_accuracy": 0.7379998922348022, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.281405186683833, | |
| "grad_norm": 0.9872377385823925, | |
| "learning_rate": 4.372839197227571e-05, | |
| "loss": 0.8848, | |
| "mean_token_accuracy": 0.7446985721588135, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.2823248114769174, | |
| "grad_norm": 1.0976151495403408, | |
| "learning_rate": 4.368091968174463e-05, | |
| "loss": 0.9632, | |
| "mean_token_accuracy": 0.723613953590393, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.28324443627000184, | |
| "grad_norm": 1.013680671037777, | |
| "learning_rate": 4.363329773251309e-05, | |
| "loss": 0.866, | |
| "mean_token_accuracy": 0.750942587852478, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2841640610630863, | |
| "grad_norm": 1.1182733077200029, | |
| "learning_rate": 4.3585526565040543e-05, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.7137303233146668, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.28508368585617067, | |
| "grad_norm": 0.9779737007515391, | |
| "learning_rate": 4.353760662116658e-05, | |
| "loss": 0.9369, | |
| "mean_token_accuracy": 0.7336580872535705, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2860033106492551, | |
| "grad_norm": 1.0260468281394197, | |
| "learning_rate": 4.348953834410683e-05, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.7206373929977417, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.28692293544233954, | |
| "grad_norm": 1.0263096637333005, | |
| "learning_rate": 4.3441322178448856e-05, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.7260561943054199, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.2878425602354239, | |
| "grad_norm": 0.9619383230028783, | |
| "learning_rate": 4.339295857014809e-05, | |
| "loss": 0.9501, | |
| "mean_token_accuracy": 0.7264659523963928, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.28876218502850837, | |
| "grad_norm": 0.9946060524217067, | |
| "learning_rate": 4.3344447966523634e-05, | |
| "loss": 0.9887, | |
| "mean_token_accuracy": 0.7160560727119446, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2896818098215928, | |
| "grad_norm": 1.0275376139203307, | |
| "learning_rate": 4.3295790816254195e-05, | |
| "loss": 0.9262, | |
| "mean_token_accuracy": 0.734666109085083, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.2906014346146772, | |
| "grad_norm": 1.1276042923218728, | |
| "learning_rate": 4.324698756937388e-05, | |
| "loss": 0.9378, | |
| "mean_token_accuracy": 0.7300173878669739, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.29152105940776163, | |
| "grad_norm": 0.9552400868458645, | |
| "learning_rate": 4.319803867726807e-05, | |
| "loss": 0.8879, | |
| "mean_token_accuracy": 0.7425481796264648, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.29244068420084607, | |
| "grad_norm": 0.9486514468425481, | |
| "learning_rate": 4.3148944592669234e-05, | |
| "loss": 0.9613, | |
| "mean_token_accuracy": 0.7219538450241089, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.29336030899393045, | |
| "grad_norm": 0.9567962674802902, | |
| "learning_rate": 4.30997057696527e-05, | |
| "loss": 0.8741, | |
| "mean_token_accuracy": 0.7477473855018616, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.2942799337870149, | |
| "grad_norm": 0.9667609260469084, | |
| "learning_rate": 4.3050322663632564e-05, | |
| "loss": 0.9568, | |
| "mean_token_accuracy": 0.7255883097648621, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.29519955858009933, | |
| "grad_norm": 0.9920073647296315, | |
| "learning_rate": 4.3000795731357333e-05, | |
| "loss": 0.9237, | |
| "mean_token_accuracy": 0.7383288621902466, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.2961191833731837, | |
| "grad_norm": 1.0604465170326072, | |
| "learning_rate": 4.295112543090584e-05, | |
| "loss": 0.9609, | |
| "mean_token_accuracy": 0.7225096940994262, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.29703880816626815, | |
| "grad_norm": 1.0688037490276023, | |
| "learning_rate": 4.290131222168289e-05, | |
| "loss": 1.0008, | |
| "mean_token_accuracy": 0.7138909697532654, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.2979584329593526, | |
| "grad_norm": 1.143629206489082, | |
| "learning_rate": 4.2851356564415086e-05, | |
| "loss": 0.9867, | |
| "mean_token_accuracy": 0.7165561437606811, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.29887805775243703, | |
| "grad_norm": 1.0438745750713756, | |
| "learning_rate": 4.280125892114656e-05, | |
| "loss": 0.9434, | |
| "mean_token_accuracy": 0.7298865675926208, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.2997976825455214, | |
| "grad_norm": 1.0251559106803514, | |
| "learning_rate": 4.2751019755234664e-05, | |
| "loss": 0.935, | |
| "mean_token_accuracy": 0.7299148678779602, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.30071730733860585, | |
| "grad_norm": 0.9900961445552091, | |
| "learning_rate": 4.27006395313457e-05, | |
| "loss": 0.9963, | |
| "mean_token_accuracy": 0.7131295561790466, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.3016369321316903, | |
| "grad_norm": 1.040210108998438, | |
| "learning_rate": 4.265011871545066e-05, | |
| "loss": 0.9412, | |
| "mean_token_accuracy": 0.7279941439628601, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.3025565569247747, | |
| "grad_norm": 1.0262950854145634, | |
| "learning_rate": 4.259945777482085e-05, | |
| "loss": 0.9239, | |
| "mean_token_accuracy": 0.7327239632606506, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.3034761817178591, | |
| "grad_norm": 0.9969469234100081, | |
| "learning_rate": 4.25486571780236e-05, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.7269651889801025, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.30439580651094356, | |
| "grad_norm": 1.0021703198417462, | |
| "learning_rate": 4.249771739491795e-05, | |
| "loss": 0.9003, | |
| "mean_token_accuracy": 0.7421126961708069, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.30531543130402794, | |
| "grad_norm": 1.0255704189414308, | |
| "learning_rate": 4.24466388966503e-05, | |
| "loss": 0.9249, | |
| "mean_token_accuracy": 0.7345858454704285, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.3062350560971124, | |
| "grad_norm": 0.9438771845720968, | |
| "learning_rate": 4.239542215565e-05, | |
| "loss": 0.9749, | |
| "mean_token_accuracy": 0.7182752847671509, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.3071546808901968, | |
| "grad_norm": 0.9878451650581643, | |
| "learning_rate": 4.2344067645625036e-05, | |
| "loss": 0.9455, | |
| "mean_token_accuracy": 0.7264060854911805, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.3080743056832812, | |
| "grad_norm": 1.1287364443586523, | |
| "learning_rate": 4.229257584155765e-05, | |
| "loss": 0.9218, | |
| "mean_token_accuracy": 0.7332573175430298, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.30899393047636564, | |
| "grad_norm": 0.971666072350275, | |
| "learning_rate": 4.2240947219699895e-05, | |
| "loss": 0.8756, | |
| "mean_token_accuracy": 0.7459922909736634, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.3099135552694501, | |
| "grad_norm": 0.9593974583897734, | |
| "learning_rate": 4.2189182257569285e-05, | |
| "loss": 0.9329, | |
| "mean_token_accuracy": 0.730040967464447, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.31083318006253446, | |
| "grad_norm": 0.943158273064518, | |
| "learning_rate": 4.213728143394436e-05, | |
| "loss": 0.8839, | |
| "mean_token_accuracy": 0.7458212971687317, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.3117528048556189, | |
| "grad_norm": 1.050902490407755, | |
| "learning_rate": 4.208524522886022e-05, | |
| "loss": 0.9443, | |
| "mean_token_accuracy": 0.7311147809028625, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.31267242964870334, | |
| "grad_norm": 1.0074348860409519, | |
| "learning_rate": 4.203307412360418e-05, | |
| "loss": 0.9201, | |
| "mean_token_accuracy": 0.7326057314872741, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3135920544417877, | |
| "grad_norm": 1.0039288385867127, | |
| "learning_rate": 4.1980768600711194e-05, | |
| "loss": 0.9169, | |
| "mean_token_accuracy": 0.736884355545044, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.31451167923487217, | |
| "grad_norm": 0.9456279018137994, | |
| "learning_rate": 4.1928329143959506e-05, | |
| "loss": 0.9198, | |
| "mean_token_accuracy": 0.7341038465499878, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.3154313040279566, | |
| "grad_norm": 0.969219875361889, | |
| "learning_rate": 4.18757562383661e-05, | |
| "loss": 0.9586, | |
| "mean_token_accuracy": 0.7229322910308837, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.316350928821041, | |
| "grad_norm": 0.9823553221239351, | |
| "learning_rate": 4.182305037018224e-05, | |
| "loss": 0.8674, | |
| "mean_token_accuracy": 0.7455045938491821, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.31727055361412543, | |
| "grad_norm": 0.9614849491835867, | |
| "learning_rate": 4.1770212026888974e-05, | |
| "loss": 0.8978, | |
| "mean_token_accuracy": 0.7393216609954834, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.31819017840720987, | |
| "grad_norm": 1.0298443865011644, | |
| "learning_rate": 4.1717241697192636e-05, | |
| "loss": 0.9046, | |
| "mean_token_accuracy": 0.7390219569206238, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.3191098032002943, | |
| "grad_norm": 0.9675044814332657, | |
| "learning_rate": 4.166413987102031e-05, | |
| "loss": 0.9014, | |
| "mean_token_accuracy": 0.7412125468254089, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.3200294279933787, | |
| "grad_norm": 0.9558901216962499, | |
| "learning_rate": 4.161090703951528e-05, | |
| "loss": 0.8915, | |
| "mean_token_accuracy": 0.7442119359970093, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.32094905278646313, | |
| "grad_norm": 1.0231471726772243, | |
| "learning_rate": 4.155754369503254e-05, | |
| "loss": 0.9508, | |
| "mean_token_accuracy": 0.7272051572799683, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.32186867757954757, | |
| "grad_norm": 0.971225693001968, | |
| "learning_rate": 4.1504050331134186e-05, | |
| "loss": 0.9271, | |
| "mean_token_accuracy": 0.7334083676338196, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.32278830237263195, | |
| "grad_norm": 0.9487975621871125, | |
| "learning_rate": 4.1450427442584885e-05, | |
| "loss": 0.9231, | |
| "mean_token_accuracy": 0.7330006003379822, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.3237079271657164, | |
| "grad_norm": 1.080234485746019, | |
| "learning_rate": 4.13966755253473e-05, | |
| "loss": 0.8934, | |
| "mean_token_accuracy": 0.7371908903121949, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.32462755195880083, | |
| "grad_norm": 1.0042744657060512, | |
| "learning_rate": 4.134279507657746e-05, | |
| "loss": 0.9357, | |
| "mean_token_accuracy": 0.7307947874069214, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.3255471767518852, | |
| "grad_norm": 1.0167454318885076, | |
| "learning_rate": 4.1288786594620224e-05, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7250777244567871, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.32646680154496965, | |
| "grad_norm": 1.0378785371682158, | |
| "learning_rate": 4.123465057900463e-05, | |
| "loss": 0.8991, | |
| "mean_token_accuracy": 0.7383182883262634, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.3273864263380541, | |
| "grad_norm": 0.975574798117687, | |
| "learning_rate": 4.118038753043927e-05, | |
| "loss": 0.8962, | |
| "mean_token_accuracy": 0.7391498327255249, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.3283060511311385, | |
| "grad_norm": 0.9785593634297269, | |
| "learning_rate": 4.112599795080771e-05, | |
| "loss": 0.8976, | |
| "mean_token_accuracy": 0.7406945347785949, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.3292256759242229, | |
| "grad_norm": 0.9506069452238485, | |
| "learning_rate": 4.107148234316378e-05, | |
| "loss": 0.9792, | |
| "mean_token_accuracy": 0.7183930397033691, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.33014530071730736, | |
| "grad_norm": 0.9568388159915644, | |
| "learning_rate": 4.101684121172696e-05, | |
| "loss": 0.9445, | |
| "mean_token_accuracy": 0.7280240654945374, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.33106492551039174, | |
| "grad_norm": 1.022357456314008, | |
| "learning_rate": 4.096207506187773e-05, | |
| "loss": 0.9394, | |
| "mean_token_accuracy": 0.7300898432731628, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3319845503034762, | |
| "grad_norm": 0.993312074550177, | |
| "learning_rate": 4.090718440015285e-05, | |
| "loss": 0.8857, | |
| "mean_token_accuracy": 0.7397880554199219, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.3329041750965606, | |
| "grad_norm": 0.9393217165901138, | |
| "learning_rate": 4.0852169734240715e-05, | |
| "loss": 0.9055, | |
| "mean_token_accuracy": 0.7397056937217712, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.333823799889645, | |
| "grad_norm": 1.0286146516865022, | |
| "learning_rate": 4.0797031572976644e-05, | |
| "loss": 0.9486, | |
| "mean_token_accuracy": 0.7270653247833252, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.33474342468272944, | |
| "grad_norm": 1.0433673618214743, | |
| "learning_rate": 4.074177042633818e-05, | |
| "loss": 0.8654, | |
| "mean_token_accuracy": 0.7493741869926452, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.3356630494758139, | |
| "grad_norm": 0.9978374983290279, | |
| "learning_rate": 4.068638680544035e-05, | |
| "loss": 0.9434, | |
| "mean_token_accuracy": 0.7284141898155212, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.33658267426889826, | |
| "grad_norm": 0.9268570875914646, | |
| "learning_rate": 4.063088122253096e-05, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7292568445205688, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.3375022990619827, | |
| "grad_norm": 1.0098370277606412, | |
| "learning_rate": 4.05752541909859e-05, | |
| "loss": 0.8831, | |
| "mean_token_accuracy": 0.7427129149436951, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.33842192385506714, | |
| "grad_norm": 0.9840521255378257, | |
| "learning_rate": 4.0519506225304266e-05, | |
| "loss": 0.9129, | |
| "mean_token_accuracy": 0.7376075983047485, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.3393415486481516, | |
| "grad_norm": 0.9706147022595509, | |
| "learning_rate": 4.046363784110375e-05, | |
| "loss": 0.8867, | |
| "mean_token_accuracy": 0.7421358585357666, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.34026117344123596, | |
| "grad_norm": 1.0544553608523015, | |
| "learning_rate": 4.040764955511577e-05, | |
| "loss": 0.9404, | |
| "mean_token_accuracy": 0.7300120830535889, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3411807982343204, | |
| "grad_norm": 0.9771051625951763, | |
| "learning_rate": 4.035154188518076e-05, | |
| "loss": 0.92, | |
| "mean_token_accuracy": 0.7353024840354919, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.34210042302740484, | |
| "grad_norm": 0.9612601058837731, | |
| "learning_rate": 4.02953153502433e-05, | |
| "loss": 0.8822, | |
| "mean_token_accuracy": 0.7446259975433349, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.3430200478204892, | |
| "grad_norm": 1.0790844365415948, | |
| "learning_rate": 4.0238970470347404e-05, | |
| "loss": 0.9243, | |
| "mean_token_accuracy": 0.7315137147903442, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.34393967261357367, | |
| "grad_norm": 0.9988868690440261, | |
| "learning_rate": 4.018250776663164e-05, | |
| "loss": 0.8875, | |
| "mean_token_accuracy": 0.7421119809150696, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.3448592974066581, | |
| "grad_norm": 1.0571095915292046, | |
| "learning_rate": 4.012592776132435e-05, | |
| "loss": 0.9273, | |
| "mean_token_accuracy": 0.731085193157196, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.3457789221997425, | |
| "grad_norm": 1.135743652086019, | |
| "learning_rate": 4.0069230977738826e-05, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.7248372554779052, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.34669854699282693, | |
| "grad_norm": 0.9715071563775657, | |
| "learning_rate": 4.001241794026842e-05, | |
| "loss": 0.94, | |
| "mean_token_accuracy": 0.731473171710968, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.34761817178591137, | |
| "grad_norm": 0.9942342778662301, | |
| "learning_rate": 3.9955489174381746e-05, | |
| "loss": 0.9329, | |
| "mean_token_accuracy": 0.7310616850852967, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.34853779657899575, | |
| "grad_norm": 1.0075175249825896, | |
| "learning_rate": 3.989844520661779e-05, | |
| "loss": 0.9438, | |
| "mean_token_accuracy": 0.7262274742126464, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.3494574213720802, | |
| "grad_norm": 0.9753954477573876, | |
| "learning_rate": 3.984128656458106e-05, | |
| "loss": 0.9702, | |
| "mean_token_accuracy": 0.7193968415260314, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.35037704616516463, | |
| "grad_norm": 1.0133558076382343, | |
| "learning_rate": 3.978401377693669e-05, | |
| "loss": 0.873, | |
| "mean_token_accuracy": 0.7490906119346619, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.351296670958249, | |
| "grad_norm": 1.0343688728685794, | |
| "learning_rate": 3.9726627373405544e-05, | |
| "loss": 0.9308, | |
| "mean_token_accuracy": 0.7297749042510986, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.35221629575133345, | |
| "grad_norm": 0.9695668089988693, | |
| "learning_rate": 3.966912788475937e-05, | |
| "loss": 0.9028, | |
| "mean_token_accuracy": 0.7381954431533814, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.3531359205444179, | |
| "grad_norm": 0.9832664588504738, | |
| "learning_rate": 3.961151584281581e-05, | |
| "loss": 0.8815, | |
| "mean_token_accuracy": 0.7429476737976074, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3540555453375023, | |
| "grad_norm": 0.963687599953708, | |
| "learning_rate": 3.955379178043352e-05, | |
| "loss": 0.9823, | |
| "mean_token_accuracy": 0.7177613019943238, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.3549751701305867, | |
| "grad_norm": 0.9479437389842555, | |
| "learning_rate": 3.9495956231507266e-05, | |
| "loss": 0.9274, | |
| "mean_token_accuracy": 0.7312801122665405, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.35589479492367115, | |
| "grad_norm": 0.938691928481946, | |
| "learning_rate": 3.943800973096296e-05, | |
| "loss": 0.9017, | |
| "mean_token_accuracy": 0.7394131779670715, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.35681441971675554, | |
| "grad_norm": 0.967769246759337, | |
| "learning_rate": 3.937995281475269e-05, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.7352214097976685, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.35773404450984, | |
| "grad_norm": 0.9613349378582403, | |
| "learning_rate": 3.932178601984982e-05, | |
| "loss": 0.8861, | |
| "mean_token_accuracy": 0.7429886102676392, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.3586536693029244, | |
| "grad_norm": 0.9739202222729397, | |
| "learning_rate": 3.926350988424397e-05, | |
| "loss": 0.8628, | |
| "mean_token_accuracy": 0.7480137705802917, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.35957329409600886, | |
| "grad_norm": 1.00417983410191, | |
| "learning_rate": 3.920512494693607e-05, | |
| "loss": 0.879, | |
| "mean_token_accuracy": 0.7440518856048584, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.36049291888909324, | |
| "grad_norm": 1.0098406374163094, | |
| "learning_rate": 3.9146631747933366e-05, | |
| "loss": 0.8329, | |
| "mean_token_accuracy": 0.759476363658905, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3614125436821777, | |
| "grad_norm": 0.9962046099940254, | |
| "learning_rate": 3.908803082824441e-05, | |
| "loss": 0.8369, | |
| "mean_token_accuracy": 0.7543352007865906, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.3623321684752621, | |
| "grad_norm": 1.0229275697874085, | |
| "learning_rate": 3.9029322729874104e-05, | |
| "loss": 0.9319, | |
| "mean_token_accuracy": 0.7315138220787049, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.3632517932683465, | |
| "grad_norm": 0.9131833883898176, | |
| "learning_rate": 3.8970507995818636e-05, | |
| "loss": 0.8373, | |
| "mean_token_accuracy": 0.754296875, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.36417141806143094, | |
| "grad_norm": 0.9558351857573911, | |
| "learning_rate": 3.891158717006046e-05, | |
| "loss": 0.892, | |
| "mean_token_accuracy": 0.7430965900421143, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.3650910428545154, | |
| "grad_norm": 0.9446973659937214, | |
| "learning_rate": 3.885256079756331e-05, | |
| "loss": 0.9394, | |
| "mean_token_accuracy": 0.7250162839889527, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.36601066764759976, | |
| "grad_norm": 0.9202948815573198, | |
| "learning_rate": 3.879342942426711e-05, | |
| "loss": 0.9124, | |
| "mean_token_accuracy": 0.7363432049751282, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.3669302924406842, | |
| "grad_norm": 0.9507433703052857, | |
| "learning_rate": 3.8734193597082964e-05, | |
| "loss": 0.9265, | |
| "mean_token_accuracy": 0.7309059858322143, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.36784991723376864, | |
| "grad_norm": 0.9721403940210892, | |
| "learning_rate": 3.867485386388806e-05, | |
| "loss": 0.9368, | |
| "mean_token_accuracy": 0.7331580281257629, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.368769542026853, | |
| "grad_norm": 0.9405505899400793, | |
| "learning_rate": 3.8615410773520635e-05, | |
| "loss": 0.9138, | |
| "mean_token_accuracy": 0.7358463048934937, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.36968916681993746, | |
| "grad_norm": 0.963025470188593, | |
| "learning_rate": 3.8555864875774885e-05, | |
| "loss": 0.9019, | |
| "mean_token_accuracy": 0.7384212732315063, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3706087916130219, | |
| "grad_norm": 0.9907971594256944, | |
| "learning_rate": 3.849621672139588e-05, | |
| "loss": 0.8763, | |
| "mean_token_accuracy": 0.7444020867347717, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.3715284164061063, | |
| "grad_norm": 0.981696155165083, | |
| "learning_rate": 3.843646686207445e-05, | |
| "loss": 0.9202, | |
| "mean_token_accuracy": 0.7325111865997315, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.3724480411991907, | |
| "grad_norm": 0.990078628199776, | |
| "learning_rate": 3.837661585044211e-05, | |
| "loss": 0.9045, | |
| "mean_token_accuracy": 0.7379343152046204, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.37336766599227517, | |
| "grad_norm": 0.9302652014201332, | |
| "learning_rate": 3.831666424006598e-05, | |
| "loss": 0.9145, | |
| "mean_token_accuracy": 0.7369246363639832, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.37428729078535955, | |
| "grad_norm": 1.0127134327540788, | |
| "learning_rate": 3.825661258544358e-05, | |
| "loss": 0.8949, | |
| "mean_token_accuracy": 0.740783178806305, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.375206915578444, | |
| "grad_norm": 0.9456025309406082, | |
| "learning_rate": 3.819646144199777e-05, | |
| "loss": 0.8635, | |
| "mean_token_accuracy": 0.749360203742981, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.37612654037152843, | |
| "grad_norm": 0.9458510607283644, | |
| "learning_rate": 3.813621136607157e-05, | |
| "loss": 0.9212, | |
| "mean_token_accuracy": 0.7321518301963806, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.3770461651646128, | |
| "grad_norm": 0.995792214246869, | |
| "learning_rate": 3.8075862914923074e-05, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.7222961544990539, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.37796578995769725, | |
| "grad_norm": 0.931780686224964, | |
| "learning_rate": 3.801541664672021e-05, | |
| "loss": 0.9068, | |
| "mean_token_accuracy": 0.7373356938362121, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.3788854147507817, | |
| "grad_norm": 1.032699719779323, | |
| "learning_rate": 3.795487312053566e-05, | |
| "loss": 0.8428, | |
| "mean_token_accuracy": 0.754009485244751, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.37980503954386613, | |
| "grad_norm": 1.0082536583803767, | |
| "learning_rate": 3.789423289634163e-05, | |
| "loss": 0.8877, | |
| "mean_token_accuracy": 0.7419803261756897, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.3807246643369505, | |
| "grad_norm": 0.9922794484448726, | |
| "learning_rate": 3.783349653500472e-05, | |
| "loss": 0.9549, | |
| "mean_token_accuracy": 0.7244602799415588, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.38164428913003495, | |
| "grad_norm": 0.9289765959162268, | |
| "learning_rate": 3.777266459828067e-05, | |
| "loss": 0.9049, | |
| "mean_token_accuracy": 0.7346539378166199, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.3825639139231194, | |
| "grad_norm": 0.9418822148176986, | |
| "learning_rate": 3.7711737648809255e-05, | |
| "loss": 0.8631, | |
| "mean_token_accuracy": 0.7498388290405273, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3834835387162038, | |
| "grad_norm": 0.9739714347813362, | |
| "learning_rate": 3.765071625010899e-05, | |
| "loss": 0.8642, | |
| "mean_token_accuracy": 0.7496488690376282, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.3844031635092882, | |
| "grad_norm": 0.9876318304111896, | |
| "learning_rate": 3.758960096657197e-05, | |
| "loss": 0.9409, | |
| "mean_token_accuracy": 0.7231215476989746, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.38532278830237265, | |
| "grad_norm": 0.9391298182307426, | |
| "learning_rate": 3.752839236345866e-05, | |
| "loss": 0.9321, | |
| "mean_token_accuracy": 0.7299721479415894, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.38624241309545704, | |
| "grad_norm": 0.9975883406823954, | |
| "learning_rate": 3.746709100689263e-05, | |
| "loss": 0.9119, | |
| "mean_token_accuracy": 0.7372664332389831, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3871620378885415, | |
| "grad_norm": 0.9585598143365737, | |
| "learning_rate": 3.740569746385531e-05, | |
| "loss": 0.9511, | |
| "mean_token_accuracy": 0.7252285242080688, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.3880816626816259, | |
| "grad_norm": 0.9708930878655039, | |
| "learning_rate": 3.7344212302180807e-05, | |
| "loss": 0.9021, | |
| "mean_token_accuracy": 0.7373741269111633, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.3890012874747103, | |
| "grad_norm": 0.9842480657825518, | |
| "learning_rate": 3.7282636090550613e-05, | |
| "loss": 0.9155, | |
| "mean_token_accuracy": 0.7346144676208496, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.38992091226779474, | |
| "grad_norm": 1.010319909401371, | |
| "learning_rate": 3.722096939848833e-05, | |
| "loss": 0.8251, | |
| "mean_token_accuracy": 0.7569172263145447, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3908405370608792, | |
| "grad_norm": 1.0232782350312868, | |
| "learning_rate": 3.7159212796354425e-05, | |
| "loss": 0.9061, | |
| "mean_token_accuracy": 0.7363372683525086, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.39176016185396356, | |
| "grad_norm": 0.9853933308782586, | |
| "learning_rate": 3.7097366855340974e-05, | |
| "loss": 0.9281, | |
| "mean_token_accuracy": 0.7297635912895203, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.392679786647048, | |
| "grad_norm": 1.0085562594833883, | |
| "learning_rate": 3.703543214746632e-05, | |
| "loss": 0.9345, | |
| "mean_token_accuracy": 0.7267664670944214, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.39359941144013244, | |
| "grad_norm": 0.9907065624349415, | |
| "learning_rate": 3.6973409245569846e-05, | |
| "loss": 0.9017, | |
| "mean_token_accuracy": 0.7393394112586975, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3945190362332168, | |
| "grad_norm": 0.9488707860528096, | |
| "learning_rate": 3.691129872330663e-05, | |
| "loss": 0.9373, | |
| "mean_token_accuracy": 0.728193199634552, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.39543866102630126, | |
| "grad_norm": 0.9103606197233259, | |
| "learning_rate": 3.684910115514218e-05, | |
| "loss": 0.897, | |
| "mean_token_accuracy": 0.7412585973739624, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3963582858193857, | |
| "grad_norm": 0.965709462156266, | |
| "learning_rate": 3.678681711634708e-05, | |
| "loss": 0.8715, | |
| "mean_token_accuracy": 0.74575275182724, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.3972779106124701, | |
| "grad_norm": 1.0272326947622106, | |
| "learning_rate": 3.67244471829917e-05, | |
| "loss": 0.8789, | |
| "mean_token_accuracy": 0.7422020196914673, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3981975354055545, | |
| "grad_norm": 0.9300588922771316, | |
| "learning_rate": 3.6661991931940856e-05, | |
| "loss": 0.8945, | |
| "mean_token_accuracy": 0.7385678648948669, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.39911716019863896, | |
| "grad_norm": 1.002757392159615, | |
| "learning_rate": 3.6599451940848446e-05, | |
| "loss": 0.8993, | |
| "mean_token_accuracy": 0.7361081838607788, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.4000367849917234, | |
| "grad_norm": 1.1036859227862066, | |
| "learning_rate": 3.6536827788152176e-05, | |
| "loss": 0.9308, | |
| "mean_token_accuracy": 0.7304606318473816, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.4009564097848078, | |
| "grad_norm": 0.9701793563305904, | |
| "learning_rate": 3.6474120053068164e-05, | |
| "loss": 0.8472, | |
| "mean_token_accuracy": 0.7498792171478271, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.4018760345778922, | |
| "grad_norm": 1.041733702997736, | |
| "learning_rate": 3.641132931558556e-05, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.7201631188392639, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.40279565937097667, | |
| "grad_norm": 1.0348942168040987, | |
| "learning_rate": 3.634845615646123e-05, | |
| "loss": 0.9393, | |
| "mean_token_accuracy": 0.7280836224555969, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.40371528416406105, | |
| "grad_norm": 1.0131734961320986, | |
| "learning_rate": 3.628550115721437e-05, | |
| "loss": 0.927, | |
| "mean_token_accuracy": 0.729682469367981, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.4046349089571455, | |
| "grad_norm": 1.025738826571974, | |
| "learning_rate": 3.622246490012111e-05, | |
| "loss": 0.9357, | |
| "mean_token_accuracy": 0.724788224697113, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.40555453375022993, | |
| "grad_norm": 0.9501914998942569, | |
| "learning_rate": 3.615934796820915e-05, | |
| "loss": 0.8978, | |
| "mean_token_accuracy": 0.7385434865951538, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.4064741585433143, | |
| "grad_norm": 1.0106650660729533, | |
| "learning_rate": 3.609615094525235e-05, | |
| "loss": 0.952, | |
| "mean_token_accuracy": 0.7243346452713013, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.40739378333639875, | |
| "grad_norm": 0.9301771755028939, | |
| "learning_rate": 3.6032874415765344e-05, | |
| "loss": 0.8633, | |
| "mean_token_accuracy": 0.7481309175491333, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.4083134081294832, | |
| "grad_norm": 0.9662316400458029, | |
| "learning_rate": 3.596951896499813e-05, | |
| "loss": 0.8931, | |
| "mean_token_accuracy": 0.7380975484848022, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.4092330329225676, | |
| "grad_norm": 0.9612362754674141, | |
| "learning_rate": 3.590608517893065e-05, | |
| "loss": 0.8787, | |
| "mean_token_accuracy": 0.743196439743042, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.410152657715652, | |
| "grad_norm": 0.9923328807528666, | |
| "learning_rate": 3.584257364426738e-05, | |
| "loss": 0.942, | |
| "mean_token_accuracy": 0.7252677202224731, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.41107228250873645, | |
| "grad_norm": 0.9797715702136052, | |
| "learning_rate": 3.577898494843191e-05, | |
| "loss": 0.9523, | |
| "mean_token_accuracy": 0.7244603157043457, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.41199190730182084, | |
| "grad_norm": 0.9048445218025765, | |
| "learning_rate": 3.571531967956147e-05, | |
| "loss": 0.9136, | |
| "mean_token_accuracy": 0.7320458292961121, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.4129115320949053, | |
| "grad_norm": 0.9649058945655278, | |
| "learning_rate": 3.565157842650154e-05, | |
| "loss": 0.9041, | |
| "mean_token_accuracy": 0.7362257719039917, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.4138311568879897, | |
| "grad_norm": 0.9147474250541198, | |
| "learning_rate": 3.55877617788004e-05, | |
| "loss": 0.9155, | |
| "mean_token_accuracy": 0.7333362221717834, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4147507816810741, | |
| "grad_norm": 0.876619458906422, | |
| "learning_rate": 3.5523870326703635e-05, | |
| "loss": 0.8492, | |
| "mean_token_accuracy": 0.7528911828994751, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.41567040647415854, | |
| "grad_norm": 1.0036194468259731, | |
| "learning_rate": 3.545990466114871e-05, | |
| "loss": 0.9137, | |
| "mean_token_accuracy": 0.734946858882904, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.416590031267243, | |
| "grad_norm": 0.9978348158615458, | |
| "learning_rate": 3.5395865373759504e-05, | |
| "loss": 0.8815, | |
| "mean_token_accuracy": 0.742937445640564, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.41750965606032736, | |
| "grad_norm": 0.9799485166888982, | |
| "learning_rate": 3.533175305684081e-05, | |
| "loss": 0.8857, | |
| "mean_token_accuracy": 0.7412702798843384, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.4184292808534118, | |
| "grad_norm": 0.9766101000667111, | |
| "learning_rate": 3.5267568303372914e-05, | |
| "loss": 0.8934, | |
| "mean_token_accuracy": 0.7409379720687866, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.41934890564649624, | |
| "grad_norm": 0.9775807722195559, | |
| "learning_rate": 3.520331170700605e-05, | |
| "loss": 0.9067, | |
| "mean_token_accuracy": 0.7377767205238343, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.4202685304395807, | |
| "grad_norm": 0.9690742278243399, | |
| "learning_rate": 3.513898386205491e-05, | |
| "loss": 0.9032, | |
| "mean_token_accuracy": 0.7356434345245362, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.42118815523266506, | |
| "grad_norm": 0.965511424805927, | |
| "learning_rate": 3.507458536349323e-05, | |
| "loss": 0.9157, | |
| "mean_token_accuracy": 0.7343951106071472, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.4221077800257495, | |
| "grad_norm": 0.9486968791577164, | |
| "learning_rate": 3.5010116806948166e-05, | |
| "loss": 0.901, | |
| "mean_token_accuracy": 0.7399522423744201, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.42302740481883394, | |
| "grad_norm": 0.9414293890579761, | |
| "learning_rate": 3.4945578788694894e-05, | |
| "loss": 0.9179, | |
| "mean_token_accuracy": 0.7342228889465332, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4239470296119183, | |
| "grad_norm": 0.9896377940060639, | |
| "learning_rate": 3.4880971905651016e-05, | |
| "loss": 0.8784, | |
| "mean_token_accuracy": 0.7457787752151489, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.42486665440500276, | |
| "grad_norm": 0.9655527131977069, | |
| "learning_rate": 3.481629675537108e-05, | |
| "loss": 0.863, | |
| "mean_token_accuracy": 0.7453173756599426, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.4257862791980872, | |
| "grad_norm": 0.8936296988219236, | |
| "learning_rate": 3.475155393604104e-05, | |
| "loss": 0.8856, | |
| "mean_token_accuracy": 0.7441475629806519, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.4267059039911716, | |
| "grad_norm": 0.9149916486904485, | |
| "learning_rate": 3.468674404647273e-05, | |
| "loss": 0.8532, | |
| "mean_token_accuracy": 0.7507219910621643, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.427625528784256, | |
| "grad_norm": 0.9750792604803812, | |
| "learning_rate": 3.462186768609834e-05, | |
| "loss": 0.863, | |
| "mean_token_accuracy": 0.7469933509826661, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.42854515357734047, | |
| "grad_norm": 0.980901247745682, | |
| "learning_rate": 3.455692545496483e-05, | |
| "loss": 0.837, | |
| "mean_token_accuracy": 0.7545093297958374, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.42946477837042485, | |
| "grad_norm": 0.9686839306544004, | |
| "learning_rate": 3.4491917953728396e-05, | |
| "loss": 0.8885, | |
| "mean_token_accuracy": 0.7428396463394165, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.4303844031635093, | |
| "grad_norm": 0.9388350160272184, | |
| "learning_rate": 3.442684578364897e-05, | |
| "loss": 0.8951, | |
| "mean_token_accuracy": 0.7408537268638611, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.4313040279565937, | |
| "grad_norm": 0.8933385447401438, | |
| "learning_rate": 3.4361709546584545e-05, | |
| "loss": 0.8689, | |
| "mean_token_accuracy": 0.7458449006080627, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.4322236527496781, | |
| "grad_norm": 0.9411177313363235, | |
| "learning_rate": 3.429650984498573e-05, | |
| "loss": 0.8417, | |
| "mean_token_accuracy": 0.7528134107589721, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.43314327754276255, | |
| "grad_norm": 0.9359109119006161, | |
| "learning_rate": 3.423124728189009e-05, | |
| "loss": 0.8737, | |
| "mean_token_accuracy": 0.7434362411499024, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.434062902335847, | |
| "grad_norm": 0.966957214742338, | |
| "learning_rate": 3.4165922460916635e-05, | |
| "loss": 0.8946, | |
| "mean_token_accuracy": 0.7397825956344605, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.4349825271289314, | |
| "grad_norm": 0.9950941777576424, | |
| "learning_rate": 3.410053598626016e-05, | |
| "loss": 0.8833, | |
| "mean_token_accuracy": 0.7447291493415833, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.4359021519220158, | |
| "grad_norm": 0.963560335329199, | |
| "learning_rate": 3.403508846268574e-05, | |
| "loss": 0.8675, | |
| "mean_token_accuracy": 0.7479366779327392, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.43682177671510025, | |
| "grad_norm": 0.9286384422364868, | |
| "learning_rate": 3.396958049552307e-05, | |
| "loss": 0.9171, | |
| "mean_token_accuracy": 0.7304298520088196, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.43774140150818464, | |
| "grad_norm": 0.9750119805406471, | |
| "learning_rate": 3.39040126906609e-05, | |
| "loss": 0.8858, | |
| "mean_token_accuracy": 0.742851734161377, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.4386610263012691, | |
| "grad_norm": 0.9160809046368507, | |
| "learning_rate": 3.383838565454144e-05, | |
| "loss": 0.9062, | |
| "mean_token_accuracy": 0.7335192441940308, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.4395806510943535, | |
| "grad_norm": 0.9668435486381742, | |
| "learning_rate": 3.37726999941547e-05, | |
| "loss": 0.9243, | |
| "mean_token_accuracy": 0.7276196122169495, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.4405002758874379, | |
| "grad_norm": 0.9935097247563913, | |
| "learning_rate": 3.3706956317032954e-05, | |
| "loss": 0.8678, | |
| "mean_token_accuracy": 0.7438644409179688, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.44141990068052234, | |
| "grad_norm": 0.9939894791042586, | |
| "learning_rate": 3.364115523124503e-05, | |
| "loss": 0.8904, | |
| "mean_token_accuracy": 0.7412869215011597, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4423395254736068, | |
| "grad_norm": 0.9937645932689831, | |
| "learning_rate": 3.357529734539079e-05, | |
| "loss": 0.8455, | |
| "mean_token_accuracy": 0.7517339706420898, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.4432591502666912, | |
| "grad_norm": 0.9375114941684974, | |
| "learning_rate": 3.350938326859539e-05, | |
| "loss": 0.8468, | |
| "mean_token_accuracy": 0.7528372883796692, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.4441787750597756, | |
| "grad_norm": 0.8973960962242926, | |
| "learning_rate": 3.3443413610503735e-05, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.7442919254302979, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.44509839985286004, | |
| "grad_norm": 1.0080330285869648, | |
| "learning_rate": 3.337738898127479e-05, | |
| "loss": 0.8785, | |
| "mean_token_accuracy": 0.7428927779197693, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.4460180246459445, | |
| "grad_norm": 0.8985281228115014, | |
| "learning_rate": 3.331130999157597e-05, | |
| "loss": 0.8644, | |
| "mean_token_accuracy": 0.7480224132537842, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.44693764943902886, | |
| "grad_norm": 0.9291069202904676, | |
| "learning_rate": 3.3245177252577454e-05, | |
| "loss": 0.8976, | |
| "mean_token_accuracy": 0.7383280873298645, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.4478572742321133, | |
| "grad_norm": 0.9623008963786942, | |
| "learning_rate": 3.317899137594656e-05, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.7246118664741517, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.44877689902519774, | |
| "grad_norm": 0.9234507163948065, | |
| "learning_rate": 3.311275297384208e-05, | |
| "loss": 0.8413, | |
| "mean_token_accuracy": 0.7528854846954346, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4496965238182821, | |
| "grad_norm": 0.979267043456503, | |
| "learning_rate": 3.3046462658908636e-05, | |
| "loss": 0.845, | |
| "mean_token_accuracy": 0.7532721877098083, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.45061614861136656, | |
| "grad_norm": 0.9032231134895651, | |
| "learning_rate": 3.298012104427097e-05, | |
| "loss": 0.895, | |
| "mean_token_accuracy": 0.7396630644798279, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.451535773404451, | |
| "grad_norm": 0.9383158653652773, | |
| "learning_rate": 3.291372874352832e-05, | |
| "loss": 0.8943, | |
| "mean_token_accuracy": 0.73899405002594, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.4524553981975354, | |
| "grad_norm": 0.9664126873169693, | |
| "learning_rate": 3.284728637074869e-05, | |
| "loss": 0.869, | |
| "mean_token_accuracy": 0.746407687664032, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.4533750229906198, | |
| "grad_norm": 0.993853088939543, | |
| "learning_rate": 3.278079454046325e-05, | |
| "loss": 0.9011, | |
| "mean_token_accuracy": 0.7388368129730225, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.45429464778370426, | |
| "grad_norm": 0.8741206209918251, | |
| "learning_rate": 3.271425386766058e-05, | |
| "loss": 0.8388, | |
| "mean_token_accuracy": 0.7533232569694519, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.45521427257678865, | |
| "grad_norm": 0.9447835076472045, | |
| "learning_rate": 3.2647664967781035e-05, | |
| "loss": 0.8228, | |
| "mean_token_accuracy": 0.7583665132522583, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.4561338973698731, | |
| "grad_norm": 1.0045001891415821, | |
| "learning_rate": 3.258102845671097e-05, | |
| "loss": 0.8934, | |
| "mean_token_accuracy": 0.7414227366447449, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4570535221629575, | |
| "grad_norm": 0.9475063098055461, | |
| "learning_rate": 3.251434495077716e-05, | |
| "loss": 0.9182, | |
| "mean_token_accuracy": 0.7303388476371765, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.4579731469560419, | |
| "grad_norm": 0.9775463234456495, | |
| "learning_rate": 3.2447615066741004e-05, | |
| "loss": 0.9361, | |
| "mean_token_accuracy": 0.7293364763259887, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.45889277174912635, | |
| "grad_norm": 0.9174334893241889, | |
| "learning_rate": 3.238083942179288e-05, | |
| "loss": 0.8474, | |
| "mean_token_accuracy": 0.7529029250144958, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.4598123965422108, | |
| "grad_norm": 0.9021239390235616, | |
| "learning_rate": 3.2314018633546375e-05, | |
| "loss": 0.8314, | |
| "mean_token_accuracy": 0.7585980296134949, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.46073202133529517, | |
| "grad_norm": 0.9231622515184421, | |
| "learning_rate": 3.224715332003265e-05, | |
| "loss": 0.8498, | |
| "mean_token_accuracy": 0.7502579808235168, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.4616516461283796, | |
| "grad_norm": 0.9279166556927757, | |
| "learning_rate": 3.218024409969468e-05, | |
| "loss": 0.899, | |
| "mean_token_accuracy": 0.7380064010620118, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.46257127092146405, | |
| "grad_norm": 0.9333611856920211, | |
| "learning_rate": 3.2113291591381516e-05, | |
| "loss": 0.9113, | |
| "mean_token_accuracy": 0.7354224920272827, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.4634908957145485, | |
| "grad_norm": 0.9585859302538061, | |
| "learning_rate": 3.204629641434259e-05, | |
| "loss": 0.912, | |
| "mean_token_accuracy": 0.7332522869110107, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.4644105205076329, | |
| "grad_norm": 1.0072945032594127, | |
| "learning_rate": 3.197925918822199e-05, | |
| "loss": 0.8615, | |
| "mean_token_accuracy": 0.7460902214050293, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.4653301453007173, | |
| "grad_norm": 0.9703474311506037, | |
| "learning_rate": 3.1912180533052716e-05, | |
| "loss": 0.9391, | |
| "mean_token_accuracy": 0.7272826433181763, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.46624977009380175, | |
| "grad_norm": 0.9701812144923739, | |
| "learning_rate": 3.184506106925094e-05, | |
| "loss": 0.8677, | |
| "mean_token_accuracy": 0.747051191329956, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.46716939488688614, | |
| "grad_norm": 0.9672451609696705, | |
| "learning_rate": 3.177790141761029e-05, | |
| "loss": 0.8627, | |
| "mean_token_accuracy": 0.7482078075408936, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.4680890196799706, | |
| "grad_norm": 0.9530973638849749, | |
| "learning_rate": 3.1710702199296085e-05, | |
| "loss": 0.8492, | |
| "mean_token_accuracy": 0.7528972029685974, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.469008644473055, | |
| "grad_norm": 0.9084239076489461, | |
| "learning_rate": 3.16434640358396e-05, | |
| "loss": 0.8653, | |
| "mean_token_accuracy": 0.746622622013092, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4699282692661394, | |
| "grad_norm": 0.9998420571855022, | |
| "learning_rate": 3.157618754913233e-05, | |
| "loss": 0.8975, | |
| "mean_token_accuracy": 0.738722312450409, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.47084789405922384, | |
| "grad_norm": 0.9250250902872688, | |
| "learning_rate": 3.15088733614202e-05, | |
| "loss": 0.8551, | |
| "mean_token_accuracy": 0.750208032131195, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.4717675188523083, | |
| "grad_norm": 1.0106796436372896, | |
| "learning_rate": 3.144152209529786e-05, | |
| "loss": 0.9079, | |
| "mean_token_accuracy": 0.7350385189056396, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.47268714364539266, | |
| "grad_norm": 0.9619558970415346, | |
| "learning_rate": 3.137413437370289e-05, | |
| "loss": 0.91, | |
| "mean_token_accuracy": 0.7369326472282409, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.4736067684384771, | |
| "grad_norm": 1.0109885841238913, | |
| "learning_rate": 3.130671081991005e-05, | |
| "loss": 0.9084, | |
| "mean_token_accuracy": 0.7353306174278259, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.47452639323156154, | |
| "grad_norm": 0.9779190292756188, | |
| "learning_rate": 3.123925205752552e-05, | |
| "loss": 0.8556, | |
| "mean_token_accuracy": 0.7515247583389282, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.4754460180246459, | |
| "grad_norm": 0.9645840220644, | |
| "learning_rate": 3.1171758710481096e-05, | |
| "loss": 0.8755, | |
| "mean_token_accuracy": 0.7436783194541932, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.47636564281773036, | |
| "grad_norm": 1.001058541812525, | |
| "learning_rate": 3.110423140302852e-05, | |
| "loss": 0.9096, | |
| "mean_token_accuracy": 0.7341774582862854, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.4772852676108148, | |
| "grad_norm": 0.8974468409856537, | |
| "learning_rate": 3.103667075973356e-05, | |
| "loss": 0.9083, | |
| "mean_token_accuracy": 0.7359666705131531, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.4782048924038992, | |
| "grad_norm": 1.0374371477545201, | |
| "learning_rate": 3.096907740547036e-05, | |
| "loss": 0.9111, | |
| "mean_token_accuracy": 0.7324892163276673, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4791245171969836, | |
| "grad_norm": 0.9405864234939062, | |
| "learning_rate": 3.0901451965415595e-05, | |
| "loss": 0.812, | |
| "mean_token_accuracy": 0.7602822542190552, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.48004414199006806, | |
| "grad_norm": 0.9654353230874346, | |
| "learning_rate": 3.08337950650427e-05, | |
| "loss": 0.8978, | |
| "mean_token_accuracy": 0.7364333510398865, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.48096376678315245, | |
| "grad_norm": 1.0011041381512356, | |
| "learning_rate": 3.076610733011609e-05, | |
| "loss": 0.9049, | |
| "mean_token_accuracy": 0.7363562822341919, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.4818833915762369, | |
| "grad_norm": 0.9686831090055986, | |
| "learning_rate": 3.069838938668538e-05, | |
| "loss": 0.8898, | |
| "mean_token_accuracy": 0.7398189902305603, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.4828030163693213, | |
| "grad_norm": 0.9318085356157495, | |
| "learning_rate": 3.063064186107957e-05, | |
| "loss": 0.8791, | |
| "mean_token_accuracy": 0.7449330806732177, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.48372264116240576, | |
| "grad_norm": 0.8934228857530689, | |
| "learning_rate": 3.056286537990129e-05, | |
| "loss": 0.8632, | |
| "mean_token_accuracy": 0.7459052681922913, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.48464226595549015, | |
| "grad_norm": 0.9725972260652284, | |
| "learning_rate": 3.049506057002098e-05, | |
| "loss": 0.8541, | |
| "mean_token_accuracy": 0.7478031516075134, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.4855618907485746, | |
| "grad_norm": 0.9452628770649284, | |
| "learning_rate": 3.042722805857106e-05, | |
| "loss": 0.8555, | |
| "mean_token_accuracy": 0.746888279914856, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.486481515541659, | |
| "grad_norm": 0.8806175124503305, | |
| "learning_rate": 3.0359368472940208e-05, | |
| "loss": 0.9035, | |
| "mean_token_accuracy": 0.7369076132774353, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.4874011403347434, | |
| "grad_norm": 0.8988265278259941, | |
| "learning_rate": 3.029148244076749e-05, | |
| "loss": 0.8643, | |
| "mean_token_accuracy": 0.7449605345726014, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.48832076512782785, | |
| "grad_norm": 0.9176861265880045, | |
| "learning_rate": 3.022357058993657e-05, | |
| "loss": 0.8643, | |
| "mean_token_accuracy": 0.7462789297103882, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.4892403899209123, | |
| "grad_norm": 0.9232400004776917, | |
| "learning_rate": 3.0155633548569955e-05, | |
| "loss": 0.903, | |
| "mean_token_accuracy": 0.7353234887123108, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.4901600147139967, | |
| "grad_norm": 0.9476269194909095, | |
| "learning_rate": 3.008767194502309e-05, | |
| "loss": 0.9035, | |
| "mean_token_accuracy": 0.7386479258537293, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.4910796395070811, | |
| "grad_norm": 0.931067111141978, | |
| "learning_rate": 3.0019686407878617e-05, | |
| "loss": 0.8883, | |
| "mean_token_accuracy": 0.7414939045906067, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.49199926430016555, | |
| "grad_norm": 0.9153445295986272, | |
| "learning_rate": 2.995167756594055e-05, | |
| "loss": 0.8625, | |
| "mean_token_accuracy": 0.7501867294311524, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.49291888909324993, | |
| "grad_norm": 0.9210143810764434, | |
| "learning_rate": 2.988364604822845e-05, | |
| "loss": 0.8972, | |
| "mean_token_accuracy": 0.7386625647544861, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4938385138863344, | |
| "grad_norm": 0.9925053868796728, | |
| "learning_rate": 2.9815592483971584e-05, | |
| "loss": 0.8458, | |
| "mean_token_accuracy": 0.751643443107605, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.4947581386794188, | |
| "grad_norm": 1.006336852347141, | |
| "learning_rate": 2.9747517502603167e-05, | |
| "loss": 0.8721, | |
| "mean_token_accuracy": 0.7480525851249695, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4956777634725032, | |
| "grad_norm": 0.9701598502406181, | |
| "learning_rate": 2.967942173375447e-05, | |
| "loss": 0.8818, | |
| "mean_token_accuracy": 0.740173089504242, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.49659738826558764, | |
| "grad_norm": 0.9431128523024928, | |
| "learning_rate": 2.9611305807249052e-05, | |
| "loss": 0.8344, | |
| "mean_token_accuracy": 0.7551051139831543, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4975170130586721, | |
| "grad_norm": 0.9346714282194056, | |
| "learning_rate": 2.95431703530969e-05, | |
| "loss": 0.835, | |
| "mean_token_accuracy": 0.7544684171676636, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.49843663785175646, | |
| "grad_norm": 0.9358393411052466, | |
| "learning_rate": 2.9475016001488608e-05, | |
| "loss": 0.8906, | |
| "mean_token_accuracy": 0.7427068829536438, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4993562626448409, | |
| "grad_norm": 0.8867163340537708, | |
| "learning_rate": 2.9406843382789583e-05, | |
| "loss": 0.8719, | |
| "mean_token_accuracy": 0.745942211151123, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.5002758874379253, | |
| "grad_norm": 0.9212664551640851, | |
| "learning_rate": 2.9338653127534148e-05, | |
| "loss": 0.8562, | |
| "mean_token_accuracy": 0.7497703909873963, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.5011955122310098, | |
| "grad_norm": 0.9432905808331339, | |
| "learning_rate": 2.9270445866419766e-05, | |
| "loss": 0.8741, | |
| "mean_token_accuracy": 0.7432116866111755, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.5021151370240942, | |
| "grad_norm": 0.9512906709412812, | |
| "learning_rate": 2.92022222303012e-05, | |
| "loss": 0.8818, | |
| "mean_token_accuracy": 0.7435823440551758, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.5030347618171785, | |
| "grad_norm": 0.9468765725989278, | |
| "learning_rate": 2.9133982850184645e-05, | |
| "loss": 0.8627, | |
| "mean_token_accuracy": 0.748947024345398, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.503954386610263, | |
| "grad_norm": 1.0112504748902342, | |
| "learning_rate": 2.9065728357221927e-05, | |
| "loss": 0.8508, | |
| "mean_token_accuracy": 0.7537087440490723, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.5048740114033474, | |
| "grad_norm": 0.9649262010355393, | |
| "learning_rate": 2.899745938270465e-05, | |
| "loss": 0.8819, | |
| "mean_token_accuracy": 0.7414289236068725, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.5057936361964318, | |
| "grad_norm": 0.9373961423715033, | |
| "learning_rate": 2.8929176558058352e-05, | |
| "loss": 0.8876, | |
| "mean_token_accuracy": 0.741254198551178, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.5067132609895163, | |
| "grad_norm": 0.9616567239953456, | |
| "learning_rate": 2.8860880514836687e-05, | |
| "loss": 0.8826, | |
| "mean_token_accuracy": 0.7436172485351562, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.5076328857826007, | |
| "grad_norm": 0.9367792403626876, | |
| "learning_rate": 2.8792571884715546e-05, | |
| "loss": 0.8482, | |
| "mean_token_accuracy": 0.7529447674751282, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.5085525105756851, | |
| "grad_norm": 0.9104599971108884, | |
| "learning_rate": 2.8724251299487263e-05, | |
| "loss": 0.8753, | |
| "mean_token_accuracy": 0.7427584528923035, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.5094721353687696, | |
| "grad_norm": 1.0105096627504964, | |
| "learning_rate": 2.8655919391054732e-05, | |
| "loss": 0.8641, | |
| "mean_token_accuracy": 0.7479874610900878, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.510391760161854, | |
| "grad_norm": 0.9279979512504474, | |
| "learning_rate": 2.8587576791425568e-05, | |
| "loss": 0.8317, | |
| "mean_token_accuracy": 0.7535252571105957, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.5113113849549383, | |
| "grad_norm": 0.9297465828114925, | |
| "learning_rate": 2.8519224132706297e-05, | |
| "loss": 0.8774, | |
| "mean_token_accuracy": 0.7402622103691101, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.5122310097480228, | |
| "grad_norm": 0.9452271860575534, | |
| "learning_rate": 2.845086204709645e-05, | |
| "loss": 0.8771, | |
| "mean_token_accuracy": 0.744519031047821, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.5131506345411072, | |
| "grad_norm": 0.9830981203343458, | |
| "learning_rate": 2.838249116688277e-05, | |
| "loss": 0.9289, | |
| "mean_token_accuracy": 0.7298115253448486, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.5140702593341917, | |
| "grad_norm": 1.041430018260559, | |
| "learning_rate": 2.8314112124433334e-05, | |
| "loss": 0.9045, | |
| "mean_token_accuracy": 0.7383831977844239, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.5149898841272761, | |
| "grad_norm": 0.9620402098071436, | |
| "learning_rate": 2.8245725552191703e-05, | |
| "loss": 0.8634, | |
| "mean_token_accuracy": 0.746962821483612, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5159095089203605, | |
| "grad_norm": 0.9015921123510985, | |
| "learning_rate": 2.8177332082671117e-05, | |
| "loss": 0.853, | |
| "mean_token_accuracy": 0.7487654685974121, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.516829133713445, | |
| "grad_norm": 0.9007228615494444, | |
| "learning_rate": 2.8108932348448553e-05, | |
| "loss": 0.8428, | |
| "mean_token_accuracy": 0.7535581469535828, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.5177487585065293, | |
| "grad_norm": 0.9827577309973088, | |
| "learning_rate": 2.8040526982158993e-05, | |
| "loss": 0.8789, | |
| "mean_token_accuracy": 0.7432992815971374, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.5186683832996137, | |
| "grad_norm": 0.9633925171762643, | |
| "learning_rate": 2.7972116616489464e-05, | |
| "loss": 0.8397, | |
| "mean_token_accuracy": 0.752094304561615, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.5195880080926982, | |
| "grad_norm": 0.9281148435495344, | |
| "learning_rate": 2.790370188417324e-05, | |
| "loss": 0.8596, | |
| "mean_token_accuracy": 0.7485750317573547, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.5205076328857826, | |
| "grad_norm": 1.0029136932204825, | |
| "learning_rate": 2.7835283417984005e-05, | |
| "loss": 0.8718, | |
| "mean_token_accuracy": 0.7433583855628967, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.521427257678867, | |
| "grad_norm": 0.9621263162970809, | |
| "learning_rate": 2.7766861850729958e-05, | |
| "loss": 0.8955, | |
| "mean_token_accuracy": 0.7394774556159973, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.5223468824719515, | |
| "grad_norm": 0.9670299071015823, | |
| "learning_rate": 2.7698437815247995e-05, | |
| "loss": 0.8529, | |
| "mean_token_accuracy": 0.7500015497207642, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.5232665072650359, | |
| "grad_norm": 0.9398184622397476, | |
| "learning_rate": 2.763001194439782e-05, | |
| "loss": 0.8447, | |
| "mean_token_accuracy": 0.7504964828491211, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.5241861320581203, | |
| "grad_norm": 0.8869891271688453, | |
| "learning_rate": 2.756158487105613e-05, | |
| "loss": 0.8404, | |
| "mean_token_accuracy": 0.7549336075782775, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.5251057568512048, | |
| "grad_norm": 0.9965820824716972, | |
| "learning_rate": 2.749315722811073e-05, | |
| "loss": 0.9179, | |
| "mean_token_accuracy": 0.7317790746688843, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.5260253816442891, | |
| "grad_norm": 0.9304946857092635, | |
| "learning_rate": 2.7424729648454717e-05, | |
| "loss": 0.8874, | |
| "mean_token_accuracy": 0.7398088812828064, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.5269450064373735, | |
| "grad_norm": 0.9880649590404676, | |
| "learning_rate": 2.735630276498058e-05, | |
| "loss": 0.8738, | |
| "mean_token_accuracy": 0.7432942867279053, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.527864631230458, | |
| "grad_norm": 0.9350070938993663, | |
| "learning_rate": 2.728787721057437e-05, | |
| "loss": 0.8758, | |
| "mean_token_accuracy": 0.7431787729263306, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.5287842560235424, | |
| "grad_norm": 0.8997664568286488, | |
| "learning_rate": 2.7219453618109853e-05, | |
| "loss": 0.842, | |
| "mean_token_accuracy": 0.7523634552955627, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.5297038808166268, | |
| "grad_norm": 0.9519585493296138, | |
| "learning_rate": 2.715103262044265e-05, | |
| "loss": 0.8744, | |
| "mean_token_accuracy": 0.7417232871055603, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.5306235056097113, | |
| "grad_norm": 0.8836119550117293, | |
| "learning_rate": 2.708261485040439e-05, | |
| "loss": 0.856, | |
| "mean_token_accuracy": 0.7496297836303711, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.5315431304027957, | |
| "grad_norm": 0.9589883589041829, | |
| "learning_rate": 2.7014200940796824e-05, | |
| "loss": 0.8418, | |
| "mean_token_accuracy": 0.7520057439804078, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.53246275519588, | |
| "grad_norm": 0.9563207815434712, | |
| "learning_rate": 2.694579152438601e-05, | |
| "loss": 0.8936, | |
| "mean_token_accuracy": 0.7398610949516297, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.5333823799889645, | |
| "grad_norm": 0.9233468769288075, | |
| "learning_rate": 2.6877387233896472e-05, | |
| "loss": 0.8634, | |
| "mean_token_accuracy": 0.745741093158722, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5343020047820489, | |
| "grad_norm": 0.9541286928919233, | |
| "learning_rate": 2.6808988702005285e-05, | |
| "loss": 0.868, | |
| "mean_token_accuracy": 0.7439489006996155, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.5352216295751333, | |
| "grad_norm": 0.9922987370495847, | |
| "learning_rate": 2.6740596561336275e-05, | |
| "loss": 0.8482, | |
| "mean_token_accuracy": 0.7504428863525391, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.5361412543682178, | |
| "grad_norm": 0.9722831543231532, | |
| "learning_rate": 2.667221144445418e-05, | |
| "loss": 0.8177, | |
| "mean_token_accuracy": 0.7608316302299499, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.5370608791613022, | |
| "grad_norm": 1.0275441684092577, | |
| "learning_rate": 2.6603833983858738e-05, | |
| "loss": 0.9398, | |
| "mean_token_accuracy": 0.7276052117347718, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.5379805039543866, | |
| "grad_norm": 1.0068511170391965, | |
| "learning_rate": 2.6535464811978894e-05, | |
| "loss": 0.8424, | |
| "mean_token_accuracy": 0.7531503081321717, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.5389001287474711, | |
| "grad_norm": 0.9554905959505885, | |
| "learning_rate": 2.6467104561166927e-05, | |
| "loss": 0.8671, | |
| "mean_token_accuracy": 0.7456499934196472, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.5398197535405554, | |
| "grad_norm": 0.9318421761107843, | |
| "learning_rate": 2.639875386369261e-05, | |
| "loss": 0.8674, | |
| "mean_token_accuracy": 0.7474814653396606, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.5407393783336398, | |
| "grad_norm": 0.9797586514540253, | |
| "learning_rate": 2.6330413351737336e-05, | |
| "loss": 0.893, | |
| "mean_token_accuracy": 0.7371798276901245, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.5416590031267243, | |
| "grad_norm": 0.9627863342351398, | |
| "learning_rate": 2.626208365738831e-05, | |
| "loss": 0.8662, | |
| "mean_token_accuracy": 0.7450501322746277, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.5425786279198087, | |
| "grad_norm": 0.9378560834404903, | |
| "learning_rate": 2.6193765412632677e-05, | |
| "loss": 0.8427, | |
| "mean_token_accuracy": 0.750009298324585, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.5434982527128931, | |
| "grad_norm": 0.9349477883280783, | |
| "learning_rate": 2.6125459249351697e-05, | |
| "loss": 0.8908, | |
| "mean_token_accuracy": 0.7386453747749329, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.5444178775059776, | |
| "grad_norm": 0.9298587181804499, | |
| "learning_rate": 2.6057165799314854e-05, | |
| "loss": 0.855, | |
| "mean_token_accuracy": 0.7491998553276062, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.545337502299062, | |
| "grad_norm": 0.9026144571758381, | |
| "learning_rate": 2.5988885694174085e-05, | |
| "loss": 0.8786, | |
| "mean_token_accuracy": 0.7437506198883057, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.5462571270921464, | |
| "grad_norm": 0.9408107824152944, | |
| "learning_rate": 2.5920619565457877e-05, | |
| "loss": 0.8758, | |
| "mean_token_accuracy": 0.7427832961082459, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.5471767518852308, | |
| "grad_norm": 0.9195819021761746, | |
| "learning_rate": 2.5852368044565452e-05, | |
| "loss": 0.9277, | |
| "mean_token_accuracy": 0.7323094010353088, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.5480963766783152, | |
| "grad_norm": 0.9586681296133412, | |
| "learning_rate": 2.5784131762760922e-05, | |
| "loss": 0.8334, | |
| "mean_token_accuracy": 0.7566598057746887, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.5490160014713996, | |
| "grad_norm": 0.9092467816987784, | |
| "learning_rate": 2.5715911351167465e-05, | |
| "loss": 0.9014, | |
| "mean_token_accuracy": 0.7390154361724853, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.5499356262644841, | |
| "grad_norm": 0.966449128998816, | |
| "learning_rate": 2.564770744076144e-05, | |
| "loss": 0.8959, | |
| "mean_token_accuracy": 0.7373208284378052, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.5508552510575685, | |
| "grad_norm": 1.0269176653506933, | |
| "learning_rate": 2.5579520662366618e-05, | |
| "loss": 0.8626, | |
| "mean_token_accuracy": 0.7471036791801453, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.5517748758506529, | |
| "grad_norm": 0.9705454615801481, | |
| "learning_rate": 2.5511351646648324e-05, | |
| "loss": 0.8761, | |
| "mean_token_accuracy": 0.7408113241195678, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5526945006437374, | |
| "grad_norm": 0.9683019669667483, | |
| "learning_rate": 2.5443201024107537e-05, | |
| "loss": 0.8974, | |
| "mean_token_accuracy": 0.7345914959907531, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.5536141254368218, | |
| "grad_norm": 0.9328296833493311, | |
| "learning_rate": 2.5375069425075176e-05, | |
| "loss": 0.8629, | |
| "mean_token_accuracy": 0.7468894720077515, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.5545337502299063, | |
| "grad_norm": 0.9565417579373001, | |
| "learning_rate": 2.5306957479706196e-05, | |
| "loss": 0.8914, | |
| "mean_token_accuracy": 0.7373947501182556, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.5554533750229906, | |
| "grad_norm": 0.9439811181197841, | |
| "learning_rate": 2.5238865817973735e-05, | |
| "loss": 0.8264, | |
| "mean_token_accuracy": 0.7566876411437988, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.556372999816075, | |
| "grad_norm": 0.8918377804941932, | |
| "learning_rate": 2.5170795069663374e-05, | |
| "loss": 0.8384, | |
| "mean_token_accuracy": 0.7532538652420044, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.5572926246091595, | |
| "grad_norm": 0.9531681758263391, | |
| "learning_rate": 2.510274586436725e-05, | |
| "loss": 0.9137, | |
| "mean_token_accuracy": 0.7336269617080688, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.5582122494022439, | |
| "grad_norm": 0.9547809224031603, | |
| "learning_rate": 2.5034718831478236e-05, | |
| "loss": 0.8121, | |
| "mean_token_accuracy": 0.7607084512710571, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.5591318741953283, | |
| "grad_norm": 0.9101416039188879, | |
| "learning_rate": 2.496671460018414e-05, | |
| "loss": 0.8374, | |
| "mean_token_accuracy": 0.7512237310409546, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.5600514989884128, | |
| "grad_norm": 0.9591588974138807, | |
| "learning_rate": 2.4898733799461866e-05, | |
| "loss": 0.8691, | |
| "mean_token_accuracy": 0.7475574612617493, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.5609711237814972, | |
| "grad_norm": 0.9481182124754315, | |
| "learning_rate": 2.4830777058071623e-05, | |
| "loss": 0.8541, | |
| "mean_token_accuracy": 0.7470650672912598, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5618907485745815, | |
| "grad_norm": 0.8991567391844545, | |
| "learning_rate": 2.4762845004551077e-05, | |
| "loss": 0.834, | |
| "mean_token_accuracy": 0.7513617157936097, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.562810373367666, | |
| "grad_norm": 0.8993594505060807, | |
| "learning_rate": 2.4694938267209567e-05, | |
| "loss": 0.8302, | |
| "mean_token_accuracy": 0.7539983510971069, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.5637299981607504, | |
| "grad_norm": 0.9212463554308379, | |
| "learning_rate": 2.4627057474122273e-05, | |
| "loss": 0.8598, | |
| "mean_token_accuracy": 0.747953188419342, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.5646496229538348, | |
| "grad_norm": 0.9155845020709076, | |
| "learning_rate": 2.4559203253124407e-05, | |
| "loss": 0.8728, | |
| "mean_token_accuracy": 0.7440886616706848, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.5655692477469193, | |
| "grad_norm": 0.9376543570110895, | |
| "learning_rate": 2.4491376231805428e-05, | |
| "loss": 0.8529, | |
| "mean_token_accuracy": 0.7518376111984253, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.5664888725400037, | |
| "grad_norm": 0.9720221730313491, | |
| "learning_rate": 2.442357703750322e-05, | |
| "loss": 0.8423, | |
| "mean_token_accuracy": 0.7525236487388611, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5674084973330881, | |
| "grad_norm": 0.9013738631587733, | |
| "learning_rate": 2.4355806297298296e-05, | |
| "loss": 0.8422, | |
| "mean_token_accuracy": 0.7528858304023742, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.5683281221261726, | |
| "grad_norm": 0.9524358228393591, | |
| "learning_rate": 2.4288064638007974e-05, | |
| "loss": 0.8672, | |
| "mean_token_accuracy": 0.7468002319335938, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.569247746919257, | |
| "grad_norm": 0.9505409858129935, | |
| "learning_rate": 2.4220352686180613e-05, | |
| "loss": 0.8416, | |
| "mean_token_accuracy": 0.7486450433731079, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.5701673717123413, | |
| "grad_norm": 0.9615751645550065, | |
| "learning_rate": 2.415267106808983e-05, | |
| "loss": 0.803, | |
| "mean_token_accuracy": 0.7603586912155151, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5710869965054258, | |
| "grad_norm": 0.9458073029155306, | |
| "learning_rate": 2.4085020409728633e-05, | |
| "loss": 0.8614, | |
| "mean_token_accuracy": 0.7483598232269287, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.5720066212985102, | |
| "grad_norm": 0.959427274017189, | |
| "learning_rate": 2.4017401336803713e-05, | |
| "loss": 0.8795, | |
| "mean_token_accuracy": 0.7383235573768616, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.5729262460915946, | |
| "grad_norm": 0.9688058239251538, | |
| "learning_rate": 2.394981447472963e-05, | |
| "loss": 0.8854, | |
| "mean_token_accuracy": 0.7413538813591003, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.5738458708846791, | |
| "grad_norm": 0.9543674760330169, | |
| "learning_rate": 2.3882260448623002e-05, | |
| "loss": 0.8924, | |
| "mean_token_accuracy": 0.739243483543396, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5747654956777635, | |
| "grad_norm": 0.9565581088949338, | |
| "learning_rate": 2.381473988329675e-05, | |
| "loss": 0.8878, | |
| "mean_token_accuracy": 0.737128746509552, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.5756851204708479, | |
| "grad_norm": 0.9446263148140598, | |
| "learning_rate": 2.374725340325433e-05, | |
| "loss": 0.8771, | |
| "mean_token_accuracy": 0.7424870610237122, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.5766047452639324, | |
| "grad_norm": 0.9235345865848048, | |
| "learning_rate": 2.3679801632683927e-05, | |
| "loss": 0.8791, | |
| "mean_token_accuracy": 0.7413055062294006, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.5775243700570167, | |
| "grad_norm": 0.931358306977097, | |
| "learning_rate": 2.3612385195452687e-05, | |
| "loss": 0.8864, | |
| "mean_token_accuracy": 0.7415070414543152, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5784439948501011, | |
| "grad_norm": 0.9366462545353926, | |
| "learning_rate": 2.3545004715100966e-05, | |
| "loss": 0.8791, | |
| "mean_token_accuracy": 0.7428970575332642, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.5793636196431856, | |
| "grad_norm": 0.9312216076414869, | |
| "learning_rate": 2.3477660814836562e-05, | |
| "loss": 0.8318, | |
| "mean_token_accuracy": 0.7540540814399719, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.58028324443627, | |
| "grad_norm": 0.9058432741408705, | |
| "learning_rate": 2.3410354117528904e-05, | |
| "loss": 0.9128, | |
| "mean_token_accuracy": 0.7328131318092346, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.5812028692293544, | |
| "grad_norm": 0.92693757568253, | |
| "learning_rate": 2.3343085245703373e-05, | |
| "loss": 0.8356, | |
| "mean_token_accuracy": 0.754761004447937, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5821224940224389, | |
| "grad_norm": 0.9685552745916727, | |
| "learning_rate": 2.3275854821535476e-05, | |
| "loss": 0.8696, | |
| "mean_token_accuracy": 0.7423434615135193, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.5830421188155233, | |
| "grad_norm": 0.9530016316914325, | |
| "learning_rate": 2.3208663466845108e-05, | |
| "loss": 0.8239, | |
| "mean_token_accuracy": 0.7581414461135865, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.5839617436086076, | |
| "grad_norm": 0.9912981010776241, | |
| "learning_rate": 2.3141511803090815e-05, | |
| "loss": 0.8784, | |
| "mean_token_accuracy": 0.743216586112976, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.5848813684016921, | |
| "grad_norm": 0.8897494823501038, | |
| "learning_rate": 2.3074400451364048e-05, | |
| "loss": 0.8771, | |
| "mean_token_accuracy": 0.7422731041908264, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.5858009931947765, | |
| "grad_norm": 0.9087254524604537, | |
| "learning_rate": 2.300733003238339e-05, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.75495365858078, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.5867206179878609, | |
| "grad_norm": 0.9615326948623956, | |
| "learning_rate": 2.2940301166488846e-05, | |
| "loss": 0.7821, | |
| "mean_token_accuracy": 0.7687617659568786, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.5876402427809454, | |
| "grad_norm": 0.9239773147706558, | |
| "learning_rate": 2.28733144736361e-05, | |
| "loss": 0.8034, | |
| "mean_token_accuracy": 0.7630661010742188, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.5885598675740298, | |
| "grad_norm": 0.9271354944208791, | |
| "learning_rate": 2.2806370573390745e-05, | |
| "loss": 0.8377, | |
| "mean_token_accuracy": 0.7517584562301636, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5894794923671142, | |
| "grad_norm": 0.9307261567222711, | |
| "learning_rate": 2.2739470084922608e-05, | |
| "loss": 0.9145, | |
| "mean_token_accuracy": 0.7307730317115784, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.5903991171601987, | |
| "grad_norm": 0.8708186634436479, | |
| "learning_rate": 2.2672613626999994e-05, | |
| "loss": 0.8495, | |
| "mean_token_accuracy": 0.7486128211021423, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.591318741953283, | |
| "grad_norm": 0.9473141853732495, | |
| "learning_rate": 2.2605801817983958e-05, | |
| "loss": 0.8341, | |
| "mean_token_accuracy": 0.7518749475479126, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.5922383667463674, | |
| "grad_norm": 0.9382593885727152, | |
| "learning_rate": 2.253903527582259e-05, | |
| "loss": 0.8447, | |
| "mean_token_accuracy": 0.7506359577178955, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5931579915394519, | |
| "grad_norm": 0.9696123819996886, | |
| "learning_rate": 2.247231461804532e-05, | |
| "loss": 0.8266, | |
| "mean_token_accuracy": 0.7562480688095092, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.5940776163325363, | |
| "grad_norm": 0.8949351423802622, | |
| "learning_rate": 2.2405640461757176e-05, | |
| "loss": 0.814, | |
| "mean_token_accuracy": 0.7592174887657166, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.5949972411256208, | |
| "grad_norm": 0.9615311548799811, | |
| "learning_rate": 2.2339013423633083e-05, | |
| "loss": 0.8503, | |
| "mean_token_accuracy": 0.7499252796173096, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.5959168659187052, | |
| "grad_norm": 0.9086052926810453, | |
| "learning_rate": 2.2272434119912184e-05, | |
| "loss": 0.8754, | |
| "mean_token_accuracy": 0.7434251546859741, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5968364907117896, | |
| "grad_norm": 0.9221742878259598, | |
| "learning_rate": 2.2205903166392113e-05, | |
| "loss": 0.8477, | |
| "mean_token_accuracy": 0.7485897660255432, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.5977561155048741, | |
| "grad_norm": 0.967041034869552, | |
| "learning_rate": 2.2139421178423307e-05, | |
| "loss": 0.8225, | |
| "mean_token_accuracy": 0.7570245742797852, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5986757402979584, | |
| "grad_norm": 0.981067205830958, | |
| "learning_rate": 2.207298877090333e-05, | |
| "loss": 0.8701, | |
| "mean_token_accuracy": 0.7440281748771668, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.5995953650910428, | |
| "grad_norm": 0.989973298607582, | |
| "learning_rate": 2.2006606558271142e-05, | |
| "loss": 0.8713, | |
| "mean_token_accuracy": 0.7413482785224914, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.6005149898841273, | |
| "grad_norm": 0.8672144464089592, | |
| "learning_rate": 2.1940275154501482e-05, | |
| "loss": 0.87, | |
| "mean_token_accuracy": 0.743138313293457, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.6014346146772117, | |
| "grad_norm": 0.9653292378844739, | |
| "learning_rate": 2.187399517309914e-05, | |
| "loss": 0.8575, | |
| "mean_token_accuracy": 0.7464121103286743, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.6023542394702961, | |
| "grad_norm": 0.9239524199502155, | |
| "learning_rate": 2.1807767227093268e-05, | |
| "loss": 0.8236, | |
| "mean_token_accuracy": 0.7573307991027832, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.6032738642633806, | |
| "grad_norm": 0.9806975126747703, | |
| "learning_rate": 2.1741591929031795e-05, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.7407856106758117, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.604193489056465, | |
| "grad_norm": 0.9640808408127749, | |
| "learning_rate": 2.167546989097566e-05, | |
| "loss": 0.8638, | |
| "mean_token_accuracy": 0.7459958910942077, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.6051131138495494, | |
| "grad_norm": 0.9656473527433518, | |
| "learning_rate": 2.16094017244932e-05, | |
| "loss": 0.8783, | |
| "mean_token_accuracy": 0.7419638872146607, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.6060327386426339, | |
| "grad_norm": 0.9930014003610543, | |
| "learning_rate": 2.154338804065451e-05, | |
| "loss": 0.8615, | |
| "mean_token_accuracy": 0.7456332087516785, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.6069523634357182, | |
| "grad_norm": 0.9330196848152268, | |
| "learning_rate": 2.1477429450025767e-05, | |
| "loss": 0.8352, | |
| "mean_token_accuracy": 0.7517044901847839, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6078719882288026, | |
| "grad_norm": 0.8777553334567131, | |
| "learning_rate": 2.1411526562663554e-05, | |
| "loss": 0.8364, | |
| "mean_token_accuracy": 0.7501665949821472, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.6087916130218871, | |
| "grad_norm": 0.9315142599796349, | |
| "learning_rate": 2.1345679988109284e-05, | |
| "loss": 0.8378, | |
| "mean_token_accuracy": 0.7534802198410034, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.6097112378149715, | |
| "grad_norm": 0.9385962221597601, | |
| "learning_rate": 2.1279890335383534e-05, | |
| "loss": 0.8876, | |
| "mean_token_accuracy": 0.7398653388023376, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.6106308626080559, | |
| "grad_norm": 0.9451857651632474, | |
| "learning_rate": 2.1214158212980366e-05, | |
| "loss": 0.7988, | |
| "mean_token_accuracy": 0.7636669516563416, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.6115504874011404, | |
| "grad_norm": 0.9310680714278403, | |
| "learning_rate": 2.114848422886177e-05, | |
| "loss": 0.8417, | |
| "mean_token_accuracy": 0.7545873999595643, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.6124701121942248, | |
| "grad_norm": 0.9555284993925652, | |
| "learning_rate": 2.108286899045202e-05, | |
| "loss": 0.8906, | |
| "mean_token_accuracy": 0.7384588122367859, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.6133897369873091, | |
| "grad_norm": 0.9525478437560697, | |
| "learning_rate": 2.1017313104632003e-05, | |
| "loss": 0.844, | |
| "mean_token_accuracy": 0.7497392654418945, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.6143093617803936, | |
| "grad_norm": 0.9657934498214388, | |
| "learning_rate": 2.0951817177733684e-05, | |
| "loss": 0.8748, | |
| "mean_token_accuracy": 0.7426393389701843, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.615228986573478, | |
| "grad_norm": 0.9174407552166862, | |
| "learning_rate": 2.088638181553446e-05, | |
| "loss": 0.8727, | |
| "mean_token_accuracy": 0.742801570892334, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.6161486113665624, | |
| "grad_norm": 0.9106809477969502, | |
| "learning_rate": 2.0821007623251564e-05, | |
| "loss": 0.8227, | |
| "mean_token_accuracy": 0.7550573825836182, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.6170682361596469, | |
| "grad_norm": 0.8816231707997737, | |
| "learning_rate": 2.075569520553643e-05, | |
| "loss": 0.8066, | |
| "mean_token_accuracy": 0.7590124368667602, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.6179878609527313, | |
| "grad_norm": 0.9651791807712018, | |
| "learning_rate": 2.0690445166469158e-05, | |
| "loss": 0.8575, | |
| "mean_token_accuracy": 0.7481630921363831, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.6189074857458157, | |
| "grad_norm": 0.962161882798645, | |
| "learning_rate": 2.0625258109552926e-05, | |
| "loss": 0.8842, | |
| "mean_token_accuracy": 0.743985378742218, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.6198271105389002, | |
| "grad_norm": 0.955250281560398, | |
| "learning_rate": 2.0560134637708334e-05, | |
| "loss": 0.8413, | |
| "mean_token_accuracy": 0.7497357606887818, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.6207467353319845, | |
| "grad_norm": 1.0327175413319667, | |
| "learning_rate": 2.0495075353267913e-05, | |
| "loss": 0.8697, | |
| "mean_token_accuracy": 0.7445659875869751, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.6216663601250689, | |
| "grad_norm": 0.9525687098312168, | |
| "learning_rate": 2.043008085797052e-05, | |
| "loss": 0.8722, | |
| "mean_token_accuracy": 0.7410041093826294, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.6225859849181534, | |
| "grad_norm": 0.9275514977855014, | |
| "learning_rate": 2.036515175295574e-05, | |
| "loss": 0.8412, | |
| "mean_token_accuracy": 0.7507887959480286, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.6235056097112378, | |
| "grad_norm": 0.9493961658678648, | |
| "learning_rate": 2.03002886387584e-05, | |
| "loss": 0.8556, | |
| "mean_token_accuracy": 0.7469261646270752, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.6244252345043222, | |
| "grad_norm": 0.9292345545436532, | |
| "learning_rate": 2.0235492115302944e-05, | |
| "loss": 0.8301, | |
| "mean_token_accuracy": 0.7550871014595032, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.6253448592974067, | |
| "grad_norm": 0.9430411664378814, | |
| "learning_rate": 2.017076278189794e-05, | |
| "loss": 0.8321, | |
| "mean_token_accuracy": 0.7533326983451843, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6262644840904911, | |
| "grad_norm": 0.8889521393845567, | |
| "learning_rate": 2.0106101237230455e-05, | |
| "loss": 0.8324, | |
| "mean_token_accuracy": 0.7539088129997253, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.6271841088835755, | |
| "grad_norm": 0.9180009901150891, | |
| "learning_rate": 2.0041508079360634e-05, | |
| "loss": 0.7898, | |
| "mean_token_accuracy": 0.761493980884552, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.62810373367666, | |
| "grad_norm": 0.9055995921329637, | |
| "learning_rate": 1.997698390571608e-05, | |
| "loss": 0.8419, | |
| "mean_token_accuracy": 0.7503387928009033, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.6290233584697443, | |
| "grad_norm": 0.9447591194939752, | |
| "learning_rate": 1.991252931308633e-05, | |
| "loss": 0.8692, | |
| "mean_token_accuracy": 0.7452242970466614, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.6299429832628287, | |
| "grad_norm": 0.9351426059072258, | |
| "learning_rate": 1.9848144897617417e-05, | |
| "loss": 0.8149, | |
| "mean_token_accuracy": 0.7568124055862426, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.6308626080559132, | |
| "grad_norm": 0.9168023134449134, | |
| "learning_rate": 1.9783831254806257e-05, | |
| "loss": 0.8157, | |
| "mean_token_accuracy": 0.7554953694343567, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.6317822328489976, | |
| "grad_norm": 1.027979530127791, | |
| "learning_rate": 1.971958897949518e-05, | |
| "loss": 0.8229, | |
| "mean_token_accuracy": 0.7550533413887024, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.632701857642082, | |
| "grad_norm": 0.8964633060914129, | |
| "learning_rate": 1.9655418665866465e-05, | |
| "loss": 0.7966, | |
| "mean_token_accuracy": 0.7639833688735962, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.6336214824351665, | |
| "grad_norm": 0.8702615238247585, | |
| "learning_rate": 1.9591320907436782e-05, | |
| "loss": 0.8502, | |
| "mean_token_accuracy": 0.74614177942276, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.6345411072282509, | |
| "grad_norm": 0.9157962896320851, | |
| "learning_rate": 1.9527296297051765e-05, | |
| "loss": 0.8026, | |
| "mean_token_accuracy": 0.758307683467865, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.6354607320213354, | |
| "grad_norm": 0.9465005665572019, | |
| "learning_rate": 1.9463345426880448e-05, | |
| "loss": 0.8036, | |
| "mean_token_accuracy": 0.7617629647254944, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.6363803568144197, | |
| "grad_norm": 0.9618417431183126, | |
| "learning_rate": 1.939946888840986e-05, | |
| "loss": 0.8819, | |
| "mean_token_accuracy": 0.7395693898200989, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.6372999816075041, | |
| "grad_norm": 0.9326022903907812, | |
| "learning_rate": 1.933566727243956e-05, | |
| "loss": 0.8384, | |
| "mean_token_accuracy": 0.7497618556022644, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.6382196064005886, | |
| "grad_norm": 0.942168299955769, | |
| "learning_rate": 1.927194116907608e-05, | |
| "loss": 0.8821, | |
| "mean_token_accuracy": 0.7422310829162597, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.639139231193673, | |
| "grad_norm": 0.930256851029374, | |
| "learning_rate": 1.9208291167727576e-05, | |
| "loss": 0.8293, | |
| "mean_token_accuracy": 0.7561385631561279, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.6400588559867574, | |
| "grad_norm": 0.8857746537604931, | |
| "learning_rate": 1.9144717857098328e-05, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7583439826965332, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.6409784807798419, | |
| "grad_norm": 0.9519372824273006, | |
| "learning_rate": 1.908122182518326e-05, | |
| "loss": 0.8674, | |
| "mean_token_accuracy": 0.741856062412262, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.6418981055729263, | |
| "grad_norm": 0.9483959540274922, | |
| "learning_rate": 1.9017803659262583e-05, | |
| "loss": 0.8496, | |
| "mean_token_accuracy": 0.7491413950920105, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.6428177303660106, | |
| "grad_norm": 0.9729346329964175, | |
| "learning_rate": 1.8954463945896293e-05, | |
| "loss": 0.8554, | |
| "mean_token_accuracy": 0.7483752846717835, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.6437373551590951, | |
| "grad_norm": 0.910719020599245, | |
| "learning_rate": 1.889120327091879e-05, | |
| "loss": 0.8332, | |
| "mean_token_accuracy": 0.753311276435852, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6446569799521795, | |
| "grad_norm": 0.8997078755147822, | |
| "learning_rate": 1.8828022219433413e-05, | |
| "loss": 0.8311, | |
| "mean_token_accuracy": 0.7538302779197693, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.6455766047452639, | |
| "grad_norm": 0.9097287217365273, | |
| "learning_rate": 1.8764921375807083e-05, | |
| "loss": 0.8573, | |
| "mean_token_accuracy": 0.74767564535141, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.6464962295383484, | |
| "grad_norm": 0.9420262116863728, | |
| "learning_rate": 1.8701901323664863e-05, | |
| "loss": 0.8551, | |
| "mean_token_accuracy": 0.7479906916618347, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.6474158543314328, | |
| "grad_norm": 0.9297816459092663, | |
| "learning_rate": 1.8638962645884565e-05, | |
| "loss": 0.8066, | |
| "mean_token_accuracy": 0.7580268263816834, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.6483354791245172, | |
| "grad_norm": 0.946031226164797, | |
| "learning_rate": 1.8576105924591357e-05, | |
| "loss": 0.8179, | |
| "mean_token_accuracy": 0.7542472004890441, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.6492551039176017, | |
| "grad_norm": 0.9036904422802344, | |
| "learning_rate": 1.8513331741152412e-05, | |
| "loss": 0.8261, | |
| "mean_token_accuracy": 0.7552783608436584, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.650174728710686, | |
| "grad_norm": 0.921905554132334, | |
| "learning_rate": 1.8450640676171472e-05, | |
| "loss": 0.8351, | |
| "mean_token_accuracy": 0.752598226070404, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.6510943535037704, | |
| "grad_norm": 1.0035005670649164, | |
| "learning_rate": 1.8388033309483522e-05, | |
| "loss": 0.8981, | |
| "mean_token_accuracy": 0.7371325850486755, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.6520139782968549, | |
| "grad_norm": 0.9724909600231612, | |
| "learning_rate": 1.8325510220149413e-05, | |
| "loss": 0.8327, | |
| "mean_token_accuracy": 0.751532518863678, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.6529336030899393, | |
| "grad_norm": 0.9664687506252672, | |
| "learning_rate": 1.8263071986450524e-05, | |
| "loss": 0.8336, | |
| "mean_token_accuracy": 0.7516280770301819, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6538532278830237, | |
| "grad_norm": 0.9164445815967506, | |
| "learning_rate": 1.8200719185883358e-05, | |
| "loss": 0.8316, | |
| "mean_token_accuracy": 0.7544404864311218, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.6547728526761082, | |
| "grad_norm": 0.9293565126179983, | |
| "learning_rate": 1.813845239515427e-05, | |
| "loss": 0.8257, | |
| "mean_token_accuracy": 0.7552899837493896, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6556924774691926, | |
| "grad_norm": 0.9010810987925738, | |
| "learning_rate": 1.8076272190174115e-05, | |
| "loss": 0.8201, | |
| "mean_token_accuracy": 0.7565722703933716, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.656612102262277, | |
| "grad_norm": 1.0075745989661558, | |
| "learning_rate": 1.801417914605286e-05, | |
| "loss": 0.869, | |
| "mean_token_accuracy": 0.7453143835067749, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.6575317270553614, | |
| "grad_norm": 0.935586367301874, | |
| "learning_rate": 1.795217383709437e-05, | |
| "loss": 0.8845, | |
| "mean_token_accuracy": 0.7403179168701172, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.6584513518484458, | |
| "grad_norm": 0.9872971011864189, | |
| "learning_rate": 1.7890256836791008e-05, | |
| "loss": 0.8052, | |
| "mean_token_accuracy": 0.7629344463348389, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.6593709766415302, | |
| "grad_norm": 0.9876503263464145, | |
| "learning_rate": 1.7828428717818353e-05, | |
| "loss": 0.8135, | |
| "mean_token_accuracy": 0.7590724229812622, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.6602906014346147, | |
| "grad_norm": 0.8811578706911977, | |
| "learning_rate": 1.7766690052029944e-05, | |
| "loss": 0.8221, | |
| "mean_token_accuracy": 0.7560603976249695, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.6612102262276991, | |
| "grad_norm": 0.9719326557742581, | |
| "learning_rate": 1.770504141045194e-05, | |
| "loss": 0.8342, | |
| "mean_token_accuracy": 0.7510559558868408, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.6621298510207835, | |
| "grad_norm": 1.0132470520749903, | |
| "learning_rate": 1.7643483363277874e-05, | |
| "loss": 0.8487, | |
| "mean_token_accuracy": 0.7500616908073425, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.663049475813868, | |
| "grad_norm": 1.0318932699213554, | |
| "learning_rate": 1.7582016479863327e-05, | |
| "loss": 0.8487, | |
| "mean_token_accuracy": 0.7490703582763671, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.6639691006069524, | |
| "grad_norm": 0.8658023921332224, | |
| "learning_rate": 1.7520641328720756e-05, | |
| "loss": 0.8238, | |
| "mean_token_accuracy": 0.7564070224761963, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.6648887254000367, | |
| "grad_norm": 0.9750052383478849, | |
| "learning_rate": 1.7459358477514122e-05, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.7549832344055176, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.6658083501931212, | |
| "grad_norm": 0.957114636285714, | |
| "learning_rate": 1.7398168493053723e-05, | |
| "loss": 0.7881, | |
| "mean_token_accuracy": 0.7615378856658935, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.6667279749862056, | |
| "grad_norm": 0.9148381033348181, | |
| "learning_rate": 1.7337071941290944e-05, | |
| "loss": 0.8196, | |
| "mean_token_accuracy": 0.7577734112739563, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.66764759977929, | |
| "grad_norm": 0.9583843198631806, | |
| "learning_rate": 1.7276069387312955e-05, | |
| "loss": 0.9, | |
| "mean_token_accuracy": 0.7367844343185425, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.6685672245723745, | |
| "grad_norm": 0.9525242256598431, | |
| "learning_rate": 1.7215161395337572e-05, | |
| "loss": 0.8351, | |
| "mean_token_accuracy": 0.7536734580993653, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 0.6694868493654589, | |
| "grad_norm": 0.9218486580963495, | |
| "learning_rate": 1.7154348528707992e-05, | |
| "loss": 0.8512, | |
| "mean_token_accuracy": 0.7513302564620972, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6704064741585433, | |
| "grad_norm": 0.9497350819436411, | |
| "learning_rate": 1.709363134988757e-05, | |
| "loss": 0.8522, | |
| "mean_token_accuracy": 0.747953987121582, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 0.6713260989516278, | |
| "grad_norm": 0.9359833703344925, | |
| "learning_rate": 1.7033010420454655e-05, | |
| "loss": 0.8091, | |
| "mean_token_accuracy": 0.7576663970947266, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6722457237447121, | |
| "grad_norm": 0.9884296155896105, | |
| "learning_rate": 1.6972486301097376e-05, | |
| "loss": 0.8185, | |
| "mean_token_accuracy": 0.7578543424606323, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 0.6731653485377965, | |
| "grad_norm": 0.885165473016121, | |
| "learning_rate": 1.691205955160845e-05, | |
| "loss": 0.8461, | |
| "mean_token_accuracy": 0.7491200208663941, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.674084973330881, | |
| "grad_norm": 0.9715821597591158, | |
| "learning_rate": 1.6851730730880012e-05, | |
| "loss": 0.8527, | |
| "mean_token_accuracy": 0.7483757376670838, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 0.6750045981239654, | |
| "grad_norm": 0.8871437133597592, | |
| "learning_rate": 1.679150039689846e-05, | |
| "loss": 0.8148, | |
| "mean_token_accuracy": 0.7578411340713501, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.6759242229170498, | |
| "grad_norm": 0.9530586600231223, | |
| "learning_rate": 1.673136910673926e-05, | |
| "loss": 0.8645, | |
| "mean_token_accuracy": 0.7451423764228821, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.6768438477101343, | |
| "grad_norm": 0.9427729850229866, | |
| "learning_rate": 1.6671337416561817e-05, | |
| "loss": 0.8432, | |
| "mean_token_accuracy": 0.7509079575538635, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6777634725032187, | |
| "grad_norm": 0.9325142143827265, | |
| "learning_rate": 1.661140588160435e-05, | |
| "loss": 0.8347, | |
| "mean_token_accuracy": 0.7516968011856079, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 0.6786830972963032, | |
| "grad_norm": 0.9601757924065347, | |
| "learning_rate": 1.6551575056178695e-05, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7589465737342834, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.6796027220893875, | |
| "grad_norm": 1.0086779966517565, | |
| "learning_rate": 1.649184549366525e-05, | |
| "loss": 0.8395, | |
| "mean_token_accuracy": 0.7520246505737305, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 0.6805223468824719, | |
| "grad_norm": 0.9707009645804029, | |
| "learning_rate": 1.6432217746507814e-05, | |
| "loss": 0.8382, | |
| "mean_token_accuracy": 0.7533354997634888, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6814419716755564, | |
| "grad_norm": 0.9109669918450888, | |
| "learning_rate": 1.6372692366208476e-05, | |
| "loss": 0.8186, | |
| "mean_token_accuracy": 0.7560298204421997, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 0.6823615964686408, | |
| "grad_norm": 0.931556246223817, | |
| "learning_rate": 1.6313269903322536e-05, | |
| "loss": 0.8682, | |
| "mean_token_accuracy": 0.7464072823524475, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.6832812212617252, | |
| "grad_norm": 0.9316943141031991, | |
| "learning_rate": 1.6253950907453414e-05, | |
| "loss": 0.7891, | |
| "mean_token_accuracy": 0.7643645644187927, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 0.6842008460548097, | |
| "grad_norm": 0.9367407375514984, | |
| "learning_rate": 1.619473592724752e-05, | |
| "loss": 0.8489, | |
| "mean_token_accuracy": 0.7488224864006042, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.6851204708478941, | |
| "grad_norm": 0.96189736553831, | |
| "learning_rate": 1.613562551038925e-05, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7625237464904785, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.6860400956409785, | |
| "grad_norm": 0.9170890141555628, | |
| "learning_rate": 1.607662020359587e-05, | |
| "loss": 0.8404, | |
| "mean_token_accuracy": 0.7529777765274048, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.686959720434063, | |
| "grad_norm": 0.9456438498787428, | |
| "learning_rate": 1.6017720552612462e-05, | |
| "loss": 0.8036, | |
| "mean_token_accuracy": 0.7614395618438721, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 0.6878793452271473, | |
| "grad_norm": 0.9544770877536788, | |
| "learning_rate": 1.595892710220691e-05, | |
| "loss": 0.8413, | |
| "mean_token_accuracy": 0.7519929647445679, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6887989700202317, | |
| "grad_norm": 1.022115954707187, | |
| "learning_rate": 1.5900240396164835e-05, | |
| "loss": 0.8612, | |
| "mean_token_accuracy": 0.747264850139618, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 0.6897185948133162, | |
| "grad_norm": 0.9476824745559427, | |
| "learning_rate": 1.584166097728455e-05, | |
| "loss": 0.847, | |
| "mean_token_accuracy": 0.7491350531578064, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6906382196064006, | |
| "grad_norm": 0.8827290010499629, | |
| "learning_rate": 1.578318938737209e-05, | |
| "loss": 0.8284, | |
| "mean_token_accuracy": 0.7547004818916321, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 0.691557844399485, | |
| "grad_norm": 0.9009975487421323, | |
| "learning_rate": 1.5724826167236146e-05, | |
| "loss": 0.8214, | |
| "mean_token_accuracy": 0.7568115711212158, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6924774691925695, | |
| "grad_norm": 0.9187149873785133, | |
| "learning_rate": 1.5666571856683116e-05, | |
| "loss": 0.827, | |
| "mean_token_accuracy": 0.7550323009490967, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 0.6933970939856539, | |
| "grad_norm": 0.9280641474823987, | |
| "learning_rate": 1.560842699451204e-05, | |
| "loss": 0.7616, | |
| "mean_token_accuracy": 0.7714649677276612, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.6943167187787382, | |
| "grad_norm": 0.9038372482824055, | |
| "learning_rate": 1.5550392118509705e-05, | |
| "loss": 0.8028, | |
| "mean_token_accuracy": 0.760212504863739, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.6952363435718227, | |
| "grad_norm": 0.9201432901179558, | |
| "learning_rate": 1.5492467765445613e-05, | |
| "loss": 0.8241, | |
| "mean_token_accuracy": 0.754262363910675, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6961559683649071, | |
| "grad_norm": 0.9031896471527984, | |
| "learning_rate": 1.5434654471067007e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.7623116612434387, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 0.6970755931579915, | |
| "grad_norm": 0.928442088214151, | |
| "learning_rate": 1.537695277009396e-05, | |
| "loss": 0.8667, | |
| "mean_token_accuracy": 0.7442408680915833, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.697995217951076, | |
| "grad_norm": 0.9545685310758198, | |
| "learning_rate": 1.5319363196214427e-05, | |
| "loss": 0.8147, | |
| "mean_token_accuracy": 0.757679283618927, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 0.6989148427441604, | |
| "grad_norm": 0.957997913837239, | |
| "learning_rate": 1.526188628207924e-05, | |
| "loss": 0.8674, | |
| "mean_token_accuracy": 0.7406766414642334, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6998344675372448, | |
| "grad_norm": 0.907233770113165, | |
| "learning_rate": 1.5204522559297275e-05, | |
| "loss": 0.8228, | |
| "mean_token_accuracy": 0.7550997257232666, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 0.7007540923303293, | |
| "grad_norm": 0.9753264400407652, | |
| "learning_rate": 1.5147272558430472e-05, | |
| "loss": 0.812, | |
| "mean_token_accuracy": 0.7584111213684082, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.7016737171234136, | |
| "grad_norm": 0.898583550613599, | |
| "learning_rate": 1.509013680898896e-05, | |
| "loss": 0.814, | |
| "mean_token_accuracy": 0.7574291110038758, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 0.702593341916498, | |
| "grad_norm": 0.9245046858803572, | |
| "learning_rate": 1.5033115839426127e-05, | |
| "loss": 0.8002, | |
| "mean_token_accuracy": 0.7631544828414917, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.7035129667095825, | |
| "grad_norm": 0.9501909113953771, | |
| "learning_rate": 1.4976210177133764e-05, | |
| "loss": 0.8284, | |
| "mean_token_accuracy": 0.7537835121154786, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.7044325915026669, | |
| "grad_norm": 0.9118736011138947, | |
| "learning_rate": 1.4919420348437189e-05, | |
| "loss": 0.8637, | |
| "mean_token_accuracy": 0.746515440940857, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.7053522162957513, | |
| "grad_norm": 0.9346208775326443, | |
| "learning_rate": 1.4862746878590329e-05, | |
| "loss": 0.8325, | |
| "mean_token_accuracy": 0.7536684751510621, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 0.7062718410888358, | |
| "grad_norm": 0.9644025251262837, | |
| "learning_rate": 1.4806190291770932e-05, | |
| "loss": 0.9199, | |
| "mean_token_accuracy": 0.728544807434082, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.7071914658819202, | |
| "grad_norm": 0.9316658230434494, | |
| "learning_rate": 1.4749751111075682e-05, | |
| "loss": 0.8478, | |
| "mean_token_accuracy": 0.7476451396942139, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 0.7081110906750046, | |
| "grad_norm": 0.8593875878005443, | |
| "learning_rate": 1.469342985851534e-05, | |
| "loss": 0.7931, | |
| "mean_token_accuracy": 0.7640434741973877, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.709030715468089, | |
| "grad_norm": 0.9379422901278587, | |
| "learning_rate": 1.4637227055009962e-05, | |
| "loss": 0.8228, | |
| "mean_token_accuracy": 0.7573190450668335, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 0.7099503402611734, | |
| "grad_norm": 0.9026485371540945, | |
| "learning_rate": 1.4581143220384047e-05, | |
| "loss": 0.82, | |
| "mean_token_accuracy": 0.756511640548706, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.7108699650542578, | |
| "grad_norm": 0.9796042273923296, | |
| "learning_rate": 1.4525178873361756e-05, | |
| "loss": 0.8242, | |
| "mean_token_accuracy": 0.7555618524551392, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 0.7117895898473423, | |
| "grad_norm": 0.9383990549827186, | |
| "learning_rate": 1.4469334531562067e-05, | |
| "loss": 0.8448, | |
| "mean_token_accuracy": 0.7482100129127502, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.7127092146404267, | |
| "grad_norm": 0.9602931261847705, | |
| "learning_rate": 1.4413610711494058e-05, | |
| "loss": 0.8365, | |
| "mean_token_accuracy": 0.7580392360687256, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.7136288394335111, | |
| "grad_norm": 0.943240285031073, | |
| "learning_rate": 1.4358007928552075e-05, | |
| "loss": 0.7861, | |
| "mean_token_accuracy": 0.7667181611061096, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.7145484642265956, | |
| "grad_norm": 0.9447898247986761, | |
| "learning_rate": 1.4302526697010964e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.7595344543457031, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 0.71546808901968, | |
| "grad_norm": 0.9841983235190546, | |
| "learning_rate": 1.424716753002136e-05, | |
| "loss": 0.8597, | |
| "mean_token_accuracy": 0.7481236219406128, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.7163877138127643, | |
| "grad_norm": 0.9684153403690037, | |
| "learning_rate": 1.4191930939604908e-05, | |
| "loss": 0.8117, | |
| "mean_token_accuracy": 0.7613986849784851, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 0.7173073386058488, | |
| "grad_norm": 0.996877698893722, | |
| "learning_rate": 1.4136817436649502e-05, | |
| "loss": 0.8766, | |
| "mean_token_accuracy": 0.738961935043335, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7182269633989332, | |
| "grad_norm": 0.9051545491177592, | |
| "learning_rate": 1.4081827530904624e-05, | |
| "loss": 0.8445, | |
| "mean_token_accuracy": 0.749999487400055, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 0.7191465881920177, | |
| "grad_norm": 0.9684927881965169, | |
| "learning_rate": 1.4026961730976584e-05, | |
| "loss": 0.8209, | |
| "mean_token_accuracy": 0.7576812863349914, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.7200662129851021, | |
| "grad_norm": 0.9610042841526357, | |
| "learning_rate": 1.3972220544323832e-05, | |
| "loss": 0.8131, | |
| "mean_token_accuracy": 0.7582221627235413, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 0.7209858377781865, | |
| "grad_norm": 0.9412320092723402, | |
| "learning_rate": 1.3917604477252238e-05, | |
| "loss": 0.7937, | |
| "mean_token_accuracy": 0.7617234110832214, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.721905462571271, | |
| "grad_norm": 0.9321659094215312, | |
| "learning_rate": 1.3863114034910452e-05, | |
| "loss": 0.8156, | |
| "mean_token_accuracy": 0.7598451256752015, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.7228250873643554, | |
| "grad_norm": 0.956577146254236, | |
| "learning_rate": 1.3808749721285214e-05, | |
| "loss": 0.8107, | |
| "mean_token_accuracy": 0.757847785949707, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.7237447121574397, | |
| "grad_norm": 0.9139917904820034, | |
| "learning_rate": 1.3754512039196658e-05, | |
| "loss": 0.8754, | |
| "mean_token_accuracy": 0.7391230940818787, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 0.7246643369505242, | |
| "grad_norm": 0.92757564731535, | |
| "learning_rate": 1.3700401490293718e-05, | |
| "loss": 0.8193, | |
| "mean_token_accuracy": 0.7570781588554383, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.7255839617436086, | |
| "grad_norm": 0.9533935473757719, | |
| "learning_rate": 1.3646418575049475e-05, | |
| "loss": 0.8244, | |
| "mean_token_accuracy": 0.756612241268158, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 0.726503586536693, | |
| "grad_norm": 0.9319033478082173, | |
| "learning_rate": 1.3592563792756468e-05, | |
| "loss": 0.7994, | |
| "mean_token_accuracy": 0.7616767644882202, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.7274232113297775, | |
| "grad_norm": 0.9659322616790049, | |
| "learning_rate": 1.3538837641522172e-05, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.7666900753974915, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 0.7283428361228619, | |
| "grad_norm": 0.9715937702004781, | |
| "learning_rate": 1.3485240618264322e-05, | |
| "loss": 0.8707, | |
| "mean_token_accuracy": 0.742601501941681, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.7292624609159463, | |
| "grad_norm": 0.9279423695840053, | |
| "learning_rate": 1.3431773218706336e-05, | |
| "loss": 0.8435, | |
| "mean_token_accuracy": 0.7503429889678955, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 0.7301820857090308, | |
| "grad_norm": 0.9826978876425828, | |
| "learning_rate": 1.3378435937372729e-05, | |
| "loss": 0.8609, | |
| "mean_token_accuracy": 0.7491580963134765, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.7311017105021151, | |
| "grad_norm": 0.9333913123309906, | |
| "learning_rate": 1.3325229267584549e-05, | |
| "loss": 0.8771, | |
| "mean_token_accuracy": 0.7425579071044922, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.7320213352951995, | |
| "grad_norm": 0.9125063830711305, | |
| "learning_rate": 1.3272153701454809e-05, | |
| "loss": 0.8086, | |
| "mean_token_accuracy": 0.7603332042694092, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.732940960088284, | |
| "grad_norm": 0.9868481200984651, | |
| "learning_rate": 1.3219209729883918e-05, | |
| "loss": 0.7879, | |
| "mean_token_accuracy": 0.7675115823745727, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 0.7338605848813684, | |
| "grad_norm": 0.9006549103315062, | |
| "learning_rate": 1.3166397842555175e-05, | |
| "loss": 0.7923, | |
| "mean_token_accuracy": 0.7659124851226806, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.7347802096744528, | |
| "grad_norm": 0.9128416767290051, | |
| "learning_rate": 1.3113718527930214e-05, | |
| "loss": 0.8363, | |
| "mean_token_accuracy": 0.751650869846344, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 0.7356998344675373, | |
| "grad_norm": 0.93586974280188, | |
| "learning_rate": 1.3061172273244477e-05, | |
| "loss": 0.8634, | |
| "mean_token_accuracy": 0.7428792953491211, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7366194592606217, | |
| "grad_norm": 0.9865948469992011, | |
| "learning_rate": 1.3008759564502742e-05, | |
| "loss": 0.8627, | |
| "mean_token_accuracy": 0.7454355955123901, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 0.737539084053706, | |
| "grad_norm": 0.9395366278250679, | |
| "learning_rate": 1.2956480886474609e-05, | |
| "loss": 0.8408, | |
| "mean_token_accuracy": 0.7488868713378907, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.7384587088467905, | |
| "grad_norm": 0.9259161411169768, | |
| "learning_rate": 1.2904336722690013e-05, | |
| "loss": 0.8474, | |
| "mean_token_accuracy": 0.7509873270988464, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 0.7393783336398749, | |
| "grad_norm": 0.8982963261004637, | |
| "learning_rate": 1.2852327555434743e-05, | |
| "loss": 0.8272, | |
| "mean_token_accuracy": 0.7562850832939148, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.7402979584329593, | |
| "grad_norm": 0.9145268063018638, | |
| "learning_rate": 1.280045386574601e-05, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7601189255714417, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.7412175832260438, | |
| "grad_norm": 0.9417030319528836, | |
| "learning_rate": 1.2748716133407985e-05, | |
| "loss": 0.8243, | |
| "mean_token_accuracy": 0.7563821077346802, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.7421372080191282, | |
| "grad_norm": 0.9170391844634309, | |
| "learning_rate": 1.269711483694733e-05, | |
| "loss": 0.8071, | |
| "mean_token_accuracy": 0.7610970735549927, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 0.7430568328122126, | |
| "grad_norm": 0.927700931925603, | |
| "learning_rate": 1.264565045362883e-05, | |
| "loss": 0.83, | |
| "mean_token_accuracy": 0.7542360424995422, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.7439764576052971, | |
| "grad_norm": 0.902718257172033, | |
| "learning_rate": 1.259432345945094e-05, | |
| "loss": 0.8026, | |
| "mean_token_accuracy": 0.7602586507797241, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 0.7448960823983815, | |
| "grad_norm": 0.9732168765607019, | |
| "learning_rate": 1.2543134329141382e-05, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7585108041763305, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.7458157071914658, | |
| "grad_norm": 0.9466993086607015, | |
| "learning_rate": 1.2492083536152772e-05, | |
| "loss": 0.8169, | |
| "mean_token_accuracy": 0.758376932144165, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 0.7467353319845503, | |
| "grad_norm": 0.9757475911083087, | |
| "learning_rate": 1.2441171552658228e-05, | |
| "loss": 0.8389, | |
| "mean_token_accuracy": 0.7498653650283813, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.7476549567776347, | |
| "grad_norm": 0.9151481291254611, | |
| "learning_rate": 1.2390398849547023e-05, | |
| "loss": 0.8006, | |
| "mean_token_accuracy": 0.7613858461380005, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 0.7485745815707191, | |
| "grad_norm": 0.8890653066533022, | |
| "learning_rate": 1.2339765896420178e-05, | |
| "loss": 0.8404, | |
| "mean_token_accuracy": 0.7510004043579102, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.7494942063638036, | |
| "grad_norm": 0.9533182704017102, | |
| "learning_rate": 1.2289273161586194e-05, | |
| "loss": 0.8234, | |
| "mean_token_accuracy": 0.7551814436912536, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.750413831156888, | |
| "grad_norm": 0.9407240854533703, | |
| "learning_rate": 1.2238921112056663e-05, | |
| "loss": 0.8635, | |
| "mean_token_accuracy": 0.7466271042823791, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.7513334559499724, | |
| "grad_norm": 0.8895247933273808, | |
| "learning_rate": 1.2188710213541957e-05, | |
| "loss": 0.8332, | |
| "mean_token_accuracy": 0.752234959602356, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 0.7522530807430569, | |
| "grad_norm": 0.9353802672482648, | |
| "learning_rate": 1.213864093044695e-05, | |
| "loss": 0.8448, | |
| "mean_token_accuracy": 0.7497453451156616, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.7531727055361412, | |
| "grad_norm": 0.946809122144392, | |
| "learning_rate": 1.2088713725866696e-05, | |
| "loss": 0.8088, | |
| "mean_token_accuracy": 0.758155906200409, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 0.7540923303292256, | |
| "grad_norm": 0.9340815348568988, | |
| "learning_rate": 1.203892906158214e-05, | |
| "loss": 0.8525, | |
| "mean_token_accuracy": 0.7470645427703857, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7550119551223101, | |
| "grad_norm": 0.9903725518055015, | |
| "learning_rate": 1.1989287398055874e-05, | |
| "loss": 0.8406, | |
| "mean_token_accuracy": 0.7499817609786987, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 0.7559315799153945, | |
| "grad_norm": 0.9005006268013445, | |
| "learning_rate": 1.193978919442787e-05, | |
| "loss": 0.833, | |
| "mean_token_accuracy": 0.7508885979652404, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.7568512047084789, | |
| "grad_norm": 0.922000222155766, | |
| "learning_rate": 1.1890434908511212e-05, | |
| "loss": 0.8256, | |
| "mean_token_accuracy": 0.7544254660606384, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 0.7577708295015634, | |
| "grad_norm": 0.9147121717124462, | |
| "learning_rate": 1.1841224996787876e-05, | |
| "loss": 0.8119, | |
| "mean_token_accuracy": 0.7572540044784546, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.7586904542946478, | |
| "grad_norm": 0.9401032528457242, | |
| "learning_rate": 1.1792159914404518e-05, | |
| "loss": 0.8389, | |
| "mean_token_accuracy": 0.7547949194908142, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.7596100790877323, | |
| "grad_norm": 0.899746427074481, | |
| "learning_rate": 1.1743240115168262e-05, | |
| "loss": 0.8104, | |
| "mean_token_accuracy": 0.7588290691375732, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.7605297038808166, | |
| "grad_norm": 0.9377432106115406, | |
| "learning_rate": 1.1694466051542473e-05, | |
| "loss": 0.8155, | |
| "mean_token_accuracy": 0.7565756559371948, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 0.761449328673901, | |
| "grad_norm": 0.9436429623996605, | |
| "learning_rate": 1.1645838174642614e-05, | |
| "loss": 0.8167, | |
| "mean_token_accuracy": 0.7574901819229126, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.7623689534669855, | |
| "grad_norm": 0.9163014099905564, | |
| "learning_rate": 1.1597356934232053e-05, | |
| "loss": 0.8518, | |
| "mean_token_accuracy": 0.7465153455734252, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 0.7632885782600699, | |
| "grad_norm": 0.8716564591657281, | |
| "learning_rate": 1.1549022778717888e-05, | |
| "loss": 0.8572, | |
| "mean_token_accuracy": 0.7444779276847839, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.7642082030531543, | |
| "grad_norm": 0.9408396749893937, | |
| "learning_rate": 1.1500836155146839e-05, | |
| "loss": 0.83, | |
| "mean_token_accuracy": 0.7533326983451843, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 0.7651278278462388, | |
| "grad_norm": 0.9335839862612282, | |
| "learning_rate": 1.1452797509201083e-05, | |
| "loss": 0.8751, | |
| "mean_token_accuracy": 0.7398134231567383, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.7660474526393232, | |
| "grad_norm": 0.9850624435923674, | |
| "learning_rate": 1.1404907285194125e-05, | |
| "loss": 0.8523, | |
| "mean_token_accuracy": 0.7461954593658447, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 0.7669670774324076, | |
| "grad_norm": 0.9679449146346353, | |
| "learning_rate": 1.1357165926066716e-05, | |
| "loss": 0.7892, | |
| "mean_token_accuracy": 0.7605505466461182, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.767886702225492, | |
| "grad_norm": 0.9416265509404674, | |
| "learning_rate": 1.130957387338275e-05, | |
| "loss": 0.8221, | |
| "mean_token_accuracy": 0.7559242844581604, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.7688063270185764, | |
| "grad_norm": 0.909615601406411, | |
| "learning_rate": 1.1262131567325163e-05, | |
| "loss": 0.8357, | |
| "mean_token_accuracy": 0.7517993927001954, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.7697259518116608, | |
| "grad_norm": 0.9047722281799156, | |
| "learning_rate": 1.1214839446691869e-05, | |
| "loss": 0.8032, | |
| "mean_token_accuracy": 0.7601001501083374, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 0.7706455766047453, | |
| "grad_norm": 0.9246634008625312, | |
| "learning_rate": 1.1167697948891707e-05, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.7536085605621338, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.7715652013978297, | |
| "grad_norm": 0.9460638804791452, | |
| "learning_rate": 1.1120707509940403e-05, | |
| "loss": 0.8167, | |
| "mean_token_accuracy": 0.7593476176261902, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 0.7724848261909141, | |
| "grad_norm": 0.9221593736048895, | |
| "learning_rate": 1.1073868564456503e-05, | |
| "loss": 0.845, | |
| "mean_token_accuracy": 0.7480282187461853, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7734044509839986, | |
| "grad_norm": 0.8888076192030434, | |
| "learning_rate": 1.1027181545657403e-05, | |
| "loss": 0.7794, | |
| "mean_token_accuracy": 0.76693354845047, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 0.774324075777083, | |
| "grad_norm": 0.8891810327123515, | |
| "learning_rate": 1.0980646885355313e-05, | |
| "loss": 0.7885, | |
| "mean_token_accuracy": 0.7628621697425843, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.7752437005701673, | |
| "grad_norm": 0.9743526817712896, | |
| "learning_rate": 1.0934265013953239e-05, | |
| "loss": 0.8478, | |
| "mean_token_accuracy": 0.7504450678825378, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 0.7761633253632518, | |
| "grad_norm": 0.9143999464853897, | |
| "learning_rate": 1.0888036360441066e-05, | |
| "loss": 0.8059, | |
| "mean_token_accuracy": 0.7603421926498413, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.7770829501563362, | |
| "grad_norm": 0.9734913517153475, | |
| "learning_rate": 1.0841961352391522e-05, | |
| "loss": 0.8159, | |
| "mean_token_accuracy": 0.7574024796485901, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.7780025749494206, | |
| "grad_norm": 0.935773373300799, | |
| "learning_rate": 1.079604041595628e-05, | |
| "loss": 0.8562, | |
| "mean_token_accuracy": 0.7468973875045777, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.7789221997425051, | |
| "grad_norm": 0.9031689337704597, | |
| "learning_rate": 1.075027397586198e-05, | |
| "loss": 0.8165, | |
| "mean_token_accuracy": 0.7566033601760864, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 0.7798418245355895, | |
| "grad_norm": 0.9138920947374664, | |
| "learning_rate": 1.0704662455406309e-05, | |
| "loss": 0.8137, | |
| "mean_token_accuracy": 0.7558243870735168, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.7807614493286739, | |
| "grad_norm": 0.942480721965923, | |
| "learning_rate": 1.06592062764541e-05, | |
| "loss": 0.8103, | |
| "mean_token_accuracy": 0.7595886349678039, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 0.7816810741217584, | |
| "grad_norm": 0.8995689595482391, | |
| "learning_rate": 1.0613905859433412e-05, | |
| "loss": 0.8158, | |
| "mean_token_accuracy": 0.7546827673912049, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7826006989148427, | |
| "grad_norm": 0.8666864815369382, | |
| "learning_rate": 1.0568761623331642e-05, | |
| "loss": 0.8082, | |
| "mean_token_accuracy": 0.7590071558952332, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 0.7835203237079271, | |
| "grad_norm": 0.9696655409923509, | |
| "learning_rate": 1.0523773985691673e-05, | |
| "loss": 0.8556, | |
| "mean_token_accuracy": 0.7452132105827332, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.7844399485010116, | |
| "grad_norm": 0.9833829005536767, | |
| "learning_rate": 1.0478943362607984e-05, | |
| "loss": 0.8586, | |
| "mean_token_accuracy": 0.7462344169616699, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 0.785359573294096, | |
| "grad_norm": 0.9595206401213471, | |
| "learning_rate": 1.0434270168722813e-05, | |
| "loss": 0.8351, | |
| "mean_token_accuracy": 0.7498462796211243, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.7862791980871804, | |
| "grad_norm": 0.9261440611345254, | |
| "learning_rate": 1.0389754817222325e-05, | |
| "loss": 0.77, | |
| "mean_token_accuracy": 0.7716120958328248, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.7871988228802649, | |
| "grad_norm": 0.926036803637149, | |
| "learning_rate": 1.0345397719832791e-05, | |
| "loss": 0.8117, | |
| "mean_token_accuracy": 0.75774165391922, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.7881184476733493, | |
| "grad_norm": 0.9482199838406158, | |
| "learning_rate": 1.0301199286816768e-05, | |
| "loss": 0.7869, | |
| "mean_token_accuracy": 0.7647076845169067, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 0.7890380724664336, | |
| "grad_norm": 0.9249156078948935, | |
| "learning_rate": 1.0257159926969315e-05, | |
| "loss": 0.8379, | |
| "mean_token_accuracy": 0.7494875431060791, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.7899576972595181, | |
| "grad_norm": 0.9426764037549299, | |
| "learning_rate": 1.0213280047614224e-05, | |
| "loss": 0.8399, | |
| "mean_token_accuracy": 0.748091197013855, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 0.7908773220526025, | |
| "grad_norm": 0.9001227058548062, | |
| "learning_rate": 1.016956005460021e-05, | |
| "loss": 0.8151, | |
| "mean_token_accuracy": 0.7553766012191773, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7917969468456869, | |
| "grad_norm": 0.9494070318147612, | |
| "learning_rate": 1.0126000352297207e-05, | |
| "loss": 0.8161, | |
| "mean_token_accuracy": 0.7553802728652954, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 0.7927165716387714, | |
| "grad_norm": 0.9634025237949015, | |
| "learning_rate": 1.0082601343592613e-05, | |
| "loss": 0.8375, | |
| "mean_token_accuracy": 0.7490672588348388, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.7936361964318558, | |
| "grad_norm": 0.918509774691625, | |
| "learning_rate": 1.0039363429887526e-05, | |
| "loss": 0.8027, | |
| "mean_token_accuracy": 0.7611651062965393, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 0.7945558212249402, | |
| "grad_norm": 0.9045021299622812, | |
| "learning_rate": 9.996287011093095e-06, | |
| "loss": 0.8194, | |
| "mean_token_accuracy": 0.7530111193656921, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.7954754460180247, | |
| "grad_norm": 0.9575102184844824, | |
| "learning_rate": 9.95337248562677e-06, | |
| "loss": 0.813, | |
| "mean_token_accuracy": 0.7606404304504395, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.796395070811109, | |
| "grad_norm": 0.9520723107616024, | |
| "learning_rate": 9.910620250408654e-06, | |
| "loss": 0.8219, | |
| "mean_token_accuracy": 0.7527819633483886, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.7973146956041934, | |
| "grad_norm": 0.9957772801943348, | |
| "learning_rate": 9.868030700857786e-06, | |
| "loss": 0.8527, | |
| "mean_token_accuracy": 0.7474417209625244, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 0.7982343203972779, | |
| "grad_norm": 0.9206334782903142, | |
| "learning_rate": 9.825604230888534e-06, | |
| "loss": 0.8013, | |
| "mean_token_accuracy": 0.7611706376075744, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.7991539451903623, | |
| "grad_norm": 0.9528692345244755, | |
| "learning_rate": 9.783341232906929e-06, | |
| "loss": 0.8452, | |
| "mean_token_accuracy": 0.7476886630058288, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 0.8000735699834468, | |
| "grad_norm": 0.9501814513029114, | |
| "learning_rate": 9.741242097807015e-06, | |
| "loss": 0.7998, | |
| "mean_token_accuracy": 0.7616806149482727, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.8009931947765312, | |
| "grad_norm": 0.9162860642484046, | |
| "learning_rate": 9.699307214967278e-06, | |
| "loss": 0.8154, | |
| "mean_token_accuracy": 0.7584839701652527, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 0.8019128195696156, | |
| "grad_norm": 1.0326738672670173, | |
| "learning_rate": 9.657536972247011e-06, | |
| "loss": 0.8364, | |
| "mean_token_accuracy": 0.7505152702331543, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.8028324443627001, | |
| "grad_norm": 0.9226495279325524, | |
| "learning_rate": 9.615931755982732e-06, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.7548305869102478, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 0.8037520691557845, | |
| "grad_norm": 0.9998522862414826, | |
| "learning_rate": 9.574491950984617e-06, | |
| "loss": 0.8713, | |
| "mean_token_accuracy": 0.7403565168380737, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.8046716939488688, | |
| "grad_norm": 0.9493513097435586, | |
| "learning_rate": 9.533217940532952e-06, | |
| "loss": 0.8295, | |
| "mean_token_accuracy": 0.7500657081604004, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.8055913187419533, | |
| "grad_norm": 0.9906056177459279, | |
| "learning_rate": 9.492110106374562e-06, | |
| "loss": 0.7962, | |
| "mean_token_accuracy": 0.7624237060546875, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.8065109435350377, | |
| "grad_norm": 0.9844968670498593, | |
| "learning_rate": 9.451168828719293e-06, | |
| "loss": 0.7978, | |
| "mean_token_accuracy": 0.7625670194625854, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 0.8074305683281221, | |
| "grad_norm": 0.9677134975970255, | |
| "learning_rate": 9.410394486236498e-06, | |
| "loss": 0.8635, | |
| "mean_token_accuracy": 0.7404338598251343, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.8083501931212066, | |
| "grad_norm": 0.9239280726012725, | |
| "learning_rate": 9.369787456051545e-06, | |
| "loss": 0.8134, | |
| "mean_token_accuracy": 0.75517338514328, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 0.809269817914291, | |
| "grad_norm": 0.9448230478695528, | |
| "learning_rate": 9.329348113742293e-06, | |
| "loss": 0.8304, | |
| "mean_token_accuracy": 0.7514260888099671, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8101894427073754, | |
| "grad_norm": 0.9454127260499946, | |
| "learning_rate": 9.289076833335659e-06, | |
| "loss": 0.8097, | |
| "mean_token_accuracy": 0.7581054925918579, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 0.8111090675004599, | |
| "grad_norm": 0.9492270487120692, | |
| "learning_rate": 9.24897398730414e-06, | |
| "loss": 0.8527, | |
| "mean_token_accuracy": 0.7465508818626404, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.8120286922935442, | |
| "grad_norm": 0.9570757946856893, | |
| "learning_rate": 9.209039946562354e-06, | |
| "loss": 0.8267, | |
| "mean_token_accuracy": 0.755340301990509, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 0.8129483170866286, | |
| "grad_norm": 0.9284190475550864, | |
| "learning_rate": 9.169275080463641e-06, | |
| "loss": 0.7752, | |
| "mean_token_accuracy": 0.7686259269714355, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.8138679418797131, | |
| "grad_norm": 0.9501950391649288, | |
| "learning_rate": 9.129679756796622e-06, | |
| "loss": 0.8111, | |
| "mean_token_accuracy": 0.7585479974746704, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.8147875666727975, | |
| "grad_norm": 0.9046262111625721, | |
| "learning_rate": 9.090254341781824e-06, | |
| "loss": 0.802, | |
| "mean_token_accuracy": 0.7600291728973388, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.8157071914658819, | |
| "grad_norm": 0.9379329497256937, | |
| "learning_rate": 9.05099920006824e-06, | |
| "loss": 0.8206, | |
| "mean_token_accuracy": 0.754150140285492, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 0.8166268162589664, | |
| "grad_norm": 0.9034131325499937, | |
| "learning_rate": 9.011914694730014e-06, | |
| "loss": 0.7971, | |
| "mean_token_accuracy": 0.7597368478775024, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.8175464410520508, | |
| "grad_norm": 0.9338149471790205, | |
| "learning_rate": 8.973001187263069e-06, | |
| "loss": 0.8184, | |
| "mean_token_accuracy": 0.7545792698860169, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 0.8184660658451351, | |
| "grad_norm": 0.9541079918085381, | |
| "learning_rate": 8.934259037581725e-06, | |
| "loss": 0.8097, | |
| "mean_token_accuracy": 0.7586872816085816, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.8193856906382196, | |
| "grad_norm": 0.9233023020738409, | |
| "learning_rate": 8.895688604015418e-06, | |
| "loss": 0.8276, | |
| "mean_token_accuracy": 0.7541133642196656, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 0.820305315431304, | |
| "grad_norm": 0.9312024884427347, | |
| "learning_rate": 8.857290243305372e-06, | |
| "loss": 0.8242, | |
| "mean_token_accuracy": 0.7540480494499207, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.8212249402243884, | |
| "grad_norm": 0.9636521068626411, | |
| "learning_rate": 8.819064310601274e-06, | |
| "loss": 0.827, | |
| "mean_token_accuracy": 0.754251503944397, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 0.8221445650174729, | |
| "grad_norm": 0.9594804588793242, | |
| "learning_rate": 8.78101115945803e-06, | |
| "loss": 0.8195, | |
| "mean_token_accuracy": 0.7567231893539429, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.8230641898105573, | |
| "grad_norm": 0.946382911890805, | |
| "learning_rate": 8.743131141832466e-06, | |
| "loss": 0.8093, | |
| "mean_token_accuracy": 0.7608936429023743, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.8239838146036417, | |
| "grad_norm": 0.9662210178630657, | |
| "learning_rate": 8.705424608080091e-06, | |
| "loss": 0.845, | |
| "mean_token_accuracy": 0.7482501983642578, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.8249034393967262, | |
| "grad_norm": 1.0134277900865423, | |
| "learning_rate": 8.667891906951822e-06, | |
| "loss": 0.806, | |
| "mean_token_accuracy": 0.7607534885406494, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 0.8258230641898106, | |
| "grad_norm": 0.969259829449015, | |
| "learning_rate": 8.63053338559081e-06, | |
| "loss": 0.8301, | |
| "mean_token_accuracy": 0.7495483517646789, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.8267426889828949, | |
| "grad_norm": 0.973132836806053, | |
| "learning_rate": 8.593349389529194e-06, | |
| "loss": 0.8412, | |
| "mean_token_accuracy": 0.7499716639518738, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 0.8276623137759794, | |
| "grad_norm": 0.9074516956073079, | |
| "learning_rate": 8.556340262684901e-06, | |
| "loss": 0.8239, | |
| "mean_token_accuracy": 0.7554465770721436, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8285819385690638, | |
| "grad_norm": 0.930234934487542, | |
| "learning_rate": 8.519506347358495e-06, | |
| "loss": 0.7947, | |
| "mean_token_accuracy": 0.7629730701446533, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 0.8295015633621482, | |
| "grad_norm": 0.8753133502304897, | |
| "learning_rate": 8.482847984229992e-06, | |
| "loss": 0.8461, | |
| "mean_token_accuracy": 0.747829282283783, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.8304211881552327, | |
| "grad_norm": 0.9490806269639048, | |
| "learning_rate": 8.446365512355697e-06, | |
| "loss": 0.809, | |
| "mean_token_accuracy": 0.7590258955955506, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 0.8313408129483171, | |
| "grad_norm": 0.945014272705201, | |
| "learning_rate": 8.410059269165094e-06, | |
| "loss": 0.858, | |
| "mean_token_accuracy": 0.7476967573165894, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.8322604377414015, | |
| "grad_norm": 0.9585805628825262, | |
| "learning_rate": 8.37392959045771e-06, | |
| "loss": 0.8276, | |
| "mean_token_accuracy": 0.7536361336708068, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.833180062534486, | |
| "grad_norm": 0.9798760065535969, | |
| "learning_rate": 8.337976810400024e-06, | |
| "loss": 0.8271, | |
| "mean_token_accuracy": 0.7538176774978638, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.8340996873275703, | |
| "grad_norm": 0.9885247811188054, | |
| "learning_rate": 8.30220126152233e-06, | |
| "loss": 0.8351, | |
| "mean_token_accuracy": 0.7511208415031433, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 0.8350193121206547, | |
| "grad_norm": 0.926636431875522, | |
| "learning_rate": 8.266603274715734e-06, | |
| "loss": 0.8536, | |
| "mean_token_accuracy": 0.7437230348587036, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.8359389369137392, | |
| "grad_norm": 0.9639989728106565, | |
| "learning_rate": 8.231183179229041e-06, | |
| "loss": 0.8337, | |
| "mean_token_accuracy": 0.749656867980957, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 0.8368585617068236, | |
| "grad_norm": 0.9810922714927505, | |
| "learning_rate": 8.19594130266571e-06, | |
| "loss": 0.8441, | |
| "mean_token_accuracy": 0.7471103310585022, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.837778186499908, | |
| "grad_norm": 0.940673214702186, | |
| "learning_rate": 8.16087797098086e-06, | |
| "loss": 0.8076, | |
| "mean_token_accuracy": 0.757796049118042, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 0.8386978112929925, | |
| "grad_norm": 0.9808241732647448, | |
| "learning_rate": 8.125993508478222e-06, | |
| "loss": 0.8107, | |
| "mean_token_accuracy": 0.7570709705352783, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.8396174360860769, | |
| "grad_norm": 0.9417309972023068, | |
| "learning_rate": 8.091288237807148e-06, | |
| "loss": 0.7918, | |
| "mean_token_accuracy": 0.7627918124198914, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 0.8405370608791614, | |
| "grad_norm": 0.9994759897340699, | |
| "learning_rate": 8.05676247995964e-06, | |
| "loss": 0.8308, | |
| "mean_token_accuracy": 0.7522749185562134, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.8414566856722457, | |
| "grad_norm": 0.9575333123064316, | |
| "learning_rate": 8.022416554267361e-06, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.7555456757545471, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.8423763104653301, | |
| "grad_norm": 0.9428369551875321, | |
| "learning_rate": 7.988250778398704e-06, | |
| "loss": 0.7799, | |
| "mean_token_accuracy": 0.7657583713531494, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.8432959352584146, | |
| "grad_norm": 0.9491493130691244, | |
| "learning_rate": 7.95426546835582e-06, | |
| "loss": 0.8463, | |
| "mean_token_accuracy": 0.7497212409973144, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 0.844215560051499, | |
| "grad_norm": 0.9279119840497574, | |
| "learning_rate": 7.92046093847173e-06, | |
| "loss": 0.7911, | |
| "mean_token_accuracy": 0.7641847729682922, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.8451351848445834, | |
| "grad_norm": 0.975196157389162, | |
| "learning_rate": 7.88683750140741e-06, | |
| "loss": 0.7829, | |
| "mean_token_accuracy": 0.76539067029953, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 0.8460548096376679, | |
| "grad_norm": 0.9630038826041202, | |
| "learning_rate": 7.853395468148877e-06, | |
| "loss": 0.8214, | |
| "mean_token_accuracy": 0.7576993346214295, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8469744344307523, | |
| "grad_norm": 0.9547194790847711, | |
| "learning_rate": 7.82013514800434e-06, | |
| "loss": 0.8133, | |
| "mean_token_accuracy": 0.7594569325447083, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 0.8478940592238366, | |
| "grad_norm": 0.9804442806928446, | |
| "learning_rate": 7.787056848601327e-06, | |
| "loss": 0.826, | |
| "mean_token_accuracy": 0.7542958974838256, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.8488136840169211, | |
| "grad_norm": 0.987211519153664, | |
| "learning_rate": 7.754160875883835e-06, | |
| "loss": 0.859, | |
| "mean_token_accuracy": 0.7447464466094971, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 0.8497333088100055, | |
| "grad_norm": 0.9279113898182684, | |
| "learning_rate": 7.721447534109509e-06, | |
| "loss": 0.8318, | |
| "mean_token_accuracy": 0.7507144689559937, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.8506529336030899, | |
| "grad_norm": 0.9722340874170035, | |
| "learning_rate": 7.688917125846836e-06, | |
| "loss": 0.8354, | |
| "mean_token_accuracy": 0.7506987690925598, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.8515725583961744, | |
| "grad_norm": 0.9470559135859266, | |
| "learning_rate": 7.65656995197231e-06, | |
| "loss": 0.846, | |
| "mean_token_accuracy": 0.7494428992271424, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.8524921831892588, | |
| "grad_norm": 1.0085786438496558, | |
| "learning_rate": 7.6244063116676965e-06, | |
| "loss": 0.8048, | |
| "mean_token_accuracy": 0.7590271830558777, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 0.8534118079823432, | |
| "grad_norm": 0.9122173396588265, | |
| "learning_rate": 7.592426502417235e-06, | |
| "loss": 0.792, | |
| "mean_token_accuracy": 0.7632818222045898, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.8543314327754277, | |
| "grad_norm": 0.920428242471814, | |
| "learning_rate": 7.560630820004905e-06, | |
| "loss": 0.7682, | |
| "mean_token_accuracy": 0.768799901008606, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 0.855251057568512, | |
| "grad_norm": 0.9650658819203722, | |
| "learning_rate": 7.529019558511664e-06, | |
| "loss": 0.8591, | |
| "mean_token_accuracy": 0.7465671896934509, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.8561706823615964, | |
| "grad_norm": 0.941100631374564, | |
| "learning_rate": 7.4975930103127575e-06, | |
| "loss": 0.8133, | |
| "mean_token_accuracy": 0.7577845811843872, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 0.8570903071546809, | |
| "grad_norm": 0.911355294655365, | |
| "learning_rate": 7.466351466075003e-06, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.7704600811004638, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.8580099319477653, | |
| "grad_norm": 0.9600196890925632, | |
| "learning_rate": 7.43529521475409e-06, | |
| "loss": 0.8356, | |
| "mean_token_accuracy": 0.752436888217926, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 0.8589295567408497, | |
| "grad_norm": 0.9096404947618868, | |
| "learning_rate": 7.404424543591926e-06, | |
| "loss": 0.8434, | |
| "mean_token_accuracy": 0.749167013168335, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.8598491815339342, | |
| "grad_norm": 0.9645413054824178, | |
| "learning_rate": 7.37373973811398e-06, | |
| "loss": 0.8422, | |
| "mean_token_accuracy": 0.7523573756217956, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.8607688063270186, | |
| "grad_norm": 0.9461536188211753, | |
| "learning_rate": 7.343241082126609e-06, | |
| "loss": 0.789, | |
| "mean_token_accuracy": 0.7644837021827697, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.861688431120103, | |
| "grad_norm": 0.9177981778366934, | |
| "learning_rate": 7.312928857714484e-06, | |
| "loss": 0.7912, | |
| "mean_token_accuracy": 0.7650796055793763, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 0.8626080559131875, | |
| "grad_norm": 0.9395263274096144, | |
| "learning_rate": 7.282803345237937e-06, | |
| "loss": 0.779, | |
| "mean_token_accuracy": 0.766014575958252, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.8635276807062718, | |
| "grad_norm": 0.974228845887035, | |
| "learning_rate": 7.252864823330397e-06, | |
| "loss": 0.8096, | |
| "mean_token_accuracy": 0.7609816431999207, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 0.8644473054993562, | |
| "grad_norm": 0.9138771854988429, | |
| "learning_rate": 7.223113568895791e-06, | |
| "loss": 0.8228, | |
| "mean_token_accuracy": 0.7533741354942322, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8653669302924407, | |
| "grad_norm": 0.9230858356341091, | |
| "learning_rate": 7.193549857105998e-06, | |
| "loss": 0.7817, | |
| "mean_token_accuracy": 0.7645957589149475, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 0.8662865550855251, | |
| "grad_norm": 0.9248959407091435, | |
| "learning_rate": 7.164173961398307e-06, | |
| "loss": 0.8123, | |
| "mean_token_accuracy": 0.758608341217041, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.8672061798786095, | |
| "grad_norm": 0.920957739245226, | |
| "learning_rate": 7.134986153472864e-06, | |
| "loss": 0.8089, | |
| "mean_token_accuracy": 0.7574970960617066, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 0.868125804671694, | |
| "grad_norm": 0.9365387305302294, | |
| "learning_rate": 7.105986703290185e-06, | |
| "loss": 0.8207, | |
| "mean_token_accuracy": 0.7519280552864075, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.8690454294647784, | |
| "grad_norm": 0.9848472191309555, | |
| "learning_rate": 7.077175879068652e-06, | |
| "loss": 0.8318, | |
| "mean_token_accuracy": 0.7514313578605651, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.8699650542578627, | |
| "grad_norm": 0.9841439973977463, | |
| "learning_rate": 7.04855394728202e-06, | |
| "loss": 0.8254, | |
| "mean_token_accuracy": 0.7536401510238647, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.8708846790509472, | |
| "grad_norm": 0.9368690483918741, | |
| "learning_rate": 7.020121172656971e-06, | |
| "loss": 0.8079, | |
| "mean_token_accuracy": 0.7589451789855957, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 0.8718043038440316, | |
| "grad_norm": 0.9537367969880632, | |
| "learning_rate": 6.991877818170647e-06, | |
| "loss": 0.8105, | |
| "mean_token_accuracy": 0.7570921540260315, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.872723928637116, | |
| "grad_norm": 0.9771290706741976, | |
| "learning_rate": 6.963824145048245e-06, | |
| "loss": 0.8383, | |
| "mean_token_accuracy": 0.7482818961143494, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 0.8736435534302005, | |
| "grad_norm": 0.9167489506515816, | |
| "learning_rate": 6.935960412760554e-06, | |
| "loss": 0.7956, | |
| "mean_token_accuracy": 0.7615381121635437, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.8745631782232849, | |
| "grad_norm": 0.9509142520738616, | |
| "learning_rate": 6.908286879021611e-06, | |
| "loss": 0.8272, | |
| "mean_token_accuracy": 0.7538857817649841, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 0.8754828030163693, | |
| "grad_norm": 0.9492010037774332, | |
| "learning_rate": 6.880803799786282e-06, | |
| "loss": 0.8083, | |
| "mean_token_accuracy": 0.7596304178237915, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.8764024278094538, | |
| "grad_norm": 0.9879455089380224, | |
| "learning_rate": 6.853511429247891e-06, | |
| "loss": 0.8501, | |
| "mean_token_accuracy": 0.7443594694137573, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 0.8773220526025381, | |
| "grad_norm": 0.900884905164465, | |
| "learning_rate": 6.826410019835897e-06, | |
| "loss": 0.8388, | |
| "mean_token_accuracy": 0.75017911195755, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.8782416773956225, | |
| "grad_norm": 0.9347399353088925, | |
| "learning_rate": 6.7994998222135415e-06, | |
| "loss": 0.8338, | |
| "mean_token_accuracy": 0.7503747582435608, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.879161302188707, | |
| "grad_norm": 0.9313447849733553, | |
| "learning_rate": 6.77278108527552e-06, | |
| "loss": 0.8223, | |
| "mean_token_accuracy": 0.7531881928443909, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.8800809269817914, | |
| "grad_norm": 0.9749122247147805, | |
| "learning_rate": 6.7462540561457035e-06, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.7597910761833191, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 0.8810005517748758, | |
| "grad_norm": 0.9459726297921652, | |
| "learning_rate": 6.719918980174842e-06, | |
| "loss": 0.7735, | |
| "mean_token_accuracy": 0.7680148124694824, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.8819201765679603, | |
| "grad_norm": 0.9477334526426899, | |
| "learning_rate": 6.6937761009382816e-06, | |
| "loss": 0.8025, | |
| "mean_token_accuracy": 0.759226131439209, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 0.8828398013610447, | |
| "grad_norm": 0.9350684746914302, | |
| "learning_rate": 6.667825660233736e-06, | |
| "loss": 0.8141, | |
| "mean_token_accuracy": 0.7565145611763, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8837594261541292, | |
| "grad_norm": 0.9492764392082258, | |
| "learning_rate": 6.642067898079038e-06, | |
| "loss": 0.8311, | |
| "mean_token_accuracy": 0.7527845025062561, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 0.8846790509472136, | |
| "grad_norm": 0.8598768439927121, | |
| "learning_rate": 6.616503052709914e-06, | |
| "loss": 0.7896, | |
| "mean_token_accuracy": 0.7648340344429017, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.8855986757402979, | |
| "grad_norm": 0.9446656437839204, | |
| "learning_rate": 6.591131360577795e-06, | |
| "loss": 0.8052, | |
| "mean_token_accuracy": 0.7575154542922974, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 0.8865183005333824, | |
| "grad_norm": 0.8652514268793213, | |
| "learning_rate": 6.565953056347608e-06, | |
| "loss": 0.7534, | |
| "mean_token_accuracy": 0.7725171089172364, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.8874379253264668, | |
| "grad_norm": 0.9422431334861092, | |
| "learning_rate": 6.540968372895634e-06, | |
| "loss": 0.7977, | |
| "mean_token_accuracy": 0.7611649394035339, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.8883575501195512, | |
| "grad_norm": 0.9384703132768932, | |
| "learning_rate": 6.516177541307333e-06, | |
| "loss": 0.7995, | |
| "mean_token_accuracy": 0.7624763369560241, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.8892771749126357, | |
| "grad_norm": 1.015847599195386, | |
| "learning_rate": 6.491580790875209e-06, | |
| "loss": 0.7916, | |
| "mean_token_accuracy": 0.7621793508529663, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 0.8901967997057201, | |
| "grad_norm": 0.9098096698494834, | |
| "learning_rate": 6.4671783490966945e-06, | |
| "loss": 0.8088, | |
| "mean_token_accuracy": 0.7614699125289917, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.8911164244988045, | |
| "grad_norm": 0.9558674059824713, | |
| "learning_rate": 6.442970441672051e-06, | |
| "loss": 0.8545, | |
| "mean_token_accuracy": 0.7470506310462952, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 0.892036049291889, | |
| "grad_norm": 0.9590352976202275, | |
| "learning_rate": 6.4189572925022655e-06, | |
| "loss": 0.8363, | |
| "mean_token_accuracy": 0.7472939848899841, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8929556740849733, | |
| "grad_norm": 0.8982751392912057, | |
| "learning_rate": 6.3951391236869985e-06, | |
| "loss": 0.8259, | |
| "mean_token_accuracy": 0.7548177719116211, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 0.8938752988780577, | |
| "grad_norm": 0.9627549202883984, | |
| "learning_rate": 6.371516155522513e-06, | |
| "loss": 0.8035, | |
| "mean_token_accuracy": 0.7578222513198852, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.8947949236711422, | |
| "grad_norm": 0.962995623951893, | |
| "learning_rate": 6.3480886064996484e-06, | |
| "loss": 0.8119, | |
| "mean_token_accuracy": 0.7579006910324096, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 0.8957145484642266, | |
| "grad_norm": 0.99045632467858, | |
| "learning_rate": 6.3248566933017975e-06, | |
| "loss": 0.7942, | |
| "mean_token_accuracy": 0.75965256690979, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.896634173257311, | |
| "grad_norm": 0.9510071830298487, | |
| "learning_rate": 6.3018206308028975e-06, | |
| "loss": 0.8185, | |
| "mean_token_accuracy": 0.7584743499755859, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.8975537980503955, | |
| "grad_norm": 0.9703791789576997, | |
| "learning_rate": 6.2789806320654456e-06, | |
| "loss": 0.7816, | |
| "mean_token_accuracy": 0.7649904489517212, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8984734228434799, | |
| "grad_norm": 0.9398378664335288, | |
| "learning_rate": 6.256336908338531e-06, | |
| "loss": 0.78, | |
| "mean_token_accuracy": 0.767956817150116, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 0.8993930476365642, | |
| "grad_norm": 0.987114293205303, | |
| "learning_rate": 6.233889669055878e-06, | |
| "loss": 0.8443, | |
| "mean_token_accuracy": 0.7497469425201416, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.9003126724296487, | |
| "grad_norm": 0.9343500174042304, | |
| "learning_rate": 6.211639121833912e-06, | |
| "loss": 0.7931, | |
| "mean_token_accuracy": 0.763602340221405, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 0.9012322972227331, | |
| "grad_norm": 0.9262644956755969, | |
| "learning_rate": 6.189585472469829e-06, | |
| "loss": 0.7792, | |
| "mean_token_accuracy": 0.7697998642921448, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.9021519220158175, | |
| "grad_norm": 0.9622834108867682, | |
| "learning_rate": 6.167728924939705e-06, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.7625941157341003, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 0.903071546808902, | |
| "grad_norm": 0.9190192726730757, | |
| "learning_rate": 6.146069681396612e-06, | |
| "loss": 0.8253, | |
| "mean_token_accuracy": 0.7542304992675781, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.9039911716019864, | |
| "grad_norm": 0.9361246140345745, | |
| "learning_rate": 6.124607942168726e-06, | |
| "loss": 0.8031, | |
| "mean_token_accuracy": 0.7584469556808472, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 0.9049107963950708, | |
| "grad_norm": 0.9457716726884055, | |
| "learning_rate": 6.1033439057574965e-06, | |
| "loss": 0.8153, | |
| "mean_token_accuracy": 0.758701741695404, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.9058304211881553, | |
| "grad_norm": 0.8853750515926242, | |
| "learning_rate": 6.082277768835807e-06, | |
| "loss": 0.7921, | |
| "mean_token_accuracy": 0.763675856590271, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.9067500459812396, | |
| "grad_norm": 0.9702784866596219, | |
| "learning_rate": 6.061409726246143e-06, | |
| "loss": 0.7851, | |
| "mean_token_accuracy": 0.7646818399429322, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.907669670774324, | |
| "grad_norm": 0.9693421985103569, | |
| "learning_rate": 6.040739970998802e-06, | |
| "loss": 0.8346, | |
| "mean_token_accuracy": 0.7530786991119385, | |
| "step": 4935 | |
| }, | |
| { | |
| "epoch": 0.9085892955674085, | |
| "grad_norm": 0.8930655347204544, | |
| "learning_rate": 6.020268694270109e-06, | |
| "loss": 0.7966, | |
| "mean_token_accuracy": 0.7641753435134888, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.9095089203604929, | |
| "grad_norm": 0.908390221485836, | |
| "learning_rate": 5.999996085400643e-06, | |
| "loss": 0.7995, | |
| "mean_token_accuracy": 0.7642928123474121, | |
| "step": 4945 | |
| }, | |
| { | |
| "epoch": 0.9104285451535773, | |
| "grad_norm": 0.9291773666129768, | |
| "learning_rate": 5.9799223318934765e-06, | |
| "loss": 0.801, | |
| "mean_token_accuracy": 0.7588168382644653, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.9113481699466618, | |
| "grad_norm": 0.9290002720904244, | |
| "learning_rate": 5.9600476194124675e-06, | |
| "loss": 0.7973, | |
| "mean_token_accuracy": 0.763935673236847, | |
| "step": 4955 | |
| }, | |
| { | |
| "epoch": 0.9122677947397462, | |
| "grad_norm": 0.9446442087955222, | |
| "learning_rate": 5.9403721317805245e-06, | |
| "loss": 0.801, | |
| "mean_token_accuracy": 0.7578533172607422, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.9131874195328306, | |
| "grad_norm": 0.9568316679901518, | |
| "learning_rate": 5.920896050977891e-06, | |
| "loss": 0.8926, | |
| "mean_token_accuracy": 0.7361096501350403, | |
| "step": 4965 | |
| }, | |
| { | |
| "epoch": 0.914107044325915, | |
| "grad_norm": 0.9761363167639366, | |
| "learning_rate": 5.901619557140502e-06, | |
| "loss": 0.8302, | |
| "mean_token_accuracy": 0.7517902731895447, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.9150266691189994, | |
| "grad_norm": 0.9363921634925068, | |
| "learning_rate": 5.882542828558286e-06, | |
| "loss": 0.8066, | |
| "mean_token_accuracy": 0.7580497026443481, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.9159462939120838, | |
| "grad_norm": 0.9898749363112332, | |
| "learning_rate": 5.86366604167352e-06, | |
| "loss": 0.7785, | |
| "mean_token_accuracy": 0.7676722645759583, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.9168659187051683, | |
| "grad_norm": 0.9461120512925497, | |
| "learning_rate": 5.844989371079215e-06, | |
| "loss": 0.7655, | |
| "mean_token_accuracy": 0.7703205943107605, | |
| "step": 4985 | |
| }, | |
| { | |
| "epoch": 0.9177855434982527, | |
| "grad_norm": 0.9340964548547984, | |
| "learning_rate": 5.826512989517478e-06, | |
| "loss": 0.8243, | |
| "mean_token_accuracy": 0.7529069542884826, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.9187051682913371, | |
| "grad_norm": 0.9542091804584825, | |
| "learning_rate": 5.808237067877942e-06, | |
| "loss": 0.7869, | |
| "mean_token_accuracy": 0.7639023303985596, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 0.9196247930844216, | |
| "grad_norm": 0.9799469338180448, | |
| "learning_rate": 5.790161775196144e-06, | |
| "loss": 0.7942, | |
| "mean_token_accuracy": 0.7624092340469361, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.920544417877506, | |
| "grad_norm": 0.9533254080832144, | |
| "learning_rate": 5.772287278652012e-06, | |
| "loss": 0.8109, | |
| "mean_token_accuracy": 0.7598010182380677, | |
| "step": 5005 | |
| }, | |
| { | |
| "epoch": 0.9214640426705903, | |
| "grad_norm": 0.9311527277134242, | |
| "learning_rate": 5.754613743568279e-06, | |
| "loss": 0.7906, | |
| "mean_token_accuracy": 0.7638931751251221, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.9223836674636748, | |
| "grad_norm": 0.9812836116539834, | |
| "learning_rate": 5.737141333408972e-06, | |
| "loss": 0.8008, | |
| "mean_token_accuracy": 0.7612162590026855, | |
| "step": 5015 | |
| }, | |
| { | |
| "epoch": 0.9233032922567592, | |
| "grad_norm": 0.9745443553849291, | |
| "learning_rate": 5.719870209777896e-06, | |
| "loss": 0.8417, | |
| "mean_token_accuracy": 0.7509512066841125, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.9242229170498437, | |
| "grad_norm": 0.9530895065948418, | |
| "learning_rate": 5.702800532417144e-06, | |
| "loss": 0.7899, | |
| "mean_token_accuracy": 0.7625620007514954, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.9251425418429281, | |
| "grad_norm": 0.9106620317823355, | |
| "learning_rate": 5.685932459205606e-06, | |
| "loss": 0.8075, | |
| "mean_token_accuracy": 0.7597783088684082, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.9260621666360125, | |
| "grad_norm": 0.9016062622069709, | |
| "learning_rate": 5.669266146157527e-06, | |
| "loss": 0.7956, | |
| "mean_token_accuracy": 0.7618203997612, | |
| "step": 5035 | |
| }, | |
| { | |
| "epoch": 0.926981791429097, | |
| "grad_norm": 0.9311871037406105, | |
| "learning_rate": 5.652801747421053e-06, | |
| "loss": 0.7755, | |
| "mean_token_accuracy": 0.7672530770301819, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.9279014162221814, | |
| "grad_norm": 0.9289149914362874, | |
| "learning_rate": 5.636539415276807e-06, | |
| "loss": 0.7971, | |
| "mean_token_accuracy": 0.7606992840766906, | |
| "step": 5045 | |
| }, | |
| { | |
| "epoch": 0.9288210410152657, | |
| "grad_norm": 0.9265920738234094, | |
| "learning_rate": 5.620479300136475e-06, | |
| "loss": 0.7675, | |
| "mean_token_accuracy": 0.7715546011924743, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.9297406658083502, | |
| "grad_norm": 1.001963123510446, | |
| "learning_rate": 5.604621550541429e-06, | |
| "loss": 0.8426, | |
| "mean_token_accuracy": 0.7474547743797302, | |
| "step": 5055 | |
| }, | |
| { | |
| "epoch": 0.9306602906014346, | |
| "grad_norm": 0.9062392197653472, | |
| "learning_rate": 5.5889663131613465e-06, | |
| "loss": 0.8237, | |
| "mean_token_accuracy": 0.7512851595878601, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.931579915394519, | |
| "grad_norm": 0.9878466692235598, | |
| "learning_rate": 5.5735137327928384e-06, | |
| "loss": 0.8018, | |
| "mean_token_accuracy": 0.7595331549644471, | |
| "step": 5065 | |
| }, | |
| { | |
| "epoch": 0.9324995401876035, | |
| "grad_norm": 0.911756127989921, | |
| "learning_rate": 5.558263952358139e-06, | |
| "loss": 0.8146, | |
| "mean_token_accuracy": 0.7572713255882263, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.9334191649806879, | |
| "grad_norm": 0.9534452188147857, | |
| "learning_rate": 5.543217112903766e-06, | |
| "loss": 0.8092, | |
| "mean_token_accuracy": 0.7591339111328125, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.9343387897737723, | |
| "grad_norm": 0.94136690175154, | |
| "learning_rate": 5.528373353599207e-06, | |
| "loss": 0.7945, | |
| "mean_token_accuracy": 0.7594197154045105, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.9352584145668568, | |
| "grad_norm": 0.9367268234664168, | |
| "learning_rate": 5.513732811735657e-06, | |
| "loss": 0.8123, | |
| "mean_token_accuracy": 0.7594240307807922, | |
| "step": 5085 | |
| }, | |
| { | |
| "epoch": 0.9361780393599411, | |
| "grad_norm": 0.8975989192963018, | |
| "learning_rate": 5.4992956227247345e-06, | |
| "loss": 0.7715, | |
| "mean_token_accuracy": 0.7677939176559448, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.9370976641530255, | |
| "grad_norm": 0.9987125543689239, | |
| "learning_rate": 5.48506192009722e-06, | |
| "loss": 0.8051, | |
| "mean_token_accuracy": 0.7597865104675293, | |
| "step": 5095 | |
| }, | |
| { | |
| "epoch": 0.93801728894611, | |
| "grad_norm": 0.9396093256392507, | |
| "learning_rate": 5.4710318355018435e-06, | |
| "loss": 0.8248, | |
| "mean_token_accuracy": 0.7557710766792297, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9389369137391944, | |
| "grad_norm": 0.907072734656757, | |
| "learning_rate": 5.457205498704046e-06, | |
| "loss": 0.8104, | |
| "mean_token_accuracy": 0.7568627595901489, | |
| "step": 5105 | |
| }, | |
| { | |
| "epoch": 0.9398565385322788, | |
| "grad_norm": 0.9498606808400206, | |
| "learning_rate": 5.443583037584792e-06, | |
| "loss": 0.829, | |
| "mean_token_accuracy": 0.7537372469902038, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.9407761633253633, | |
| "grad_norm": 0.9500188031150016, | |
| "learning_rate": 5.430164578139382e-06, | |
| "loss": 0.771, | |
| "mean_token_accuracy": 0.7692322492599487, | |
| "step": 5115 | |
| }, | |
| { | |
| "epoch": 0.9416957881184477, | |
| "grad_norm": 0.9133488515736051, | |
| "learning_rate": 5.4169502444762836e-06, | |
| "loss": 0.8203, | |
| "mean_token_accuracy": 0.7578924179077149, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.9426154129115321, | |
| "grad_norm": 0.9585342004886042, | |
| "learning_rate": 5.403940158815996e-06, | |
| "loss": 0.8209, | |
| "mean_token_accuracy": 0.7570155620574951, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.9435350377046166, | |
| "grad_norm": 0.9797939933864984, | |
| "learning_rate": 5.391134441489905e-06, | |
| "loss": 0.7937, | |
| "mean_token_accuracy": 0.7618912696838379, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.9444546624977009, | |
| "grad_norm": 0.9293935572688817, | |
| "learning_rate": 5.378533210939176e-06, | |
| "loss": 0.7948, | |
| "mean_token_accuracy": 0.7596281886100769, | |
| "step": 5135 | |
| }, | |
| { | |
| "epoch": 0.9453742872907853, | |
| "grad_norm": 0.9221042858985046, | |
| "learning_rate": 5.366136583713665e-06, | |
| "loss": 0.7717, | |
| "mean_token_accuracy": 0.7698543071746826, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.9462939120838698, | |
| "grad_norm": 1.025946124148099, | |
| "learning_rate": 5.353944674470823e-06, | |
| "loss": 0.8213, | |
| "mean_token_accuracy": 0.7552660465240478, | |
| "step": 5145 | |
| }, | |
| { | |
| "epoch": 0.9472135368769542, | |
| "grad_norm": 0.984504169212397, | |
| "learning_rate": 5.341957595974662e-06, | |
| "loss": 0.8392, | |
| "mean_token_accuracy": 0.7498656630516052, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.9481331616700386, | |
| "grad_norm": 0.9188252633726173, | |
| "learning_rate": 5.3301754590946824e-06, | |
| "loss": 0.8166, | |
| "mean_token_accuracy": 0.7552522420883179, | |
| "step": 5155 | |
| }, | |
| { | |
| "epoch": 0.9490527864631231, | |
| "grad_norm": 0.8673224532160614, | |
| "learning_rate": 5.318598372804873e-06, | |
| "loss": 0.7689, | |
| "mean_token_accuracy": 0.7689907431602478, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.9499724112562075, | |
| "grad_norm": 0.9392909148393203, | |
| "learning_rate": 5.307226444182686e-06, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7654459595680236, | |
| "step": 5165 | |
| }, | |
| { | |
| "epoch": 0.9508920360492918, | |
| "grad_norm": 1.0092515399603914, | |
| "learning_rate": 5.296059778408057e-06, | |
| "loss": 0.8228, | |
| "mean_token_accuracy": 0.7547815799713135, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.9518116608423763, | |
| "grad_norm": 0.9724478118701938, | |
| "learning_rate": 5.2850984787624264e-06, | |
| "loss": 0.8068, | |
| "mean_token_accuracy": 0.757933521270752, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.9527312856354607, | |
| "grad_norm": 0.9595437776833703, | |
| "learning_rate": 5.274342646627783e-06, | |
| "loss": 0.8612, | |
| "mean_token_accuracy": 0.7451163768768311, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.9536509104285451, | |
| "grad_norm": 0.9035621461181421, | |
| "learning_rate": 5.263792381485733e-06, | |
| "loss": 0.7942, | |
| "mean_token_accuracy": 0.7612574458122253, | |
| "step": 5185 | |
| }, | |
| { | |
| "epoch": 0.9545705352216296, | |
| "grad_norm": 0.9369759529937411, | |
| "learning_rate": 5.253447780916577e-06, | |
| "loss": 0.8199, | |
| "mean_token_accuracy": 0.755517327785492, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.955490160014714, | |
| "grad_norm": 0.9223279306007958, | |
| "learning_rate": 5.2433089405984e-06, | |
| "loss": 0.7855, | |
| "mean_token_accuracy": 0.7672001838684082, | |
| "step": 5195 | |
| }, | |
| { | |
| "epoch": 0.9564097848077984, | |
| "grad_norm": 0.9093658718364905, | |
| "learning_rate": 5.233375954306199e-06, | |
| "loss": 0.7588, | |
| "mean_token_accuracy": 0.7701982975006103, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9573294096008829, | |
| "grad_norm": 0.9756234794282658, | |
| "learning_rate": 5.22364891391101e-06, | |
| "loss": 0.8294, | |
| "mean_token_accuracy": 0.75344318151474, | |
| "step": 5205 | |
| }, | |
| { | |
| "epoch": 0.9582490343939672, | |
| "grad_norm": 0.910212786589889, | |
| "learning_rate": 5.2141279093790575e-06, | |
| "loss": 0.7894, | |
| "mean_token_accuracy": 0.7678821444511413, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.9591686591870516, | |
| "grad_norm": 0.9474929875705357, | |
| "learning_rate": 5.204813028770913e-06, | |
| "loss": 0.7891, | |
| "mean_token_accuracy": 0.7625754833221435, | |
| "step": 5215 | |
| }, | |
| { | |
| "epoch": 0.9600882839801361, | |
| "grad_norm": 0.9344552952746554, | |
| "learning_rate": 5.195704358240704e-06, | |
| "loss": 0.8059, | |
| "mean_token_accuracy": 0.759453558921814, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.9610079087732205, | |
| "grad_norm": 0.9060367178226402, | |
| "learning_rate": 5.186801982035298e-06, | |
| "loss": 0.7846, | |
| "mean_token_accuracy": 0.7654222846031189, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.9619275335663049, | |
| "grad_norm": 0.9799737312884412, | |
| "learning_rate": 5.178105982493528e-06, | |
| "loss": 0.813, | |
| "mean_token_accuracy": 0.7591325879096985, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.9628471583593894, | |
| "grad_norm": 0.9419373863409995, | |
| "learning_rate": 5.169616440045433e-06, | |
| "loss": 0.7933, | |
| "mean_token_accuracy": 0.7605907201766968, | |
| "step": 5235 | |
| }, | |
| { | |
| "epoch": 0.9637667831524738, | |
| "grad_norm": 0.904753211539841, | |
| "learning_rate": 5.16133343321151e-06, | |
| "loss": 0.796, | |
| "mean_token_accuracy": 0.7628448724746704, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.9646864079455583, | |
| "grad_norm": 0.9588441625989744, | |
| "learning_rate": 5.1532570386019944e-06, | |
| "loss": 0.7746, | |
| "mean_token_accuracy": 0.7675014138221741, | |
| "step": 5245 | |
| }, | |
| { | |
| "epoch": 0.9656060327386427, | |
| "grad_norm": 0.8875696215604679, | |
| "learning_rate": 5.145387330916144e-06, | |
| "loss": 0.7988, | |
| "mean_token_accuracy": 0.7614070296287536, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.966525657531727, | |
| "grad_norm": 0.9405630235157387, | |
| "learning_rate": 5.137724382941557e-06, | |
| "loss": 0.7918, | |
| "mean_token_accuracy": 0.7650785088539124, | |
| "step": 5255 | |
| }, | |
| { | |
| "epoch": 0.9674452823248115, | |
| "grad_norm": 0.9562043810312459, | |
| "learning_rate": 5.130268265553487e-06, | |
| "loss": 0.8144, | |
| "mean_token_accuracy": 0.7557086706161499, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.9683649071178959, | |
| "grad_norm": 0.9274811086930055, | |
| "learning_rate": 5.123019047714198e-06, | |
| "loss": 0.7576, | |
| "mean_token_accuracy": 0.7753474235534668, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 0.9692845319109803, | |
| "grad_norm": 0.9409745943869224, | |
| "learning_rate": 5.115976796472322e-06, | |
| "loss": 0.8328, | |
| "mean_token_accuracy": 0.7535906672477722, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.9702041567040648, | |
| "grad_norm": 0.919927159373234, | |
| "learning_rate": 5.109141576962239e-06, | |
| "loss": 0.7912, | |
| "mean_token_accuracy": 0.7655844688415527, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.9711237814971492, | |
| "grad_norm": 0.951329112362283, | |
| "learning_rate": 5.102513452403473e-06, | |
| "loss": 0.7683, | |
| "mean_token_accuracy": 0.7696467399597168, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.9720434062902336, | |
| "grad_norm": 0.9201946233258363, | |
| "learning_rate": 5.0960924841001155e-06, | |
| "loss": 0.7988, | |
| "mean_token_accuracy": 0.7610312700271606, | |
| "step": 5285 | |
| }, | |
| { | |
| "epoch": 0.972963031083318, | |
| "grad_norm": 1.0032717462292577, | |
| "learning_rate": 5.089878731440241e-06, | |
| "loss": 0.821, | |
| "mean_token_accuracy": 0.7543939590454102, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.9738826558764024, | |
| "grad_norm": 0.9429172545610519, | |
| "learning_rate": 5.0838722518953816e-06, | |
| "loss": 0.7989, | |
| "mean_token_accuracy": 0.7595749855041504, | |
| "step": 5295 | |
| }, | |
| { | |
| "epoch": 0.9748022806694868, | |
| "grad_norm": 0.9007616401314099, | |
| "learning_rate": 5.078073101019974e-06, | |
| "loss": 0.8083, | |
| "mean_token_accuracy": 0.7579713940620423, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9757219054625713, | |
| "grad_norm": 0.8990406462252963, | |
| "learning_rate": 5.072481332450857e-06, | |
| "loss": 0.8114, | |
| "mean_token_accuracy": 0.7577333807945251, | |
| "step": 5305 | |
| }, | |
| { | |
| "epoch": 0.9766415302556557, | |
| "grad_norm": 0.9615340254243923, | |
| "learning_rate": 5.067096997906774e-06, | |
| "loss": 0.7715, | |
| "mean_token_accuracy": 0.7705414056777954, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.9775611550487401, | |
| "grad_norm": 0.8455749234692341, | |
| "learning_rate": 5.06192014718789e-06, | |
| "loss": 0.7642, | |
| "mean_token_accuracy": 0.7697661995887757, | |
| "step": 5315 | |
| }, | |
| { | |
| "epoch": 0.9784807798418246, | |
| "grad_norm": 0.9292612449999305, | |
| "learning_rate": 5.05695082817534e-06, | |
| "loss": 0.7789, | |
| "mean_token_accuracy": 0.7671653866767884, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.979400404634909, | |
| "grad_norm": 0.9275056123774931, | |
| "learning_rate": 5.052189086830779e-06, | |
| "loss": 0.8018, | |
| "mean_token_accuracy": 0.7623230576515198, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.9803200294279933, | |
| "grad_norm": 0.9703545231339168, | |
| "learning_rate": 5.047634967195952e-06, | |
| "loss": 0.7877, | |
| "mean_token_accuracy": 0.7638481616973877, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.9812396542210778, | |
| "grad_norm": 0.955542417327297, | |
| "learning_rate": 5.043288511392302e-06, | |
| "loss": 0.7891, | |
| "mean_token_accuracy": 0.7614734530448913, | |
| "step": 5335 | |
| }, | |
| { | |
| "epoch": 0.9821592790141622, | |
| "grad_norm": 0.9645172124378145, | |
| "learning_rate": 5.039149759620569e-06, | |
| "loss": 0.7624, | |
| "mean_token_accuracy": 0.7724639177322388, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.9830789038072466, | |
| "grad_norm": 0.9734387825498484, | |
| "learning_rate": 5.0352187501604155e-06, | |
| "loss": 0.8579, | |
| "mean_token_accuracy": 0.746760880947113, | |
| "step": 5345 | |
| }, | |
| { | |
| "epoch": 0.9839985286003311, | |
| "grad_norm": 0.9730228991663388, | |
| "learning_rate": 5.031495519370083e-06, | |
| "loss": 0.8102, | |
| "mean_token_accuracy": 0.758979082107544, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9849181533934155, | |
| "grad_norm": 1.0013660074202417, | |
| "learning_rate": 5.027980101686053e-06, | |
| "loss": 0.8396, | |
| "mean_token_accuracy": 0.7509408593177795, | |
| "step": 5355 | |
| }, | |
| { | |
| "epoch": 0.9858377781864999, | |
| "grad_norm": 0.9817157587290055, | |
| "learning_rate": 5.024672529622717e-06, | |
| "loss": 0.7935, | |
| "mean_token_accuracy": 0.7596516370773315, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.9867574029795844, | |
| "grad_norm": 0.9800745490721745, | |
| "learning_rate": 5.0215728337720955e-06, | |
| "loss": 0.7491, | |
| "mean_token_accuracy": 0.7768563270568848, | |
| "step": 5365 | |
| }, | |
| { | |
| "epoch": 0.9876770277726687, | |
| "grad_norm": 0.99189390574119, | |
| "learning_rate": 5.018681042803533e-06, | |
| "loss": 0.7759, | |
| "mean_token_accuracy": 0.7670275330543518, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.9885966525657531, | |
| "grad_norm": 0.9673022649880465, | |
| "learning_rate": 5.0159971834634545e-06, | |
| "loss": 0.7867, | |
| "mean_token_accuracy": 0.764349353313446, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.9895162773588376, | |
| "grad_norm": 1.0182176113772272, | |
| "learning_rate": 5.013521280575099e-06, | |
| "loss": 0.799, | |
| "mean_token_accuracy": 0.7618956327438354, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.990435902151922, | |
| "grad_norm": 0.9959171759739962, | |
| "learning_rate": 5.011253357038306e-06, | |
| "loss": 0.8392, | |
| "mean_token_accuracy": 0.7527823686599732, | |
| "step": 5385 | |
| }, | |
| { | |
| "epoch": 0.9913555269450064, | |
| "grad_norm": 0.8997528487054468, | |
| "learning_rate": 5.0091934338292915e-06, | |
| "loss": 0.7615, | |
| "mean_token_accuracy": 0.7715205192565918, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.9922751517380909, | |
| "grad_norm": 0.919462849827096, | |
| "learning_rate": 5.00734153000046e-06, | |
| "loss": 0.7409, | |
| "mean_token_accuracy": 0.77668297290802, | |
| "step": 5395 | |
| }, | |
| { | |
| "epoch": 0.9931947765311753, | |
| "grad_norm": 0.984326555402561, | |
| "learning_rate": 5.005697662680227e-06, | |
| "loss": 0.7989, | |
| "mean_token_accuracy": 0.7626922607421875, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9941144013242597, | |
| "grad_norm": 0.9499542228497883, | |
| "learning_rate": 5.004261847072863e-06, | |
| "loss": 0.8283, | |
| "mean_token_accuracy": 0.7542143225669861, | |
| "step": 5405 | |
| }, | |
| { | |
| "epoch": 0.9950340261173442, | |
| "grad_norm": 0.9585799297597308, | |
| "learning_rate": 5.003034096458347e-06, | |
| "loss": 0.835, | |
| "mean_token_accuracy": 0.7544377326965332, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.9959536509104285, | |
| "grad_norm": 0.9165677599227604, | |
| "learning_rate": 5.0020144221922466e-06, | |
| "loss": 0.8013, | |
| "mean_token_accuracy": 0.7582892417907715, | |
| "step": 5415 | |
| }, | |
| { | |
| "epoch": 0.9968732757035129, | |
| "grad_norm": 0.9449991405622632, | |
| "learning_rate": 5.001202833705621e-06, | |
| "loss": 0.8352, | |
| "mean_token_accuracy": 0.7502840042114258, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.9977929004965974, | |
| "grad_norm": 0.9827477783752422, | |
| "learning_rate": 5.000599338504916e-06, | |
| "loss": 0.7931, | |
| "mean_token_accuracy": 0.762959897518158, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.9987125252896818, | |
| "grad_norm": 0.9751233701044131, | |
| "learning_rate": 5.0002039421719105e-06, | |
| "loss": 0.7978, | |
| "mean_token_accuracy": 0.7619426846504211, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.9996321500827662, | |
| "grad_norm": 0.971614941671036, | |
| "learning_rate": 5.000016648363663e-06, | |
| "loss": 0.801, | |
| "mean_token_accuracy": 0.7594120621681213, | |
| "step": 5435 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "mean_token_accuracy": 0.779580146074295, | |
| "step": 5437, | |
| "total_flos": 77442066677760.0, | |
| "train_loss": 0.8871173100675843, | |
| "train_runtime": 5515.7519, | |
| "train_samples_per_second": 15.771, | |
| "train_steps_per_second": 0.986 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5437, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 77442066677760.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |