{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 37.127628326416016, "learning_rate": 1e-05, "loss": 15.0452, "mean_token_accuracy": 0.41070467978715897, "step": 1 }, { "epoch": 0.016, "grad_norm": 31.609861373901367, "learning_rate": 2e-05, "loss": 14.4012, "mean_token_accuracy": 0.4362717792391777, "step": 2 }, { "epoch": 0.024, "grad_norm": 28.683244705200195, "learning_rate": 3e-05, "loss": 13.963, "mean_token_accuracy": 0.4400983825325966, "step": 3 }, { "epoch": 0.032, "grad_norm": 22.16831398010254, "learning_rate": 4e-05, "loss": 13.3495, "mean_token_accuracy": 0.4725849777460098, "step": 4 }, { "epoch": 0.04, "grad_norm": 19.798992156982422, "learning_rate": 5e-05, "loss": 11.6453, "mean_token_accuracy": 0.5295183658599854, "step": 5 }, { "epoch": 0.048, "grad_norm": 17.516647338867188, "learning_rate": 4.98989898989899e-05, "loss": 11.3724, "mean_token_accuracy": 0.5440531671047211, "step": 6 }, { "epoch": 0.056, "grad_norm": 18.01726531982422, "learning_rate": 4.97979797979798e-05, "loss": 10.463, "mean_token_accuracy": 0.5659109503030777, "step": 7 }, { "epoch": 0.064, "grad_norm": 16.347856521606445, "learning_rate": 4.9696969696969694e-05, "loss": 9.83, "mean_token_accuracy": 0.5818765759468079, "step": 8 }, { "epoch": 0.072, "grad_norm": 17.102680206298828, "learning_rate": 4.9595959595959594e-05, "loss": 9.21, "mean_token_accuracy": 0.6048124134540558, "step": 9 }, { "epoch": 0.08, "grad_norm": 13.707213401794434, "learning_rate": 4.94949494949495e-05, "loss": 9.0634, "mean_token_accuracy": 0.6290689557790756, "step": 10 }, { "epoch": 0.088, "grad_norm": 12.317888259887695, "learning_rate": 4.93939393939394e-05, "loss": 8.7963, "mean_token_accuracy": 0.6293386816978455, "step": 11 }, { "epoch": 0.096, "grad_norm": 12.71199893951416, "learning_rate": 4.92929292929293e-05, "loss": 7.9924, "mean_token_accuracy": 0.6720790416002274, "step": 12 }, { "epoch": 0.104, "grad_norm": 11.422212600708008, "learning_rate": 4.919191919191919e-05, "loss": 8.0788, "mean_token_accuracy": 0.6582284867763519, "step": 13 }, { "epoch": 0.112, "grad_norm": 10.79257869720459, "learning_rate": 4.909090909090909e-05, "loss": 7.5785, "mean_token_accuracy": 0.673900306224823, "step": 14 }, { "epoch": 0.12, "grad_norm": 10.639641761779785, "learning_rate": 4.898989898989899e-05, "loss": 7.3491, "mean_token_accuracy": 0.6823591589927673, "step": 15 }, { "epoch": 0.128, "grad_norm": 11.140637397766113, "learning_rate": 4.888888888888889e-05, "loss": 7.6078, "mean_token_accuracy": 0.6652624905109406, "step": 16 }, { "epoch": 0.136, "grad_norm": 10.391518592834473, "learning_rate": 4.878787878787879e-05, "loss": 7.1951, "mean_token_accuracy": 0.66854228079319, "step": 17 }, { "epoch": 0.144, "grad_norm": 10.393790245056152, "learning_rate": 4.868686868686869e-05, "loss": 7.1549, "mean_token_accuracy": 0.6921204030513763, "step": 18 }, { "epoch": 0.152, "grad_norm": 9.204154014587402, "learning_rate": 4.858585858585859e-05, "loss": 7.3606, "mean_token_accuracy": 0.6926577091217041, "step": 19 }, { "epoch": 0.16, "grad_norm": 10.39026927947998, "learning_rate": 4.848484848484849e-05, "loss": 7.0433, "mean_token_accuracy": 0.6972606927156448, "step": 20 }, { "epoch": 0.168, "grad_norm": 10.690828323364258, "learning_rate": 4.838383838383839e-05, "loss": 7.0197, "mean_token_accuracy": 0.6896309554576874, "step": 21 }, { "epoch": 0.176, "grad_norm": 10.476662635803223, "learning_rate": 4.828282828282829e-05, "loss": 6.2416, "mean_token_accuracy": 0.7330080419778824, "step": 22 }, { "epoch": 0.184, "grad_norm": 9.495307922363281, "learning_rate": 4.8181818181818186e-05, "loss": 6.7721, "mean_token_accuracy": 0.6969246119260788, "step": 23 }, { "epoch": 0.192, "grad_norm": 9.304316520690918, "learning_rate": 4.808080808080808e-05, "loss": 6.4576, "mean_token_accuracy": 0.7197146117687225, "step": 24 }, { "epoch": 0.2, "grad_norm": 9.24569034576416, "learning_rate": 4.797979797979798e-05, "loss": 5.9051, "mean_token_accuracy": 0.7319369614124298, "step": 25 }, { "epoch": 0.208, "grad_norm": 8.99447250366211, "learning_rate": 4.787878787878788e-05, "loss": 6.113, "mean_token_accuracy": 0.7273330986499786, "step": 26 }, { "epoch": 0.216, "grad_norm": 9.245104789733887, "learning_rate": 4.7777777777777784e-05, "loss": 6.5231, "mean_token_accuracy": 0.7232353389263153, "step": 27 }, { "epoch": 0.224, "grad_norm": 9.193451881408691, "learning_rate": 4.7676767676767684e-05, "loss": 6.0893, "mean_token_accuracy": 0.7589968591928482, "step": 28 }, { "epoch": 0.232, "grad_norm": 9.738916397094727, "learning_rate": 4.7575757575757576e-05, "loss": 6.2512, "mean_token_accuracy": 0.7394937723875046, "step": 29 }, { "epoch": 0.24, "grad_norm": 9.301070213317871, "learning_rate": 4.7474747474747476e-05, "loss": 6.4148, "mean_token_accuracy": 0.7273970693349838, "step": 30 }, { "epoch": 0.248, "grad_norm": 9.732291221618652, "learning_rate": 4.7373737373737375e-05, "loss": 6.4581, "mean_token_accuracy": 0.7421613037586212, "step": 31 }, { "epoch": 0.256, "grad_norm": 9.543816566467285, "learning_rate": 4.7272727272727275e-05, "loss": 5.7812, "mean_token_accuracy": 0.7518225610256195, "step": 32 }, { "epoch": 0.264, "grad_norm": 8.442834854125977, "learning_rate": 4.7171717171717174e-05, "loss": 5.7968, "mean_token_accuracy": 0.7630135715007782, "step": 33 }, { "epoch": 0.272, "grad_norm": 7.911731243133545, "learning_rate": 4.7070707070707074e-05, "loss": 5.9594, "mean_token_accuracy": 0.7520367801189423, "step": 34 }, { "epoch": 0.28, "grad_norm": 8.111591339111328, "learning_rate": 4.696969696969697e-05, "loss": 6.1152, "mean_token_accuracy": 0.7495731711387634, "step": 35 }, { "epoch": 0.288, "grad_norm": 7.632225036621094, "learning_rate": 4.686868686868687e-05, "loss": 5.978, "mean_token_accuracy": 0.7462944090366364, "step": 36 }, { "epoch": 0.296, "grad_norm": 7.140631675720215, "learning_rate": 4.676767676767677e-05, "loss": 5.4284, "mean_token_accuracy": 0.7744311541318893, "step": 37 }, { "epoch": 0.304, "grad_norm": 7.504866600036621, "learning_rate": 4.666666666666667e-05, "loss": 5.4771, "mean_token_accuracy": 0.7578010857105255, "step": 38 }, { "epoch": 0.312, "grad_norm": 6.958098411560059, "learning_rate": 4.656565656565657e-05, "loss": 6.0761, "mean_token_accuracy": 0.7286622673273087, "step": 39 }, { "epoch": 0.32, "grad_norm": 7.4256696701049805, "learning_rate": 4.6464646464646464e-05, "loss": 5.9964, "mean_token_accuracy": 0.7376932203769684, "step": 40 }, { "epoch": 0.328, "grad_norm": 6.8756608963012695, "learning_rate": 4.636363636363636e-05, "loss": 5.2819, "mean_token_accuracy": 0.7755442261695862, "step": 41 }, { "epoch": 0.336, "grad_norm": 6.877252101898193, "learning_rate": 4.626262626262626e-05, "loss": 5.6559, "mean_token_accuracy": 0.7578635513782501, "step": 42 }, { "epoch": 0.344, "grad_norm": 7.236316680908203, "learning_rate": 4.616161616161616e-05, "loss": 5.3482, "mean_token_accuracy": 0.758240357041359, "step": 43 }, { "epoch": 0.352, "grad_norm": 7.695366859436035, "learning_rate": 4.606060606060607e-05, "loss": 5.3286, "mean_token_accuracy": 0.769702136516571, "step": 44 }, { "epoch": 0.36, "grad_norm": 7.2449164390563965, "learning_rate": 4.595959595959596e-05, "loss": 5.3446, "mean_token_accuracy": 0.750788614153862, "step": 45 }, { "epoch": 0.368, "grad_norm": 6.902708530426025, "learning_rate": 4.585858585858586e-05, "loss": 4.5337, "mean_token_accuracy": 0.7888868898153305, "step": 46 }, { "epoch": 0.376, "grad_norm": 7.465998649597168, "learning_rate": 4.575757575757576e-05, "loss": 5.2185, "mean_token_accuracy": 0.7631554305553436, "step": 47 }, { "epoch": 0.384, "grad_norm": 6.581245422363281, "learning_rate": 4.565656565656566e-05, "loss": 5.6984, "mean_token_accuracy": 0.7439078986644745, "step": 48 }, { "epoch": 0.392, "grad_norm": 6.623785495758057, "learning_rate": 4.555555555555556e-05, "loss": 5.5339, "mean_token_accuracy": 0.7492197006940842, "step": 49 }, { "epoch": 0.4, "grad_norm": 7.161900997161865, "learning_rate": 4.545454545454546e-05, "loss": 6.0623, "mean_token_accuracy": 0.7385092377662659, "step": 50 }, { "epoch": 0.408, "grad_norm": 7.114354133605957, "learning_rate": 4.535353535353535e-05, "loss": 5.0984, "mean_token_accuracy": 0.7639837712049484, "step": 51 }, { "epoch": 0.416, "grad_norm": 6.865662097930908, "learning_rate": 4.525252525252526e-05, "loss": 4.5515, "mean_token_accuracy": 0.7843715995550156, "step": 52 }, { "epoch": 0.424, "grad_norm": 6.6706109046936035, "learning_rate": 4.515151515151516e-05, "loss": 5.7281, "mean_token_accuracy": 0.7444053590297699, "step": 53 }, { "epoch": 0.432, "grad_norm": 7.425290107727051, "learning_rate": 4.5050505050505056e-05, "loss": 5.5055, "mean_token_accuracy": 0.7495678812265396, "step": 54 }, { "epoch": 0.44, "grad_norm": 7.8452277183532715, "learning_rate": 4.494949494949495e-05, "loss": 5.5757, "mean_token_accuracy": 0.7629143446683884, "step": 55 }, { "epoch": 0.448, "grad_norm": 7.112468242645264, "learning_rate": 4.484848484848485e-05, "loss": 4.6575, "mean_token_accuracy": 0.7812573164701462, "step": 56 }, { "epoch": 0.456, "grad_norm": 6.542463302612305, "learning_rate": 4.474747474747475e-05, "loss": 5.373, "mean_token_accuracy": 0.7609668523073196, "step": 57 }, { "epoch": 0.464, "grad_norm": 6.580011367797852, "learning_rate": 4.464646464646465e-05, "loss": 4.5268, "mean_token_accuracy": 0.7902613431215286, "step": 58 }, { "epoch": 0.472, "grad_norm": 6.447749614715576, "learning_rate": 4.454545454545455e-05, "loss": 4.948, "mean_token_accuracy": 0.7766893953084946, "step": 59 }, { "epoch": 0.48, "grad_norm": 6.1748199462890625, "learning_rate": 4.4444444444444447e-05, "loss": 4.8793, "mean_token_accuracy": 0.78531713783741, "step": 60 }, { "epoch": 0.488, "grad_norm": 7.2349114418029785, "learning_rate": 4.4343434343434346e-05, "loss": 5.4005, "mean_token_accuracy": 0.7713855803012848, "step": 61 }, { "epoch": 0.496, "grad_norm": 7.137668132781982, "learning_rate": 4.4242424242424246e-05, "loss": 4.7612, "mean_token_accuracy": 0.7853883504867554, "step": 62 }, { "epoch": 0.504, "grad_norm": 6.94537353515625, "learning_rate": 4.4141414141414145e-05, "loss": 5.0952, "mean_token_accuracy": 0.7686220556497574, "step": 63 }, { "epoch": 0.512, "grad_norm": 7.088626861572266, "learning_rate": 4.4040404040404044e-05, "loss": 5.107, "mean_token_accuracy": 0.7601823508739471, "step": 64 }, { "epoch": 0.52, "grad_norm": 6.948323726654053, "learning_rate": 4.3939393939393944e-05, "loss": 4.9925, "mean_token_accuracy": 0.7808158993721008, "step": 65 }, { "epoch": 0.528, "grad_norm": 6.151853084564209, "learning_rate": 4.383838383838384e-05, "loss": 4.7631, "mean_token_accuracy": 0.7758619040250778, "step": 66 }, { "epoch": 0.536, "grad_norm": 6.796617031097412, "learning_rate": 4.3737373737373736e-05, "loss": 5.1669, "mean_token_accuracy": 0.7829089462757111, "step": 67 }, { "epoch": 0.544, "grad_norm": 6.727445602416992, "learning_rate": 4.3636363636363636e-05, "loss": 5.9112, "mean_token_accuracy": 0.7398245632648468, "step": 68 }, { "epoch": 0.552, "grad_norm": 6.673349380493164, "learning_rate": 4.3535353535353535e-05, "loss": 5.5631, "mean_token_accuracy": 0.7459984719753265, "step": 69 }, { "epoch": 0.56, "grad_norm": 6.960887432098389, "learning_rate": 4.343434343434344e-05, "loss": 4.8842, "mean_token_accuracy": 0.7681605517864227, "step": 70 }, { "epoch": 0.568, "grad_norm": 6.884949684143066, "learning_rate": 4.3333333333333334e-05, "loss": 4.9392, "mean_token_accuracy": 0.7818175554275513, "step": 71 }, { "epoch": 0.576, "grad_norm": 6.305865287780762, "learning_rate": 4.3232323232323234e-05, "loss": 4.3928, "mean_token_accuracy": 0.7899681925773621, "step": 72 }, { "epoch": 0.584, "grad_norm": 6.652983665466309, "learning_rate": 4.313131313131313e-05, "loss": 5.6515, "mean_token_accuracy": 0.7437300831079483, "step": 73 }, { "epoch": 0.592, "grad_norm": 5.960973262786865, "learning_rate": 4.303030303030303e-05, "loss": 4.2172, "mean_token_accuracy": 0.8126555383205414, "step": 74 }, { "epoch": 0.6, "grad_norm": 6.401238918304443, "learning_rate": 4.292929292929293e-05, "loss": 4.6772, "mean_token_accuracy": 0.7884032130241394, "step": 75 }, { "epoch": 0.608, "grad_norm": 6.02972412109375, "learning_rate": 4.282828282828283e-05, "loss": 4.7759, "mean_token_accuracy": 0.7719396352767944, "step": 76 }, { "epoch": 0.616, "grad_norm": 5.91736364364624, "learning_rate": 4.2727272727272724e-05, "loss": 4.5561, "mean_token_accuracy": 0.8052177727222443, "step": 77 }, { "epoch": 0.624, "grad_norm": 6.276677131652832, "learning_rate": 4.262626262626263e-05, "loss": 4.5458, "mean_token_accuracy": 0.7864794135093689, "step": 78 }, { "epoch": 0.632, "grad_norm": 6.683462142944336, "learning_rate": 4.252525252525253e-05, "loss": 4.7156, "mean_token_accuracy": 0.7799786478281021, "step": 79 }, { "epoch": 0.64, "grad_norm": 6.73935079574585, "learning_rate": 4.242424242424243e-05, "loss": 4.6576, "mean_token_accuracy": 0.7870494276285172, "step": 80 }, { "epoch": 0.648, "grad_norm": 6.009264945983887, "learning_rate": 4.232323232323233e-05, "loss": 4.3781, "mean_token_accuracy": 0.8029303699731827, "step": 81 }, { "epoch": 0.656, "grad_norm": 5.8821797370910645, "learning_rate": 4.222222222222222e-05, "loss": 4.6497, "mean_token_accuracy": 0.7830232828855515, "step": 82 }, { "epoch": 0.664, "grad_norm": 6.2794904708862305, "learning_rate": 4.212121212121212e-05, "loss": 4.563, "mean_token_accuracy": 0.7883824855089188, "step": 83 }, { "epoch": 0.672, "grad_norm": 6.4131364822387695, "learning_rate": 4.202020202020202e-05, "loss": 4.86, "mean_token_accuracy": 0.7777279317378998, "step": 84 }, { "epoch": 0.68, "grad_norm": 6.024682998657227, "learning_rate": 4.191919191919192e-05, "loss": 4.5301, "mean_token_accuracy": 0.8029981702566147, "step": 85 }, { "epoch": 0.688, "grad_norm": 6.400689125061035, "learning_rate": 4.181818181818182e-05, "loss": 4.5095, "mean_token_accuracy": 0.7834673821926117, "step": 86 }, { "epoch": 0.696, "grad_norm": 6.309573173522949, "learning_rate": 4.171717171717172e-05, "loss": 4.9511, "mean_token_accuracy": 0.7793049812316895, "step": 87 }, { "epoch": 0.704, "grad_norm": 6.098147869110107, "learning_rate": 4.161616161616162e-05, "loss": 4.8193, "mean_token_accuracy": 0.7891133576631546, "step": 88 }, { "epoch": 0.712, "grad_norm": 6.305475234985352, "learning_rate": 4.151515151515152e-05, "loss": 5.2867, "mean_token_accuracy": 0.763544350862503, "step": 89 }, { "epoch": 0.72, "grad_norm": 5.927306175231934, "learning_rate": 4.141414141414142e-05, "loss": 4.6925, "mean_token_accuracy": 0.7907081097364426, "step": 90 }, { "epoch": 0.728, "grad_norm": 6.21611213684082, "learning_rate": 4.131313131313132e-05, "loss": 4.1949, "mean_token_accuracy": 0.7960730195045471, "step": 91 }, { "epoch": 0.736, "grad_norm": 5.893104076385498, "learning_rate": 4.1212121212121216e-05, "loss": 4.3789, "mean_token_accuracy": 0.8057213127613068, "step": 92 }, { "epoch": 0.744, "grad_norm": 6.116573333740234, "learning_rate": 4.111111111111111e-05, "loss": 4.7077, "mean_token_accuracy": 0.7898276895284653, "step": 93 }, { "epoch": 0.752, "grad_norm": 5.917790412902832, "learning_rate": 4.101010101010101e-05, "loss": 4.3604, "mean_token_accuracy": 0.7916617542505264, "step": 94 }, { "epoch": 0.76, "grad_norm": 6.027205944061279, "learning_rate": 4.0909090909090915e-05, "loss": 4.3654, "mean_token_accuracy": 0.7899818271398544, "step": 95 }, { "epoch": 0.768, "grad_norm": 6.685177326202393, "learning_rate": 4.0808080808080814e-05, "loss": 4.3956, "mean_token_accuracy": 0.7992375791072845, "step": 96 }, { "epoch": 0.776, "grad_norm": 6.8115081787109375, "learning_rate": 4.070707070707071e-05, "loss": 4.4709, "mean_token_accuracy": 0.7787137180566788, "step": 97 }, { "epoch": 0.784, "grad_norm": 6.106410980224609, "learning_rate": 4.0606060606060606e-05, "loss": 4.2777, "mean_token_accuracy": 0.8080956637859344, "step": 98 }, { "epoch": 0.792, "grad_norm": 6.435486793518066, "learning_rate": 4.0505050505050506e-05, "loss": 4.7062, "mean_token_accuracy": 0.7773082256317139, "step": 99 }, { "epoch": 0.8, "grad_norm": 6.885725498199463, "learning_rate": 4.0404040404040405e-05, "loss": 4.7163, "mean_token_accuracy": 0.7818236798048019, "step": 100 }, { "epoch": 0.808, "grad_norm": 5.755829811096191, "learning_rate": 4.0303030303030305e-05, "loss": 4.4307, "mean_token_accuracy": 0.7950419485569, "step": 101 }, { "epoch": 0.816, "grad_norm": 6.709305763244629, "learning_rate": 4.0202020202020204e-05, "loss": 4.2539, "mean_token_accuracy": 0.8147921562194824, "step": 102 }, { "epoch": 0.824, "grad_norm": 6.947343826293945, "learning_rate": 4.01010101010101e-05, "loss": 4.3526, "mean_token_accuracy": 0.7921736389398575, "step": 103 }, { "epoch": 0.832, "grad_norm": 6.356295108795166, "learning_rate": 4e-05, "loss": 4.8007, "mean_token_accuracy": 0.780563622713089, "step": 104 }, { "epoch": 0.84, "grad_norm": 5.941018104553223, "learning_rate": 3.98989898989899e-05, "loss": 4.3688, "mean_token_accuracy": 0.7866577059030533, "step": 105 }, { "epoch": 0.848, "grad_norm": 6.366528034210205, "learning_rate": 3.97979797979798e-05, "loss": 4.252, "mean_token_accuracy": 0.799225702881813, "step": 106 }, { "epoch": 0.856, "grad_norm": 6.697314739227295, "learning_rate": 3.96969696969697e-05, "loss": 4.8091, "mean_token_accuracy": 0.7797738015651703, "step": 107 }, { "epoch": 0.864, "grad_norm": 6.910213947296143, "learning_rate": 3.9595959595959594e-05, "loss": 4.8069, "mean_token_accuracy": 0.7732720673084259, "step": 108 }, { "epoch": 0.872, "grad_norm": 5.771172523498535, "learning_rate": 3.9494949494949494e-05, "loss": 4.2787, "mean_token_accuracy": 0.7975706905126572, "step": 109 }, { "epoch": 0.88, "grad_norm": 5.44094181060791, "learning_rate": 3.939393939393939e-05, "loss": 4.0384, "mean_token_accuracy": 0.8148421049118042, "step": 110 }, { "epoch": 0.888, "grad_norm": 6.703949451446533, "learning_rate": 3.929292929292929e-05, "loss": 4.7667, "mean_token_accuracy": 0.7871679961681366, "step": 111 }, { "epoch": 0.896, "grad_norm": 6.386756896972656, "learning_rate": 3.91919191919192e-05, "loss": 4.2787, "mean_token_accuracy": 0.7989258021116257, "step": 112 }, { "epoch": 0.904, "grad_norm": 5.399852275848389, "learning_rate": 3.909090909090909e-05, "loss": 3.9899, "mean_token_accuracy": 0.8192960321903229, "step": 113 }, { "epoch": 0.912, "grad_norm": 5.751123905181885, "learning_rate": 3.898989898989899e-05, "loss": 4.292, "mean_token_accuracy": 0.8131812363862991, "step": 114 }, { "epoch": 0.92, "grad_norm": 5.929274559020996, "learning_rate": 3.888888888888889e-05, "loss": 4.5909, "mean_token_accuracy": 0.7861783355474472, "step": 115 }, { "epoch": 0.928, "grad_norm": 6.452223777770996, "learning_rate": 3.878787878787879e-05, "loss": 4.3544, "mean_token_accuracy": 0.7967888861894608, "step": 116 }, { "epoch": 0.936, "grad_norm": 5.868208408355713, "learning_rate": 3.868686868686869e-05, "loss": 4.9455, "mean_token_accuracy": 0.7758602350950241, "step": 117 }, { "epoch": 0.944, "grad_norm": 6.712337493896484, "learning_rate": 3.858585858585859e-05, "loss": 4.275, "mean_token_accuracy": 0.8086762726306915, "step": 118 }, { "epoch": 0.952, "grad_norm": 5.891403675079346, "learning_rate": 3.848484848484848e-05, "loss": 3.9505, "mean_token_accuracy": 0.8316792696714401, "step": 119 }, { "epoch": 0.96, "grad_norm": 6.26602840423584, "learning_rate": 3.838383838383838e-05, "loss": 4.4454, "mean_token_accuracy": 0.7933251559734344, "step": 120 }, { "epoch": 0.968, "grad_norm": 5.930867671966553, "learning_rate": 3.828282828282829e-05, "loss": 4.346, "mean_token_accuracy": 0.7956322878599167, "step": 121 }, { "epoch": 0.976, "grad_norm": 6.317086219787598, "learning_rate": 3.818181818181819e-05, "loss": 3.9196, "mean_token_accuracy": 0.8177185207605362, "step": 122 }, { "epoch": 0.984, "grad_norm": 6.2951178550720215, "learning_rate": 3.8080808080808087e-05, "loss": 4.3412, "mean_token_accuracy": 0.8002329915761948, "step": 123 }, { "epoch": 0.992, "grad_norm": 5.812499523162842, "learning_rate": 3.797979797979798e-05, "loss": 4.2135, "mean_token_accuracy": 0.8055464327335358, "step": 124 }, { "epoch": 1.0, "grad_norm": 6.186211109161377, "learning_rate": 3.787878787878788e-05, "loss": 4.2378, "mean_token_accuracy": 0.7995122522115707, "step": 125 }, { "epoch": 1.008, "grad_norm": 5.81443452835083, "learning_rate": 3.777777777777778e-05, "loss": 4.1608, "mean_token_accuracy": 0.80469611287117, "step": 126 }, { "epoch": 1.016, "grad_norm": 5.983344078063965, "learning_rate": 3.767676767676768e-05, "loss": 3.996, "mean_token_accuracy": 0.7908979803323746, "step": 127 }, { "epoch": 1.024, "grad_norm": 5.305781841278076, "learning_rate": 3.757575757575758e-05, "loss": 4.4283, "mean_token_accuracy": 0.7888950854539871, "step": 128 }, { "epoch": 1.032, "grad_norm": 5.880563259124756, "learning_rate": 3.747474747474748e-05, "loss": 4.2988, "mean_token_accuracy": 0.7900789678096771, "step": 129 }, { "epoch": 1.04, "grad_norm": 5.588067054748535, "learning_rate": 3.7373737373737376e-05, "loss": 3.5971, "mean_token_accuracy": 0.829292356967926, "step": 130 }, { "epoch": 1.048, "grad_norm": 5.570093631744385, "learning_rate": 3.7272727272727276e-05, "loss": 3.9778, "mean_token_accuracy": 0.814909428358078, "step": 131 }, { "epoch": 1.056, "grad_norm": 5.66859245300293, "learning_rate": 3.7171717171717175e-05, "loss": 3.8962, "mean_token_accuracy": 0.8082331418991089, "step": 132 }, { "epoch": 1.064, "grad_norm": 5.667624473571777, "learning_rate": 3.7070707070707075e-05, "loss": 3.3282, "mean_token_accuracy": 0.8463505655527115, "step": 133 }, { "epoch": 1.072, "grad_norm": 5.888784408569336, "learning_rate": 3.6969696969696974e-05, "loss": 4.236, "mean_token_accuracy": 0.7955542504787445, "step": 134 }, { "epoch": 1.08, "grad_norm": 6.280521392822266, "learning_rate": 3.686868686868687e-05, "loss": 3.8402, "mean_token_accuracy": 0.8112962692975998, "step": 135 }, { "epoch": 1.088, "grad_norm": 5.922068119049072, "learning_rate": 3.6767676767676766e-05, "loss": 3.8136, "mean_token_accuracy": 0.8134618252515793, "step": 136 }, { "epoch": 1.096, "grad_norm": 5.7377824783325195, "learning_rate": 3.6666666666666666e-05, "loss": 4.2849, "mean_token_accuracy": 0.7926695197820663, "step": 137 }, { "epoch": 1.104, "grad_norm": 5.895854949951172, "learning_rate": 3.656565656565657e-05, "loss": 3.9685, "mean_token_accuracy": 0.8166492581367493, "step": 138 }, { "epoch": 1.112, "grad_norm": 6.496169567108154, "learning_rate": 3.6464646464646465e-05, "loss": 4.2935, "mean_token_accuracy": 0.798405259847641, "step": 139 }, { "epoch": 1.12, "grad_norm": 5.925403594970703, "learning_rate": 3.6363636363636364e-05, "loss": 3.5502, "mean_token_accuracy": 0.8250329345464706, "step": 140 }, { "epoch": 1.1280000000000001, "grad_norm": 6.757926940917969, "learning_rate": 3.6262626262626264e-05, "loss": 4.0215, "mean_token_accuracy": 0.8052553087472916, "step": 141 }, { "epoch": 1.1360000000000001, "grad_norm": 5.971244812011719, "learning_rate": 3.616161616161616e-05, "loss": 4.1611, "mean_token_accuracy": 0.7966379076242447, "step": 142 }, { "epoch": 1.144, "grad_norm": 6.020002841949463, "learning_rate": 3.606060606060606e-05, "loss": 3.7227, "mean_token_accuracy": 0.8071051388978958, "step": 143 }, { "epoch": 1.152, "grad_norm": 5.251856327056885, "learning_rate": 3.595959595959596e-05, "loss": 3.8428, "mean_token_accuracy": 0.8186961710453033, "step": 144 }, { "epoch": 1.16, "grad_norm": 5.363802909851074, "learning_rate": 3.5858585858585855e-05, "loss": 3.1516, "mean_token_accuracy": 0.8421822488307953, "step": 145 }, { "epoch": 1.168, "grad_norm": 5.687267780303955, "learning_rate": 3.575757575757576e-05, "loss": 3.8707, "mean_token_accuracy": 0.8026652336120605, "step": 146 }, { "epoch": 1.176, "grad_norm": 6.096588134765625, "learning_rate": 3.565656565656566e-05, "loss": 3.6084, "mean_token_accuracy": 0.8240565657615662, "step": 147 }, { "epoch": 1.184, "grad_norm": 7.285717487335205, "learning_rate": 3.555555555555556e-05, "loss": 4.0248, "mean_token_accuracy": 0.8036085069179535, "step": 148 }, { "epoch": 1.192, "grad_norm": 5.360130310058594, "learning_rate": 3.545454545454546e-05, "loss": 3.7427, "mean_token_accuracy": 0.8152587413787842, "step": 149 }, { "epoch": 1.2, "grad_norm": 5.485192775726318, "learning_rate": 3.535353535353535e-05, "loss": 3.4779, "mean_token_accuracy": 0.822798877954483, "step": 150 }, { "epoch": 1.208, "grad_norm": 6.339715003967285, "learning_rate": 3.525252525252525e-05, "loss": 4.64, "mean_token_accuracy": 0.7758079469203949, "step": 151 }, { "epoch": 1.216, "grad_norm": 5.6540422439575195, "learning_rate": 3.515151515151515e-05, "loss": 3.5393, "mean_token_accuracy": 0.8202964663505554, "step": 152 }, { "epoch": 1.224, "grad_norm": 6.171947479248047, "learning_rate": 3.505050505050505e-05, "loss": 3.9077, "mean_token_accuracy": 0.8147079646587372, "step": 153 }, { "epoch": 1.232, "grad_norm": 5.978511810302734, "learning_rate": 3.494949494949495e-05, "loss": 3.9126, "mean_token_accuracy": 0.8115392625331879, "step": 154 }, { "epoch": 1.24, "grad_norm": 6.21269416809082, "learning_rate": 3.484848484848485e-05, "loss": 3.5828, "mean_token_accuracy": 0.8218794912099838, "step": 155 }, { "epoch": 1.248, "grad_norm": 5.694769859313965, "learning_rate": 3.474747474747475e-05, "loss": 3.4313, "mean_token_accuracy": 0.8210793286561966, "step": 156 }, { "epoch": 1.256, "grad_norm": 5.795802116394043, "learning_rate": 3.464646464646465e-05, "loss": 3.601, "mean_token_accuracy": 0.8306063264608383, "step": 157 }, { "epoch": 1.264, "grad_norm": 6.824512004852295, "learning_rate": 3.454545454545455e-05, "loss": 4.2305, "mean_token_accuracy": 0.8084036558866501, "step": 158 }, { "epoch": 1.272, "grad_norm": 5.593602180480957, "learning_rate": 3.444444444444445e-05, "loss": 4.0785, "mean_token_accuracy": 0.8063912093639374, "step": 159 }, { "epoch": 1.28, "grad_norm": 6.345674991607666, "learning_rate": 3.434343434343435e-05, "loss": 3.7281, "mean_token_accuracy": 0.8293210566043854, "step": 160 }, { "epoch": 1.288, "grad_norm": 5.772035598754883, "learning_rate": 3.424242424242424e-05, "loss": 3.9053, "mean_token_accuracy": 0.8084607124328613, "step": 161 }, { "epoch": 1.296, "grad_norm": 5.2538042068481445, "learning_rate": 3.414141414141414e-05, "loss": 3.5784, "mean_token_accuracy": 0.8259472250938416, "step": 162 }, { "epoch": 1.304, "grad_norm": 6.4647016525268555, "learning_rate": 3.4040404040404045e-05, "loss": 3.6876, "mean_token_accuracy": 0.8131130933761597, "step": 163 }, { "epoch": 1.312, "grad_norm": 5.747474670410156, "learning_rate": 3.3939393939393945e-05, "loss": 3.4403, "mean_token_accuracy": 0.8292675763368607, "step": 164 }, { "epoch": 1.32, "grad_norm": 5.909630298614502, "learning_rate": 3.3838383838383844e-05, "loss": 4.1141, "mean_token_accuracy": 0.7911320775747299, "step": 165 }, { "epoch": 1.328, "grad_norm": 5.7181572914123535, "learning_rate": 3.373737373737374e-05, "loss": 3.5109, "mean_token_accuracy": 0.8262276649475098, "step": 166 }, { "epoch": 1.336, "grad_norm": 5.75642728805542, "learning_rate": 3.3636363636363636e-05, "loss": 3.8356, "mean_token_accuracy": 0.8160548806190491, "step": 167 }, { "epoch": 1.3439999999999999, "grad_norm": 5.626729965209961, "learning_rate": 3.3535353535353536e-05, "loss": 3.7526, "mean_token_accuracy": 0.8073613941669464, "step": 168 }, { "epoch": 1.3519999999999999, "grad_norm": 5.47194766998291, "learning_rate": 3.3434343434343435e-05, "loss": 3.3927, "mean_token_accuracy": 0.8345044106245041, "step": 169 }, { "epoch": 1.3599999999999999, "grad_norm": 5.935661792755127, "learning_rate": 3.3333333333333335e-05, "loss": 3.8877, "mean_token_accuracy": 0.8028298169374466, "step": 170 }, { "epoch": 1.3679999999999999, "grad_norm": 6.1031107902526855, "learning_rate": 3.3232323232323234e-05, "loss": 3.159, "mean_token_accuracy": 0.8418579548597336, "step": 171 }, { "epoch": 1.376, "grad_norm": 5.29210090637207, "learning_rate": 3.3131313131313134e-05, "loss": 3.4754, "mean_token_accuracy": 0.8293817788362503, "step": 172 }, { "epoch": 1.384, "grad_norm": 5.36245059967041, "learning_rate": 3.303030303030303e-05, "loss": 3.5582, "mean_token_accuracy": 0.8308709412813187, "step": 173 }, { "epoch": 1.392, "grad_norm": 5.807570457458496, "learning_rate": 3.292929292929293e-05, "loss": 3.6899, "mean_token_accuracy": 0.809835895895958, "step": 174 }, { "epoch": 1.4, "grad_norm": 6.17417573928833, "learning_rate": 3.282828282828283e-05, "loss": 4.438, "mean_token_accuracy": 0.7785096615552902, "step": 175 }, { "epoch": 1.408, "grad_norm": 5.818863868713379, "learning_rate": 3.272727272727273e-05, "loss": 4.1754, "mean_token_accuracy": 0.7955872565507889, "step": 176 }, { "epoch": 1.416, "grad_norm": 7.145393371582031, "learning_rate": 3.2626262626262624e-05, "loss": 3.9973, "mean_token_accuracy": 0.7989909946918488, "step": 177 }, { "epoch": 1.424, "grad_norm": 5.308239459991455, "learning_rate": 3.2525252525252524e-05, "loss": 3.8291, "mean_token_accuracy": 0.8216778337955475, "step": 178 }, { "epoch": 1.432, "grad_norm": 7.332181930541992, "learning_rate": 3.2424242424242423e-05, "loss": 3.9109, "mean_token_accuracy": 0.8071533888578415, "step": 179 }, { "epoch": 1.44, "grad_norm": 5.38962984085083, "learning_rate": 3.232323232323233e-05, "loss": 3.9099, "mean_token_accuracy": 0.8167336881160736, "step": 180 }, { "epoch": 1.448, "grad_norm": 6.646029949188232, "learning_rate": 3.222222222222223e-05, "loss": 4.041, "mean_token_accuracy": 0.8054783195257187, "step": 181 }, { "epoch": 1.456, "grad_norm": 5.449068069458008, "learning_rate": 3.212121212121212e-05, "loss": 3.8969, "mean_token_accuracy": 0.8104897290468216, "step": 182 }, { "epoch": 1.464, "grad_norm": 6.279653072357178, "learning_rate": 3.202020202020202e-05, "loss": 3.8059, "mean_token_accuracy": 0.8105663061141968, "step": 183 }, { "epoch": 1.472, "grad_norm": 6.5596537590026855, "learning_rate": 3.191919191919192e-05, "loss": 4.1534, "mean_token_accuracy": 0.7909954190254211, "step": 184 }, { "epoch": 1.48, "grad_norm": 5.6107354164123535, "learning_rate": 3.181818181818182e-05, "loss": 3.7084, "mean_token_accuracy": 0.8198148310184479, "step": 185 }, { "epoch": 1.488, "grad_norm": 5.935013294219971, "learning_rate": 3.171717171717172e-05, "loss": 3.6008, "mean_token_accuracy": 0.8179962188005447, "step": 186 }, { "epoch": 1.496, "grad_norm": 6.108509540557861, "learning_rate": 3.161616161616161e-05, "loss": 4.2591, "mean_token_accuracy": 0.802385538816452, "step": 187 }, { "epoch": 1.504, "grad_norm": 5.757572650909424, "learning_rate": 3.151515151515151e-05, "loss": 3.1575, "mean_token_accuracy": 0.8446398079395294, "step": 188 }, { "epoch": 1.512, "grad_norm": 5.682815074920654, "learning_rate": 3.141414141414142e-05, "loss": 3.8183, "mean_token_accuracy": 0.8149935752153397, "step": 189 }, { "epoch": 1.52, "grad_norm": 6.077356338500977, "learning_rate": 3.131313131313132e-05, "loss": 3.6911, "mean_token_accuracy": 0.8199481070041656, "step": 190 }, { "epoch": 1.528, "grad_norm": 5.448940277099609, "learning_rate": 3.121212121212122e-05, "loss": 3.5303, "mean_token_accuracy": 0.8298965841531754, "step": 191 }, { "epoch": 1.536, "grad_norm": 5.414105415344238, "learning_rate": 3.111111111111111e-05, "loss": 3.8164, "mean_token_accuracy": 0.8245872855186462, "step": 192 }, { "epoch": 1.544, "grad_norm": 5.119867324829102, "learning_rate": 3.101010101010101e-05, "loss": 3.3991, "mean_token_accuracy": 0.8331074118614197, "step": 193 }, { "epoch": 1.552, "grad_norm": 5.911476135253906, "learning_rate": 3.090909090909091e-05, "loss": 3.6991, "mean_token_accuracy": 0.8288981467485428, "step": 194 }, { "epoch": 1.56, "grad_norm": 6.588876247406006, "learning_rate": 3.080808080808081e-05, "loss": 3.9679, "mean_token_accuracy": 0.8007670193910599, "step": 195 }, { "epoch": 1.568, "grad_norm": 6.243876934051514, "learning_rate": 3.070707070707071e-05, "loss": 3.2798, "mean_token_accuracy": 0.8313741534948349, "step": 196 }, { "epoch": 1.576, "grad_norm": 5.5765180587768555, "learning_rate": 3.060606060606061e-05, "loss": 3.5246, "mean_token_accuracy": 0.8275876641273499, "step": 197 }, { "epoch": 1.584, "grad_norm": 5.727989673614502, "learning_rate": 3.050505050505051e-05, "loss": 3.577, "mean_token_accuracy": 0.8309095203876495, "step": 198 }, { "epoch": 1.592, "grad_norm": 6.035944938659668, "learning_rate": 3.0404040404040406e-05, "loss": 3.9191, "mean_token_accuracy": 0.8098112493753433, "step": 199 }, { "epoch": 1.6, "grad_norm": 6.832411289215088, "learning_rate": 3.0303030303030306e-05, "loss": 4.0711, "mean_token_accuracy": 0.7989842146635056, "step": 200 }, { "epoch": 1.608, "grad_norm": 6.298995494842529, "learning_rate": 3.0202020202020205e-05, "loss": 3.8327, "mean_token_accuracy": 0.8034229874610901, "step": 201 }, { "epoch": 1.616, "grad_norm": 6.169890880584717, "learning_rate": 3.01010101010101e-05, "loss": 3.4308, "mean_token_accuracy": 0.8316022306680679, "step": 202 }, { "epoch": 1.624, "grad_norm": 6.228781700134277, "learning_rate": 3e-05, "loss": 4.0449, "mean_token_accuracy": 0.8119410276412964, "step": 203 }, { "epoch": 1.6320000000000001, "grad_norm": 5.267385959625244, "learning_rate": 2.98989898989899e-05, "loss": 3.3364, "mean_token_accuracy": 0.8189917951822281, "step": 204 }, { "epoch": 1.6400000000000001, "grad_norm": 6.143648624420166, "learning_rate": 2.9797979797979796e-05, "loss": 3.6751, "mean_token_accuracy": 0.8097221404314041, "step": 205 }, { "epoch": 1.6480000000000001, "grad_norm": 5.887429714202881, "learning_rate": 2.96969696969697e-05, "loss": 3.2366, "mean_token_accuracy": 0.8330790549516678, "step": 206 }, { "epoch": 1.6560000000000001, "grad_norm": 6.628138065338135, "learning_rate": 2.95959595959596e-05, "loss": 3.4484, "mean_token_accuracy": 0.825881227850914, "step": 207 }, { "epoch": 1.6640000000000001, "grad_norm": 6.397181034088135, "learning_rate": 2.9494949494949498e-05, "loss": 4.1198, "mean_token_accuracy": 0.801177367568016, "step": 208 }, { "epoch": 1.6720000000000002, "grad_norm": 6.744088649749756, "learning_rate": 2.9393939393939394e-05, "loss": 3.7321, "mean_token_accuracy": 0.8189053982496262, "step": 209 }, { "epoch": 1.6800000000000002, "grad_norm": 6.219143390655518, "learning_rate": 2.9292929292929294e-05, "loss": 3.8731, "mean_token_accuracy": 0.807484045624733, "step": 210 }, { "epoch": 1.688, "grad_norm": 5.3475751876831055, "learning_rate": 2.9191919191919193e-05, "loss": 3.3026, "mean_token_accuracy": 0.8284660577774048, "step": 211 }, { "epoch": 1.696, "grad_norm": 6.080757141113281, "learning_rate": 2.909090909090909e-05, "loss": 3.9731, "mean_token_accuracy": 0.8116171061992645, "step": 212 }, { "epoch": 1.704, "grad_norm": 5.819918155670166, "learning_rate": 2.898989898989899e-05, "loss": 3.146, "mean_token_accuracy": 0.83076611161232, "step": 213 }, { "epoch": 1.712, "grad_norm": 6.442020416259766, "learning_rate": 2.8888888888888888e-05, "loss": 3.1441, "mean_token_accuracy": 0.834155797958374, "step": 214 }, { "epoch": 1.72, "grad_norm": 6.770529270172119, "learning_rate": 2.878787878787879e-05, "loss": 3.701, "mean_token_accuracy": 0.8147666454315186, "step": 215 }, { "epoch": 1.728, "grad_norm": 6.345605373382568, "learning_rate": 2.868686868686869e-05, "loss": 3.4011, "mean_token_accuracy": 0.8231780230998993, "step": 216 }, { "epoch": 1.736, "grad_norm": 6.269248008728027, "learning_rate": 2.8585858585858587e-05, "loss": 3.3734, "mean_token_accuracy": 0.8338766843080521, "step": 217 }, { "epoch": 1.744, "grad_norm": 6.861104965209961, "learning_rate": 2.8484848484848486e-05, "loss": 3.6933, "mean_token_accuracy": 0.8223045021295547, "step": 218 }, { "epoch": 1.752, "grad_norm": 5.622055530548096, "learning_rate": 2.8383838383838386e-05, "loss": 3.6327, "mean_token_accuracy": 0.8273986279964447, "step": 219 }, { "epoch": 1.76, "grad_norm": 6.05256462097168, "learning_rate": 2.8282828282828282e-05, "loss": 3.9795, "mean_token_accuracy": 0.8149670958518982, "step": 220 }, { "epoch": 1.768, "grad_norm": 6.203214645385742, "learning_rate": 2.818181818181818e-05, "loss": 3.8234, "mean_token_accuracy": 0.8010188341140747, "step": 221 }, { "epoch": 1.776, "grad_norm": 5.9735798835754395, "learning_rate": 2.808080808080808e-05, "loss": 3.6441, "mean_token_accuracy": 0.8297218382358551, "step": 222 }, { "epoch": 1.784, "grad_norm": 5.534307956695557, "learning_rate": 2.7979797979797984e-05, "loss": 3.9049, "mean_token_accuracy": 0.808393806219101, "step": 223 }, { "epoch": 1.792, "grad_norm": 5.976230144500732, "learning_rate": 2.7878787878787883e-05, "loss": 3.1354, "mean_token_accuracy": 0.8418057858943939, "step": 224 }, { "epoch": 1.8, "grad_norm": 6.141560077667236, "learning_rate": 2.777777777777778e-05, "loss": 3.9247, "mean_token_accuracy": 0.8154790848493576, "step": 225 }, { "epoch": 1.808, "grad_norm": 6.0011773109436035, "learning_rate": 2.767676767676768e-05, "loss": 3.9032, "mean_token_accuracy": 0.8127383142709732, "step": 226 }, { "epoch": 1.8159999999999998, "grad_norm": 6.12231969833374, "learning_rate": 2.7575757575757578e-05, "loss": 3.1854, "mean_token_accuracy": 0.8354455977678299, "step": 227 }, { "epoch": 1.8239999999999998, "grad_norm": 5.492841720581055, "learning_rate": 2.7474747474747474e-05, "loss": 3.2232, "mean_token_accuracy": 0.8330144584178925, "step": 228 }, { "epoch": 1.8319999999999999, "grad_norm": 5.566941738128662, "learning_rate": 2.7373737373737374e-05, "loss": 3.4351, "mean_token_accuracy": 0.8365740329027176, "step": 229 }, { "epoch": 1.8399999999999999, "grad_norm": 6.055516719818115, "learning_rate": 2.7272727272727273e-05, "loss": 3.5741, "mean_token_accuracy": 0.819349929690361, "step": 230 }, { "epoch": 1.8479999999999999, "grad_norm": 5.578907489776611, "learning_rate": 2.717171717171717e-05, "loss": 3.4948, "mean_token_accuracy": 0.8307986855506897, "step": 231 }, { "epoch": 1.8559999999999999, "grad_norm": 6.451387882232666, "learning_rate": 2.7070707070707075e-05, "loss": 3.7951, "mean_token_accuracy": 0.8153550177812576, "step": 232 }, { "epoch": 1.8639999999999999, "grad_norm": 6.489166736602783, "learning_rate": 2.696969696969697e-05, "loss": 4.1551, "mean_token_accuracy": 0.8079245835542679, "step": 233 }, { "epoch": 1.8719999999999999, "grad_norm": 6.350606441497803, "learning_rate": 2.686868686868687e-05, "loss": 4.2089, "mean_token_accuracy": 0.7974926680326462, "step": 234 }, { "epoch": 1.88, "grad_norm": 6.2156877517700195, "learning_rate": 2.676767676767677e-05, "loss": 3.4177, "mean_token_accuracy": 0.8269297033548355, "step": 235 }, { "epoch": 1.888, "grad_norm": 6.369142532348633, "learning_rate": 2.6666666666666667e-05, "loss": 3.7993, "mean_token_accuracy": 0.8191967755556107, "step": 236 }, { "epoch": 1.896, "grad_norm": 5.774569988250732, "learning_rate": 2.6565656565656566e-05, "loss": 3.7034, "mean_token_accuracy": 0.8201487958431244, "step": 237 }, { "epoch": 1.904, "grad_norm": 7.003572940826416, "learning_rate": 2.6464646464646466e-05, "loss": 3.7289, "mean_token_accuracy": 0.8157116621732712, "step": 238 }, { "epoch": 1.912, "grad_norm": 5.5132036209106445, "learning_rate": 2.636363636363636e-05, "loss": 3.0822, "mean_token_accuracy": 0.8483001440763474, "step": 239 }, { "epoch": 1.92, "grad_norm": 5.702081680297852, "learning_rate": 2.6262626262626268e-05, "loss": 3.3978, "mean_token_accuracy": 0.8383439779281616, "step": 240 }, { "epoch": 1.928, "grad_norm": 5.952939987182617, "learning_rate": 2.6161616161616164e-05, "loss": 3.7917, "mean_token_accuracy": 0.8186378180980682, "step": 241 }, { "epoch": 1.936, "grad_norm": 5.806432247161865, "learning_rate": 2.6060606060606063e-05, "loss": 3.3405, "mean_token_accuracy": 0.8411876261234283, "step": 242 }, { "epoch": 1.944, "grad_norm": 5.565011501312256, "learning_rate": 2.5959595959595963e-05, "loss": 3.7156, "mean_token_accuracy": 0.8047986626625061, "step": 243 }, { "epoch": 1.952, "grad_norm": 6.6874494552612305, "learning_rate": 2.585858585858586e-05, "loss": 4.1427, "mean_token_accuracy": 0.7969915717840195, "step": 244 }, { "epoch": 1.96, "grad_norm": 6.274991989135742, "learning_rate": 2.575757575757576e-05, "loss": 3.5603, "mean_token_accuracy": 0.8269449025392532, "step": 245 }, { "epoch": 1.968, "grad_norm": 5.758999824523926, "learning_rate": 2.5656565656565658e-05, "loss": 2.9123, "mean_token_accuracy": 0.8435051888227463, "step": 246 }, { "epoch": 1.976, "grad_norm": 6.024221420288086, "learning_rate": 2.5555555555555554e-05, "loss": 3.9897, "mean_token_accuracy": 0.8267959505319595, "step": 247 }, { "epoch": 1.984, "grad_norm": 5.789820671081543, "learning_rate": 2.5454545454545454e-05, "loss": 3.3965, "mean_token_accuracy": 0.8389037996530533, "step": 248 }, { "epoch": 1.992, "grad_norm": 5.86129903793335, "learning_rate": 2.5353535353535356e-05, "loss": 3.9343, "mean_token_accuracy": 0.8126579076051712, "step": 249 }, { "epoch": 2.0, "grad_norm": 6.172614097595215, "learning_rate": 2.5252525252525256e-05, "loss": 3.2032, "mean_token_accuracy": 0.8359029293060303, "step": 250 }, { "epoch": 2.008, "grad_norm": 6.1252899169921875, "learning_rate": 2.5151515151515155e-05, "loss": 3.006, "mean_token_accuracy": 0.8488074690103531, "step": 251 }, { "epoch": 2.016, "grad_norm": 5.982184410095215, "learning_rate": 2.505050505050505e-05, "loss": 3.2548, "mean_token_accuracy": 0.8324020653963089, "step": 252 }, { "epoch": 2.024, "grad_norm": 5.628816604614258, "learning_rate": 2.494949494949495e-05, "loss": 3.321, "mean_token_accuracy": 0.8334167748689651, "step": 253 }, { "epoch": 2.032, "grad_norm": 5.734603404998779, "learning_rate": 2.4848484848484847e-05, "loss": 2.9638, "mean_token_accuracy": 0.8483208119869232, "step": 254 }, { "epoch": 2.04, "grad_norm": 5.817469120025635, "learning_rate": 2.474747474747475e-05, "loss": 3.0251, "mean_token_accuracy": 0.837944746017456, "step": 255 }, { "epoch": 2.048, "grad_norm": 5.197110652923584, "learning_rate": 2.464646464646465e-05, "loss": 3.1505, "mean_token_accuracy": 0.8443950265645981, "step": 256 }, { "epoch": 2.056, "grad_norm": 5.90143346786499, "learning_rate": 2.4545454545454545e-05, "loss": 3.456, "mean_token_accuracy": 0.8270305395126343, "step": 257 }, { "epoch": 2.064, "grad_norm": 6.32747745513916, "learning_rate": 2.4444444444444445e-05, "loss": 3.0387, "mean_token_accuracy": 0.8455974459648132, "step": 258 }, { "epoch": 2.072, "grad_norm": 5.210789203643799, "learning_rate": 2.4343434343434344e-05, "loss": 3.1112, "mean_token_accuracy": 0.847496286034584, "step": 259 }, { "epoch": 2.08, "grad_norm": 5.2880167961120605, "learning_rate": 2.4242424242424244e-05, "loss": 2.7432, "mean_token_accuracy": 0.8593413680791855, "step": 260 }, { "epoch": 2.088, "grad_norm": 5.868934154510498, "learning_rate": 2.4141414141414143e-05, "loss": 3.1459, "mean_token_accuracy": 0.8433663100004196, "step": 261 }, { "epoch": 2.096, "grad_norm": 5.824329376220703, "learning_rate": 2.404040404040404e-05, "loss": 3.5007, "mean_token_accuracy": 0.8235083371400833, "step": 262 }, { "epoch": 2.104, "grad_norm": 5.528572082519531, "learning_rate": 2.393939393939394e-05, "loss": 2.854, "mean_token_accuracy": 0.8565276563167572, "step": 263 }, { "epoch": 2.112, "grad_norm": 6.147289752960205, "learning_rate": 2.3838383838383842e-05, "loss": 2.9618, "mean_token_accuracy": 0.8526737242937088, "step": 264 }, { "epoch": 2.12, "grad_norm": 5.65183687210083, "learning_rate": 2.3737373737373738e-05, "loss": 3.0466, "mean_token_accuracy": 0.8418086171150208, "step": 265 }, { "epoch": 2.128, "grad_norm": 6.315326690673828, "learning_rate": 2.3636363636363637e-05, "loss": 3.4835, "mean_token_accuracy": 0.8310810178518295, "step": 266 }, { "epoch": 2.136, "grad_norm": 5.648726463317871, "learning_rate": 2.3535353535353537e-05, "loss": 3.6856, "mean_token_accuracy": 0.8161235004663467, "step": 267 }, { "epoch": 2.144, "grad_norm": 5.972370624542236, "learning_rate": 2.3434343434343436e-05, "loss": 3.0239, "mean_token_accuracy": 0.8378438502550125, "step": 268 }, { "epoch": 2.152, "grad_norm": 5.596478462219238, "learning_rate": 2.3333333333333336e-05, "loss": 3.0652, "mean_token_accuracy": 0.8487441837787628, "step": 269 }, { "epoch": 2.16, "grad_norm": 6.5623908042907715, "learning_rate": 2.3232323232323232e-05, "loss": 3.1893, "mean_token_accuracy": 0.834345743060112, "step": 270 }, { "epoch": 2.168, "grad_norm": 6.067134380340576, "learning_rate": 2.313131313131313e-05, "loss": 3.0515, "mean_token_accuracy": 0.8387996703386307, "step": 271 }, { "epoch": 2.176, "grad_norm": 6.862771987915039, "learning_rate": 2.3030303030303034e-05, "loss": 3.2861, "mean_token_accuracy": 0.8361453711986542, "step": 272 }, { "epoch": 2.184, "grad_norm": 6.166702747344971, "learning_rate": 2.292929292929293e-05, "loss": 3.6596, "mean_token_accuracy": 0.813729852437973, "step": 273 }, { "epoch": 2.192, "grad_norm": 5.386834621429443, "learning_rate": 2.282828282828283e-05, "loss": 3.4533, "mean_token_accuracy": 0.8274287581443787, "step": 274 }, { "epoch": 2.2, "grad_norm": 5.261857986450195, "learning_rate": 2.272727272727273e-05, "loss": 2.7643, "mean_token_accuracy": 0.8519167453050613, "step": 275 }, { "epoch": 2.208, "grad_norm": 5.241461753845215, "learning_rate": 2.262626262626263e-05, "loss": 3.051, "mean_token_accuracy": 0.8460838198661804, "step": 276 }, { "epoch": 2.216, "grad_norm": 5.825733184814453, "learning_rate": 2.2525252525252528e-05, "loss": 3.2424, "mean_token_accuracy": 0.8403096050024033, "step": 277 }, { "epoch": 2.224, "grad_norm": 5.554264545440674, "learning_rate": 2.2424242424242424e-05, "loss": 3.1676, "mean_token_accuracy": 0.8293324261903763, "step": 278 }, { "epoch": 2.232, "grad_norm": 5.81028413772583, "learning_rate": 2.2323232323232324e-05, "loss": 3.184, "mean_token_accuracy": 0.8394870609045029, "step": 279 }, { "epoch": 2.24, "grad_norm": 6.055649280548096, "learning_rate": 2.2222222222222223e-05, "loss": 3.0117, "mean_token_accuracy": 0.850115180015564, "step": 280 }, { "epoch": 2.248, "grad_norm": 5.307283401489258, "learning_rate": 2.2121212121212123e-05, "loss": 2.9076, "mean_token_accuracy": 0.8414607346057892, "step": 281 }, { "epoch": 2.2560000000000002, "grad_norm": 5.909352779388428, "learning_rate": 2.2020202020202022e-05, "loss": 3.4122, "mean_token_accuracy": 0.8330782055854797, "step": 282 }, { "epoch": 2.2640000000000002, "grad_norm": 5.730319499969482, "learning_rate": 2.191919191919192e-05, "loss": 3.2416, "mean_token_accuracy": 0.83427894115448, "step": 283 }, { "epoch": 2.2720000000000002, "grad_norm": 6.638869762420654, "learning_rate": 2.1818181818181818e-05, "loss": 2.8864, "mean_token_accuracy": 0.8467890173196793, "step": 284 }, { "epoch": 2.2800000000000002, "grad_norm": 6.180927753448486, "learning_rate": 2.171717171717172e-05, "loss": 3.3418, "mean_token_accuracy": 0.8295275717973709, "step": 285 }, { "epoch": 2.288, "grad_norm": 5.50770902633667, "learning_rate": 2.1616161616161617e-05, "loss": 3.3557, "mean_token_accuracy": 0.8279571235179901, "step": 286 }, { "epoch": 2.296, "grad_norm": 6.085552215576172, "learning_rate": 2.1515151515151516e-05, "loss": 2.7873, "mean_token_accuracy": 0.8584895879030228, "step": 287 }, { "epoch": 2.304, "grad_norm": 5.311261177062988, "learning_rate": 2.1414141414141416e-05, "loss": 2.786, "mean_token_accuracy": 0.8529232293367386, "step": 288 }, { "epoch": 2.312, "grad_norm": 6.482555389404297, "learning_rate": 2.1313131313131315e-05, "loss": 3.3625, "mean_token_accuracy": 0.8343013226985931, "step": 289 }, { "epoch": 2.32, "grad_norm": 5.101843357086182, "learning_rate": 2.1212121212121215e-05, "loss": 3.0018, "mean_token_accuracy": 0.8565692156553268, "step": 290 }, { "epoch": 2.328, "grad_norm": 6.496617317199707, "learning_rate": 2.111111111111111e-05, "loss": 3.5081, "mean_token_accuracy": 0.8281321227550507, "step": 291 }, { "epoch": 2.336, "grad_norm": 6.459492206573486, "learning_rate": 2.101010101010101e-05, "loss": 3.2684, "mean_token_accuracy": 0.8464506566524506, "step": 292 }, { "epoch": 2.344, "grad_norm": 6.296543121337891, "learning_rate": 2.090909090909091e-05, "loss": 2.9787, "mean_token_accuracy": 0.8391922265291214, "step": 293 }, { "epoch": 2.352, "grad_norm": 5.939156532287598, "learning_rate": 2.080808080808081e-05, "loss": 3.2943, "mean_token_accuracy": 0.8347084373235703, "step": 294 }, { "epoch": 2.36, "grad_norm": 5.988732814788818, "learning_rate": 2.070707070707071e-05, "loss": 3.1281, "mean_token_accuracy": 0.8483212292194366, "step": 295 }, { "epoch": 2.368, "grad_norm": 7.110536098480225, "learning_rate": 2.0606060606060608e-05, "loss": 3.1449, "mean_token_accuracy": 0.8350488543510437, "step": 296 }, { "epoch": 2.376, "grad_norm": 6.519949436187744, "learning_rate": 2.0505050505050504e-05, "loss": 2.6449, "mean_token_accuracy": 0.8593619167804718, "step": 297 }, { "epoch": 2.384, "grad_norm": 5.815298557281494, "learning_rate": 2.0404040404040407e-05, "loss": 3.0911, "mean_token_accuracy": 0.837956115603447, "step": 298 }, { "epoch": 2.392, "grad_norm": 5.741540908813477, "learning_rate": 2.0303030303030303e-05, "loss": 3.1501, "mean_token_accuracy": 0.8409707248210907, "step": 299 }, { "epoch": 2.4, "grad_norm": 5.991777420043945, "learning_rate": 2.0202020202020203e-05, "loss": 2.9707, "mean_token_accuracy": 0.858299732208252, "step": 300 }, { "epoch": 2.408, "grad_norm": 6.3398847579956055, "learning_rate": 2.0101010101010102e-05, "loss": 3.4141, "mean_token_accuracy": 0.830435261130333, "step": 301 }, { "epoch": 2.416, "grad_norm": 6.837295055389404, "learning_rate": 2e-05, "loss": 3.3923, "mean_token_accuracy": 0.8256559520959854, "step": 302 }, { "epoch": 2.424, "grad_norm": 5.505060195922852, "learning_rate": 1.98989898989899e-05, "loss": 3.0673, "mean_token_accuracy": 0.8478615581989288, "step": 303 }, { "epoch": 2.432, "grad_norm": 6.913103103637695, "learning_rate": 1.9797979797979797e-05, "loss": 3.5381, "mean_token_accuracy": 0.8223972916603088, "step": 304 }, { "epoch": 2.44, "grad_norm": 6.902682304382324, "learning_rate": 1.9696969696969697e-05, "loss": 3.598, "mean_token_accuracy": 0.8203122019767761, "step": 305 }, { "epoch": 2.448, "grad_norm": 5.36390495300293, "learning_rate": 1.95959595959596e-05, "loss": 3.1101, "mean_token_accuracy": 0.8461343050003052, "step": 306 }, { "epoch": 2.456, "grad_norm": 6.57292366027832, "learning_rate": 1.9494949494949496e-05, "loss": 3.6492, "mean_token_accuracy": 0.8245173096656799, "step": 307 }, { "epoch": 2.464, "grad_norm": 5.893022537231445, "learning_rate": 1.9393939393939395e-05, "loss": 2.7866, "mean_token_accuracy": 0.8550339192152023, "step": 308 }, { "epoch": 2.472, "grad_norm": 5.711099624633789, "learning_rate": 1.9292929292929295e-05, "loss": 3.0009, "mean_token_accuracy": 0.8481545150279999, "step": 309 }, { "epoch": 2.48, "grad_norm": 6.395712375640869, "learning_rate": 1.919191919191919e-05, "loss": 2.8936, "mean_token_accuracy": 0.8416745364665985, "step": 310 }, { "epoch": 2.488, "grad_norm": 5.5052289962768555, "learning_rate": 1.9090909090909094e-05, "loss": 2.9872, "mean_token_accuracy": 0.8463147729635239, "step": 311 }, { "epoch": 2.496, "grad_norm": 6.273165702819824, "learning_rate": 1.898989898989899e-05, "loss": 2.7584, "mean_token_accuracy": 0.8467618376016617, "step": 312 }, { "epoch": 2.504, "grad_norm": 5.292394161224365, "learning_rate": 1.888888888888889e-05, "loss": 2.9572, "mean_token_accuracy": 0.8431327790021896, "step": 313 }, { "epoch": 2.512, "grad_norm": 5.991566181182861, "learning_rate": 1.878787878787879e-05, "loss": 2.9398, "mean_token_accuracy": 0.8492739796638489, "step": 314 }, { "epoch": 2.52, "grad_norm": 5.635786533355713, "learning_rate": 1.8686868686868688e-05, "loss": 3.1451, "mean_token_accuracy": 0.854804664850235, "step": 315 }, { "epoch": 2.528, "grad_norm": 6.714171886444092, "learning_rate": 1.8585858585858588e-05, "loss": 3.0965, "mean_token_accuracy": 0.8423555940389633, "step": 316 }, { "epoch": 2.536, "grad_norm": 5.751944065093994, "learning_rate": 1.8484848484848487e-05, "loss": 3.0769, "mean_token_accuracy": 0.8551096171140671, "step": 317 }, { "epoch": 2.544, "grad_norm": 6.406241416931152, "learning_rate": 1.8383838383838383e-05, "loss": 2.8228, "mean_token_accuracy": 0.861114576458931, "step": 318 }, { "epoch": 2.552, "grad_norm": 6.585404396057129, "learning_rate": 1.8282828282828286e-05, "loss": 3.2246, "mean_token_accuracy": 0.8409701138734818, "step": 319 }, { "epoch": 2.56, "grad_norm": 6.6673264503479, "learning_rate": 1.8181818181818182e-05, "loss": 3.0054, "mean_token_accuracy": 0.8525452762842178, "step": 320 }, { "epoch": 2.568, "grad_norm": 6.316901206970215, "learning_rate": 1.808080808080808e-05, "loss": 3.4642, "mean_token_accuracy": 0.8289849609136581, "step": 321 }, { "epoch": 2.576, "grad_norm": 7.191961288452148, "learning_rate": 1.797979797979798e-05, "loss": 3.276, "mean_token_accuracy": 0.8262393325567245, "step": 322 }, { "epoch": 2.584, "grad_norm": 5.743154525756836, "learning_rate": 1.787878787878788e-05, "loss": 2.6852, "mean_token_accuracy": 0.8636340796947479, "step": 323 }, { "epoch": 2.592, "grad_norm": 5.567448616027832, "learning_rate": 1.777777777777778e-05, "loss": 3.0029, "mean_token_accuracy": 0.8480563163757324, "step": 324 }, { "epoch": 2.6, "grad_norm": 6.558039665222168, "learning_rate": 1.7676767676767676e-05, "loss": 2.8954, "mean_token_accuracy": 0.8490820229053497, "step": 325 }, { "epoch": 2.608, "grad_norm": 6.237626075744629, "learning_rate": 1.7575757575757576e-05, "loss": 3.2673, "mean_token_accuracy": 0.8418715596199036, "step": 326 }, { "epoch": 2.616, "grad_norm": 6.388242721557617, "learning_rate": 1.7474747474747475e-05, "loss": 3.5492, "mean_token_accuracy": 0.8205874115228653, "step": 327 }, { "epoch": 2.624, "grad_norm": 6.261191368103027, "learning_rate": 1.7373737373737375e-05, "loss": 3.784, "mean_token_accuracy": 0.8178833723068237, "step": 328 }, { "epoch": 2.632, "grad_norm": 5.890114784240723, "learning_rate": 1.7272727272727274e-05, "loss": 2.7545, "mean_token_accuracy": 0.8488318920135498, "step": 329 }, { "epoch": 2.64, "grad_norm": 6.187424182891846, "learning_rate": 1.7171717171717173e-05, "loss": 3.5939, "mean_token_accuracy": 0.827822282910347, "step": 330 }, { "epoch": 2.648, "grad_norm": 6.662416458129883, "learning_rate": 1.707070707070707e-05, "loss": 3.2883, "mean_token_accuracy": 0.8301158398389816, "step": 331 }, { "epoch": 2.656, "grad_norm": 6.358303546905518, "learning_rate": 1.6969696969696972e-05, "loss": 3.1986, "mean_token_accuracy": 0.8457056134939194, "step": 332 }, { "epoch": 2.664, "grad_norm": 5.967710971832275, "learning_rate": 1.686868686868687e-05, "loss": 2.9175, "mean_token_accuracy": 0.8557797521352768, "step": 333 }, { "epoch": 2.672, "grad_norm": 5.508559703826904, "learning_rate": 1.6767676767676768e-05, "loss": 3.1204, "mean_token_accuracy": 0.8435854762792587, "step": 334 }, { "epoch": 2.68, "grad_norm": 6.057641506195068, "learning_rate": 1.6666666666666667e-05, "loss": 3.5826, "mean_token_accuracy": 0.8250070810317993, "step": 335 }, { "epoch": 2.6879999999999997, "grad_norm": 5.788311004638672, "learning_rate": 1.6565656565656567e-05, "loss": 2.9172, "mean_token_accuracy": 0.8496552407741547, "step": 336 }, { "epoch": 2.6959999999999997, "grad_norm": 7.819416522979736, "learning_rate": 1.6464646464646466e-05, "loss": 3.1982, "mean_token_accuracy": 0.8477791100740433, "step": 337 }, { "epoch": 2.7039999999999997, "grad_norm": 6.290533065795898, "learning_rate": 1.6363636363636366e-05, "loss": 3.289, "mean_token_accuracy": 0.8374428898096085, "step": 338 }, { "epoch": 2.7119999999999997, "grad_norm": 6.164359092712402, "learning_rate": 1.6262626262626262e-05, "loss": 3.0955, "mean_token_accuracy": 0.8332884609699249, "step": 339 }, { "epoch": 2.7199999999999998, "grad_norm": 6.6753339767456055, "learning_rate": 1.6161616161616165e-05, "loss": 2.9626, "mean_token_accuracy": 0.8437196165323257, "step": 340 }, { "epoch": 2.7279999999999998, "grad_norm": 5.542717933654785, "learning_rate": 1.606060606060606e-05, "loss": 3.2322, "mean_token_accuracy": 0.8421852588653564, "step": 341 }, { "epoch": 2.7359999999999998, "grad_norm": 5.837277889251709, "learning_rate": 1.595959595959596e-05, "loss": 3.1454, "mean_token_accuracy": 0.8469538539648056, "step": 342 }, { "epoch": 2.7439999999999998, "grad_norm": 5.96569299697876, "learning_rate": 1.585858585858586e-05, "loss": 2.8061, "mean_token_accuracy": 0.8568727970123291, "step": 343 }, { "epoch": 2.752, "grad_norm": 6.756287574768066, "learning_rate": 1.5757575757575756e-05, "loss": 2.792, "mean_token_accuracy": 0.8376488238573074, "step": 344 }, { "epoch": 2.76, "grad_norm": 6.800328254699707, "learning_rate": 1.565656565656566e-05, "loss": 3.3131, "mean_token_accuracy": 0.8389277309179306, "step": 345 }, { "epoch": 2.768, "grad_norm": 5.755626201629639, "learning_rate": 1.5555555555555555e-05, "loss": 2.9014, "mean_token_accuracy": 0.8483488708734512, "step": 346 }, { "epoch": 2.776, "grad_norm": 5.480167865753174, "learning_rate": 1.5454545454545454e-05, "loss": 2.8085, "mean_token_accuracy": 0.8566093593835831, "step": 347 }, { "epoch": 2.784, "grad_norm": 6.205705642700195, "learning_rate": 1.5353535353535354e-05, "loss": 3.0061, "mean_token_accuracy": 0.8438935428857803, "step": 348 }, { "epoch": 2.792, "grad_norm": 6.319244384765625, "learning_rate": 1.5252525252525255e-05, "loss": 2.9846, "mean_token_accuracy": 0.8455973863601685, "step": 349 }, { "epoch": 2.8, "grad_norm": 5.499710559844971, "learning_rate": 1.5151515151515153e-05, "loss": 3.0732, "mean_token_accuracy": 0.8468042612075806, "step": 350 }, { "epoch": 2.808, "grad_norm": 5.381649494171143, "learning_rate": 1.505050505050505e-05, "loss": 3.4026, "mean_token_accuracy": 0.840883806347847, "step": 351 }, { "epoch": 2.816, "grad_norm": 5.975292205810547, "learning_rate": 1.494949494949495e-05, "loss": 3.3919, "mean_token_accuracy": 0.8322191834449768, "step": 352 }, { "epoch": 2.824, "grad_norm": 6.1554646492004395, "learning_rate": 1.484848484848485e-05, "loss": 2.9314, "mean_token_accuracy": 0.8548919409513474, "step": 353 }, { "epoch": 2.832, "grad_norm": 5.832248210906982, "learning_rate": 1.4747474747474749e-05, "loss": 2.3767, "mean_token_accuracy": 0.8693199008703232, "step": 354 }, { "epoch": 2.84, "grad_norm": 5.911506175994873, "learning_rate": 1.4646464646464647e-05, "loss": 2.7502, "mean_token_accuracy": 0.8517532050609589, "step": 355 }, { "epoch": 2.848, "grad_norm": 6.765252113342285, "learning_rate": 1.4545454545454545e-05, "loss": 3.6221, "mean_token_accuracy": 0.8170515298843384, "step": 356 }, { "epoch": 2.856, "grad_norm": 6.607264518737793, "learning_rate": 1.4444444444444444e-05, "loss": 3.1797, "mean_token_accuracy": 0.8341539800167084, "step": 357 }, { "epoch": 2.864, "grad_norm": 6.446462154388428, "learning_rate": 1.4343434343434345e-05, "loss": 2.9366, "mean_token_accuracy": 0.8581055998802185, "step": 358 }, { "epoch": 2.872, "grad_norm": 6.300421714782715, "learning_rate": 1.4242424242424243e-05, "loss": 3.3633, "mean_token_accuracy": 0.8499312251806259, "step": 359 }, { "epoch": 2.88, "grad_norm": 5.600719451904297, "learning_rate": 1.4141414141414141e-05, "loss": 3.088, "mean_token_accuracy": 0.8488198220729828, "step": 360 }, { "epoch": 2.888, "grad_norm": 5.980778217315674, "learning_rate": 1.404040404040404e-05, "loss": 2.9933, "mean_token_accuracy": 0.8443039804697037, "step": 361 }, { "epoch": 2.896, "grad_norm": 6.331740856170654, "learning_rate": 1.3939393939393942e-05, "loss": 2.9548, "mean_token_accuracy": 0.8464246839284897, "step": 362 }, { "epoch": 2.904, "grad_norm": 6.095213890075684, "learning_rate": 1.383838383838384e-05, "loss": 3.4251, "mean_token_accuracy": 0.8375514298677444, "step": 363 }, { "epoch": 2.912, "grad_norm": 6.7452239990234375, "learning_rate": 1.3737373737373737e-05, "loss": 3.3106, "mean_token_accuracy": 0.8348345905542374, "step": 364 }, { "epoch": 2.92, "grad_norm": 5.934120178222656, "learning_rate": 1.3636363636363637e-05, "loss": 3.1045, "mean_token_accuracy": 0.8491770774126053, "step": 365 }, { "epoch": 2.928, "grad_norm": 5.6627912521362305, "learning_rate": 1.3535353535353538e-05, "loss": 2.9255, "mean_token_accuracy": 0.8593737334012985, "step": 366 }, { "epoch": 2.936, "grad_norm": 6.51497745513916, "learning_rate": 1.3434343434343436e-05, "loss": 3.2813, "mean_token_accuracy": 0.8357561677694321, "step": 367 }, { "epoch": 2.944, "grad_norm": 6.286316394805908, "learning_rate": 1.3333333333333333e-05, "loss": 2.8042, "mean_token_accuracy": 0.8635920882225037, "step": 368 }, { "epoch": 2.952, "grad_norm": 6.30144739151001, "learning_rate": 1.3232323232323233e-05, "loss": 2.7913, "mean_token_accuracy": 0.8532185405492783, "step": 369 }, { "epoch": 2.96, "grad_norm": 5.6404314041137695, "learning_rate": 1.3131313131313134e-05, "loss": 2.8048, "mean_token_accuracy": 0.8557418137788773, "step": 370 }, { "epoch": 2.968, "grad_norm": 6.2682342529296875, "learning_rate": 1.3030303030303032e-05, "loss": 3.3507, "mean_token_accuracy": 0.8299184590578079, "step": 371 }, { "epoch": 2.976, "grad_norm": 5.7995219230651855, "learning_rate": 1.292929292929293e-05, "loss": 2.9806, "mean_token_accuracy": 0.8536931127309799, "step": 372 }, { "epoch": 2.984, "grad_norm": 6.288918972015381, "learning_rate": 1.2828282828282829e-05, "loss": 3.1404, "mean_token_accuracy": 0.8438131213188171, "step": 373 }, { "epoch": 2.992, "grad_norm": 6.12652587890625, "learning_rate": 1.2727272727272727e-05, "loss": 3.1072, "mean_token_accuracy": 0.8395776003599167, "step": 374 }, { "epoch": 3.0, "grad_norm": 6.006952285766602, "learning_rate": 1.2626262626262628e-05, "loss": 3.0308, "mean_token_accuracy": 0.8461359292268753, "step": 375 }, { "epoch": 3.008, "grad_norm": 5.877007961273193, "learning_rate": 1.2525252525252526e-05, "loss": 2.7955, "mean_token_accuracy": 0.8688687086105347, "step": 376 }, { "epoch": 3.016, "grad_norm": 5.722151756286621, "learning_rate": 1.2424242424242424e-05, "loss": 2.4909, "mean_token_accuracy": 0.8692611008882523, "step": 377 }, { "epoch": 3.024, "grad_norm": 5.512819290161133, "learning_rate": 1.2323232323232325e-05, "loss": 2.5139, "mean_token_accuracy": 0.8634871691465378, "step": 378 }, { "epoch": 3.032, "grad_norm": 5.516348838806152, "learning_rate": 1.2222222222222222e-05, "loss": 2.8559, "mean_token_accuracy": 0.8596174418926239, "step": 379 }, { "epoch": 3.04, "grad_norm": 5.508405685424805, "learning_rate": 1.2121212121212122e-05, "loss": 2.5545, "mean_token_accuracy": 0.8677777200937271, "step": 380 }, { "epoch": 3.048, "grad_norm": 6.354642868041992, "learning_rate": 1.202020202020202e-05, "loss": 3.0228, "mean_token_accuracy": 0.8556893318891525, "step": 381 }, { "epoch": 3.056, "grad_norm": 6.423473834991455, "learning_rate": 1.1919191919191921e-05, "loss": 3.072, "mean_token_accuracy": 0.8429125100374222, "step": 382 }, { "epoch": 3.064, "grad_norm": 5.950673580169678, "learning_rate": 1.1818181818181819e-05, "loss": 3.4449, "mean_token_accuracy": 0.8237967044115067, "step": 383 }, { "epoch": 3.072, "grad_norm": 6.789544582366943, "learning_rate": 1.1717171717171718e-05, "loss": 2.8032, "mean_token_accuracy": 0.8490213006734848, "step": 384 }, { "epoch": 3.08, "grad_norm": 5.8454413414001465, "learning_rate": 1.1616161616161616e-05, "loss": 2.7207, "mean_token_accuracy": 0.8605297058820724, "step": 385 }, { "epoch": 3.088, "grad_norm": 6.446214199066162, "learning_rate": 1.1515151515151517e-05, "loss": 3.2414, "mean_token_accuracy": 0.8328104317188263, "step": 386 }, { "epoch": 3.096, "grad_norm": 5.578056335449219, "learning_rate": 1.1414141414141415e-05, "loss": 2.8079, "mean_token_accuracy": 0.8580958545207977, "step": 387 }, { "epoch": 3.104, "grad_norm": 5.806094169616699, "learning_rate": 1.1313131313131314e-05, "loss": 2.6591, "mean_token_accuracy": 0.8651483207941055, "step": 388 }, { "epoch": 3.112, "grad_norm": 4.987120151519775, "learning_rate": 1.1212121212121212e-05, "loss": 2.2249, "mean_token_accuracy": 0.8830093741416931, "step": 389 }, { "epoch": 3.12, "grad_norm": 6.069543361663818, "learning_rate": 1.1111111111111112e-05, "loss": 2.8345, "mean_token_accuracy": 0.8573679178953171, "step": 390 }, { "epoch": 3.128, "grad_norm": 6.578948974609375, "learning_rate": 1.1010101010101011e-05, "loss": 3.4663, "mean_token_accuracy": 0.8265634626150131, "step": 391 }, { "epoch": 3.136, "grad_norm": 6.3034772872924805, "learning_rate": 1.0909090909090909e-05, "loss": 2.5849, "mean_token_accuracy": 0.8636100441217422, "step": 392 }, { "epoch": 3.144, "grad_norm": 5.5531206130981445, "learning_rate": 1.0808080808080808e-05, "loss": 2.8643, "mean_token_accuracy": 0.8591476529836655, "step": 393 }, { "epoch": 3.152, "grad_norm": 5.342994689941406, "learning_rate": 1.0707070707070708e-05, "loss": 2.4832, "mean_token_accuracy": 0.868215799331665, "step": 394 }, { "epoch": 3.16, "grad_norm": 6.020049095153809, "learning_rate": 1.0606060606060607e-05, "loss": 2.7882, "mean_token_accuracy": 0.860662654042244, "step": 395 }, { "epoch": 3.168, "grad_norm": 6.296903610229492, "learning_rate": 1.0505050505050505e-05, "loss": 2.6621, "mean_token_accuracy": 0.8558385968208313, "step": 396 }, { "epoch": 3.176, "grad_norm": 5.6435980796813965, "learning_rate": 1.0404040404040405e-05, "loss": 2.5996, "mean_token_accuracy": 0.8638162761926651, "step": 397 }, { "epoch": 3.184, "grad_norm": 5.6095123291015625, "learning_rate": 1.0303030303030304e-05, "loss": 2.7824, "mean_token_accuracy": 0.8573920726776123, "step": 398 }, { "epoch": 3.192, "grad_norm": 5.77184534072876, "learning_rate": 1.0202020202020204e-05, "loss": 3.2982, "mean_token_accuracy": 0.8455280959606171, "step": 399 }, { "epoch": 3.2, "grad_norm": 6.490416049957275, "learning_rate": 1.0101010101010101e-05, "loss": 2.8005, "mean_token_accuracy": 0.8535754829645157, "step": 400 }, { "epoch": 3.208, "grad_norm": 5.768113613128662, "learning_rate": 1e-05, "loss": 2.5534, "mean_token_accuracy": 0.8649332523345947, "step": 401 }, { "epoch": 3.216, "grad_norm": 6.118169784545898, "learning_rate": 9.898989898989899e-06, "loss": 2.8782, "mean_token_accuracy": 0.8546314835548401, "step": 402 }, { "epoch": 3.224, "grad_norm": 5.9188456535339355, "learning_rate": 9.7979797979798e-06, "loss": 2.8495, "mean_token_accuracy": 0.8487619459629059, "step": 403 }, { "epoch": 3.232, "grad_norm": 6.8953986167907715, "learning_rate": 9.696969696969698e-06, "loss": 3.2653, "mean_token_accuracy": 0.842998206615448, "step": 404 }, { "epoch": 3.24, "grad_norm": 6.311713218688965, "learning_rate": 9.595959595959595e-06, "loss": 3.0002, "mean_token_accuracy": 0.8414562940597534, "step": 405 }, { "epoch": 3.248, "grad_norm": 6.360631465911865, "learning_rate": 9.494949494949495e-06, "loss": 2.5226, "mean_token_accuracy": 0.8662195056676865, "step": 406 }, { "epoch": 3.2560000000000002, "grad_norm": 6.032021999359131, "learning_rate": 9.393939393939394e-06, "loss": 2.4289, "mean_token_accuracy": 0.8762623816728592, "step": 407 }, { "epoch": 3.2640000000000002, "grad_norm": 6.2928948402404785, "learning_rate": 9.292929292929294e-06, "loss": 2.4797, "mean_token_accuracy": 0.8561984449625015, "step": 408 }, { "epoch": 3.2720000000000002, "grad_norm": 5.483267784118652, "learning_rate": 9.191919191919192e-06, "loss": 2.8567, "mean_token_accuracy": 0.8590980023145676, "step": 409 }, { "epoch": 3.2800000000000002, "grad_norm": 6.180997848510742, "learning_rate": 9.090909090909091e-06, "loss": 2.6063, "mean_token_accuracy": 0.8648645430803299, "step": 410 }, { "epoch": 3.288, "grad_norm": 6.12001371383667, "learning_rate": 8.98989898989899e-06, "loss": 2.6823, "mean_token_accuracy": 0.8513016700744629, "step": 411 }, { "epoch": 3.296, "grad_norm": 5.99603796005249, "learning_rate": 8.88888888888889e-06, "loss": 3.0047, "mean_token_accuracy": 0.8510095775127411, "step": 412 }, { "epoch": 3.304, "grad_norm": 5.749976634979248, "learning_rate": 8.787878787878788e-06, "loss": 2.9216, "mean_token_accuracy": 0.8540640473365784, "step": 413 }, { "epoch": 3.312, "grad_norm": 6.270049095153809, "learning_rate": 8.686868686868687e-06, "loss": 3.0965, "mean_token_accuracy": 0.8414539396762848, "step": 414 }, { "epoch": 3.32, "grad_norm": 5.532293319702148, "learning_rate": 8.585858585858587e-06, "loss": 3.0462, "mean_token_accuracy": 0.8508045822381973, "step": 415 }, { "epoch": 3.328, "grad_norm": 6.463992118835449, "learning_rate": 8.484848484848486e-06, "loss": 2.7394, "mean_token_accuracy": 0.85701984167099, "step": 416 }, { "epoch": 3.336, "grad_norm": 5.6121134757995605, "learning_rate": 8.383838383838384e-06, "loss": 2.7231, "mean_token_accuracy": 0.8531108349561691, "step": 417 }, { "epoch": 3.344, "grad_norm": 6.132601737976074, "learning_rate": 8.282828282828283e-06, "loss": 3.1036, "mean_token_accuracy": 0.8383132815361023, "step": 418 }, { "epoch": 3.352, "grad_norm": 6.347646713256836, "learning_rate": 8.181818181818183e-06, "loss": 2.5049, "mean_token_accuracy": 0.8660729080438614, "step": 419 }, { "epoch": 3.36, "grad_norm": 5.851092338562012, "learning_rate": 8.080808080808082e-06, "loss": 2.5479, "mean_token_accuracy": 0.8598389625549316, "step": 420 }, { "epoch": 3.368, "grad_norm": 5.76607084274292, "learning_rate": 7.97979797979798e-06, "loss": 3.1152, "mean_token_accuracy": 0.8485762178897858, "step": 421 }, { "epoch": 3.376, "grad_norm": 6.1592607498168945, "learning_rate": 7.878787878787878e-06, "loss": 2.514, "mean_token_accuracy": 0.8623416870832443, "step": 422 }, { "epoch": 3.384, "grad_norm": 6.193600654602051, "learning_rate": 7.777777777777777e-06, "loss": 2.9589, "mean_token_accuracy": 0.8406593501567841, "step": 423 }, { "epoch": 3.392, "grad_norm": 5.8620100021362305, "learning_rate": 7.676767676767677e-06, "loss": 2.6917, "mean_token_accuracy": 0.8631896525621414, "step": 424 }, { "epoch": 3.4, "grad_norm": 5.631361961364746, "learning_rate": 7.5757575757575764e-06, "loss": 2.6014, "mean_token_accuracy": 0.8725499212741852, "step": 425 }, { "epoch": 3.408, "grad_norm": 6.087384223937988, "learning_rate": 7.474747474747475e-06, "loss": 2.5906, "mean_token_accuracy": 0.8593233972787857, "step": 426 }, { "epoch": 3.416, "grad_norm": 6.2712016105651855, "learning_rate": 7.3737373737373745e-06, "loss": 2.7093, "mean_token_accuracy": 0.8598934262990952, "step": 427 }, { "epoch": 3.424, "grad_norm": 5.651885986328125, "learning_rate": 7.272727272727272e-06, "loss": 2.6062, "mean_token_accuracy": 0.8617094457149506, "step": 428 }, { "epoch": 3.432, "grad_norm": 5.506243705749512, "learning_rate": 7.171717171717173e-06, "loss": 2.5589, "mean_token_accuracy": 0.8667820692062378, "step": 429 }, { "epoch": 3.44, "grad_norm": 6.425995349884033, "learning_rate": 7.0707070707070704e-06, "loss": 3.1032, "mean_token_accuracy": 0.8365529775619507, "step": 430 }, { "epoch": 3.448, "grad_norm": 5.543388366699219, "learning_rate": 6.969696969696971e-06, "loss": 2.6756, "mean_token_accuracy": 0.8701014816761017, "step": 431 }, { "epoch": 3.456, "grad_norm": 6.339065074920654, "learning_rate": 6.8686868686868685e-06, "loss": 2.5654, "mean_token_accuracy": 0.8716783672571182, "step": 432 }, { "epoch": 3.464, "grad_norm": 6.019214153289795, "learning_rate": 6.767676767676769e-06, "loss": 3.3469, "mean_token_accuracy": 0.8329573720693588, "step": 433 }, { "epoch": 3.472, "grad_norm": 6.126465797424316, "learning_rate": 6.666666666666667e-06, "loss": 2.5695, "mean_token_accuracy": 0.8654420524835587, "step": 434 }, { "epoch": 3.48, "grad_norm": 5.879609107971191, "learning_rate": 6.565656565656567e-06, "loss": 2.5542, "mean_token_accuracy": 0.861924409866333, "step": 435 }, { "epoch": 3.488, "grad_norm": 6.507324695587158, "learning_rate": 6.464646464646465e-06, "loss": 2.8441, "mean_token_accuracy": 0.84620201587677, "step": 436 }, { "epoch": 3.496, "grad_norm": 6.121249675750732, "learning_rate": 6.363636363636363e-06, "loss": 2.2379, "mean_token_accuracy": 0.8786091357469559, "step": 437 }, { "epoch": 3.504, "grad_norm": 5.940850734710693, "learning_rate": 6.262626262626263e-06, "loss": 2.5193, "mean_token_accuracy": 0.8697729557752609, "step": 438 }, { "epoch": 3.512, "grad_norm": 6.618006229400635, "learning_rate": 6.161616161616162e-06, "loss": 3.0884, "mean_token_accuracy": 0.8440293669700623, "step": 439 }, { "epoch": 3.52, "grad_norm": 6.300545692443848, "learning_rate": 6.060606060606061e-06, "loss": 2.9795, "mean_token_accuracy": 0.8421255350112915, "step": 440 }, { "epoch": 3.528, "grad_norm": 6.084777355194092, "learning_rate": 5.9595959595959605e-06, "loss": 2.8443, "mean_token_accuracy": 0.8482642769813538, "step": 441 }, { "epoch": 3.536, "grad_norm": 6.6309685707092285, "learning_rate": 5.858585858585859e-06, "loss": 2.7261, "mean_token_accuracy": 0.8591820150613785, "step": 442 }, { "epoch": 3.544, "grad_norm": 5.60848331451416, "learning_rate": 5.7575757575757586e-06, "loss": 2.4302, "mean_token_accuracy": 0.8700147867202759, "step": 443 }, { "epoch": 3.552, "grad_norm": 6.067654132843018, "learning_rate": 5.656565656565657e-06, "loss": 2.5163, "mean_token_accuracy": 0.8643026798963547, "step": 444 }, { "epoch": 3.56, "grad_norm": 5.975160121917725, "learning_rate": 5.555555555555556e-06, "loss": 2.8837, "mean_token_accuracy": 0.8568368703126907, "step": 445 }, { "epoch": 3.568, "grad_norm": 6.040992736816406, "learning_rate": 5.4545454545454545e-06, "loss": 2.3984, "mean_token_accuracy": 0.867580771446228, "step": 446 }, { "epoch": 3.576, "grad_norm": 5.610777378082275, "learning_rate": 5.353535353535354e-06, "loss": 2.5171, "mean_token_accuracy": 0.8687434643507004, "step": 447 }, { "epoch": 3.584, "grad_norm": 6.74979829788208, "learning_rate": 5.2525252525252526e-06, "loss": 2.5815, "mean_token_accuracy": 0.866498276591301, "step": 448 }, { "epoch": 3.592, "grad_norm": 5.904205322265625, "learning_rate": 5.151515151515152e-06, "loss": 2.7649, "mean_token_accuracy": 0.8657421469688416, "step": 449 }, { "epoch": 3.6, "grad_norm": 5.820155620574951, "learning_rate": 5.050505050505051e-06, "loss": 2.7525, "mean_token_accuracy": 0.8619928807020187, "step": 450 }, { "epoch": 3.608, "grad_norm": 6.589330196380615, "learning_rate": 4.949494949494949e-06, "loss": 2.287, "mean_token_accuracy": 0.8719237148761749, "step": 451 }, { "epoch": 3.616, "grad_norm": 6.643290996551514, "learning_rate": 4.848484848484849e-06, "loss": 3.4465, "mean_token_accuracy": 0.841964989900589, "step": 452 }, { "epoch": 3.624, "grad_norm": 6.711119174957275, "learning_rate": 4.747474747474747e-06, "loss": 2.8413, "mean_token_accuracy": 0.856516107916832, "step": 453 }, { "epoch": 3.632, "grad_norm": 5.4684343338012695, "learning_rate": 4.646464646464647e-06, "loss": 2.4144, "mean_token_accuracy": 0.8840565979480743, "step": 454 }, { "epoch": 3.64, "grad_norm": 5.740659236907959, "learning_rate": 4.5454545454545455e-06, "loss": 2.6323, "mean_token_accuracy": 0.8655981123447418, "step": 455 }, { "epoch": 3.648, "grad_norm": 6.271495819091797, "learning_rate": 4.444444444444445e-06, "loss": 3.0839, "mean_token_accuracy": 0.8419709354639053, "step": 456 }, { "epoch": 3.656, "grad_norm": 5.493051052093506, "learning_rate": 4.343434343434344e-06, "loss": 2.1636, "mean_token_accuracy": 0.8905516117811203, "step": 457 }, { "epoch": 3.664, "grad_norm": 6.861065864562988, "learning_rate": 4.242424242424243e-06, "loss": 2.9378, "mean_token_accuracy": 0.8521191477775574, "step": 458 }, { "epoch": 3.672, "grad_norm": 6.059913158416748, "learning_rate": 4.141414141414142e-06, "loss": 2.7574, "mean_token_accuracy": 0.8573954999446869, "step": 459 }, { "epoch": 3.68, "grad_norm": 6.36575174331665, "learning_rate": 4.040404040404041e-06, "loss": 2.8077, "mean_token_accuracy": 0.8544747680425644, "step": 460 }, { "epoch": 3.6879999999999997, "grad_norm": 6.40785026550293, "learning_rate": 3.939393939393939e-06, "loss": 2.9, "mean_token_accuracy": 0.8473234623670578, "step": 461 }, { "epoch": 3.6959999999999997, "grad_norm": 6.139939785003662, "learning_rate": 3.8383838383838385e-06, "loss": 2.4719, "mean_token_accuracy": 0.8682427853345871, "step": 462 }, { "epoch": 3.7039999999999997, "grad_norm": 6.098149299621582, "learning_rate": 3.7373737373737375e-06, "loss": 2.7453, "mean_token_accuracy": 0.8645330965518951, "step": 463 }, { "epoch": 3.7119999999999997, "grad_norm": 6.3011884689331055, "learning_rate": 3.636363636363636e-06, "loss": 2.7872, "mean_token_accuracy": 0.8514862060546875, "step": 464 }, { "epoch": 3.7199999999999998, "grad_norm": 6.186530590057373, "learning_rate": 3.5353535353535352e-06, "loss": 3.1428, "mean_token_accuracy": 0.8448829352855682, "step": 465 }, { "epoch": 3.7279999999999998, "grad_norm": 7.000403881072998, "learning_rate": 3.4343434343434343e-06, "loss": 3.5395, "mean_token_accuracy": 0.8234449177980423, "step": 466 }, { "epoch": 3.7359999999999998, "grad_norm": 6.087681770324707, "learning_rate": 3.3333333333333333e-06, "loss": 3.0334, "mean_token_accuracy": 0.8479331731796265, "step": 467 }, { "epoch": 3.7439999999999998, "grad_norm": 5.989120960235596, "learning_rate": 3.2323232323232324e-06, "loss": 2.7067, "mean_token_accuracy": 0.866983637213707, "step": 468 }, { "epoch": 3.752, "grad_norm": 6.254819869995117, "learning_rate": 3.1313131313131314e-06, "loss": 2.3856, "mean_token_accuracy": 0.8707826137542725, "step": 469 }, { "epoch": 3.76, "grad_norm": 6.099485874176025, "learning_rate": 3.0303030303030305e-06, "loss": 2.8972, "mean_token_accuracy": 0.8589896708726883, "step": 470 }, { "epoch": 3.768, "grad_norm": 6.424018383026123, "learning_rate": 2.9292929292929295e-06, "loss": 2.7964, "mean_token_accuracy": 0.8504993915557861, "step": 471 }, { "epoch": 3.776, "grad_norm": 5.645679950714111, "learning_rate": 2.8282828282828286e-06, "loss": 2.4383, "mean_token_accuracy": 0.8685364574193954, "step": 472 }, { "epoch": 3.784, "grad_norm": 6.7141523361206055, "learning_rate": 2.7272727272727272e-06, "loss": 2.8149, "mean_token_accuracy": 0.8539319187402725, "step": 473 }, { "epoch": 3.792, "grad_norm": 6.887962818145752, "learning_rate": 2.6262626262626263e-06, "loss": 2.6289, "mean_token_accuracy": 0.8702027946710587, "step": 474 }, { "epoch": 3.8, "grad_norm": 6.338537216186523, "learning_rate": 2.5252525252525253e-06, "loss": 3.0126, "mean_token_accuracy": 0.8451626300811768, "step": 475 }, { "epoch": 3.808, "grad_norm": 6.215012550354004, "learning_rate": 2.4242424242424244e-06, "loss": 3.0838, "mean_token_accuracy": 0.853987067937851, "step": 476 }, { "epoch": 3.816, "grad_norm": 6.2042999267578125, "learning_rate": 2.3232323232323234e-06, "loss": 2.962, "mean_token_accuracy": 0.8471638411283493, "step": 477 }, { "epoch": 3.824, "grad_norm": 5.406651020050049, "learning_rate": 2.2222222222222225e-06, "loss": 2.4559, "mean_token_accuracy": 0.872399777173996, "step": 478 }, { "epoch": 3.832, "grad_norm": 6.384584426879883, "learning_rate": 2.1212121212121216e-06, "loss": 2.7063, "mean_token_accuracy": 0.8628278374671936, "step": 479 }, { "epoch": 3.84, "grad_norm": 6.349820137023926, "learning_rate": 2.0202020202020206e-06, "loss": 2.3293, "mean_token_accuracy": 0.8819480836391449, "step": 480 }, { "epoch": 3.848, "grad_norm": 5.694690227508545, "learning_rate": 1.9191919191919192e-06, "loss": 2.554, "mean_token_accuracy": 0.8645013719797134, "step": 481 }, { "epoch": 3.856, "grad_norm": 6.480876445770264, "learning_rate": 1.818181818181818e-06, "loss": 3.0091, "mean_token_accuracy": 0.8551051169633865, "step": 482 }, { "epoch": 3.864, "grad_norm": 6.149241924285889, "learning_rate": 1.7171717171717171e-06, "loss": 3.0215, "mean_token_accuracy": 0.8434359133243561, "step": 483 }, { "epoch": 3.872, "grad_norm": 5.84627103805542, "learning_rate": 1.6161616161616162e-06, "loss": 2.7796, "mean_token_accuracy": 0.8535165041685104, "step": 484 }, { "epoch": 3.88, "grad_norm": 6.468245506286621, "learning_rate": 1.5151515151515152e-06, "loss": 2.8329, "mean_token_accuracy": 0.8529188930988312, "step": 485 }, { "epoch": 3.888, "grad_norm": 6.755955219268799, "learning_rate": 1.4141414141414143e-06, "loss": 3.3661, "mean_token_accuracy": 0.831203356385231, "step": 486 }, { "epoch": 3.896, "grad_norm": 6.618368148803711, "learning_rate": 1.3131313131313131e-06, "loss": 2.9949, "mean_token_accuracy": 0.8433407545089722, "step": 487 }, { "epoch": 3.904, "grad_norm": 6.448158264160156, "learning_rate": 1.2121212121212122e-06, "loss": 2.6009, "mean_token_accuracy": 0.8493270874023438, "step": 488 }, { "epoch": 3.912, "grad_norm": 6.494684219360352, "learning_rate": 1.1111111111111112e-06, "loss": 2.8263, "mean_token_accuracy": 0.847339928150177, "step": 489 }, { "epoch": 3.92, "grad_norm": 6.943580150604248, "learning_rate": 1.0101010101010103e-06, "loss": 3.0355, "mean_token_accuracy": 0.8504604995250702, "step": 490 }, { "epoch": 3.928, "grad_norm": 5.839914798736572, "learning_rate": 9.09090909090909e-07, "loss": 2.6543, "mean_token_accuracy": 0.8537915647029877, "step": 491 }, { "epoch": 3.936, "grad_norm": 5.800342082977295, "learning_rate": 8.080808080808081e-07, "loss": 2.4919, "mean_token_accuracy": 0.8684473484754562, "step": 492 }, { "epoch": 3.944, "grad_norm": 6.721318244934082, "learning_rate": 7.070707070707071e-07, "loss": 2.7316, "mean_token_accuracy": 0.8631936460733414, "step": 493 }, { "epoch": 3.952, "grad_norm": 5.557168960571289, "learning_rate": 6.060606060606061e-07, "loss": 2.7026, "mean_token_accuracy": 0.8652904033660889, "step": 494 }, { "epoch": 3.96, "grad_norm": 6.060300350189209, "learning_rate": 5.050505050505052e-07, "loss": 2.8709, "mean_token_accuracy": 0.854452446103096, "step": 495 }, { "epoch": 3.968, "grad_norm": 5.997809886932373, "learning_rate": 4.0404040404040405e-07, "loss": 2.5419, "mean_token_accuracy": 0.8659024238586426, "step": 496 }, { "epoch": 3.976, "grad_norm": 5.681960105895996, "learning_rate": 3.0303030303030305e-07, "loss": 2.7244, "mean_token_accuracy": 0.8663401901721954, "step": 497 }, { "epoch": 3.984, "grad_norm": 8.17302417755127, "learning_rate": 2.0202020202020202e-07, "loss": 3.1581, "mean_token_accuracy": 0.8333937376737595, "step": 498 }, { "epoch": 3.992, "grad_norm": 6.531320095062256, "learning_rate": 1.0101010101010101e-07, "loss": 3.0756, "mean_token_accuracy": 0.8531923592090607, "step": 499 }, { "epoch": 4.0, "grad_norm": 6.187903881072998, "learning_rate": 0.0, "loss": 2.5067, "mean_token_accuracy": 0.8733052164316177, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2610056134656000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }