{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 34.44804382324219, "learning_rate": 1e-05, "loss": 13.0101, "mean_token_accuracy": 0.4696590006351471, "step": 1 }, { "epoch": 0.016, "grad_norm": 30.779788970947266, "learning_rate": 2e-05, "loss": 12.3851, "mean_token_accuracy": 0.47303473204374313, "step": 2 }, { "epoch": 0.024, "grad_norm": 29.67559242248535, "learning_rate": 3e-05, "loss": 12.3488, "mean_token_accuracy": 0.49709559231996536, "step": 3 }, { "epoch": 0.032, "grad_norm": 26.862010955810547, "learning_rate": 4e-05, "loss": 11.6596, "mean_token_accuracy": 0.5584611147642136, "step": 4 }, { "epoch": 0.04, "grad_norm": 22.10072135925293, "learning_rate": 5e-05, "loss": 10.1384, "mean_token_accuracy": 0.5924926251173019, "step": 5 }, { "epoch": 0.048, "grad_norm": 20.171361923217773, "learning_rate": 4.98989898989899e-05, "loss": 9.5421, "mean_token_accuracy": 0.5888276249170303, "step": 6 }, { "epoch": 0.056, "grad_norm": 16.452842712402344, "learning_rate": 4.97979797979798e-05, "loss": 8.4344, "mean_token_accuracy": 0.632336363196373, "step": 7 }, { "epoch": 0.064, "grad_norm": 12.62137222290039, "learning_rate": 4.9696969696969694e-05, "loss": 7.7758, "mean_token_accuracy": 0.6633019298315048, "step": 8 }, { "epoch": 0.072, "grad_norm": 11.533007621765137, "learning_rate": 4.9595959595959594e-05, "loss": 7.5523, "mean_token_accuracy": 0.6721473336219788, "step": 9 }, { "epoch": 0.08, "grad_norm": 9.251978874206543, "learning_rate": 4.94949494949495e-05, "loss": 7.5504, "mean_token_accuracy": 0.6924590468406677, "step": 10 }, { "epoch": 0.088, "grad_norm": 9.099457740783691, "learning_rate": 4.93939393939394e-05, "loss": 7.1883, "mean_token_accuracy": 0.694612979888916, "step": 11 }, { "epoch": 0.096, "grad_norm": 9.096614837646484, "learning_rate": 4.92929292929293e-05, "loss": 6.7714, "mean_token_accuracy": 0.7158520817756653, "step": 12 }, { "epoch": 0.104, "grad_norm": 8.9071044921875, "learning_rate": 4.919191919191919e-05, "loss": 6.7582, "mean_token_accuracy": 0.7046481072902679, "step": 13 }, { "epoch": 0.112, "grad_norm": 8.336892127990723, "learning_rate": 4.909090909090909e-05, "loss": 6.639, "mean_token_accuracy": 0.6894638538360596, "step": 14 }, { "epoch": 0.12, "grad_norm": 7.760809421539307, "learning_rate": 4.898989898989899e-05, "loss": 6.1283, "mean_token_accuracy": 0.7323453426361084, "step": 15 }, { "epoch": 0.128, "grad_norm": 10.2608003616333, "learning_rate": 4.888888888888889e-05, "loss": 6.3731, "mean_token_accuracy": 0.7066609710454941, "step": 16 }, { "epoch": 0.136, "grad_norm": 9.326302528381348, "learning_rate": 4.878787878787879e-05, "loss": 6.0298, "mean_token_accuracy": 0.7226346284151077, "step": 17 }, { "epoch": 0.144, "grad_norm": 9.35575008392334, "learning_rate": 4.868686868686869e-05, "loss": 6.1363, "mean_token_accuracy": 0.7228655517101288, "step": 18 }, { "epoch": 0.152, "grad_norm": 7.2700018882751465, "learning_rate": 4.858585858585859e-05, "loss": 6.3529, "mean_token_accuracy": 0.7191445082426071, "step": 19 }, { "epoch": 0.16, "grad_norm": 7.965806484222412, "learning_rate": 4.848484848484849e-05, "loss": 6.1908, "mean_token_accuracy": 0.7225745767354965, "step": 20 }, { "epoch": 0.168, "grad_norm": 9.064255714416504, "learning_rate": 4.838383838383839e-05, "loss": 6.0033, "mean_token_accuracy": 0.7359140962362289, "step": 21 }, { "epoch": 0.176, "grad_norm": 6.869060039520264, "learning_rate": 4.828282828282829e-05, "loss": 5.1424, "mean_token_accuracy": 0.7650310397148132, "step": 22 }, { "epoch": 0.184, "grad_norm": 7.401363372802734, "learning_rate": 4.8181818181818186e-05, "loss": 6.2128, "mean_token_accuracy": 0.7163093239068985, "step": 23 }, { "epoch": 0.192, "grad_norm": 7.535747051239014, "learning_rate": 4.808080808080808e-05, "loss": 5.561, "mean_token_accuracy": 0.7451221346855164, "step": 24 }, { "epoch": 0.2, "grad_norm": 8.211135864257812, "learning_rate": 4.797979797979798e-05, "loss": 5.265, "mean_token_accuracy": 0.7609298378229141, "step": 25 }, { "epoch": 0.208, "grad_norm": 6.720511436462402, "learning_rate": 4.787878787878788e-05, "loss": 5.3244, "mean_token_accuracy": 0.7525999993085861, "step": 26 }, { "epoch": 0.216, "grad_norm": 9.114042282104492, "learning_rate": 4.7777777777777784e-05, "loss": 5.6173, "mean_token_accuracy": 0.7520534992218018, "step": 27 }, { "epoch": 0.224, "grad_norm": 7.770754337310791, "learning_rate": 4.7676767676767684e-05, "loss": 5.5093, "mean_token_accuracy": 0.7558625787496567, "step": 28 }, { "epoch": 0.232, "grad_norm": 7.749612808227539, "learning_rate": 4.7575757575757576e-05, "loss": 5.5386, "mean_token_accuracy": 0.7553819268941879, "step": 29 }, { "epoch": 0.24, "grad_norm": 7.486739635467529, "learning_rate": 4.7474747474747476e-05, "loss": 5.73, "mean_token_accuracy": 0.732807844877243, "step": 30 }, { "epoch": 0.248, "grad_norm": 7.474062919616699, "learning_rate": 4.7373737373737375e-05, "loss": 6.0216, "mean_token_accuracy": 0.742279127240181, "step": 31 }, { "epoch": 0.256, "grad_norm": 7.018927574157715, "learning_rate": 4.7272727272727275e-05, "loss": 5.0907, "mean_token_accuracy": 0.7746435850858688, "step": 32 }, { "epoch": 0.264, "grad_norm": 6.719446659088135, "learning_rate": 4.7171717171717174e-05, "loss": 4.9967, "mean_token_accuracy": 0.7717019468545914, "step": 33 }, { "epoch": 0.272, "grad_norm": 6.464803695678711, "learning_rate": 4.7070707070707074e-05, "loss": 5.2966, "mean_token_accuracy": 0.7656355202198029, "step": 34 }, { "epoch": 0.28, "grad_norm": 7.2371320724487305, "learning_rate": 4.696969696969697e-05, "loss": 5.6333, "mean_token_accuracy": 0.7543937265872955, "step": 35 }, { "epoch": 0.288, "grad_norm": 6.18785285949707, "learning_rate": 4.686868686868687e-05, "loss": 5.3846, "mean_token_accuracy": 0.7626298367977142, "step": 36 }, { "epoch": 0.296, "grad_norm": 5.636417865753174, "learning_rate": 4.676767676767677e-05, "loss": 4.7779, "mean_token_accuracy": 0.796937882900238, "step": 37 }, { "epoch": 0.304, "grad_norm": 6.951849460601807, "learning_rate": 4.666666666666667e-05, "loss": 5.3062, "mean_token_accuracy": 0.7514433115720749, "step": 38 }, { "epoch": 0.312, "grad_norm": 7.596467018127441, "learning_rate": 4.656565656565657e-05, "loss": 5.4916, "mean_token_accuracy": 0.738706648349762, "step": 39 }, { "epoch": 0.32, "grad_norm": 7.036402225494385, "learning_rate": 4.6464646464646464e-05, "loss": 5.4346, "mean_token_accuracy": 0.7636198997497559, "step": 40 }, { "epoch": 0.328, "grad_norm": 7.065268039703369, "learning_rate": 4.636363636363636e-05, "loss": 4.9636, "mean_token_accuracy": 0.76950503885746, "step": 41 }, { "epoch": 0.336, "grad_norm": 5.93984842300415, "learning_rate": 4.626262626262626e-05, "loss": 4.9883, "mean_token_accuracy": 0.7728704810142517, "step": 42 }, { "epoch": 0.344, "grad_norm": 6.929603576660156, "learning_rate": 4.616161616161616e-05, "loss": 4.6927, "mean_token_accuracy": 0.7756397873163223, "step": 43 }, { "epoch": 0.352, "grad_norm": 7.2813496589660645, "learning_rate": 4.606060606060607e-05, "loss": 4.761, "mean_token_accuracy": 0.784713864326477, "step": 44 }, { "epoch": 0.36, "grad_norm": 6.584592342376709, "learning_rate": 4.595959595959596e-05, "loss": 4.7331, "mean_token_accuracy": 0.787657305598259, "step": 45 }, { "epoch": 0.368, "grad_norm": 7.5701751708984375, "learning_rate": 4.585858585858586e-05, "loss": 4.1914, "mean_token_accuracy": 0.796615332365036, "step": 46 }, { "epoch": 0.376, "grad_norm": 6.217738628387451, "learning_rate": 4.575757575757576e-05, "loss": 4.6948, "mean_token_accuracy": 0.7924428284168243, "step": 47 }, { "epoch": 0.384, "grad_norm": 6.909704685211182, "learning_rate": 4.565656565656566e-05, "loss": 5.0747, "mean_token_accuracy": 0.7648352980613708, "step": 48 }, { "epoch": 0.392, "grad_norm": 7.039934158325195, "learning_rate": 4.555555555555556e-05, "loss": 5.168, "mean_token_accuracy": 0.7628057301044464, "step": 49 }, { "epoch": 0.4, "grad_norm": 7.007484436035156, "learning_rate": 4.545454545454546e-05, "loss": 5.5139, "mean_token_accuracy": 0.7507320195436478, "step": 50 }, { "epoch": 0.408, "grad_norm": 7.163206100463867, "learning_rate": 4.535353535353535e-05, "loss": 4.6769, "mean_token_accuracy": 0.789847657084465, "step": 51 }, { "epoch": 0.416, "grad_norm": 6.482788562774658, "learning_rate": 4.525252525252526e-05, "loss": 4.0523, "mean_token_accuracy": 0.8091136366128922, "step": 52 }, { "epoch": 0.424, "grad_norm": 6.3435564041137695, "learning_rate": 4.515151515151516e-05, "loss": 5.0074, "mean_token_accuracy": 0.7676153928041458, "step": 53 }, { "epoch": 0.432, "grad_norm": 6.715310096740723, "learning_rate": 4.5050505050505056e-05, "loss": 4.7588, "mean_token_accuracy": 0.7704059928655624, "step": 54 }, { "epoch": 0.44, "grad_norm": 6.287456512451172, "learning_rate": 4.494949494949495e-05, "loss": 5.0595, "mean_token_accuracy": 0.7789322882890701, "step": 55 }, { "epoch": 0.448, "grad_norm": 6.842898845672607, "learning_rate": 4.484848484848485e-05, "loss": 4.3176, "mean_token_accuracy": 0.7993331402540207, "step": 56 }, { "epoch": 0.456, "grad_norm": 5.992519378662109, "learning_rate": 4.474747474747475e-05, "loss": 5.0297, "mean_token_accuracy": 0.779092013835907, "step": 57 }, { "epoch": 0.464, "grad_norm": 6.67594051361084, "learning_rate": 4.464646464646465e-05, "loss": 4.1185, "mean_token_accuracy": 0.7971874326467514, "step": 58 }, { "epoch": 0.472, "grad_norm": 6.712916374206543, "learning_rate": 4.454545454545455e-05, "loss": 4.6345, "mean_token_accuracy": 0.7811515778303146, "step": 59 }, { "epoch": 0.48, "grad_norm": 6.204624652862549, "learning_rate": 4.4444444444444447e-05, "loss": 4.5869, "mean_token_accuracy": 0.8028187602758408, "step": 60 }, { "epoch": 0.488, "grad_norm": 7.798375129699707, "learning_rate": 4.4343434343434346e-05, "loss": 5.2535, "mean_token_accuracy": 0.7713681906461716, "step": 61 }, { "epoch": 0.496, "grad_norm": 6.647665500640869, "learning_rate": 4.4242424242424246e-05, "loss": 4.1662, "mean_token_accuracy": 0.809452161192894, "step": 62 }, { "epoch": 0.504, "grad_norm": 7.158292770385742, "learning_rate": 4.4141414141414145e-05, "loss": 4.5907, "mean_token_accuracy": 0.7843180149793625, "step": 63 }, { "epoch": 0.512, "grad_norm": 6.763492107391357, "learning_rate": 4.4040404040404044e-05, "loss": 4.6913, "mean_token_accuracy": 0.7733302861452103, "step": 64 }, { "epoch": 0.52, "grad_norm": 6.745001792907715, "learning_rate": 4.3939393939393944e-05, "loss": 4.7438, "mean_token_accuracy": 0.7817106246948242, "step": 65 }, { "epoch": 0.528, "grad_norm": 5.92717981338501, "learning_rate": 4.383838383838384e-05, "loss": 4.3055, "mean_token_accuracy": 0.7931149005889893, "step": 66 }, { "epoch": 0.536, "grad_norm": 7.291048049926758, "learning_rate": 4.3737373737373736e-05, "loss": 4.7281, "mean_token_accuracy": 0.7924042791128159, "step": 67 }, { "epoch": 0.544, "grad_norm": 6.6944899559021, "learning_rate": 4.3636363636363636e-05, "loss": 5.4566, "mean_token_accuracy": 0.7465656846761703, "step": 68 }, { "epoch": 0.552, "grad_norm": 6.43382453918457, "learning_rate": 4.3535353535353535e-05, "loss": 5.1924, "mean_token_accuracy": 0.765620231628418, "step": 69 }, { "epoch": 0.56, "grad_norm": 7.896528244018555, "learning_rate": 4.343434343434344e-05, "loss": 4.535, "mean_token_accuracy": 0.780782625079155, "step": 70 }, { "epoch": 0.568, "grad_norm": 7.0929059982299805, "learning_rate": 4.3333333333333334e-05, "loss": 4.7798, "mean_token_accuracy": 0.7812370210886002, "step": 71 }, { "epoch": 0.576, "grad_norm": 6.4265031814575195, "learning_rate": 4.3232323232323234e-05, "loss": 4.0221, "mean_token_accuracy": 0.8055804222822189, "step": 72 }, { "epoch": 0.584, "grad_norm": 6.464663982391357, "learning_rate": 4.313131313131313e-05, "loss": 5.3328, "mean_token_accuracy": 0.7507916688919067, "step": 73 }, { "epoch": 0.592, "grad_norm": 6.163589000701904, "learning_rate": 4.303030303030303e-05, "loss": 3.8647, "mean_token_accuracy": 0.8181672990322113, "step": 74 }, { "epoch": 0.6, "grad_norm": 6.442709445953369, "learning_rate": 4.292929292929293e-05, "loss": 4.1589, "mean_token_accuracy": 0.7894054055213928, "step": 75 }, { "epoch": 0.608, "grad_norm": 6.7371673583984375, "learning_rate": 4.282828282828283e-05, "loss": 4.4978, "mean_token_accuracy": 0.7749454975128174, "step": 76 }, { "epoch": 0.616, "grad_norm": 6.681241035461426, "learning_rate": 4.2727272727272724e-05, "loss": 4.3404, "mean_token_accuracy": 0.8082751482725143, "step": 77 }, { "epoch": 0.624, "grad_norm": 6.194783687591553, "learning_rate": 4.262626262626263e-05, "loss": 4.1043, "mean_token_accuracy": 0.7980503439903259, "step": 78 }, { "epoch": 0.632, "grad_norm": 6.758218288421631, "learning_rate": 4.252525252525253e-05, "loss": 4.4278, "mean_token_accuracy": 0.7885774075984955, "step": 79 }, { "epoch": 0.64, "grad_norm": 7.5866546630859375, "learning_rate": 4.242424242424243e-05, "loss": 4.227, "mean_token_accuracy": 0.7932698577642441, "step": 80 }, { "epoch": 0.648, "grad_norm": 7.192435264587402, "learning_rate": 4.232323232323233e-05, "loss": 4.0345, "mean_token_accuracy": 0.8199726790189743, "step": 81 }, { "epoch": 0.656, "grad_norm": 5.501450061798096, "learning_rate": 4.222222222222222e-05, "loss": 4.2629, "mean_token_accuracy": 0.7981265485286713, "step": 82 }, { "epoch": 0.664, "grad_norm": 6.11761474609375, "learning_rate": 4.212121212121212e-05, "loss": 4.2, "mean_token_accuracy": 0.8049240857362747, "step": 83 }, { "epoch": 0.672, "grad_norm": 6.632277011871338, "learning_rate": 4.202020202020202e-05, "loss": 4.5717, "mean_token_accuracy": 0.784144937992096, "step": 84 }, { "epoch": 0.68, "grad_norm": 5.618972301483154, "learning_rate": 4.191919191919192e-05, "loss": 4.1424, "mean_token_accuracy": 0.8174345791339874, "step": 85 }, { "epoch": 0.688, "grad_norm": 6.348287105560303, "learning_rate": 4.181818181818182e-05, "loss": 3.9289, "mean_token_accuracy": 0.8103939890861511, "step": 86 }, { "epoch": 0.696, "grad_norm": 6.326577186584473, "learning_rate": 4.171717171717172e-05, "loss": 4.4158, "mean_token_accuracy": 0.7934867739677429, "step": 87 }, { "epoch": 0.704, "grad_norm": 5.938681602478027, "learning_rate": 4.161616161616162e-05, "loss": 4.3607, "mean_token_accuracy": 0.8023031949996948, "step": 88 }, { "epoch": 0.712, "grad_norm": 6.043774604797363, "learning_rate": 4.151515151515152e-05, "loss": 4.8988, "mean_token_accuracy": 0.7769377380609512, "step": 89 }, { "epoch": 0.72, "grad_norm": 6.513429641723633, "learning_rate": 4.141414141414142e-05, "loss": 4.469, "mean_token_accuracy": 0.7987037748098373, "step": 90 }, { "epoch": 0.728, "grad_norm": 6.306874752044678, "learning_rate": 4.131313131313132e-05, "loss": 3.8623, "mean_token_accuracy": 0.8089967370033264, "step": 91 }, { "epoch": 0.736, "grad_norm": 6.43092155456543, "learning_rate": 4.1212121212121216e-05, "loss": 4.1141, "mean_token_accuracy": 0.8077200204133987, "step": 92 }, { "epoch": 0.744, "grad_norm": 5.928807735443115, "learning_rate": 4.111111111111111e-05, "loss": 4.379, "mean_token_accuracy": 0.7994053959846497, "step": 93 }, { "epoch": 0.752, "grad_norm": 6.193979740142822, "learning_rate": 4.101010101010101e-05, "loss": 4.0719, "mean_token_accuracy": 0.7951067984104156, "step": 94 }, { "epoch": 0.76, "grad_norm": 5.613356113433838, "learning_rate": 4.0909090909090915e-05, "loss": 3.9982, "mean_token_accuracy": 0.8044578582048416, "step": 95 }, { "epoch": 0.768, "grad_norm": 5.828372478485107, "learning_rate": 4.0808080808080814e-05, "loss": 3.9207, "mean_token_accuracy": 0.804252415895462, "step": 96 }, { "epoch": 0.776, "grad_norm": 6.8737993240356445, "learning_rate": 4.070707070707071e-05, "loss": 4.018, "mean_token_accuracy": 0.7971819192171097, "step": 97 }, { "epoch": 0.784, "grad_norm": 6.014230251312256, "learning_rate": 4.0606060606060606e-05, "loss": 3.9449, "mean_token_accuracy": 0.8023800402879715, "step": 98 }, { "epoch": 0.792, "grad_norm": 6.424180030822754, "learning_rate": 4.0505050505050506e-05, "loss": 4.133, "mean_token_accuracy": 0.7898548692464828, "step": 99 }, { "epoch": 0.8, "grad_norm": 5.941647529602051, "learning_rate": 4.0404040404040405e-05, "loss": 4.4152, "mean_token_accuracy": 0.7944744974374771, "step": 100 }, { "epoch": 0.808, "grad_norm": 5.532174110412598, "learning_rate": 4.0303030303030305e-05, "loss": 3.9543, "mean_token_accuracy": 0.8119575679302216, "step": 101 }, { "epoch": 0.816, "grad_norm": 6.766019344329834, "learning_rate": 4.0202020202020204e-05, "loss": 3.8357, "mean_token_accuracy": 0.8240242004394531, "step": 102 }, { "epoch": 0.824, "grad_norm": 5.829742908477783, "learning_rate": 4.01010101010101e-05, "loss": 4.1291, "mean_token_accuracy": 0.798798069357872, "step": 103 }, { "epoch": 0.832, "grad_norm": 6.3326416015625, "learning_rate": 4e-05, "loss": 4.5605, "mean_token_accuracy": 0.7854688614606857, "step": 104 }, { "epoch": 0.84, "grad_norm": 5.9824042320251465, "learning_rate": 3.98989898989899e-05, "loss": 4.0588, "mean_token_accuracy": 0.8019276112318039, "step": 105 }, { "epoch": 0.848, "grad_norm": 6.245088577270508, "learning_rate": 3.97979797979798e-05, "loss": 3.9216, "mean_token_accuracy": 0.8176022469997406, "step": 106 }, { "epoch": 0.856, "grad_norm": 6.343690395355225, "learning_rate": 3.96969696969697e-05, "loss": 4.5661, "mean_token_accuracy": 0.7900134176015854, "step": 107 }, { "epoch": 0.864, "grad_norm": 6.176941394805908, "learning_rate": 3.9595959595959594e-05, "loss": 4.3509, "mean_token_accuracy": 0.7881277054548264, "step": 108 }, { "epoch": 0.872, "grad_norm": 5.415067672729492, "learning_rate": 3.9494949494949494e-05, "loss": 3.9209, "mean_token_accuracy": 0.8084693402051926, "step": 109 }, { "epoch": 0.88, "grad_norm": 6.951507568359375, "learning_rate": 3.939393939393939e-05, "loss": 3.8214, "mean_token_accuracy": 0.8184851258993149, "step": 110 }, { "epoch": 0.888, "grad_norm": 6.3500213623046875, "learning_rate": 3.929292929292929e-05, "loss": 4.445, "mean_token_accuracy": 0.7920354455709457, "step": 111 }, { "epoch": 0.896, "grad_norm": 6.108852386474609, "learning_rate": 3.91919191919192e-05, "loss": 3.9563, "mean_token_accuracy": 0.8090476542711258, "step": 112 }, { "epoch": 0.904, "grad_norm": 5.749330043792725, "learning_rate": 3.909090909090909e-05, "loss": 3.7184, "mean_token_accuracy": 0.8214946985244751, "step": 113 }, { "epoch": 0.912, "grad_norm": 6.137975692749023, "learning_rate": 3.898989898989899e-05, "loss": 3.8174, "mean_token_accuracy": 0.8351726979017258, "step": 114 }, { "epoch": 0.92, "grad_norm": 5.827854156494141, "learning_rate": 3.888888888888889e-05, "loss": 4.1516, "mean_token_accuracy": 0.8008674532175064, "step": 115 }, { "epoch": 0.928, "grad_norm": 6.231594562530518, "learning_rate": 3.878787878787879e-05, "loss": 3.9401, "mean_token_accuracy": 0.8067153543233871, "step": 116 }, { "epoch": 0.936, "grad_norm": 6.3449578285217285, "learning_rate": 3.868686868686869e-05, "loss": 4.5777, "mean_token_accuracy": 0.7864043414592743, "step": 117 }, { "epoch": 0.944, "grad_norm": 5.758909702301025, "learning_rate": 3.858585858585859e-05, "loss": 3.9313, "mean_token_accuracy": 0.8202812224626541, "step": 118 }, { "epoch": 0.952, "grad_norm": 5.801164627075195, "learning_rate": 3.848484848484848e-05, "loss": 3.4612, "mean_token_accuracy": 0.8463838249444962, "step": 119 }, { "epoch": 0.96, "grad_norm": 6.331445217132568, "learning_rate": 3.838383838383838e-05, "loss": 4.1784, "mean_token_accuracy": 0.8084011971950531, "step": 120 }, { "epoch": 0.968, "grad_norm": 5.821787357330322, "learning_rate": 3.828282828282829e-05, "loss": 4.0625, "mean_token_accuracy": 0.8006434291601181, "step": 121 }, { "epoch": 0.976, "grad_norm": 6.507475852966309, "learning_rate": 3.818181818181819e-05, "loss": 3.7078, "mean_token_accuracy": 0.8223373293876648, "step": 122 }, { "epoch": 0.984, "grad_norm": 5.8519511222839355, "learning_rate": 3.8080808080808087e-05, "loss": 3.8093, "mean_token_accuracy": 0.8161063343286514, "step": 123 }, { "epoch": 0.992, "grad_norm": 6.515493869781494, "learning_rate": 3.797979797979798e-05, "loss": 3.8881, "mean_token_accuracy": 0.8130738884210587, "step": 124 }, { "epoch": 1.0, "grad_norm": 5.974099159240723, "learning_rate": 3.787878787878788e-05, "loss": 3.8128, "mean_token_accuracy": 0.817224845290184, "step": 125 }, { "epoch": 1.008, "grad_norm": 5.158608436584473, "learning_rate": 3.777777777777778e-05, "loss": 3.5407, "mean_token_accuracy": 0.8298159688711166, "step": 126 }, { "epoch": 1.016, "grad_norm": 5.91457462310791, "learning_rate": 3.767676767676768e-05, "loss": 3.6346, "mean_token_accuracy": 0.8155378848314285, "step": 127 }, { "epoch": 1.024, "grad_norm": 5.550519943237305, "learning_rate": 3.757575757575758e-05, "loss": 3.8566, "mean_token_accuracy": 0.8141728788614273, "step": 128 }, { "epoch": 1.032, "grad_norm": 5.610849380493164, "learning_rate": 3.747474747474748e-05, "loss": 3.785, "mean_token_accuracy": 0.811878427863121, "step": 129 }, { "epoch": 1.04, "grad_norm": 6.2908854484558105, "learning_rate": 3.7373737373737376e-05, "loss": 3.0106, "mean_token_accuracy": 0.8458054810762405, "step": 130 }, { "epoch": 1.048, "grad_norm": 5.267597675323486, "learning_rate": 3.7272727272727276e-05, "loss": 3.4967, "mean_token_accuracy": 0.8339269608259201, "step": 131 }, { "epoch": 1.056, "grad_norm": 5.37116003036499, "learning_rate": 3.7171717171717175e-05, "loss": 3.4903, "mean_token_accuracy": 0.8220857381820679, "step": 132 }, { "epoch": 1.064, "grad_norm": 5.198507785797119, "learning_rate": 3.7070707070707075e-05, "loss": 2.9143, "mean_token_accuracy": 0.8564625233411789, "step": 133 }, { "epoch": 1.072, "grad_norm": 5.638608932495117, "learning_rate": 3.6969696969696974e-05, "loss": 3.6283, "mean_token_accuracy": 0.8230338096618652, "step": 134 }, { "epoch": 1.08, "grad_norm": 5.533100605010986, "learning_rate": 3.686868686868687e-05, "loss": 3.3163, "mean_token_accuracy": 0.8364241421222687, "step": 135 }, { "epoch": 1.088, "grad_norm": 5.794349670410156, "learning_rate": 3.6767676767676766e-05, "loss": 3.4776, "mean_token_accuracy": 0.822014644742012, "step": 136 }, { "epoch": 1.096, "grad_norm": 6.230714797973633, "learning_rate": 3.6666666666666666e-05, "loss": 3.9834, "mean_token_accuracy": 0.808947280049324, "step": 137 }, { "epoch": 1.104, "grad_norm": 5.761725902557373, "learning_rate": 3.656565656565657e-05, "loss": 3.5205, "mean_token_accuracy": 0.8236398249864578, "step": 138 }, { "epoch": 1.112, "grad_norm": 5.620357990264893, "learning_rate": 3.6464646464646465e-05, "loss": 3.7957, "mean_token_accuracy": 0.8184810876846313, "step": 139 }, { "epoch": 1.12, "grad_norm": 5.732111930847168, "learning_rate": 3.6363636363636364e-05, "loss": 3.3031, "mean_token_accuracy": 0.8268382251262665, "step": 140 }, { "epoch": 1.1280000000000001, "grad_norm": 5.512340545654297, "learning_rate": 3.6262626262626264e-05, "loss": 3.4623, "mean_token_accuracy": 0.8173917531967163, "step": 141 }, { "epoch": 1.1360000000000001, "grad_norm": 5.278458118438721, "learning_rate": 3.616161616161616e-05, "loss": 3.6187, "mean_token_accuracy": 0.8212647438049316, "step": 142 }, { "epoch": 1.144, "grad_norm": 6.000738620758057, "learning_rate": 3.606060606060606e-05, "loss": 3.4098, "mean_token_accuracy": 0.8118827193975449, "step": 143 }, { "epoch": 1.152, "grad_norm": 5.230849266052246, "learning_rate": 3.595959595959596e-05, "loss": 3.3312, "mean_token_accuracy": 0.8291827887296677, "step": 144 }, { "epoch": 1.16, "grad_norm": 6.520547389984131, "learning_rate": 3.5858585858585855e-05, "loss": 2.9547, "mean_token_accuracy": 0.8410928547382355, "step": 145 }, { "epoch": 1.168, "grad_norm": 6.905513286590576, "learning_rate": 3.575757575757576e-05, "loss": 3.3843, "mean_token_accuracy": 0.8292149305343628, "step": 146 }, { "epoch": 1.176, "grad_norm": 5.25789213180542, "learning_rate": 3.565656565656566e-05, "loss": 3.0306, "mean_token_accuracy": 0.8420234769582748, "step": 147 }, { "epoch": 1.184, "grad_norm": 6.301482677459717, "learning_rate": 3.555555555555556e-05, "loss": 3.563, "mean_token_accuracy": 0.816611647605896, "step": 148 }, { "epoch": 1.192, "grad_norm": 5.18954610824585, "learning_rate": 3.545454545454546e-05, "loss": 3.3712, "mean_token_accuracy": 0.8429348319768906, "step": 149 }, { "epoch": 1.2, "grad_norm": 5.924179553985596, "learning_rate": 3.535353535353535e-05, "loss": 3.0173, "mean_token_accuracy": 0.8484609872102737, "step": 150 }, { "epoch": 1.208, "grad_norm": 6.125277996063232, "learning_rate": 3.525252525252525e-05, "loss": 4.017, "mean_token_accuracy": 0.8024349808692932, "step": 151 }, { "epoch": 1.216, "grad_norm": 6.013985633850098, "learning_rate": 3.515151515151515e-05, "loss": 3.1488, "mean_token_accuracy": 0.8332614004611969, "step": 152 }, { "epoch": 1.224, "grad_norm": 5.863841533660889, "learning_rate": 3.505050505050505e-05, "loss": 3.4748, "mean_token_accuracy": 0.8309570550918579, "step": 153 }, { "epoch": 1.232, "grad_norm": 5.565553665161133, "learning_rate": 3.494949494949495e-05, "loss": 3.4478, "mean_token_accuracy": 0.8285814672708511, "step": 154 }, { "epoch": 1.24, "grad_norm": 5.7123565673828125, "learning_rate": 3.484848484848485e-05, "loss": 3.1585, "mean_token_accuracy": 0.8400322198867798, "step": 155 }, { "epoch": 1.248, "grad_norm": 5.499763488769531, "learning_rate": 3.474747474747475e-05, "loss": 2.9475, "mean_token_accuracy": 0.8378583043813705, "step": 156 }, { "epoch": 1.256, "grad_norm": 5.909451961517334, "learning_rate": 3.464646464646465e-05, "loss": 3.0639, "mean_token_accuracy": 0.8469367325305939, "step": 157 }, { "epoch": 1.264, "grad_norm": 6.213925838470459, "learning_rate": 3.454545454545455e-05, "loss": 3.7884, "mean_token_accuracy": 0.8231126517057419, "step": 158 }, { "epoch": 1.272, "grad_norm": 5.21215295791626, "learning_rate": 3.444444444444445e-05, "loss": 3.4636, "mean_token_accuracy": 0.8168640285730362, "step": 159 }, { "epoch": 1.28, "grad_norm": 5.759207725524902, "learning_rate": 3.434343434343435e-05, "loss": 3.2428, "mean_token_accuracy": 0.8419955372810364, "step": 160 }, { "epoch": 1.288, "grad_norm": 5.337326526641846, "learning_rate": 3.424242424242424e-05, "loss": 3.5007, "mean_token_accuracy": 0.826006218791008, "step": 161 }, { "epoch": 1.296, "grad_norm": 5.429465293884277, "learning_rate": 3.414141414141414e-05, "loss": 3.1902, "mean_token_accuracy": 0.8436289280653, "step": 162 }, { "epoch": 1.304, "grad_norm": 5.5671234130859375, "learning_rate": 3.4040404040404045e-05, "loss": 3.1885, "mean_token_accuracy": 0.837678074836731, "step": 163 }, { "epoch": 1.312, "grad_norm": 5.217177391052246, "learning_rate": 3.3939393939393945e-05, "loss": 3.0344, "mean_token_accuracy": 0.8463016897439957, "step": 164 }, { "epoch": 1.32, "grad_norm": 6.01118803024292, "learning_rate": 3.3838383838383844e-05, "loss": 3.6496, "mean_token_accuracy": 0.8142215311527252, "step": 165 }, { "epoch": 1.328, "grad_norm": 5.723564147949219, "learning_rate": 3.373737373737374e-05, "loss": 3.1859, "mean_token_accuracy": 0.8320183008909225, "step": 166 }, { "epoch": 1.336, "grad_norm": 5.838883399963379, "learning_rate": 3.3636363636363636e-05, "loss": 3.4536, "mean_token_accuracy": 0.8264990895986557, "step": 167 }, { "epoch": 1.3439999999999999, "grad_norm": 5.59492301940918, "learning_rate": 3.3535353535353536e-05, "loss": 3.1706, "mean_token_accuracy": 0.8360193520784378, "step": 168 }, { "epoch": 1.3519999999999999, "grad_norm": 5.386695861816406, "learning_rate": 3.3434343434343435e-05, "loss": 2.8613, "mean_token_accuracy": 0.8595142513513565, "step": 169 }, { "epoch": 1.3599999999999999, "grad_norm": 5.8718791007995605, "learning_rate": 3.3333333333333335e-05, "loss": 3.405, "mean_token_accuracy": 0.8177822679281235, "step": 170 }, { "epoch": 1.3679999999999999, "grad_norm": 5.533784866333008, "learning_rate": 3.3232323232323234e-05, "loss": 2.7535, "mean_token_accuracy": 0.847559779882431, "step": 171 }, { "epoch": 1.376, "grad_norm": 6.221394062042236, "learning_rate": 3.3131313131313134e-05, "loss": 3.2758, "mean_token_accuracy": 0.8370038121938705, "step": 172 }, { "epoch": 1.384, "grad_norm": 5.768967628479004, "learning_rate": 3.303030303030303e-05, "loss": 3.144, "mean_token_accuracy": 0.8520988523960114, "step": 173 }, { "epoch": 1.392, "grad_norm": 5.794501781463623, "learning_rate": 3.292929292929293e-05, "loss": 3.294, "mean_token_accuracy": 0.8296584039926529, "step": 174 }, { "epoch": 1.4, "grad_norm": 6.306114196777344, "learning_rate": 3.282828282828283e-05, "loss": 3.8122, "mean_token_accuracy": 0.8015681505203247, "step": 175 }, { "epoch": 1.408, "grad_norm": 5.309399127960205, "learning_rate": 3.272727272727273e-05, "loss": 3.5656, "mean_token_accuracy": 0.8148599863052368, "step": 176 }, { "epoch": 1.416, "grad_norm": 6.384012699127197, "learning_rate": 3.2626262626262624e-05, "loss": 3.3465, "mean_token_accuracy": 0.8153296113014221, "step": 177 }, { "epoch": 1.424, "grad_norm": 5.76847505569458, "learning_rate": 3.2525252525252524e-05, "loss": 3.5117, "mean_token_accuracy": 0.8243328183889389, "step": 178 }, { "epoch": 1.432, "grad_norm": 5.956698417663574, "learning_rate": 3.2424242424242423e-05, "loss": 3.3246, "mean_token_accuracy": 0.8309469819068909, "step": 179 }, { "epoch": 1.44, "grad_norm": 5.436334133148193, "learning_rate": 3.232323232323233e-05, "loss": 3.3948, "mean_token_accuracy": 0.8304746299982071, "step": 180 }, { "epoch": 1.448, "grad_norm": 6.713822364807129, "learning_rate": 3.222222222222223e-05, "loss": 3.7249, "mean_token_accuracy": 0.8197664022445679, "step": 181 }, { "epoch": 1.456, "grad_norm": 5.69707727432251, "learning_rate": 3.212121212121212e-05, "loss": 3.4366, "mean_token_accuracy": 0.8290866911411285, "step": 182 }, { "epoch": 1.464, "grad_norm": 5.674050807952881, "learning_rate": 3.202020202020202e-05, "loss": 3.4487, "mean_token_accuracy": 0.8316894471645355, "step": 183 }, { "epoch": 1.472, "grad_norm": 6.27432918548584, "learning_rate": 3.191919191919192e-05, "loss": 3.7579, "mean_token_accuracy": 0.8203246891498566, "step": 184 }, { "epoch": 1.48, "grad_norm": 6.963736534118652, "learning_rate": 3.181818181818182e-05, "loss": 3.3907, "mean_token_accuracy": 0.8244215101003647, "step": 185 }, { "epoch": 1.488, "grad_norm": 6.205438137054443, "learning_rate": 3.171717171717172e-05, "loss": 3.2918, "mean_token_accuracy": 0.8321795910596848, "step": 186 }, { "epoch": 1.496, "grad_norm": 6.026015281677246, "learning_rate": 3.161616161616161e-05, "loss": 3.7184, "mean_token_accuracy": 0.8272889703512192, "step": 187 }, { "epoch": 1.504, "grad_norm": 5.637466907501221, "learning_rate": 3.151515151515151e-05, "loss": 2.5835, "mean_token_accuracy": 0.8698715269565582, "step": 188 }, { "epoch": 1.512, "grad_norm": 6.474348545074463, "learning_rate": 3.141414141414142e-05, "loss": 3.2364, "mean_token_accuracy": 0.8249330222606659, "step": 189 }, { "epoch": 1.52, "grad_norm": 6.4876885414123535, "learning_rate": 3.131313131313132e-05, "loss": 3.3135, "mean_token_accuracy": 0.8382684588432312, "step": 190 }, { "epoch": 1.528, "grad_norm": 5.597368240356445, "learning_rate": 3.121212121212122e-05, "loss": 3.2368, "mean_token_accuracy": 0.8341715931892395, "step": 191 }, { "epoch": 1.536, "grad_norm": 6.087445259094238, "learning_rate": 3.111111111111111e-05, "loss": 3.4584, "mean_token_accuracy": 0.8352274149656296, "step": 192 }, { "epoch": 1.544, "grad_norm": 5.523479461669922, "learning_rate": 3.101010101010101e-05, "loss": 3.0476, "mean_token_accuracy": 0.8508585840463638, "step": 193 }, { "epoch": 1.552, "grad_norm": 5.6211442947387695, "learning_rate": 3.090909090909091e-05, "loss": 3.2642, "mean_token_accuracy": 0.8426928371191025, "step": 194 }, { "epoch": 1.56, "grad_norm": 5.739730358123779, "learning_rate": 3.080808080808081e-05, "loss": 3.6835, "mean_token_accuracy": 0.8226543515920639, "step": 195 }, { "epoch": 1.568, "grad_norm": 6.00140380859375, "learning_rate": 3.070707070707071e-05, "loss": 2.9485, "mean_token_accuracy": 0.8533317595720291, "step": 196 }, { "epoch": 1.576, "grad_norm": 5.541549205780029, "learning_rate": 3.060606060606061e-05, "loss": 3.0892, "mean_token_accuracy": 0.8474195152521133, "step": 197 }, { "epoch": 1.584, "grad_norm": 5.923431873321533, "learning_rate": 3.050505050505051e-05, "loss": 3.018, "mean_token_accuracy": 0.84869784116745, "step": 198 }, { "epoch": 1.592, "grad_norm": 6.204696178436279, "learning_rate": 3.0404040404040406e-05, "loss": 3.5126, "mean_token_accuracy": 0.8261184245347977, "step": 199 }, { "epoch": 1.6, "grad_norm": 6.3025712966918945, "learning_rate": 3.0303030303030306e-05, "loss": 3.4185, "mean_token_accuracy": 0.8210860043764114, "step": 200 }, { "epoch": 1.608, "grad_norm": 5.689709663391113, "learning_rate": 3.0202020202020205e-05, "loss": 3.5322, "mean_token_accuracy": 0.8191230297088623, "step": 201 }, { "epoch": 1.616, "grad_norm": 5.707929611206055, "learning_rate": 3.01010101010101e-05, "loss": 3.024, "mean_token_accuracy": 0.8488472253084183, "step": 202 }, { "epoch": 1.624, "grad_norm": 5.80771017074585, "learning_rate": 3e-05, "loss": 3.5556, "mean_token_accuracy": 0.8266987651586533, "step": 203 }, { "epoch": 1.6320000000000001, "grad_norm": 6.03700590133667, "learning_rate": 2.98989898989899e-05, "loss": 3.1062, "mean_token_accuracy": 0.8308413475751877, "step": 204 }, { "epoch": 1.6400000000000001, "grad_norm": 6.733646392822266, "learning_rate": 2.9797979797979796e-05, "loss": 3.1995, "mean_token_accuracy": 0.8294665813446045, "step": 205 }, { "epoch": 1.6480000000000001, "grad_norm": 5.883334636688232, "learning_rate": 2.96969696969697e-05, "loss": 2.8124, "mean_token_accuracy": 0.8439156115055084, "step": 206 }, { "epoch": 1.6560000000000001, "grad_norm": 5.941483497619629, "learning_rate": 2.95959595959596e-05, "loss": 3.0179, "mean_token_accuracy": 0.8416165858507156, "step": 207 }, { "epoch": 1.6640000000000001, "grad_norm": 6.391957759857178, "learning_rate": 2.9494949494949498e-05, "loss": 3.7068, "mean_token_accuracy": 0.8122821152210236, "step": 208 }, { "epoch": 1.6720000000000002, "grad_norm": 6.237335205078125, "learning_rate": 2.9393939393939394e-05, "loss": 3.32, "mean_token_accuracy": 0.8313788622617722, "step": 209 }, { "epoch": 1.6800000000000002, "grad_norm": 6.5731987953186035, "learning_rate": 2.9292929292929294e-05, "loss": 3.5619, "mean_token_accuracy": 0.816206768155098, "step": 210 }, { "epoch": 1.688, "grad_norm": 5.061030387878418, "learning_rate": 2.9191919191919193e-05, "loss": 2.8566, "mean_token_accuracy": 0.8475048393011093, "step": 211 }, { "epoch": 1.696, "grad_norm": 5.921192646026611, "learning_rate": 2.909090909090909e-05, "loss": 3.3349, "mean_token_accuracy": 0.827596977353096, "step": 212 }, { "epoch": 1.704, "grad_norm": 5.721929550170898, "learning_rate": 2.898989898989899e-05, "loss": 2.7363, "mean_token_accuracy": 0.8517429679632187, "step": 213 }, { "epoch": 1.712, "grad_norm": 6.763914585113525, "learning_rate": 2.8888888888888888e-05, "loss": 2.8292, "mean_token_accuracy": 0.8498516529798508, "step": 214 }, { "epoch": 1.72, "grad_norm": 6.817018032073975, "learning_rate": 2.878787878787879e-05, "loss": 3.2958, "mean_token_accuracy": 0.8348537087440491, "step": 215 }, { "epoch": 1.728, "grad_norm": 5.659510612487793, "learning_rate": 2.868686868686869e-05, "loss": 2.8683, "mean_token_accuracy": 0.8435689210891724, "step": 216 }, { "epoch": 1.736, "grad_norm": 6.099102020263672, "learning_rate": 2.8585858585858587e-05, "loss": 2.9825, "mean_token_accuracy": 0.8394478559494019, "step": 217 }, { "epoch": 1.744, "grad_norm": 6.094569206237793, "learning_rate": 2.8484848484848486e-05, "loss": 3.2325, "mean_token_accuracy": 0.8246908932924271, "step": 218 }, { "epoch": 1.752, "grad_norm": 6.866254806518555, "learning_rate": 2.8383838383838386e-05, "loss": 3.1284, "mean_token_accuracy": 0.8342044651508331, "step": 219 }, { "epoch": 1.76, "grad_norm": 6.149699687957764, "learning_rate": 2.8282828282828282e-05, "loss": 3.6781, "mean_token_accuracy": 0.8211702555418015, "step": 220 }, { "epoch": 1.768, "grad_norm": 5.862130641937256, "learning_rate": 2.818181818181818e-05, "loss": 3.3624, "mean_token_accuracy": 0.8324427157640457, "step": 221 }, { "epoch": 1.776, "grad_norm": 5.395320415496826, "learning_rate": 2.808080808080808e-05, "loss": 3.3338, "mean_token_accuracy": 0.8399553894996643, "step": 222 }, { "epoch": 1.784, "grad_norm": 5.550381183624268, "learning_rate": 2.7979797979797984e-05, "loss": 3.4036, "mean_token_accuracy": 0.8223441988229752, "step": 223 }, { "epoch": 1.792, "grad_norm": 5.406087875366211, "learning_rate": 2.7878787878787883e-05, "loss": 2.6777, "mean_token_accuracy": 0.8600380420684814, "step": 224 }, { "epoch": 1.8, "grad_norm": 5.7635698318481445, "learning_rate": 2.777777777777778e-05, "loss": 3.5319, "mean_token_accuracy": 0.8255642205476761, "step": 225 }, { "epoch": 1.808, "grad_norm": 5.934004783630371, "learning_rate": 2.767676767676768e-05, "loss": 3.2659, "mean_token_accuracy": 0.8305630534887314, "step": 226 }, { "epoch": 1.8159999999999998, "grad_norm": 6.8721184730529785, "learning_rate": 2.7575757575757578e-05, "loss": 3.1587, "mean_token_accuracy": 0.8299548774957657, "step": 227 }, { "epoch": 1.8239999999999998, "grad_norm": 5.556332588195801, "learning_rate": 2.7474747474747474e-05, "loss": 2.8912, "mean_token_accuracy": 0.8541744500398636, "step": 228 }, { "epoch": 1.8319999999999999, "grad_norm": 5.591052532196045, "learning_rate": 2.7373737373737374e-05, "loss": 3.1768, "mean_token_accuracy": 0.840098574757576, "step": 229 }, { "epoch": 1.8399999999999999, "grad_norm": 7.923620700836182, "learning_rate": 2.7272727272727273e-05, "loss": 3.1165, "mean_token_accuracy": 0.8416318744421005, "step": 230 }, { "epoch": 1.8479999999999999, "grad_norm": 6.180293083190918, "learning_rate": 2.717171717171717e-05, "loss": 3.1878, "mean_token_accuracy": 0.8414299786090851, "step": 231 }, { "epoch": 1.8559999999999999, "grad_norm": 6.479142189025879, "learning_rate": 2.7070707070707075e-05, "loss": 3.519, "mean_token_accuracy": 0.8237949758768082, "step": 232 }, { "epoch": 1.8639999999999999, "grad_norm": 6.211218357086182, "learning_rate": 2.696969696969697e-05, "loss": 3.6602, "mean_token_accuracy": 0.8112812489271164, "step": 233 }, { "epoch": 1.8719999999999999, "grad_norm": 6.262876033782959, "learning_rate": 2.686868686868687e-05, "loss": 3.8658, "mean_token_accuracy": 0.8071164190769196, "step": 234 }, { "epoch": 1.88, "grad_norm": 5.474344730377197, "learning_rate": 2.676767676767677e-05, "loss": 2.971, "mean_token_accuracy": 0.8377246409654617, "step": 235 }, { "epoch": 1.888, "grad_norm": 6.193055629730225, "learning_rate": 2.6666666666666667e-05, "loss": 3.478, "mean_token_accuracy": 0.823166161775589, "step": 236 }, { "epoch": 1.896, "grad_norm": 5.627188682556152, "learning_rate": 2.6565656565656566e-05, "loss": 3.1995, "mean_token_accuracy": 0.8456520289182663, "step": 237 }, { "epoch": 1.904, "grad_norm": 6.110049247741699, "learning_rate": 2.6464646464646466e-05, "loss": 3.0544, "mean_token_accuracy": 0.8367758989334106, "step": 238 }, { "epoch": 1.912, "grad_norm": 5.3701982498168945, "learning_rate": 2.636363636363636e-05, "loss": 2.6659, "mean_token_accuracy": 0.8609540313482285, "step": 239 }, { "epoch": 1.92, "grad_norm": 5.352710247039795, "learning_rate": 2.6262626262626268e-05, "loss": 3.082, "mean_token_accuracy": 0.842663049697876, "step": 240 }, { "epoch": 1.928, "grad_norm": 6.08484411239624, "learning_rate": 2.6161616161616164e-05, "loss": 3.3737, "mean_token_accuracy": 0.8245035409927368, "step": 241 }, { "epoch": 1.936, "grad_norm": 5.956201076507568, "learning_rate": 2.6060606060606063e-05, "loss": 2.9072, "mean_token_accuracy": 0.8548275828361511, "step": 242 }, { "epoch": 1.944, "grad_norm": 5.499819278717041, "learning_rate": 2.5959595959595963e-05, "loss": 3.355, "mean_token_accuracy": 0.820908397436142, "step": 243 }, { "epoch": 1.952, "grad_norm": 6.338292598724365, "learning_rate": 2.585858585858586e-05, "loss": 3.7158, "mean_token_accuracy": 0.8252478241920471, "step": 244 }, { "epoch": 1.96, "grad_norm": 6.105760097503662, "learning_rate": 2.575757575757576e-05, "loss": 3.1973, "mean_token_accuracy": 0.8429799377918243, "step": 245 }, { "epoch": 1.968, "grad_norm": 5.636971950531006, "learning_rate": 2.5656565656565658e-05, "loss": 2.6206, "mean_token_accuracy": 0.8573070019483566, "step": 246 }, { "epoch": 1.976, "grad_norm": 6.419624328613281, "learning_rate": 2.5555555555555554e-05, "loss": 3.5778, "mean_token_accuracy": 0.8265507072210312, "step": 247 }, { "epoch": 1.984, "grad_norm": 5.554307460784912, "learning_rate": 2.5454545454545454e-05, "loss": 2.9344, "mean_token_accuracy": 0.8515568971633911, "step": 248 }, { "epoch": 1.992, "grad_norm": 6.059617519378662, "learning_rate": 2.5353535353535356e-05, "loss": 3.4458, "mean_token_accuracy": 0.8348643332719803, "step": 249 }, { "epoch": 2.0, "grad_norm": 5.995134353637695, "learning_rate": 2.5252525252525256e-05, "loss": 2.9078, "mean_token_accuracy": 0.8390886038541794, "step": 250 }, { "epoch": 2.008, "grad_norm": 5.29079532623291, "learning_rate": 2.5151515151515155e-05, "loss": 2.5449, "mean_token_accuracy": 0.8623387068510056, "step": 251 }, { "epoch": 2.016, "grad_norm": 6.307847499847412, "learning_rate": 2.505050505050505e-05, "loss": 2.6746, "mean_token_accuracy": 0.8547452688217163, "step": 252 }, { "epoch": 2.024, "grad_norm": 5.258662700653076, "learning_rate": 2.494949494949495e-05, "loss": 2.8106, "mean_token_accuracy": 0.8465935438871384, "step": 253 }, { "epoch": 2.032, "grad_norm": 5.160598278045654, "learning_rate": 2.4848484848484847e-05, "loss": 2.5327, "mean_token_accuracy": 0.8596736937761307, "step": 254 }, { "epoch": 2.04, "grad_norm": 5.5139055252075195, "learning_rate": 2.474747474747475e-05, "loss": 2.5406, "mean_token_accuracy": 0.8599353432655334, "step": 255 }, { "epoch": 2.048, "grad_norm": 5.596591949462891, "learning_rate": 2.464646464646465e-05, "loss": 2.5955, "mean_token_accuracy": 0.8636805862188339, "step": 256 }, { "epoch": 2.056, "grad_norm": 5.260463237762451, "learning_rate": 2.4545454545454545e-05, "loss": 2.8003, "mean_token_accuracy": 0.8472322225570679, "step": 257 }, { "epoch": 2.064, "grad_norm": 5.485147476196289, "learning_rate": 2.4444444444444445e-05, "loss": 2.4907, "mean_token_accuracy": 0.8619499355554581, "step": 258 }, { "epoch": 2.072, "grad_norm": 5.589687824249268, "learning_rate": 2.4343434343434344e-05, "loss": 2.563, "mean_token_accuracy": 0.8651385009288788, "step": 259 }, { "epoch": 2.08, "grad_norm": 4.876934051513672, "learning_rate": 2.4242424242424244e-05, "loss": 2.0974, "mean_token_accuracy": 0.8819858431816101, "step": 260 }, { "epoch": 2.088, "grad_norm": 6.266694068908691, "learning_rate": 2.4141414141414143e-05, "loss": 2.6164, "mean_token_accuracy": 0.8617752641439438, "step": 261 }, { "epoch": 2.096, "grad_norm": 5.54473352432251, "learning_rate": 2.404040404040404e-05, "loss": 2.9005, "mean_token_accuracy": 0.8471231460571289, "step": 262 }, { "epoch": 2.104, "grad_norm": 5.299022197723389, "learning_rate": 2.393939393939394e-05, "loss": 2.3437, "mean_token_accuracy": 0.872315376996994, "step": 263 }, { "epoch": 2.112, "grad_norm": 5.8907856941223145, "learning_rate": 2.3838383838383842e-05, "loss": 2.5687, "mean_token_accuracy": 0.8683241903781891, "step": 264 }, { "epoch": 2.12, "grad_norm": 5.84760046005249, "learning_rate": 2.3737373737373738e-05, "loss": 2.6167, "mean_token_accuracy": 0.8569298684597015, "step": 265 }, { "epoch": 2.128, "grad_norm": 5.831202507019043, "learning_rate": 2.3636363636363637e-05, "loss": 3.0151, "mean_token_accuracy": 0.8395980596542358, "step": 266 }, { "epoch": 2.136, "grad_norm": 5.698826313018799, "learning_rate": 2.3535353535353537e-05, "loss": 3.1071, "mean_token_accuracy": 0.850052535533905, "step": 267 }, { "epoch": 2.144, "grad_norm": 6.112237930297852, "learning_rate": 2.3434343434343436e-05, "loss": 2.624, "mean_token_accuracy": 0.8661712259054184, "step": 268 }, { "epoch": 2.152, "grad_norm": 6.229471683502197, "learning_rate": 2.3333333333333336e-05, "loss": 2.5656, "mean_token_accuracy": 0.8691338896751404, "step": 269 }, { "epoch": 2.16, "grad_norm": 6.126317501068115, "learning_rate": 2.3232323232323232e-05, "loss": 2.6405, "mean_token_accuracy": 0.8534891903400421, "step": 270 }, { "epoch": 2.168, "grad_norm": 6.28452205657959, "learning_rate": 2.313131313131313e-05, "loss": 2.6796, "mean_token_accuracy": 0.8531413078308105, "step": 271 }, { "epoch": 2.176, "grad_norm": 5.863641262054443, "learning_rate": 2.3030303030303034e-05, "loss": 2.7015, "mean_token_accuracy": 0.8519039303064346, "step": 272 }, { "epoch": 2.184, "grad_norm": 6.535420894622803, "learning_rate": 2.292929292929293e-05, "loss": 3.0103, "mean_token_accuracy": 0.8415365815162659, "step": 273 }, { "epoch": 2.192, "grad_norm": 5.875096321105957, "learning_rate": 2.282828282828283e-05, "loss": 2.8179, "mean_token_accuracy": 0.8494950979948044, "step": 274 }, { "epoch": 2.2, "grad_norm": 5.35692834854126, "learning_rate": 2.272727272727273e-05, "loss": 2.1563, "mean_token_accuracy": 0.8810548335313797, "step": 275 }, { "epoch": 2.208, "grad_norm": 6.05448579788208, "learning_rate": 2.262626262626263e-05, "loss": 2.4902, "mean_token_accuracy": 0.8656753301620483, "step": 276 }, { "epoch": 2.216, "grad_norm": 5.888792037963867, "learning_rate": 2.2525252525252528e-05, "loss": 2.5769, "mean_token_accuracy": 0.8682566732168198, "step": 277 }, { "epoch": 2.224, "grad_norm": 5.912112712860107, "learning_rate": 2.2424242424242424e-05, "loss": 2.5715, "mean_token_accuracy": 0.8543179333209991, "step": 278 }, { "epoch": 2.232, "grad_norm": 6.485673427581787, "learning_rate": 2.2323232323232324e-05, "loss": 2.6801, "mean_token_accuracy": 0.8597021549940109, "step": 279 }, { "epoch": 2.24, "grad_norm": 5.757596969604492, "learning_rate": 2.2222222222222223e-05, "loss": 2.5495, "mean_token_accuracy": 0.8788398951292038, "step": 280 }, { "epoch": 2.248, "grad_norm": 5.577345848083496, "learning_rate": 2.2121212121212123e-05, "loss": 2.4548, "mean_token_accuracy": 0.8728667497634888, "step": 281 }, { "epoch": 2.2560000000000002, "grad_norm": 8.356124877929688, "learning_rate": 2.2020202020202022e-05, "loss": 2.9846, "mean_token_accuracy": 0.8483117371797562, "step": 282 }, { "epoch": 2.2640000000000002, "grad_norm": 5.887564659118652, "learning_rate": 2.191919191919192e-05, "loss": 2.7833, "mean_token_accuracy": 0.8488295823335648, "step": 283 }, { "epoch": 2.2720000000000002, "grad_norm": 6.558447360992432, "learning_rate": 2.1818181818181818e-05, "loss": 2.3944, "mean_token_accuracy": 0.8682046681642532, "step": 284 }, { "epoch": 2.2800000000000002, "grad_norm": 6.828135013580322, "learning_rate": 2.171717171717172e-05, "loss": 2.8403, "mean_token_accuracy": 0.8527731895446777, "step": 285 }, { "epoch": 2.288, "grad_norm": 5.772212505340576, "learning_rate": 2.1616161616161617e-05, "loss": 2.8966, "mean_token_accuracy": 0.8490441888570786, "step": 286 }, { "epoch": 2.296, "grad_norm": 5.538363933563232, "learning_rate": 2.1515151515151516e-05, "loss": 2.4595, "mean_token_accuracy": 0.8743099421262741, "step": 287 }, { "epoch": 2.304, "grad_norm": 6.172807216644287, "learning_rate": 2.1414141414141416e-05, "loss": 2.3328, "mean_token_accuracy": 0.8704792261123657, "step": 288 }, { "epoch": 2.312, "grad_norm": 6.232668399810791, "learning_rate": 2.1313131313131315e-05, "loss": 2.9026, "mean_token_accuracy": 0.8546594232320786, "step": 289 }, { "epoch": 2.32, "grad_norm": 6.070257663726807, "learning_rate": 2.1212121212121215e-05, "loss": 2.4686, "mean_token_accuracy": 0.8643316328525543, "step": 290 }, { "epoch": 2.328, "grad_norm": 5.737977504730225, "learning_rate": 2.111111111111111e-05, "loss": 2.9917, "mean_token_accuracy": 0.8514521420001984, "step": 291 }, { "epoch": 2.336, "grad_norm": 6.409512519836426, "learning_rate": 2.101010101010101e-05, "loss": 2.6572, "mean_token_accuracy": 0.8566225320100784, "step": 292 }, { "epoch": 2.344, "grad_norm": 6.57427978515625, "learning_rate": 2.090909090909091e-05, "loss": 2.466, "mean_token_accuracy": 0.853950634598732, "step": 293 }, { "epoch": 2.352, "grad_norm": 6.017867088317871, "learning_rate": 2.080808080808081e-05, "loss": 2.8763, "mean_token_accuracy": 0.8532497733831406, "step": 294 }, { "epoch": 2.36, "grad_norm": 5.640237808227539, "learning_rate": 2.070707070707071e-05, "loss": 2.6013, "mean_token_accuracy": 0.8739284723997116, "step": 295 }, { "epoch": 2.368, "grad_norm": 6.341038703918457, "learning_rate": 2.0606060606060608e-05, "loss": 2.6485, "mean_token_accuracy": 0.8521928191184998, "step": 296 }, { "epoch": 2.376, "grad_norm": 6.142149925231934, "learning_rate": 2.0505050505050504e-05, "loss": 2.3397, "mean_token_accuracy": 0.8850863575935364, "step": 297 }, { "epoch": 2.384, "grad_norm": 6.308354377746582, "learning_rate": 2.0404040404040407e-05, "loss": 2.5977, "mean_token_accuracy": 0.8648134768009186, "step": 298 }, { "epoch": 2.392, "grad_norm": 5.566152572631836, "learning_rate": 2.0303030303030303e-05, "loss": 2.7123, "mean_token_accuracy": 0.8633100241422653, "step": 299 }, { "epoch": 2.4, "grad_norm": 5.725775241851807, "learning_rate": 2.0202020202020203e-05, "loss": 2.6108, "mean_token_accuracy": 0.8738928139209747, "step": 300 }, { "epoch": 2.408, "grad_norm": 6.435664176940918, "learning_rate": 2.0101010101010102e-05, "loss": 2.7976, "mean_token_accuracy": 0.8510608673095703, "step": 301 }, { "epoch": 2.416, "grad_norm": 7.4542622566223145, "learning_rate": 2e-05, "loss": 2.8097, "mean_token_accuracy": 0.8516113609075546, "step": 302 }, { "epoch": 2.424, "grad_norm": 5.655482769012451, "learning_rate": 1.98989898989899e-05, "loss": 2.3856, "mean_token_accuracy": 0.8649388700723648, "step": 303 }, { "epoch": 2.432, "grad_norm": 6.412247180938721, "learning_rate": 1.9797979797979797e-05, "loss": 2.8675, "mean_token_accuracy": 0.8470883667469025, "step": 304 }, { "epoch": 2.44, "grad_norm": 7.186456680297852, "learning_rate": 1.9696969696969697e-05, "loss": 3.1156, "mean_token_accuracy": 0.8328486531972885, "step": 305 }, { "epoch": 2.448, "grad_norm": 5.615805625915527, "learning_rate": 1.95959595959596e-05, "loss": 2.6565, "mean_token_accuracy": 0.8666907250881195, "step": 306 }, { "epoch": 2.456, "grad_norm": 6.449917793273926, "learning_rate": 1.9494949494949496e-05, "loss": 2.9544, "mean_token_accuracy": 0.8483339697122574, "step": 307 }, { "epoch": 2.464, "grad_norm": 5.747985363006592, "learning_rate": 1.9393939393939395e-05, "loss": 2.1422, "mean_token_accuracy": 0.8723134845495224, "step": 308 }, { "epoch": 2.472, "grad_norm": 6.027318000793457, "learning_rate": 1.9292929292929295e-05, "loss": 2.5305, "mean_token_accuracy": 0.8660032451152802, "step": 309 }, { "epoch": 2.48, "grad_norm": 7.454213619232178, "learning_rate": 1.919191919191919e-05, "loss": 2.5679, "mean_token_accuracy": 0.8533085733652115, "step": 310 }, { "epoch": 2.488, "grad_norm": 5.4801716804504395, "learning_rate": 1.9090909090909094e-05, "loss": 2.6006, "mean_token_accuracy": 0.8550811111927032, "step": 311 }, { "epoch": 2.496, "grad_norm": 6.020359039306641, "learning_rate": 1.898989898989899e-05, "loss": 2.2521, "mean_token_accuracy": 0.8731967061758041, "step": 312 }, { "epoch": 2.504, "grad_norm": 5.78825569152832, "learning_rate": 1.888888888888889e-05, "loss": 2.4461, "mean_token_accuracy": 0.8596260696649551, "step": 313 }, { "epoch": 2.512, "grad_norm": 5.771088123321533, "learning_rate": 1.878787878787879e-05, "loss": 2.3915, "mean_token_accuracy": 0.8663473874330521, "step": 314 }, { "epoch": 2.52, "grad_norm": 5.939653396606445, "learning_rate": 1.8686868686868688e-05, "loss": 2.7159, "mean_token_accuracy": 0.8722144514322281, "step": 315 }, { "epoch": 2.528, "grad_norm": 6.489267349243164, "learning_rate": 1.8585858585858588e-05, "loss": 2.6137, "mean_token_accuracy": 0.8580323159694672, "step": 316 }, { "epoch": 2.536, "grad_norm": 5.97398042678833, "learning_rate": 1.8484848484848487e-05, "loss": 2.7352, "mean_token_accuracy": 0.8655538409948349, "step": 317 }, { "epoch": 2.544, "grad_norm": 5.858009338378906, "learning_rate": 1.8383838383838383e-05, "loss": 2.2991, "mean_token_accuracy": 0.8808925747871399, "step": 318 }, { "epoch": 2.552, "grad_norm": 5.720334053039551, "learning_rate": 1.8282828282828286e-05, "loss": 2.6282, "mean_token_accuracy": 0.8669092357158661, "step": 319 }, { "epoch": 2.56, "grad_norm": 6.569767475128174, "learning_rate": 1.8181818181818182e-05, "loss": 2.5663, "mean_token_accuracy": 0.8692184388637543, "step": 320 }, { "epoch": 2.568, "grad_norm": 6.509884357452393, "learning_rate": 1.808080808080808e-05, "loss": 2.8698, "mean_token_accuracy": 0.8457562029361725, "step": 321 }, { "epoch": 2.576, "grad_norm": 5.976521968841553, "learning_rate": 1.797979797979798e-05, "loss": 2.7501, "mean_token_accuracy": 0.846638560295105, "step": 322 }, { "epoch": 2.584, "grad_norm": 5.878550052642822, "learning_rate": 1.787878787878788e-05, "loss": 2.2021, "mean_token_accuracy": 0.8834634870290756, "step": 323 }, { "epoch": 2.592, "grad_norm": 5.905157566070557, "learning_rate": 1.777777777777778e-05, "loss": 2.5808, "mean_token_accuracy": 0.8616163432598114, "step": 324 }, { "epoch": 2.6, "grad_norm": 5.510688304901123, "learning_rate": 1.7676767676767676e-05, "loss": 2.265, "mean_token_accuracy": 0.8738918304443359, "step": 325 }, { "epoch": 2.608, "grad_norm": 5.952402591705322, "learning_rate": 1.7575757575757576e-05, "loss": 2.7743, "mean_token_accuracy": 0.85427226126194, "step": 326 }, { "epoch": 2.616, "grad_norm": 6.647154808044434, "learning_rate": 1.7474747474747475e-05, "loss": 2.9181, "mean_token_accuracy": 0.8349238932132721, "step": 327 }, { "epoch": 2.624, "grad_norm": 7.24924898147583, "learning_rate": 1.7373737373737375e-05, "loss": 3.1174, "mean_token_accuracy": 0.8411069959402084, "step": 328 }, { "epoch": 2.632, "grad_norm": 5.238758563995361, "learning_rate": 1.7272727272727274e-05, "loss": 2.4068, "mean_token_accuracy": 0.8669510632753372, "step": 329 }, { "epoch": 2.64, "grad_norm": 5.997193336486816, "learning_rate": 1.7171717171717173e-05, "loss": 3.0963, "mean_token_accuracy": 0.8448814451694489, "step": 330 }, { "epoch": 2.648, "grad_norm": 6.348751544952393, "learning_rate": 1.707070707070707e-05, "loss": 2.8593, "mean_token_accuracy": 0.8491098284721375, "step": 331 }, { "epoch": 2.656, "grad_norm": 5.824342250823975, "learning_rate": 1.6969696969696972e-05, "loss": 2.5448, "mean_token_accuracy": 0.8616123348474503, "step": 332 }, { "epoch": 2.664, "grad_norm": 7.513331890106201, "learning_rate": 1.686868686868687e-05, "loss": 2.5108, "mean_token_accuracy": 0.8665321916341782, "step": 333 }, { "epoch": 2.672, "grad_norm": 5.547114372253418, "learning_rate": 1.6767676767676768e-05, "loss": 2.674, "mean_token_accuracy": 0.8699924349784851, "step": 334 }, { "epoch": 2.68, "grad_norm": 6.60705041885376, "learning_rate": 1.6666666666666667e-05, "loss": 2.9442, "mean_token_accuracy": 0.8435302972793579, "step": 335 }, { "epoch": 2.6879999999999997, "grad_norm": 6.116333961486816, "learning_rate": 1.6565656565656567e-05, "loss": 2.3428, "mean_token_accuracy": 0.8697264492511749, "step": 336 }, { "epoch": 2.6959999999999997, "grad_norm": 6.642714500427246, "learning_rate": 1.6464646464646466e-05, "loss": 2.7629, "mean_token_accuracy": 0.863296702504158, "step": 337 }, { "epoch": 2.7039999999999997, "grad_norm": 6.764316558837891, "learning_rate": 1.6363636363636366e-05, "loss": 3.0041, "mean_token_accuracy": 0.8556036204099655, "step": 338 }, { "epoch": 2.7119999999999997, "grad_norm": 5.682474613189697, "learning_rate": 1.6262626262626262e-05, "loss": 2.4677, "mean_token_accuracy": 0.8652613461017609, "step": 339 }, { "epoch": 2.7199999999999998, "grad_norm": 6.347187519073486, "learning_rate": 1.6161616161616165e-05, "loss": 2.4674, "mean_token_accuracy": 0.8690385818481445, "step": 340 }, { "epoch": 2.7279999999999998, "grad_norm": 5.507145404815674, "learning_rate": 1.606060606060606e-05, "loss": 2.692, "mean_token_accuracy": 0.8610135316848755, "step": 341 }, { "epoch": 2.7359999999999998, "grad_norm": 6.108325481414795, "learning_rate": 1.595959595959596e-05, "loss": 2.4909, "mean_token_accuracy": 0.8648791164159775, "step": 342 }, { "epoch": 2.7439999999999998, "grad_norm": 6.017334938049316, "learning_rate": 1.585858585858586e-05, "loss": 2.2442, "mean_token_accuracy": 0.8753398060798645, "step": 343 }, { "epoch": 2.752, "grad_norm": 6.193384170532227, "learning_rate": 1.5757575757575756e-05, "loss": 2.3729, "mean_token_accuracy": 0.8614612221717834, "step": 344 }, { "epoch": 2.76, "grad_norm": 6.495743751525879, "learning_rate": 1.565656565656566e-05, "loss": 2.8584, "mean_token_accuracy": 0.8486870527267456, "step": 345 }, { "epoch": 2.768, "grad_norm": 5.537295818328857, "learning_rate": 1.5555555555555555e-05, "loss": 2.3311, "mean_token_accuracy": 0.8707073032855988, "step": 346 }, { "epoch": 2.776, "grad_norm": 5.175790309906006, "learning_rate": 1.5454545454545454e-05, "loss": 2.4548, "mean_token_accuracy": 0.8665929436683655, "step": 347 }, { "epoch": 2.784, "grad_norm": 5.646080017089844, "learning_rate": 1.5353535353535354e-05, "loss": 2.4042, "mean_token_accuracy": 0.8688449859619141, "step": 348 }, { "epoch": 2.792, "grad_norm": 5.514770984649658, "learning_rate": 1.5252525252525255e-05, "loss": 2.3592, "mean_token_accuracy": 0.868006706237793, "step": 349 }, { "epoch": 2.8, "grad_norm": 5.443539619445801, "learning_rate": 1.5151515151515153e-05, "loss": 2.7749, "mean_token_accuracy": 0.8517529368400574, "step": 350 }, { "epoch": 2.808, "grad_norm": 6.177525043487549, "learning_rate": 1.505050505050505e-05, "loss": 2.7422, "mean_token_accuracy": 0.853731244802475, "step": 351 }, { "epoch": 2.816, "grad_norm": 5.842599868774414, "learning_rate": 1.494949494949495e-05, "loss": 2.7828, "mean_token_accuracy": 0.8474886268377304, "step": 352 }, { "epoch": 2.824, "grad_norm": 6.956360816955566, "learning_rate": 1.484848484848485e-05, "loss": 2.6584, "mean_token_accuracy": 0.8608374446630478, "step": 353 }, { "epoch": 2.832, "grad_norm": 6.075902938842773, "learning_rate": 1.4747474747474749e-05, "loss": 1.7885, "mean_token_accuracy": 0.8908968865871429, "step": 354 }, { "epoch": 2.84, "grad_norm": 5.676952838897705, "learning_rate": 1.4646464646464647e-05, "loss": 2.2233, "mean_token_accuracy": 0.8666234165430069, "step": 355 }, { "epoch": 2.848, "grad_norm": 6.072289943695068, "learning_rate": 1.4545454545454545e-05, "loss": 2.9788, "mean_token_accuracy": 0.8446744084358215, "step": 356 }, { "epoch": 2.856, "grad_norm": 6.316305637359619, "learning_rate": 1.4444444444444444e-05, "loss": 2.6826, "mean_token_accuracy": 0.8572750687599182, "step": 357 }, { "epoch": 2.864, "grad_norm": 6.896461486816406, "learning_rate": 1.4343434343434345e-05, "loss": 2.3163, "mean_token_accuracy": 0.874391183257103, "step": 358 }, { "epoch": 2.872, "grad_norm": 6.410128116607666, "learning_rate": 1.4242424242424243e-05, "loss": 2.66, "mean_token_accuracy": 0.8778804391622543, "step": 359 }, { "epoch": 2.88, "grad_norm": 5.7101664543151855, "learning_rate": 1.4141414141414141e-05, "loss": 2.5463, "mean_token_accuracy": 0.8713642507791519, "step": 360 }, { "epoch": 2.888, "grad_norm": 5.941382884979248, "learning_rate": 1.404040404040404e-05, "loss": 2.4611, "mean_token_accuracy": 0.8689820915460587, "step": 361 }, { "epoch": 2.896, "grad_norm": 5.766918182373047, "learning_rate": 1.3939393939393942e-05, "loss": 2.3588, "mean_token_accuracy": 0.8715604543685913, "step": 362 }, { "epoch": 2.904, "grad_norm": 6.412060260772705, "learning_rate": 1.383838383838384e-05, "loss": 2.9418, "mean_token_accuracy": 0.8422341346740723, "step": 363 }, { "epoch": 2.912, "grad_norm": 6.339536666870117, "learning_rate": 1.3737373737373737e-05, "loss": 2.8499, "mean_token_accuracy": 0.8519201278686523, "step": 364 }, { "epoch": 2.92, "grad_norm": 5.80408239364624, "learning_rate": 1.3636363636363637e-05, "loss": 2.6526, "mean_token_accuracy": 0.8615650832653046, "step": 365 }, { "epoch": 2.928, "grad_norm": 5.643406391143799, "learning_rate": 1.3535353535353538e-05, "loss": 2.4131, "mean_token_accuracy": 0.8788148909807205, "step": 366 }, { "epoch": 2.936, "grad_norm": 6.2641167640686035, "learning_rate": 1.3434343434343436e-05, "loss": 2.7667, "mean_token_accuracy": 0.8569764792919159, "step": 367 }, { "epoch": 2.944, "grad_norm": 6.191479682922363, "learning_rate": 1.3333333333333333e-05, "loss": 2.3207, "mean_token_accuracy": 0.8694847077131271, "step": 368 }, { "epoch": 2.952, "grad_norm": 5.668253421783447, "learning_rate": 1.3232323232323233e-05, "loss": 2.4538, "mean_token_accuracy": 0.8613507598638535, "step": 369 }, { "epoch": 2.96, "grad_norm": 5.68297004699707, "learning_rate": 1.3131313131313134e-05, "loss": 2.4336, "mean_token_accuracy": 0.8691912293434143, "step": 370 }, { "epoch": 2.968, "grad_norm": 6.308213710784912, "learning_rate": 1.3030303030303032e-05, "loss": 2.9029, "mean_token_accuracy": 0.8455093801021576, "step": 371 }, { "epoch": 2.976, "grad_norm": 5.907562255859375, "learning_rate": 1.292929292929293e-05, "loss": 2.4898, "mean_token_accuracy": 0.8742183148860931, "step": 372 }, { "epoch": 2.984, "grad_norm": 6.077236652374268, "learning_rate": 1.2828282828282829e-05, "loss": 2.5998, "mean_token_accuracy": 0.8597747683525085, "step": 373 }, { "epoch": 2.992, "grad_norm": 5.781510829925537, "learning_rate": 1.2727272727272727e-05, "loss": 2.5208, "mean_token_accuracy": 0.8634484708309174, "step": 374 }, { "epoch": 3.0, "grad_norm": 6.007801532745361, "learning_rate": 1.2626262626262628e-05, "loss": 2.5073, "mean_token_accuracy": 0.8571873605251312, "step": 375 }, { "epoch": 3.008, "grad_norm": 5.24554443359375, "learning_rate": 1.2525252525252526e-05, "loss": 2.189, "mean_token_accuracy": 0.8963351398706436, "step": 376 }, { "epoch": 3.016, "grad_norm": 5.7106614112854, "learning_rate": 1.2424242424242424e-05, "loss": 2.1298, "mean_token_accuracy": 0.8783598244190216, "step": 377 }, { "epoch": 3.024, "grad_norm": 5.2301025390625, "learning_rate": 1.2323232323232325e-05, "loss": 1.9574, "mean_token_accuracy": 0.8882095962762833, "step": 378 }, { "epoch": 3.032, "grad_norm": 5.160372257232666, "learning_rate": 1.2222222222222222e-05, "loss": 2.315, "mean_token_accuracy": 0.8809339702129364, "step": 379 }, { "epoch": 3.04, "grad_norm": 5.574481010437012, "learning_rate": 1.2121212121212122e-05, "loss": 2.1501, "mean_token_accuracy": 0.8889680802822113, "step": 380 }, { "epoch": 3.048, "grad_norm": 6.913398265838623, "learning_rate": 1.202020202020202e-05, "loss": 2.3129, "mean_token_accuracy": 0.8849718272686005, "step": 381 }, { "epoch": 3.056, "grad_norm": 6.229403972625732, "learning_rate": 1.1919191919191921e-05, "loss": 2.3757, "mean_token_accuracy": 0.8662046045064926, "step": 382 }, { "epoch": 3.064, "grad_norm": 5.60806131362915, "learning_rate": 1.1818181818181819e-05, "loss": 2.6174, "mean_token_accuracy": 0.8517654091119766, "step": 383 }, { "epoch": 3.072, "grad_norm": 5.600075721740723, "learning_rate": 1.1717171717171718e-05, "loss": 2.3305, "mean_token_accuracy": 0.8705442994832993, "step": 384 }, { "epoch": 3.08, "grad_norm": 6.062666416168213, "learning_rate": 1.1616161616161616e-05, "loss": 2.2109, "mean_token_accuracy": 0.8755911886692047, "step": 385 }, { "epoch": 3.088, "grad_norm": 7.154458999633789, "learning_rate": 1.1515151515151517e-05, "loss": 2.6082, "mean_token_accuracy": 0.860401377081871, "step": 386 }, { "epoch": 3.096, "grad_norm": 6.247750759124756, "learning_rate": 1.1414141414141415e-05, "loss": 2.2877, "mean_token_accuracy": 0.880204901099205, "step": 387 }, { "epoch": 3.104, "grad_norm": 5.955714702606201, "learning_rate": 1.1313131313131314e-05, "loss": 1.8738, "mean_token_accuracy": 0.9012208431959152, "step": 388 }, { "epoch": 3.112, "grad_norm": 5.796663284301758, "learning_rate": 1.1212121212121212e-05, "loss": 1.6587, "mean_token_accuracy": 0.9029347151517868, "step": 389 }, { "epoch": 3.12, "grad_norm": 5.392340660095215, "learning_rate": 1.1111111111111112e-05, "loss": 2.3056, "mean_token_accuracy": 0.8753447830677032, "step": 390 }, { "epoch": 3.128, "grad_norm": 7.139640808105469, "learning_rate": 1.1010101010101011e-05, "loss": 2.8439, "mean_token_accuracy": 0.8585825264453888, "step": 391 }, { "epoch": 3.136, "grad_norm": 5.9656829833984375, "learning_rate": 1.0909090909090909e-05, "loss": 2.1093, "mean_token_accuracy": 0.8788597285747528, "step": 392 }, { "epoch": 3.144, "grad_norm": 5.839116096496582, "learning_rate": 1.0808080808080808e-05, "loss": 2.2813, "mean_token_accuracy": 0.885636180639267, "step": 393 }, { "epoch": 3.152, "grad_norm": 5.032593727111816, "learning_rate": 1.0707070707070708e-05, "loss": 1.9325, "mean_token_accuracy": 0.8939605355262756, "step": 394 }, { "epoch": 3.16, "grad_norm": 6.447021961212158, "learning_rate": 1.0606060606060607e-05, "loss": 2.0899, "mean_token_accuracy": 0.8907236605882645, "step": 395 }, { "epoch": 3.168, "grad_norm": 6.839473724365234, "learning_rate": 1.0505050505050505e-05, "loss": 1.8443, "mean_token_accuracy": 0.8959701657295227, "step": 396 }, { "epoch": 3.176, "grad_norm": 6.076663017272949, "learning_rate": 1.0404040404040405e-05, "loss": 2.0503, "mean_token_accuracy": 0.8750788569450378, "step": 397 }, { "epoch": 3.184, "grad_norm": 5.917912483215332, "learning_rate": 1.0303030303030304e-05, "loss": 2.1561, "mean_token_accuracy": 0.8795887529850006, "step": 398 }, { "epoch": 3.192, "grad_norm": 6.047354698181152, "learning_rate": 1.0202020202020204e-05, "loss": 2.4048, "mean_token_accuracy": 0.8771449476480484, "step": 399 }, { "epoch": 3.2, "grad_norm": 6.2761125564575195, "learning_rate": 1.0101010101010101e-05, "loss": 2.0975, "mean_token_accuracy": 0.8842770159244537, "step": 400 }, { "epoch": 3.208, "grad_norm": 6.07623291015625, "learning_rate": 1e-05, "loss": 1.8668, "mean_token_accuracy": 0.8863209933042526, "step": 401 }, { "epoch": 3.216, "grad_norm": 6.246818542480469, "learning_rate": 9.898989898989899e-06, "loss": 2.245, "mean_token_accuracy": 0.8780923783779144, "step": 402 }, { "epoch": 3.224, "grad_norm": 6.323997974395752, "learning_rate": 9.7979797979798e-06, "loss": 2.1369, "mean_token_accuracy": 0.8779752850532532, "step": 403 }, { "epoch": 3.232, "grad_norm": 7.1778411865234375, "learning_rate": 9.696969696969698e-06, "loss": 2.5985, "mean_token_accuracy": 0.8683747202157974, "step": 404 }, { "epoch": 3.24, "grad_norm": 6.361847877502441, "learning_rate": 9.595959595959595e-06, "loss": 2.3375, "mean_token_accuracy": 0.8720128089189529, "step": 405 }, { "epoch": 3.248, "grad_norm": 6.138617992401123, "learning_rate": 9.494949494949495e-06, "loss": 2.0752, "mean_token_accuracy": 0.8752965927124023, "step": 406 }, { "epoch": 3.2560000000000002, "grad_norm": 5.688358783721924, "learning_rate": 9.393939393939394e-06, "loss": 1.9569, "mean_token_accuracy": 0.8975227028131485, "step": 407 }, { "epoch": 3.2640000000000002, "grad_norm": 5.894192218780518, "learning_rate": 9.292929292929294e-06, "loss": 2.0029, "mean_token_accuracy": 0.886854737997055, "step": 408 }, { "epoch": 3.2720000000000002, "grad_norm": 6.036264419555664, "learning_rate": 9.191919191919192e-06, "loss": 2.2071, "mean_token_accuracy": 0.8780567795038223, "step": 409 }, { "epoch": 3.2800000000000002, "grad_norm": 5.998513221740723, "learning_rate": 9.090909090909091e-06, "loss": 1.9301, "mean_token_accuracy": 0.8917412608861923, "step": 410 }, { "epoch": 3.288, "grad_norm": 6.909607887268066, "learning_rate": 8.98989898989899e-06, "loss": 2.1684, "mean_token_accuracy": 0.871549054980278, "step": 411 }, { "epoch": 3.296, "grad_norm": 6.868885040283203, "learning_rate": 8.88888888888889e-06, "loss": 2.3904, "mean_token_accuracy": 0.8834272921085358, "step": 412 }, { "epoch": 3.304, "grad_norm": 6.619902610778809, "learning_rate": 8.787878787878788e-06, "loss": 2.3145, "mean_token_accuracy": 0.8664613366127014, "step": 413 }, { "epoch": 3.312, "grad_norm": 7.675636291503906, "learning_rate": 8.686868686868687e-06, "loss": 2.4144, "mean_token_accuracy": 0.8673148602247238, "step": 414 }, { "epoch": 3.32, "grad_norm": 6.839119911193848, "learning_rate": 8.585858585858587e-06, "loss": 2.445, "mean_token_accuracy": 0.8668957501649857, "step": 415 }, { "epoch": 3.328, "grad_norm": 6.37800407409668, "learning_rate": 8.484848484848486e-06, "loss": 2.0239, "mean_token_accuracy": 0.8818509876728058, "step": 416 }, { "epoch": 3.336, "grad_norm": 6.840362071990967, "learning_rate": 8.383838383838384e-06, "loss": 2.2702, "mean_token_accuracy": 0.8701023757457733, "step": 417 }, { "epoch": 3.344, "grad_norm": 6.7885003089904785, "learning_rate": 8.282828282828283e-06, "loss": 2.4609, "mean_token_accuracy": 0.8644936680793762, "step": 418 }, { "epoch": 3.352, "grad_norm": 7.857728481292725, "learning_rate": 8.181818181818183e-06, "loss": 1.8772, "mean_token_accuracy": 0.8961075842380524, "step": 419 }, { "epoch": 3.36, "grad_norm": 6.570309162139893, "learning_rate": 8.080808080808082e-06, "loss": 1.8877, "mean_token_accuracy": 0.8875200748443604, "step": 420 }, { "epoch": 3.368, "grad_norm": 6.203190803527832, "learning_rate": 7.97979797979798e-06, "loss": 2.529, "mean_token_accuracy": 0.8654111176729202, "step": 421 }, { "epoch": 3.376, "grad_norm": 5.97314977645874, "learning_rate": 7.878787878787878e-06, "loss": 1.9645, "mean_token_accuracy": 0.8833965063095093, "step": 422 }, { "epoch": 3.384, "grad_norm": 6.25310754776001, "learning_rate": 7.777777777777777e-06, "loss": 2.2233, "mean_token_accuracy": 0.8773595839738846, "step": 423 }, { "epoch": 3.392, "grad_norm": 7.28585958480835, "learning_rate": 7.676767676767677e-06, "loss": 2.0955, "mean_token_accuracy": 0.8743046373128891, "step": 424 }, { "epoch": 3.4, "grad_norm": 5.4623260498046875, "learning_rate": 7.5757575757575764e-06, "loss": 2.0769, "mean_token_accuracy": 0.886017695069313, "step": 425 }, { "epoch": 3.408, "grad_norm": 6.525033950805664, "learning_rate": 7.474747474747475e-06, "loss": 1.8359, "mean_token_accuracy": 0.8860061317682266, "step": 426 }, { "epoch": 3.416, "grad_norm": 6.582717418670654, "learning_rate": 7.3737373737373745e-06, "loss": 1.9729, "mean_token_accuracy": 0.8987778276205063, "step": 427 }, { "epoch": 3.424, "grad_norm": 6.132490634918213, "learning_rate": 7.272727272727272e-06, "loss": 1.9637, "mean_token_accuracy": 0.883312463760376, "step": 428 }, { "epoch": 3.432, "grad_norm": 5.695188045501709, "learning_rate": 7.171717171717173e-06, "loss": 2.0213, "mean_token_accuracy": 0.8891171365976334, "step": 429 }, { "epoch": 3.44, "grad_norm": 8.365019798278809, "learning_rate": 7.0707070707070704e-06, "loss": 2.4396, "mean_token_accuracy": 0.8708128333091736, "step": 430 }, { "epoch": 3.448, "grad_norm": 5.691287040710449, "learning_rate": 6.969696969696971e-06, "loss": 2.0038, "mean_token_accuracy": 0.9007010161876678, "step": 431 }, { "epoch": 3.456, "grad_norm": 6.807380676269531, "learning_rate": 6.8686868686868685e-06, "loss": 1.9772, "mean_token_accuracy": 0.8897037208080292, "step": 432 }, { "epoch": 3.464, "grad_norm": 6.571257591247559, "learning_rate": 6.767676767676769e-06, "loss": 2.5581, "mean_token_accuracy": 0.8635726571083069, "step": 433 }, { "epoch": 3.472, "grad_norm": 5.491265296936035, "learning_rate": 6.666666666666667e-06, "loss": 1.9038, "mean_token_accuracy": 0.9004794210195541, "step": 434 }, { "epoch": 3.48, "grad_norm": 5.837332725524902, "learning_rate": 6.565656565656567e-06, "loss": 1.8962, "mean_token_accuracy": 0.890899047255516, "step": 435 }, { "epoch": 3.488, "grad_norm": 6.126434803009033, "learning_rate": 6.464646464646465e-06, "loss": 2.0859, "mean_token_accuracy": 0.8814673125743866, "step": 436 }, { "epoch": 3.496, "grad_norm": 5.65269660949707, "learning_rate": 6.363636363636363e-06, "loss": 1.7333, "mean_token_accuracy": 0.900140568614006, "step": 437 }, { "epoch": 3.504, "grad_norm": 6.0961079597473145, "learning_rate": 6.262626262626263e-06, "loss": 1.9624, "mean_token_accuracy": 0.8910833448171616, "step": 438 }, { "epoch": 3.512, "grad_norm": 6.404435157775879, "learning_rate": 6.161616161616162e-06, "loss": 2.3729, "mean_token_accuracy": 0.8687343299388885, "step": 439 }, { "epoch": 3.52, "grad_norm": 6.7053093910217285, "learning_rate": 6.060606060606061e-06, "loss": 2.2333, "mean_token_accuracy": 0.8874952048063278, "step": 440 }, { "epoch": 3.528, "grad_norm": 6.296670436859131, "learning_rate": 5.9595959595959605e-06, "loss": 2.1395, "mean_token_accuracy": 0.8732293993234634, "step": 441 }, { "epoch": 3.536, "grad_norm": 6.392675399780273, "learning_rate": 5.858585858585859e-06, "loss": 2.0524, "mean_token_accuracy": 0.8836774080991745, "step": 442 }, { "epoch": 3.544, "grad_norm": 6.301563262939453, "learning_rate": 5.7575757575757586e-06, "loss": 1.8739, "mean_token_accuracy": 0.8968744426965714, "step": 443 }, { "epoch": 3.552, "grad_norm": 6.225803852081299, "learning_rate": 5.656565656565657e-06, "loss": 2.0003, "mean_token_accuracy": 0.8827953040599823, "step": 444 }, { "epoch": 3.56, "grad_norm": 6.4414801597595215, "learning_rate": 5.555555555555556e-06, "loss": 2.1538, "mean_token_accuracy": 0.8831271827220917, "step": 445 }, { "epoch": 3.568, "grad_norm": 8.136292457580566, "learning_rate": 5.4545454545454545e-06, "loss": 1.7717, "mean_token_accuracy": 0.8966427743434906, "step": 446 }, { "epoch": 3.576, "grad_norm": 6.468795299530029, "learning_rate": 5.353535353535354e-06, "loss": 1.9399, "mean_token_accuracy": 0.8909775465726852, "step": 447 }, { "epoch": 3.584, "grad_norm": 6.0381083488464355, "learning_rate": 5.2525252525252526e-06, "loss": 2.0466, "mean_token_accuracy": 0.8934204578399658, "step": 448 }, { "epoch": 3.592, "grad_norm": 6.083271026611328, "learning_rate": 5.151515151515152e-06, "loss": 2.1554, "mean_token_accuracy": 0.88754041492939, "step": 449 }, { "epoch": 3.6, "grad_norm": 6.717824935913086, "learning_rate": 5.050505050505051e-06, "loss": 2.1326, "mean_token_accuracy": 0.8911218792200089, "step": 450 }, { "epoch": 3.608, "grad_norm": 6.14173698425293, "learning_rate": 4.949494949494949e-06, "loss": 1.6302, "mean_token_accuracy": 0.9065367430448532, "step": 451 }, { "epoch": 3.616, "grad_norm": 6.943065643310547, "learning_rate": 4.848484848484849e-06, "loss": 2.6283, "mean_token_accuracy": 0.8691755533218384, "step": 452 }, { "epoch": 3.624, "grad_norm": 6.845998287200928, "learning_rate": 4.747474747474747e-06, "loss": 2.1922, "mean_token_accuracy": 0.8834270387887955, "step": 453 }, { "epoch": 3.632, "grad_norm": 6.045809268951416, "learning_rate": 4.646464646464647e-06, "loss": 1.8713, "mean_token_accuracy": 0.9000080078840256, "step": 454 }, { "epoch": 3.64, "grad_norm": 7.274755477905273, "learning_rate": 4.5454545454545455e-06, "loss": 2.0781, "mean_token_accuracy": 0.8945260941982269, "step": 455 }, { "epoch": 3.648, "grad_norm": 6.384909152984619, "learning_rate": 4.444444444444445e-06, "loss": 2.1529, "mean_token_accuracy": 0.8765928000211716, "step": 456 }, { "epoch": 3.656, "grad_norm": 5.835945129394531, "learning_rate": 4.343434343434344e-06, "loss": 1.626, "mean_token_accuracy": 0.9043529778718948, "step": 457 }, { "epoch": 3.664, "grad_norm": 6.979379177093506, "learning_rate": 4.242424242424243e-06, "loss": 2.3105, "mean_token_accuracy": 0.8783639371395111, "step": 458 }, { "epoch": 3.672, "grad_norm": 7.117347240447998, "learning_rate": 4.141414141414142e-06, "loss": 2.1868, "mean_token_accuracy": 0.884697675704956, "step": 459 }, { "epoch": 3.68, "grad_norm": 6.831372261047363, "learning_rate": 4.040404040404041e-06, "loss": 2.1852, "mean_token_accuracy": 0.8815398663282394, "step": 460 }, { "epoch": 3.6879999999999997, "grad_norm": 6.203273773193359, "learning_rate": 3.939393939393939e-06, "loss": 2.1329, "mean_token_accuracy": 0.8797174096107483, "step": 461 }, { "epoch": 3.6959999999999997, "grad_norm": 5.709746837615967, "learning_rate": 3.8383838383838385e-06, "loss": 2.0121, "mean_token_accuracy": 0.8906229883432388, "step": 462 }, { "epoch": 3.7039999999999997, "grad_norm": 6.700322151184082, "learning_rate": 3.7373737373737375e-06, "loss": 2.2537, "mean_token_accuracy": 0.8837107121944427, "step": 463 }, { "epoch": 3.7119999999999997, "grad_norm": 5.748446464538574, "learning_rate": 3.636363636363636e-06, "loss": 2.199, "mean_token_accuracy": 0.8685683310031891, "step": 464 }, { "epoch": 3.7199999999999998, "grad_norm": 6.6395697593688965, "learning_rate": 3.5353535353535352e-06, "loss": 2.4631, "mean_token_accuracy": 0.8770331889390945, "step": 465 }, { "epoch": 3.7279999999999998, "grad_norm": 6.521454334259033, "learning_rate": 3.4343434343434343e-06, "loss": 2.7614, "mean_token_accuracy": 0.8615328371524811, "step": 466 }, { "epoch": 3.7359999999999998, "grad_norm": 5.956634998321533, "learning_rate": 3.3333333333333333e-06, "loss": 2.2834, "mean_token_accuracy": 0.8763253539800644, "step": 467 }, { "epoch": 3.7439999999999998, "grad_norm": 7.035250186920166, "learning_rate": 3.2323232323232324e-06, "loss": 2.033, "mean_token_accuracy": 0.8936846852302551, "step": 468 }, { "epoch": 3.752, "grad_norm": 6.001895904541016, "learning_rate": 3.1313131313131314e-06, "loss": 2.005, "mean_token_accuracy": 0.8896369785070419, "step": 469 }, { "epoch": 3.76, "grad_norm": 6.189558506011963, "learning_rate": 3.0303030303030305e-06, "loss": 2.1698, "mean_token_accuracy": 0.8766046166419983, "step": 470 }, { "epoch": 3.768, "grad_norm": 6.994729995727539, "learning_rate": 2.9292929292929295e-06, "loss": 2.108, "mean_token_accuracy": 0.8722548186779022, "step": 471 }, { "epoch": 3.776, "grad_norm": 5.766332626342773, "learning_rate": 2.8282828282828286e-06, "loss": 1.7626, "mean_token_accuracy": 0.8968187123537064, "step": 472 }, { "epoch": 3.784, "grad_norm": 6.6420392990112305, "learning_rate": 2.7272727272727272e-06, "loss": 2.1263, "mean_token_accuracy": 0.8799799233675003, "step": 473 }, { "epoch": 3.792, "grad_norm": 6.921932220458984, "learning_rate": 2.6262626262626263e-06, "loss": 1.8222, "mean_token_accuracy": 0.905672699213028, "step": 474 }, { "epoch": 3.8, "grad_norm": 6.345583915710449, "learning_rate": 2.5252525252525253e-06, "loss": 2.3984, "mean_token_accuracy": 0.8654076457023621, "step": 475 }, { "epoch": 3.808, "grad_norm": 6.0304484367370605, "learning_rate": 2.4242424242424244e-06, "loss": 2.5167, "mean_token_accuracy": 0.8724007159471512, "step": 476 }, { "epoch": 3.816, "grad_norm": 6.487928867340088, "learning_rate": 2.3232323232323234e-06, "loss": 2.5901, "mean_token_accuracy": 0.8569000661373138, "step": 477 }, { "epoch": 3.824, "grad_norm": 6.210375785827637, "learning_rate": 2.2222222222222225e-06, "loss": 1.7313, "mean_token_accuracy": 0.8982816338539124, "step": 478 }, { "epoch": 3.832, "grad_norm": 6.7894182205200195, "learning_rate": 2.1212121212121216e-06, "loss": 2.121, "mean_token_accuracy": 0.8915885388851166, "step": 479 }, { "epoch": 3.84, "grad_norm": 6.732741355895996, "learning_rate": 2.0202020202020206e-06, "loss": 2.0609, "mean_token_accuracy": 0.8849688917398453, "step": 480 }, { "epoch": 3.848, "grad_norm": 6.280490875244141, "learning_rate": 1.9191919191919192e-06, "loss": 2.0463, "mean_token_accuracy": 0.8867028504610062, "step": 481 }, { "epoch": 3.856, "grad_norm": 6.721777439117432, "learning_rate": 1.818181818181818e-06, "loss": 2.2023, "mean_token_accuracy": 0.8840513229370117, "step": 482 }, { "epoch": 3.864, "grad_norm": 6.34207010269165, "learning_rate": 1.7171717171717171e-06, "loss": 2.2238, "mean_token_accuracy": 0.8773442953824997, "step": 483 }, { "epoch": 3.872, "grad_norm": 6.753549098968506, "learning_rate": 1.6161616161616162e-06, "loss": 2.1247, "mean_token_accuracy": 0.8759674429893494, "step": 484 }, { "epoch": 3.88, "grad_norm": 6.267600059509277, "learning_rate": 1.5151515151515152e-06, "loss": 2.1506, "mean_token_accuracy": 0.8819513469934464, "step": 485 }, { "epoch": 3.888, "grad_norm": 6.509019374847412, "learning_rate": 1.4141414141414143e-06, "loss": 2.6317, "mean_token_accuracy": 0.8672285228967667, "step": 486 }, { "epoch": 3.896, "grad_norm": 6.893660068511963, "learning_rate": 1.3131313131313131e-06, "loss": 2.1853, "mean_token_accuracy": 0.8704394996166229, "step": 487 }, { "epoch": 3.904, "grad_norm": 6.314718723297119, "learning_rate": 1.2121212121212122e-06, "loss": 1.7886, "mean_token_accuracy": 0.8912533521652222, "step": 488 }, { "epoch": 3.912, "grad_norm": 7.181535243988037, "learning_rate": 1.1111111111111112e-06, "loss": 2.2651, "mean_token_accuracy": 0.8664899617433548, "step": 489 }, { "epoch": 3.92, "grad_norm": 6.317063331604004, "learning_rate": 1.0101010101010103e-06, "loss": 2.3394, "mean_token_accuracy": 0.8771510571241379, "step": 490 }, { "epoch": 3.928, "grad_norm": 6.195638656616211, "learning_rate": 9.09090909090909e-07, "loss": 2.0807, "mean_token_accuracy": 0.8755189925432205, "step": 491 }, { "epoch": 3.936, "grad_norm": 6.2105817794799805, "learning_rate": 8.080808080808081e-07, "loss": 1.7974, "mean_token_accuracy": 0.9031495600938797, "step": 492 }, { "epoch": 3.944, "grad_norm": 6.697377681732178, "learning_rate": 7.070707070707071e-07, "loss": 2.0917, "mean_token_accuracy": 0.8885557353496552, "step": 493 }, { "epoch": 3.952, "grad_norm": 6.320727825164795, "learning_rate": 6.060606060606061e-07, "loss": 2.0905, "mean_token_accuracy": 0.8876091539859772, "step": 494 }, { "epoch": 3.96, "grad_norm": 5.805496692657471, "learning_rate": 5.050505050505052e-07, "loss": 2.1719, "mean_token_accuracy": 0.8817218542098999, "step": 495 }, { "epoch": 3.968, "grad_norm": 6.263917446136475, "learning_rate": 4.0404040404040405e-07, "loss": 1.7863, "mean_token_accuracy": 0.8985198885202408, "step": 496 }, { "epoch": 3.976, "grad_norm": 5.801756858825684, "learning_rate": 3.0303030303030305e-07, "loss": 2.0907, "mean_token_accuracy": 0.8869260847568512, "step": 497 }, { "epoch": 3.984, "grad_norm": 7.1110310554504395, "learning_rate": 2.0202020202020202e-07, "loss": 2.3715, "mean_token_accuracy": 0.8689504116773605, "step": 498 }, { "epoch": 3.992, "grad_norm": 7.1602911949157715, "learning_rate": 1.0101010101010101e-07, "loss": 2.4322, "mean_token_accuracy": 0.8692635595798492, "step": 499 }, { "epoch": 4.0, "grad_norm": 6.362170696258545, "learning_rate": 0.0, "loss": 1.9344, "mean_token_accuracy": 0.8930138498544693, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7732474675200000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }