diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.284444444444444, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002962962962962963, + "grad_norm": 8.772912979125977, + "learning_rate": 0.0, + "loss": 1.443, + "mean_token_accuracy": 0.6539813578128815, + "num_tokens": 3673.0, + "step": 1 + }, + { + "epoch": 0.005925925925925926, + "grad_norm": 10.403252601623535, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5363, + "mean_token_accuracy": 0.6431898474693298, + "num_tokens": 7100.0, + "step": 2 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 7.703908443450928, + "learning_rate": 4.000000000000001e-06, + "loss": 1.257, + "mean_token_accuracy": 0.6699334979057312, + "num_tokens": 11286.0, + "step": 3 + }, + { + "epoch": 0.011851851851851851, + "grad_norm": 7.7078704833984375, + "learning_rate": 6e-06, + "loss": 1.4003, + "mean_token_accuracy": 0.6438529789447784, + "num_tokens": 14768.0, + "step": 4 + }, + { + "epoch": 0.014814814814814815, + "grad_norm": 9.674022674560547, + "learning_rate": 8.000000000000001e-06, + "loss": 1.7093, + "mean_token_accuracy": 0.6057405769824982, + "num_tokens": 17528.0, + "step": 5 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 4.503542423248291, + "learning_rate": 1e-05, + "loss": 1.1562, + "mean_token_accuracy": 0.6828956007957458, + "num_tokens": 21214.0, + "step": 6 + }, + { + "epoch": 0.02074074074074074, + "grad_norm": 3.6897921562194824, + "learning_rate": 9.97979797979798e-06, + "loss": 1.1789, + "mean_token_accuracy": 0.6797761619091034, + "num_tokens": 24324.0, + "step": 7 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 2.946655750274658, + "learning_rate": 9.95959595959596e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6878251135349274, + "num_tokens": 27624.0, + "step": 8 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 2.5357279777526855, + "learning_rate": 9.939393939393939e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7190627455711365, + "num_tokens": 31325.0, + "step": 9 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 2.0331485271453857, + "learning_rate": 9.91919191919192e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7283298671245575, + "num_tokens": 34978.0, + "step": 10 + }, + { + "epoch": 0.03259259259259259, + "grad_norm": 2.128148317337036, + "learning_rate": 9.8989898989899e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7284091711044312, + "num_tokens": 38558.0, + "step": 11 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 1.68574857711792, + "learning_rate": 9.87878787878788e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.735737144947052, + "num_tokens": 42179.0, + "step": 12 + }, + { + "epoch": 0.03851851851851852, + "grad_norm": 1.9066306352615356, + "learning_rate": 9.85858585858586e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7306854426860809, + "num_tokens": 45330.0, + "step": 13 + }, + { + "epoch": 0.04148148148148148, + "grad_norm": 1.6364827156066895, + "learning_rate": 9.838383838383839e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7543825805187225, + "num_tokens": 49191.0, + "step": 14 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 1.7865906953811646, + "learning_rate": 9.81818181818182e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7224741578102112, + "num_tokens": 53441.0, + "step": 15 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 1.7012970447540283, + "learning_rate": 9.797979797979798e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7562681436538696, + "num_tokens": 57278.0, + "step": 16 + }, + { + "epoch": 0.05037037037037037, + "grad_norm": 1.5783880949020386, + "learning_rate": 9.777777777777779e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.74215167760849, + "num_tokens": 60962.0, + "step": 17 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 1.5642509460449219, + "learning_rate": 9.757575757575758e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7534424364566803, + "num_tokens": 64637.0, + "step": 18 + }, + { + "epoch": 0.056296296296296296, + "grad_norm": 1.295188546180725, + "learning_rate": 9.737373737373738e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7553088665008545, + "num_tokens": 68546.0, + "step": 19 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 1.3624945878982544, + "learning_rate": 9.717171717171719e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7777272164821625, + "num_tokens": 72390.0, + "step": 20 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 1.506975769996643, + "learning_rate": 9.696969696969698e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7464078962802887, + "num_tokens": 75733.0, + "step": 21 + }, + { + "epoch": 0.06518518518518518, + "grad_norm": 1.281867504119873, + "learning_rate": 9.676767676767678e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7741674780845642, + "num_tokens": 79827.0, + "step": 22 + }, + { + "epoch": 0.06814814814814815, + "grad_norm": 1.4096407890319824, + "learning_rate": 9.656565656565657e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7645211815834045, + "num_tokens": 83332.0, + "step": 23 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 1.7312240600585938, + "learning_rate": 9.636363636363638e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7544649243354797, + "num_tokens": 86138.0, + "step": 24 + }, + { + "epoch": 0.07407407407407407, + "grad_norm": 1.3980112075805664, + "learning_rate": 9.616161616161616e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7725013494491577, + "num_tokens": 89667.0, + "step": 25 + }, + { + "epoch": 0.07703703703703704, + "grad_norm": 1.3854331970214844, + "learning_rate": 9.595959595959597e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7772875130176544, + "num_tokens": 93562.0, + "step": 26 + }, + { + "epoch": 0.08, + "grad_norm": 1.369156837463379, + "learning_rate": 9.575757575757576e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7752570509910583, + "num_tokens": 97798.0, + "step": 27 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 1.448936939239502, + "learning_rate": 9.555555555555556e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7755326628684998, + "num_tokens": 101271.0, + "step": 28 + }, + { + "epoch": 0.08592592592592592, + "grad_norm": 1.7893205881118774, + "learning_rate": 9.535353535353537e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7425324320793152, + "num_tokens": 104761.0, + "step": 29 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 1.9558690786361694, + "learning_rate": 9.515151515151516e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7522033154964447, + "num_tokens": 107675.0, + "step": 30 + }, + { + "epoch": 0.09185185185185185, + "grad_norm": 1.8256072998046875, + "learning_rate": 9.494949494949497e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7518804371356964, + "num_tokens": 110557.0, + "step": 31 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 1.3364187479019165, + "learning_rate": 9.474747474747475e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7841249108314514, + "num_tokens": 114117.0, + "step": 32 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 1.2807625532150269, + "learning_rate": 9.454545454545456e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7709018290042877, + "num_tokens": 118138.0, + "step": 33 + }, + { + "epoch": 0.10074074074074074, + "grad_norm": 1.4543238878250122, + "learning_rate": 9.434343434343435e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7700174748897552, + "num_tokens": 121496.0, + "step": 34 + }, + { + "epoch": 0.1037037037037037, + "grad_norm": 1.5930938720703125, + "learning_rate": 9.414141414141414e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8093555867671967, + "num_tokens": 125505.0, + "step": 35 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.4539469480514526, + "learning_rate": 9.393939393939396e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7853818237781525, + "num_tokens": 128738.0, + "step": 36 + }, + { + "epoch": 0.10962962962962963, + "grad_norm": 1.4270331859588623, + "learning_rate": 9.373737373737375e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7508721649646759, + "num_tokens": 132296.0, + "step": 37 + }, + { + "epoch": 0.11259259259259259, + "grad_norm": 1.6544485092163086, + "learning_rate": 9.353535353535354e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7520561814308167, + "num_tokens": 135257.0, + "step": 38 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 1.6830586194992065, + "learning_rate": 9.333333333333334e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.75847128033638, + "num_tokens": 138078.0, + "step": 39 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 1.4704310894012451, + "learning_rate": 9.313131313131313e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7721151411533356, + "num_tokens": 141851.0, + "step": 40 + }, + { + "epoch": 0.12148148148148148, + "grad_norm": 1.6560653448104858, + "learning_rate": 9.292929292929294e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7641234695911407, + "num_tokens": 144922.0, + "step": 41 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 1.4824740886688232, + "learning_rate": 9.272727272727273e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7708763182163239, + "num_tokens": 148340.0, + "step": 42 + }, + { + "epoch": 0.1274074074074074, + "grad_norm": 1.433552861213684, + "learning_rate": 9.252525252525253e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7866450250148773, + "num_tokens": 152359.0, + "step": 43 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 1.4344393014907837, + "learning_rate": 9.232323232323232e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.7916350960731506, + "num_tokens": 155836.0, + "step": 44 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.3556673526763916, + "learning_rate": 9.212121212121213e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7881527841091156, + "num_tokens": 159751.0, + "step": 45 + }, + { + "epoch": 0.1362962962962963, + "grad_norm": 1.5045182704925537, + "learning_rate": 9.191919191919193e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.7934019863605499, + "num_tokens": 162726.0, + "step": 46 + }, + { + "epoch": 0.13925925925925925, + "grad_norm": 1.456202745437622, + "learning_rate": 9.171717171717172e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7751224040985107, + "num_tokens": 166249.0, + "step": 47 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 1.376373529434204, + "learning_rate": 9.151515151515153e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.7941091656684875, + "num_tokens": 170193.0, + "step": 48 + }, + { + "epoch": 0.1451851851851852, + "grad_norm": 1.7317931652069092, + "learning_rate": 9.131313131313132e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7754814326763153, + "num_tokens": 173542.0, + "step": 49 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 1.283530831336975, + "learning_rate": 9.111111111111112e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7888539731502533, + "num_tokens": 177567.0, + "step": 50 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 1.3664168119430542, + "learning_rate": 9.090909090909091e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.7923276424407959, + "num_tokens": 181147.0, + "step": 51 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 1.4568012952804565, + "learning_rate": 9.070707070707072e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7747292518615723, + "num_tokens": 184682.0, + "step": 52 + }, + { + "epoch": 0.15703703703703703, + "grad_norm": 1.499325156211853, + "learning_rate": 9.050505050505052e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7736046016216278, + "num_tokens": 188065.0, + "step": 53 + }, + { + "epoch": 0.16, + "grad_norm": 1.2658172845840454, + "learning_rate": 9.030303030303031e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.7896979749202728, + "num_tokens": 192062.0, + "step": 54 + }, + { + "epoch": 0.16296296296296298, + "grad_norm": 1.395758032798767, + "learning_rate": 9.010101010101012e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.7921416461467743, + "num_tokens": 195563.0, + "step": 55 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 1.319954514503479, + "learning_rate": 8.98989898989899e-06, + "loss": 0.5821, + "mean_token_accuracy": 0.8026478588581085, + "num_tokens": 199074.0, + "step": 56 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 1.3816076517105103, + "learning_rate": 8.969696969696971e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7794812917709351, + "num_tokens": 202521.0, + "step": 57 + }, + { + "epoch": 0.17185185185185184, + "grad_norm": 1.1893161535263062, + "learning_rate": 8.94949494949495e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.791596919298172, + "num_tokens": 206622.0, + "step": 58 + }, + { + "epoch": 0.1748148148148148, + "grad_norm": 1.431281328201294, + "learning_rate": 8.92929292929293e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.7729869782924652, + "num_tokens": 210140.0, + "step": 59 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 1.3009824752807617, + "learning_rate": 8.90909090909091e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7884209454059601, + "num_tokens": 213885.0, + "step": 60 + }, + { + "epoch": 0.18074074074074073, + "grad_norm": 1.5704373121261597, + "learning_rate": 8.888888888888888e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7790744304656982, + "num_tokens": 217144.0, + "step": 61 + }, + { + "epoch": 0.1837037037037037, + "grad_norm": 1.6178092956542969, + "learning_rate": 8.86868686868687e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7749516069889069, + "num_tokens": 219799.0, + "step": 62 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 1.5708521604537964, + "learning_rate": 8.84848484848485e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.7913980185985565, + "num_tokens": 222635.0, + "step": 63 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 1.9118913412094116, + "learning_rate": 8.82828282828283e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7410552203655243, + "num_tokens": 225338.0, + "step": 64 + }, + { + "epoch": 0.1925925925925926, + "grad_norm": 1.4103525876998901, + "learning_rate": 8.808080808080809e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.7971398234367371, + "num_tokens": 229194.0, + "step": 65 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 1.1924285888671875, + "learning_rate": 8.787878787878788e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.8194809556007385, + "num_tokens": 233640.0, + "step": 66 + }, + { + "epoch": 0.1985185185185185, + "grad_norm": 1.7252451181411743, + "learning_rate": 8.767676767676768e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7796667218208313, + "num_tokens": 236550.0, + "step": 67 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 1.6755399703979492, + "learning_rate": 8.747474747474747e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.8030109703540802, + "num_tokens": 239391.0, + "step": 68 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 1.1719599962234497, + "learning_rate": 8.727272727272728e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8049156665802002, + "num_tokens": 243557.0, + "step": 69 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 1.352806568145752, + "learning_rate": 8.707070707070707e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8049381971359253, + "num_tokens": 247026.0, + "step": 70 + }, + { + "epoch": 0.21037037037037037, + "grad_norm": 1.2742316722869873, + "learning_rate": 8.686868686868687e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.7976597547531128, + "num_tokens": 251053.0, + "step": 71 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 1.7672648429870605, + "learning_rate": 8.666666666666668e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7863489985466003, + "num_tokens": 253536.0, + "step": 72 + }, + { + "epoch": 0.2162962962962963, + "grad_norm": 1.3268659114837646, + "learning_rate": 8.646464646464647e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8095948398113251, + "num_tokens": 257887.0, + "step": 73 + }, + { + "epoch": 0.21925925925925926, + "grad_norm": 1.2447559833526611, + "learning_rate": 8.626262626262627e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.8086831569671631, + "num_tokens": 261683.0, + "step": 74 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.4702627658843994, + "learning_rate": 8.606060606060606e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.7973065972328186, + "num_tokens": 265542.0, + "step": 75 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 1.4920425415039062, + "learning_rate": 8.585858585858587e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8030296266078949, + "num_tokens": 269151.0, + "step": 76 + }, + { + "epoch": 0.22814814814814816, + "grad_norm": 1.751456379890442, + "learning_rate": 8.565656565656566e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7510209381580353, + "num_tokens": 271923.0, + "step": 77 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 1.4484258890151978, + "learning_rate": 8.545454545454546e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8068382143974304, + "num_tokens": 275380.0, + "step": 78 + }, + { + "epoch": 0.23407407407407407, + "grad_norm": 1.5630091428756714, + "learning_rate": 8.525252525252527e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.8142485320568085, + "num_tokens": 278552.0, + "step": 79 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 1.3946943283081055, + "learning_rate": 8.505050505050506e-06, + "loss": 0.6588, + "mean_token_accuracy": 0.7752138078212738, + "num_tokens": 282017.0, + "step": 80 + }, + { + "epoch": 0.24, + "grad_norm": 1.2121405601501465, + "learning_rate": 8.484848484848486e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8139599561691284, + "num_tokens": 286420.0, + "step": 81 + }, + { + "epoch": 0.24296296296296296, + "grad_norm": 1.181118369102478, + "learning_rate": 8.464646464646465e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8217185437679291, + "num_tokens": 291160.0, + "step": 82 + }, + { + "epoch": 0.24592592592592594, + "grad_norm": 1.204434871673584, + "learning_rate": 8.444444444444446e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8164167702198029, + "num_tokens": 296037.0, + "step": 83 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 1.2375670671463013, + "learning_rate": 8.424242424242425e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8033479452133179, + "num_tokens": 300062.0, + "step": 84 + }, + { + "epoch": 0.2518518518518518, + "grad_norm": 1.2679692506790161, + "learning_rate": 8.404040404040405e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8225648999214172, + "num_tokens": 304346.0, + "step": 85 + }, + { + "epoch": 0.2548148148148148, + "grad_norm": 1.4768755435943604, + "learning_rate": 8.383838383838384e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8099740743637085, + "num_tokens": 307609.0, + "step": 86 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 1.4789756536483765, + "learning_rate": 8.363636363636365e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.8028825223445892, + "num_tokens": 310792.0, + "step": 87 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 1.468558669090271, + "learning_rate": 8.343434343434345e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7962141931056976, + "num_tokens": 313930.0, + "step": 88 + }, + { + "epoch": 0.2637037037037037, + "grad_norm": 1.667181134223938, + "learning_rate": 8.323232323232324e-06, + "loss": 0.6622, + "mean_token_accuracy": 0.7884489595890045, + "num_tokens": 316880.0, + "step": 89 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.514397382736206, + "learning_rate": 8.303030303030305e-06, + "loss": 0.593, + "mean_token_accuracy": 0.7914770543575287, + "num_tokens": 320452.0, + "step": 90 + }, + { + "epoch": 0.2696296296296296, + "grad_norm": 1.3031214475631714, + "learning_rate": 8.282828282828283e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.7927478551864624, + "num_tokens": 324166.0, + "step": 91 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 1.290567398071289, + "learning_rate": 8.262626262626264e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.8007905781269073, + "num_tokens": 328173.0, + "step": 92 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 1.9205256700515747, + "learning_rate": 8.242424242424243e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7856946289539337, + "num_tokens": 331579.0, + "step": 93 + }, + { + "epoch": 0.2785185185185185, + "grad_norm": 1.3299251794815063, + "learning_rate": 8.222222222222222e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.8204686045646667, + "num_tokens": 335255.0, + "step": 94 + }, + { + "epoch": 0.2814814814814815, + "grad_norm": 1.5957735776901245, + "learning_rate": 8.202020202020202e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.810562789440155, + "num_tokens": 338419.0, + "step": 95 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 1.2185496091842651, + "learning_rate": 8.181818181818183e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.8130707144737244, + "num_tokens": 342741.0, + "step": 96 + }, + { + "epoch": 0.2874074074074074, + "grad_norm": 1.5280882120132446, + "learning_rate": 8.161616161616162e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.80326247215271, + "num_tokens": 346019.0, + "step": 97 + }, + { + "epoch": 0.2903703703703704, + "grad_norm": 1.5355949401855469, + "learning_rate": 8.141414141414142e-06, + "loss": 0.664, + "mean_token_accuracy": 0.7727339267730713, + "num_tokens": 349348.0, + "step": 98 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 1.5812368392944336, + "learning_rate": 8.121212121212121e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.7974686026573181, + "num_tokens": 352524.0, + "step": 99 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 1.4538582563400269, + "learning_rate": 8.101010101010102e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8171866536140442, + "num_tokens": 355833.0, + "step": 100 + }, + { + "epoch": 0.2992592592592593, + "grad_norm": 1.5794340372085571, + "learning_rate": 8.08080808080808e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.788432389497757, + "num_tokens": 358996.0, + "step": 101 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 1.6119457483291626, + "learning_rate": 8.060606060606061e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.7918921113014221, + "num_tokens": 361844.0, + "step": 102 + }, + { + "epoch": 0.30518518518518517, + "grad_norm": 1.625422716140747, + "learning_rate": 8.04040404040404e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7703355550765991, + "num_tokens": 364951.0, + "step": 103 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 1.4044547080993652, + "learning_rate": 8.02020202020202e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.7982601523399353, + "num_tokens": 368375.0, + "step": 104 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 1.2490246295928955, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.8049896657466888, + "num_tokens": 372533.0, + "step": 105 + }, + { + "epoch": 0.31407407407407406, + "grad_norm": 1.544236660003662, + "learning_rate": 7.97979797979798e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8232283592224121, + "num_tokens": 375748.0, + "step": 106 + }, + { + "epoch": 0.31703703703703706, + "grad_norm": 1.474335789680481, + "learning_rate": 7.95959595959596e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.8042398691177368, + "num_tokens": 379160.0, + "step": 107 + }, + { + "epoch": 0.32, + "grad_norm": 1.4716367721557617, + "learning_rate": 7.93939393939394e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.81321781873703, + "num_tokens": 382472.0, + "step": 108 + }, + { + "epoch": 0.32296296296296295, + "grad_norm": 1.6919196844100952, + "learning_rate": 7.91919191919192e-06, + "loss": 0.6646, + "mean_token_accuracy": 0.7750830352306366, + "num_tokens": 385378.0, + "step": 109 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 1.5173804759979248, + "learning_rate": 7.898989898989899e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.781123161315918, + "num_tokens": 388738.0, + "step": 110 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 1.2750955820083618, + "learning_rate": 7.87878787878788e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8296970427036285, + "num_tokens": 392546.0, + "step": 111 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 1.5602514743804932, + "learning_rate": 7.858585858585859e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.804762214422226, + "num_tokens": 395589.0, + "step": 112 + }, + { + "epoch": 0.3348148148148148, + "grad_norm": 1.3706358671188354, + "learning_rate": 7.838383838383839e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8147208392620087, + "num_tokens": 399209.0, + "step": 113 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 1.4131827354431152, + "learning_rate": 7.81818181818182e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8179905712604523, + "num_tokens": 402747.0, + "step": 114 + }, + { + "epoch": 0.34074074074074073, + "grad_norm": 1.3775626420974731, + "learning_rate": 7.797979797979799e-06, + "loss": 0.592, + "mean_token_accuracy": 0.7967976629734039, + "num_tokens": 406261.0, + "step": 115 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 1.3306676149368286, + "learning_rate": 7.77777777777778e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.786788135766983, + "num_tokens": 410218.0, + "step": 116 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.2516112327575684, + "learning_rate": 7.757575757575758e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8270311653614044, + "num_tokens": 414317.0, + "step": 117 + }, + { + "epoch": 0.3496296296296296, + "grad_norm": 1.3731434345245361, + "learning_rate": 7.737373737373739e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.8068479895591736, + "num_tokens": 418109.0, + "step": 118 + }, + { + "epoch": 0.35259259259259257, + "grad_norm": 1.3524179458618164, + "learning_rate": 7.717171717171717e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8223755657672882, + "num_tokens": 421670.0, + "step": 119 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 1.779089093208313, + "learning_rate": 7.696969696969696e-06, + "loss": 0.616, + "mean_token_accuracy": 0.7928546369075775, + "num_tokens": 424789.0, + "step": 120 + }, + { + "epoch": 0.3585185185185185, + "grad_norm": 1.6166177988052368, + "learning_rate": 7.676767676767677e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.7925732135772705, + "num_tokens": 427818.0, + "step": 121 + }, + { + "epoch": 0.36148148148148146, + "grad_norm": 1.272436261177063, + "learning_rate": 7.656565656565658e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8285199403762817, + "num_tokens": 432137.0, + "step": 122 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 1.5963072776794434, + "learning_rate": 7.636363636363638e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7772507071495056, + "num_tokens": 435535.0, + "step": 123 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 1.5523314476013184, + "learning_rate": 7.616161616161617e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7861550450325012, + "num_tokens": 438951.0, + "step": 124 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 1.3267731666564941, + "learning_rate": 7.595959595959597e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.7991159856319427, + "num_tokens": 443116.0, + "step": 125 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.6102476119995117, + "learning_rate": 7.5757575757575764e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8104711472988129, + "num_tokens": 446274.0, + "step": 126 + }, + { + "epoch": 0.3762962962962963, + "grad_norm": 1.5453342199325562, + "learning_rate": 7.555555555555556e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7892000675201416, + "num_tokens": 449697.0, + "step": 127 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 1.4418869018554688, + "learning_rate": 7.535353535353536e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8220471143722534, + "num_tokens": 453230.0, + "step": 128 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 1.4159401655197144, + "learning_rate": 7.515151515151516e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8074868321418762, + "num_tokens": 457079.0, + "step": 129 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 1.3482673168182373, + "learning_rate": 7.494949494949496e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8075011372566223, + "num_tokens": 460755.0, + "step": 130 + }, + { + "epoch": 0.38814814814814813, + "grad_norm": 1.2653297185897827, + "learning_rate": 7.474747474747476e-06, + "loss": 0.5586, + "mean_token_accuracy": 0.8131665289402008, + "num_tokens": 465203.0, + "step": 131 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 1.488782525062561, + "learning_rate": 7.454545454545456e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8179747760295868, + "num_tokens": 468750.0, + "step": 132 + }, + { + "epoch": 0.3940740740740741, + "grad_norm": 1.3600029945373535, + "learning_rate": 7.434343434343435e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.822299063205719, + "num_tokens": 472332.0, + "step": 133 + }, + { + "epoch": 0.397037037037037, + "grad_norm": 1.549268364906311, + "learning_rate": 7.414141414141415e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.8189083337783813, + "num_tokens": 475866.0, + "step": 134 + }, + { + "epoch": 0.4, + "grad_norm": 1.3664052486419678, + "learning_rate": 7.393939393939395e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.8033970594406128, + "num_tokens": 479846.0, + "step": 135 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 1.4477136135101318, + "learning_rate": 7.373737373737374e-06, + "loss": 0.568, + "mean_token_accuracy": 0.811283677816391, + "num_tokens": 483803.0, + "step": 136 + }, + { + "epoch": 0.4059259259259259, + "grad_norm": 1.6781620979309082, + "learning_rate": 7.353535353535353e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7736048996448517, + "num_tokens": 486840.0, + "step": 137 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 1.4147554636001587, + "learning_rate": 7.333333333333333e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.8107582926750183, + "num_tokens": 490739.0, + "step": 138 + }, + { + "epoch": 0.41185185185185186, + "grad_norm": 1.7689573764801025, + "learning_rate": 7.3131313131313146e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7686804234981537, + "num_tokens": 493586.0, + "step": 139 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 1.3570775985717773, + "learning_rate": 7.2929292929292934e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8123593330383301, + "num_tokens": 497478.0, + "step": 140 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 1.696730613708496, + "learning_rate": 7.272727272727273e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7887285649776459, + "num_tokens": 500300.0, + "step": 141 + }, + { + "epoch": 0.42074074074074075, + "grad_norm": 1.391579508781433, + "learning_rate": 7.252525252525253e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.8132326900959015, + "num_tokens": 503678.0, + "step": 142 + }, + { + "epoch": 0.4237037037037037, + "grad_norm": 1.7409371137619019, + "learning_rate": 7.232323232323233e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.8220958113670349, + "num_tokens": 506435.0, + "step": 143 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 1.4844486713409424, + "learning_rate": 7.212121212121212e-06, + "loss": 0.564, + "mean_token_accuracy": 0.812475174665451, + "num_tokens": 509731.0, + "step": 144 + }, + { + "epoch": 0.42962962962962964, + "grad_norm": 1.6001708507537842, + "learning_rate": 7.191919191919192e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.7975054979324341, + "num_tokens": 512901.0, + "step": 145 + }, + { + "epoch": 0.4325925925925926, + "grad_norm": 1.4961379766464233, + "learning_rate": 7.171717171717172e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.7981494963169098, + "num_tokens": 516427.0, + "step": 146 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 1.4953283071517944, + "learning_rate": 7.151515151515152e-06, + "loss": 0.5753, + "mean_token_accuracy": 0.8087860941886902, + "num_tokens": 520298.0, + "step": 147 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 1.4786593914031982, + "learning_rate": 7.131313131313132e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8053753077983856, + "num_tokens": 523679.0, + "step": 148 + }, + { + "epoch": 0.4414814814814815, + "grad_norm": 1.50172758102417, + "learning_rate": 7.111111111111112e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.8032037019729614, + "num_tokens": 527140.0, + "step": 149 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.48976469039917, + "learning_rate": 7.0909090909090916e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8339570760726929, + "num_tokens": 530634.0, + "step": 150 + }, + { + "epoch": 0.4474074074074074, + "grad_norm": 1.3774471282958984, + "learning_rate": 7.070707070707071e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.8163467943668365, + "num_tokens": 534704.0, + "step": 151 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 1.7946031093597412, + "learning_rate": 7.050505050505051e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.7851580083370209, + "num_tokens": 537626.0, + "step": 152 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 1.5375752449035645, + "learning_rate": 7.030303030303031e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.7998016178607941, + "num_tokens": 541111.0, + "step": 153 + }, + { + "epoch": 0.4562962962962963, + "grad_norm": 1.447189211845398, + "learning_rate": 7.0101010101010105e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8036025762557983, + "num_tokens": 544999.0, + "step": 154 + }, + { + "epoch": 0.45925925925925926, + "grad_norm": 1.4655578136444092, + "learning_rate": 6.98989898989899e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.7951805293560028, + "num_tokens": 548714.0, + "step": 155 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 1.488142728805542, + "learning_rate": 6.969696969696971e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.7991268336772919, + "num_tokens": 552222.0, + "step": 156 + }, + { + "epoch": 0.4651851851851852, + "grad_norm": 1.2677358388900757, + "learning_rate": 6.9494949494949505e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8355545401573181, + "num_tokens": 556159.0, + "step": 157 + }, + { + "epoch": 0.46814814814814815, + "grad_norm": 1.2496588230133057, + "learning_rate": 6.92929292929293e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.8112179338932037, + "num_tokens": 560824.0, + "step": 158 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 1.5725500583648682, + "learning_rate": 6.90909090909091e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.7929348051548004, + "num_tokens": 564410.0, + "step": 159 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 1.5075668096542358, + "learning_rate": 6.88888888888889e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.8283835649490356, + "num_tokens": 567695.0, + "step": 160 + }, + { + "epoch": 0.47703703703703704, + "grad_norm": 1.4889564514160156, + "learning_rate": 6.868686868686869e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.8116940259933472, + "num_tokens": 571418.0, + "step": 161 + }, + { + "epoch": 0.48, + "grad_norm": 1.3995633125305176, + "learning_rate": 6.848484848484849e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8361697196960449, + "num_tokens": 574990.0, + "step": 162 + }, + { + "epoch": 0.482962962962963, + "grad_norm": 1.4445774555206299, + "learning_rate": 6.828282828282828e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.8065415620803833, + "num_tokens": 578715.0, + "step": 163 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 1.8358018398284912, + "learning_rate": 6.808080808080809e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7867823839187622, + "num_tokens": 581633.0, + "step": 164 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 1.6168558597564697, + "learning_rate": 6.787878787878789e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.802934855222702, + "num_tokens": 584805.0, + "step": 165 + }, + { + "epoch": 0.4918518518518519, + "grad_norm": 1.6819730997085571, + "learning_rate": 6.767676767676769e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.777470052242279, + "num_tokens": 587697.0, + "step": 166 + }, + { + "epoch": 0.4948148148148148, + "grad_norm": 1.507273554801941, + "learning_rate": 6.747474747474749e-06, + "loss": 0.576, + "mean_token_accuracy": 0.8129746615886688, + "num_tokens": 591407.0, + "step": 167 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 1.4918867349624634, + "learning_rate": 6.7272727272727275e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8223305642604828, + "num_tokens": 595003.0, + "step": 168 + }, + { + "epoch": 0.5007407407407407, + "grad_norm": 1.444341778755188, + "learning_rate": 6.707070707070707e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.8108565807342529, + "num_tokens": 598742.0, + "step": 169 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 1.353681206703186, + "learning_rate": 6.686868686868687e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8098458349704742, + "num_tokens": 602962.0, + "step": 170 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 1.6338601112365723, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.8077074587345123, + "num_tokens": 606001.0, + "step": 171 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 1.5632859468460083, + "learning_rate": 6.646464646464646e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8030338883399963, + "num_tokens": 609705.0, + "step": 172 + }, + { + "epoch": 0.5125925925925926, + "grad_norm": 1.6990123987197876, + "learning_rate": 6.626262626262627e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7909416854381561, + "num_tokens": 613102.0, + "step": 173 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 1.6014174222946167, + "learning_rate": 6.606060606060607e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.795735239982605, + "num_tokens": 616460.0, + "step": 174 + }, + { + "epoch": 0.5185185185185185, + "grad_norm": 1.3934897184371948, + "learning_rate": 6.585858585858586e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.814742773771286, + "num_tokens": 620641.0, + "step": 175 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 1.637742519378662, + "learning_rate": 6.565656565656566e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.7971774041652679, + "num_tokens": 623918.0, + "step": 176 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 1.4732917547225952, + "learning_rate": 6.545454545454546e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.8108367323875427, + "num_tokens": 627624.0, + "step": 177 + }, + { + "epoch": 0.5274074074074074, + "grad_norm": 1.5997675657272339, + "learning_rate": 6.525252525252526e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.8024126589298248, + "num_tokens": 631005.0, + "step": 178 + }, + { + "epoch": 0.5303703703703704, + "grad_norm": 1.4993646144866943, + "learning_rate": 6.505050505050505e-06, + "loss": 0.5665, + "mean_token_accuracy": 0.7992716729640961, + "num_tokens": 634688.0, + "step": 179 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 1.444858431816101, + "learning_rate": 6.484848484848485e-06, + "loss": 0.56, + "mean_token_accuracy": 0.8167417347431183, + "num_tokens": 638471.0, + "step": 180 + }, + { + "epoch": 0.5362962962962963, + "grad_norm": 1.5364516973495483, + "learning_rate": 6.464646464646466e-06, + "loss": 0.5811, + "mean_token_accuracy": 0.8004305064678192, + "num_tokens": 642036.0, + "step": 181 + }, + { + "epoch": 0.5392592592592592, + "grad_norm": 1.6283938884735107, + "learning_rate": 6.444444444444445e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.8070067763328552, + "num_tokens": 645280.0, + "step": 182 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 1.3253967761993408, + "learning_rate": 6.424242424242425e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8250679969787598, + "num_tokens": 649383.0, + "step": 183 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 1.5438027381896973, + "learning_rate": 6.404040404040405e-06, + "loss": 0.587, + "mean_token_accuracy": 0.810188502073288, + "num_tokens": 652707.0, + "step": 184 + }, + { + "epoch": 0.5481481481481482, + "grad_norm": 1.3087868690490723, + "learning_rate": 6.3838383838383845e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.822411298751831, + "num_tokens": 657083.0, + "step": 185 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 1.6901150941848755, + "learning_rate": 6.363636363636364e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8315849602222443, + "num_tokens": 660003.0, + "step": 186 + }, + { + "epoch": 0.554074074074074, + "grad_norm": 1.3695402145385742, + "learning_rate": 6.343434343434344e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8279611766338348, + "num_tokens": 664161.0, + "step": 187 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 1.2661864757537842, + "learning_rate": 6.323232323232324e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8103662729263306, + "num_tokens": 668766.0, + "step": 188 + }, + { + "epoch": 0.56, + "grad_norm": 1.45142662525177, + "learning_rate": 6.303030303030303e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8103845417499542, + "num_tokens": 672648.0, + "step": 189 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 1.5805292129516602, + "learning_rate": 6.282828282828284e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8113139867782593, + "num_tokens": 675928.0, + "step": 190 + }, + { + "epoch": 0.5659259259259259, + "grad_norm": 1.44484543800354, + "learning_rate": 6.262626262626264e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.8005532026290894, + "num_tokens": 679648.0, + "step": 191 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 1.4975072145462036, + "learning_rate": 6.2424242424242434e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.7957307696342468, + "num_tokens": 683308.0, + "step": 192 + }, + { + "epoch": 0.5718518518518518, + "grad_norm": 1.4652924537658691, + "learning_rate": 6.222222222222223e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.8080338537693024, + "num_tokens": 687160.0, + "step": 193 + }, + { + "epoch": 0.5748148148148148, + "grad_norm": 1.5434601306915283, + "learning_rate": 6.202020202020203e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.8029443323612213, + "num_tokens": 690885.0, + "step": 194 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 1.521317720413208, + "learning_rate": 6.181818181818182e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.8022967278957367, + "num_tokens": 694758.0, + "step": 195 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 1.500725269317627, + "learning_rate": 6.1616161616161615e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8195858001708984, + "num_tokens": 698577.0, + "step": 196 + }, + { + "epoch": 0.5837037037037037, + "grad_norm": 1.601712942123413, + "learning_rate": 6.141414141414141e-06, + "loss": 0.584, + "mean_token_accuracy": 0.8032855689525604, + "num_tokens": 702092.0, + "step": 197 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.498335361480713, + "learning_rate": 6.121212121212121e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8234370648860931, + "num_tokens": 705489.0, + "step": 198 + }, + { + "epoch": 0.5896296296296296, + "grad_norm": 1.8601118326187134, + "learning_rate": 6.1010101010101015e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7875588238239288, + "num_tokens": 708188.0, + "step": 199 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 1.6213523149490356, + "learning_rate": 6.080808080808081e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.8046819567680359, + "num_tokens": 711430.0, + "step": 200 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 1.4652053117752075, + "learning_rate": 6.060606060606061e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8242162466049194, + "num_tokens": 715000.0, + "step": 201 + }, + { + "epoch": 0.5985185185185186, + "grad_norm": 1.550500512123108, + "learning_rate": 6.040404040404041e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.7844513952732086, + "num_tokens": 718745.0, + "step": 202 + }, + { + "epoch": 0.6014814814814815, + "grad_norm": 1.8714395761489868, + "learning_rate": 6.0202020202020204e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.8035549521446228, + "num_tokens": 721542.0, + "step": 203 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 1.4112492799758911, + "learning_rate": 6e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8142783641815186, + "num_tokens": 725297.0, + "step": 204 + }, + { + "epoch": 0.6074074074074074, + "grad_norm": 1.8889070749282837, + "learning_rate": 5.97979797979798e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7618487775325775, + "num_tokens": 728135.0, + "step": 205 + }, + { + "epoch": 0.6103703703703703, + "grad_norm": 1.4843196868896484, + "learning_rate": 5.95959595959596e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8324712216854095, + "num_tokens": 731817.0, + "step": 206 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 1.5266814231872559, + "learning_rate": 5.93939393939394e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8324710428714752, + "num_tokens": 735261.0, + "step": 207 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 1.413236379623413, + "learning_rate": 5.91919191919192e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8100711405277252, + "num_tokens": 739167.0, + "step": 208 + }, + { + "epoch": 0.6192592592592593, + "grad_norm": 1.5599700212478638, + "learning_rate": 5.8989898989899e-06, + "loss": 0.518, + "mean_token_accuracy": 0.8236251473426819, + "num_tokens": 742440.0, + "step": 209 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 1.26742684841156, + "learning_rate": 5.878787878787879e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8374475538730621, + "num_tokens": 746608.0, + "step": 210 + }, + { + "epoch": 0.6251851851851852, + "grad_norm": 1.7720143795013428, + "learning_rate": 5.858585858585859e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.8017608523368835, + "num_tokens": 749669.0, + "step": 211 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 1.5050828456878662, + "learning_rate": 5.838383838383839e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8172882497310638, + "num_tokens": 753321.0, + "step": 212 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 1.842973232269287, + "learning_rate": 5.8181818181818185e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8186375796794891, + "num_tokens": 756194.0, + "step": 213 + }, + { + "epoch": 0.6340740740740741, + "grad_norm": 1.794044017791748, + "learning_rate": 5.797979797979798e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.7934879958629608, + "num_tokens": 759401.0, + "step": 214 + }, + { + "epoch": 0.6370370370370371, + "grad_norm": 1.579003930091858, + "learning_rate": 5.777777777777778e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.7970623672008514, + "num_tokens": 763365.0, + "step": 215 + }, + { + "epoch": 0.64, + "grad_norm": 1.5435208082199097, + "learning_rate": 5.7575757575757586e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8127906620502472, + "num_tokens": 767460.0, + "step": 216 + }, + { + "epoch": 0.642962962962963, + "grad_norm": 1.7103557586669922, + "learning_rate": 5.737373737373738e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8113856315612793, + "num_tokens": 770777.0, + "step": 217 + }, + { + "epoch": 0.6459259259259259, + "grad_norm": 1.3665575981140137, + "learning_rate": 5.717171717171718e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.814469188451767, + "num_tokens": 775010.0, + "step": 218 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 1.4849032163619995, + "learning_rate": 5.696969696969698e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8189038634300232, + "num_tokens": 778600.0, + "step": 219 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 1.5816186666488647, + "learning_rate": 5.6767676767676775e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8194137215614319, + "num_tokens": 782208.0, + "step": 220 + }, + { + "epoch": 0.6548148148148148, + "grad_norm": 1.5780069828033447, + "learning_rate": 5.656565656565657e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.8015795350074768, + "num_tokens": 785748.0, + "step": 221 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 1.5095868110656738, + "learning_rate": 5.636363636363636e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8157745003700256, + "num_tokens": 789487.0, + "step": 222 + }, + { + "epoch": 0.6607407407407407, + "grad_norm": 1.737778663635254, + "learning_rate": 5.616161616161616e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.7943673133850098, + "num_tokens": 792804.0, + "step": 223 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 1.7924089431762695, + "learning_rate": 5.595959595959597e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.8109574615955353, + "num_tokens": 795615.0, + "step": 224 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.817681074142456, + "learning_rate": 5.575757575757577e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.8096210360527039, + "num_tokens": 798567.0, + "step": 225 + }, + { + "epoch": 0.6696296296296296, + "grad_norm": 1.887503981590271, + "learning_rate": 5.555555555555557e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.7909146547317505, + "num_tokens": 801365.0, + "step": 226 + }, + { + "epoch": 0.6725925925925926, + "grad_norm": 1.4171687364578247, + "learning_rate": 5.5353535353535355e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.8115169107913971, + "num_tokens": 805718.0, + "step": 227 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 1.5414133071899414, + "learning_rate": 5.515151515151515e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.8272335529327393, + "num_tokens": 809262.0, + "step": 228 + }, + { + "epoch": 0.6785185185185185, + "grad_norm": 1.386720895767212, + "learning_rate": 5.494949494949495e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8235159814357758, + "num_tokens": 813431.0, + "step": 229 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 2.3421661853790283, + "learning_rate": 5.474747474747475e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7778838276863098, + "num_tokens": 815609.0, + "step": 230 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 1.4562745094299316, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8238994777202606, + "num_tokens": 819510.0, + "step": 231 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 1.2899240255355835, + "learning_rate": 5.434343434343434e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8367083072662354, + "num_tokens": 824222.0, + "step": 232 + }, + { + "epoch": 0.6903703703703704, + "grad_norm": 1.5816051959991455, + "learning_rate": 5.414141414141415e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8175100982189178, + "num_tokens": 827998.0, + "step": 233 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 1.6603777408599854, + "learning_rate": 5.3939393939393945e-06, + "loss": 0.551, + "mean_token_accuracy": 0.8155083060264587, + "num_tokens": 831483.0, + "step": 234 + }, + { + "epoch": 0.6962962962962963, + "grad_norm": 1.4165617227554321, + "learning_rate": 5.373737373737374e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8127353191375732, + "num_tokens": 835698.0, + "step": 235 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 1.5943632125854492, + "learning_rate": 5.353535353535354e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8185691237449646, + "num_tokens": 839279.0, + "step": 236 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 1.6768890619277954, + "learning_rate": 5.333333333333334e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.8021369576454163, + "num_tokens": 842625.0, + "step": 237 + }, + { + "epoch": 0.7051851851851851, + "grad_norm": 1.746807336807251, + "learning_rate": 5.313131313131313e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.8058859407901764, + "num_tokens": 845835.0, + "step": 238 + }, + { + "epoch": 0.7081481481481482, + "grad_norm": 1.7297693490982056, + "learning_rate": 5.292929292929293e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.7869347929954529, + "num_tokens": 849109.0, + "step": 239 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 1.5339139699935913, + "learning_rate": 5.272727272727273e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8289228081703186, + "num_tokens": 852647.0, + "step": 240 + }, + { + "epoch": 0.7140740740740741, + "grad_norm": 1.5021756887435913, + "learning_rate": 5.252525252525253e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8174805045127869, + "num_tokens": 856179.0, + "step": 241 + }, + { + "epoch": 0.717037037037037, + "grad_norm": 1.7352502346038818, + "learning_rate": 5.232323232323233e-06, + "loss": 0.575, + "mean_token_accuracy": 0.8106747269630432, + "num_tokens": 859288.0, + "step": 242 + }, + { + "epoch": 0.72, + "grad_norm": 1.5095127820968628, + "learning_rate": 5.212121212121213e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8312416672706604, + "num_tokens": 862701.0, + "step": 243 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 1.6179436445236206, + "learning_rate": 5.191919191919193e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8022513091564178, + "num_tokens": 866174.0, + "step": 244 + }, + { + "epoch": 0.725925925925926, + "grad_norm": 1.481798768043518, + "learning_rate": 5.171717171717172e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8222281336784363, + "num_tokens": 870072.0, + "step": 245 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 1.360124111175537, + "learning_rate": 5.151515151515152e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8409126102924347, + "num_tokens": 874256.0, + "step": 246 + }, + { + "epoch": 0.7318518518518519, + "grad_norm": 1.4909288883209229, + "learning_rate": 5.131313131313132e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.8216418325901031, + "num_tokens": 878097.0, + "step": 247 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 1.3830221891403198, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8262554705142975, + "num_tokens": 882230.0, + "step": 248 + }, + { + "epoch": 0.7377777777777778, + "grad_norm": 1.5149977207183838, + "learning_rate": 5.090909090909091e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.821267694234848, + "num_tokens": 886024.0, + "step": 249 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 1.5606229305267334, + "learning_rate": 5.070707070707072e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.8006150126457214, + "num_tokens": 889709.0, + "step": 250 + }, + { + "epoch": 0.7437037037037038, + "grad_norm": 1.7571200132369995, + "learning_rate": 5.0505050505050515e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.7919616401195526, + "num_tokens": 892821.0, + "step": 251 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 1.449567198753357, + "learning_rate": 5.030303030303031e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8278501927852631, + "num_tokens": 896491.0, + "step": 252 + }, + { + "epoch": 0.7496296296296296, + "grad_norm": 1.6625317335128784, + "learning_rate": 5.010101010101011e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8050054013729095, + "num_tokens": 899831.0, + "step": 253 + }, + { + "epoch": 0.7525925925925926, + "grad_norm": 1.3138338327407837, + "learning_rate": 4.98989898989899e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8254255056381226, + "num_tokens": 904575.0, + "step": 254 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 1.8576364517211914, + "learning_rate": 4.9696969696969696e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.8111565709114075, + "num_tokens": 907415.0, + "step": 255 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 1.4178019762039185, + "learning_rate": 4.94949494949495e-06, + "loss": 0.5282, + "mean_token_accuracy": 0.8179458677768707, + "num_tokens": 911266.0, + "step": 256 + }, + { + "epoch": 0.7614814814814815, + "grad_norm": 1.646183729171753, + "learning_rate": 4.92929292929293e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8059698641300201, + "num_tokens": 914713.0, + "step": 257 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 1.460288166999817, + "learning_rate": 4.90909090909091e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8073983788490295, + "num_tokens": 918499.0, + "step": 258 + }, + { + "epoch": 0.7674074074074074, + "grad_norm": 1.5938327312469482, + "learning_rate": 4.888888888888889e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8212731182575226, + "num_tokens": 922020.0, + "step": 259 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 1.810258388519287, + "learning_rate": 4.868686868686869e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.7912078201770782, + "num_tokens": 925296.0, + "step": 260 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 1.3986088037490845, + "learning_rate": 4.848484848484849e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8284522294998169, + "num_tokens": 929425.0, + "step": 261 + }, + { + "epoch": 0.7762962962962963, + "grad_norm": 1.6466894149780273, + "learning_rate": 4.8282828282828285e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8191911578178406, + "num_tokens": 932774.0, + "step": 262 + }, + { + "epoch": 0.7792592592592592, + "grad_norm": 1.4643137454986572, + "learning_rate": 4.808080808080808e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.836432158946991, + "num_tokens": 936580.0, + "step": 263 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 1.5742167234420776, + "learning_rate": 4.787878787878788e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8256834149360657, + "num_tokens": 940188.0, + "step": 264 + }, + { + "epoch": 0.7851851851851852, + "grad_norm": 1.6398093700408936, + "learning_rate": 4.7676767676767685e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.8210432827472687, + "num_tokens": 943702.0, + "step": 265 + }, + { + "epoch": 0.7881481481481482, + "grad_norm": 1.4668092727661133, + "learning_rate": 4.747474747474748e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8323331475257874, + "num_tokens": 947368.0, + "step": 266 + }, + { + "epoch": 0.7911111111111111, + "grad_norm": 1.7432724237442017, + "learning_rate": 4.727272727272728e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8172522783279419, + "num_tokens": 950560.0, + "step": 267 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 1.52858567237854, + "learning_rate": 4.707070707070707e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8274672031402588, + "num_tokens": 954167.0, + "step": 268 + }, + { + "epoch": 0.797037037037037, + "grad_norm": 2.062519073486328, + "learning_rate": 4.6868686868686874e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.7954976558685303, + "num_tokens": 957007.0, + "step": 269 + }, + { + "epoch": 0.8, + "grad_norm": 2.1808712482452393, + "learning_rate": 4.666666666666667e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7946015298366547, + "num_tokens": 959434.0, + "step": 270 + }, + { + "epoch": 0.802962962962963, + "grad_norm": 1.746964693069458, + "learning_rate": 4.646464646464647e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.8066003024578094, + "num_tokens": 962704.0, + "step": 271 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 1.377026915550232, + "learning_rate": 4.626262626262627e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8214571177959442, + "num_tokens": 967354.0, + "step": 272 + }, + { + "epoch": 0.8088888888888889, + "grad_norm": 1.4898761510849, + "learning_rate": 4.606060606060606e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8236507475376129, + "num_tokens": 971253.0, + "step": 273 + }, + { + "epoch": 0.8118518518518518, + "grad_norm": 1.3284251689910889, + "learning_rate": 4.585858585858586e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.8214887976646423, + "num_tokens": 975766.0, + "step": 274 + }, + { + "epoch": 0.8148148148148148, + "grad_norm": 1.6390351057052612, + "learning_rate": 4.565656565656566e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.8090437650680542, + "num_tokens": 979202.0, + "step": 275 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 1.3614585399627686, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.8211559057235718, + "num_tokens": 983733.0, + "step": 276 + }, + { + "epoch": 0.8207407407407408, + "grad_norm": 1.9103572368621826, + "learning_rate": 4.525252525252526e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.8051064014434814, + "num_tokens": 986713.0, + "step": 277 + }, + { + "epoch": 0.8237037037037037, + "grad_norm": 1.6311554908752441, + "learning_rate": 4.505050505050506e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.8132398724555969, + "num_tokens": 990061.0, + "step": 278 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 1.430684208869934, + "learning_rate": 4.4848484848484855e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.822590172290802, + "num_tokens": 993984.0, + "step": 279 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 1.650658130645752, + "learning_rate": 4.464646464646465e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.8105242550373077, + "num_tokens": 997507.0, + "step": 280 + }, + { + "epoch": 0.8325925925925926, + "grad_norm": 1.4831931591033936, + "learning_rate": 4.444444444444444e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.821378231048584, + "num_tokens": 1001582.0, + "step": 281 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 1.6502623558044434, + "learning_rate": 4.424242424242425e-06, + "loss": 0.572, + "mean_token_accuracy": 0.8153172433376312, + "num_tokens": 1004927.0, + "step": 282 + }, + { + "epoch": 0.8385185185185186, + "grad_norm": 1.7103201150894165, + "learning_rate": 4.4040404040404044e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.8058517277240753, + "num_tokens": 1008289.0, + "step": 283 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 1.408506155014038, + "learning_rate": 4.383838383838384e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8221867978572845, + "num_tokens": 1012697.0, + "step": 284 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 1.5832841396331787, + "learning_rate": 4.363636363636364e-06, + "loss": 0.542, + "mean_token_accuracy": 0.8219292461872101, + "num_tokens": 1016103.0, + "step": 285 + }, + { + "epoch": 0.8474074074074074, + "grad_norm": 1.741313099861145, + "learning_rate": 4.343434343434344e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8415253162384033, + "num_tokens": 1019273.0, + "step": 286 + }, + { + "epoch": 0.8503703703703703, + "grad_norm": 1.6206964254379272, + "learning_rate": 4.323232323232323e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8371073305606842, + "num_tokens": 1022496.0, + "step": 287 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 1.8968009948730469, + "learning_rate": 4.303030303030303e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8203877210617065, + "num_tokens": 1025281.0, + "step": 288 + }, + { + "epoch": 0.8562962962962963, + "grad_norm": 1.6340306997299194, + "learning_rate": 4.282828282828283e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.8140796720981598, + "num_tokens": 1028774.0, + "step": 289 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 1.5761096477508545, + "learning_rate": 4.262626262626263e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.8144352436065674, + "num_tokens": 1032483.0, + "step": 290 + }, + { + "epoch": 0.8622222222222222, + "grad_norm": 1.6122000217437744, + "learning_rate": 4.242424242424243e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.7831108570098877, + "num_tokens": 1035928.0, + "step": 291 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 1.6059738397598267, + "learning_rate": 4.222222222222223e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.8186386823654175, + "num_tokens": 1039463.0, + "step": 292 + }, + { + "epoch": 0.8681481481481481, + "grad_norm": 1.8089956045150757, + "learning_rate": 4.2020202020202026e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.8020069003105164, + "num_tokens": 1042510.0, + "step": 293 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 1.615954041481018, + "learning_rate": 4.181818181818182e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.825139582157135, + "num_tokens": 1046121.0, + "step": 294 + }, + { + "epoch": 0.8740740740740741, + "grad_norm": 1.6538552045822144, + "learning_rate": 4.161616161616162e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.8040241301059723, + "num_tokens": 1049733.0, + "step": 295 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 1.6505142450332642, + "learning_rate": 4.141414141414142e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.7971898913383484, + "num_tokens": 1053401.0, + "step": 296 + }, + { + "epoch": 0.88, + "grad_norm": 1.8934201002120972, + "learning_rate": 4.1212121212121215e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7825890779495239, + "num_tokens": 1056438.0, + "step": 297 + }, + { + "epoch": 0.882962962962963, + "grad_norm": 1.433295726776123, + "learning_rate": 4.101010101010101e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8208615779876709, + "num_tokens": 1060611.0, + "step": 298 + }, + { + "epoch": 0.8859259259259259, + "grad_norm": 1.8147464990615845, + "learning_rate": 4.080808080808081e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.8081979751586914, + "num_tokens": 1063818.0, + "step": 299 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.6561836004257202, + "learning_rate": 4.060606060606061e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8106991350650787, + "num_tokens": 1067245.0, + "step": 300 + }, + { + "epoch": 0.8918518518518519, + "grad_norm": 1.9350473880767822, + "learning_rate": 4.04040404040404e-06, + "loss": 0.6696, + "mean_token_accuracy": 0.7699568271636963, + "num_tokens": 1070277.0, + "step": 301 + }, + { + "epoch": 0.8948148148148148, + "grad_norm": 1.834263801574707, + "learning_rate": 4.02020202020202e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.8140117526054382, + "num_tokens": 1073356.0, + "step": 302 + }, + { + "epoch": 0.8977777777777778, + "grad_norm": 1.5959419012069702, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.8041948974132538, + "num_tokens": 1077424.0, + "step": 303 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 1.7203707695007324, + "learning_rate": 3.97979797979798e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8332377076148987, + "num_tokens": 1080856.0, + "step": 304 + }, + { + "epoch": 0.9037037037037037, + "grad_norm": 1.492654800415039, + "learning_rate": 3.95959595959596e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8251007497310638, + "num_tokens": 1085105.0, + "step": 305 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 1.4445887804031372, + "learning_rate": 3.93939393939394e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8359389007091522, + "num_tokens": 1088829.0, + "step": 306 + }, + { + "epoch": 0.9096296296296297, + "grad_norm": 1.5416429042816162, + "learning_rate": 3.9191919191919196e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8351666629314423, + "num_tokens": 1092475.0, + "step": 307 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 1.5634393692016602, + "learning_rate": 3.898989898989899e-06, + "loss": 0.583, + "mean_token_accuracy": 0.8078629970550537, + "num_tokens": 1096256.0, + "step": 308 + }, + { + "epoch": 0.9155555555555556, + "grad_norm": 1.8946349620819092, + "learning_rate": 3.878787878787879e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.8006861805915833, + "num_tokens": 1099425.0, + "step": 309 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 1.3436765670776367, + "learning_rate": 3.858585858585859e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8171330690383911, + "num_tokens": 1104274.0, + "step": 310 + }, + { + "epoch": 0.9214814814814815, + "grad_norm": 1.549047589302063, + "learning_rate": 3.8383838383838385e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8313369750976562, + "num_tokens": 1107762.0, + "step": 311 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 1.615659475326538, + "learning_rate": 3.818181818181819e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8152271211147308, + "num_tokens": 1111480.0, + "step": 312 + }, + { + "epoch": 0.9274074074074075, + "grad_norm": 1.7447831630706787, + "learning_rate": 3.7979797979797984e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.805528312921524, + "num_tokens": 1114504.0, + "step": 313 + }, + { + "epoch": 0.9303703703703704, + "grad_norm": 1.858133316040039, + "learning_rate": 3.777777777777778e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8056531846523285, + "num_tokens": 1117584.0, + "step": 314 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 1.4916445016860962, + "learning_rate": 3.757575757575758e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.822784811258316, + "num_tokens": 1121166.0, + "step": 315 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 1.4517661333084106, + "learning_rate": 3.737373737373738e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8186366260051727, + "num_tokens": 1125121.0, + "step": 316 + }, + { + "epoch": 0.9392592592592592, + "grad_norm": 1.6084564924240112, + "learning_rate": 3.7171717171717177e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.8142254650592804, + "num_tokens": 1128655.0, + "step": 317 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 1.4573696851730347, + "learning_rate": 3.6969696969696974e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8219133615493774, + "num_tokens": 1132817.0, + "step": 318 + }, + { + "epoch": 0.9451851851851852, + "grad_norm": 1.465296745300293, + "learning_rate": 3.6767676767676767e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.8081372678279877, + "num_tokens": 1137373.0, + "step": 319 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 1.5735323429107666, + "learning_rate": 3.6565656565656573e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.8291351199150085, + "num_tokens": 1140956.0, + "step": 320 + }, + { + "epoch": 0.9511111111111111, + "grad_norm": 1.3979496955871582, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8407446444034576, + "num_tokens": 1145175.0, + "step": 321 + }, + { + "epoch": 0.9540740740740741, + "grad_norm": 1.7717021703720093, + "learning_rate": 3.6161616161616163e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8134226202964783, + "num_tokens": 1148394.0, + "step": 322 + }, + { + "epoch": 0.957037037037037, + "grad_norm": 1.5911054611206055, + "learning_rate": 3.595959595959596e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.826337069272995, + "num_tokens": 1152083.0, + "step": 323 + }, + { + "epoch": 0.96, + "grad_norm": 1.968131422996521, + "learning_rate": 3.575757575757576e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.7950144112110138, + "num_tokens": 1154917.0, + "step": 324 + }, + { + "epoch": 0.9629629629629629, + "grad_norm": 2.050241231918335, + "learning_rate": 3.555555555555556e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7744756639003754, + "num_tokens": 1157523.0, + "step": 325 + }, + { + "epoch": 0.965925925925926, + "grad_norm": 1.7777398824691772, + "learning_rate": 3.5353535353535356e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8203760385513306, + "num_tokens": 1160892.0, + "step": 326 + }, + { + "epoch": 0.9688888888888889, + "grad_norm": 1.6309396028518677, + "learning_rate": 3.5151515151515154e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8261799514293671, + "num_tokens": 1164679.0, + "step": 327 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 1.5073151588439941, + "learning_rate": 3.494949494949495e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.8077302277088165, + "num_tokens": 1168609.0, + "step": 328 + }, + { + "epoch": 0.9748148148148148, + "grad_norm": 1.6433923244476318, + "learning_rate": 3.4747474747474752e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.810727059841156, + "num_tokens": 1172511.0, + "step": 329 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 1.561825156211853, + "learning_rate": 3.454545454545455e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8353243470191956, + "num_tokens": 1175944.0, + "step": 330 + }, + { + "epoch": 0.9807407407407407, + "grad_norm": 1.7079589366912842, + "learning_rate": 3.4343434343434347e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8003055453300476, + "num_tokens": 1179490.0, + "step": 331 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 1.429787278175354, + "learning_rate": 3.414141414141414e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.8218268752098083, + "num_tokens": 1183733.0, + "step": 332 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 1.4076861143112183, + "learning_rate": 3.3939393939393946e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8280346989631653, + "num_tokens": 1188062.0, + "step": 333 + }, + { + "epoch": 0.9896296296296296, + "grad_norm": 1.474229335784912, + "learning_rate": 3.3737373737373743e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8166138529777527, + "num_tokens": 1192330.0, + "step": 334 + }, + { + "epoch": 0.9925925925925926, + "grad_norm": 1.6596734523773193, + "learning_rate": 3.3535353535353536e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.7998766899108887, + "num_tokens": 1196033.0, + "step": 335 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 1.757028579711914, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.8176335990428925, + "num_tokens": 1199325.0, + "step": 336 + }, + { + "epoch": 0.9985185185185185, + "grad_norm": 1.8239721059799194, + "learning_rate": 3.3131313131313135e-06, + "loss": 0.5643, + "mean_token_accuracy": 0.8093422055244446, + "num_tokens": 1202430.0, + "step": 337 + }, + { + "epoch": 1.0, + "grad_norm": 2.642312526702881, + "learning_rate": 3.292929292929293e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.8093146085739136, + "num_tokens": 1203906.0, + "step": 338 + }, + { + "epoch": 1.002962962962963, + "grad_norm": 1.578357458114624, + "learning_rate": 3.272727272727273e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.8166315853595734, + "num_tokens": 1207582.0, + "step": 339 + }, + { + "epoch": 1.005925925925926, + "grad_norm": 1.4901695251464844, + "learning_rate": 3.2525252525252527e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8372379839420319, + "num_tokens": 1211672.0, + "step": 340 + }, + { + "epoch": 1.008888888888889, + "grad_norm": 1.5155280828475952, + "learning_rate": 3.232323232323233e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8326481878757477, + "num_tokens": 1215812.0, + "step": 341 + }, + { + "epoch": 1.0118518518518518, + "grad_norm": 1.6035300493240356, + "learning_rate": 3.2121212121212125e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.8124500215053558, + "num_tokens": 1219371.0, + "step": 342 + }, + { + "epoch": 1.0148148148148148, + "grad_norm": 1.8786622285842896, + "learning_rate": 3.1919191919191923e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.7993294298648834, + "num_tokens": 1222448.0, + "step": 343 + }, + { + "epoch": 1.0177777777777777, + "grad_norm": 1.6415557861328125, + "learning_rate": 3.171717171717172e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8083369433879852, + "num_tokens": 1225933.0, + "step": 344 + }, + { + "epoch": 1.0207407407407407, + "grad_norm": 1.6128138303756714, + "learning_rate": 3.1515151515151517e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8134556710720062, + "num_tokens": 1229450.0, + "step": 345 + }, + { + "epoch": 1.0237037037037038, + "grad_norm": 1.8448843955993652, + "learning_rate": 3.131313131313132e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.7991564273834229, + "num_tokens": 1232530.0, + "step": 346 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 1.905211091041565, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.7874984443187714, + "num_tokens": 1235498.0, + "step": 347 + }, + { + "epoch": 1.0296296296296297, + "grad_norm": 1.7022649049758911, + "learning_rate": 3.090909090909091e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.824236124753952, + "num_tokens": 1239019.0, + "step": 348 + }, + { + "epoch": 1.0325925925925925, + "grad_norm": 1.4742542505264282, + "learning_rate": 3.0707070707070706e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8214419186115265, + "num_tokens": 1243107.0, + "step": 349 + }, + { + "epoch": 1.0355555555555556, + "grad_norm": 1.4447436332702637, + "learning_rate": 3.0505050505050508e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.8106407523155212, + "num_tokens": 1247395.0, + "step": 350 + }, + { + "epoch": 1.0385185185185186, + "grad_norm": 1.666168212890625, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.820197731256485, + "num_tokens": 1250878.0, + "step": 351 + }, + { + "epoch": 1.0414814814814815, + "grad_norm": 1.6129275560379028, + "learning_rate": 3.0101010101010102e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8269371390342712, + "num_tokens": 1254248.0, + "step": 352 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 1.6680711507797241, + "learning_rate": 2.98989898989899e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.8158087134361267, + "num_tokens": 1257572.0, + "step": 353 + }, + { + "epoch": 1.0474074074074073, + "grad_norm": 1.7332454919815063, + "learning_rate": 2.96969696969697e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.8155358135700226, + "num_tokens": 1260878.0, + "step": 354 + }, + { + "epoch": 1.0503703703703704, + "grad_norm": 1.7129002809524536, + "learning_rate": 2.94949494949495e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8043730854988098, + "num_tokens": 1264398.0, + "step": 355 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 1.6838005781173706, + "learning_rate": 2.9292929292929295e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.813912570476532, + "num_tokens": 1267902.0, + "step": 356 + }, + { + "epoch": 1.0562962962962963, + "grad_norm": 1.4188517332077026, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8319959044456482, + "num_tokens": 1272076.0, + "step": 357 + }, + { + "epoch": 1.0592592592592593, + "grad_norm": 1.4845267534255981, + "learning_rate": 2.888888888888889e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8287602066993713, + "num_tokens": 1276507.0, + "step": 358 + }, + { + "epoch": 1.0622222222222222, + "grad_norm": 1.464799165725708, + "learning_rate": 2.868686868686869e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8125880360603333, + "num_tokens": 1280892.0, + "step": 359 + }, + { + "epoch": 1.0651851851851852, + "grad_norm": 1.654840350151062, + "learning_rate": 2.848484848484849e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8072237074375153, + "num_tokens": 1284467.0, + "step": 360 + }, + { + "epoch": 1.068148148148148, + "grad_norm": 2.310239553451538, + "learning_rate": 2.8282828282828286e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7771076261997223, + "num_tokens": 1286981.0, + "step": 361 + }, + { + "epoch": 1.0711111111111111, + "grad_norm": 1.561045527458191, + "learning_rate": 2.808080808080808e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8212715685367584, + "num_tokens": 1290849.0, + "step": 362 + }, + { + "epoch": 1.074074074074074, + "grad_norm": 1.6020928621292114, + "learning_rate": 2.7878787878787885e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8208242654800415, + "num_tokens": 1294670.0, + "step": 363 + }, + { + "epoch": 1.077037037037037, + "grad_norm": 1.726387619972229, + "learning_rate": 2.7676767676767678e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8148184418678284, + "num_tokens": 1297856.0, + "step": 364 + }, + { + "epoch": 1.08, + "grad_norm": 1.4136912822723389, + "learning_rate": 2.7474747474747475e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.844176173210144, + "num_tokens": 1302070.0, + "step": 365 + }, + { + "epoch": 1.082962962962963, + "grad_norm": 1.941740870475769, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.562, + "mean_token_accuracy": 0.7918556928634644, + "num_tokens": 1305010.0, + "step": 366 + }, + { + "epoch": 1.085925925925926, + "grad_norm": 1.4563864469528198, + "learning_rate": 2.7070707070707074e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8302809000015259, + "num_tokens": 1309023.0, + "step": 367 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 1.5809407234191895, + "learning_rate": 2.686868686868687e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8184447884559631, + "num_tokens": 1312678.0, + "step": 368 + }, + { + "epoch": 1.0918518518518519, + "grad_norm": 1.555533766746521, + "learning_rate": 2.666666666666667e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8194683492183685, + "num_tokens": 1316785.0, + "step": 369 + }, + { + "epoch": 1.094814814814815, + "grad_norm": 1.5707939863204956, + "learning_rate": 2.6464646464646466e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8213657438755035, + "num_tokens": 1320733.0, + "step": 370 + }, + { + "epoch": 1.0977777777777777, + "grad_norm": 1.88181734085083, + "learning_rate": 2.6262626262626267e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8116715550422668, + "num_tokens": 1323667.0, + "step": 371 + }, + { + "epoch": 1.1007407407407408, + "grad_norm": 1.944959282875061, + "learning_rate": 2.6060606060606064e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.8119111061096191, + "num_tokens": 1326554.0, + "step": 372 + }, + { + "epoch": 1.1037037037037036, + "grad_norm": 1.6744418144226074, + "learning_rate": 2.585858585858586e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8059849739074707, + "num_tokens": 1330163.0, + "step": 373 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 1.6038318872451782, + "learning_rate": 2.565656565656566e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.8079316020011902, + "num_tokens": 1334164.0, + "step": 374 + }, + { + "epoch": 1.1096296296296297, + "grad_norm": 1.65394926071167, + "learning_rate": 2.5454545454545456e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8273237347602844, + "num_tokens": 1337515.0, + "step": 375 + }, + { + "epoch": 1.1125925925925926, + "grad_norm": 1.5841422080993652, + "learning_rate": 2.5252525252525258e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8296765685081482, + "num_tokens": 1341191.0, + "step": 376 + }, + { + "epoch": 1.1155555555555556, + "grad_norm": 1.6318837404251099, + "learning_rate": 2.5050505050505055e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.808883011341095, + "num_tokens": 1344906.0, + "step": 377 + }, + { + "epoch": 1.1185185185185185, + "grad_norm": 1.4715019464492798, + "learning_rate": 2.4848484848484848e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.838902086019516, + "num_tokens": 1349053.0, + "step": 378 + }, + { + "epoch": 1.1214814814814815, + "grad_norm": 1.3896701335906982, + "learning_rate": 2.464646464646465e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8318624198436737, + "num_tokens": 1353286.0, + "step": 379 + }, + { + "epoch": 1.1244444444444444, + "grad_norm": 1.6245249509811401, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8300711512565613, + "num_tokens": 1356985.0, + "step": 380 + }, + { + "epoch": 1.1274074074074074, + "grad_norm": 1.7144895792007446, + "learning_rate": 2.4242424242424244e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.8217493295669556, + "num_tokens": 1360494.0, + "step": 381 + }, + { + "epoch": 1.1303703703703705, + "grad_norm": 1.5089396238327026, + "learning_rate": 2.404040404040404e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8147092461585999, + "num_tokens": 1364638.0, + "step": 382 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 1.3192235231399536, + "learning_rate": 2.3838383838383843e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8234310150146484, + "num_tokens": 1369771.0, + "step": 383 + }, + { + "epoch": 1.1362962962962964, + "grad_norm": 1.3969296216964722, + "learning_rate": 2.363636363636364e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.822689414024353, + "num_tokens": 1374289.0, + "step": 384 + }, + { + "epoch": 1.1392592592592592, + "grad_norm": 1.5321006774902344, + "learning_rate": 2.3434343434343437e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8339057862758636, + "num_tokens": 1378011.0, + "step": 385 + }, + { + "epoch": 1.1422222222222222, + "grad_norm": 1.740943193435669, + "learning_rate": 2.3232323232323234e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8320908546447754, + "num_tokens": 1381543.0, + "step": 386 + }, + { + "epoch": 1.145185185185185, + "grad_norm": 1.6941492557525635, + "learning_rate": 2.303030303030303e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8285880982875824, + "num_tokens": 1384850.0, + "step": 387 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 1.682213306427002, + "learning_rate": 2.282828282828283e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8547782003879547, + "num_tokens": 1388426.0, + "step": 388 + }, + { + "epoch": 1.1511111111111112, + "grad_norm": 1.5372880697250366, + "learning_rate": 2.262626262626263e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.8182636797428131, + "num_tokens": 1392160.0, + "step": 389 + }, + { + "epoch": 1.154074074074074, + "grad_norm": 1.4516013860702515, + "learning_rate": 2.2424242424242428e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8334946036338806, + "num_tokens": 1396401.0, + "step": 390 + }, + { + "epoch": 1.157037037037037, + "grad_norm": 1.7130506038665771, + "learning_rate": 2.222222222222222e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8241472840309143, + "num_tokens": 1400017.0, + "step": 391 + }, + { + "epoch": 1.16, + "grad_norm": 1.366657018661499, + "learning_rate": 2.2020202020202022e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8447684645652771, + "num_tokens": 1404496.0, + "step": 392 + }, + { + "epoch": 1.162962962962963, + "grad_norm": 1.8387781381607056, + "learning_rate": 2.181818181818182e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8059518933296204, + "num_tokens": 1407575.0, + "step": 393 + }, + { + "epoch": 1.1659259259259258, + "grad_norm": 1.5123631954193115, + "learning_rate": 2.1616161616161617e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8212185502052307, + "num_tokens": 1411632.0, + "step": 394 + }, + { + "epoch": 1.1688888888888889, + "grad_norm": 1.732013463973999, + "learning_rate": 2.1414141414141414e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8018167614936829, + "num_tokens": 1415169.0, + "step": 395 + }, + { + "epoch": 1.171851851851852, + "grad_norm": 1.785217523574829, + "learning_rate": 2.1212121212121216e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.7885331511497498, + "num_tokens": 1418584.0, + "step": 396 + }, + { + "epoch": 1.1748148148148148, + "grad_norm": 1.7562960386276245, + "learning_rate": 2.1010101010101013e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8363031446933746, + "num_tokens": 1421687.0, + "step": 397 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 1.5766472816467285, + "learning_rate": 2.080808080808081e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8138737380504608, + "num_tokens": 1425691.0, + "step": 398 + }, + { + "epoch": 1.1807407407407406, + "grad_norm": 1.6172001361846924, + "learning_rate": 2.0606060606060607e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.8216177523136139, + "num_tokens": 1429557.0, + "step": 399 + }, + { + "epoch": 1.1837037037037037, + "grad_norm": 1.6814985275268555, + "learning_rate": 2.0404040404040405e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.8085346221923828, + "num_tokens": 1433267.0, + "step": 400 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 1.4000880718231201, + "learning_rate": 2.02020202020202e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8277598023414612, + "num_tokens": 1437667.0, + "step": 401 + }, + { + "epoch": 1.1896296296296296, + "grad_norm": 1.8423353433609009, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.7976607382297516, + "num_tokens": 1440966.0, + "step": 402 + }, + { + "epoch": 1.1925925925925926, + "grad_norm": 1.9194600582122803, + "learning_rate": 1.97979797979798e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.7985728085041046, + "num_tokens": 1444008.0, + "step": 403 + }, + { + "epoch": 1.1955555555555555, + "grad_norm": 2.046607494354248, + "learning_rate": 1.9595959595959598e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.8077805638313293, + "num_tokens": 1446800.0, + "step": 404 + }, + { + "epoch": 1.1985185185185185, + "grad_norm": 1.8536856174468994, + "learning_rate": 1.9393939393939395e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.8012503981590271, + "num_tokens": 1449783.0, + "step": 405 + }, + { + "epoch": 1.2014814814814816, + "grad_norm": 1.4639760255813599, + "learning_rate": 1.9191919191919192e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8369546234607697, + "num_tokens": 1453788.0, + "step": 406 + }, + { + "epoch": 1.2044444444444444, + "grad_norm": 1.7938032150268555, + "learning_rate": 1.8989898989898992e-06, + "loss": 0.554, + "mean_token_accuracy": 0.8145140409469604, + "num_tokens": 1457236.0, + "step": 407 + }, + { + "epoch": 1.2074074074074075, + "grad_norm": 1.8429512977600098, + "learning_rate": 1.878787878787879e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8476223647594452, + "num_tokens": 1460115.0, + "step": 408 + }, + { + "epoch": 1.2103703703703703, + "grad_norm": 1.6122170686721802, + "learning_rate": 1.8585858585858588e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8167700469493866, + "num_tokens": 1464031.0, + "step": 409 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 1.449187159538269, + "learning_rate": 1.8383838383838384e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8162829875946045, + "num_tokens": 1468592.0, + "step": 410 + }, + { + "epoch": 1.2162962962962962, + "grad_norm": 1.7037770748138428, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8312564194202423, + "num_tokens": 1472103.0, + "step": 411 + }, + { + "epoch": 1.2192592592592593, + "grad_norm": 1.9440031051635742, + "learning_rate": 1.797979797979798e-06, + "loss": 0.59, + "mean_token_accuracy": 0.7937948405742645, + "num_tokens": 1475240.0, + "step": 412 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 1.8598569631576538, + "learning_rate": 1.777777777777778e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.823639988899231, + "num_tokens": 1478232.0, + "step": 413 + }, + { + "epoch": 1.2251851851851852, + "grad_norm": 1.4698619842529297, + "learning_rate": 1.7575757575757577e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8282549977302551, + "num_tokens": 1482354.0, + "step": 414 + }, + { + "epoch": 1.2281481481481482, + "grad_norm": 1.5878387689590454, + "learning_rate": 1.7373737373737376e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8321788609027863, + "num_tokens": 1486030.0, + "step": 415 + }, + { + "epoch": 1.231111111111111, + "grad_norm": 1.606638789176941, + "learning_rate": 1.7171717171717173e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8201124370098114, + "num_tokens": 1489655.0, + "step": 416 + }, + { + "epoch": 1.234074074074074, + "grad_norm": 1.625203251838684, + "learning_rate": 1.6969696969696973e-06, + "loss": 0.5145, + "mean_token_accuracy": 0.824636310338974, + "num_tokens": 1493377.0, + "step": 417 + }, + { + "epoch": 1.237037037037037, + "grad_norm": 1.5984927415847778, + "learning_rate": 1.6767676767676768e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8223057687282562, + "num_tokens": 1496909.0, + "step": 418 + }, + { + "epoch": 1.24, + "grad_norm": 1.592211365699768, + "learning_rate": 1.6565656565656567e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.8070831596851349, + "num_tokens": 1500733.0, + "step": 419 + }, + { + "epoch": 1.242962962962963, + "grad_norm": 1.6033225059509277, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8257551491260529, + "num_tokens": 1504538.0, + "step": 420 + }, + { + "epoch": 1.2459259259259259, + "grad_norm": 1.623600721359253, + "learning_rate": 1.6161616161616164e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8141489923000336, + "num_tokens": 1508099.0, + "step": 421 + }, + { + "epoch": 1.248888888888889, + "grad_norm": 1.5946520566940308, + "learning_rate": 1.5959595959595961e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8114981055259705, + "num_tokens": 1511962.0, + "step": 422 + }, + { + "epoch": 1.2518518518518518, + "grad_norm": 1.8489376306533813, + "learning_rate": 1.5757575757575759e-06, + "loss": 0.569, + "mean_token_accuracy": 0.8056513369083405, + "num_tokens": 1514996.0, + "step": 423 + }, + { + "epoch": 1.2548148148148148, + "grad_norm": 1.5455330610275269, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8286541998386383, + "num_tokens": 1518687.0, + "step": 424 + }, + { + "epoch": 1.2577777777777777, + "grad_norm": 1.881600260734558, + "learning_rate": 1.5353535353535353e-06, + "loss": 0.5961, + "mean_token_accuracy": 0.812826544046402, + "num_tokens": 1521819.0, + "step": 425 + }, + { + "epoch": 1.2607407407407407, + "grad_norm": 1.5988526344299316, + "learning_rate": 1.5151515151515152e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8221748471260071, + "num_tokens": 1525899.0, + "step": 426 + }, + { + "epoch": 1.2637037037037038, + "grad_norm": 1.8061450719833374, + "learning_rate": 1.494949494949495e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.7981002628803253, + "num_tokens": 1529481.0, + "step": 427 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 1.843762755393982, + "learning_rate": 1.474747474747475e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8306773006916046, + "num_tokens": 1532486.0, + "step": 428 + }, + { + "epoch": 1.2696296296296297, + "grad_norm": 1.448974370956421, + "learning_rate": 1.4545454545454546e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8318825960159302, + "num_tokens": 1536682.0, + "step": 429 + }, + { + "epoch": 1.2725925925925927, + "grad_norm": 1.8719877004623413, + "learning_rate": 1.4343434343434346e-06, + "loss": 0.61, + "mean_token_accuracy": 0.7988592088222504, + "num_tokens": 1539997.0, + "step": 430 + }, + { + "epoch": 1.2755555555555556, + "grad_norm": 1.5998164415359497, + "learning_rate": 1.4141414141414143e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8334919810295105, + "num_tokens": 1543781.0, + "step": 431 + }, + { + "epoch": 1.2785185185185184, + "grad_norm": 2.038449287414551, + "learning_rate": 1.3939393939393942e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.8053812682628632, + "num_tokens": 1546470.0, + "step": 432 + }, + { + "epoch": 1.2814814814814814, + "grad_norm": 1.956413745880127, + "learning_rate": 1.3737373737373738e-06, + "loss": 0.6533, + "mean_token_accuracy": 0.7844628691673279, + "num_tokens": 1549331.0, + "step": 433 + }, + { + "epoch": 1.2844444444444445, + "grad_norm": 1.8449971675872803, + "learning_rate": 1.3535353535353537e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8119227588176727, + "num_tokens": 1552568.0, + "step": 434 + }, + { + "epoch": 1.2874074074074073, + "grad_norm": 1.7006678581237793, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.8144896030426025, + "num_tokens": 1556178.0, + "step": 435 + }, + { + "epoch": 1.2903703703703704, + "grad_norm": 1.7452467679977417, + "learning_rate": 1.3131313131313134e-06, + "loss": 0.5616, + "mean_token_accuracy": 0.8174393177032471, + "num_tokens": 1559510.0, + "step": 436 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 1.5783942937850952, + "learning_rate": 1.292929292929293e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.841576099395752, + "num_tokens": 1563292.0, + "step": 437 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 1.646907925605774, + "learning_rate": 1.2727272727272728e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.8231407999992371, + "num_tokens": 1566911.0, + "step": 438 + }, + { + "epoch": 1.2992592592592593, + "grad_norm": 1.7039787769317627, + "learning_rate": 1.2525252525252527e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8283706903457642, + "num_tokens": 1570397.0, + "step": 439 + }, + { + "epoch": 1.3022222222222222, + "grad_norm": 1.801865816116333, + "learning_rate": 1.2323232323232325e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.805145651102066, + "num_tokens": 1573871.0, + "step": 440 + }, + { + "epoch": 1.3051851851851852, + "grad_norm": 1.705989956855774, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.8106194138526917, + "num_tokens": 1577599.0, + "step": 441 + }, + { + "epoch": 1.308148148148148, + "grad_norm": 1.5632222890853882, + "learning_rate": 1.1919191919191921e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8252464532852173, + "num_tokens": 1581321.0, + "step": 442 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 1.8951255083084106, + "learning_rate": 1.1717171717171719e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.8146626353263855, + "num_tokens": 1584490.0, + "step": 443 + }, + { + "epoch": 1.3140740740740742, + "grad_norm": 1.5391404628753662, + "learning_rate": 1.1515151515151516e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.820237547159195, + "num_tokens": 1588627.0, + "step": 444 + }, + { + "epoch": 1.317037037037037, + "grad_norm": 1.3899142742156982, + "learning_rate": 1.1313131313131315e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8290388286113739, + "num_tokens": 1593168.0, + "step": 445 + }, + { + "epoch": 1.32, + "grad_norm": 1.835131049156189, + "learning_rate": 1.111111111111111e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.820201575756073, + "num_tokens": 1596431.0, + "step": 446 + }, + { + "epoch": 1.322962962962963, + "grad_norm": 1.577429175376892, + "learning_rate": 1.090909090909091e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8084814250469208, + "num_tokens": 1600653.0, + "step": 447 + }, + { + "epoch": 1.325925925925926, + "grad_norm": 1.498453140258789, + "learning_rate": 1.0707070707070707e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8429514765739441, + "num_tokens": 1604553.0, + "step": 448 + }, + { + "epoch": 1.3288888888888888, + "grad_norm": 1.6362348794937134, + "learning_rate": 1.0505050505050506e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.8162236511707306, + "num_tokens": 1608166.0, + "step": 449 + }, + { + "epoch": 1.3318518518518518, + "grad_norm": 1.626116394996643, + "learning_rate": 1.0303030303030304e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8315063714981079, + "num_tokens": 1611868.0, + "step": 450 + }, + { + "epoch": 1.334814814814815, + "grad_norm": 1.8012502193450928, + "learning_rate": 1.01010101010101e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8172584474086761, + "num_tokens": 1615419.0, + "step": 451 + }, + { + "epoch": 1.3377777777777777, + "grad_norm": 1.8158141374588013, + "learning_rate": 9.8989898989899e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.8111367225646973, + "num_tokens": 1618830.0, + "step": 452 + }, + { + "epoch": 1.3407407407407408, + "grad_norm": 1.5807775259017944, + "learning_rate": 9.696969696969698e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8227434456348419, + "num_tokens": 1623137.0, + "step": 453 + }, + { + "epoch": 1.3437037037037036, + "grad_norm": 1.4234119653701782, + "learning_rate": 9.494949494949496e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8470558524131775, + "num_tokens": 1627519.0, + "step": 454 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 2.125579595565796, + "learning_rate": 9.292929292929294e-07, + "loss": 0.6188, + "mean_token_accuracy": 0.7967137098312378, + "num_tokens": 1630108.0, + "step": 455 + }, + { + "epoch": 1.3496296296296295, + "grad_norm": 1.6869717836380005, + "learning_rate": 9.090909090909091e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.8161383271217346, + "num_tokens": 1633891.0, + "step": 456 + }, + { + "epoch": 1.3525925925925926, + "grad_norm": 1.3703079223632812, + "learning_rate": 8.88888888888889e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8110777735710144, + "num_tokens": 1639382.0, + "step": 457 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 1.872101902961731, + "learning_rate": 8.686868686868688e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8095068037509918, + "num_tokens": 1642500.0, + "step": 458 + }, + { + "epoch": 1.3585185185185185, + "grad_norm": 2.045640230178833, + "learning_rate": 8.484848484848486e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.7941328585147858, + "num_tokens": 1645321.0, + "step": 459 + }, + { + "epoch": 1.3614814814814815, + "grad_norm": 1.4400309324264526, + "learning_rate": 8.282828282828284e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8225707709789276, + "num_tokens": 1649696.0, + "step": 460 + }, + { + "epoch": 1.3644444444444446, + "grad_norm": 1.8450188636779785, + "learning_rate": 8.080808080808082e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8141945004463196, + "num_tokens": 1652752.0, + "step": 461 + }, + { + "epoch": 1.3674074074074074, + "grad_norm": 1.9214624166488647, + "learning_rate": 7.878787878787879e-07, + "loss": 0.6201, + "mean_token_accuracy": 0.7922622263431549, + "num_tokens": 1655876.0, + "step": 462 + }, + { + "epoch": 1.3703703703703702, + "grad_norm": 1.7893153429031372, + "learning_rate": 7.676767676767677e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.8077681362628937, + "num_tokens": 1659173.0, + "step": 463 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 1.5689014196395874, + "learning_rate": 7.474747474747475e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8216508030891418, + "num_tokens": 1663190.0, + "step": 464 + }, + { + "epoch": 1.3762962962962964, + "grad_norm": 1.4236626625061035, + "learning_rate": 7.272727272727273e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8321132957935333, + "num_tokens": 1667930.0, + "step": 465 + }, + { + "epoch": 1.3792592592592592, + "grad_norm": 1.42567777633667, + "learning_rate": 7.070707070707071e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8296580612659454, + "num_tokens": 1672840.0, + "step": 466 + }, + { + "epoch": 1.3822222222222222, + "grad_norm": 1.6564109325408936, + "learning_rate": 6.868686868686869e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8186525702476501, + "num_tokens": 1676358.0, + "step": 467 + }, + { + "epoch": 1.3851851851851853, + "grad_norm": 1.726884126663208, + "learning_rate": 6.666666666666667e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.825296014547348, + "num_tokens": 1679865.0, + "step": 468 + }, + { + "epoch": 1.3881481481481481, + "grad_norm": 1.5112507343292236, + "learning_rate": 6.464646464646465e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8372526466846466, + "num_tokens": 1683729.0, + "step": 469 + }, + { + "epoch": 1.3911111111111112, + "grad_norm": 2.0546481609344482, + "learning_rate": 6.262626262626264e-07, + "loss": 0.5827, + "mean_token_accuracy": 0.8113296926021576, + "num_tokens": 1686576.0, + "step": 470 + }, + { + "epoch": 1.394074074074074, + "grad_norm": 1.7979618310928345, + "learning_rate": 6.060606060606061e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8221349120140076, + "num_tokens": 1689624.0, + "step": 471 + }, + { + "epoch": 1.397037037037037, + "grad_norm": 1.6469945907592773, + "learning_rate": 5.858585858585859e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.8208528757095337, + "num_tokens": 1693565.0, + "step": 472 + }, + { + "epoch": 1.4, + "grad_norm": 1.6764768362045288, + "learning_rate": 5.656565656565658e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8198188543319702, + "num_tokens": 1697123.0, + "step": 473 + }, + { + "epoch": 1.402962962962963, + "grad_norm": 1.4936809539794922, + "learning_rate": 5.454545454545455e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8270980417728424, + "num_tokens": 1701265.0, + "step": 474 + }, + { + "epoch": 1.405925925925926, + "grad_norm": 1.7262264490127563, + "learning_rate": 5.252525252525253e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8204884827136993, + "num_tokens": 1704710.0, + "step": 475 + }, + { + "epoch": 1.4088888888888889, + "grad_norm": 1.4793908596038818, + "learning_rate": 5.05050505050505e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8427509069442749, + "num_tokens": 1708624.0, + "step": 476 + }, + { + "epoch": 1.411851851851852, + "grad_norm": 1.798835039138794, + "learning_rate": 4.848484848484849e-07, + "loss": 0.6204, + "mean_token_accuracy": 0.7952524423599243, + "num_tokens": 1712005.0, + "step": 477 + }, + { + "epoch": 1.4148148148148147, + "grad_norm": 1.8856531381607056, + "learning_rate": 4.646464646464647e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8076030910015106, + "num_tokens": 1715073.0, + "step": 478 + }, + { + "epoch": 1.4177777777777778, + "grad_norm": 1.6599233150482178, + "learning_rate": 4.444444444444445e-07, + "loss": 0.5426, + "mean_token_accuracy": 0.8160195648670197, + "num_tokens": 1719100.0, + "step": 479 + }, + { + "epoch": 1.4207407407407406, + "grad_norm": 1.5088729858398438, + "learning_rate": 4.242424242424243e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.8253285586833954, + "num_tokens": 1723826.0, + "step": 480 + }, + { + "epoch": 1.4237037037037037, + "grad_norm": 1.7026400566101074, + "learning_rate": 4.040404040404041e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8092281818389893, + "num_tokens": 1727687.0, + "step": 481 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 1.795376181602478, + "learning_rate": 3.838383838383838e-07, + "loss": 0.6058, + "mean_token_accuracy": 0.7978301644325256, + "num_tokens": 1731329.0, + "step": 482 + }, + { + "epoch": 1.4296296296296296, + "grad_norm": 1.553145408630371, + "learning_rate": 3.6363636363636366e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8289152979850769, + "num_tokens": 1735231.0, + "step": 483 + }, + { + "epoch": 1.4325925925925926, + "grad_norm": 1.7523322105407715, + "learning_rate": 3.4343434343434344e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8325841426849365, + "num_tokens": 1738462.0, + "step": 484 + }, + { + "epoch": 1.4355555555555555, + "grad_norm": 1.89793062210083, + "learning_rate": 3.2323232323232327e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8249283134937286, + "num_tokens": 1741266.0, + "step": 485 + }, + { + "epoch": 1.4385185185185185, + "grad_norm": 1.8411691188812256, + "learning_rate": 3.0303030303030305e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.8220325112342834, + "num_tokens": 1744400.0, + "step": 486 + }, + { + "epoch": 1.4414814814814814, + "grad_norm": 1.693596601486206, + "learning_rate": 2.828282828282829e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.8160210251808167, + "num_tokens": 1748015.0, + "step": 487 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 1.9168052673339844, + "learning_rate": 2.6262626262626266e-07, + "loss": 0.6008, + "mean_token_accuracy": 0.7967769503593445, + "num_tokens": 1751065.0, + "step": 488 + }, + { + "epoch": 1.4474074074074075, + "grad_norm": 1.6740949153900146, + "learning_rate": 2.4242424242424244e-07, + "loss": 0.5708, + "mean_token_accuracy": 0.8047958016395569, + "num_tokens": 1754519.0, + "step": 489 + }, + { + "epoch": 1.4503703703703703, + "grad_norm": 1.4756853580474854, + "learning_rate": 2.2222222222222224e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8352658748626709, + "num_tokens": 1758468.0, + "step": 490 + }, + { + "epoch": 1.4533333333333334, + "grad_norm": 1.4283018112182617, + "learning_rate": 2.0202020202020205e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8323183953762054, + "num_tokens": 1762896.0, + "step": 491 + }, + { + "epoch": 1.4562962962962964, + "grad_norm": 1.57586669921875, + "learning_rate": 1.8181818181818183e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8177944123744965, + "num_tokens": 1766655.0, + "step": 492 + }, + { + "epoch": 1.4592592592592593, + "grad_norm": 1.7949517965316772, + "learning_rate": 1.6161616161616163e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8148461282253265, + "num_tokens": 1769847.0, + "step": 493 + }, + { + "epoch": 1.462222222222222, + "grad_norm": 1.894729733467102, + "learning_rate": 1.4141414141414144e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.822737455368042, + "num_tokens": 1773129.0, + "step": 494 + }, + { + "epoch": 1.4651851851851851, + "grad_norm": 1.5462119579315186, + "learning_rate": 1.2121212121212122e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8403027057647705, + "num_tokens": 1777102.0, + "step": 495 + }, + { + "epoch": 1.4681481481481482, + "grad_norm": 1.7450637817382812, + "learning_rate": 1.0101010101010103e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.7947291433811188, + "num_tokens": 1780696.0, + "step": 496 + }, + { + "epoch": 1.471111111111111, + "grad_norm": 1.5586693286895752, + "learning_rate": 8.080808080808082e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.8422993123531342, + "num_tokens": 1784544.0, + "step": 497 + }, + { + "epoch": 1.474074074074074, + "grad_norm": 1.6921535730361938, + "learning_rate": 6.060606060606061e-08, + "loss": 0.5049, + "mean_token_accuracy": 0.8294592499732971, + "num_tokens": 1788074.0, + "step": 498 + }, + { + "epoch": 1.4770370370370371, + "grad_norm": 1.726887583732605, + "learning_rate": 4.040404040404041e-08, + "loss": 0.599, + "mean_token_accuracy": 0.7944865524768829, + "num_tokens": 1792035.0, + "step": 499 + }, + { + "epoch": 1.48, + "grad_norm": 1.856404423713684, + "learning_rate": 2.0202020202020204e-08, + "loss": 0.5403, + "mean_token_accuracy": 0.8191438913345337, + "num_tokens": 1795099.0, + "step": 500 + }, + { + "epoch": 4.435555555555555, + "grad_norm": 4.225268363952637, + "learning_rate": 0.0, + "loss": 1.3339, + "mean_token_accuracy": 0.6172648668289185, + "num_tokens": 1519.0, + "step": 501 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 4.488350868225098, + "learning_rate": 6.682274247491639e-06, + "loss": 1.396, + "mean_token_accuracy": 0.6025722920894623, + "num_tokens": 3034.0, + "step": 502 + }, + { + "epoch": 4.453333333333333, + "grad_norm": 4.018293857574463, + "learning_rate": 6.675585284280937e-06, + "loss": 1.2248, + "mean_token_accuracy": 0.6472580134868622, + "num_tokens": 4612.0, + "step": 503 + }, + { + "epoch": 4.4622222222222225, + "grad_norm": 4.143852710723877, + "learning_rate": 6.6688963210702354e-06, + "loss": 1.2655, + "mean_token_accuracy": 0.6227790415287018, + "num_tokens": 6183.0, + "step": 504 + }, + { + "epoch": 4.471111111111111, + "grad_norm": 4.326348781585693, + "learning_rate": 6.6622073578595324e-06, + "loss": 1.428, + "mean_token_accuracy": 0.6165547370910645, + "num_tokens": 7687.0, + "step": 505 + }, + { + "epoch": 4.48, + "grad_norm": 4.497056484222412, + "learning_rate": 6.65551839464883e-06, + "loss": 1.34, + "mean_token_accuracy": 0.5999650657176971, + "num_tokens": 9168.0, + "step": 506 + }, + { + "epoch": 4.488888888888889, + "grad_norm": 3.9834299087524414, + "learning_rate": 6.648829431438127e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.6574497520923615, + "num_tokens": 10673.0, + "step": 507 + }, + { + "epoch": 4.497777777777777, + "grad_norm": 4.128340721130371, + "learning_rate": 6.642140468227425e-06, + "loss": 1.2272, + "mean_token_accuracy": 0.6588640809059143, + "num_tokens": 12211.0, + "step": 508 + }, + { + "epoch": 4.506666666666667, + "grad_norm": 3.9665286540985107, + "learning_rate": 6.635451505016724e-06, + "loss": 1.1703, + "mean_token_accuracy": 0.6437129378318787, + "num_tokens": 13748.0, + "step": 509 + }, + { + "epoch": 4.515555555555555, + "grad_norm": 4.362207889556885, + "learning_rate": 6.628762541806021e-06, + "loss": 1.3545, + "mean_token_accuracy": 0.6177742779254913, + "num_tokens": 15319.0, + "step": 510 + }, + { + "epoch": 4.524444444444445, + "grad_norm": 4.330372333526611, + "learning_rate": 6.622073578595319e-06, + "loss": 1.3055, + "mean_token_accuracy": 0.6244651079177856, + "num_tokens": 16883.0, + "step": 511 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 4.363888263702393, + "learning_rate": 6.615384615384616e-06, + "loss": 1.3835, + "mean_token_accuracy": 0.6012668609619141, + "num_tokens": 18451.0, + "step": 512 + }, + { + "epoch": 4.542222222222223, + "grad_norm": 4.555056095123291, + "learning_rate": 6.6086956521739135e-06, + "loss": 1.4118, + "mean_token_accuracy": 0.6006550788879395, + "num_tokens": 20016.0, + "step": 513 + }, + { + "epoch": 4.551111111111111, + "grad_norm": 4.659957408905029, + "learning_rate": 6.602006688963211e-06, + "loss": 1.4178, + "mean_token_accuracy": 0.6133290529251099, + "num_tokens": 21525.0, + "step": 514 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 3.9378740787506104, + "learning_rate": 6.595317725752509e-06, + "loss": 1.2496, + "mean_token_accuracy": 0.6227707862854004, + "num_tokens": 23091.0, + "step": 515 + }, + { + "epoch": 4.568888888888889, + "grad_norm": 4.1683669090271, + "learning_rate": 6.588628762541807e-06, + "loss": 1.3073, + "mean_token_accuracy": 0.617749035358429, + "num_tokens": 24615.0, + "step": 516 + }, + { + "epoch": 4.5777777777777775, + "grad_norm": 4.403312683105469, + "learning_rate": 6.581939799331104e-06, + "loss": 1.2819, + "mean_token_accuracy": 0.6140061914920807, + "num_tokens": 26177.0, + "step": 517 + }, + { + "epoch": 4.586666666666667, + "grad_norm": 4.714698791503906, + "learning_rate": 6.575250836120402e-06, + "loss": 1.3392, + "mean_token_accuracy": 0.6056069731712341, + "num_tokens": 27746.0, + "step": 518 + }, + { + "epoch": 4.595555555555555, + "grad_norm": 4.839066505432129, + "learning_rate": 6.5685618729097e-06, + "loss": 1.3556, + "mean_token_accuracy": 0.6084996163845062, + "num_tokens": 29291.0, + "step": 519 + }, + { + "epoch": 4.604444444444445, + "grad_norm": 4.281559467315674, + "learning_rate": 6.5618729096989975e-06, + "loss": 1.2868, + "mean_token_accuracy": 0.602731853723526, + "num_tokens": 30868.0, + "step": 520 + }, + { + "epoch": 4.613333333333333, + "grad_norm": 3.7515485286712646, + "learning_rate": 6.5551839464882945e-06, + "loss": 1.2452, + "mean_token_accuracy": 0.6293786466121674, + "num_tokens": 32434.0, + "step": 521 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 3.9423084259033203, + "learning_rate": 6.548494983277592e-06, + "loss": 1.2597, + "mean_token_accuracy": 0.6278244853019714, + "num_tokens": 33960.0, + "step": 522 + }, + { + "epoch": 4.631111111111111, + "grad_norm": 5.6481828689575195, + "learning_rate": 6.54180602006689e-06, + "loss": 1.3823, + "mean_token_accuracy": 0.6131255626678467, + "num_tokens": 35481.0, + "step": 523 + }, + { + "epoch": 4.64, + "grad_norm": 4.2653069496154785, + "learning_rate": 6.535117056856188e-06, + "loss": 1.3628, + "mean_token_accuracy": 0.6240188479423523, + "num_tokens": 37087.0, + "step": 524 + }, + { + "epoch": 4.648888888888889, + "grad_norm": 4.646631717681885, + "learning_rate": 6.528428093645486e-06, + "loss": 1.3275, + "mean_token_accuracy": 0.6222825944423676, + "num_tokens": 38607.0, + "step": 525 + }, + { + "epoch": 4.657777777777778, + "grad_norm": 4.104828834533691, + "learning_rate": 6.521739130434783e-06, + "loss": 1.2694, + "mean_token_accuracy": 0.6106247305870056, + "num_tokens": 40115.0, + "step": 526 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 4.830842018127441, + "learning_rate": 6.515050167224081e-06, + "loss": 1.3452, + "mean_token_accuracy": 0.6059794127941132, + "num_tokens": 41708.0, + "step": 527 + }, + { + "epoch": 4.6755555555555555, + "grad_norm": 4.9096808433532715, + "learning_rate": 6.508361204013378e-06, + "loss": 1.4248, + "mean_token_accuracy": 0.6112937331199646, + "num_tokens": 43291.0, + "step": 528 + }, + { + "epoch": 4.684444444444445, + "grad_norm": 4.433243274688721, + "learning_rate": 6.501672240802676e-06, + "loss": 1.3692, + "mean_token_accuracy": 0.6176974773406982, + "num_tokens": 44821.0, + "step": 529 + }, + { + "epoch": 4.693333333333333, + "grad_norm": 3.8135907649993896, + "learning_rate": 6.494983277591974e-06, + "loss": 1.3265, + "mean_token_accuracy": 0.6157056093215942, + "num_tokens": 46384.0, + "step": 530 + }, + { + "epoch": 4.702222222222222, + "grad_norm": 4.359870910644531, + "learning_rate": 6.488294314381271e-06, + "loss": 1.2509, + "mean_token_accuracy": 0.6303499042987823, + "num_tokens": 47908.0, + "step": 531 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 3.892988920211792, + "learning_rate": 6.481605351170569e-06, + "loss": 1.2433, + "mean_token_accuracy": 0.6552886962890625, + "num_tokens": 49489.0, + "step": 532 + }, + { + "epoch": 4.72, + "grad_norm": 4.532709121704102, + "learning_rate": 6.474916387959866e-06, + "loss": 1.3169, + "mean_token_accuracy": 0.6119561195373535, + "num_tokens": 51000.0, + "step": 533 + }, + { + "epoch": 4.728888888888889, + "grad_norm": 4.435358047485352, + "learning_rate": 6.468227424749164e-06, + "loss": 1.3567, + "mean_token_accuracy": 0.6293449103832245, + "num_tokens": 52547.0, + "step": 534 + }, + { + "epoch": 4.737777777777778, + "grad_norm": 4.117359161376953, + "learning_rate": 6.461538461538463e-06, + "loss": 1.3599, + "mean_token_accuracy": 0.6110989451408386, + "num_tokens": 54077.0, + "step": 535 + }, + { + "epoch": 4.746666666666667, + "grad_norm": 4.48085880279541, + "learning_rate": 6.45484949832776e-06, + "loss": 1.2819, + "mean_token_accuracy": 0.6180529892444611, + "num_tokens": 55588.0, + "step": 536 + }, + { + "epoch": 4.7555555555555555, + "grad_norm": 4.002811431884766, + "learning_rate": 6.4481605351170575e-06, + "loss": 1.2773, + "mean_token_accuracy": 0.6259198486804962, + "num_tokens": 57145.0, + "step": 537 + }, + { + "epoch": 4.764444444444445, + "grad_norm": 4.070973873138428, + "learning_rate": 6.4414715719063544e-06, + "loss": 1.2792, + "mean_token_accuracy": 0.6446845531463623, + "num_tokens": 58743.0, + "step": 538 + }, + { + "epoch": 4.773333333333333, + "grad_norm": 4.156578540802002, + "learning_rate": 6.434782608695652e-06, + "loss": 1.2109, + "mean_token_accuracy": 0.6407094597816467, + "num_tokens": 60275.0, + "step": 539 + }, + { + "epoch": 4.782222222222222, + "grad_norm": 4.197425365447998, + "learning_rate": 6.428093645484951e-06, + "loss": 1.2825, + "mean_token_accuracy": 0.6370415985584259, + "num_tokens": 61840.0, + "step": 540 + }, + { + "epoch": 4.791111111111111, + "grad_norm": 3.9540748596191406, + "learning_rate": 6.421404682274248e-06, + "loss": 1.2509, + "mean_token_accuracy": 0.645992249250412, + "num_tokens": 63434.0, + "step": 541 + }, + { + "epoch": 4.8, + "grad_norm": 3.9202404022216797, + "learning_rate": 6.414715719063546e-06, + "loss": 1.3379, + "mean_token_accuracy": 0.6166598200798035, + "num_tokens": 64997.0, + "step": 542 + }, + { + "epoch": 4.808888888888889, + "grad_norm": 4.10667610168457, + "learning_rate": 6.408026755852843e-06, + "loss": 1.2359, + "mean_token_accuracy": 0.6515249311923981, + "num_tokens": 66511.0, + "step": 543 + }, + { + "epoch": 4.817777777777778, + "grad_norm": NaN, + "learning_rate": 6.401337792642141e-06, + "loss": 1.2951, + "mean_token_accuracy": 0.6413204073905945, + "num_tokens": 68093.0, + "step": 544 + }, + { + "epoch": 4.826666666666666, + "grad_norm": 4.252730846405029, + "learning_rate": 6.401337792642141e-06, + "loss": 1.2457, + "mean_token_accuracy": 0.6290826797485352, + "num_tokens": 69628.0, + "step": 545 + }, + { + "epoch": 4.835555555555556, + "grad_norm": 4.309604644775391, + "learning_rate": 6.394648829431439e-06, + "loss": 1.2835, + "mean_token_accuracy": 0.6459629237651825, + "num_tokens": 71223.0, + "step": 546 + }, + { + "epoch": 4.844444444444444, + "grad_norm": 3.9162492752075195, + "learning_rate": 6.387959866220736e-06, + "loss": 1.2173, + "mean_token_accuracy": 0.639301598072052, + "num_tokens": 72796.0, + "step": 547 + }, + { + "epoch": 4.8533333333333335, + "grad_norm": 4.038804531097412, + "learning_rate": 6.381270903010034e-06, + "loss": 1.1941, + "mean_token_accuracy": 0.6442736089229584, + "num_tokens": 74387.0, + "step": 548 + }, + { + "epoch": 4.862222222222222, + "grad_norm": 4.134418487548828, + "learning_rate": 6.374581939799331e-06, + "loss": 1.399, + "mean_token_accuracy": 0.6052152514457703, + "num_tokens": 75990.0, + "step": 549 + }, + { + "epoch": 4.871111111111111, + "grad_norm": 4.522721290588379, + "learning_rate": 6.367892976588629e-06, + "loss": 1.3014, + "mean_token_accuracy": 0.6219512224197388, + "num_tokens": 77525.0, + "step": 550 + }, + { + "epoch": 4.88, + "grad_norm": 4.368188858032227, + "learning_rate": 6.361204013377928e-06, + "loss": 1.1843, + "mean_token_accuracy": 0.629572182893753, + "num_tokens": 79057.0, + "step": 551 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 4.035810947418213, + "learning_rate": 6.354515050167225e-06, + "loss": 1.1227, + "mean_token_accuracy": 0.6539424061775208, + "num_tokens": 80619.0, + "step": 552 + }, + { + "epoch": 4.897777777777778, + "grad_norm": 4.665794849395752, + "learning_rate": 6.3478260869565225e-06, + "loss": 1.2825, + "mean_token_accuracy": 0.6447575390338898, + "num_tokens": 82121.0, + "step": 553 + }, + { + "epoch": 4.906666666666666, + "grad_norm": 3.937727212905884, + "learning_rate": 6.3411371237458195e-06, + "loss": 1.245, + "mean_token_accuracy": 0.6499392986297607, + "num_tokens": 83652.0, + "step": 554 + }, + { + "epoch": 4.915555555555556, + "grad_norm": 4.216758728027344, + "learning_rate": 6.334448160535117e-06, + "loss": 1.3351, + "mean_token_accuracy": 0.6096144616603851, + "num_tokens": 85158.0, + "step": 555 + }, + { + "epoch": 4.924444444444444, + "grad_norm": 4.27169132232666, + "learning_rate": 6.327759197324416e-06, + "loss": 1.2812, + "mean_token_accuracy": 0.6336115896701813, + "num_tokens": 86699.0, + "step": 556 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 4.5901055335998535, + "learning_rate": 6.321070234113713e-06, + "loss": 1.36, + "mean_token_accuracy": 0.6174565851688385, + "num_tokens": 88227.0, + "step": 557 + }, + { + "epoch": 4.942222222222222, + "grad_norm": 4.002968788146973, + "learning_rate": 6.314381270903011e-06, + "loss": 1.2906, + "mean_token_accuracy": 0.6195584237575531, + "num_tokens": 89788.0, + "step": 558 + }, + { + "epoch": 4.9511111111111115, + "grad_norm": 3.9454307556152344, + "learning_rate": 6.307692307692308e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.6103806495666504, + "num_tokens": 91309.0, + "step": 559 + }, + { + "epoch": 4.96, + "grad_norm": 4.503467082977295, + "learning_rate": 6.301003344481606e-06, + "loss": 1.3124, + "mean_token_accuracy": 0.6284722089767456, + "num_tokens": 92824.0, + "step": 560 + }, + { + "epoch": 4.968888888888889, + "grad_norm": 3.7083399295806885, + "learning_rate": 6.294314381270904e-06, + "loss": 1.2111, + "mean_token_accuracy": 0.6747935116291046, + "num_tokens": 94390.0, + "step": 561 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 4.690550804138184, + "learning_rate": 6.287625418060201e-06, + "loss": 1.3954, + "mean_token_accuracy": 0.6270782053470612, + "num_tokens": 95853.0, + "step": 562 + }, + { + "epoch": 4.986666666666666, + "grad_norm": 4.372628688812256, + "learning_rate": 6.280936454849499e-06, + "loss": 1.3963, + "mean_token_accuracy": 0.6323529481887817, + "num_tokens": 97327.0, + "step": 563 + }, + { + "epoch": 4.995555555555556, + "grad_norm": 3.991077423095703, + "learning_rate": 6.274247491638796e-06, + "loss": 1.3497, + "mean_token_accuracy": 0.623641312122345, + "num_tokens": 98882.0, + "step": 564 + }, + { + "epoch": 5.0088888888888885, + "grad_norm": 7.295304298400879, + "learning_rate": 6.267558528428094e-06, + "loss": 2.5627, + "mean_token_accuracy": 0.6528763373692831, + "num_tokens": 101210.0, + "step": 565 + }, + { + "epoch": 5.017777777777778, + "grad_norm": 4.190359115600586, + "learning_rate": 6.260869565217392e-06, + "loss": 1.2468, + "mean_token_accuracy": 0.629037469625473, + "num_tokens": 102764.0, + "step": 566 + }, + { + "epoch": 5.026666666666666, + "grad_norm": 4.111153602600098, + "learning_rate": 6.25418060200669e-06, + "loss": 1.2374, + "mean_token_accuracy": 0.6295638978481293, + "num_tokens": 104295.0, + "step": 567 + }, + { + "epoch": 5.035555555555556, + "grad_norm": 3.803464412689209, + "learning_rate": 6.247491638795987e-06, + "loss": 1.1998, + "mean_token_accuracy": 0.6541315317153931, + "num_tokens": 105887.0, + "step": 568 + }, + { + "epoch": 5.044444444444444, + "grad_norm": 3.8923354148864746, + "learning_rate": 6.240802675585285e-06, + "loss": 1.3511, + "mean_token_accuracy": 0.6365979909896851, + "num_tokens": 107459.0, + "step": 569 + }, + { + "epoch": 5.053333333333334, + "grad_norm": 3.7917885780334473, + "learning_rate": 6.2341137123745825e-06, + "loss": 1.2269, + "mean_token_accuracy": 0.6486060917377472, + "num_tokens": 109007.0, + "step": 570 + }, + { + "epoch": 5.062222222222222, + "grad_norm": 3.799825668334961, + "learning_rate": 6.22742474916388e-06, + "loss": 1.1432, + "mean_token_accuracy": 0.6583755016326904, + "num_tokens": 110560.0, + "step": 571 + }, + { + "epoch": 5.071111111111111, + "grad_norm": 4.5347700119018555, + "learning_rate": 6.220735785953178e-06, + "loss": 1.3539, + "mean_token_accuracy": 0.6043514013290405, + "num_tokens": 112038.0, + "step": 572 + }, + { + "epoch": 5.08, + "grad_norm": 4.88681697845459, + "learning_rate": 6.214046822742475e-06, + "loss": 1.3022, + "mean_token_accuracy": 0.6238621771335602, + "num_tokens": 113543.0, + "step": 573 + }, + { + "epoch": 5.088888888888889, + "grad_norm": 4.141240119934082, + "learning_rate": 6.207357859531773e-06, + "loss": 1.2743, + "mean_token_accuracy": 0.6353206932544708, + "num_tokens": 115050.0, + "step": 574 + }, + { + "epoch": 5.097777777777778, + "grad_norm": 3.584052801132202, + "learning_rate": 6.20066889632107e-06, + "loss": 1.193, + "mean_token_accuracy": 0.6459736824035645, + "num_tokens": 116661.0, + "step": 575 + }, + { + "epoch": 5.1066666666666665, + "grad_norm": 4.8705525398254395, + "learning_rate": 6.193979933110369e-06, + "loss": 1.2227, + "mean_token_accuracy": 0.6656882464885712, + "num_tokens": 118188.0, + "step": 576 + }, + { + "epoch": 5.115555555555556, + "grad_norm": 3.86112380027771, + "learning_rate": 6.1872909698996665e-06, + "loss": 1.2507, + "mean_token_accuracy": 0.6458339691162109, + "num_tokens": 119727.0, + "step": 577 + }, + { + "epoch": 5.124444444444444, + "grad_norm": 4.1094794273376465, + "learning_rate": 6.1806020066889635e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.6778306365013123, + "num_tokens": 121287.0, + "step": 578 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 4.067919731140137, + "learning_rate": 6.173913043478261e-06, + "loss": 1.2438, + "mean_token_accuracy": 0.6293447315692902, + "num_tokens": 122829.0, + "step": 579 + }, + { + "epoch": 5.142222222222222, + "grad_norm": 5.456110000610352, + "learning_rate": 6.167224080267558e-06, + "loss": 1.391, + "mean_token_accuracy": 0.6298533082008362, + "num_tokens": 124346.0, + "step": 580 + }, + { + "epoch": 5.151111111111111, + "grad_norm": 4.137439727783203, + "learning_rate": 6.160535117056856e-06, + "loss": 1.2543, + "mean_token_accuracy": 0.6336300075054169, + "num_tokens": 125853.0, + "step": 581 + }, + { + "epoch": 5.16, + "grad_norm": 4.414316654205322, + "learning_rate": 6.153846153846155e-06, + "loss": 1.2596, + "mean_token_accuracy": 0.6324091553688049, + "num_tokens": 127400.0, + "step": 582 + }, + { + "epoch": 5.168888888888889, + "grad_norm": 4.356647491455078, + "learning_rate": 6.147157190635452e-06, + "loss": 1.3783, + "mean_token_accuracy": 0.6197916865348816, + "num_tokens": 128948.0, + "step": 583 + }, + { + "epoch": 5.177777777777778, + "grad_norm": 4.111170768737793, + "learning_rate": 6.14046822742475e-06, + "loss": 1.2125, + "mean_token_accuracy": 0.6237273514270782, + "num_tokens": 130509.0, + "step": 584 + }, + { + "epoch": 5.1866666666666665, + "grad_norm": 4.040274143218994, + "learning_rate": 6.133779264214047e-06, + "loss": 1.214, + "mean_token_accuracy": 0.6504157781600952, + "num_tokens": 132033.0, + "step": 585 + }, + { + "epoch": 5.195555555555556, + "grad_norm": 4.420424461364746, + "learning_rate": 6.1270903010033445e-06, + "loss": 1.1915, + "mean_token_accuracy": 0.6408601999282837, + "num_tokens": 133572.0, + "step": 586 + }, + { + "epoch": 5.204444444444444, + "grad_norm": 4.098907470703125, + "learning_rate": 6.120401337792643e-06, + "loss": 1.2254, + "mean_token_accuracy": 0.642585277557373, + "num_tokens": 135115.0, + "step": 587 + }, + { + "epoch": 5.213333333333333, + "grad_norm": 4.087829113006592, + "learning_rate": 6.11371237458194e-06, + "loss": 1.3225, + "mean_token_accuracy": 0.6249795258045197, + "num_tokens": 136683.0, + "step": 588 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 3.90519118309021, + "learning_rate": 6.107023411371238e-06, + "loss": 1.1818, + "mean_token_accuracy": 0.6413995921611786, + "num_tokens": 138232.0, + "step": 589 + }, + { + "epoch": 5.231111111111111, + "grad_norm": 4.1083455085754395, + "learning_rate": 6.100334448160535e-06, + "loss": 1.3061, + "mean_token_accuracy": 0.6174591481685638, + "num_tokens": 139827.0, + "step": 590 + }, + { + "epoch": 5.24, + "grad_norm": 4.230836391448975, + "learning_rate": 6.093645484949833e-06, + "loss": 1.3293, + "mean_token_accuracy": 0.6246418058872223, + "num_tokens": 141419.0, + "step": 591 + }, + { + "epoch": 5.248888888888889, + "grad_norm": 4.106869697570801, + "learning_rate": 6.086956521739132e-06, + "loss": 1.2407, + "mean_token_accuracy": 0.6637173891067505, + "num_tokens": 142910.0, + "step": 592 + }, + { + "epoch": 5.257777777777778, + "grad_norm": 4.120201587677002, + "learning_rate": 6.080267558528429e-06, + "loss": 1.2608, + "mean_token_accuracy": 0.6491403579711914, + "num_tokens": 144457.0, + "step": 593 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 4.03704309463501, + "learning_rate": 6.0735785953177264e-06, + "loss": 1.178, + "mean_token_accuracy": 0.6414922773838043, + "num_tokens": 145973.0, + "step": 594 + }, + { + "epoch": 5.275555555555556, + "grad_norm": 4.515911102294922, + "learning_rate": 6.0668896321070234e-06, + "loss": 1.2299, + "mean_token_accuracy": 0.6331446170806885, + "num_tokens": 147512.0, + "step": 595 + }, + { + "epoch": 5.2844444444444445, + "grad_norm": 4.2795844078063965, + "learning_rate": 6.060200668896321e-06, + "loss": 1.3121, + "mean_token_accuracy": 0.6276523768901825, + "num_tokens": 149037.0, + "step": 596 + }, + { + "epoch": 5.293333333333333, + "grad_norm": 4.322146415710449, + "learning_rate": 6.05351170568562e-06, + "loss": 1.2658, + "mean_token_accuracy": 0.6295925080776215, + "num_tokens": 150548.0, + "step": 597 + }, + { + "epoch": 5.302222222222222, + "grad_norm": 4.351261138916016, + "learning_rate": 6.046822742474917e-06, + "loss": 1.2351, + "mean_token_accuracy": 0.6303271353244781, + "num_tokens": 152059.0, + "step": 598 + }, + { + "epoch": 5.311111111111111, + "grad_norm": 4.588160514831543, + "learning_rate": 6.040133779264215e-06, + "loss": 1.1391, + "mean_token_accuracy": 0.6651667952537537, + "num_tokens": 153701.0, + "step": 599 + }, + { + "epoch": 5.32, + "grad_norm": 4.943595886230469, + "learning_rate": 6.033444816053512e-06, + "loss": 1.1975, + "mean_token_accuracy": 0.6209216415882111, + "num_tokens": 155217.0, + "step": 600 + }, + { + "epoch": 5.328888888888889, + "grad_norm": 4.235236644744873, + "learning_rate": 6.02675585284281e-06, + "loss": 1.2048, + "mean_token_accuracy": 0.6525147557258606, + "num_tokens": 156785.0, + "step": 601 + }, + { + "epoch": 5.337777777777778, + "grad_norm": 4.236960411071777, + "learning_rate": 6.020066889632108e-06, + "loss": 1.2781, + "mean_token_accuracy": 0.6383876800537109, + "num_tokens": 158365.0, + "step": 602 + }, + { + "epoch": 5.346666666666667, + "grad_norm": 4.114459037780762, + "learning_rate": 6.013377926421405e-06, + "loss": 1.2313, + "mean_token_accuracy": 0.6331909596920013, + "num_tokens": 159944.0, + "step": 603 + }, + { + "epoch": 5.355555555555555, + "grad_norm": 4.426380634307861, + "learning_rate": 6.006688963210703e-06, + "loss": 1.2827, + "mean_token_accuracy": 0.6368323564529419, + "num_tokens": 161502.0, + "step": 604 + }, + { + "epoch": 5.364444444444445, + "grad_norm": 4.068392753601074, + "learning_rate": 6e-06, + "loss": 1.1406, + "mean_token_accuracy": 0.6722408533096313, + "num_tokens": 163036.0, + "step": 605 + }, + { + "epoch": 5.373333333333333, + "grad_norm": 4.264810085296631, + "learning_rate": 5.993311036789298e-06, + "loss": 1.2388, + "mean_token_accuracy": 0.6601577997207642, + "num_tokens": 164570.0, + "step": 606 + }, + { + "epoch": 5.3822222222222225, + "grad_norm": 3.9750025272369385, + "learning_rate": 5.986622073578597e-06, + "loss": 1.2014, + "mean_token_accuracy": 0.6377952992916107, + "num_tokens": 166156.0, + "step": 607 + }, + { + "epoch": 5.391111111111111, + "grad_norm": 3.9109015464782715, + "learning_rate": 5.979933110367894e-06, + "loss": 1.1777, + "mean_token_accuracy": 0.6708051264286041, + "num_tokens": 167759.0, + "step": 608 + }, + { + "epoch": 5.4, + "grad_norm": 4.409512996673584, + "learning_rate": 5.9732441471571915e-06, + "loss": 1.1556, + "mean_token_accuracy": 0.6488099992275238, + "num_tokens": 169339.0, + "step": 609 + }, + { + "epoch": 5.408888888888889, + "grad_norm": 4.582766532897949, + "learning_rate": 5.9665551839464885e-06, + "loss": 1.2372, + "mean_token_accuracy": 0.6393229067325592, + "num_tokens": 170874.0, + "step": 610 + }, + { + "epoch": 5.417777777777777, + "grad_norm": 4.071080207824707, + "learning_rate": 5.959866220735786e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6595380008220673, + "num_tokens": 172454.0, + "step": 611 + }, + { + "epoch": 5.426666666666667, + "grad_norm": 4.4920654296875, + "learning_rate": 5.953177257525084e-06, + "loss": 1.2282, + "mean_token_accuracy": 0.6174688637256622, + "num_tokens": 173983.0, + "step": 612 + }, + { + "epoch": 5.435555555555555, + "grad_norm": 4.128040790557861, + "learning_rate": 5.946488294314382e-06, + "loss": 1.22, + "mean_token_accuracy": 0.6388939321041107, + "num_tokens": 175542.0, + "step": 613 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 4.672006607055664, + "learning_rate": 5.939799331103679e-06, + "loss": 1.2463, + "mean_token_accuracy": 0.6465741395950317, + "num_tokens": 177117.0, + "step": 614 + }, + { + "epoch": 5.453333333333333, + "grad_norm": 4.382131576538086, + "learning_rate": 5.933110367892977e-06, + "loss": 1.337, + "mean_token_accuracy": 0.615509420633316, + "num_tokens": 178669.0, + "step": 615 + }, + { + "epoch": 5.4622222222222225, + "grad_norm": 4.524114608764648, + "learning_rate": 5.926421404682275e-06, + "loss": 1.2395, + "mean_token_accuracy": 0.6426366567611694, + "num_tokens": 180228.0, + "step": 616 + }, + { + "epoch": 5.471111111111111, + "grad_norm": 4.475776195526123, + "learning_rate": 5.9197324414715726e-06, + "loss": 1.1837, + "mean_token_accuracy": 0.6443540751934052, + "num_tokens": 181760.0, + "step": 617 + }, + { + "epoch": 5.48, + "grad_norm": 4.360227584838867, + "learning_rate": 5.91304347826087e-06, + "loss": 1.2527, + "mean_token_accuracy": 0.6177384257316589, + "num_tokens": 183243.0, + "step": 618 + }, + { + "epoch": 5.488888888888889, + "grad_norm": 4.36024284362793, + "learning_rate": 5.906354515050167e-06, + "loss": 1.2085, + "mean_token_accuracy": 0.6363393366336823, + "num_tokens": 184756.0, + "step": 619 + }, + { + "epoch": 5.497777777777777, + "grad_norm": 4.680853366851807, + "learning_rate": 5.899665551839465e-06, + "loss": 1.2881, + "mean_token_accuracy": 0.6234374642372131, + "num_tokens": 186274.0, + "step": 620 + }, + { + "epoch": 5.506666666666667, + "grad_norm": 4.460485458374023, + "learning_rate": 5.892976588628762e-06, + "loss": 1.1701, + "mean_token_accuracy": 0.6644229888916016, + "num_tokens": 187800.0, + "step": 621 + }, + { + "epoch": 5.515555555555555, + "grad_norm": 3.931915044784546, + "learning_rate": 5.886287625418061e-06, + "loss": 1.1491, + "mean_token_accuracy": 0.6583350300788879, + "num_tokens": 189334.0, + "step": 622 + }, + { + "epoch": 5.524444444444445, + "grad_norm": 4.149975299835205, + "learning_rate": 5.879598662207359e-06, + "loss": 1.297, + "mean_token_accuracy": 0.6326058208942413, + "num_tokens": 190897.0, + "step": 623 + }, + { + "epoch": 5.533333333333333, + "grad_norm": 4.198488712310791, + "learning_rate": 5.872909698996656e-06, + "loss": 1.1743, + "mean_token_accuracy": 0.6488493978977203, + "num_tokens": 192439.0, + "step": 624 + }, + { + "epoch": 5.542222222222223, + "grad_norm": 4.188075065612793, + "learning_rate": 5.866220735785954e-06, + "loss": 1.2826, + "mean_token_accuracy": 0.6318497955799103, + "num_tokens": 193994.0, + "step": 625 + }, + { + "epoch": 5.551111111111111, + "grad_norm": 4.831582069396973, + "learning_rate": 5.859531772575251e-06, + "loss": 1.2571, + "mean_token_accuracy": 0.6433433890342712, + "num_tokens": 195513.0, + "step": 626 + }, + { + "epoch": 5.5600000000000005, + "grad_norm": 3.9233953952789307, + "learning_rate": 5.852842809364549e-06, + "loss": 1.1908, + "mean_token_accuracy": 0.664311558008194, + "num_tokens": 197024.0, + "step": 627 + }, + { + "epoch": 5.568888888888889, + "grad_norm": 4.321683406829834, + "learning_rate": 5.846153846153847e-06, + "loss": 1.2502, + "mean_token_accuracy": 0.6159590482711792, + "num_tokens": 198569.0, + "step": 628 + }, + { + "epoch": 5.5777777777777775, + "grad_norm": 4.1210761070251465, + "learning_rate": 5.839464882943144e-06, + "loss": 1.21, + "mean_token_accuracy": 0.6426295936107635, + "num_tokens": 200157.0, + "step": 629 + }, + { + "epoch": 5.586666666666667, + "grad_norm": 3.8421363830566406, + "learning_rate": 5.832775919732442e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6531431972980499, + "num_tokens": 201779.0, + "step": 630 + }, + { + "epoch": 5.595555555555555, + "grad_norm": 4.534753799438477, + "learning_rate": 5.826086956521739e-06, + "loss": 1.2102, + "mean_token_accuracy": 0.6474778056144714, + "num_tokens": 203406.0, + "step": 631 + }, + { + "epoch": 5.604444444444445, + "grad_norm": 4.2284321784973145, + "learning_rate": 5.819397993311037e-06, + "loss": 1.107, + "mean_token_accuracy": 0.6628619730472565, + "num_tokens": 204991.0, + "step": 632 + }, + { + "epoch": 5.613333333333333, + "grad_norm": 4.557707786560059, + "learning_rate": 5.8127090301003355e-06, + "loss": 1.2902, + "mean_token_accuracy": 0.6160265803337097, + "num_tokens": 206527.0, + "step": 633 + }, + { + "epoch": 5.622222222222222, + "grad_norm": 4.258055686950684, + "learning_rate": 5.8060200668896325e-06, + "loss": 1.2132, + "mean_token_accuracy": 0.6164488196372986, + "num_tokens": 208046.0, + "step": 634 + }, + { + "epoch": 5.631111111111111, + "grad_norm": 4.232338905334473, + "learning_rate": 5.79933110367893e-06, + "loss": 1.1602, + "mean_token_accuracy": 0.6611541509628296, + "num_tokens": 209639.0, + "step": 635 + }, + { + "epoch": 5.64, + "grad_norm": 3.9534757137298584, + "learning_rate": 5.792642140468227e-06, + "loss": 1.1348, + "mean_token_accuracy": 0.6628827452659607, + "num_tokens": 211194.0, + "step": 636 + }, + { + "epoch": 5.648888888888889, + "grad_norm": 5.0057878494262695, + "learning_rate": 5.785953177257525e-06, + "loss": 1.1817, + "mean_token_accuracy": 0.672100841999054, + "num_tokens": 212667.0, + "step": 637 + }, + { + "epoch": 5.657777777777778, + "grad_norm": 4.1638922691345215, + "learning_rate": 5.779264214046824e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6614958941936493, + "num_tokens": 214202.0, + "step": 638 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 4.188668727874756, + "learning_rate": 5.772575250836121e-06, + "loss": 1.262, + "mean_token_accuracy": 0.6617647111415863, + "num_tokens": 215837.0, + "step": 639 + }, + { + "epoch": 5.6755555555555555, + "grad_norm": 4.181295871734619, + "learning_rate": 5.765886287625419e-06, + "loss": 1.1686, + "mean_token_accuracy": 0.6633462011814117, + "num_tokens": 217397.0, + "step": 640 + }, + { + "epoch": 5.684444444444445, + "grad_norm": 3.8542087078094482, + "learning_rate": 5.759197324414716e-06, + "loss": 1.1666, + "mean_token_accuracy": 0.6421281099319458, + "num_tokens": 218930.0, + "step": 641 + }, + { + "epoch": 5.693333333333333, + "grad_norm": 4.476701736450195, + "learning_rate": 5.7525083612040135e-06, + "loss": 1.3186, + "mean_token_accuracy": 0.6084637641906738, + "num_tokens": 220447.0, + "step": 642 + }, + { + "epoch": 5.702222222222222, + "grad_norm": 4.52183723449707, + "learning_rate": 5.745819397993312e-06, + "loss": 1.3288, + "mean_token_accuracy": 0.6371418237686157, + "num_tokens": 221958.0, + "step": 643 + }, + { + "epoch": 5.711111111111111, + "grad_norm": 4.62246036529541, + "learning_rate": 5.739130434782609e-06, + "loss": 1.2808, + "mean_token_accuracy": 0.6417193412780762, + "num_tokens": 223498.0, + "step": 644 + }, + { + "epoch": 5.72, + "grad_norm": 4.085021495819092, + "learning_rate": 5.732441471571907e-06, + "loss": 1.2044, + "mean_token_accuracy": 0.6430547833442688, + "num_tokens": 225073.0, + "step": 645 + }, + { + "epoch": 5.728888888888889, + "grad_norm": 4.501692295074463, + "learning_rate": 5.725752508361204e-06, + "loss": 1.1555, + "mean_token_accuracy": 0.6684712469577789, + "num_tokens": 226609.0, + "step": 646 + }, + { + "epoch": 5.737777777777778, + "grad_norm": 4.564600467681885, + "learning_rate": 5.719063545150502e-06, + "loss": 1.2416, + "mean_token_accuracy": 0.6314437389373779, + "num_tokens": 228121.0, + "step": 647 + }, + { + "epoch": 5.746666666666667, + "grad_norm": 4.364714622497559, + "learning_rate": 5.712374581939801e-06, + "loss": 1.3068, + "mean_token_accuracy": 0.6219372451305389, + "num_tokens": 229657.0, + "step": 648 + }, + { + "epoch": 5.7555555555555555, + "grad_norm": 4.248340129852295, + "learning_rate": 5.7056856187290976e-06, + "loss": 1.2671, + "mean_token_accuracy": 0.6305710971355438, + "num_tokens": 231184.0, + "step": 649 + }, + { + "epoch": 5.764444444444445, + "grad_norm": 3.9247777462005615, + "learning_rate": 5.698996655518395e-06, + "loss": 1.252, + "mean_token_accuracy": 0.6101377308368683, + "num_tokens": 232689.0, + "step": 650 + }, + { + "epoch": 5.773333333333333, + "grad_norm": 4.409743309020996, + "learning_rate": 5.692307692307692e-06, + "loss": 1.1701, + "mean_token_accuracy": 0.6548322141170502, + "num_tokens": 234284.0, + "step": 651 + }, + { + "epoch": 5.782222222222222, + "grad_norm": 4.686461925506592, + "learning_rate": 5.68561872909699e-06, + "loss": 1.2203, + "mean_token_accuracy": 0.619014173746109, + "num_tokens": 235874.0, + "step": 652 + }, + { + "epoch": 5.791111111111111, + "grad_norm": 4.42437744140625, + "learning_rate": 5.678929765886289e-06, + "loss": 1.1638, + "mean_token_accuracy": 0.6503566205501556, + "num_tokens": 237389.0, + "step": 653 + }, + { + "epoch": 5.8, + "grad_norm": 4.1741485595703125, + "learning_rate": 5.672240802675586e-06, + "loss": 1.1604, + "mean_token_accuracy": 0.6509364247322083, + "num_tokens": 238931.0, + "step": 654 + }, + { + "epoch": 5.808888888888889, + "grad_norm": 4.342599868774414, + "learning_rate": 5.665551839464884e-06, + "loss": 1.1447, + "mean_token_accuracy": 0.6514750421047211, + "num_tokens": 240415.0, + "step": 655 + }, + { + "epoch": 5.817777777777778, + "grad_norm": 4.110108852386475, + "learning_rate": 5.658862876254181e-06, + "loss": 1.1451, + "mean_token_accuracy": 0.6781052052974701, + "num_tokens": 241944.0, + "step": 656 + }, + { + "epoch": 5.826666666666666, + "grad_norm": 4.355100631713867, + "learning_rate": 5.652173913043479e-06, + "loss": 1.188, + "mean_token_accuracy": 0.6333755850791931, + "num_tokens": 243487.0, + "step": 657 + }, + { + "epoch": 5.835555555555556, + "grad_norm": 4.403768062591553, + "learning_rate": 5.6454849498327765e-06, + "loss": 1.2466, + "mean_token_accuracy": 0.6301897466182709, + "num_tokens": 245010.0, + "step": 658 + }, + { + "epoch": 5.844444444444444, + "grad_norm": 4.161632061004639, + "learning_rate": 5.638795986622074e-06, + "loss": 1.1966, + "mean_token_accuracy": 0.6503784954547882, + "num_tokens": 246541.0, + "step": 659 + }, + { + "epoch": 5.8533333333333335, + "grad_norm": 4.5803937911987305, + "learning_rate": 5.632107023411372e-06, + "loss": 1.2214, + "mean_token_accuracy": 0.6395663917064667, + "num_tokens": 248080.0, + "step": 660 + }, + { + "epoch": 5.862222222222222, + "grad_norm": 4.173155784606934, + "learning_rate": 5.625418060200669e-06, + "loss": 1.1929, + "mean_token_accuracy": 0.6691378355026245, + "num_tokens": 249636.0, + "step": 661 + }, + { + "epoch": 5.871111111111111, + "grad_norm": 4.352691173553467, + "learning_rate": 5.618729096989967e-06, + "loss": 1.2175, + "mean_token_accuracy": 0.6241460144519806, + "num_tokens": 251210.0, + "step": 662 + }, + { + "epoch": 5.88, + "grad_norm": 4.927140712738037, + "learning_rate": 5.612040133779265e-06, + "loss": 1.2145, + "mean_token_accuracy": 0.647426426410675, + "num_tokens": 252689.0, + "step": 663 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 4.043613433837891, + "learning_rate": 5.605351170568563e-06, + "loss": 1.179, + "mean_token_accuracy": 0.6637146174907684, + "num_tokens": 254246.0, + "step": 664 + }, + { + "epoch": 5.897777777777778, + "grad_norm": 4.199939250946045, + "learning_rate": 5.59866220735786e-06, + "loss": 1.1967, + "mean_token_accuracy": 0.6548939347267151, + "num_tokens": 255769.0, + "step": 665 + }, + { + "epoch": 5.906666666666666, + "grad_norm": 4.14693021774292, + "learning_rate": 5.5919732441471575e-06, + "loss": 1.1661, + "mean_token_accuracy": 0.6620028913021088, + "num_tokens": 257339.0, + "step": 666 + }, + { + "epoch": 5.915555555555556, + "grad_norm": 4.456506252288818, + "learning_rate": 5.585284280936455e-06, + "loss": 1.1093, + "mean_token_accuracy": 0.6862540245056152, + "num_tokens": 258860.0, + "step": 667 + }, + { + "epoch": 5.924444444444444, + "grad_norm": 4.283112049102783, + "learning_rate": 5.578595317725753e-06, + "loss": 1.1933, + "mean_token_accuracy": 0.6490034759044647, + "num_tokens": 260422.0, + "step": 668 + }, + { + "epoch": 5.933333333333334, + "grad_norm": 4.610235214233398, + "learning_rate": 5.571906354515051e-06, + "loss": 1.2086, + "mean_token_accuracy": 0.6420519351959229, + "num_tokens": 262001.0, + "step": 669 + }, + { + "epoch": 5.942222222222222, + "grad_norm": 4.3906145095825195, + "learning_rate": 5.565217391304348e-06, + "loss": 1.2567, + "mean_token_accuracy": 0.622428297996521, + "num_tokens": 263499.0, + "step": 670 + }, + { + "epoch": 5.9511111111111115, + "grad_norm": 4.747217655181885, + "learning_rate": 5.558528428093646e-06, + "loss": 1.1363, + "mean_token_accuracy": 0.6563035249710083, + "num_tokens": 264986.0, + "step": 671 + }, + { + "epoch": 5.96, + "grad_norm": 4.199408054351807, + "learning_rate": 5.551839464882943e-06, + "loss": 1.2, + "mean_token_accuracy": 0.6685831844806671, + "num_tokens": 266508.0, + "step": 672 + }, + { + "epoch": 5.968888888888889, + "grad_norm": 5.767821311950684, + "learning_rate": 5.5451505016722415e-06, + "loss": 1.257, + "mean_token_accuracy": 0.6445925235748291, + "num_tokens": 268051.0, + "step": 673 + }, + { + "epoch": 5.977777777777778, + "grad_norm": 4.904174327850342, + "learning_rate": 5.538461538461539e-06, + "loss": 1.2187, + "mean_token_accuracy": 0.6507235467433929, + "num_tokens": 269570.0, + "step": 674 + }, + { + "epoch": 5.986666666666666, + "grad_norm": 4.289644241333008, + "learning_rate": 5.531772575250836e-06, + "loss": 1.106, + "mean_token_accuracy": 0.6613070070743561, + "num_tokens": 271095.0, + "step": 675 + }, + { + "epoch": 5.995555555555556, + "grad_norm": 4.620111465454102, + "learning_rate": 5.525083612040134e-06, + "loss": 1.2303, + "mean_token_accuracy": 0.6419282853603363, + "num_tokens": 272608.0, + "step": 676 + }, + { + "epoch": 6.0, + "grad_norm": 6.861684322357178, + "learning_rate": 5.518394648829431e-06, + "loss": 1.2369, + "mean_token_accuracy": 0.6349693536758423, + "num_tokens": 273321.0, + "step": 677 + }, + { + "epoch": 6.0088888888888885, + "grad_norm": 4.33583927154541, + "learning_rate": 5.51170568561873e-06, + "loss": 1.1206, + "mean_token_accuracy": 0.6615579426288605, + "num_tokens": 274832.0, + "step": 678 + }, + { + "epoch": 6.017777777777778, + "grad_norm": 4.612186431884766, + "learning_rate": 5.505016722408028e-06, + "loss": 1.1967, + "mean_token_accuracy": 0.6452172100543976, + "num_tokens": 276386.0, + "step": 679 + }, + { + "epoch": 6.026666666666666, + "grad_norm": 4.441816329956055, + "learning_rate": 5.498327759197325e-06, + "loss": 1.3114, + "mean_token_accuracy": 0.6355763375759125, + "num_tokens": 277896.0, + "step": 680 + }, + { + "epoch": 6.035555555555556, + "grad_norm": 4.62359619140625, + "learning_rate": 5.491638795986623e-06, + "loss": 1.1578, + "mean_token_accuracy": 0.640273928642273, + "num_tokens": 279384.0, + "step": 681 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 4.2924675941467285, + "learning_rate": 5.48494983277592e-06, + "loss": 1.1838, + "mean_token_accuracy": 0.6701782941818237, + "num_tokens": 280907.0, + "step": 682 + }, + { + "epoch": 6.053333333333334, + "grad_norm": 4.137370586395264, + "learning_rate": 5.478260869565217e-06, + "loss": 1.1644, + "mean_token_accuracy": 0.6513003408908844, + "num_tokens": 282490.0, + "step": 683 + }, + { + "epoch": 6.062222222222222, + "grad_norm": 4.4451584815979, + "learning_rate": 5.471571906354516e-06, + "loss": 1.2093, + "mean_token_accuracy": 0.6403338313102722, + "num_tokens": 284025.0, + "step": 684 + }, + { + "epoch": 6.071111111111111, + "grad_norm": 4.592154026031494, + "learning_rate": 5.464882943143813e-06, + "loss": 1.2078, + "mean_token_accuracy": 0.6282210052013397, + "num_tokens": 285556.0, + "step": 685 + }, + { + "epoch": 6.08, + "grad_norm": 4.92061710357666, + "learning_rate": 5.458193979933111e-06, + "loss": 1.1976, + "mean_token_accuracy": 0.6441406607627869, + "num_tokens": 287106.0, + "step": 686 + }, + { + "epoch": 6.088888888888889, + "grad_norm": 4.2925705909729, + "learning_rate": 5.451505016722408e-06, + "loss": 1.2471, + "mean_token_accuracy": 0.6352337300777435, + "num_tokens": 288660.0, + "step": 687 + }, + { + "epoch": 6.097777777777778, + "grad_norm": 4.945430755615234, + "learning_rate": 5.444816053511706e-06, + "loss": 1.2283, + "mean_token_accuracy": 0.6231025457382202, + "num_tokens": 290195.0, + "step": 688 + }, + { + "epoch": 6.1066666666666665, + "grad_norm": 4.435353755950928, + "learning_rate": 5.4381270903010045e-06, + "loss": 1.1725, + "mean_token_accuracy": 0.6580179035663605, + "num_tokens": 291724.0, + "step": 689 + }, + { + "epoch": 6.115555555555556, + "grad_norm": 4.419246673583984, + "learning_rate": 5.4314381270903015e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.6618131697177887, + "num_tokens": 293232.0, + "step": 690 + }, + { + "epoch": 6.124444444444444, + "grad_norm": 4.438714504241943, + "learning_rate": 5.424749163879599e-06, + "loss": 1.1212, + "mean_token_accuracy": 0.6635273993015289, + "num_tokens": 294756.0, + "step": 691 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 4.837860584259033, + "learning_rate": 5.418060200668896e-06, + "loss": 1.2358, + "mean_token_accuracy": 0.6516101658344269, + "num_tokens": 296351.0, + "step": 692 + }, + { + "epoch": 6.142222222222222, + "grad_norm": 4.669703960418701, + "learning_rate": 5.411371237458194e-06, + "loss": 1.254, + "mean_token_accuracy": 0.6333885788917542, + "num_tokens": 297877.0, + "step": 693 + }, + { + "epoch": 6.151111111111111, + "grad_norm": 4.55410623550415, + "learning_rate": 5.404682274247493e-06, + "loss": 1.1861, + "mean_token_accuracy": 0.6504701972007751, + "num_tokens": 299388.0, + "step": 694 + }, + { + "epoch": 6.16, + "grad_norm": 4.345452308654785, + "learning_rate": 5.39799331103679e-06, + "loss": 1.1769, + "mean_token_accuracy": 0.6571119129657745, + "num_tokens": 300910.0, + "step": 695 + }, + { + "epoch": 6.168888888888889, + "grad_norm": 4.2362141609191895, + "learning_rate": 5.391304347826088e-06, + "loss": 1.2114, + "mean_token_accuracy": 0.6493647694587708, + "num_tokens": 302438.0, + "step": 696 + }, + { + "epoch": 6.177777777777778, + "grad_norm": 4.663865566253662, + "learning_rate": 5.384615384615385e-06, + "loss": 1.2973, + "mean_token_accuracy": 0.6381103694438934, + "num_tokens": 304012.0, + "step": 697 + }, + { + "epoch": 6.1866666666666665, + "grad_norm": 4.230501174926758, + "learning_rate": 5.3779264214046825e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6913765370845795, + "num_tokens": 305564.0, + "step": 698 + }, + { + "epoch": 6.195555555555556, + "grad_norm": 4.471275329589844, + "learning_rate": 5.371237458193981e-06, + "loss": 1.2286, + "mean_token_accuracy": 0.641329288482666, + "num_tokens": 307083.0, + "step": 699 + }, + { + "epoch": 6.204444444444444, + "grad_norm": 4.411542892456055, + "learning_rate": 5.364548494983278e-06, + "loss": 1.1575, + "mean_token_accuracy": 0.6555075943470001, + "num_tokens": 308611.0, + "step": 700 + }, + { + "epoch": 6.213333333333333, + "grad_norm": 3.9822497367858887, + "learning_rate": 5.357859531772576e-06, + "loss": 1.1521, + "mean_token_accuracy": 0.6389129757881165, + "num_tokens": 310180.0, + "step": 701 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 4.320405006408691, + "learning_rate": 5.351170568561873e-06, + "loss": 1.1731, + "mean_token_accuracy": 0.6548811197280884, + "num_tokens": 311715.0, + "step": 702 + }, + { + "epoch": 6.231111111111111, + "grad_norm": 4.436611175537109, + "learning_rate": 5.344481605351171e-06, + "loss": 1.1129, + "mean_token_accuracy": 0.6702597439289093, + "num_tokens": 313292.0, + "step": 703 + }, + { + "epoch": 6.24, + "grad_norm": 5.164817810058594, + "learning_rate": 5.337792642140469e-06, + "loss": 1.1789, + "mean_token_accuracy": 0.6648919880390167, + "num_tokens": 314821.0, + "step": 704 + }, + { + "epoch": 6.248888888888889, + "grad_norm": 4.132184028625488, + "learning_rate": 5.3311036789297666e-06, + "loss": 1.1531, + "mean_token_accuracy": 0.6697083413600922, + "num_tokens": 316330.0, + "step": 705 + }, + { + "epoch": 6.257777777777778, + "grad_norm": 4.565079212188721, + "learning_rate": 5.324414715719064e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.6742570996284485, + "num_tokens": 317867.0, + "step": 706 + }, + { + "epoch": 6.266666666666667, + "grad_norm": 5.1129255294799805, + "learning_rate": 5.317725752508361e-06, + "loss": 1.2677, + "mean_token_accuracy": 0.6145256757736206, + "num_tokens": 319349.0, + "step": 707 + }, + { + "epoch": 6.275555555555556, + "grad_norm": 4.7777276039123535, + "learning_rate": 5.311036789297659e-06, + "loss": 1.2887, + "mean_token_accuracy": 0.6110173463821411, + "num_tokens": 320877.0, + "step": 708 + }, + { + "epoch": 6.2844444444444445, + "grad_norm": 4.717259883880615, + "learning_rate": 5.304347826086957e-06, + "loss": 1.2111, + "mean_token_accuracy": 0.6434732377529144, + "num_tokens": 322384.0, + "step": 709 + }, + { + "epoch": 6.293333333333333, + "grad_norm": 4.740537643432617, + "learning_rate": 5.297658862876255e-06, + "loss": 1.1147, + "mean_token_accuracy": 0.6747602224349976, + "num_tokens": 323937.0, + "step": 710 + }, + { + "epoch": 6.302222222222222, + "grad_norm": 4.441349029541016, + "learning_rate": 5.290969899665552e-06, + "loss": 1.2029, + "mean_token_accuracy": 0.6509633362293243, + "num_tokens": 325500.0, + "step": 711 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 4.91001033782959, + "learning_rate": 5.28428093645485e-06, + "loss": 1.1438, + "mean_token_accuracy": 0.6630813479423523, + "num_tokens": 327050.0, + "step": 712 + }, + { + "epoch": 6.32, + "grad_norm": 4.40542459487915, + "learning_rate": 5.277591973244148e-06, + "loss": 1.1391, + "mean_token_accuracy": 0.6568433344364166, + "num_tokens": 328542.0, + "step": 713 + }, + { + "epoch": 6.328888888888889, + "grad_norm": 5.083597183227539, + "learning_rate": 5.2709030100334454e-06, + "loss": 1.2937, + "mean_token_accuracy": 0.6237309575080872, + "num_tokens": 330044.0, + "step": 714 + }, + { + "epoch": 6.337777777777778, + "grad_norm": 4.431397438049316, + "learning_rate": 5.264214046822743e-06, + "loss": 1.154, + "mean_token_accuracy": 0.6710568964481354, + "num_tokens": 331567.0, + "step": 715 + }, + { + "epoch": 6.346666666666667, + "grad_norm": 4.679903984069824, + "learning_rate": 5.25752508361204e-06, + "loss": 1.2591, + "mean_token_accuracy": 0.6364468038082123, + "num_tokens": 333160.0, + "step": 716 + }, + { + "epoch": 6.355555555555555, + "grad_norm": 4.453300476074219, + "learning_rate": 5.250836120401338e-06, + "loss": 1.1238, + "mean_token_accuracy": 0.6860308051109314, + "num_tokens": 334716.0, + "step": 717 + }, + { + "epoch": 6.364444444444445, + "grad_norm": 4.235513687133789, + "learning_rate": 5.244147157190635e-06, + "loss": 1.1819, + "mean_token_accuracy": 0.6361779868602753, + "num_tokens": 336256.0, + "step": 718 + }, + { + "epoch": 6.373333333333333, + "grad_norm": 4.488588333129883, + "learning_rate": 5.237458193979934e-06, + "loss": 1.2059, + "mean_token_accuracy": 0.6572637259960175, + "num_tokens": 337824.0, + "step": 719 + }, + { + "epoch": 6.3822222222222225, + "grad_norm": 4.299254894256592, + "learning_rate": 5.230769230769232e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6514302492141724, + "num_tokens": 339397.0, + "step": 720 + }, + { + "epoch": 6.391111111111111, + "grad_norm": 4.424536228179932, + "learning_rate": 5.224080267558529e-06, + "loss": 1.1912, + "mean_token_accuracy": 0.6588321328163147, + "num_tokens": 340891.0, + "step": 721 + }, + { + "epoch": 6.4, + "grad_norm": 4.400254249572754, + "learning_rate": 5.2173913043478265e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6789169013500214, + "num_tokens": 342492.0, + "step": 722 + }, + { + "epoch": 6.408888888888889, + "grad_norm": 4.184225082397461, + "learning_rate": 5.2107023411371235e-06, + "loss": 1.1392, + "mean_token_accuracy": 0.6490438878536224, + "num_tokens": 343998.0, + "step": 723 + }, + { + "epoch": 6.417777777777777, + "grad_norm": 4.579216957092285, + "learning_rate": 5.204013377926422e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.6473316550254822, + "num_tokens": 345527.0, + "step": 724 + }, + { + "epoch": 6.426666666666667, + "grad_norm": 4.637413501739502, + "learning_rate": 5.19732441471572e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6627026498317719, + "num_tokens": 347101.0, + "step": 725 + }, + { + "epoch": 6.435555555555555, + "grad_norm": 4.445194721221924, + "learning_rate": 5.190635451505017e-06, + "loss": 1.2057, + "mean_token_accuracy": 0.6304130852222443, + "num_tokens": 348605.0, + "step": 726 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 4.261415481567383, + "learning_rate": 5.183946488294315e-06, + "loss": 1.1103, + "mean_token_accuracy": 0.6645594835281372, + "num_tokens": 350193.0, + "step": 727 + }, + { + "epoch": 6.453333333333333, + "grad_norm": 4.703232288360596, + "learning_rate": 5.177257525083612e-06, + "loss": 1.2506, + "mean_token_accuracy": 0.6516353785991669, + "num_tokens": 351721.0, + "step": 728 + }, + { + "epoch": 6.4622222222222225, + "grad_norm": 5.103692531585693, + "learning_rate": 5.1705685618729105e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6963317394256592, + "num_tokens": 353176.0, + "step": 729 + }, + { + "epoch": 6.471111111111111, + "grad_norm": 4.698752403259277, + "learning_rate": 5.163879598662208e-06, + "loss": 1.168, + "mean_token_accuracy": 0.6384732127189636, + "num_tokens": 354748.0, + "step": 730 + }, + { + "epoch": 6.48, + "grad_norm": 5.011492729187012, + "learning_rate": 5.157190635451505e-06, + "loss": 1.113, + "mean_token_accuracy": 0.675061047077179, + "num_tokens": 356264.0, + "step": 731 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 5.266847610473633, + "learning_rate": 5.150501672240803e-06, + "loss": 1.212, + "mean_token_accuracy": 0.6491404175758362, + "num_tokens": 357798.0, + "step": 732 + }, + { + "epoch": 6.497777777777777, + "grad_norm": 5.363094329833984, + "learning_rate": 5.1438127090301e-06, + "loss": 1.1811, + "mean_token_accuracy": 0.640788733959198, + "num_tokens": 359303.0, + "step": 733 + }, + { + "epoch": 6.506666666666667, + "grad_norm": 5.235933780670166, + "learning_rate": 5.137123745819398e-06, + "loss": 1.1161, + "mean_token_accuracy": 0.6741294860839844, + "num_tokens": 360914.0, + "step": 734 + }, + { + "epoch": 6.515555555555555, + "grad_norm": 4.5590434074401855, + "learning_rate": 5.130434782608697e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6580885648727417, + "num_tokens": 362414.0, + "step": 735 + }, + { + "epoch": 6.524444444444445, + "grad_norm": 4.886003017425537, + "learning_rate": 5.123745819397994e-06, + "loss": 1.2197, + "mean_token_accuracy": 0.6403879523277283, + "num_tokens": 364001.0, + "step": 736 + }, + { + "epoch": 6.533333333333333, + "grad_norm": 4.724174499511719, + "learning_rate": 5.1170568561872916e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.6844411790370941, + "num_tokens": 365515.0, + "step": 737 + }, + { + "epoch": 6.542222222222223, + "grad_norm": 5.063113212585449, + "learning_rate": 5.1103678929765886e-06, + "loss": 1.2381, + "mean_token_accuracy": 0.6328524649143219, + "num_tokens": 367039.0, + "step": 738 + }, + { + "epoch": 6.551111111111111, + "grad_norm": 5.1623711585998535, + "learning_rate": 5.103678929765886e-06, + "loss": 1.2277, + "mean_token_accuracy": 0.6545201241970062, + "num_tokens": 368571.0, + "step": 739 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 4.488753795623779, + "learning_rate": 5.096989966555185e-06, + "loss": 1.1577, + "mean_token_accuracy": 0.6652779877185822, + "num_tokens": 370078.0, + "step": 740 + }, + { + "epoch": 6.568888888888889, + "grad_norm": 5.227181911468506, + "learning_rate": 5.090301003344482e-06, + "loss": 1.2633, + "mean_token_accuracy": 0.633996844291687, + "num_tokens": 371567.0, + "step": 741 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 4.520918846130371, + "learning_rate": 5.08361204013378e-06, + "loss": 1.1256, + "mean_token_accuracy": 0.683332622051239, + "num_tokens": 373085.0, + "step": 742 + }, + { + "epoch": 6.586666666666667, + "grad_norm": 4.61983585357666, + "learning_rate": 5.076923076923077e-06, + "loss": 1.1536, + "mean_token_accuracy": 0.641717940568924, + "num_tokens": 374578.0, + "step": 743 + }, + { + "epoch": 6.595555555555555, + "grad_norm": 4.451761245727539, + "learning_rate": 5.070234113712375e-06, + "loss": 1.1153, + "mean_token_accuracy": 0.6684437990188599, + "num_tokens": 376086.0, + "step": 744 + }, + { + "epoch": 6.604444444444445, + "grad_norm": 5.2474212646484375, + "learning_rate": 5.0635451505016735e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.670863926410675, + "num_tokens": 377649.0, + "step": 745 + }, + { + "epoch": 6.613333333333333, + "grad_norm": 4.630691051483154, + "learning_rate": 5.0568561872909704e-06, + "loss": 1.1528, + "mean_token_accuracy": 0.6571419537067413, + "num_tokens": 379225.0, + "step": 746 + }, + { + "epoch": 6.622222222222222, + "grad_norm": 4.767591953277588, + "learning_rate": 5.050167224080268e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6682968437671661, + "num_tokens": 380753.0, + "step": 747 + }, + { + "epoch": 6.631111111111111, + "grad_norm": 4.507701873779297, + "learning_rate": 5.043478260869565e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6710818111896515, + "num_tokens": 382321.0, + "step": 748 + }, + { + "epoch": 6.64, + "grad_norm": 4.915844917297363, + "learning_rate": 5.036789297658863e-06, + "loss": 1.1316, + "mean_token_accuracy": 0.6446280777454376, + "num_tokens": 383825.0, + "step": 749 + }, + { + "epoch": 6.648888888888889, + "grad_norm": 4.822279453277588, + "learning_rate": 5.030100334448161e-06, + "loss": 1.1983, + "mean_token_accuracy": 0.6473309397697449, + "num_tokens": 385369.0, + "step": 750 + }, + { + "epoch": 6.657777777777778, + "grad_norm": 4.585779190063477, + "learning_rate": 5.023411371237459e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6787306666374207, + "num_tokens": 386966.0, + "step": 751 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 5.084322929382324, + "learning_rate": 5.016722408026757e-06, + "loss": 1.2536, + "mean_token_accuracy": 0.6180808842182159, + "num_tokens": 388442.0, + "step": 752 + }, + { + "epoch": 6.6755555555555555, + "grad_norm": 4.926344871520996, + "learning_rate": 5.010033444816054e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6696269512176514, + "num_tokens": 389997.0, + "step": 753 + }, + { + "epoch": 6.684444444444445, + "grad_norm": 4.653207778930664, + "learning_rate": 5.0033444816053515e-06, + "loss": 1.1334, + "mean_token_accuracy": 0.6817471086978912, + "num_tokens": 391523.0, + "step": 754 + }, + { + "epoch": 6.693333333333333, + "grad_norm": 4.806591033935547, + "learning_rate": 4.996655518394649e-06, + "loss": 1.1597, + "mean_token_accuracy": 0.6599297821521759, + "num_tokens": 393096.0, + "step": 755 + }, + { + "epoch": 6.702222222222222, + "grad_norm": 4.988717079162598, + "learning_rate": 4.989966555183947e-06, + "loss": 1.16, + "mean_token_accuracy": 0.6670804023742676, + "num_tokens": 394639.0, + "step": 756 + }, + { + "epoch": 6.711111111111111, + "grad_norm": 4.501280307769775, + "learning_rate": 4.983277591973244e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7011318802833557, + "num_tokens": 396235.0, + "step": 757 + }, + { + "epoch": 6.72, + "grad_norm": 4.68208646774292, + "learning_rate": 4.976588628762542e-06, + "loss": 1.0807, + "mean_token_accuracy": 0.6642267405986786, + "num_tokens": 397728.0, + "step": 758 + }, + { + "epoch": 6.728888888888889, + "grad_norm": 5.056075572967529, + "learning_rate": 4.96989966555184e-06, + "loss": 1.1719, + "mean_token_accuracy": 0.6560480296611786, + "num_tokens": 399223.0, + "step": 759 + }, + { + "epoch": 6.737777777777778, + "grad_norm": 5.287487983703613, + "learning_rate": 4.963210702341138e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6819011867046356, + "num_tokens": 400749.0, + "step": 760 + }, + { + "epoch": 6.746666666666667, + "grad_norm": 4.856434345245361, + "learning_rate": 4.9565217391304355e-06, + "loss": 1.2041, + "mean_token_accuracy": 0.6568415760993958, + "num_tokens": 402312.0, + "step": 761 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 4.789765357971191, + "learning_rate": 4.9498327759197325e-06, + "loss": 1.1548, + "mean_token_accuracy": 0.6776265799999237, + "num_tokens": 403812.0, + "step": 762 + }, + { + "epoch": 6.764444444444445, + "grad_norm": 4.778657913208008, + "learning_rate": 4.94314381270903e-06, + "loss": 1.1461, + "mean_token_accuracy": 0.6648118793964386, + "num_tokens": 405341.0, + "step": 763 + }, + { + "epoch": 6.773333333333333, + "grad_norm": 4.769364356994629, + "learning_rate": 4.936454849498328e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6716397106647491, + "num_tokens": 406901.0, + "step": 764 + }, + { + "epoch": 6.782222222222222, + "grad_norm": 4.781235694885254, + "learning_rate": 4.929765886287626e-06, + "loss": 1.2039, + "mean_token_accuracy": 0.6387063264846802, + "num_tokens": 408458.0, + "step": 765 + }, + { + "epoch": 6.791111111111111, + "grad_norm": 4.616097927093506, + "learning_rate": 4.923076923076924e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.689809650182724, + "num_tokens": 409967.0, + "step": 766 + }, + { + "epoch": 6.8, + "grad_norm": 4.806463718414307, + "learning_rate": 4.916387959866221e-06, + "loss": 1.1162, + "mean_token_accuracy": 0.6813206076622009, + "num_tokens": 411480.0, + "step": 767 + }, + { + "epoch": 6.808888888888889, + "grad_norm": 4.712216377258301, + "learning_rate": 4.909698996655519e-06, + "loss": 1.18, + "mean_token_accuracy": 0.6471198201179504, + "num_tokens": 413028.0, + "step": 768 + }, + { + "epoch": 6.817777777777778, + "grad_norm": 4.705795764923096, + "learning_rate": 4.9030100334448166e-06, + "loss": 1.122, + "mean_token_accuracy": 0.6501418948173523, + "num_tokens": 414592.0, + "step": 769 + }, + { + "epoch": 6.826666666666666, + "grad_norm": 4.957315921783447, + "learning_rate": 4.896321070234114e-06, + "loss": 1.1423, + "mean_token_accuracy": 0.6794840395450592, + "num_tokens": 416152.0, + "step": 770 + }, + { + "epoch": 6.835555555555556, + "grad_norm": 4.869872093200684, + "learning_rate": 4.889632107023411e-06, + "loss": 1.1703, + "mean_token_accuracy": 0.6651597917079926, + "num_tokens": 417756.0, + "step": 771 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 5.3918070793151855, + "learning_rate": 4.882943143812709e-06, + "loss": 1.3262, + "mean_token_accuracy": 0.6076781451702118, + "num_tokens": 419285.0, + "step": 772 + }, + { + "epoch": 6.8533333333333335, + "grad_norm": 4.708622932434082, + "learning_rate": 4.876254180602007e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6709949672222137, + "num_tokens": 420873.0, + "step": 773 + }, + { + "epoch": 6.862222222222222, + "grad_norm": 5.08357048034668, + "learning_rate": 4.869565217391305e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6804331243038177, + "num_tokens": 422434.0, + "step": 774 + }, + { + "epoch": 6.871111111111111, + "grad_norm": 5.003035545349121, + "learning_rate": 4.862876254180603e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6827142536640167, + "num_tokens": 424020.0, + "step": 775 + }, + { + "epoch": 6.88, + "grad_norm": 4.372951984405518, + "learning_rate": 4.8561872909699e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.7075498998165131, + "num_tokens": 425587.0, + "step": 776 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 4.540833473205566, + "learning_rate": 4.849498327759198e-06, + "loss": 1.113, + "mean_token_accuracy": 0.6746594905853271, + "num_tokens": 427103.0, + "step": 777 + }, + { + "epoch": 6.897777777777778, + "grad_norm": 5.006860733032227, + "learning_rate": 4.8428093645484955e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6714694797992706, + "num_tokens": 428635.0, + "step": 778 + }, + { + "epoch": 6.906666666666666, + "grad_norm": 4.806639671325684, + "learning_rate": 4.836120401337793e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6855470240116119, + "num_tokens": 430165.0, + "step": 779 + }, + { + "epoch": 6.915555555555556, + "grad_norm": 4.436448097229004, + "learning_rate": 4.82943143812709e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6945804059505463, + "num_tokens": 431712.0, + "step": 780 + }, + { + "epoch": 6.924444444444444, + "grad_norm": 4.592042446136475, + "learning_rate": 4.822742474916388e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.7016887366771698, + "num_tokens": 433281.0, + "step": 781 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 4.449767589569092, + "learning_rate": 4.816053511705686e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.7014623582363129, + "num_tokens": 434796.0, + "step": 782 + }, + { + "epoch": 6.942222222222222, + "grad_norm": 5.081116199493408, + "learning_rate": 4.809364548494984e-06, + "loss": 1.1519, + "mean_token_accuracy": 0.6471051275730133, + "num_tokens": 436366.0, + "step": 783 + }, + { + "epoch": 6.9511111111111115, + "grad_norm": 4.829915523529053, + "learning_rate": 4.802675585284282e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.6669144928455353, + "num_tokens": 437891.0, + "step": 784 + }, + { + "epoch": 6.96, + "grad_norm": 5.671187400817871, + "learning_rate": 4.795986622073579e-06, + "loss": 1.2612, + "mean_token_accuracy": 0.6515063345432281, + "num_tokens": 439448.0, + "step": 785 + }, + { + "epoch": 6.968888888888889, + "grad_norm": 4.963412761688232, + "learning_rate": 4.7892976588628765e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6779084205627441, + "num_tokens": 441033.0, + "step": 786 + }, + { + "epoch": 6.977777777777778, + "grad_norm": 5.4057793617248535, + "learning_rate": 4.782608695652174e-06, + "loss": 1.2167, + "mean_token_accuracy": 0.645590215921402, + "num_tokens": 442609.0, + "step": 787 + }, + { + "epoch": 6.986666666666666, + "grad_norm": 5.4399943351745605, + "learning_rate": 4.775919732441472e-06, + "loss": 1.2728, + "mean_token_accuracy": 0.6314329206943512, + "num_tokens": 444152.0, + "step": 788 + }, + { + "epoch": 6.995555555555556, + "grad_norm": 4.750269412994385, + "learning_rate": 4.76923076923077e-06, + "loss": 1.14, + "mean_token_accuracy": 0.6701004505157471, + "num_tokens": 445675.0, + "step": 789 + }, + { + "epoch": 7.0, + "grad_norm": 6.860434532165527, + "learning_rate": 4.762541806020067e-06, + "loss": 1.1372, + "mean_token_accuracy": 0.6571428775787354, + "num_tokens": 446441.0, + "step": 790 + }, + { + "epoch": 7.0088888888888885, + "grad_norm": 4.965756416320801, + "learning_rate": 4.755852842809365e-06, + "loss": 1.1062, + "mean_token_accuracy": 0.6704318225383759, + "num_tokens": 448022.0, + "step": 791 + }, + { + "epoch": 7.017777777777778, + "grad_norm": 5.081864833831787, + "learning_rate": 4.749163879598663e-06, + "loss": 1.1165, + "mean_token_accuracy": 0.6706746518611908, + "num_tokens": 449549.0, + "step": 792 + }, + { + "epoch": 7.026666666666666, + "grad_norm": 4.8764166831970215, + "learning_rate": 4.7424749163879605e-06, + "loss": 1.1227, + "mean_token_accuracy": 0.6908602118492126, + "num_tokens": 451101.0, + "step": 793 + }, + { + "epoch": 7.035555555555556, + "grad_norm": 5.3034210205078125, + "learning_rate": 4.7357859531772575e-06, + "loss": 1.1562, + "mean_token_accuracy": 0.6468882858753204, + "num_tokens": 452631.0, + "step": 794 + }, + { + "epoch": 7.044444444444444, + "grad_norm": 4.8361945152282715, + "learning_rate": 4.729096989966555e-06, + "loss": 1.1521, + "mean_token_accuracy": 0.6653201878070831, + "num_tokens": 454158.0, + "step": 795 + }, + { + "epoch": 7.053333333333334, + "grad_norm": 4.605897426605225, + "learning_rate": 4.722408026755853e-06, + "loss": 1.115, + "mean_token_accuracy": 0.6745364665985107, + "num_tokens": 455693.0, + "step": 796 + }, + { + "epoch": 7.062222222222222, + "grad_norm": 5.250539302825928, + "learning_rate": 4.715719063545151e-06, + "loss": 1.1686, + "mean_token_accuracy": 0.6480132341384888, + "num_tokens": 457210.0, + "step": 797 + }, + { + "epoch": 7.071111111111111, + "grad_norm": 4.881685733795166, + "learning_rate": 4.709030100334449e-06, + "loss": 1.1477, + "mean_token_accuracy": 0.6660578548908234, + "num_tokens": 458742.0, + "step": 798 + }, + { + "epoch": 7.08, + "grad_norm": 4.9617533683776855, + "learning_rate": 4.702341137123746e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6666490435600281, + "num_tokens": 460317.0, + "step": 799 + }, + { + "epoch": 7.088888888888889, + "grad_norm": 5.095200061798096, + "learning_rate": 4.695652173913044e-06, + "loss": 1.1311, + "mean_token_accuracy": 0.6378019452095032, + "num_tokens": 461893.0, + "step": 800 + }, + { + "epoch": 7.097777777777778, + "grad_norm": 4.970204830169678, + "learning_rate": 4.688963210702342e-06, + "loss": 1.1918, + "mean_token_accuracy": 0.6388539373874664, + "num_tokens": 463394.0, + "step": 801 + }, + { + "epoch": 7.1066666666666665, + "grad_norm": 4.652441024780273, + "learning_rate": 4.6822742474916394e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6657119989395142, + "num_tokens": 464975.0, + "step": 802 + }, + { + "epoch": 7.115555555555556, + "grad_norm": 4.983111381530762, + "learning_rate": 4.675585284280936e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6800532042980194, + "num_tokens": 466551.0, + "step": 803 + }, + { + "epoch": 7.124444444444444, + "grad_norm": 5.1220784187316895, + "learning_rate": 4.668896321070234e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6691431403160095, + "num_tokens": 468100.0, + "step": 804 + }, + { + "epoch": 7.133333333333334, + "grad_norm": 5.042264938354492, + "learning_rate": 4.662207357859532e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6806917190551758, + "num_tokens": 469624.0, + "step": 805 + }, + { + "epoch": 7.142222222222222, + "grad_norm": 5.108158111572266, + "learning_rate": 4.65551839464883e-06, + "loss": 1.1752, + "mean_token_accuracy": 0.6473954021930695, + "num_tokens": 471178.0, + "step": 806 + }, + { + "epoch": 7.151111111111111, + "grad_norm": 5.127073287963867, + "learning_rate": 4.648829431438128e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6867430210113525, + "num_tokens": 472715.0, + "step": 807 + }, + { + "epoch": 7.16, + "grad_norm": 4.905478477478027, + "learning_rate": 4.642140468227425e-06, + "loss": 1.1311, + "mean_token_accuracy": 0.6759839355945587, + "num_tokens": 474280.0, + "step": 808 + }, + { + "epoch": 7.168888888888889, + "grad_norm": 4.752647876739502, + "learning_rate": 4.635451505016723e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6817436218261719, + "num_tokens": 475841.0, + "step": 809 + }, + { + "epoch": 7.177777777777778, + "grad_norm": 4.907838821411133, + "learning_rate": 4.6287625418060205e-06, + "loss": 1.1068, + "mean_token_accuracy": 0.6646086275577545, + "num_tokens": 477354.0, + "step": 810 + }, + { + "epoch": 7.1866666666666665, + "grad_norm": 5.139395236968994, + "learning_rate": 4.622073578595318e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6775968670845032, + "num_tokens": 478882.0, + "step": 811 + }, + { + "epoch": 7.195555555555556, + "grad_norm": 5.534365653991699, + "learning_rate": 4.615384615384616e-06, + "loss": 1.2055, + "mean_token_accuracy": 0.6239286661148071, + "num_tokens": 480491.0, + "step": 812 + }, + { + "epoch": 7.204444444444444, + "grad_norm": 4.87898063659668, + "learning_rate": 4.608695652173913e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6747311949729919, + "num_tokens": 482062.0, + "step": 813 + }, + { + "epoch": 7.213333333333333, + "grad_norm": 4.996034145355225, + "learning_rate": 4.602006688963211e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6727764308452606, + "num_tokens": 483613.0, + "step": 814 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 5.057877063751221, + "learning_rate": 4.595317725752509e-06, + "loss": 1.126, + "mean_token_accuracy": 0.6610957086086273, + "num_tokens": 485187.0, + "step": 815 + }, + { + "epoch": 7.231111111111111, + "grad_norm": 5.159642219543457, + "learning_rate": 4.588628762541807e-06, + "loss": 1.1236, + "mean_token_accuracy": 0.6579912006855011, + "num_tokens": 486760.0, + "step": 816 + }, + { + "epoch": 7.24, + "grad_norm": 5.221027851104736, + "learning_rate": 4.581939799331104e-06, + "loss": 1.1366, + "mean_token_accuracy": 0.6620135605335236, + "num_tokens": 488352.0, + "step": 817 + }, + { + "epoch": 7.248888888888889, + "grad_norm": 4.7972612380981445, + "learning_rate": 4.5752508361204015e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6596427857875824, + "num_tokens": 489918.0, + "step": 818 + }, + { + "epoch": 7.257777777777778, + "grad_norm": 4.837949752807617, + "learning_rate": 4.568561872909699e-06, + "loss": 1.094, + "mean_token_accuracy": 0.6713178753852844, + "num_tokens": 491455.0, + "step": 819 + }, + { + "epoch": 7.266666666666667, + "grad_norm": 4.915107250213623, + "learning_rate": 4.561872909698997e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.678695410490036, + "num_tokens": 493003.0, + "step": 820 + }, + { + "epoch": 7.275555555555556, + "grad_norm": 4.963907241821289, + "learning_rate": 4.555183946488295e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.6531919240951538, + "num_tokens": 494561.0, + "step": 821 + }, + { + "epoch": 7.2844444444444445, + "grad_norm": 5.253054141998291, + "learning_rate": 4.548494983277592e-06, + "loss": 1.1647, + "mean_token_accuracy": 0.6553089320659637, + "num_tokens": 496077.0, + "step": 822 + }, + { + "epoch": 7.293333333333333, + "grad_norm": 4.672346115112305, + "learning_rate": 4.54180602006689e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7008708417415619, + "num_tokens": 497677.0, + "step": 823 + }, + { + "epoch": 7.302222222222222, + "grad_norm": 4.6910200119018555, + "learning_rate": 4.535117056856188e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6844610869884491, + "num_tokens": 499206.0, + "step": 824 + }, + { + "epoch": 7.311111111111111, + "grad_norm": 5.06258487701416, + "learning_rate": 4.5284280936454856e-06, + "loss": 1.1766, + "mean_token_accuracy": 0.6332085132598877, + "num_tokens": 500757.0, + "step": 825 + }, + { + "epoch": 7.32, + "grad_norm": 5.221124172210693, + "learning_rate": 4.5217391304347826e-06, + "loss": 1.1613, + "mean_token_accuracy": 0.6643394529819489, + "num_tokens": 502327.0, + "step": 826 + }, + { + "epoch": 7.328888888888889, + "grad_norm": 4.98481559753418, + "learning_rate": 4.51505016722408e-06, + "loss": 1.176, + "mean_token_accuracy": 0.634590208530426, + "num_tokens": 503865.0, + "step": 827 + }, + { + "epoch": 7.337777777777778, + "grad_norm": 5.355236530303955, + "learning_rate": 4.508361204013378e-06, + "loss": 1.1152, + "mean_token_accuracy": 0.6839827001094818, + "num_tokens": 505405.0, + "step": 828 + }, + { + "epoch": 7.346666666666667, + "grad_norm": 5.237786769866943, + "learning_rate": 4.501672240802676e-06, + "loss": 1.0962, + "mean_token_accuracy": 0.6735359132289886, + "num_tokens": 506969.0, + "step": 829 + }, + { + "epoch": 7.355555555555555, + "grad_norm": 5.832341194152832, + "learning_rate": 4.494983277591974e-06, + "loss": 1.1082, + "mean_token_accuracy": 0.6676413118839264, + "num_tokens": 508442.0, + "step": 830 + }, + { + "epoch": 7.364444444444445, + "grad_norm": 5.237814426422119, + "learning_rate": 4.488294314381271e-06, + "loss": 1.1523, + "mean_token_accuracy": 0.6546165943145752, + "num_tokens": 509967.0, + "step": 831 + }, + { + "epoch": 7.373333333333333, + "grad_norm": 5.053372859954834, + "learning_rate": 4.481605351170569e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6775301694869995, + "num_tokens": 511525.0, + "step": 832 + }, + { + "epoch": 7.3822222222222225, + "grad_norm": 5.421652793884277, + "learning_rate": 4.474916387959867e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6759259402751923, + "num_tokens": 513080.0, + "step": 833 + }, + { + "epoch": 7.391111111111111, + "grad_norm": 5.246650218963623, + "learning_rate": 4.4682274247491644e-06, + "loss": 1.1455, + "mean_token_accuracy": 0.6497251391410828, + "num_tokens": 514582.0, + "step": 834 + }, + { + "epoch": 7.4, + "grad_norm": 5.782858371734619, + "learning_rate": 4.461538461538462e-06, + "loss": 1.1372, + "mean_token_accuracy": 0.6657707393169403, + "num_tokens": 516106.0, + "step": 835 + }, + { + "epoch": 7.408888888888889, + "grad_norm": 5.068062782287598, + "learning_rate": 4.454849498327759e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6968516409397125, + "num_tokens": 517620.0, + "step": 836 + }, + { + "epoch": 7.417777777777777, + "grad_norm": 5.271042346954346, + "learning_rate": 4.448160535117057e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6802702844142914, + "num_tokens": 519166.0, + "step": 837 + }, + { + "epoch": 7.426666666666667, + "grad_norm": 5.751661777496338, + "learning_rate": 4.441471571906355e-06, + "loss": 1.2133, + "mean_token_accuracy": 0.6435826420783997, + "num_tokens": 520688.0, + "step": 838 + }, + { + "epoch": 7.435555555555555, + "grad_norm": 5.209582805633545, + "learning_rate": 4.434782608695653e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6820244193077087, + "num_tokens": 522195.0, + "step": 839 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 5.173656463623047, + "learning_rate": 4.428093645484951e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6729772984981537, + "num_tokens": 523815.0, + "step": 840 + }, + { + "epoch": 7.453333333333333, + "grad_norm": 5.213850975036621, + "learning_rate": 4.421404682274248e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6957837045192719, + "num_tokens": 525363.0, + "step": 841 + }, + { + "epoch": 7.4622222222222225, + "grad_norm": 5.766268730163574, + "learning_rate": 4.4147157190635455e-06, + "loss": 1.1252, + "mean_token_accuracy": 0.6593060493469238, + "num_tokens": 526842.0, + "step": 842 + }, + { + "epoch": 7.471111111111111, + "grad_norm": 5.768586158752441, + "learning_rate": 4.408026755852843e-06, + "loss": 1.1674, + "mean_token_accuracy": 0.6498867273330688, + "num_tokens": 528381.0, + "step": 843 + }, + { + "epoch": 7.48, + "grad_norm": 6.050495147705078, + "learning_rate": 4.401337792642141e-06, + "loss": 1.1203, + "mean_token_accuracy": 0.6873879730701447, + "num_tokens": 529906.0, + "step": 844 + }, + { + "epoch": 7.488888888888889, + "grad_norm": 5.316295146942139, + "learning_rate": 4.394648829431438e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6799847781658173, + "num_tokens": 531421.0, + "step": 845 + }, + { + "epoch": 7.497777777777777, + "grad_norm": 5.562624454498291, + "learning_rate": 4.387959866220736e-06, + "loss": 1.118, + "mean_token_accuracy": 0.6705078780651093, + "num_tokens": 532947.0, + "step": 846 + }, + { + "epoch": 7.506666666666667, + "grad_norm": 5.198367595672607, + "learning_rate": 4.381270903010034e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6822245419025421, + "num_tokens": 534461.0, + "step": 847 + }, + { + "epoch": 7.515555555555555, + "grad_norm": 5.307058334350586, + "learning_rate": 4.374581939799332e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6818973124027252, + "num_tokens": 536045.0, + "step": 848 + }, + { + "epoch": 7.524444444444445, + "grad_norm": 5.2752532958984375, + "learning_rate": 4.367892976588629e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6859411001205444, + "num_tokens": 537624.0, + "step": 849 + }, + { + "epoch": 7.533333333333333, + "grad_norm": 5.355432987213135, + "learning_rate": 4.3612040133779265e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6870485842227936, + "num_tokens": 539147.0, + "step": 850 + }, + { + "epoch": 7.542222222222223, + "grad_norm": 5.21906852722168, + "learning_rate": 4.354515050167224e-06, + "loss": 1.116, + "mean_token_accuracy": 0.6499757170677185, + "num_tokens": 540661.0, + "step": 851 + }, + { + "epoch": 7.551111111111111, + "grad_norm": 5.896401882171631, + "learning_rate": 4.347826086956522e-06, + "loss": 1.2131, + "mean_token_accuracy": 0.6473860144615173, + "num_tokens": 542145.0, + "step": 852 + }, + { + "epoch": 7.5600000000000005, + "grad_norm": 5.899242401123047, + "learning_rate": 4.34113712374582e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6826995611190796, + "num_tokens": 543737.0, + "step": 853 + }, + { + "epoch": 7.568888888888889, + "grad_norm": 5.382544040679932, + "learning_rate": 4.334448160535117e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6944388151168823, + "num_tokens": 545346.0, + "step": 854 + }, + { + "epoch": 7.5777777777777775, + "grad_norm": 5.309659957885742, + "learning_rate": 4.327759197324415e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.673027753829956, + "num_tokens": 546930.0, + "step": 855 + }, + { + "epoch": 7.586666666666667, + "grad_norm": 5.420487880706787, + "learning_rate": 4.321070234113713e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6797861158847809, + "num_tokens": 548454.0, + "step": 856 + }, + { + "epoch": 7.595555555555555, + "grad_norm": 5.470422744750977, + "learning_rate": 4.3143812709030106e-06, + "loss": 1.138, + "mean_token_accuracy": 0.6494588851928711, + "num_tokens": 549986.0, + "step": 857 + }, + { + "epoch": 7.604444444444445, + "grad_norm": 5.255749702453613, + "learning_rate": 4.307692307692308e-06, + "loss": 1.107, + "mean_token_accuracy": 0.6578380763530731, + "num_tokens": 551538.0, + "step": 858 + }, + { + "epoch": 7.613333333333333, + "grad_norm": 5.558720111846924, + "learning_rate": 4.301003344481605e-06, + "loss": 1.1225, + "mean_token_accuracy": 0.6527324020862579, + "num_tokens": 553067.0, + "step": 859 + }, + { + "epoch": 7.622222222222222, + "grad_norm": 5.80465030670166, + "learning_rate": 4.294314381270903e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.674045205116272, + "num_tokens": 554572.0, + "step": 860 + }, + { + "epoch": 7.631111111111111, + "grad_norm": 4.932501792907715, + "learning_rate": 4.287625418060201e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.67153000831604, + "num_tokens": 556149.0, + "step": 861 + }, + { + "epoch": 7.64, + "grad_norm": 5.582308292388916, + "learning_rate": 4.280936454849499e-06, + "loss": 1.1622, + "mean_token_accuracy": 0.652256190776825, + "num_tokens": 557695.0, + "step": 862 + }, + { + "epoch": 7.648888888888889, + "grad_norm": 5.571846961975098, + "learning_rate": 4.274247491638797e-06, + "loss": 1.1842, + "mean_token_accuracy": 0.6774209141731262, + "num_tokens": 559241.0, + "step": 863 + }, + { + "epoch": 7.657777777777778, + "grad_norm": 5.913292407989502, + "learning_rate": 4.267558528428094e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6666834354400635, + "num_tokens": 560786.0, + "step": 864 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 5.50427770614624, + "learning_rate": 4.260869565217392e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6892899572849274, + "num_tokens": 562316.0, + "step": 865 + }, + { + "epoch": 7.6755555555555555, + "grad_norm": 5.476339817047119, + "learning_rate": 4.2541806020066895e-06, + "loss": 1.1455, + "mean_token_accuracy": 0.6631506383419037, + "num_tokens": 563775.0, + "step": 866 + }, + { + "epoch": 7.684444444444445, + "grad_norm": 5.270669460296631, + "learning_rate": 4.247491638795987e-06, + "loss": 1.1175, + "mean_token_accuracy": 0.661677747964859, + "num_tokens": 565378.0, + "step": 867 + }, + { + "epoch": 7.693333333333333, + "grad_norm": 5.447734355926514, + "learning_rate": 4.240802675585284e-06, + "loss": 1.1643, + "mean_token_accuracy": 0.6277681887149811, + "num_tokens": 566914.0, + "step": 868 + }, + { + "epoch": 7.702222222222222, + "grad_norm": 5.398086071014404, + "learning_rate": 4.234113712374582e-06, + "loss": 1.1199, + "mean_token_accuracy": 0.6719484329223633, + "num_tokens": 568434.0, + "step": 869 + }, + { + "epoch": 7.711111111111111, + "grad_norm": 5.813198089599609, + "learning_rate": 4.22742474916388e-06, + "loss": 1.1885, + "mean_token_accuracy": 0.6275551021099091, + "num_tokens": 569936.0, + "step": 870 + }, + { + "epoch": 7.72, + "grad_norm": 5.76833963394165, + "learning_rate": 4.220735785953178e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6729241013526917, + "num_tokens": 571449.0, + "step": 871 + }, + { + "epoch": 7.728888888888889, + "grad_norm": 5.4217753410339355, + "learning_rate": 4.214046822742475e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.675014466047287, + "num_tokens": 572935.0, + "step": 872 + }, + { + "epoch": 7.737777777777778, + "grad_norm": 5.16042947769165, + "learning_rate": 4.207357859531773e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6860963106155396, + "num_tokens": 574513.0, + "step": 873 + }, + { + "epoch": 7.746666666666667, + "grad_norm": 5.511298179626465, + "learning_rate": 4.2006688963210705e-06, + "loss": 1.1285, + "mean_token_accuracy": 0.6621744930744171, + "num_tokens": 576102.0, + "step": 874 + }, + { + "epoch": 7.7555555555555555, + "grad_norm": 5.41949987411499, + "learning_rate": 4.193979933110368e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7015989720821381, + "num_tokens": 577591.0, + "step": 875 + }, + { + "epoch": 7.764444444444445, + "grad_norm": 5.362825393676758, + "learning_rate": 4.187290969899666e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6902236640453339, + "num_tokens": 579162.0, + "step": 876 + }, + { + "epoch": 7.773333333333333, + "grad_norm": 5.219437122344971, + "learning_rate": 4.180602006688963e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6877565383911133, + "num_tokens": 580721.0, + "step": 877 + }, + { + "epoch": 7.782222222222222, + "grad_norm": 5.75254487991333, + "learning_rate": 4.173913043478261e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.707065612077713, + "num_tokens": 582217.0, + "step": 878 + }, + { + "epoch": 7.791111111111111, + "grad_norm": 5.665853500366211, + "learning_rate": 4.167224080267559e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6803156137466431, + "num_tokens": 583732.0, + "step": 879 + }, + { + "epoch": 7.8, + "grad_norm": 5.3993000984191895, + "learning_rate": 4.160535117056857e-06, + "loss": 1.1064, + "mean_token_accuracy": 0.6648494005203247, + "num_tokens": 585300.0, + "step": 880 + }, + { + "epoch": 7.808888888888889, + "grad_norm": 6.3109636306762695, + "learning_rate": 4.1538461538461545e-06, + "loss": 1.1448, + "mean_token_accuracy": 0.6615542769432068, + "num_tokens": 586860.0, + "step": 881 + }, + { + "epoch": 7.817777777777778, + "grad_norm": 5.708662509918213, + "learning_rate": 4.1471571906354515e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6654264628887177, + "num_tokens": 588423.0, + "step": 882 + }, + { + "epoch": 7.826666666666666, + "grad_norm": 5.324418544769287, + "learning_rate": 4.140468227424749e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6788971424102783, + "num_tokens": 589980.0, + "step": 883 + }, + { + "epoch": 7.835555555555556, + "grad_norm": 6.183442115783691, + "learning_rate": 4.133779264214047e-06, + "loss": 1.2065, + "mean_token_accuracy": 0.6498978734016418, + "num_tokens": 591523.0, + "step": 884 + }, + { + "epoch": 7.844444444444444, + "grad_norm": 6.225346088409424, + "learning_rate": 4.127090301003345e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7024379670619965, + "num_tokens": 592990.0, + "step": 885 + }, + { + "epoch": 7.8533333333333335, + "grad_norm": 5.479620933532715, + "learning_rate": 4.120401337792643e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6687215566635132, + "num_tokens": 594521.0, + "step": 886 + }, + { + "epoch": 7.862222222222222, + "grad_norm": 5.474368095397949, + "learning_rate": 4.11371237458194e-06, + "loss": 1.1066, + "mean_token_accuracy": 0.6771464049816132, + "num_tokens": 596093.0, + "step": 887 + }, + { + "epoch": 7.871111111111111, + "grad_norm": 5.470034122467041, + "learning_rate": 4.107023411371238e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.6573123335838318, + "num_tokens": 597671.0, + "step": 888 + }, + { + "epoch": 7.88, + "grad_norm": 5.707530975341797, + "learning_rate": 4.100334448160536e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6746935546398163, + "num_tokens": 599203.0, + "step": 889 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 5.892587661743164, + "learning_rate": 4.093645484949833e-06, + "loss": 1.1916, + "mean_token_accuracy": 0.6432776749134064, + "num_tokens": 600802.0, + "step": 890 + }, + { + "epoch": 7.897777777777778, + "grad_norm": 5.981845855712891, + "learning_rate": 4.086956521739131e-06, + "loss": 1.1828, + "mean_token_accuracy": 0.651203453540802, + "num_tokens": 602357.0, + "step": 891 + }, + { + "epoch": 7.906666666666666, + "grad_norm": 5.971845626831055, + "learning_rate": 4.080267558528428e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6776088178157806, + "num_tokens": 603862.0, + "step": 892 + }, + { + "epoch": 7.915555555555556, + "grad_norm": 5.596317768096924, + "learning_rate": 4.073578595317726e-06, + "loss": 1.1196, + "mean_token_accuracy": 0.6720840036869049, + "num_tokens": 605431.0, + "step": 893 + }, + { + "epoch": 7.924444444444444, + "grad_norm": 5.453043460845947, + "learning_rate": 4.066889632107024e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6812250912189484, + "num_tokens": 606948.0, + "step": 894 + }, + { + "epoch": 7.933333333333334, + "grad_norm": 5.850295543670654, + "learning_rate": 4.060200668896322e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6786240041255951, + "num_tokens": 608478.0, + "step": 895 + }, + { + "epoch": 7.942222222222222, + "grad_norm": 5.7047119140625, + "learning_rate": 4.053511705685619e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6731551587581635, + "num_tokens": 610022.0, + "step": 896 + }, + { + "epoch": 7.9511111111111115, + "grad_norm": 5.332679271697998, + "learning_rate": 4.046822742474917e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6878829598426819, + "num_tokens": 611599.0, + "step": 897 + }, + { + "epoch": 7.96, + "grad_norm": 5.336120128631592, + "learning_rate": 4.0401337792642145e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6844577491283417, + "num_tokens": 613166.0, + "step": 898 + }, + { + "epoch": 7.968888888888889, + "grad_norm": 5.45760440826416, + "learning_rate": 4.033444816053512e-06, + "loss": 1.139, + "mean_token_accuracy": 0.6659426689147949, + "num_tokens": 614754.0, + "step": 899 + }, + { + "epoch": 7.977777777777778, + "grad_norm": 5.652373313903809, + "learning_rate": 4.026755852842809e-06, + "loss": 1.1314, + "mean_token_accuracy": 0.6639488339424133, + "num_tokens": 616296.0, + "step": 900 + }, + { + "epoch": 7.986666666666666, + "grad_norm": 5.775835990905762, + "learning_rate": 4.020066889632107e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.6707667112350464, + "num_tokens": 617825.0, + "step": 901 + }, + { + "epoch": 7.995555555555556, + "grad_norm": 5.378000736236572, + "learning_rate": 4.013377926421405e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6742046177387238, + "num_tokens": 619380.0, + "step": 902 + }, + { + "epoch": 8.0, + "grad_norm": 7.907252788543701, + "learning_rate": 4.006688963210703e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6804733872413635, + "num_tokens": 620106.0, + "step": 903 + }, + { + "epoch": 8.008888888888889, + "grad_norm": 5.468036651611328, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.7012869715690613, + "num_tokens": 621653.0, + "step": 904 + }, + { + "epoch": 8.017777777777777, + "grad_norm": 5.645960330963135, + "learning_rate": 3.993311036789298e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.6785929501056671, + "num_tokens": 623208.0, + "step": 905 + }, + { + "epoch": 8.026666666666667, + "grad_norm": 5.695517063140869, + "learning_rate": 3.9866220735785955e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7016069889068604, + "num_tokens": 624747.0, + "step": 906 + }, + { + "epoch": 8.035555555555556, + "grad_norm": 5.841511249542236, + "learning_rate": 3.979933110367893e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.6541284620761871, + "num_tokens": 626331.0, + "step": 907 + }, + { + "epoch": 8.044444444444444, + "grad_norm": 5.562108993530273, + "learning_rate": 3.973244147157191e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6952144503593445, + "num_tokens": 627887.0, + "step": 908 + }, + { + "epoch": 8.053333333333333, + "grad_norm": 5.662504196166992, + "learning_rate": 3.966555183946489e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6854623854160309, + "num_tokens": 629452.0, + "step": 909 + }, + { + "epoch": 8.062222222222223, + "grad_norm": 5.873632431030273, + "learning_rate": 3.959866220735786e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6880045533180237, + "num_tokens": 630971.0, + "step": 910 + }, + { + "epoch": 8.071111111111112, + "grad_norm": 5.85434627532959, + "learning_rate": 3.953177257525084e-06, + "loss": 1.1383, + "mean_token_accuracy": 0.6709563732147217, + "num_tokens": 632551.0, + "step": 911 + }, + { + "epoch": 8.08, + "grad_norm": 5.4959869384765625, + "learning_rate": 3.946488294314382e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7016617953777313, + "num_tokens": 634126.0, + "step": 912 + }, + { + "epoch": 8.088888888888889, + "grad_norm": 6.136938571929932, + "learning_rate": 3.9397993311036795e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6812466084957123, + "num_tokens": 635688.0, + "step": 913 + }, + { + "epoch": 8.097777777777777, + "grad_norm": 5.422168254852295, + "learning_rate": 3.933110367892977e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7077834904193878, + "num_tokens": 637175.0, + "step": 914 + }, + { + "epoch": 8.106666666666667, + "grad_norm": 5.24487829208374, + "learning_rate": 3.926421404682274e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6929806470870972, + "num_tokens": 638728.0, + "step": 915 + }, + { + "epoch": 8.115555555555556, + "grad_norm": 5.922311305999756, + "learning_rate": 3.919732441471572e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6781790256500244, + "num_tokens": 640261.0, + "step": 916 + }, + { + "epoch": 8.124444444444444, + "grad_norm": 6.581790924072266, + "learning_rate": 3.91304347826087e-06, + "loss": 1.1233, + "mean_token_accuracy": 0.6532779335975647, + "num_tokens": 641788.0, + "step": 917 + }, + { + "epoch": 8.133333333333333, + "grad_norm": 5.554086685180664, + "learning_rate": 3.906354515050168e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6845808625221252, + "num_tokens": 643415.0, + "step": 918 + }, + { + "epoch": 8.142222222222221, + "grad_norm": 6.113454818725586, + "learning_rate": 3.899665551839465e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6979126632213593, + "num_tokens": 644941.0, + "step": 919 + }, + { + "epoch": 8.151111111111112, + "grad_norm": 5.653061389923096, + "learning_rate": 3.892976588628763e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.695422500371933, + "num_tokens": 646493.0, + "step": 920 + }, + { + "epoch": 8.16, + "grad_norm": 6.188579559326172, + "learning_rate": 3.886287625418061e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.675423264503479, + "num_tokens": 648042.0, + "step": 921 + }, + { + "epoch": 8.168888888888889, + "grad_norm": 5.901784420013428, + "learning_rate": 3.8795986622073584e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.690513402223587, + "num_tokens": 649582.0, + "step": 922 + }, + { + "epoch": 8.177777777777777, + "grad_norm": 5.366498947143555, + "learning_rate": 3.8729096989966554e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6984874606132507, + "num_tokens": 651162.0, + "step": 923 + }, + { + "epoch": 8.186666666666667, + "grad_norm": 6.116403102874756, + "learning_rate": 3.866220735785953e-06, + "loss": 1.1429, + "mean_token_accuracy": 0.6260418891906738, + "num_tokens": 652721.0, + "step": 924 + }, + { + "epoch": 8.195555555555556, + "grad_norm": 6.210691928863525, + "learning_rate": 3.859531772575251e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.7016407549381256, + "num_tokens": 654306.0, + "step": 925 + }, + { + "epoch": 8.204444444444444, + "grad_norm": 5.694217681884766, + "learning_rate": 3.852842809364549e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7237739861011505, + "num_tokens": 655831.0, + "step": 926 + }, + { + "epoch": 8.213333333333333, + "grad_norm": 5.572376251220703, + "learning_rate": 3.846153846153847e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6681804955005646, + "num_tokens": 657391.0, + "step": 927 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 5.760652542114258, + "learning_rate": 3.839464882943144e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6816069483757019, + "num_tokens": 658911.0, + "step": 928 + }, + { + "epoch": 8.231111111111112, + "grad_norm": 5.3832597732543945, + "learning_rate": 3.832775919732442e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.7018404006958008, + "num_tokens": 660421.0, + "step": 929 + }, + { + "epoch": 8.24, + "grad_norm": 5.437999248504639, + "learning_rate": 3.8260869565217395e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6898455917835236, + "num_tokens": 661963.0, + "step": 930 + }, + { + "epoch": 8.248888888888889, + "grad_norm": 5.449160575866699, + "learning_rate": 3.819397993311037e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7264066934585571, + "num_tokens": 663500.0, + "step": 931 + }, + { + "epoch": 8.257777777777777, + "grad_norm": 5.735660552978516, + "learning_rate": 3.812709030100335e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6813046634197235, + "num_tokens": 665039.0, + "step": 932 + }, + { + "epoch": 8.266666666666667, + "grad_norm": 6.236545085906982, + "learning_rate": 3.8060200668896326e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6644236445426941, + "num_tokens": 666569.0, + "step": 933 + }, + { + "epoch": 8.275555555555556, + "grad_norm": 6.074221134185791, + "learning_rate": 3.79933110367893e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6636908054351807, + "num_tokens": 668125.0, + "step": 934 + }, + { + "epoch": 8.284444444444444, + "grad_norm": 6.069758415222168, + "learning_rate": 3.792642140468228e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6893440186977386, + "num_tokens": 669639.0, + "step": 935 + }, + { + "epoch": 8.293333333333333, + "grad_norm": 6.023536205291748, + "learning_rate": 3.7859531772575253e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6813647449016571, + "num_tokens": 671135.0, + "step": 936 + }, + { + "epoch": 8.302222222222222, + "grad_norm": 5.801281452178955, + "learning_rate": 3.7792642140468235e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.6971921622753143, + "num_tokens": 672701.0, + "step": 937 + }, + { + "epoch": 8.311111111111112, + "grad_norm": 5.723294258117676, + "learning_rate": 3.772575250836121e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6772040724754333, + "num_tokens": 674269.0, + "step": 938 + }, + { + "epoch": 8.32, + "grad_norm": 5.829345226287842, + "learning_rate": 3.7658862876254184e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6933946311473846, + "num_tokens": 675804.0, + "step": 939 + }, + { + "epoch": 8.328888888888889, + "grad_norm": 6.187466621398926, + "learning_rate": 3.759197324414716e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7091684937477112, + "num_tokens": 677352.0, + "step": 940 + }, + { + "epoch": 8.337777777777777, + "grad_norm": 6.085653305053711, + "learning_rate": 3.7525083612040136e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6711534559726715, + "num_tokens": 678902.0, + "step": 941 + }, + { + "epoch": 8.346666666666668, + "grad_norm": 6.733924865722656, + "learning_rate": 3.745819397993311e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.667328804731369, + "num_tokens": 680404.0, + "step": 942 + }, + { + "epoch": 8.355555555555556, + "grad_norm": 6.405935764312744, + "learning_rate": 3.739130434782609e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6760315001010895, + "num_tokens": 681930.0, + "step": 943 + }, + { + "epoch": 8.364444444444445, + "grad_norm": 6.480041027069092, + "learning_rate": 3.7324414715719063e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6703530848026276, + "num_tokens": 683502.0, + "step": 944 + }, + { + "epoch": 8.373333333333333, + "grad_norm": 6.6260480880737305, + "learning_rate": 3.7257525083612046e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7034675180912018, + "num_tokens": 685041.0, + "step": 945 + }, + { + "epoch": 8.382222222222222, + "grad_norm": 6.365560531616211, + "learning_rate": 3.719063545150502e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6870715916156769, + "num_tokens": 686577.0, + "step": 946 + }, + { + "epoch": 8.391111111111112, + "grad_norm": 6.911205768585205, + "learning_rate": 3.7123745819397994e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6994909644126892, + "num_tokens": 688106.0, + "step": 947 + }, + { + "epoch": 8.4, + "grad_norm": 6.3452982902526855, + "learning_rate": 3.7056856187290972e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6929118633270264, + "num_tokens": 689650.0, + "step": 948 + }, + { + "epoch": 8.408888888888889, + "grad_norm": 6.5572991371154785, + "learning_rate": 3.6989966555183947e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.6556223630905151, + "num_tokens": 691196.0, + "step": 949 + }, + { + "epoch": 8.417777777777777, + "grad_norm": 6.221085071563721, + "learning_rate": 3.692307692307693e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6876726448535919, + "num_tokens": 692760.0, + "step": 950 + }, + { + "epoch": 8.426666666666666, + "grad_norm": 6.218048095703125, + "learning_rate": 3.6856187290969903e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6954516470432281, + "num_tokens": 694253.0, + "step": 951 + }, + { + "epoch": 8.435555555555556, + "grad_norm": 6.507298469543457, + "learning_rate": 3.6789297658862878e-06, + "loss": 1.1231, + "mean_token_accuracy": 0.6779280304908752, + "num_tokens": 695784.0, + "step": 952 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 6.065165996551514, + "learning_rate": 3.6722408026755856e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6734541356563568, + "num_tokens": 697371.0, + "step": 953 + }, + { + "epoch": 8.453333333333333, + "grad_norm": 6.742241382598877, + "learning_rate": 3.665551839464883e-06, + "loss": 1.027, + "mean_token_accuracy": 0.7033563554286957, + "num_tokens": 698899.0, + "step": 954 + }, + { + "epoch": 8.462222222222222, + "grad_norm": 6.056994915008545, + "learning_rate": 3.6588628762541813e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6718437671661377, + "num_tokens": 700491.0, + "step": 955 + }, + { + "epoch": 8.471111111111112, + "grad_norm": 6.605247974395752, + "learning_rate": 3.6521739130434787e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6813630163669586, + "num_tokens": 702007.0, + "step": 956 + }, + { + "epoch": 8.48, + "grad_norm": 5.856255531311035, + "learning_rate": 3.645484949832776e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7033153176307678, + "num_tokens": 703570.0, + "step": 957 + }, + { + "epoch": 8.488888888888889, + "grad_norm": 6.791911602020264, + "learning_rate": 3.638795986622074e-06, + "loss": 1.1523, + "mean_token_accuracy": 0.6678631007671356, + "num_tokens": 705066.0, + "step": 958 + }, + { + "epoch": 8.497777777777777, + "grad_norm": 6.336922645568848, + "learning_rate": 3.6321070234113714e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.698070764541626, + "num_tokens": 706575.0, + "step": 959 + }, + { + "epoch": 8.506666666666666, + "grad_norm": 6.691784381866455, + "learning_rate": 3.6254180602006696e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6644406616687775, + "num_tokens": 708074.0, + "step": 960 + }, + { + "epoch": 8.515555555555556, + "grad_norm": 6.075328350067139, + "learning_rate": 3.618729096989967e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7022949755191803, + "num_tokens": 709583.0, + "step": 961 + }, + { + "epoch": 8.524444444444445, + "grad_norm": 5.8315935134887695, + "learning_rate": 3.6120401337792645e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7152966558933258, + "num_tokens": 711143.0, + "step": 962 + }, + { + "epoch": 8.533333333333333, + "grad_norm": 6.13075065612793, + "learning_rate": 3.6053511705685623e-06, + "loss": 1.1237, + "mean_token_accuracy": 0.6510510444641113, + "num_tokens": 712636.0, + "step": 963 + }, + { + "epoch": 8.542222222222222, + "grad_norm": 5.962428569793701, + "learning_rate": 3.5986622073578597e-06, + "loss": 1.0864, + "mean_token_accuracy": 0.6623203456401825, + "num_tokens": 714213.0, + "step": 964 + }, + { + "epoch": 8.551111111111112, + "grad_norm": 6.176811695098877, + "learning_rate": 3.5919732441471576e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6801503300666809, + "num_tokens": 715705.0, + "step": 965 + }, + { + "epoch": 8.56, + "grad_norm": 6.383599281311035, + "learning_rate": 3.585284280936455e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6885103583335876, + "num_tokens": 717197.0, + "step": 966 + }, + { + "epoch": 8.568888888888889, + "grad_norm": 5.940642356872559, + "learning_rate": 3.578595317725753e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7222094237804413, + "num_tokens": 718787.0, + "step": 967 + }, + { + "epoch": 8.577777777777778, + "grad_norm": 6.427318572998047, + "learning_rate": 3.5719063545150507e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7075932621955872, + "num_tokens": 720314.0, + "step": 968 + }, + { + "epoch": 8.586666666666666, + "grad_norm": 6.300614356994629, + "learning_rate": 3.565217391304348e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7019940316677094, + "num_tokens": 721905.0, + "step": 969 + }, + { + "epoch": 8.595555555555556, + "grad_norm": 6.25466251373291, + "learning_rate": 3.5585284280936455e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7043630182743073, + "num_tokens": 723453.0, + "step": 970 + }, + { + "epoch": 8.604444444444445, + "grad_norm": 6.768106460571289, + "learning_rate": 3.5518394648829434e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6801336109638214, + "num_tokens": 724991.0, + "step": 971 + }, + { + "epoch": 8.613333333333333, + "grad_norm": 6.334679126739502, + "learning_rate": 3.5451505016722408e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6815243065357208, + "num_tokens": 726566.0, + "step": 972 + }, + { + "epoch": 8.622222222222222, + "grad_norm": 6.935222148895264, + "learning_rate": 3.538461538461539e-06, + "loss": 1.1826, + "mean_token_accuracy": 0.642726331949234, + "num_tokens": 728087.0, + "step": 973 + }, + { + "epoch": 8.63111111111111, + "grad_norm": 6.195850849151611, + "learning_rate": 3.5317725752508365e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.681325227022171, + "num_tokens": 729646.0, + "step": 974 + }, + { + "epoch": 8.64, + "grad_norm": 5.939260005950928, + "learning_rate": 3.525083612040134e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6942110359668732, + "num_tokens": 731255.0, + "step": 975 + }, + { + "epoch": 8.648888888888889, + "grad_norm": 6.667236804962158, + "learning_rate": 3.5183946488294317e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6624611616134644, + "num_tokens": 732774.0, + "step": 976 + }, + { + "epoch": 8.657777777777778, + "grad_norm": 7.3711323738098145, + "learning_rate": 3.511705685618729e-06, + "loss": 1.1434, + "mean_token_accuracy": 0.6538436412811279, + "num_tokens": 734270.0, + "step": 977 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 6.097768783569336, + "learning_rate": 3.5050167224080274e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6774771213531494, + "num_tokens": 735769.0, + "step": 978 + }, + { + "epoch": 8.675555555555556, + "grad_norm": 6.924376964569092, + "learning_rate": 3.498327759197325e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6878836154937744, + "num_tokens": 737355.0, + "step": 979 + }, + { + "epoch": 8.684444444444445, + "grad_norm": 6.257560729980469, + "learning_rate": 3.4916387959866222e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7175524532794952, + "num_tokens": 738859.0, + "step": 980 + }, + { + "epoch": 8.693333333333333, + "grad_norm": 6.315246105194092, + "learning_rate": 3.48494983277592e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6984696388244629, + "num_tokens": 740351.0, + "step": 981 + }, + { + "epoch": 8.702222222222222, + "grad_norm": 6.431005477905273, + "learning_rate": 3.4782608695652175e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.700090765953064, + "num_tokens": 741882.0, + "step": 982 + }, + { + "epoch": 8.71111111111111, + "grad_norm": 6.321691036224365, + "learning_rate": 3.4715719063545158e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6974213123321533, + "num_tokens": 743432.0, + "step": 983 + }, + { + "epoch": 8.72, + "grad_norm": 7.2131028175354, + "learning_rate": 3.464882943143813e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.7091863751411438, + "num_tokens": 744929.0, + "step": 984 + }, + { + "epoch": 8.72888888888889, + "grad_norm": 6.784447193145752, + "learning_rate": 3.4581939799331106e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6907787919044495, + "num_tokens": 746437.0, + "step": 985 + }, + { + "epoch": 8.737777777777778, + "grad_norm": 6.193514347076416, + "learning_rate": 3.4515050167224085e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6818574965000153, + "num_tokens": 747959.0, + "step": 986 + }, + { + "epoch": 8.746666666666666, + "grad_norm": 5.993904113769531, + "learning_rate": 3.444816053511706e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7186124324798584, + "num_tokens": 749548.0, + "step": 987 + }, + { + "epoch": 8.755555555555556, + "grad_norm": 6.577064514160156, + "learning_rate": 3.4381270903010037e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.6834734082221985, + "num_tokens": 751071.0, + "step": 988 + }, + { + "epoch": 8.764444444444445, + "grad_norm": 5.983057022094727, + "learning_rate": 3.431438127090301e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.6955296695232391, + "num_tokens": 752652.0, + "step": 989 + }, + { + "epoch": 8.773333333333333, + "grad_norm": 6.326080799102783, + "learning_rate": 3.424749163879599e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7226007580757141, + "num_tokens": 754175.0, + "step": 990 + }, + { + "epoch": 8.782222222222222, + "grad_norm": 7.065449237823486, + "learning_rate": 3.418060200668897e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7023067772388458, + "num_tokens": 755723.0, + "step": 991 + }, + { + "epoch": 8.79111111111111, + "grad_norm": 6.616161346435547, + "learning_rate": 3.4113712374581942e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7136098742485046, + "num_tokens": 757247.0, + "step": 992 + }, + { + "epoch": 8.8, + "grad_norm": 6.490316867828369, + "learning_rate": 3.4046822742474917e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6961342096328735, + "num_tokens": 758802.0, + "step": 993 + }, + { + "epoch": 8.80888888888889, + "grad_norm": 6.603043556213379, + "learning_rate": 3.3979933110367895e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7105428874492645, + "num_tokens": 760381.0, + "step": 994 + }, + { + "epoch": 8.817777777777778, + "grad_norm": 7.626179218292236, + "learning_rate": 3.391304347826087e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6785582304000854, + "num_tokens": 761850.0, + "step": 995 + }, + { + "epoch": 8.826666666666666, + "grad_norm": 7.58528470993042, + "learning_rate": 3.384615384615385e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6882131397724152, + "num_tokens": 763359.0, + "step": 996 + }, + { + "epoch": 8.835555555555555, + "grad_norm": 8.419365882873535, + "learning_rate": 3.3779264214046826e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7298109233379364, + "num_tokens": 764877.0, + "step": 997 + }, + { + "epoch": 8.844444444444445, + "grad_norm": 6.878218650817871, + "learning_rate": 3.37123745819398e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.698095828294754, + "num_tokens": 766385.0, + "step": 998 + }, + { + "epoch": 8.853333333333333, + "grad_norm": 7.882478713989258, + "learning_rate": 3.364548494983278e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6738704442977905, + "num_tokens": 767898.0, + "step": 999 + }, + { + "epoch": 8.862222222222222, + "grad_norm": 7.762391090393066, + "learning_rate": 3.3578595317725753e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7127435803413391, + "num_tokens": 769477.0, + "step": 1000 + }, + { + "epoch": 8.87111111111111, + "grad_norm": 6.794309139251709, + "learning_rate": 3.3511705685618735e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6749875843524933, + "num_tokens": 771089.0, + "step": 1001 + }, + { + "epoch": 8.88, + "grad_norm": 6.67686653137207, + "learning_rate": 3.344481605351171e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7026183307170868, + "num_tokens": 772622.0, + "step": 1002 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 7.100803852081299, + "learning_rate": 3.3377926421404684e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6974572837352753, + "num_tokens": 774205.0, + "step": 1003 + }, + { + "epoch": 8.897777777777778, + "grad_norm": 7.320077896118164, + "learning_rate": 3.3311036789297662e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6934449076652527, + "num_tokens": 775749.0, + "step": 1004 + }, + { + "epoch": 8.906666666666666, + "grad_norm": 6.816684246063232, + "learning_rate": 3.3244147157190636e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.6990039646625519, + "num_tokens": 777305.0, + "step": 1005 + }, + { + "epoch": 8.915555555555555, + "grad_norm": 6.338404655456543, + "learning_rate": 3.317725752508362e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.6953420341014862, + "num_tokens": 778865.0, + "step": 1006 + }, + { + "epoch": 8.924444444444445, + "grad_norm": 6.688970565795898, + "learning_rate": 3.3110367892976593e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6720951199531555, + "num_tokens": 780407.0, + "step": 1007 + }, + { + "epoch": 8.933333333333334, + "grad_norm": 6.222789287567139, + "learning_rate": 3.3043478260869567e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7111324667930603, + "num_tokens": 781935.0, + "step": 1008 + }, + { + "epoch": 8.942222222222222, + "grad_norm": 6.006635665893555, + "learning_rate": 3.2976588628762546e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7061764001846313, + "num_tokens": 783494.0, + "step": 1009 + }, + { + "epoch": 8.95111111111111, + "grad_norm": 6.638212203979492, + "learning_rate": 3.290969899665552e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.672920823097229, + "num_tokens": 785045.0, + "step": 1010 + }, + { + "epoch": 8.96, + "grad_norm": 6.107909202575684, + "learning_rate": 3.28428093645485e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6988838016986847, + "num_tokens": 786614.0, + "step": 1011 + }, + { + "epoch": 8.96888888888889, + "grad_norm": 6.867550373077393, + "learning_rate": 3.2775919732441473e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6862790584564209, + "num_tokens": 788138.0, + "step": 1012 + }, + { + "epoch": 8.977777777777778, + "grad_norm": 7.0037455558776855, + "learning_rate": 3.270903010033445e-06, + "loss": 1.1278, + "mean_token_accuracy": 0.6648448705673218, + "num_tokens": 789686.0, + "step": 1013 + }, + { + "epoch": 8.986666666666666, + "grad_norm": 6.66884183883667, + "learning_rate": 3.264214046822743e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6889189183712006, + "num_tokens": 791311.0, + "step": 1014 + }, + { + "epoch": 8.995555555555555, + "grad_norm": 6.210651397705078, + "learning_rate": 3.2575250836120404e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6996380686759949, + "num_tokens": 792896.0, + "step": 1015 + }, + { + "epoch": 9.0, + "grad_norm": 8.813923835754395, + "learning_rate": 3.250836120401338e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7277628183364868, + "num_tokens": 793664.0, + "step": 1016 + }, + { + "epoch": 9.008888888888889, + "grad_norm": 7.017031192779541, + "learning_rate": 3.2441471571906356e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6850097477436066, + "num_tokens": 795169.0, + "step": 1017 + }, + { + "epoch": 9.017777777777777, + "grad_norm": 6.505417346954346, + "learning_rate": 3.237458193979933e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6710526347160339, + "num_tokens": 796739.0, + "step": 1018 + }, + { + "epoch": 9.026666666666667, + "grad_norm": 6.589199542999268, + "learning_rate": 3.2307692307692313e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7095823585987091, + "num_tokens": 798301.0, + "step": 1019 + }, + { + "epoch": 9.035555555555556, + "grad_norm": 6.258679389953613, + "learning_rate": 3.2240802675585287e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7020219564437866, + "num_tokens": 799926.0, + "step": 1020 + }, + { + "epoch": 9.044444444444444, + "grad_norm": 6.970437049865723, + "learning_rate": 3.217391304347826e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6801057159900665, + "num_tokens": 801420.0, + "step": 1021 + }, + { + "epoch": 9.053333333333333, + "grad_norm": 6.7259931564331055, + "learning_rate": 3.210702341137124e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.705773264169693, + "num_tokens": 802981.0, + "step": 1022 + }, + { + "epoch": 9.062222222222223, + "grad_norm": 6.553651332855225, + "learning_rate": 3.2040133779264214e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7216697037220001, + "num_tokens": 804498.0, + "step": 1023 + }, + { + "epoch": 9.071111111111112, + "grad_norm": 6.5840277671813965, + "learning_rate": 3.1973244147157197e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7113925814628601, + "num_tokens": 806041.0, + "step": 1024 + }, + { + "epoch": 9.08, + "grad_norm": 6.5796074867248535, + "learning_rate": 3.190635451505017e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7270649969577789, + "num_tokens": 807576.0, + "step": 1025 + }, + { + "epoch": 9.088888888888889, + "grad_norm": 6.998542785644531, + "learning_rate": 3.1839464882943145e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6865200400352478, + "num_tokens": 809133.0, + "step": 1026 + }, + { + "epoch": 9.097777777777777, + "grad_norm": 6.307586193084717, + "learning_rate": 3.1772575250836123e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6806622445583344, + "num_tokens": 810689.0, + "step": 1027 + }, + { + "epoch": 9.106666666666667, + "grad_norm": 6.716704368591309, + "learning_rate": 3.1705685618729098e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.6849233508110046, + "num_tokens": 812234.0, + "step": 1028 + }, + { + "epoch": 9.115555555555556, + "grad_norm": 6.906090259552002, + "learning_rate": 3.163879598662208e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7084031105041504, + "num_tokens": 813829.0, + "step": 1029 + }, + { + "epoch": 9.124444444444444, + "grad_norm": 7.696595191955566, + "learning_rate": 3.1571906354515055e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7153899669647217, + "num_tokens": 815397.0, + "step": 1030 + }, + { + "epoch": 9.133333333333333, + "grad_norm": 7.4056172370910645, + "learning_rate": 3.150501672240803e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7049383223056793, + "num_tokens": 816939.0, + "step": 1031 + }, + { + "epoch": 9.142222222222221, + "grad_norm": 8.645524978637695, + "learning_rate": 3.1438127090301007e-06, + "loss": 1.0693, + "mean_token_accuracy": 0.6822735071182251, + "num_tokens": 818459.0, + "step": 1032 + }, + { + "epoch": 9.151111111111112, + "grad_norm": 7.273892879486084, + "learning_rate": 3.137123745819398e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7010431587696075, + "num_tokens": 819996.0, + "step": 1033 + }, + { + "epoch": 9.16, + "grad_norm": 7.414793491363525, + "learning_rate": 3.130434782608696e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7118965089321136, + "num_tokens": 821493.0, + "step": 1034 + }, + { + "epoch": 9.168888888888889, + "grad_norm": 7.9450578689575195, + "learning_rate": 3.1237458193979934e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.6825365424156189, + "num_tokens": 823041.0, + "step": 1035 + }, + { + "epoch": 9.177777777777777, + "grad_norm": 7.435962677001953, + "learning_rate": 3.1170568561872912e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6761629581451416, + "num_tokens": 824603.0, + "step": 1036 + }, + { + "epoch": 9.186666666666667, + "grad_norm": 7.091315269470215, + "learning_rate": 3.110367892976589e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.699446827173233, + "num_tokens": 826195.0, + "step": 1037 + }, + { + "epoch": 9.195555555555556, + "grad_norm": 7.373175144195557, + "learning_rate": 3.1036789297658865e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7247863411903381, + "num_tokens": 827726.0, + "step": 1038 + }, + { + "epoch": 9.204444444444444, + "grad_norm": 7.488717079162598, + "learning_rate": 3.0969899665551843e-06, + "loss": 1.1321, + "mean_token_accuracy": 0.6597996354103088, + "num_tokens": 829277.0, + "step": 1039 + }, + { + "epoch": 9.213333333333333, + "grad_norm": 7.354495525360107, + "learning_rate": 3.0903010033444818e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.732792466878891, + "num_tokens": 830831.0, + "step": 1040 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 6.9890899658203125, + "learning_rate": 3.083612040133779e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7435739636421204, + "num_tokens": 832378.0, + "step": 1041 + }, + { + "epoch": 9.231111111111112, + "grad_norm": 7.310318470001221, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.714740127325058, + "num_tokens": 833916.0, + "step": 1042 + }, + { + "epoch": 9.24, + "grad_norm": 7.045800685882568, + "learning_rate": 3.070234113712375e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7156940698623657, + "num_tokens": 835465.0, + "step": 1043 + }, + { + "epoch": 9.248888888888889, + "grad_norm": 7.140171051025391, + "learning_rate": 3.0635451505016723e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.6952777802944183, + "num_tokens": 836988.0, + "step": 1044 + }, + { + "epoch": 9.257777777777777, + "grad_norm": 7.7405195236206055, + "learning_rate": 3.05685618729097e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7019312679767609, + "num_tokens": 838519.0, + "step": 1045 + }, + { + "epoch": 9.266666666666667, + "grad_norm": 7.700501918792725, + "learning_rate": 3.0501672240802675e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7104335129261017, + "num_tokens": 840048.0, + "step": 1046 + }, + { + "epoch": 9.275555555555556, + "grad_norm": 7.23869514465332, + "learning_rate": 3.043478260869566e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6953783929347992, + "num_tokens": 841615.0, + "step": 1047 + }, + { + "epoch": 9.284444444444444, + "grad_norm": 7.534677028656006, + "learning_rate": 3.0367892976588632e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7028279006481171, + "num_tokens": 843203.0, + "step": 1048 + }, + { + "epoch": 9.293333333333333, + "grad_norm": 7.981372356414795, + "learning_rate": 3.0301003344481606e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6728443205356598, + "num_tokens": 844832.0, + "step": 1049 + }, + { + "epoch": 9.302222222222222, + "grad_norm": 7.877739429473877, + "learning_rate": 3.0234113712374585e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6671358048915863, + "num_tokens": 846339.0, + "step": 1050 + }, + { + "epoch": 9.311111111111112, + "grad_norm": 7.4183125495910645, + "learning_rate": 3.016722408026756e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7167645692825317, + "num_tokens": 847888.0, + "step": 1051 + }, + { + "epoch": 9.32, + "grad_norm": 6.944304943084717, + "learning_rate": 3.010033444816054e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7264457643032074, + "num_tokens": 849436.0, + "step": 1052 + }, + { + "epoch": 9.328888888888889, + "grad_norm": 7.041371822357178, + "learning_rate": 3.0033444816053516e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6815483868122101, + "num_tokens": 850950.0, + "step": 1053 + }, + { + "epoch": 9.337777777777777, + "grad_norm": 6.785553932189941, + "learning_rate": 2.996655518394649e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.6975142955780029, + "num_tokens": 852531.0, + "step": 1054 + }, + { + "epoch": 9.346666666666668, + "grad_norm": 6.817051887512207, + "learning_rate": 2.989966555183947e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6933909058570862, + "num_tokens": 854070.0, + "step": 1055 + }, + { + "epoch": 9.355555555555556, + "grad_norm": 6.753493309020996, + "learning_rate": 2.9832775919732443e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7168819308280945, + "num_tokens": 855614.0, + "step": 1056 + }, + { + "epoch": 9.364444444444445, + "grad_norm": 6.881538391113281, + "learning_rate": 2.976588628762542e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7146275043487549, + "num_tokens": 857145.0, + "step": 1057 + }, + { + "epoch": 9.373333333333333, + "grad_norm": 7.589474201202393, + "learning_rate": 2.9698996655518395e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6974684298038483, + "num_tokens": 858685.0, + "step": 1058 + }, + { + "epoch": 9.382222222222222, + "grad_norm": 7.396028518676758, + "learning_rate": 2.9632107023411374e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7101724743843079, + "num_tokens": 860240.0, + "step": 1059 + }, + { + "epoch": 9.391111111111112, + "grad_norm": 7.9415059089660645, + "learning_rate": 2.956521739130435e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6832516193389893, + "num_tokens": 861746.0, + "step": 1060 + }, + { + "epoch": 9.4, + "grad_norm": 7.231890678405762, + "learning_rate": 2.9498327759197326e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7255527973175049, + "num_tokens": 863260.0, + "step": 1061 + }, + { + "epoch": 9.408888888888889, + "grad_norm": 7.095097064971924, + "learning_rate": 2.9431438127090305e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.6953908801078796, + "num_tokens": 864790.0, + "step": 1062 + }, + { + "epoch": 9.417777777777777, + "grad_norm": 8.562793731689453, + "learning_rate": 2.936454849498328e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6639785170555115, + "num_tokens": 866326.0, + "step": 1063 + }, + { + "epoch": 9.426666666666666, + "grad_norm": 7.325282096862793, + "learning_rate": 2.9297658862876253e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7024668753147125, + "num_tokens": 867891.0, + "step": 1064 + }, + { + "epoch": 9.435555555555556, + "grad_norm": 7.891602993011475, + "learning_rate": 2.9230769230769236e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6713910400867462, + "num_tokens": 869452.0, + "step": 1065 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 7.602499008178711, + "learning_rate": 2.916387959866221e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6921903491020203, + "num_tokens": 871018.0, + "step": 1066 + }, + { + "epoch": 9.453333333333333, + "grad_norm": 7.239400386810303, + "learning_rate": 2.9096989966555184e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7048648595809937, + "num_tokens": 872547.0, + "step": 1067 + }, + { + "epoch": 9.462222222222222, + "grad_norm": 7.503430366516113, + "learning_rate": 2.9030100334448162e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7398989200592041, + "num_tokens": 874109.0, + "step": 1068 + }, + { + "epoch": 9.471111111111112, + "grad_norm": 7.326026916503906, + "learning_rate": 2.8963210702341137e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7010915279388428, + "num_tokens": 875661.0, + "step": 1069 + }, + { + "epoch": 9.48, + "grad_norm": 7.553050518035889, + "learning_rate": 2.889632107023412e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7152423560619354, + "num_tokens": 877230.0, + "step": 1070 + }, + { + "epoch": 9.488888888888889, + "grad_norm": 7.922276973724365, + "learning_rate": 2.8829431438127093e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6910598576068878, + "num_tokens": 878758.0, + "step": 1071 + }, + { + "epoch": 9.497777777777777, + "grad_norm": 7.106056213378906, + "learning_rate": 2.8762541806020068e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.6931600868701935, + "num_tokens": 880348.0, + "step": 1072 + }, + { + "epoch": 9.506666666666666, + "grad_norm": 6.510905742645264, + "learning_rate": 2.8695652173913046e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7370401918888092, + "num_tokens": 881884.0, + "step": 1073 + }, + { + "epoch": 9.515555555555556, + "grad_norm": 7.510615348815918, + "learning_rate": 2.862876254180602e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7144144177436829, + "num_tokens": 883415.0, + "step": 1074 + }, + { + "epoch": 9.524444444444445, + "grad_norm": 7.206902503967285, + "learning_rate": 2.8561872909699003e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7002752423286438, + "num_tokens": 884963.0, + "step": 1075 + }, + { + "epoch": 9.533333333333333, + "grad_norm": 7.168454647064209, + "learning_rate": 2.8494983277591977e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.6953886449337006, + "num_tokens": 886515.0, + "step": 1076 + }, + { + "epoch": 9.542222222222222, + "grad_norm": 7.769477844238281, + "learning_rate": 2.842809364548495e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6819023787975311, + "num_tokens": 888019.0, + "step": 1077 + }, + { + "epoch": 9.551111111111112, + "grad_norm": 7.892253875732422, + "learning_rate": 2.836120401337793e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6725123524665833, + "num_tokens": 889591.0, + "step": 1078 + }, + { + "epoch": 9.56, + "grad_norm": 7.1552205085754395, + "learning_rate": 2.8294314381270904e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7231752574443817, + "num_tokens": 891168.0, + "step": 1079 + }, + { + "epoch": 9.568888888888889, + "grad_norm": 7.7475690841674805, + "learning_rate": 2.8227424749163882e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.686979353427887, + "num_tokens": 892668.0, + "step": 1080 + }, + { + "epoch": 9.577777777777778, + "grad_norm": 7.754464626312256, + "learning_rate": 2.816053511705686e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7015874683856964, + "num_tokens": 894279.0, + "step": 1081 + }, + { + "epoch": 9.586666666666666, + "grad_norm": 8.41580867767334, + "learning_rate": 2.8093645484949835e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7079631686210632, + "num_tokens": 895841.0, + "step": 1082 + }, + { + "epoch": 9.595555555555556, + "grad_norm": 7.38240909576416, + "learning_rate": 2.8026755852842813e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6916173100471497, + "num_tokens": 897375.0, + "step": 1083 + }, + { + "epoch": 9.604444444444445, + "grad_norm": 7.442349910736084, + "learning_rate": 2.7959866220735787e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7134892046451569, + "num_tokens": 898939.0, + "step": 1084 + }, + { + "epoch": 9.613333333333333, + "grad_norm": 7.7187957763671875, + "learning_rate": 2.7892976588628766e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6741228103637695, + "num_tokens": 900516.0, + "step": 1085 + }, + { + "epoch": 9.622222222222222, + "grad_norm": 7.576633930206299, + "learning_rate": 2.782608695652174e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6900342106819153, + "num_tokens": 901990.0, + "step": 1086 + }, + { + "epoch": 9.63111111111111, + "grad_norm": 7.197464942932129, + "learning_rate": 2.7759197324414714e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7199681401252747, + "num_tokens": 903520.0, + "step": 1087 + }, + { + "epoch": 9.64, + "grad_norm": 8.106382369995117, + "learning_rate": 2.7692307692307697e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7060422003269196, + "num_tokens": 905097.0, + "step": 1088 + }, + { + "epoch": 9.648888888888889, + "grad_norm": 8.363383293151855, + "learning_rate": 2.762541806020067e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7160404026508331, + "num_tokens": 906611.0, + "step": 1089 + }, + { + "epoch": 9.657777777777778, + "grad_norm": 6.9902215003967285, + "learning_rate": 2.755852842809365e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7038565576076508, + "num_tokens": 908163.0, + "step": 1090 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 7.325692176818848, + "learning_rate": 2.7491638795986624e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.725650429725647, + "num_tokens": 909707.0, + "step": 1091 + }, + { + "epoch": 9.675555555555556, + "grad_norm": 7.5955705642700195, + "learning_rate": 2.74247491638796e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7353417277336121, + "num_tokens": 911249.0, + "step": 1092 + }, + { + "epoch": 9.684444444444445, + "grad_norm": 8.205609321594238, + "learning_rate": 2.735785953177258e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7183323502540588, + "num_tokens": 912754.0, + "step": 1093 + }, + { + "epoch": 9.693333333333333, + "grad_norm": 7.832437992095947, + "learning_rate": 2.7290969899665555e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6860167682170868, + "num_tokens": 914285.0, + "step": 1094 + }, + { + "epoch": 9.702222222222222, + "grad_norm": 7.534722805023193, + "learning_rate": 2.722408026755853e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6702898442745209, + "num_tokens": 915909.0, + "step": 1095 + }, + { + "epoch": 9.71111111111111, + "grad_norm": 7.805872440338135, + "learning_rate": 2.7157190635451507e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6805555522441864, + "num_tokens": 917401.0, + "step": 1096 + }, + { + "epoch": 9.72, + "grad_norm": 8.188508987426758, + "learning_rate": 2.709030100334448e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6886340379714966, + "num_tokens": 918938.0, + "step": 1097 + }, + { + "epoch": 9.72888888888889, + "grad_norm": 7.655539035797119, + "learning_rate": 2.7023411371237464e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7287272810935974, + "num_tokens": 920475.0, + "step": 1098 + }, + { + "epoch": 9.737777777777778, + "grad_norm": 7.540700435638428, + "learning_rate": 2.695652173913044e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.709308385848999, + "num_tokens": 922053.0, + "step": 1099 + }, + { + "epoch": 9.746666666666666, + "grad_norm": 8.335047721862793, + "learning_rate": 2.6889632107023413e-06, + "loss": 1.1259, + "mean_token_accuracy": 0.682058572769165, + "num_tokens": 923561.0, + "step": 1100 + }, + { + "epoch": 9.755555555555556, + "grad_norm": 7.959589004516602, + "learning_rate": 2.682274247491639e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7027527689933777, + "num_tokens": 925084.0, + "step": 1101 + }, + { + "epoch": 9.764444444444445, + "grad_norm": 7.213424205780029, + "learning_rate": 2.6755852842809365e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.6938560307025909, + "num_tokens": 926678.0, + "step": 1102 + }, + { + "epoch": 9.773333333333333, + "grad_norm": 7.589350700378418, + "learning_rate": 2.6688963210702344e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.6909720301628113, + "num_tokens": 928192.0, + "step": 1103 + }, + { + "epoch": 9.782222222222222, + "grad_norm": 7.717529773712158, + "learning_rate": 2.662207357859532e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7051864862442017, + "num_tokens": 929705.0, + "step": 1104 + }, + { + "epoch": 9.79111111111111, + "grad_norm": 8.09391975402832, + "learning_rate": 2.6555183946488296e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.6986806094646454, + "num_tokens": 931234.0, + "step": 1105 + }, + { + "epoch": 9.8, + "grad_norm": 7.379924297332764, + "learning_rate": 2.6488294314381275e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7153298258781433, + "num_tokens": 932825.0, + "step": 1106 + }, + { + "epoch": 9.80888888888889, + "grad_norm": 7.178143501281738, + "learning_rate": 2.642140468227425e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6795435547828674, + "num_tokens": 934393.0, + "step": 1107 + }, + { + "epoch": 9.817777777777778, + "grad_norm": 7.333486557006836, + "learning_rate": 2.6354515050167227e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7157953381538391, + "num_tokens": 935916.0, + "step": 1108 + }, + { + "epoch": 9.826666666666666, + "grad_norm": 11.751126289367676, + "learning_rate": 2.62876254180602e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7208646535873413, + "num_tokens": 937493.0, + "step": 1109 + }, + { + "epoch": 9.835555555555555, + "grad_norm": 7.191551685333252, + "learning_rate": 2.6220735785953176e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7038331627845764, + "num_tokens": 939050.0, + "step": 1110 + }, + { + "epoch": 9.844444444444445, + "grad_norm": 8.14315128326416, + "learning_rate": 2.615384615384616e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6974965631961823, + "num_tokens": 940594.0, + "step": 1111 + }, + { + "epoch": 9.853333333333333, + "grad_norm": 7.563806533813477, + "learning_rate": 2.6086956521739132e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6890034377574921, + "num_tokens": 942126.0, + "step": 1112 + }, + { + "epoch": 9.862222222222222, + "grad_norm": 8.39829158782959, + "learning_rate": 2.602006688963211e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6709831058979034, + "num_tokens": 943640.0, + "step": 1113 + }, + { + "epoch": 9.87111111111111, + "grad_norm": 8.10810375213623, + "learning_rate": 2.5953177257525085e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7128537595272064, + "num_tokens": 945187.0, + "step": 1114 + }, + { + "epoch": 9.88, + "grad_norm": 7.686197280883789, + "learning_rate": 2.588628762541806e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7309402823448181, + "num_tokens": 946710.0, + "step": 1115 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 7.863304138183594, + "learning_rate": 2.581939799331104e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7190088331699371, + "num_tokens": 948287.0, + "step": 1116 + }, + { + "epoch": 9.897777777777778, + "grad_norm": 7.664231777191162, + "learning_rate": 2.5752508361204016e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7492275536060333, + "num_tokens": 949806.0, + "step": 1117 + }, + { + "epoch": 9.906666666666666, + "grad_norm": 8.542102813720703, + "learning_rate": 2.568561872909699e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7213290333747864, + "num_tokens": 951303.0, + "step": 1118 + }, + { + "epoch": 9.915555555555555, + "grad_norm": 7.8826680183410645, + "learning_rate": 2.561872909698997e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7111823260784149, + "num_tokens": 952848.0, + "step": 1119 + }, + { + "epoch": 9.924444444444445, + "grad_norm": 7.539241790771484, + "learning_rate": 2.5551839464882943e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.733702689409256, + "num_tokens": 954361.0, + "step": 1120 + }, + { + "epoch": 9.933333333333334, + "grad_norm": 9.744601249694824, + "learning_rate": 2.5484949832775925e-06, + "loss": 1.1254, + "mean_token_accuracy": 0.6495727598667145, + "num_tokens": 955896.0, + "step": 1121 + }, + { + "epoch": 9.942222222222222, + "grad_norm": 8.241456985473633, + "learning_rate": 2.54180602006689e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7213788032531738, + "num_tokens": 957464.0, + "step": 1122 + }, + { + "epoch": 9.95111111111111, + "grad_norm": 8.084714889526367, + "learning_rate": 2.5351170568561874e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.712028443813324, + "num_tokens": 959026.0, + "step": 1123 + }, + { + "epoch": 9.96, + "grad_norm": 7.583651542663574, + "learning_rate": 2.5284280936454852e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7052400708198547, + "num_tokens": 960642.0, + "step": 1124 + }, + { + "epoch": 9.96888888888889, + "grad_norm": 7.885745048522949, + "learning_rate": 2.5217391304347826e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.727026104927063, + "num_tokens": 962228.0, + "step": 1125 + }, + { + "epoch": 9.977777777777778, + "grad_norm": 8.236817359924316, + "learning_rate": 2.5150501672240805e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7088866531848907, + "num_tokens": 963780.0, + "step": 1126 + }, + { + "epoch": 9.986666666666666, + "grad_norm": 8.461657524108887, + "learning_rate": 2.5083612040133783e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7065580487251282, + "num_tokens": 965319.0, + "step": 1127 + }, + { + "epoch": 9.995555555555555, + "grad_norm": 8.181718826293945, + "learning_rate": 2.5016722408026757e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6786901652812958, + "num_tokens": 966927.0, + "step": 1128 + }, + { + "epoch": 10.0, + "grad_norm": 12.545977592468262, + "learning_rate": 2.4949832775919736e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7124682068824768, + "num_tokens": 967716.0, + "step": 1129 + }, + { + "epoch": 10.008888888888889, + "grad_norm": 8.807211875915527, + "learning_rate": 2.488294314381271e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7009295225143433, + "num_tokens": 969233.0, + "step": 1130 + }, + { + "epoch": 10.017777777777777, + "grad_norm": 7.570104598999023, + "learning_rate": 2.481605351170569e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7288297414779663, + "num_tokens": 970802.0, + "step": 1131 + }, + { + "epoch": 10.026666666666667, + "grad_norm": 7.99111795425415, + "learning_rate": 2.4749163879598663e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7274698913097382, + "num_tokens": 972353.0, + "step": 1132 + }, + { + "epoch": 10.035555555555556, + "grad_norm": 7.928145885467529, + "learning_rate": 2.468227424749164e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7249955534934998, + "num_tokens": 973891.0, + "step": 1133 + }, + { + "epoch": 10.044444444444444, + "grad_norm": 8.425963401794434, + "learning_rate": 2.461538461538462e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7077345848083496, + "num_tokens": 975387.0, + "step": 1134 + }, + { + "epoch": 10.053333333333333, + "grad_norm": 8.072123527526855, + "learning_rate": 2.4548494983277594e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7360780835151672, + "num_tokens": 976918.0, + "step": 1135 + }, + { + "epoch": 10.062222222222223, + "grad_norm": 7.622961521148682, + "learning_rate": 2.448160535117057e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.726354569196701, + "num_tokens": 978455.0, + "step": 1136 + }, + { + "epoch": 10.071111111111112, + "grad_norm": 7.8295207023620605, + "learning_rate": 2.4414715719063546e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.6977002620697021, + "num_tokens": 979989.0, + "step": 1137 + }, + { + "epoch": 10.08, + "grad_norm": 8.656242370605469, + "learning_rate": 2.4347826086956525e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6767398715019226, + "num_tokens": 981469.0, + "step": 1138 + }, + { + "epoch": 10.088888888888889, + "grad_norm": 8.98353099822998, + "learning_rate": 2.42809364548495e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7058785259723663, + "num_tokens": 983001.0, + "step": 1139 + }, + { + "epoch": 10.097777777777777, + "grad_norm": 7.904594421386719, + "learning_rate": 2.4214046822742477e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7197617888450623, + "num_tokens": 984491.0, + "step": 1140 + }, + { + "epoch": 10.106666666666667, + "grad_norm": 7.937580108642578, + "learning_rate": 2.414715719063545e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6898120939731598, + "num_tokens": 986074.0, + "step": 1141 + }, + { + "epoch": 10.115555555555556, + "grad_norm": 8.64685344696045, + "learning_rate": 2.408026755852843e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7190959751605988, + "num_tokens": 987610.0, + "step": 1142 + }, + { + "epoch": 10.124444444444444, + "grad_norm": 7.571821689605713, + "learning_rate": 2.401337792642141e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7469573020935059, + "num_tokens": 989128.0, + "step": 1143 + }, + { + "epoch": 10.133333333333333, + "grad_norm": 8.27009105682373, + "learning_rate": 2.3946488294314382e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7442129552364349, + "num_tokens": 990633.0, + "step": 1144 + }, + { + "epoch": 10.142222222222221, + "grad_norm": 8.289285659790039, + "learning_rate": 2.387959866220736e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7064669132232666, + "num_tokens": 992192.0, + "step": 1145 + }, + { + "epoch": 10.151111111111112, + "grad_norm": 8.250883102416992, + "learning_rate": 2.3812709030100335e-06, + "loss": 0.945, + "mean_token_accuracy": 0.6870531737804413, + "num_tokens": 993735.0, + "step": 1146 + }, + { + "epoch": 10.16, + "grad_norm": 8.29701042175293, + "learning_rate": 2.3745819397993314e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7231054604053497, + "num_tokens": 995218.0, + "step": 1147 + }, + { + "epoch": 10.168888888888889, + "grad_norm": 8.8833646774292, + "learning_rate": 2.3678929765886288e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7027332186698914, + "num_tokens": 996704.0, + "step": 1148 + }, + { + "epoch": 10.177777777777777, + "grad_norm": 8.010129928588867, + "learning_rate": 2.3612040133779266e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6945264637470245, + "num_tokens": 998239.0, + "step": 1149 + }, + { + "epoch": 10.186666666666667, + "grad_norm": 7.930617332458496, + "learning_rate": 2.3545150501672245e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7229310870170593, + "num_tokens": 999775.0, + "step": 1150 + }, + { + "epoch": 10.195555555555556, + "grad_norm": 8.43835735321045, + "learning_rate": 2.347826086956522e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7493226826190948, + "num_tokens": 1001333.0, + "step": 1151 + }, + { + "epoch": 10.204444444444444, + "grad_norm": 9.688827514648438, + "learning_rate": 2.3411371237458197e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.6913759708404541, + "num_tokens": 1002805.0, + "step": 1152 + }, + { + "epoch": 10.213333333333333, + "grad_norm": 9.071305274963379, + "learning_rate": 2.334448160535117e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6971546113491058, + "num_tokens": 1004408.0, + "step": 1153 + }, + { + "epoch": 10.222222222222221, + "grad_norm": NaN, + "learning_rate": 2.327759197324415e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7348439991474152, + "num_tokens": 1005897.0, + "step": 1154 + }, + { + "epoch": 10.231111111111112, + "grad_norm": 8.965614318847656, + "learning_rate": 2.327759197324415e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7272486388683319, + "num_tokens": 1007429.0, + "step": 1155 + }, + { + "epoch": 10.24, + "grad_norm": 8.638874053955078, + "learning_rate": 2.3210702341137124e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7175429165363312, + "num_tokens": 1009011.0, + "step": 1156 + }, + { + "epoch": 10.248888888888889, + "grad_norm": 8.152356147766113, + "learning_rate": 2.3143812709030102e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7296044230461121, + "num_tokens": 1010599.0, + "step": 1157 + }, + { + "epoch": 10.257777777777777, + "grad_norm": 8.888917922973633, + "learning_rate": 2.307692307692308e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7109507620334625, + "num_tokens": 1012184.0, + "step": 1158 + }, + { + "epoch": 10.266666666666667, + "grad_norm": 8.584786415100098, + "learning_rate": 2.3010033444816055e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7557413280010223, + "num_tokens": 1013707.0, + "step": 1159 + }, + { + "epoch": 10.275555555555556, + "grad_norm": 9.76318645477295, + "learning_rate": 2.2943143812709033e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7334924936294556, + "num_tokens": 1015314.0, + "step": 1160 + }, + { + "epoch": 10.284444444444444, + "grad_norm": 9.037351608276367, + "learning_rate": 2.2876254180602008e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7136805951595306, + "num_tokens": 1016866.0, + "step": 1161 + }, + { + "epoch": 10.293333333333333, + "grad_norm": 10.209254264831543, + "learning_rate": 2.2809364548494986e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.6911448240280151, + "num_tokens": 1018396.0, + "step": 1162 + }, + { + "epoch": 10.302222222222222, + "grad_norm": 10.558046340942383, + "learning_rate": 2.274247491638796e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6711841225624084, + "num_tokens": 1019954.0, + "step": 1163 + }, + { + "epoch": 10.311111111111112, + "grad_norm": 9.165814399719238, + "learning_rate": 2.267558528428094e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7337560653686523, + "num_tokens": 1021455.0, + "step": 1164 + }, + { + "epoch": 10.32, + "grad_norm": 9.73916244506836, + "learning_rate": 2.2608695652173913e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6959030628204346, + "num_tokens": 1022991.0, + "step": 1165 + }, + { + "epoch": 10.328888888888889, + "grad_norm": 9.443109512329102, + "learning_rate": 2.254180602006689e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.6874563694000244, + "num_tokens": 1024518.0, + "step": 1166 + }, + { + "epoch": 10.337777777777777, + "grad_norm": 8.762104988098145, + "learning_rate": 2.247491638795987e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7114595174789429, + "num_tokens": 1026003.0, + "step": 1167 + }, + { + "epoch": 10.346666666666668, + "grad_norm": 9.014803886413574, + "learning_rate": 2.2408026755852844e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7107007503509521, + "num_tokens": 1027542.0, + "step": 1168 + }, + { + "epoch": 10.355555555555556, + "grad_norm": 8.868739128112793, + "learning_rate": 2.2341137123745822e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7135984301567078, + "num_tokens": 1029052.0, + "step": 1169 + }, + { + "epoch": 10.364444444444445, + "grad_norm": 11.203288078308105, + "learning_rate": 2.2274247491638796e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6849428117275238, + "num_tokens": 1030607.0, + "step": 1170 + }, + { + "epoch": 10.373333333333333, + "grad_norm": 8.221288681030273, + "learning_rate": 2.2207357859531775e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7107290029525757, + "num_tokens": 1032151.0, + "step": 1171 + }, + { + "epoch": 10.382222222222222, + "grad_norm": 8.208683967590332, + "learning_rate": 2.2140468227424753e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7205279171466827, + "num_tokens": 1033745.0, + "step": 1172 + }, + { + "epoch": 10.391111111111112, + "grad_norm": 8.299823760986328, + "learning_rate": 2.2073578595317727e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7202318608760834, + "num_tokens": 1035248.0, + "step": 1173 + }, + { + "epoch": 10.4, + "grad_norm": 8.656328201293945, + "learning_rate": 2.2006688963210706e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7260493040084839, + "num_tokens": 1036812.0, + "step": 1174 + }, + { + "epoch": 10.408888888888889, + "grad_norm": 9.027706146240234, + "learning_rate": 2.193979933110368e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7022848725318909, + "num_tokens": 1038333.0, + "step": 1175 + }, + { + "epoch": 10.417777777777777, + "grad_norm": 8.575098037719727, + "learning_rate": 2.187290969899666e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7127075493335724, + "num_tokens": 1039858.0, + "step": 1176 + }, + { + "epoch": 10.426666666666666, + "grad_norm": 8.244449615478516, + "learning_rate": 2.1806020066889633e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7200237512588501, + "num_tokens": 1041397.0, + "step": 1177 + }, + { + "epoch": 10.435555555555556, + "grad_norm": 8.872578620910645, + "learning_rate": 2.173913043478261e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7321111261844635, + "num_tokens": 1042897.0, + "step": 1178 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 9.207344055175781, + "learning_rate": 2.1672240802675585e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7269322872161865, + "num_tokens": 1044447.0, + "step": 1179 + }, + { + "epoch": 10.453333333333333, + "grad_norm": 8.85890007019043, + "learning_rate": 2.1605351170568564e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7011830806732178, + "num_tokens": 1045969.0, + "step": 1180 + }, + { + "epoch": 10.462222222222222, + "grad_norm": 8.814640998840332, + "learning_rate": 2.153846153846154e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.716943085193634, + "num_tokens": 1047525.0, + "step": 1181 + }, + { + "epoch": 10.471111111111112, + "grad_norm": 8.165874481201172, + "learning_rate": 2.1471571906354516e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.717018187046051, + "num_tokens": 1049116.0, + "step": 1182 + }, + { + "epoch": 10.48, + "grad_norm": 8.150487899780273, + "learning_rate": 2.1404682274247495e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7317810654640198, + "num_tokens": 1050751.0, + "step": 1183 + }, + { + "epoch": 10.488888888888889, + "grad_norm": 8.577346801757812, + "learning_rate": 2.133779264214047e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7205539047718048, + "num_tokens": 1052271.0, + "step": 1184 + }, + { + "epoch": 10.497777777777777, + "grad_norm": 9.297968864440918, + "learning_rate": 2.1270903010033447e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7091435194015503, + "num_tokens": 1053827.0, + "step": 1185 + }, + { + "epoch": 10.506666666666666, + "grad_norm": 9.853944778442383, + "learning_rate": 2.120401337792642e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7016641199588776, + "num_tokens": 1055351.0, + "step": 1186 + }, + { + "epoch": 10.515555555555556, + "grad_norm": 9.04289722442627, + "learning_rate": 2.11371237458194e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7026011645793915, + "num_tokens": 1056826.0, + "step": 1187 + }, + { + "epoch": 10.524444444444445, + "grad_norm": 9.558469772338867, + "learning_rate": 2.1070234113712374e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7452720999717712, + "num_tokens": 1058322.0, + "step": 1188 + }, + { + "epoch": 10.533333333333333, + "grad_norm": 8.776966094970703, + "learning_rate": 2.1003344481605352e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7420634925365448, + "num_tokens": 1059843.0, + "step": 1189 + }, + { + "epoch": 10.542222222222222, + "grad_norm": 8.179105758666992, + "learning_rate": 2.093645484949833e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7576799392700195, + "num_tokens": 1061424.0, + "step": 1190 + }, + { + "epoch": 10.551111111111112, + "grad_norm": 8.988635063171387, + "learning_rate": 2.0869565217391305e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7489105463027954, + "num_tokens": 1062895.0, + "step": 1191 + }, + { + "epoch": 10.56, + "grad_norm": 8.707941055297852, + "learning_rate": 2.0802675585284283e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.726638674736023, + "num_tokens": 1064517.0, + "step": 1192 + }, + { + "epoch": 10.568888888888889, + "grad_norm": 9.591484069824219, + "learning_rate": 2.0735785953177258e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7290910482406616, + "num_tokens": 1066025.0, + "step": 1193 + }, + { + "epoch": 10.577777777777778, + "grad_norm": 8.911201477050781, + "learning_rate": 2.0668896321070236e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7478257715702057, + "num_tokens": 1067556.0, + "step": 1194 + }, + { + "epoch": 10.586666666666666, + "grad_norm": 9.220439910888672, + "learning_rate": 2.0602006688963215e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7016622424125671, + "num_tokens": 1069059.0, + "step": 1195 + }, + { + "epoch": 10.595555555555556, + "grad_norm": 8.35219955444336, + "learning_rate": 2.053511705685619e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7373862862586975, + "num_tokens": 1070664.0, + "step": 1196 + }, + { + "epoch": 10.604444444444445, + "grad_norm": 8.986740112304688, + "learning_rate": 2.0468227424749167e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7249921262264252, + "num_tokens": 1072207.0, + "step": 1197 + }, + { + "epoch": 10.613333333333333, + "grad_norm": 9.114724159240723, + "learning_rate": 2.040133779264214e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6784088909626007, + "num_tokens": 1073694.0, + "step": 1198 + }, + { + "epoch": 10.622222222222222, + "grad_norm": 9.227952003479004, + "learning_rate": 2.033444816053512e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7185415327548981, + "num_tokens": 1075235.0, + "step": 1199 + }, + { + "epoch": 10.63111111111111, + "grad_norm": 9.150552749633789, + "learning_rate": 2.0267558528428094e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7219127118587494, + "num_tokens": 1076846.0, + "step": 1200 + }, + { + "epoch": 10.64, + "grad_norm": 8.589648246765137, + "learning_rate": 2.0200668896321072e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7348284721374512, + "num_tokens": 1078430.0, + "step": 1201 + }, + { + "epoch": 10.648888888888889, + "grad_norm": 10.16896915435791, + "learning_rate": 2.0133779264214046e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.702299177646637, + "num_tokens": 1079958.0, + "step": 1202 + }, + { + "epoch": 10.657777777777778, + "grad_norm": 9.370407104492188, + "learning_rate": 2.0066889632107025e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7124338448047638, + "num_tokens": 1081537.0, + "step": 1203 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 10.115323066711426, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7161619961261749, + "num_tokens": 1083058.0, + "step": 1204 + }, + { + "epoch": 10.675555555555556, + "grad_norm": 10.125144958496094, + "learning_rate": 1.9933110367892978e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6946381330490112, + "num_tokens": 1084614.0, + "step": 1205 + }, + { + "epoch": 10.684444444444445, + "grad_norm": 9.331127166748047, + "learning_rate": 1.9866220735785956e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7241120338439941, + "num_tokens": 1086204.0, + "step": 1206 + }, + { + "epoch": 10.693333333333333, + "grad_norm": 9.497456550598145, + "learning_rate": 1.979933110367893e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7166876196861267, + "num_tokens": 1087803.0, + "step": 1207 + }, + { + "epoch": 10.702222222222222, + "grad_norm": 9.382795333862305, + "learning_rate": 1.973244147157191e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7208240330219269, + "num_tokens": 1089342.0, + "step": 1208 + }, + { + "epoch": 10.71111111111111, + "grad_norm": 8.92259407043457, + "learning_rate": 1.9665551839464887e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7398748099803925, + "num_tokens": 1090892.0, + "step": 1209 + }, + { + "epoch": 10.72, + "grad_norm": 8.456918716430664, + "learning_rate": 1.959866220735786e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7439115941524506, + "num_tokens": 1092423.0, + "step": 1210 + }, + { + "epoch": 10.72888888888889, + "grad_norm": 8.657918930053711, + "learning_rate": 1.953177257525084e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7281069457530975, + "num_tokens": 1093960.0, + "step": 1211 + }, + { + "epoch": 10.737777777777778, + "grad_norm": 8.761609077453613, + "learning_rate": 1.9464882943143814e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.724135547876358, + "num_tokens": 1095538.0, + "step": 1212 + }, + { + "epoch": 10.746666666666666, + "grad_norm": 8.74243450164795, + "learning_rate": 1.9397993311036792e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7348692417144775, + "num_tokens": 1097015.0, + "step": 1213 + }, + { + "epoch": 10.755555555555556, + "grad_norm": 9.902844429016113, + "learning_rate": 1.9331103678929766e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7169565260410309, + "num_tokens": 1098587.0, + "step": 1214 + }, + { + "epoch": 10.764444444444445, + "grad_norm": 10.011761665344238, + "learning_rate": 1.9264214046822745e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7076692581176758, + "num_tokens": 1100096.0, + "step": 1215 + }, + { + "epoch": 10.773333333333333, + "grad_norm": 10.179853439331055, + "learning_rate": 1.919732441471572e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6836839020252228, + "num_tokens": 1101634.0, + "step": 1216 + }, + { + "epoch": 10.782222222222222, + "grad_norm": 8.37536907196045, + "learning_rate": 1.9130434782608697e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7353242337703705, + "num_tokens": 1103174.0, + "step": 1217 + }, + { + "epoch": 10.79111111111111, + "grad_norm": 9.145805358886719, + "learning_rate": 1.9063545150501676e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7100694477558136, + "num_tokens": 1104755.0, + "step": 1218 + }, + { + "epoch": 10.8, + "grad_norm": 9.395395278930664, + "learning_rate": 1.899665551839465e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7300034463405609, + "num_tokens": 1106275.0, + "step": 1219 + }, + { + "epoch": 10.80888888888889, + "grad_norm": 9.520753860473633, + "learning_rate": 1.8929765886287626e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7243862450122833, + "num_tokens": 1107755.0, + "step": 1220 + }, + { + "epoch": 10.817777777777778, + "grad_norm": 9.470590591430664, + "learning_rate": 1.8862876254180605e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7329618036746979, + "num_tokens": 1109306.0, + "step": 1221 + }, + { + "epoch": 10.826666666666666, + "grad_norm": 9.438454627990723, + "learning_rate": 1.879598662207358e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7189265489578247, + "num_tokens": 1110815.0, + "step": 1222 + }, + { + "epoch": 10.835555555555555, + "grad_norm": 10.700515747070312, + "learning_rate": 1.8729096989966555e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7257115840911865, + "num_tokens": 1112347.0, + "step": 1223 + }, + { + "epoch": 10.844444444444445, + "grad_norm": 9.501360893249512, + "learning_rate": 1.8662207357859531e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.738349199295044, + "num_tokens": 1113867.0, + "step": 1224 + }, + { + "epoch": 10.853333333333333, + "grad_norm": 9.49864387512207, + "learning_rate": 1.859531772575251e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7389309406280518, + "num_tokens": 1115368.0, + "step": 1225 + }, + { + "epoch": 10.862222222222222, + "grad_norm": 8.811186790466309, + "learning_rate": 1.8528428093645486e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7312729954719543, + "num_tokens": 1116956.0, + "step": 1226 + }, + { + "epoch": 10.87111111111111, + "grad_norm": 9.287123680114746, + "learning_rate": 1.8461538461538465e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7251039147377014, + "num_tokens": 1118514.0, + "step": 1227 + }, + { + "epoch": 10.88, + "grad_norm": 9.864175796508789, + "learning_rate": 1.8394648829431439e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7289265096187592, + "num_tokens": 1120038.0, + "step": 1228 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 9.1536865234375, + "learning_rate": 1.8327759197324415e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7586621642112732, + "num_tokens": 1121562.0, + "step": 1229 + }, + { + "epoch": 10.897777777777778, + "grad_norm": 9.437410354614258, + "learning_rate": 1.8260869565217394e-06, + "loss": 0.858, + "mean_token_accuracy": 0.731667548418045, + "num_tokens": 1123125.0, + "step": 1230 + }, + { + "epoch": 10.906666666666666, + "grad_norm": 9.41873836517334, + "learning_rate": 1.819397993311037e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7081202566623688, + "num_tokens": 1124660.0, + "step": 1231 + }, + { + "epoch": 10.915555555555555, + "grad_norm": 9.111668586730957, + "learning_rate": 1.8127090301003348e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7155178487300873, + "num_tokens": 1126198.0, + "step": 1232 + }, + { + "epoch": 10.924444444444445, + "grad_norm": 8.775675773620605, + "learning_rate": 1.8060200668896322e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.6985008120536804, + "num_tokens": 1127740.0, + "step": 1233 + }, + { + "epoch": 10.933333333333334, + "grad_norm": 8.874909400939941, + "learning_rate": 1.7993311036789299e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7493126690387726, + "num_tokens": 1129316.0, + "step": 1234 + }, + { + "epoch": 10.942222222222222, + "grad_norm": 9.81312084197998, + "learning_rate": 1.7926421404682275e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7354390919208527, + "num_tokens": 1130814.0, + "step": 1235 + }, + { + "epoch": 10.95111111111111, + "grad_norm": 9.950927734375, + "learning_rate": 1.7859531772575253e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7304711639881134, + "num_tokens": 1132385.0, + "step": 1236 + }, + { + "epoch": 10.96, + "grad_norm": 10.69039535522461, + "learning_rate": 1.7792642140468228e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6832348108291626, + "num_tokens": 1133926.0, + "step": 1237 + }, + { + "epoch": 10.96888888888889, + "grad_norm": 10.74488639831543, + "learning_rate": 1.7725752508361204e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7784852683544159, + "num_tokens": 1135442.0, + "step": 1238 + }, + { + "epoch": 10.977777777777778, + "grad_norm": 9.97143268585205, + "learning_rate": 1.7658862876254182e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7335735559463501, + "num_tokens": 1136969.0, + "step": 1239 + }, + { + "epoch": 10.986666666666666, + "grad_norm": 10.192907333374023, + "learning_rate": 1.7591973244147159e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7500674426555634, + "num_tokens": 1138487.0, + "step": 1240 + }, + { + "epoch": 10.995555555555555, + "grad_norm": 9.691726684570312, + "learning_rate": 1.7525083612040137e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7214130163192749, + "num_tokens": 1140010.0, + "step": 1241 + }, + { + "epoch": 11.0, + "grad_norm": 14.803547859191895, + "learning_rate": 1.7458193979933111e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7022472023963928, + "num_tokens": 1140743.0, + "step": 1242 + }, + { + "epoch": 11.008888888888889, + "grad_norm": 9.426106452941895, + "learning_rate": 1.7391304347826088e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7495324015617371, + "num_tokens": 1142305.0, + "step": 1243 + }, + { + "epoch": 11.017777777777777, + "grad_norm": 10.356644630432129, + "learning_rate": 1.7324414715719066e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6773306429386139, + "num_tokens": 1143865.0, + "step": 1244 + }, + { + "epoch": 11.026666666666667, + "grad_norm": 9.611288070678711, + "learning_rate": 1.7257525083612042e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7428846955299377, + "num_tokens": 1145405.0, + "step": 1245 + }, + { + "epoch": 11.035555555555556, + "grad_norm": 8.847456932067871, + "learning_rate": 1.7190635451505019e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7524968385696411, + "num_tokens": 1146974.0, + "step": 1246 + }, + { + "epoch": 11.044444444444444, + "grad_norm": 9.132477760314941, + "learning_rate": 1.7123745819397995e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7442333996295929, + "num_tokens": 1148524.0, + "step": 1247 + }, + { + "epoch": 11.053333333333333, + "grad_norm": 12.014487266540527, + "learning_rate": 1.7056856187290971e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7456110417842865, + "num_tokens": 1150033.0, + "step": 1248 + }, + { + "epoch": 11.062222222222223, + "grad_norm": 10.503037452697754, + "learning_rate": 1.6989966555183947e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7642403542995453, + "num_tokens": 1151540.0, + "step": 1249 + }, + { + "epoch": 11.071111111111112, + "grad_norm": 9.17188549041748, + "learning_rate": 1.6923076923076926e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7654966413974762, + "num_tokens": 1153118.0, + "step": 1250 + }, + { + "epoch": 11.08, + "grad_norm": 8.896985054016113, + "learning_rate": 1.68561872909699e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7337823808193207, + "num_tokens": 1154662.0, + "step": 1251 + }, + { + "epoch": 11.088888888888889, + "grad_norm": 9.703432083129883, + "learning_rate": 1.6789297658862876e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7098216712474823, + "num_tokens": 1156245.0, + "step": 1252 + }, + { + "epoch": 11.097777777777777, + "grad_norm": 10.221484184265137, + "learning_rate": 1.6722408026755855e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7289004921913147, + "num_tokens": 1157780.0, + "step": 1253 + }, + { + "epoch": 11.106666666666667, + "grad_norm": 9.041837692260742, + "learning_rate": 1.6655518394648831e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7599569857120514, + "num_tokens": 1159358.0, + "step": 1254 + }, + { + "epoch": 11.115555555555556, + "grad_norm": 8.830787658691406, + "learning_rate": 1.658862876254181e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7451253533363342, + "num_tokens": 1160885.0, + "step": 1255 + }, + { + "epoch": 11.124444444444444, + "grad_norm": 10.50179672241211, + "learning_rate": 1.6521739130434784e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6972069442272186, + "num_tokens": 1162439.0, + "step": 1256 + }, + { + "epoch": 11.133333333333333, + "grad_norm": 9.393866539001465, + "learning_rate": 1.645484949832776e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7274407744407654, + "num_tokens": 1163986.0, + "step": 1257 + }, + { + "epoch": 11.142222222222221, + "grad_norm": 8.099898338317871, + "learning_rate": 1.6387959866220736e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7301158308982849, + "num_tokens": 1165526.0, + "step": 1258 + }, + { + "epoch": 11.151111111111112, + "grad_norm": 9.189102172851562, + "learning_rate": 1.6321070234113715e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7427593171596527, + "num_tokens": 1167074.0, + "step": 1259 + }, + { + "epoch": 11.16, + "grad_norm": 8.817166328430176, + "learning_rate": 1.625418060200669e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7474271655082703, + "num_tokens": 1168587.0, + "step": 1260 + }, + { + "epoch": 11.168888888888889, + "grad_norm": 9.122915267944336, + "learning_rate": 1.6187290969899665e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7379071414470673, + "num_tokens": 1170227.0, + "step": 1261 + }, + { + "epoch": 11.177777777777777, + "grad_norm": 10.05552864074707, + "learning_rate": 1.6120401337792644e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7526738047599792, + "num_tokens": 1171770.0, + "step": 1262 + }, + { + "epoch": 11.186666666666667, + "grad_norm": 11.068049430847168, + "learning_rate": 1.605351170568562e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7252320051193237, + "num_tokens": 1173292.0, + "step": 1263 + }, + { + "epoch": 11.195555555555556, + "grad_norm": 10.239787101745605, + "learning_rate": 1.5986622073578598e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7152619063854218, + "num_tokens": 1174800.0, + "step": 1264 + }, + { + "epoch": 11.204444444444444, + "grad_norm": 10.203516006469727, + "learning_rate": 1.5919732441471573e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7535128593444824, + "num_tokens": 1176338.0, + "step": 1265 + }, + { + "epoch": 11.213333333333333, + "grad_norm": 8.80499267578125, + "learning_rate": 1.5852842809364549e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7711828649044037, + "num_tokens": 1177919.0, + "step": 1266 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 12.050443649291992, + "learning_rate": 1.5785953177257527e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7158454954624176, + "num_tokens": 1179443.0, + "step": 1267 + }, + { + "epoch": 11.231111111111112, + "grad_norm": 10.994086265563965, + "learning_rate": 1.5719063545150504e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6741707623004913, + "num_tokens": 1180946.0, + "step": 1268 + }, + { + "epoch": 11.24, + "grad_norm": 10.186769485473633, + "learning_rate": 1.565217391304348e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7389181554317474, + "num_tokens": 1182474.0, + "step": 1269 + }, + { + "epoch": 11.248888888888889, + "grad_norm": 9.398778915405273, + "learning_rate": 1.5585284280936456e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7416382431983948, + "num_tokens": 1184033.0, + "step": 1270 + }, + { + "epoch": 11.257777777777777, + "grad_norm": 10.37645149230957, + "learning_rate": 1.5518394648829432e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7285986840724945, + "num_tokens": 1185577.0, + "step": 1271 + }, + { + "epoch": 11.266666666666667, + "grad_norm": 10.098435401916504, + "learning_rate": 1.5451505016722409e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7467935085296631, + "num_tokens": 1187082.0, + "step": 1272 + }, + { + "epoch": 11.275555555555556, + "grad_norm": 9.915616989135742, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7425287663936615, + "num_tokens": 1188657.0, + "step": 1273 + }, + { + "epoch": 11.284444444444444, + "grad_norm": 9.656970977783203, + "learning_rate": 1.5317725752508361e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7414161264896393, + "num_tokens": 1190230.0, + "step": 1274 + }, + { + "epoch": 11.293333333333333, + "grad_norm": 9.647955894470215, + "learning_rate": 1.5250836120401338e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7316479682922363, + "num_tokens": 1191756.0, + "step": 1275 + }, + { + "epoch": 11.302222222222222, + "grad_norm": 10.82711410522461, + "learning_rate": 1.5183946488294316e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7284018993377686, + "num_tokens": 1193225.0, + "step": 1276 + }, + { + "epoch": 11.311111111111112, + "grad_norm": 11.43072509765625, + "learning_rate": 1.5117056856187292e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7073327302932739, + "num_tokens": 1194775.0, + "step": 1277 + }, + { + "epoch": 11.32, + "grad_norm": 10.278733253479004, + "learning_rate": 1.505016722408027e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.765709638595581, + "num_tokens": 1196267.0, + "step": 1278 + }, + { + "epoch": 11.328888888888889, + "grad_norm": 10.071756362915039, + "learning_rate": 1.4983277591973245e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7428359687328339, + "num_tokens": 1197803.0, + "step": 1279 + }, + { + "epoch": 11.337777777777777, + "grad_norm": 10.58071231842041, + "learning_rate": 1.4916387959866221e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7167567610740662, + "num_tokens": 1199338.0, + "step": 1280 + }, + { + "epoch": 11.346666666666668, + "grad_norm": 9.963506698608398, + "learning_rate": 1.4849498327759198e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7314527332782745, + "num_tokens": 1200873.0, + "step": 1281 + }, + { + "epoch": 11.355555555555556, + "grad_norm": 11.868447303771973, + "learning_rate": 1.4782608695652176e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7209419906139374, + "num_tokens": 1202387.0, + "step": 1282 + }, + { + "epoch": 11.364444444444445, + "grad_norm": 9.845200538635254, + "learning_rate": 1.4715719063545152e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7301340699195862, + "num_tokens": 1203907.0, + "step": 1283 + }, + { + "epoch": 11.373333333333333, + "grad_norm": 10.199912071228027, + "learning_rate": 1.4648829431438126e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7293265759944916, + "num_tokens": 1205493.0, + "step": 1284 + }, + { + "epoch": 11.382222222222222, + "grad_norm": 11.473793029785156, + "learning_rate": 1.4581939799331105e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7049669027328491, + "num_tokens": 1207123.0, + "step": 1285 + }, + { + "epoch": 11.391111111111112, + "grad_norm": 9.913331031799316, + "learning_rate": 1.4515050167224081e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7525153756141663, + "num_tokens": 1208657.0, + "step": 1286 + }, + { + "epoch": 11.4, + "grad_norm": 9.227319717407227, + "learning_rate": 1.444816053511706e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7375368177890778, + "num_tokens": 1210169.0, + "step": 1287 + }, + { + "epoch": 11.408888888888889, + "grad_norm": 9.736603736877441, + "learning_rate": 1.4381270903010034e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7417746186256409, + "num_tokens": 1211749.0, + "step": 1288 + }, + { + "epoch": 11.417777777777777, + "grad_norm": 9.877978324890137, + "learning_rate": 1.431438127090301e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.745613306760788, + "num_tokens": 1213265.0, + "step": 1289 + }, + { + "epoch": 11.426666666666666, + "grad_norm": 9.483874320983887, + "learning_rate": 1.4247491638795989e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7481759488582611, + "num_tokens": 1214824.0, + "step": 1290 + }, + { + "epoch": 11.435555555555556, + "grad_norm": 9.994118690490723, + "learning_rate": 1.4180602006688965e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.6899648904800415, + "num_tokens": 1216364.0, + "step": 1291 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 9.605559349060059, + "learning_rate": 1.4113712374581941e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7389807105064392, + "num_tokens": 1217904.0, + "step": 1292 + }, + { + "epoch": 11.453333333333333, + "grad_norm": 9.763737678527832, + "learning_rate": 1.4046822742474917e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7436649799346924, + "num_tokens": 1219417.0, + "step": 1293 + }, + { + "epoch": 11.462222222222222, + "grad_norm": 10.235578536987305, + "learning_rate": 1.3979933110367894e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7638567686080933, + "num_tokens": 1220973.0, + "step": 1294 + }, + { + "epoch": 11.471111111111112, + "grad_norm": 10.566251754760742, + "learning_rate": 1.391304347826087e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7424421906471252, + "num_tokens": 1222519.0, + "step": 1295 + }, + { + "epoch": 11.48, + "grad_norm": 9.725247383117676, + "learning_rate": 1.3846153846153848e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7333829998970032, + "num_tokens": 1224033.0, + "step": 1296 + }, + { + "epoch": 11.488888888888889, + "grad_norm": 10.112412452697754, + "learning_rate": 1.3779264214046825e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.777823805809021, + "num_tokens": 1225553.0, + "step": 1297 + }, + { + "epoch": 11.497777777777777, + "grad_norm": 12.48869514465332, + "learning_rate": 1.37123745819398e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.6983858942985535, + "num_tokens": 1227040.0, + "step": 1298 + }, + { + "epoch": 11.506666666666666, + "grad_norm": 10.945323944091797, + "learning_rate": 1.3645484949832777e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7533878684043884, + "num_tokens": 1228606.0, + "step": 1299 + }, + { + "epoch": 11.515555555555556, + "grad_norm": 10.367324829101562, + "learning_rate": 1.3578595317725754e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7605359554290771, + "num_tokens": 1230166.0, + "step": 1300 + }, + { + "epoch": 11.524444444444445, + "grad_norm": 10.754217147827148, + "learning_rate": 1.3511705685618732e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7673335671424866, + "num_tokens": 1231681.0, + "step": 1301 + }, + { + "epoch": 11.533333333333333, + "grad_norm": 10.28548526763916, + "learning_rate": 1.3444816053511706e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7334432303905487, + "num_tokens": 1233225.0, + "step": 1302 + }, + { + "epoch": 11.542222222222222, + "grad_norm": 12.382189750671387, + "learning_rate": 1.3377926421404683e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.719428300857544, + "num_tokens": 1234742.0, + "step": 1303 + }, + { + "epoch": 11.551111111111112, + "grad_norm": 13.96876049041748, + "learning_rate": 1.331103678929766e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7012206614017487, + "num_tokens": 1236214.0, + "step": 1304 + }, + { + "epoch": 11.56, + "grad_norm": 11.204610824584961, + "learning_rate": 1.3244147157190637e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7313401103019714, + "num_tokens": 1237765.0, + "step": 1305 + }, + { + "epoch": 11.568888888888889, + "grad_norm": 11.702737808227539, + "learning_rate": 1.3177257525083614e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7349664866924286, + "num_tokens": 1239262.0, + "step": 1306 + }, + { + "epoch": 11.577777777777778, + "grad_norm": 10.65979290008545, + "learning_rate": 1.3110367892976588e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7151464819908142, + "num_tokens": 1240735.0, + "step": 1307 + }, + { + "epoch": 11.586666666666666, + "grad_norm": 10.745150566101074, + "learning_rate": 1.3043478260869566e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7620976269245148, + "num_tokens": 1242279.0, + "step": 1308 + }, + { + "epoch": 11.595555555555556, + "grad_norm": 9.68779182434082, + "learning_rate": 1.2976588628762542e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.752841979265213, + "num_tokens": 1243851.0, + "step": 1309 + }, + { + "epoch": 11.604444444444445, + "grad_norm": 10.154099464416504, + "learning_rate": 1.290969899665552e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7413276433944702, + "num_tokens": 1245374.0, + "step": 1310 + }, + { + "epoch": 11.613333333333333, + "grad_norm": 9.835077285766602, + "learning_rate": 1.2842809364548495e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7356804311275482, + "num_tokens": 1246921.0, + "step": 1311 + }, + { + "epoch": 11.622222222222222, + "grad_norm": 11.166604042053223, + "learning_rate": 1.2775919732441471e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7008466124534607, + "num_tokens": 1248426.0, + "step": 1312 + }, + { + "epoch": 11.63111111111111, + "grad_norm": 12.781075477600098, + "learning_rate": 1.270903010033445e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7529019713401794, + "num_tokens": 1249994.0, + "step": 1313 + }, + { + "epoch": 11.64, + "grad_norm": 10.562948226928711, + "learning_rate": 1.2642140468227426e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7390634417533875, + "num_tokens": 1251553.0, + "step": 1314 + }, + { + "epoch": 11.648888888888889, + "grad_norm": 9.466314315795898, + "learning_rate": 1.2575250836120402e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7510727941989899, + "num_tokens": 1253162.0, + "step": 1315 + }, + { + "epoch": 11.657777777777778, + "grad_norm": 10.16841983795166, + "learning_rate": 1.2508361204013379e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7552642226219177, + "num_tokens": 1254701.0, + "step": 1316 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 10.048696517944336, + "learning_rate": 1.2441471571906355e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.744918704032898, + "num_tokens": 1256220.0, + "step": 1317 + }, + { + "epoch": 11.675555555555556, + "grad_norm": 10.12587833404541, + "learning_rate": 1.2374581939799331e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7411422431468964, + "num_tokens": 1257762.0, + "step": 1318 + }, + { + "epoch": 11.684444444444445, + "grad_norm": 11.20529556274414, + "learning_rate": 1.230769230769231e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7191186547279358, + "num_tokens": 1259329.0, + "step": 1319 + }, + { + "epoch": 11.693333333333333, + "grad_norm": 9.212910652160645, + "learning_rate": 1.2240802675585286e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7477546334266663, + "num_tokens": 1260940.0, + "step": 1320 + }, + { + "epoch": 11.702222222222222, + "grad_norm": 11.681827545166016, + "learning_rate": 1.2173913043478262e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7102537751197815, + "num_tokens": 1262455.0, + "step": 1321 + }, + { + "epoch": 11.71111111111111, + "grad_norm": 10.266230583190918, + "learning_rate": 1.2107023411371239e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7559337019920349, + "num_tokens": 1264039.0, + "step": 1322 + }, + { + "epoch": 11.72, + "grad_norm": 10.404054641723633, + "learning_rate": 1.2040133779264215e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.727322518825531, + "num_tokens": 1265607.0, + "step": 1323 + }, + { + "epoch": 11.72888888888889, + "grad_norm": 11.820804595947266, + "learning_rate": 1.1973244147157191e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7324827611446381, + "num_tokens": 1267221.0, + "step": 1324 + }, + { + "epoch": 11.737777777777778, + "grad_norm": 12.820998191833496, + "learning_rate": 1.1906354515050168e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7003381252288818, + "num_tokens": 1268745.0, + "step": 1325 + }, + { + "epoch": 11.746666666666666, + "grad_norm": 11.24789047241211, + "learning_rate": 1.1839464882943144e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7292083203792572, + "num_tokens": 1270271.0, + "step": 1326 + }, + { + "epoch": 11.755555555555556, + "grad_norm": 10.463576316833496, + "learning_rate": 1.1772575250836122e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7706163823604584, + "num_tokens": 1271812.0, + "step": 1327 + }, + { + "epoch": 11.764444444444445, + "grad_norm": 10.55777359008789, + "learning_rate": 1.1705685618729099e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7448605298995972, + "num_tokens": 1273422.0, + "step": 1328 + }, + { + "epoch": 11.773333333333333, + "grad_norm": 9.933938980102539, + "learning_rate": 1.1638795986622075e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7308960556983948, + "num_tokens": 1275013.0, + "step": 1329 + }, + { + "epoch": 11.782222222222222, + "grad_norm": 10.806668281555176, + "learning_rate": 1.1571906354515051e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7102511525154114, + "num_tokens": 1276593.0, + "step": 1330 + }, + { + "epoch": 11.79111111111111, + "grad_norm": 10.983059883117676, + "learning_rate": 1.1505016722408027e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7398191392421722, + "num_tokens": 1278158.0, + "step": 1331 + }, + { + "epoch": 11.8, + "grad_norm": 10.983460426330566, + "learning_rate": 1.1438127090301004e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7161470651626587, + "num_tokens": 1279630.0, + "step": 1332 + }, + { + "epoch": 11.80888888888889, + "grad_norm": 10.257384300231934, + "learning_rate": 1.137123745819398e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7453327775001526, + "num_tokens": 1281163.0, + "step": 1333 + }, + { + "epoch": 11.817777777777778, + "grad_norm": 10.935276985168457, + "learning_rate": 1.1304347826086956e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7039744555950165, + "num_tokens": 1282660.0, + "step": 1334 + }, + { + "epoch": 11.826666666666666, + "grad_norm": 11.620217323303223, + "learning_rate": 1.1237458193979935e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7362974584102631, + "num_tokens": 1284250.0, + "step": 1335 + }, + { + "epoch": 11.835555555555555, + "grad_norm": 10.642646789550781, + "learning_rate": 1.1170568561872911e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6967872679233551, + "num_tokens": 1285792.0, + "step": 1336 + }, + { + "epoch": 11.844444444444445, + "grad_norm": 11.10942268371582, + "learning_rate": 1.1103678929765887e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7384042739868164, + "num_tokens": 1287293.0, + "step": 1337 + }, + { + "epoch": 11.853333333333333, + "grad_norm": 10.504341125488281, + "learning_rate": 1.1036789297658864e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7801645994186401, + "num_tokens": 1288855.0, + "step": 1338 + }, + { + "epoch": 11.862222222222222, + "grad_norm": 11.431352615356445, + "learning_rate": 1.096989966555184e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7341761291027069, + "num_tokens": 1290370.0, + "step": 1339 + }, + { + "epoch": 11.87111111111111, + "grad_norm": 10.578594207763672, + "learning_rate": 1.0903010033444816e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7224303483963013, + "num_tokens": 1291981.0, + "step": 1340 + }, + { + "epoch": 11.88, + "grad_norm": 9.791159629821777, + "learning_rate": 1.0836120401337793e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7679729759693146, + "num_tokens": 1293560.0, + "step": 1341 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 11.115952491760254, + "learning_rate": 1.076923076923077e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7187186181545258, + "num_tokens": 1295063.0, + "step": 1342 + }, + { + "epoch": 11.897777777777778, + "grad_norm": 10.795344352722168, + "learning_rate": 1.0702341137123747e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7404151558876038, + "num_tokens": 1296640.0, + "step": 1343 + }, + { + "epoch": 11.906666666666666, + "grad_norm": 11.657620429992676, + "learning_rate": 1.0635451505016724e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7328090667724609, + "num_tokens": 1298157.0, + "step": 1344 + }, + { + "epoch": 11.915555555555555, + "grad_norm": 10.503164291381836, + "learning_rate": 1.05685618729097e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7819617390632629, + "num_tokens": 1299671.0, + "step": 1345 + }, + { + "epoch": 11.924444444444445, + "grad_norm": 9.782106399536133, + "learning_rate": 1.0501672240802676e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7555056810379028, + "num_tokens": 1301236.0, + "step": 1346 + }, + { + "epoch": 11.933333333333334, + "grad_norm": 10.800812721252441, + "learning_rate": 1.0434782608695653e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7799651920795441, + "num_tokens": 1302793.0, + "step": 1347 + }, + { + "epoch": 11.942222222222222, + "grad_norm": 10.409107208251953, + "learning_rate": 1.0367892976588629e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7560741901397705, + "num_tokens": 1304395.0, + "step": 1348 + }, + { + "epoch": 11.95111111111111, + "grad_norm": 10.702130317687988, + "learning_rate": 1.0301003344481607e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7918002307415009, + "num_tokens": 1305934.0, + "step": 1349 + }, + { + "epoch": 11.96, + "grad_norm": 11.875544548034668, + "learning_rate": 1.0234113712374584e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7206161022186279, + "num_tokens": 1307459.0, + "step": 1350 + }, + { + "epoch": 11.96888888888889, + "grad_norm": 11.350446701049805, + "learning_rate": 1.016722408026756e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.717968612909317, + "num_tokens": 1308980.0, + "step": 1351 + }, + { + "epoch": 11.977777777777778, + "grad_norm": 11.702728271484375, + "learning_rate": 1.0100334448160536e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.6972176730632782, + "num_tokens": 1310538.0, + "step": 1352 + }, + { + "epoch": 11.986666666666666, + "grad_norm": 11.754011154174805, + "learning_rate": 1.0033444816053512e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7406153976917267, + "num_tokens": 1312088.0, + "step": 1353 + }, + { + "epoch": 11.995555555555555, + "grad_norm": 10.950407981872559, + "learning_rate": 9.966555183946489e-07, + "loss": 0.8691, + "mean_token_accuracy": 0.7280450761318207, + "num_tokens": 1313703.0, + "step": 1354 + }, + { + "epoch": 12.0, + "grad_norm": 15.17965316772461, + "learning_rate": 9.899665551839465e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7063291072845459, + "num_tokens": 1314504.0, + "step": 1355 + }, + { + "epoch": 12.008888888888889, + "grad_norm": 11.19985580444336, + "learning_rate": 9.832775919732443e-07, + "loss": 0.7856, + "mean_token_accuracy": 0.7488973140716553, + "num_tokens": 1316070.0, + "step": 1356 + }, + { + "epoch": 12.017777777777777, + "grad_norm": 11.47360610961914, + "learning_rate": 9.76588628762542e-07, + "loss": 0.7395, + "mean_token_accuracy": 0.7574574053287506, + "num_tokens": 1317552.0, + "step": 1357 + }, + { + "epoch": 12.026666666666667, + "grad_norm": 10.085973739624023, + "learning_rate": 9.698996655518396e-07, + "loss": 0.8479, + "mean_token_accuracy": 0.739927738904953, + "num_tokens": 1319105.0, + "step": 1358 + }, + { + "epoch": 12.035555555555556, + "grad_norm": 10.451904296875, + "learning_rate": 9.632107023411372e-07, + "loss": 0.7412, + "mean_token_accuracy": 0.7783210575580597, + "num_tokens": 1320673.0, + "step": 1359 + }, + { + "epoch": 12.044444444444444, + "grad_norm": 11.603410720825195, + "learning_rate": 9.565217391304349e-07, + "loss": 0.8734, + "mean_token_accuracy": 0.7277852296829224, + "num_tokens": 1322229.0, + "step": 1360 + }, + { + "epoch": 12.053333333333333, + "grad_norm": 11.71084213256836, + "learning_rate": 9.498327759197325e-07, + "loss": 0.7343, + "mean_token_accuracy": 0.7503126561641693, + "num_tokens": 1323824.0, + "step": 1361 + }, + { + "epoch": 12.062222222222223, + "grad_norm": 10.450995445251465, + "learning_rate": 9.431438127090302e-07, + "loss": 0.8466, + "mean_token_accuracy": 0.7504589557647705, + "num_tokens": 1325345.0, + "step": 1362 + }, + { + "epoch": 12.071111111111112, + "grad_norm": 11.271384239196777, + "learning_rate": 9.364548494983278e-07, + "loss": 0.7492, + "mean_token_accuracy": 0.7569797933101654, + "num_tokens": 1326896.0, + "step": 1363 + }, + { + "epoch": 12.08, + "grad_norm": 11.015911102294922, + "learning_rate": 9.297658862876255e-07, + "loss": 0.7734, + "mean_token_accuracy": 0.7461153268814087, + "num_tokens": 1328437.0, + "step": 1364 + }, + { + "epoch": 12.088888888888889, + "grad_norm": 11.115043640136719, + "learning_rate": 9.230769230769232e-07, + "loss": 0.8714, + "mean_token_accuracy": 0.7363775968551636, + "num_tokens": 1329997.0, + "step": 1365 + }, + { + "epoch": 12.097777777777777, + "grad_norm": 10.791339874267578, + "learning_rate": 9.163879598662208e-07, + "loss": 0.873, + "mean_token_accuracy": 0.7471760511398315, + "num_tokens": 1331529.0, + "step": 1366 + }, + { + "epoch": 12.106666666666667, + "grad_norm": 11.784246444702148, + "learning_rate": 9.096989966555185e-07, + "loss": 0.8861, + "mean_token_accuracy": 0.7247993350028992, + "num_tokens": 1333061.0, + "step": 1367 + }, + { + "epoch": 12.115555555555556, + "grad_norm": 10.800557136535645, + "learning_rate": 9.030100334448161e-07, + "loss": 0.7845, + "mean_token_accuracy": 0.7613691687583923, + "num_tokens": 1334621.0, + "step": 1368 + }, + { + "epoch": 12.124444444444444, + "grad_norm": 11.521154403686523, + "learning_rate": 8.963210702341138e-07, + "loss": 0.8424, + "mean_token_accuracy": 0.7419287860393524, + "num_tokens": 1336146.0, + "step": 1369 + }, + { + "epoch": 12.133333333333333, + "grad_norm": 10.115452766418457, + "learning_rate": 8.896321070234114e-07, + "loss": 0.6908, + "mean_token_accuracy": 0.7758003175258636, + "num_tokens": 1337727.0, + "step": 1370 + }, + { + "epoch": 12.142222222222221, + "grad_norm": 10.107575416564941, + "learning_rate": 8.829431438127091e-07, + "loss": 0.7941, + "mean_token_accuracy": 0.7503213286399841, + "num_tokens": 1339314.0, + "step": 1371 + }, + { + "epoch": 12.151111111111112, + "grad_norm": 11.163673400878906, + "learning_rate": 8.762541806020069e-07, + "loss": 0.7382, + "mean_token_accuracy": 0.7746748626232147, + "num_tokens": 1340898.0, + "step": 1372 + }, + { + "epoch": 12.16, + "grad_norm": 12.687785148620605, + "learning_rate": 8.695652173913044e-07, + "loss": 0.948, + "mean_token_accuracy": 0.7144178152084351, + "num_tokens": 1342438.0, + "step": 1373 + }, + { + "epoch": 12.168888888888889, + "grad_norm": 11.149510383605957, + "learning_rate": 8.628762541806021e-07, + "loss": 0.8243, + "mean_token_accuracy": 0.7559310495853424, + "num_tokens": 1343975.0, + "step": 1374 + }, + { + "epoch": 12.177777777777777, + "grad_norm": 11.115209579467773, + "learning_rate": 8.561872909698997e-07, + "loss": 0.7495, + "mean_token_accuracy": 0.7407161891460419, + "num_tokens": 1345479.0, + "step": 1375 + }, + { + "epoch": 12.186666666666667, + "grad_norm": 13.132133483886719, + "learning_rate": 8.494983277591974e-07, + "loss": 0.7995, + "mean_token_accuracy": 0.7394975423812866, + "num_tokens": 1346986.0, + "step": 1376 + }, + { + "epoch": 12.195555555555556, + "grad_norm": 11.011582374572754, + "learning_rate": 8.42809364548495e-07, + "loss": 0.8427, + "mean_token_accuracy": 0.7303920686244965, + "num_tokens": 1348513.0, + "step": 1377 + }, + { + "epoch": 12.204444444444444, + "grad_norm": 13.303811073303223, + "learning_rate": 8.361204013377927e-07, + "loss": 0.8565, + "mean_token_accuracy": 0.735723465681076, + "num_tokens": 1350014.0, + "step": 1378 + }, + { + "epoch": 12.213333333333333, + "grad_norm": 11.410381317138672, + "learning_rate": 8.294314381270905e-07, + "loss": 0.8691, + "mean_token_accuracy": 0.7279907166957855, + "num_tokens": 1351605.0, + "step": 1379 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 13.394940376281738, + "learning_rate": 8.22742474916388e-07, + "loss": 0.9146, + "mean_token_accuracy": 0.7259432971477509, + "num_tokens": 1353134.0, + "step": 1380 + }, + { + "epoch": 12.231111111111112, + "grad_norm": 14.60746955871582, + "learning_rate": 8.160535117056857e-07, + "loss": 0.9115, + "mean_token_accuracy": 0.7078434824943542, + "num_tokens": 1354739.0, + "step": 1381 + }, + { + "epoch": 12.24, + "grad_norm": 12.181891441345215, + "learning_rate": 8.093645484949833e-07, + "loss": 0.873, + "mean_token_accuracy": 0.7215684652328491, + "num_tokens": 1356269.0, + "step": 1382 + }, + { + "epoch": 12.248888888888889, + "grad_norm": 13.190144538879395, + "learning_rate": 8.02675585284281e-07, + "loss": 0.8553, + "mean_token_accuracy": 0.7164647877216339, + "num_tokens": 1357799.0, + "step": 1383 + }, + { + "epoch": 12.257777777777777, + "grad_norm": 11.18969440460205, + "learning_rate": 7.959866220735786e-07, + "loss": 0.8517, + "mean_token_accuracy": 0.7479919791221619, + "num_tokens": 1359362.0, + "step": 1384 + }, + { + "epoch": 12.266666666666667, + "grad_norm": 11.302090644836426, + "learning_rate": 7.892976588628764e-07, + "loss": 0.8954, + "mean_token_accuracy": 0.7263404428958893, + "num_tokens": 1360911.0, + "step": 1385 + }, + { + "epoch": 12.275555555555556, + "grad_norm": 12.10798168182373, + "learning_rate": 7.82608695652174e-07, + "loss": 0.9042, + "mean_token_accuracy": 0.7121402025222778, + "num_tokens": 1362437.0, + "step": 1386 + }, + { + "epoch": 12.284444444444444, + "grad_norm": 13.394943237304688, + "learning_rate": 7.759197324414716e-07, + "loss": 0.8464, + "mean_token_accuracy": 0.723204642534256, + "num_tokens": 1363951.0, + "step": 1387 + }, + { + "epoch": 12.293333333333333, + "grad_norm": 11.187024116516113, + "learning_rate": 7.692307692307694e-07, + "loss": 0.8246, + "mean_token_accuracy": 0.7538046538829803, + "num_tokens": 1365439.0, + "step": 1388 + }, + { + "epoch": 12.302222222222222, + "grad_norm": 11.444994926452637, + "learning_rate": 7.625418060200669e-07, + "loss": 0.8602, + "mean_token_accuracy": 0.7231144905090332, + "num_tokens": 1366958.0, + "step": 1389 + }, + { + "epoch": 12.311111111111112, + "grad_norm": 10.64663314819336, + "learning_rate": 7.558528428093646e-07, + "loss": 0.8256, + "mean_token_accuracy": 0.7380082309246063, + "num_tokens": 1368535.0, + "step": 1390 + }, + { + "epoch": 12.32, + "grad_norm": 11.3572998046875, + "learning_rate": 7.491638795986622e-07, + "loss": 0.8264, + "mean_token_accuracy": 0.752658873796463, + "num_tokens": 1370088.0, + "step": 1391 + }, + { + "epoch": 12.328888888888889, + "grad_norm": 11.421731948852539, + "learning_rate": 7.424749163879599e-07, + "loss": 0.8283, + "mean_token_accuracy": 0.7526355683803558, + "num_tokens": 1371621.0, + "step": 1392 + }, + { + "epoch": 12.337777777777777, + "grad_norm": 11.156582832336426, + "learning_rate": 7.357859531772576e-07, + "loss": 0.8818, + "mean_token_accuracy": 0.7278747260570526, + "num_tokens": 1373147.0, + "step": 1393 + }, + { + "epoch": 12.346666666666668, + "grad_norm": 11.013882637023926, + "learning_rate": 7.290969899665552e-07, + "loss": 0.7917, + "mean_token_accuracy": 0.7407235205173492, + "num_tokens": 1374636.0, + "step": 1394 + }, + { + "epoch": 12.355555555555556, + "grad_norm": 11.782276153564453, + "learning_rate": 7.22408026755853e-07, + "loss": 0.8019, + "mean_token_accuracy": 0.7332910299301147, + "num_tokens": 1376164.0, + "step": 1395 + }, + { + "epoch": 12.364444444444445, + "grad_norm": 10.177056312561035, + "learning_rate": 7.157190635451505e-07, + "loss": 0.8104, + "mean_token_accuracy": 0.7447313070297241, + "num_tokens": 1377701.0, + "step": 1396 + }, + { + "epoch": 12.373333333333333, + "grad_norm": 11.28663444519043, + "learning_rate": 7.090301003344482e-07, + "loss": 0.8352, + "mean_token_accuracy": 0.7181954681873322, + "num_tokens": 1379234.0, + "step": 1397 + }, + { + "epoch": 12.382222222222222, + "grad_norm": 10.805381774902344, + "learning_rate": 7.023411371237459e-07, + "loss": 0.7587, + "mean_token_accuracy": 0.7621866166591644, + "num_tokens": 1380758.0, + "step": 1398 + }, + { + "epoch": 12.391111111111112, + "grad_norm": 10.939835548400879, + "learning_rate": 6.956521739130435e-07, + "loss": 0.867, + "mean_token_accuracy": 0.7197368443012238, + "num_tokens": 1382336.0, + "step": 1399 + }, + { + "epoch": 12.4, + "grad_norm": 11.682055473327637, + "learning_rate": 6.889632107023412e-07, + "loss": 0.8479, + "mean_token_accuracy": 0.7068181931972504, + "num_tokens": 1383880.0, + "step": 1400 + }, + { + "epoch": 12.408888888888889, + "grad_norm": 10.942253112792969, + "learning_rate": 6.822742474916389e-07, + "loss": 0.7747, + "mean_token_accuracy": 0.750813752412796, + "num_tokens": 1385410.0, + "step": 1401 + }, + { + "epoch": 12.417777777777777, + "grad_norm": 11.189682006835938, + "learning_rate": 6.755852842809366e-07, + "loss": 0.9807, + "mean_token_accuracy": 0.7228166460990906, + "num_tokens": 1386957.0, + "step": 1402 + }, + { + "epoch": 12.426666666666666, + "grad_norm": 10.950565338134766, + "learning_rate": 6.688963210702341e-07, + "loss": 0.7857, + "mean_token_accuracy": 0.757340669631958, + "num_tokens": 1388480.0, + "step": 1403 + }, + { + "epoch": 12.435555555555556, + "grad_norm": 11.337262153625488, + "learning_rate": 6.622073578595319e-07, + "loss": 0.8658, + "mean_token_accuracy": 0.7306015193462372, + "num_tokens": 1390038.0, + "step": 1404 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 10.33862590789795, + "learning_rate": 6.555183946488294e-07, + "loss": 0.8286, + "mean_token_accuracy": 0.7430371344089508, + "num_tokens": 1391566.0, + "step": 1405 + }, + { + "epoch": 12.453333333333333, + "grad_norm": 10.075733184814453, + "learning_rate": 6.488294314381271e-07, + "loss": 0.7298, + "mean_token_accuracy": 0.7688927054405212, + "num_tokens": 1393107.0, + "step": 1406 + }, + { + "epoch": 12.462222222222222, + "grad_norm": 10.388235092163086, + "learning_rate": 6.421404682274248e-07, + "loss": 0.757, + "mean_token_accuracy": 0.7737459242343903, + "num_tokens": 1394696.0, + "step": 1407 + }, + { + "epoch": 12.471111111111112, + "grad_norm": 11.990020751953125, + "learning_rate": 6.354515050167225e-07, + "loss": 0.82, + "mean_token_accuracy": 0.7470935583114624, + "num_tokens": 1396210.0, + "step": 1408 + }, + { + "epoch": 12.48, + "grad_norm": 11.000653266906738, + "learning_rate": 6.287625418060201e-07, + "loss": 0.8718, + "mean_token_accuracy": 0.7183686792850494, + "num_tokens": 1397732.0, + "step": 1409 + }, + { + "epoch": 12.488888888888889, + "grad_norm": 11.26606559753418, + "learning_rate": 6.220735785953178e-07, + "loss": 0.8874, + "mean_token_accuracy": 0.7228727042675018, + "num_tokens": 1399318.0, + "step": 1410 + }, + { + "epoch": 12.497777777777777, + "grad_norm": 10.929518699645996, + "learning_rate": 6.153846153846155e-07, + "loss": 0.7068, + "mean_token_accuracy": 0.7871188819408417, + "num_tokens": 1400874.0, + "step": 1411 + }, + { + "epoch": 12.506666666666666, + "grad_norm": 14.554587364196777, + "learning_rate": 6.086956521739131e-07, + "loss": 0.7904, + "mean_token_accuracy": 0.7480728030204773, + "num_tokens": 1402400.0, + "step": 1412 + }, + { + "epoch": 12.515555555555556, + "grad_norm": 11.641924858093262, + "learning_rate": 6.020066889632107e-07, + "loss": 0.9172, + "mean_token_accuracy": 0.7081720232963562, + "num_tokens": 1403938.0, + "step": 1413 + }, + { + "epoch": 12.524444444444445, + "grad_norm": 10.826407432556152, + "learning_rate": 5.953177257525084e-07, + "loss": 0.7831, + "mean_token_accuracy": 0.7453969419002533, + "num_tokens": 1405486.0, + "step": 1414 + }, + { + "epoch": 12.533333333333333, + "grad_norm": 10.776083946228027, + "learning_rate": 5.886287625418061e-07, + "loss": 0.8236, + "mean_token_accuracy": 0.7545139789581299, + "num_tokens": 1407018.0, + "step": 1415 + }, + { + "epoch": 12.542222222222222, + "grad_norm": 11.47647476196289, + "learning_rate": 5.819397993311037e-07, + "loss": 0.8395, + "mean_token_accuracy": 0.733754426240921, + "num_tokens": 1408562.0, + "step": 1416 + }, + { + "epoch": 12.551111111111112, + "grad_norm": 10.963071823120117, + "learning_rate": 5.752508361204014e-07, + "loss": 0.8115, + "mean_token_accuracy": 0.7456030249595642, + "num_tokens": 1410163.0, + "step": 1417 + }, + { + "epoch": 12.56, + "grad_norm": 11.764537811279297, + "learning_rate": 5.68561872909699e-07, + "loss": 0.7523, + "mean_token_accuracy": 0.777882307767868, + "num_tokens": 1411644.0, + "step": 1418 + }, + { + "epoch": 12.568888888888889, + "grad_norm": 12.224940299987793, + "learning_rate": 5.618729096989967e-07, + "loss": 0.9225, + "mean_token_accuracy": 0.7277089655399323, + "num_tokens": 1413156.0, + "step": 1419 + }, + { + "epoch": 12.577777777777778, + "grad_norm": 11.274248123168945, + "learning_rate": 5.551839464882944e-07, + "loss": 0.7321, + "mean_token_accuracy": 0.7558932304382324, + "num_tokens": 1414677.0, + "step": 1420 + }, + { + "epoch": 12.586666666666666, + "grad_norm": 13.055071830749512, + "learning_rate": 5.48494983277592e-07, + "loss": 0.8835, + "mean_token_accuracy": 0.7215878665447235, + "num_tokens": 1416178.0, + "step": 1421 + }, + { + "epoch": 12.595555555555556, + "grad_norm": 11.788314819335938, + "learning_rate": 5.418060200668896e-07, + "loss": 0.8563, + "mean_token_accuracy": 0.7316378355026245, + "num_tokens": 1417732.0, + "step": 1422 + }, + { + "epoch": 12.604444444444445, + "grad_norm": 10.4442138671875, + "learning_rate": 5.351170568561874e-07, + "loss": 0.7793, + "mean_token_accuracy": 0.7476752400398254, + "num_tokens": 1419317.0, + "step": 1423 + }, + { + "epoch": 12.613333333333333, + "grad_norm": 11.516166687011719, + "learning_rate": 5.28428093645485e-07, + "loss": 0.8034, + "mean_token_accuracy": 0.7438822686672211, + "num_tokens": 1420868.0, + "step": 1424 + }, + { + "epoch": 12.622222222222222, + "grad_norm": 11.070847511291504, + "learning_rate": 5.217391304347826e-07, + "loss": 0.7984, + "mean_token_accuracy": 0.7478469610214233, + "num_tokens": 1422404.0, + "step": 1425 + }, + { + "epoch": 12.63111111111111, + "grad_norm": 11.268789291381836, + "learning_rate": 5.150501672240804e-07, + "loss": 0.7984, + "mean_token_accuracy": 0.7557925879955292, + "num_tokens": 1423987.0, + "step": 1426 + }, + { + "epoch": 12.64, + "grad_norm": 12.173585891723633, + "learning_rate": 5.08361204013378e-07, + "loss": 0.8994, + "mean_token_accuracy": 0.7108751833438873, + "num_tokens": 1425527.0, + "step": 1427 + }, + { + "epoch": 12.648888888888889, + "grad_norm": 11.112841606140137, + "learning_rate": 5.016722408026756e-07, + "loss": 0.753, + "mean_token_accuracy": 0.7562646269798279, + "num_tokens": 1427012.0, + "step": 1428 + }, + { + "epoch": 12.657777777777778, + "grad_norm": 11.890487670898438, + "learning_rate": 4.949832775919733e-07, + "loss": 0.7693, + "mean_token_accuracy": 0.7662787735462189, + "num_tokens": 1428575.0, + "step": 1429 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 13.777972221374512, + "learning_rate": 4.88294314381271e-07, + "loss": 0.8536, + "mean_token_accuracy": 0.7295635342597961, + "num_tokens": 1430155.0, + "step": 1430 + }, + { + "epoch": 12.675555555555556, + "grad_norm": 11.279742240905762, + "learning_rate": 4.816053511705686e-07, + "loss": 0.7514, + "mean_token_accuracy": 0.7535927295684814, + "num_tokens": 1431671.0, + "step": 1431 + }, + { + "epoch": 12.684444444444445, + "grad_norm": 10.853391647338867, + "learning_rate": 4.7491638795986625e-07, + "loss": 0.6991, + "mean_token_accuracy": 0.7797865271568298, + "num_tokens": 1433218.0, + "step": 1432 + }, + { + "epoch": 12.693333333333333, + "grad_norm": 11.915665626525879, + "learning_rate": 4.682274247491639e-07, + "loss": 0.665, + "mean_token_accuracy": 0.7856169641017914, + "num_tokens": 1434792.0, + "step": 1433 + }, + { + "epoch": 12.702222222222222, + "grad_norm": 11.719280242919922, + "learning_rate": 4.615384615384616e-07, + "loss": 0.8293, + "mean_token_accuracy": 0.7231181859970093, + "num_tokens": 1436324.0, + "step": 1434 + }, + { + "epoch": 12.71111111111111, + "grad_norm": 10.97822093963623, + "learning_rate": 4.5484949832775925e-07, + "loss": 0.7525, + "mean_token_accuracy": 0.7646740674972534, + "num_tokens": 1437874.0, + "step": 1435 + }, + { + "epoch": 12.72, + "grad_norm": 11.214217185974121, + "learning_rate": 4.481605351170569e-07, + "loss": 0.7875, + "mean_token_accuracy": 0.7630895674228668, + "num_tokens": 1439440.0, + "step": 1436 + }, + { + "epoch": 12.72888888888889, + "grad_norm": 11.627196311950684, + "learning_rate": 4.4147157190635456e-07, + "loss": 0.8197, + "mean_token_accuracy": 0.7388998866081238, + "num_tokens": 1440975.0, + "step": 1437 + }, + { + "epoch": 12.737777777777778, + "grad_norm": 11.210291862487793, + "learning_rate": 4.347826086956522e-07, + "loss": 0.8298, + "mean_token_accuracy": 0.741605281829834, + "num_tokens": 1442488.0, + "step": 1438 + }, + { + "epoch": 12.746666666666666, + "grad_norm": 12.083922386169434, + "learning_rate": 4.2809364548494987e-07, + "loss": 0.7364, + "mean_token_accuracy": 0.7560800611972809, + "num_tokens": 1444049.0, + "step": 1439 + }, + { + "epoch": 12.755555555555556, + "grad_norm": 12.62346076965332, + "learning_rate": 4.214046822742475e-07, + "loss": 0.7714, + "mean_token_accuracy": 0.7384555339813232, + "num_tokens": 1445583.0, + "step": 1440 + }, + { + "epoch": 12.764444444444445, + "grad_norm": 12.882533073425293, + "learning_rate": 4.1471571906354524e-07, + "loss": 0.8842, + "mean_token_accuracy": 0.7299583256244659, + "num_tokens": 1447149.0, + "step": 1441 + }, + { + "epoch": 12.773333333333333, + "grad_norm": 11.43833065032959, + "learning_rate": 4.0802675585284287e-07, + "loss": 0.6554, + "mean_token_accuracy": 0.7745546400547028, + "num_tokens": 1448736.0, + "step": 1442 + }, + { + "epoch": 12.782222222222222, + "grad_norm": 12.209686279296875, + "learning_rate": 4.013377926421405e-07, + "loss": 0.8031, + "mean_token_accuracy": 0.741134375333786, + "num_tokens": 1450270.0, + "step": 1443 + }, + { + "epoch": 12.79111111111111, + "grad_norm": 12.68696117401123, + "learning_rate": 3.946488294314382e-07, + "loss": 0.8772, + "mean_token_accuracy": 0.7299082279205322, + "num_tokens": 1451793.0, + "step": 1444 + }, + { + "epoch": 12.8, + "grad_norm": 11.887773513793945, + "learning_rate": 3.879598662207358e-07, + "loss": 0.7527, + "mean_token_accuracy": 0.7635610699653625, + "num_tokens": 1453299.0, + "step": 1445 + }, + { + "epoch": 12.80888888888889, + "grad_norm": 12.384306907653809, + "learning_rate": 3.8127090301003344e-07, + "loss": 0.8461, + "mean_token_accuracy": 0.7613052129745483, + "num_tokens": 1454829.0, + "step": 1446 + }, + { + "epoch": 12.817777777777778, + "grad_norm": 12.942500114440918, + "learning_rate": 3.745819397993311e-07, + "loss": 0.7907, + "mean_token_accuracy": 0.7535550594329834, + "num_tokens": 1456366.0, + "step": 1447 + }, + { + "epoch": 12.826666666666666, + "grad_norm": 11.554134368896484, + "learning_rate": 3.678929765886288e-07, + "loss": 0.7304, + "mean_token_accuracy": 0.7709911465644836, + "num_tokens": 1457893.0, + "step": 1448 + }, + { + "epoch": 12.835555555555555, + "grad_norm": 10.760127067565918, + "learning_rate": 3.612040133779265e-07, + "loss": 0.7288, + "mean_token_accuracy": 0.7915098667144775, + "num_tokens": 1459432.0, + "step": 1449 + }, + { + "epoch": 12.844444444444445, + "grad_norm": 11.202887535095215, + "learning_rate": 3.545150501672241e-07, + "loss": 0.7942, + "mean_token_accuracy": 0.7399070858955383, + "num_tokens": 1460978.0, + "step": 1450 + }, + { + "epoch": 12.853333333333333, + "grad_norm": 11.224979400634766, + "learning_rate": 3.4782608695652175e-07, + "loss": 0.7687, + "mean_token_accuracy": 0.7546682953834534, + "num_tokens": 1462536.0, + "step": 1451 + }, + { + "epoch": 12.862222222222222, + "grad_norm": 11.693232536315918, + "learning_rate": 3.4113712374581943e-07, + "loss": 0.8305, + "mean_token_accuracy": 0.7397772669792175, + "num_tokens": 1464065.0, + "step": 1452 + }, + { + "epoch": 12.87111111111111, + "grad_norm": 11.743040084838867, + "learning_rate": 3.3444816053511706e-07, + "loss": 0.8179, + "mean_token_accuracy": 0.7369158267974854, + "num_tokens": 1465581.0, + "step": 1453 + }, + { + "epoch": 12.88, + "grad_norm": 10.687796592712402, + "learning_rate": 3.277591973244147e-07, + "loss": 0.7807, + "mean_token_accuracy": 0.7535022497177124, + "num_tokens": 1467111.0, + "step": 1454 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 11.326873779296875, + "learning_rate": 3.210702341137124e-07, + "loss": 0.6794, + "mean_token_accuracy": 0.7946988940238953, + "num_tokens": 1468692.0, + "step": 1455 + }, + { + "epoch": 12.897777777777778, + "grad_norm": 16.22199249267578, + "learning_rate": 3.1438127090301006e-07, + "loss": 0.8996, + "mean_token_accuracy": 0.7187929749488831, + "num_tokens": 1470237.0, + "step": 1456 + }, + { + "epoch": 12.906666666666666, + "grad_norm": 10.63743782043457, + "learning_rate": 3.0769230769230774e-07, + "loss": 0.6605, + "mean_token_accuracy": 0.781734824180603, + "num_tokens": 1471781.0, + "step": 1457 + }, + { + "epoch": 12.915555555555555, + "grad_norm": 10.886502265930176, + "learning_rate": 3.010033444816054e-07, + "loss": 0.6815, + "mean_token_accuracy": 0.7747696042060852, + "num_tokens": 1473304.0, + "step": 1458 + }, + { + "epoch": 12.924444444444445, + "grad_norm": 11.612689018249512, + "learning_rate": 2.9431438127090306e-07, + "loss": 0.7364, + "mean_token_accuracy": 0.7680708169937134, + "num_tokens": 1474826.0, + "step": 1459 + }, + { + "epoch": 12.933333333333334, + "grad_norm": 13.100022315979004, + "learning_rate": 2.876254180602007e-07, + "loss": 0.8053, + "mean_token_accuracy": 0.7443003356456757, + "num_tokens": 1476335.0, + "step": 1460 + }, + { + "epoch": 12.942222222222222, + "grad_norm": 12.578561782836914, + "learning_rate": 2.8093645484949837e-07, + "loss": 0.8174, + "mean_token_accuracy": 0.7382814884185791, + "num_tokens": 1477898.0, + "step": 1461 + }, + { + "epoch": 12.95111111111111, + "grad_norm": 10.931857109069824, + "learning_rate": 2.74247491638796e-07, + "loss": 0.778, + "mean_token_accuracy": 0.7571633160114288, + "num_tokens": 1479444.0, + "step": 1462 + }, + { + "epoch": 12.96, + "grad_norm": 12.038164138793945, + "learning_rate": 2.675585284280937e-07, + "loss": 0.945, + "mean_token_accuracy": 0.7019413113594055, + "num_tokens": 1480965.0, + "step": 1463 + }, + { + "epoch": 12.96888888888889, + "grad_norm": 11.326685905456543, + "learning_rate": 2.608695652173913e-07, + "loss": 0.7849, + "mean_token_accuracy": 0.7315789461135864, + "num_tokens": 1482544.0, + "step": 1464 + }, + { + "epoch": 12.977777777777778, + "grad_norm": 12.041080474853516, + "learning_rate": 2.54180602006689e-07, + "loss": 0.8476, + "mean_token_accuracy": 0.7517750561237335, + "num_tokens": 1484028.0, + "step": 1465 + }, + { + "epoch": 12.986666666666666, + "grad_norm": 12.060532569885254, + "learning_rate": 2.474916387959866e-07, + "loss": 0.8549, + "mean_token_accuracy": 0.7479470670223236, + "num_tokens": 1485615.0, + "step": 1466 + }, + { + "epoch": 12.995555555555555, + "grad_norm": 11.087488174438477, + "learning_rate": 2.408026755852843e-07, + "loss": 0.753, + "mean_token_accuracy": 0.7583780288696289, + "num_tokens": 1487146.0, + "step": 1467 + }, + { + "epoch": 13.0, + "grad_norm": 16.713895797729492, + "learning_rate": 2.3411371237458194e-07, + "loss": 0.9231, + "mean_token_accuracy": 0.7050938606262207, + "num_tokens": 1487915.0, + "step": 1468 + }, + { + "epoch": 13.008888888888889, + "grad_norm": 12.660602569580078, + "learning_rate": 2.2742474916387962e-07, + "loss": 0.8619, + "mean_token_accuracy": 0.7404457032680511, + "num_tokens": 1489468.0, + "step": 1469 + }, + { + "epoch": 13.017777777777777, + "grad_norm": 10.490103721618652, + "learning_rate": 2.2073578595317728e-07, + "loss": 0.7791, + "mean_token_accuracy": 0.7618842422962189, + "num_tokens": 1491007.0, + "step": 1470 + }, + { + "epoch": 13.026666666666667, + "grad_norm": 11.956686973571777, + "learning_rate": 2.1404682274247494e-07, + "loss": 0.8506, + "mean_token_accuracy": 0.7486215531826019, + "num_tokens": 1492506.0, + "step": 1471 + }, + { + "epoch": 13.035555555555556, + "grad_norm": 12.304571151733398, + "learning_rate": 2.0735785953177262e-07, + "loss": 0.7599, + "mean_token_accuracy": 0.7559034824371338, + "num_tokens": 1494083.0, + "step": 1472 + }, + { + "epoch": 13.044444444444444, + "grad_norm": 12.120339393615723, + "learning_rate": 2.0066889632107025e-07, + "loss": 0.8537, + "mean_token_accuracy": 0.7309532463550568, + "num_tokens": 1495598.0, + "step": 1473 + }, + { + "epoch": 13.053333333333333, + "grad_norm": 11.369709968566895, + "learning_rate": 1.939799331103679e-07, + "loss": 0.7824, + "mean_token_accuracy": 0.7448587715625763, + "num_tokens": 1497165.0, + "step": 1474 + }, + { + "epoch": 13.062222222222223, + "grad_norm": 12.115415573120117, + "learning_rate": 1.8729096989966556e-07, + "loss": 0.7695, + "mean_token_accuracy": 0.7558363974094391, + "num_tokens": 1498683.0, + "step": 1475 + }, + { + "epoch": 13.071111111111112, + "grad_norm": 11.831680297851562, + "learning_rate": 1.8060200668896325e-07, + "loss": 0.8152, + "mean_token_accuracy": 0.7389887273311615, + "num_tokens": 1500183.0, + "step": 1476 + }, + { + "epoch": 13.08, + "grad_norm": 11.319001197814941, + "learning_rate": 1.7391304347826088e-07, + "loss": 0.8912, + "mean_token_accuracy": 0.715111643075943, + "num_tokens": 1501747.0, + "step": 1477 + }, + { + "epoch": 13.088888888888889, + "grad_norm": 11.821799278259277, + "learning_rate": 1.6722408026755853e-07, + "loss": 0.7891, + "mean_token_accuracy": 0.753851979970932, + "num_tokens": 1503351.0, + "step": 1478 + }, + { + "epoch": 13.097777777777777, + "grad_norm": 11.7422513961792, + "learning_rate": 1.605351170568562e-07, + "loss": 0.7886, + "mean_token_accuracy": 0.7362039089202881, + "num_tokens": 1504880.0, + "step": 1479 + }, + { + "epoch": 13.106666666666667, + "grad_norm": 11.365352630615234, + "learning_rate": 1.5384615384615387e-07, + "loss": 0.7284, + "mean_token_accuracy": 0.783903956413269, + "num_tokens": 1506444.0, + "step": 1480 + }, + { + "epoch": 13.115555555555556, + "grad_norm": 13.659729957580566, + "learning_rate": 1.4715719063545153e-07, + "loss": 0.8448, + "mean_token_accuracy": 0.7234861850738525, + "num_tokens": 1507944.0, + "step": 1481 + }, + { + "epoch": 13.124444444444444, + "grad_norm": 10.933953285217285, + "learning_rate": 1.4046822742474918e-07, + "loss": 0.7511, + "mean_token_accuracy": 0.762059360742569, + "num_tokens": 1509487.0, + "step": 1482 + }, + { + "epoch": 13.133333333333333, + "grad_norm": 12.030712127685547, + "learning_rate": 1.3377926421404684e-07, + "loss": 0.872, + "mean_token_accuracy": 0.7245631515979767, + "num_tokens": 1511000.0, + "step": 1483 + }, + { + "epoch": 13.142222222222221, + "grad_norm": 11.445247650146484, + "learning_rate": 1.270903010033445e-07, + "loss": 0.7538, + "mean_token_accuracy": 0.7527328133583069, + "num_tokens": 1512537.0, + "step": 1484 + }, + { + "epoch": 13.151111111111112, + "grad_norm": 11.598963737487793, + "learning_rate": 1.2040133779264215e-07, + "loss": 0.78, + "mean_token_accuracy": 0.749574601650238, + "num_tokens": 1514011.0, + "step": 1485 + }, + { + "epoch": 13.16, + "grad_norm": 11.013884544372559, + "learning_rate": 1.1371237458193981e-07, + "loss": 0.7158, + "mean_token_accuracy": 0.7696661651134491, + "num_tokens": 1515586.0, + "step": 1486 + }, + { + "epoch": 13.168888888888889, + "grad_norm": 13.129805564880371, + "learning_rate": 1.0702341137123747e-07, + "loss": 0.7892, + "mean_token_accuracy": 0.7477328777313232, + "num_tokens": 1517071.0, + "step": 1487 + }, + { + "epoch": 13.177777777777777, + "grad_norm": 10.285709381103516, + "learning_rate": 1.0033444816053512e-07, + "loss": 0.7053, + "mean_token_accuracy": 0.7833385467529297, + "num_tokens": 1518621.0, + "step": 1488 + }, + { + "epoch": 13.186666666666667, + "grad_norm": 11.17088508605957, + "learning_rate": 9.364548494983278e-08, + "loss": 0.7803, + "mean_token_accuracy": 0.741111695766449, + "num_tokens": 1520139.0, + "step": 1489 + }, + { + "epoch": 13.195555555555556, + "grad_norm": 11.94490909576416, + "learning_rate": 8.695652173913044e-08, + "loss": 0.8333, + "mean_token_accuracy": 0.7364652752876282, + "num_tokens": 1521673.0, + "step": 1490 + }, + { + "epoch": 13.204444444444444, + "grad_norm": 12.052721977233887, + "learning_rate": 8.02675585284281e-08, + "loss": 0.7493, + "mean_token_accuracy": 0.7717856764793396, + "num_tokens": 1523242.0, + "step": 1491 + }, + { + "epoch": 13.213333333333333, + "grad_norm": 12.116615295410156, + "learning_rate": 7.357859531772576e-08, + "loss": 0.8534, + "mean_token_accuracy": 0.7492619454860687, + "num_tokens": 1524809.0, + "step": 1492 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 12.866549491882324, + "learning_rate": 6.688963210702342e-08, + "loss": 0.8472, + "mean_token_accuracy": 0.7299552261829376, + "num_tokens": 1526326.0, + "step": 1493 + }, + { + "epoch": 13.231111111111112, + "grad_norm": 11.005880355834961, + "learning_rate": 6.020066889632108e-08, + "loss": 0.8127, + "mean_token_accuracy": 0.7413433492183685, + "num_tokens": 1527949.0, + "step": 1494 + }, + { + "epoch": 13.24, + "grad_norm": 11.637360572814941, + "learning_rate": 5.3511705685618734e-08, + "loss": 0.7777, + "mean_token_accuracy": 0.7507373988628387, + "num_tokens": 1529463.0, + "step": 1495 + }, + { + "epoch": 13.248888888888889, + "grad_norm": 11.939549446105957, + "learning_rate": 4.682274247491639e-08, + "loss": 0.7509, + "mean_token_accuracy": 0.7558735013008118, + "num_tokens": 1531020.0, + "step": 1496 + }, + { + "epoch": 13.257777777777777, + "grad_norm": 12.295382499694824, + "learning_rate": 4.013377926421405e-08, + "loss": 0.8211, + "mean_token_accuracy": 0.7522831857204437, + "num_tokens": 1532526.0, + "step": 1497 + }, + { + "epoch": 13.266666666666667, + "grad_norm": 11.98880386352539, + "learning_rate": 3.344481605351171e-08, + "loss": 0.9039, + "mean_token_accuracy": 0.7274755835533142, + "num_tokens": 1534113.0, + "step": 1498 + }, + { + "epoch": 13.275555555555556, + "grad_norm": 11.381470680236816, + "learning_rate": 2.6755852842809367e-08, + "loss": 0.8339, + "mean_token_accuracy": 0.7509444653987885, + "num_tokens": 1535625.0, + "step": 1499 + }, + { + "epoch": 13.284444444444444, + "grad_norm": 12.14786434173584, + "learning_rate": 2.0066889632107024e-08, + "loss": 0.6878, + "mean_token_accuracy": 0.7737761735916138, + "num_tokens": 1537144.0, + "step": 1500 + } + ], + "logging_steps": 1, + "max_steps": 1500, + "num_input_tokens_seen": 0, + "num_train_epochs": 14, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.8248939938237645e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}