{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 466, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002145922746781116, "grad_norm": 76.18501490873153, "learning_rate": 4.999943188496862e-06, "loss": 1.6093, "step": 1 }, { "epoch": 0.004291845493562232, "grad_norm": 14.009670404971743, "learning_rate": 4.999772756569482e-06, "loss": 0.4008, "step": 2 }, { "epoch": 0.006437768240343348, "grad_norm": 6.1214346980998515, "learning_rate": 4.999488711963857e-06, "loss": 0.3077, "step": 3 }, { "epoch": 0.008583690987124463, "grad_norm": 14.429243581164625, "learning_rate": 4.999091067589587e-06, "loss": 0.4675, "step": 4 }, { "epoch": 0.01072961373390558, "grad_norm": 7.580155180829176, "learning_rate": 4.998579841519292e-06, "loss": 0.4538, "step": 5 }, { "epoch": 0.012875536480686695, "grad_norm": 7.832206035781183, "learning_rate": 4.99795505698779e-06, "loss": 0.2233, "step": 6 }, { "epoch": 0.015021459227467811, "grad_norm": 6.259035686547294, "learning_rate": 4.997216742391038e-06, "loss": 0.2383, "step": 7 }, { "epoch": 0.017167381974248927, "grad_norm": 6.2512799288622, "learning_rate": 4.996364931284847e-06, "loss": 0.2912, "step": 8 }, { "epoch": 0.019313304721030045, "grad_norm": 7.589384412275137, "learning_rate": 4.995399662383352e-06, "loss": 0.2463, "step": 9 }, { "epoch": 0.02145922746781116, "grad_norm": 10.710971905656887, "learning_rate": 4.994320979557256e-06, "loss": 0.3021, "step": 10 }, { "epoch": 0.023605150214592276, "grad_norm": 9.561570863346544, "learning_rate": 4.99312893183183e-06, "loss": 0.3147, "step": 11 }, { "epoch": 0.02575107296137339, "grad_norm": 38.042377389708484, "learning_rate": 4.991823573384695e-06, "loss": 0.8302, "step": 12 }, { "epoch": 0.027896995708154508, "grad_norm": 8.86173699044075, "learning_rate": 4.990404963543352e-06, "loss": 0.4076, "step": 13 }, { "epoch": 0.030042918454935622, "grad_norm": 21.537285072501607, "learning_rate": 4.988873166782485e-06, "loss": 0.7611, "step": 14 }, { "epoch": 0.032188841201716736, "grad_norm": 6.047109119735588, "learning_rate": 4.987228252721037e-06, "loss": 0.3053, "step": 15 }, { "epoch": 0.034334763948497854, "grad_norm": 14.447362163381339, "learning_rate": 4.985470296119038e-06, "loss": 0.5285, "step": 16 }, { "epoch": 0.03648068669527897, "grad_norm": 7.74290518703786, "learning_rate": 4.983599376874216e-06, "loss": 0.2236, "step": 17 }, { "epoch": 0.03862660944206009, "grad_norm": 5.727455325582489, "learning_rate": 4.981615580018358e-06, "loss": 0.204, "step": 18 }, { "epoch": 0.0407725321888412, "grad_norm": 16.691846221989785, "learning_rate": 4.979518995713448e-06, "loss": 0.4196, "step": 19 }, { "epoch": 0.04291845493562232, "grad_norm": 16.720958919193354, "learning_rate": 4.977309719247571e-06, "loss": 0.7494, "step": 20 }, { "epoch": 0.045064377682403435, "grad_norm": 6.612052621566363, "learning_rate": 4.974987851030581e-06, "loss": 0.3287, "step": 21 }, { "epoch": 0.04721030042918455, "grad_norm": 7.3652166724204315, "learning_rate": 4.972553496589537e-06, "loss": 0.3473, "step": 22 }, { "epoch": 0.04935622317596566, "grad_norm": 5.025111473292376, "learning_rate": 4.970006766563906e-06, "loss": 0.3725, "step": 23 }, { "epoch": 0.05150214592274678, "grad_norm": 11.718331155399566, "learning_rate": 4.967347776700538e-06, "loss": 0.6273, "step": 24 }, { "epoch": 0.0536480686695279, "grad_norm": 4.873321533780134, "learning_rate": 4.964576647848401e-06, "loss": 0.5175, "step": 25 }, { "epoch": 0.055793991416309016, "grad_norm": 4.204769507730203, "learning_rate": 4.9616935059530915e-06, "loss": 0.1414, "step": 26 }, { "epoch": 0.05793991416309013, "grad_norm": 7.206330363974099, "learning_rate": 4.958698482051109e-06, "loss": 0.2285, "step": 27 }, { "epoch": 0.060085836909871244, "grad_norm": 4.7858940585296095, "learning_rate": 4.955591712263901e-06, "loss": 0.2037, "step": 28 }, { "epoch": 0.06223175965665236, "grad_norm": 12.477235363576892, "learning_rate": 4.952373337791678e-06, "loss": 0.5673, "step": 29 }, { "epoch": 0.06437768240343347, "grad_norm": 3.797571893331321, "learning_rate": 4.9490435049069925e-06, "loss": 0.2233, "step": 30 }, { "epoch": 0.06652360515021459, "grad_norm": 3.3156309080116158, "learning_rate": 4.9456023649480935e-06, "loss": 0.1981, "step": 31 }, { "epoch": 0.06866952789699571, "grad_norm": 4.531878367746364, "learning_rate": 4.942050074312048e-06, "loss": 0.1946, "step": 32 }, { "epoch": 0.07081545064377683, "grad_norm": 5.78159856239121, "learning_rate": 4.9383867944476325e-06, "loss": 0.2749, "step": 33 }, { "epoch": 0.07296137339055794, "grad_norm": 3.7408734670014794, "learning_rate": 4.934612691847995e-06, "loss": 0.3626, "step": 34 }, { "epoch": 0.07510729613733906, "grad_norm": 15.815816851091464, "learning_rate": 4.930727938043091e-06, "loss": 0.4734, "step": 35 }, { "epoch": 0.07725321888412018, "grad_norm": 5.5369981240766935, "learning_rate": 4.926732709591879e-06, "loss": 0.2659, "step": 36 }, { "epoch": 0.07939914163090128, "grad_norm": 12.910997188700815, "learning_rate": 4.9226271880743086e-06, "loss": 0.4348, "step": 37 }, { "epoch": 0.0815450643776824, "grad_norm": 8.020574277619785, "learning_rate": 4.918411560083058e-06, "loss": 0.3308, "step": 38 }, { "epoch": 0.08369098712446352, "grad_norm": 6.453487445532046, "learning_rate": 4.914086017215059e-06, "loss": 0.4054, "step": 39 }, { "epoch": 0.08583690987124463, "grad_norm": 6.115766672764467, "learning_rate": 4.909650756062782e-06, "loss": 0.332, "step": 40 }, { "epoch": 0.08798283261802575, "grad_norm": 6.319229087749166, "learning_rate": 4.9051059782053125e-06, "loss": 0.2929, "step": 41 }, { "epoch": 0.09012875536480687, "grad_norm": 14.438350682047952, "learning_rate": 4.900451890199179e-06, "loss": 0.7799, "step": 42 }, { "epoch": 0.09227467811158799, "grad_norm": 6.79883162868288, "learning_rate": 4.895688703568968e-06, "loss": 0.3752, "step": 43 }, { "epoch": 0.0944206008583691, "grad_norm": 5.690387611301978, "learning_rate": 4.890816634797716e-06, "loss": 0.2059, "step": 44 }, { "epoch": 0.09656652360515021, "grad_norm": 4.431043352022045, "learning_rate": 4.885835905317061e-06, "loss": 0.2755, "step": 45 }, { "epoch": 0.09871244635193133, "grad_norm": 6.846760154412704, "learning_rate": 4.880746741497187e-06, "loss": 0.319, "step": 46 }, { "epoch": 0.10085836909871244, "grad_norm": 5.075521155502137, "learning_rate": 4.87554937463653e-06, "loss": 0.2944, "step": 47 }, { "epoch": 0.10300429184549356, "grad_norm": 5.000407636367204, "learning_rate": 4.87024404095127e-06, "loss": 0.244, "step": 48 }, { "epoch": 0.10515021459227468, "grad_norm": 5.5213729215970515, "learning_rate": 4.8648309815645915e-06, "loss": 0.3535, "step": 49 }, { "epoch": 0.1072961373390558, "grad_norm": 5.162325783820257, "learning_rate": 4.8593104424957275e-06, "loss": 0.3763, "step": 50 }, { "epoch": 0.10944206008583691, "grad_norm": 5.241667905913422, "learning_rate": 4.853682674648775e-06, "loss": 0.256, "step": 51 }, { "epoch": 0.11158798283261803, "grad_norm": 5.009026381629991, "learning_rate": 4.847947933801296e-06, "loss": 0.3455, "step": 52 }, { "epoch": 0.11373390557939914, "grad_norm": 12.563430161222314, "learning_rate": 4.842106480592687e-06, "loss": 0.6362, "step": 53 }, { "epoch": 0.11587982832618025, "grad_norm": 6.315768827715915, "learning_rate": 4.836158580512339e-06, "loss": 0.2889, "step": 54 }, { "epoch": 0.11802575107296137, "grad_norm": 5.218075678965081, "learning_rate": 4.8301045038875665e-06, "loss": 0.1951, "step": 55 }, { "epoch": 0.12017167381974249, "grad_norm": 7.3960507355002605, "learning_rate": 4.823944525871324e-06, "loss": 0.3214, "step": 56 }, { "epoch": 0.1223175965665236, "grad_norm": 7.4185340069371986, "learning_rate": 4.817678926429702e-06, "loss": 0.3852, "step": 57 }, { "epoch": 0.12446351931330472, "grad_norm": 4.9490761074888825, "learning_rate": 4.8113079903291955e-06, "loss": 0.4695, "step": 58 }, { "epoch": 0.12660944206008584, "grad_norm": 5.408957891954199, "learning_rate": 4.804832007123771e-06, "loss": 0.3385, "step": 59 }, { "epoch": 0.12875536480686695, "grad_norm": 12.766577641000062, "learning_rate": 4.7982512711416995e-06, "loss": 0.4539, "step": 60 }, { "epoch": 0.13090128755364808, "grad_norm": 6.338330185457858, "learning_rate": 4.791566081472185e-06, "loss": 0.4034, "step": 61 }, { "epoch": 0.13304721030042918, "grad_norm": 6.151730476584575, "learning_rate": 4.784776741951766e-06, "loss": 0.3683, "step": 62 }, { "epoch": 0.1351931330472103, "grad_norm": 4.345928683775522, "learning_rate": 4.777883561150508e-06, "loss": 0.2088, "step": 63 }, { "epoch": 0.13733905579399142, "grad_norm": 6.444874963490508, "learning_rate": 4.770886852357983e-06, "loss": 0.2917, "step": 64 }, { "epoch": 0.13948497854077252, "grad_norm": 5.531328541781978, "learning_rate": 4.763786933569025e-06, "loss": 0.279, "step": 65 }, { "epoch": 0.14163090128755365, "grad_norm": 4.080884948693169, "learning_rate": 4.75658412746928e-06, "loss": 0.195, "step": 66 }, { "epoch": 0.14377682403433475, "grad_norm": 10.700949555936232, "learning_rate": 4.7492787614205425e-06, "loss": 0.4479, "step": 67 }, { "epoch": 0.1459227467811159, "grad_norm": 5.481209106182085, "learning_rate": 4.7418711674458735e-06, "loss": 0.2758, "step": 68 }, { "epoch": 0.148068669527897, "grad_norm": 6.189118615859581, "learning_rate": 4.734361682214511e-06, "loss": 0.2218, "step": 69 }, { "epoch": 0.15021459227467812, "grad_norm": 6.34535278449602, "learning_rate": 4.726750647026569e-06, "loss": 0.4419, "step": 70 }, { "epoch": 0.15236051502145923, "grad_norm": 5.239918053202764, "learning_rate": 4.719038407797529e-06, "loss": 0.2557, "step": 71 }, { "epoch": 0.15450643776824036, "grad_norm": 6.26390385146301, "learning_rate": 4.711225315042513e-06, "loss": 0.5612, "step": 72 }, { "epoch": 0.15665236051502146, "grad_norm": 6.195157191377425, "learning_rate": 4.703311723860356e-06, "loss": 0.3057, "step": 73 }, { "epoch": 0.15879828326180256, "grad_norm": 16.357650722017635, "learning_rate": 4.695297993917465e-06, "loss": 0.8227, "step": 74 }, { "epoch": 0.1609442060085837, "grad_norm": 5.339299574193258, "learning_rate": 4.687184489431476e-06, "loss": 0.2878, "step": 75 }, { "epoch": 0.1630901287553648, "grad_norm": 14.30043548697042, "learning_rate": 4.678971579154698e-06, "loss": 0.8272, "step": 76 }, { "epoch": 0.16523605150214593, "grad_norm": 6.315372884620644, "learning_rate": 4.670659636357352e-06, "loss": 0.2812, "step": 77 }, { "epoch": 0.16738197424892703, "grad_norm": 4.0158091796680075, "learning_rate": 4.66224903881061e-06, "loss": 0.3212, "step": 78 }, { "epoch": 0.16952789699570817, "grad_norm": 4.741126690857027, "learning_rate": 4.653740168769424e-06, "loss": 0.2627, "step": 79 }, { "epoch": 0.17167381974248927, "grad_norm": 4.654686079147113, "learning_rate": 4.64513341295515e-06, "loss": 0.1007, "step": 80 }, { "epoch": 0.17381974248927037, "grad_norm": 2.858109471127793, "learning_rate": 4.6364291625379785e-06, "loss": 0.1819, "step": 81 }, { "epoch": 0.1759656652360515, "grad_norm": 13.540162342975938, "learning_rate": 4.627627813119147e-06, "loss": 0.5442, "step": 82 }, { "epoch": 0.1781115879828326, "grad_norm": 14.029611827699513, "learning_rate": 4.618729764712969e-06, "loss": 0.9108, "step": 83 }, { "epoch": 0.18025751072961374, "grad_norm": 4.457304810520309, "learning_rate": 4.609735421728647e-06, "loss": 0.2638, "step": 84 }, { "epoch": 0.18240343347639484, "grad_norm": 5.935629670933359, "learning_rate": 4.600645192951898e-06, "loss": 0.2403, "step": 85 }, { "epoch": 0.18454935622317598, "grad_norm": 5.136446320474782, "learning_rate": 4.591459491526371e-06, "loss": 0.246, "step": 86 }, { "epoch": 0.18669527896995708, "grad_norm": 5.684377700354834, "learning_rate": 4.582178734934869e-06, "loss": 0.2751, "step": 87 }, { "epoch": 0.1888412017167382, "grad_norm": 15.146436017458708, "learning_rate": 4.572803344980378e-06, "loss": 0.8044, "step": 88 }, { "epoch": 0.19098712446351931, "grad_norm": 4.636355059101864, "learning_rate": 4.563333747766896e-06, "loss": 0.2279, "step": 89 }, { "epoch": 0.19313304721030042, "grad_norm": 4.963353306284944, "learning_rate": 4.553770373680062e-06, "loss": 0.1785, "step": 90 }, { "epoch": 0.19527896995708155, "grad_norm": 3.887562311073866, "learning_rate": 4.544113657367604e-06, "loss": 0.1408, "step": 91 }, { "epoch": 0.19742489270386265, "grad_norm": 4.996113376607759, "learning_rate": 4.5343640377195766e-06, "loss": 0.2715, "step": 92 }, { "epoch": 0.19957081545064378, "grad_norm": 5.484264961569098, "learning_rate": 4.524521957848416e-06, "loss": 0.3287, "step": 93 }, { "epoch": 0.2017167381974249, "grad_norm": 5.08682917163341, "learning_rate": 4.514587865068806e-06, "loss": 0.366, "step": 94 }, { "epoch": 0.20386266094420602, "grad_norm": 6.8043618981585725, "learning_rate": 4.504562210877338e-06, "loss": 0.3089, "step": 95 }, { "epoch": 0.20600858369098712, "grad_norm": 14.009371009062516, "learning_rate": 4.494445450932003e-06, "loss": 0.5487, "step": 96 }, { "epoch": 0.20815450643776823, "grad_norm": 13.77182300710328, "learning_rate": 4.484238045031471e-06, "loss": 0.6681, "step": 97 }, { "epoch": 0.21030042918454936, "grad_norm": 5.373599093252736, "learning_rate": 4.473940457094199e-06, "loss": 0.2824, "step": 98 }, { "epoch": 0.21244635193133046, "grad_norm": 7.7339638883523145, "learning_rate": 4.463553155137348e-06, "loss": 0.3773, "step": 99 }, { "epoch": 0.2145922746781116, "grad_norm": 4.358486041621687, "learning_rate": 4.453076611255507e-06, "loss": 0.1968, "step": 100 }, { "epoch": 0.2145922746781116, "eval_loss": 1.61643385887146, "eval_runtime": 0.5589, "eval_samples_per_second": 3.578, "eval_steps_per_second": 1.789, "step": 100 }, { "epoch": 0.2167381974248927, "grad_norm": 10.51123783356939, "learning_rate": 4.442511301599241e-06, "loss": 0.6631, "step": 101 }, { "epoch": 0.21888412017167383, "grad_norm": 6.204285891841481, "learning_rate": 4.431857706353449e-06, "loss": 0.1981, "step": 102 }, { "epoch": 0.22103004291845493, "grad_norm": 3.509886696140534, "learning_rate": 4.4211163097155375e-06, "loss": 0.2514, "step": 103 }, { "epoch": 0.22317596566523606, "grad_norm": 5.794240091360383, "learning_rate": 4.4102875998734176e-06, "loss": 0.4743, "step": 104 }, { "epoch": 0.22532188841201717, "grad_norm": 7.1780216249839475, "learning_rate": 4.399372068983317e-06, "loss": 0.3754, "step": 105 }, { "epoch": 0.22746781115879827, "grad_norm": 3.4853873904835835, "learning_rate": 4.388370213147409e-06, "loss": 0.1217, "step": 106 }, { "epoch": 0.2296137339055794, "grad_norm": 5.282583393202969, "learning_rate": 4.377282532391267e-06, "loss": 0.2422, "step": 107 }, { "epoch": 0.2317596566523605, "grad_norm": 5.917106273735147, "learning_rate": 4.36610953064114e-06, "loss": 0.3188, "step": 108 }, { "epoch": 0.23390557939914164, "grad_norm": 5.49762193840351, "learning_rate": 4.354851715701046e-06, "loss": 0.2603, "step": 109 }, { "epoch": 0.23605150214592274, "grad_norm": 5.113114203550232, "learning_rate": 4.343509599229697e-06, "loss": 0.2025, "step": 110 }, { "epoch": 0.23819742489270387, "grad_norm": 3.9665916022215733, "learning_rate": 4.332083696717242e-06, "loss": 0.2608, "step": 111 }, { "epoch": 0.24034334763948498, "grad_norm": 5.001407701927511, "learning_rate": 4.3205745274618365e-06, "loss": 0.3543, "step": 112 }, { "epoch": 0.24248927038626608, "grad_norm": 4.783668888273014, "learning_rate": 4.308982614546045e-06, "loss": 0.1367, "step": 113 }, { "epoch": 0.2446351931330472, "grad_norm": 13.339939405425257, "learning_rate": 4.297308484813067e-06, "loss": 0.4563, "step": 114 }, { "epoch": 0.24678111587982832, "grad_norm": 6.774593211018968, "learning_rate": 4.2855526688427875e-06, "loss": 0.2344, "step": 115 }, { "epoch": 0.24892703862660945, "grad_norm": 6.032144655648605, "learning_rate": 4.273715700927666e-06, "loss": 0.3559, "step": 116 }, { "epoch": 0.2510729613733906, "grad_norm": 11.663752172897125, "learning_rate": 4.261798119048456e-06, "loss": 0.5593, "step": 117 }, { "epoch": 0.2532188841201717, "grad_norm": 11.248638669710262, "learning_rate": 4.249800464849751e-06, "loss": 0.4252, "step": 118 }, { "epoch": 0.2553648068669528, "grad_norm": 5.542239291058805, "learning_rate": 4.2377232836153635e-06, "loss": 0.2671, "step": 119 }, { "epoch": 0.2575107296137339, "grad_norm": 5.6163735157751935, "learning_rate": 4.22556712424355e-06, "loss": 0.2948, "step": 120 }, { "epoch": 0.259656652360515, "grad_norm": 7.094248362569725, "learning_rate": 4.21333253922206e-06, "loss": 0.3536, "step": 121 }, { "epoch": 0.26180257510729615, "grad_norm": 6.177984494937913, "learning_rate": 4.201020084603027e-06, "loss": 0.3318, "step": 122 }, { "epoch": 0.26394849785407726, "grad_norm": 5.442893696217635, "learning_rate": 4.1886303199776924e-06, "loss": 0.3083, "step": 123 }, { "epoch": 0.26609442060085836, "grad_norm": 5.3971104766895355, "learning_rate": 4.176163808450978e-06, "loss": 0.2955, "step": 124 }, { "epoch": 0.26824034334763946, "grad_norm": 4.524019146701118, "learning_rate": 4.163621116615892e-06, "loss": 0.3251, "step": 125 }, { "epoch": 0.2703862660944206, "grad_norm": 5.615564703579776, "learning_rate": 4.151002814527774e-06, "loss": 0.3306, "step": 126 }, { "epoch": 0.27253218884120173, "grad_norm": 5.126082022354096, "learning_rate": 4.138309475678393e-06, "loss": 0.2977, "step": 127 }, { "epoch": 0.27467811158798283, "grad_norm": 11.215215705069811, "learning_rate": 4.125541676969876e-06, "loss": 0.4561, "step": 128 }, { "epoch": 0.27682403433476394, "grad_norm": 6.185141044554984, "learning_rate": 4.112699998688492e-06, "loss": 0.3652, "step": 129 }, { "epoch": 0.27896995708154504, "grad_norm": 14.23422453207559, "learning_rate": 4.099785024478276e-06, "loss": 0.5595, "step": 130 }, { "epoch": 0.2811158798283262, "grad_norm": 4.138921969139266, "learning_rate": 4.086797341314509e-06, "loss": 0.2577, "step": 131 }, { "epoch": 0.2832618025751073, "grad_norm": 4.766155830963139, "learning_rate": 4.073737539477033e-06, "loss": 0.266, "step": 132 }, { "epoch": 0.2854077253218884, "grad_norm": 16.59914611751421, "learning_rate": 4.060606212523425e-06, "loss": 0.7351, "step": 133 }, { "epoch": 0.2875536480686695, "grad_norm": 12.70203249759127, "learning_rate": 4.047403957262024e-06, "loss": 0.7393, "step": 134 }, { "epoch": 0.28969957081545067, "grad_norm": 5.854857495039337, "learning_rate": 4.034131373724802e-06, "loss": 0.3114, "step": 135 }, { "epoch": 0.2918454935622318, "grad_norm": 7.054371064646026, "learning_rate": 4.020789065140097e-06, "loss": 0.3954, "step": 136 }, { "epoch": 0.2939914163090129, "grad_norm": 5.777740234720859, "learning_rate": 4.0073776379051945e-06, "loss": 0.3613, "step": 137 }, { "epoch": 0.296137339055794, "grad_norm": 6.807856815251271, "learning_rate": 3.993897701558764e-06, "loss": 0.2781, "step": 138 }, { "epoch": 0.2982832618025751, "grad_norm": 4.274362381043211, "learning_rate": 3.980349868753166e-06, "loss": 0.2904, "step": 139 }, { "epoch": 0.30042918454935624, "grad_norm": 4.166342301745076, "learning_rate": 3.9667347552265945e-06, "loss": 0.2234, "step": 140 }, { "epoch": 0.30257510729613735, "grad_norm": 4.702643904412771, "learning_rate": 3.953052979775103e-06, "loss": 0.2765, "step": 141 }, { "epoch": 0.30472103004291845, "grad_norm": 15.066466935467592, "learning_rate": 3.939305164224474e-06, "loss": 0.7204, "step": 142 }, { "epoch": 0.30686695278969955, "grad_norm": 4.817588986196399, "learning_rate": 3.925491933401961e-06, "loss": 0.2919, "step": 143 }, { "epoch": 0.3090128755364807, "grad_norm": 4.841054379645271, "learning_rate": 3.911613915107888e-06, "loss": 0.2624, "step": 144 }, { "epoch": 0.3111587982832618, "grad_norm": 5.54460036441131, "learning_rate": 3.89767174008712e-06, "loss": 0.3137, "step": 145 }, { "epoch": 0.3133047210300429, "grad_norm": 4.6296008910638164, "learning_rate": 3.883666042000392e-06, "loss": 0.2172, "step": 146 }, { "epoch": 0.315450643776824, "grad_norm": 5.738911770360598, "learning_rate": 3.869597457395514e-06, "loss": 0.2998, "step": 147 }, { "epoch": 0.31759656652360513, "grad_norm": 4.284288974195585, "learning_rate": 3.855466625678435e-06, "loss": 0.2037, "step": 148 }, { "epoch": 0.3197424892703863, "grad_norm": 15.839881512200966, "learning_rate": 3.841274189084189e-06, "loss": 0.7527, "step": 149 }, { "epoch": 0.3218884120171674, "grad_norm": 6.689386040699, "learning_rate": 3.8270207926477e-06, "loss": 0.3388, "step": 150 }, { "epoch": 0.3240343347639485, "grad_norm": 5.131159476750089, "learning_rate": 3.8127070841744695e-06, "loss": 0.21, "step": 151 }, { "epoch": 0.3261802575107296, "grad_norm": 12.20701029064903, "learning_rate": 3.798333714211132e-06, "loss": 0.5333, "step": 152 }, { "epoch": 0.3283261802575107, "grad_norm": 2.8374089434590966, "learning_rate": 3.7839013360158904e-06, "loss": 0.2125, "step": 153 }, { "epoch": 0.33047210300429186, "grad_norm": 13.285466340975823, "learning_rate": 3.769410605528824e-06, "loss": 0.5934, "step": 154 }, { "epoch": 0.33261802575107297, "grad_norm": 5.3446998985506875, "learning_rate": 3.7548621813420765e-06, "loss": 0.3581, "step": 155 }, { "epoch": 0.33476394849785407, "grad_norm": 7.274618884484683, "learning_rate": 3.7402567246699257e-06, "loss": 0.3321, "step": 156 }, { "epoch": 0.3369098712446352, "grad_norm": 6.497928899578661, "learning_rate": 3.72559489931873e-06, "loss": 0.4336, "step": 157 }, { "epoch": 0.33905579399141633, "grad_norm": 5.931060877134416, "learning_rate": 3.710877371656757e-06, "loss": 0.2499, "step": 158 }, { "epoch": 0.34120171673819744, "grad_norm": 11.662557566869323, "learning_rate": 3.696104810583904e-06, "loss": 0.521, "step": 159 }, { "epoch": 0.34334763948497854, "grad_norm": 10.738868357386822, "learning_rate": 3.68127788750129e-06, "loss": 0.3935, "step": 160 }, { "epoch": 0.34549356223175964, "grad_norm": 10.27557863180008, "learning_rate": 3.6663972762807453e-06, "loss": 0.3763, "step": 161 }, { "epoch": 0.34763948497854075, "grad_norm": 3.4237960233644866, "learning_rate": 3.6514636532341825e-06, "loss": 0.2778, "step": 162 }, { "epoch": 0.3497854077253219, "grad_norm": 3.6998517744689763, "learning_rate": 3.6364776970828586e-06, "loss": 0.1564, "step": 163 }, { "epoch": 0.351931330472103, "grad_norm": 4.614921333239657, "learning_rate": 3.621440088926531e-06, "loss": 0.272, "step": 164 }, { "epoch": 0.3540772532188841, "grad_norm": 6.050087330401235, "learning_rate": 3.6063515122124975e-06, "loss": 0.3652, "step": 165 }, { "epoch": 0.3562231759656652, "grad_norm": 4.387527359728692, "learning_rate": 3.5912126527045368e-06, "loss": 0.2172, "step": 166 }, { "epoch": 0.3583690987124464, "grad_norm": 15.020253643431802, "learning_rate": 3.5760241984517397e-06, "loss": 0.8165, "step": 167 }, { "epoch": 0.3605150214592275, "grad_norm": 4.553283039366036, "learning_rate": 3.560786839757242e-06, "loss": 0.3647, "step": 168 }, { "epoch": 0.3626609442060086, "grad_norm": 7.402637796397458, "learning_rate": 3.5455012691468417e-06, "loss": 0.3558, "step": 169 }, { "epoch": 0.3648068669527897, "grad_norm": 9.457334627147352, "learning_rate": 3.5301681813375343e-06, "loss": 0.3833, "step": 170 }, { "epoch": 0.3669527896995708, "grad_norm": 4.692028452001265, "learning_rate": 3.5147882732059323e-06, "loss": 0.2337, "step": 171 }, { "epoch": 0.36909871244635195, "grad_norm": 8.372452946921305, "learning_rate": 3.4993622437565955e-06, "loss": 0.2836, "step": 172 }, { "epoch": 0.37124463519313305, "grad_norm": 3.3319166006121783, "learning_rate": 3.4838907940902607e-06, "loss": 0.1544, "step": 173 }, { "epoch": 0.37339055793991416, "grad_norm": 13.281904749983129, "learning_rate": 3.4683746273719754e-06, "loss": 0.4684, "step": 174 }, { "epoch": 0.37553648068669526, "grad_norm": 4.672588539349313, "learning_rate": 3.4528144487991448e-06, "loss": 0.2533, "step": 175 }, { "epoch": 0.3776824034334764, "grad_norm": 9.892923641996738, "learning_rate": 3.437210965569475e-06, "loss": 0.2724, "step": 176 }, { "epoch": 0.3798283261802575, "grad_norm": 11.942875136773626, "learning_rate": 3.421564886848835e-06, "loss": 0.5145, "step": 177 }, { "epoch": 0.38197424892703863, "grad_norm": 5.004292816588128, "learning_rate": 3.4058769237390254e-06, "loss": 0.2421, "step": 178 }, { "epoch": 0.38412017167381973, "grad_norm": 6.348610825705306, "learning_rate": 3.3901477892454583e-06, "loss": 0.5266, "step": 179 }, { "epoch": 0.38626609442060084, "grad_norm": 11.476145919220956, "learning_rate": 3.3743781982447533e-06, "loss": 0.5593, "step": 180 }, { "epoch": 0.388412017167382, "grad_norm": 4.384950541640652, "learning_rate": 3.3585688674522438e-06, "loss": 0.1745, "step": 181 }, { "epoch": 0.3905579399141631, "grad_norm": 12.738421839026058, "learning_rate": 3.3427205153894088e-06, "loss": 0.8569, "step": 182 }, { "epoch": 0.3927038626609442, "grad_norm": 4.770469502184926, "learning_rate": 3.3268338623512094e-06, "loss": 0.3608, "step": 183 }, { "epoch": 0.3948497854077253, "grad_norm": 5.608829333428552, "learning_rate": 3.3109096303733564e-06, "loss": 0.2229, "step": 184 }, { "epoch": 0.3969957081545064, "grad_norm": 14.271945687755156, "learning_rate": 3.2949485431994945e-06, "loss": 0.7949, "step": 185 }, { "epoch": 0.39914163090128757, "grad_norm": 17.320126868800134, "learning_rate": 3.2789513262483053e-06, "loss": 0.7768, "step": 186 }, { "epoch": 0.4012875536480687, "grad_norm": 6.304445282235124, "learning_rate": 3.2629187065805445e-06, "loss": 0.3341, "step": 187 }, { "epoch": 0.4034334763948498, "grad_norm": 4.772846096021666, "learning_rate": 3.2468514128659884e-06, "loss": 0.2169, "step": 188 }, { "epoch": 0.4055793991416309, "grad_norm": 12.095944359895759, "learning_rate": 3.230750175350324e-06, "loss": 0.5972, "step": 189 }, { "epoch": 0.40772532188841204, "grad_norm": 17.046644577375776, "learning_rate": 3.2146157258219534e-06, "loss": 0.8746, "step": 190 }, { "epoch": 0.40987124463519314, "grad_norm": 5.155144213224812, "learning_rate": 3.1984487975787433e-06, "loss": 0.403, "step": 191 }, { "epoch": 0.41201716738197425, "grad_norm": 3.2493116945473273, "learning_rate": 3.1822501253946875e-06, "loss": 0.2046, "step": 192 }, { "epoch": 0.41416309012875535, "grad_norm": 4.232233097969361, "learning_rate": 3.1660204454865203e-06, "loss": 0.3811, "step": 193 }, { "epoch": 0.41630901287553645, "grad_norm": 4.72756957325817, "learning_rate": 3.1497604954802485e-06, "loss": 0.3, "step": 194 }, { "epoch": 0.4184549356223176, "grad_norm": 12.946178614407032, "learning_rate": 3.1334710143776346e-06, "loss": 0.5872, "step": 195 }, { "epoch": 0.4206008583690987, "grad_norm": 4.61301032231011, "learning_rate": 3.1171527425226027e-06, "loss": 0.2081, "step": 196 }, { "epoch": 0.4227467811158798, "grad_norm": 12.907938814433235, "learning_rate": 3.100806421567596e-06, "loss": 0.6287, "step": 197 }, { "epoch": 0.4248927038626609, "grad_norm": 5.546582248819232, "learning_rate": 3.084432794439865e-06, "loss": 0.3105, "step": 198 }, { "epoch": 0.4270386266094421, "grad_norm": 7.245512728978741, "learning_rate": 3.0680326053077047e-06, "loss": 0.3016, "step": 199 }, { "epoch": 0.4291845493562232, "grad_norm": 10.84493687480769, "learning_rate": 3.0516065995466336e-06, "loss": 0.3219, "step": 200 }, { "epoch": 0.4291845493562232, "eval_loss": 1.4745992422103882, "eval_runtime": 0.5572, "eval_samples_per_second": 3.59, "eval_steps_per_second": 1.795, "step": 200 }, { "epoch": 0.4313304721030043, "grad_norm": 4.10172843012159, "learning_rate": 3.0351555237055135e-06, "loss": 0.1799, "step": 201 }, { "epoch": 0.4334763948497854, "grad_norm": 5.306456302714684, "learning_rate": 3.0186801254726213e-06, "loss": 0.2365, "step": 202 }, { "epoch": 0.4356223175965665, "grad_norm": 5.299147474065381, "learning_rate": 3.0021811536416676e-06, "loss": 0.2406, "step": 203 }, { "epoch": 0.43776824034334766, "grad_norm": 4.14896225307621, "learning_rate": 2.985659358077765e-06, "loss": 0.1931, "step": 204 }, { "epoch": 0.43991416309012876, "grad_norm": 13.03761590039539, "learning_rate": 2.9691154896833454e-06, "loss": 0.325, "step": 205 }, { "epoch": 0.44206008583690987, "grad_norm": 10.573666701230737, "learning_rate": 2.9525503003640336e-06, "loss": 0.324, "step": 206 }, { "epoch": 0.44420600858369097, "grad_norm": 11.926884819820218, "learning_rate": 2.935964542994475e-06, "loss": 0.5481, "step": 207 }, { "epoch": 0.44635193133047213, "grad_norm": 10.151629428230573, "learning_rate": 2.9193589713841132e-06, "loss": 0.5082, "step": 208 }, { "epoch": 0.44849785407725323, "grad_norm": 12.486051719027907, "learning_rate": 2.902734340242937e-06, "loss": 0.3599, "step": 209 }, { "epoch": 0.45064377682403434, "grad_norm": 15.529513845799773, "learning_rate": 2.8860914051471722e-06, "loss": 0.7298, "step": 210 }, { "epoch": 0.45278969957081544, "grad_norm": 5.461786086907061, "learning_rate": 2.869430922504947e-06, "loss": 0.2674, "step": 211 }, { "epoch": 0.45493562231759654, "grad_norm": 5.376224720954328, "learning_rate": 2.852753649521911e-06, "loss": 0.3557, "step": 212 }, { "epoch": 0.4570815450643777, "grad_norm": 17.151457129512085, "learning_rate": 2.836060344166821e-06, "loss": 0.6405, "step": 213 }, { "epoch": 0.4592274678111588, "grad_norm": 4.757476534660833, "learning_rate": 2.8193517651370934e-06, "loss": 0.3124, "step": 214 }, { "epoch": 0.4613733905579399, "grad_norm": 5.64585393975328, "learning_rate": 2.80262867182432e-06, "loss": 0.2888, "step": 215 }, { "epoch": 0.463519313304721, "grad_norm": 13.765994790690012, "learning_rate": 2.785891824279755e-06, "loss": 0.5319, "step": 216 }, { "epoch": 0.4656652360515021, "grad_norm": 3.4645720688637702, "learning_rate": 2.7691419831797724e-06, "loss": 0.1354, "step": 217 }, { "epoch": 0.4678111587982833, "grad_norm": 4.728651796593724, "learning_rate": 2.7523799097912905e-06, "loss": 0.2457, "step": 218 }, { "epoch": 0.4699570815450644, "grad_norm": 5.579222939328203, "learning_rate": 2.73560636593718e-06, "loss": 0.3542, "step": 219 }, { "epoch": 0.4721030042918455, "grad_norm": 5.988497269511321, "learning_rate": 2.7188221139616303e-06, "loss": 0.4184, "step": 220 }, { "epoch": 0.4742489270386266, "grad_norm": 6.257959157029455, "learning_rate": 2.70202791669551e-06, "loss": 0.2553, "step": 221 }, { "epoch": 0.47639484978540775, "grad_norm": 12.558388707787643, "learning_rate": 2.68522453742169e-06, "loss": 0.541, "step": 222 }, { "epoch": 0.47854077253218885, "grad_norm": 5.81966073574648, "learning_rate": 2.66841273984036e-06, "loss": 0.3147, "step": 223 }, { "epoch": 0.48068669527896996, "grad_norm": 12.408285990600806, "learning_rate": 2.6515932880343103e-06, "loss": 0.4052, "step": 224 }, { "epoch": 0.48283261802575106, "grad_norm": 6.652286326696977, "learning_rate": 2.634766946434214e-06, "loss": 0.2372, "step": 225 }, { "epoch": 0.48497854077253216, "grad_norm": 11.13575728119692, "learning_rate": 2.6179344797838775e-06, "loss": 0.3067, "step": 226 }, { "epoch": 0.4871244635193133, "grad_norm": 11.134631212750397, "learning_rate": 2.6010966531054852e-06, "loss": 0.8026, "step": 227 }, { "epoch": 0.4892703862660944, "grad_norm": 3.8667307700150446, "learning_rate": 2.5842542316648333e-06, "loss": 0.2596, "step": 228 }, { "epoch": 0.49141630901287553, "grad_norm": 6.443730023062394, "learning_rate": 2.5674079809365443e-06, "loss": 0.288, "step": 229 }, { "epoch": 0.49356223175965663, "grad_norm": 2.9440291648892987, "learning_rate": 2.550558666569279e-06, "loss": 0.1493, "step": 230 }, { "epoch": 0.4957081545064378, "grad_norm": 4.462532793208141, "learning_rate": 2.533707054350938e-06, "loss": 0.1839, "step": 231 }, { "epoch": 0.4978540772532189, "grad_norm": 4.9063569155392015, "learning_rate": 2.5168539101738576e-06, "loss": 0.2802, "step": 232 }, { "epoch": 0.5, "grad_norm": 10.022637557306455, "learning_rate": 2.5e-06, "loss": 0.3349, "step": 233 }, { "epoch": 0.5021459227467812, "grad_norm": 4.3433358033272, "learning_rate": 2.4831460898261428e-06, "loss": 0.2626, "step": 234 }, { "epoch": 0.5042918454935622, "grad_norm": 9.069133083700263, "learning_rate": 2.4662929456490633e-06, "loss": 0.2204, "step": 235 }, { "epoch": 0.5064377682403434, "grad_norm": 4.611418200226192, "learning_rate": 2.449441333430722e-06, "loss": 0.1723, "step": 236 }, { "epoch": 0.5085836909871244, "grad_norm": 4.915343701063927, "learning_rate": 2.432592019063456e-06, "loss": 0.28, "step": 237 }, { "epoch": 0.5107296137339056, "grad_norm": 4.878529367689273, "learning_rate": 2.415745768335167e-06, "loss": 0.3336, "step": 238 }, { "epoch": 0.5128755364806867, "grad_norm": 5.589922591964495, "learning_rate": 2.398903346894515e-06, "loss": 0.2363, "step": 239 }, { "epoch": 0.5150214592274678, "grad_norm": 4.07202657981208, "learning_rate": 2.3820655202161237e-06, "loss": 0.2744, "step": 240 }, { "epoch": 0.5171673819742489, "grad_norm": 3.9798028023766703, "learning_rate": 2.365233053565787e-06, "loss": 0.2538, "step": 241 }, { "epoch": 0.51931330472103, "grad_norm": 4.2713689569349, "learning_rate": 2.3484067119656905e-06, "loss": 0.2072, "step": 242 }, { "epoch": 0.5214592274678111, "grad_norm": 4.132159038986101, "learning_rate": 2.331587260159641e-06, "loss": 0.2537, "step": 243 }, { "epoch": 0.5236051502145923, "grad_norm": 4.7509182744825695, "learning_rate": 2.31477546257831e-06, "loss": 0.2475, "step": 244 }, { "epoch": 0.5257510729613734, "grad_norm": 12.573882879618065, "learning_rate": 2.297972083304491e-06, "loss": 0.5451, "step": 245 }, { "epoch": 0.5278969957081545, "grad_norm": 4.265194868971971, "learning_rate": 2.28117788603837e-06, "loss": 0.2038, "step": 246 }, { "epoch": 0.5300429184549357, "grad_norm": 5.327237193336404, "learning_rate": 2.2643936340628205e-06, "loss": 0.2953, "step": 247 }, { "epoch": 0.5321888412017167, "grad_norm": 3.830903874010895, "learning_rate": 2.24762009020871e-06, "loss": 0.2118, "step": 248 }, { "epoch": 0.5343347639484979, "grad_norm": 3.692771359304331, "learning_rate": 2.2308580168202284e-06, "loss": 0.2862, "step": 249 }, { "epoch": 0.5364806866952789, "grad_norm": 4.46004020199315, "learning_rate": 2.214108175720246e-06, "loss": 0.1485, "step": 250 }, { "epoch": 0.5386266094420601, "grad_norm": 4.194077533120119, "learning_rate": 2.197371328175681e-06, "loss": 0.18, "step": 251 }, { "epoch": 0.5407725321888412, "grad_norm": 4.670466175503201, "learning_rate": 2.1806482348629065e-06, "loss": 0.2563, "step": 252 }, { "epoch": 0.5429184549356223, "grad_norm": 11.620363098035735, "learning_rate": 2.1639396558331794e-06, "loss": 0.4257, "step": 253 }, { "epoch": 0.5450643776824035, "grad_norm": 5.143937972622028, "learning_rate": 2.1472463504780893e-06, "loss": 0.2771, "step": 254 }, { "epoch": 0.5472103004291845, "grad_norm": 4.790083924982343, "learning_rate": 2.1305690774950543e-06, "loss": 0.3618, "step": 255 }, { "epoch": 0.5493562231759657, "grad_norm": 4.122103689477043, "learning_rate": 2.1139085948528286e-06, "loss": 0.1102, "step": 256 }, { "epoch": 0.5515021459227468, "grad_norm": 4.807538148349604, "learning_rate": 2.097265659757064e-06, "loss": 0.2713, "step": 257 }, { "epoch": 0.5536480686695279, "grad_norm": 3.2834156049564003, "learning_rate": 2.080641028615888e-06, "loss": 0.1608, "step": 258 }, { "epoch": 0.555793991416309, "grad_norm": 4.843114090474319, "learning_rate": 2.064035457005526e-06, "loss": 0.2053, "step": 259 }, { "epoch": 0.5579399141630901, "grad_norm": 4.187255961249243, "learning_rate": 2.0474496996359676e-06, "loss": 0.3847, "step": 260 }, { "epoch": 0.5600858369098712, "grad_norm": 4.835958039304007, "learning_rate": 2.0308845103166555e-06, "loss": 0.2324, "step": 261 }, { "epoch": 0.5622317596566524, "grad_norm": 4.612281021599447, "learning_rate": 2.0143406419222354e-06, "loss": 0.1589, "step": 262 }, { "epoch": 0.5643776824034334, "grad_norm": 4.679769267914521, "learning_rate": 1.997818846358333e-06, "loss": 0.2734, "step": 263 }, { "epoch": 0.5665236051502146, "grad_norm": 15.988595800455592, "learning_rate": 1.98131987452738e-06, "loss": 0.5932, "step": 264 }, { "epoch": 0.5686695278969958, "grad_norm": 6.13615019112239, "learning_rate": 1.964844476294487e-06, "loss": 0.3048, "step": 265 }, { "epoch": 0.5708154506437768, "grad_norm": 5.642706295976752, "learning_rate": 1.948393400453367e-06, "loss": 0.2217, "step": 266 }, { "epoch": 0.572961373390558, "grad_norm": 4.00696562989142, "learning_rate": 1.9319673946922953e-06, "loss": 0.142, "step": 267 }, { "epoch": 0.575107296137339, "grad_norm": 4.431532041750428, "learning_rate": 1.9155672055601364e-06, "loss": 0.2385, "step": 268 }, { "epoch": 0.5772532188841202, "grad_norm": 4.7544298296222305, "learning_rate": 1.8991935784324048e-06, "loss": 0.2229, "step": 269 }, { "epoch": 0.5793991416309013, "grad_norm": 14.806103600453497, "learning_rate": 1.882847257477398e-06, "loss": 0.6235, "step": 270 }, { "epoch": 0.5815450643776824, "grad_norm": 3.608004765245334, "learning_rate": 1.8665289856223662e-06, "loss": 0.1837, "step": 271 }, { "epoch": 0.5836909871244635, "grad_norm": 4.269748144395558, "learning_rate": 1.8502395045197522e-06, "loss": 0.1686, "step": 272 }, { "epoch": 0.5858369098712446, "grad_norm": 3.4279537261830897, "learning_rate": 1.8339795545134814e-06, "loss": 0.176, "step": 273 }, { "epoch": 0.5879828326180258, "grad_norm": 3.693149204798503, "learning_rate": 1.8177498746053129e-06, "loss": 0.2255, "step": 274 }, { "epoch": 0.5901287553648069, "grad_norm": 7.0891066905956714, "learning_rate": 1.8015512024212573e-06, "loss": 0.3132, "step": 275 }, { "epoch": 0.592274678111588, "grad_norm": 4.371558241354916, "learning_rate": 1.7853842741780474e-06, "loss": 0.2161, "step": 276 }, { "epoch": 0.5944206008583691, "grad_norm": 3.7439703795742423, "learning_rate": 1.769249824649677e-06, "loss": 0.1681, "step": 277 }, { "epoch": 0.5965665236051502, "grad_norm": 4.954905160767698, "learning_rate": 1.7531485871340122e-06, "loss": 0.2169, "step": 278 }, { "epoch": 0.5987124463519313, "grad_norm": 4.509338306319588, "learning_rate": 1.7370812934194565e-06, "loss": 0.3037, "step": 279 }, { "epoch": 0.6008583690987125, "grad_norm": 3.5630209392348315, "learning_rate": 1.7210486737516947e-06, "loss": 0.1491, "step": 280 }, { "epoch": 0.6030042918454935, "grad_norm": 4.721486742009494, "learning_rate": 1.7050514568005072e-06, "loss": 0.2978, "step": 281 }, { "epoch": 0.6051502145922747, "grad_norm": 4.438705208811552, "learning_rate": 1.6890903696266447e-06, "loss": 0.2172, "step": 282 }, { "epoch": 0.6072961373390557, "grad_norm": 10.633440687114604, "learning_rate": 1.6731661376487923e-06, "loss": 0.2509, "step": 283 }, { "epoch": 0.6094420600858369, "grad_norm": 9.519241352604435, "learning_rate": 1.6572794846105919e-06, "loss": 0.3126, "step": 284 }, { "epoch": 0.6115879828326181, "grad_norm": 6.236116887316222, "learning_rate": 1.6414311325477567e-06, "loss": 0.3433, "step": 285 }, { "epoch": 0.6137339055793991, "grad_norm": 6.652929433249715, "learning_rate": 1.6256218017552484e-06, "loss": 0.3924, "step": 286 }, { "epoch": 0.6158798283261803, "grad_norm": 4.230578383590287, "learning_rate": 1.6098522107545426e-06, "loss": 0.1969, "step": 287 }, { "epoch": 0.6180257510729614, "grad_norm": 4.584849008876823, "learning_rate": 1.594123076260975e-06, "loss": 0.1952, "step": 288 }, { "epoch": 0.6201716738197425, "grad_norm": 10.774923368712571, "learning_rate": 1.578435113151166e-06, "loss": 0.3867, "step": 289 }, { "epoch": 0.6223175965665236, "grad_norm": 3.7956827550236603, "learning_rate": 1.5627890344305256e-06, "loss": 0.1921, "step": 290 }, { "epoch": 0.6244635193133047, "grad_norm": 2.7587416399153897, "learning_rate": 1.547185551200856e-06, "loss": 0.1014, "step": 291 }, { "epoch": 0.6266094420600858, "grad_norm": 5.27854274924106, "learning_rate": 1.531625372628025e-06, "loss": 0.2008, "step": 292 }, { "epoch": 0.628755364806867, "grad_norm": 8.635037982539608, "learning_rate": 1.5161092059097399e-06, "loss": 0.5101, "step": 293 }, { "epoch": 0.630901287553648, "grad_norm": 3.3848075411641076, "learning_rate": 1.500637756243405e-06, "loss": 0.1936, "step": 294 }, { "epoch": 0.6330472103004292, "grad_norm": 10.323449388666509, "learning_rate": 1.485211726794068e-06, "loss": 0.3156, "step": 295 }, { "epoch": 0.6351931330472103, "grad_norm": 3.3898443020378823, "learning_rate": 1.469831818662467e-06, "loss": 0.4251, "step": 296 }, { "epoch": 0.6373390557939914, "grad_norm": 5.429823091284252, "learning_rate": 1.4544987308531594e-06, "loss": 0.2863, "step": 297 }, { "epoch": 0.6394849785407726, "grad_norm": 6.489533944140564, "learning_rate": 1.439213160242759e-06, "loss": 0.2321, "step": 298 }, { "epoch": 0.6416309012875536, "grad_norm": 5.491165521890711, "learning_rate": 1.4239758015482607e-06, "loss": 0.5229, "step": 299 }, { "epoch": 0.6437768240343348, "grad_norm": 10.960007456788807, "learning_rate": 1.4087873472954638e-06, "loss": 0.3474, "step": 300 }, { "epoch": 0.6437768240343348, "eval_loss": 1.4280835390090942, "eval_runtime": 0.5582, "eval_samples_per_second": 3.583, "eval_steps_per_second": 1.792, "step": 300 }, { "epoch": 0.6459227467811158, "grad_norm": 4.097125958239756, "learning_rate": 1.393648487787504e-06, "loss": 0.175, "step": 301 }, { "epoch": 0.648068669527897, "grad_norm": 9.06788832499044, "learning_rate": 1.37855991107347e-06, "loss": 0.3413, "step": 302 }, { "epoch": 0.6502145922746781, "grad_norm": 4.058418163873224, "learning_rate": 1.3635223029171418e-06, "loss": 0.1829, "step": 303 }, { "epoch": 0.6523605150214592, "grad_norm": 148.2865695887899, "learning_rate": 1.3485363467658186e-06, "loss": 6.7839, "step": 304 }, { "epoch": 0.6545064377682404, "grad_norm": 5.592326984498724, "learning_rate": 1.3336027237192551e-06, "loss": 0.2715, "step": 305 }, { "epoch": 0.6566523605150214, "grad_norm": 11.675146672194217, "learning_rate": 1.3187221124987107e-06, "loss": 0.3927, "step": 306 }, { "epoch": 0.6587982832618026, "grad_norm": 6.303487956163143, "learning_rate": 1.3038951894160962e-06, "loss": 0.3058, "step": 307 }, { "epoch": 0.6609442060085837, "grad_norm": 3.5714075946161583, "learning_rate": 1.289122628343244e-06, "loss": 0.1626, "step": 308 }, { "epoch": 0.6630901287553648, "grad_norm": 2.8795739712912747, "learning_rate": 1.2744051006812712e-06, "loss": 0.1164, "step": 309 }, { "epoch": 0.6652360515021459, "grad_norm": 3.784768504884893, "learning_rate": 1.2597432753300753e-06, "loss": 0.2305, "step": 310 }, { "epoch": 0.6673819742489271, "grad_norm": 5.42666962884753, "learning_rate": 1.245137818657924e-06, "loss": 0.2721, "step": 311 }, { "epoch": 0.6695278969957081, "grad_norm": 5.933064715860253, "learning_rate": 1.2305893944711773e-06, "loss": 0.2627, "step": 312 }, { "epoch": 0.6716738197424893, "grad_norm": 4.451323659006045, "learning_rate": 1.21609866398411e-06, "loss": 0.2969, "step": 313 }, { "epoch": 0.6738197424892703, "grad_norm": 4.851157313459204, "learning_rate": 1.201666285788869e-06, "loss": 0.2446, "step": 314 }, { "epoch": 0.6759656652360515, "grad_norm": 4.25532850589281, "learning_rate": 1.187292915825531e-06, "loss": 0.2488, "step": 315 }, { "epoch": 0.6781115879828327, "grad_norm": 4.400762031022185, "learning_rate": 1.1729792073523e-06, "loss": 0.2463, "step": 316 }, { "epoch": 0.6802575107296137, "grad_norm": 4.1661170844892785, "learning_rate": 1.1587258109158114e-06, "loss": 0.2479, "step": 317 }, { "epoch": 0.6824034334763949, "grad_norm": 4.315876128019499, "learning_rate": 1.1445333743215648e-06, "loss": 0.3109, "step": 318 }, { "epoch": 0.6845493562231759, "grad_norm": 4.507385647592385, "learning_rate": 1.1304025426044869e-06, "loss": 0.2711, "step": 319 }, { "epoch": 0.6866952789699571, "grad_norm": 4.922745847528059, "learning_rate": 1.116333957999608e-06, "loss": 0.2441, "step": 320 }, { "epoch": 0.6888412017167382, "grad_norm": 4.076334007452991, "learning_rate": 1.1023282599128797e-06, "loss": 0.2134, "step": 321 }, { "epoch": 0.6909871244635193, "grad_norm": 4.671754190487534, "learning_rate": 1.0883860848921122e-06, "loss": 0.2048, "step": 322 }, { "epoch": 0.6931330472103004, "grad_norm": 9.92777165374971, "learning_rate": 1.07450806659804e-06, "loss": 0.4585, "step": 323 }, { "epoch": 0.6952789699570815, "grad_norm": 5.249124657109493, "learning_rate": 1.060694835775527e-06, "loss": 0.2441, "step": 324 }, { "epoch": 0.6974248927038627, "grad_norm": 3.576374367164296, "learning_rate": 1.0469470202248976e-06, "loss": 0.2092, "step": 325 }, { "epoch": 0.6995708154506438, "grad_norm": 4.1766033624138315, "learning_rate": 1.0332652447734057e-06, "loss": 0.2437, "step": 326 }, { "epoch": 0.7017167381974249, "grad_norm": 4.5911217940771865, "learning_rate": 1.019650131246835e-06, "loss": 0.3378, "step": 327 }, { "epoch": 0.703862660944206, "grad_norm": 5.049668031588922, "learning_rate": 1.006102298441236e-06, "loss": 0.276, "step": 328 }, { "epoch": 0.7060085836909872, "grad_norm": 3.2474445516283366, "learning_rate": 9.926223620948061e-07, "loss": 0.1107, "step": 329 }, { "epoch": 0.7081545064377682, "grad_norm": 12.87037417509136, "learning_rate": 9.792109348599036e-07, "loss": 0.3857, "step": 330 }, { "epoch": 0.7103004291845494, "grad_norm": 5.089010214872758, "learning_rate": 9.65868626275198e-07, "loss": 0.3078, "step": 331 }, { "epoch": 0.7124463519313304, "grad_norm": 12.594828602227423, "learning_rate": 9.525960427379772e-07, "loss": 0.3576, "step": 332 }, { "epoch": 0.7145922746781116, "grad_norm": 13.530682640951623, "learning_rate": 9.393937874765754e-07, "loss": 0.5429, "step": 333 }, { "epoch": 0.7167381974248928, "grad_norm": 3.963378157061749, "learning_rate": 9.262624605229673e-07, "loss": 0.2409, "step": 334 }, { "epoch": 0.7188841201716738, "grad_norm": 3.7920088399022287, "learning_rate": 9.132026586854909e-07, "loss": 0.249, "step": 335 }, { "epoch": 0.721030042918455, "grad_norm": 13.220884914625854, "learning_rate": 9.002149755217246e-07, "loss": 0.654, "step": 336 }, { "epoch": 0.723175965665236, "grad_norm": 3.585564797999746, "learning_rate": 8.873000013115099e-07, "loss": 0.1466, "step": 337 }, { "epoch": 0.7253218884120172, "grad_norm": 4.837559263261418, "learning_rate": 8.744583230301248e-07, "loss": 0.2767, "step": 338 }, { "epoch": 0.7274678111587983, "grad_norm": 5.252715623197888, "learning_rate": 8.61690524321607e-07, "loss": 0.4616, "step": 339 }, { "epoch": 0.7296137339055794, "grad_norm": 4.394456008160045, "learning_rate": 8.48997185472226e-07, "loss": 0.3365, "step": 340 }, { "epoch": 0.7317596566523605, "grad_norm": 4.731413577745255, "learning_rate": 8.363788833841083e-07, "loss": 0.2681, "step": 341 }, { "epoch": 0.7339055793991416, "grad_norm": 4.23884351278645, "learning_rate": 8.238361915490226e-07, "loss": 0.2179, "step": 342 }, { "epoch": 0.7360515021459227, "grad_norm": 4.991832411605434, "learning_rate": 8.113696800223084e-07, "loss": 0.3074, "step": 343 }, { "epoch": 0.7381974248927039, "grad_norm": 11.78247542725481, "learning_rate": 7.989799153969735e-07, "loss": 0.7113, "step": 344 }, { "epoch": 0.740343347639485, "grad_norm": 2.677869163204128, "learning_rate": 7.866674607779401e-07, "loss": 0.1147, "step": 345 }, { "epoch": 0.7424892703862661, "grad_norm": 15.72663219186909, "learning_rate": 7.744328757564501e-07, "loss": 0.8535, "step": 346 }, { "epoch": 0.7446351931330472, "grad_norm": 5.679767488995843, "learning_rate": 7.622767163846376e-07, "loss": 0.2275, "step": 347 }, { "epoch": 0.7467811158798283, "grad_norm": 4.066844515227611, "learning_rate": 7.501995351502497e-07, "loss": 0.3943, "step": 348 }, { "epoch": 0.7489270386266095, "grad_norm": 5.645157777694975, "learning_rate": 7.38201880951544e-07, "loss": 0.4427, "step": 349 }, { "epoch": 0.7510729613733905, "grad_norm": 4.956779129225836, "learning_rate": 7.26284299072334e-07, "loss": 0.202, "step": 350 }, { "epoch": 0.7532188841201717, "grad_norm": 2.5935658433153566, "learning_rate": 7.144473311572136e-07, "loss": 0.1596, "step": 351 }, { "epoch": 0.7553648068669528, "grad_norm": 4.01470514282122, "learning_rate": 7.026915151869335e-07, "loss": 0.1781, "step": 352 }, { "epoch": 0.7575107296137339, "grad_norm": 4.6123159242382075, "learning_rate": 6.910173854539551e-07, "loss": 0.25, "step": 353 }, { "epoch": 0.759656652360515, "grad_norm": 3.0887466278904703, "learning_rate": 6.794254725381641e-07, "loss": 0.2184, "step": 354 }, { "epoch": 0.7618025751072961, "grad_norm": 4.1284793392534995, "learning_rate": 6.679163032827593e-07, "loss": 0.2396, "step": 355 }, { "epoch": 0.7639484978540773, "grad_norm": 4.382346488786086, "learning_rate": 6.564904007703032e-07, "loss": 0.2136, "step": 356 }, { "epoch": 0.7660944206008584, "grad_norm": 10.798704798244964, "learning_rate": 6.45148284298954e-07, "loss": 0.6572, "step": 357 }, { "epoch": 0.7682403433476395, "grad_norm": 4.878129288407048, "learning_rate": 6.33890469358861e-07, "loss": 0.3525, "step": 358 }, { "epoch": 0.7703862660944206, "grad_norm": 5.030026237930303, "learning_rate": 6.227174676087333e-07, "loss": 0.2312, "step": 359 }, { "epoch": 0.7725321888412017, "grad_norm": 17.098271284659155, "learning_rate": 6.11629786852592e-07, "loss": 0.6886, "step": 360 }, { "epoch": 0.7746781115879828, "grad_norm": 4.264109038299607, "learning_rate": 6.006279310166835e-07, "loss": 0.2899, "step": 361 }, { "epoch": 0.776824034334764, "grad_norm": 4.9567417333092605, "learning_rate": 5.897124001265822e-07, "loss": 0.2594, "step": 362 }, { "epoch": 0.778969957081545, "grad_norm": 4.793597034829966, "learning_rate": 5.788836902844633e-07, "loss": 0.2406, "step": 363 }, { "epoch": 0.7811158798283262, "grad_norm": 5.338193032183525, "learning_rate": 5.681422936465522e-07, "loss": 0.3718, "step": 364 }, { "epoch": 0.7832618025751072, "grad_norm": 4.677431437894929, "learning_rate": 5.574886984007602e-07, "loss": 0.1582, "step": 365 }, { "epoch": 0.7854077253218884, "grad_norm": 5.490552521240752, "learning_rate": 5.469233887444941e-07, "loss": 0.3061, "step": 366 }, { "epoch": 0.7875536480686696, "grad_norm": 3.324520040683454, "learning_rate": 5.36446844862653e-07, "loss": 0.416, "step": 367 }, { "epoch": 0.7896995708154506, "grad_norm": 4.170197573866644, "learning_rate": 5.260595429058021e-07, "loss": 0.182, "step": 368 }, { "epoch": 0.7918454935622318, "grad_norm": 15.271343898793598, "learning_rate": 5.1576195496853e-07, "loss": 0.971, "step": 369 }, { "epoch": 0.7939914163090128, "grad_norm": 5.773601400585028, "learning_rate": 5.055545490679981e-07, "loss": 0.373, "step": 370 }, { "epoch": 0.796137339055794, "grad_norm": 5.969373195356161, "learning_rate": 4.954377891226623e-07, "loss": 0.1698, "step": 371 }, { "epoch": 0.7982832618025751, "grad_norm": 4.932322793806067, "learning_rate": 4.854121349311949e-07, "loss": 0.3279, "step": 372 }, { "epoch": 0.8004291845493562, "grad_norm": 4.577118395812275, "learning_rate": 4.7547804215158476e-07, "loss": 0.2336, "step": 373 }, { "epoch": 0.8025751072961373, "grad_norm": 4.347265571148098, "learning_rate": 4.6563596228042433e-07, "loss": 0.2708, "step": 374 }, { "epoch": 0.8047210300429185, "grad_norm": 5.494640250591727, "learning_rate": 4.558863426323962e-07, "loss": 0.1907, "step": 375 }, { "epoch": 0.8068669527896996, "grad_norm": 3.3572786766867844, "learning_rate": 4.462296263199381e-07, "loss": 0.2496, "step": 376 }, { "epoch": 0.8090128755364807, "grad_norm": 5.900778507191783, "learning_rate": 4.366662522331053e-07, "loss": 0.348, "step": 377 }, { "epoch": 0.8111587982832618, "grad_norm": 9.530668791644544, "learning_rate": 4.27196655019623e-07, "loss": 0.31, "step": 378 }, { "epoch": 0.8133047210300429, "grad_norm": 11.51101766547589, "learning_rate": 4.1782126506513196e-07, "loss": 0.4048, "step": 379 }, { "epoch": 0.8154506437768241, "grad_norm": 4.265212244249617, "learning_rate": 4.0854050847362966e-07, "loss": 0.3379, "step": 380 }, { "epoch": 0.8175965665236051, "grad_norm": 4.266290446089351, "learning_rate": 3.9935480704810237e-07, "loss": 0.2869, "step": 381 }, { "epoch": 0.8197424892703863, "grad_norm": 10.069859696254095, "learning_rate": 3.9026457827135324e-07, "loss": 0.3259, "step": 382 }, { "epoch": 0.8218884120171673, "grad_norm": 4.789626593387076, "learning_rate": 3.812702352870321e-07, "loss": 0.2795, "step": 383 }, { "epoch": 0.8240343347639485, "grad_norm": 7.071090188512615, "learning_rate": 3.723721868808533e-07, "loss": 0.3904, "step": 384 }, { "epoch": 0.8261802575107297, "grad_norm": 6.396013480233416, "learning_rate": 3.6357083746202173e-07, "loss": 0.3435, "step": 385 }, { "epoch": 0.8283261802575107, "grad_norm": 6.912201776722396, "learning_rate": 3.5486658704484977e-07, "loss": 0.1841, "step": 386 }, { "epoch": 0.8304721030042919, "grad_norm": 3.7884598131328793, "learning_rate": 3.4625983123057624e-07, "loss": 0.1977, "step": 387 }, { "epoch": 0.8326180257510729, "grad_norm": 4.539195321014333, "learning_rate": 3.3775096118939033e-07, "loss": 0.2573, "step": 388 }, { "epoch": 0.8347639484978541, "grad_norm": 13.401579924542975, "learning_rate": 3.2934036364264845e-07, "loss": 0.4396, "step": 389 }, { "epoch": 0.8369098712446352, "grad_norm": 3.4147384980259816, "learning_rate": 3.2102842084530293e-07, "loss": 0.1677, "step": 390 }, { "epoch": 0.8390557939914163, "grad_norm": 12.280073964483028, "learning_rate": 3.128155105685243e-07, "loss": 0.6472, "step": 391 }, { "epoch": 0.8412017167381974, "grad_norm": 14.537818479540036, "learning_rate": 3.0470200608253594e-07, "loss": 0.6278, "step": 392 }, { "epoch": 0.8433476394849786, "grad_norm": 4.602724381634226, "learning_rate": 2.96688276139645e-07, "loss": 0.2403, "step": 393 }, { "epoch": 0.8454935622317596, "grad_norm": 5.352137114560932, "learning_rate": 2.887746849574877e-07, "loss": 0.3369, "step": 394 }, { "epoch": 0.8476394849785408, "grad_norm": 6.023327004803226, "learning_rate": 2.809615922024711e-07, "loss": 0.2251, "step": 395 }, { "epoch": 0.8497854077253219, "grad_norm": 5.227321763313939, "learning_rate": 2.7324935297343146e-07, "loss": 0.3855, "step": 396 }, { "epoch": 0.851931330472103, "grad_norm": 3.301210094777184, "learning_rate": 2.6563831778549015e-07, "loss": 0.1753, "step": 397 }, { "epoch": 0.8540772532188842, "grad_norm": 3.8623722681440125, "learning_rate": 2.5812883255412704e-07, "loss": 0.1972, "step": 398 }, { "epoch": 0.8562231759656652, "grad_norm": 11.977134679582875, "learning_rate": 2.5072123857945773e-07, "loss": 0.4452, "step": 399 }, { "epoch": 0.8583690987124464, "grad_norm": 11.01382449808334, "learning_rate": 2.4341587253072035e-07, "loss": 0.365, "step": 400 }, { "epoch": 0.8583690987124464, "eval_loss": 1.3694226741790771, "eval_runtime": 0.5595, "eval_samples_per_second": 3.575, "eval_steps_per_second": 1.787, "step": 400 }, { "epoch": 0.8605150214592274, "grad_norm": 5.5777813351540315, "learning_rate": 2.3621306643097613e-07, "loss": 0.2782, "step": 401 }, { "epoch": 0.8626609442060086, "grad_norm": 4.6116367037678785, "learning_rate": 2.2911314764201775e-07, "loss": 0.3513, "step": 402 }, { "epoch": 0.8648068669527897, "grad_norm": 4.483486905044895, "learning_rate": 2.221164388494923e-07, "loss": 0.4313, "step": 403 }, { "epoch": 0.8669527896995708, "grad_norm": 4.272277428586251, "learning_rate": 2.1522325804823496e-07, "loss": 0.2204, "step": 404 }, { "epoch": 0.869098712446352, "grad_norm": 3.896674815860478, "learning_rate": 2.0843391852781558e-07, "loss": 0.2286, "step": 405 }, { "epoch": 0.871244635193133, "grad_norm": 11.404612671959441, "learning_rate": 2.0174872885830117e-07, "loss": 0.5745, "step": 406 }, { "epoch": 0.8733905579399142, "grad_norm": 5.761128589810944, "learning_rate": 1.9516799287622984e-07, "loss": 0.2869, "step": 407 }, { "epoch": 0.8755364806866953, "grad_norm": 3.268320125330955, "learning_rate": 1.8869200967080503e-07, "loss": 0.1103, "step": 408 }, { "epoch": 0.8776824034334764, "grad_norm": 4.38842912004494, "learning_rate": 1.8232107357029877e-07, "loss": 0.2272, "step": 409 }, { "epoch": 0.8798283261802575, "grad_norm": 4.643796217859636, "learning_rate": 1.7605547412867574e-07, "loss": 0.184, "step": 410 }, { "epoch": 0.8819742489270386, "grad_norm": 4.90732488089375, "learning_rate": 1.6989549611243412e-07, "loss": 0.3627, "step": 411 }, { "epoch": 0.8841201716738197, "grad_norm": 12.17579771573891, "learning_rate": 1.638414194876617e-07, "loss": 0.5054, "step": 412 }, { "epoch": 0.8862660944206009, "grad_norm": 7.752186615318354, "learning_rate": 1.5789351940731334e-07, "loss": 0.363, "step": 413 }, { "epoch": 0.8884120171673819, "grad_norm": 4.5450019220648, "learning_rate": 1.520520661987049e-07, "loss": 0.3259, "step": 414 }, { "epoch": 0.8905579399141631, "grad_norm": 5.488305915443275, "learning_rate": 1.463173253512251e-07, "loss": 0.3395, "step": 415 }, { "epoch": 0.8927038626609443, "grad_norm": 7.094268562494252, "learning_rate": 1.406895575042727e-07, "loss": 0.295, "step": 416 }, { "epoch": 0.8948497854077253, "grad_norm": 4.7154597836682965, "learning_rate": 1.3516901843540876e-07, "loss": 0.2653, "step": 417 }, { "epoch": 0.8969957081545065, "grad_norm": 11.321571989329978, "learning_rate": 1.2975595904873073e-07, "loss": 0.4694, "step": 418 }, { "epoch": 0.8991416309012875, "grad_norm": 4.452790240381536, "learning_rate": 1.2445062536347057e-07, "loss": 0.3792, "step": 419 }, { "epoch": 0.9012875536480687, "grad_norm": 4.044178243939312, "learning_rate": 1.1925325850281416e-07, "loss": 0.2857, "step": 420 }, { "epoch": 0.9034334763948498, "grad_norm": 5.509954869782006, "learning_rate": 1.1416409468293977e-07, "loss": 0.3078, "step": 421 }, { "epoch": 0.9055793991416309, "grad_norm": 4.69558598631356, "learning_rate": 1.0918336520228474e-07, "loss": 0.335, "step": 422 }, { "epoch": 0.907725321888412, "grad_norm": 4.4538465310945385, "learning_rate": 1.0431129643103193e-07, "loss": 0.2351, "step": 423 }, { "epoch": 0.9098712446351931, "grad_norm": 5.374815587193687, "learning_rate": 9.954810980082191e-08, "loss": 0.2546, "step": 424 }, { "epoch": 0.9120171673819742, "grad_norm": 11.52228630231847, "learning_rate": 9.489402179468754e-08, "loss": 0.7185, "step": 425 }, { "epoch": 0.9141630901287554, "grad_norm": 3.884298143154326, "learning_rate": 9.034924393721778e-08, "loss": 0.2261, "step": 426 }, { "epoch": 0.9163090128755365, "grad_norm": 12.582223724723782, "learning_rate": 8.5913982784942e-08, "loss": 0.5048, "step": 427 }, { "epoch": 0.9184549356223176, "grad_norm": 4.909917760044249, "learning_rate": 8.15884399169417e-08, "loss": 0.1981, "step": 428 }, { "epoch": 0.9206008583690987, "grad_norm": 4.336475429929087, "learning_rate": 7.737281192569169e-08, "loss": 0.2551, "step": 429 }, { "epoch": 0.9227467811158798, "grad_norm": 3.489835127648909, "learning_rate": 7.326729040812136e-08, "loss": 0.1921, "step": 430 }, { "epoch": 0.924892703862661, "grad_norm": 4.877570803828318, "learning_rate": 6.927206195691039e-08, "loss": 0.2177, "step": 431 }, { "epoch": 0.927038626609442, "grad_norm": 4.717548019415604, "learning_rate": 6.538730815200483e-08, "loss": 0.147, "step": 432 }, { "epoch": 0.9291845493562232, "grad_norm": 14.494510709152369, "learning_rate": 6.1613205552368e-08, "loss": 0.6247, "step": 433 }, { "epoch": 0.9313304721030042, "grad_norm": 3.5697709692346677, "learning_rate": 5.79499256879526e-08, "loss": 0.1588, "step": 434 }, { "epoch": 0.9334763948497854, "grad_norm": 5.135679478772209, "learning_rate": 5.4397635051907093e-08, "loss": 0.4196, "step": 435 }, { "epoch": 0.9356223175965666, "grad_norm": 5.329997658405359, "learning_rate": 5.095649509300804e-08, "loss": 0.2701, "step": 436 }, { "epoch": 0.9377682403433476, "grad_norm": 3.760523480520758, "learning_rate": 4.7626662208322405e-08, "loss": 0.0939, "step": 437 }, { "epoch": 0.9399141630901288, "grad_norm": 5.291669505677406, "learning_rate": 4.4408287736099344e-08, "loss": 0.3128, "step": 438 }, { "epoch": 0.9420600858369099, "grad_norm": 11.680854749305373, "learning_rate": 4.130151794889181e-08, "loss": 0.3982, "step": 439 }, { "epoch": 0.944206008583691, "grad_norm": 12.506267427980399, "learning_rate": 3.830649404690939e-08, "loss": 0.6572, "step": 440 }, { "epoch": 0.9463519313304721, "grad_norm": 4.533062512569462, "learning_rate": 3.5423352151599534e-08, "loss": 0.2257, "step": 441 }, { "epoch": 0.9484978540772532, "grad_norm": 3.8785495849634173, "learning_rate": 3.2652223299462214e-08, "loss": 0.217, "step": 442 }, { "epoch": 0.9506437768240343, "grad_norm": 6.192440279403632, "learning_rate": 2.9993233436093895e-08, "loss": 0.362, "step": 443 }, { "epoch": 0.9527896995708155, "grad_norm": 4.267433754596404, "learning_rate": 2.7446503410463178e-08, "loss": 0.2044, "step": 444 }, { "epoch": 0.9549356223175965, "grad_norm": 5.592239506957767, "learning_rate": 2.5012148969419113e-08, "loss": 0.3607, "step": 445 }, { "epoch": 0.9570815450643777, "grad_norm": 12.403767416603346, "learning_rate": 2.2690280752429293e-08, "loss": 0.5623, "step": 446 }, { "epoch": 0.9592274678111588, "grad_norm": 4.281323531138287, "learning_rate": 2.0481004286552753e-08, "loss": 0.2902, "step": 447 }, { "epoch": 0.9613733905579399, "grad_norm": 2.903934981764517, "learning_rate": 1.8384419981642698e-08, "loss": 0.1355, "step": 448 }, { "epoch": 0.9635193133047211, "grad_norm": 8.89203528557412, "learning_rate": 1.6400623125784053e-08, "loss": 0.3686, "step": 449 }, { "epoch": 0.9656652360515021, "grad_norm": 4.562592593974174, "learning_rate": 1.452970388096192e-08, "loss": 0.1926, "step": 450 }, { "epoch": 0.9678111587982833, "grad_norm": 4.534339307707502, "learning_rate": 1.2771747278963464e-08, "loss": 0.3514, "step": 451 }, { "epoch": 0.9699570815450643, "grad_norm": 4.556632192151679, "learning_rate": 1.1126833217514898e-08, "loss": 0.2418, "step": 452 }, { "epoch": 0.9721030042918455, "grad_norm": 9.909433789309341, "learning_rate": 9.595036456648277e-09, "loss": 0.3205, "step": 453 }, { "epoch": 0.9742489270386266, "grad_norm": 3.9689055222382374, "learning_rate": 8.176426615304767e-09, "loss": 0.1813, "step": 454 }, { "epoch": 0.9763948497854077, "grad_norm": 5.3691676448201155, "learning_rate": 6.871068168170237e-09, "loss": 0.2227, "step": 455 }, { "epoch": 0.9785407725321889, "grad_norm": 3.1063248198160824, "learning_rate": 5.679020442745098e-09, "loss": 0.1993, "step": 456 }, { "epoch": 0.98068669527897, "grad_norm": 3.8439436507436415, "learning_rate": 4.600337616648131e-09, "loss": 0.2025, "step": 457 }, { "epoch": 0.9828326180257511, "grad_norm": 3.4608789139316882, "learning_rate": 3.6350687151531782e-09, "loss": 0.1106, "step": 458 }, { "epoch": 0.9849785407725322, "grad_norm": 4.432907889303436, "learning_rate": 2.7832576089623086e-09, "loss": 0.181, "step": 459 }, { "epoch": 0.9871244635193133, "grad_norm": 5.815988851277571, "learning_rate": 2.044943012210754e-09, "loss": 0.3236, "step": 460 }, { "epoch": 0.9892703862660944, "grad_norm": 4.622989507343908, "learning_rate": 1.4201584807083113e-09, "loss": 0.2848, "step": 461 }, { "epoch": 0.9914163090128756, "grad_norm": 4.812640379426997, "learning_rate": 9.08932410413621e-10, "loss": 0.2791, "step": 462 }, { "epoch": 0.9935622317596566, "grad_norm": 11.627724270489814, "learning_rate": 5.112880361438088e-10, "loss": 0.7668, "step": 463 }, { "epoch": 0.9957081545064378, "grad_norm": 3.8724681206038953, "learning_rate": 2.2724343051866438e-10, "loss": 0.1976, "step": 464 }, { "epoch": 0.9978540772532188, "grad_norm": 6.521739917424491, "learning_rate": 5.681150313907591e-11, "loss": 0.2869, "step": 465 }, { "epoch": 1.0, "grad_norm": 3.883799871571989, "learning_rate": 0.0, "loss": 0.1588, "step": 466 }, { "epoch": 1.0, "step": 466, "total_flos": 4281709953024.0, "train_loss": 0.34951822290220996, "train_runtime": 959.7413, "train_samples_per_second": 1.942, "train_steps_per_second": 0.486 } ], "logging_steps": 1, "max_steps": 466, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4281709953024.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }