{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9987570287067182, "eval_steps": 500, "global_step": 6333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047351287363125186, "grad_norm": 32.22987365722656, "learning_rate": 1.0526315789473685e-07, "loss": 4.8137, "step": 1 }, { "epoch": 0.0009470257472625037, "grad_norm": 31.049623489379883, "learning_rate": 2.105263157894737e-07, "loss": 4.6949, "step": 2 }, { "epoch": 0.0014205386208937555, "grad_norm": 30.38652801513672, "learning_rate": 3.1578947368421055e-07, "loss": 4.7183, "step": 3 }, { "epoch": 0.0018940514945250074, "grad_norm": 30.45220184326172, "learning_rate": 4.210526315789474e-07, "loss": 4.6404, "step": 4 }, { "epoch": 0.0023675643681562593, "grad_norm": 29.60318946838379, "learning_rate": 5.263157894736843e-07, "loss": 4.6377, "step": 5 }, { "epoch": 0.002841077241787511, "grad_norm": 30.485383987426758, "learning_rate": 6.315789473684211e-07, "loss": 4.7493, "step": 6 }, { "epoch": 0.003314590115418763, "grad_norm": 29.985885620117188, "learning_rate": 7.368421052631579e-07, "loss": 4.6343, "step": 7 }, { "epoch": 0.003788102989050015, "grad_norm": 29.9451904296875, "learning_rate": 8.421052631578948e-07, "loss": 4.6325, "step": 8 }, { "epoch": 0.0042616158626812665, "grad_norm": 28.334569931030273, "learning_rate": 9.473684210526317e-07, "loss": 4.4123, "step": 9 }, { "epoch": 0.004735128736312519, "grad_norm": 30.083621978759766, "learning_rate": 1.0526315789473685e-06, "loss": 4.4909, "step": 10 }, { "epoch": 0.00520864160994377, "grad_norm": 29.243478775024414, "learning_rate": 1.1578947368421053e-06, "loss": 4.4436, "step": 11 }, { "epoch": 0.005682154483575022, "grad_norm": 27.903270721435547, "learning_rate": 1.2631578947368422e-06, "loss": 4.4001, "step": 12 }, { "epoch": 0.006155667357206274, "grad_norm": 28.32964515686035, "learning_rate": 1.3684210526315791e-06, "loss": 4.2836, "step": 13 }, { "epoch": 0.006629180230837526, "grad_norm": 28.63888931274414, "learning_rate": 1.4736842105263159e-06, "loss": 4.2648, "step": 14 }, { "epoch": 0.007102693104468778, "grad_norm": 27.617847442626953, "learning_rate": 1.5789473684210526e-06, "loss": 4.1683, "step": 15 }, { "epoch": 0.00757620597810003, "grad_norm": 27.17360496520996, "learning_rate": 1.6842105263157895e-06, "loss": 4.1395, "step": 16 }, { "epoch": 0.008049718851731282, "grad_norm": 27.059186935424805, "learning_rate": 1.7894736842105265e-06, "loss": 3.9835, "step": 17 }, { "epoch": 0.008523231725362533, "grad_norm": 27.937824249267578, "learning_rate": 1.8947368421052634e-06, "loss": 3.9632, "step": 18 }, { "epoch": 0.008996744598993784, "grad_norm": 27.05862045288086, "learning_rate": 2.0000000000000003e-06, "loss": 3.8811, "step": 19 }, { "epoch": 0.009470257472625037, "grad_norm": 26.196979522705078, "learning_rate": 2.105263157894737e-06, "loss": 3.8312, "step": 20 }, { "epoch": 0.009943770346256289, "grad_norm": 26.135305404663086, "learning_rate": 2.2105263157894738e-06, "loss": 3.7232, "step": 21 }, { "epoch": 0.01041728321988754, "grad_norm": 25.997011184692383, "learning_rate": 2.3157894736842105e-06, "loss": 3.6288, "step": 22 }, { "epoch": 0.010890796093518793, "grad_norm": 26.397438049316406, "learning_rate": 2.4210526315789477e-06, "loss": 3.5437, "step": 23 }, { "epoch": 0.011364308967150044, "grad_norm": 24.695541381835938, "learning_rate": 2.5263157894736844e-06, "loss": 3.4521, "step": 24 }, { "epoch": 0.011837821840781295, "grad_norm": 24.346065521240234, "learning_rate": 2.631578947368421e-06, "loss": 3.269, "step": 25 }, { "epoch": 0.012311334714412548, "grad_norm": 24.483036041259766, "learning_rate": 2.7368421052631583e-06, "loss": 3.2028, "step": 26 }, { "epoch": 0.0127848475880438, "grad_norm": 24.411731719970703, "learning_rate": 2.842105263157895e-06, "loss": 3.1983, "step": 27 }, { "epoch": 0.013258360461675053, "grad_norm": 24.398752212524414, "learning_rate": 2.9473684210526317e-06, "loss": 3.0967, "step": 28 }, { "epoch": 0.013731873335306304, "grad_norm": 24.130815505981445, "learning_rate": 3.052631578947369e-06, "loss": 2.9756, "step": 29 }, { "epoch": 0.014205386208937555, "grad_norm": 22.618980407714844, "learning_rate": 3.157894736842105e-06, "loss": 2.7865, "step": 30 }, { "epoch": 0.014678899082568808, "grad_norm": 22.387496948242188, "learning_rate": 3.2631578947368423e-06, "loss": 2.7595, "step": 31 }, { "epoch": 0.01515241195620006, "grad_norm": 21.85611915588379, "learning_rate": 3.368421052631579e-06, "loss": 2.6221, "step": 32 }, { "epoch": 0.01562592482983131, "grad_norm": 21.454317092895508, "learning_rate": 3.473684210526316e-06, "loss": 2.4785, "step": 33 }, { "epoch": 0.016099437703462564, "grad_norm": 20.223222732543945, "learning_rate": 3.578947368421053e-06, "loss": 2.4204, "step": 34 }, { "epoch": 0.016572950577093813, "grad_norm": 20.16985511779785, "learning_rate": 3.6842105263157896e-06, "loss": 2.4438, "step": 35 }, { "epoch": 0.017046463450725066, "grad_norm": 18.412492752075195, "learning_rate": 3.789473684210527e-06, "loss": 2.2141, "step": 36 }, { "epoch": 0.01751997632435632, "grad_norm": 17.698265075683594, "learning_rate": 3.894736842105263e-06, "loss": 2.1291, "step": 37 }, { "epoch": 0.01799348919798757, "grad_norm": 15.409119606018066, "learning_rate": 4.000000000000001e-06, "loss": 2.0709, "step": 38 }, { "epoch": 0.01846700207161882, "grad_norm": 14.957233428955078, "learning_rate": 4.105263157894737e-06, "loss": 1.9438, "step": 39 }, { "epoch": 0.018940514945250075, "grad_norm": 14.200984001159668, "learning_rate": 4.210526315789474e-06, "loss": 1.8377, "step": 40 }, { "epoch": 0.019414027818881324, "grad_norm": 11.768697738647461, "learning_rate": 4.315789473684211e-06, "loss": 1.8675, "step": 41 }, { "epoch": 0.019887540692512577, "grad_norm": 11.194828987121582, "learning_rate": 4.4210526315789476e-06, "loss": 1.7639, "step": 42 }, { "epoch": 0.02036105356614383, "grad_norm": 10.777433395385742, "learning_rate": 4.526315789473685e-06, "loss": 1.6869, "step": 43 }, { "epoch": 0.02083456643977508, "grad_norm": 9.706337928771973, "learning_rate": 4.631578947368421e-06, "loss": 1.7424, "step": 44 }, { "epoch": 0.021308079313406333, "grad_norm": 9.26910400390625, "learning_rate": 4.736842105263158e-06, "loss": 1.5467, "step": 45 }, { "epoch": 0.021781592187037586, "grad_norm": 8.306986808776855, "learning_rate": 4.842105263157895e-06, "loss": 1.5561, "step": 46 }, { "epoch": 0.022255105060668835, "grad_norm": 7.666769027709961, "learning_rate": 4.947368421052632e-06, "loss": 1.4493, "step": 47 }, { "epoch": 0.022728617934300088, "grad_norm": 7.483455181121826, "learning_rate": 5.052631578947369e-06, "loss": 1.383, "step": 48 }, { "epoch": 0.02320213080793134, "grad_norm": 7.80124044418335, "learning_rate": 5.157894736842106e-06, "loss": 1.4092, "step": 49 }, { "epoch": 0.02367564368156259, "grad_norm": 7.231191158294678, "learning_rate": 5.263157894736842e-06, "loss": 1.3909, "step": 50 }, { "epoch": 0.024149156555193844, "grad_norm": 6.258241176605225, "learning_rate": 5.36842105263158e-06, "loss": 1.3075, "step": 51 }, { "epoch": 0.024622669428825097, "grad_norm": 6.165030479431152, "learning_rate": 5.4736842105263165e-06, "loss": 1.2565, "step": 52 }, { "epoch": 0.02509618230245635, "grad_norm": 6.1708455085754395, "learning_rate": 5.578947368421052e-06, "loss": 1.3199, "step": 53 }, { "epoch": 0.0255696951760876, "grad_norm": 5.459502696990967, "learning_rate": 5.68421052631579e-06, "loss": 1.1997, "step": 54 }, { "epoch": 0.026043208049718852, "grad_norm": 5.5988640785217285, "learning_rate": 5.789473684210527e-06, "loss": 1.0775, "step": 55 }, { "epoch": 0.026516720923350105, "grad_norm": 4.691328048706055, "learning_rate": 5.8947368421052634e-06, "loss": 1.185, "step": 56 }, { "epoch": 0.026990233796981355, "grad_norm": 4.2112507820129395, "learning_rate": 6e-06, "loss": 1.099, "step": 57 }, { "epoch": 0.027463746670612608, "grad_norm": 4.1589674949646, "learning_rate": 6.105263157894738e-06, "loss": 1.1201, "step": 58 }, { "epoch": 0.02793725954424386, "grad_norm": 4.95751428604126, "learning_rate": 6.2105263157894745e-06, "loss": 1.1349, "step": 59 }, { "epoch": 0.02841077241787511, "grad_norm": 3.543083906173706, "learning_rate": 6.31578947368421e-06, "loss": 1.0243, "step": 60 }, { "epoch": 0.028884285291506363, "grad_norm": 2.737982749938965, "learning_rate": 6.421052631578948e-06, "loss": 0.9596, "step": 61 }, { "epoch": 0.029357798165137616, "grad_norm": 3.6427838802337646, "learning_rate": 6.526315789473685e-06, "loss": 0.9429, "step": 62 }, { "epoch": 0.029831311038768866, "grad_norm": 3.1644389629364014, "learning_rate": 6.631578947368421e-06, "loss": 1.0354, "step": 63 }, { "epoch": 0.03030482391240012, "grad_norm": 2.886333465576172, "learning_rate": 6.736842105263158e-06, "loss": 0.9896, "step": 64 }, { "epoch": 0.030778336786031372, "grad_norm": 2.797492265701294, "learning_rate": 6.842105263157896e-06, "loss": 0.9985, "step": 65 }, { "epoch": 0.03125184965966262, "grad_norm": 2.594764471054077, "learning_rate": 6.947368421052632e-06, "loss": 0.9611, "step": 66 }, { "epoch": 0.031725362533293874, "grad_norm": 3.788191795349121, "learning_rate": 7.052631578947369e-06, "loss": 0.9094, "step": 67 }, { "epoch": 0.03219887540692513, "grad_norm": 2.9197521209716797, "learning_rate": 7.157894736842106e-06, "loss": 0.8584, "step": 68 }, { "epoch": 0.03267238828055638, "grad_norm": 3.339698314666748, "learning_rate": 7.263157894736843e-06, "loss": 0.8872, "step": 69 }, { "epoch": 0.033145901154187626, "grad_norm": 3.415419578552246, "learning_rate": 7.368421052631579e-06, "loss": 0.8177, "step": 70 }, { "epoch": 0.03361941402781888, "grad_norm": 2.3660242557525635, "learning_rate": 7.473684210526316e-06, "loss": 0.9576, "step": 71 }, { "epoch": 0.03409292690145013, "grad_norm": 2.605135202407837, "learning_rate": 7.578947368421054e-06, "loss": 0.8901, "step": 72 }, { "epoch": 0.034566439775081385, "grad_norm": 3.861926317214966, "learning_rate": 7.68421052631579e-06, "loss": 0.8733, "step": 73 }, { "epoch": 0.03503995264871264, "grad_norm": 3.650552272796631, "learning_rate": 7.789473684210526e-06, "loss": 0.9273, "step": 74 }, { "epoch": 0.03551346552234389, "grad_norm": 3.098768949508667, "learning_rate": 7.894736842105265e-06, "loss": 0.9519, "step": 75 }, { "epoch": 0.03598697839597514, "grad_norm": 3.1536178588867188, "learning_rate": 8.000000000000001e-06, "loss": 0.8491, "step": 76 }, { "epoch": 0.03646049126960639, "grad_norm": 1.9899249076843262, "learning_rate": 8.105263157894736e-06, "loss": 0.8351, "step": 77 }, { "epoch": 0.03693400414323764, "grad_norm": 2.1144354343414307, "learning_rate": 8.210526315789475e-06, "loss": 0.8503, "step": 78 }, { "epoch": 0.037407517016868896, "grad_norm": 2.7250900268554688, "learning_rate": 8.315789473684212e-06, "loss": 0.8294, "step": 79 }, { "epoch": 0.03788102989050015, "grad_norm": 2.3137624263763428, "learning_rate": 8.421052631578948e-06, "loss": 0.78, "step": 80 }, { "epoch": 0.0383545427641314, "grad_norm": 2.3112668991088867, "learning_rate": 8.526315789473685e-06, "loss": 0.7696, "step": 81 }, { "epoch": 0.03882805563776265, "grad_norm": 2.882868766784668, "learning_rate": 8.631578947368422e-06, "loss": 0.829, "step": 82 }, { "epoch": 0.0393015685113939, "grad_norm": 2.7895877361297607, "learning_rate": 8.736842105263158e-06, "loss": 0.8318, "step": 83 }, { "epoch": 0.039775081385025154, "grad_norm": 2.179732322692871, "learning_rate": 8.842105263157895e-06, "loss": 0.7615, "step": 84 }, { "epoch": 0.04024859425865641, "grad_norm": 2.3356051445007324, "learning_rate": 8.947368421052632e-06, "loss": 0.6886, "step": 85 }, { "epoch": 0.04072210713228766, "grad_norm": 2.664323568344116, "learning_rate": 9.05263157894737e-06, "loss": 0.7625, "step": 86 }, { "epoch": 0.04119562000591891, "grad_norm": 2.517026424407959, "learning_rate": 9.157894736842105e-06, "loss": 0.6589, "step": 87 }, { "epoch": 0.04166913287955016, "grad_norm": 1.9179812669754028, "learning_rate": 9.263157894736842e-06, "loss": 0.695, "step": 88 }, { "epoch": 0.04214264575318141, "grad_norm": 2.2245266437530518, "learning_rate": 9.36842105263158e-06, "loss": 0.6799, "step": 89 }, { "epoch": 0.042616158626812665, "grad_norm": 2.003324270248413, "learning_rate": 9.473684210526315e-06, "loss": 0.6828, "step": 90 }, { "epoch": 0.04308967150044392, "grad_norm": 2.43106746673584, "learning_rate": 9.578947368421054e-06, "loss": 0.6584, "step": 91 }, { "epoch": 0.04356318437407517, "grad_norm": 2.4158437252044678, "learning_rate": 9.68421052631579e-06, "loss": 0.6645, "step": 92 }, { "epoch": 0.044036697247706424, "grad_norm": 2.5698916912078857, "learning_rate": 9.789473684210527e-06, "loss": 0.6451, "step": 93 }, { "epoch": 0.04451021012133767, "grad_norm": 1.9317587614059448, "learning_rate": 9.894736842105264e-06, "loss": 0.6401, "step": 94 }, { "epoch": 0.044983722994968924, "grad_norm": 1.8760066032409668, "learning_rate": 1e-05, "loss": 0.6576, "step": 95 }, { "epoch": 0.045457235868600177, "grad_norm": 2.3672170639038086, "learning_rate": 1.0105263157894738e-05, "loss": 0.7306, "step": 96 }, { "epoch": 0.04593074874223143, "grad_norm": 1.790613055229187, "learning_rate": 1.0210526315789476e-05, "loss": 0.6165, "step": 97 }, { "epoch": 0.04640426161586268, "grad_norm": 1.7723586559295654, "learning_rate": 1.0315789473684213e-05, "loss": 0.5753, "step": 98 }, { "epoch": 0.046877774489493935, "grad_norm": 2.3334202766418457, "learning_rate": 1.0421052631578948e-05, "loss": 0.6944, "step": 99 }, { "epoch": 0.04735128736312518, "grad_norm": 1.6184226274490356, "learning_rate": 1.0526315789473684e-05, "loss": 0.5287, "step": 100 }, { "epoch": 0.047824800236756435, "grad_norm": 1.6825703382492065, "learning_rate": 1.0631578947368421e-05, "loss": 0.5426, "step": 101 }, { "epoch": 0.04829831311038769, "grad_norm": 1.4492383003234863, "learning_rate": 1.073684210526316e-05, "loss": 0.53, "step": 102 }, { "epoch": 0.04877182598401894, "grad_norm": 2.225109577178955, "learning_rate": 1.0842105263157896e-05, "loss": 0.5375, "step": 103 }, { "epoch": 0.049245338857650194, "grad_norm": 1.645599365234375, "learning_rate": 1.0947368421052633e-05, "loss": 0.5288, "step": 104 }, { "epoch": 0.049718851731281447, "grad_norm": 1.912824034690857, "learning_rate": 1.105263157894737e-05, "loss": 0.5346, "step": 105 }, { "epoch": 0.0501923646049127, "grad_norm": 2.340646505355835, "learning_rate": 1.1157894736842105e-05, "loss": 0.5581, "step": 106 }, { "epoch": 0.050665877478543946, "grad_norm": 1.8115869760513306, "learning_rate": 1.1263157894736843e-05, "loss": 0.5986, "step": 107 }, { "epoch": 0.0511393903521752, "grad_norm": 2.0887653827667236, "learning_rate": 1.136842105263158e-05, "loss": 0.5353, "step": 108 }, { "epoch": 0.05161290322580645, "grad_norm": 1.7228904962539673, "learning_rate": 1.1473684210526317e-05, "loss": 0.5273, "step": 109 }, { "epoch": 0.052086416099437705, "grad_norm": 2.6061673164367676, "learning_rate": 1.1578947368421053e-05, "loss": 0.5476, "step": 110 }, { "epoch": 0.05255992897306896, "grad_norm": 1.8213406801223755, "learning_rate": 1.1684210526315792e-05, "loss": 0.611, "step": 111 }, { "epoch": 0.05303344184670021, "grad_norm": 1.6525737047195435, "learning_rate": 1.1789473684210527e-05, "loss": 0.5687, "step": 112 }, { "epoch": 0.05350695472033146, "grad_norm": 1.7281478643417358, "learning_rate": 1.1894736842105264e-05, "loss": 0.5377, "step": 113 }, { "epoch": 0.05398046759396271, "grad_norm": 1.835943341255188, "learning_rate": 1.2e-05, "loss": 0.4596, "step": 114 }, { "epoch": 0.05445398046759396, "grad_norm": 1.6497899293899536, "learning_rate": 1.2105263157894737e-05, "loss": 0.544, "step": 115 }, { "epoch": 0.054927493341225216, "grad_norm": 1.4863883256912231, "learning_rate": 1.2210526315789475e-05, "loss": 0.4536, "step": 116 }, { "epoch": 0.05540100621485647, "grad_norm": 1.4068655967712402, "learning_rate": 1.2315789473684212e-05, "loss": 0.4522, "step": 117 }, { "epoch": 0.05587451908848772, "grad_norm": 1.8424599170684814, "learning_rate": 1.2421052631578949e-05, "loss": 0.5377, "step": 118 }, { "epoch": 0.05634803196211897, "grad_norm": 1.6451683044433594, "learning_rate": 1.2526315789473684e-05, "loss": 0.5254, "step": 119 }, { "epoch": 0.05682154483575022, "grad_norm": 1.8392103910446167, "learning_rate": 1.263157894736842e-05, "loss": 0.4758, "step": 120 }, { "epoch": 0.057295057709381474, "grad_norm": 1.9140554666519165, "learning_rate": 1.2736842105263159e-05, "loss": 0.5243, "step": 121 }, { "epoch": 0.05776857058301273, "grad_norm": 2.6478939056396484, "learning_rate": 1.2842105263157896e-05, "loss": 0.4688, "step": 122 }, { "epoch": 0.05824208345664398, "grad_norm": 1.8704174757003784, "learning_rate": 1.2947368421052633e-05, "loss": 0.5192, "step": 123 }, { "epoch": 0.05871559633027523, "grad_norm": 1.791478157043457, "learning_rate": 1.305263157894737e-05, "loss": 0.5216, "step": 124 }, { "epoch": 0.05918910920390648, "grad_norm": 2.0871686935424805, "learning_rate": 1.3157894736842108e-05, "loss": 0.4937, "step": 125 }, { "epoch": 0.05966262207753773, "grad_norm": 1.6168326139450073, "learning_rate": 1.3263157894736843e-05, "loss": 0.5167, "step": 126 }, { "epoch": 0.060136134951168985, "grad_norm": 1.9244468212127686, "learning_rate": 1.336842105263158e-05, "loss": 0.482, "step": 127 }, { "epoch": 0.06060964782480024, "grad_norm": 1.7854818105697632, "learning_rate": 1.3473684210526316e-05, "loss": 0.5376, "step": 128 }, { "epoch": 0.06108316069843149, "grad_norm": 1.5491472482681274, "learning_rate": 1.3578947368421055e-05, "loss": 0.4578, "step": 129 }, { "epoch": 0.061556673572062744, "grad_norm": 2.987952470779419, "learning_rate": 1.3684210526315791e-05, "loss": 0.4415, "step": 130 }, { "epoch": 0.06203018644569399, "grad_norm": 2.423494338989258, "learning_rate": 1.3789473684210528e-05, "loss": 0.4908, "step": 131 }, { "epoch": 0.06250369931932524, "grad_norm": 1.8803611993789673, "learning_rate": 1.3894736842105265e-05, "loss": 0.4588, "step": 132 }, { "epoch": 0.0629772121929565, "grad_norm": 2.069321393966675, "learning_rate": 1.4e-05, "loss": 0.5236, "step": 133 }, { "epoch": 0.06345072506658775, "grad_norm": 2.4028356075286865, "learning_rate": 1.4105263157894738e-05, "loss": 0.5207, "step": 134 }, { "epoch": 0.063924237940219, "grad_norm": 1.9155303239822388, "learning_rate": 1.4210526315789475e-05, "loss": 0.4975, "step": 135 }, { "epoch": 0.06439775081385025, "grad_norm": 3.001650810241699, "learning_rate": 1.4315789473684212e-05, "loss": 0.4302, "step": 136 }, { "epoch": 0.0648712636874815, "grad_norm": 2.335148334503174, "learning_rate": 1.4421052631578948e-05, "loss": 0.5123, "step": 137 }, { "epoch": 0.06534477656111276, "grad_norm": 1.7075388431549072, "learning_rate": 1.4526315789473687e-05, "loss": 0.456, "step": 138 }, { "epoch": 0.065818289434744, "grad_norm": 2.3079159259796143, "learning_rate": 1.4631578947368424e-05, "loss": 0.486, "step": 139 }, { "epoch": 0.06629180230837525, "grad_norm": 2.429774522781372, "learning_rate": 1.4736842105263159e-05, "loss": 0.4889, "step": 140 }, { "epoch": 0.06676531518200651, "grad_norm": 2.658094644546509, "learning_rate": 1.4842105263157895e-05, "loss": 0.4289, "step": 141 }, { "epoch": 0.06723882805563776, "grad_norm": 1.507424235343933, "learning_rate": 1.4947368421052632e-05, "loss": 0.4725, "step": 142 }, { "epoch": 0.06771234092926902, "grad_norm": 1.9254406690597534, "learning_rate": 1.505263157894737e-05, "loss": 0.456, "step": 143 }, { "epoch": 0.06818585380290026, "grad_norm": 2.3014895915985107, "learning_rate": 1.5157894736842107e-05, "loss": 0.4627, "step": 144 }, { "epoch": 0.06865936667653152, "grad_norm": 2.4393584728240967, "learning_rate": 1.5263157894736846e-05, "loss": 0.5304, "step": 145 }, { "epoch": 0.06913287955016277, "grad_norm": 1.627350926399231, "learning_rate": 1.536842105263158e-05, "loss": 0.4897, "step": 146 }, { "epoch": 0.06960639242379402, "grad_norm": 1.4329842329025269, "learning_rate": 1.5473684210526316e-05, "loss": 0.4581, "step": 147 }, { "epoch": 0.07007990529742528, "grad_norm": 1.653219223022461, "learning_rate": 1.5578947368421052e-05, "loss": 0.4961, "step": 148 }, { "epoch": 0.07055341817105652, "grad_norm": 1.8321658372879028, "learning_rate": 1.568421052631579e-05, "loss": 0.4684, "step": 149 }, { "epoch": 0.07102693104468778, "grad_norm": 1.8446546792984009, "learning_rate": 1.578947368421053e-05, "loss": 0.4637, "step": 150 }, { "epoch": 0.07150044391831903, "grad_norm": 1.486007809638977, "learning_rate": 1.5894736842105266e-05, "loss": 0.3637, "step": 151 }, { "epoch": 0.07197395679195027, "grad_norm": 1.6993294954299927, "learning_rate": 1.6000000000000003e-05, "loss": 0.4323, "step": 152 }, { "epoch": 0.07244746966558153, "grad_norm": 1.391952633857727, "learning_rate": 1.6105263157894736e-05, "loss": 0.4072, "step": 153 }, { "epoch": 0.07292098253921278, "grad_norm": 2.0186688899993896, "learning_rate": 1.6210526315789473e-05, "loss": 0.4412, "step": 154 }, { "epoch": 0.07339449541284404, "grad_norm": 2.2258450984954834, "learning_rate": 1.6315789473684213e-05, "loss": 0.4302, "step": 155 }, { "epoch": 0.07386800828647529, "grad_norm": 1.21897554397583, "learning_rate": 1.642105263157895e-05, "loss": 0.4197, "step": 156 }, { "epoch": 0.07434152116010655, "grad_norm": 2.815114974975586, "learning_rate": 1.6526315789473686e-05, "loss": 0.4093, "step": 157 }, { "epoch": 0.07481503403373779, "grad_norm": 1.9448319673538208, "learning_rate": 1.6631578947368423e-05, "loss": 0.3686, "step": 158 }, { "epoch": 0.07528854690736904, "grad_norm": 1.9755760431289673, "learning_rate": 1.673684210526316e-05, "loss": 0.4371, "step": 159 }, { "epoch": 0.0757620597810003, "grad_norm": 2.202780246734619, "learning_rate": 1.6842105263157896e-05, "loss": 0.41, "step": 160 }, { "epoch": 0.07623557265463154, "grad_norm": 1.373448133468628, "learning_rate": 1.6947368421052633e-05, "loss": 0.3839, "step": 161 }, { "epoch": 0.0767090855282628, "grad_norm": 2.2185139656066895, "learning_rate": 1.705263157894737e-05, "loss": 0.4207, "step": 162 }, { "epoch": 0.07718259840189405, "grad_norm": 1.905508279800415, "learning_rate": 1.7157894736842107e-05, "loss": 0.3555, "step": 163 }, { "epoch": 0.0776561112755253, "grad_norm": 1.355228304862976, "learning_rate": 1.7263157894736843e-05, "loss": 0.4047, "step": 164 }, { "epoch": 0.07812962414915656, "grad_norm": 1.822799563407898, "learning_rate": 1.736842105263158e-05, "loss": 0.4355, "step": 165 }, { "epoch": 0.0786031370227878, "grad_norm": 1.541835069656372, "learning_rate": 1.7473684210526317e-05, "loss": 0.3955, "step": 166 }, { "epoch": 0.07907664989641906, "grad_norm": 1.8151495456695557, "learning_rate": 1.7578947368421054e-05, "loss": 0.4162, "step": 167 }, { "epoch": 0.07955016277005031, "grad_norm": 1.320173978805542, "learning_rate": 1.768421052631579e-05, "loss": 0.4257, "step": 168 }, { "epoch": 0.08002367564368157, "grad_norm": 2.0332558155059814, "learning_rate": 1.7789473684210527e-05, "loss": 0.3654, "step": 169 }, { "epoch": 0.08049718851731281, "grad_norm": 1.4273725748062134, "learning_rate": 1.7894736842105264e-05, "loss": 0.4193, "step": 170 }, { "epoch": 0.08097070139094406, "grad_norm": 1.5815318822860718, "learning_rate": 1.8e-05, "loss": 0.4495, "step": 171 }, { "epoch": 0.08144421426457532, "grad_norm": 1.5733940601348877, "learning_rate": 1.810526315789474e-05, "loss": 0.4453, "step": 172 }, { "epoch": 0.08191772713820657, "grad_norm": 1.554313063621521, "learning_rate": 1.8210526315789477e-05, "loss": 0.4122, "step": 173 }, { "epoch": 0.08239124001183783, "grad_norm": 1.6655805110931396, "learning_rate": 1.831578947368421e-05, "loss": 0.4201, "step": 174 }, { "epoch": 0.08286475288546907, "grad_norm": 2.0391829013824463, "learning_rate": 1.8421052631578947e-05, "loss": 0.3712, "step": 175 }, { "epoch": 0.08333826575910032, "grad_norm": 1.7739325761795044, "learning_rate": 1.8526315789473684e-05, "loss": 0.3718, "step": 176 }, { "epoch": 0.08381177863273158, "grad_norm": 1.7949507236480713, "learning_rate": 1.8631578947368424e-05, "loss": 0.4053, "step": 177 }, { "epoch": 0.08428529150636282, "grad_norm": 1.8814252614974976, "learning_rate": 1.873684210526316e-05, "loss": 0.3998, "step": 178 }, { "epoch": 0.08475880437999408, "grad_norm": 1.8132871389389038, "learning_rate": 1.8842105263157898e-05, "loss": 0.4553, "step": 179 }, { "epoch": 0.08523231725362533, "grad_norm": 1.2668806314468384, "learning_rate": 1.894736842105263e-05, "loss": 0.3793, "step": 180 }, { "epoch": 0.08570583012725659, "grad_norm": 1.6622042655944824, "learning_rate": 1.9052631578947368e-05, "loss": 0.398, "step": 181 }, { "epoch": 0.08617934300088784, "grad_norm": 2.258330821990967, "learning_rate": 1.9157894736842108e-05, "loss": 0.3751, "step": 182 }, { "epoch": 0.08665285587451908, "grad_norm": 2.4360086917877197, "learning_rate": 1.9263157894736845e-05, "loss": 0.3793, "step": 183 }, { "epoch": 0.08712636874815034, "grad_norm": 1.8272238969802856, "learning_rate": 1.936842105263158e-05, "loss": 0.4108, "step": 184 }, { "epoch": 0.08759988162178159, "grad_norm": 1.8764162063598633, "learning_rate": 1.9473684210526318e-05, "loss": 0.3656, "step": 185 }, { "epoch": 0.08807339449541285, "grad_norm": 1.8359413146972656, "learning_rate": 1.9578947368421055e-05, "loss": 0.4059, "step": 186 }, { "epoch": 0.0885469073690441, "grad_norm": 1.6942843198776245, "learning_rate": 1.968421052631579e-05, "loss": 0.4086, "step": 187 }, { "epoch": 0.08902042024267534, "grad_norm": 1.4842171669006348, "learning_rate": 1.9789473684210528e-05, "loss": 0.3665, "step": 188 }, { "epoch": 0.0894939331163066, "grad_norm": 2.0384953022003174, "learning_rate": 1.9894736842105265e-05, "loss": 0.4014, "step": 189 }, { "epoch": 0.08996744598993785, "grad_norm": 2.0458626747131348, "learning_rate": 2e-05, "loss": 0.3963, "step": 190 }, { "epoch": 0.09044095886356911, "grad_norm": 1.6842604875564575, "learning_rate": 1.999999869229824e-05, "loss": 0.3383, "step": 191 }, { "epoch": 0.09091447173720035, "grad_norm": 1.3942151069641113, "learning_rate": 1.9999994769193288e-05, "loss": 0.3629, "step": 192 }, { "epoch": 0.09138798461083161, "grad_norm": 1.8772989511489868, "learning_rate": 1.9999988230686176e-05, "loss": 0.3963, "step": 193 }, { "epoch": 0.09186149748446286, "grad_norm": 1.479197382926941, "learning_rate": 1.9999979076778615e-05, "loss": 0.409, "step": 194 }, { "epoch": 0.0923350103580941, "grad_norm": 1.848061442375183, "learning_rate": 1.9999967307473e-05, "loss": 0.3716, "step": 195 }, { "epoch": 0.09280852323172536, "grad_norm": 1.4783906936645508, "learning_rate": 1.9999952922772404e-05, "loss": 0.3995, "step": 196 }, { "epoch": 0.09328203610535661, "grad_norm": 1.4269295930862427, "learning_rate": 1.9999935922680593e-05, "loss": 0.428, "step": 197 }, { "epoch": 0.09375554897898787, "grad_norm": 2.0845489501953125, "learning_rate": 1.9999916307202013e-05, "loss": 0.3909, "step": 198 }, { "epoch": 0.09422906185261912, "grad_norm": 1.550615668296814, "learning_rate": 1.9999894076341794e-05, "loss": 0.3585, "step": 199 }, { "epoch": 0.09470257472625036, "grad_norm": 1.9279413223266602, "learning_rate": 1.999986923010575e-05, "loss": 0.3696, "step": 200 }, { "epoch": 0.09517608759988162, "grad_norm": 1.6810389757156372, "learning_rate": 1.999984176850038e-05, "loss": 0.4183, "step": 201 }, { "epoch": 0.09564960047351287, "grad_norm": 1.6918199062347412, "learning_rate": 1.9999811691532865e-05, "loss": 0.4065, "step": 202 }, { "epoch": 0.09612311334714413, "grad_norm": 1.8900259733200073, "learning_rate": 1.999977899921107e-05, "loss": 0.3825, "step": 203 }, { "epoch": 0.09659662622077538, "grad_norm": 1.9546706676483154, "learning_rate": 1.999974369154355e-05, "loss": 0.3864, "step": 204 }, { "epoch": 0.09707013909440664, "grad_norm": 1.354948878288269, "learning_rate": 1.9999705768539537e-05, "loss": 0.3947, "step": 205 }, { "epoch": 0.09754365196803788, "grad_norm": 1.7710624933242798, "learning_rate": 1.9999665230208947e-05, "loss": 0.4115, "step": 206 }, { "epoch": 0.09801716484166913, "grad_norm": 1.6053873300552368, "learning_rate": 1.9999622076562387e-05, "loss": 0.3892, "step": 207 }, { "epoch": 0.09849067771530039, "grad_norm": 1.1621034145355225, "learning_rate": 1.999957630761114e-05, "loss": 0.3298, "step": 208 }, { "epoch": 0.09896419058893163, "grad_norm": 1.2299578189849854, "learning_rate": 1.9999527923367175e-05, "loss": 0.3176, "step": 209 }, { "epoch": 0.09943770346256289, "grad_norm": 1.7731809616088867, "learning_rate": 1.999947692384315e-05, "loss": 0.3745, "step": 210 }, { "epoch": 0.09991121633619414, "grad_norm": 1.3102498054504395, "learning_rate": 1.9999423309052405e-05, "loss": 0.3688, "step": 211 }, { "epoch": 0.1003847292098254, "grad_norm": 1.3057738542556763, "learning_rate": 1.9999367079008957e-05, "loss": 0.3865, "step": 212 }, { "epoch": 0.10085824208345665, "grad_norm": 1.6710134744644165, "learning_rate": 1.9999308233727516e-05, "loss": 0.3523, "step": 213 }, { "epoch": 0.10133175495708789, "grad_norm": 1.6011284589767456, "learning_rate": 1.9999246773223468e-05, "loss": 0.3286, "step": 214 }, { "epoch": 0.10180526783071915, "grad_norm": 1.5414258241653442, "learning_rate": 1.9999182697512896e-05, "loss": 0.3608, "step": 215 }, { "epoch": 0.1022787807043504, "grad_norm": 2.0255424976348877, "learning_rate": 1.9999116006612553e-05, "loss": 0.3466, "step": 216 }, { "epoch": 0.10275229357798166, "grad_norm": 1.9602148532867432, "learning_rate": 1.999904670053988e-05, "loss": 0.3827, "step": 217 }, { "epoch": 0.1032258064516129, "grad_norm": 2.306156635284424, "learning_rate": 1.9998974779313004e-05, "loss": 0.371, "step": 218 }, { "epoch": 0.10369931932524415, "grad_norm": 1.572983980178833, "learning_rate": 1.9998900242950736e-05, "loss": 0.3279, "step": 219 }, { "epoch": 0.10417283219887541, "grad_norm": 1.3669328689575195, "learning_rate": 1.9998823091472574e-05, "loss": 0.3717, "step": 220 }, { "epoch": 0.10464634507250666, "grad_norm": 1.9657361507415771, "learning_rate": 1.9998743324898687e-05, "loss": 0.3583, "step": 221 }, { "epoch": 0.10511985794613792, "grad_norm": 1.8538391590118408, "learning_rate": 1.9998660943249947e-05, "loss": 0.3816, "step": 222 }, { "epoch": 0.10559337081976916, "grad_norm": 1.4254624843597412, "learning_rate": 1.9998575946547897e-05, "loss": 0.3319, "step": 223 }, { "epoch": 0.10606688369340042, "grad_norm": 1.865258812904358, "learning_rate": 1.9998488334814766e-05, "loss": 0.3527, "step": 224 }, { "epoch": 0.10654039656703167, "grad_norm": 1.446964979171753, "learning_rate": 1.9998398108073465e-05, "loss": 0.4055, "step": 225 }, { "epoch": 0.10701390944066291, "grad_norm": 1.7752127647399902, "learning_rate": 1.9998305266347598e-05, "loss": 0.3875, "step": 226 }, { "epoch": 0.10748742231429417, "grad_norm": 1.8876196146011353, "learning_rate": 1.9998209809661443e-05, "loss": 0.3439, "step": 227 }, { "epoch": 0.10796093518792542, "grad_norm": 1.2993323802947998, "learning_rate": 1.9998111738039964e-05, "loss": 0.3372, "step": 228 }, { "epoch": 0.10843444806155668, "grad_norm": 1.4371949434280396, "learning_rate": 1.9998011051508816e-05, "loss": 0.3672, "step": 229 }, { "epoch": 0.10890796093518793, "grad_norm": 1.5482254028320312, "learning_rate": 1.9997907750094332e-05, "loss": 0.3554, "step": 230 }, { "epoch": 0.10938147380881917, "grad_norm": 1.4659943580627441, "learning_rate": 1.9997801833823526e-05, "loss": 0.3691, "step": 231 }, { "epoch": 0.10985498668245043, "grad_norm": 1.5168931484222412, "learning_rate": 1.99976933027241e-05, "loss": 0.3559, "step": 232 }, { "epoch": 0.11032849955608168, "grad_norm": 1.4778302907943726, "learning_rate": 1.999758215682444e-05, "loss": 0.3201, "step": 233 }, { "epoch": 0.11080201242971294, "grad_norm": 2.5164334774017334, "learning_rate": 1.9997468396153615e-05, "loss": 0.3711, "step": 234 }, { "epoch": 0.11127552530334418, "grad_norm": 1.5941665172576904, "learning_rate": 1.999735202074138e-05, "loss": 0.3489, "step": 235 }, { "epoch": 0.11174903817697544, "grad_norm": 1.5780380964279175, "learning_rate": 1.9997233030618167e-05, "loss": 0.3282, "step": 236 }, { "epoch": 0.11222255105060669, "grad_norm": 1.8493186235427856, "learning_rate": 1.9997111425815103e-05, "loss": 0.3516, "step": 237 }, { "epoch": 0.11269606392423794, "grad_norm": 1.4640581607818604, "learning_rate": 1.999698720636399e-05, "loss": 0.3542, "step": 238 }, { "epoch": 0.1131695767978692, "grad_norm": 2.8939545154571533, "learning_rate": 1.9996860372297312e-05, "loss": 0.3596, "step": 239 }, { "epoch": 0.11364308967150044, "grad_norm": 1.7447127103805542, "learning_rate": 1.999673092364825e-05, "loss": 0.3593, "step": 240 }, { "epoch": 0.1141166025451317, "grad_norm": 1.753167986869812, "learning_rate": 1.9996598860450653e-05, "loss": 0.3278, "step": 241 }, { "epoch": 0.11459011541876295, "grad_norm": 1.8249387741088867, "learning_rate": 1.9996464182739063e-05, "loss": 0.3373, "step": 242 }, { "epoch": 0.1150636282923942, "grad_norm": 1.9170994758605957, "learning_rate": 1.99963268905487e-05, "loss": 0.3284, "step": 243 }, { "epoch": 0.11553714116602545, "grad_norm": 2.2071142196655273, "learning_rate": 1.999618698391548e-05, "loss": 0.3633, "step": 244 }, { "epoch": 0.1160106540396567, "grad_norm": 1.6293482780456543, "learning_rate": 1.9996044462875984e-05, "loss": 0.3624, "step": 245 }, { "epoch": 0.11648416691328796, "grad_norm": 2.3833389282226562, "learning_rate": 1.9995899327467498e-05, "loss": 0.3329, "step": 246 }, { "epoch": 0.1169576797869192, "grad_norm": 1.9268333911895752, "learning_rate": 1.999575157772797e-05, "loss": 0.3451, "step": 247 }, { "epoch": 0.11743119266055047, "grad_norm": 1.3293579816818237, "learning_rate": 1.9995601213696053e-05, "loss": 0.3676, "step": 248 }, { "epoch": 0.11790470553418171, "grad_norm": 1.651096224784851, "learning_rate": 1.9995448235411062e-05, "loss": 0.3527, "step": 249 }, { "epoch": 0.11837821840781296, "grad_norm": 1.949661135673523, "learning_rate": 1.999529264291302e-05, "loss": 0.4063, "step": 250 }, { "epoch": 0.11885173128144422, "grad_norm": 1.3306670188903809, "learning_rate": 1.9995134436242606e-05, "loss": 0.3607, "step": 251 }, { "epoch": 0.11932524415507546, "grad_norm": 1.5161584615707397, "learning_rate": 1.9994973615441207e-05, "loss": 0.3825, "step": 252 }, { "epoch": 0.11979875702870672, "grad_norm": 2.071152687072754, "learning_rate": 1.9994810180550884e-05, "loss": 0.3306, "step": 253 }, { "epoch": 0.12027226990233797, "grad_norm": 1.514334797859192, "learning_rate": 1.9994644131614382e-05, "loss": 0.3254, "step": 254 }, { "epoch": 0.12074578277596922, "grad_norm": 1.9304691553115845, "learning_rate": 1.9994475468675122e-05, "loss": 0.3863, "step": 255 }, { "epoch": 0.12121929564960048, "grad_norm": 1.6955091953277588, "learning_rate": 1.9994304191777228e-05, "loss": 0.3559, "step": 256 }, { "epoch": 0.12169280852323172, "grad_norm": 1.3693172931671143, "learning_rate": 1.9994130300965485e-05, "loss": 0.3393, "step": 257 }, { "epoch": 0.12216632139686298, "grad_norm": 1.4945056438446045, "learning_rate": 1.9993953796285377e-05, "loss": 0.3051, "step": 258 }, { "epoch": 0.12263983427049423, "grad_norm": 1.3563364744186401, "learning_rate": 1.999377467778307e-05, "loss": 0.3369, "step": 259 }, { "epoch": 0.12311334714412549, "grad_norm": 1.2931690216064453, "learning_rate": 1.9993592945505402e-05, "loss": 0.3297, "step": 260 }, { "epoch": 0.12358686001775673, "grad_norm": 1.5800117254257202, "learning_rate": 1.9993408599499914e-05, "loss": 0.3287, "step": 261 }, { "epoch": 0.12406037289138798, "grad_norm": 1.3716331720352173, "learning_rate": 1.999322163981481e-05, "loss": 0.3092, "step": 262 }, { "epoch": 0.12453388576501924, "grad_norm": 1.6586685180664062, "learning_rate": 1.9993032066499e-05, "loss": 0.4093, "step": 263 }, { "epoch": 0.12500739863865049, "grad_norm": 1.4457738399505615, "learning_rate": 1.999283987960205e-05, "loss": 0.3679, "step": 264 }, { "epoch": 0.12548091151228175, "grad_norm": 1.7291022539138794, "learning_rate": 1.9992645079174235e-05, "loss": 0.3708, "step": 265 }, { "epoch": 0.125954424385913, "grad_norm": 1.6154969930648804, "learning_rate": 1.99924476652665e-05, "loss": 0.3684, "step": 266 }, { "epoch": 0.12642793725954424, "grad_norm": 1.2971596717834473, "learning_rate": 1.999224763793048e-05, "loss": 0.3378, "step": 267 }, { "epoch": 0.1269014501331755, "grad_norm": 2.0271036624908447, "learning_rate": 1.9992044997218484e-05, "loss": 0.3062, "step": 268 }, { "epoch": 0.12737496300680676, "grad_norm": 1.4624063968658447, "learning_rate": 1.9991839743183514e-05, "loss": 0.3105, "step": 269 }, { "epoch": 0.127848475880438, "grad_norm": 1.4940061569213867, "learning_rate": 1.9991631875879252e-05, "loss": 0.3523, "step": 270 }, { "epoch": 0.12832198875406925, "grad_norm": 1.7870043516159058, "learning_rate": 1.9991421395360066e-05, "loss": 0.3551, "step": 271 }, { "epoch": 0.1287955016277005, "grad_norm": 1.2716615200042725, "learning_rate": 1.9991208301681e-05, "loss": 0.3073, "step": 272 }, { "epoch": 0.12926901450133174, "grad_norm": 1.2899311780929565, "learning_rate": 1.9990992594897792e-05, "loss": 0.3495, "step": 273 }, { "epoch": 0.129742527374963, "grad_norm": 1.1452953815460205, "learning_rate": 1.9990774275066856e-05, "loss": 0.3054, "step": 274 }, { "epoch": 0.13021604024859426, "grad_norm": 1.4126183986663818, "learning_rate": 1.9990553342245288e-05, "loss": 0.3209, "step": 275 }, { "epoch": 0.13068955312222552, "grad_norm": 1.9172788858413696, "learning_rate": 1.9990329796490878e-05, "loss": 0.326, "step": 276 }, { "epoch": 0.13116306599585675, "grad_norm": 1.8034718036651611, "learning_rate": 1.9990103637862086e-05, "loss": 0.3302, "step": 277 }, { "epoch": 0.131636578869488, "grad_norm": 2.3256547451019287, "learning_rate": 1.998987486641806e-05, "loss": 0.36, "step": 278 }, { "epoch": 0.13211009174311927, "grad_norm": 2.554792642593384, "learning_rate": 1.9989643482218642e-05, "loss": 0.3144, "step": 279 }, { "epoch": 0.1325836046167505, "grad_norm": 1.872496485710144, "learning_rate": 1.9989409485324342e-05, "loss": 0.3312, "step": 280 }, { "epoch": 0.13305711749038177, "grad_norm": 1.47379469871521, "learning_rate": 1.998917287579636e-05, "loss": 0.359, "step": 281 }, { "epoch": 0.13353063036401303, "grad_norm": 1.6940183639526367, "learning_rate": 1.998893365369658e-05, "loss": 0.3364, "step": 282 }, { "epoch": 0.13400414323764429, "grad_norm": 2.1470634937286377, "learning_rate": 1.9988691819087568e-05, "loss": 0.3076, "step": 283 }, { "epoch": 0.13447765611127552, "grad_norm": 1.484626054763794, "learning_rate": 1.9988447372032573e-05, "loss": 0.3061, "step": 284 }, { "epoch": 0.13495116898490678, "grad_norm": 2.2221710681915283, "learning_rate": 1.9988200312595527e-05, "loss": 0.3271, "step": 285 }, { "epoch": 0.13542468185853804, "grad_norm": 1.8547298908233643, "learning_rate": 1.998795064084105e-05, "loss": 0.3321, "step": 286 }, { "epoch": 0.13589819473216927, "grad_norm": 1.4910904169082642, "learning_rate": 1.9987698356834438e-05, "loss": 0.3319, "step": 287 }, { "epoch": 0.13637170760580053, "grad_norm": 1.8652129173278809, "learning_rate": 1.9987443460641672e-05, "loss": 0.3279, "step": 288 }, { "epoch": 0.1368452204794318, "grad_norm": 1.7757213115692139, "learning_rate": 1.998718595232942e-05, "loss": 0.3662, "step": 289 }, { "epoch": 0.13731873335306305, "grad_norm": 1.7252709865570068, "learning_rate": 1.998692583196503e-05, "loss": 0.3394, "step": 290 }, { "epoch": 0.13779224622669428, "grad_norm": 1.5977404117584229, "learning_rate": 1.998666309961653e-05, "loss": 0.3187, "step": 291 }, { "epoch": 0.13826575910032554, "grad_norm": 2.18762469291687, "learning_rate": 1.9986397755352643e-05, "loss": 0.3153, "step": 292 }, { "epoch": 0.1387392719739568, "grad_norm": 1.6634726524353027, "learning_rate": 1.9986129799242767e-05, "loss": 0.3312, "step": 293 }, { "epoch": 0.13921278484758803, "grad_norm": 1.709532618522644, "learning_rate": 1.9985859231356976e-05, "loss": 0.3461, "step": 294 }, { "epoch": 0.1396862977212193, "grad_norm": 1.9468045234680176, "learning_rate": 1.998558605176604e-05, "loss": 0.3441, "step": 295 }, { "epoch": 0.14015981059485055, "grad_norm": 1.3585155010223389, "learning_rate": 1.9985310260541403e-05, "loss": 0.332, "step": 296 }, { "epoch": 0.14063332346848179, "grad_norm": 2.272162437438965, "learning_rate": 1.99850318577552e-05, "loss": 0.3187, "step": 297 }, { "epoch": 0.14110683634211305, "grad_norm": 1.5794426202774048, "learning_rate": 1.998475084348024e-05, "loss": 0.2903, "step": 298 }, { "epoch": 0.1415803492157443, "grad_norm": 1.7926369905471802, "learning_rate": 1.998446721779002e-05, "loss": 0.3271, "step": 299 }, { "epoch": 0.14205386208937557, "grad_norm": 2.8953635692596436, "learning_rate": 1.9984180980758724e-05, "loss": 0.3276, "step": 300 }, { "epoch": 0.1425273749630068, "grad_norm": 2.217665910720825, "learning_rate": 1.998389213246121e-05, "loss": 0.3039, "step": 301 }, { "epoch": 0.14300088783663806, "grad_norm": 1.6621983051300049, "learning_rate": 1.9983600672973026e-05, "loss": 0.3323, "step": 302 }, { "epoch": 0.14347440071026932, "grad_norm": 2.2490603923797607, "learning_rate": 1.99833066023704e-05, "loss": 0.3002, "step": 303 }, { "epoch": 0.14394791358390055, "grad_norm": 1.3009684085845947, "learning_rate": 1.9983009920730244e-05, "loss": 0.3074, "step": 304 }, { "epoch": 0.1444214264575318, "grad_norm": 1.508434534072876, "learning_rate": 1.998271062813015e-05, "loss": 0.3917, "step": 305 }, { "epoch": 0.14489493933116307, "grad_norm": 1.421549677848816, "learning_rate": 1.998240872464839e-05, "loss": 0.3269, "step": 306 }, { "epoch": 0.14536845220479433, "grad_norm": 1.5330595970153809, "learning_rate": 1.9982104210363936e-05, "loss": 0.3122, "step": 307 }, { "epoch": 0.14584196507842556, "grad_norm": 1.7626796960830688, "learning_rate": 1.9981797085356426e-05, "loss": 0.3176, "step": 308 }, { "epoch": 0.14631547795205682, "grad_norm": 1.6426533460617065, "learning_rate": 1.998148734970618e-05, "loss": 0.3441, "step": 309 }, { "epoch": 0.14678899082568808, "grad_norm": 1.3211804628372192, "learning_rate": 1.9981175003494217e-05, "loss": 0.2941, "step": 310 }, { "epoch": 0.1472625036993193, "grad_norm": 1.625162124633789, "learning_rate": 1.9980860046802214e-05, "loss": 0.3284, "step": 311 }, { "epoch": 0.14773601657295057, "grad_norm": 1.1030138731002808, "learning_rate": 1.998054247971256e-05, "loss": 0.3086, "step": 312 }, { "epoch": 0.14820952944658183, "grad_norm": 1.5524837970733643, "learning_rate": 1.9980222302308297e-05, "loss": 0.3069, "step": 313 }, { "epoch": 0.1486830423202131, "grad_norm": 1.973054051399231, "learning_rate": 1.997989951467318e-05, "loss": 0.3185, "step": 314 }, { "epoch": 0.14915655519384433, "grad_norm": 1.6762253046035767, "learning_rate": 1.9979574116891617e-05, "loss": 0.3271, "step": 315 }, { "epoch": 0.14963006806747559, "grad_norm": 1.4578509330749512, "learning_rate": 1.997924610904872e-05, "loss": 0.3082, "step": 316 }, { "epoch": 0.15010358094110685, "grad_norm": 1.9425774812698364, "learning_rate": 1.9978915491230277e-05, "loss": 0.3229, "step": 317 }, { "epoch": 0.15057709381473808, "grad_norm": 1.977746605873108, "learning_rate": 1.997858226352275e-05, "loss": 0.3359, "step": 318 }, { "epoch": 0.15105060668836934, "grad_norm": 1.842903971672058, "learning_rate": 1.9978246426013304e-05, "loss": 0.3891, "step": 319 }, { "epoch": 0.1515241195620006, "grad_norm": 1.3752530813217163, "learning_rate": 1.9977907978789762e-05, "loss": 0.3273, "step": 320 }, { "epoch": 0.15199763243563186, "grad_norm": 1.6148236989974976, "learning_rate": 1.997756692194065e-05, "loss": 0.3787, "step": 321 }, { "epoch": 0.1524711453092631, "grad_norm": 1.5016567707061768, "learning_rate": 1.997722325555516e-05, "loss": 0.2908, "step": 322 }, { "epoch": 0.15294465818289435, "grad_norm": 1.3146488666534424, "learning_rate": 1.9976876979723185e-05, "loss": 0.2753, "step": 323 }, { "epoch": 0.1534181710565256, "grad_norm": 1.3233283758163452, "learning_rate": 1.9976528094535285e-05, "loss": 0.3403, "step": 324 }, { "epoch": 0.15389168393015684, "grad_norm": 1.2820795774459839, "learning_rate": 1.9976176600082702e-05, "loss": 0.3536, "step": 325 }, { "epoch": 0.1543651968037881, "grad_norm": 1.5473982095718384, "learning_rate": 1.9975822496457377e-05, "loss": 0.3327, "step": 326 }, { "epoch": 0.15483870967741936, "grad_norm": 2.2213194370269775, "learning_rate": 1.9975465783751908e-05, "loss": 0.3157, "step": 327 }, { "epoch": 0.1553122225510506, "grad_norm": 2.196427345275879, "learning_rate": 1.9975106462059604e-05, "loss": 0.2918, "step": 328 }, { "epoch": 0.15578573542468185, "grad_norm": 1.3942145109176636, "learning_rate": 1.997474453147444e-05, "loss": 0.3043, "step": 329 }, { "epoch": 0.1562592482983131, "grad_norm": 1.2132917642593384, "learning_rate": 1.9974379992091065e-05, "loss": 0.2957, "step": 330 }, { "epoch": 0.15673276117194437, "grad_norm": 1.470230221748352, "learning_rate": 1.997401284400483e-05, "loss": 0.3122, "step": 331 }, { "epoch": 0.1572062740455756, "grad_norm": 2.0138354301452637, "learning_rate": 1.9973643087311755e-05, "loss": 0.3466, "step": 332 }, { "epoch": 0.15767978691920687, "grad_norm": 1.662995457649231, "learning_rate": 1.9973270722108553e-05, "loss": 0.3182, "step": 333 }, { "epoch": 0.15815329979283813, "grad_norm": 1.7974313497543335, "learning_rate": 1.9972895748492603e-05, "loss": 0.3393, "step": 334 }, { "epoch": 0.15862681266646936, "grad_norm": 1.5902550220489502, "learning_rate": 1.9972518166561983e-05, "loss": 0.304, "step": 335 }, { "epoch": 0.15910032554010062, "grad_norm": 1.5947198867797852, "learning_rate": 1.997213797641544e-05, "loss": 0.3209, "step": 336 }, { "epoch": 0.15957383841373188, "grad_norm": 1.3484359979629517, "learning_rate": 1.9971755178152416e-05, "loss": 0.3136, "step": 337 }, { "epoch": 0.16004735128736314, "grad_norm": 1.3645367622375488, "learning_rate": 1.9971369771873022e-05, "loss": 0.3269, "step": 338 }, { "epoch": 0.16052086416099437, "grad_norm": 1.8705230951309204, "learning_rate": 1.997098175767806e-05, "loss": 0.2756, "step": 339 }, { "epoch": 0.16099437703462563, "grad_norm": 1.3941823244094849, "learning_rate": 1.997059113566901e-05, "loss": 0.291, "step": 340 }, { "epoch": 0.1614678899082569, "grad_norm": 1.3297516107559204, "learning_rate": 1.9970197905948035e-05, "loss": 0.3159, "step": 341 }, { "epoch": 0.16194140278188812, "grad_norm": 1.6793006658554077, "learning_rate": 1.996980206861799e-05, "loss": 0.3268, "step": 342 }, { "epoch": 0.16241491565551938, "grad_norm": 1.1595239639282227, "learning_rate": 1.9969403623782385e-05, "loss": 0.2932, "step": 343 }, { "epoch": 0.16288842852915064, "grad_norm": 2.3051133155822754, "learning_rate": 1.9969002571545442e-05, "loss": 0.3267, "step": 344 }, { "epoch": 0.1633619414027819, "grad_norm": 2.491314172744751, "learning_rate": 1.996859891201205e-05, "loss": 0.3367, "step": 345 }, { "epoch": 0.16383545427641313, "grad_norm": 1.5665141344070435, "learning_rate": 1.996819264528778e-05, "loss": 0.3012, "step": 346 }, { "epoch": 0.1643089671500444, "grad_norm": 2.1546201705932617, "learning_rate": 1.996778377147889e-05, "loss": 0.3249, "step": 347 }, { "epoch": 0.16478248002367565, "grad_norm": 2.4622466564178467, "learning_rate": 1.9967372290692314e-05, "loss": 0.3302, "step": 348 }, { "epoch": 0.16525599289730689, "grad_norm": 1.5641582012176514, "learning_rate": 1.9966958203035673e-05, "loss": 0.3561, "step": 349 }, { "epoch": 0.16572950577093815, "grad_norm": 1.4802383184432983, "learning_rate": 1.996654150861727e-05, "loss": 0.2927, "step": 350 }, { "epoch": 0.1662030186445694, "grad_norm": 2.025378942489624, "learning_rate": 1.996612220754608e-05, "loss": 0.3213, "step": 351 }, { "epoch": 0.16667653151820064, "grad_norm": 2.12949275970459, "learning_rate": 1.9965700299931772e-05, "loss": 0.3446, "step": 352 }, { "epoch": 0.1671500443918319, "grad_norm": 1.340847373008728, "learning_rate": 1.9965275785884692e-05, "loss": 0.308, "step": 353 }, { "epoch": 0.16762355726546316, "grad_norm": 1.11890709400177, "learning_rate": 1.9964848665515867e-05, "loss": 0.2796, "step": 354 }, { "epoch": 0.16809707013909442, "grad_norm": 1.3249599933624268, "learning_rate": 1.9964418938937005e-05, "loss": 0.3066, "step": 355 }, { "epoch": 0.16857058301272565, "grad_norm": 2.209138870239258, "learning_rate": 1.99639866062605e-05, "loss": 0.3221, "step": 356 }, { "epoch": 0.1690440958863569, "grad_norm": 1.4605731964111328, "learning_rate": 1.9963551667599425e-05, "loss": 0.3104, "step": 357 }, { "epoch": 0.16951760875998817, "grad_norm": 1.4312578439712524, "learning_rate": 1.9963114123067525e-05, "loss": 0.3303, "step": 358 }, { "epoch": 0.1699911216336194, "grad_norm": 1.3438875675201416, "learning_rate": 1.9962673972779244e-05, "loss": 0.327, "step": 359 }, { "epoch": 0.17046463450725066, "grad_norm": 1.717059850692749, "learning_rate": 1.99622312168497e-05, "loss": 0.3151, "step": 360 }, { "epoch": 0.17093814738088192, "grad_norm": 1.7237285375595093, "learning_rate": 1.9961785855394685e-05, "loss": 0.288, "step": 361 }, { "epoch": 0.17141166025451318, "grad_norm": 1.342538833618164, "learning_rate": 1.9961337888530686e-05, "loss": 0.2842, "step": 362 }, { "epoch": 0.17188517312814441, "grad_norm": 1.9628273248672485, "learning_rate": 1.996088731637486e-05, "loss": 0.3327, "step": 363 }, { "epoch": 0.17235868600177567, "grad_norm": 1.262096643447876, "learning_rate": 1.996043413904505e-05, "loss": 0.2896, "step": 364 }, { "epoch": 0.17283219887540693, "grad_norm": 1.4459971189498901, "learning_rate": 1.995997835665978e-05, "loss": 0.3512, "step": 365 }, { "epoch": 0.17330571174903817, "grad_norm": 1.829930067062378, "learning_rate": 1.9959519969338257e-05, "loss": 0.3222, "step": 366 }, { "epoch": 0.17377922462266943, "grad_norm": 1.5174282789230347, "learning_rate": 1.9959058977200368e-05, "loss": 0.3152, "step": 367 }, { "epoch": 0.17425273749630069, "grad_norm": 2.4811291694641113, "learning_rate": 1.9958595380366683e-05, "loss": 0.2855, "step": 368 }, { "epoch": 0.17472625036993195, "grad_norm": 1.502210259437561, "learning_rate": 1.995812917895844e-05, "loss": 0.3019, "step": 369 }, { "epoch": 0.17519976324356318, "grad_norm": 2.2557857036590576, "learning_rate": 1.9957660373097587e-05, "loss": 0.3081, "step": 370 }, { "epoch": 0.17567327611719444, "grad_norm": 1.5600831508636475, "learning_rate": 1.9957188962906722e-05, "loss": 0.2764, "step": 371 }, { "epoch": 0.1761467889908257, "grad_norm": 1.6627660989761353, "learning_rate": 1.9956714948509144e-05, "loss": 0.2831, "step": 372 }, { "epoch": 0.17662030186445693, "grad_norm": 1.7966787815093994, "learning_rate": 1.9956238330028825e-05, "loss": 0.3213, "step": 373 }, { "epoch": 0.1770938147380882, "grad_norm": 1.3147883415222168, "learning_rate": 1.9955759107590424e-05, "loss": 0.2842, "step": 374 }, { "epoch": 0.17756732761171945, "grad_norm": 1.6301828622817993, "learning_rate": 1.9955277281319265e-05, "loss": 0.288, "step": 375 }, { "epoch": 0.17804084048535068, "grad_norm": 1.4622524976730347, "learning_rate": 1.995479285134138e-05, "loss": 0.3361, "step": 376 }, { "epoch": 0.17851435335898194, "grad_norm": 1.8074785470962524, "learning_rate": 1.9954305817783456e-05, "loss": 0.3038, "step": 377 }, { "epoch": 0.1789878662326132, "grad_norm": 1.4103055000305176, "learning_rate": 1.995381618077288e-05, "loss": 0.2992, "step": 378 }, { "epoch": 0.17946137910624446, "grad_norm": 1.4178228378295898, "learning_rate": 1.9953323940437707e-05, "loss": 0.3074, "step": 379 }, { "epoch": 0.1799348919798757, "grad_norm": 2.322828769683838, "learning_rate": 1.9952829096906677e-05, "loss": 0.3414, "step": 380 }, { "epoch": 0.18040840485350695, "grad_norm": 1.5784039497375488, "learning_rate": 1.9952331650309217e-05, "loss": 0.3, "step": 381 }, { "epoch": 0.18088191772713821, "grad_norm": 1.6129640340805054, "learning_rate": 1.9951831600775423e-05, "loss": 0.3698, "step": 382 }, { "epoch": 0.18135543060076945, "grad_norm": 1.7583578824996948, "learning_rate": 1.995132894843608e-05, "loss": 0.312, "step": 383 }, { "epoch": 0.1818289434744007, "grad_norm": 1.4428211450576782, "learning_rate": 1.9950823693422653e-05, "loss": 0.3401, "step": 384 }, { "epoch": 0.18230245634803197, "grad_norm": 1.8162686824798584, "learning_rate": 1.995031583586729e-05, "loss": 0.2884, "step": 385 }, { "epoch": 0.18277596922166323, "grad_norm": 1.871068000793457, "learning_rate": 1.9949805375902807e-05, "loss": 0.2974, "step": 386 }, { "epoch": 0.18324948209529446, "grad_norm": 1.9816604852676392, "learning_rate": 1.994929231366272e-05, "loss": 0.269, "step": 387 }, { "epoch": 0.18372299496892572, "grad_norm": 1.8843307495117188, "learning_rate": 1.994877664928121e-05, "loss": 0.3071, "step": 388 }, { "epoch": 0.18419650784255698, "grad_norm": 1.7556949853897095, "learning_rate": 1.994825838289314e-05, "loss": 0.3656, "step": 389 }, { "epoch": 0.1846700207161882, "grad_norm": 2.0573790073394775, "learning_rate": 1.9947737514634068e-05, "loss": 0.3047, "step": 390 }, { "epoch": 0.18514353358981947, "grad_norm": 1.355928897857666, "learning_rate": 1.9947214044640215e-05, "loss": 0.3033, "step": 391 }, { "epoch": 0.18561704646345073, "grad_norm": 1.6918185949325562, "learning_rate": 1.9946687973048493e-05, "loss": 0.2985, "step": 392 }, { "epoch": 0.186090559337082, "grad_norm": 1.477439045906067, "learning_rate": 1.9946159299996485e-05, "loss": 0.3319, "step": 393 }, { "epoch": 0.18656407221071322, "grad_norm": 1.5000855922698975, "learning_rate": 1.9945628025622466e-05, "loss": 0.2956, "step": 394 }, { "epoch": 0.18703758508434448, "grad_norm": 1.4173663854599, "learning_rate": 1.9945094150065385e-05, "loss": 0.361, "step": 395 }, { "epoch": 0.18751109795797574, "grad_norm": 1.6704801321029663, "learning_rate": 1.9944557673464873e-05, "loss": 0.3124, "step": 396 }, { "epoch": 0.18798461083160697, "grad_norm": 1.5634368658065796, "learning_rate": 1.9944018595961235e-05, "loss": 0.294, "step": 397 }, { "epoch": 0.18845812370523823, "grad_norm": 1.3400732278823853, "learning_rate": 1.9943476917695465e-05, "loss": 0.3401, "step": 398 }, { "epoch": 0.1889316365788695, "grad_norm": 1.7150495052337646, "learning_rate": 1.9942932638809233e-05, "loss": 0.319, "step": 399 }, { "epoch": 0.18940514945250073, "grad_norm": 1.299709439277649, "learning_rate": 1.9942385759444892e-05, "loss": 0.3081, "step": 400 }, { "epoch": 0.189878662326132, "grad_norm": 1.4215167760849, "learning_rate": 1.9941836279745473e-05, "loss": 0.3435, "step": 401 }, { "epoch": 0.19035217519976325, "grad_norm": 1.525974154472351, "learning_rate": 1.9941284199854684e-05, "loss": 0.2943, "step": 402 }, { "epoch": 0.1908256880733945, "grad_norm": 1.4975392818450928, "learning_rate": 1.994072951991692e-05, "loss": 0.2931, "step": 403 }, { "epoch": 0.19129920094702574, "grad_norm": 2.3207857608795166, "learning_rate": 1.9940172240077248e-05, "loss": 0.2704, "step": 404 }, { "epoch": 0.191772713820657, "grad_norm": 1.4448798894882202, "learning_rate": 1.993961236048142e-05, "loss": 0.3027, "step": 405 }, { "epoch": 0.19224622669428826, "grad_norm": 1.5526810884475708, "learning_rate": 1.9939049881275868e-05, "loss": 0.3239, "step": 406 }, { "epoch": 0.1927197395679195, "grad_norm": 1.3078432083129883, "learning_rate": 1.9938484802607704e-05, "loss": 0.2749, "step": 407 }, { "epoch": 0.19319325244155075, "grad_norm": 1.5397154092788696, "learning_rate": 1.993791712462472e-05, "loss": 0.3503, "step": 408 }, { "epoch": 0.193666765315182, "grad_norm": 1.3340824842453003, "learning_rate": 1.9937346847475382e-05, "loss": 0.3272, "step": 409 }, { "epoch": 0.19414027818881327, "grad_norm": 1.463257074356079, "learning_rate": 1.9936773971308847e-05, "loss": 0.3026, "step": 410 }, { "epoch": 0.1946137910624445, "grad_norm": 1.3551300764083862, "learning_rate": 1.993619849627494e-05, "loss": 0.2841, "step": 411 }, { "epoch": 0.19508730393607576, "grad_norm": 1.297788143157959, "learning_rate": 1.9935620422524172e-05, "loss": 0.2927, "step": 412 }, { "epoch": 0.19556081680970702, "grad_norm": 1.5475691556930542, "learning_rate": 1.993503975020773e-05, "loss": 0.2562, "step": 413 }, { "epoch": 0.19603432968333825, "grad_norm": 1.5862221717834473, "learning_rate": 1.993445647947749e-05, "loss": 0.2706, "step": 414 }, { "epoch": 0.19650784255696951, "grad_norm": 1.496739387512207, "learning_rate": 1.9933870610486e-05, "loss": 0.2786, "step": 415 }, { "epoch": 0.19698135543060077, "grad_norm": 1.2570983171463013, "learning_rate": 1.9933282143386478e-05, "loss": 0.2779, "step": 416 }, { "epoch": 0.19745486830423203, "grad_norm": 1.8604596853256226, "learning_rate": 1.9932691078332843e-05, "loss": 0.307, "step": 417 }, { "epoch": 0.19792838117786327, "grad_norm": 1.6916229724884033, "learning_rate": 1.9932097415479683e-05, "loss": 0.2797, "step": 418 }, { "epoch": 0.19840189405149453, "grad_norm": 1.1430197954177856, "learning_rate": 1.993150115498226e-05, "loss": 0.2891, "step": 419 }, { "epoch": 0.19887540692512579, "grad_norm": 1.5780760049819946, "learning_rate": 1.9930902296996516e-05, "loss": 0.2891, "step": 420 }, { "epoch": 0.19934891979875702, "grad_norm": 1.7553094625473022, "learning_rate": 1.993030084167908e-05, "loss": 0.3078, "step": 421 }, { "epoch": 0.19982243267238828, "grad_norm": 1.4407962560653687, "learning_rate": 1.9929696789187264e-05, "loss": 0.3151, "step": 422 }, { "epoch": 0.20029594554601954, "grad_norm": 1.4529813528060913, "learning_rate": 1.9929090139679045e-05, "loss": 0.3176, "step": 423 }, { "epoch": 0.2007694584196508, "grad_norm": 1.5530153512954712, "learning_rate": 1.9928480893313082e-05, "loss": 0.2996, "step": 424 }, { "epoch": 0.20124297129328203, "grad_norm": 1.2986822128295898, "learning_rate": 1.992786905024873e-05, "loss": 0.3124, "step": 425 }, { "epoch": 0.2017164841669133, "grad_norm": 1.4415348768234253, "learning_rate": 1.9927254610646e-05, "loss": 0.2665, "step": 426 }, { "epoch": 0.20218999704054455, "grad_norm": 1.6581053733825684, "learning_rate": 1.9926637574665598e-05, "loss": 0.2962, "step": 427 }, { "epoch": 0.20266350991417578, "grad_norm": 1.351175308227539, "learning_rate": 1.9926017942468903e-05, "loss": 0.299, "step": 428 }, { "epoch": 0.20313702278780704, "grad_norm": 1.107627511024475, "learning_rate": 1.992539571421797e-05, "loss": 0.2441, "step": 429 }, { "epoch": 0.2036105356614383, "grad_norm": 1.7610833644866943, "learning_rate": 1.9924770890075544e-05, "loss": 0.2864, "step": 430 }, { "epoch": 0.20408404853506953, "grad_norm": 1.2733440399169922, "learning_rate": 1.9924143470205034e-05, "loss": 0.2957, "step": 431 }, { "epoch": 0.2045575614087008, "grad_norm": 1.452659010887146, "learning_rate": 1.992351345477054e-05, "loss": 0.2657, "step": 432 }, { "epoch": 0.20503107428233205, "grad_norm": 1.7691302299499512, "learning_rate": 1.992288084393684e-05, "loss": 0.3139, "step": 433 }, { "epoch": 0.20550458715596331, "grad_norm": 1.0048437118530273, "learning_rate": 1.9922245637869376e-05, "loss": 0.2768, "step": 434 }, { "epoch": 0.20597810002959455, "grad_norm": 1.4264775514602661, "learning_rate": 1.9921607836734292e-05, "loss": 0.3009, "step": 435 }, { "epoch": 0.2064516129032258, "grad_norm": 1.2522767782211304, "learning_rate": 1.9920967440698392e-05, "loss": 0.3091, "step": 436 }, { "epoch": 0.20692512577685707, "grad_norm": 1.4616775512695312, "learning_rate": 1.992032444992917e-05, "loss": 0.2709, "step": 437 }, { "epoch": 0.2073986386504883, "grad_norm": 2.048379421234131, "learning_rate": 1.9919678864594788e-05, "loss": 0.2922, "step": 438 }, { "epoch": 0.20787215152411956, "grad_norm": 2.135870933532715, "learning_rate": 1.9919030684864097e-05, "loss": 0.3139, "step": 439 }, { "epoch": 0.20834566439775082, "grad_norm": 1.5550521612167358, "learning_rate": 1.991837991090662e-05, "loss": 0.2967, "step": 440 }, { "epoch": 0.20881917727138208, "grad_norm": 1.4404891729354858, "learning_rate": 1.9917726542892562e-05, "loss": 0.2625, "step": 441 }, { "epoch": 0.2092926901450133, "grad_norm": 1.4406466484069824, "learning_rate": 1.9917070580992805e-05, "loss": 0.2846, "step": 442 }, { "epoch": 0.20976620301864457, "grad_norm": 2.054635524749756, "learning_rate": 1.9916412025378907e-05, "loss": 0.3299, "step": 443 }, { "epoch": 0.21023971589227583, "grad_norm": 1.5401420593261719, "learning_rate": 1.9915750876223112e-05, "loss": 0.2995, "step": 444 }, { "epoch": 0.21071322876590706, "grad_norm": 1.5220952033996582, "learning_rate": 1.9915087133698333e-05, "loss": 0.2638, "step": 445 }, { "epoch": 0.21118674163953832, "grad_norm": 1.1745985746383667, "learning_rate": 1.9914420797978167e-05, "loss": 0.2603, "step": 446 }, { "epoch": 0.21166025451316958, "grad_norm": 1.6358566284179688, "learning_rate": 1.9913751869236888e-05, "loss": 0.2847, "step": 447 }, { "epoch": 0.21213376738680084, "grad_norm": 1.0750114917755127, "learning_rate": 1.9913080347649446e-05, "loss": 0.2783, "step": 448 }, { "epoch": 0.21260728026043207, "grad_norm": 2.263500928878784, "learning_rate": 1.9912406233391474e-05, "loss": 0.2818, "step": 449 }, { "epoch": 0.21308079313406333, "grad_norm": 1.2337026596069336, "learning_rate": 1.991172952663928e-05, "loss": 0.2995, "step": 450 }, { "epoch": 0.2135543060076946, "grad_norm": 1.5734847784042358, "learning_rate": 1.9911050227569845e-05, "loss": 0.2681, "step": 451 }, { "epoch": 0.21402781888132583, "grad_norm": 1.5128898620605469, "learning_rate": 1.9910368336360836e-05, "loss": 0.3128, "step": 452 }, { "epoch": 0.2145013317549571, "grad_norm": 2.963315486907959, "learning_rate": 1.99096838531906e-05, "loss": 0.3157, "step": 453 }, { "epoch": 0.21497484462858835, "grad_norm": 1.357264757156372, "learning_rate": 1.990899677823815e-05, "loss": 0.3173, "step": 454 }, { "epoch": 0.21544835750221958, "grad_norm": 1.2650480270385742, "learning_rate": 1.9908307111683184e-05, "loss": 0.2686, "step": 455 }, { "epoch": 0.21592187037585084, "grad_norm": 1.538608193397522, "learning_rate": 1.990761485370608e-05, "loss": 0.3223, "step": 456 }, { "epoch": 0.2163953832494821, "grad_norm": 1.675292730331421, "learning_rate": 1.9906920004487894e-05, "loss": 0.292, "step": 457 }, { "epoch": 0.21686889612311336, "grad_norm": 1.2939106225967407, "learning_rate": 1.9906222564210353e-05, "loss": 0.2695, "step": 458 }, { "epoch": 0.2173424089967446, "grad_norm": 1.1748974323272705, "learning_rate": 1.990552253305587e-05, "loss": 0.3129, "step": 459 }, { "epoch": 0.21781592187037585, "grad_norm": 1.7201486825942993, "learning_rate": 1.9904819911207526e-05, "loss": 0.3042, "step": 460 }, { "epoch": 0.2182894347440071, "grad_norm": 1.790076732635498, "learning_rate": 1.990411469884909e-05, "loss": 0.3384, "step": 461 }, { "epoch": 0.21876294761763834, "grad_norm": 1.5797431468963623, "learning_rate": 1.9903406896165e-05, "loss": 0.3244, "step": 462 }, { "epoch": 0.2192364604912696, "grad_norm": 1.390303373336792, "learning_rate": 1.9902696503340378e-05, "loss": 0.2862, "step": 463 }, { "epoch": 0.21970997336490086, "grad_norm": 1.6508777141571045, "learning_rate": 1.990198352056102e-05, "loss": 0.313, "step": 464 }, { "epoch": 0.22018348623853212, "grad_norm": 1.5535106658935547, "learning_rate": 1.990126794801339e-05, "loss": 0.313, "step": 465 }, { "epoch": 0.22065699911216335, "grad_norm": 1.5495903491973877, "learning_rate": 1.9900549785884654e-05, "loss": 0.2866, "step": 466 }, { "epoch": 0.22113051198579461, "grad_norm": 1.5182076692581177, "learning_rate": 1.989982903436263e-05, "loss": 0.2786, "step": 467 }, { "epoch": 0.22160402485942587, "grad_norm": 1.5054259300231934, "learning_rate": 1.989910569363583e-05, "loss": 0.2599, "step": 468 }, { "epoch": 0.2220775377330571, "grad_norm": 1.793799877166748, "learning_rate": 1.989837976389344e-05, "loss": 0.3061, "step": 469 }, { "epoch": 0.22255105060668837, "grad_norm": 1.3098176717758179, "learning_rate": 1.9897651245325306e-05, "loss": 0.268, "step": 470 }, { "epoch": 0.22302456348031963, "grad_norm": 1.5548096895217896, "learning_rate": 1.9896920138121977e-05, "loss": 0.2858, "step": 471 }, { "epoch": 0.2234980763539509, "grad_norm": 1.5963671207427979, "learning_rate": 1.989618644247466e-05, "loss": 0.3073, "step": 472 }, { "epoch": 0.22397158922758212, "grad_norm": 1.266221046447754, "learning_rate": 1.989545015857525e-05, "loss": 0.2709, "step": 473 }, { "epoch": 0.22444510210121338, "grad_norm": 1.2214033603668213, "learning_rate": 1.9894711286616313e-05, "loss": 0.2725, "step": 474 }, { "epoch": 0.22491861497484464, "grad_norm": 1.5835678577423096, "learning_rate": 1.98939698267911e-05, "loss": 0.2411, "step": 475 }, { "epoch": 0.22539212784847587, "grad_norm": 1.4112712144851685, "learning_rate": 1.989322577929352e-05, "loss": 0.2913, "step": 476 }, { "epoch": 0.22586564072210713, "grad_norm": 1.335115909576416, "learning_rate": 1.9892479144318187e-05, "loss": 0.2936, "step": 477 }, { "epoch": 0.2263391535957384, "grad_norm": 1.4088594913482666, "learning_rate": 1.989172992206036e-05, "loss": 0.3129, "step": 478 }, { "epoch": 0.22681266646936962, "grad_norm": 1.2447527647018433, "learning_rate": 1.989097811271601e-05, "loss": 0.3029, "step": 479 }, { "epoch": 0.22728617934300088, "grad_norm": 1.2852405309677124, "learning_rate": 1.9890223716481746e-05, "loss": 0.2788, "step": 480 }, { "epoch": 0.22775969221663214, "grad_norm": 2.38966703414917, "learning_rate": 1.9889466733554883e-05, "loss": 0.3573, "step": 481 }, { "epoch": 0.2282332050902634, "grad_norm": 1.3226877450942993, "learning_rate": 1.9888707164133403e-05, "loss": 0.2997, "step": 482 }, { "epoch": 0.22870671796389463, "grad_norm": 1.2726070880889893, "learning_rate": 1.988794500841596e-05, "loss": 0.27, "step": 483 }, { "epoch": 0.2291802308375259, "grad_norm": 1.3730125427246094, "learning_rate": 1.9887180266601892e-05, "loss": 0.2917, "step": 484 }, { "epoch": 0.22965374371115715, "grad_norm": 1.4895507097244263, "learning_rate": 1.988641293889121e-05, "loss": 0.3041, "step": 485 }, { "epoch": 0.2301272565847884, "grad_norm": 1.5664935111999512, "learning_rate": 1.9885643025484598e-05, "loss": 0.2987, "step": 486 }, { "epoch": 0.23060076945841965, "grad_norm": 1.38105046749115, "learning_rate": 1.988487052658342e-05, "loss": 0.2739, "step": 487 }, { "epoch": 0.2310742823320509, "grad_norm": 1.3974148035049438, "learning_rate": 1.988409544238972e-05, "loss": 0.294, "step": 488 }, { "epoch": 0.23154779520568217, "grad_norm": 1.1836129426956177, "learning_rate": 1.988331777310621e-05, "loss": 0.2847, "step": 489 }, { "epoch": 0.2320213080793134, "grad_norm": 1.9731239080429077, "learning_rate": 1.9882537518936283e-05, "loss": 0.2773, "step": 490 }, { "epoch": 0.23249482095294466, "grad_norm": 1.8344943523406982, "learning_rate": 1.9881754680084e-05, "loss": 0.3044, "step": 491 }, { "epoch": 0.23296833382657592, "grad_norm": 1.9124189615249634, "learning_rate": 1.988096925675412e-05, "loss": 0.2658, "step": 492 }, { "epoch": 0.23344184670020715, "grad_norm": 1.1877110004425049, "learning_rate": 1.988018124915205e-05, "loss": 0.2978, "step": 493 }, { "epoch": 0.2339153595738384, "grad_norm": 3.0704288482666016, "learning_rate": 1.987939065748389e-05, "loss": 0.2754, "step": 494 }, { "epoch": 0.23438887244746967, "grad_norm": 2.0718958377838135, "learning_rate": 1.9878597481956416e-05, "loss": 0.2838, "step": 495 }, { "epoch": 0.23486238532110093, "grad_norm": 1.83311927318573, "learning_rate": 1.9877801722777064e-05, "loss": 0.2823, "step": 496 }, { "epoch": 0.23533589819473216, "grad_norm": 1.3887392282485962, "learning_rate": 1.9877003380153968e-05, "loss": 0.2805, "step": 497 }, { "epoch": 0.23580941106836342, "grad_norm": 1.7678110599517822, "learning_rate": 1.9876202454295926e-05, "loss": 0.2892, "step": 498 }, { "epoch": 0.23628292394199468, "grad_norm": 1.2601211071014404, "learning_rate": 1.9875398945412403e-05, "loss": 0.3009, "step": 499 }, { "epoch": 0.23675643681562591, "grad_norm": 1.4841456413269043, "learning_rate": 1.987459285371356e-05, "loss": 0.2835, "step": 500 }, { "epoch": 0.23722994968925717, "grad_norm": 1.53849196434021, "learning_rate": 1.9873784179410217e-05, "loss": 0.2825, "step": 501 }, { "epoch": 0.23770346256288843, "grad_norm": 1.1709768772125244, "learning_rate": 1.9872972922713875e-05, "loss": 0.2877, "step": 502 }, { "epoch": 0.23817697543651967, "grad_norm": 1.4577317237854004, "learning_rate": 1.9872159083836713e-05, "loss": 0.2704, "step": 503 }, { "epoch": 0.23865048831015093, "grad_norm": 1.207236886024475, "learning_rate": 1.9871342662991582e-05, "loss": 0.3042, "step": 504 }, { "epoch": 0.2391240011837822, "grad_norm": 1.409219741821289, "learning_rate": 1.9870523660392004e-05, "loss": 0.2925, "step": 505 }, { "epoch": 0.23959751405741345, "grad_norm": 2.00093412399292, "learning_rate": 1.986970207625219e-05, "loss": 0.3005, "step": 506 }, { "epoch": 0.24007102693104468, "grad_norm": 1.3477380275726318, "learning_rate": 1.986887791078701e-05, "loss": 0.2754, "step": 507 }, { "epoch": 0.24054453980467594, "grad_norm": 1.0551843643188477, "learning_rate": 1.9868051164212017e-05, "loss": 0.3066, "step": 508 }, { "epoch": 0.2410180526783072, "grad_norm": 1.1477186679840088, "learning_rate": 1.986722183674344e-05, "loss": 0.2924, "step": 509 }, { "epoch": 0.24149156555193843, "grad_norm": 1.6527049541473389, "learning_rate": 1.9866389928598188e-05, "loss": 0.3204, "step": 510 }, { "epoch": 0.2419650784255697, "grad_norm": 1.1355421543121338, "learning_rate": 1.986555543999383e-05, "loss": 0.2904, "step": 511 }, { "epoch": 0.24243859129920095, "grad_norm": 1.9797145128250122, "learning_rate": 1.9864718371148623e-05, "loss": 0.2834, "step": 512 }, { "epoch": 0.2429121041728322, "grad_norm": 2.0345993041992188, "learning_rate": 1.9863878722281492e-05, "loss": 0.2688, "step": 513 }, { "epoch": 0.24338561704646344, "grad_norm": 2.046318769454956, "learning_rate": 1.986303649361204e-05, "loss": 0.2846, "step": 514 }, { "epoch": 0.2438591299200947, "grad_norm": 1.3903858661651611, "learning_rate": 1.986219168536054e-05, "loss": 0.2915, "step": 515 }, { "epoch": 0.24433264279372596, "grad_norm": 1.3947668075561523, "learning_rate": 1.986134429774795e-05, "loss": 0.3142, "step": 516 }, { "epoch": 0.2448061556673572, "grad_norm": 1.6972023248672485, "learning_rate": 1.9860494330995892e-05, "loss": 0.2862, "step": 517 }, { "epoch": 0.24527966854098845, "grad_norm": 1.431135892868042, "learning_rate": 1.9859641785326672e-05, "loss": 0.2691, "step": 518 }, { "epoch": 0.24575318141461971, "grad_norm": 1.653490424156189, "learning_rate": 1.9858786660963253e-05, "loss": 0.2368, "step": 519 }, { "epoch": 0.24622669428825097, "grad_norm": 2.2051680088043213, "learning_rate": 1.98579289581293e-05, "loss": 0.294, "step": 520 }, { "epoch": 0.2467002071618822, "grad_norm": 2.165895938873291, "learning_rate": 1.9857068677049124e-05, "loss": 0.2745, "step": 521 }, { "epoch": 0.24717372003551347, "grad_norm": 2.177927255630493, "learning_rate": 1.9856205817947728e-05, "loss": 0.3092, "step": 522 }, { "epoch": 0.24764723290914473, "grad_norm": 1.3167831897735596, "learning_rate": 1.9855340381050787e-05, "loss": 0.2428, "step": 523 }, { "epoch": 0.24812074578277596, "grad_norm": 1.1702346801757812, "learning_rate": 1.9854472366584646e-05, "loss": 0.2834, "step": 524 }, { "epoch": 0.24859425865640722, "grad_norm": 1.4927124977111816, "learning_rate": 1.9853601774776322e-05, "loss": 0.2695, "step": 525 }, { "epoch": 0.24906777153003848, "grad_norm": 2.9464986324310303, "learning_rate": 1.9852728605853516e-05, "loss": 0.278, "step": 526 }, { "epoch": 0.24954128440366974, "grad_norm": 1.3696672916412354, "learning_rate": 1.9851852860044594e-05, "loss": 0.2743, "step": 527 }, { "epoch": 0.25001479727730097, "grad_norm": 1.6753240823745728, "learning_rate": 1.9850974537578597e-05, "loss": 0.2999, "step": 528 }, { "epoch": 0.2504883101509322, "grad_norm": 1.2681008577346802, "learning_rate": 1.9850093638685247e-05, "loss": 0.2763, "step": 529 }, { "epoch": 0.2509618230245635, "grad_norm": 1.4322853088378906, "learning_rate": 1.984921016359493e-05, "loss": 0.301, "step": 530 }, { "epoch": 0.2514353358981947, "grad_norm": 1.3877912759780884, "learning_rate": 1.984832411253871e-05, "loss": 0.2866, "step": 531 }, { "epoch": 0.251908848771826, "grad_norm": 1.4901669025421143, "learning_rate": 1.9847435485748328e-05, "loss": 0.3069, "step": 532 }, { "epoch": 0.25238236164545724, "grad_norm": 1.5434002876281738, "learning_rate": 1.9846544283456195e-05, "loss": 0.2884, "step": 533 }, { "epoch": 0.2528558745190885, "grad_norm": 1.3284276723861694, "learning_rate": 1.9845650505895397e-05, "loss": 0.2709, "step": 534 }, { "epoch": 0.25332938739271976, "grad_norm": 1.1716914176940918, "learning_rate": 1.9844754153299686e-05, "loss": 0.273, "step": 535 }, { "epoch": 0.253802900266351, "grad_norm": 1.208626389503479, "learning_rate": 1.984385522590351e-05, "loss": 0.2616, "step": 536 }, { "epoch": 0.2542764131399822, "grad_norm": 1.6577744483947754, "learning_rate": 1.9842953723941954e-05, "loss": 0.2702, "step": 537 }, { "epoch": 0.2547499260136135, "grad_norm": 1.4658805131912231, "learning_rate": 1.9842049647650815e-05, "loss": 0.2848, "step": 538 }, { "epoch": 0.25522343888724475, "grad_norm": 1.4830657243728638, "learning_rate": 1.984114299726654e-05, "loss": 0.2536, "step": 539 }, { "epoch": 0.255696951760876, "grad_norm": 1.5815433263778687, "learning_rate": 1.984023377302625e-05, "loss": 0.2853, "step": 540 }, { "epoch": 0.25617046463450727, "grad_norm": 1.2159545421600342, "learning_rate": 1.9839321975167747e-05, "loss": 0.3015, "step": 541 }, { "epoch": 0.2566439775081385, "grad_norm": 1.711661696434021, "learning_rate": 1.9838407603929503e-05, "loss": 0.2924, "step": 542 }, { "epoch": 0.25711749038176973, "grad_norm": 1.3681540489196777, "learning_rate": 1.9837490659550665e-05, "loss": 0.2999, "step": 543 }, { "epoch": 0.257591003255401, "grad_norm": 1.9190740585327148, "learning_rate": 1.983657114227105e-05, "loss": 0.3154, "step": 544 }, { "epoch": 0.25806451612903225, "grad_norm": 1.5599863529205322, "learning_rate": 1.9835649052331143e-05, "loss": 0.2775, "step": 545 }, { "epoch": 0.2585380290026635, "grad_norm": 1.3822370767593384, "learning_rate": 1.9834724389972116e-05, "loss": 0.2659, "step": 546 }, { "epoch": 0.25901154187629477, "grad_norm": 1.2202012538909912, "learning_rate": 1.98337971554358e-05, "loss": 0.2835, "step": 547 }, { "epoch": 0.259485054749926, "grad_norm": 1.225102186203003, "learning_rate": 1.9832867348964707e-05, "loss": 0.3081, "step": 548 }, { "epoch": 0.2599585676235573, "grad_norm": 1.9328206777572632, "learning_rate": 1.983193497080202e-05, "loss": 0.2807, "step": 549 }, { "epoch": 0.2604320804971885, "grad_norm": 3.2432518005371094, "learning_rate": 1.9831000021191592e-05, "loss": 0.3038, "step": 550 }, { "epoch": 0.26090559337081975, "grad_norm": 1.950760841369629, "learning_rate": 1.9830062500377945e-05, "loss": 0.3174, "step": 551 }, { "epoch": 0.26137910624445104, "grad_norm": 1.1726529598236084, "learning_rate": 1.9829122408606288e-05, "loss": 0.2757, "step": 552 }, { "epoch": 0.2618526191180823, "grad_norm": 1.5435218811035156, "learning_rate": 1.9828179746122487e-05, "loss": 0.301, "step": 553 }, { "epoch": 0.2623261319917135, "grad_norm": 2.2376794815063477, "learning_rate": 1.9827234513173087e-05, "loss": 0.2955, "step": 554 }, { "epoch": 0.2627996448653448, "grad_norm": 2.062591791152954, "learning_rate": 1.9826286710005305e-05, "loss": 0.255, "step": 555 }, { "epoch": 0.263273157738976, "grad_norm": 1.1453508138656616, "learning_rate": 1.982533633686703e-05, "loss": 0.2597, "step": 556 }, { "epoch": 0.26374667061260726, "grad_norm": 1.465489387512207, "learning_rate": 1.9824383394006825e-05, "loss": 0.2905, "step": 557 }, { "epoch": 0.26422018348623855, "grad_norm": 1.4714313745498657, "learning_rate": 1.9823427881673916e-05, "loss": 0.2584, "step": 558 }, { "epoch": 0.2646936963598698, "grad_norm": 1.2892956733703613, "learning_rate": 1.9822469800118215e-05, "loss": 0.2792, "step": 559 }, { "epoch": 0.265167209233501, "grad_norm": 1.3988784551620483, "learning_rate": 1.98215091495903e-05, "loss": 0.2749, "step": 560 }, { "epoch": 0.2656407221071323, "grad_norm": 1.2862416505813599, "learning_rate": 1.9820545930341413e-05, "loss": 0.2765, "step": 561 }, { "epoch": 0.26611423498076353, "grad_norm": 1.5326262712478638, "learning_rate": 1.981958014262348e-05, "loss": 0.3028, "step": 562 }, { "epoch": 0.2665877478543948, "grad_norm": 0.968665361404419, "learning_rate": 1.981861178668909e-05, "loss": 0.255, "step": 563 }, { "epoch": 0.26706126072802605, "grad_norm": 1.944764494895935, "learning_rate": 1.981764086279151e-05, "loss": 0.292, "step": 564 }, { "epoch": 0.2675347736016573, "grad_norm": 1.2996660470962524, "learning_rate": 1.9816667371184677e-05, "loss": 0.2615, "step": 565 }, { "epoch": 0.26800828647528857, "grad_norm": 1.362474799156189, "learning_rate": 1.9815691312123194e-05, "loss": 0.2889, "step": 566 }, { "epoch": 0.2684817993489198, "grad_norm": 1.6865662336349487, "learning_rate": 1.9814712685862342e-05, "loss": 0.2745, "step": 567 }, { "epoch": 0.26895531222255104, "grad_norm": 1.5344867706298828, "learning_rate": 1.9813731492658073e-05, "loss": 0.2901, "step": 568 }, { "epoch": 0.2694288250961823, "grad_norm": 1.4177885055541992, "learning_rate": 1.9812747732767006e-05, "loss": 0.2864, "step": 569 }, { "epoch": 0.26990233796981355, "grad_norm": 1.1646602153778076, "learning_rate": 1.981176140644643e-05, "loss": 0.2668, "step": 570 }, { "epoch": 0.2703758508434448, "grad_norm": 1.417022943496704, "learning_rate": 1.981077251395432e-05, "loss": 0.2761, "step": 571 }, { "epoch": 0.2708493637170761, "grad_norm": 1.7936999797821045, "learning_rate": 1.9809781055549308e-05, "loss": 0.2899, "step": 572 }, { "epoch": 0.2713228765907073, "grad_norm": 2.242428779602051, "learning_rate": 1.980878703149069e-05, "loss": 0.2971, "step": 573 }, { "epoch": 0.27179638946433854, "grad_norm": 1.8133538961410522, "learning_rate": 1.980779044203845e-05, "loss": 0.2789, "step": 574 }, { "epoch": 0.2722699023379698, "grad_norm": 1.2497389316558838, "learning_rate": 1.9806791287453245e-05, "loss": 0.2885, "step": 575 }, { "epoch": 0.27274341521160106, "grad_norm": 1.5333000421524048, "learning_rate": 1.9805789567996384e-05, "loss": 0.2917, "step": 576 }, { "epoch": 0.2732169280852323, "grad_norm": 2.611602306365967, "learning_rate": 1.9804785283929858e-05, "loss": 0.2752, "step": 577 }, { "epoch": 0.2736904409588636, "grad_norm": 1.3517531156539917, "learning_rate": 1.9803778435516327e-05, "loss": 0.2619, "step": 578 }, { "epoch": 0.2741639538324948, "grad_norm": 1.452409267425537, "learning_rate": 1.9802769023019128e-05, "loss": 0.2735, "step": 579 }, { "epoch": 0.2746374667061261, "grad_norm": 1.5018271207809448, "learning_rate": 1.980175704670226e-05, "loss": 0.3296, "step": 580 }, { "epoch": 0.27511097957975733, "grad_norm": 2.2614331245422363, "learning_rate": 1.9800742506830394e-05, "loss": 0.2969, "step": 581 }, { "epoch": 0.27558449245338856, "grad_norm": 1.6071243286132812, "learning_rate": 1.979972540366888e-05, "loss": 0.3061, "step": 582 }, { "epoch": 0.27605800532701985, "grad_norm": 1.25821852684021, "learning_rate": 1.979870573748372e-05, "loss": 0.2715, "step": 583 }, { "epoch": 0.2765315182006511, "grad_norm": 1.5205860137939453, "learning_rate": 1.9797683508541606e-05, "loss": 0.3016, "step": 584 }, { "epoch": 0.2770050310742823, "grad_norm": 1.3796861171722412, "learning_rate": 1.9796658717109892e-05, "loss": 0.2868, "step": 585 }, { "epoch": 0.2774785439479136, "grad_norm": 1.5328634977340698, "learning_rate": 1.97956313634566e-05, "loss": 0.2787, "step": 586 }, { "epoch": 0.27795205682154484, "grad_norm": 1.6029144525527954, "learning_rate": 1.9794601447850424e-05, "loss": 0.2508, "step": 587 }, { "epoch": 0.27842556969517607, "grad_norm": 1.3855677843093872, "learning_rate": 1.9793568970560728e-05, "loss": 0.2723, "step": 588 }, { "epoch": 0.27889908256880735, "grad_norm": 1.2493582963943481, "learning_rate": 1.979253393185755e-05, "loss": 0.2589, "step": 589 }, { "epoch": 0.2793725954424386, "grad_norm": 1.4009218215942383, "learning_rate": 1.9791496332011593e-05, "loss": 0.2998, "step": 590 }, { "epoch": 0.2798461083160698, "grad_norm": 2.099388837814331, "learning_rate": 1.979045617129423e-05, "loss": 0.292, "step": 591 }, { "epoch": 0.2803196211897011, "grad_norm": 1.3639984130859375, "learning_rate": 1.9789413449977505e-05, "loss": 0.2717, "step": 592 }, { "epoch": 0.28079313406333234, "grad_norm": 1.2905452251434326, "learning_rate": 1.9788368168334135e-05, "loss": 0.245, "step": 593 }, { "epoch": 0.28126664693696357, "grad_norm": 1.319740653038025, "learning_rate": 1.9787320326637494e-05, "loss": 0.259, "step": 594 }, { "epoch": 0.28174015981059486, "grad_norm": 1.6142657995224, "learning_rate": 1.9786269925161646e-05, "loss": 0.2819, "step": 595 }, { "epoch": 0.2822136726842261, "grad_norm": 1.273977279663086, "learning_rate": 1.978521696418131e-05, "loss": 0.2574, "step": 596 }, { "epoch": 0.2826871855578574, "grad_norm": 1.7523685693740845, "learning_rate": 1.9784161443971878e-05, "loss": 0.2703, "step": 597 }, { "epoch": 0.2831606984314886, "grad_norm": 1.5006474256515503, "learning_rate": 1.9783103364809405e-05, "loss": 0.3043, "step": 598 }, { "epoch": 0.28363421130511984, "grad_norm": 1.1372324228286743, "learning_rate": 1.978204272697063e-05, "loss": 0.2793, "step": 599 }, { "epoch": 0.28410772417875113, "grad_norm": 1.5428109169006348, "learning_rate": 1.9780979530732947e-05, "loss": 0.2455, "step": 600 }, { "epoch": 0.28458123705238236, "grad_norm": 1.6741421222686768, "learning_rate": 1.9779913776374427e-05, "loss": 0.2767, "step": 601 }, { "epoch": 0.2850547499260136, "grad_norm": 1.1833195686340332, "learning_rate": 1.9778845464173805e-05, "loss": 0.2639, "step": 602 }, { "epoch": 0.2855282627996449, "grad_norm": 1.1689391136169434, "learning_rate": 1.9777774594410495e-05, "loss": 0.2601, "step": 603 }, { "epoch": 0.2860017756732761, "grad_norm": 1.502760887145996, "learning_rate": 1.9776701167364565e-05, "loss": 0.3045, "step": 604 }, { "epoch": 0.28647528854690735, "grad_norm": 1.974056601524353, "learning_rate": 1.977562518331676e-05, "loss": 0.2785, "step": 605 }, { "epoch": 0.28694880142053864, "grad_norm": 1.409839391708374, "learning_rate": 1.97745466425485e-05, "loss": 0.2732, "step": 606 }, { "epoch": 0.28742231429416987, "grad_norm": 1.0581002235412598, "learning_rate": 1.9773465545341855e-05, "loss": 0.2569, "step": 607 }, { "epoch": 0.2878958271678011, "grad_norm": 1.3603756427764893, "learning_rate": 1.977238189197959e-05, "loss": 0.2536, "step": 608 }, { "epoch": 0.2883693400414324, "grad_norm": 1.2487221956253052, "learning_rate": 1.9771295682745115e-05, "loss": 0.2998, "step": 609 }, { "epoch": 0.2888428529150636, "grad_norm": 1.1602859497070312, "learning_rate": 1.977020691792252e-05, "loss": 0.2476, "step": 610 }, { "epoch": 0.2893163657886949, "grad_norm": 1.3260509967803955, "learning_rate": 1.976911559779656e-05, "loss": 0.2972, "step": 611 }, { "epoch": 0.28978987866232614, "grad_norm": 1.3510822057724, "learning_rate": 1.976802172265266e-05, "loss": 0.2658, "step": 612 }, { "epoch": 0.29026339153595737, "grad_norm": 2.0700032711029053, "learning_rate": 1.976692529277691e-05, "loss": 0.2471, "step": 613 }, { "epoch": 0.29073690440958866, "grad_norm": 1.5847715139389038, "learning_rate": 1.9765826308456075e-05, "loss": 0.2757, "step": 614 }, { "epoch": 0.2912104172832199, "grad_norm": 1.1467682123184204, "learning_rate": 1.976472476997758e-05, "loss": 0.2733, "step": 615 }, { "epoch": 0.2916839301568511, "grad_norm": 1.1180078983306885, "learning_rate": 1.9763620677629525e-05, "loss": 0.2494, "step": 616 }, { "epoch": 0.2921574430304824, "grad_norm": 2.1236302852630615, "learning_rate": 1.9762514031700673e-05, "loss": 0.2934, "step": 617 }, { "epoch": 0.29263095590411364, "grad_norm": 1.3528096675872803, "learning_rate": 1.9761404832480455e-05, "loss": 0.2811, "step": 618 }, { "epoch": 0.2931044687777449, "grad_norm": 1.4439085721969604, "learning_rate": 1.9760293080258976e-05, "loss": 0.2829, "step": 619 }, { "epoch": 0.29357798165137616, "grad_norm": 1.2082427740097046, "learning_rate": 1.9759178775327e-05, "loss": 0.2781, "step": 620 }, { "epoch": 0.2940514945250074, "grad_norm": 2.3843281269073486, "learning_rate": 1.975806191797596e-05, "loss": 0.2968, "step": 621 }, { "epoch": 0.2945250073986386, "grad_norm": 1.1950457096099854, "learning_rate": 1.9756942508497967e-05, "loss": 0.2816, "step": 622 }, { "epoch": 0.2949985202722699, "grad_norm": 1.4412981271743774, "learning_rate": 1.9755820547185787e-05, "loss": 0.2838, "step": 623 }, { "epoch": 0.29547203314590115, "grad_norm": 1.1725237369537354, "learning_rate": 1.975469603433286e-05, "loss": 0.2596, "step": 624 }, { "epoch": 0.2959455460195324, "grad_norm": 1.1337695121765137, "learning_rate": 1.975356897023329e-05, "loss": 0.2647, "step": 625 }, { "epoch": 0.29641905889316367, "grad_norm": 1.444916009902954, "learning_rate": 1.9752439355181848e-05, "loss": 0.2679, "step": 626 }, { "epoch": 0.2968925717667949, "grad_norm": 1.7320390939712524, "learning_rate": 1.975130718947398e-05, "loss": 0.2317, "step": 627 }, { "epoch": 0.2973660846404262, "grad_norm": 2.060805320739746, "learning_rate": 1.9750172473405785e-05, "loss": 0.269, "step": 628 }, { "epoch": 0.2978395975140574, "grad_norm": 1.348092794418335, "learning_rate": 1.9749035207274044e-05, "loss": 0.2658, "step": 629 }, { "epoch": 0.29831311038768865, "grad_norm": 1.3302925825119019, "learning_rate": 1.9747895391376192e-05, "loss": 0.2504, "step": 630 }, { "epoch": 0.29878662326131994, "grad_norm": 1.1984549760818481, "learning_rate": 1.9746753026010342e-05, "loss": 0.2713, "step": 631 }, { "epoch": 0.29926013613495117, "grad_norm": 1.1172891855239868, "learning_rate": 1.9745608111475267e-05, "loss": 0.2636, "step": 632 }, { "epoch": 0.2997336490085824, "grad_norm": 1.5490065813064575, "learning_rate": 1.9744460648070408e-05, "loss": 0.2618, "step": 633 }, { "epoch": 0.3002071618822137, "grad_norm": 1.167188048362732, "learning_rate": 1.974331063609587e-05, "loss": 0.2601, "step": 634 }, { "epoch": 0.3006806747558449, "grad_norm": 1.5427215099334717, "learning_rate": 1.9742158075852435e-05, "loss": 0.281, "step": 635 }, { "epoch": 0.30115418762947616, "grad_norm": 1.17001211643219, "learning_rate": 1.9741002967641537e-05, "loss": 0.2592, "step": 636 }, { "epoch": 0.30162770050310744, "grad_norm": 1.1877490282058716, "learning_rate": 1.9739845311765286e-05, "loss": 0.3065, "step": 637 }, { "epoch": 0.3021012133767387, "grad_norm": 1.3651347160339355, "learning_rate": 1.9738685108526456e-05, "loss": 0.2801, "step": 638 }, { "epoch": 0.3025747262503699, "grad_norm": 1.283866047859192, "learning_rate": 1.9737522358228487e-05, "loss": 0.2939, "step": 639 }, { "epoch": 0.3030482391240012, "grad_norm": 1.2590893507003784, "learning_rate": 1.9736357061175483e-05, "loss": 0.2657, "step": 640 }, { "epoch": 0.3035217519976324, "grad_norm": 1.178700566291809, "learning_rate": 1.973518921767222e-05, "loss": 0.2618, "step": 641 }, { "epoch": 0.3039952648712637, "grad_norm": 1.7493693828582764, "learning_rate": 1.9734018828024136e-05, "loss": 0.2763, "step": 642 }, { "epoch": 0.30446877774489495, "grad_norm": 1.9666167497634888, "learning_rate": 1.973284589253733e-05, "loss": 0.2826, "step": 643 }, { "epoch": 0.3049422906185262, "grad_norm": 1.3451874256134033, "learning_rate": 1.9731670411518578e-05, "loss": 0.2642, "step": 644 }, { "epoch": 0.30541580349215747, "grad_norm": 1.3448307514190674, "learning_rate": 1.973049238527531e-05, "loss": 0.2672, "step": 645 }, { "epoch": 0.3058893163657887, "grad_norm": 1.249316692352295, "learning_rate": 1.9729311814115632e-05, "loss": 0.2564, "step": 646 }, { "epoch": 0.30636282923941993, "grad_norm": 1.4523890018463135, "learning_rate": 1.972812869834831e-05, "loss": 0.2831, "step": 647 }, { "epoch": 0.3068363421130512, "grad_norm": 1.7998663187026978, "learning_rate": 1.9726943038282772e-05, "loss": 0.2962, "step": 648 }, { "epoch": 0.30730985498668245, "grad_norm": 1.393811583518982, "learning_rate": 1.9725754834229124e-05, "loss": 0.2553, "step": 649 }, { "epoch": 0.3077833678603137, "grad_norm": 1.6593152284622192, "learning_rate": 1.9724564086498123e-05, "loss": 0.3102, "step": 650 }, { "epoch": 0.30825688073394497, "grad_norm": 1.6688569784164429, "learning_rate": 1.97233707954012e-05, "loss": 0.2706, "step": 651 }, { "epoch": 0.3087303936075762, "grad_norm": 2.265216588973999, "learning_rate": 1.972217496125045e-05, "loss": 0.2884, "step": 652 }, { "epoch": 0.30920390648120744, "grad_norm": 1.522386908531189, "learning_rate": 1.972097658435863e-05, "loss": 0.2754, "step": 653 }, { "epoch": 0.3096774193548387, "grad_norm": 1.4247666597366333, "learning_rate": 1.9719775665039162e-05, "loss": 0.2557, "step": 654 }, { "epoch": 0.31015093222846996, "grad_norm": 1.1649177074432373, "learning_rate": 1.971857220360614e-05, "loss": 0.2608, "step": 655 }, { "epoch": 0.3106244451021012, "grad_norm": 2.187192678451538, "learning_rate": 1.9717366200374313e-05, "loss": 0.2989, "step": 656 }, { "epoch": 0.3110979579757325, "grad_norm": 1.4517701864242554, "learning_rate": 1.9716157655659102e-05, "loss": 0.2681, "step": 657 }, { "epoch": 0.3115714708493637, "grad_norm": 1.8214770555496216, "learning_rate": 1.971494656977659e-05, "loss": 0.241, "step": 658 }, { "epoch": 0.312044983722995, "grad_norm": 1.289170742034912, "learning_rate": 1.9713732943043524e-05, "loss": 0.2659, "step": 659 }, { "epoch": 0.3125184965966262, "grad_norm": 1.3986639976501465, "learning_rate": 1.9712516775777315e-05, "loss": 0.2649, "step": 660 }, { "epoch": 0.31299200947025746, "grad_norm": 1.7374229431152344, "learning_rate": 1.9711298068296046e-05, "loss": 0.2902, "step": 661 }, { "epoch": 0.31346552234388875, "grad_norm": 1.802095890045166, "learning_rate": 1.9710076820918448e-05, "loss": 0.2515, "step": 662 }, { "epoch": 0.31393903521752, "grad_norm": 1.2375832796096802, "learning_rate": 1.9708853033963936e-05, "loss": 0.267, "step": 663 }, { "epoch": 0.3144125480911512, "grad_norm": 1.2813341617584229, "learning_rate": 1.9707626707752574e-05, "loss": 0.2743, "step": 664 }, { "epoch": 0.3148860609647825, "grad_norm": 1.1700478792190552, "learning_rate": 1.97063978426051e-05, "loss": 0.2724, "step": 665 }, { "epoch": 0.31535957383841373, "grad_norm": 2.465041160583496, "learning_rate": 1.9705166438842907e-05, "loss": 0.2598, "step": 666 }, { "epoch": 0.31583308671204496, "grad_norm": 1.6531705856323242, "learning_rate": 1.9703932496788063e-05, "loss": 0.2889, "step": 667 }, { "epoch": 0.31630659958567625, "grad_norm": 1.5581059455871582, "learning_rate": 1.9702696016763286e-05, "loss": 0.2985, "step": 668 }, { "epoch": 0.3167801124593075, "grad_norm": 1.4005812406539917, "learning_rate": 1.9701456999091974e-05, "loss": 0.2475, "step": 669 }, { "epoch": 0.3172536253329387, "grad_norm": 1.6382842063903809, "learning_rate": 1.970021544409817e-05, "loss": 0.2683, "step": 670 }, { "epoch": 0.31772713820657, "grad_norm": 1.9983477592468262, "learning_rate": 1.96989713521066e-05, "loss": 0.2836, "step": 671 }, { "epoch": 0.31820065108020124, "grad_norm": 1.6255732774734497, "learning_rate": 1.9697724723442643e-05, "loss": 0.2627, "step": 672 }, { "epoch": 0.31867416395383247, "grad_norm": 1.2218822240829468, "learning_rate": 1.9696475558432334e-05, "loss": 0.2735, "step": 673 }, { "epoch": 0.31914767682746376, "grad_norm": 1.356476902961731, "learning_rate": 1.969522385740239e-05, "loss": 0.2905, "step": 674 }, { "epoch": 0.319621189701095, "grad_norm": 1.8103471994400024, "learning_rate": 1.9693969620680177e-05, "loss": 0.2909, "step": 675 }, { "epoch": 0.3200947025747263, "grad_norm": 1.908302903175354, "learning_rate": 1.9692712848593726e-05, "loss": 0.2738, "step": 676 }, { "epoch": 0.3205682154483575, "grad_norm": 1.2977486848831177, "learning_rate": 1.969145354147174e-05, "loss": 0.2518, "step": 677 }, { "epoch": 0.32104172832198874, "grad_norm": 1.6458895206451416, "learning_rate": 1.9690191699643575e-05, "loss": 0.2707, "step": 678 }, { "epoch": 0.32151524119562, "grad_norm": 2.108949661254883, "learning_rate": 1.9688927323439254e-05, "loss": 0.2894, "step": 679 }, { "epoch": 0.32198875406925126, "grad_norm": 1.1589833498001099, "learning_rate": 1.9687660413189463e-05, "loss": 0.255, "step": 680 }, { "epoch": 0.3224622669428825, "grad_norm": 1.281614899635315, "learning_rate": 1.9686390969225545e-05, "loss": 0.2743, "step": 681 }, { "epoch": 0.3229357798165138, "grad_norm": 1.3841043710708618, "learning_rate": 1.968511899187952e-05, "loss": 0.2668, "step": 682 }, { "epoch": 0.323409292690145, "grad_norm": 1.8176034688949585, "learning_rate": 1.9683844481484054e-05, "loss": 0.2286, "step": 683 }, { "epoch": 0.32388280556377624, "grad_norm": 2.2643625736236572, "learning_rate": 1.9682567438372486e-05, "loss": 0.2619, "step": 684 }, { "epoch": 0.32435631843740753, "grad_norm": 2.1710567474365234, "learning_rate": 1.9681287862878818e-05, "loss": 0.2898, "step": 685 }, { "epoch": 0.32482983131103876, "grad_norm": 1.4091668128967285, "learning_rate": 1.9680005755337702e-05, "loss": 0.2515, "step": 686 }, { "epoch": 0.32530334418467, "grad_norm": 1.261533260345459, "learning_rate": 1.9678721116084465e-05, "loss": 0.2553, "step": 687 }, { "epoch": 0.3257768570583013, "grad_norm": 1.412940263748169, "learning_rate": 1.9677433945455092e-05, "loss": 0.3054, "step": 688 }, { "epoch": 0.3262503699319325, "grad_norm": 1.3808501958847046, "learning_rate": 1.9676144243786236e-05, "loss": 0.2611, "step": 689 }, { "epoch": 0.3267238828055638, "grad_norm": 1.1639057397842407, "learning_rate": 1.9674852011415194e-05, "loss": 0.2882, "step": 690 }, { "epoch": 0.32719739567919504, "grad_norm": 1.1630046367645264, "learning_rate": 1.9673557248679945e-05, "loss": 0.2231, "step": 691 }, { "epoch": 0.32767090855282627, "grad_norm": 1.795854091644287, "learning_rate": 1.9672259955919123e-05, "loss": 0.3062, "step": 692 }, { "epoch": 0.32814442142645756, "grad_norm": 1.4758808612823486, "learning_rate": 1.967096013347202e-05, "loss": 0.277, "step": 693 }, { "epoch": 0.3286179343000888, "grad_norm": 1.5420145988464355, "learning_rate": 1.9669657781678587e-05, "loss": 0.2952, "step": 694 }, { "epoch": 0.32909144717372, "grad_norm": 1.7470226287841797, "learning_rate": 1.9668352900879447e-05, "loss": 0.2645, "step": 695 }, { "epoch": 0.3295649600473513, "grad_norm": 1.6357449293136597, "learning_rate": 1.9667045491415878e-05, "loss": 0.2714, "step": 696 }, { "epoch": 0.33003847292098254, "grad_norm": 2.116548776626587, "learning_rate": 1.9665735553629824e-05, "loss": 0.2769, "step": 697 }, { "epoch": 0.33051198579461377, "grad_norm": 2.229929208755493, "learning_rate": 1.966442308786388e-05, "loss": 0.2505, "step": 698 }, { "epoch": 0.33098549866824506, "grad_norm": 1.3335548639297485, "learning_rate": 1.966310809446131e-05, "loss": 0.2516, "step": 699 }, { "epoch": 0.3314590115418763, "grad_norm": 1.3416866064071655, "learning_rate": 1.9661790573766046e-05, "loss": 0.2563, "step": 700 }, { "epoch": 0.3319325244155075, "grad_norm": 1.5719635486602783, "learning_rate": 1.966047052612266e-05, "loss": 0.2644, "step": 701 }, { "epoch": 0.3324060372891388, "grad_norm": 1.4301633834838867, "learning_rate": 1.9659147951876407e-05, "loss": 0.2599, "step": 702 }, { "epoch": 0.33287955016277004, "grad_norm": 1.6425291299819946, "learning_rate": 1.965782285137319e-05, "loss": 0.2422, "step": 703 }, { "epoch": 0.3333530630364013, "grad_norm": 1.3709927797317505, "learning_rate": 1.9656495224959578e-05, "loss": 0.2653, "step": 704 }, { "epoch": 0.33382657591003256, "grad_norm": 2.5719616413116455, "learning_rate": 1.9655165072982797e-05, "loss": 0.2581, "step": 705 }, { "epoch": 0.3343000887836638, "grad_norm": 2.397643804550171, "learning_rate": 1.9653832395790733e-05, "loss": 0.2737, "step": 706 }, { "epoch": 0.3347736016572951, "grad_norm": 1.7374242544174194, "learning_rate": 1.965249719373194e-05, "loss": 0.2835, "step": 707 }, { "epoch": 0.3352471145309263, "grad_norm": 1.2840518951416016, "learning_rate": 1.965115946715563e-05, "loss": 0.2673, "step": 708 }, { "epoch": 0.33572062740455755, "grad_norm": 1.8667082786560059, "learning_rate": 1.964981921641166e-05, "loss": 0.2596, "step": 709 }, { "epoch": 0.33619414027818884, "grad_norm": 1.0804299116134644, "learning_rate": 1.9648476441850574e-05, "loss": 0.2432, "step": 710 }, { "epoch": 0.33666765315182007, "grad_norm": 1.5073543787002563, "learning_rate": 1.964713114382355e-05, "loss": 0.2594, "step": 711 }, { "epoch": 0.3371411660254513, "grad_norm": 1.724390983581543, "learning_rate": 1.9645783322682447e-05, "loss": 0.251, "step": 712 }, { "epoch": 0.3376146788990826, "grad_norm": 1.4915387630462646, "learning_rate": 1.964443297877977e-05, "loss": 0.2637, "step": 713 }, { "epoch": 0.3380881917727138, "grad_norm": 1.8173447847366333, "learning_rate": 1.9643080112468683e-05, "loss": 0.2878, "step": 714 }, { "epoch": 0.33856170464634505, "grad_norm": 1.6663117408752441, "learning_rate": 1.9641724724103026e-05, "loss": 0.2929, "step": 715 }, { "epoch": 0.33903521751997634, "grad_norm": 1.1947954893112183, "learning_rate": 1.9640366814037283e-05, "loss": 0.2699, "step": 716 }, { "epoch": 0.33950873039360757, "grad_norm": 1.2085504531860352, "learning_rate": 1.96390063826266e-05, "loss": 0.258, "step": 717 }, { "epoch": 0.3399822432672388, "grad_norm": 1.3036900758743286, "learning_rate": 1.963764343022679e-05, "loss": 0.283, "step": 718 }, { "epoch": 0.3404557561408701, "grad_norm": 2.1460154056549072, "learning_rate": 1.9636277957194316e-05, "loss": 0.2811, "step": 719 }, { "epoch": 0.3409292690145013, "grad_norm": 2.7816526889801025, "learning_rate": 1.9634909963886304e-05, "loss": 0.2436, "step": 720 }, { "epoch": 0.34140278188813256, "grad_norm": 1.8817825317382812, "learning_rate": 1.963353945066054e-05, "loss": 0.2644, "step": 721 }, { "epoch": 0.34187629476176384, "grad_norm": 1.5300520658493042, "learning_rate": 1.963216641787547e-05, "loss": 0.2729, "step": 722 }, { "epoch": 0.3423498076353951, "grad_norm": 1.8480312824249268, "learning_rate": 1.9630790865890196e-05, "loss": 0.3109, "step": 723 }, { "epoch": 0.34282332050902636, "grad_norm": 2.683523178100586, "learning_rate": 1.9629412795064482e-05, "loss": 0.2516, "step": 724 }, { "epoch": 0.3432968333826576, "grad_norm": 1.2851934432983398, "learning_rate": 1.9628032205758746e-05, "loss": 0.271, "step": 725 }, { "epoch": 0.34377034625628883, "grad_norm": 1.8185439109802246, "learning_rate": 1.962664909833407e-05, "loss": 0.2953, "step": 726 }, { "epoch": 0.3442438591299201, "grad_norm": 1.4006390571594238, "learning_rate": 1.9625263473152193e-05, "loss": 0.2656, "step": 727 }, { "epoch": 0.34471737200355135, "grad_norm": 1.353481650352478, "learning_rate": 1.962387533057551e-05, "loss": 0.2615, "step": 728 }, { "epoch": 0.3451908848771826, "grad_norm": 1.9150645732879639, "learning_rate": 1.9622484670967083e-05, "loss": 0.2629, "step": 729 }, { "epoch": 0.34566439775081387, "grad_norm": 1.6772758960723877, "learning_rate": 1.9621091494690616e-05, "loss": 0.2406, "step": 730 }, { "epoch": 0.3461379106244451, "grad_norm": 1.7029880285263062, "learning_rate": 1.9619695802110485e-05, "loss": 0.2548, "step": 731 }, { "epoch": 0.34661142349807633, "grad_norm": 1.2326855659484863, "learning_rate": 1.961829759359172e-05, "loss": 0.2976, "step": 732 }, { "epoch": 0.3470849363717076, "grad_norm": 1.2379788160324097, "learning_rate": 1.961689686950001e-05, "loss": 0.2472, "step": 733 }, { "epoch": 0.34755844924533885, "grad_norm": 2.5410702228546143, "learning_rate": 1.9615493630201694e-05, "loss": 0.2827, "step": 734 }, { "epoch": 0.3480319621189701, "grad_norm": 2.657710075378418, "learning_rate": 1.961408787606379e-05, "loss": 0.2616, "step": 735 }, { "epoch": 0.34850547499260137, "grad_norm": 1.4291493892669678, "learning_rate": 1.9612679607453942e-05, "loss": 0.2978, "step": 736 }, { "epoch": 0.3489789878662326, "grad_norm": 1.5034074783325195, "learning_rate": 1.9611268824740482e-05, "loss": 0.2616, "step": 737 }, { "epoch": 0.3494525007398639, "grad_norm": 1.3025211095809937, "learning_rate": 1.9609855528292386e-05, "loss": 0.2704, "step": 738 }, { "epoch": 0.3499260136134951, "grad_norm": 2.3469293117523193, "learning_rate": 1.960843971847928e-05, "loss": 0.3042, "step": 739 }, { "epoch": 0.35039952648712636, "grad_norm": 2.3904573917388916, "learning_rate": 1.960702139567146e-05, "loss": 0.2687, "step": 740 }, { "epoch": 0.35087303936075764, "grad_norm": 1.4793574810028076, "learning_rate": 1.9605600560239874e-05, "loss": 0.2495, "step": 741 }, { "epoch": 0.3513465522343889, "grad_norm": 1.2374844551086426, "learning_rate": 1.960417721255613e-05, "loss": 0.2791, "step": 742 }, { "epoch": 0.3518200651080201, "grad_norm": 2.1964030265808105, "learning_rate": 1.960275135299249e-05, "loss": 0.2738, "step": 743 }, { "epoch": 0.3522935779816514, "grad_norm": 1.7759153842926025, "learning_rate": 1.9601322981921872e-05, "loss": 0.2436, "step": 744 }, { "epoch": 0.35276709085528263, "grad_norm": 1.5018988847732544, "learning_rate": 1.959989209971785e-05, "loss": 0.2935, "step": 745 }, { "epoch": 0.35324060372891386, "grad_norm": 1.1463077068328857, "learning_rate": 1.959845870675467e-05, "loss": 0.2613, "step": 746 }, { "epoch": 0.35371411660254515, "grad_norm": 1.4144256114959717, "learning_rate": 1.9597022803407206e-05, "loss": 0.2337, "step": 747 }, { "epoch": 0.3541876294761764, "grad_norm": 1.2923469543457031, "learning_rate": 1.9595584390051014e-05, "loss": 0.2449, "step": 748 }, { "epoch": 0.3546611423498076, "grad_norm": 2.338681936264038, "learning_rate": 1.9594143467062295e-05, "loss": 0.2694, "step": 749 }, { "epoch": 0.3551346552234389, "grad_norm": 2.0817441940307617, "learning_rate": 1.9592700034817906e-05, "loss": 0.3, "step": 750 }, { "epoch": 0.35560816809707013, "grad_norm": 1.2521568536758423, "learning_rate": 1.959125409369537e-05, "loss": 0.2664, "step": 751 }, { "epoch": 0.35608168097070136, "grad_norm": 1.5073604583740234, "learning_rate": 1.958980564407285e-05, "loss": 0.2696, "step": 752 }, { "epoch": 0.35655519384433265, "grad_norm": 1.10438871383667, "learning_rate": 1.9588354686329182e-05, "loss": 0.2712, "step": 753 }, { "epoch": 0.3570287067179639, "grad_norm": 1.8521367311477661, "learning_rate": 1.9586901220843844e-05, "loss": 0.2802, "step": 754 }, { "epoch": 0.35750221959159517, "grad_norm": 2.0266237258911133, "learning_rate": 1.958544524799698e-05, "loss": 0.2608, "step": 755 }, { "epoch": 0.3579757324652264, "grad_norm": 1.1580361127853394, "learning_rate": 1.958398676816938e-05, "loss": 0.2582, "step": 756 }, { "epoch": 0.35844924533885764, "grad_norm": 1.5954866409301758, "learning_rate": 1.9582525781742502e-05, "loss": 0.2436, "step": 757 }, { "epoch": 0.3589227582124889, "grad_norm": 1.2600346803665161, "learning_rate": 1.9581062289098448e-05, "loss": 0.2658, "step": 758 }, { "epoch": 0.35939627108612016, "grad_norm": 1.5517022609710693, "learning_rate": 1.9579596290619986e-05, "loss": 0.2886, "step": 759 }, { "epoch": 0.3598697839597514, "grad_norm": 1.9510260820388794, "learning_rate": 1.9578127786690532e-05, "loss": 0.2839, "step": 760 }, { "epoch": 0.3603432968333827, "grad_norm": 1.2476470470428467, "learning_rate": 1.957665677769415e-05, "loss": 0.2668, "step": 761 }, { "epoch": 0.3608168097070139, "grad_norm": 1.4977624416351318, "learning_rate": 1.9575183264015577e-05, "loss": 0.2613, "step": 762 }, { "epoch": 0.36129032258064514, "grad_norm": 1.4652796983718872, "learning_rate": 1.95737072460402e-05, "loss": 0.2766, "step": 763 }, { "epoch": 0.36176383545427643, "grad_norm": 1.8096346855163574, "learning_rate": 1.957222872415405e-05, "loss": 0.2817, "step": 764 }, { "epoch": 0.36223734832790766, "grad_norm": 1.9074580669403076, "learning_rate": 1.9570747698743818e-05, "loss": 0.2689, "step": 765 }, { "epoch": 0.3627108612015389, "grad_norm": 1.3230053186416626, "learning_rate": 1.956926417019686e-05, "loss": 0.2504, "step": 766 }, { "epoch": 0.3631843740751702, "grad_norm": 1.6510530710220337, "learning_rate": 1.9567778138901175e-05, "loss": 0.2582, "step": 767 }, { "epoch": 0.3636578869488014, "grad_norm": 1.1170092821121216, "learning_rate": 1.9566289605245416e-05, "loss": 0.2573, "step": 768 }, { "epoch": 0.3641313998224327, "grad_norm": 1.1853854656219482, "learning_rate": 1.95647985696189e-05, "loss": 0.2889, "step": 769 }, { "epoch": 0.36460491269606393, "grad_norm": 1.359598159790039, "learning_rate": 1.9563305032411594e-05, "loss": 0.259, "step": 770 }, { "epoch": 0.36507842556969516, "grad_norm": 1.3403595685958862, "learning_rate": 1.956180899401411e-05, "loss": 0.2468, "step": 771 }, { "epoch": 0.36555193844332645, "grad_norm": 1.6090962886810303, "learning_rate": 1.9560310454817736e-05, "loss": 0.2536, "step": 772 }, { "epoch": 0.3660254513169577, "grad_norm": 1.543067455291748, "learning_rate": 1.9558809415214386e-05, "loss": 0.2331, "step": 773 }, { "epoch": 0.3664989641905889, "grad_norm": 1.650692343711853, "learning_rate": 1.955730587559665e-05, "loss": 0.2705, "step": 774 }, { "epoch": 0.3669724770642202, "grad_norm": 1.7598607540130615, "learning_rate": 1.9555799836357765e-05, "loss": 0.2612, "step": 775 }, { "epoch": 0.36744598993785144, "grad_norm": 1.1476891040802002, "learning_rate": 1.955429129789162e-05, "loss": 0.2401, "step": 776 }, { "epoch": 0.36791950281148267, "grad_norm": 2.1535274982452393, "learning_rate": 1.9552780260592755e-05, "loss": 0.2797, "step": 777 }, { "epoch": 0.36839301568511396, "grad_norm": 1.3766757249832153, "learning_rate": 1.955126672485637e-05, "loss": 0.2579, "step": 778 }, { "epoch": 0.3688665285587452, "grad_norm": 1.8468455076217651, "learning_rate": 1.954975069107832e-05, "loss": 0.2515, "step": 779 }, { "epoch": 0.3693400414323764, "grad_norm": 1.970403790473938, "learning_rate": 1.95482321596551e-05, "loss": 0.2513, "step": 780 }, { "epoch": 0.3698135543060077, "grad_norm": 1.3336784839630127, "learning_rate": 1.9546711130983874e-05, "loss": 0.2741, "step": 781 }, { "epoch": 0.37028706717963894, "grad_norm": 2.5626380443573, "learning_rate": 1.954518760546245e-05, "loss": 0.2537, "step": 782 }, { "epoch": 0.3707605800532702, "grad_norm": 3.228623628616333, "learning_rate": 1.9543661583489295e-05, "loss": 0.2756, "step": 783 }, { "epoch": 0.37123409292690146, "grad_norm": 2.226210832595825, "learning_rate": 1.9542133065463518e-05, "loss": 0.278, "step": 784 }, { "epoch": 0.3717076058005327, "grad_norm": 1.0971438884735107, "learning_rate": 1.9540602051784897e-05, "loss": 0.2334, "step": 785 }, { "epoch": 0.372181118674164, "grad_norm": 1.3254427909851074, "learning_rate": 1.9539068542853844e-05, "loss": 0.2461, "step": 786 }, { "epoch": 0.3726546315477952, "grad_norm": 1.6858649253845215, "learning_rate": 1.953753253907144e-05, "loss": 0.272, "step": 787 }, { "epoch": 0.37312814442142644, "grad_norm": 1.6952732801437378, "learning_rate": 1.9535994040839413e-05, "loss": 0.2775, "step": 788 }, { "epoch": 0.37360165729505773, "grad_norm": 1.41287100315094, "learning_rate": 1.9534453048560137e-05, "loss": 0.2415, "step": 789 }, { "epoch": 0.37407517016868896, "grad_norm": 1.457585096359253, "learning_rate": 1.953290956263665e-05, "loss": 0.2778, "step": 790 }, { "epoch": 0.3745486830423202, "grad_norm": 1.6097171306610107, "learning_rate": 1.9531363583472628e-05, "loss": 0.2522, "step": 791 }, { "epoch": 0.3750221959159515, "grad_norm": 1.8246599435806274, "learning_rate": 1.9529815111472414e-05, "loss": 0.27, "step": 792 }, { "epoch": 0.3754957087895827, "grad_norm": 2.544400691986084, "learning_rate": 1.9528264147040995e-05, "loss": 0.2704, "step": 793 }, { "epoch": 0.37596922166321395, "grad_norm": 1.3660106658935547, "learning_rate": 1.9526710690584005e-05, "loss": 0.2608, "step": 794 }, { "epoch": 0.37644273453684524, "grad_norm": 1.1534936428070068, "learning_rate": 1.9525154742507745e-05, "loss": 0.2565, "step": 795 }, { "epoch": 0.37691624741047647, "grad_norm": 1.5819586515426636, "learning_rate": 1.9523596303219146e-05, "loss": 0.2806, "step": 796 }, { "epoch": 0.3773897602841077, "grad_norm": 1.9623225927352905, "learning_rate": 1.9522035373125816e-05, "loss": 0.2916, "step": 797 }, { "epoch": 0.377863273157739, "grad_norm": 1.619255542755127, "learning_rate": 1.9520471952635992e-05, "loss": 0.2641, "step": 798 }, { "epoch": 0.3783367860313702, "grad_norm": 1.587944746017456, "learning_rate": 1.9518906042158575e-05, "loss": 0.2707, "step": 799 }, { "epoch": 0.37881029890500145, "grad_norm": 1.1555267572402954, "learning_rate": 1.9517337642103116e-05, "loss": 0.2526, "step": 800 }, { "epoch": 0.37928381177863274, "grad_norm": 1.6961590051651, "learning_rate": 1.9515766752879808e-05, "loss": 0.2276, "step": 801 }, { "epoch": 0.379757324652264, "grad_norm": 1.6464869976043701, "learning_rate": 1.9514193374899508e-05, "loss": 0.2662, "step": 802 }, { "epoch": 0.38023083752589526, "grad_norm": 2.7278549671173096, "learning_rate": 1.9512617508573713e-05, "loss": 0.2696, "step": 803 }, { "epoch": 0.3807043503995265, "grad_norm": 1.3315246105194092, "learning_rate": 1.951103915431458e-05, "loss": 0.2777, "step": 804 }, { "epoch": 0.3811778632731577, "grad_norm": 1.1722429990768433, "learning_rate": 1.9509458312534912e-05, "loss": 0.2657, "step": 805 }, { "epoch": 0.381651376146789, "grad_norm": 2.3142430782318115, "learning_rate": 1.9507874983648163e-05, "loss": 0.2829, "step": 806 }, { "epoch": 0.38212488902042024, "grad_norm": 2.0772149562835693, "learning_rate": 1.9506289168068433e-05, "loss": 0.2589, "step": 807 }, { "epoch": 0.3825984018940515, "grad_norm": 1.3622729778289795, "learning_rate": 1.950470086621048e-05, "loss": 0.2629, "step": 808 }, { "epoch": 0.38307191476768276, "grad_norm": 1.4933375120162964, "learning_rate": 1.9503110078489712e-05, "loss": 0.2594, "step": 809 }, { "epoch": 0.383545427641314, "grad_norm": 1.4791429042816162, "learning_rate": 1.950151680532218e-05, "loss": 0.2544, "step": 810 }, { "epoch": 0.38401894051494523, "grad_norm": 1.3858115673065186, "learning_rate": 1.9499921047124587e-05, "loss": 0.2611, "step": 811 }, { "epoch": 0.3844924533885765, "grad_norm": 1.273097038269043, "learning_rate": 1.9498322804314297e-05, "loss": 0.2752, "step": 812 }, { "epoch": 0.38496596626220775, "grad_norm": 1.3330796957015991, "learning_rate": 1.9496722077309306e-05, "loss": 0.2363, "step": 813 }, { "epoch": 0.385439479135839, "grad_norm": 1.091861605644226, "learning_rate": 1.949511886652827e-05, "loss": 0.2528, "step": 814 }, { "epoch": 0.38591299200947027, "grad_norm": 1.2032009363174438, "learning_rate": 1.9493513172390498e-05, "loss": 0.2528, "step": 815 }, { "epoch": 0.3863865048831015, "grad_norm": 2.2817001342773438, "learning_rate": 1.949190499531594e-05, "loss": 0.2587, "step": 816 }, { "epoch": 0.3868600177567328, "grad_norm": 2.36639666557312, "learning_rate": 1.9490294335725204e-05, "loss": 0.2474, "step": 817 }, { "epoch": 0.387333530630364, "grad_norm": 1.1809403896331787, "learning_rate": 1.9488681194039537e-05, "loss": 0.2738, "step": 818 }, { "epoch": 0.38780704350399525, "grad_norm": 1.3689755201339722, "learning_rate": 1.9487065570680845e-05, "loss": 0.2547, "step": 819 }, { "epoch": 0.38828055637762654, "grad_norm": 1.8941787481307983, "learning_rate": 1.948544746607167e-05, "loss": 0.2823, "step": 820 }, { "epoch": 0.38875406925125777, "grad_norm": 1.0949586629867554, "learning_rate": 1.9483826880635225e-05, "loss": 0.2658, "step": 821 }, { "epoch": 0.389227582124889, "grad_norm": 2.2636001110076904, "learning_rate": 1.9482203814795344e-05, "loss": 0.2608, "step": 822 }, { "epoch": 0.3897010949985203, "grad_norm": 1.2517188787460327, "learning_rate": 1.9480578268976536e-05, "loss": 0.2892, "step": 823 }, { "epoch": 0.3901746078721515, "grad_norm": 1.6371468305587769, "learning_rate": 1.9478950243603946e-05, "loss": 0.2999, "step": 824 }, { "epoch": 0.39064812074578276, "grad_norm": 1.488568663597107, "learning_rate": 1.947731973910336e-05, "loss": 0.2885, "step": 825 }, { "epoch": 0.39112163361941404, "grad_norm": 1.5922267436981201, "learning_rate": 1.9475686755901227e-05, "loss": 0.2553, "step": 826 }, { "epoch": 0.3915951464930453, "grad_norm": 2.3070576190948486, "learning_rate": 1.9474051294424634e-05, "loss": 0.2591, "step": 827 }, { "epoch": 0.3920686593666765, "grad_norm": 2.676198959350586, "learning_rate": 1.9472413355101327e-05, "loss": 0.2867, "step": 828 }, { "epoch": 0.3925421722403078, "grad_norm": 1.9517337083816528, "learning_rate": 1.9470772938359687e-05, "loss": 0.2684, "step": 829 }, { "epoch": 0.39301568511393903, "grad_norm": 1.6799813508987427, "learning_rate": 1.946913004462875e-05, "loss": 0.2631, "step": 830 }, { "epoch": 0.39348919798757026, "grad_norm": 1.206333041191101, "learning_rate": 1.9467484674338202e-05, "loss": 0.2628, "step": 831 }, { "epoch": 0.39396271086120155, "grad_norm": 1.4563113451004028, "learning_rate": 1.9465836827918373e-05, "loss": 0.2684, "step": 832 }, { "epoch": 0.3944362237348328, "grad_norm": 2.2227954864501953, "learning_rate": 1.9464186505800236e-05, "loss": 0.2905, "step": 833 }, { "epoch": 0.39490973660846407, "grad_norm": 1.46279776096344, "learning_rate": 1.9462533708415425e-05, "loss": 0.2615, "step": 834 }, { "epoch": 0.3953832494820953, "grad_norm": 2.0220069885253906, "learning_rate": 1.9460878436196206e-05, "loss": 0.2427, "step": 835 }, { "epoch": 0.39585676235572653, "grad_norm": 1.3017007112503052, "learning_rate": 1.9459220689575505e-05, "loss": 0.2563, "step": 836 }, { "epoch": 0.3963302752293578, "grad_norm": 1.700585961341858, "learning_rate": 1.9457560468986888e-05, "loss": 0.2656, "step": 837 }, { "epoch": 0.39680378810298905, "grad_norm": 1.8619630336761475, "learning_rate": 1.9455897774864567e-05, "loss": 0.2653, "step": 838 }, { "epoch": 0.3972773009766203, "grad_norm": 1.1475433111190796, "learning_rate": 1.9454232607643406e-05, "loss": 0.2612, "step": 839 }, { "epoch": 0.39775081385025157, "grad_norm": 1.736665964126587, "learning_rate": 1.9452564967758912e-05, "loss": 0.2655, "step": 840 }, { "epoch": 0.3982243267238828, "grad_norm": 1.1102855205535889, "learning_rate": 1.9450894855647246e-05, "loss": 0.2437, "step": 841 }, { "epoch": 0.39869783959751404, "grad_norm": 1.2545884847640991, "learning_rate": 1.9449222271745202e-05, "loss": 0.2401, "step": 842 }, { "epoch": 0.3991713524711453, "grad_norm": 1.2088849544525146, "learning_rate": 1.944754721649023e-05, "loss": 0.2497, "step": 843 }, { "epoch": 0.39964486534477656, "grad_norm": 1.708852767944336, "learning_rate": 1.9445869690320425e-05, "loss": 0.2513, "step": 844 }, { "epoch": 0.4001183782184078, "grad_norm": 1.3181136846542358, "learning_rate": 1.9444189693674528e-05, "loss": 0.2814, "step": 845 }, { "epoch": 0.4005918910920391, "grad_norm": 1.6149542331695557, "learning_rate": 1.944250722699193e-05, "loss": 0.2475, "step": 846 }, { "epoch": 0.4010654039656703, "grad_norm": 1.4610974788665771, "learning_rate": 1.944082229071266e-05, "loss": 0.2571, "step": 847 }, { "epoch": 0.4015389168393016, "grad_norm": 1.5411815643310547, "learning_rate": 1.9439134885277394e-05, "loss": 0.2255, "step": 848 }, { "epoch": 0.40201242971293283, "grad_norm": 1.6833422183990479, "learning_rate": 1.9437445011127463e-05, "loss": 0.2551, "step": 849 }, { "epoch": 0.40248594258656406, "grad_norm": 1.65785813331604, "learning_rate": 1.943575266870483e-05, "loss": 0.246, "step": 850 }, { "epoch": 0.40295945546019535, "grad_norm": 1.3264704942703247, "learning_rate": 1.9434057858452117e-05, "loss": 0.2935, "step": 851 }, { "epoch": 0.4034329683338266, "grad_norm": 1.166812539100647, "learning_rate": 1.9432360580812583e-05, "loss": 0.2528, "step": 852 }, { "epoch": 0.4039064812074578, "grad_norm": 2.272095203399658, "learning_rate": 1.9430660836230134e-05, "loss": 0.2852, "step": 853 }, { "epoch": 0.4043799940810891, "grad_norm": 1.7517335414886475, "learning_rate": 1.9428958625149324e-05, "loss": 0.2755, "step": 854 }, { "epoch": 0.40485350695472033, "grad_norm": 1.2015048265457153, "learning_rate": 1.942725394801535e-05, "loss": 0.2628, "step": 855 }, { "epoch": 0.40532701982835156, "grad_norm": 1.3004090785980225, "learning_rate": 1.9425546805274048e-05, "loss": 0.2415, "step": 856 }, { "epoch": 0.40580053270198285, "grad_norm": 1.5480315685272217, "learning_rate": 1.942383719737191e-05, "loss": 0.3167, "step": 857 }, { "epoch": 0.4062740455756141, "grad_norm": 2.4538869857788086, "learning_rate": 1.9422125124756068e-05, "loss": 0.2415, "step": 858 }, { "epoch": 0.4067475584492453, "grad_norm": 1.163549542427063, "learning_rate": 1.9420410587874295e-05, "loss": 0.2615, "step": 859 }, { "epoch": 0.4072210713228766, "grad_norm": 1.151456594467163, "learning_rate": 1.941869358717501e-05, "loss": 0.2391, "step": 860 }, { "epoch": 0.40769458419650784, "grad_norm": 1.4595973491668701, "learning_rate": 1.9416974123107287e-05, "loss": 0.2871, "step": 861 }, { "epoch": 0.40816809707013907, "grad_norm": 1.4766554832458496, "learning_rate": 1.941525219612083e-05, "loss": 0.2879, "step": 862 }, { "epoch": 0.40864160994377036, "grad_norm": 1.770892858505249, "learning_rate": 1.941352780666599e-05, "loss": 0.2809, "step": 863 }, { "epoch": 0.4091151228174016, "grad_norm": 1.4286603927612305, "learning_rate": 1.9411800955193762e-05, "loss": 0.2793, "step": 864 }, { "epoch": 0.4095886356910329, "grad_norm": 1.2635271549224854, "learning_rate": 1.9410071642155796e-05, "loss": 0.267, "step": 865 }, { "epoch": 0.4100621485646641, "grad_norm": 2.1183278560638428, "learning_rate": 1.940833986800437e-05, "loss": 0.2578, "step": 866 }, { "epoch": 0.41053566143829534, "grad_norm": 1.1549620628356934, "learning_rate": 1.9406605633192414e-05, "loss": 0.2852, "step": 867 }, { "epoch": 0.41100917431192663, "grad_norm": 1.3597477674484253, "learning_rate": 1.9404868938173503e-05, "loss": 0.2711, "step": 868 }, { "epoch": 0.41148268718555786, "grad_norm": 1.2947059869766235, "learning_rate": 1.9403129783401854e-05, "loss": 0.2838, "step": 869 }, { "epoch": 0.4119562000591891, "grad_norm": 1.0901488065719604, "learning_rate": 1.9401388169332322e-05, "loss": 0.2467, "step": 870 }, { "epoch": 0.4124297129328204, "grad_norm": 1.7122608423233032, "learning_rate": 1.939964409642041e-05, "loss": 0.265, "step": 871 }, { "epoch": 0.4129032258064516, "grad_norm": 1.7495481967926025, "learning_rate": 1.9397897565122267e-05, "loss": 0.2824, "step": 872 }, { "epoch": 0.41337673868008284, "grad_norm": 1.2452020645141602, "learning_rate": 1.939614857589468e-05, "loss": 0.2695, "step": 873 }, { "epoch": 0.41385025155371413, "grad_norm": 1.2135512828826904, "learning_rate": 1.9394397129195076e-05, "loss": 0.2606, "step": 874 }, { "epoch": 0.41432376442734536, "grad_norm": 1.4891951084136963, "learning_rate": 1.9392643225481535e-05, "loss": 0.2587, "step": 875 }, { "epoch": 0.4147972773009766, "grad_norm": 1.4017555713653564, "learning_rate": 1.9390886865212767e-05, "loss": 0.2776, "step": 876 }, { "epoch": 0.4152707901746079, "grad_norm": 1.8024797439575195, "learning_rate": 1.9389128048848136e-05, "loss": 0.265, "step": 877 }, { "epoch": 0.4157443030482391, "grad_norm": 1.2021361589431763, "learning_rate": 1.9387366776847645e-05, "loss": 0.2712, "step": 878 }, { "epoch": 0.41621781592187035, "grad_norm": 1.449995756149292, "learning_rate": 1.9385603049671934e-05, "loss": 0.2887, "step": 879 }, { "epoch": 0.41669132879550164, "grad_norm": 1.4844348430633545, "learning_rate": 1.9383836867782287e-05, "loss": 0.2384, "step": 880 }, { "epoch": 0.41716484166913287, "grad_norm": 1.3519595861434937, "learning_rate": 1.938206823164064e-05, "loss": 0.2376, "step": 881 }, { "epoch": 0.41763835454276416, "grad_norm": 1.2284427881240845, "learning_rate": 1.938029714170955e-05, "loss": 0.2525, "step": 882 }, { "epoch": 0.4181118674163954, "grad_norm": 1.362317442893982, "learning_rate": 1.937852359845224e-05, "loss": 0.2695, "step": 883 }, { "epoch": 0.4185853802900266, "grad_norm": 1.533306360244751, "learning_rate": 1.937674760233256e-05, "loss": 0.2334, "step": 884 }, { "epoch": 0.4190588931636579, "grad_norm": 2.291046380996704, "learning_rate": 1.9374969153815005e-05, "loss": 0.2578, "step": 885 }, { "epoch": 0.41953240603728914, "grad_norm": 1.2593145370483398, "learning_rate": 1.937318825336471e-05, "loss": 0.2854, "step": 886 }, { "epoch": 0.4200059189109204, "grad_norm": 1.5685346126556396, "learning_rate": 1.9371404901447445e-05, "loss": 0.2777, "step": 887 }, { "epoch": 0.42047943178455166, "grad_norm": 1.7853624820709229, "learning_rate": 1.936961909852964e-05, "loss": 0.2817, "step": 888 }, { "epoch": 0.4209529446581829, "grad_norm": 1.8879746198654175, "learning_rate": 1.9367830845078354e-05, "loss": 0.2515, "step": 889 }, { "epoch": 0.4214264575318141, "grad_norm": 1.719527244567871, "learning_rate": 1.936604014156128e-05, "loss": 0.2583, "step": 890 }, { "epoch": 0.4218999704054454, "grad_norm": 1.735648274421692, "learning_rate": 1.936424698844676e-05, "loss": 0.2788, "step": 891 }, { "epoch": 0.42237348327907664, "grad_norm": 1.6810283660888672, "learning_rate": 1.9362451386203784e-05, "loss": 0.2548, "step": 892 }, { "epoch": 0.4228469961527079, "grad_norm": 1.0706638097763062, "learning_rate": 1.9360653335301964e-05, "loss": 0.2553, "step": 893 }, { "epoch": 0.42332050902633916, "grad_norm": 1.1451069116592407, "learning_rate": 1.9358852836211573e-05, "loss": 0.2566, "step": 894 }, { "epoch": 0.4237940218999704, "grad_norm": 1.3580163717269897, "learning_rate": 1.9357049889403506e-05, "loss": 0.2676, "step": 895 }, { "epoch": 0.4242675347736017, "grad_norm": 1.5814894437789917, "learning_rate": 1.9355244495349307e-05, "loss": 0.2997, "step": 896 }, { "epoch": 0.4247410476472329, "grad_norm": 2.3685078620910645, "learning_rate": 1.9353436654521168e-05, "loss": 0.261, "step": 897 }, { "epoch": 0.42521456052086415, "grad_norm": 1.4430562257766724, "learning_rate": 1.9351626367391902e-05, "loss": 0.2477, "step": 898 }, { "epoch": 0.42568807339449544, "grad_norm": 1.7693711519241333, "learning_rate": 1.9349813634434977e-05, "loss": 0.2577, "step": 899 }, { "epoch": 0.42616158626812667, "grad_norm": 1.3744468688964844, "learning_rate": 1.9347998456124497e-05, "loss": 0.2736, "step": 900 }, { "epoch": 0.4266350991417579, "grad_norm": 1.5893174409866333, "learning_rate": 1.9346180832935202e-05, "loss": 0.2518, "step": 901 }, { "epoch": 0.4271086120153892, "grad_norm": 1.6329647302627563, "learning_rate": 1.9344360765342472e-05, "loss": 0.2574, "step": 902 }, { "epoch": 0.4275821248890204, "grad_norm": 1.2065149545669556, "learning_rate": 1.9342538253822334e-05, "loss": 0.2522, "step": 903 }, { "epoch": 0.42805563776265165, "grad_norm": 1.2911807298660278, "learning_rate": 1.934071329885144e-05, "loss": 0.2441, "step": 904 }, { "epoch": 0.42852915063628294, "grad_norm": 1.260495901107788, "learning_rate": 1.93388859009071e-05, "loss": 0.2505, "step": 905 }, { "epoch": 0.4290026635099142, "grad_norm": 1.6986874341964722, "learning_rate": 1.9337056060467244e-05, "loss": 0.2942, "step": 906 }, { "epoch": 0.4294761763835454, "grad_norm": 1.6441107988357544, "learning_rate": 1.933522377801045e-05, "loss": 0.2577, "step": 907 }, { "epoch": 0.4299496892571767, "grad_norm": 1.4719241857528687, "learning_rate": 1.9333389054015935e-05, "loss": 0.2602, "step": 908 }, { "epoch": 0.4304232021308079, "grad_norm": 1.37874436378479, "learning_rate": 1.9331551888963557e-05, "loss": 0.2792, "step": 909 }, { "epoch": 0.43089671500443916, "grad_norm": 1.7454997301101685, "learning_rate": 1.93297122833338e-05, "loss": 0.2456, "step": 910 }, { "epoch": 0.43137022787807044, "grad_norm": 1.5132381916046143, "learning_rate": 1.9327870237607805e-05, "loss": 0.2826, "step": 911 }, { "epoch": 0.4318437407517017, "grad_norm": 1.309891939163208, "learning_rate": 1.9326025752267338e-05, "loss": 0.2695, "step": 912 }, { "epoch": 0.43231725362533296, "grad_norm": 1.375044822692871, "learning_rate": 1.9324178827794803e-05, "loss": 0.252, "step": 913 }, { "epoch": 0.4327907664989642, "grad_norm": 1.6480258703231812, "learning_rate": 1.9322329464673248e-05, "loss": 0.2797, "step": 914 }, { "epoch": 0.43326427937259543, "grad_norm": 1.1525005102157593, "learning_rate": 1.9320477663386358e-05, "loss": 0.2449, "step": 915 }, { "epoch": 0.4337377922462267, "grad_norm": 1.945312261581421, "learning_rate": 1.9318623424418446e-05, "loss": 0.2355, "step": 916 }, { "epoch": 0.43421130511985795, "grad_norm": 1.427280068397522, "learning_rate": 1.9316766748254477e-05, "loss": 0.2888, "step": 917 }, { "epoch": 0.4346848179934892, "grad_norm": 1.4059946537017822, "learning_rate": 1.931490763538005e-05, "loss": 0.2321, "step": 918 }, { "epoch": 0.43515833086712047, "grad_norm": 1.3967366218566895, "learning_rate": 1.931304608628139e-05, "loss": 0.2585, "step": 919 }, { "epoch": 0.4356318437407517, "grad_norm": 1.3935580253601074, "learning_rate": 1.931118210144537e-05, "loss": 0.2663, "step": 920 }, { "epoch": 0.43610535661438293, "grad_norm": 1.2772636413574219, "learning_rate": 1.93093156813595e-05, "loss": 0.2607, "step": 921 }, { "epoch": 0.4365788694880142, "grad_norm": 1.2750356197357178, "learning_rate": 1.930744682651192e-05, "loss": 0.2518, "step": 922 }, { "epoch": 0.43705238236164545, "grad_norm": 1.2921040058135986, "learning_rate": 1.9305575537391416e-05, "loss": 0.2623, "step": 923 }, { "epoch": 0.4375258952352767, "grad_norm": 1.6561219692230225, "learning_rate": 1.9303701814487403e-05, "loss": 0.3298, "step": 924 }, { "epoch": 0.437999408108908, "grad_norm": 1.4498628377914429, "learning_rate": 1.930182565828993e-05, "loss": 0.2543, "step": 925 }, { "epoch": 0.4384729209825392, "grad_norm": 1.2270756959915161, "learning_rate": 1.9299947069289694e-05, "loss": 0.2553, "step": 926 }, { "epoch": 0.43894643385617044, "grad_norm": 1.31882905960083, "learning_rate": 1.9298066047978024e-05, "loss": 0.2574, "step": 927 }, { "epoch": 0.4394199467298017, "grad_norm": 1.993991494178772, "learning_rate": 1.9296182594846876e-05, "loss": 0.2692, "step": 928 }, { "epoch": 0.43989345960343296, "grad_norm": 1.425070881843567, "learning_rate": 1.9294296710388852e-05, "loss": 0.2553, "step": 929 }, { "epoch": 0.44036697247706424, "grad_norm": 1.2672945261001587, "learning_rate": 1.9292408395097187e-05, "loss": 0.2814, "step": 930 }, { "epoch": 0.4408404853506955, "grad_norm": 1.2957855463027954, "learning_rate": 1.9290517649465756e-05, "loss": 0.2714, "step": 931 }, { "epoch": 0.4413139982243267, "grad_norm": 1.4948066473007202, "learning_rate": 1.9288624473989055e-05, "loss": 0.2353, "step": 932 }, { "epoch": 0.441787511097958, "grad_norm": 1.6086140871047974, "learning_rate": 1.9286728869162235e-05, "loss": 0.2507, "step": 933 }, { "epoch": 0.44226102397158923, "grad_norm": 1.5332534313201904, "learning_rate": 1.928483083548107e-05, "loss": 0.2744, "step": 934 }, { "epoch": 0.44273453684522046, "grad_norm": 1.8860116004943848, "learning_rate": 1.928293037344197e-05, "loss": 0.2706, "step": 935 }, { "epoch": 0.44320804971885175, "grad_norm": 1.3716895580291748, "learning_rate": 1.9281027483541986e-05, "loss": 0.2565, "step": 936 }, { "epoch": 0.443681562592483, "grad_norm": 2.0549087524414062, "learning_rate": 1.9279122166278798e-05, "loss": 0.2524, "step": 937 }, { "epoch": 0.4441550754661142, "grad_norm": 1.379072666168213, "learning_rate": 1.927721442215073e-05, "loss": 0.2694, "step": 938 }, { "epoch": 0.4446285883397455, "grad_norm": 1.1916866302490234, "learning_rate": 1.9275304251656723e-05, "loss": 0.2386, "step": 939 }, { "epoch": 0.44510210121337673, "grad_norm": 2.893798589706421, "learning_rate": 1.9273391655296373e-05, "loss": 0.2682, "step": 940 }, { "epoch": 0.44557561408700797, "grad_norm": 2.7330830097198486, "learning_rate": 1.9271476633569895e-05, "loss": 0.2632, "step": 941 }, { "epoch": 0.44604912696063925, "grad_norm": 1.7794554233551025, "learning_rate": 1.926955918697815e-05, "loss": 0.2447, "step": 942 }, { "epoch": 0.4465226398342705, "grad_norm": 1.3762879371643066, "learning_rate": 1.926763931602262e-05, "loss": 0.2525, "step": 943 }, { "epoch": 0.4469961527079018, "grad_norm": 1.5030863285064697, "learning_rate": 1.9265717021205437e-05, "loss": 0.248, "step": 944 }, { "epoch": 0.447469665581533, "grad_norm": 1.604885220527649, "learning_rate": 1.9263792303029355e-05, "loss": 0.2527, "step": 945 }, { "epoch": 0.44794317845516424, "grad_norm": 1.0252493619918823, "learning_rate": 1.9261865161997765e-05, "loss": 0.2084, "step": 946 }, { "epoch": 0.4484166913287955, "grad_norm": 1.32304048538208, "learning_rate": 1.925993559861469e-05, "loss": 0.2385, "step": 947 }, { "epoch": 0.44889020420242676, "grad_norm": 1.2492711544036865, "learning_rate": 1.9258003613384793e-05, "loss": 0.2862, "step": 948 }, { "epoch": 0.449363717076058, "grad_norm": 1.9659324884414673, "learning_rate": 1.925606920681337e-05, "loss": 0.2729, "step": 949 }, { "epoch": 0.4498372299496893, "grad_norm": 1.930130958557129, "learning_rate": 1.9254132379406335e-05, "loss": 0.2429, "step": 950 }, { "epoch": 0.4503107428233205, "grad_norm": 1.9109481573104858, "learning_rate": 1.925219313167025e-05, "loss": 0.2579, "step": 951 }, { "epoch": 0.45078425569695174, "grad_norm": 1.461925983428955, "learning_rate": 1.9250251464112313e-05, "loss": 0.3025, "step": 952 }, { "epoch": 0.45125776857058303, "grad_norm": 1.1421688795089722, "learning_rate": 1.9248307377240346e-05, "loss": 0.2611, "step": 953 }, { "epoch": 0.45173128144421426, "grad_norm": 1.3895161151885986, "learning_rate": 1.92463608715628e-05, "loss": 0.2854, "step": 954 }, { "epoch": 0.4522047943178455, "grad_norm": 1.081526517868042, "learning_rate": 1.9244411947588774e-05, "loss": 0.2495, "step": 955 }, { "epoch": 0.4526783071914768, "grad_norm": 1.5755951404571533, "learning_rate": 1.924246060582798e-05, "loss": 0.2645, "step": 956 }, { "epoch": 0.453151820065108, "grad_norm": 1.318973183631897, "learning_rate": 1.9240506846790784e-05, "loss": 0.2215, "step": 957 }, { "epoch": 0.45362533293873925, "grad_norm": 1.0379008054733276, "learning_rate": 1.9238550670988166e-05, "loss": 0.2594, "step": 958 }, { "epoch": 0.45409884581237053, "grad_norm": 1.1944754123687744, "learning_rate": 1.923659207893174e-05, "loss": 0.2458, "step": 959 }, { "epoch": 0.45457235868600177, "grad_norm": 1.7871370315551758, "learning_rate": 1.9234631071133768e-05, "loss": 0.2622, "step": 960 }, { "epoch": 0.45504587155963305, "grad_norm": 1.2604117393493652, "learning_rate": 1.9232667648107127e-05, "loss": 0.2823, "step": 961 }, { "epoch": 0.4555193844332643, "grad_norm": 1.3337626457214355, "learning_rate": 1.923070181036533e-05, "loss": 0.2737, "step": 962 }, { "epoch": 0.4559928973068955, "grad_norm": 1.0898325443267822, "learning_rate": 1.9228733558422525e-05, "loss": 0.2567, "step": 963 }, { "epoch": 0.4564664101805268, "grad_norm": 1.1792700290679932, "learning_rate": 1.9226762892793492e-05, "loss": 0.2511, "step": 964 }, { "epoch": 0.45693992305415804, "grad_norm": 1.4441752433776855, "learning_rate": 1.922478981399363e-05, "loss": 0.2526, "step": 965 }, { "epoch": 0.45741343592778927, "grad_norm": 1.9730075597763062, "learning_rate": 1.9222814322538993e-05, "loss": 0.3023, "step": 966 }, { "epoch": 0.45788694880142056, "grad_norm": 1.2090874910354614, "learning_rate": 1.922083641894624e-05, "loss": 0.2561, "step": 967 }, { "epoch": 0.4583604616750518, "grad_norm": 1.5436848402023315, "learning_rate": 1.9218856103732675e-05, "loss": 0.2539, "step": 968 }, { "epoch": 0.458833974548683, "grad_norm": 1.0943453311920166, "learning_rate": 1.9216873377416236e-05, "loss": 0.2568, "step": 969 }, { "epoch": 0.4593074874223143, "grad_norm": 1.5270817279815674, "learning_rate": 1.9214888240515478e-05, "loss": 0.2381, "step": 970 }, { "epoch": 0.45978100029594554, "grad_norm": 1.254157543182373, "learning_rate": 1.9212900693549602e-05, "loss": 0.2476, "step": 971 }, { "epoch": 0.4602545131695768, "grad_norm": 1.5397264957427979, "learning_rate": 1.9210910737038424e-05, "loss": 0.2651, "step": 972 }, { "epoch": 0.46072802604320806, "grad_norm": 1.4279857873916626, "learning_rate": 1.9208918371502404e-05, "loss": 0.249, "step": 973 }, { "epoch": 0.4612015389168393, "grad_norm": 1.1333208084106445, "learning_rate": 1.9206923597462625e-05, "loss": 0.2391, "step": 974 }, { "epoch": 0.4616750517904706, "grad_norm": 1.3642656803131104, "learning_rate": 1.9204926415440798e-05, "loss": 0.2462, "step": 975 }, { "epoch": 0.4621485646641018, "grad_norm": 1.0775152444839478, "learning_rate": 1.920292682595927e-05, "loss": 0.2345, "step": 976 }, { "epoch": 0.46262207753773305, "grad_norm": 1.583609938621521, "learning_rate": 1.9200924829541012e-05, "loss": 0.2341, "step": 977 }, { "epoch": 0.46309559041136433, "grad_norm": 1.214303731918335, "learning_rate": 1.919892042670963e-05, "loss": 0.2515, "step": 978 }, { "epoch": 0.46356910328499557, "grad_norm": 1.1225674152374268, "learning_rate": 1.919691361798935e-05, "loss": 0.2615, "step": 979 }, { "epoch": 0.4640426161586268, "grad_norm": 1.8481762409210205, "learning_rate": 1.9194904403905038e-05, "loss": 0.2602, "step": 980 }, { "epoch": 0.4645161290322581, "grad_norm": 1.6956995725631714, "learning_rate": 1.9192892784982185e-05, "loss": 0.2413, "step": 981 }, { "epoch": 0.4649896419058893, "grad_norm": 1.4857017993927002, "learning_rate": 1.919087876174691e-05, "loss": 0.271, "step": 982 }, { "epoch": 0.46546315477952055, "grad_norm": 1.868812918663025, "learning_rate": 1.918886233472596e-05, "loss": 0.2553, "step": 983 }, { "epoch": 0.46593666765315184, "grad_norm": 2.422353982925415, "learning_rate": 1.9186843504446716e-05, "loss": 0.2641, "step": 984 }, { "epoch": 0.46641018052678307, "grad_norm": 1.8645198345184326, "learning_rate": 1.9184822271437176e-05, "loss": 0.2752, "step": 985 }, { "epoch": 0.4668836934004143, "grad_norm": 1.2485929727554321, "learning_rate": 1.9182798636225983e-05, "loss": 0.2439, "step": 986 }, { "epoch": 0.4673572062740456, "grad_norm": 1.1110594272613525, "learning_rate": 1.918077259934239e-05, "loss": 0.2504, "step": 987 }, { "epoch": 0.4678307191476768, "grad_norm": 1.529255747795105, "learning_rate": 1.9178744161316297e-05, "loss": 0.2643, "step": 988 }, { "epoch": 0.46830423202130805, "grad_norm": 1.2772247791290283, "learning_rate": 1.9176713322678212e-05, "loss": 0.2207, "step": 989 }, { "epoch": 0.46877774489493934, "grad_norm": 1.129361867904663, "learning_rate": 1.917468008395929e-05, "loss": 0.2468, "step": 990 }, { "epoch": 0.4692512577685706, "grad_norm": 1.451148271560669, "learning_rate": 1.9172644445691305e-05, "loss": 0.2493, "step": 991 }, { "epoch": 0.46972477064220186, "grad_norm": 1.5785322189331055, "learning_rate": 1.9170606408406648e-05, "loss": 0.2314, "step": 992 }, { "epoch": 0.4701982835158331, "grad_norm": 1.7173629999160767, "learning_rate": 1.916856597263836e-05, "loss": 0.2629, "step": 993 }, { "epoch": 0.4706717963894643, "grad_norm": 1.471721887588501, "learning_rate": 1.916652313892009e-05, "loss": 0.2752, "step": 994 }, { "epoch": 0.4711453092630956, "grad_norm": 1.9189519882202148, "learning_rate": 1.9164477907786128e-05, "loss": 0.2714, "step": 995 }, { "epoch": 0.47161882213672685, "grad_norm": 1.542122721672058, "learning_rate": 1.9162430279771378e-05, "loss": 0.2648, "step": 996 }, { "epoch": 0.4720923350103581, "grad_norm": 1.2591853141784668, "learning_rate": 1.916038025541138e-05, "loss": 0.2792, "step": 997 }, { "epoch": 0.47256584788398937, "grad_norm": 1.6615357398986816, "learning_rate": 1.9158327835242296e-05, "loss": 0.2639, "step": 998 }, { "epoch": 0.4730393607576206, "grad_norm": 1.494132161140442, "learning_rate": 1.915627301980092e-05, "loss": 0.2994, "step": 999 }, { "epoch": 0.47351287363125183, "grad_norm": 1.3454561233520508, "learning_rate": 1.915421580962467e-05, "loss": 0.2394, "step": 1000 }, { "epoch": 0.4739863865048831, "grad_norm": 1.6147300004959106, "learning_rate": 1.9152156205251583e-05, "loss": 0.2415, "step": 1001 }, { "epoch": 0.47445989937851435, "grad_norm": 1.5121729373931885, "learning_rate": 1.9150094207220338e-05, "loss": 0.2285, "step": 1002 }, { "epoch": 0.4749334122521456, "grad_norm": 2.2810189723968506, "learning_rate": 1.9148029816070223e-05, "loss": 0.2352, "step": 1003 }, { "epoch": 0.47540692512577687, "grad_norm": 1.55019211769104, "learning_rate": 1.9145963032341163e-05, "loss": 0.273, "step": 1004 }, { "epoch": 0.4758804379994081, "grad_norm": 1.1222288608551025, "learning_rate": 1.9143893856573702e-05, "loss": 0.2655, "step": 1005 }, { "epoch": 0.47635395087303933, "grad_norm": 2.17794132232666, "learning_rate": 1.9141822289309016e-05, "loss": 0.2886, "step": 1006 }, { "epoch": 0.4768274637466706, "grad_norm": 1.7431869506835938, "learning_rate": 1.9139748331088906e-05, "loss": 0.2651, "step": 1007 }, { "epoch": 0.47730097662030185, "grad_norm": 1.3544774055480957, "learning_rate": 1.913767198245579e-05, "loss": 0.2441, "step": 1008 }, { "epoch": 0.47777448949393314, "grad_norm": 1.2691947221755981, "learning_rate": 1.9135593243952724e-05, "loss": 0.222, "step": 1009 }, { "epoch": 0.4782480023675644, "grad_norm": 1.9088751077651978, "learning_rate": 1.913351211612337e-05, "loss": 0.2371, "step": 1010 }, { "epoch": 0.4787215152411956, "grad_norm": 1.6651183366775513, "learning_rate": 1.9131428599512042e-05, "loss": 0.2516, "step": 1011 }, { "epoch": 0.4791950281148269, "grad_norm": 1.7003988027572632, "learning_rate": 1.9129342694663655e-05, "loss": 0.2557, "step": 1012 }, { "epoch": 0.4796685409884581, "grad_norm": 1.6492764949798584, "learning_rate": 1.9127254402123755e-05, "loss": 0.277, "step": 1013 }, { "epoch": 0.48014205386208936, "grad_norm": 1.4109059572219849, "learning_rate": 1.912516372243852e-05, "loss": 0.2222, "step": 1014 }, { "epoch": 0.48061556673572065, "grad_norm": 1.9010688066482544, "learning_rate": 1.9123070656154748e-05, "loss": 0.2614, "step": 1015 }, { "epoch": 0.4810890796093519, "grad_norm": 1.4977048635482788, "learning_rate": 1.9120975203819855e-05, "loss": 0.2262, "step": 1016 }, { "epoch": 0.4815625924829831, "grad_norm": 1.206935167312622, "learning_rate": 1.9118877365981887e-05, "loss": 0.2591, "step": 1017 }, { "epoch": 0.4820361053566144, "grad_norm": 1.7453712224960327, "learning_rate": 1.9116777143189517e-05, "loss": 0.2233, "step": 1018 }, { "epoch": 0.48250961823024563, "grad_norm": 1.1337833404541016, "learning_rate": 1.9114674535992038e-05, "loss": 0.2608, "step": 1019 }, { "epoch": 0.48298313110387686, "grad_norm": 2.339742422103882, "learning_rate": 1.9112569544939364e-05, "loss": 0.2628, "step": 1020 }, { "epoch": 0.48345664397750815, "grad_norm": 1.3370864391326904, "learning_rate": 1.9110462170582036e-05, "loss": 0.2664, "step": 1021 }, { "epoch": 0.4839301568511394, "grad_norm": 1.41420578956604, "learning_rate": 1.9108352413471215e-05, "loss": 0.2721, "step": 1022 }, { "epoch": 0.48440366972477067, "grad_norm": 1.787891149520874, "learning_rate": 1.9106240274158693e-05, "loss": 0.2511, "step": 1023 }, { "epoch": 0.4848771825984019, "grad_norm": 1.4634735584259033, "learning_rate": 1.9104125753196876e-05, "loss": 0.2523, "step": 1024 }, { "epoch": 0.48535069547203313, "grad_norm": 1.4561212062835693, "learning_rate": 1.9102008851138797e-05, "loss": 0.2913, "step": 1025 }, { "epoch": 0.4858242083456644, "grad_norm": 1.1067405939102173, "learning_rate": 1.9099889568538113e-05, "loss": 0.2476, "step": 1026 }, { "epoch": 0.48629772121929565, "grad_norm": 1.2803436517715454, "learning_rate": 1.90977679059491e-05, "loss": 0.2446, "step": 1027 }, { "epoch": 0.4867712340929269, "grad_norm": 1.3610336780548096, "learning_rate": 1.909564386392666e-05, "loss": 0.2365, "step": 1028 }, { "epoch": 0.4872447469665582, "grad_norm": 1.3339933156967163, "learning_rate": 1.909351744302631e-05, "loss": 0.2395, "step": 1029 }, { "epoch": 0.4877182598401894, "grad_norm": 2.5949342250823975, "learning_rate": 1.9091388643804202e-05, "loss": 0.2646, "step": 1030 }, { "epoch": 0.48819177271382064, "grad_norm": 1.3655803203582764, "learning_rate": 1.9089257466817102e-05, "loss": 0.2368, "step": 1031 }, { "epoch": 0.4886652855874519, "grad_norm": 1.8703043460845947, "learning_rate": 1.9087123912622397e-05, "loss": 0.301, "step": 1032 }, { "epoch": 0.48913879846108316, "grad_norm": 1.272451400756836, "learning_rate": 1.9084987981778097e-05, "loss": 0.2422, "step": 1033 }, { "epoch": 0.4896123113347144, "grad_norm": 1.2157567739486694, "learning_rate": 1.9082849674842835e-05, "loss": 0.2515, "step": 1034 }, { "epoch": 0.4900858242083457, "grad_norm": 1.857765555381775, "learning_rate": 1.9080708992375863e-05, "loss": 0.2427, "step": 1035 }, { "epoch": 0.4905593370819769, "grad_norm": 1.2299634218215942, "learning_rate": 1.907856593493706e-05, "loss": 0.2659, "step": 1036 }, { "epoch": 0.49103284995560814, "grad_norm": 1.3312522172927856, "learning_rate": 1.9076420503086915e-05, "loss": 0.2546, "step": 1037 }, { "epoch": 0.49150636282923943, "grad_norm": 2.030163049697876, "learning_rate": 1.9074272697386554e-05, "loss": 0.2592, "step": 1038 }, { "epoch": 0.49197987570287066, "grad_norm": 1.262022852897644, "learning_rate": 1.9072122518397706e-05, "loss": 0.2468, "step": 1039 }, { "epoch": 0.49245338857650195, "grad_norm": 1.5466980934143066, "learning_rate": 1.9069969966682738e-05, "loss": 0.2692, "step": 1040 }, { "epoch": 0.4929269014501332, "grad_norm": 1.136094093322754, "learning_rate": 1.9067815042804622e-05, "loss": 0.2811, "step": 1041 }, { "epoch": 0.4934004143237644, "grad_norm": 3.164823532104492, "learning_rate": 1.906565774732696e-05, "loss": 0.252, "step": 1042 }, { "epoch": 0.4938739271973957, "grad_norm": 1.530435562133789, "learning_rate": 1.9063498080813973e-05, "loss": 0.2797, "step": 1043 }, { "epoch": 0.49434744007102693, "grad_norm": 1.6616190671920776, "learning_rate": 1.9061336043830498e-05, "loss": 0.2717, "step": 1044 }, { "epoch": 0.49482095294465817, "grad_norm": 1.4890795946121216, "learning_rate": 1.9059171636942e-05, "loss": 0.2734, "step": 1045 }, { "epoch": 0.49529446581828945, "grad_norm": 2.622288703918457, "learning_rate": 1.905700486071455e-05, "loss": 0.2473, "step": 1046 }, { "epoch": 0.4957679786919207, "grad_norm": 1.4621379375457764, "learning_rate": 1.905483571571486e-05, "loss": 0.2511, "step": 1047 }, { "epoch": 0.4962414915655519, "grad_norm": 1.3425689935684204, "learning_rate": 1.905266420251024e-05, "loss": 0.2709, "step": 1048 }, { "epoch": 0.4967150044391832, "grad_norm": 1.391316294670105, "learning_rate": 1.905049032166863e-05, "loss": 0.2401, "step": 1049 }, { "epoch": 0.49718851731281444, "grad_norm": 1.435872197151184, "learning_rate": 1.9048314073758586e-05, "loss": 0.2355, "step": 1050 }, { "epoch": 0.49766203018644567, "grad_norm": 1.4960476160049438, "learning_rate": 1.9046135459349287e-05, "loss": 0.2619, "step": 1051 }, { "epoch": 0.49813554306007696, "grad_norm": 2.056912899017334, "learning_rate": 1.9043954479010532e-05, "loss": 0.258, "step": 1052 }, { "epoch": 0.4986090559337082, "grad_norm": 1.3434464931488037, "learning_rate": 1.9041771133312732e-05, "loss": 0.2469, "step": 1053 }, { "epoch": 0.4990825688073395, "grad_norm": 1.5715394020080566, "learning_rate": 1.9039585422826916e-05, "loss": 0.2293, "step": 1054 }, { "epoch": 0.4995560816809707, "grad_norm": 1.3329315185546875, "learning_rate": 1.903739734812474e-05, "loss": 0.2552, "step": 1055 }, { "epoch": 0.5000295945546019, "grad_norm": 1.1879801750183105, "learning_rate": 1.9035206909778475e-05, "loss": 0.2491, "step": 1056 }, { "epoch": 0.5005031074282332, "grad_norm": 1.3756626844406128, "learning_rate": 1.9033014108361003e-05, "loss": 0.2371, "step": 1057 }, { "epoch": 0.5009766203018644, "grad_norm": 1.3430827856063843, "learning_rate": 1.9030818944445836e-05, "loss": 0.2535, "step": 1058 }, { "epoch": 0.5014501331754957, "grad_norm": 1.2272851467132568, "learning_rate": 1.9028621418607095e-05, "loss": 0.2473, "step": 1059 }, { "epoch": 0.501923646049127, "grad_norm": 1.979054570198059, "learning_rate": 1.9026421531419522e-05, "loss": 0.2675, "step": 1060 }, { "epoch": 0.5023971589227582, "grad_norm": 1.8961822986602783, "learning_rate": 1.902421928345848e-05, "loss": 0.2752, "step": 1061 }, { "epoch": 0.5028706717963894, "grad_norm": 1.5014052391052246, "learning_rate": 1.902201467529994e-05, "loss": 0.2348, "step": 1062 }, { "epoch": 0.5033441846700207, "grad_norm": 1.413966178894043, "learning_rate": 1.90198077075205e-05, "loss": 0.2568, "step": 1063 }, { "epoch": 0.503817697543652, "grad_norm": 1.097167730331421, "learning_rate": 1.901759838069737e-05, "loss": 0.2602, "step": 1064 }, { "epoch": 0.5042912104172832, "grad_norm": 2.0718507766723633, "learning_rate": 1.9015386695408377e-05, "loss": 0.2524, "step": 1065 }, { "epoch": 0.5047647232909145, "grad_norm": 1.8669925928115845, "learning_rate": 1.9013172652231967e-05, "loss": 0.2333, "step": 1066 }, { "epoch": 0.5052382361645458, "grad_norm": 1.3957126140594482, "learning_rate": 1.9010956251747202e-05, "loss": 0.266, "step": 1067 }, { "epoch": 0.505711749038177, "grad_norm": 2.0462698936462402, "learning_rate": 1.9008737494533757e-05, "loss": 0.2846, "step": 1068 }, { "epoch": 0.5061852619118082, "grad_norm": 1.195210576057434, "learning_rate": 1.9006516381171933e-05, "loss": 0.2851, "step": 1069 }, { "epoch": 0.5066587747854395, "grad_norm": 1.3141087293624878, "learning_rate": 1.9004292912242634e-05, "loss": 0.2393, "step": 1070 }, { "epoch": 0.5071322876590707, "grad_norm": 1.6447049379348755, "learning_rate": 1.900206708832739e-05, "loss": 0.2711, "step": 1071 }, { "epoch": 0.507605800532702, "grad_norm": 2.366652727127075, "learning_rate": 1.8999838910008347e-05, "loss": 0.2233, "step": 1072 }, { "epoch": 0.5080793134063333, "grad_norm": 1.6027371883392334, "learning_rate": 1.8997608377868256e-05, "loss": 0.2689, "step": 1073 }, { "epoch": 0.5085528262799645, "grad_norm": 1.4096899032592773, "learning_rate": 1.8995375492490495e-05, "loss": 0.2539, "step": 1074 }, { "epoch": 0.5090263391535957, "grad_norm": 1.179916501045227, "learning_rate": 1.8993140254459057e-05, "loss": 0.2667, "step": 1075 }, { "epoch": 0.509499852027227, "grad_norm": 1.141992449760437, "learning_rate": 1.8990902664358542e-05, "loss": 0.2461, "step": 1076 }, { "epoch": 0.5099733649008582, "grad_norm": 1.3046503067016602, "learning_rate": 1.8988662722774172e-05, "loss": 0.2607, "step": 1077 }, { "epoch": 0.5104468777744895, "grad_norm": 1.1678529977798462, "learning_rate": 1.898642043029178e-05, "loss": 0.2352, "step": 1078 }, { "epoch": 0.5109203906481208, "grad_norm": 1.4803732633590698, "learning_rate": 1.8984175787497822e-05, "loss": 0.2612, "step": 1079 }, { "epoch": 0.511393903521752, "grad_norm": 1.7487013339996338, "learning_rate": 1.898192879497936e-05, "loss": 0.2348, "step": 1080 }, { "epoch": 0.5118674163953832, "grad_norm": 1.448327898979187, "learning_rate": 1.8979679453324068e-05, "loss": 0.261, "step": 1081 }, { "epoch": 0.5123409292690145, "grad_norm": 1.1041336059570312, "learning_rate": 1.8977427763120242e-05, "loss": 0.2754, "step": 1082 }, { "epoch": 0.5128144421426457, "grad_norm": 1.948614239692688, "learning_rate": 1.8975173724956794e-05, "loss": 0.2594, "step": 1083 }, { "epoch": 0.513287955016277, "grad_norm": 1.1922783851623535, "learning_rate": 1.897291733942324e-05, "loss": 0.2457, "step": 1084 }, { "epoch": 0.5137614678899083, "grad_norm": 1.4861289262771606, "learning_rate": 1.8970658607109723e-05, "loss": 0.2459, "step": 1085 }, { "epoch": 0.5142349807635395, "grad_norm": 1.497668981552124, "learning_rate": 1.896839752860699e-05, "loss": 0.2828, "step": 1086 }, { "epoch": 0.5147084936371707, "grad_norm": 1.194231390953064, "learning_rate": 1.89661341045064e-05, "loss": 0.2524, "step": 1087 }, { "epoch": 0.515182006510802, "grad_norm": 1.5421984195709229, "learning_rate": 1.8963868335399933e-05, "loss": 0.2355, "step": 1088 }, { "epoch": 0.5156555193844332, "grad_norm": 1.1698225736618042, "learning_rate": 1.8961600221880177e-05, "loss": 0.253, "step": 1089 }, { "epoch": 0.5161290322580645, "grad_norm": 1.2847546339035034, "learning_rate": 1.8959329764540343e-05, "loss": 0.2463, "step": 1090 }, { "epoch": 0.5166025451316958, "grad_norm": 1.8415346145629883, "learning_rate": 1.895705696397424e-05, "loss": 0.2602, "step": 1091 }, { "epoch": 0.517076058005327, "grad_norm": 1.1953470706939697, "learning_rate": 1.8954781820776293e-05, "loss": 0.2544, "step": 1092 }, { "epoch": 0.5175495708789583, "grad_norm": 1.2167423963546753, "learning_rate": 1.8952504335541554e-05, "loss": 0.2353, "step": 1093 }, { "epoch": 0.5180230837525895, "grad_norm": 1.259513258934021, "learning_rate": 1.8950224508865667e-05, "loss": 0.2395, "step": 1094 }, { "epoch": 0.5184965966262208, "grad_norm": 1.6243139505386353, "learning_rate": 1.894794234134491e-05, "loss": 0.2362, "step": 1095 }, { "epoch": 0.518970109499852, "grad_norm": 1.0280331373214722, "learning_rate": 1.8945657833576155e-05, "loss": 0.2476, "step": 1096 }, { "epoch": 0.5194436223734833, "grad_norm": 1.332411289215088, "learning_rate": 1.894337098615689e-05, "loss": 0.2551, "step": 1097 }, { "epoch": 0.5199171352471146, "grad_norm": 1.2964811325073242, "learning_rate": 1.8941081799685227e-05, "loss": 0.2598, "step": 1098 }, { "epoch": 0.5203906481207458, "grad_norm": 1.6705334186553955, "learning_rate": 1.893879027475987e-05, "loss": 0.2639, "step": 1099 }, { "epoch": 0.520864160994377, "grad_norm": 1.3009783029556274, "learning_rate": 1.8936496411980156e-05, "loss": 0.2803, "step": 1100 }, { "epoch": 0.5213376738680083, "grad_norm": 1.179648995399475, "learning_rate": 1.8934200211946013e-05, "loss": 0.2561, "step": 1101 }, { "epoch": 0.5218111867416395, "grad_norm": 1.4020172357559204, "learning_rate": 1.8931901675257996e-05, "loss": 0.2533, "step": 1102 }, { "epoch": 0.5222846996152708, "grad_norm": 1.20439875125885, "learning_rate": 1.8929600802517262e-05, "loss": 0.2684, "step": 1103 }, { "epoch": 0.5227582124889021, "grad_norm": 1.7954668998718262, "learning_rate": 1.8927297594325583e-05, "loss": 0.2242, "step": 1104 }, { "epoch": 0.5232317253625333, "grad_norm": 1.4614356756210327, "learning_rate": 1.8924992051285345e-05, "loss": 0.2541, "step": 1105 }, { "epoch": 0.5237052382361645, "grad_norm": 1.306846022605896, "learning_rate": 1.8922684173999538e-05, "loss": 0.2382, "step": 1106 }, { "epoch": 0.5241787511097958, "grad_norm": 1.7092262506484985, "learning_rate": 1.8920373963071757e-05, "loss": 0.2512, "step": 1107 }, { "epoch": 0.524652263983427, "grad_norm": 1.5031977891921997, "learning_rate": 1.8918061419106228e-05, "loss": 0.2581, "step": 1108 }, { "epoch": 0.5251257768570583, "grad_norm": 1.5629005432128906, "learning_rate": 1.8915746542707767e-05, "loss": 0.2439, "step": 1109 }, { "epoch": 0.5255992897306896, "grad_norm": 1.4823302030563354, "learning_rate": 1.891342933448181e-05, "loss": 0.2734, "step": 1110 }, { "epoch": 0.5260728026043208, "grad_norm": 1.880656123161316, "learning_rate": 1.89111097950344e-05, "loss": 0.2425, "step": 1111 }, { "epoch": 0.526546315477952, "grad_norm": 1.1813774108886719, "learning_rate": 1.8908787924972188e-05, "loss": 0.2375, "step": 1112 }, { "epoch": 0.5270198283515833, "grad_norm": 1.9654037952423096, "learning_rate": 1.8906463724902443e-05, "loss": 0.2817, "step": 1113 }, { "epoch": 0.5274933412252145, "grad_norm": 1.7562179565429688, "learning_rate": 1.890413719543303e-05, "loss": 0.256, "step": 1114 }, { "epoch": 0.5279668540988458, "grad_norm": 1.140051245689392, "learning_rate": 1.8901808337172433e-05, "loss": 0.2455, "step": 1115 }, { "epoch": 0.5284403669724771, "grad_norm": 1.8479341268539429, "learning_rate": 1.8899477150729745e-05, "loss": 0.2571, "step": 1116 }, { "epoch": 0.5289138798461083, "grad_norm": 1.7184622287750244, "learning_rate": 1.889714363671466e-05, "loss": 0.2463, "step": 1117 }, { "epoch": 0.5293873927197396, "grad_norm": 1.3186061382293701, "learning_rate": 1.8894807795737492e-05, "loss": 0.2657, "step": 1118 }, { "epoch": 0.5298609055933708, "grad_norm": 1.8647300004959106, "learning_rate": 1.8892469628409157e-05, "loss": 0.2505, "step": 1119 }, { "epoch": 0.530334418467002, "grad_norm": 2.4725730419158936, "learning_rate": 1.889012913534117e-05, "loss": 0.2475, "step": 1120 }, { "epoch": 0.5308079313406333, "grad_norm": 1.2804913520812988, "learning_rate": 1.8887786317145677e-05, "loss": 0.2607, "step": 1121 }, { "epoch": 0.5312814442142646, "grad_norm": 1.6455892324447632, "learning_rate": 1.8885441174435417e-05, "loss": 0.2273, "step": 1122 }, { "epoch": 0.5317549570878958, "grad_norm": 1.4230307340621948, "learning_rate": 1.8883093707823733e-05, "loss": 0.2571, "step": 1123 }, { "epoch": 0.5322284699615271, "grad_norm": 2.230576992034912, "learning_rate": 1.8880743917924585e-05, "loss": 0.2418, "step": 1124 }, { "epoch": 0.5327019828351583, "grad_norm": 1.6521943807601929, "learning_rate": 1.8878391805352544e-05, "loss": 0.2537, "step": 1125 }, { "epoch": 0.5331754957087896, "grad_norm": 1.336004376411438, "learning_rate": 1.887603737072278e-05, "loss": 0.243, "step": 1126 }, { "epoch": 0.5336490085824208, "grad_norm": 1.650602102279663, "learning_rate": 1.8873680614651065e-05, "loss": 0.2544, "step": 1127 }, { "epoch": 0.5341225214560521, "grad_norm": 1.5167171955108643, "learning_rate": 1.8871321537753792e-05, "loss": 0.2596, "step": 1128 }, { "epoch": 0.5345960343296834, "grad_norm": 1.3937066793441772, "learning_rate": 1.8868960140647953e-05, "loss": 0.2398, "step": 1129 }, { "epoch": 0.5350695472033146, "grad_norm": 1.1855547428131104, "learning_rate": 1.886659642395115e-05, "loss": 0.2312, "step": 1130 }, { "epoch": 0.5355430600769459, "grad_norm": 1.1568256616592407, "learning_rate": 1.8864230388281594e-05, "loss": 0.2481, "step": 1131 }, { "epoch": 0.5360165729505771, "grad_norm": 1.2066378593444824, "learning_rate": 1.886186203425809e-05, "loss": 0.2306, "step": 1132 }, { "epoch": 0.5364900858242083, "grad_norm": 1.1401264667510986, "learning_rate": 1.8859491362500066e-05, "loss": 0.2538, "step": 1133 }, { "epoch": 0.5369635986978396, "grad_norm": 1.3635470867156982, "learning_rate": 1.885711837362754e-05, "loss": 0.247, "step": 1134 }, { "epoch": 0.5374371115714709, "grad_norm": 1.619167685508728, "learning_rate": 1.8854743068261154e-05, "loss": 0.2123, "step": 1135 }, { "epoch": 0.5379106244451021, "grad_norm": 1.432413101196289, "learning_rate": 1.885236544702214e-05, "loss": 0.2377, "step": 1136 }, { "epoch": 0.5383841373187334, "grad_norm": 2.2878897190093994, "learning_rate": 1.8849985510532348e-05, "loss": 0.2466, "step": 1137 }, { "epoch": 0.5388576501923646, "grad_norm": 1.848306655883789, "learning_rate": 1.8847603259414215e-05, "loss": 0.2458, "step": 1138 }, { "epoch": 0.5393311630659958, "grad_norm": 1.1554219722747803, "learning_rate": 1.884521869429081e-05, "loss": 0.2542, "step": 1139 }, { "epoch": 0.5398046759396271, "grad_norm": 1.239094614982605, "learning_rate": 1.8842831815785783e-05, "loss": 0.2569, "step": 1140 }, { "epoch": 0.5402781888132584, "grad_norm": 1.240552544593811, "learning_rate": 1.8840442624523408e-05, "loss": 0.2391, "step": 1141 }, { "epoch": 0.5407517016868896, "grad_norm": 1.8577840328216553, "learning_rate": 1.8838051121128545e-05, "loss": 0.224, "step": 1142 }, { "epoch": 0.5412252145605209, "grad_norm": 1.2552670240402222, "learning_rate": 1.883565730622668e-05, "loss": 0.2271, "step": 1143 }, { "epoch": 0.5416987274341521, "grad_norm": 1.31777822971344, "learning_rate": 1.8833261180443877e-05, "loss": 0.2579, "step": 1144 }, { "epoch": 0.5421722403077833, "grad_norm": 1.180493950843811, "learning_rate": 1.8830862744406834e-05, "loss": 0.2437, "step": 1145 }, { "epoch": 0.5426457531814146, "grad_norm": 1.399220585823059, "learning_rate": 1.8828461998742827e-05, "loss": 0.2548, "step": 1146 }, { "epoch": 0.5431192660550459, "grad_norm": 1.4404245615005493, "learning_rate": 1.8826058944079763e-05, "loss": 0.2314, "step": 1147 }, { "epoch": 0.5435927789286771, "grad_norm": 1.5792372226715088, "learning_rate": 1.8823653581046122e-05, "loss": 0.2487, "step": 1148 }, { "epoch": 0.5440662918023084, "grad_norm": 1.5916982889175415, "learning_rate": 1.8821245910271013e-05, "loss": 0.2647, "step": 1149 }, { "epoch": 0.5445398046759397, "grad_norm": 1.326357364654541, "learning_rate": 1.8818835932384133e-05, "loss": 0.2418, "step": 1150 }, { "epoch": 0.5450133175495708, "grad_norm": 1.949175238609314, "learning_rate": 1.8816423648015795e-05, "loss": 0.252, "step": 1151 }, { "epoch": 0.5454868304232021, "grad_norm": 1.1373693943023682, "learning_rate": 1.88140090577969e-05, "loss": 0.2407, "step": 1152 }, { "epoch": 0.5459603432968334, "grad_norm": 1.1092511415481567, "learning_rate": 1.8811592162358977e-05, "loss": 0.2749, "step": 1153 }, { "epoch": 0.5464338561704646, "grad_norm": 2.095144510269165, "learning_rate": 1.8809172962334124e-05, "loss": 0.2357, "step": 1154 }, { "epoch": 0.5469073690440959, "grad_norm": 2.180050849914551, "learning_rate": 1.8806751458355064e-05, "loss": 0.2326, "step": 1155 }, { "epoch": 0.5473808819177272, "grad_norm": 1.321502923965454, "learning_rate": 1.8804327651055123e-05, "loss": 0.2304, "step": 1156 }, { "epoch": 0.5478543947913584, "grad_norm": 1.3724721670150757, "learning_rate": 1.8801901541068224e-05, "loss": 0.2372, "step": 1157 }, { "epoch": 0.5483279076649896, "grad_norm": 1.1256623268127441, "learning_rate": 1.8799473129028886e-05, "loss": 0.2449, "step": 1158 }, { "epoch": 0.5488014205386209, "grad_norm": 1.4590364694595337, "learning_rate": 1.8797042415572244e-05, "loss": 0.2559, "step": 1159 }, { "epoch": 0.5492749334122522, "grad_norm": 1.8025602102279663, "learning_rate": 1.879460940133402e-05, "loss": 0.2778, "step": 1160 }, { "epoch": 0.5497484462858834, "grad_norm": 1.1516093015670776, "learning_rate": 1.879217408695056e-05, "loss": 0.2249, "step": 1161 }, { "epoch": 0.5502219591595147, "grad_norm": 1.4225125312805176, "learning_rate": 1.8789736473058776e-05, "loss": 0.256, "step": 1162 }, { "epoch": 0.550695472033146, "grad_norm": 1.2126541137695312, "learning_rate": 1.8787296560296223e-05, "loss": 0.2269, "step": 1163 }, { "epoch": 0.5511689849067771, "grad_norm": 1.0485234260559082, "learning_rate": 1.8784854349301023e-05, "loss": 0.231, "step": 1164 }, { "epoch": 0.5516424977804084, "grad_norm": 2.125662326812744, "learning_rate": 1.8782409840711916e-05, "loss": 0.2419, "step": 1165 }, { "epoch": 0.5521160106540397, "grad_norm": 2.186833620071411, "learning_rate": 1.877996303516824e-05, "loss": 0.2685, "step": 1166 }, { "epoch": 0.5525895235276709, "grad_norm": 1.3703869581222534, "learning_rate": 1.8777513933309938e-05, "loss": 0.2663, "step": 1167 }, { "epoch": 0.5530630364013022, "grad_norm": 1.2371742725372314, "learning_rate": 1.877506253577754e-05, "loss": 0.2626, "step": 1168 }, { "epoch": 0.5535365492749335, "grad_norm": 1.1112666130065918, "learning_rate": 1.877260884321219e-05, "loss": 0.2318, "step": 1169 }, { "epoch": 0.5540100621485646, "grad_norm": 1.446602463722229, "learning_rate": 1.8770152856255636e-05, "loss": 0.2662, "step": 1170 }, { "epoch": 0.5544835750221959, "grad_norm": 1.32243013381958, "learning_rate": 1.87676945755502e-05, "loss": 0.2765, "step": 1171 }, { "epoch": 0.5549570878958272, "grad_norm": 1.6575604677200317, "learning_rate": 1.8765234001738838e-05, "loss": 0.2783, "step": 1172 }, { "epoch": 0.5554306007694584, "grad_norm": 1.1545252799987793, "learning_rate": 1.8762771135465078e-05, "loss": 0.2414, "step": 1173 }, { "epoch": 0.5559041136430897, "grad_norm": 1.1764464378356934, "learning_rate": 1.8760305977373067e-05, "loss": 0.2403, "step": 1174 }, { "epoch": 0.556377626516721, "grad_norm": 1.4649169445037842, "learning_rate": 1.875783852810754e-05, "loss": 0.2466, "step": 1175 }, { "epoch": 0.5568511393903521, "grad_norm": 1.1785379648208618, "learning_rate": 1.8755368788313834e-05, "loss": 0.2363, "step": 1176 }, { "epoch": 0.5573246522639834, "grad_norm": 1.8420625925064087, "learning_rate": 1.8752896758637884e-05, "loss": 0.2261, "step": 1177 }, { "epoch": 0.5577981651376147, "grad_norm": 1.241428256034851, "learning_rate": 1.8750422439726227e-05, "loss": 0.2642, "step": 1178 }, { "epoch": 0.5582716780112459, "grad_norm": 2.547609567642212, "learning_rate": 1.8747945832226e-05, "loss": 0.2315, "step": 1179 }, { "epoch": 0.5587451908848772, "grad_norm": 1.8056316375732422, "learning_rate": 1.874546693678493e-05, "loss": 0.2672, "step": 1180 }, { "epoch": 0.5592187037585085, "grad_norm": 1.3478612899780273, "learning_rate": 1.874298575405135e-05, "loss": 0.217, "step": 1181 }, { "epoch": 0.5596922166321396, "grad_norm": 1.2503442764282227, "learning_rate": 1.8740502284674197e-05, "loss": 0.2321, "step": 1182 }, { "epoch": 0.5601657295057709, "grad_norm": 2.2722413539886475, "learning_rate": 1.873801652930299e-05, "loss": 0.2657, "step": 1183 }, { "epoch": 0.5606392423794022, "grad_norm": 1.1966623067855835, "learning_rate": 1.873552848858786e-05, "loss": 0.2579, "step": 1184 }, { "epoch": 0.5611127552530334, "grad_norm": 1.2810187339782715, "learning_rate": 1.8733038163179524e-05, "loss": 0.2727, "step": 1185 }, { "epoch": 0.5615862681266647, "grad_norm": 1.2064369916915894, "learning_rate": 1.8730545553729306e-05, "loss": 0.2572, "step": 1186 }, { "epoch": 0.562059781000296, "grad_norm": 1.379677653312683, "learning_rate": 1.8728050660889123e-05, "loss": 0.2622, "step": 1187 }, { "epoch": 0.5625332938739271, "grad_norm": 0.9348089098930359, "learning_rate": 1.8725553485311492e-05, "loss": 0.2304, "step": 1188 }, { "epoch": 0.5630068067475584, "grad_norm": 2.2606005668640137, "learning_rate": 1.872305402764952e-05, "loss": 0.2673, "step": 1189 }, { "epoch": 0.5634803196211897, "grad_norm": 1.8432506322860718, "learning_rate": 1.8720552288556926e-05, "loss": 0.2779, "step": 1190 }, { "epoch": 0.563953832494821, "grad_norm": 1.0819976329803467, "learning_rate": 1.8718048268688006e-05, "loss": 0.2349, "step": 1191 }, { "epoch": 0.5644273453684522, "grad_norm": 1.164947748184204, "learning_rate": 1.8715541968697666e-05, "loss": 0.2575, "step": 1192 }, { "epoch": 0.5649008582420835, "grad_norm": 1.1525195837020874, "learning_rate": 1.8713033389241406e-05, "loss": 0.2653, "step": 1193 }, { "epoch": 0.5653743711157148, "grad_norm": 1.0925800800323486, "learning_rate": 1.8710522530975315e-05, "loss": 0.2368, "step": 1194 }, { "epoch": 0.5658478839893459, "grad_norm": 1.066985845565796, "learning_rate": 1.870800939455609e-05, "loss": 0.2427, "step": 1195 }, { "epoch": 0.5663213968629772, "grad_norm": 1.2804023027420044, "learning_rate": 1.8705493980641017e-05, "loss": 0.2649, "step": 1196 }, { "epoch": 0.5667949097366085, "grad_norm": 1.0042173862457275, "learning_rate": 1.8702976289887978e-05, "loss": 0.2408, "step": 1197 }, { "epoch": 0.5672684226102397, "grad_norm": 1.1509785652160645, "learning_rate": 1.8700456322955446e-05, "loss": 0.2527, "step": 1198 }, { "epoch": 0.567741935483871, "grad_norm": 1.66000235080719, "learning_rate": 1.8697934080502498e-05, "loss": 0.2614, "step": 1199 }, { "epoch": 0.5682154483575023, "grad_norm": 1.2652623653411865, "learning_rate": 1.8695409563188803e-05, "loss": 0.2366, "step": 1200 }, { "epoch": 0.5686889612311334, "grad_norm": 1.2608882188796997, "learning_rate": 1.8692882771674624e-05, "loss": 0.2625, "step": 1201 }, { "epoch": 0.5691624741047647, "grad_norm": 1.0795193910598755, "learning_rate": 1.8690353706620815e-05, "loss": 0.2497, "step": 1202 }, { "epoch": 0.569635986978396, "grad_norm": 1.6654939651489258, "learning_rate": 1.868782236868883e-05, "loss": 0.2668, "step": 1203 }, { "epoch": 0.5701094998520272, "grad_norm": 1.3972606658935547, "learning_rate": 1.8685288758540724e-05, "loss": 0.2451, "step": 1204 }, { "epoch": 0.5705830127256585, "grad_norm": 1.5168894529342651, "learning_rate": 1.8682752876839127e-05, "loss": 0.2625, "step": 1205 }, { "epoch": 0.5710565255992898, "grad_norm": 1.420271873474121, "learning_rate": 1.868021472424728e-05, "loss": 0.2467, "step": 1206 }, { "epoch": 0.5715300384729209, "grad_norm": 1.3395509719848633, "learning_rate": 1.8677674301429012e-05, "loss": 0.2397, "step": 1207 }, { "epoch": 0.5720035513465522, "grad_norm": 1.5764024257659912, "learning_rate": 1.8675131609048742e-05, "loss": 0.2556, "step": 1208 }, { "epoch": 0.5724770642201835, "grad_norm": 1.2619208097457886, "learning_rate": 1.8672586647771496e-05, "loss": 0.2772, "step": 1209 }, { "epoch": 0.5729505770938147, "grad_norm": 1.2036354541778564, "learning_rate": 1.8670039418262873e-05, "loss": 0.2276, "step": 1210 }, { "epoch": 0.573424089967446, "grad_norm": 1.003892183303833, "learning_rate": 1.8667489921189083e-05, "loss": 0.2257, "step": 1211 }, { "epoch": 0.5738976028410773, "grad_norm": 1.2825255393981934, "learning_rate": 1.8664938157216923e-05, "loss": 0.26, "step": 1212 }, { "epoch": 0.5743711157147084, "grad_norm": 1.5353515148162842, "learning_rate": 1.866238412701378e-05, "loss": 0.2344, "step": 1213 }, { "epoch": 0.5748446285883397, "grad_norm": 1.1604024171829224, "learning_rate": 1.8659827831247632e-05, "loss": 0.2378, "step": 1214 }, { "epoch": 0.575318141461971, "grad_norm": 1.5623371601104736, "learning_rate": 1.865726927058706e-05, "loss": 0.2597, "step": 1215 }, { "epoch": 0.5757916543356022, "grad_norm": 1.4064077138900757, "learning_rate": 1.8654708445701227e-05, "loss": 0.2308, "step": 1216 }, { "epoch": 0.5762651672092335, "grad_norm": 1.2463563680648804, "learning_rate": 1.8652145357259897e-05, "loss": 0.2289, "step": 1217 }, { "epoch": 0.5767386800828648, "grad_norm": 1.6035746335983276, "learning_rate": 1.8649580005933415e-05, "loss": 0.2724, "step": 1218 }, { "epoch": 0.577212192956496, "grad_norm": 1.4112597703933716, "learning_rate": 1.8647012392392728e-05, "loss": 0.255, "step": 1219 }, { "epoch": 0.5776857058301272, "grad_norm": 1.4282629489898682, "learning_rate": 1.8644442517309366e-05, "loss": 0.2478, "step": 1220 }, { "epoch": 0.5781592187037585, "grad_norm": 1.234606385231018, "learning_rate": 1.8641870381355463e-05, "loss": 0.2597, "step": 1221 }, { "epoch": 0.5786327315773898, "grad_norm": 1.283451795578003, "learning_rate": 1.8639295985203726e-05, "loss": 0.2724, "step": 1222 }, { "epoch": 0.579106244451021, "grad_norm": 1.5409351587295532, "learning_rate": 1.8636719329527474e-05, "loss": 0.267, "step": 1223 }, { "epoch": 0.5795797573246523, "grad_norm": 1.1971688270568848, "learning_rate": 1.8634140415000595e-05, "loss": 0.2635, "step": 1224 }, { "epoch": 0.5800532701982836, "grad_norm": 1.3609153032302856, "learning_rate": 1.863155924229759e-05, "loss": 0.2479, "step": 1225 }, { "epoch": 0.5805267830719147, "grad_norm": 1.1887803077697754, "learning_rate": 1.8628975812093535e-05, "loss": 0.2453, "step": 1226 }, { "epoch": 0.581000295945546, "grad_norm": 2.107870101928711, "learning_rate": 1.86263901250641e-05, "loss": 0.249, "step": 1227 }, { "epoch": 0.5814738088191773, "grad_norm": 1.9113880395889282, "learning_rate": 1.8623802181885548e-05, "loss": 0.2569, "step": 1228 }, { "epoch": 0.5819473216928085, "grad_norm": 1.2835649251937866, "learning_rate": 1.862121198323473e-05, "loss": 0.2462, "step": 1229 }, { "epoch": 0.5824208345664398, "grad_norm": 1.388893723487854, "learning_rate": 1.861861952978909e-05, "loss": 0.2582, "step": 1230 }, { "epoch": 0.5828943474400711, "grad_norm": 1.21442449092865, "learning_rate": 1.861602482222666e-05, "loss": 0.2257, "step": 1231 }, { "epoch": 0.5833678603137022, "grad_norm": 1.585196614265442, "learning_rate": 1.8613427861226056e-05, "loss": 0.2475, "step": 1232 }, { "epoch": 0.5838413731873335, "grad_norm": 2.5939364433288574, "learning_rate": 1.8610828647466487e-05, "loss": 0.2519, "step": 1233 }, { "epoch": 0.5843148860609648, "grad_norm": 1.4720892906188965, "learning_rate": 1.8608227181627757e-05, "loss": 0.2598, "step": 1234 }, { "epoch": 0.584788398934596, "grad_norm": 1.8123221397399902, "learning_rate": 1.860562346439025e-05, "loss": 0.2372, "step": 1235 }, { "epoch": 0.5852619118082273, "grad_norm": 1.4891722202301025, "learning_rate": 1.8603017496434953e-05, "loss": 0.2353, "step": 1236 }, { "epoch": 0.5857354246818586, "grad_norm": 1.0998507738113403, "learning_rate": 1.860040927844342e-05, "loss": 0.2437, "step": 1237 }, { "epoch": 0.5862089375554898, "grad_norm": 1.521974802017212, "learning_rate": 1.859779881109781e-05, "loss": 0.2569, "step": 1238 }, { "epoch": 0.586682450429121, "grad_norm": 1.6860134601593018, "learning_rate": 1.8595186095080864e-05, "loss": 0.2483, "step": 1239 }, { "epoch": 0.5871559633027523, "grad_norm": 1.1612238883972168, "learning_rate": 1.8592571131075915e-05, "loss": 0.2645, "step": 1240 }, { "epoch": 0.5876294761763835, "grad_norm": 1.3629693984985352, "learning_rate": 1.8589953919766882e-05, "loss": 0.2492, "step": 1241 }, { "epoch": 0.5881029890500148, "grad_norm": 1.175735592842102, "learning_rate": 1.8587334461838267e-05, "loss": 0.2256, "step": 1242 }, { "epoch": 0.5885765019236461, "grad_norm": 2.3192355632781982, "learning_rate": 1.8584712757975173e-05, "loss": 0.2593, "step": 1243 }, { "epoch": 0.5890500147972773, "grad_norm": 1.1697986125946045, "learning_rate": 1.858208880886327e-05, "loss": 0.2484, "step": 1244 }, { "epoch": 0.5895235276709085, "grad_norm": 1.8343950510025024, "learning_rate": 1.8579462615188832e-05, "loss": 0.2652, "step": 1245 }, { "epoch": 0.5899970405445398, "grad_norm": 1.3866143226623535, "learning_rate": 1.8576834177638717e-05, "loss": 0.2543, "step": 1246 }, { "epoch": 0.590470553418171, "grad_norm": 1.11716628074646, "learning_rate": 1.8574203496900366e-05, "loss": 0.2338, "step": 1247 }, { "epoch": 0.5909440662918023, "grad_norm": 1.593776822090149, "learning_rate": 1.8571570573661803e-05, "loss": 0.2547, "step": 1248 }, { "epoch": 0.5914175791654336, "grad_norm": 2.5242745876312256, "learning_rate": 1.8568935408611652e-05, "loss": 0.2697, "step": 1249 }, { "epoch": 0.5918910920390648, "grad_norm": 1.9075709581375122, "learning_rate": 1.856629800243911e-05, "loss": 0.2924, "step": 1250 }, { "epoch": 0.592364604912696, "grad_norm": 1.8078994750976562, "learning_rate": 1.8563658355833965e-05, "loss": 0.2521, "step": 1251 }, { "epoch": 0.5928381177863273, "grad_norm": 1.6747920513153076, "learning_rate": 1.856101646948659e-05, "loss": 0.2511, "step": 1252 }, { "epoch": 0.5933116306599586, "grad_norm": 1.425503134727478, "learning_rate": 1.8558372344087953e-05, "loss": 0.2487, "step": 1253 }, { "epoch": 0.5937851435335898, "grad_norm": 2.0698137283325195, "learning_rate": 1.8555725980329588e-05, "loss": 0.2581, "step": 1254 }, { "epoch": 0.5942586564072211, "grad_norm": 2.361027479171753, "learning_rate": 1.8553077378903632e-05, "loss": 0.2371, "step": 1255 }, { "epoch": 0.5947321692808524, "grad_norm": 2.6887784004211426, "learning_rate": 1.8550426540502802e-05, "loss": 0.2425, "step": 1256 }, { "epoch": 0.5952056821544836, "grad_norm": 1.9129482507705688, "learning_rate": 1.8547773465820397e-05, "loss": 0.2334, "step": 1257 }, { "epoch": 0.5956791950281148, "grad_norm": 1.7753819227218628, "learning_rate": 1.8545118155550305e-05, "loss": 0.2376, "step": 1258 }, { "epoch": 0.5961527079017461, "grad_norm": 1.9076635837554932, "learning_rate": 1.8542460610386993e-05, "loss": 0.2295, "step": 1259 }, { "epoch": 0.5966262207753773, "grad_norm": 2.463299512863159, "learning_rate": 1.853980083102552e-05, "loss": 0.2538, "step": 1260 }, { "epoch": 0.5970997336490086, "grad_norm": 1.5017327070236206, "learning_rate": 1.8537138818161527e-05, "loss": 0.2312, "step": 1261 }, { "epoch": 0.5975732465226399, "grad_norm": 1.36251962184906, "learning_rate": 1.8534474572491235e-05, "loss": 0.2536, "step": 1262 }, { "epoch": 0.598046759396271, "grad_norm": 1.2385445833206177, "learning_rate": 1.853180809471145e-05, "loss": 0.2439, "step": 1263 }, { "epoch": 0.5985202722699023, "grad_norm": 1.0537524223327637, "learning_rate": 1.8529139385519567e-05, "loss": 0.2483, "step": 1264 }, { "epoch": 0.5989937851435336, "grad_norm": 3.3753559589385986, "learning_rate": 1.8526468445613556e-05, "loss": 0.2376, "step": 1265 }, { "epoch": 0.5994672980171648, "grad_norm": 2.3799593448638916, "learning_rate": 1.8523795275691986e-05, "loss": 0.2361, "step": 1266 }, { "epoch": 0.5999408108907961, "grad_norm": 1.300310492515564, "learning_rate": 1.852111987645399e-05, "loss": 0.2466, "step": 1267 }, { "epoch": 0.6004143237644274, "grad_norm": 1.4112299680709839, "learning_rate": 1.851844224859929e-05, "loss": 0.2418, "step": 1268 }, { "epoch": 0.6008878366380586, "grad_norm": 1.9448109865188599, "learning_rate": 1.8515762392828205e-05, "loss": 0.2492, "step": 1269 }, { "epoch": 0.6013613495116898, "grad_norm": 1.0596145391464233, "learning_rate": 1.8513080309841616e-05, "loss": 0.2385, "step": 1270 }, { "epoch": 0.6018348623853211, "grad_norm": 1.4580031633377075, "learning_rate": 1.8510396000341e-05, "loss": 0.2615, "step": 1271 }, { "epoch": 0.6023083752589523, "grad_norm": 1.4013538360595703, "learning_rate": 1.850770946502841e-05, "loss": 0.2271, "step": 1272 }, { "epoch": 0.6027818881325836, "grad_norm": 1.3812453746795654, "learning_rate": 1.8505020704606486e-05, "loss": 0.2506, "step": 1273 }, { "epoch": 0.6032554010062149, "grad_norm": 1.411999225616455, "learning_rate": 1.8502329719778448e-05, "loss": 0.2343, "step": 1274 }, { "epoch": 0.6037289138798461, "grad_norm": 1.6212693452835083, "learning_rate": 1.849963651124809e-05, "loss": 0.2382, "step": 1275 }, { "epoch": 0.6042024267534774, "grad_norm": 1.3826075792312622, "learning_rate": 1.8496941079719805e-05, "loss": 0.2541, "step": 1276 }, { "epoch": 0.6046759396271086, "grad_norm": 2.0459675788879395, "learning_rate": 1.8494243425898548e-05, "loss": 0.2367, "step": 1277 }, { "epoch": 0.6051494525007398, "grad_norm": 2.222378969192505, "learning_rate": 1.8491543550489872e-05, "loss": 0.2267, "step": 1278 }, { "epoch": 0.6056229653743711, "grad_norm": 1.439396619796753, "learning_rate": 1.84888414541999e-05, "loss": 0.2454, "step": 1279 }, { "epoch": 0.6060964782480024, "grad_norm": 1.3656351566314697, "learning_rate": 1.8486137137735337e-05, "loss": 0.2417, "step": 1280 }, { "epoch": 0.6065699911216336, "grad_norm": 1.2693840265274048, "learning_rate": 1.848343060180347e-05, "loss": 0.2573, "step": 1281 }, { "epoch": 0.6070435039952649, "grad_norm": 1.9069156646728516, "learning_rate": 1.8480721847112174e-05, "loss": 0.2466, "step": 1282 }, { "epoch": 0.6075170168688961, "grad_norm": 1.1109650135040283, "learning_rate": 1.8478010874369894e-05, "loss": 0.2637, "step": 1283 }, { "epoch": 0.6079905297425274, "grad_norm": 1.4270925521850586, "learning_rate": 1.8475297684285657e-05, "loss": 0.1981, "step": 1284 }, { "epoch": 0.6084640426161586, "grad_norm": 1.5452510118484497, "learning_rate": 1.8472582277569072e-05, "loss": 0.2564, "step": 1285 }, { "epoch": 0.6089375554897899, "grad_norm": 1.9207974672317505, "learning_rate": 1.8469864654930333e-05, "loss": 0.2676, "step": 1286 }, { "epoch": 0.6094110683634212, "grad_norm": 1.18226957321167, "learning_rate": 1.8467144817080204e-05, "loss": 0.2474, "step": 1287 }, { "epoch": 0.6098845812370524, "grad_norm": 1.3588470220565796, "learning_rate": 1.8464422764730028e-05, "loss": 0.2469, "step": 1288 }, { "epoch": 0.6103580941106836, "grad_norm": 1.4473971128463745, "learning_rate": 1.8461698498591736e-05, "loss": 0.2322, "step": 1289 }, { "epoch": 0.6108316069843149, "grad_norm": 2.050037145614624, "learning_rate": 1.8458972019377834e-05, "loss": 0.2611, "step": 1290 }, { "epoch": 0.6113051198579461, "grad_norm": 1.4301786422729492, "learning_rate": 1.8456243327801407e-05, "loss": 0.2265, "step": 1291 }, { "epoch": 0.6117786327315774, "grad_norm": 1.096746802330017, "learning_rate": 1.8453512424576113e-05, "loss": 0.2279, "step": 1292 }, { "epoch": 0.6122521456052087, "grad_norm": 1.5921342372894287, "learning_rate": 1.84507793104162e-05, "loss": 0.2564, "step": 1293 }, { "epoch": 0.6127256584788399, "grad_norm": 1.1023545265197754, "learning_rate": 1.8448043986036483e-05, "loss": 0.2359, "step": 1294 }, { "epoch": 0.6131991713524712, "grad_norm": 2.039523124694824, "learning_rate": 1.844530645215236e-05, "loss": 0.2283, "step": 1295 }, { "epoch": 0.6136726842261024, "grad_norm": 1.5656194686889648, "learning_rate": 1.8442566709479813e-05, "loss": 0.256, "step": 1296 }, { "epoch": 0.6141461970997336, "grad_norm": 1.9712538719177246, "learning_rate": 1.8439824758735385e-05, "loss": 0.2526, "step": 1297 }, { "epoch": 0.6146197099733649, "grad_norm": 1.1353607177734375, "learning_rate": 1.8437080600636215e-05, "loss": 0.2606, "step": 1298 }, { "epoch": 0.6150932228469962, "grad_norm": 2.3655588626861572, "learning_rate": 1.8434334235900008e-05, "loss": 0.2293, "step": 1299 }, { "epoch": 0.6155667357206274, "grad_norm": 1.5827844142913818, "learning_rate": 1.8431585665245047e-05, "loss": 0.2443, "step": 1300 }, { "epoch": 0.6160402485942587, "grad_norm": 1.5247994661331177, "learning_rate": 1.8428834889390194e-05, "loss": 0.2444, "step": 1301 }, { "epoch": 0.6165137614678899, "grad_norm": 1.465590476989746, "learning_rate": 1.8426081909054893e-05, "loss": 0.2279, "step": 1302 }, { "epoch": 0.6169872743415211, "grad_norm": 1.2217838764190674, "learning_rate": 1.8423326724959157e-05, "loss": 0.2393, "step": 1303 }, { "epoch": 0.6174607872151524, "grad_norm": 1.6876059770584106, "learning_rate": 1.8420569337823576e-05, "loss": 0.248, "step": 1304 }, { "epoch": 0.6179343000887837, "grad_norm": 1.808072805404663, "learning_rate": 1.841780974836932e-05, "loss": 0.2678, "step": 1305 }, { "epoch": 0.6184078129624149, "grad_norm": 2.256350517272949, "learning_rate": 1.8415047957318132e-05, "loss": 0.2462, "step": 1306 }, { "epoch": 0.6188813258360462, "grad_norm": 2.597975969314575, "learning_rate": 1.8412283965392334e-05, "loss": 0.2504, "step": 1307 }, { "epoch": 0.6193548387096774, "grad_norm": 1.6564836502075195, "learning_rate": 1.8409517773314816e-05, "loss": 0.2264, "step": 1308 }, { "epoch": 0.6198283515833086, "grad_norm": 1.5437556505203247, "learning_rate": 1.8406749381809054e-05, "loss": 0.2375, "step": 1309 }, { "epoch": 0.6203018644569399, "grad_norm": 2.6221256256103516, "learning_rate": 1.8403978791599095e-05, "loss": 0.2454, "step": 1310 }, { "epoch": 0.6207753773305712, "grad_norm": 1.4474231004714966, "learning_rate": 1.8401206003409554e-05, "loss": 0.2489, "step": 1311 }, { "epoch": 0.6212488902042024, "grad_norm": 2.578899621963501, "learning_rate": 1.839843101796563e-05, "loss": 0.2466, "step": 1312 }, { "epoch": 0.6217224030778337, "grad_norm": 2.1055898666381836, "learning_rate": 1.8395653835993098e-05, "loss": 0.2741, "step": 1313 }, { "epoch": 0.622195915951465, "grad_norm": 1.2362160682678223, "learning_rate": 1.83928744582183e-05, "loss": 0.2472, "step": 1314 }, { "epoch": 0.6226694288250961, "grad_norm": 1.5515974760055542, "learning_rate": 1.839009288536815e-05, "loss": 0.2346, "step": 1315 }, { "epoch": 0.6231429416987274, "grad_norm": 1.4846057891845703, "learning_rate": 1.838730911817015e-05, "loss": 0.2439, "step": 1316 }, { "epoch": 0.6236164545723587, "grad_norm": 2.014627695083618, "learning_rate": 1.8384523157352365e-05, "loss": 0.2398, "step": 1317 }, { "epoch": 0.62408996744599, "grad_norm": 2.682614326477051, "learning_rate": 1.8381735003643434e-05, "loss": 0.2567, "step": 1318 }, { "epoch": 0.6245634803196212, "grad_norm": 1.1889935731887817, "learning_rate": 1.837894465777257e-05, "loss": 0.2425, "step": 1319 }, { "epoch": 0.6250369931932525, "grad_norm": 1.283697247505188, "learning_rate": 1.8376152120469567e-05, "loss": 0.2576, "step": 1320 }, { "epoch": 0.6255105060668837, "grad_norm": 1.7607721090316772, "learning_rate": 1.8373357392464783e-05, "loss": 0.2428, "step": 1321 }, { "epoch": 0.6259840189405149, "grad_norm": 2.0898520946502686, "learning_rate": 1.837056047448915e-05, "loss": 0.246, "step": 1322 }, { "epoch": 0.6264575318141462, "grad_norm": 1.7172597646713257, "learning_rate": 1.836776136727418e-05, "loss": 0.242, "step": 1323 }, { "epoch": 0.6269310446877775, "grad_norm": 1.7374228239059448, "learning_rate": 1.8364960071551948e-05, "loss": 0.252, "step": 1324 }, { "epoch": 0.6274045575614087, "grad_norm": 1.4779413938522339, "learning_rate": 1.836215658805511e-05, "loss": 0.2424, "step": 1325 }, { "epoch": 0.62787807043504, "grad_norm": 1.4057080745697021, "learning_rate": 1.8359350917516882e-05, "loss": 0.2516, "step": 1326 }, { "epoch": 0.6283515833086712, "grad_norm": 1.7283161878585815, "learning_rate": 1.835654306067107e-05, "loss": 0.2559, "step": 1327 }, { "epoch": 0.6288250961823024, "grad_norm": 1.3666443824768066, "learning_rate": 1.835373301825204e-05, "loss": 0.235, "step": 1328 }, { "epoch": 0.6292986090559337, "grad_norm": 1.4366817474365234, "learning_rate": 1.8350920790994723e-05, "loss": 0.2405, "step": 1329 }, { "epoch": 0.629772121929565, "grad_norm": 1.3197062015533447, "learning_rate": 1.834810637963464e-05, "loss": 0.2191, "step": 1330 }, { "epoch": 0.6302456348031962, "grad_norm": 1.0538299083709717, "learning_rate": 1.8345289784907863e-05, "loss": 0.2255, "step": 1331 }, { "epoch": 0.6307191476768275, "grad_norm": 1.732539415359497, "learning_rate": 1.8342471007551058e-05, "loss": 0.2733, "step": 1332 }, { "epoch": 0.6311926605504588, "grad_norm": 1.7113350629806519, "learning_rate": 1.833965004830144e-05, "loss": 0.2285, "step": 1333 }, { "epoch": 0.6316661734240899, "grad_norm": 2.064622640609741, "learning_rate": 1.8336826907896806e-05, "loss": 0.2398, "step": 1334 }, { "epoch": 0.6321396862977212, "grad_norm": 1.9892138242721558, "learning_rate": 1.8334001587075517e-05, "loss": 0.2503, "step": 1335 }, { "epoch": 0.6326131991713525, "grad_norm": 1.079971194267273, "learning_rate": 1.833117408657652e-05, "loss": 0.2354, "step": 1336 }, { "epoch": 0.6330867120449837, "grad_norm": 1.276336669921875, "learning_rate": 1.8328344407139307e-05, "loss": 0.2452, "step": 1337 }, { "epoch": 0.633560224918615, "grad_norm": 1.477914571762085, "learning_rate": 1.832551254950396e-05, "loss": 0.2422, "step": 1338 }, { "epoch": 0.6340337377922463, "grad_norm": 1.510981798171997, "learning_rate": 1.832267851441113e-05, "loss": 0.2523, "step": 1339 }, { "epoch": 0.6345072506658774, "grad_norm": 1.4486583471298218, "learning_rate": 1.8319842302602014e-05, "loss": 0.2492, "step": 1340 }, { "epoch": 0.6349807635395087, "grad_norm": 1.9352304935455322, "learning_rate": 1.8317003914818415e-05, "loss": 0.2373, "step": 1341 }, { "epoch": 0.63545427641314, "grad_norm": 1.3235574960708618, "learning_rate": 1.8314163351802673e-05, "loss": 0.2464, "step": 1342 }, { "epoch": 0.6359277892867712, "grad_norm": 1.9728165864944458, "learning_rate": 1.831132061429772e-05, "loss": 0.2387, "step": 1343 }, { "epoch": 0.6364013021604025, "grad_norm": 1.5262460708618164, "learning_rate": 1.8308475703047034e-05, "loss": 0.2273, "step": 1344 }, { "epoch": 0.6368748150340338, "grad_norm": 1.6200482845306396, "learning_rate": 1.8305628618794685e-05, "loss": 0.2561, "step": 1345 }, { "epoch": 0.6373483279076649, "grad_norm": 1.5177505016326904, "learning_rate": 1.8302779362285298e-05, "loss": 0.2331, "step": 1346 }, { "epoch": 0.6378218407812962, "grad_norm": 1.3834291696548462, "learning_rate": 1.8299927934264064e-05, "loss": 0.2117, "step": 1347 }, { "epoch": 0.6382953536549275, "grad_norm": 1.5583209991455078, "learning_rate": 1.829707433547675e-05, "loss": 0.2055, "step": 1348 }, { "epoch": 0.6387688665285588, "grad_norm": 1.7908825874328613, "learning_rate": 1.8294218566669684e-05, "loss": 0.264, "step": 1349 }, { "epoch": 0.63924237940219, "grad_norm": 2.1676015853881836, "learning_rate": 1.8291360628589774e-05, "loss": 0.2424, "step": 1350 }, { "epoch": 0.6397158922758213, "grad_norm": 1.7668577432632446, "learning_rate": 1.8288500521984477e-05, "loss": 0.2576, "step": 1351 }, { "epoch": 0.6401894051494526, "grad_norm": 1.537380337715149, "learning_rate": 1.828563824760183e-05, "loss": 0.243, "step": 1352 }, { "epoch": 0.6406629180230837, "grad_norm": 1.5818302631378174, "learning_rate": 1.828277380619043e-05, "loss": 0.2637, "step": 1353 }, { "epoch": 0.641136430896715, "grad_norm": 1.2502553462982178, "learning_rate": 1.827990719849945e-05, "loss": 0.2299, "step": 1354 }, { "epoch": 0.6416099437703463, "grad_norm": 1.2032712697982788, "learning_rate": 1.8277038425278616e-05, "loss": 0.2597, "step": 1355 }, { "epoch": 0.6420834566439775, "grad_norm": 1.1171890497207642, "learning_rate": 1.8274167487278232e-05, "loss": 0.24, "step": 1356 }, { "epoch": 0.6425569695176088, "grad_norm": 1.7673466205596924, "learning_rate": 1.8271294385249168e-05, "loss": 0.2274, "step": 1357 }, { "epoch": 0.64303048239124, "grad_norm": 1.4950395822525024, "learning_rate": 1.8268419119942852e-05, "loss": 0.2521, "step": 1358 }, { "epoch": 0.6435039952648712, "grad_norm": 1.344424843788147, "learning_rate": 1.8265541692111277e-05, "loss": 0.2201, "step": 1359 }, { "epoch": 0.6439775081385025, "grad_norm": 1.1256235837936401, "learning_rate": 1.8262662102507012e-05, "loss": 0.2402, "step": 1360 }, { "epoch": 0.6444510210121338, "grad_norm": 1.2942851781845093, "learning_rate": 1.8259780351883188e-05, "loss": 0.2561, "step": 1361 }, { "epoch": 0.644924533885765, "grad_norm": 1.1802473068237305, "learning_rate": 1.8256896440993498e-05, "loss": 0.2516, "step": 1362 }, { "epoch": 0.6453980467593963, "grad_norm": 1.1492762565612793, "learning_rate": 1.8254010370592197e-05, "loss": 0.2492, "step": 1363 }, { "epoch": 0.6458715596330276, "grad_norm": 1.716592788696289, "learning_rate": 1.825112214143411e-05, "loss": 0.2208, "step": 1364 }, { "epoch": 0.6463450725066587, "grad_norm": 1.5306360721588135, "learning_rate": 1.824823175427463e-05, "loss": 0.2273, "step": 1365 }, { "epoch": 0.64681858538029, "grad_norm": 1.6594233512878418, "learning_rate": 1.8245339209869705e-05, "loss": 0.2514, "step": 1366 }, { "epoch": 0.6472920982539213, "grad_norm": 1.2237942218780518, "learning_rate": 1.8242444508975857e-05, "loss": 0.2514, "step": 1367 }, { "epoch": 0.6477656111275525, "grad_norm": 1.5434156656265259, "learning_rate": 1.8239547652350162e-05, "loss": 0.2523, "step": 1368 }, { "epoch": 0.6482391240011838, "grad_norm": 1.3254024982452393, "learning_rate": 1.8236648640750266e-05, "loss": 0.2588, "step": 1369 }, { "epoch": 0.6487126368748151, "grad_norm": 1.8970592021942139, "learning_rate": 1.823374747493438e-05, "loss": 0.2487, "step": 1370 }, { "epoch": 0.6491861497484462, "grad_norm": 1.7532835006713867, "learning_rate": 1.8230844155661273e-05, "loss": 0.2405, "step": 1371 }, { "epoch": 0.6496596626220775, "grad_norm": 1.1474004983901978, "learning_rate": 1.822793868369028e-05, "loss": 0.2221, "step": 1372 }, { "epoch": 0.6501331754957088, "grad_norm": 1.8724579811096191, "learning_rate": 1.8225031059781302e-05, "loss": 0.2345, "step": 1373 }, { "epoch": 0.65060668836934, "grad_norm": 1.337428331375122, "learning_rate": 1.8222121284694798e-05, "loss": 0.2737, "step": 1374 }, { "epoch": 0.6510802012429713, "grad_norm": 1.1940720081329346, "learning_rate": 1.8219209359191793e-05, "loss": 0.2089, "step": 1375 }, { "epoch": 0.6515537141166026, "grad_norm": 1.327831745147705, "learning_rate": 1.821629528403387e-05, "loss": 0.2629, "step": 1376 }, { "epoch": 0.6520272269902337, "grad_norm": 1.3946506977081299, "learning_rate": 1.8213379059983184e-05, "loss": 0.227, "step": 1377 }, { "epoch": 0.652500739863865, "grad_norm": 1.840470790863037, "learning_rate": 1.8210460687802437e-05, "loss": 0.2434, "step": 1378 }, { "epoch": 0.6529742527374963, "grad_norm": 1.3250887393951416, "learning_rate": 1.8207540168254907e-05, "loss": 0.2449, "step": 1379 }, { "epoch": 0.6534477656111276, "grad_norm": 1.3822866678237915, "learning_rate": 1.8204617502104426e-05, "loss": 0.2286, "step": 1380 }, { "epoch": 0.6539212784847588, "grad_norm": 1.2406575679779053, "learning_rate": 1.8201692690115385e-05, "loss": 0.2551, "step": 1381 }, { "epoch": 0.6543947913583901, "grad_norm": 1.167535662651062, "learning_rate": 1.819876573305275e-05, "loss": 0.2365, "step": 1382 }, { "epoch": 0.6548683042320214, "grad_norm": 1.555350661277771, "learning_rate": 1.8195836631682025e-05, "loss": 0.2523, "step": 1383 }, { "epoch": 0.6553418171056525, "grad_norm": 1.228309988975525, "learning_rate": 1.8192905386769298e-05, "loss": 0.2235, "step": 1384 }, { "epoch": 0.6558153299792838, "grad_norm": 1.2090561389923096, "learning_rate": 1.8189971999081207e-05, "loss": 0.24, "step": 1385 }, { "epoch": 0.6562888428529151, "grad_norm": 1.1428760290145874, "learning_rate": 1.818703646938495e-05, "loss": 0.2441, "step": 1386 }, { "epoch": 0.6567623557265463, "grad_norm": 2.0970654487609863, "learning_rate": 1.8184098798448285e-05, "loss": 0.2685, "step": 1387 }, { "epoch": 0.6572358686001776, "grad_norm": 1.638376235961914, "learning_rate": 1.8181158987039534e-05, "loss": 0.2524, "step": 1388 }, { "epoch": 0.6577093814738089, "grad_norm": 1.7937678098678589, "learning_rate": 1.8178217035927578e-05, "loss": 0.2446, "step": 1389 }, { "epoch": 0.65818289434744, "grad_norm": 1.5529017448425293, "learning_rate": 1.8175272945881847e-05, "loss": 0.262, "step": 1390 }, { "epoch": 0.6586564072210713, "grad_norm": 2.30466628074646, "learning_rate": 1.8172326717672348e-05, "loss": 0.239, "step": 1391 }, { "epoch": 0.6591299200947026, "grad_norm": 2.677629232406616, "learning_rate": 1.8169378352069637e-05, "loss": 0.2682, "step": 1392 }, { "epoch": 0.6596034329683338, "grad_norm": 2.3182013034820557, "learning_rate": 1.8166427849844826e-05, "loss": 0.2425, "step": 1393 }, { "epoch": 0.6600769458419651, "grad_norm": 1.5342023372650146, "learning_rate": 1.8163475211769595e-05, "loss": 0.2674, "step": 1394 }, { "epoch": 0.6605504587155964, "grad_norm": 2.3066344261169434, "learning_rate": 1.8160520438616176e-05, "loss": 0.2506, "step": 1395 }, { "epoch": 0.6610239715892275, "grad_norm": 1.210817813873291, "learning_rate": 1.8157563531157366e-05, "loss": 0.2399, "step": 1396 }, { "epoch": 0.6614974844628588, "grad_norm": 1.5053683519363403, "learning_rate": 1.815460449016651e-05, "loss": 0.2343, "step": 1397 }, { "epoch": 0.6619709973364901, "grad_norm": 1.1325657367706299, "learning_rate": 1.8151643316417518e-05, "loss": 0.2338, "step": 1398 }, { "epoch": 0.6624445102101213, "grad_norm": 2.2156691551208496, "learning_rate": 1.8148680010684856e-05, "loss": 0.2328, "step": 1399 }, { "epoch": 0.6629180230837526, "grad_norm": 1.634922742843628, "learning_rate": 1.814571457374355e-05, "loss": 0.2786, "step": 1400 }, { "epoch": 0.6633915359573839, "grad_norm": 1.8771041631698608, "learning_rate": 1.8142747006369176e-05, "loss": 0.2626, "step": 1401 }, { "epoch": 0.663865048831015, "grad_norm": 1.1594529151916504, "learning_rate": 1.813977730933788e-05, "loss": 0.2044, "step": 1402 }, { "epoch": 0.6643385617046463, "grad_norm": 1.380021095275879, "learning_rate": 1.8136805483426358e-05, "loss": 0.2689, "step": 1403 }, { "epoch": 0.6648120745782776, "grad_norm": 1.0207122564315796, "learning_rate": 1.8133831529411856e-05, "loss": 0.249, "step": 1404 }, { "epoch": 0.6652855874519088, "grad_norm": 1.723662257194519, "learning_rate": 1.8130855448072186e-05, "loss": 0.2373, "step": 1405 }, { "epoch": 0.6657591003255401, "grad_norm": 1.606900930404663, "learning_rate": 1.8127877240185716e-05, "loss": 0.2731, "step": 1406 }, { "epoch": 0.6662326131991714, "grad_norm": 1.1863452196121216, "learning_rate": 1.8124896906531363e-05, "loss": 0.2379, "step": 1407 }, { "epoch": 0.6667061260728026, "grad_norm": 1.311086893081665, "learning_rate": 1.8121914447888605e-05, "loss": 0.2295, "step": 1408 }, { "epoch": 0.6671796389464338, "grad_norm": 1.2078311443328857, "learning_rate": 1.811892986503748e-05, "loss": 0.2209, "step": 1409 }, { "epoch": 0.6676531518200651, "grad_norm": 2.1490330696105957, "learning_rate": 1.811594315875857e-05, "loss": 0.2922, "step": 1410 }, { "epoch": 0.6681266646936964, "grad_norm": 1.3307857513427734, "learning_rate": 1.8112954329833022e-05, "loss": 0.229, "step": 1411 }, { "epoch": 0.6686001775673276, "grad_norm": 1.1393177509307861, "learning_rate": 1.8109963379042538e-05, "loss": 0.243, "step": 1412 }, { "epoch": 0.6690736904409589, "grad_norm": 1.3860878944396973, "learning_rate": 1.810697030716937e-05, "loss": 0.246, "step": 1413 }, { "epoch": 0.6695472033145902, "grad_norm": 1.2071013450622559, "learning_rate": 1.8103975114996327e-05, "loss": 0.2124, "step": 1414 }, { "epoch": 0.6700207161882213, "grad_norm": 1.5767385959625244, "learning_rate": 1.8100977803306774e-05, "loss": 0.2346, "step": 1415 }, { "epoch": 0.6704942290618526, "grad_norm": 1.6426430940628052, "learning_rate": 1.8097978372884627e-05, "loss": 0.2258, "step": 1416 }, { "epoch": 0.6709677419354839, "grad_norm": 1.0845011472702026, "learning_rate": 1.8094976824514363e-05, "loss": 0.2492, "step": 1417 }, { "epoch": 0.6714412548091151, "grad_norm": 1.211498498916626, "learning_rate": 1.8091973158981e-05, "loss": 0.2457, "step": 1418 }, { "epoch": 0.6719147676827464, "grad_norm": 1.3050991296768188, "learning_rate": 1.8088967377070122e-05, "loss": 0.1988, "step": 1419 }, { "epoch": 0.6723882805563777, "grad_norm": 2.2744362354278564, "learning_rate": 1.8085959479567866e-05, "loss": 0.2364, "step": 1420 }, { "epoch": 0.6728617934300088, "grad_norm": 1.9004119634628296, "learning_rate": 1.808294946726091e-05, "loss": 0.2387, "step": 1421 }, { "epoch": 0.6733353063036401, "grad_norm": 1.1495134830474854, "learning_rate": 1.80799373409365e-05, "loss": 0.2364, "step": 1422 }, { "epoch": 0.6738088191772714, "grad_norm": 1.161669373512268, "learning_rate": 1.807692310138243e-05, "loss": 0.2158, "step": 1423 }, { "epoch": 0.6742823320509026, "grad_norm": 1.1612114906311035, "learning_rate": 1.807390674938704e-05, "loss": 0.2477, "step": 1424 }, { "epoch": 0.6747558449245339, "grad_norm": 1.5632646083831787, "learning_rate": 1.8070888285739227e-05, "loss": 0.2696, "step": 1425 }, { "epoch": 0.6752293577981652, "grad_norm": 1.4020699262619019, "learning_rate": 1.806786771122845e-05, "loss": 0.2244, "step": 1426 }, { "epoch": 0.6757028706717964, "grad_norm": 1.7532941102981567, "learning_rate": 1.80648450266447e-05, "loss": 0.3041, "step": 1427 }, { "epoch": 0.6761763835454276, "grad_norm": 2.035996675491333, "learning_rate": 1.806182023277854e-05, "loss": 0.2601, "step": 1428 }, { "epoch": 0.6766498964190589, "grad_norm": 1.4259287118911743, "learning_rate": 1.805879333042107e-05, "loss": 0.2476, "step": 1429 }, { "epoch": 0.6771234092926901, "grad_norm": 1.3768444061279297, "learning_rate": 1.805576432036395e-05, "loss": 0.2458, "step": 1430 }, { "epoch": 0.6775969221663214, "grad_norm": 1.153491497039795, "learning_rate": 1.8052733203399385e-05, "loss": 0.2582, "step": 1431 }, { "epoch": 0.6780704350399527, "grad_norm": 1.0750600099563599, "learning_rate": 1.804969998032014e-05, "loss": 0.2708, "step": 1432 }, { "epoch": 0.6785439479135839, "grad_norm": 1.5494842529296875, "learning_rate": 1.8046664651919517e-05, "loss": 0.2314, "step": 1433 }, { "epoch": 0.6790174607872151, "grad_norm": 1.6655199527740479, "learning_rate": 1.8043627218991385e-05, "loss": 0.257, "step": 1434 }, { "epoch": 0.6794909736608464, "grad_norm": 1.0493881702423096, "learning_rate": 1.8040587682330155e-05, "loss": 0.2497, "step": 1435 }, { "epoch": 0.6799644865344776, "grad_norm": 1.266782283782959, "learning_rate": 1.8037546042730783e-05, "loss": 0.2443, "step": 1436 }, { "epoch": 0.6804379994081089, "grad_norm": 1.381783127784729, "learning_rate": 1.8034502300988784e-05, "loss": 0.2467, "step": 1437 }, { "epoch": 0.6809115122817402, "grad_norm": 1.3426703214645386, "learning_rate": 1.803145645790022e-05, "loss": 0.2399, "step": 1438 }, { "epoch": 0.6813850251553714, "grad_norm": 1.1513340473175049, "learning_rate": 1.8028408514261695e-05, "loss": 0.2482, "step": 1439 }, { "epoch": 0.6818585380290026, "grad_norm": 1.5276223421096802, "learning_rate": 1.8025358470870376e-05, "loss": 0.2307, "step": 1440 }, { "epoch": 0.6823320509026339, "grad_norm": 1.416548252105713, "learning_rate": 1.8022306328523973e-05, "loss": 0.2142, "step": 1441 }, { "epoch": 0.6828055637762651, "grad_norm": 1.427147626876831, "learning_rate": 1.801925208802074e-05, "loss": 0.2139, "step": 1442 }, { "epoch": 0.6832790766498964, "grad_norm": 1.3105965852737427, "learning_rate": 1.8016195750159488e-05, "loss": 0.2398, "step": 1443 }, { "epoch": 0.6837525895235277, "grad_norm": 1.5109857320785522, "learning_rate": 1.801313731573957e-05, "loss": 0.2733, "step": 1444 }, { "epoch": 0.684226102397159, "grad_norm": 1.150864601135254, "learning_rate": 1.8010076785560896e-05, "loss": 0.2492, "step": 1445 }, { "epoch": 0.6846996152707902, "grad_norm": 1.219900131225586, "learning_rate": 1.8007014160423907e-05, "loss": 0.2619, "step": 1446 }, { "epoch": 0.6851731281444214, "grad_norm": 1.590561032295227, "learning_rate": 1.8003949441129612e-05, "loss": 0.2396, "step": 1447 }, { "epoch": 0.6856466410180527, "grad_norm": 1.0455493927001953, "learning_rate": 1.8000882628479558e-05, "loss": 0.234, "step": 1448 }, { "epoch": 0.6861201538916839, "grad_norm": 1.0974806547164917, "learning_rate": 1.7997813723275834e-05, "loss": 0.2592, "step": 1449 }, { "epoch": 0.6865936667653152, "grad_norm": 1.3178282976150513, "learning_rate": 1.799474272632109e-05, "loss": 0.2275, "step": 1450 }, { "epoch": 0.6870671796389465, "grad_norm": 1.2756496667861938, "learning_rate": 1.7991669638418515e-05, "loss": 0.2165, "step": 1451 }, { "epoch": 0.6875406925125777, "grad_norm": 1.5571712255477905, "learning_rate": 1.798859446037184e-05, "loss": 0.2507, "step": 1452 }, { "epoch": 0.6880142053862089, "grad_norm": 1.1532251834869385, "learning_rate": 1.798551719298535e-05, "loss": 0.2464, "step": 1453 }, { "epoch": 0.6884877182598402, "grad_norm": 1.2180898189544678, "learning_rate": 1.7982437837063878e-05, "loss": 0.2383, "step": 1454 }, { "epoch": 0.6889612311334714, "grad_norm": 1.2221808433532715, "learning_rate": 1.7979356393412796e-05, "loss": 0.2409, "step": 1455 }, { "epoch": 0.6894347440071027, "grad_norm": 1.3098372220993042, "learning_rate": 1.797627286283803e-05, "loss": 0.2462, "step": 1456 }, { "epoch": 0.689908256880734, "grad_norm": 1.6582274436950684, "learning_rate": 1.7973187246146044e-05, "loss": 0.2566, "step": 1457 }, { "epoch": 0.6903817697543652, "grad_norm": 1.5802083015441895, "learning_rate": 1.7970099544143852e-05, "loss": 0.2495, "step": 1458 }, { "epoch": 0.6908552826279964, "grad_norm": 1.2195215225219727, "learning_rate": 1.7967009757639008e-05, "loss": 0.2477, "step": 1459 }, { "epoch": 0.6913287955016277, "grad_norm": 1.1696422100067139, "learning_rate": 1.796391788743963e-05, "loss": 0.2191, "step": 1460 }, { "epoch": 0.6918023083752589, "grad_norm": 1.1211189031600952, "learning_rate": 1.7960823934354352e-05, "loss": 0.2616, "step": 1461 }, { "epoch": 0.6922758212488902, "grad_norm": 1.4870905876159668, "learning_rate": 1.7957727899192375e-05, "loss": 0.2307, "step": 1462 }, { "epoch": 0.6927493341225215, "grad_norm": 0.9586571455001831, "learning_rate": 1.7954629782763437e-05, "loss": 0.2507, "step": 1463 }, { "epoch": 0.6932228469961527, "grad_norm": 1.396880865097046, "learning_rate": 1.7951529585877818e-05, "loss": 0.2456, "step": 1464 }, { "epoch": 0.693696359869784, "grad_norm": 1.5271177291870117, "learning_rate": 1.7948427309346346e-05, "loss": 0.2374, "step": 1465 }, { "epoch": 0.6941698727434152, "grad_norm": 1.1528421640396118, "learning_rate": 1.7945322953980387e-05, "loss": 0.2209, "step": 1466 }, { "epoch": 0.6946433856170464, "grad_norm": 1.230507254600525, "learning_rate": 1.7942216520591867e-05, "loss": 0.2414, "step": 1467 }, { "epoch": 0.6951168984906777, "grad_norm": 1.2597784996032715, "learning_rate": 1.793910800999323e-05, "loss": 0.2556, "step": 1468 }, { "epoch": 0.695590411364309, "grad_norm": 1.1090929508209229, "learning_rate": 1.7935997422997484e-05, "loss": 0.2575, "step": 1469 }, { "epoch": 0.6960639242379402, "grad_norm": 1.2305208444595337, "learning_rate": 1.7932884760418172e-05, "loss": 0.2437, "step": 1470 }, { "epoch": 0.6965374371115715, "grad_norm": 1.3730130195617676, "learning_rate": 1.7929770023069383e-05, "loss": 0.2321, "step": 1471 }, { "epoch": 0.6970109499852027, "grad_norm": 1.4209880828857422, "learning_rate": 1.7926653211765742e-05, "loss": 0.2597, "step": 1472 }, { "epoch": 0.6974844628588339, "grad_norm": 1.2759629487991333, "learning_rate": 1.7923534327322427e-05, "loss": 0.2424, "step": 1473 }, { "epoch": 0.6979579757324652, "grad_norm": 1.0876637697219849, "learning_rate": 1.7920413370555143e-05, "loss": 0.2589, "step": 1474 }, { "epoch": 0.6984314886060965, "grad_norm": 1.1464848518371582, "learning_rate": 1.7917290342280154e-05, "loss": 0.2399, "step": 1475 }, { "epoch": 0.6989050014797278, "grad_norm": 1.3303989171981812, "learning_rate": 1.7914165243314256e-05, "loss": 0.2653, "step": 1476 }, { "epoch": 0.699378514353359, "grad_norm": 1.5119075775146484, "learning_rate": 1.7911038074474788e-05, "loss": 0.2422, "step": 1477 }, { "epoch": 0.6998520272269902, "grad_norm": 1.3617953062057495, "learning_rate": 1.790790883657963e-05, "loss": 0.253, "step": 1478 }, { "epoch": 0.7003255401006215, "grad_norm": 1.2398014068603516, "learning_rate": 1.7904777530447203e-05, "loss": 0.247, "step": 1479 }, { "epoch": 0.7007990529742527, "grad_norm": 0.9873283505439758, "learning_rate": 1.7901644156896474e-05, "loss": 0.2248, "step": 1480 }, { "epoch": 0.701272565847884, "grad_norm": 1.4423537254333496, "learning_rate": 1.7898508716746944e-05, "loss": 0.2449, "step": 1481 }, { "epoch": 0.7017460787215153, "grad_norm": 1.600345253944397, "learning_rate": 1.7895371210818656e-05, "loss": 0.252, "step": 1482 }, { "epoch": 0.7022195915951465, "grad_norm": 1.43065345287323, "learning_rate": 1.78922316399322e-05, "loss": 0.247, "step": 1483 }, { "epoch": 0.7026931044687778, "grad_norm": 1.2158089876174927, "learning_rate": 1.7889090004908692e-05, "loss": 0.2162, "step": 1484 }, { "epoch": 0.703166617342409, "grad_norm": 1.3399561643600464, "learning_rate": 1.78859463065698e-05, "loss": 0.2449, "step": 1485 }, { "epoch": 0.7036401302160402, "grad_norm": 2.326958179473877, "learning_rate": 1.788280054573773e-05, "loss": 0.2354, "step": 1486 }, { "epoch": 0.7041136430896715, "grad_norm": 1.8712157011032104, "learning_rate": 1.7879652723235223e-05, "loss": 0.2409, "step": 1487 }, { "epoch": 0.7045871559633028, "grad_norm": 1.066486120223999, "learning_rate": 1.7876502839885564e-05, "loss": 0.2323, "step": 1488 }, { "epoch": 0.705060668836934, "grad_norm": 1.8184261322021484, "learning_rate": 1.7873350896512574e-05, "loss": 0.2201, "step": 1489 }, { "epoch": 0.7055341817105653, "grad_norm": 2.173187255859375, "learning_rate": 1.787019689394061e-05, "loss": 0.2419, "step": 1490 }, { "epoch": 0.7060076945841965, "grad_norm": 1.2974936962127686, "learning_rate": 1.786704083299458e-05, "loss": 0.2354, "step": 1491 }, { "epoch": 0.7064812074578277, "grad_norm": 1.272907018661499, "learning_rate": 1.786388271449991e-05, "loss": 0.2416, "step": 1492 }, { "epoch": 0.706954720331459, "grad_norm": 0.8591986894607544, "learning_rate": 1.7860722539282577e-05, "loss": 0.2094, "step": 1493 }, { "epoch": 0.7074282332050903, "grad_norm": 1.1995400190353394, "learning_rate": 1.7857560308169103e-05, "loss": 0.2289, "step": 1494 }, { "epoch": 0.7079017460787215, "grad_norm": 1.7277050018310547, "learning_rate": 1.785439602198653e-05, "loss": 0.2252, "step": 1495 }, { "epoch": 0.7083752589523528, "grad_norm": 2.3876638412475586, "learning_rate": 1.785122968156245e-05, "loss": 0.2494, "step": 1496 }, { "epoch": 0.708848771825984, "grad_norm": 2.412504196166992, "learning_rate": 1.7848061287724993e-05, "loss": 0.2361, "step": 1497 }, { "epoch": 0.7093222846996152, "grad_norm": 1.439038872718811, "learning_rate": 1.7844890841302815e-05, "loss": 0.251, "step": 1498 }, { "epoch": 0.7097957975732465, "grad_norm": 1.5058058500289917, "learning_rate": 1.7841718343125117e-05, "loss": 0.2373, "step": 1499 }, { "epoch": 0.7102693104468778, "grad_norm": 1.5263663530349731, "learning_rate": 1.7838543794021637e-05, "loss": 0.2423, "step": 1500 }, { "epoch": 0.710742823320509, "grad_norm": 1.1687594652175903, "learning_rate": 1.783536719482265e-05, "loss": 0.2125, "step": 1501 }, { "epoch": 0.7112163361941403, "grad_norm": 1.1493502855300903, "learning_rate": 1.783218854635896e-05, "loss": 0.2257, "step": 1502 }, { "epoch": 0.7116898490677716, "grad_norm": 2.6674232482910156, "learning_rate": 1.782900784946192e-05, "loss": 0.2585, "step": 1503 }, { "epoch": 0.7121633619414027, "grad_norm": 1.5568790435791016, "learning_rate": 1.7825825104963398e-05, "loss": 0.2436, "step": 1504 }, { "epoch": 0.712636874815034, "grad_norm": 1.7096036672592163, "learning_rate": 1.782264031369582e-05, "loss": 0.2551, "step": 1505 }, { "epoch": 0.7131103876886653, "grad_norm": 1.8939051628112793, "learning_rate": 1.7819453476492136e-05, "loss": 0.2482, "step": 1506 }, { "epoch": 0.7135839005622966, "grad_norm": 1.7977244853973389, "learning_rate": 1.7816264594185826e-05, "loss": 0.2449, "step": 1507 }, { "epoch": 0.7140574134359278, "grad_norm": 1.4691609144210815, "learning_rate": 1.781307366761092e-05, "loss": 0.2438, "step": 1508 }, { "epoch": 0.7145309263095591, "grad_norm": 1.6823903322219849, "learning_rate": 1.7809880697601965e-05, "loss": 0.2588, "step": 1509 }, { "epoch": 0.7150044391831903, "grad_norm": 1.1935667991638184, "learning_rate": 1.7806685684994063e-05, "loss": 0.2381, "step": 1510 }, { "epoch": 0.7154779520568215, "grad_norm": 1.196661114692688, "learning_rate": 1.7803488630622833e-05, "loss": 0.2411, "step": 1511 }, { "epoch": 0.7159514649304528, "grad_norm": 1.9287184476852417, "learning_rate": 1.7800289535324426e-05, "loss": 0.2545, "step": 1512 }, { "epoch": 0.7164249778040841, "grad_norm": 2.1813395023345947, "learning_rate": 1.7797088399935547e-05, "loss": 0.2496, "step": 1513 }, { "epoch": 0.7168984906777153, "grad_norm": 1.9686847925186157, "learning_rate": 1.7793885225293418e-05, "loss": 0.2352, "step": 1514 }, { "epoch": 0.7173720035513466, "grad_norm": 1.5410264730453491, "learning_rate": 1.779068001223579e-05, "loss": 0.2156, "step": 1515 }, { "epoch": 0.7178455164249778, "grad_norm": 1.2590621709823608, "learning_rate": 1.7787472761600973e-05, "loss": 0.2278, "step": 1516 }, { "epoch": 0.718319029298609, "grad_norm": 1.5641133785247803, "learning_rate": 1.7784263474227774e-05, "loss": 0.2149, "step": 1517 }, { "epoch": 0.7187925421722403, "grad_norm": 1.5322697162628174, "learning_rate": 1.7781052150955566e-05, "loss": 0.2337, "step": 1518 }, { "epoch": 0.7192660550458716, "grad_norm": 1.4889428615570068, "learning_rate": 1.7777838792624228e-05, "loss": 0.2447, "step": 1519 }, { "epoch": 0.7197395679195028, "grad_norm": 2.111682176589966, "learning_rate": 1.777462340007419e-05, "loss": 0.2279, "step": 1520 }, { "epoch": 0.7202130807931341, "grad_norm": 1.946251630783081, "learning_rate": 1.7771405974146403e-05, "loss": 0.2399, "step": 1521 }, { "epoch": 0.7206865936667654, "grad_norm": 1.0815329551696777, "learning_rate": 1.776818651568236e-05, "loss": 0.2292, "step": 1522 }, { "epoch": 0.7211601065403965, "grad_norm": 1.4069883823394775, "learning_rate": 1.7764965025524072e-05, "loss": 0.2677, "step": 1523 }, { "epoch": 0.7216336194140278, "grad_norm": 1.227126121520996, "learning_rate": 1.776174150451409e-05, "loss": 0.2468, "step": 1524 }, { "epoch": 0.7221071322876591, "grad_norm": 1.791418433189392, "learning_rate": 1.7758515953495496e-05, "loss": 0.2335, "step": 1525 }, { "epoch": 0.7225806451612903, "grad_norm": 2.3596413135528564, "learning_rate": 1.7755288373311906e-05, "loss": 0.2589, "step": 1526 }, { "epoch": 0.7230541580349216, "grad_norm": 1.5666567087173462, "learning_rate": 1.7752058764807455e-05, "loss": 0.2381, "step": 1527 }, { "epoch": 0.7235276709085529, "grad_norm": 1.1688320636749268, "learning_rate": 1.7748827128826822e-05, "loss": 0.2469, "step": 1528 }, { "epoch": 0.724001183782184, "grad_norm": 1.2592840194702148, "learning_rate": 1.7745593466215204e-05, "loss": 0.224, "step": 1529 }, { "epoch": 0.7244746966558153, "grad_norm": 1.0283125638961792, "learning_rate": 1.774235777781834e-05, "loss": 0.2054, "step": 1530 }, { "epoch": 0.7249482095294466, "grad_norm": 1.7181527614593506, "learning_rate": 1.7739120064482493e-05, "loss": 0.2394, "step": 1531 }, { "epoch": 0.7254217224030778, "grad_norm": 1.6182515621185303, "learning_rate": 1.7735880327054453e-05, "loss": 0.2225, "step": 1532 }, { "epoch": 0.7258952352767091, "grad_norm": 1.1025639772415161, "learning_rate": 1.7732638566381544e-05, "loss": 0.2413, "step": 1533 }, { "epoch": 0.7263687481503404, "grad_norm": 1.6946557760238647, "learning_rate": 1.7729394783311614e-05, "loss": 0.2424, "step": 1534 }, { "epoch": 0.7268422610239715, "grad_norm": 1.2723066806793213, "learning_rate": 1.7726148978693046e-05, "loss": 0.2264, "step": 1535 }, { "epoch": 0.7273157738976028, "grad_norm": 1.2535463571548462, "learning_rate": 1.7722901153374748e-05, "loss": 0.2516, "step": 1536 }, { "epoch": 0.7277892867712341, "grad_norm": 1.6399126052856445, "learning_rate": 1.7719651308206157e-05, "loss": 0.237, "step": 1537 }, { "epoch": 0.7282627996448654, "grad_norm": 1.1695709228515625, "learning_rate": 1.771639944403724e-05, "loss": 0.2125, "step": 1538 }, { "epoch": 0.7287363125184966, "grad_norm": 1.4084415435791016, "learning_rate": 1.7713145561718486e-05, "loss": 0.2153, "step": 1539 }, { "epoch": 0.7292098253921279, "grad_norm": 1.2717535495758057, "learning_rate": 1.7709889662100926e-05, "loss": 0.2599, "step": 1540 }, { "epoch": 0.7296833382657592, "grad_norm": 1.3030693531036377, "learning_rate": 1.77066317460361e-05, "loss": 0.2262, "step": 1541 }, { "epoch": 0.7301568511393903, "grad_norm": 1.2973533868789673, "learning_rate": 1.7703371814376088e-05, "loss": 0.2526, "step": 1542 }, { "epoch": 0.7306303640130216, "grad_norm": 1.3349380493164062, "learning_rate": 1.7700109867973494e-05, "loss": 0.2436, "step": 1543 }, { "epoch": 0.7311038768866529, "grad_norm": 1.424356460571289, "learning_rate": 1.769684590768145e-05, "loss": 0.2335, "step": 1544 }, { "epoch": 0.7315773897602841, "grad_norm": 1.7350075244903564, "learning_rate": 1.769357993435361e-05, "loss": 0.2501, "step": 1545 }, { "epoch": 0.7320509026339154, "grad_norm": 1.5108102560043335, "learning_rate": 1.7690311948844162e-05, "loss": 0.2606, "step": 1546 }, { "epoch": 0.7325244155075467, "grad_norm": 1.3136991262435913, "learning_rate": 1.768704195200781e-05, "loss": 0.217, "step": 1547 }, { "epoch": 0.7329979283811778, "grad_norm": 1.9467602968215942, "learning_rate": 1.7683769944699793e-05, "loss": 0.245, "step": 1548 }, { "epoch": 0.7334714412548091, "grad_norm": 1.9547033309936523, "learning_rate": 1.768049592777588e-05, "loss": 0.2583, "step": 1549 }, { "epoch": 0.7339449541284404, "grad_norm": 1.5787625312805176, "learning_rate": 1.7677219902092345e-05, "loss": 0.2554, "step": 1550 }, { "epoch": 0.7344184670020716, "grad_norm": 1.423520803451538, "learning_rate": 1.7673941868506014e-05, "loss": 0.2397, "step": 1551 }, { "epoch": 0.7348919798757029, "grad_norm": 2.1277055740356445, "learning_rate": 1.7670661827874217e-05, "loss": 0.2384, "step": 1552 }, { "epoch": 0.7353654927493342, "grad_norm": 1.7148913145065308, "learning_rate": 1.7667379781054816e-05, "loss": 0.2562, "step": 1553 }, { "epoch": 0.7358390056229653, "grad_norm": 1.5137001276016235, "learning_rate": 1.7664095728906202e-05, "loss": 0.2452, "step": 1554 }, { "epoch": 0.7363125184965966, "grad_norm": 1.6710152626037598, "learning_rate": 1.766080967228729e-05, "loss": 0.2281, "step": 1555 }, { "epoch": 0.7367860313702279, "grad_norm": 1.5264017581939697, "learning_rate": 1.7657521612057513e-05, "loss": 0.2265, "step": 1556 }, { "epoch": 0.7372595442438591, "grad_norm": 2.8512990474700928, "learning_rate": 1.765423154907683e-05, "loss": 0.2342, "step": 1557 }, { "epoch": 0.7377330571174904, "grad_norm": 1.2972322702407837, "learning_rate": 1.7650939484205728e-05, "loss": 0.2237, "step": 1558 }, { "epoch": 0.7382065699911217, "grad_norm": 1.2768687009811401, "learning_rate": 1.7647645418305215e-05, "loss": 0.2285, "step": 1559 }, { "epoch": 0.7386800828647528, "grad_norm": 0.9737210273742676, "learning_rate": 1.7644349352236822e-05, "loss": 0.2494, "step": 1560 }, { "epoch": 0.7391535957383841, "grad_norm": 1.2401894330978394, "learning_rate": 1.7641051286862597e-05, "loss": 0.263, "step": 1561 }, { "epoch": 0.7396271086120154, "grad_norm": 1.2521781921386719, "learning_rate": 1.763775122304513e-05, "loss": 0.2624, "step": 1562 }, { "epoch": 0.7401006214856466, "grad_norm": 1.5043491125106812, "learning_rate": 1.7634449161647506e-05, "loss": 0.2325, "step": 1563 }, { "epoch": 0.7405741343592779, "grad_norm": 1.1127467155456543, "learning_rate": 1.7631145103533357e-05, "loss": 0.2532, "step": 1564 }, { "epoch": 0.7410476472329092, "grad_norm": 1.3029921054840088, "learning_rate": 1.7627839049566827e-05, "loss": 0.2445, "step": 1565 }, { "epoch": 0.7415211601065403, "grad_norm": 1.4428067207336426, "learning_rate": 1.762453100061258e-05, "loss": 0.2123, "step": 1566 }, { "epoch": 0.7419946729801716, "grad_norm": 1.1127543449401855, "learning_rate": 1.76212209575358e-05, "loss": 0.2265, "step": 1567 }, { "epoch": 0.7424681858538029, "grad_norm": 0.9954766631126404, "learning_rate": 1.761790892120221e-05, "loss": 0.2089, "step": 1568 }, { "epoch": 0.7429416987274341, "grad_norm": 1.198296070098877, "learning_rate": 1.7614594892478026e-05, "loss": 0.2272, "step": 1569 }, { "epoch": 0.7434152116010654, "grad_norm": 1.9979981184005737, "learning_rate": 1.761127887223001e-05, "loss": 0.2511, "step": 1570 }, { "epoch": 0.7438887244746967, "grad_norm": 1.982858657836914, "learning_rate": 1.7607960861325434e-05, "loss": 0.2372, "step": 1571 }, { "epoch": 0.744362237348328, "grad_norm": 1.2229499816894531, "learning_rate": 1.760464086063209e-05, "loss": 0.2101, "step": 1572 }, { "epoch": 0.7448357502219591, "grad_norm": 1.25929856300354, "learning_rate": 1.760131887101829e-05, "loss": 0.27, "step": 1573 }, { "epoch": 0.7453092630955904, "grad_norm": 1.0544230937957764, "learning_rate": 1.7597994893352873e-05, "loss": 0.2076, "step": 1574 }, { "epoch": 0.7457827759692217, "grad_norm": 1.874146819114685, "learning_rate": 1.7594668928505186e-05, "loss": 0.2267, "step": 1575 }, { "epoch": 0.7462562888428529, "grad_norm": 1.3984766006469727, "learning_rate": 1.7591340977345112e-05, "loss": 0.218, "step": 1576 }, { "epoch": 0.7467298017164842, "grad_norm": 1.5700955390930176, "learning_rate": 1.758801104074304e-05, "loss": 0.2278, "step": 1577 }, { "epoch": 0.7472033145901155, "grad_norm": 1.9291844367980957, "learning_rate": 1.7584679119569882e-05, "loss": 0.2446, "step": 1578 }, { "epoch": 0.7476768274637466, "grad_norm": 1.6862872838974, "learning_rate": 1.758134521469707e-05, "loss": 0.2461, "step": 1579 }, { "epoch": 0.7481503403373779, "grad_norm": 2.24984073638916, "learning_rate": 1.7578009326996556e-05, "loss": 0.2702, "step": 1580 }, { "epoch": 0.7486238532110092, "grad_norm": 1.163398265838623, "learning_rate": 1.757467145734081e-05, "loss": 0.2349, "step": 1581 }, { "epoch": 0.7490973660846404, "grad_norm": 1.2229551076889038, "learning_rate": 1.757133160660282e-05, "loss": 0.2439, "step": 1582 }, { "epoch": 0.7495708789582717, "grad_norm": 1.2336183786392212, "learning_rate": 1.7567989775656088e-05, "loss": 0.2629, "step": 1583 }, { "epoch": 0.750044391831903, "grad_norm": 1.5344003438949585, "learning_rate": 1.756464596537464e-05, "loss": 0.2362, "step": 1584 }, { "epoch": 0.7505179047055341, "grad_norm": 1.0532424449920654, "learning_rate": 1.7561300176633015e-05, "loss": 0.2238, "step": 1585 }, { "epoch": 0.7509914175791654, "grad_norm": 1.153308391571045, "learning_rate": 1.755795241030628e-05, "loss": 0.2429, "step": 1586 }, { "epoch": 0.7514649304527967, "grad_norm": 1.3348098993301392, "learning_rate": 1.755460266727e-05, "loss": 0.2435, "step": 1587 }, { "epoch": 0.7519384433264279, "grad_norm": 1.3655999898910522, "learning_rate": 1.7551250948400273e-05, "loss": 0.2197, "step": 1588 }, { "epoch": 0.7524119562000592, "grad_norm": 2.0334396362304688, "learning_rate": 1.754789725457371e-05, "loss": 0.2142, "step": 1589 }, { "epoch": 0.7528854690736905, "grad_norm": 1.0390914678573608, "learning_rate": 1.754454158666744e-05, "loss": 0.2386, "step": 1590 }, { "epoch": 0.7533589819473216, "grad_norm": 1.4241347312927246, "learning_rate": 1.7541183945559095e-05, "loss": 0.2464, "step": 1591 }, { "epoch": 0.7538324948209529, "grad_norm": 1.2554645538330078, "learning_rate": 1.7537824332126842e-05, "loss": 0.2298, "step": 1592 }, { "epoch": 0.7543060076945842, "grad_norm": 1.4590020179748535, "learning_rate": 1.753446274724935e-05, "loss": 0.2441, "step": 1593 }, { "epoch": 0.7547795205682154, "grad_norm": 1.2696107625961304, "learning_rate": 1.753109919180582e-05, "loss": 0.2478, "step": 1594 }, { "epoch": 0.7552530334418467, "grad_norm": 1.4026949405670166, "learning_rate": 1.7527733666675945e-05, "loss": 0.256, "step": 1595 }, { "epoch": 0.755726546315478, "grad_norm": 1.0931724309921265, "learning_rate": 1.7524366172739954e-05, "loss": 0.2363, "step": 1596 }, { "epoch": 0.7562000591891092, "grad_norm": 1.0928606986999512, "learning_rate": 1.7520996710878577e-05, "loss": 0.2427, "step": 1597 }, { "epoch": 0.7566735720627404, "grad_norm": 1.247383713722229, "learning_rate": 1.751762528197307e-05, "loss": 0.2468, "step": 1598 }, { "epoch": 0.7571470849363717, "grad_norm": 1.1337532997131348, "learning_rate": 1.7514251886905192e-05, "loss": 0.2321, "step": 1599 }, { "epoch": 0.7576205978100029, "grad_norm": 1.2659448385238647, "learning_rate": 1.7510876526557225e-05, "loss": 0.2439, "step": 1600 }, { "epoch": 0.7580941106836342, "grad_norm": 1.3613624572753906, "learning_rate": 1.7507499201811958e-05, "loss": 0.233, "step": 1601 }, { "epoch": 0.7585676235572655, "grad_norm": 1.2028915882110596, "learning_rate": 1.7504119913552707e-05, "loss": 0.2176, "step": 1602 }, { "epoch": 0.7590411364308968, "grad_norm": 1.3185116052627563, "learning_rate": 1.750073866266328e-05, "loss": 0.2633, "step": 1603 }, { "epoch": 0.759514649304528, "grad_norm": 1.093675971031189, "learning_rate": 1.749735545002802e-05, "loss": 0.2641, "step": 1604 }, { "epoch": 0.7599881621781592, "grad_norm": 1.3118257522583008, "learning_rate": 1.7493970276531768e-05, "loss": 0.2769, "step": 1605 }, { "epoch": 0.7604616750517905, "grad_norm": 1.8869200944900513, "learning_rate": 1.7490583143059885e-05, "loss": 0.247, "step": 1606 }, { "epoch": 0.7609351879254217, "grad_norm": 1.1935513019561768, "learning_rate": 1.748719405049825e-05, "loss": 0.2098, "step": 1607 }, { "epoch": 0.761408700799053, "grad_norm": 1.2408536672592163, "learning_rate": 1.7483802999733237e-05, "loss": 0.227, "step": 1608 }, { "epoch": 0.7618822136726843, "grad_norm": 1.9241235256195068, "learning_rate": 1.748040999165175e-05, "loss": 0.2486, "step": 1609 }, { "epoch": 0.7623557265463154, "grad_norm": 2.5321922302246094, "learning_rate": 1.7477015027141192e-05, "loss": 0.2458, "step": 1610 }, { "epoch": 0.7628292394199467, "grad_norm": 1.3591722249984741, "learning_rate": 1.7473618107089482e-05, "loss": 0.2064, "step": 1611 }, { "epoch": 0.763302752293578, "grad_norm": 1.9121155738830566, "learning_rate": 1.747021923238506e-05, "loss": 0.2565, "step": 1612 }, { "epoch": 0.7637762651672092, "grad_norm": 1.7858729362487793, "learning_rate": 1.7466818403916862e-05, "loss": 0.2642, "step": 1613 }, { "epoch": 0.7642497780408405, "grad_norm": 1.5614818334579468, "learning_rate": 1.7463415622574346e-05, "loss": 0.2363, "step": 1614 }, { "epoch": 0.7647232909144718, "grad_norm": 1.199522852897644, "learning_rate": 1.7460010889247473e-05, "loss": 0.2525, "step": 1615 }, { "epoch": 0.765196803788103, "grad_norm": 1.5608481168746948, "learning_rate": 1.7456604204826725e-05, "loss": 0.2218, "step": 1616 }, { "epoch": 0.7656703166617342, "grad_norm": 2.3330767154693604, "learning_rate": 1.7453195570203075e-05, "loss": 0.2299, "step": 1617 }, { "epoch": 0.7661438295353655, "grad_norm": 2.918964385986328, "learning_rate": 1.7449784986268033e-05, "loss": 0.2124, "step": 1618 }, { "epoch": 0.7666173424089967, "grad_norm": 2.289975643157959, "learning_rate": 1.7446372453913592e-05, "loss": 0.2549, "step": 1619 }, { "epoch": 0.767090855282628, "grad_norm": 1.3809151649475098, "learning_rate": 1.7442957974032274e-05, "loss": 0.2461, "step": 1620 }, { "epoch": 0.7675643681562593, "grad_norm": 1.484054446220398, "learning_rate": 1.74395415475171e-05, "loss": 0.2427, "step": 1621 }, { "epoch": 0.7680378810298905, "grad_norm": 1.378715991973877, "learning_rate": 1.7436123175261607e-05, "loss": 0.2572, "step": 1622 }, { "epoch": 0.7685113939035217, "grad_norm": 1.168416976928711, "learning_rate": 1.7432702858159835e-05, "loss": 0.2395, "step": 1623 }, { "epoch": 0.768984906777153, "grad_norm": 1.0436391830444336, "learning_rate": 1.742928059710633e-05, "loss": 0.2144, "step": 1624 }, { "epoch": 0.7694584196507842, "grad_norm": 2.4827699661254883, "learning_rate": 1.742585639299616e-05, "loss": 0.2408, "step": 1625 }, { "epoch": 0.7699319325244155, "grad_norm": 2.694228410720825, "learning_rate": 1.7422430246724892e-05, "loss": 0.2193, "step": 1626 }, { "epoch": 0.7704054453980468, "grad_norm": 1.7454769611358643, "learning_rate": 1.7419002159188593e-05, "loss": 0.2328, "step": 1627 }, { "epoch": 0.770878958271678, "grad_norm": 2.133936643600464, "learning_rate": 1.7415572131283856e-05, "loss": 0.2401, "step": 1628 }, { "epoch": 0.7713524711453092, "grad_norm": 1.3482989072799683, "learning_rate": 1.7412140163907765e-05, "loss": 0.2338, "step": 1629 }, { "epoch": 0.7718259840189405, "grad_norm": 1.8004233837127686, "learning_rate": 1.7408706257957922e-05, "loss": 0.2271, "step": 1630 }, { "epoch": 0.7722994968925717, "grad_norm": 1.835065484046936, "learning_rate": 1.740527041433243e-05, "loss": 0.2661, "step": 1631 }, { "epoch": 0.772773009766203, "grad_norm": 1.9195033311843872, "learning_rate": 1.7401832633929897e-05, "loss": 0.2482, "step": 1632 }, { "epoch": 0.7732465226398343, "grad_norm": 1.346403956413269, "learning_rate": 1.7398392917649448e-05, "loss": 0.2437, "step": 1633 }, { "epoch": 0.7737200355134656, "grad_norm": 1.3023998737335205, "learning_rate": 1.7394951266390708e-05, "loss": 0.2476, "step": 1634 }, { "epoch": 0.7741935483870968, "grad_norm": 1.6358240842819214, "learning_rate": 1.7391507681053802e-05, "loss": 0.2367, "step": 1635 }, { "epoch": 0.774667061260728, "grad_norm": 1.2242166996002197, "learning_rate": 1.7388062162539368e-05, "loss": 0.2527, "step": 1636 }, { "epoch": 0.7751405741343593, "grad_norm": 1.1994009017944336, "learning_rate": 1.738461471174855e-05, "loss": 0.2417, "step": 1637 }, { "epoch": 0.7756140870079905, "grad_norm": 1.6394872665405273, "learning_rate": 1.7381165329582996e-05, "loss": 0.2193, "step": 1638 }, { "epoch": 0.7760875998816218, "grad_norm": 1.1021173000335693, "learning_rate": 1.7377714016944856e-05, "loss": 0.2529, "step": 1639 }, { "epoch": 0.7765611127552531, "grad_norm": 0.9502620697021484, "learning_rate": 1.737426077473679e-05, "loss": 0.2347, "step": 1640 }, { "epoch": 0.7770346256288843, "grad_norm": 1.5584297180175781, "learning_rate": 1.737080560386196e-05, "loss": 0.2619, "step": 1641 }, { "epoch": 0.7775081385025155, "grad_norm": 1.0773392915725708, "learning_rate": 1.736734850522403e-05, "loss": 0.2445, "step": 1642 }, { "epoch": 0.7779816513761468, "grad_norm": 0.9587130546569824, "learning_rate": 1.736388947972717e-05, "loss": 0.2168, "step": 1643 }, { "epoch": 0.778455164249778, "grad_norm": 1.201387882232666, "learning_rate": 1.7360428528276062e-05, "loss": 0.2421, "step": 1644 }, { "epoch": 0.7789286771234093, "grad_norm": 1.5380560159683228, "learning_rate": 1.735696565177588e-05, "loss": 0.2278, "step": 1645 }, { "epoch": 0.7794021899970406, "grad_norm": 1.150850534439087, "learning_rate": 1.7353500851132305e-05, "loss": 0.2434, "step": 1646 }, { "epoch": 0.7798757028706718, "grad_norm": 1.1250213384628296, "learning_rate": 1.735003412725152e-05, "loss": 0.2712, "step": 1647 }, { "epoch": 0.780349215744303, "grad_norm": 1.428877592086792, "learning_rate": 1.7346565481040218e-05, "loss": 0.2397, "step": 1648 }, { "epoch": 0.7808227286179343, "grad_norm": 1.245980978012085, "learning_rate": 1.734309491340559e-05, "loss": 0.2259, "step": 1649 }, { "epoch": 0.7812962414915655, "grad_norm": 1.2803640365600586, "learning_rate": 1.7339622425255323e-05, "loss": 0.224, "step": 1650 }, { "epoch": 0.7817697543651968, "grad_norm": 1.3713748455047607, "learning_rate": 1.7336148017497617e-05, "loss": 0.2288, "step": 1651 }, { "epoch": 0.7822432672388281, "grad_norm": 1.4782074689865112, "learning_rate": 1.7332671691041173e-05, "loss": 0.2334, "step": 1652 }, { "epoch": 0.7827167801124593, "grad_norm": 1.6904542446136475, "learning_rate": 1.7329193446795186e-05, "loss": 0.2539, "step": 1653 }, { "epoch": 0.7831902929860906, "grad_norm": 2.416238784790039, "learning_rate": 1.732571328566936e-05, "loss": 0.2279, "step": 1654 }, { "epoch": 0.7836638058597218, "grad_norm": 2.9311797618865967, "learning_rate": 1.7322231208573897e-05, "loss": 0.2518, "step": 1655 }, { "epoch": 0.784137318733353, "grad_norm": 1.2711611986160278, "learning_rate": 1.7318747216419502e-05, "loss": 0.2443, "step": 1656 }, { "epoch": 0.7846108316069843, "grad_norm": 1.742221713066101, "learning_rate": 1.7315261310117376e-05, "loss": 0.2431, "step": 1657 }, { "epoch": 0.7850843444806156, "grad_norm": 1.3446333408355713, "learning_rate": 1.7311773490579225e-05, "loss": 0.2241, "step": 1658 }, { "epoch": 0.7855578573542468, "grad_norm": 1.3194044828414917, "learning_rate": 1.7308283758717255e-05, "loss": 0.2399, "step": 1659 }, { "epoch": 0.7860313702278781, "grad_norm": 1.2633658647537231, "learning_rate": 1.7304792115444172e-05, "loss": 0.2336, "step": 1660 }, { "epoch": 0.7865048831015093, "grad_norm": 1.167573094367981, "learning_rate": 1.7301298561673186e-05, "loss": 0.2337, "step": 1661 }, { "epoch": 0.7869783959751405, "grad_norm": 1.2154494524002075, "learning_rate": 1.7297803098317995e-05, "loss": 0.2571, "step": 1662 }, { "epoch": 0.7874519088487718, "grad_norm": 1.3965184688568115, "learning_rate": 1.7294305726292804e-05, "loss": 0.2521, "step": 1663 }, { "epoch": 0.7879254217224031, "grad_norm": 1.1656535863876343, "learning_rate": 1.7290806446512324e-05, "loss": 0.2134, "step": 1664 }, { "epoch": 0.7883989345960344, "grad_norm": 1.8412251472473145, "learning_rate": 1.728730525989175e-05, "loss": 0.2219, "step": 1665 }, { "epoch": 0.7888724474696656, "grad_norm": 2.7580809593200684, "learning_rate": 1.7283802167346793e-05, "loss": 0.2484, "step": 1666 }, { "epoch": 0.7893459603432968, "grad_norm": 1.3492392301559448, "learning_rate": 1.7280297169793643e-05, "loss": 0.2292, "step": 1667 }, { "epoch": 0.7898194732169281, "grad_norm": 1.6488065719604492, "learning_rate": 1.7276790268149e-05, "loss": 0.2416, "step": 1668 }, { "epoch": 0.7902929860905593, "grad_norm": 1.1698122024536133, "learning_rate": 1.7273281463330066e-05, "loss": 0.2375, "step": 1669 }, { "epoch": 0.7907664989641906, "grad_norm": 1.7447906732559204, "learning_rate": 1.7269770756254532e-05, "loss": 0.257, "step": 1670 }, { "epoch": 0.7912400118378219, "grad_norm": 1.752770185470581, "learning_rate": 1.726625814784059e-05, "loss": 0.2087, "step": 1671 }, { "epoch": 0.7917135247114531, "grad_norm": 1.0216140747070312, "learning_rate": 1.7262743639006928e-05, "loss": 0.2084, "step": 1672 }, { "epoch": 0.7921870375850844, "grad_norm": 1.923169493675232, "learning_rate": 1.7259227230672733e-05, "loss": 0.231, "step": 1673 }, { "epoch": 0.7926605504587156, "grad_norm": 1.4375102519989014, "learning_rate": 1.7255708923757683e-05, "loss": 0.2381, "step": 1674 }, { "epoch": 0.7931340633323468, "grad_norm": 1.4559400081634521, "learning_rate": 1.7252188719181965e-05, "loss": 0.2328, "step": 1675 }, { "epoch": 0.7936075762059781, "grad_norm": 1.6422553062438965, "learning_rate": 1.724866661786625e-05, "loss": 0.2457, "step": 1676 }, { "epoch": 0.7940810890796094, "grad_norm": 1.408460259437561, "learning_rate": 1.7245142620731707e-05, "loss": 0.2479, "step": 1677 }, { "epoch": 0.7945546019532406, "grad_norm": 1.476257562637329, "learning_rate": 1.724161672870001e-05, "loss": 0.2193, "step": 1678 }, { "epoch": 0.7950281148268719, "grad_norm": 1.108467698097229, "learning_rate": 1.7238088942693315e-05, "loss": 0.2265, "step": 1679 }, { "epoch": 0.7955016277005031, "grad_norm": 1.2234896421432495, "learning_rate": 1.723455926363429e-05, "loss": 0.2383, "step": 1680 }, { "epoch": 0.7959751405741343, "grad_norm": 0.9792563915252686, "learning_rate": 1.723102769244608e-05, "loss": 0.2288, "step": 1681 }, { "epoch": 0.7964486534477656, "grad_norm": 1.273815393447876, "learning_rate": 1.7227494230052337e-05, "loss": 0.2287, "step": 1682 }, { "epoch": 0.7969221663213969, "grad_norm": 1.5027164220809937, "learning_rate": 1.72239588773772e-05, "loss": 0.2635, "step": 1683 }, { "epoch": 0.7973956791950281, "grad_norm": 1.1177582740783691, "learning_rate": 1.7220421635345312e-05, "loss": 0.2351, "step": 1684 }, { "epoch": 0.7978691920686594, "grad_norm": 1.341367483139038, "learning_rate": 1.7216882504881796e-05, "loss": 0.2124, "step": 1685 }, { "epoch": 0.7983427049422906, "grad_norm": 1.2859315872192383, "learning_rate": 1.721334148691229e-05, "loss": 0.2186, "step": 1686 }, { "epoch": 0.7988162178159218, "grad_norm": 1.8942023515701294, "learning_rate": 1.7209798582362904e-05, "loss": 0.2746, "step": 1687 }, { "epoch": 0.7992897306895531, "grad_norm": 1.0868715047836304, "learning_rate": 1.7206253792160255e-05, "loss": 0.238, "step": 1688 }, { "epoch": 0.7997632435631844, "grad_norm": 1.7027019262313843, "learning_rate": 1.7202707117231443e-05, "loss": 0.2381, "step": 1689 }, { "epoch": 0.8002367564368156, "grad_norm": 1.0921725034713745, "learning_rate": 1.7199158558504074e-05, "loss": 0.2556, "step": 1690 }, { "epoch": 0.8007102693104469, "grad_norm": 1.053194522857666, "learning_rate": 1.7195608116906234e-05, "loss": 0.2267, "step": 1691 }, { "epoch": 0.8011837821840782, "grad_norm": 1.3293603658676147, "learning_rate": 1.7192055793366506e-05, "loss": 0.2312, "step": 1692 }, { "epoch": 0.8016572950577093, "grad_norm": 1.9301906824111938, "learning_rate": 1.718850158881397e-05, "loss": 0.2198, "step": 1693 }, { "epoch": 0.8021308079313406, "grad_norm": 1.5638904571533203, "learning_rate": 1.7184945504178193e-05, "loss": 0.2624, "step": 1694 }, { "epoch": 0.8026043208049719, "grad_norm": 1.715846061706543, "learning_rate": 1.7181387540389235e-05, "loss": 0.2195, "step": 1695 }, { "epoch": 0.8030778336786032, "grad_norm": 1.0800182819366455, "learning_rate": 1.7177827698377646e-05, "loss": 0.2419, "step": 1696 }, { "epoch": 0.8035513465522344, "grad_norm": 1.205739974975586, "learning_rate": 1.7174265979074464e-05, "loss": 0.2211, "step": 1697 }, { "epoch": 0.8040248594258657, "grad_norm": 1.1386871337890625, "learning_rate": 1.717070238341123e-05, "loss": 0.2216, "step": 1698 }, { "epoch": 0.8044983722994969, "grad_norm": 1.242944598197937, "learning_rate": 1.7167136912319962e-05, "loss": 0.2197, "step": 1699 }, { "epoch": 0.8049718851731281, "grad_norm": 1.081717610359192, "learning_rate": 1.716356956673318e-05, "loss": 0.2377, "step": 1700 }, { "epoch": 0.8054453980467594, "grad_norm": 1.5992521047592163, "learning_rate": 1.7160000347583885e-05, "loss": 0.2345, "step": 1701 }, { "epoch": 0.8059189109203907, "grad_norm": 1.3137495517730713, "learning_rate": 1.715642925580557e-05, "loss": 0.2322, "step": 1702 }, { "epoch": 0.8063924237940219, "grad_norm": 1.910725474357605, "learning_rate": 1.7152856292332225e-05, "loss": 0.2487, "step": 1703 }, { "epoch": 0.8068659366676532, "grad_norm": 1.408942699432373, "learning_rate": 1.7149281458098325e-05, "loss": 0.2556, "step": 1704 }, { "epoch": 0.8073394495412844, "grad_norm": 1.5198936462402344, "learning_rate": 1.7145704754038825e-05, "loss": 0.2308, "step": 1705 }, { "epoch": 0.8078129624149156, "grad_norm": 1.0505871772766113, "learning_rate": 1.7142126181089184e-05, "loss": 0.2366, "step": 1706 }, { "epoch": 0.8082864752885469, "grad_norm": 1.4777036905288696, "learning_rate": 1.713854574018534e-05, "loss": 0.2297, "step": 1707 }, { "epoch": 0.8087599881621782, "grad_norm": 1.320868730545044, "learning_rate": 1.7134963432263725e-05, "loss": 0.2222, "step": 1708 }, { "epoch": 0.8092335010358094, "grad_norm": 1.3242131471633911, "learning_rate": 1.713137925826126e-05, "loss": 0.2458, "step": 1709 }, { "epoch": 0.8097070139094407, "grad_norm": 1.8690855503082275, "learning_rate": 1.7127793219115345e-05, "loss": 0.2241, "step": 1710 }, { "epoch": 0.810180526783072, "grad_norm": 1.4543685913085938, "learning_rate": 1.7124205315763876e-05, "loss": 0.2369, "step": 1711 }, { "epoch": 0.8106540396567031, "grad_norm": 1.1340458393096924, "learning_rate": 1.7120615549145234e-05, "loss": 0.2446, "step": 1712 }, { "epoch": 0.8111275525303344, "grad_norm": 2.3450067043304443, "learning_rate": 1.711702392019829e-05, "loss": 0.2507, "step": 1713 }, { "epoch": 0.8116010654039657, "grad_norm": 1.5409314632415771, "learning_rate": 1.71134304298624e-05, "loss": 0.213, "step": 1714 }, { "epoch": 0.8120745782775969, "grad_norm": 1.5783090591430664, "learning_rate": 1.7109835079077406e-05, "loss": 0.2278, "step": 1715 }, { "epoch": 0.8125480911512282, "grad_norm": 1.2138385772705078, "learning_rate": 1.7106237868783635e-05, "loss": 0.2228, "step": 1716 }, { "epoch": 0.8130216040248595, "grad_norm": 1.4703527688980103, "learning_rate": 1.710263879992191e-05, "loss": 0.2285, "step": 1717 }, { "epoch": 0.8134951168984906, "grad_norm": 1.418495774269104, "learning_rate": 1.709903787343352e-05, "loss": 0.2528, "step": 1718 }, { "epoch": 0.8139686297721219, "grad_norm": 2.0826809406280518, "learning_rate": 1.7095435090260263e-05, "loss": 0.2252, "step": 1719 }, { "epoch": 0.8144421426457532, "grad_norm": 1.1822896003723145, "learning_rate": 1.7091830451344406e-05, "loss": 0.2336, "step": 1720 }, { "epoch": 0.8149156555193844, "grad_norm": 2.3410282135009766, "learning_rate": 1.7088223957628714e-05, "loss": 0.2513, "step": 1721 }, { "epoch": 0.8153891683930157, "grad_norm": 0.9941398501396179, "learning_rate": 1.708461561005643e-05, "loss": 0.2511, "step": 1722 }, { "epoch": 0.815862681266647, "grad_norm": 1.2086396217346191, "learning_rate": 1.708100540957127e-05, "loss": 0.2535, "step": 1723 }, { "epoch": 0.8163361941402781, "grad_norm": 1.3249119520187378, "learning_rate": 1.7077393357117467e-05, "loss": 0.252, "step": 1724 }, { "epoch": 0.8168097070139094, "grad_norm": 1.0457037687301636, "learning_rate": 1.70737794536397e-05, "loss": 0.2265, "step": 1725 }, { "epoch": 0.8172832198875407, "grad_norm": 1.7129755020141602, "learning_rate": 1.7070163700083163e-05, "loss": 0.2345, "step": 1726 }, { "epoch": 0.8177567327611719, "grad_norm": 1.0255284309387207, "learning_rate": 1.7066546097393518e-05, "loss": 0.2303, "step": 1727 }, { "epoch": 0.8182302456348032, "grad_norm": 1.2299504280090332, "learning_rate": 1.7062926646516915e-05, "loss": 0.2454, "step": 1728 }, { "epoch": 0.8187037585084345, "grad_norm": 1.1177610158920288, "learning_rate": 1.705930534839998e-05, "loss": 0.2478, "step": 1729 }, { "epoch": 0.8191772713820658, "grad_norm": 1.1740856170654297, "learning_rate": 1.7055682203989838e-05, "loss": 0.2253, "step": 1730 }, { "epoch": 0.8196507842556969, "grad_norm": 2.3321869373321533, "learning_rate": 1.705205721423408e-05, "loss": 0.2168, "step": 1731 }, { "epoch": 0.8201242971293282, "grad_norm": 1.5028231143951416, "learning_rate": 1.7048430380080793e-05, "loss": 0.2292, "step": 1732 }, { "epoch": 0.8205978100029595, "grad_norm": 1.116110920906067, "learning_rate": 1.7044801702478534e-05, "loss": 0.2118, "step": 1733 }, { "epoch": 0.8210713228765907, "grad_norm": 2.010495901107788, "learning_rate": 1.7041171182376354e-05, "loss": 0.2414, "step": 1734 }, { "epoch": 0.821544835750222, "grad_norm": 1.1517298221588135, "learning_rate": 1.7037538820723777e-05, "loss": 0.2366, "step": 1735 }, { "epoch": 0.8220183486238533, "grad_norm": 1.393791913986206, "learning_rate": 1.7033904618470814e-05, "loss": 0.2457, "step": 1736 }, { "epoch": 0.8224918614974844, "grad_norm": 1.2726801633834839, "learning_rate": 1.7030268576567956e-05, "loss": 0.2476, "step": 1737 }, { "epoch": 0.8229653743711157, "grad_norm": 1.198588490486145, "learning_rate": 1.7026630695966172e-05, "loss": 0.2283, "step": 1738 }, { "epoch": 0.823438887244747, "grad_norm": 1.3464490175247192, "learning_rate": 1.7022990977616922e-05, "loss": 0.2221, "step": 1739 }, { "epoch": 0.8239124001183782, "grad_norm": 1.504508137702942, "learning_rate": 1.7019349422472128e-05, "loss": 0.2292, "step": 1740 }, { "epoch": 0.8243859129920095, "grad_norm": 1.8641504049301147, "learning_rate": 1.7015706031484215e-05, "loss": 0.2391, "step": 1741 }, { "epoch": 0.8248594258656408, "grad_norm": 1.2077713012695312, "learning_rate": 1.7012060805606067e-05, "loss": 0.2267, "step": 1742 }, { "epoch": 0.8253329387392719, "grad_norm": 1.1325595378875732, "learning_rate": 1.7008413745791063e-05, "loss": 0.2077, "step": 1743 }, { "epoch": 0.8258064516129032, "grad_norm": 1.5017510652542114, "learning_rate": 1.7004764852993056e-05, "loss": 0.2344, "step": 1744 }, { "epoch": 0.8262799644865345, "grad_norm": 1.8392795324325562, "learning_rate": 1.7001114128166375e-05, "loss": 0.2312, "step": 1745 }, { "epoch": 0.8267534773601657, "grad_norm": 1.4507255554199219, "learning_rate": 1.6997461572265838e-05, "loss": 0.2629, "step": 1746 }, { "epoch": 0.827226990233797, "grad_norm": 1.8967761993408203, "learning_rate": 1.6993807186246735e-05, "loss": 0.2135, "step": 1747 }, { "epoch": 0.8277005031074283, "grad_norm": 1.259427785873413, "learning_rate": 1.6990150971064827e-05, "loss": 0.2199, "step": 1748 }, { "epoch": 0.8281740159810594, "grad_norm": 1.5154216289520264, "learning_rate": 1.698649292767637e-05, "loss": 0.2385, "step": 1749 }, { "epoch": 0.8286475288546907, "grad_norm": 1.33448326587677, "learning_rate": 1.6982833057038087e-05, "loss": 0.229, "step": 1750 }, { "epoch": 0.829121041728322, "grad_norm": 1.2957985401153564, "learning_rate": 1.6979171360107183e-05, "loss": 0.2483, "step": 1751 }, { "epoch": 0.8295945546019532, "grad_norm": 1.2026399374008179, "learning_rate": 1.6975507837841338e-05, "loss": 0.2452, "step": 1752 }, { "epoch": 0.8300680674755845, "grad_norm": 1.6381677389144897, "learning_rate": 1.6971842491198716e-05, "loss": 0.2299, "step": 1753 }, { "epoch": 0.8305415803492158, "grad_norm": 1.994767189025879, "learning_rate": 1.6968175321137942e-05, "loss": 0.2343, "step": 1754 }, { "epoch": 0.831015093222847, "grad_norm": 1.1473991870880127, "learning_rate": 1.696450632861814e-05, "loss": 0.2431, "step": 1755 }, { "epoch": 0.8314886060964782, "grad_norm": 1.2881150245666504, "learning_rate": 1.6960835514598897e-05, "loss": 0.245, "step": 1756 }, { "epoch": 0.8319621189701095, "grad_norm": 1.086107850074768, "learning_rate": 1.6957162880040273e-05, "loss": 0.221, "step": 1757 }, { "epoch": 0.8324356318437407, "grad_norm": 1.6053413152694702, "learning_rate": 1.695348842590282e-05, "loss": 0.2587, "step": 1758 }, { "epoch": 0.832909144717372, "grad_norm": 1.3724325895309448, "learning_rate": 1.694981215314755e-05, "loss": 0.2508, "step": 1759 }, { "epoch": 0.8333826575910033, "grad_norm": 1.0389055013656616, "learning_rate": 1.6946134062735953e-05, "loss": 0.214, "step": 1760 }, { "epoch": 0.8338561704646346, "grad_norm": 1.638784646987915, "learning_rate": 1.6942454155630005e-05, "loss": 0.2257, "step": 1761 }, { "epoch": 0.8343296833382657, "grad_norm": 1.2976818084716797, "learning_rate": 1.6938772432792146e-05, "loss": 0.2631, "step": 1762 }, { "epoch": 0.834803196211897, "grad_norm": 1.4617828130722046, "learning_rate": 1.69350888951853e-05, "loss": 0.2217, "step": 1763 }, { "epoch": 0.8352767090855283, "grad_norm": 1.6357537508010864, "learning_rate": 1.693140354377286e-05, "loss": 0.276, "step": 1764 }, { "epoch": 0.8357502219591595, "grad_norm": 2.0060503482818604, "learning_rate": 1.6927716379518683e-05, "loss": 0.2226, "step": 1765 }, { "epoch": 0.8362237348327908, "grad_norm": 1.3638052940368652, "learning_rate": 1.6924027403387125e-05, "loss": 0.2312, "step": 1766 }, { "epoch": 0.8366972477064221, "grad_norm": 1.0776060819625854, "learning_rate": 1.6920336616343e-05, "loss": 0.2261, "step": 1767 }, { "epoch": 0.8371707605800532, "grad_norm": 1.6560837030410767, "learning_rate": 1.691664401935159e-05, "loss": 0.2287, "step": 1768 }, { "epoch": 0.8376442734536845, "grad_norm": 1.7977901697158813, "learning_rate": 1.691294961337866e-05, "loss": 0.2451, "step": 1769 }, { "epoch": 0.8381177863273158, "grad_norm": 1.1947377920150757, "learning_rate": 1.6909253399390453e-05, "loss": 0.2327, "step": 1770 }, { "epoch": 0.838591299200947, "grad_norm": 1.2023718357086182, "learning_rate": 1.6905555378353676e-05, "loss": 0.2413, "step": 1771 }, { "epoch": 0.8390648120745783, "grad_norm": 1.2229983806610107, "learning_rate": 1.6901855551235505e-05, "loss": 0.2319, "step": 1772 }, { "epoch": 0.8395383249482096, "grad_norm": 2.0484726428985596, "learning_rate": 1.68981539190036e-05, "loss": 0.2407, "step": 1773 }, { "epoch": 0.8400118378218407, "grad_norm": 1.0766804218292236, "learning_rate": 1.6894450482626087e-05, "loss": 0.2518, "step": 1774 }, { "epoch": 0.840485350695472, "grad_norm": 1.1905542612075806, "learning_rate": 1.6890745243071558e-05, "loss": 0.2279, "step": 1775 }, { "epoch": 0.8409588635691033, "grad_norm": 1.0002597570419312, "learning_rate": 1.688703820130909e-05, "loss": 0.235, "step": 1776 }, { "epoch": 0.8414323764427345, "grad_norm": 1.6852128505706787, "learning_rate": 1.688332935830822e-05, "loss": 0.2301, "step": 1777 }, { "epoch": 0.8419058893163658, "grad_norm": 0.9994098544120789, "learning_rate": 1.687961871503896e-05, "loss": 0.2174, "step": 1778 }, { "epoch": 0.8423794021899971, "grad_norm": 1.0002872943878174, "learning_rate": 1.6875906272471795e-05, "loss": 0.2541, "step": 1779 }, { "epoch": 0.8428529150636282, "grad_norm": 1.7979168891906738, "learning_rate": 1.6872192031577675e-05, "loss": 0.2421, "step": 1780 }, { "epoch": 0.8433264279372595, "grad_norm": 1.4673607349395752, "learning_rate": 1.6868475993328027e-05, "loss": 0.2231, "step": 1781 }, { "epoch": 0.8437999408108908, "grad_norm": 1.4320827722549438, "learning_rate": 1.6864758158694747e-05, "loss": 0.2273, "step": 1782 }, { "epoch": 0.844273453684522, "grad_norm": 1.837491750717163, "learning_rate": 1.6861038528650197e-05, "loss": 0.2815, "step": 1783 }, { "epoch": 0.8447469665581533, "grad_norm": 0.963911771774292, "learning_rate": 1.685731710416721e-05, "loss": 0.209, "step": 1784 }, { "epoch": 0.8452204794317846, "grad_norm": 1.31343412399292, "learning_rate": 1.685359388621908e-05, "loss": 0.2288, "step": 1785 }, { "epoch": 0.8456939923054158, "grad_norm": 1.2854394912719727, "learning_rate": 1.6849868875779594e-05, "loss": 0.2427, "step": 1786 }, { "epoch": 0.846167505179047, "grad_norm": 1.5275187492370605, "learning_rate": 1.6846142073822983e-05, "loss": 0.259, "step": 1787 }, { "epoch": 0.8466410180526783, "grad_norm": 1.2911092042922974, "learning_rate": 1.684241348132396e-05, "loss": 0.258, "step": 1788 }, { "epoch": 0.8471145309263095, "grad_norm": 1.2889440059661865, "learning_rate": 1.68386830992577e-05, "loss": 0.235, "step": 1789 }, { "epoch": 0.8475880437999408, "grad_norm": 1.3690677881240845, "learning_rate": 1.683495092859985e-05, "loss": 0.2401, "step": 1790 }, { "epoch": 0.8480615566735721, "grad_norm": 1.3110579252243042, "learning_rate": 1.683121697032652e-05, "loss": 0.2563, "step": 1791 }, { "epoch": 0.8485350695472034, "grad_norm": 1.2487221956253052, "learning_rate": 1.6827481225414298e-05, "loss": 0.2234, "step": 1792 }, { "epoch": 0.8490085824208345, "grad_norm": 1.9341635704040527, "learning_rate": 1.6823743694840226e-05, "loss": 0.2653, "step": 1793 }, { "epoch": 0.8494820952944658, "grad_norm": 1.6593029499053955, "learning_rate": 1.6820004379581816e-05, "loss": 0.2799, "step": 1794 }, { "epoch": 0.8499556081680971, "grad_norm": 1.5015946626663208, "learning_rate": 1.6816263280617054e-05, "loss": 0.2821, "step": 1795 }, { "epoch": 0.8504291210417283, "grad_norm": 1.0924832820892334, "learning_rate": 1.6812520398924393e-05, "loss": 0.2256, "step": 1796 }, { "epoch": 0.8509026339153596, "grad_norm": 3.2573161125183105, "learning_rate": 1.6808775735482746e-05, "loss": 0.2438, "step": 1797 }, { "epoch": 0.8513761467889909, "grad_norm": 1.6617119312286377, "learning_rate": 1.6805029291271485e-05, "loss": 0.2604, "step": 1798 }, { "epoch": 0.851849659662622, "grad_norm": 1.2218396663665771, "learning_rate": 1.680128106727046e-05, "loss": 0.2546, "step": 1799 }, { "epoch": 0.8523231725362533, "grad_norm": 1.5456347465515137, "learning_rate": 1.6797531064459995e-05, "loss": 0.232, "step": 1800 }, { "epoch": 0.8527966854098846, "grad_norm": 1.6752814054489136, "learning_rate": 1.679377928382085e-05, "loss": 0.2486, "step": 1801 }, { "epoch": 0.8532701982835158, "grad_norm": 1.3025007247924805, "learning_rate": 1.6790025726334274e-05, "loss": 0.243, "step": 1802 }, { "epoch": 0.8537437111571471, "grad_norm": 1.4086090326309204, "learning_rate": 1.6786270392981976e-05, "loss": 0.2331, "step": 1803 }, { "epoch": 0.8542172240307784, "grad_norm": 1.0914981365203857, "learning_rate": 1.6782513284746124e-05, "loss": 0.2667, "step": 1804 }, { "epoch": 0.8546907369044096, "grad_norm": 1.737202525138855, "learning_rate": 1.6778754402609356e-05, "loss": 0.2584, "step": 1805 }, { "epoch": 0.8551642497780408, "grad_norm": 1.0595831871032715, "learning_rate": 1.6774993747554767e-05, "loss": 0.2386, "step": 1806 }, { "epoch": 0.8556377626516721, "grad_norm": 1.2180547714233398, "learning_rate": 1.6771231320565925e-05, "loss": 0.2429, "step": 1807 }, { "epoch": 0.8561112755253033, "grad_norm": 1.1929233074188232, "learning_rate": 1.6767467122626852e-05, "loss": 0.227, "step": 1808 }, { "epoch": 0.8565847883989346, "grad_norm": 1.554271936416626, "learning_rate": 1.6763701154722043e-05, "loss": 0.2447, "step": 1809 }, { "epoch": 0.8570583012725659, "grad_norm": 1.0554946660995483, "learning_rate": 1.6759933417836446e-05, "loss": 0.2397, "step": 1810 }, { "epoch": 0.8575318141461971, "grad_norm": 1.350473165512085, "learning_rate": 1.6756163912955478e-05, "loss": 0.2526, "step": 1811 }, { "epoch": 0.8580053270198283, "grad_norm": 1.6592644453048706, "learning_rate": 1.6752392641065015e-05, "loss": 0.2294, "step": 1812 }, { "epoch": 0.8584788398934596, "grad_norm": 0.9810163974761963, "learning_rate": 1.67486196031514e-05, "loss": 0.2413, "step": 1813 }, { "epoch": 0.8589523527670908, "grad_norm": 1.351130485534668, "learning_rate": 1.674484480020143e-05, "loss": 0.2768, "step": 1814 }, { "epoch": 0.8594258656407221, "grad_norm": 1.1572606563568115, "learning_rate": 1.6741068233202374e-05, "loss": 0.233, "step": 1815 }, { "epoch": 0.8598993785143534, "grad_norm": 1.5954844951629639, "learning_rate": 1.6737289903141954e-05, "loss": 0.2755, "step": 1816 }, { "epoch": 0.8603728913879846, "grad_norm": 1.2186288833618164, "learning_rate": 1.6733509811008354e-05, "loss": 0.2461, "step": 1817 }, { "epoch": 0.8608464042616158, "grad_norm": 1.1969510316848755, "learning_rate": 1.6729727957790224e-05, "loss": 0.2269, "step": 1818 }, { "epoch": 0.8613199171352471, "grad_norm": 1.5450844764709473, "learning_rate": 1.672594434447667e-05, "loss": 0.2235, "step": 1819 }, { "epoch": 0.8617934300088783, "grad_norm": 1.2357224225997925, "learning_rate": 1.672215897205726e-05, "loss": 0.2287, "step": 1820 }, { "epoch": 0.8622669428825096, "grad_norm": 1.3368169069290161, "learning_rate": 1.6718371841522015e-05, "loss": 0.2543, "step": 1821 }, { "epoch": 0.8627404557561409, "grad_norm": 1.64212167263031, "learning_rate": 1.6714582953861432e-05, "loss": 0.2242, "step": 1822 }, { "epoch": 0.8632139686297722, "grad_norm": 1.6220941543579102, "learning_rate": 1.6710792310066454e-05, "loss": 0.2628, "step": 1823 }, { "epoch": 0.8636874815034034, "grad_norm": 1.4587221145629883, "learning_rate": 1.6706999911128488e-05, "loss": 0.2459, "step": 1824 }, { "epoch": 0.8641609943770346, "grad_norm": 1.312369704246521, "learning_rate": 1.6703205758039397e-05, "loss": 0.249, "step": 1825 }, { "epoch": 0.8646345072506659, "grad_norm": 1.632554054260254, "learning_rate": 1.6699409851791507e-05, "loss": 0.2515, "step": 1826 }, { "epoch": 0.8651080201242971, "grad_norm": 1.6713101863861084, "learning_rate": 1.6695612193377604e-05, "loss": 0.2434, "step": 1827 }, { "epoch": 0.8655815329979284, "grad_norm": 1.163874626159668, "learning_rate": 1.6691812783790924e-05, "loss": 0.2101, "step": 1828 }, { "epoch": 0.8660550458715597, "grad_norm": 1.604141116142273, "learning_rate": 1.6688011624025164e-05, "loss": 0.2043, "step": 1829 }, { "epoch": 0.8665285587451909, "grad_norm": 1.5439268350601196, "learning_rate": 1.6684208715074488e-05, "loss": 0.2381, "step": 1830 }, { "epoch": 0.8670020716188221, "grad_norm": 2.153048038482666, "learning_rate": 1.6680404057933504e-05, "loss": 0.2072, "step": 1831 }, { "epoch": 0.8674755844924534, "grad_norm": 1.1268725395202637, "learning_rate": 1.6676597653597287e-05, "loss": 0.2483, "step": 1832 }, { "epoch": 0.8679490973660846, "grad_norm": 1.470022439956665, "learning_rate": 1.6672789503061362e-05, "loss": 0.24, "step": 1833 }, { "epoch": 0.8684226102397159, "grad_norm": 1.7773938179016113, "learning_rate": 1.6668979607321717e-05, "loss": 0.2214, "step": 1834 }, { "epoch": 0.8688961231133472, "grad_norm": 1.1891618967056274, "learning_rate": 1.6665167967374795e-05, "loss": 0.2411, "step": 1835 }, { "epoch": 0.8693696359869784, "grad_norm": 2.1788344383239746, "learning_rate": 1.666135458421749e-05, "loss": 0.2029, "step": 1836 }, { "epoch": 0.8698431488606096, "grad_norm": 1.9523930549621582, "learning_rate": 1.6657539458847157e-05, "loss": 0.2247, "step": 1837 }, { "epoch": 0.8703166617342409, "grad_norm": 1.6084970235824585, "learning_rate": 1.6653722592261605e-05, "loss": 0.2564, "step": 1838 }, { "epoch": 0.8707901746078721, "grad_norm": 1.0069568157196045, "learning_rate": 1.6649903985459093e-05, "loss": 0.2296, "step": 1839 }, { "epoch": 0.8712636874815034, "grad_norm": 1.9601439237594604, "learning_rate": 1.664608363943835e-05, "loss": 0.2398, "step": 1840 }, { "epoch": 0.8717372003551347, "grad_norm": 1.4246207475662231, "learning_rate": 1.664226155519855e-05, "loss": 0.2336, "step": 1841 }, { "epoch": 0.8722107132287659, "grad_norm": 1.4902780055999756, "learning_rate": 1.6638437733739317e-05, "loss": 0.2309, "step": 1842 }, { "epoch": 0.8726842261023972, "grad_norm": 1.1577699184417725, "learning_rate": 1.6634612176060736e-05, "loss": 0.2355, "step": 1843 }, { "epoch": 0.8731577389760284, "grad_norm": 1.28193998336792, "learning_rate": 1.6630784883163347e-05, "loss": 0.2353, "step": 1844 }, { "epoch": 0.8736312518496596, "grad_norm": 1.1589275598526, "learning_rate": 1.662695585604814e-05, "loss": 0.2501, "step": 1845 }, { "epoch": 0.8741047647232909, "grad_norm": 1.8391677141189575, "learning_rate": 1.662312509571656e-05, "loss": 0.2188, "step": 1846 }, { "epoch": 0.8745782775969222, "grad_norm": 1.7798473834991455, "learning_rate": 1.6619292603170505e-05, "loss": 0.2465, "step": 1847 }, { "epoch": 0.8750517904705534, "grad_norm": 0.9828191995620728, "learning_rate": 1.6615458379412327e-05, "loss": 0.247, "step": 1848 }, { "epoch": 0.8755253033441847, "grad_norm": 1.7325772047042847, "learning_rate": 1.6611622425444834e-05, "loss": 0.2165, "step": 1849 }, { "epoch": 0.875998816217816, "grad_norm": 1.395041584968567, "learning_rate": 1.6607784742271275e-05, "loss": 0.2143, "step": 1850 }, { "epoch": 0.8764723290914471, "grad_norm": 1.2467516660690308, "learning_rate": 1.6603945330895364e-05, "loss": 0.2121, "step": 1851 }, { "epoch": 0.8769458419650784, "grad_norm": 1.279456615447998, "learning_rate": 1.660010419232126e-05, "loss": 0.2339, "step": 1852 }, { "epoch": 0.8774193548387097, "grad_norm": 1.203802227973938, "learning_rate": 1.659626132755358e-05, "loss": 0.2216, "step": 1853 }, { "epoch": 0.8778928677123409, "grad_norm": 1.2188462018966675, "learning_rate": 1.6592416737597382e-05, "loss": 0.2611, "step": 1854 }, { "epoch": 0.8783663805859722, "grad_norm": 1.2634485960006714, "learning_rate": 1.6588570423458185e-05, "loss": 0.2343, "step": 1855 }, { "epoch": 0.8788398934596034, "grad_norm": 1.070681095123291, "learning_rate": 1.6584722386141955e-05, "loss": 0.2278, "step": 1856 }, { "epoch": 0.8793134063332347, "grad_norm": 1.0444326400756836, "learning_rate": 1.6580872626655113e-05, "loss": 0.2236, "step": 1857 }, { "epoch": 0.8797869192068659, "grad_norm": 1.1631354093551636, "learning_rate": 1.6577021146004514e-05, "loss": 0.2565, "step": 1858 }, { "epoch": 0.8802604320804972, "grad_norm": 2.2485361099243164, "learning_rate": 1.6573167945197492e-05, "loss": 0.243, "step": 1859 }, { "epoch": 0.8807339449541285, "grad_norm": 1.143649697303772, "learning_rate": 1.65693130252418e-05, "loss": 0.2247, "step": 1860 }, { "epoch": 0.8812074578277597, "grad_norm": 1.7670962810516357, "learning_rate": 1.6565456387145667e-05, "loss": 0.2466, "step": 1861 }, { "epoch": 0.881680970701391, "grad_norm": 1.5178930759429932, "learning_rate": 1.6561598031917752e-05, "loss": 0.2272, "step": 1862 }, { "epoch": 0.8821544835750222, "grad_norm": 2.0225138664245605, "learning_rate": 1.655773796056717e-05, "loss": 0.2544, "step": 1863 }, { "epoch": 0.8826279964486534, "grad_norm": 1.248473048210144, "learning_rate": 1.655387617410349e-05, "loss": 0.2331, "step": 1864 }, { "epoch": 0.8831015093222847, "grad_norm": 2.6657464504241943, "learning_rate": 1.6550012673536725e-05, "loss": 0.2501, "step": 1865 }, { "epoch": 0.883575022195916, "grad_norm": 1.0329293012619019, "learning_rate": 1.654614745987733e-05, "loss": 0.2139, "step": 1866 }, { "epoch": 0.8840485350695472, "grad_norm": 1.3353016376495361, "learning_rate": 1.6542280534136223e-05, "loss": 0.2816, "step": 1867 }, { "epoch": 0.8845220479431785, "grad_norm": 0.9897204041481018, "learning_rate": 1.6538411897324757e-05, "loss": 0.2147, "step": 1868 }, { "epoch": 0.8849955608168097, "grad_norm": 1.145198941230774, "learning_rate": 1.653454155045473e-05, "loss": 0.228, "step": 1869 }, { "epoch": 0.8854690736904409, "grad_norm": 1.2381608486175537, "learning_rate": 1.6530669494538403e-05, "loss": 0.267, "step": 1870 }, { "epoch": 0.8859425865640722, "grad_norm": 1.4348737001419067, "learning_rate": 1.6526795730588477e-05, "loss": 0.2294, "step": 1871 }, { "epoch": 0.8864160994377035, "grad_norm": 1.178130030632019, "learning_rate": 1.652292025961809e-05, "loss": 0.2079, "step": 1872 }, { "epoch": 0.8868896123113347, "grad_norm": 1.6016205549240112, "learning_rate": 1.6519043082640834e-05, "loss": 0.22, "step": 1873 }, { "epoch": 0.887363125184966, "grad_norm": 1.1184009313583374, "learning_rate": 1.6515164200670754e-05, "loss": 0.2505, "step": 1874 }, { "epoch": 0.8878366380585972, "grad_norm": 1.3344361782073975, "learning_rate": 1.651128361472233e-05, "loss": 0.2063, "step": 1875 }, { "epoch": 0.8883101509322284, "grad_norm": 1.438899278640747, "learning_rate": 1.6507401325810488e-05, "loss": 0.2183, "step": 1876 }, { "epoch": 0.8887836638058597, "grad_norm": 1.2038847208023071, "learning_rate": 1.650351733495061e-05, "loss": 0.2361, "step": 1877 }, { "epoch": 0.889257176679491, "grad_norm": 1.5315783023834229, "learning_rate": 1.6499631643158512e-05, "loss": 0.2188, "step": 1878 }, { "epoch": 0.8897306895531222, "grad_norm": 1.375877857208252, "learning_rate": 1.6495744251450464e-05, "loss": 0.238, "step": 1879 }, { "epoch": 0.8902042024267535, "grad_norm": 2.0350253582000732, "learning_rate": 1.6491855160843172e-05, "loss": 0.2217, "step": 1880 }, { "epoch": 0.8906777153003848, "grad_norm": 1.5811187028884888, "learning_rate": 1.648796437235379e-05, "loss": 0.233, "step": 1881 }, { "epoch": 0.8911512281740159, "grad_norm": 1.5892693996429443, "learning_rate": 1.6484071886999917e-05, "loss": 0.2399, "step": 1882 }, { "epoch": 0.8916247410476472, "grad_norm": 1.705972671508789, "learning_rate": 1.6480177705799594e-05, "loss": 0.2467, "step": 1883 }, { "epoch": 0.8920982539212785, "grad_norm": 1.4446234703063965, "learning_rate": 1.647628182977131e-05, "loss": 0.2375, "step": 1884 }, { "epoch": 0.8925717667949097, "grad_norm": 1.0984984636306763, "learning_rate": 1.6472384259933986e-05, "loss": 0.2171, "step": 1885 }, { "epoch": 0.893045279668541, "grad_norm": 1.1526954174041748, "learning_rate": 1.6468484997307003e-05, "loss": 0.2437, "step": 1886 }, { "epoch": 0.8935187925421723, "grad_norm": 1.1996570825576782, "learning_rate": 1.646458404291017e-05, "loss": 0.2596, "step": 1887 }, { "epoch": 0.8939923054158035, "grad_norm": 2.095825433731079, "learning_rate": 1.6460681397763746e-05, "loss": 0.23, "step": 1888 }, { "epoch": 0.8944658182894347, "grad_norm": 1.0920315980911255, "learning_rate": 1.645677706288843e-05, "loss": 0.2274, "step": 1889 }, { "epoch": 0.894939331163066, "grad_norm": 1.171776294708252, "learning_rate": 1.6452871039305365e-05, "loss": 0.2355, "step": 1890 }, { "epoch": 0.8954128440366973, "grad_norm": 1.1109461784362793, "learning_rate": 1.6448963328036125e-05, "loss": 0.2205, "step": 1891 }, { "epoch": 0.8958863569103285, "grad_norm": 1.180391550064087, "learning_rate": 1.6445053930102747e-05, "loss": 0.2343, "step": 1892 }, { "epoch": 0.8963598697839598, "grad_norm": 1.4910218715667725, "learning_rate": 1.6441142846527688e-05, "loss": 0.2145, "step": 1893 }, { "epoch": 0.896833382657591, "grad_norm": 1.087941288948059, "learning_rate": 1.6437230078333855e-05, "loss": 0.2206, "step": 1894 }, { "epoch": 0.8973068955312222, "grad_norm": 1.3966666460037231, "learning_rate": 1.6433315626544598e-05, "loss": 0.2289, "step": 1895 }, { "epoch": 0.8977804084048535, "grad_norm": 1.0643726587295532, "learning_rate": 1.6429399492183703e-05, "loss": 0.2301, "step": 1896 }, { "epoch": 0.8982539212784848, "grad_norm": 1.5619611740112305, "learning_rate": 1.6425481676275396e-05, "loss": 0.2353, "step": 1897 }, { "epoch": 0.898727434152116, "grad_norm": 1.496078610420227, "learning_rate": 1.642156217984434e-05, "loss": 0.204, "step": 1898 }, { "epoch": 0.8992009470257473, "grad_norm": 1.6565096378326416, "learning_rate": 1.6417641003915653e-05, "loss": 0.2353, "step": 1899 }, { "epoch": 0.8996744598993786, "grad_norm": 1.215949535369873, "learning_rate": 1.641371814951487e-05, "loss": 0.2246, "step": 1900 }, { "epoch": 0.9001479727730097, "grad_norm": 1.6516441106796265, "learning_rate": 1.6409793617667976e-05, "loss": 0.2404, "step": 1901 }, { "epoch": 0.900621485646641, "grad_norm": 1.6095350980758667, "learning_rate": 1.6405867409401403e-05, "loss": 0.2316, "step": 1902 }, { "epoch": 0.9010949985202723, "grad_norm": 1.5806862115859985, "learning_rate": 1.6401939525742007e-05, "loss": 0.2754, "step": 1903 }, { "epoch": 0.9015685113939035, "grad_norm": 1.4216324090957642, "learning_rate": 1.6398009967717086e-05, "loss": 0.2322, "step": 1904 }, { "epoch": 0.9020420242675348, "grad_norm": 1.7498546838760376, "learning_rate": 1.639407873635438e-05, "loss": 0.2352, "step": 1905 }, { "epoch": 0.9025155371411661, "grad_norm": 0.969447135925293, "learning_rate": 1.639014583268207e-05, "loss": 0.2338, "step": 1906 }, { "epoch": 0.9029890500147972, "grad_norm": 2.0034878253936768, "learning_rate": 1.638621125772876e-05, "loss": 0.2381, "step": 1907 }, { "epoch": 0.9034625628884285, "grad_norm": 1.6915373802185059, "learning_rate": 1.6382275012523503e-05, "loss": 0.25, "step": 1908 }, { "epoch": 0.9039360757620598, "grad_norm": 1.4558416604995728, "learning_rate": 1.637833709809579e-05, "loss": 0.2475, "step": 1909 }, { "epoch": 0.904409588635691, "grad_norm": 1.660110354423523, "learning_rate": 1.6374397515475543e-05, "loss": 0.2226, "step": 1910 }, { "epoch": 0.9048831015093223, "grad_norm": 1.2827781438827515, "learning_rate": 1.637045626569312e-05, "loss": 0.2301, "step": 1911 }, { "epoch": 0.9053566143829536, "grad_norm": 2.6431972980499268, "learning_rate": 1.6366513349779313e-05, "loss": 0.2198, "step": 1912 }, { "epoch": 0.9058301272565847, "grad_norm": 2.985435962677002, "learning_rate": 1.6362568768765362e-05, "loss": 0.2571, "step": 1913 }, { "epoch": 0.906303640130216, "grad_norm": 2.624217987060547, "learning_rate": 1.635862252368293e-05, "loss": 0.2407, "step": 1914 }, { "epoch": 0.9067771530038473, "grad_norm": 2.1122894287109375, "learning_rate": 1.635467461556412e-05, "loss": 0.229, "step": 1915 }, { "epoch": 0.9072506658774785, "grad_norm": 1.5032678842544556, "learning_rate": 1.6350725045441472e-05, "loss": 0.2417, "step": 1916 }, { "epoch": 0.9077241787511098, "grad_norm": 1.1959266662597656, "learning_rate": 1.6346773814347952e-05, "loss": 0.2241, "step": 1917 }, { "epoch": 0.9081976916247411, "grad_norm": 2.0419347286224365, "learning_rate": 1.634282092331697e-05, "loss": 0.2398, "step": 1918 }, { "epoch": 0.9086712044983724, "grad_norm": 2.168184280395508, "learning_rate": 1.6338866373382366e-05, "loss": 0.2275, "step": 1919 }, { "epoch": 0.9091447173720035, "grad_norm": 2.695232629776001, "learning_rate": 1.6334910165578413e-05, "loss": 0.2307, "step": 1920 }, { "epoch": 0.9096182302456348, "grad_norm": 2.0876898765563965, "learning_rate": 1.6330952300939817e-05, "loss": 0.2517, "step": 1921 }, { "epoch": 0.9100917431192661, "grad_norm": 1.803197979927063, "learning_rate": 1.6326992780501727e-05, "loss": 0.2283, "step": 1922 }, { "epoch": 0.9105652559928973, "grad_norm": 1.1686948537826538, "learning_rate": 1.632303160529971e-05, "loss": 0.2201, "step": 1923 }, { "epoch": 0.9110387688665286, "grad_norm": 1.152266025543213, "learning_rate": 1.6319068776369783e-05, "loss": 0.2007, "step": 1924 }, { "epoch": 0.9115122817401599, "grad_norm": 2.0857367515563965, "learning_rate": 1.631510429474837e-05, "loss": 0.2101, "step": 1925 }, { "epoch": 0.911985794613791, "grad_norm": 1.8467085361480713, "learning_rate": 1.6311138161472355e-05, "loss": 0.2554, "step": 1926 }, { "epoch": 0.9124593074874223, "grad_norm": 2.0132124423980713, "learning_rate": 1.6307170377579038e-05, "loss": 0.2226, "step": 1927 }, { "epoch": 0.9129328203610536, "grad_norm": 2.2759344577789307, "learning_rate": 1.6303200944106155e-05, "loss": 0.2046, "step": 1928 }, { "epoch": 0.9134063332346848, "grad_norm": 1.024839162826538, "learning_rate": 1.6299229862091876e-05, "loss": 0.2192, "step": 1929 }, { "epoch": 0.9138798461083161, "grad_norm": 1.2846004962921143, "learning_rate": 1.629525713257479e-05, "loss": 0.2037, "step": 1930 }, { "epoch": 0.9143533589819474, "grad_norm": 1.3948590755462646, "learning_rate": 1.6291282756593937e-05, "loss": 0.2416, "step": 1931 }, { "epoch": 0.9148268718555785, "grad_norm": 2.4310286045074463, "learning_rate": 1.628730673518877e-05, "loss": 0.2268, "step": 1932 }, { "epoch": 0.9153003847292098, "grad_norm": 1.4620105028152466, "learning_rate": 1.6283329069399188e-05, "loss": 0.2336, "step": 1933 }, { "epoch": 0.9157738976028411, "grad_norm": 1.0024254322052002, "learning_rate": 1.6279349760265497e-05, "loss": 0.2386, "step": 1934 }, { "epoch": 0.9162474104764723, "grad_norm": 1.0868014097213745, "learning_rate": 1.6275368808828457e-05, "loss": 0.2486, "step": 1935 }, { "epoch": 0.9167209233501036, "grad_norm": 2.2207415103912354, "learning_rate": 1.6271386216129245e-05, "loss": 0.2333, "step": 1936 }, { "epoch": 0.9171944362237349, "grad_norm": 1.3191596269607544, "learning_rate": 1.6267401983209464e-05, "loss": 0.2305, "step": 1937 }, { "epoch": 0.917667949097366, "grad_norm": 1.842523217201233, "learning_rate": 1.626341611111116e-05, "loss": 0.2442, "step": 1938 }, { "epoch": 0.9181414619709973, "grad_norm": 1.3315726518630981, "learning_rate": 1.62594286008768e-05, "loss": 0.24, "step": 1939 }, { "epoch": 0.9186149748446286, "grad_norm": 2.0374701023101807, "learning_rate": 1.6255439453549274e-05, "loss": 0.2319, "step": 1940 }, { "epoch": 0.9190884877182598, "grad_norm": 1.2693907022476196, "learning_rate": 1.62514486701719e-05, "loss": 0.2409, "step": 1941 }, { "epoch": 0.9195620005918911, "grad_norm": 1.0667670965194702, "learning_rate": 1.6247456251788444e-05, "loss": 0.2414, "step": 1942 }, { "epoch": 0.9200355134655224, "grad_norm": 1.2325665950775146, "learning_rate": 1.624346219944307e-05, "loss": 0.2227, "step": 1943 }, { "epoch": 0.9205090263391535, "grad_norm": 2.379380941390991, "learning_rate": 1.6239466514180393e-05, "loss": 0.2169, "step": 1944 }, { "epoch": 0.9209825392127848, "grad_norm": 1.3998842239379883, "learning_rate": 1.623546919704544e-05, "loss": 0.2434, "step": 1945 }, { "epoch": 0.9214560520864161, "grad_norm": 1.3410049676895142, "learning_rate": 1.6231470249083675e-05, "loss": 0.2471, "step": 1946 }, { "epoch": 0.9219295649600473, "grad_norm": 1.5513018369674683, "learning_rate": 1.622746967134098e-05, "loss": 0.2519, "step": 1947 }, { "epoch": 0.9224030778336786, "grad_norm": 1.0884742736816406, "learning_rate": 1.622346746486367e-05, "loss": 0.2354, "step": 1948 }, { "epoch": 0.9228765907073099, "grad_norm": 1.212095856666565, "learning_rate": 1.6219463630698484e-05, "loss": 0.2266, "step": 1949 }, { "epoch": 0.9233501035809412, "grad_norm": 1.2227774858474731, "learning_rate": 1.6215458169892582e-05, "loss": 0.2474, "step": 1950 }, { "epoch": 0.9238236164545723, "grad_norm": 1.3644037246704102, "learning_rate": 1.6211451083493564e-05, "loss": 0.2658, "step": 1951 }, { "epoch": 0.9242971293282036, "grad_norm": 1.7697309255599976, "learning_rate": 1.6207442372549436e-05, "loss": 0.2118, "step": 1952 }, { "epoch": 0.9247706422018349, "grad_norm": 1.1578645706176758, "learning_rate": 1.6203432038108638e-05, "loss": 0.2359, "step": 1953 }, { "epoch": 0.9252441550754661, "grad_norm": 2.0185935497283936, "learning_rate": 1.6199420081220035e-05, "loss": 0.2172, "step": 1954 }, { "epoch": 0.9257176679490974, "grad_norm": 1.3043760061264038, "learning_rate": 1.619540650293292e-05, "loss": 0.2327, "step": 1955 }, { "epoch": 0.9261911808227287, "grad_norm": 0.9914813041687012, "learning_rate": 1.6191391304297e-05, "loss": 0.2352, "step": 1956 }, { "epoch": 0.9266646936963598, "grad_norm": 2.466811180114746, "learning_rate": 1.6187374486362414e-05, "loss": 0.2604, "step": 1957 }, { "epoch": 0.9271382065699911, "grad_norm": 1.7497360706329346, "learning_rate": 1.6183356050179724e-05, "loss": 0.2298, "step": 1958 }, { "epoch": 0.9276117194436224, "grad_norm": 1.3502507209777832, "learning_rate": 1.617933599679991e-05, "loss": 0.2325, "step": 1959 }, { "epoch": 0.9280852323172536, "grad_norm": 1.0206633806228638, "learning_rate": 1.6175314327274377e-05, "loss": 0.2066, "step": 1960 }, { "epoch": 0.9285587451908849, "grad_norm": 1.507358193397522, "learning_rate": 1.6171291042654957e-05, "loss": 0.2335, "step": 1961 }, { "epoch": 0.9290322580645162, "grad_norm": 1.3536046743392944, "learning_rate": 1.6167266143993904e-05, "loss": 0.2261, "step": 1962 }, { "epoch": 0.9295057709381473, "grad_norm": 0.9721176028251648, "learning_rate": 1.6163239632343883e-05, "loss": 0.2134, "step": 1963 }, { "epoch": 0.9299792838117786, "grad_norm": 1.2623369693756104, "learning_rate": 1.6159211508757996e-05, "loss": 0.2231, "step": 1964 }, { "epoch": 0.9304527966854099, "grad_norm": 1.866391658782959, "learning_rate": 1.615518177428976e-05, "loss": 0.2441, "step": 1965 }, { "epoch": 0.9309263095590411, "grad_norm": 1.2802680730819702, "learning_rate": 1.6151150429993106e-05, "loss": 0.2501, "step": 1966 }, { "epoch": 0.9313998224326724, "grad_norm": 1.4352434873580933, "learning_rate": 1.61471174769224e-05, "loss": 0.2254, "step": 1967 }, { "epoch": 0.9318733353063037, "grad_norm": 1.0232847929000854, "learning_rate": 1.614308291613242e-05, "loss": 0.2456, "step": 1968 }, { "epoch": 0.9323468481799349, "grad_norm": 1.138548493385315, "learning_rate": 1.6139046748678366e-05, "loss": 0.2217, "step": 1969 }, { "epoch": 0.9328203610535661, "grad_norm": 1.6218969821929932, "learning_rate": 1.613500897561586e-05, "loss": 0.2379, "step": 1970 }, { "epoch": 0.9332938739271974, "grad_norm": 1.2417789697647095, "learning_rate": 1.6130969598000945e-05, "loss": 0.2304, "step": 1971 }, { "epoch": 0.9337673868008286, "grad_norm": 1.4800796508789062, "learning_rate": 1.6126928616890077e-05, "loss": 0.2745, "step": 1972 }, { "epoch": 0.9342408996744599, "grad_norm": 2.1304409503936768, "learning_rate": 1.6122886033340134e-05, "loss": 0.2395, "step": 1973 }, { "epoch": 0.9347144125480912, "grad_norm": 1.1594027280807495, "learning_rate": 1.6118841848408418e-05, "loss": 0.2544, "step": 1974 }, { "epoch": 0.9351879254217224, "grad_norm": 1.1337738037109375, "learning_rate": 1.6114796063152648e-05, "loss": 0.2067, "step": 1975 }, { "epoch": 0.9356614382953536, "grad_norm": 1.8558512926101685, "learning_rate": 1.611074867863096e-05, "loss": 0.2369, "step": 1976 }, { "epoch": 0.9361349511689849, "grad_norm": 1.4643030166625977, "learning_rate": 1.61066996959019e-05, "loss": 0.207, "step": 1977 }, { "epoch": 0.9366084640426161, "grad_norm": 1.9388593435287476, "learning_rate": 1.610264911602445e-05, "loss": 0.2065, "step": 1978 }, { "epoch": 0.9370819769162474, "grad_norm": 1.1505235433578491, "learning_rate": 1.6098596940058e-05, "loss": 0.2367, "step": 1979 }, { "epoch": 0.9375554897898787, "grad_norm": 1.003483533859253, "learning_rate": 1.6094543169062353e-05, "loss": 0.2095, "step": 1980 }, { "epoch": 0.9380290026635099, "grad_norm": 1.2500495910644531, "learning_rate": 1.6090487804097734e-05, "loss": 0.2377, "step": 1981 }, { "epoch": 0.9385025155371411, "grad_norm": 1.7271959781646729, "learning_rate": 1.6086430846224787e-05, "loss": 0.2288, "step": 1982 }, { "epoch": 0.9389760284107724, "grad_norm": 1.2142095565795898, "learning_rate": 1.6082372296504568e-05, "loss": 0.2158, "step": 1983 }, { "epoch": 0.9394495412844037, "grad_norm": 0.9605250954627991, "learning_rate": 1.6078312155998554e-05, "loss": 0.205, "step": 1984 }, { "epoch": 0.9399230541580349, "grad_norm": 1.0248587131500244, "learning_rate": 1.6074250425768632e-05, "loss": 0.2326, "step": 1985 }, { "epoch": 0.9403965670316662, "grad_norm": 1.8944402933120728, "learning_rate": 1.607018710687711e-05, "loss": 0.227, "step": 1986 }, { "epoch": 0.9408700799052975, "grad_norm": 0.8332754373550415, "learning_rate": 1.6066122200386713e-05, "loss": 0.2075, "step": 1987 }, { "epoch": 0.9413435927789287, "grad_norm": 1.4765225648880005, "learning_rate": 1.6062055707360575e-05, "loss": 0.2085, "step": 1988 }, { "epoch": 0.9418171056525599, "grad_norm": 1.3047866821289062, "learning_rate": 1.6057987628862246e-05, "loss": 0.2134, "step": 1989 }, { "epoch": 0.9422906185261912, "grad_norm": 1.0208147764205933, "learning_rate": 1.6053917965955698e-05, "loss": 0.2091, "step": 1990 }, { "epoch": 0.9427641313998224, "grad_norm": 1.4134602546691895, "learning_rate": 1.6049846719705307e-05, "loss": 0.2519, "step": 1991 }, { "epoch": 0.9432376442734537, "grad_norm": 1.1737221479415894, "learning_rate": 1.604577389117587e-05, "loss": 0.2391, "step": 1992 }, { "epoch": 0.943711157147085, "grad_norm": 1.1827245950698853, "learning_rate": 1.60416994814326e-05, "loss": 0.2178, "step": 1993 }, { "epoch": 0.9441846700207162, "grad_norm": 1.5996443033218384, "learning_rate": 1.6037623491541114e-05, "loss": 0.2106, "step": 1994 }, { "epoch": 0.9446581828943474, "grad_norm": 1.233630895614624, "learning_rate": 1.6033545922567447e-05, "loss": 0.2299, "step": 1995 }, { "epoch": 0.9451316957679787, "grad_norm": 0.9926683306694031, "learning_rate": 1.6029466775578054e-05, "loss": 0.2464, "step": 1996 }, { "epoch": 0.9456052086416099, "grad_norm": 1.6486265659332275, "learning_rate": 1.602538605163979e-05, "loss": 0.23, "step": 1997 }, { "epoch": 0.9460787215152412, "grad_norm": 1.1196324825286865, "learning_rate": 1.602130375181994e-05, "loss": 0.2234, "step": 1998 }, { "epoch": 0.9465522343888725, "grad_norm": 1.0825940370559692, "learning_rate": 1.6017219877186173e-05, "loss": 0.207, "step": 1999 }, { "epoch": 0.9470257472625037, "grad_norm": 1.6445330381393433, "learning_rate": 1.60131344288066e-05, "loss": 0.2202, "step": 2000 }, { "epoch": 0.947499260136135, "grad_norm": 1.1196541786193848, "learning_rate": 1.600904740774973e-05, "loss": 0.2332, "step": 2001 }, { "epoch": 0.9479727730097662, "grad_norm": 1.173668622970581, "learning_rate": 1.6004958815084476e-05, "loss": 0.25, "step": 2002 }, { "epoch": 0.9484462858833974, "grad_norm": 1.4854817390441895, "learning_rate": 1.6000868651880175e-05, "loss": 0.2444, "step": 2003 }, { "epoch": 0.9489197987570287, "grad_norm": 1.8547112941741943, "learning_rate": 1.599677691920657e-05, "loss": 0.2711, "step": 2004 }, { "epoch": 0.94939331163066, "grad_norm": 1.0332943201065063, "learning_rate": 1.5992683618133817e-05, "loss": 0.231, "step": 2005 }, { "epoch": 0.9498668245042912, "grad_norm": 1.8548601865768433, "learning_rate": 1.5988588749732472e-05, "loss": 0.2198, "step": 2006 }, { "epoch": 0.9503403373779225, "grad_norm": 1.8581714630126953, "learning_rate": 1.5984492315073512e-05, "loss": 0.2327, "step": 2007 }, { "epoch": 0.9508138502515537, "grad_norm": 1.062828540802002, "learning_rate": 1.5980394315228323e-05, "loss": 0.224, "step": 2008 }, { "epoch": 0.9512873631251849, "grad_norm": 1.4137210845947266, "learning_rate": 1.5976294751268695e-05, "loss": 0.2407, "step": 2009 }, { "epoch": 0.9517608759988162, "grad_norm": 0.9517819285392761, "learning_rate": 1.597219362426683e-05, "loss": 0.2383, "step": 2010 }, { "epoch": 0.9522343888724475, "grad_norm": 1.2080825567245483, "learning_rate": 1.5968090935295335e-05, "loss": 0.2174, "step": 2011 }, { "epoch": 0.9527079017460787, "grad_norm": 1.1361576318740845, "learning_rate": 1.596398668542723e-05, "loss": 0.2225, "step": 2012 }, { "epoch": 0.95318141461971, "grad_norm": 1.490893840789795, "learning_rate": 1.5959880875735944e-05, "loss": 0.2416, "step": 2013 }, { "epoch": 0.9536549274933412, "grad_norm": 1.1593446731567383, "learning_rate": 1.5955773507295313e-05, "loss": 0.2363, "step": 2014 }, { "epoch": 0.9541284403669725, "grad_norm": 1.0832358598709106, "learning_rate": 1.5951664581179578e-05, "loss": 0.2241, "step": 2015 }, { "epoch": 0.9546019532406037, "grad_norm": 1.426303505897522, "learning_rate": 1.5947554098463386e-05, "loss": 0.2233, "step": 2016 }, { "epoch": 0.955075466114235, "grad_norm": 1.7878800630569458, "learning_rate": 1.5943442060221795e-05, "loss": 0.2577, "step": 2017 }, { "epoch": 0.9555489789878663, "grad_norm": 1.071764349937439, "learning_rate": 1.5939328467530276e-05, "loss": 0.2123, "step": 2018 }, { "epoch": 0.9560224918614975, "grad_norm": 1.2568780183792114, "learning_rate": 1.593521332146469e-05, "loss": 0.2506, "step": 2019 }, { "epoch": 0.9564960047351287, "grad_norm": 1.9318242073059082, "learning_rate": 1.593109662310132e-05, "loss": 0.2185, "step": 2020 }, { "epoch": 0.95696951760876, "grad_norm": 1.3242648839950562, "learning_rate": 1.5926978373516842e-05, "loss": 0.2374, "step": 2021 }, { "epoch": 0.9574430304823912, "grad_norm": 1.1390552520751953, "learning_rate": 1.5922858573788356e-05, "loss": 0.2206, "step": 2022 }, { "epoch": 0.9579165433560225, "grad_norm": 1.5206451416015625, "learning_rate": 1.5918737224993345e-05, "loss": 0.2218, "step": 2023 }, { "epoch": 0.9583900562296538, "grad_norm": 1.1097302436828613, "learning_rate": 1.591461432820971e-05, "loss": 0.1886, "step": 2024 }, { "epoch": 0.958863569103285, "grad_norm": 1.2331650257110596, "learning_rate": 1.591048988451576e-05, "loss": 0.2343, "step": 2025 }, { "epoch": 0.9593370819769163, "grad_norm": 2.753371000289917, "learning_rate": 1.5906363894990197e-05, "loss": 0.2412, "step": 2026 }, { "epoch": 0.9598105948505475, "grad_norm": 0.9969640374183655, "learning_rate": 1.590223636071214e-05, "loss": 0.2389, "step": 2027 }, { "epoch": 0.9602841077241787, "grad_norm": 2.2440433502197266, "learning_rate": 1.58981072827611e-05, "loss": 0.1999, "step": 2028 }, { "epoch": 0.96075762059781, "grad_norm": 1.106385350227356, "learning_rate": 1.5893976662217e-05, "loss": 0.2168, "step": 2029 }, { "epoch": 0.9612311334714413, "grad_norm": 2.1727712154388428, "learning_rate": 1.588984450016017e-05, "loss": 0.2357, "step": 2030 }, { "epoch": 0.9617046463450725, "grad_norm": 1.639740228652954, "learning_rate": 1.5885710797671326e-05, "loss": 0.2533, "step": 2031 }, { "epoch": 0.9621781592187038, "grad_norm": 0.9460485577583313, "learning_rate": 1.5881575555831604e-05, "loss": 0.2454, "step": 2032 }, { "epoch": 0.962651672092335, "grad_norm": 1.232308268547058, "learning_rate": 1.5877438775722536e-05, "loss": 0.2304, "step": 2033 }, { "epoch": 0.9631251849659662, "grad_norm": 1.2091732025146484, "learning_rate": 1.587330045842606e-05, "loss": 0.2175, "step": 2034 }, { "epoch": 0.9635986978395975, "grad_norm": 1.2049965858459473, "learning_rate": 1.586916060502451e-05, "loss": 0.2437, "step": 2035 }, { "epoch": 0.9640722107132288, "grad_norm": 1.1409215927124023, "learning_rate": 1.586501921660062e-05, "loss": 0.2253, "step": 2036 }, { "epoch": 0.96454572358686, "grad_norm": 1.720682978630066, "learning_rate": 1.5860876294237535e-05, "loss": 0.2218, "step": 2037 }, { "epoch": 0.9650192364604913, "grad_norm": 1.6841931343078613, "learning_rate": 1.58567318390188e-05, "loss": 0.2172, "step": 2038 }, { "epoch": 0.9654927493341225, "grad_norm": 0.894889235496521, "learning_rate": 1.5852585852028348e-05, "loss": 0.232, "step": 2039 }, { "epoch": 0.9659662622077537, "grad_norm": 1.6859184503555298, "learning_rate": 1.584843833435053e-05, "loss": 0.2161, "step": 2040 }, { "epoch": 0.966439775081385, "grad_norm": 1.403887391090393, "learning_rate": 1.5844289287070088e-05, "loss": 0.2235, "step": 2041 }, { "epoch": 0.9669132879550163, "grad_norm": 1.843876600265503, "learning_rate": 1.5840138711272165e-05, "loss": 0.201, "step": 2042 }, { "epoch": 0.9673868008286475, "grad_norm": 0.9713191986083984, "learning_rate": 1.58359866080423e-05, "loss": 0.2385, "step": 2043 }, { "epoch": 0.9678603137022788, "grad_norm": 1.014763593673706, "learning_rate": 1.583183297846644e-05, "loss": 0.231, "step": 2044 }, { "epoch": 0.96833382657591, "grad_norm": 1.2346670627593994, "learning_rate": 1.5827677823630922e-05, "loss": 0.2111, "step": 2045 }, { "epoch": 0.9688073394495413, "grad_norm": 1.3078184127807617, "learning_rate": 1.5823521144622493e-05, "loss": 0.2232, "step": 2046 }, { "epoch": 0.9692808523231725, "grad_norm": 1.7039018869400024, "learning_rate": 1.5819362942528288e-05, "loss": 0.2271, "step": 2047 }, { "epoch": 0.9697543651968038, "grad_norm": 2.7722675800323486, "learning_rate": 1.5815203218435847e-05, "loss": 0.1962, "step": 2048 }, { "epoch": 0.9702278780704351, "grad_norm": 1.9730979204177856, "learning_rate": 1.5811041973433103e-05, "loss": 0.2382, "step": 2049 }, { "epoch": 0.9707013909440663, "grad_norm": 0.8822180032730103, "learning_rate": 1.580687920860839e-05, "loss": 0.2026, "step": 2050 }, { "epoch": 0.9711749038176976, "grad_norm": 1.793717861175537, "learning_rate": 1.5802714925050444e-05, "loss": 0.2262, "step": 2051 }, { "epoch": 0.9716484166913288, "grad_norm": 1.0498217344284058, "learning_rate": 1.5798549123848386e-05, "loss": 0.2067, "step": 2052 }, { "epoch": 0.97212192956496, "grad_norm": 1.1422361135482788, "learning_rate": 1.5794381806091742e-05, "loss": 0.2108, "step": 2053 }, { "epoch": 0.9725954424385913, "grad_norm": 1.9571365118026733, "learning_rate": 1.579021297287044e-05, "loss": 0.2208, "step": 2054 }, { "epoch": 0.9730689553122226, "grad_norm": 1.8578828573226929, "learning_rate": 1.5786042625274795e-05, "loss": 0.242, "step": 2055 }, { "epoch": 0.9735424681858538, "grad_norm": 1.3278276920318604, "learning_rate": 1.5781870764395515e-05, "loss": 0.2039, "step": 2056 }, { "epoch": 0.9740159810594851, "grad_norm": 1.0172450542449951, "learning_rate": 1.5777697391323717e-05, "loss": 0.2138, "step": 2057 }, { "epoch": 0.9744894939331163, "grad_norm": 1.599645733833313, "learning_rate": 1.577352250715091e-05, "loss": 0.2476, "step": 2058 }, { "epoch": 0.9749630068067475, "grad_norm": 1.6316616535186768, "learning_rate": 1.5769346112968985e-05, "loss": 0.2329, "step": 2059 }, { "epoch": 0.9754365196803788, "grad_norm": 1.9370191097259521, "learning_rate": 1.5765168209870243e-05, "loss": 0.2382, "step": 2060 }, { "epoch": 0.9759100325540101, "grad_norm": 1.4778871536254883, "learning_rate": 1.5760988798947372e-05, "loss": 0.2407, "step": 2061 }, { "epoch": 0.9763835454276413, "grad_norm": 1.1680418252944946, "learning_rate": 1.575680788129346e-05, "loss": 0.2301, "step": 2062 }, { "epoch": 0.9768570583012726, "grad_norm": 1.1255524158477783, "learning_rate": 1.575262545800198e-05, "loss": 0.2359, "step": 2063 }, { "epoch": 0.9773305711749039, "grad_norm": 1.652750849723816, "learning_rate": 1.5748441530166814e-05, "loss": 0.2416, "step": 2064 }, { "epoch": 0.977804084048535, "grad_norm": 1.3256607055664062, "learning_rate": 1.5744256098882217e-05, "loss": 0.2516, "step": 2065 }, { "epoch": 0.9782775969221663, "grad_norm": 1.3550041913986206, "learning_rate": 1.5740069165242854e-05, "loss": 0.2473, "step": 2066 }, { "epoch": 0.9787511097957976, "grad_norm": 1.5172563791275024, "learning_rate": 1.5735880730343776e-05, "loss": 0.2229, "step": 2067 }, { "epoch": 0.9792246226694288, "grad_norm": 1.0142927169799805, "learning_rate": 1.573169079528043e-05, "loss": 0.2393, "step": 2068 }, { "epoch": 0.9796981355430601, "grad_norm": 0.9681066274642944, "learning_rate": 1.5727499361148647e-05, "loss": 0.2281, "step": 2069 }, { "epoch": 0.9801716484166914, "grad_norm": 1.673020362854004, "learning_rate": 1.5723306429044663e-05, "loss": 0.2345, "step": 2070 }, { "epoch": 0.9806451612903225, "grad_norm": 1.8111623525619507, "learning_rate": 1.57191120000651e-05, "loss": 0.207, "step": 2071 }, { "epoch": 0.9811186741639538, "grad_norm": 2.4038047790527344, "learning_rate": 1.571491607530696e-05, "loss": 0.222, "step": 2072 }, { "epoch": 0.9815921870375851, "grad_norm": 2.7947847843170166, "learning_rate": 1.5710718655867658e-05, "loss": 0.2338, "step": 2073 }, { "epoch": 0.9820656999112163, "grad_norm": 2.207833766937256, "learning_rate": 1.5706519742844982e-05, "loss": 0.2417, "step": 2074 }, { "epoch": 0.9825392127848476, "grad_norm": 1.7318761348724365, "learning_rate": 1.5702319337337118e-05, "loss": 0.2323, "step": 2075 }, { "epoch": 0.9830127256584789, "grad_norm": 1.349669098854065, "learning_rate": 1.5698117440442643e-05, "loss": 0.2377, "step": 2076 }, { "epoch": 0.9834862385321101, "grad_norm": 1.8161946535110474, "learning_rate": 1.5693914053260524e-05, "loss": 0.2341, "step": 2077 }, { "epoch": 0.9839597514057413, "grad_norm": 2.296698570251465, "learning_rate": 1.5689709176890113e-05, "loss": 0.226, "step": 2078 }, { "epoch": 0.9844332642793726, "grad_norm": 1.6288074254989624, "learning_rate": 1.5685502812431156e-05, "loss": 0.2476, "step": 2079 }, { "epoch": 0.9849067771530039, "grad_norm": 1.528737187385559, "learning_rate": 1.5681294960983787e-05, "loss": 0.2455, "step": 2080 }, { "epoch": 0.9853802900266351, "grad_norm": 1.2330217361450195, "learning_rate": 1.567708562364853e-05, "loss": 0.2377, "step": 2081 }, { "epoch": 0.9858538029002664, "grad_norm": 1.067199945449829, "learning_rate": 1.56728748015263e-05, "loss": 0.2244, "step": 2082 }, { "epoch": 0.9863273157738977, "grad_norm": 1.0939373970031738, "learning_rate": 1.566866249571839e-05, "loss": 0.2275, "step": 2083 }, { "epoch": 0.9868008286475288, "grad_norm": 1.1805354356765747, "learning_rate": 1.566444870732649e-05, "loss": 0.2211, "step": 2084 }, { "epoch": 0.9872743415211601, "grad_norm": 2.558940887451172, "learning_rate": 1.5660233437452676e-05, "loss": 0.2368, "step": 2085 }, { "epoch": 0.9877478543947914, "grad_norm": 1.4615000486373901, "learning_rate": 1.565601668719941e-05, "loss": 0.2194, "step": 2086 }, { "epoch": 0.9882213672684226, "grad_norm": 1.341464638710022, "learning_rate": 1.565179845766955e-05, "loss": 0.2447, "step": 2087 }, { "epoch": 0.9886948801420539, "grad_norm": 1.341123104095459, "learning_rate": 1.564757874996632e-05, "loss": 0.2241, "step": 2088 }, { "epoch": 0.9891683930156852, "grad_norm": 0.9674537181854248, "learning_rate": 1.5643357565193355e-05, "loss": 0.2312, "step": 2089 }, { "epoch": 0.9896419058893163, "grad_norm": 1.1203300952911377, "learning_rate": 1.5639134904454663e-05, "loss": 0.2513, "step": 2090 }, { "epoch": 0.9901154187629476, "grad_norm": 1.7176257371902466, "learning_rate": 1.5634910768854634e-05, "loss": 0.2076, "step": 2091 }, { "epoch": 0.9905889316365789, "grad_norm": 2.126793146133423, "learning_rate": 1.5630685159498057e-05, "loss": 0.2445, "step": 2092 }, { "epoch": 0.9910624445102101, "grad_norm": 1.8809027671813965, "learning_rate": 1.56264580774901e-05, "loss": 0.2538, "step": 2093 }, { "epoch": 0.9915359573838414, "grad_norm": 1.7825748920440674, "learning_rate": 1.562222952393631e-05, "loss": 0.2337, "step": 2094 }, { "epoch": 0.9920094702574727, "grad_norm": 1.1806340217590332, "learning_rate": 1.5617999499942623e-05, "loss": 0.2319, "step": 2095 }, { "epoch": 0.9924829831311038, "grad_norm": 1.4441994428634644, "learning_rate": 1.5613768006615367e-05, "loss": 0.2383, "step": 2096 }, { "epoch": 0.9929564960047351, "grad_norm": 0.8531612157821655, "learning_rate": 1.5609535045061247e-05, "loss": 0.2205, "step": 2097 }, { "epoch": 0.9934300088783664, "grad_norm": 1.457108497619629, "learning_rate": 1.5605300616387347e-05, "loss": 0.2266, "step": 2098 }, { "epoch": 0.9939035217519976, "grad_norm": 1.050585389137268, "learning_rate": 1.5601064721701155e-05, "loss": 0.2373, "step": 2099 }, { "epoch": 0.9943770346256289, "grad_norm": 0.9880205392837524, "learning_rate": 1.5596827362110512e-05, "loss": 0.2344, "step": 2100 }, { "epoch": 0.9948505474992602, "grad_norm": 1.1303571462631226, "learning_rate": 1.559258853872367e-05, "loss": 0.2044, "step": 2101 }, { "epoch": 0.9953240603728913, "grad_norm": 1.6071451902389526, "learning_rate": 1.5588348252649246e-05, "loss": 0.2096, "step": 2102 }, { "epoch": 0.9957975732465226, "grad_norm": 1.32492995262146, "learning_rate": 1.5584106504996247e-05, "loss": 0.2312, "step": 2103 }, { "epoch": 0.9962710861201539, "grad_norm": 1.9682942628860474, "learning_rate": 1.5579863296874066e-05, "loss": 0.2378, "step": 2104 }, { "epoch": 0.9967445989937851, "grad_norm": 1.4948111772537231, "learning_rate": 1.5575618629392466e-05, "loss": 0.2195, "step": 2105 }, { "epoch": 0.9972181118674164, "grad_norm": 1.1894011497497559, "learning_rate": 1.5571372503661604e-05, "loss": 0.2456, "step": 2106 }, { "epoch": 0.9976916247410477, "grad_norm": 1.0907820463180542, "learning_rate": 1.556712492079201e-05, "loss": 0.2328, "step": 2107 }, { "epoch": 0.998165137614679, "grad_norm": 1.6044102907180786, "learning_rate": 1.5562875881894605e-05, "loss": 0.2159, "step": 2108 }, { "epoch": 0.9986386504883101, "grad_norm": 2.430710554122925, "learning_rate": 1.5558625388080676e-05, "loss": 0.2135, "step": 2109 }, { "epoch": 0.9991121633619414, "grad_norm": 1.4718191623687744, "learning_rate": 1.5554373440461904e-05, "loss": 0.2448, "step": 2110 }, { "epoch": 0.9995856762355727, "grad_norm": 1.1206008195877075, "learning_rate": 1.5550120040150338e-05, "loss": 0.2055, "step": 2111 }, { "epoch": 1.0000591891092039, "grad_norm": 1.4358619451522827, "learning_rate": 1.5545865188258423e-05, "loss": 0.2423, "step": 2112 }, { "epoch": 1.0005327019828352, "grad_norm": 1.060018539428711, "learning_rate": 1.5541608885898968e-05, "loss": 0.2124, "step": 2113 }, { "epoch": 1.0010062148564665, "grad_norm": 1.6677261590957642, "learning_rate": 1.553735113418517e-05, "loss": 0.2323, "step": 2114 }, { "epoch": 1.0014797277300977, "grad_norm": 1.8145389556884766, "learning_rate": 1.55330919342306e-05, "loss": 0.2383, "step": 2115 }, { "epoch": 1.0019532406037288, "grad_norm": 1.2736999988555908, "learning_rate": 1.552883128714922e-05, "loss": 0.2094, "step": 2116 }, { "epoch": 1.00242675347736, "grad_norm": 1.4934360980987549, "learning_rate": 1.552456919405535e-05, "loss": 0.2103, "step": 2117 }, { "epoch": 1.0029002663509914, "grad_norm": 1.330871343612671, "learning_rate": 1.5520305656063702e-05, "loss": 0.2176, "step": 2118 }, { "epoch": 1.0033737792246227, "grad_norm": 1.226676344871521, "learning_rate": 1.5516040674289364e-05, "loss": 0.2181, "step": 2119 }, { "epoch": 1.003847292098254, "grad_norm": 0.9475164413452148, "learning_rate": 1.5511774249847806e-05, "loss": 0.2092, "step": 2120 }, { "epoch": 1.0043208049718853, "grad_norm": 2.017439126968384, "learning_rate": 1.5507506383854867e-05, "loss": 0.2404, "step": 2121 }, { "epoch": 1.0047943178455163, "grad_norm": 2.2948458194732666, "learning_rate": 1.5503237077426762e-05, "loss": 0.2225, "step": 2122 }, { "epoch": 1.0052678307191476, "grad_norm": 1.144311547279358, "learning_rate": 1.5498966331680093e-05, "loss": 0.2437, "step": 2123 }, { "epoch": 1.005741343592779, "grad_norm": 1.512420892715454, "learning_rate": 1.5494694147731822e-05, "loss": 0.2337, "step": 2124 }, { "epoch": 1.0062148564664102, "grad_norm": 1.3547443151474, "learning_rate": 1.549042052669931e-05, "loss": 0.2101, "step": 2125 }, { "epoch": 1.0066883693400415, "grad_norm": 1.429449200630188, "learning_rate": 1.5486145469700278e-05, "loss": 0.2054, "step": 2126 }, { "epoch": 1.0071618822136728, "grad_norm": 2.169478178024292, "learning_rate": 1.5481868977852823e-05, "loss": 0.2465, "step": 2127 }, { "epoch": 1.007635395087304, "grad_norm": 1.4005268812179565, "learning_rate": 1.547759105227542e-05, "loss": 0.2212, "step": 2128 }, { "epoch": 1.008108907960935, "grad_norm": 1.521437644958496, "learning_rate": 1.547331169408692e-05, "loss": 0.2458, "step": 2129 }, { "epoch": 1.0085824208345664, "grad_norm": 1.0772839784622192, "learning_rate": 1.5469030904406554e-05, "loss": 0.2265, "step": 2130 }, { "epoch": 1.0090559337081977, "grad_norm": 1.4369512796401978, "learning_rate": 1.546474868435391e-05, "loss": 0.2037, "step": 2131 }, { "epoch": 1.009529446581829, "grad_norm": 1.2946951389312744, "learning_rate": 1.546046503504897e-05, "loss": 0.2261, "step": 2132 }, { "epoch": 1.0100029594554603, "grad_norm": 1.3359379768371582, "learning_rate": 1.5456179957612074e-05, "loss": 0.2487, "step": 2133 }, { "epoch": 1.0104764723290915, "grad_norm": 2.1459126472473145, "learning_rate": 1.545189345316395e-05, "loss": 0.2004, "step": 2134 }, { "epoch": 1.0109499852027226, "grad_norm": 2.0224199295043945, "learning_rate": 1.5447605522825687e-05, "loss": 0.2237, "step": 2135 }, { "epoch": 1.011423498076354, "grad_norm": 1.4307156801223755, "learning_rate": 1.5443316167718756e-05, "loss": 0.2076, "step": 2136 }, { "epoch": 1.0118970109499852, "grad_norm": 1.0832078456878662, "learning_rate": 1.543902538896499e-05, "loss": 0.2175, "step": 2137 }, { "epoch": 1.0123705238236165, "grad_norm": 1.5245671272277832, "learning_rate": 1.543473318768661e-05, "loss": 0.245, "step": 2138 }, { "epoch": 1.0128440366972478, "grad_norm": 1.020795226097107, "learning_rate": 1.5430439565006193e-05, "loss": 0.2282, "step": 2139 }, { "epoch": 1.013317549570879, "grad_norm": 2.5934853553771973, "learning_rate": 1.5426144522046692e-05, "loss": 0.2264, "step": 2140 }, { "epoch": 1.0137910624445101, "grad_norm": 1.0803589820861816, "learning_rate": 1.5421848059931443e-05, "loss": 0.2158, "step": 2141 }, { "epoch": 1.0142645753181414, "grad_norm": 1.115559697151184, "learning_rate": 1.541755017978414e-05, "loss": 0.2259, "step": 2142 }, { "epoch": 1.0147380881917727, "grad_norm": 1.1411402225494385, "learning_rate": 1.5413250882728847e-05, "loss": 0.2196, "step": 2143 }, { "epoch": 1.015211601065404, "grad_norm": 1.0891308784484863, "learning_rate": 1.540895016989001e-05, "loss": 0.2257, "step": 2144 }, { "epoch": 1.0156851139390353, "grad_norm": 1.2236292362213135, "learning_rate": 1.5404648042392437e-05, "loss": 0.2483, "step": 2145 }, { "epoch": 1.0161586268126666, "grad_norm": 1.3101778030395508, "learning_rate": 1.5400344501361305e-05, "loss": 0.2389, "step": 2146 }, { "epoch": 1.0166321396862976, "grad_norm": 1.1559494733810425, "learning_rate": 1.5396039547922174e-05, "loss": 0.2229, "step": 2147 }, { "epoch": 1.017105652559929, "grad_norm": 1.1943864822387695, "learning_rate": 1.5391733183200952e-05, "loss": 0.2181, "step": 2148 }, { "epoch": 1.0175791654335602, "grad_norm": 1.2304420471191406, "learning_rate": 1.5387425408323934e-05, "loss": 0.2105, "step": 2149 }, { "epoch": 1.0180526783071915, "grad_norm": 1.0440901517868042, "learning_rate": 1.5383116224417767e-05, "loss": 0.2332, "step": 2150 }, { "epoch": 1.0185261911808228, "grad_norm": 1.2044522762298584, "learning_rate": 1.5378805632609487e-05, "loss": 0.2375, "step": 2151 }, { "epoch": 1.018999704054454, "grad_norm": 1.4749743938446045, "learning_rate": 1.5374493634026486e-05, "loss": 0.1913, "step": 2152 }, { "epoch": 1.0194732169280851, "grad_norm": 1.4272947311401367, "learning_rate": 1.537018022979652e-05, "loss": 0.22, "step": 2153 }, { "epoch": 1.0199467298017164, "grad_norm": 1.7914042472839355, "learning_rate": 1.5365865421047724e-05, "loss": 0.2344, "step": 2154 }, { "epoch": 1.0204202426753477, "grad_norm": 1.1792020797729492, "learning_rate": 1.5361549208908594e-05, "loss": 0.2023, "step": 2155 }, { "epoch": 1.020893755548979, "grad_norm": 1.5299314260482788, "learning_rate": 1.5357231594507988e-05, "loss": 0.2242, "step": 2156 }, { "epoch": 1.0213672684226103, "grad_norm": 1.1294841766357422, "learning_rate": 1.5352912578975144e-05, "loss": 0.2355, "step": 2157 }, { "epoch": 1.0218407812962416, "grad_norm": 1.5787099599838257, "learning_rate": 1.5348592163439655e-05, "loss": 0.2163, "step": 2158 }, { "epoch": 1.0223142941698729, "grad_norm": 1.2531263828277588, "learning_rate": 1.5344270349031486e-05, "loss": 0.2435, "step": 2159 }, { "epoch": 1.022787807043504, "grad_norm": 1.2263544797897339, "learning_rate": 1.5339947136880962e-05, "loss": 0.213, "step": 2160 }, { "epoch": 1.0232613199171352, "grad_norm": 1.8795762062072754, "learning_rate": 1.5335622528118777e-05, "loss": 0.2309, "step": 2161 }, { "epoch": 1.0237348327907665, "grad_norm": 1.558018684387207, "learning_rate": 1.5331296523876e-05, "loss": 0.231, "step": 2162 }, { "epoch": 1.0242083456643978, "grad_norm": 1.8025420904159546, "learning_rate": 1.5326969125284043e-05, "loss": 0.202, "step": 2163 }, { "epoch": 1.024681858538029, "grad_norm": 1.190397024154663, "learning_rate": 1.5322640333474704e-05, "loss": 0.2224, "step": 2164 }, { "epoch": 1.0251553714116604, "grad_norm": 1.9141662120819092, "learning_rate": 1.5318310149580133e-05, "loss": 0.2584, "step": 2165 }, { "epoch": 1.0256288842852914, "grad_norm": 1.1253691911697388, "learning_rate": 1.531397857473285e-05, "loss": 0.2169, "step": 2166 }, { "epoch": 1.0261023971589227, "grad_norm": 1.264083981513977, "learning_rate": 1.530964561006574e-05, "loss": 0.2143, "step": 2167 }, { "epoch": 1.026575910032554, "grad_norm": 1.5587284564971924, "learning_rate": 1.5305311256712038e-05, "loss": 0.2312, "step": 2168 }, { "epoch": 1.0270494229061853, "grad_norm": 1.0813679695129395, "learning_rate": 1.5300975515805358e-05, "loss": 0.2241, "step": 2169 }, { "epoch": 1.0275229357798166, "grad_norm": 1.4243192672729492, "learning_rate": 1.5296638388479673e-05, "loss": 0.2121, "step": 2170 }, { "epoch": 1.0279964486534479, "grad_norm": 1.1746392250061035, "learning_rate": 1.5292299875869313e-05, "loss": 0.238, "step": 2171 }, { "epoch": 1.028469961527079, "grad_norm": 1.360106110572815, "learning_rate": 1.528795997910898e-05, "loss": 0.2336, "step": 2172 }, { "epoch": 1.0289434744007102, "grad_norm": 1.4826574325561523, "learning_rate": 1.5283618699333725e-05, "loss": 0.2278, "step": 2173 }, { "epoch": 1.0294169872743415, "grad_norm": 1.1893174648284912, "learning_rate": 1.5279276037678972e-05, "loss": 0.2175, "step": 2174 }, { "epoch": 1.0298905001479728, "grad_norm": 1.326848030090332, "learning_rate": 1.5274931995280504e-05, "loss": 0.2246, "step": 2175 }, { "epoch": 1.030364013021604, "grad_norm": 1.2376220226287842, "learning_rate": 1.527058657327446e-05, "loss": 0.2405, "step": 2176 }, { "epoch": 1.0308375258952354, "grad_norm": 0.9731859564781189, "learning_rate": 1.5266239772797343e-05, "loss": 0.2302, "step": 2177 }, { "epoch": 1.0313110387688664, "grad_norm": 1.755089521408081, "learning_rate": 1.5261891594986014e-05, "loss": 0.1949, "step": 2178 }, { "epoch": 1.0317845516424977, "grad_norm": 2.296330451965332, "learning_rate": 1.5257542040977706e-05, "loss": 0.2192, "step": 2179 }, { "epoch": 1.032258064516129, "grad_norm": 2.243488311767578, "learning_rate": 1.5253191111909997e-05, "loss": 0.2227, "step": 2180 }, { "epoch": 1.0327315773897603, "grad_norm": 1.5681140422821045, "learning_rate": 1.524883880892083e-05, "loss": 0.2159, "step": 2181 }, { "epoch": 1.0332050902633916, "grad_norm": 1.9723291397094727, "learning_rate": 1.5244485133148507e-05, "loss": 0.232, "step": 2182 }, { "epoch": 1.0336786031370229, "grad_norm": 1.862541913986206, "learning_rate": 1.5240130085731695e-05, "loss": 0.2463, "step": 2183 }, { "epoch": 1.034152116010654, "grad_norm": 1.2561465501785278, "learning_rate": 1.5235773667809409e-05, "loss": 0.2121, "step": 2184 }, { "epoch": 1.0346256288842852, "grad_norm": 1.7062082290649414, "learning_rate": 1.5231415880521032e-05, "loss": 0.2215, "step": 2185 }, { "epoch": 1.0350991417579165, "grad_norm": 1.24093496799469, "learning_rate": 1.5227056725006301e-05, "loss": 0.2052, "step": 2186 }, { "epoch": 1.0355726546315478, "grad_norm": 1.7129935026168823, "learning_rate": 1.5222696202405307e-05, "loss": 0.1988, "step": 2187 }, { "epoch": 1.036046167505179, "grad_norm": 1.3242411613464355, "learning_rate": 1.5218334313858507e-05, "loss": 0.2089, "step": 2188 }, { "epoch": 1.0365196803788104, "grad_norm": 2.027369737625122, "learning_rate": 1.5213971060506709e-05, "loss": 0.2572, "step": 2189 }, { "epoch": 1.0369931932524414, "grad_norm": 1.3638339042663574, "learning_rate": 1.520960644349108e-05, "loss": 0.2261, "step": 2190 }, { "epoch": 1.0374667061260727, "grad_norm": 1.4748543500900269, "learning_rate": 1.5205240463953146e-05, "loss": 0.2149, "step": 2191 }, { "epoch": 1.037940218999704, "grad_norm": 1.1796306371688843, "learning_rate": 1.5200873123034783e-05, "loss": 0.237, "step": 2192 }, { "epoch": 1.0384137318733353, "grad_norm": 1.1525838375091553, "learning_rate": 1.5196504421878229e-05, "loss": 0.2385, "step": 2193 }, { "epoch": 1.0388872447469666, "grad_norm": 2.3929810523986816, "learning_rate": 1.5192134361626074e-05, "loss": 0.2005, "step": 2194 }, { "epoch": 1.0393607576205979, "grad_norm": 1.697789192199707, "learning_rate": 1.5187762943421266e-05, "loss": 0.2422, "step": 2195 }, { "epoch": 1.0398342704942292, "grad_norm": 1.2148844003677368, "learning_rate": 1.5183390168407108e-05, "loss": 0.2188, "step": 2196 }, { "epoch": 1.0403077833678602, "grad_norm": 1.2797152996063232, "learning_rate": 1.5179016037727256e-05, "loss": 0.215, "step": 2197 }, { "epoch": 1.0407812962414915, "grad_norm": 1.7191182374954224, "learning_rate": 1.5174640552525724e-05, "loss": 0.2422, "step": 2198 }, { "epoch": 1.0412548091151228, "grad_norm": 1.3166422843933105, "learning_rate": 1.5170263713946873e-05, "loss": 0.22, "step": 2199 }, { "epoch": 1.041728321988754, "grad_norm": 1.1417112350463867, "learning_rate": 1.5165885523135426e-05, "loss": 0.2137, "step": 2200 }, { "epoch": 1.0422018348623854, "grad_norm": 1.1441370248794556, "learning_rate": 1.516150598123646e-05, "loss": 0.2146, "step": 2201 }, { "epoch": 1.0426753477360167, "grad_norm": 1.5537104606628418, "learning_rate": 1.5157125089395397e-05, "loss": 0.21, "step": 2202 }, { "epoch": 1.0431488606096477, "grad_norm": 1.3530882596969604, "learning_rate": 1.5152742848758018e-05, "loss": 0.2085, "step": 2203 }, { "epoch": 1.043622373483279, "grad_norm": 1.541207194328308, "learning_rate": 1.5148359260470456e-05, "loss": 0.2315, "step": 2204 }, { "epoch": 1.0440958863569103, "grad_norm": 1.0132625102996826, "learning_rate": 1.5143974325679196e-05, "loss": 0.2008, "step": 2205 }, { "epoch": 1.0445693992305416, "grad_norm": 1.1668498516082764, "learning_rate": 1.5139588045531077e-05, "loss": 0.2239, "step": 2206 }, { "epoch": 1.0450429121041729, "grad_norm": 1.6192450523376465, "learning_rate": 1.5135200421173288e-05, "loss": 0.2306, "step": 2207 }, { "epoch": 1.0455164249778042, "grad_norm": 1.6233320236206055, "learning_rate": 1.5130811453753369e-05, "loss": 0.2243, "step": 2208 }, { "epoch": 1.0459899378514352, "grad_norm": 1.1391911506652832, "learning_rate": 1.512642114441921e-05, "loss": 0.1966, "step": 2209 }, { "epoch": 1.0464634507250665, "grad_norm": 1.7349532842636108, "learning_rate": 1.512202949431906e-05, "loss": 0.2256, "step": 2210 }, { "epoch": 1.0469369635986978, "grad_norm": 1.411200761795044, "learning_rate": 1.5117636504601505e-05, "loss": 0.2225, "step": 2211 }, { "epoch": 1.047410476472329, "grad_norm": 1.5413966178894043, "learning_rate": 1.5113242176415495e-05, "loss": 0.2301, "step": 2212 }, { "epoch": 1.0478839893459604, "grad_norm": 1.4936213493347168, "learning_rate": 1.5108846510910322e-05, "loss": 0.2299, "step": 2213 }, { "epoch": 1.0483575022195917, "grad_norm": 2.126307964324951, "learning_rate": 1.5104449509235628e-05, "loss": 0.2195, "step": 2214 }, { "epoch": 1.0488310150932227, "grad_norm": 1.6920348405838013, "learning_rate": 1.5100051172541408e-05, "loss": 0.2081, "step": 2215 }, { "epoch": 1.049304527966854, "grad_norm": 1.1273527145385742, "learning_rate": 1.5095651501978007e-05, "loss": 0.2157, "step": 2216 }, { "epoch": 1.0497780408404853, "grad_norm": 0.9604322910308838, "learning_rate": 1.5091250498696113e-05, "loss": 0.2404, "step": 2217 }, { "epoch": 1.0502515537141166, "grad_norm": 1.639671802520752, "learning_rate": 1.508684816384677e-05, "loss": 0.2384, "step": 2218 }, { "epoch": 1.050725066587748, "grad_norm": 1.559471845626831, "learning_rate": 1.5082444498581362e-05, "loss": 0.2078, "step": 2219 }, { "epoch": 1.0511985794613792, "grad_norm": 1.1831077337265015, "learning_rate": 1.5078039504051626e-05, "loss": 0.2396, "step": 2220 }, { "epoch": 1.0516720923350102, "grad_norm": 1.6754920482635498, "learning_rate": 1.5073633181409645e-05, "loss": 0.2161, "step": 2221 }, { "epoch": 1.0521456052086415, "grad_norm": 1.6285377740859985, "learning_rate": 1.5069225531807852e-05, "loss": 0.2252, "step": 2222 }, { "epoch": 1.0526191180822728, "grad_norm": 0.9481242895126343, "learning_rate": 1.5064816556399027e-05, "loss": 0.2061, "step": 2223 }, { "epoch": 1.053092630955904, "grad_norm": 1.5364612340927124, "learning_rate": 1.506040625633629e-05, "loss": 0.229, "step": 2224 }, { "epoch": 1.0535661438295354, "grad_norm": 1.2100945711135864, "learning_rate": 1.5055994632773119e-05, "loss": 0.2278, "step": 2225 }, { "epoch": 1.0540396567031667, "grad_norm": 1.6874003410339355, "learning_rate": 1.5051581686863323e-05, "loss": 0.2117, "step": 2226 }, { "epoch": 1.054513169576798, "grad_norm": 1.3330941200256348, "learning_rate": 1.5047167419761075e-05, "loss": 0.2267, "step": 2227 }, { "epoch": 1.054986682450429, "grad_norm": 1.1632763147354126, "learning_rate": 1.5042751832620879e-05, "loss": 0.234, "step": 2228 }, { "epoch": 1.0554601953240603, "grad_norm": 1.2275227308273315, "learning_rate": 1.5038334926597587e-05, "loss": 0.2079, "step": 2229 }, { "epoch": 1.0559337081976916, "grad_norm": 1.616982102394104, "learning_rate": 1.50339167028464e-05, "loss": 0.2224, "step": 2230 }, { "epoch": 1.056407221071323, "grad_norm": 1.067596197128296, "learning_rate": 1.5029497162522865e-05, "loss": 0.229, "step": 2231 }, { "epoch": 1.0568807339449542, "grad_norm": 1.3072737455368042, "learning_rate": 1.5025076306782866e-05, "loss": 0.205, "step": 2232 }, { "epoch": 1.0573542468185855, "grad_norm": 1.5054287910461426, "learning_rate": 1.5020654136782637e-05, "loss": 0.2159, "step": 2233 }, { "epoch": 1.0578277596922165, "grad_norm": 1.238438606262207, "learning_rate": 1.5016230653678757e-05, "loss": 0.2145, "step": 2234 }, { "epoch": 1.0583012725658478, "grad_norm": 1.200659155845642, "learning_rate": 1.5011805858628137e-05, "loss": 0.26, "step": 2235 }, { "epoch": 1.0587747854394791, "grad_norm": 1.4251799583435059, "learning_rate": 1.5007379752788045e-05, "loss": 0.2004, "step": 2236 }, { "epoch": 1.0592482983131104, "grad_norm": 1.2545171976089478, "learning_rate": 1.5002952337316088e-05, "loss": 0.2127, "step": 2237 }, { "epoch": 1.0597218111867417, "grad_norm": 1.177280306816101, "learning_rate": 1.499852361337021e-05, "loss": 0.212, "step": 2238 }, { "epoch": 1.060195324060373, "grad_norm": 1.241126537322998, "learning_rate": 1.4994093582108704e-05, "loss": 0.2193, "step": 2239 }, { "epoch": 1.060668836934004, "grad_norm": 1.5119160413742065, "learning_rate": 1.49896622446902e-05, "loss": 0.2198, "step": 2240 }, { "epoch": 1.0611423498076353, "grad_norm": 1.6282936334609985, "learning_rate": 1.498522960227367e-05, "loss": 0.2185, "step": 2241 }, { "epoch": 1.0616158626812666, "grad_norm": 0.9974349737167358, "learning_rate": 1.4980795656018432e-05, "loss": 0.2323, "step": 2242 }, { "epoch": 1.062089375554898, "grad_norm": 1.2673429250717163, "learning_rate": 1.4976360407084141e-05, "loss": 0.2032, "step": 2243 }, { "epoch": 1.0625628884285292, "grad_norm": 1.0515581369400024, "learning_rate": 1.4971923856630792e-05, "loss": 0.201, "step": 2244 }, { "epoch": 1.0630364013021605, "grad_norm": 1.0648077726364136, "learning_rate": 1.4967486005818727e-05, "loss": 0.2237, "step": 2245 }, { "epoch": 1.0635099141757918, "grad_norm": 1.3801108598709106, "learning_rate": 1.4963046855808618e-05, "loss": 0.2375, "step": 2246 }, { "epoch": 1.0639834270494228, "grad_norm": 1.4944959878921509, "learning_rate": 1.4958606407761482e-05, "loss": 0.2441, "step": 2247 }, { "epoch": 1.0644569399230541, "grad_norm": 2.0347981452941895, "learning_rate": 1.4954164662838677e-05, "loss": 0.2568, "step": 2248 }, { "epoch": 1.0649304527966854, "grad_norm": 1.357796311378479, "learning_rate": 1.4949721622201896e-05, "loss": 0.2161, "step": 2249 }, { "epoch": 1.0654039656703167, "grad_norm": 1.0744569301605225, "learning_rate": 1.4945277287013178e-05, "loss": 0.2269, "step": 2250 }, { "epoch": 1.065877478543948, "grad_norm": 1.2349190711975098, "learning_rate": 1.4940831658434893e-05, "loss": 0.2213, "step": 2251 }, { "epoch": 1.066350991417579, "grad_norm": 1.181551456451416, "learning_rate": 1.4936384737629753e-05, "loss": 0.218, "step": 2252 }, { "epoch": 1.0668245042912103, "grad_norm": 1.792569637298584, "learning_rate": 1.4931936525760806e-05, "loss": 0.2103, "step": 2253 }, { "epoch": 1.0672980171648416, "grad_norm": 1.04933500289917, "learning_rate": 1.4927487023991441e-05, "loss": 0.2085, "step": 2254 }, { "epoch": 1.067771530038473, "grad_norm": 1.1748439073562622, "learning_rate": 1.4923036233485383e-05, "loss": 0.2093, "step": 2255 }, { "epoch": 1.0682450429121042, "grad_norm": 1.149575114250183, "learning_rate": 1.4918584155406688e-05, "loss": 0.251, "step": 2256 }, { "epoch": 1.0687185557857355, "grad_norm": 1.4834145307540894, "learning_rate": 1.4914130790919761e-05, "loss": 0.2117, "step": 2257 }, { "epoch": 1.0691920686593668, "grad_norm": 1.1242401599884033, "learning_rate": 1.4909676141189332e-05, "loss": 0.2525, "step": 2258 }, { "epoch": 1.0696655815329978, "grad_norm": 2.0916712284088135, "learning_rate": 1.4905220207380468e-05, "loss": 0.2408, "step": 2259 }, { "epoch": 1.0701390944066291, "grad_norm": 1.4777238368988037, "learning_rate": 1.4900762990658585e-05, "loss": 0.2203, "step": 2260 }, { "epoch": 1.0706126072802604, "grad_norm": 1.251544713973999, "learning_rate": 1.4896304492189417e-05, "loss": 0.2283, "step": 2261 }, { "epoch": 1.0710861201538917, "grad_norm": 1.3505147695541382, "learning_rate": 1.489184471313905e-05, "loss": 0.2255, "step": 2262 }, { "epoch": 1.071559633027523, "grad_norm": 1.4952011108398438, "learning_rate": 1.4887383654673889e-05, "loss": 0.2214, "step": 2263 }, { "epoch": 1.0720331459011543, "grad_norm": 1.3948256969451904, "learning_rate": 1.488292131796068e-05, "loss": 0.2502, "step": 2264 }, { "epoch": 1.0725066587747853, "grad_norm": 1.5061980485916138, "learning_rate": 1.4878457704166506e-05, "loss": 0.1799, "step": 2265 }, { "epoch": 1.0729801716484166, "grad_norm": 1.324911117553711, "learning_rate": 1.4873992814458786e-05, "loss": 0.2278, "step": 2266 }, { "epoch": 1.073453684522048, "grad_norm": 1.1821624040603638, "learning_rate": 1.4869526650005264e-05, "loss": 0.2209, "step": 2267 }, { "epoch": 1.0739271973956792, "grad_norm": 1.988358736038208, "learning_rate": 1.4865059211974024e-05, "loss": 0.2366, "step": 2268 }, { "epoch": 1.0744007102693105, "grad_norm": 1.212149739265442, "learning_rate": 1.4860590501533482e-05, "loss": 0.2386, "step": 2269 }, { "epoch": 1.0748742231429418, "grad_norm": 1.2160612344741821, "learning_rate": 1.4856120519852383e-05, "loss": 0.2227, "step": 2270 }, { "epoch": 1.0753477360165729, "grad_norm": 1.1903105974197388, "learning_rate": 1.4851649268099813e-05, "loss": 0.2442, "step": 2271 }, { "epoch": 1.0758212488902041, "grad_norm": 1.6250642538070679, "learning_rate": 1.484717674744518e-05, "loss": 0.23, "step": 2272 }, { "epoch": 1.0762947617638354, "grad_norm": 1.3602406978607178, "learning_rate": 1.484270295905823e-05, "loss": 0.2328, "step": 2273 }, { "epoch": 1.0767682746374667, "grad_norm": 1.4769501686096191, "learning_rate": 1.4838227904109041e-05, "loss": 0.2319, "step": 2274 }, { "epoch": 1.077241787511098, "grad_norm": 1.1678149700164795, "learning_rate": 1.4833751583768017e-05, "loss": 0.221, "step": 2275 }, { "epoch": 1.0777153003847293, "grad_norm": 2.216442823410034, "learning_rate": 1.48292739992059e-05, "loss": 0.2286, "step": 2276 }, { "epoch": 1.0781888132583606, "grad_norm": 2.1490554809570312, "learning_rate": 1.4824795151593756e-05, "loss": 0.2138, "step": 2277 }, { "epoch": 1.0786623261319916, "grad_norm": 1.926507830619812, "learning_rate": 1.4820315042102986e-05, "loss": 0.2501, "step": 2278 }, { "epoch": 1.079135839005623, "grad_norm": 1.7893540859222412, "learning_rate": 1.481583367190532e-05, "loss": 0.2324, "step": 2279 }, { "epoch": 1.0796093518792542, "grad_norm": 1.2722523212432861, "learning_rate": 1.4811351042172813e-05, "loss": 0.2112, "step": 2280 }, { "epoch": 1.0800828647528855, "grad_norm": 2.427342414855957, "learning_rate": 1.480686715407786e-05, "loss": 0.2359, "step": 2281 }, { "epoch": 1.0805563776265168, "grad_norm": 1.4813814163208008, "learning_rate": 1.4802382008793174e-05, "loss": 0.2157, "step": 2282 }, { "epoch": 1.0810298905001479, "grad_norm": 1.7409425973892212, "learning_rate": 1.4797895607491803e-05, "loss": 0.2105, "step": 2283 }, { "epoch": 1.0815034033737791, "grad_norm": 1.6621959209442139, "learning_rate": 1.4793407951347125e-05, "loss": 0.2328, "step": 2284 }, { "epoch": 1.0819769162474104, "grad_norm": 1.1202408075332642, "learning_rate": 1.4788919041532836e-05, "loss": 0.2422, "step": 2285 }, { "epoch": 1.0824504291210417, "grad_norm": 1.863373041152954, "learning_rate": 1.4784428879222974e-05, "loss": 0.2561, "step": 2286 }, { "epoch": 1.082923941994673, "grad_norm": 1.392762541770935, "learning_rate": 1.4779937465591893e-05, "loss": 0.2376, "step": 2287 }, { "epoch": 1.0833974548683043, "grad_norm": 1.4913394451141357, "learning_rate": 1.4775444801814283e-05, "loss": 0.2238, "step": 2288 }, { "epoch": 1.0838709677419356, "grad_norm": 0.916836142539978, "learning_rate": 1.4770950889065154e-05, "loss": 0.2274, "step": 2289 }, { "epoch": 1.0843444806155667, "grad_norm": 1.3088781833648682, "learning_rate": 1.4766455728519846e-05, "loss": 0.2168, "step": 2290 }, { "epoch": 1.084817993489198, "grad_norm": 1.986661672592163, "learning_rate": 1.4761959321354025e-05, "loss": 0.2137, "step": 2291 }, { "epoch": 1.0852915063628292, "grad_norm": 2.061962604522705, "learning_rate": 1.4757461668743681e-05, "loss": 0.1949, "step": 2292 }, { "epoch": 1.0857650192364605, "grad_norm": 1.4279181957244873, "learning_rate": 1.4752962771865136e-05, "loss": 0.2108, "step": 2293 }, { "epoch": 1.0862385321100918, "grad_norm": 1.3412773609161377, "learning_rate": 1.474846263189503e-05, "loss": 0.2322, "step": 2294 }, { "epoch": 1.086712044983723, "grad_norm": 2.3214855194091797, "learning_rate": 1.4743961250010331e-05, "loss": 0.198, "step": 2295 }, { "epoch": 1.0871855578573542, "grad_norm": 1.6193276643753052, "learning_rate": 1.4739458627388332e-05, "loss": 0.2224, "step": 2296 }, { "epoch": 1.0876590707309854, "grad_norm": 1.793225646018982, "learning_rate": 1.4734954765206652e-05, "loss": 0.2259, "step": 2297 }, { "epoch": 1.0881325836046167, "grad_norm": 1.7120318412780762, "learning_rate": 1.4730449664643234e-05, "loss": 0.2271, "step": 2298 }, { "epoch": 1.088606096478248, "grad_norm": 1.1128010749816895, "learning_rate": 1.4725943326876342e-05, "loss": 0.2003, "step": 2299 }, { "epoch": 1.0890796093518793, "grad_norm": 1.3492709398269653, "learning_rate": 1.472143575308456e-05, "loss": 0.2332, "step": 2300 }, { "epoch": 1.0895531222255106, "grad_norm": 1.4029978513717651, "learning_rate": 1.471692694444681e-05, "loss": 0.2185, "step": 2301 }, { "epoch": 1.0900266350991417, "grad_norm": 1.254938006401062, "learning_rate": 1.471241690214232e-05, "loss": 0.223, "step": 2302 }, { "epoch": 1.090500147972773, "grad_norm": 1.5678329467773438, "learning_rate": 1.4707905627350653e-05, "loss": 0.2186, "step": 2303 }, { "epoch": 1.0909736608464042, "grad_norm": 1.8724418878555298, "learning_rate": 1.4703393121251685e-05, "loss": 0.2135, "step": 2304 }, { "epoch": 1.0914471737200355, "grad_norm": 1.3198801279067993, "learning_rate": 1.4698879385025625e-05, "loss": 0.2076, "step": 2305 }, { "epoch": 1.0919206865936668, "grad_norm": 1.733998417854309, "learning_rate": 1.4694364419852986e-05, "loss": 0.2184, "step": 2306 }, { "epoch": 1.092394199467298, "grad_norm": 0.94035804271698, "learning_rate": 1.4689848226914627e-05, "loss": 0.2124, "step": 2307 }, { "epoch": 1.0928677123409292, "grad_norm": 0.9416120052337646, "learning_rate": 1.4685330807391704e-05, "loss": 0.2126, "step": 2308 }, { "epoch": 1.0933412252145605, "grad_norm": 1.1188304424285889, "learning_rate": 1.468081216246571e-05, "loss": 0.2299, "step": 2309 }, { "epoch": 1.0938147380881917, "grad_norm": 1.0400240421295166, "learning_rate": 1.467629229331845e-05, "loss": 0.2196, "step": 2310 }, { "epoch": 1.094288250961823, "grad_norm": 1.3964438438415527, "learning_rate": 1.4671771201132054e-05, "loss": 0.2477, "step": 2311 }, { "epoch": 1.0947617638354543, "grad_norm": 1.3592251539230347, "learning_rate": 1.4667248887088971e-05, "loss": 0.2073, "step": 2312 }, { "epoch": 1.0952352767090856, "grad_norm": 3.0954136848449707, "learning_rate": 1.4662725352371967e-05, "loss": 0.2125, "step": 2313 }, { "epoch": 1.0957087895827167, "grad_norm": 1.2312672138214111, "learning_rate": 1.4658200598164127e-05, "loss": 0.2282, "step": 2314 }, { "epoch": 1.096182302456348, "grad_norm": 1.0197677612304688, "learning_rate": 1.4653674625648861e-05, "loss": 0.2203, "step": 2315 }, { "epoch": 1.0966558153299792, "grad_norm": 1.2136882543563843, "learning_rate": 1.4649147436009895e-05, "loss": 0.2358, "step": 2316 }, { "epoch": 1.0971293282036105, "grad_norm": 1.1311389207839966, "learning_rate": 1.4644619030431264e-05, "loss": 0.2187, "step": 2317 }, { "epoch": 1.0976028410772418, "grad_norm": 1.0771368741989136, "learning_rate": 1.4640089410097333e-05, "loss": 0.2128, "step": 2318 }, { "epoch": 1.098076353950873, "grad_norm": 1.247301459312439, "learning_rate": 1.463555857619278e-05, "loss": 0.2158, "step": 2319 }, { "epoch": 1.0985498668245044, "grad_norm": 1.5069024562835693, "learning_rate": 1.4631026529902601e-05, "loss": 0.2248, "step": 2320 }, { "epoch": 1.0990233796981355, "grad_norm": 1.451413869857788, "learning_rate": 1.462649327241211e-05, "loss": 0.2121, "step": 2321 }, { "epoch": 1.0994968925717667, "grad_norm": 1.4309951066970825, "learning_rate": 1.4621958804906938e-05, "loss": 0.2378, "step": 2322 }, { "epoch": 1.099970405445398, "grad_norm": 1.5729237794876099, "learning_rate": 1.4617423128573028e-05, "loss": 0.2183, "step": 2323 }, { "epoch": 1.1004439183190293, "grad_norm": 2.354729413986206, "learning_rate": 1.4612886244596647e-05, "loss": 0.223, "step": 2324 }, { "epoch": 1.1009174311926606, "grad_norm": 1.197218894958496, "learning_rate": 1.4608348154164367e-05, "loss": 0.2095, "step": 2325 }, { "epoch": 1.101390944066292, "grad_norm": 2.5518031120300293, "learning_rate": 1.4603808858463085e-05, "loss": 0.2002, "step": 2326 }, { "epoch": 1.101864456939923, "grad_norm": 1.1937416791915894, "learning_rate": 1.459926835868001e-05, "loss": 0.2171, "step": 2327 }, { "epoch": 1.1023379698135543, "grad_norm": 1.7216546535491943, "learning_rate": 1.4594726656002664e-05, "loss": 0.2304, "step": 2328 }, { "epoch": 1.1028114826871855, "grad_norm": 1.408703327178955, "learning_rate": 1.4590183751618889e-05, "loss": 0.2391, "step": 2329 }, { "epoch": 1.1032849955608168, "grad_norm": 1.8831274509429932, "learning_rate": 1.4585639646716836e-05, "loss": 0.2414, "step": 2330 }, { "epoch": 1.1037585084344481, "grad_norm": 1.7065414190292358, "learning_rate": 1.4581094342484972e-05, "loss": 0.2365, "step": 2331 }, { "epoch": 1.1042320213080794, "grad_norm": 1.767787218093872, "learning_rate": 1.4576547840112077e-05, "loss": 0.2229, "step": 2332 }, { "epoch": 1.1047055341817105, "grad_norm": 1.4647932052612305, "learning_rate": 1.4572000140787244e-05, "loss": 0.2404, "step": 2333 }, { "epoch": 1.1051790470553418, "grad_norm": 1.9528988599777222, "learning_rate": 1.4567451245699884e-05, "loss": 0.2326, "step": 2334 }, { "epoch": 1.105652559928973, "grad_norm": 1.284548282623291, "learning_rate": 1.456290115603971e-05, "loss": 0.2263, "step": 2335 }, { "epoch": 1.1061260728026043, "grad_norm": 1.5598032474517822, "learning_rate": 1.4558349872996762e-05, "loss": 0.2211, "step": 2336 }, { "epoch": 1.1065995856762356, "grad_norm": 1.3689323663711548, "learning_rate": 1.4553797397761376e-05, "loss": 0.2168, "step": 2337 }, { "epoch": 1.107073098549867, "grad_norm": 0.9562427401542664, "learning_rate": 1.4549243731524213e-05, "loss": 0.213, "step": 2338 }, { "epoch": 1.107546611423498, "grad_norm": 1.2045793533325195, "learning_rate": 1.4544688875476238e-05, "loss": 0.2287, "step": 2339 }, { "epoch": 1.1080201242971293, "grad_norm": 1.2727464437484741, "learning_rate": 1.4540132830808733e-05, "loss": 0.2112, "step": 2340 }, { "epoch": 1.1084936371707605, "grad_norm": 1.1461354494094849, "learning_rate": 1.4535575598713287e-05, "loss": 0.2019, "step": 2341 }, { "epoch": 1.1089671500443918, "grad_norm": 1.330062985420227, "learning_rate": 1.4531017180381797e-05, "loss": 0.212, "step": 2342 }, { "epoch": 1.1094406629180231, "grad_norm": 1.4322924613952637, "learning_rate": 1.4526457577006473e-05, "loss": 0.2197, "step": 2343 }, { "epoch": 1.1099141757916544, "grad_norm": 1.5362545251846313, "learning_rate": 1.4521896789779842e-05, "loss": 0.2195, "step": 2344 }, { "epoch": 1.1103876886652855, "grad_norm": 1.0807383060455322, "learning_rate": 1.4517334819894724e-05, "loss": 0.2394, "step": 2345 }, { "epoch": 1.1108612015389168, "grad_norm": 1.3042844533920288, "learning_rate": 1.4512771668544266e-05, "loss": 0.2393, "step": 2346 }, { "epoch": 1.111334714412548, "grad_norm": 1.3964802026748657, "learning_rate": 1.4508207336921914e-05, "loss": 0.2293, "step": 2347 }, { "epoch": 1.1118082272861793, "grad_norm": 0.9700664281845093, "learning_rate": 1.4503641826221424e-05, "loss": 0.2346, "step": 2348 }, { "epoch": 1.1122817401598106, "grad_norm": 1.142829418182373, "learning_rate": 1.449907513763686e-05, "loss": 0.2185, "step": 2349 }, { "epoch": 1.112755253033442, "grad_norm": 1.3726601600646973, "learning_rate": 1.44945072723626e-05, "loss": 0.2247, "step": 2350 }, { "epoch": 1.1132287659070732, "grad_norm": 1.0523629188537598, "learning_rate": 1.4489938231593321e-05, "loss": 0.2263, "step": 2351 }, { "epoch": 1.1137022787807043, "grad_norm": 1.1638647317886353, "learning_rate": 1.4485368016524013e-05, "loss": 0.2094, "step": 2352 }, { "epoch": 1.1141757916543356, "grad_norm": 1.121314525604248, "learning_rate": 1.4480796628349972e-05, "loss": 0.2036, "step": 2353 }, { "epoch": 1.1146493045279668, "grad_norm": 1.4763176441192627, "learning_rate": 1.4476224068266798e-05, "loss": 0.2385, "step": 2354 }, { "epoch": 1.1151228174015981, "grad_norm": 1.7545726299285889, "learning_rate": 1.4471650337470402e-05, "loss": 0.2234, "step": 2355 }, { "epoch": 1.1155963302752294, "grad_norm": 1.8628343343734741, "learning_rate": 1.4467075437156998e-05, "loss": 0.2142, "step": 2356 }, { "epoch": 1.1160698431488607, "grad_norm": 1.5996010303497314, "learning_rate": 1.446249936852311e-05, "loss": 0.2413, "step": 2357 }, { "epoch": 1.1165433560224918, "grad_norm": 1.7511857748031616, "learning_rate": 1.4457922132765563e-05, "loss": 0.2044, "step": 2358 }, { "epoch": 1.117016868896123, "grad_norm": 0.8672329783439636, "learning_rate": 1.4453343731081488e-05, "loss": 0.2006, "step": 2359 }, { "epoch": 1.1174903817697543, "grad_norm": 1.460821509361267, "learning_rate": 1.444876416466832e-05, "loss": 0.2255, "step": 2360 }, { "epoch": 1.1179638946433856, "grad_norm": 1.3242175579071045, "learning_rate": 1.4444183434723807e-05, "loss": 0.2342, "step": 2361 }, { "epoch": 1.118437407517017, "grad_norm": 1.3085192441940308, "learning_rate": 1.4439601542445987e-05, "loss": 0.2688, "step": 2362 }, { "epoch": 1.1189109203906482, "grad_norm": 1.3582148551940918, "learning_rate": 1.4435018489033214e-05, "loss": 0.2452, "step": 2363 }, { "epoch": 1.1193844332642793, "grad_norm": 1.1882847547531128, "learning_rate": 1.443043427568414e-05, "loss": 0.2238, "step": 2364 }, { "epoch": 1.1198579461379106, "grad_norm": 1.574207067489624, "learning_rate": 1.4425848903597724e-05, "loss": 0.2382, "step": 2365 }, { "epoch": 1.1203314590115419, "grad_norm": 1.0986261367797852, "learning_rate": 1.4421262373973223e-05, "loss": 0.2102, "step": 2366 }, { "epoch": 1.1208049718851731, "grad_norm": 1.3533656597137451, "learning_rate": 1.4416674688010202e-05, "loss": 0.2158, "step": 2367 }, { "epoch": 1.1212784847588044, "grad_norm": 1.6296641826629639, "learning_rate": 1.4412085846908526e-05, "loss": 0.2101, "step": 2368 }, { "epoch": 1.1217519976324357, "grad_norm": 1.0967878103256226, "learning_rate": 1.4407495851868359e-05, "loss": 0.2318, "step": 2369 }, { "epoch": 1.1222255105060668, "grad_norm": 1.2239967584609985, "learning_rate": 1.440290470409017e-05, "loss": 0.2045, "step": 2370 }, { "epoch": 1.122699023379698, "grad_norm": 1.5295300483703613, "learning_rate": 1.4398312404774735e-05, "loss": 0.2554, "step": 2371 }, { "epoch": 1.1231725362533294, "grad_norm": 1.227840542793274, "learning_rate": 1.439371895512312e-05, "loss": 0.2367, "step": 2372 }, { "epoch": 1.1236460491269606, "grad_norm": 1.9119755029678345, "learning_rate": 1.4389124356336696e-05, "loss": 0.2372, "step": 2373 }, { "epoch": 1.124119562000592, "grad_norm": 1.034201979637146, "learning_rate": 1.4384528609617143e-05, "loss": 0.199, "step": 2374 }, { "epoch": 1.1245930748742232, "grad_norm": 1.1491769552230835, "learning_rate": 1.4379931716166429e-05, "loss": 0.2293, "step": 2375 }, { "epoch": 1.1250665877478543, "grad_norm": 1.0914734601974487, "learning_rate": 1.4375333677186829e-05, "loss": 0.2179, "step": 2376 }, { "epoch": 1.1255401006214856, "grad_norm": 2.126800775527954, "learning_rate": 1.4370734493880916e-05, "loss": 0.2166, "step": 2377 }, { "epoch": 1.1260136134951169, "grad_norm": 2.5790648460388184, "learning_rate": 1.4366134167451562e-05, "loss": 0.256, "step": 2378 }, { "epoch": 1.1264871263687481, "grad_norm": 1.1876254081726074, "learning_rate": 1.4361532699101935e-05, "loss": 0.223, "step": 2379 }, { "epoch": 1.1269606392423794, "grad_norm": 1.856112003326416, "learning_rate": 1.4356930090035508e-05, "loss": 0.2211, "step": 2380 }, { "epoch": 1.1274341521160107, "grad_norm": 1.3353132009506226, "learning_rate": 1.4352326341456048e-05, "loss": 0.2256, "step": 2381 }, { "epoch": 1.127907664989642, "grad_norm": 1.3428007364273071, "learning_rate": 1.4347721454567623e-05, "loss": 0.2296, "step": 2382 }, { "epoch": 1.128381177863273, "grad_norm": 1.5457638502120972, "learning_rate": 1.4343115430574592e-05, "loss": 0.2188, "step": 2383 }, { "epoch": 1.1288546907369044, "grad_norm": 1.9435369968414307, "learning_rate": 1.433850827068162e-05, "loss": 0.2184, "step": 2384 }, { "epoch": 1.1293282036105357, "grad_norm": 1.060576319694519, "learning_rate": 1.4333899976093664e-05, "loss": 0.2332, "step": 2385 }, { "epoch": 1.129801716484167, "grad_norm": 2.13041090965271, "learning_rate": 1.4329290548015981e-05, "loss": 0.231, "step": 2386 }, { "epoch": 1.1302752293577982, "grad_norm": 1.1683634519577026, "learning_rate": 1.4324679987654118e-05, "loss": 0.211, "step": 2387 }, { "epoch": 1.1307487422314293, "grad_norm": 1.2120347023010254, "learning_rate": 1.4320068296213926e-05, "loss": 0.2094, "step": 2388 }, { "epoch": 1.1312222551050606, "grad_norm": 1.454931616783142, "learning_rate": 1.4315455474901547e-05, "loss": 0.2472, "step": 2389 }, { "epoch": 1.1316957679786919, "grad_norm": 1.3851542472839355, "learning_rate": 1.4310841524923421e-05, "loss": 0.2272, "step": 2390 }, { "epoch": 1.1321692808523232, "grad_norm": 1.0476808547973633, "learning_rate": 1.4306226447486283e-05, "loss": 0.2176, "step": 2391 }, { "epoch": 1.1326427937259544, "grad_norm": 1.5204083919525146, "learning_rate": 1.4301610243797158e-05, "loss": 0.2392, "step": 2392 }, { "epoch": 1.1331163065995857, "grad_norm": 1.792072057723999, "learning_rate": 1.4296992915063373e-05, "loss": 0.2097, "step": 2393 }, { "epoch": 1.133589819473217, "grad_norm": 1.989022135734558, "learning_rate": 1.4292374462492547e-05, "loss": 0.2483, "step": 2394 }, { "epoch": 1.134063332346848, "grad_norm": 1.7426658868789673, "learning_rate": 1.4287754887292589e-05, "loss": 0.2133, "step": 2395 }, { "epoch": 1.1345368452204794, "grad_norm": 1.5938878059387207, "learning_rate": 1.42831341906717e-05, "loss": 0.2123, "step": 2396 }, { "epoch": 1.1350103580941107, "grad_norm": 1.6211594343185425, "learning_rate": 1.4278512373838386e-05, "loss": 0.2364, "step": 2397 }, { "epoch": 1.135483870967742, "grad_norm": 0.9879429936408997, "learning_rate": 1.4273889438001439e-05, "loss": 0.2109, "step": 2398 }, { "epoch": 1.1359573838413732, "grad_norm": 1.1694254875183105, "learning_rate": 1.426926538436994e-05, "loss": 0.2156, "step": 2399 }, { "epoch": 1.1364308967150045, "grad_norm": 1.0623806715011597, "learning_rate": 1.4264640214153263e-05, "loss": 0.1991, "step": 2400 }, { "epoch": 1.1369044095886358, "grad_norm": 1.6476950645446777, "learning_rate": 1.4260013928561081e-05, "loss": 0.242, "step": 2401 }, { "epoch": 1.1373779224622669, "grad_norm": 1.9095466136932373, "learning_rate": 1.4255386528803354e-05, "loss": 0.2194, "step": 2402 }, { "epoch": 1.1378514353358982, "grad_norm": 1.2309021949768066, "learning_rate": 1.4250758016090335e-05, "loss": 0.2141, "step": 2403 }, { "epoch": 1.1383249482095295, "grad_norm": 1.0731322765350342, "learning_rate": 1.4246128391632562e-05, "loss": 0.2075, "step": 2404 }, { "epoch": 1.1387984610831607, "grad_norm": 1.4842568635940552, "learning_rate": 1.4241497656640872e-05, "loss": 0.2145, "step": 2405 }, { "epoch": 1.139271973956792, "grad_norm": 1.6689938306808472, "learning_rate": 1.4236865812326386e-05, "loss": 0.2281, "step": 2406 }, { "epoch": 1.139745486830423, "grad_norm": 1.186214566230774, "learning_rate": 1.4232232859900523e-05, "loss": 0.2296, "step": 2407 }, { "epoch": 1.1402189997040544, "grad_norm": 2.036839246749878, "learning_rate": 1.4227598800574984e-05, "loss": 0.1975, "step": 2408 }, { "epoch": 1.1406925125776857, "grad_norm": 1.4921070337295532, "learning_rate": 1.4222963635561761e-05, "loss": 0.2342, "step": 2409 }, { "epoch": 1.141166025451317, "grad_norm": 1.0753675699234009, "learning_rate": 1.4218327366073142e-05, "loss": 0.2251, "step": 2410 }, { "epoch": 1.1416395383249482, "grad_norm": 1.8981343507766724, "learning_rate": 1.4213689993321693e-05, "loss": 0.2247, "step": 2411 }, { "epoch": 1.1421130511985795, "grad_norm": 1.5621330738067627, "learning_rate": 1.4209051518520279e-05, "loss": 0.2106, "step": 2412 }, { "epoch": 1.1425865640722108, "grad_norm": 1.145002841949463, "learning_rate": 1.4204411942882046e-05, "loss": 0.2496, "step": 2413 }, { "epoch": 1.1430600769458419, "grad_norm": 1.893953561782837, "learning_rate": 1.4199771267620429e-05, "loss": 0.223, "step": 2414 }, { "epoch": 1.1435335898194732, "grad_norm": 2.6021175384521484, "learning_rate": 1.4195129493949152e-05, "loss": 0.2389, "step": 2415 }, { "epoch": 1.1440071026931045, "grad_norm": 1.2552067041397095, "learning_rate": 1.4190486623082224e-05, "loss": 0.2072, "step": 2416 }, { "epoch": 1.1444806155667357, "grad_norm": 1.4053682088851929, "learning_rate": 1.418584265623395e-05, "loss": 0.2115, "step": 2417 }, { "epoch": 1.144954128440367, "grad_norm": 1.3213428258895874, "learning_rate": 1.418119759461891e-05, "loss": 0.2304, "step": 2418 }, { "epoch": 1.145427641313998, "grad_norm": 1.09917151927948, "learning_rate": 1.4176551439451976e-05, "loss": 0.2189, "step": 2419 }, { "epoch": 1.1459011541876294, "grad_norm": 3.225097417831421, "learning_rate": 1.4171904191948306e-05, "loss": 0.215, "step": 2420 }, { "epoch": 1.1463746670612607, "grad_norm": 1.6562659740447998, "learning_rate": 1.416725585332334e-05, "loss": 0.2157, "step": 2421 }, { "epoch": 1.146848179934892, "grad_norm": 1.3881036043167114, "learning_rate": 1.4162606424792809e-05, "loss": 0.2283, "step": 2422 }, { "epoch": 1.1473216928085233, "grad_norm": 1.0177631378173828, "learning_rate": 1.4157955907572722e-05, "loss": 0.1981, "step": 2423 }, { "epoch": 1.1477952056821545, "grad_norm": 1.4494411945343018, "learning_rate": 1.4153304302879383e-05, "loss": 0.2367, "step": 2424 }, { "epoch": 1.1482687185557858, "grad_norm": 1.3473752737045288, "learning_rate": 1.4148651611929371e-05, "loss": 0.236, "step": 2425 }, { "epoch": 1.148742231429417, "grad_norm": 1.4328070878982544, "learning_rate": 1.4143997835939552e-05, "loss": 0.2182, "step": 2426 }, { "epoch": 1.1492157443030482, "grad_norm": 1.770027995109558, "learning_rate": 1.4139342976127077e-05, "loss": 0.2246, "step": 2427 }, { "epoch": 1.1496892571766795, "grad_norm": 1.1986833810806274, "learning_rate": 1.413468703370938e-05, "loss": 0.2309, "step": 2428 }, { "epoch": 1.1501627700503108, "grad_norm": 1.74411940574646, "learning_rate": 1.4130030009904174e-05, "loss": 0.2027, "step": 2429 }, { "epoch": 1.150636282923942, "grad_norm": 1.3250808715820312, "learning_rate": 1.4125371905929468e-05, "loss": 0.2219, "step": 2430 }, { "epoch": 1.1511097957975733, "grad_norm": 1.336143136024475, "learning_rate": 1.4120712723003535e-05, "loss": 0.2279, "step": 2431 }, { "epoch": 1.1515833086712046, "grad_norm": 1.1617181301116943, "learning_rate": 1.4116052462344942e-05, "loss": 0.2143, "step": 2432 }, { "epoch": 1.1520568215448357, "grad_norm": 1.7668710947036743, "learning_rate": 1.4111391125172537e-05, "loss": 0.2268, "step": 2433 }, { "epoch": 1.152530334418467, "grad_norm": 1.455324649810791, "learning_rate": 1.4106728712705446e-05, "loss": 0.2289, "step": 2434 }, { "epoch": 1.1530038472920983, "grad_norm": 1.4046434164047241, "learning_rate": 1.4102065226163078e-05, "loss": 0.2297, "step": 2435 }, { "epoch": 1.1534773601657295, "grad_norm": 1.2692493200302124, "learning_rate": 1.4097400666765122e-05, "loss": 0.2339, "step": 2436 }, { "epoch": 1.1539508730393608, "grad_norm": 1.7210558652877808, "learning_rate": 1.4092735035731553e-05, "loss": 0.2129, "step": 2437 }, { "epoch": 1.154424385912992, "grad_norm": 1.2296080589294434, "learning_rate": 1.4088068334282617e-05, "loss": 0.2163, "step": 2438 }, { "epoch": 1.1548978987866232, "grad_norm": 1.0044761896133423, "learning_rate": 1.4083400563638847e-05, "loss": 0.2193, "step": 2439 }, { "epoch": 1.1553714116602545, "grad_norm": 2.0132339000701904, "learning_rate": 1.407873172502105e-05, "loss": 0.2281, "step": 2440 }, { "epoch": 1.1558449245338858, "grad_norm": 2.1369824409484863, "learning_rate": 1.4074061819650322e-05, "loss": 0.2122, "step": 2441 }, { "epoch": 1.156318437407517, "grad_norm": 1.4052802324295044, "learning_rate": 1.4069390848748029e-05, "loss": 0.2225, "step": 2442 }, { "epoch": 1.1567919502811483, "grad_norm": 1.6282340288162231, "learning_rate": 1.4064718813535817e-05, "loss": 0.2247, "step": 2443 }, { "epoch": 1.1572654631547796, "grad_norm": 1.5721560716629028, "learning_rate": 1.4060045715235609e-05, "loss": 0.2201, "step": 2444 }, { "epoch": 1.1577389760284107, "grad_norm": 1.0424257516860962, "learning_rate": 1.4055371555069615e-05, "loss": 0.2363, "step": 2445 }, { "epoch": 1.158212488902042, "grad_norm": 1.8262851238250732, "learning_rate": 1.4050696334260312e-05, "loss": 0.2222, "step": 2446 }, { "epoch": 1.1586860017756733, "grad_norm": 2.008397340774536, "learning_rate": 1.4046020054030465e-05, "loss": 0.242, "step": 2447 }, { "epoch": 1.1591595146493046, "grad_norm": 3.8094611167907715, "learning_rate": 1.40413427156031e-05, "loss": 0.2242, "step": 2448 }, { "epoch": 1.1596330275229358, "grad_norm": 1.1817164421081543, "learning_rate": 1.4036664320201538e-05, "loss": 0.2312, "step": 2449 }, { "epoch": 1.160106540396567, "grad_norm": 1.771918535232544, "learning_rate": 1.4031984869049366e-05, "loss": 0.209, "step": 2450 }, { "epoch": 1.1605800532701982, "grad_norm": 1.6216905117034912, "learning_rate": 1.4027304363370446e-05, "loss": 0.2296, "step": 2451 }, { "epoch": 1.1610535661438295, "grad_norm": 1.4540348052978516, "learning_rate": 1.4022622804388923e-05, "loss": 0.2135, "step": 2452 }, { "epoch": 1.1615270790174608, "grad_norm": 1.7169173955917358, "learning_rate": 1.4017940193329213e-05, "loss": 0.2417, "step": 2453 }, { "epoch": 1.162000591891092, "grad_norm": 1.2987480163574219, "learning_rate": 1.401325653141601e-05, "loss": 0.2275, "step": 2454 }, { "epoch": 1.1624741047647233, "grad_norm": 1.303578495979309, "learning_rate": 1.4008571819874273e-05, "loss": 0.2323, "step": 2455 }, { "epoch": 1.1629476176383546, "grad_norm": 1.2520272731781006, "learning_rate": 1.4003886059929248e-05, "loss": 0.2025, "step": 2456 }, { "epoch": 1.1634211305119857, "grad_norm": 2.3889143466949463, "learning_rate": 1.3999199252806452e-05, "loss": 0.22, "step": 2457 }, { "epoch": 1.163894643385617, "grad_norm": 1.60451340675354, "learning_rate": 1.3994511399731675e-05, "loss": 0.2413, "step": 2458 }, { "epoch": 1.1643681562592483, "grad_norm": 1.227243185043335, "learning_rate": 1.3989822501930972e-05, "loss": 0.1986, "step": 2459 }, { "epoch": 1.1648416691328796, "grad_norm": 1.0574538707733154, "learning_rate": 1.3985132560630688e-05, "loss": 0.2294, "step": 2460 }, { "epoch": 1.1653151820065109, "grad_norm": 1.0879645347595215, "learning_rate": 1.3980441577057426e-05, "loss": 0.2376, "step": 2461 }, { "epoch": 1.1657886948801421, "grad_norm": 2.3020236492156982, "learning_rate": 1.397574955243807e-05, "loss": 0.2257, "step": 2462 }, { "epoch": 1.1662622077537734, "grad_norm": 1.1904938220977783, "learning_rate": 1.3971056487999773e-05, "loss": 0.2329, "step": 2463 }, { "epoch": 1.1667357206274045, "grad_norm": 2.3944849967956543, "learning_rate": 1.3966362384969963e-05, "loss": 0.2004, "step": 2464 }, { "epoch": 1.1672092335010358, "grad_norm": 1.1166476011276245, "learning_rate": 1.3961667244576335e-05, "loss": 0.2009, "step": 2465 }, { "epoch": 1.167682746374667, "grad_norm": 1.1066415309906006, "learning_rate": 1.3956971068046855e-05, "loss": 0.2289, "step": 2466 }, { "epoch": 1.1681562592482984, "grad_norm": 1.3403149843215942, "learning_rate": 1.3952273856609767e-05, "loss": 0.2034, "step": 2467 }, { "epoch": 1.1686297721219296, "grad_norm": 1.3216232061386108, "learning_rate": 1.3947575611493583e-05, "loss": 0.2223, "step": 2468 }, { "epoch": 1.1691032849955607, "grad_norm": 2.135986328125, "learning_rate": 1.3942876333927077e-05, "loss": 0.2079, "step": 2469 }, { "epoch": 1.169576797869192, "grad_norm": 1.8347281217575073, "learning_rate": 1.3938176025139305e-05, "loss": 0.1986, "step": 2470 }, { "epoch": 1.1700503107428233, "grad_norm": 1.1069951057434082, "learning_rate": 1.3933474686359588e-05, "loss": 0.2261, "step": 2471 }, { "epoch": 1.1705238236164546, "grad_norm": 1.534125804901123, "learning_rate": 1.3928772318817509e-05, "loss": 0.23, "step": 2472 }, { "epoch": 1.1709973364900859, "grad_norm": 0.9962480664253235, "learning_rate": 1.3924068923742935e-05, "loss": 0.2145, "step": 2473 }, { "epoch": 1.1714708493637171, "grad_norm": 1.4505356550216675, "learning_rate": 1.391936450236599e-05, "loss": 0.2107, "step": 2474 }, { "epoch": 1.1719443622373484, "grad_norm": 1.0452574491500854, "learning_rate": 1.3914659055917068e-05, "loss": 0.2004, "step": 2475 }, { "epoch": 1.1724178751109795, "grad_norm": 1.055141568183899, "learning_rate": 1.3909952585626836e-05, "loss": 0.2231, "step": 2476 }, { "epoch": 1.1728913879846108, "grad_norm": 1.314717411994934, "learning_rate": 1.3905245092726225e-05, "loss": 0.2346, "step": 2477 }, { "epoch": 1.173364900858242, "grad_norm": 1.5005841255187988, "learning_rate": 1.3900536578446437e-05, "loss": 0.2334, "step": 2478 }, { "epoch": 1.1738384137318734, "grad_norm": 1.6286835670471191, "learning_rate": 1.3895827044018934e-05, "loss": 0.2517, "step": 2479 }, { "epoch": 1.1743119266055047, "grad_norm": 1.1182934045791626, "learning_rate": 1.3891116490675451e-05, "loss": 0.2388, "step": 2480 }, { "epoch": 1.1747854394791357, "grad_norm": 1.1708893775939941, "learning_rate": 1.3886404919647988e-05, "loss": 0.2131, "step": 2481 }, { "epoch": 1.175258952352767, "grad_norm": 1.1157783269882202, "learning_rate": 1.3881692332168815e-05, "loss": 0.2378, "step": 2482 }, { "epoch": 1.1757324652263983, "grad_norm": 1.3915766477584839, "learning_rate": 1.3876978729470459e-05, "loss": 0.2189, "step": 2483 }, { "epoch": 1.1762059781000296, "grad_norm": 1.132906198501587, "learning_rate": 1.3872264112785714e-05, "loss": 0.2082, "step": 2484 }, { "epoch": 1.1766794909736609, "grad_norm": 1.12389075756073, "learning_rate": 1.386754848334765e-05, "loss": 0.1907, "step": 2485 }, { "epoch": 1.1771530038472922, "grad_norm": 1.498472809791565, "learning_rate": 1.3862831842389591e-05, "loss": 0.2291, "step": 2486 }, { "epoch": 1.1776265167209234, "grad_norm": 1.148629903793335, "learning_rate": 1.3858114191145126e-05, "loss": 0.2093, "step": 2487 }, { "epoch": 1.1781000295945545, "grad_norm": 1.5816822052001953, "learning_rate": 1.3853395530848114e-05, "loss": 0.225, "step": 2488 }, { "epoch": 1.1785735424681858, "grad_norm": 1.4523855447769165, "learning_rate": 1.384867586273268e-05, "loss": 0.2078, "step": 2489 }, { "epoch": 1.179047055341817, "grad_norm": 1.0670571327209473, "learning_rate": 1.38439551880332e-05, "loss": 0.2214, "step": 2490 }, { "epoch": 1.1795205682154484, "grad_norm": 1.1799019575119019, "learning_rate": 1.3839233507984324e-05, "loss": 0.2186, "step": 2491 }, { "epoch": 1.1799940810890797, "grad_norm": 1.5703297853469849, "learning_rate": 1.3834510823820963e-05, "loss": 0.2402, "step": 2492 }, { "epoch": 1.180467593962711, "grad_norm": 1.9375343322753906, "learning_rate": 1.3829787136778288e-05, "loss": 0.2241, "step": 2493 }, { "epoch": 1.1809411068363422, "grad_norm": 1.1045565605163574, "learning_rate": 1.382506244809173e-05, "loss": 0.2247, "step": 2494 }, { "epoch": 1.1814146197099733, "grad_norm": 1.2954577207565308, "learning_rate": 1.3820336758996994e-05, "loss": 0.2266, "step": 2495 }, { "epoch": 1.1818881325836046, "grad_norm": 1.3069260120391846, "learning_rate": 1.3815610070730032e-05, "loss": 0.2192, "step": 2496 }, { "epoch": 1.1823616454572359, "grad_norm": 0.9491212368011475, "learning_rate": 1.3810882384527067e-05, "loss": 0.2241, "step": 2497 }, { "epoch": 1.1828351583308672, "grad_norm": 1.5114809274673462, "learning_rate": 1.3806153701624578e-05, "loss": 0.2567, "step": 2498 }, { "epoch": 1.1833086712044985, "grad_norm": 1.5127360820770264, "learning_rate": 1.3801424023259308e-05, "loss": 0.2152, "step": 2499 }, { "epoch": 1.1837821840781295, "grad_norm": 1.0083001852035522, "learning_rate": 1.3796693350668258e-05, "loss": 0.2371, "step": 2500 }, { "epoch": 1.1842556969517608, "grad_norm": 1.140708088874817, "learning_rate": 1.3791961685088689e-05, "loss": 0.231, "step": 2501 }, { "epoch": 1.184729209825392, "grad_norm": 1.298230528831482, "learning_rate": 1.3787229027758122e-05, "loss": 0.2102, "step": 2502 }, { "epoch": 1.1852027226990234, "grad_norm": 1.1305428743362427, "learning_rate": 1.378249537991434e-05, "loss": 0.2321, "step": 2503 }, { "epoch": 1.1856762355726547, "grad_norm": 1.2196705341339111, "learning_rate": 1.3777760742795381e-05, "loss": 0.2243, "step": 2504 }, { "epoch": 1.186149748446286, "grad_norm": 1.1564288139343262, "learning_rate": 1.3773025117639543e-05, "loss": 0.2223, "step": 2505 }, { "epoch": 1.1866232613199172, "grad_norm": 1.8472498655319214, "learning_rate": 1.3768288505685385e-05, "loss": 0.199, "step": 2506 }, { "epoch": 1.1870967741935483, "grad_norm": 1.086516261100769, "learning_rate": 1.3763550908171724e-05, "loss": 0.2352, "step": 2507 }, { "epoch": 1.1875702870671796, "grad_norm": 1.063267707824707, "learning_rate": 1.375881232633763e-05, "loss": 0.2142, "step": 2508 }, { "epoch": 1.1880437999408109, "grad_norm": 1.4095113277435303, "learning_rate": 1.3754072761422434e-05, "loss": 0.192, "step": 2509 }, { "epoch": 1.1885173128144422, "grad_norm": 1.2273898124694824, "learning_rate": 1.3749332214665723e-05, "loss": 0.237, "step": 2510 }, { "epoch": 1.1889908256880735, "grad_norm": 1.414565920829773, "learning_rate": 1.374459068730734e-05, "loss": 0.2165, "step": 2511 }, { "epoch": 1.1894643385617045, "grad_norm": 1.405671238899231, "learning_rate": 1.373984818058739e-05, "loss": 0.2649, "step": 2512 }, { "epoch": 1.1899378514353358, "grad_norm": 1.4026716947555542, "learning_rate": 1.3735104695746225e-05, "loss": 0.2252, "step": 2513 }, { "epoch": 1.190411364308967, "grad_norm": 1.2324182987213135, "learning_rate": 1.373036023402446e-05, "loss": 0.2129, "step": 2514 }, { "epoch": 1.1908848771825984, "grad_norm": 1.2538621425628662, "learning_rate": 1.3725614796662962e-05, "loss": 0.2241, "step": 2515 }, { "epoch": 1.1913583900562297, "grad_norm": 1.2133036851882935, "learning_rate": 1.372086838490286e-05, "loss": 0.2213, "step": 2516 }, { "epoch": 1.191831902929861, "grad_norm": 1.0370423793792725, "learning_rate": 1.3716120999985527e-05, "loss": 0.216, "step": 2517 }, { "epoch": 1.1923054158034923, "grad_norm": 1.403160810470581, "learning_rate": 1.3711372643152597e-05, "loss": 0.2511, "step": 2518 }, { "epoch": 1.1927789286771233, "grad_norm": 1.4306610822677612, "learning_rate": 1.3706623315645953e-05, "loss": 0.2194, "step": 2519 }, { "epoch": 1.1932524415507546, "grad_norm": 0.9785909056663513, "learning_rate": 1.370187301870774e-05, "loss": 0.2138, "step": 2520 }, { "epoch": 1.193725954424386, "grad_norm": 1.315924882888794, "learning_rate": 1.3697121753580353e-05, "loss": 0.2212, "step": 2521 }, { "epoch": 1.1941994672980172, "grad_norm": 1.1268949508666992, "learning_rate": 1.3692369521506437e-05, "loss": 0.221, "step": 2522 }, { "epoch": 1.1946729801716485, "grad_norm": 2.0519661903381348, "learning_rate": 1.3687616323728894e-05, "loss": 0.2417, "step": 2523 }, { "epoch": 1.1951464930452798, "grad_norm": 1.7941192388534546, "learning_rate": 1.3682862161490877e-05, "loss": 0.2387, "step": 2524 }, { "epoch": 1.195620005918911, "grad_norm": 1.0652010440826416, "learning_rate": 1.367810703603579e-05, "loss": 0.2142, "step": 2525 }, { "epoch": 1.196093518792542, "grad_norm": 1.2762157917022705, "learning_rate": 1.3673350948607294e-05, "loss": 0.2268, "step": 2526 }, { "epoch": 1.1965670316661734, "grad_norm": 1.4592194557189941, "learning_rate": 1.3668593900449292e-05, "loss": 0.2157, "step": 2527 }, { "epoch": 1.1970405445398047, "grad_norm": 1.3134995698928833, "learning_rate": 1.3663835892805947e-05, "loss": 0.2369, "step": 2528 }, { "epoch": 1.197514057413436, "grad_norm": 2.157960891723633, "learning_rate": 1.365907692692167e-05, "loss": 0.1831, "step": 2529 }, { "epoch": 1.1979875702870673, "grad_norm": 1.0059032440185547, "learning_rate": 1.3654317004041122e-05, "loss": 0.2266, "step": 2530 }, { "epoch": 1.1984610831606983, "grad_norm": 1.2563735246658325, "learning_rate": 1.3649556125409218e-05, "loss": 0.2189, "step": 2531 }, { "epoch": 1.1989345960343296, "grad_norm": 0.9727022051811218, "learning_rate": 1.3644794292271116e-05, "loss": 0.2255, "step": 2532 }, { "epoch": 1.199408108907961, "grad_norm": 0.8149449825286865, "learning_rate": 1.3640031505872225e-05, "loss": 0.2023, "step": 2533 }, { "epoch": 1.1998816217815922, "grad_norm": 1.0001932382583618, "learning_rate": 1.3635267767458213e-05, "loss": 0.2004, "step": 2534 }, { "epoch": 1.2003551346552235, "grad_norm": 1.593064546585083, "learning_rate": 1.3630503078274988e-05, "loss": 0.2215, "step": 2535 }, { "epoch": 1.2008286475288548, "grad_norm": 1.3267443180084229, "learning_rate": 1.3625737439568705e-05, "loss": 0.2162, "step": 2536 }, { "epoch": 1.201302160402486, "grad_norm": 1.1851780414581299, "learning_rate": 1.3620970852585774e-05, "loss": 0.2121, "step": 2537 }, { "epoch": 1.2017756732761171, "grad_norm": 0.9846872687339783, "learning_rate": 1.3616203318572847e-05, "loss": 0.2268, "step": 2538 }, { "epoch": 1.2022491861497484, "grad_norm": 1.268949031829834, "learning_rate": 1.3611434838776827e-05, "loss": 0.2392, "step": 2539 }, { "epoch": 1.2027226990233797, "grad_norm": 1.274588704109192, "learning_rate": 1.3606665414444868e-05, "loss": 0.2315, "step": 2540 }, { "epoch": 1.203196211897011, "grad_norm": 2.332651376724243, "learning_rate": 1.3601895046824363e-05, "loss": 0.2311, "step": 2541 }, { "epoch": 1.2036697247706423, "grad_norm": 2.8526484966278076, "learning_rate": 1.3597123737162955e-05, "loss": 0.2059, "step": 2542 }, { "epoch": 1.2041432376442733, "grad_norm": 1.6713618040084839, "learning_rate": 1.359235148670854e-05, "loss": 0.2153, "step": 2543 }, { "epoch": 1.2046167505179046, "grad_norm": 1.6479053497314453, "learning_rate": 1.3587578296709248e-05, "loss": 0.2273, "step": 2544 }, { "epoch": 1.205090263391536, "grad_norm": 1.376947283744812, "learning_rate": 1.358280416841346e-05, "loss": 0.2047, "step": 2545 }, { "epoch": 1.2055637762651672, "grad_norm": 1.5050996541976929, "learning_rate": 1.3578029103069805e-05, "loss": 0.211, "step": 2546 }, { "epoch": 1.2060372891387985, "grad_norm": 1.4506083726882935, "learning_rate": 1.357325310192716e-05, "loss": 0.2179, "step": 2547 }, { "epoch": 1.2065108020124298, "grad_norm": 1.4101526737213135, "learning_rate": 1.3568476166234634e-05, "loss": 0.2305, "step": 2548 }, { "epoch": 1.206984314886061, "grad_norm": 2.1561319828033447, "learning_rate": 1.3563698297241596e-05, "loss": 0.2227, "step": 2549 }, { "epoch": 1.2074578277596921, "grad_norm": 1.0047372579574585, "learning_rate": 1.3558919496197645e-05, "loss": 0.2211, "step": 2550 }, { "epoch": 1.2079313406333234, "grad_norm": 1.103215217590332, "learning_rate": 1.3554139764352634e-05, "loss": 0.2091, "step": 2551 }, { "epoch": 1.2084048535069547, "grad_norm": 2.47819185256958, "learning_rate": 1.3549359102956655e-05, "loss": 0.231, "step": 2552 }, { "epoch": 1.208878366380586, "grad_norm": 0.9107365608215332, "learning_rate": 1.3544577513260046e-05, "loss": 0.2048, "step": 2553 }, { "epoch": 1.2093518792542173, "grad_norm": 1.520993947982788, "learning_rate": 1.353979499651338e-05, "loss": 0.2213, "step": 2554 }, { "epoch": 1.2098253921278486, "grad_norm": 0.9904034733772278, "learning_rate": 1.3535011553967486e-05, "loss": 0.2281, "step": 2555 }, { "epoch": 1.2102989050014799, "grad_norm": 1.3577964305877686, "learning_rate": 1.3530227186873419e-05, "loss": 0.2299, "step": 2556 }, { "epoch": 1.210772417875111, "grad_norm": 1.2297004461288452, "learning_rate": 1.352544189648249e-05, "loss": 0.1907, "step": 2557 }, { "epoch": 1.2112459307487422, "grad_norm": 1.2518497705459595, "learning_rate": 1.3520655684046242e-05, "loss": 0.2217, "step": 2558 }, { "epoch": 1.2117194436223735, "grad_norm": 1.2009005546569824, "learning_rate": 1.3515868550816467e-05, "loss": 0.1946, "step": 2559 }, { "epoch": 1.2121929564960048, "grad_norm": 1.5080748796463013, "learning_rate": 1.3511080498045189e-05, "loss": 0.2365, "step": 2560 }, { "epoch": 1.212666469369636, "grad_norm": 1.2145692110061646, "learning_rate": 1.3506291526984679e-05, "loss": 0.2444, "step": 2561 }, { "epoch": 1.2131399822432671, "grad_norm": 1.701706886291504, "learning_rate": 1.3501501638887447e-05, "loss": 0.2327, "step": 2562 }, { "epoch": 1.2136134951168984, "grad_norm": 3.05845308303833, "learning_rate": 1.349671083500624e-05, "loss": 0.2352, "step": 2563 }, { "epoch": 1.2140870079905297, "grad_norm": 1.9371395111083984, "learning_rate": 1.3491919116594045e-05, "loss": 0.2098, "step": 2564 }, { "epoch": 1.214560520864161, "grad_norm": 1.3178801536560059, "learning_rate": 1.3487126484904093e-05, "loss": 0.2176, "step": 2565 }, { "epoch": 1.2150340337377923, "grad_norm": 1.2978839874267578, "learning_rate": 1.3482332941189854e-05, "loss": 0.2286, "step": 2566 }, { "epoch": 1.2155075466114236, "grad_norm": 1.7363958358764648, "learning_rate": 1.3477538486705025e-05, "loss": 0.2034, "step": 2567 }, { "epoch": 1.2159810594850549, "grad_norm": 2.09183669090271, "learning_rate": 1.3472743122703552e-05, "loss": 0.2326, "step": 2568 }, { "epoch": 1.216454572358686, "grad_norm": 0.9742308855056763, "learning_rate": 1.3467946850439622e-05, "loss": 0.1946, "step": 2569 }, { "epoch": 1.2169280852323172, "grad_norm": 1.709175705909729, "learning_rate": 1.3463149671167646e-05, "loss": 0.2307, "step": 2570 }, { "epoch": 1.2174015981059485, "grad_norm": 1.1259140968322754, "learning_rate": 1.3458351586142284e-05, "loss": 0.2385, "step": 2571 }, { "epoch": 1.2178751109795798, "grad_norm": 1.0130417346954346, "learning_rate": 1.3453552596618427e-05, "loss": 0.2264, "step": 2572 }, { "epoch": 1.218348623853211, "grad_norm": 1.774648904800415, "learning_rate": 1.3448752703851207e-05, "loss": 0.2396, "step": 2573 }, { "epoch": 1.2188221367268421, "grad_norm": 2.2298903465270996, "learning_rate": 1.3443951909095984e-05, "loss": 0.2075, "step": 2574 }, { "epoch": 1.2192956496004734, "grad_norm": 1.3856366872787476, "learning_rate": 1.3439150213608367e-05, "loss": 0.2104, "step": 2575 }, { "epoch": 1.2197691624741047, "grad_norm": 1.630552887916565, "learning_rate": 1.343434761864419e-05, "loss": 0.2092, "step": 2576 }, { "epoch": 1.220242675347736, "grad_norm": 1.0852291584014893, "learning_rate": 1.3429544125459524e-05, "loss": 0.2098, "step": 2577 }, { "epoch": 1.2207161882213673, "grad_norm": 1.034583568572998, "learning_rate": 1.342473973531068e-05, "loss": 0.2085, "step": 2578 }, { "epoch": 1.2211897010949986, "grad_norm": 1.044459581375122, "learning_rate": 1.3419934449454194e-05, "loss": 0.2173, "step": 2579 }, { "epoch": 1.2216632139686299, "grad_norm": 1.3877681493759155, "learning_rate": 1.3415128269146846e-05, "loss": 0.2386, "step": 2580 }, { "epoch": 1.222136726842261, "grad_norm": 1.530917763710022, "learning_rate": 1.3410321195645648e-05, "loss": 0.2139, "step": 2581 }, { "epoch": 1.2226102397158922, "grad_norm": 1.024557113647461, "learning_rate": 1.3405513230207839e-05, "loss": 0.236, "step": 2582 }, { "epoch": 1.2230837525895235, "grad_norm": 1.3001816272735596, "learning_rate": 1.3400704374090898e-05, "loss": 0.2451, "step": 2583 }, { "epoch": 1.2235572654631548, "grad_norm": 1.623937964439392, "learning_rate": 1.3395894628552535e-05, "loss": 0.2361, "step": 2584 }, { "epoch": 1.224030778336786, "grad_norm": 1.2357534170150757, "learning_rate": 1.3391083994850696e-05, "loss": 0.2363, "step": 2585 }, { "epoch": 1.2245042912104174, "grad_norm": 1.3607426881790161, "learning_rate": 1.3386272474243546e-05, "loss": 0.2231, "step": 2586 }, { "epoch": 1.2249778040840484, "grad_norm": 1.3958909511566162, "learning_rate": 1.3381460067989505e-05, "loss": 0.2305, "step": 2587 }, { "epoch": 1.2254513169576797, "grad_norm": 1.3651123046875, "learning_rate": 1.3376646777347202e-05, "loss": 0.233, "step": 2588 }, { "epoch": 1.225924829831311, "grad_norm": 1.5809210538864136, "learning_rate": 1.3371832603575509e-05, "loss": 0.2273, "step": 2589 }, { "epoch": 1.2263983427049423, "grad_norm": 1.3318142890930176, "learning_rate": 1.3367017547933529e-05, "loss": 0.2202, "step": 2590 }, { "epoch": 1.2268718555785736, "grad_norm": 2.5720956325531006, "learning_rate": 1.3362201611680587e-05, "loss": 0.2073, "step": 2591 }, { "epoch": 1.2273453684522049, "grad_norm": 1.4822810888290405, "learning_rate": 1.3357384796076253e-05, "loss": 0.2111, "step": 2592 }, { "epoch": 1.227818881325836, "grad_norm": 1.8587872982025146, "learning_rate": 1.3352567102380315e-05, "loss": 0.2369, "step": 2593 }, { "epoch": 1.2282923941994672, "grad_norm": 0.9261167645454407, "learning_rate": 1.3347748531852791e-05, "loss": 0.2318, "step": 2594 }, { "epoch": 1.2287659070730985, "grad_norm": 1.5869637727737427, "learning_rate": 1.3342929085753939e-05, "loss": 0.2472, "step": 2595 }, { "epoch": 1.2292394199467298, "grad_norm": 0.9304991364479065, "learning_rate": 1.3338108765344233e-05, "loss": 0.2046, "step": 2596 }, { "epoch": 1.229712932820361, "grad_norm": 1.8789327144622803, "learning_rate": 1.3333287571884382e-05, "loss": 0.2467, "step": 2597 }, { "epoch": 1.2301864456939924, "grad_norm": 3.1216373443603516, "learning_rate": 1.3328465506635324e-05, "loss": 0.2007, "step": 2598 }, { "epoch": 1.2306599585676237, "grad_norm": 1.2716069221496582, "learning_rate": 1.3323642570858221e-05, "loss": 0.2083, "step": 2599 }, { "epoch": 1.2311334714412547, "grad_norm": 1.1587867736816406, "learning_rate": 1.331881876581447e-05, "loss": 0.1932, "step": 2600 }, { "epoch": 1.231606984314886, "grad_norm": 1.1460086107254028, "learning_rate": 1.3313994092765688e-05, "loss": 0.2137, "step": 2601 }, { "epoch": 1.2320804971885173, "grad_norm": 1.066895604133606, "learning_rate": 1.3309168552973718e-05, "loss": 0.2257, "step": 2602 }, { "epoch": 1.2325540100621486, "grad_norm": 1.1454565525054932, "learning_rate": 1.3304342147700642e-05, "loss": 0.2358, "step": 2603 }, { "epoch": 1.2330275229357799, "grad_norm": 1.030795693397522, "learning_rate": 1.3299514878208752e-05, "loss": 0.2263, "step": 2604 }, { "epoch": 1.233501035809411, "grad_norm": 1.6368428468704224, "learning_rate": 1.3294686745760578e-05, "loss": 0.2432, "step": 2605 }, { "epoch": 1.2339745486830422, "grad_norm": 1.6315193176269531, "learning_rate": 1.328985775161887e-05, "loss": 0.2042, "step": 2606 }, { "epoch": 1.2344480615566735, "grad_norm": 1.179657220840454, "learning_rate": 1.3285027897046603e-05, "loss": 0.2091, "step": 2607 }, { "epoch": 1.2349215744303048, "grad_norm": 1.292107105255127, "learning_rate": 1.3280197183306982e-05, "loss": 0.2323, "step": 2608 }, { "epoch": 1.235395087303936, "grad_norm": 1.0263645648956299, "learning_rate": 1.3275365611663432e-05, "loss": 0.2211, "step": 2609 }, { "epoch": 1.2358686001775674, "grad_norm": 1.5201125144958496, "learning_rate": 1.3270533183379604e-05, "loss": 0.2254, "step": 2610 }, { "epoch": 1.2363421130511987, "grad_norm": 1.2638511657714844, "learning_rate": 1.3265699899719374e-05, "loss": 0.2249, "step": 2611 }, { "epoch": 1.2368156259248297, "grad_norm": 1.429612398147583, "learning_rate": 1.3260865761946837e-05, "loss": 0.2316, "step": 2612 }, { "epoch": 1.237289138798461, "grad_norm": 1.2139184474945068, "learning_rate": 1.3256030771326325e-05, "loss": 0.2282, "step": 2613 }, { "epoch": 1.2377626516720923, "grad_norm": 2.151301383972168, "learning_rate": 1.325119492912237e-05, "loss": 0.2462, "step": 2614 }, { "epoch": 1.2382361645457236, "grad_norm": 1.421278715133667, "learning_rate": 1.3246358236599747e-05, "loss": 0.2238, "step": 2615 }, { "epoch": 1.238709677419355, "grad_norm": 1.212721824645996, "learning_rate": 1.3241520695023449e-05, "loss": 0.2312, "step": 2616 }, { "epoch": 1.2391831902929862, "grad_norm": 1.3026739358901978, "learning_rate": 1.3236682305658682e-05, "loss": 0.2466, "step": 2617 }, { "epoch": 1.2396567031666172, "grad_norm": 1.0858564376831055, "learning_rate": 1.3231843069770882e-05, "loss": 0.2156, "step": 2618 }, { "epoch": 1.2401302160402485, "grad_norm": 1.2696760892868042, "learning_rate": 1.3227002988625705e-05, "loss": 0.2361, "step": 2619 }, { "epoch": 1.2406037289138798, "grad_norm": 1.2995176315307617, "learning_rate": 1.322216206348903e-05, "loss": 0.2306, "step": 2620 }, { "epoch": 1.241077241787511, "grad_norm": 1.480022668838501, "learning_rate": 1.3217320295626953e-05, "loss": 0.2266, "step": 2621 }, { "epoch": 1.2415507546611424, "grad_norm": 1.1644608974456787, "learning_rate": 1.3212477686305789e-05, "loss": 0.2018, "step": 2622 }, { "epoch": 1.2420242675347737, "grad_norm": 0.9980354309082031, "learning_rate": 1.3207634236792077e-05, "loss": 0.2019, "step": 2623 }, { "epoch": 1.2424977804084048, "grad_norm": 1.6177034378051758, "learning_rate": 1.3202789948352577e-05, "loss": 0.2148, "step": 2624 }, { "epoch": 1.242971293282036, "grad_norm": 1.0906509160995483, "learning_rate": 1.319794482225426e-05, "loss": 0.2256, "step": 2625 }, { "epoch": 1.2434448061556673, "grad_norm": 1.3726736307144165, "learning_rate": 1.3193098859764329e-05, "loss": 0.2089, "step": 2626 }, { "epoch": 1.2439183190292986, "grad_norm": 1.480689525604248, "learning_rate": 1.3188252062150195e-05, "loss": 0.2264, "step": 2627 }, { "epoch": 1.24439183190293, "grad_norm": 1.417734980583191, "learning_rate": 1.3183404430679492e-05, "loss": 0.2317, "step": 2628 }, { "epoch": 1.2448653447765612, "grad_norm": 1.1104220151901245, "learning_rate": 1.3178555966620067e-05, "loss": 0.2314, "step": 2629 }, { "epoch": 1.2453388576501925, "grad_norm": 1.307004690170288, "learning_rate": 1.3173706671239999e-05, "loss": 0.23, "step": 2630 }, { "epoch": 1.2458123705238235, "grad_norm": 1.3435912132263184, "learning_rate": 1.3168856545807565e-05, "loss": 0.2255, "step": 2631 }, { "epoch": 1.2462858833974548, "grad_norm": 1.3050034046173096, "learning_rate": 1.3164005591591273e-05, "loss": 0.2399, "step": 2632 }, { "epoch": 1.2467593962710861, "grad_norm": 1.0008726119995117, "learning_rate": 1.3159153809859842e-05, "loss": 0.2318, "step": 2633 }, { "epoch": 1.2472329091447174, "grad_norm": 1.289227843284607, "learning_rate": 1.3154301201882209e-05, "loss": 0.2061, "step": 2634 }, { "epoch": 1.2477064220183487, "grad_norm": 1.4629969596862793, "learning_rate": 1.3149447768927526e-05, "loss": 0.2023, "step": 2635 }, { "epoch": 1.2481799348919798, "grad_norm": 1.1824212074279785, "learning_rate": 1.3144593512265162e-05, "loss": 0.2296, "step": 2636 }, { "epoch": 1.248653447765611, "grad_norm": 1.5664751529693604, "learning_rate": 1.3139738433164704e-05, "loss": 0.2159, "step": 2637 }, { "epoch": 1.2491269606392423, "grad_norm": 1.077606201171875, "learning_rate": 1.3134882532895945e-05, "loss": 0.2186, "step": 2638 }, { "epoch": 1.2496004735128736, "grad_norm": 1.1679469347000122, "learning_rate": 1.3130025812728904e-05, "loss": 0.2338, "step": 2639 }, { "epoch": 1.250073986386505, "grad_norm": 1.1365487575531006, "learning_rate": 1.3125168273933811e-05, "loss": 0.235, "step": 2640 }, { "epoch": 1.2505474992601362, "grad_norm": 0.9639025330543518, "learning_rate": 1.31203099177811e-05, "loss": 0.2164, "step": 2641 }, { "epoch": 1.2510210121337675, "grad_norm": 1.8010168075561523, "learning_rate": 1.3115450745541434e-05, "loss": 0.2062, "step": 2642 }, { "epoch": 1.2514945250073986, "grad_norm": 1.1365344524383545, "learning_rate": 1.311059075848568e-05, "loss": 0.1904, "step": 2643 }, { "epoch": 1.2519680378810298, "grad_norm": 2.163564682006836, "learning_rate": 1.3105729957884923e-05, "loss": 0.2219, "step": 2644 }, { "epoch": 1.2524415507546611, "grad_norm": 1.4166170358657837, "learning_rate": 1.3100868345010454e-05, "loss": 0.2137, "step": 2645 }, { "epoch": 1.2529150636282924, "grad_norm": 1.539779543876648, "learning_rate": 1.3096005921133785e-05, "loss": 0.2185, "step": 2646 }, { "epoch": 1.2533885765019237, "grad_norm": 2.17313289642334, "learning_rate": 1.3091142687526637e-05, "loss": 0.2425, "step": 2647 }, { "epoch": 1.2538620893755548, "grad_norm": 2.488083600997925, "learning_rate": 1.3086278645460939e-05, "loss": 0.2248, "step": 2648 }, { "epoch": 1.2543356022491863, "grad_norm": 1.1161004304885864, "learning_rate": 1.3081413796208835e-05, "loss": 0.2153, "step": 2649 }, { "epoch": 1.2548091151228173, "grad_norm": 1.0800951719284058, "learning_rate": 1.307654814104268e-05, "loss": 0.2141, "step": 2650 }, { "epoch": 1.2552826279964486, "grad_norm": 1.1490174531936646, "learning_rate": 1.3071681681235039e-05, "loss": 0.2563, "step": 2651 }, { "epoch": 1.25575614087008, "grad_norm": 1.0029891729354858, "learning_rate": 1.3066814418058685e-05, "loss": 0.2291, "step": 2652 }, { "epoch": 1.2562296537437112, "grad_norm": 1.9102951288223267, "learning_rate": 1.3061946352786607e-05, "loss": 0.2197, "step": 2653 }, { "epoch": 1.2567031666173425, "grad_norm": 1.408528208732605, "learning_rate": 1.3057077486692e-05, "loss": 0.2255, "step": 2654 }, { "epoch": 1.2571766794909736, "grad_norm": 1.8932093381881714, "learning_rate": 1.3052207821048268e-05, "loss": 0.2058, "step": 2655 }, { "epoch": 1.2576501923646048, "grad_norm": 1.3944228887557983, "learning_rate": 1.3047337357129025e-05, "loss": 0.2077, "step": 2656 }, { "epoch": 1.2581237052382361, "grad_norm": 1.0805628299713135, "learning_rate": 1.3042466096208099e-05, "loss": 0.2436, "step": 2657 }, { "epoch": 1.2585972181118674, "grad_norm": 1.1540179252624512, "learning_rate": 1.3037594039559514e-05, "loss": 0.2202, "step": 2658 }, { "epoch": 1.2590707309854987, "grad_norm": 1.449994683265686, "learning_rate": 1.303272118845751e-05, "loss": 0.227, "step": 2659 }, { "epoch": 1.25954424385913, "grad_norm": 1.6480882167816162, "learning_rate": 1.3027847544176537e-05, "loss": 0.206, "step": 2660 }, { "epoch": 1.2600177567327613, "grad_norm": 1.056485891342163, "learning_rate": 1.3022973107991251e-05, "loss": 0.2193, "step": 2661 }, { "epoch": 1.2604912696063924, "grad_norm": 1.3506393432617188, "learning_rate": 1.301809788117651e-05, "loss": 0.2331, "step": 2662 }, { "epoch": 1.2609647824800236, "grad_norm": 1.8271267414093018, "learning_rate": 1.3013221865007385e-05, "loss": 0.2206, "step": 2663 }, { "epoch": 1.261438295353655, "grad_norm": 1.0800082683563232, "learning_rate": 1.3008345060759149e-05, "loss": 0.2127, "step": 2664 }, { "epoch": 1.2619118082272862, "grad_norm": 1.0898518562316895, "learning_rate": 1.3003467469707287e-05, "loss": 0.2179, "step": 2665 }, { "epoch": 1.2623853211009175, "grad_norm": 1.098704218864441, "learning_rate": 1.299858909312748e-05, "loss": 0.2281, "step": 2666 }, { "epoch": 1.2628588339745486, "grad_norm": 1.6652623414993286, "learning_rate": 1.2993709932295628e-05, "loss": 0.2241, "step": 2667 }, { "epoch": 1.2633323468481799, "grad_norm": 1.2261773347854614, "learning_rate": 1.2988829988487822e-05, "loss": 0.2483, "step": 2668 }, { "epoch": 1.2638058597218111, "grad_norm": 0.9692365527153015, "learning_rate": 1.2983949262980362e-05, "loss": 0.2104, "step": 2669 }, { "epoch": 1.2642793725954424, "grad_norm": 1.218557357788086, "learning_rate": 1.2979067757049763e-05, "loss": 0.241, "step": 2670 }, { "epoch": 1.2647528854690737, "grad_norm": 1.0587778091430664, "learning_rate": 1.297418547197273e-05, "loss": 0.2213, "step": 2671 }, { "epoch": 1.265226398342705, "grad_norm": 2.7871766090393066, "learning_rate": 1.2969302409026181e-05, "loss": 0.22, "step": 2672 }, { "epoch": 1.2656999112163363, "grad_norm": 1.0976502895355225, "learning_rate": 1.2964418569487232e-05, "loss": 0.2031, "step": 2673 }, { "epoch": 1.2661734240899674, "grad_norm": 1.201554298400879, "learning_rate": 1.2959533954633205e-05, "loss": 0.2362, "step": 2674 }, { "epoch": 1.2666469369635986, "grad_norm": 1.15542733669281, "learning_rate": 1.2954648565741623e-05, "loss": 0.2212, "step": 2675 }, { "epoch": 1.26712044983723, "grad_norm": 1.641326665878296, "learning_rate": 1.2949762404090214e-05, "loss": 0.2476, "step": 2676 }, { "epoch": 1.2675939627108612, "grad_norm": 1.6406174898147583, "learning_rate": 1.2944875470956902e-05, "loss": 0.2303, "step": 2677 }, { "epoch": 1.2680674755844925, "grad_norm": 1.359363079071045, "learning_rate": 1.2939987767619821e-05, "loss": 0.2267, "step": 2678 }, { "epoch": 1.2685409884581236, "grad_norm": 1.6398720741271973, "learning_rate": 1.2935099295357304e-05, "loss": 0.2117, "step": 2679 }, { "epoch": 1.269014501331755, "grad_norm": 1.0096100568771362, "learning_rate": 1.2930210055447879e-05, "loss": 0.2299, "step": 2680 }, { "epoch": 1.2694880142053862, "grad_norm": 1.2073965072631836, "learning_rate": 1.2925320049170282e-05, "loss": 0.2236, "step": 2681 }, { "epoch": 1.2699615270790174, "grad_norm": 2.227213144302368, "learning_rate": 1.292042927780345e-05, "loss": 0.2152, "step": 2682 }, { "epoch": 1.2704350399526487, "grad_norm": 2.0689918994903564, "learning_rate": 1.2915537742626512e-05, "loss": 0.2129, "step": 2683 }, { "epoch": 1.27090855282628, "grad_norm": 2.3078198432922363, "learning_rate": 1.2910645444918809e-05, "loss": 0.2196, "step": 2684 }, { "epoch": 1.2713820656999113, "grad_norm": 1.536636233329773, "learning_rate": 1.2905752385959863e-05, "loss": 0.2474, "step": 2685 }, { "epoch": 1.2718555785735424, "grad_norm": 1.6504733562469482, "learning_rate": 1.2900858567029416e-05, "loss": 0.2419, "step": 2686 }, { "epoch": 1.2723290914471737, "grad_norm": 2.3625659942626953, "learning_rate": 1.2895963989407394e-05, "loss": 0.2319, "step": 2687 }, { "epoch": 1.272802604320805, "grad_norm": 1.3434407711029053, "learning_rate": 1.2891068654373928e-05, "loss": 0.2157, "step": 2688 }, { "epoch": 1.2732761171944362, "grad_norm": 0.922859251499176, "learning_rate": 1.2886172563209348e-05, "loss": 0.1955, "step": 2689 }, { "epoch": 1.2737496300680675, "grad_norm": 2.3007314205169678, "learning_rate": 1.2881275717194175e-05, "loss": 0.2455, "step": 2690 }, { "epoch": 1.2742231429416988, "grad_norm": 2.0500521659851074, "learning_rate": 1.2876378117609136e-05, "loss": 0.2188, "step": 2691 }, { "epoch": 1.27469665581533, "grad_norm": 1.2446473836898804, "learning_rate": 1.2871479765735151e-05, "loss": 0.2171, "step": 2692 }, { "epoch": 1.2751701686889612, "grad_norm": 1.333303451538086, "learning_rate": 1.2866580662853334e-05, "loss": 0.2597, "step": 2693 }, { "epoch": 1.2756436815625924, "grad_norm": 1.047584891319275, "learning_rate": 1.2861680810244998e-05, "loss": 0.2296, "step": 2694 }, { "epoch": 1.2761171944362237, "grad_norm": 1.1778916120529175, "learning_rate": 1.2856780209191655e-05, "loss": 0.2263, "step": 2695 }, { "epoch": 1.276590707309855, "grad_norm": 2.103959083557129, "learning_rate": 1.2851878860975007e-05, "loss": 0.223, "step": 2696 }, { "epoch": 1.2770642201834863, "grad_norm": 2.1192874908447266, "learning_rate": 1.2846976766876956e-05, "loss": 0.2031, "step": 2697 }, { "epoch": 1.2775377330571174, "grad_norm": 1.320399522781372, "learning_rate": 1.2842073928179594e-05, "loss": 0.224, "step": 2698 }, { "epoch": 1.2780112459307487, "grad_norm": 2.2415990829467773, "learning_rate": 1.2837170346165216e-05, "loss": 0.2253, "step": 2699 }, { "epoch": 1.27848475880438, "grad_norm": 1.6020069122314453, "learning_rate": 1.2832266022116304e-05, "loss": 0.2348, "step": 2700 }, { "epoch": 1.2789582716780112, "grad_norm": 1.4526188373565674, "learning_rate": 1.2827360957315542e-05, "loss": 0.2167, "step": 2701 }, { "epoch": 1.2794317845516425, "grad_norm": 1.26724374294281, "learning_rate": 1.2822455153045792e-05, "loss": 0.2219, "step": 2702 }, { "epoch": 1.2799052974252738, "grad_norm": 2.3599040508270264, "learning_rate": 1.281754861059013e-05, "loss": 0.2317, "step": 2703 }, { "epoch": 1.280378810298905, "grad_norm": 1.8642935752868652, "learning_rate": 1.2812641331231806e-05, "loss": 0.2083, "step": 2704 }, { "epoch": 1.2808523231725362, "grad_norm": 1.1180553436279297, "learning_rate": 1.2807733316254275e-05, "loss": 0.1894, "step": 2705 }, { "epoch": 1.2813258360461675, "grad_norm": 1.712058663368225, "learning_rate": 1.2802824566941186e-05, "loss": 0.2353, "step": 2706 }, { "epoch": 1.2817993489197987, "grad_norm": 1.1192132234573364, "learning_rate": 1.279791508457637e-05, "loss": 0.2343, "step": 2707 }, { "epoch": 1.28227286179343, "grad_norm": 1.7516214847564697, "learning_rate": 1.279300487044385e-05, "loss": 0.2302, "step": 2708 }, { "epoch": 1.2827463746670613, "grad_norm": 1.5895251035690308, "learning_rate": 1.2788093925827858e-05, "loss": 0.2144, "step": 2709 }, { "epoch": 1.2832198875406924, "grad_norm": 1.3747384548187256, "learning_rate": 1.2783182252012796e-05, "loss": 0.2168, "step": 2710 }, { "epoch": 1.283693400414324, "grad_norm": 1.1060492992401123, "learning_rate": 1.2778269850283263e-05, "loss": 0.2158, "step": 2711 }, { "epoch": 1.284166913287955, "grad_norm": 0.8876561522483826, "learning_rate": 1.2773356721924053e-05, "loss": 0.1834, "step": 2712 }, { "epoch": 1.2846404261615862, "grad_norm": 1.2856721878051758, "learning_rate": 1.276844286822015e-05, "loss": 0.2477, "step": 2713 }, { "epoch": 1.2851139390352175, "grad_norm": 2.298548460006714, "learning_rate": 1.2763528290456719e-05, "loss": 0.2009, "step": 2714 }, { "epoch": 1.2855874519088488, "grad_norm": 1.167225956916809, "learning_rate": 1.2758612989919126e-05, "loss": 0.2077, "step": 2715 }, { "epoch": 1.28606096478248, "grad_norm": 1.04922354221344, "learning_rate": 1.275369696789292e-05, "loss": 0.2088, "step": 2716 }, { "epoch": 1.2865344776561112, "grad_norm": 1.2038118839263916, "learning_rate": 1.2748780225663835e-05, "loss": 0.2267, "step": 2717 }, { "epoch": 1.2870079905297425, "grad_norm": 1.241967797279358, "learning_rate": 1.27438627645178e-05, "loss": 0.2171, "step": 2718 }, { "epoch": 1.2874815034033738, "grad_norm": 1.1782163381576538, "learning_rate": 1.2738944585740933e-05, "loss": 0.2194, "step": 2719 }, { "epoch": 1.287955016277005, "grad_norm": 0.991797685623169, "learning_rate": 1.2734025690619529e-05, "loss": 0.2126, "step": 2720 }, { "epoch": 1.2884285291506363, "grad_norm": 1.2841802835464478, "learning_rate": 1.2729106080440081e-05, "loss": 0.2145, "step": 2721 }, { "epoch": 1.2889020420242676, "grad_norm": 1.4147202968597412, "learning_rate": 1.2724185756489267e-05, "loss": 0.2368, "step": 2722 }, { "epoch": 1.289375554897899, "grad_norm": 1.1640384197235107, "learning_rate": 1.271926472005395e-05, "loss": 0.2179, "step": 2723 }, { "epoch": 1.28984906777153, "grad_norm": 1.5157102346420288, "learning_rate": 1.2714342972421177e-05, "loss": 0.2185, "step": 2724 }, { "epoch": 1.2903225806451613, "grad_norm": 1.4633888006210327, "learning_rate": 1.2709420514878183e-05, "loss": 0.2216, "step": 2725 }, { "epoch": 1.2907960935187925, "grad_norm": 1.7542057037353516, "learning_rate": 1.2704497348712397e-05, "loss": 0.2188, "step": 2726 }, { "epoch": 1.2912696063924238, "grad_norm": 1.3297491073608398, "learning_rate": 1.2699573475211415e-05, "loss": 0.2083, "step": 2727 }, { "epoch": 1.2917431192660551, "grad_norm": 1.0260534286499023, "learning_rate": 1.2694648895663038e-05, "loss": 0.2324, "step": 2728 }, { "epoch": 1.2922166321396862, "grad_norm": 1.0116316080093384, "learning_rate": 1.2689723611355236e-05, "loss": 0.2188, "step": 2729 }, { "epoch": 1.2926901450133175, "grad_norm": 1.2791035175323486, "learning_rate": 1.2684797623576173e-05, "loss": 0.2286, "step": 2730 }, { "epoch": 1.2931636578869488, "grad_norm": 2.320425033569336, "learning_rate": 1.2679870933614189e-05, "loss": 0.2229, "step": 2731 }, { "epoch": 1.29363717076058, "grad_norm": 1.2323137521743774, "learning_rate": 1.2674943542757819e-05, "loss": 0.2404, "step": 2732 }, { "epoch": 1.2941106836342113, "grad_norm": 1.2795385122299194, "learning_rate": 1.267001545229577e-05, "loss": 0.2178, "step": 2733 }, { "epoch": 1.2945841965078426, "grad_norm": 1.4273737668991089, "learning_rate": 1.2665086663516937e-05, "loss": 0.2046, "step": 2734 }, { "epoch": 1.295057709381474, "grad_norm": 1.6153572797775269, "learning_rate": 1.26601571777104e-05, "loss": 0.2214, "step": 2735 }, { "epoch": 1.295531222255105, "grad_norm": 1.3819822072982788, "learning_rate": 1.2655226996165415e-05, "loss": 0.2056, "step": 2736 }, { "epoch": 1.2960047351287363, "grad_norm": 1.2994166612625122, "learning_rate": 1.2650296120171424e-05, "loss": 0.1976, "step": 2737 }, { "epoch": 1.2964782480023676, "grad_norm": 2.0569915771484375, "learning_rate": 1.2645364551018049e-05, "loss": 0.2245, "step": 2738 }, { "epoch": 1.2969517608759988, "grad_norm": 1.7465271949768066, "learning_rate": 1.2640432289995097e-05, "loss": 0.2217, "step": 2739 }, { "epoch": 1.2974252737496301, "grad_norm": 1.355583667755127, "learning_rate": 1.2635499338392554e-05, "loss": 0.206, "step": 2740 }, { "epoch": 1.2978987866232612, "grad_norm": 1.9201940298080444, "learning_rate": 1.2630565697500583e-05, "loss": 0.2159, "step": 2741 }, { "epoch": 1.2983722994968927, "grad_norm": 2.0526649951934814, "learning_rate": 1.262563136860953e-05, "loss": 0.204, "step": 2742 }, { "epoch": 1.2988458123705238, "grad_norm": 1.1159857511520386, "learning_rate": 1.2620696353009925e-05, "loss": 0.2238, "step": 2743 }, { "epoch": 1.299319325244155, "grad_norm": 1.1484869718551636, "learning_rate": 1.2615760651992469e-05, "loss": 0.2084, "step": 2744 }, { "epoch": 1.2997928381177863, "grad_norm": 2.430534839630127, "learning_rate": 1.261082426684805e-05, "loss": 0.2258, "step": 2745 }, { "epoch": 1.3002663509914176, "grad_norm": 1.5022352933883667, "learning_rate": 1.2605887198867732e-05, "loss": 0.2052, "step": 2746 }, { "epoch": 1.300739863865049, "grad_norm": 1.0015721321105957, "learning_rate": 1.2600949449342754e-05, "loss": 0.2288, "step": 2747 }, { "epoch": 1.30121337673868, "grad_norm": 1.3668203353881836, "learning_rate": 1.259601101956454e-05, "loss": 0.2349, "step": 2748 }, { "epoch": 1.3016868896123113, "grad_norm": 2.162443161010742, "learning_rate": 1.259107191082469e-05, "loss": 0.237, "step": 2749 }, { "epoch": 1.3021604024859426, "grad_norm": 1.806618094444275, "learning_rate": 1.2586132124414978e-05, "loss": 0.2307, "step": 2750 }, { "epoch": 1.3026339153595738, "grad_norm": 1.345781922340393, "learning_rate": 1.2581191661627355e-05, "loss": 0.2118, "step": 2751 }, { "epoch": 1.3031074282332051, "grad_norm": 1.1584277153015137, "learning_rate": 1.2576250523753956e-05, "loss": 0.2287, "step": 2752 }, { "epoch": 1.3035809411068364, "grad_norm": 1.5245954990386963, "learning_rate": 1.257130871208709e-05, "loss": 0.2424, "step": 2753 }, { "epoch": 1.3040544539804677, "grad_norm": 1.4683634042739868, "learning_rate": 1.2566366227919232e-05, "loss": 0.2133, "step": 2754 }, { "epoch": 1.3045279668540988, "grad_norm": 2.778150796890259, "learning_rate": 1.2561423072543043e-05, "loss": 0.2346, "step": 2755 }, { "epoch": 1.30500147972773, "grad_norm": 2.2693684101104736, "learning_rate": 1.2556479247251364e-05, "loss": 0.221, "step": 2756 }, { "epoch": 1.3054749926013614, "grad_norm": 2.705554962158203, "learning_rate": 1.2551534753337198e-05, "loss": 0.2178, "step": 2757 }, { "epoch": 1.3059485054749926, "grad_norm": 1.8546373844146729, "learning_rate": 1.2546589592093734e-05, "loss": 0.2251, "step": 2758 }, { "epoch": 1.306422018348624, "grad_norm": 1.1411086320877075, "learning_rate": 1.2541643764814328e-05, "loss": 0.2077, "step": 2759 }, { "epoch": 1.306895531222255, "grad_norm": 1.2965822219848633, "learning_rate": 1.2536697272792517e-05, "loss": 0.2308, "step": 2760 }, { "epoch": 1.3073690440958863, "grad_norm": 1.3362536430358887, "learning_rate": 1.2531750117322004e-05, "loss": 0.2004, "step": 2761 }, { "epoch": 1.3078425569695176, "grad_norm": 1.4326553344726562, "learning_rate": 1.2526802299696674e-05, "loss": 0.223, "step": 2762 }, { "epoch": 1.3083160698431489, "grad_norm": 2.0484044551849365, "learning_rate": 1.252185382121058e-05, "loss": 0.2205, "step": 2763 }, { "epoch": 1.3087895827167801, "grad_norm": 1.2032407522201538, "learning_rate": 1.2516904683157947e-05, "loss": 0.237, "step": 2764 }, { "epoch": 1.3092630955904114, "grad_norm": 1.9354078769683838, "learning_rate": 1.2511954886833173e-05, "loss": 0.2233, "step": 2765 }, { "epoch": 1.3097366084640427, "grad_norm": 1.3299360275268555, "learning_rate": 1.2507004433530832e-05, "loss": 0.2189, "step": 2766 }, { "epoch": 1.3102101213376738, "grad_norm": 1.600842833518982, "learning_rate": 1.2502053324545666e-05, "loss": 0.2371, "step": 2767 }, { "epoch": 1.310683634211305, "grad_norm": 2.048098564147949, "learning_rate": 1.2497101561172593e-05, "loss": 0.224, "step": 2768 }, { "epoch": 1.3111571470849364, "grad_norm": 1.361019492149353, "learning_rate": 1.2492149144706696e-05, "loss": 0.2266, "step": 2769 }, { "epoch": 1.3116306599585676, "grad_norm": 2.096219778060913, "learning_rate": 1.2487196076443233e-05, "loss": 0.2166, "step": 2770 }, { "epoch": 1.312104172832199, "grad_norm": 1.8196600675582886, "learning_rate": 1.2482242357677631e-05, "loss": 0.1956, "step": 2771 }, { "epoch": 1.31257768570583, "grad_norm": 0.9747202396392822, "learning_rate": 1.2477287989705487e-05, "loss": 0.2051, "step": 2772 }, { "epoch": 1.3130511985794615, "grad_norm": 1.0282551050186157, "learning_rate": 1.2472332973822568e-05, "loss": 0.2144, "step": 2773 }, { "epoch": 1.3135247114530926, "grad_norm": 0.8776658773422241, "learning_rate": 1.2467377311324809e-05, "loss": 0.2119, "step": 2774 }, { "epoch": 1.3139982243267239, "grad_norm": 1.1889859437942505, "learning_rate": 1.2462421003508318e-05, "loss": 0.2086, "step": 2775 }, { "epoch": 1.3144717372003552, "grad_norm": 2.8763959407806396, "learning_rate": 1.2457464051669368e-05, "loss": 0.2256, "step": 2776 }, { "epoch": 1.3149452500739864, "grad_norm": 1.6497411727905273, "learning_rate": 1.2452506457104406e-05, "loss": 0.2192, "step": 2777 }, { "epoch": 1.3154187629476177, "grad_norm": 1.5440013408660889, "learning_rate": 1.2447548221110037e-05, "loss": 0.2078, "step": 2778 }, { "epoch": 1.3158922758212488, "grad_norm": 1.3232966661453247, "learning_rate": 1.2442589344983043e-05, "loss": 0.2108, "step": 2779 }, { "epoch": 1.31636578869488, "grad_norm": 0.9742394685745239, "learning_rate": 1.2437629830020372e-05, "loss": 0.2167, "step": 2780 }, { "epoch": 1.3168393015685114, "grad_norm": 1.2739520072937012, "learning_rate": 1.2432669677519134e-05, "loss": 0.2363, "step": 2781 }, { "epoch": 1.3173128144421427, "grad_norm": 1.014618158340454, "learning_rate": 1.2427708888776611e-05, "loss": 0.2226, "step": 2782 }, { "epoch": 1.317786327315774, "grad_norm": 2.0462191104888916, "learning_rate": 1.2422747465090246e-05, "loss": 0.2393, "step": 2783 }, { "epoch": 1.3182598401894052, "grad_norm": 1.8126726150512695, "learning_rate": 1.2417785407757657e-05, "loss": 0.2352, "step": 2784 }, { "epoch": 1.3187333530630365, "grad_norm": 1.3824222087860107, "learning_rate": 1.2412822718076619e-05, "loss": 0.2277, "step": 2785 }, { "epoch": 1.3192068659366676, "grad_norm": 1.3327887058258057, "learning_rate": 1.2407859397345073e-05, "loss": 0.2179, "step": 2786 }, { "epoch": 1.3196803788102989, "grad_norm": 1.1972007751464844, "learning_rate": 1.2402895446861131e-05, "loss": 0.2516, "step": 2787 }, { "epoch": 1.3201538916839302, "grad_norm": 1.0115021467208862, "learning_rate": 1.239793086792307e-05, "loss": 0.2129, "step": 2788 }, { "epoch": 1.3206274045575614, "grad_norm": 2.7716517448425293, "learning_rate": 1.2392965661829321e-05, "loss": 0.2244, "step": 2789 }, { "epoch": 1.3211009174311927, "grad_norm": 2.3599562644958496, "learning_rate": 1.238799982987849e-05, "loss": 0.2132, "step": 2790 }, { "epoch": 1.3215744303048238, "grad_norm": 2.0140914916992188, "learning_rate": 1.238303337336934e-05, "loss": 0.1975, "step": 2791 }, { "epoch": 1.322047943178455, "grad_norm": 1.8659518957138062, "learning_rate": 1.2378066293600801e-05, "loss": 0.2089, "step": 2792 }, { "epoch": 1.3225214560520864, "grad_norm": 2.2264151573181152, "learning_rate": 1.2373098591871964e-05, "loss": 0.1899, "step": 2793 }, { "epoch": 1.3229949689257177, "grad_norm": 2.3644020557403564, "learning_rate": 1.2368130269482084e-05, "loss": 0.2203, "step": 2794 }, { "epoch": 1.323468481799349, "grad_norm": 1.8323869705200195, "learning_rate": 1.2363161327730577e-05, "loss": 0.2109, "step": 2795 }, { "epoch": 1.3239419946729802, "grad_norm": 1.3173034191131592, "learning_rate": 1.2358191767917024e-05, "loss": 0.2372, "step": 2796 }, { "epoch": 1.3244155075466115, "grad_norm": 1.2270554304122925, "learning_rate": 1.2353221591341163e-05, "loss": 0.211, "step": 2797 }, { "epoch": 1.3248890204202426, "grad_norm": 2.137927770614624, "learning_rate": 1.2348250799302898e-05, "loss": 0.2115, "step": 2798 }, { "epoch": 1.3253625332938739, "grad_norm": 2.619537591934204, "learning_rate": 1.2343279393102292e-05, "loss": 0.2316, "step": 2799 }, { "epoch": 1.3258360461675052, "grad_norm": 1.1064987182617188, "learning_rate": 1.2338307374039564e-05, "loss": 0.2106, "step": 2800 }, { "epoch": 1.3263095590411365, "grad_norm": 1.347868800163269, "learning_rate": 1.2333334743415103e-05, "loss": 0.2111, "step": 2801 }, { "epoch": 1.3267830719147677, "grad_norm": 1.0354684591293335, "learning_rate": 1.2328361502529444e-05, "loss": 0.221, "step": 2802 }, { "epoch": 1.3272565847883988, "grad_norm": 1.5998934507369995, "learning_rate": 1.23233876526833e-05, "loss": 0.2271, "step": 2803 }, { "epoch": 1.3277300976620303, "grad_norm": 1.4590028524398804, "learning_rate": 1.231841319517753e-05, "loss": 0.2234, "step": 2804 }, { "epoch": 1.3282036105356614, "grad_norm": 2.0347890853881836, "learning_rate": 1.2313438131313155e-05, "loss": 0.2375, "step": 2805 }, { "epoch": 1.3286771234092927, "grad_norm": 1.5571720600128174, "learning_rate": 1.2308462462391356e-05, "loss": 0.2286, "step": 2806 }, { "epoch": 1.329150636282924, "grad_norm": 1.1657594442367554, "learning_rate": 1.2303486189713466e-05, "loss": 0.2066, "step": 2807 }, { "epoch": 1.3296241491565552, "grad_norm": 1.2159301042556763, "learning_rate": 1.2298509314580986e-05, "loss": 0.2515, "step": 2808 }, { "epoch": 1.3300976620301865, "grad_norm": 2.0197834968566895, "learning_rate": 1.2293531838295572e-05, "loss": 0.2153, "step": 2809 }, { "epoch": 1.3305711749038176, "grad_norm": 1.6019450426101685, "learning_rate": 1.228855376215903e-05, "loss": 0.2245, "step": 2810 }, { "epoch": 1.3310446877774489, "grad_norm": 1.1272090673446655, "learning_rate": 1.228357508747333e-05, "loss": 0.2135, "step": 2811 }, { "epoch": 1.3315182006510802, "grad_norm": 1.550205111503601, "learning_rate": 1.2278595815540595e-05, "loss": 0.2498, "step": 2812 }, { "epoch": 1.3319917135247115, "grad_norm": 1.4885501861572266, "learning_rate": 1.2273615947663107e-05, "loss": 0.2371, "step": 2813 }, { "epoch": 1.3324652263983428, "grad_norm": 1.3265790939331055, "learning_rate": 1.2268635485143303e-05, "loss": 0.2216, "step": 2814 }, { "epoch": 1.332938739271974, "grad_norm": 1.121368646621704, "learning_rate": 1.2263654429283774e-05, "loss": 0.2311, "step": 2815 }, { "epoch": 1.3334122521456053, "grad_norm": 1.9394561052322388, "learning_rate": 1.2258672781387267e-05, "loss": 0.2228, "step": 2816 }, { "epoch": 1.3338857650192364, "grad_norm": 1.1712671518325806, "learning_rate": 1.2253690542756682e-05, "loss": 0.2071, "step": 2817 }, { "epoch": 1.3343592778928677, "grad_norm": 1.167645812034607, "learning_rate": 1.2248707714695077e-05, "loss": 0.2191, "step": 2818 }, { "epoch": 1.334832790766499, "grad_norm": 1.0730623006820679, "learning_rate": 1.2243724298505664e-05, "loss": 0.2258, "step": 2819 }, { "epoch": 1.3353063036401303, "grad_norm": 1.0903040170669556, "learning_rate": 1.2238740295491804e-05, "loss": 0.2109, "step": 2820 }, { "epoch": 1.3357798165137615, "grad_norm": 0.9441567063331604, "learning_rate": 1.223375570695702e-05, "loss": 0.2366, "step": 2821 }, { "epoch": 1.3362533293873926, "grad_norm": 0.9626446962356567, "learning_rate": 1.222877053420498e-05, "loss": 0.1965, "step": 2822 }, { "epoch": 1.336726842261024, "grad_norm": 1.2279541492462158, "learning_rate": 1.2223784778539508e-05, "loss": 0.2155, "step": 2823 }, { "epoch": 1.3372003551346552, "grad_norm": 1.0313448905944824, "learning_rate": 1.2218798441264579e-05, "loss": 0.2165, "step": 2824 }, { "epoch": 1.3376738680082865, "grad_norm": 1.0775771141052246, "learning_rate": 1.2213811523684325e-05, "loss": 0.2141, "step": 2825 }, { "epoch": 1.3381473808819178, "grad_norm": 0.9873069524765015, "learning_rate": 1.2208824027103021e-05, "loss": 0.1984, "step": 2826 }, { "epoch": 1.338620893755549, "grad_norm": 1.219462513923645, "learning_rate": 1.2203835952825105e-05, "loss": 0.2372, "step": 2827 }, { "epoch": 1.3390944066291803, "grad_norm": 1.2963061332702637, "learning_rate": 1.2198847302155154e-05, "loss": 0.2206, "step": 2828 }, { "epoch": 1.3395679195028114, "grad_norm": 1.0308513641357422, "learning_rate": 1.2193858076397905e-05, "loss": 0.2288, "step": 2829 }, { "epoch": 1.3400414323764427, "grad_norm": 1.5890069007873535, "learning_rate": 1.2188868276858238e-05, "loss": 0.2171, "step": 2830 }, { "epoch": 1.340514945250074, "grad_norm": 2.1345863342285156, "learning_rate": 1.2183877904841193e-05, "loss": 0.2138, "step": 2831 }, { "epoch": 1.3409884581237053, "grad_norm": 1.1476088762283325, "learning_rate": 1.217888696165195e-05, "loss": 0.2144, "step": 2832 }, { "epoch": 1.3414619709973366, "grad_norm": 1.109824776649475, "learning_rate": 1.2173895448595842e-05, "loss": 0.2018, "step": 2833 }, { "epoch": 1.3419354838709676, "grad_norm": 1.1557180881500244, "learning_rate": 1.216890336697835e-05, "loss": 0.2178, "step": 2834 }, { "epoch": 1.3424089967445991, "grad_norm": 1.1168832778930664, "learning_rate": 1.2163910718105108e-05, "loss": 0.2174, "step": 2835 }, { "epoch": 1.3428825096182302, "grad_norm": 1.25518000125885, "learning_rate": 1.2158917503281891e-05, "loss": 0.2305, "step": 2836 }, { "epoch": 1.3433560224918615, "grad_norm": 1.8499336242675781, "learning_rate": 1.215392372381463e-05, "loss": 0.202, "step": 2837 }, { "epoch": 1.3438295353654928, "grad_norm": 1.5814791917800903, "learning_rate": 1.2148929381009398e-05, "loss": 0.2027, "step": 2838 }, { "epoch": 1.344303048239124, "grad_norm": 1.3987447023391724, "learning_rate": 1.2143934476172416e-05, "loss": 0.2271, "step": 2839 }, { "epoch": 1.3447765611127553, "grad_norm": 1.075582504272461, "learning_rate": 1.2138939010610055e-05, "loss": 0.1957, "step": 2840 }, { "epoch": 1.3452500739863864, "grad_norm": 1.5337646007537842, "learning_rate": 1.2133942985628833e-05, "loss": 0.2177, "step": 2841 }, { "epoch": 1.3457235868600177, "grad_norm": 2.3917672634124756, "learning_rate": 1.2128946402535409e-05, "loss": 0.2185, "step": 2842 }, { "epoch": 1.346197099733649, "grad_norm": 2.155311346054077, "learning_rate": 1.2123949262636592e-05, "loss": 0.221, "step": 2843 }, { "epoch": 1.3466706126072803, "grad_norm": 0.9766375422477722, "learning_rate": 1.2118951567239331e-05, "loss": 0.2271, "step": 2844 }, { "epoch": 1.3471441254809116, "grad_norm": 1.0675561428070068, "learning_rate": 1.2113953317650733e-05, "loss": 0.2214, "step": 2845 }, { "epoch": 1.3476176383545428, "grad_norm": 1.6465510129928589, "learning_rate": 1.2108954515178037e-05, "loss": 0.2025, "step": 2846 }, { "epoch": 1.3480911512281741, "grad_norm": 1.7982463836669922, "learning_rate": 1.2103955161128635e-05, "loss": 0.203, "step": 2847 }, { "epoch": 1.3485646641018052, "grad_norm": 1.2066468000411987, "learning_rate": 1.2098955256810057e-05, "loss": 0.215, "step": 2848 }, { "epoch": 1.3490381769754365, "grad_norm": 1.1641322374343872, "learning_rate": 1.2093954803529981e-05, "loss": 0.2276, "step": 2849 }, { "epoch": 1.3495116898490678, "grad_norm": 1.1573890447616577, "learning_rate": 1.2088953802596229e-05, "loss": 0.2076, "step": 2850 }, { "epoch": 1.349985202722699, "grad_norm": 0.9064158201217651, "learning_rate": 1.208395225531676e-05, "loss": 0.2124, "step": 2851 }, { "epoch": 1.3504587155963304, "grad_norm": 1.221864104270935, "learning_rate": 1.207895016299968e-05, "loss": 0.203, "step": 2852 }, { "epoch": 1.3509322284699614, "grad_norm": 1.3113806247711182, "learning_rate": 1.2073947526953245e-05, "loss": 0.2201, "step": 2853 }, { "epoch": 1.3514057413435927, "grad_norm": 1.3473477363586426, "learning_rate": 1.206894434848584e-05, "loss": 0.2346, "step": 2854 }, { "epoch": 1.351879254217224, "grad_norm": 1.0617995262145996, "learning_rate": 1.2063940628906001e-05, "loss": 0.2009, "step": 2855 }, { "epoch": 1.3523527670908553, "grad_norm": 1.075543761253357, "learning_rate": 1.20589363695224e-05, "loss": 0.2371, "step": 2856 }, { "epoch": 1.3528262799644866, "grad_norm": 1.068983554840088, "learning_rate": 1.2053931571643857e-05, "loss": 0.2256, "step": 2857 }, { "epoch": 1.3532997928381179, "grad_norm": 0.9813727736473083, "learning_rate": 1.2048926236579326e-05, "loss": 0.2171, "step": 2858 }, { "epoch": 1.3537733057117491, "grad_norm": 1.6709295511245728, "learning_rate": 1.2043920365637904e-05, "loss": 0.2091, "step": 2859 }, { "epoch": 1.3542468185853802, "grad_norm": 0.9895414710044861, "learning_rate": 1.2038913960128828e-05, "loss": 0.2049, "step": 2860 }, { "epoch": 1.3547203314590115, "grad_norm": 1.3172953128814697, "learning_rate": 1.2033907021361476e-05, "loss": 0.2308, "step": 2861 }, { "epoch": 1.3551938443326428, "grad_norm": 1.719438076019287, "learning_rate": 1.2028899550645362e-05, "loss": 0.2239, "step": 2862 }, { "epoch": 1.355667357206274, "grad_norm": 1.0997658967971802, "learning_rate": 1.2023891549290143e-05, "loss": 0.2123, "step": 2863 }, { "epoch": 1.3561408700799054, "grad_norm": 1.2592804431915283, "learning_rate": 1.2018883018605614e-05, "loss": 0.2157, "step": 2864 }, { "epoch": 1.3566143829535364, "grad_norm": 1.2449421882629395, "learning_rate": 1.201387395990171e-05, "loss": 0.2011, "step": 2865 }, { "epoch": 1.357087895827168, "grad_norm": 1.2251133918762207, "learning_rate": 1.20088643744885e-05, "loss": 0.2227, "step": 2866 }, { "epoch": 1.357561408700799, "grad_norm": 1.0823248624801636, "learning_rate": 1.2003854263676196e-05, "loss": 0.2384, "step": 2867 }, { "epoch": 1.3580349215744303, "grad_norm": 1.333550214767456, "learning_rate": 1.1998843628775136e-05, "loss": 0.2299, "step": 2868 }, { "epoch": 1.3585084344480616, "grad_norm": 1.7682206630706787, "learning_rate": 1.199383247109581e-05, "loss": 0.2277, "step": 2869 }, { "epoch": 1.3589819473216929, "grad_norm": 1.3045170307159424, "learning_rate": 1.1988820791948834e-05, "loss": 0.2093, "step": 2870 }, { "epoch": 1.3594554601953242, "grad_norm": 2.6720004081726074, "learning_rate": 1.1983808592644967e-05, "loss": 0.2414, "step": 2871 }, { "epoch": 1.3599289730689552, "grad_norm": 0.9892565608024597, "learning_rate": 1.1978795874495103e-05, "loss": 0.206, "step": 2872 }, { "epoch": 1.3604024859425865, "grad_norm": 1.4673343896865845, "learning_rate": 1.1973782638810264e-05, "loss": 0.2143, "step": 2873 }, { "epoch": 1.3608759988162178, "grad_norm": 1.2298039197921753, "learning_rate": 1.1968768886901621e-05, "loss": 0.227, "step": 2874 }, { "epoch": 1.361349511689849, "grad_norm": 1.48917555809021, "learning_rate": 1.1963754620080467e-05, "loss": 0.2148, "step": 2875 }, { "epoch": 1.3618230245634804, "grad_norm": 1.349953532218933, "learning_rate": 1.1958739839658238e-05, "loss": 0.1988, "step": 2876 }, { "epoch": 1.3622965374371117, "grad_norm": 1.3024402856826782, "learning_rate": 1.1953724546946502e-05, "loss": 0.2332, "step": 2877 }, { "epoch": 1.362770050310743, "grad_norm": 1.4080761671066284, "learning_rate": 1.1948708743256954e-05, "loss": 0.221, "step": 2878 }, { "epoch": 1.363243563184374, "grad_norm": 1.1572239398956299, "learning_rate": 1.1943692429901437e-05, "loss": 0.2094, "step": 2879 }, { "epoch": 1.3637170760580053, "grad_norm": 1.7980104684829712, "learning_rate": 1.1938675608191914e-05, "loss": 0.2282, "step": 2880 }, { "epoch": 1.3641905889316366, "grad_norm": 1.5248477458953857, "learning_rate": 1.193365827944049e-05, "loss": 0.2235, "step": 2881 }, { "epoch": 1.3646641018052679, "grad_norm": 1.2481404542922974, "learning_rate": 1.1928640444959396e-05, "loss": 0.2013, "step": 2882 }, { "epoch": 1.3651376146788992, "grad_norm": 2.0264244079589844, "learning_rate": 1.1923622106060997e-05, "loss": 0.2341, "step": 2883 }, { "epoch": 1.3656111275525302, "grad_norm": 1.1340547800064087, "learning_rate": 1.19186032640578e-05, "loss": 0.2295, "step": 2884 }, { "epoch": 1.3660846404261615, "grad_norm": 1.4470584392547607, "learning_rate": 1.1913583920262424e-05, "loss": 0.2207, "step": 2885 }, { "epoch": 1.3665581532997928, "grad_norm": 1.103227138519287, "learning_rate": 1.1908564075987637e-05, "loss": 0.2065, "step": 2886 }, { "epoch": 1.367031666173424, "grad_norm": 1.558107852935791, "learning_rate": 1.1903543732546326e-05, "loss": 0.2203, "step": 2887 }, { "epoch": 1.3675051790470554, "grad_norm": 1.080611228942871, "learning_rate": 1.1898522891251516e-05, "loss": 0.2328, "step": 2888 }, { "epoch": 1.3679786919206867, "grad_norm": 1.09610915184021, "learning_rate": 1.1893501553416357e-05, "loss": 0.2292, "step": 2889 }, { "epoch": 1.368452204794318, "grad_norm": 1.4019349813461304, "learning_rate": 1.1888479720354138e-05, "loss": 0.2118, "step": 2890 }, { "epoch": 1.368925717667949, "grad_norm": 1.5220277309417725, "learning_rate": 1.1883457393378263e-05, "loss": 0.2051, "step": 2891 }, { "epoch": 1.3693992305415803, "grad_norm": 0.9200198650360107, "learning_rate": 1.187843457380228e-05, "loss": 0.214, "step": 2892 }, { "epoch": 1.3698727434152116, "grad_norm": 1.2134068012237549, "learning_rate": 1.1873411262939854e-05, "loss": 0.2068, "step": 2893 }, { "epoch": 1.3703462562888429, "grad_norm": 1.125510334968567, "learning_rate": 1.1868387462104787e-05, "loss": 0.2247, "step": 2894 }, { "epoch": 1.3708197691624742, "grad_norm": 0.9510089159011841, "learning_rate": 1.1863363172611003e-05, "loss": 0.2077, "step": 2895 }, { "epoch": 1.3712932820361052, "grad_norm": 1.2528704404830933, "learning_rate": 1.1858338395772555e-05, "loss": 0.234, "step": 2896 }, { "epoch": 1.3717667949097367, "grad_norm": 1.718252182006836, "learning_rate": 1.185331313290363e-05, "loss": 0.2241, "step": 2897 }, { "epoch": 1.3722403077833678, "grad_norm": 1.3889955282211304, "learning_rate": 1.1848287385318533e-05, "loss": 0.239, "step": 2898 }, { "epoch": 1.372713820656999, "grad_norm": 1.056844711303711, "learning_rate": 1.1843261154331702e-05, "loss": 0.1979, "step": 2899 }, { "epoch": 1.3731873335306304, "grad_norm": 1.1087300777435303, "learning_rate": 1.1838234441257698e-05, "loss": 0.2094, "step": 2900 }, { "epoch": 1.3736608464042617, "grad_norm": 1.014634609222412, "learning_rate": 1.1833207247411208e-05, "loss": 0.2052, "step": 2901 }, { "epoch": 1.374134359277893, "grad_norm": 1.0202566385269165, "learning_rate": 1.182817957410705e-05, "loss": 0.2064, "step": 2902 }, { "epoch": 1.374607872151524, "grad_norm": 1.1275465488433838, "learning_rate": 1.1823151422660162e-05, "loss": 0.2161, "step": 2903 }, { "epoch": 1.3750813850251553, "grad_norm": 1.6918755769729614, "learning_rate": 1.1818122794385604e-05, "loss": 0.2127, "step": 2904 }, { "epoch": 1.3755548978987866, "grad_norm": 1.5377037525177002, "learning_rate": 1.1813093690598572e-05, "loss": 0.1938, "step": 2905 }, { "epoch": 1.3760284107724179, "grad_norm": 1.1492862701416016, "learning_rate": 1.1808064112614375e-05, "loss": 0.2127, "step": 2906 }, { "epoch": 1.3765019236460492, "grad_norm": 1.8122957944869995, "learning_rate": 1.1803034061748453e-05, "loss": 0.1997, "step": 2907 }, { "epoch": 1.3769754365196802, "grad_norm": 1.2996703386306763, "learning_rate": 1.1798003539316365e-05, "loss": 0.226, "step": 2908 }, { "epoch": 1.3774489493933118, "grad_norm": 1.1555538177490234, "learning_rate": 1.1792972546633799e-05, "loss": 0.2417, "step": 2909 }, { "epoch": 1.3779224622669428, "grad_norm": 1.0537813901901245, "learning_rate": 1.178794108501656e-05, "loss": 0.221, "step": 2910 }, { "epoch": 1.378395975140574, "grad_norm": 1.2504982948303223, "learning_rate": 1.1782909155780577e-05, "loss": 0.207, "step": 2911 }, { "epoch": 1.3788694880142054, "grad_norm": 1.0087714195251465, "learning_rate": 1.1777876760241907e-05, "loss": 0.2108, "step": 2912 }, { "epoch": 1.3793430008878367, "grad_norm": 0.9194630980491638, "learning_rate": 1.1772843899716719e-05, "loss": 0.2024, "step": 2913 }, { "epoch": 1.379816513761468, "grad_norm": 1.1061056852340698, "learning_rate": 1.1767810575521312e-05, "loss": 0.2317, "step": 2914 }, { "epoch": 1.380290026635099, "grad_norm": 1.6991933584213257, "learning_rate": 1.1762776788972106e-05, "loss": 0.2297, "step": 2915 }, { "epoch": 1.3807635395087303, "grad_norm": 1.4516056776046753, "learning_rate": 1.1757742541385636e-05, "loss": 0.2268, "step": 2916 }, { "epoch": 1.3812370523823616, "grad_norm": 1.5717034339904785, "learning_rate": 1.1752707834078558e-05, "loss": 0.2298, "step": 2917 }, { "epoch": 1.381710565255993, "grad_norm": 1.2218979597091675, "learning_rate": 1.1747672668367659e-05, "loss": 0.2222, "step": 2918 }, { "epoch": 1.3821840781296242, "grad_norm": 1.1709834337234497, "learning_rate": 1.1742637045569832e-05, "loss": 0.2199, "step": 2919 }, { "epoch": 1.3826575910032555, "grad_norm": 1.1610695123672485, "learning_rate": 1.1737600967002095e-05, "loss": 0.2203, "step": 2920 }, { "epoch": 1.3831311038768868, "grad_norm": 1.1131560802459717, "learning_rate": 1.1732564433981594e-05, "loss": 0.205, "step": 2921 }, { "epoch": 1.3836046167505178, "grad_norm": 1.2609494924545288, "learning_rate": 1.1727527447825575e-05, "loss": 0.2129, "step": 2922 }, { "epoch": 1.3840781296241491, "grad_norm": 1.1561344861984253, "learning_rate": 1.1722490009851418e-05, "loss": 0.2299, "step": 2923 }, { "epoch": 1.3845516424977804, "grad_norm": 1.0187231302261353, "learning_rate": 1.1717452121376616e-05, "loss": 0.1932, "step": 2924 }, { "epoch": 1.3850251553714117, "grad_norm": 1.7099744081497192, "learning_rate": 1.1712413783718782e-05, "loss": 0.2273, "step": 2925 }, { "epoch": 1.385498668245043, "grad_norm": 1.1517730951309204, "learning_rate": 1.1707374998195643e-05, "loss": 0.2322, "step": 2926 }, { "epoch": 1.385972181118674, "grad_norm": 1.6895732879638672, "learning_rate": 1.1702335766125042e-05, "loss": 0.2198, "step": 2927 }, { "epoch": 1.3864456939923056, "grad_norm": 1.1857640743255615, "learning_rate": 1.1697296088824945e-05, "loss": 0.2639, "step": 2928 }, { "epoch": 1.3869192068659366, "grad_norm": 1.5664935111999512, "learning_rate": 1.1692255967613432e-05, "loss": 0.2146, "step": 2929 }, { "epoch": 1.387392719739568, "grad_norm": 1.2540571689605713, "learning_rate": 1.1687215403808697e-05, "loss": 0.2288, "step": 2930 }, { "epoch": 1.3878662326131992, "grad_norm": 1.223510980606079, "learning_rate": 1.1682174398729044e-05, "loss": 0.2205, "step": 2931 }, { "epoch": 1.3883397454868305, "grad_norm": 0.9939408302307129, "learning_rate": 1.1677132953692911e-05, "loss": 0.2321, "step": 2932 }, { "epoch": 1.3888132583604618, "grad_norm": 1.3630565404891968, "learning_rate": 1.1672091070018832e-05, "loss": 0.2274, "step": 2933 }, { "epoch": 1.3892867712340928, "grad_norm": 1.4759294986724854, "learning_rate": 1.1667048749025462e-05, "loss": 0.2319, "step": 2934 }, { "epoch": 1.3897602841077241, "grad_norm": 0.9733860492706299, "learning_rate": 1.1662005992031577e-05, "loss": 0.2087, "step": 2935 }, { "epoch": 1.3902337969813554, "grad_norm": 1.1344149112701416, "learning_rate": 1.1656962800356058e-05, "loss": 0.2226, "step": 2936 }, { "epoch": 1.3907073098549867, "grad_norm": 1.0230250358581543, "learning_rate": 1.1651919175317903e-05, "loss": 0.2076, "step": 2937 }, { "epoch": 1.391180822728618, "grad_norm": 1.2815759181976318, "learning_rate": 1.1646875118236225e-05, "loss": 0.209, "step": 2938 }, { "epoch": 1.391654335602249, "grad_norm": 1.4215916395187378, "learning_rate": 1.1641830630430246e-05, "loss": 0.2402, "step": 2939 }, { "epoch": 1.3921278484758806, "grad_norm": 1.8291083574295044, "learning_rate": 1.1636785713219305e-05, "loss": 0.2072, "step": 2940 }, { "epoch": 1.3926013613495116, "grad_norm": 1.3542451858520508, "learning_rate": 1.163174036792285e-05, "loss": 0.2279, "step": 2941 }, { "epoch": 1.393074874223143, "grad_norm": 1.9127904176712036, "learning_rate": 1.1626694595860443e-05, "loss": 0.2264, "step": 2942 }, { "epoch": 1.3935483870967742, "grad_norm": 1.4430345296859741, "learning_rate": 1.1621648398351762e-05, "loss": 0.2149, "step": 2943 }, { "epoch": 1.3940218999704055, "grad_norm": 2.1805191040039062, "learning_rate": 1.1616601776716583e-05, "loss": 0.2259, "step": 2944 }, { "epoch": 1.3944954128440368, "grad_norm": 1.4834105968475342, "learning_rate": 1.1611554732274806e-05, "loss": 0.2054, "step": 2945 }, { "epoch": 1.3949689257176678, "grad_norm": 1.532042145729065, "learning_rate": 1.1606507266346436e-05, "loss": 0.2169, "step": 2946 }, { "epoch": 1.3954424385912991, "grad_norm": 1.2459840774536133, "learning_rate": 1.160145938025159e-05, "loss": 0.2091, "step": 2947 }, { "epoch": 1.3959159514649304, "grad_norm": 1.190146565437317, "learning_rate": 1.159641107531049e-05, "loss": 0.1977, "step": 2948 }, { "epoch": 1.3963894643385617, "grad_norm": 1.033531665802002, "learning_rate": 1.1591362352843477e-05, "loss": 0.1989, "step": 2949 }, { "epoch": 1.396862977212193, "grad_norm": 1.877530813217163, "learning_rate": 1.158631321417099e-05, "loss": 0.2075, "step": 2950 }, { "epoch": 1.3973364900858243, "grad_norm": 1.1932871341705322, "learning_rate": 1.1581263660613585e-05, "loss": 0.2094, "step": 2951 }, { "epoch": 1.3978100029594556, "grad_norm": 1.5696194171905518, "learning_rate": 1.1576213693491925e-05, "loss": 0.207, "step": 2952 }, { "epoch": 1.3982835158330866, "grad_norm": 1.6218510866165161, "learning_rate": 1.157116331412678e-05, "loss": 0.1988, "step": 2953 }, { "epoch": 1.398757028706718, "grad_norm": 1.69428551197052, "learning_rate": 1.1566112523839028e-05, "loss": 0.2118, "step": 2954 }, { "epoch": 1.3992305415803492, "grad_norm": 0.9720800518989563, "learning_rate": 1.1561061323949652e-05, "loss": 0.1867, "step": 2955 }, { "epoch": 1.3997040544539805, "grad_norm": 1.5280438661575317, "learning_rate": 1.155600971577975e-05, "loss": 0.2288, "step": 2956 }, { "epoch": 1.4001775673276118, "grad_norm": 1.5449702739715576, "learning_rate": 1.1550957700650517e-05, "loss": 0.2242, "step": 2957 }, { "epoch": 1.4006510802012428, "grad_norm": 1.8118822574615479, "learning_rate": 1.1545905279883258e-05, "loss": 0.2122, "step": 2958 }, { "epoch": 1.4011245930748744, "grad_norm": 1.7148773670196533, "learning_rate": 1.1540852454799388e-05, "loss": 0.2237, "step": 2959 }, { "epoch": 1.4015981059485054, "grad_norm": 1.2663835287094116, "learning_rate": 1.1535799226720421e-05, "loss": 0.2229, "step": 2960 }, { "epoch": 1.4020716188221367, "grad_norm": 1.3801897764205933, "learning_rate": 1.1530745596967982e-05, "loss": 0.2263, "step": 2961 }, { "epoch": 1.402545131695768, "grad_norm": 1.4142658710479736, "learning_rate": 1.1525691566863802e-05, "loss": 0.2169, "step": 2962 }, { "epoch": 1.4030186445693993, "grad_norm": 0.985097348690033, "learning_rate": 1.1520637137729711e-05, "loss": 0.2025, "step": 2963 }, { "epoch": 1.4034921574430306, "grad_norm": 1.7682098150253296, "learning_rate": 1.1515582310887647e-05, "loss": 0.2098, "step": 2964 }, { "epoch": 1.4039656703166616, "grad_norm": 1.2826495170593262, "learning_rate": 1.1510527087659648e-05, "loss": 0.2185, "step": 2965 }, { "epoch": 1.404439183190293, "grad_norm": 0.9184038639068604, "learning_rate": 1.1505471469367864e-05, "loss": 0.2079, "step": 2966 }, { "epoch": 1.4049126960639242, "grad_norm": 1.2887505292892456, "learning_rate": 1.1500415457334539e-05, "loss": 0.1951, "step": 2967 }, { "epoch": 1.4053862089375555, "grad_norm": 1.2137596607208252, "learning_rate": 1.1495359052882028e-05, "loss": 0.2093, "step": 2968 }, { "epoch": 1.4058597218111868, "grad_norm": 1.2313615083694458, "learning_rate": 1.1490302257332781e-05, "loss": 0.2015, "step": 2969 }, { "epoch": 1.4063332346848179, "grad_norm": 1.5044950246810913, "learning_rate": 1.1485245072009357e-05, "loss": 0.2187, "step": 2970 }, { "epoch": 1.4068067475584494, "grad_norm": 1.2070937156677246, "learning_rate": 1.1480187498234412e-05, "loss": 0.2067, "step": 2971 }, { "epoch": 1.4072802604320804, "grad_norm": 1.3706823587417603, "learning_rate": 1.1475129537330707e-05, "loss": 0.2081, "step": 2972 }, { "epoch": 1.4077537733057117, "grad_norm": 1.1687662601470947, "learning_rate": 1.1470071190621103e-05, "loss": 0.2081, "step": 2973 }, { "epoch": 1.408227286179343, "grad_norm": 1.102828025817871, "learning_rate": 1.1465012459428562e-05, "loss": 0.2129, "step": 2974 }, { "epoch": 1.4087007990529743, "grad_norm": 1.8313281536102295, "learning_rate": 1.1459953345076142e-05, "loss": 0.2111, "step": 2975 }, { "epoch": 1.4091743119266056, "grad_norm": 1.3744657039642334, "learning_rate": 1.1454893848887013e-05, "loss": 0.2099, "step": 2976 }, { "epoch": 1.4096478248002366, "grad_norm": 0.9316054582595825, "learning_rate": 1.144983397218443e-05, "loss": 0.2132, "step": 2977 }, { "epoch": 1.410121337673868, "grad_norm": 1.1082512140274048, "learning_rate": 1.1444773716291759e-05, "loss": 0.1941, "step": 2978 }, { "epoch": 1.4105948505474992, "grad_norm": 1.3401765823364258, "learning_rate": 1.143971308253246e-05, "loss": 0.2226, "step": 2979 }, { "epoch": 1.4110683634211305, "grad_norm": 1.1040364503860474, "learning_rate": 1.1434652072230092e-05, "loss": 0.1832, "step": 2980 }, { "epoch": 1.4115418762947618, "grad_norm": 1.007563829421997, "learning_rate": 1.142959068670832e-05, "loss": 0.2122, "step": 2981 }, { "epoch": 1.412015389168393, "grad_norm": 1.306701421737671, "learning_rate": 1.1424528927290892e-05, "loss": 0.2162, "step": 2982 }, { "epoch": 1.4124889020420244, "grad_norm": 0.9834753274917603, "learning_rate": 1.1419466795301665e-05, "loss": 0.201, "step": 2983 }, { "epoch": 1.4129624149156554, "grad_norm": 1.4253736734390259, "learning_rate": 1.1414404292064593e-05, "loss": 0.2153, "step": 2984 }, { "epoch": 1.4134359277892867, "grad_norm": 1.133834719657898, "learning_rate": 1.1409341418903725e-05, "loss": 0.2283, "step": 2985 }, { "epoch": 1.413909440662918, "grad_norm": 1.4796968698501587, "learning_rate": 1.1404278177143202e-05, "loss": 0.2251, "step": 2986 }, { "epoch": 1.4143829535365493, "grad_norm": 1.3336002826690674, "learning_rate": 1.139921456810727e-05, "loss": 0.206, "step": 2987 }, { "epoch": 1.4148564664101806, "grad_norm": 1.420716404914856, "learning_rate": 1.1394150593120268e-05, "loss": 0.2226, "step": 2988 }, { "epoch": 1.4153299792838117, "grad_norm": 1.5567362308502197, "learning_rate": 1.1389086253506626e-05, "loss": 0.2219, "step": 2989 }, { "epoch": 1.4158034921574432, "grad_norm": 0.9303296804428101, "learning_rate": 1.1384021550590878e-05, "loss": 0.2277, "step": 2990 }, { "epoch": 1.4162770050310742, "grad_norm": 1.0575077533721924, "learning_rate": 1.1378956485697644e-05, "loss": 0.2224, "step": 2991 }, { "epoch": 1.4167505179047055, "grad_norm": 1.0305496454238892, "learning_rate": 1.1373891060151643e-05, "loss": 0.2341, "step": 2992 }, { "epoch": 1.4172240307783368, "grad_norm": 1.0244580507278442, "learning_rate": 1.1368825275277689e-05, "loss": 0.2011, "step": 2993 }, { "epoch": 1.417697543651968, "grad_norm": 1.31196928024292, "learning_rate": 1.136375913240069e-05, "loss": 0.2253, "step": 2994 }, { "epoch": 1.4181710565255994, "grad_norm": 1.118857502937317, "learning_rate": 1.1358692632845645e-05, "loss": 0.2278, "step": 2995 }, { "epoch": 1.4186445693992304, "grad_norm": 1.114423155784607, "learning_rate": 1.1353625777937652e-05, "loss": 0.2004, "step": 2996 }, { "epoch": 1.4191180822728617, "grad_norm": 1.2260422706604004, "learning_rate": 1.1348558569001896e-05, "loss": 0.2056, "step": 2997 }, { "epoch": 1.419591595146493, "grad_norm": 1.8472795486450195, "learning_rate": 1.1343491007363652e-05, "loss": 0.2181, "step": 2998 }, { "epoch": 1.4200651080201243, "grad_norm": 1.8274016380310059, "learning_rate": 1.1338423094348299e-05, "loss": 0.2138, "step": 2999 }, { "epoch": 1.4205386208937556, "grad_norm": 1.1858888864517212, "learning_rate": 1.1333354831281295e-05, "loss": 0.2022, "step": 3000 }, { "epoch": 1.4210121337673867, "grad_norm": 1.1788923740386963, "learning_rate": 1.13282862194882e-05, "loss": 0.1814, "step": 3001 }, { "epoch": 1.4214856466410182, "grad_norm": 1.1308039426803589, "learning_rate": 1.132321726029466e-05, "loss": 0.2167, "step": 3002 }, { "epoch": 1.4219591595146492, "grad_norm": 1.0506905317306519, "learning_rate": 1.1318147955026408e-05, "loss": 0.2011, "step": 3003 }, { "epoch": 1.4224326723882805, "grad_norm": 1.2325364351272583, "learning_rate": 1.1313078305009278e-05, "loss": 0.1922, "step": 3004 }, { "epoch": 1.4229061852619118, "grad_norm": 1.2299723625183105, "learning_rate": 1.130800831156918e-05, "loss": 0.2292, "step": 3005 }, { "epoch": 1.423379698135543, "grad_norm": 1.5796177387237549, "learning_rate": 1.1302937976032131e-05, "loss": 0.2311, "step": 3006 }, { "epoch": 1.4238532110091744, "grad_norm": 1.015859603881836, "learning_rate": 1.1297867299724223e-05, "loss": 0.2398, "step": 3007 }, { "epoch": 1.4243267238828055, "grad_norm": 1.0847722291946411, "learning_rate": 1.1292796283971645e-05, "loss": 0.2105, "step": 3008 }, { "epoch": 1.4248002367564367, "grad_norm": 1.5696632862091064, "learning_rate": 1.1287724930100668e-05, "loss": 0.22, "step": 3009 }, { "epoch": 1.425273749630068, "grad_norm": 1.0364569425582886, "learning_rate": 1.128265323943766e-05, "loss": 0.2086, "step": 3010 }, { "epoch": 1.4257472625036993, "grad_norm": 1.0130901336669922, "learning_rate": 1.127758121330907e-05, "loss": 0.1888, "step": 3011 }, { "epoch": 1.4262207753773306, "grad_norm": 1.5478230714797974, "learning_rate": 1.127250885304144e-05, "loss": 0.2325, "step": 3012 }, { "epoch": 1.426694288250962, "grad_norm": 1.2723689079284668, "learning_rate": 1.1267436159961393e-05, "loss": 0.2187, "step": 3013 }, { "epoch": 1.4271678011245932, "grad_norm": 1.111171007156372, "learning_rate": 1.1262363135395648e-05, "loss": 0.215, "step": 3014 }, { "epoch": 1.4276413139982242, "grad_norm": 1.3690720796585083, "learning_rate": 1.1257289780671004e-05, "loss": 0.2206, "step": 3015 }, { "epoch": 1.4281148268718555, "grad_norm": 1.2229105234146118, "learning_rate": 1.1252216097114344e-05, "loss": 0.2297, "step": 3016 }, { "epoch": 1.4285883397454868, "grad_norm": 1.8208054304122925, "learning_rate": 1.124714208605265e-05, "loss": 0.2429, "step": 3017 }, { "epoch": 1.4290618526191181, "grad_norm": 1.015411615371704, "learning_rate": 1.1242067748812968e-05, "loss": 0.2145, "step": 3018 }, { "epoch": 1.4295353654927494, "grad_norm": 1.8651050329208374, "learning_rate": 1.123699308672245e-05, "loss": 0.2222, "step": 3019 }, { "epoch": 1.4300088783663805, "grad_norm": 2.2374584674835205, "learning_rate": 1.1231918101108325e-05, "loss": 0.2141, "step": 3020 }, { "epoch": 1.430482391240012, "grad_norm": 1.1194039583206177, "learning_rate": 1.1226842793297904e-05, "loss": 0.212, "step": 3021 }, { "epoch": 1.430955904113643, "grad_norm": 1.2247196435928345, "learning_rate": 1.122176716461859e-05, "loss": 0.2164, "step": 3022 }, { "epoch": 1.4314294169872743, "grad_norm": 1.0762709379196167, "learning_rate": 1.1216691216397854e-05, "loss": 0.2328, "step": 3023 }, { "epoch": 1.4319029298609056, "grad_norm": 1.0583688020706177, "learning_rate": 1.1211614949963274e-05, "loss": 0.1974, "step": 3024 }, { "epoch": 1.432376442734537, "grad_norm": 1.0842574834823608, "learning_rate": 1.1206538366642491e-05, "loss": 0.1934, "step": 3025 }, { "epoch": 1.4328499556081682, "grad_norm": 1.6864724159240723, "learning_rate": 1.1201461467763238e-05, "loss": 0.227, "step": 3026 }, { "epoch": 1.4333234684817993, "grad_norm": 1.0597320795059204, "learning_rate": 1.1196384254653328e-05, "loss": 0.2115, "step": 3027 }, { "epoch": 1.4337969813554305, "grad_norm": 1.4735287427902222, "learning_rate": 1.1191306728640659e-05, "loss": 0.2357, "step": 3028 }, { "epoch": 1.4342704942290618, "grad_norm": 1.1243826150894165, "learning_rate": 1.1186228891053205e-05, "loss": 0.2247, "step": 3029 }, { "epoch": 1.4347440071026931, "grad_norm": 1.8924311399459839, "learning_rate": 1.1181150743219031e-05, "loss": 0.2168, "step": 3030 }, { "epoch": 1.4352175199763244, "grad_norm": 1.0397160053253174, "learning_rate": 1.1176072286466273e-05, "loss": 0.2292, "step": 3031 }, { "epoch": 1.4356910328499555, "grad_norm": 1.515010118484497, "learning_rate": 1.1170993522123155e-05, "loss": 0.236, "step": 3032 }, { "epoch": 1.436164545723587, "grad_norm": 1.094772458076477, "learning_rate": 1.1165914451517978e-05, "loss": 0.1897, "step": 3033 }, { "epoch": 1.436638058597218, "grad_norm": 1.71537446975708, "learning_rate": 1.1160835075979124e-05, "loss": 0.1892, "step": 3034 }, { "epoch": 1.4371115714708493, "grad_norm": 1.3985260725021362, "learning_rate": 1.1155755396835057e-05, "loss": 0.2327, "step": 3035 }, { "epoch": 1.4375850843444806, "grad_norm": 0.8228690028190613, "learning_rate": 1.1150675415414314e-05, "loss": 0.2021, "step": 3036 }, { "epoch": 1.438058597218112, "grad_norm": 0.931536078453064, "learning_rate": 1.1145595133045517e-05, "loss": 0.2083, "step": 3037 }, { "epoch": 1.4385321100917432, "grad_norm": 1.1511605978012085, "learning_rate": 1.1140514551057361e-05, "loss": 0.201, "step": 3038 }, { "epoch": 1.4390056229653743, "grad_norm": 1.2221344709396362, "learning_rate": 1.113543367077863e-05, "loss": 0.2049, "step": 3039 }, { "epoch": 1.4394791358390056, "grad_norm": 1.1700915098190308, "learning_rate": 1.1130352493538176e-05, "loss": 0.208, "step": 3040 }, { "epoch": 1.4399526487126368, "grad_norm": 0.9786930084228516, "learning_rate": 1.1125271020664931e-05, "loss": 0.2006, "step": 3041 }, { "epoch": 1.4404261615862681, "grad_norm": 1.0784828662872314, "learning_rate": 1.1120189253487912e-05, "loss": 0.2204, "step": 3042 }, { "epoch": 1.4408996744598994, "grad_norm": 1.8664531707763672, "learning_rate": 1.1115107193336194e-05, "loss": 0.1987, "step": 3043 }, { "epoch": 1.4413731873335307, "grad_norm": 1.3436121940612793, "learning_rate": 1.111002484153895e-05, "loss": 0.2301, "step": 3044 }, { "epoch": 1.441846700207162, "grad_norm": 1.8887630701065063, "learning_rate": 1.1104942199425418e-05, "loss": 0.2189, "step": 3045 }, { "epoch": 1.442320213080793, "grad_norm": 1.4706549644470215, "learning_rate": 1.1099859268324911e-05, "loss": 0.2382, "step": 3046 }, { "epoch": 1.4427937259544243, "grad_norm": 1.3858911991119385, "learning_rate": 1.1094776049566822e-05, "loss": 0.2085, "step": 3047 }, { "epoch": 1.4432672388280556, "grad_norm": 1.5077974796295166, "learning_rate": 1.1089692544480622e-05, "loss": 0.2127, "step": 3048 }, { "epoch": 1.443740751701687, "grad_norm": 1.5535578727722168, "learning_rate": 1.1084608754395846e-05, "loss": 0.2164, "step": 3049 }, { "epoch": 1.4442142645753182, "grad_norm": 1.3303495645523071, "learning_rate": 1.1079524680642115e-05, "loss": 0.2167, "step": 3050 }, { "epoch": 1.4446877774489493, "grad_norm": 1.0496814250946045, "learning_rate": 1.1074440324549118e-05, "loss": 0.2073, "step": 3051 }, { "epoch": 1.4451612903225808, "grad_norm": 2.0565896034240723, "learning_rate": 1.1069355687446624e-05, "loss": 0.2019, "step": 3052 }, { "epoch": 1.4456348031962118, "grad_norm": 1.2639986276626587, "learning_rate": 1.1064270770664463e-05, "loss": 0.2108, "step": 3053 }, { "epoch": 1.4461083160698431, "grad_norm": 1.359134554862976, "learning_rate": 1.1059185575532548e-05, "loss": 0.2302, "step": 3054 }, { "epoch": 1.4465818289434744, "grad_norm": 1.306850552558899, "learning_rate": 1.1054100103380862e-05, "loss": 0.228, "step": 3055 }, { "epoch": 1.4470553418171057, "grad_norm": 1.1254212856292725, "learning_rate": 1.1049014355539466e-05, "loss": 0.2271, "step": 3056 }, { "epoch": 1.447528854690737, "grad_norm": 1.1044193506240845, "learning_rate": 1.1043928333338486e-05, "loss": 0.2116, "step": 3057 }, { "epoch": 1.448002367564368, "grad_norm": 1.277251958847046, "learning_rate": 1.1038842038108117e-05, "loss": 0.2246, "step": 3058 }, { "epoch": 1.4484758804379994, "grad_norm": 1.157020092010498, "learning_rate": 1.1033755471178638e-05, "loss": 0.2379, "step": 3059 }, { "epoch": 1.4489493933116306, "grad_norm": 1.0917078256607056, "learning_rate": 1.102866863388039e-05, "loss": 0.2357, "step": 3060 }, { "epoch": 1.449422906185262, "grad_norm": 1.5090773105621338, "learning_rate": 1.1023581527543784e-05, "loss": 0.2085, "step": 3061 }, { "epoch": 1.4498964190588932, "grad_norm": 1.000868320465088, "learning_rate": 1.1018494153499302e-05, "loss": 0.2221, "step": 3062 }, { "epoch": 1.4503699319325243, "grad_norm": 1.2444463968276978, "learning_rate": 1.10134065130775e-05, "loss": 0.2355, "step": 3063 }, { "epoch": 1.4508434448061558, "grad_norm": 1.5682604312896729, "learning_rate": 1.1008318607609e-05, "loss": 0.2147, "step": 3064 }, { "epoch": 1.4513169576797869, "grad_norm": 1.108665108680725, "learning_rate": 1.1003230438424498e-05, "loss": 0.205, "step": 3065 }, { "epoch": 1.4517904705534181, "grad_norm": 1.2267842292785645, "learning_rate": 1.0998142006854754e-05, "loss": 0.2354, "step": 3066 }, { "epoch": 1.4522639834270494, "grad_norm": 1.2378571033477783, "learning_rate": 1.0993053314230593e-05, "loss": 0.2122, "step": 3067 }, { "epoch": 1.4527374963006807, "grad_norm": 1.5879398584365845, "learning_rate": 1.0987964361882921e-05, "loss": 0.2425, "step": 3068 }, { "epoch": 1.453211009174312, "grad_norm": 0.9649191498756409, "learning_rate": 1.0982875151142702e-05, "loss": 0.2217, "step": 3069 }, { "epoch": 1.453684522047943, "grad_norm": 1.1818419694900513, "learning_rate": 1.097778568334097e-05, "loss": 0.1961, "step": 3070 }, { "epoch": 1.4541580349215744, "grad_norm": 1.4016741514205933, "learning_rate": 1.0972695959808822e-05, "loss": 0.1939, "step": 3071 }, { "epoch": 1.4546315477952056, "grad_norm": 1.1574840545654297, "learning_rate": 1.096760598187743e-05, "loss": 0.2055, "step": 3072 }, { "epoch": 1.455105060668837, "grad_norm": 2.2353291511535645, "learning_rate": 1.096251575087803e-05, "loss": 0.2041, "step": 3073 }, { "epoch": 1.4555785735424682, "grad_norm": 1.6579928398132324, "learning_rate": 1.095742526814192e-05, "loss": 0.1982, "step": 3074 }, { "epoch": 1.4560520864160995, "grad_norm": 1.2612777948379517, "learning_rate": 1.0952334535000466e-05, "loss": 0.2086, "step": 3075 }, { "epoch": 1.4565255992897308, "grad_norm": 1.2440173625946045, "learning_rate": 1.0947243552785103e-05, "loss": 0.2239, "step": 3076 }, { "epoch": 1.4569991121633619, "grad_norm": 1.0874110460281372, "learning_rate": 1.0942152322827326e-05, "loss": 0.2099, "step": 3077 }, { "epoch": 1.4574726250369932, "grad_norm": 1.3692322969436646, "learning_rate": 1.0937060846458701e-05, "loss": 0.2069, "step": 3078 }, { "epoch": 1.4579461379106244, "grad_norm": 1.1790695190429688, "learning_rate": 1.0931969125010848e-05, "loss": 0.2467, "step": 3079 }, { "epoch": 1.4584196507842557, "grad_norm": 1.1012235879898071, "learning_rate": 1.0926877159815463e-05, "loss": 0.2214, "step": 3080 }, { "epoch": 1.458893163657887, "grad_norm": 1.178139328956604, "learning_rate": 1.0921784952204299e-05, "loss": 0.2138, "step": 3081 }, { "epoch": 1.459366676531518, "grad_norm": 1.5324974060058594, "learning_rate": 1.0916692503509172e-05, "loss": 0.234, "step": 3082 }, { "epoch": 1.4598401894051496, "grad_norm": 1.391147255897522, "learning_rate": 1.0911599815061966e-05, "loss": 0.2091, "step": 3083 }, { "epoch": 1.4603137022787807, "grad_norm": 1.076026439666748, "learning_rate": 1.0906506888194621e-05, "loss": 0.2073, "step": 3084 }, { "epoch": 1.460787215152412, "grad_norm": 1.4098458290100098, "learning_rate": 1.0901413724239144e-05, "loss": 0.2358, "step": 3085 }, { "epoch": 1.4612607280260432, "grad_norm": 1.8523685932159424, "learning_rate": 1.0896320324527606e-05, "loss": 0.2471, "step": 3086 }, { "epoch": 1.4617342408996745, "grad_norm": 1.161373496055603, "learning_rate": 1.0891226690392136e-05, "loss": 0.2168, "step": 3087 }, { "epoch": 1.4622077537733058, "grad_norm": 1.116976261138916, "learning_rate": 1.0886132823164919e-05, "loss": 0.2141, "step": 3088 }, { "epoch": 1.4626812666469369, "grad_norm": 0.9809169769287109, "learning_rate": 1.0881038724178211e-05, "loss": 0.2297, "step": 3089 }, { "epoch": 1.4631547795205682, "grad_norm": 1.2216544151306152, "learning_rate": 1.0875944394764325e-05, "loss": 0.2317, "step": 3090 }, { "epoch": 1.4636282923941994, "grad_norm": 1.036920189857483, "learning_rate": 1.0870849836255632e-05, "loss": 0.2307, "step": 3091 }, { "epoch": 1.4641018052678307, "grad_norm": 1.1497174501419067, "learning_rate": 1.0865755049984568e-05, "loss": 0.2114, "step": 3092 }, { "epoch": 1.464575318141462, "grad_norm": 1.21004056930542, "learning_rate": 1.0860660037283622e-05, "loss": 0.2054, "step": 3093 }, { "epoch": 1.465048831015093, "grad_norm": 2.018868923187256, "learning_rate": 1.0855564799485345e-05, "loss": 0.2074, "step": 3094 }, { "epoch": 1.4655223438887246, "grad_norm": 1.4326393604278564, "learning_rate": 1.0850469337922348e-05, "loss": 0.2179, "step": 3095 }, { "epoch": 1.4659958567623557, "grad_norm": 1.4479272365570068, "learning_rate": 1.0845373653927303e-05, "loss": 0.2191, "step": 3096 }, { "epoch": 1.466469369635987, "grad_norm": 1.525227665901184, "learning_rate": 1.084027774883293e-05, "loss": 0.2227, "step": 3097 }, { "epoch": 1.4669428825096182, "grad_norm": 1.1272501945495605, "learning_rate": 1.0835181623972022e-05, "loss": 0.2156, "step": 3098 }, { "epoch": 1.4674163953832495, "grad_norm": 1.327453851699829, "learning_rate": 1.0830085280677415e-05, "loss": 0.2185, "step": 3099 }, { "epoch": 1.4678899082568808, "grad_norm": 1.1839863061904907, "learning_rate": 1.082498872028201e-05, "loss": 0.2343, "step": 3100 }, { "epoch": 1.4683634211305119, "grad_norm": 0.9597224593162537, "learning_rate": 1.0819891944118768e-05, "loss": 0.193, "step": 3101 }, { "epoch": 1.4688369340041432, "grad_norm": 1.457085371017456, "learning_rate": 1.0814794953520695e-05, "loss": 0.2102, "step": 3102 }, { "epoch": 1.4693104468777745, "grad_norm": 1.054195761680603, "learning_rate": 1.0809697749820862e-05, "loss": 0.2354, "step": 3103 }, { "epoch": 1.4697839597514057, "grad_norm": 1.249380111694336, "learning_rate": 1.0804600334352398e-05, "loss": 0.2031, "step": 3104 }, { "epoch": 1.470257472625037, "grad_norm": 1.4922364950180054, "learning_rate": 1.0799502708448475e-05, "loss": 0.2107, "step": 3105 }, { "epoch": 1.4707309854986683, "grad_norm": 1.830451488494873, "learning_rate": 1.079440487344233e-05, "loss": 0.2185, "step": 3106 }, { "epoch": 1.4712044983722996, "grad_norm": 1.072477102279663, "learning_rate": 1.0789306830667256e-05, "loss": 0.2233, "step": 3107 }, { "epoch": 1.4716780112459307, "grad_norm": 1.220098853111267, "learning_rate": 1.0784208581456595e-05, "loss": 0.2268, "step": 3108 }, { "epoch": 1.472151524119562, "grad_norm": 1.263378381729126, "learning_rate": 1.0779110127143746e-05, "loss": 0.2277, "step": 3109 }, { "epoch": 1.4726250369931932, "grad_norm": 1.1181433200836182, "learning_rate": 1.0774011469062157e-05, "loss": 0.2124, "step": 3110 }, { "epoch": 1.4730985498668245, "grad_norm": 0.9625453352928162, "learning_rate": 1.0768912608545336e-05, "loss": 0.2112, "step": 3111 }, { "epoch": 1.4735720627404558, "grad_norm": 1.250014066696167, "learning_rate": 1.0763813546926842e-05, "loss": 0.2388, "step": 3112 }, { "epoch": 1.474045575614087, "grad_norm": 1.1710668802261353, "learning_rate": 1.0758714285540281e-05, "loss": 0.2245, "step": 3113 }, { "epoch": 1.4745190884877182, "grad_norm": 1.6824264526367188, "learning_rate": 1.0753614825719321e-05, "loss": 0.1949, "step": 3114 }, { "epoch": 1.4749926013613495, "grad_norm": 1.1846730709075928, "learning_rate": 1.0748515168797673e-05, "loss": 0.2139, "step": 3115 }, { "epoch": 1.4754661142349808, "grad_norm": 1.0218571424484253, "learning_rate": 1.07434153161091e-05, "loss": 0.1977, "step": 3116 }, { "epoch": 1.475939627108612, "grad_norm": 1.459490418434143, "learning_rate": 1.0738315268987424e-05, "loss": 0.2223, "step": 3117 }, { "epoch": 1.4764131399822433, "grad_norm": 1.3154277801513672, "learning_rate": 1.0733215028766515e-05, "loss": 0.1916, "step": 3118 }, { "epoch": 1.4768866528558746, "grad_norm": 1.44888436794281, "learning_rate": 1.0728114596780287e-05, "loss": 0.1832, "step": 3119 }, { "epoch": 1.4773601657295057, "grad_norm": 1.1630711555480957, "learning_rate": 1.072301397436271e-05, "loss": 0.2025, "step": 3120 }, { "epoch": 1.477833678603137, "grad_norm": 1.9314302206039429, "learning_rate": 1.0717913162847803e-05, "loss": 0.1954, "step": 3121 }, { "epoch": 1.4783071914767683, "grad_norm": 2.154477119445801, "learning_rate": 1.0712812163569638e-05, "loss": 0.2119, "step": 3122 }, { "epoch": 1.4787807043503995, "grad_norm": 1.4404516220092773, "learning_rate": 1.0707710977862322e-05, "loss": 0.2379, "step": 3123 }, { "epoch": 1.4792542172240308, "grad_norm": 1.926571249961853, "learning_rate": 1.070260960706003e-05, "loss": 0.2268, "step": 3124 }, { "epoch": 1.479727730097662, "grad_norm": 1.082242488861084, "learning_rate": 1.0697508052496975e-05, "loss": 0.2129, "step": 3125 }, { "epoch": 1.4802012429712934, "grad_norm": 1.2488141059875488, "learning_rate": 1.0692406315507416e-05, "loss": 0.225, "step": 3126 }, { "epoch": 1.4806747558449245, "grad_norm": 1.4362695217132568, "learning_rate": 1.0687304397425666e-05, "loss": 0.2294, "step": 3127 }, { "epoch": 1.4811482687185558, "grad_norm": 1.3312585353851318, "learning_rate": 1.068220229958608e-05, "loss": 0.2222, "step": 3128 }, { "epoch": 1.481621781592187, "grad_norm": 1.7034131288528442, "learning_rate": 1.0677100023323064e-05, "loss": 0.2519, "step": 3129 }, { "epoch": 1.4820952944658183, "grad_norm": 1.3612613677978516, "learning_rate": 1.067199756997107e-05, "loss": 0.2048, "step": 3130 }, { "epoch": 1.4825688073394496, "grad_norm": 1.0770941972732544, "learning_rate": 1.0666894940864595e-05, "loss": 0.2233, "step": 3131 }, { "epoch": 1.4830423202130807, "grad_norm": 1.6613733768463135, "learning_rate": 1.0661792137338183e-05, "loss": 0.2034, "step": 3132 }, { "epoch": 1.483515833086712, "grad_norm": 1.7866359949111938, "learning_rate": 1.0656689160726418e-05, "loss": 0.2307, "step": 3133 }, { "epoch": 1.4839893459603433, "grad_norm": 0.9103512167930603, "learning_rate": 1.065158601236394e-05, "loss": 0.2262, "step": 3134 }, { "epoch": 1.4844628588339746, "grad_norm": 1.5666887760162354, "learning_rate": 1.0646482693585427e-05, "loss": 0.1796, "step": 3135 }, { "epoch": 1.4849363717076058, "grad_norm": 1.1646493673324585, "learning_rate": 1.0641379205725599e-05, "loss": 0.2278, "step": 3136 }, { "epoch": 1.4854098845812371, "grad_norm": 1.1764169931411743, "learning_rate": 1.063627555011923e-05, "loss": 0.1983, "step": 3137 }, { "epoch": 1.4858833974548684, "grad_norm": 1.3643893003463745, "learning_rate": 1.0631171728101129e-05, "loss": 0.1969, "step": 3138 }, { "epoch": 1.4863569103284995, "grad_norm": 1.2476388216018677, "learning_rate": 1.0626067741006155e-05, "loss": 0.1941, "step": 3139 }, { "epoch": 1.4868304232021308, "grad_norm": 1.1567429304122925, "learning_rate": 1.0620963590169197e-05, "loss": 0.2146, "step": 3140 }, { "epoch": 1.487303936075762, "grad_norm": 1.037000060081482, "learning_rate": 1.0615859276925207e-05, "loss": 0.2062, "step": 3141 }, { "epoch": 1.4877774489493933, "grad_norm": 1.5151640176773071, "learning_rate": 1.0610754802609162e-05, "loss": 0.2161, "step": 3142 }, { "epoch": 1.4882509618230246, "grad_norm": 1.4791309833526611, "learning_rate": 1.060565016855609e-05, "loss": 0.2273, "step": 3143 }, { "epoch": 1.4887244746966557, "grad_norm": 1.1170471906661987, "learning_rate": 1.0600545376101061e-05, "loss": 0.2279, "step": 3144 }, { "epoch": 1.489197987570287, "grad_norm": 1.664336085319519, "learning_rate": 1.0595440426579182e-05, "loss": 0.2168, "step": 3145 }, { "epoch": 1.4896715004439183, "grad_norm": 1.1722487211227417, "learning_rate": 1.0590335321325603e-05, "loss": 0.2215, "step": 3146 }, { "epoch": 1.4901450133175496, "grad_norm": 1.0749174356460571, "learning_rate": 1.0585230061675515e-05, "loss": 0.2348, "step": 3147 }, { "epoch": 1.4906185261911808, "grad_norm": 1.5461552143096924, "learning_rate": 1.0580124648964153e-05, "loss": 0.2247, "step": 3148 }, { "epoch": 1.4910920390648121, "grad_norm": 0.9260118007659912, "learning_rate": 1.0575019084526785e-05, "loss": 0.2142, "step": 3149 }, { "epoch": 1.4915655519384434, "grad_norm": 1.4796782732009888, "learning_rate": 1.0569913369698722e-05, "loss": 0.2071, "step": 3150 }, { "epoch": 1.4920390648120745, "grad_norm": 1.0759540796279907, "learning_rate": 1.0564807505815316e-05, "loss": 0.1932, "step": 3151 }, { "epoch": 1.4925125776857058, "grad_norm": 1.082331895828247, "learning_rate": 1.0559701494211953e-05, "loss": 0.212, "step": 3152 }, { "epoch": 1.492986090559337, "grad_norm": 1.0994702577590942, "learning_rate": 1.0554595336224065e-05, "loss": 0.2176, "step": 3153 }, { "epoch": 1.4934596034329684, "grad_norm": 0.989517092704773, "learning_rate": 1.0549489033187116e-05, "loss": 0.2085, "step": 3154 }, { "epoch": 1.4939331163065996, "grad_norm": 1.2326009273529053, "learning_rate": 1.0544382586436613e-05, "loss": 0.1911, "step": 3155 }, { "epoch": 1.4944066291802307, "grad_norm": 1.4476752281188965, "learning_rate": 1.0539275997308099e-05, "loss": 0.2365, "step": 3156 }, { "epoch": 1.4948801420538622, "grad_norm": 1.07553231716156, "learning_rate": 1.0534169267137146e-05, "loss": 0.2231, "step": 3157 }, { "epoch": 1.4953536549274933, "grad_norm": 1.2448638677597046, "learning_rate": 1.0529062397259375e-05, "loss": 0.2211, "step": 3158 }, { "epoch": 1.4958271678011246, "grad_norm": 1.1470022201538086, "learning_rate": 1.0523955389010438e-05, "loss": 0.2268, "step": 3159 }, { "epoch": 1.4963006806747559, "grad_norm": 1.2596864700317383, "learning_rate": 1.0518848243726026e-05, "loss": 0.2212, "step": 3160 }, { "epoch": 1.4967741935483871, "grad_norm": 1.1401036977767944, "learning_rate": 1.0513740962741861e-05, "loss": 0.2301, "step": 3161 }, { "epoch": 1.4972477064220184, "grad_norm": 1.7496001720428467, "learning_rate": 1.05086335473937e-05, "loss": 0.2162, "step": 3162 }, { "epoch": 1.4977212192956495, "grad_norm": 1.3748828172683716, "learning_rate": 1.0503525999017343e-05, "loss": 0.2101, "step": 3163 }, { "epoch": 1.4981947321692808, "grad_norm": 1.002260446548462, "learning_rate": 1.049841831894862e-05, "loss": 0.2274, "step": 3164 }, { "epoch": 1.498668245042912, "grad_norm": 1.6265779733657837, "learning_rate": 1.0493310508523393e-05, "loss": 0.2283, "step": 3165 }, { "epoch": 1.4991417579165434, "grad_norm": 1.0258605480194092, "learning_rate": 1.0488202569077564e-05, "loss": 0.199, "step": 3166 }, { "epoch": 1.4996152707901746, "grad_norm": 1.2962850332260132, "learning_rate": 1.0483094501947062e-05, "loss": 0.2447, "step": 3167 }, { "epoch": 1.5000887836638057, "grad_norm": 0.9795529246330261, "learning_rate": 1.0477986308467851e-05, "loss": 0.2269, "step": 3168 }, { "epoch": 1.5005622965374372, "grad_norm": 1.3065211772918701, "learning_rate": 1.0472877989975933e-05, "loss": 0.2304, "step": 3169 }, { "epoch": 1.5010358094110683, "grad_norm": 2.2391700744628906, "learning_rate": 1.046776954780734e-05, "loss": 0.2273, "step": 3170 }, { "epoch": 1.5015093222846996, "grad_norm": 1.249174952507019, "learning_rate": 1.0462660983298132e-05, "loss": 0.1968, "step": 3171 }, { "epoch": 1.5019828351583309, "grad_norm": 1.0384050607681274, "learning_rate": 1.045755229778441e-05, "loss": 0.2087, "step": 3172 }, { "epoch": 1.5024563480319622, "grad_norm": 1.1867655515670776, "learning_rate": 1.0452443492602296e-05, "loss": 0.2292, "step": 3173 }, { "epoch": 1.5029298609055934, "grad_norm": 1.0175690650939941, "learning_rate": 1.0447334569087953e-05, "loss": 0.2039, "step": 3174 }, { "epoch": 1.5034033737792245, "grad_norm": 1.395185947418213, "learning_rate": 1.0442225528577568e-05, "loss": 0.2102, "step": 3175 }, { "epoch": 1.503876886652856, "grad_norm": 1.4154939651489258, "learning_rate": 1.0437116372407364e-05, "loss": 0.2346, "step": 3176 }, { "epoch": 1.504350399526487, "grad_norm": 1.1730825901031494, "learning_rate": 1.0432007101913588e-05, "loss": 0.1973, "step": 3177 }, { "epoch": 1.5048239124001184, "grad_norm": 1.7297412157058716, "learning_rate": 1.0426897718432523e-05, "loss": 0.216, "step": 3178 }, { "epoch": 1.5052974252737497, "grad_norm": 0.9606209993362427, "learning_rate": 1.0421788223300478e-05, "loss": 0.2313, "step": 3179 }, { "epoch": 1.5057709381473807, "grad_norm": 1.1701611280441284, "learning_rate": 1.041667861785379e-05, "loss": 0.208, "step": 3180 }, { "epoch": 1.5062444510210122, "grad_norm": 1.2683926820755005, "learning_rate": 1.0411568903428832e-05, "loss": 0.229, "step": 3181 }, { "epoch": 1.5067179638946433, "grad_norm": 1.734331727027893, "learning_rate": 1.0406459081361998e-05, "loss": 0.2352, "step": 3182 }, { "epoch": 1.5071914767682746, "grad_norm": 1.5925458669662476, "learning_rate": 1.0401349152989713e-05, "loss": 0.2294, "step": 3183 }, { "epoch": 1.5076649896419059, "grad_norm": 1.7187014818191528, "learning_rate": 1.0396239119648426e-05, "loss": 0.2182, "step": 3184 }, { "epoch": 1.5081385025155372, "grad_norm": 1.6393986940383911, "learning_rate": 1.0391128982674621e-05, "loss": 0.2139, "step": 3185 }, { "epoch": 1.5086120153891684, "grad_norm": 1.1025725603103638, "learning_rate": 1.0386018743404805e-05, "loss": 0.2337, "step": 3186 }, { "epoch": 1.5090855282627995, "grad_norm": 1.29452383518219, "learning_rate": 1.0380908403175509e-05, "loss": 0.1877, "step": 3187 }, { "epoch": 1.509559041136431, "grad_norm": 2.088343858718872, "learning_rate": 1.0375797963323295e-05, "loss": 0.2016, "step": 3188 }, { "epoch": 1.510032554010062, "grad_norm": 1.1434667110443115, "learning_rate": 1.037068742518475e-05, "loss": 0.2022, "step": 3189 }, { "epoch": 1.5105060668836934, "grad_norm": 1.0598808526992798, "learning_rate": 1.0365576790096486e-05, "loss": 0.2173, "step": 3190 }, { "epoch": 1.5109795797573247, "grad_norm": 1.190727710723877, "learning_rate": 1.0360466059395138e-05, "loss": 0.2441, "step": 3191 }, { "epoch": 1.511453092630956, "grad_norm": 1.5847951173782349, "learning_rate": 1.0355355234417369e-05, "loss": 0.2008, "step": 3192 }, { "epoch": 1.5119266055045872, "grad_norm": 1.251030445098877, "learning_rate": 1.035024431649987e-05, "loss": 0.2403, "step": 3193 }, { "epoch": 1.5124001183782183, "grad_norm": 1.6456767320632935, "learning_rate": 1.0345133306979342e-05, "loss": 0.2249, "step": 3194 }, { "epoch": 1.5128736312518498, "grad_norm": 1.1858950853347778, "learning_rate": 1.0340022207192532e-05, "loss": 0.2287, "step": 3195 }, { "epoch": 1.5133471441254809, "grad_norm": 1.4092812538146973, "learning_rate": 1.0334911018476194e-05, "loss": 0.2057, "step": 3196 }, { "epoch": 1.5138206569991122, "grad_norm": 1.0549933910369873, "learning_rate": 1.0329799742167108e-05, "loss": 0.1973, "step": 3197 }, { "epoch": 1.5142941698727435, "grad_norm": 1.5164347887039185, "learning_rate": 1.0324688379602083e-05, "loss": 0.2488, "step": 3198 }, { "epoch": 1.5147676827463745, "grad_norm": 1.8977534770965576, "learning_rate": 1.0319576932117945e-05, "loss": 0.2014, "step": 3199 }, { "epoch": 1.515241195620006, "grad_norm": 1.5914242267608643, "learning_rate": 1.0314465401051544e-05, "loss": 0.2275, "step": 3200 }, { "epoch": 1.515714708493637, "grad_norm": 1.1438359022140503, "learning_rate": 1.030935378773975e-05, "loss": 0.2144, "step": 3201 }, { "epoch": 1.5161882213672684, "grad_norm": 1.1040819883346558, "learning_rate": 1.0304242093519456e-05, "loss": 0.218, "step": 3202 }, { "epoch": 1.5166617342408997, "grad_norm": 1.5802520513534546, "learning_rate": 1.0299130319727576e-05, "loss": 0.2124, "step": 3203 }, { "epoch": 1.517135247114531, "grad_norm": 1.4523437023162842, "learning_rate": 1.029401846770105e-05, "loss": 0.2154, "step": 3204 }, { "epoch": 1.5176087599881622, "grad_norm": 1.1811856031417847, "learning_rate": 1.0288906538776831e-05, "loss": 0.22, "step": 3205 }, { "epoch": 1.5180822728617933, "grad_norm": 1.3491801023483276, "learning_rate": 1.0283794534291891e-05, "loss": 0.2113, "step": 3206 }, { "epoch": 1.5185557857354248, "grad_norm": 1.7651821374893188, "learning_rate": 1.027868245558323e-05, "loss": 0.2177, "step": 3207 }, { "epoch": 1.519029298609056, "grad_norm": 1.385327696800232, "learning_rate": 1.0273570303987859e-05, "loss": 0.2135, "step": 3208 }, { "epoch": 1.5195028114826872, "grad_norm": 1.7392898797988892, "learning_rate": 1.0268458080842815e-05, "loss": 0.2174, "step": 3209 }, { "epoch": 1.5199763243563185, "grad_norm": 1.3350859880447388, "learning_rate": 1.0263345787485156e-05, "loss": 0.2226, "step": 3210 }, { "epoch": 1.5204498372299495, "grad_norm": 1.0405269861221313, "learning_rate": 1.025823342525194e-05, "loss": 0.2163, "step": 3211 }, { "epoch": 1.520923350103581, "grad_norm": 1.7307740449905396, "learning_rate": 1.0253120995480264e-05, "loss": 0.2056, "step": 3212 }, { "epoch": 1.521396862977212, "grad_norm": 1.300768494606018, "learning_rate": 1.024800849950723e-05, "loss": 0.1799, "step": 3213 }, { "epoch": 1.5218703758508434, "grad_norm": 1.4394781589508057, "learning_rate": 1.024289593866997e-05, "loss": 0.2101, "step": 3214 }, { "epoch": 1.5223438887244747, "grad_norm": 1.2956507205963135, "learning_rate": 1.0237783314305621e-05, "loss": 0.2143, "step": 3215 }, { "epoch": 1.522817401598106, "grad_norm": 1.18267822265625, "learning_rate": 1.0232670627751337e-05, "loss": 0.2064, "step": 3216 }, { "epoch": 1.5232909144717373, "grad_norm": 1.1501325368881226, "learning_rate": 1.02275578803443e-05, "loss": 0.2249, "step": 3217 }, { "epoch": 1.5237644273453683, "grad_norm": 1.0747531652450562, "learning_rate": 1.0222445073421692e-05, "loss": 0.2215, "step": 3218 }, { "epoch": 1.5242379402189998, "grad_norm": 1.061420202255249, "learning_rate": 1.021733220832072e-05, "loss": 0.1964, "step": 3219 }, { "epoch": 1.524711453092631, "grad_norm": 1.1536870002746582, "learning_rate": 1.0212219286378606e-05, "loss": 0.2291, "step": 3220 }, { "epoch": 1.5251849659662622, "grad_norm": 2.1566736698150635, "learning_rate": 1.0207106308932585e-05, "loss": 0.2479, "step": 3221 }, { "epoch": 1.5256584788398935, "grad_norm": 1.3876272439956665, "learning_rate": 1.0201993277319906e-05, "loss": 0.2181, "step": 3222 }, { "epoch": 1.5261319917135248, "grad_norm": 1.3905116319656372, "learning_rate": 1.0196880192877836e-05, "loss": 0.2275, "step": 3223 }, { "epoch": 1.526605504587156, "grad_norm": 1.5620601177215576, "learning_rate": 1.019176705694365e-05, "loss": 0.222, "step": 3224 }, { "epoch": 1.5270790174607871, "grad_norm": 1.212353229522705, "learning_rate": 1.018665387085464e-05, "loss": 0.196, "step": 3225 }, { "epoch": 1.5275525303344186, "grad_norm": 1.0069432258605957, "learning_rate": 1.018154063594811e-05, "loss": 0.2234, "step": 3226 }, { "epoch": 1.5280260432080497, "grad_norm": 1.6228710412979126, "learning_rate": 1.017642735356138e-05, "loss": 0.2348, "step": 3227 }, { "epoch": 1.528499556081681, "grad_norm": 1.7563085556030273, "learning_rate": 1.0171314025031777e-05, "loss": 0.2149, "step": 3228 }, { "epoch": 1.5289730689553123, "grad_norm": 1.539036750793457, "learning_rate": 1.0166200651696642e-05, "loss": 0.2116, "step": 3229 }, { "epoch": 1.5294465818289433, "grad_norm": 2.3316597938537598, "learning_rate": 1.0161087234893332e-05, "loss": 0.2123, "step": 3230 }, { "epoch": 1.5299200947025748, "grad_norm": 1.3801292181015015, "learning_rate": 1.0155973775959209e-05, "loss": 0.2363, "step": 3231 }, { "epoch": 1.530393607576206, "grad_norm": 1.6556155681610107, "learning_rate": 1.0150860276231649e-05, "loss": 0.2092, "step": 3232 }, { "epoch": 1.5308671204498372, "grad_norm": 1.1947085857391357, "learning_rate": 1.0145746737048037e-05, "loss": 0.2128, "step": 3233 }, { "epoch": 1.5313406333234685, "grad_norm": 1.491898775100708, "learning_rate": 1.0140633159745775e-05, "loss": 0.2195, "step": 3234 }, { "epoch": 1.5318141461970998, "grad_norm": 1.2863268852233887, "learning_rate": 1.0135519545662267e-05, "loss": 0.2168, "step": 3235 }, { "epoch": 1.532287659070731, "grad_norm": 2.18508243560791, "learning_rate": 1.013040589613493e-05, "loss": 0.2423, "step": 3236 }, { "epoch": 1.5327611719443621, "grad_norm": 0.9997796416282654, "learning_rate": 1.0125292212501186e-05, "loss": 0.2101, "step": 3237 }, { "epoch": 1.5332346848179936, "grad_norm": 3.1009128093719482, "learning_rate": 1.0120178496098474e-05, "loss": 0.2044, "step": 3238 }, { "epoch": 1.5337081976916247, "grad_norm": 2.1549055576324463, "learning_rate": 1.0115064748264236e-05, "loss": 0.2169, "step": 3239 }, { "epoch": 1.534181710565256, "grad_norm": 1.5560920238494873, "learning_rate": 1.0109950970335923e-05, "loss": 0.2285, "step": 3240 }, { "epoch": 1.5346552234388873, "grad_norm": 2.1575727462768555, "learning_rate": 1.0104837163650994e-05, "loss": 0.2077, "step": 3241 }, { "epoch": 1.5351287363125183, "grad_norm": 2.3875033855438232, "learning_rate": 1.0099723329546915e-05, "loss": 0.2247, "step": 3242 }, { "epoch": 1.5356022491861498, "grad_norm": 1.0770213603973389, "learning_rate": 1.0094609469361162e-05, "loss": 0.215, "step": 3243 }, { "epoch": 1.536075762059781, "grad_norm": 1.1401655673980713, "learning_rate": 1.0089495584431217e-05, "loss": 0.2126, "step": 3244 }, { "epoch": 1.5365492749334122, "grad_norm": 1.0763803720474243, "learning_rate": 1.0084381676094566e-05, "loss": 0.2448, "step": 3245 }, { "epoch": 1.5370227878070435, "grad_norm": 1.8991236686706543, "learning_rate": 1.00792677456887e-05, "loss": 0.2243, "step": 3246 }, { "epoch": 1.5374963006806748, "grad_norm": 2.337759256362915, "learning_rate": 1.0074153794551119e-05, "loss": 0.2146, "step": 3247 }, { "epoch": 1.537969813554306, "grad_norm": 2.1561119556427, "learning_rate": 1.0069039824019326e-05, "loss": 0.195, "step": 3248 }, { "epoch": 1.5384433264279371, "grad_norm": 2.016658306121826, "learning_rate": 1.0063925835430838e-05, "loss": 0.2334, "step": 3249 }, { "epoch": 1.5389168393015686, "grad_norm": 2.295039653778076, "learning_rate": 1.005881183012316e-05, "loss": 0.2266, "step": 3250 }, { "epoch": 1.5393903521751997, "grad_norm": 2.0591042041778564, "learning_rate": 1.0053697809433817e-05, "loss": 0.2052, "step": 3251 }, { "epoch": 1.539863865048831, "grad_norm": 1.625144362449646, "learning_rate": 1.004858377470033e-05, "loss": 0.2076, "step": 3252 }, { "epoch": 1.5403373779224623, "grad_norm": 0.9673202633857727, "learning_rate": 1.0043469727260228e-05, "loss": 0.2103, "step": 3253 }, { "epoch": 1.5408108907960936, "grad_norm": 1.1474332809448242, "learning_rate": 1.0038355668451037e-05, "loss": 0.214, "step": 3254 }, { "epoch": 1.5412844036697249, "grad_norm": 1.004038691520691, "learning_rate": 1.0033241599610288e-05, "loss": 0.1771, "step": 3255 }, { "epoch": 1.541757916543356, "grad_norm": 1.4131745100021362, "learning_rate": 1.0028127522075522e-05, "loss": 0.2241, "step": 3256 }, { "epoch": 1.5422314294169874, "grad_norm": 2.0983598232269287, "learning_rate": 1.0023013437184273e-05, "loss": 0.2309, "step": 3257 }, { "epoch": 1.5427049422906185, "grad_norm": 1.784072995185852, "learning_rate": 1.0017899346274082e-05, "loss": 0.2212, "step": 3258 }, { "epoch": 1.5431784551642498, "grad_norm": 2.016923427581787, "learning_rate": 1.0012785250682488e-05, "loss": 0.2223, "step": 3259 }, { "epoch": 1.543651968037881, "grad_norm": 1.3807404041290283, "learning_rate": 1.0007671151747038e-05, "loss": 0.2082, "step": 3260 }, { "epoch": 1.5441254809115121, "grad_norm": 1.9663400650024414, "learning_rate": 1.000255705080527e-05, "loss": 0.2303, "step": 3261 }, { "epoch": 1.5445989937851436, "grad_norm": 1.2780094146728516, "learning_rate": 9.997442949194733e-06, "loss": 0.2188, "step": 3262 }, { "epoch": 1.5450725066587747, "grad_norm": 1.079207181930542, "learning_rate": 9.992328848252965e-06, "loss": 0.2033, "step": 3263 }, { "epoch": 1.545546019532406, "grad_norm": 1.1068617105484009, "learning_rate": 9.987214749317514e-06, "loss": 0.2441, "step": 3264 }, { "epoch": 1.5460195324060373, "grad_norm": 1.965084195137024, "learning_rate": 9.982100653725921e-06, "loss": 0.1883, "step": 3265 }, { "epoch": 1.5464930452796686, "grad_norm": 2.6802639961242676, "learning_rate": 9.97698656281573e-06, "loss": 0.2319, "step": 3266 }, { "epoch": 1.5469665581532999, "grad_norm": 1.9177919626235962, "learning_rate": 9.971872477924482e-06, "loss": 0.2197, "step": 3267 }, { "epoch": 1.547440071026931, "grad_norm": 2.636075735092163, "learning_rate": 9.966758400389714e-06, "loss": 0.2109, "step": 3268 }, { "epoch": 1.5479135839005624, "grad_norm": 2.6643176078796387, "learning_rate": 9.961644331548967e-06, "loss": 0.2137, "step": 3269 }, { "epoch": 1.5483870967741935, "grad_norm": 2.1698949337005615, "learning_rate": 9.956530272739775e-06, "loss": 0.2371, "step": 3270 }, { "epoch": 1.5488606096478248, "grad_norm": 1.0527149438858032, "learning_rate": 9.951416225299671e-06, "loss": 0.2027, "step": 3271 }, { "epoch": 1.549334122521456, "grad_norm": 1.1439855098724365, "learning_rate": 9.946302190566186e-06, "loss": 0.2106, "step": 3272 }, { "epoch": 1.5498076353950871, "grad_norm": 1.49906325340271, "learning_rate": 9.941188169876843e-06, "loss": 0.2192, "step": 3273 }, { "epoch": 1.5502811482687187, "grad_norm": 1.464437484741211, "learning_rate": 9.936074164569168e-06, "loss": 0.2061, "step": 3274 }, { "epoch": 1.5507546611423497, "grad_norm": 2.1014366149902344, "learning_rate": 9.930960175980677e-06, "loss": 0.2131, "step": 3275 }, { "epoch": 1.551228174015981, "grad_norm": 1.1291313171386719, "learning_rate": 9.925846205448886e-06, "loss": 0.2004, "step": 3276 }, { "epoch": 1.5517016868896123, "grad_norm": 2.514810800552368, "learning_rate": 9.920732254311306e-06, "loss": 0.1979, "step": 3277 }, { "epoch": 1.5521751997632436, "grad_norm": 1.7324765920639038, "learning_rate": 9.91561832390544e-06, "loss": 0.1977, "step": 3278 }, { "epoch": 1.5526487126368749, "grad_norm": 1.4034252166748047, "learning_rate": 9.910504415568788e-06, "loss": 0.2243, "step": 3279 }, { "epoch": 1.553122225510506, "grad_norm": 1.4264874458312988, "learning_rate": 9.90539053063884e-06, "loss": 0.2129, "step": 3280 }, { "epoch": 1.5535957383841374, "grad_norm": 1.2019026279449463, "learning_rate": 9.900276670453085e-06, "loss": 0.2052, "step": 3281 }, { "epoch": 1.5540692512577685, "grad_norm": 1.222839593887329, "learning_rate": 9.895162836349006e-06, "loss": 0.2145, "step": 3282 }, { "epoch": 1.5545427641313998, "grad_norm": 1.3310260772705078, "learning_rate": 9.890049029664079e-06, "loss": 0.2224, "step": 3283 }, { "epoch": 1.555016277005031, "grad_norm": 1.5830497741699219, "learning_rate": 9.884935251735766e-06, "loss": 0.2102, "step": 3284 }, { "epoch": 1.5554897898786624, "grad_norm": 1.1304932832717896, "learning_rate": 9.879821503901527e-06, "loss": 0.2128, "step": 3285 }, { "epoch": 1.5559633027522937, "grad_norm": 1.5791419744491577, "learning_rate": 9.874707787498814e-06, "loss": 0.2253, "step": 3286 }, { "epoch": 1.5564368156259247, "grad_norm": 2.089458465576172, "learning_rate": 9.869594103865074e-06, "loss": 0.209, "step": 3287 }, { "epoch": 1.5569103284995562, "grad_norm": 1.063925862312317, "learning_rate": 9.864480454337735e-06, "loss": 0.2186, "step": 3288 }, { "epoch": 1.5573838413731873, "grad_norm": 1.1000468730926514, "learning_rate": 9.859366840254227e-06, "loss": 0.212, "step": 3289 }, { "epoch": 1.5578573542468186, "grad_norm": 1.3365740776062012, "learning_rate": 9.854253262951964e-06, "loss": 0.1934, "step": 3290 }, { "epoch": 1.5583308671204499, "grad_norm": 1.6963863372802734, "learning_rate": 9.849139723768354e-06, "loss": 0.2115, "step": 3291 }, { "epoch": 1.558804379994081, "grad_norm": 1.2676887512207031, "learning_rate": 9.844026224040794e-06, "loss": 0.2232, "step": 3292 }, { "epoch": 1.5592778928677125, "grad_norm": 1.4118210077285767, "learning_rate": 9.838912765106671e-06, "loss": 0.2005, "step": 3293 }, { "epoch": 1.5597514057413435, "grad_norm": 1.1821280717849731, "learning_rate": 9.83379934830336e-06, "loss": 0.2133, "step": 3294 }, { "epoch": 1.5602249186149748, "grad_norm": 1.2033153772354126, "learning_rate": 9.828685974968224e-06, "loss": 0.2165, "step": 3295 }, { "epoch": 1.560698431488606, "grad_norm": 1.9517135620117188, "learning_rate": 9.823572646438622e-06, "loss": 0.2394, "step": 3296 }, { "epoch": 1.5611719443622374, "grad_norm": 1.292248249053955, "learning_rate": 9.818459364051893e-06, "loss": 0.2039, "step": 3297 }, { "epoch": 1.5616454572358687, "grad_norm": 1.03531813621521, "learning_rate": 9.813346129145364e-06, "loss": 0.2124, "step": 3298 }, { "epoch": 1.5621189701094997, "grad_norm": 1.404657006263733, "learning_rate": 9.808232943056354e-06, "loss": 0.2242, "step": 3299 }, { "epoch": 1.5625924829831312, "grad_norm": 1.1680656671524048, "learning_rate": 9.803119807122167e-06, "loss": 0.193, "step": 3300 }, { "epoch": 1.5630659958567623, "grad_norm": 1.476773738861084, "learning_rate": 9.798006722680096e-06, "loss": 0.2155, "step": 3301 }, { "epoch": 1.5635395087303936, "grad_norm": 1.3005181550979614, "learning_rate": 9.792893691067417e-06, "loss": 0.1809, "step": 3302 }, { "epoch": 1.5640130216040249, "grad_norm": 1.2317800521850586, "learning_rate": 9.787780713621397e-06, "loss": 0.2186, "step": 3303 }, { "epoch": 1.564486534477656, "grad_norm": 1.0465822219848633, "learning_rate": 9.782667791679283e-06, "loss": 0.2259, "step": 3304 }, { "epoch": 1.5649600473512875, "grad_norm": 1.5197325944900513, "learning_rate": 9.777554926578311e-06, "loss": 0.237, "step": 3305 }, { "epoch": 1.5654335602249185, "grad_norm": 1.2708994150161743, "learning_rate": 9.772442119655706e-06, "loss": 0.2116, "step": 3306 }, { "epoch": 1.5659070730985498, "grad_norm": 1.392824411392212, "learning_rate": 9.767329372248666e-06, "loss": 0.2197, "step": 3307 }, { "epoch": 1.566380585972181, "grad_norm": 2.88095760345459, "learning_rate": 9.762216685694382e-06, "loss": 0.2157, "step": 3308 }, { "epoch": 1.5668540988458124, "grad_norm": 1.6576155424118042, "learning_rate": 9.757104061330033e-06, "loss": 0.1998, "step": 3309 }, { "epoch": 1.5673276117194437, "grad_norm": 1.031028151512146, "learning_rate": 9.751991500492772e-06, "loss": 0.2168, "step": 3310 }, { "epoch": 1.5678011245930747, "grad_norm": 2.0515482425689697, "learning_rate": 9.746879004519741e-06, "loss": 0.1973, "step": 3311 }, { "epoch": 1.5682746374667063, "grad_norm": 1.582655668258667, "learning_rate": 9.741766574748066e-06, "loss": 0.2349, "step": 3312 }, { "epoch": 1.5687481503403373, "grad_norm": 0.992260754108429, "learning_rate": 9.736654212514851e-06, "loss": 0.1884, "step": 3313 }, { "epoch": 1.5692216632139686, "grad_norm": 0.8423192501068115, "learning_rate": 9.731541919157186e-06, "loss": 0.2027, "step": 3314 }, { "epoch": 1.5696951760876, "grad_norm": 1.082379937171936, "learning_rate": 9.726429696012143e-06, "loss": 0.207, "step": 3315 }, { "epoch": 1.5701686889612312, "grad_norm": 1.25356125831604, "learning_rate": 9.721317544416775e-06, "loss": 0.2285, "step": 3316 }, { "epoch": 1.5706422018348625, "grad_norm": 1.3357824087142944, "learning_rate": 9.716205465708114e-06, "loss": 0.1885, "step": 3317 }, { "epoch": 1.5711157147084935, "grad_norm": 1.4782239198684692, "learning_rate": 9.711093461223175e-06, "loss": 0.1942, "step": 3318 }, { "epoch": 1.571589227582125, "grad_norm": 1.226721167564392, "learning_rate": 9.705981532298955e-06, "loss": 0.2043, "step": 3319 }, { "epoch": 1.5720627404557561, "grad_norm": 1.5207408666610718, "learning_rate": 9.700869680272422e-06, "loss": 0.2314, "step": 3320 }, { "epoch": 1.5725362533293874, "grad_norm": 1.0883424282073975, "learning_rate": 9.695757906480545e-06, "loss": 0.2212, "step": 3321 }, { "epoch": 1.5730097662030187, "grad_norm": 1.4195133447647095, "learning_rate": 9.690646212260254e-06, "loss": 0.2189, "step": 3322 }, { "epoch": 1.5734832790766498, "grad_norm": 2.070651054382324, "learning_rate": 9.68553459894846e-06, "loss": 0.2199, "step": 3323 }, { "epoch": 1.5739567919502813, "grad_norm": 1.0643229484558105, "learning_rate": 9.680423067882057e-06, "loss": 0.1874, "step": 3324 }, { "epoch": 1.5744303048239123, "grad_norm": 2.199005365371704, "learning_rate": 9.675311620397917e-06, "loss": 0.2141, "step": 3325 }, { "epoch": 1.5749038176975436, "grad_norm": 1.1408580541610718, "learning_rate": 9.670200257832891e-06, "loss": 0.1889, "step": 3326 }, { "epoch": 1.575377330571175, "grad_norm": 1.1677594184875488, "learning_rate": 9.665088981523807e-06, "loss": 0.2221, "step": 3327 }, { "epoch": 1.5758508434448062, "grad_norm": 1.3932559490203857, "learning_rate": 9.659977792807468e-06, "loss": 0.2027, "step": 3328 }, { "epoch": 1.5763243563184375, "grad_norm": 1.122646689414978, "learning_rate": 9.654866693020656e-06, "loss": 0.1988, "step": 3329 }, { "epoch": 1.5767978691920685, "grad_norm": 1.0918054580688477, "learning_rate": 9.649755683500134e-06, "loss": 0.2109, "step": 3330 }, { "epoch": 1.5772713820657, "grad_norm": 1.0284793376922607, "learning_rate": 9.644644765582633e-06, "loss": 0.1926, "step": 3331 }, { "epoch": 1.5777448949393311, "grad_norm": 1.006220817565918, "learning_rate": 9.639533940604867e-06, "loss": 0.23, "step": 3332 }, { "epoch": 1.5782184078129624, "grad_norm": 1.0561009645462036, "learning_rate": 9.634423209903518e-06, "loss": 0.2246, "step": 3333 }, { "epoch": 1.5786919206865937, "grad_norm": 1.0437111854553223, "learning_rate": 9.629312574815251e-06, "loss": 0.2267, "step": 3334 }, { "epoch": 1.5791654335602248, "grad_norm": 2.093397617340088, "learning_rate": 9.624202036676707e-06, "loss": 0.2099, "step": 3335 }, { "epoch": 1.5796389464338563, "grad_norm": 2.2291595935821533, "learning_rate": 9.619091596824493e-06, "loss": 0.2153, "step": 3336 }, { "epoch": 1.5801124593074873, "grad_norm": 1.3327558040618896, "learning_rate": 9.613981256595199e-06, "loss": 0.1965, "step": 3337 }, { "epoch": 1.5805859721811186, "grad_norm": 0.9187760353088379, "learning_rate": 9.60887101732538e-06, "loss": 0.2022, "step": 3338 }, { "epoch": 1.58105948505475, "grad_norm": 1.6713463068008423, "learning_rate": 9.603760880351576e-06, "loss": 0.2052, "step": 3339 }, { "epoch": 1.5815329979283812, "grad_norm": 1.2200876474380493, "learning_rate": 9.59865084701029e-06, "loss": 0.2185, "step": 3340 }, { "epoch": 1.5820065108020125, "grad_norm": 1.1830116510391235, "learning_rate": 9.593540918638006e-06, "loss": 0.1816, "step": 3341 }, { "epoch": 1.5824800236756436, "grad_norm": 1.3431400060653687, "learning_rate": 9.588431096571171e-06, "loss": 0.2059, "step": 3342 }, { "epoch": 1.582953536549275, "grad_norm": 1.1804959774017334, "learning_rate": 9.583321382146212e-06, "loss": 0.2229, "step": 3343 }, { "epoch": 1.5834270494229061, "grad_norm": 1.8426966667175293, "learning_rate": 9.578211776699527e-06, "loss": 0.2133, "step": 3344 }, { "epoch": 1.5839005622965374, "grad_norm": 1.2378004789352417, "learning_rate": 9.57310228156748e-06, "loss": 0.236, "step": 3345 }, { "epoch": 1.5843740751701687, "grad_norm": 1.0932939052581787, "learning_rate": 9.567992898086415e-06, "loss": 0.2039, "step": 3346 }, { "epoch": 1.5848475880438, "grad_norm": 1.0567042827606201, "learning_rate": 9.56288362759264e-06, "loss": 0.1963, "step": 3347 }, { "epoch": 1.5853211009174313, "grad_norm": 1.230892300605774, "learning_rate": 9.557774471422434e-06, "loss": 0.2288, "step": 3348 }, { "epoch": 1.5857946137910623, "grad_norm": 1.4192466735839844, "learning_rate": 9.552665430912049e-06, "loss": 0.2143, "step": 3349 }, { "epoch": 1.5862681266646939, "grad_norm": 1.3479020595550537, "learning_rate": 9.547556507397705e-06, "loss": 0.2058, "step": 3350 }, { "epoch": 1.586741639538325, "grad_norm": 1.4939135313034058, "learning_rate": 9.542447702215596e-06, "loss": 0.2394, "step": 3351 }, { "epoch": 1.5872151524119562, "grad_norm": 1.3415539264678955, "learning_rate": 9.537339016701871e-06, "loss": 0.2256, "step": 3352 }, { "epoch": 1.5876886652855875, "grad_norm": 1.2717926502227783, "learning_rate": 9.532230452192666e-06, "loss": 0.2284, "step": 3353 }, { "epoch": 1.5881621781592186, "grad_norm": 1.1318105459213257, "learning_rate": 9.527122010024072e-06, "loss": 0.2142, "step": 3354 }, { "epoch": 1.58863569103285, "grad_norm": 1.1898558139801025, "learning_rate": 9.522013691532154e-06, "loss": 0.2224, "step": 3355 }, { "epoch": 1.5891092039064811, "grad_norm": 1.3306890726089478, "learning_rate": 9.516905498052944e-06, "loss": 0.2193, "step": 3356 }, { "epoch": 1.5895827167801124, "grad_norm": 1.2777800559997559, "learning_rate": 9.51179743092244e-06, "loss": 0.2109, "step": 3357 }, { "epoch": 1.5900562296537437, "grad_norm": 1.1809998750686646, "learning_rate": 9.50668949147661e-06, "loss": 0.2307, "step": 3358 }, { "epoch": 1.590529742527375, "grad_norm": 1.5275858640670776, "learning_rate": 9.50158168105138e-06, "loss": 0.2074, "step": 3359 }, { "epoch": 1.5910032554010063, "grad_norm": 1.153929352760315, "learning_rate": 9.496474000982657e-06, "loss": 0.1873, "step": 3360 }, { "epoch": 1.5914767682746374, "grad_norm": 1.1171983480453491, "learning_rate": 9.4913664526063e-06, "loss": 0.2069, "step": 3361 }, { "epoch": 1.5919502811482689, "grad_norm": 1.2041152715682983, "learning_rate": 9.48625903725814e-06, "loss": 0.2075, "step": 3362 }, { "epoch": 1.5924237940219, "grad_norm": 2.3762893676757812, "learning_rate": 9.481151756273976e-06, "loss": 0.2169, "step": 3363 }, { "epoch": 1.5928973068955312, "grad_norm": 3.4273829460144043, "learning_rate": 9.476044610989562e-06, "loss": 0.2338, "step": 3364 }, { "epoch": 1.5933708197691625, "grad_norm": 0.9142179489135742, "learning_rate": 9.470937602740624e-06, "loss": 0.216, "step": 3365 }, { "epoch": 1.5938443326427936, "grad_norm": 1.2260160446166992, "learning_rate": 9.465830732862857e-06, "loss": 0.2242, "step": 3366 }, { "epoch": 1.594317845516425, "grad_norm": 1.3920302391052246, "learning_rate": 9.460724002691906e-06, "loss": 0.216, "step": 3367 }, { "epoch": 1.5947913583900561, "grad_norm": 0.9747047424316406, "learning_rate": 9.455617413563389e-06, "loss": 0.2445, "step": 3368 }, { "epoch": 1.5952648712636874, "grad_norm": 1.1095210313796997, "learning_rate": 9.450510966812885e-06, "loss": 0.1813, "step": 3369 }, { "epoch": 1.5957383841373187, "grad_norm": 1.0806443691253662, "learning_rate": 9.445404663775938e-06, "loss": 0.2429, "step": 3370 }, { "epoch": 1.59621189701095, "grad_norm": 1.3196625709533691, "learning_rate": 9.44029850578805e-06, "loss": 0.244, "step": 3371 }, { "epoch": 1.5966854098845813, "grad_norm": 2.6262784004211426, "learning_rate": 9.435192494184689e-06, "loss": 0.2116, "step": 3372 }, { "epoch": 1.5971589227582124, "grad_norm": 1.1327464580535889, "learning_rate": 9.43008663030128e-06, "loss": 0.1857, "step": 3373 }, { "epoch": 1.5976324356318439, "grad_norm": 0.9777623414993286, "learning_rate": 9.424980915473217e-06, "loss": 0.2091, "step": 3374 }, { "epoch": 1.598105948505475, "grad_norm": 1.3106697797775269, "learning_rate": 9.419875351035848e-06, "loss": 0.213, "step": 3375 }, { "epoch": 1.5985794613791062, "grad_norm": 1.0573484897613525, "learning_rate": 9.414769938324487e-06, "loss": 0.2028, "step": 3376 }, { "epoch": 1.5990529742527375, "grad_norm": 1.3696736097335815, "learning_rate": 9.4096646786744e-06, "loss": 0.2101, "step": 3377 }, { "epoch": 1.5995264871263688, "grad_norm": 1.2642654180526733, "learning_rate": 9.404559573420822e-06, "loss": 0.2352, "step": 3378 }, { "epoch": 1.6, "grad_norm": 1.4823346138000488, "learning_rate": 9.399454623898942e-06, "loss": 0.2035, "step": 3379 }, { "epoch": 1.6004735128736312, "grad_norm": 1.1810314655303955, "learning_rate": 9.394349831443912e-06, "loss": 0.2161, "step": 3380 }, { "epoch": 1.6009470257472627, "grad_norm": 1.377890706062317, "learning_rate": 9.389245197390842e-06, "loss": 0.2498, "step": 3381 }, { "epoch": 1.6014205386208937, "grad_norm": 1.6414611339569092, "learning_rate": 9.384140723074796e-06, "loss": 0.2296, "step": 3382 }, { "epoch": 1.601894051494525, "grad_norm": 1.2565416097640991, "learning_rate": 9.379036409830804e-06, "loss": 0.2056, "step": 3383 }, { "epoch": 1.6023675643681563, "grad_norm": 1.1565498113632202, "learning_rate": 9.37393225899385e-06, "loss": 0.2115, "step": 3384 }, { "epoch": 1.6028410772417874, "grad_norm": 1.0674059391021729, "learning_rate": 9.368828271898874e-06, "loss": 0.2103, "step": 3385 }, { "epoch": 1.6033145901154189, "grad_norm": 0.9259766340255737, "learning_rate": 9.363724449880773e-06, "loss": 0.1965, "step": 3386 }, { "epoch": 1.60378810298905, "grad_norm": 1.08977472782135, "learning_rate": 9.358620794274404e-06, "loss": 0.1933, "step": 3387 }, { "epoch": 1.6042616158626812, "grad_norm": 1.5023025274276733, "learning_rate": 9.35351730641458e-06, "loss": 0.2293, "step": 3388 }, { "epoch": 1.6047351287363125, "grad_norm": 1.0194486379623413, "learning_rate": 9.348413987636065e-06, "loss": 0.2344, "step": 3389 }, { "epoch": 1.6052086416099438, "grad_norm": 2.0349748134613037, "learning_rate": 9.343310839273587e-06, "loss": 0.2097, "step": 3390 }, { "epoch": 1.605682154483575, "grad_norm": 0.9003639221191406, "learning_rate": 9.338207862661824e-06, "loss": 0.2193, "step": 3391 }, { "epoch": 1.6061556673572062, "grad_norm": 1.3183727264404297, "learning_rate": 9.33310505913541e-06, "loss": 0.2418, "step": 3392 }, { "epoch": 1.6066291802308377, "grad_norm": 1.593930721282959, "learning_rate": 9.328002430028932e-06, "loss": 0.1988, "step": 3393 }, { "epoch": 1.6071026931044687, "grad_norm": 1.2510972023010254, "learning_rate": 9.322899976676938e-06, "loss": 0.201, "step": 3394 }, { "epoch": 1.6075762059781, "grad_norm": 1.210264801979065, "learning_rate": 9.317797700413925e-06, "loss": 0.2076, "step": 3395 }, { "epoch": 1.6080497188517313, "grad_norm": 1.0849723815917969, "learning_rate": 9.31269560257434e-06, "loss": 0.2217, "step": 3396 }, { "epoch": 1.6085232317253624, "grad_norm": 1.2907907962799072, "learning_rate": 9.307593684492588e-06, "loss": 0.2071, "step": 3397 }, { "epoch": 1.6089967445989939, "grad_norm": 1.6149160861968994, "learning_rate": 9.302491947503027e-06, "loss": 0.2145, "step": 3398 }, { "epoch": 1.609470257472625, "grad_norm": 1.4612150192260742, "learning_rate": 9.29739039293997e-06, "loss": 0.2184, "step": 3399 }, { "epoch": 1.6099437703462562, "grad_norm": 1.7195252180099487, "learning_rate": 9.292289022137678e-06, "loss": 0.1968, "step": 3400 }, { "epoch": 1.6104172832198875, "grad_norm": 2.2150564193725586, "learning_rate": 9.287187836430366e-06, "loss": 0.2549, "step": 3401 }, { "epoch": 1.6108907960935188, "grad_norm": 1.1368675231933594, "learning_rate": 9.282086837152198e-06, "loss": 0.2028, "step": 3402 }, { "epoch": 1.61136430896715, "grad_norm": 1.2357794046401978, "learning_rate": 9.276986025637291e-06, "loss": 0.2165, "step": 3403 }, { "epoch": 1.6118378218407812, "grad_norm": 1.7708053588867188, "learning_rate": 9.271885403219715e-06, "loss": 0.1969, "step": 3404 }, { "epoch": 1.6123113347144127, "grad_norm": 1.0533392429351807, "learning_rate": 9.266784971233487e-06, "loss": 0.2193, "step": 3405 }, { "epoch": 1.6127848475880437, "grad_norm": 2.6355881690979004, "learning_rate": 9.261684731012575e-06, "loss": 0.201, "step": 3406 }, { "epoch": 1.613258360461675, "grad_norm": 1.196489930152893, "learning_rate": 9.256584683890902e-06, "loss": 0.2151, "step": 3407 }, { "epoch": 1.6137318733353063, "grad_norm": 1.8537029027938843, "learning_rate": 9.25148483120233e-06, "loss": 0.2343, "step": 3408 }, { "epoch": 1.6142053862089376, "grad_norm": 1.2622405290603638, "learning_rate": 9.24638517428068e-06, "loss": 0.2154, "step": 3409 }, { "epoch": 1.614678899082569, "grad_norm": 1.1913985013961792, "learning_rate": 9.24128571445972e-06, "loss": 0.2275, "step": 3410 }, { "epoch": 1.6151524119562, "grad_norm": 1.129180669784546, "learning_rate": 9.236186453073161e-06, "loss": 0.208, "step": 3411 }, { "epoch": 1.6156259248298315, "grad_norm": 1.8293282985687256, "learning_rate": 9.231087391454665e-06, "loss": 0.2184, "step": 3412 }, { "epoch": 1.6160994377034625, "grad_norm": 1.8663169145584106, "learning_rate": 9.225988530937846e-06, "loss": 0.2305, "step": 3413 }, { "epoch": 1.6165729505770938, "grad_norm": 1.2646862268447876, "learning_rate": 9.220889872856258e-06, "loss": 0.2168, "step": 3414 }, { "epoch": 1.6170464634507251, "grad_norm": 1.4086544513702393, "learning_rate": 9.215791418543407e-06, "loss": 0.2109, "step": 3415 }, { "epoch": 1.6175199763243562, "grad_norm": 1.7981756925582886, "learning_rate": 9.210693169332746e-06, "loss": 0.2098, "step": 3416 }, { "epoch": 1.6179934891979877, "grad_norm": 1.776749849319458, "learning_rate": 9.205595126557673e-06, "loss": 0.2017, "step": 3417 }, { "epoch": 1.6184670020716188, "grad_norm": 1.2278921604156494, "learning_rate": 9.200497291551528e-06, "loss": 0.2068, "step": 3418 }, { "epoch": 1.61894051494525, "grad_norm": 0.9366362690925598, "learning_rate": 9.195399665647607e-06, "loss": 0.1798, "step": 3419 }, { "epoch": 1.6194140278188813, "grad_norm": 1.8072518110275269, "learning_rate": 9.190302250179141e-06, "loss": 0.2245, "step": 3420 }, { "epoch": 1.6198875406925126, "grad_norm": 1.4255504608154297, "learning_rate": 9.185205046479308e-06, "loss": 0.2082, "step": 3421 }, { "epoch": 1.620361053566144, "grad_norm": 1.2994076013565063, "learning_rate": 9.180108055881236e-06, "loss": 0.2107, "step": 3422 }, { "epoch": 1.620834566439775, "grad_norm": 1.6966938972473145, "learning_rate": 9.175011279717992e-06, "loss": 0.2548, "step": 3423 }, { "epoch": 1.6213080793134065, "grad_norm": 1.2594822645187378, "learning_rate": 9.169914719322588e-06, "loss": 0.2031, "step": 3424 }, { "epoch": 1.6217815921870375, "grad_norm": 1.0965392589569092, "learning_rate": 9.164818376027981e-06, "loss": 0.2233, "step": 3425 }, { "epoch": 1.6222551050606688, "grad_norm": 1.0597667694091797, "learning_rate": 9.159722251167073e-06, "loss": 0.2034, "step": 3426 }, { "epoch": 1.6227286179343001, "grad_norm": 1.3505845069885254, "learning_rate": 9.154626346072702e-06, "loss": 0.2138, "step": 3427 }, { "epoch": 1.6232021308079312, "grad_norm": 0.9662488102912903, "learning_rate": 9.149530662077655e-06, "loss": 0.1981, "step": 3428 }, { "epoch": 1.6236756436815627, "grad_norm": 0.9503706693649292, "learning_rate": 9.144435200514658e-06, "loss": 0.2161, "step": 3429 }, { "epoch": 1.6241491565551938, "grad_norm": 1.316527247428894, "learning_rate": 9.139339962716383e-06, "loss": 0.188, "step": 3430 }, { "epoch": 1.624622669428825, "grad_norm": 1.0049852132797241, "learning_rate": 9.134244950015437e-06, "loss": 0.2057, "step": 3431 }, { "epoch": 1.6250961823024563, "grad_norm": 1.619187355041504, "learning_rate": 9.129150163744371e-06, "loss": 0.2018, "step": 3432 }, { "epoch": 1.6255696951760876, "grad_norm": 1.270595908164978, "learning_rate": 9.12405560523568e-06, "loss": 0.1814, "step": 3433 }, { "epoch": 1.626043208049719, "grad_norm": 1.6350579261779785, "learning_rate": 9.118961275821792e-06, "loss": 0.2457, "step": 3434 }, { "epoch": 1.62651672092335, "grad_norm": 2.1566319465637207, "learning_rate": 9.113867176835086e-06, "loss": 0.1939, "step": 3435 }, { "epoch": 1.6269902337969815, "grad_norm": 1.6590288877487183, "learning_rate": 9.10877330960787e-06, "loss": 0.2094, "step": 3436 }, { "epoch": 1.6274637466706126, "grad_norm": 1.642244577407837, "learning_rate": 9.103679675472395e-06, "loss": 0.2082, "step": 3437 }, { "epoch": 1.6279372595442438, "grad_norm": 1.484878659248352, "learning_rate": 9.098586275760854e-06, "loss": 0.2094, "step": 3438 }, { "epoch": 1.6284107724178751, "grad_norm": 1.3303133249282837, "learning_rate": 9.093493111805379e-06, "loss": 0.2161, "step": 3439 }, { "epoch": 1.6288842852915064, "grad_norm": 1.818798542022705, "learning_rate": 9.088400184938036e-06, "loss": 0.2261, "step": 3440 }, { "epoch": 1.6293577981651377, "grad_norm": 1.2942332029342651, "learning_rate": 9.08330749649083e-06, "loss": 0.2272, "step": 3441 }, { "epoch": 1.6298313110387688, "grad_norm": 1.2421215772628784, "learning_rate": 9.078215047795703e-06, "loss": 0.2111, "step": 3442 }, { "epoch": 1.6303048239124003, "grad_norm": 1.215605616569519, "learning_rate": 9.073122840184537e-06, "loss": 0.2487, "step": 3443 }, { "epoch": 1.6307783367860313, "grad_norm": 1.1501661539077759, "learning_rate": 9.068030874989152e-06, "loss": 0.2214, "step": 3444 }, { "epoch": 1.6312518496596626, "grad_norm": 1.0269571542739868, "learning_rate": 9.062939153541302e-06, "loss": 0.2149, "step": 3445 }, { "epoch": 1.631725362533294, "grad_norm": 1.087510108947754, "learning_rate": 9.057847677172675e-06, "loss": 0.2047, "step": 3446 }, { "epoch": 1.632198875406925, "grad_norm": 1.1288443803787231, "learning_rate": 9.052756447214899e-06, "loss": 0.2051, "step": 3447 }, { "epoch": 1.6326723882805565, "grad_norm": 0.9681136608123779, "learning_rate": 9.047665464999537e-06, "loss": 0.2058, "step": 3448 }, { "epoch": 1.6331459011541876, "grad_norm": 1.3318946361541748, "learning_rate": 9.042574731858084e-06, "loss": 0.224, "step": 3449 }, { "epoch": 1.6336194140278189, "grad_norm": 1.4043301343917847, "learning_rate": 9.037484249121974e-06, "loss": 0.2254, "step": 3450 }, { "epoch": 1.6340929269014501, "grad_norm": 1.0534493923187256, "learning_rate": 9.032394018122572e-06, "loss": 0.217, "step": 3451 }, { "epoch": 1.6345664397750814, "grad_norm": 1.055869221687317, "learning_rate": 9.027304040191181e-06, "loss": 0.2174, "step": 3452 }, { "epoch": 1.6350399526487127, "grad_norm": 0.9580625891685486, "learning_rate": 9.022214316659035e-06, "loss": 0.2125, "step": 3453 }, { "epoch": 1.6355134655223438, "grad_norm": 1.0676701068878174, "learning_rate": 9.0171248488573e-06, "loss": 0.2094, "step": 3454 }, { "epoch": 1.6359869783959753, "grad_norm": 1.6258785724639893, "learning_rate": 9.012035638117082e-06, "loss": 0.2406, "step": 3455 }, { "epoch": 1.6364604912696064, "grad_norm": 1.652103304862976, "learning_rate": 9.006946685769408e-06, "loss": 0.1958, "step": 3456 }, { "epoch": 1.6369340041432376, "grad_norm": 1.625702977180481, "learning_rate": 9.001857993145251e-06, "loss": 0.2426, "step": 3457 }, { "epoch": 1.637407517016869, "grad_norm": 1.5374822616577148, "learning_rate": 8.996769561575504e-06, "loss": 0.2269, "step": 3458 }, { "epoch": 1.6378810298905, "grad_norm": 1.0075321197509766, "learning_rate": 8.991681392391001e-06, "loss": 0.1967, "step": 3459 }, { "epoch": 1.6383545427641315, "grad_norm": 1.2052757740020752, "learning_rate": 8.986593486922504e-06, "loss": 0.1997, "step": 3460 }, { "epoch": 1.6388280556377626, "grad_norm": 1.0683456659317017, "learning_rate": 8.981505846500703e-06, "loss": 0.2165, "step": 3461 }, { "epoch": 1.6393015685113939, "grad_norm": 1.3364335298538208, "learning_rate": 8.976418472456222e-06, "loss": 0.2098, "step": 3462 }, { "epoch": 1.6397750813850251, "grad_norm": 1.8314017057418823, "learning_rate": 8.971331366119613e-06, "loss": 0.2215, "step": 3463 }, { "epoch": 1.6402485942586564, "grad_norm": 1.2816479206085205, "learning_rate": 8.966244528821366e-06, "loss": 0.2137, "step": 3464 }, { "epoch": 1.6407221071322877, "grad_norm": 1.2321319580078125, "learning_rate": 8.961157961891886e-06, "loss": 0.2345, "step": 3465 }, { "epoch": 1.6411956200059188, "grad_norm": 1.0520602464675903, "learning_rate": 8.95607166666152e-06, "loss": 0.2141, "step": 3466 }, { "epoch": 1.6416691328795503, "grad_norm": 1.592511773109436, "learning_rate": 8.950985644460539e-06, "loss": 0.2279, "step": 3467 }, { "epoch": 1.6421426457531814, "grad_norm": 1.1186596155166626, "learning_rate": 8.945899896619143e-06, "loss": 0.2285, "step": 3468 }, { "epoch": 1.6426161586268127, "grad_norm": 1.4300451278686523, "learning_rate": 8.940814424467457e-06, "loss": 0.2494, "step": 3469 }, { "epoch": 1.643089671500444, "grad_norm": 1.1781500577926636, "learning_rate": 8.935729229335544e-06, "loss": 0.2253, "step": 3470 }, { "epoch": 1.6435631843740752, "grad_norm": 1.3187671899795532, "learning_rate": 8.930644312553381e-06, "loss": 0.2244, "step": 3471 }, { "epoch": 1.6440366972477065, "grad_norm": 1.1596239805221558, "learning_rate": 8.925559675450883e-06, "loss": 0.2193, "step": 3472 }, { "epoch": 1.6445102101213376, "grad_norm": 2.0229430198669434, "learning_rate": 8.920475319357886e-06, "loss": 0.2182, "step": 3473 }, { "epoch": 1.644983722994969, "grad_norm": 1.2594224214553833, "learning_rate": 8.915391245604159e-06, "loss": 0.2311, "step": 3474 }, { "epoch": 1.6454572358686002, "grad_norm": 1.5540090799331665, "learning_rate": 8.910307455519385e-06, "loss": 0.2034, "step": 3475 }, { "epoch": 1.6459307487422314, "grad_norm": 1.0850417613983154, "learning_rate": 8.905223950433178e-06, "loss": 0.1913, "step": 3476 }, { "epoch": 1.6464042616158627, "grad_norm": 1.3703140020370483, "learning_rate": 8.90014073167509e-06, "loss": 0.2108, "step": 3477 }, { "epoch": 1.6468777744894938, "grad_norm": 1.050058364868164, "learning_rate": 8.895057800574584e-06, "loss": 0.2306, "step": 3478 }, { "epoch": 1.6473512873631253, "grad_norm": 1.3480294942855835, "learning_rate": 8.889975158461051e-06, "loss": 0.202, "step": 3479 }, { "epoch": 1.6478248002367564, "grad_norm": 1.2146999835968018, "learning_rate": 8.884892806663808e-06, "loss": 0.198, "step": 3480 }, { "epoch": 1.6482983131103877, "grad_norm": 1.6478931903839111, "learning_rate": 8.879810746512091e-06, "loss": 0.2188, "step": 3481 }, { "epoch": 1.648771825984019, "grad_norm": 1.0835014581680298, "learning_rate": 8.874728979335069e-06, "loss": 0.2076, "step": 3482 }, { "epoch": 1.6492453388576502, "grad_norm": 1.0451796054840088, "learning_rate": 8.869647506461823e-06, "loss": 0.221, "step": 3483 }, { "epoch": 1.6497188517312815, "grad_norm": 1.0795484781265259, "learning_rate": 8.86456632922137e-06, "loss": 0.2048, "step": 3484 }, { "epoch": 1.6501923646049126, "grad_norm": 1.3780536651611328, "learning_rate": 8.85948544894264e-06, "loss": 0.2048, "step": 3485 }, { "epoch": 1.650665877478544, "grad_norm": 1.2751306295394897, "learning_rate": 8.854404866954485e-06, "loss": 0.2238, "step": 3486 }, { "epoch": 1.6511393903521752, "grad_norm": 1.0825250148773193, "learning_rate": 8.84932458458569e-06, "loss": 0.2076, "step": 3487 }, { "epoch": 1.6516129032258065, "grad_norm": 1.1926277875900269, "learning_rate": 8.844244603164946e-06, "loss": 0.2038, "step": 3488 }, { "epoch": 1.6520864160994377, "grad_norm": 1.0996829271316528, "learning_rate": 8.839164924020878e-06, "loss": 0.1927, "step": 3489 }, { "epoch": 1.6525599289730688, "grad_norm": 0.9266497492790222, "learning_rate": 8.834085548482024e-06, "loss": 0.2048, "step": 3490 }, { "epoch": 1.6530334418467003, "grad_norm": 1.086111068725586, "learning_rate": 8.829006477876847e-06, "loss": 0.2219, "step": 3491 }, { "epoch": 1.6535069547203314, "grad_norm": 1.0157992839813232, "learning_rate": 8.82392771353373e-06, "loss": 0.1998, "step": 3492 }, { "epoch": 1.6539804675939627, "grad_norm": 1.2996865510940552, "learning_rate": 8.818849256780972e-06, "loss": 0.2155, "step": 3493 }, { "epoch": 1.654453980467594, "grad_norm": 1.0794087648391724, "learning_rate": 8.813771108946798e-06, "loss": 0.208, "step": 3494 }, { "epoch": 1.6549274933412252, "grad_norm": 1.0981913805007935, "learning_rate": 8.808693271359346e-06, "loss": 0.2074, "step": 3495 }, { "epoch": 1.6554010062148565, "grad_norm": 1.7508763074874878, "learning_rate": 8.803615745346675e-06, "loss": 0.238, "step": 3496 }, { "epoch": 1.6558745190884876, "grad_norm": 1.0915553569793701, "learning_rate": 8.798538532236764e-06, "loss": 0.2166, "step": 3497 }, { "epoch": 1.656348031962119, "grad_norm": 0.9217873811721802, "learning_rate": 8.79346163335751e-06, "loss": 0.1957, "step": 3498 }, { "epoch": 1.6568215448357502, "grad_norm": 2.03841233253479, "learning_rate": 8.78838505003673e-06, "loss": 0.2089, "step": 3499 }, { "epoch": 1.6572950577093815, "grad_norm": 1.8650535345077515, "learning_rate": 8.783308783602148e-06, "loss": 0.188, "step": 3500 }, { "epoch": 1.6577685705830127, "grad_norm": 0.9951745867729187, "learning_rate": 8.778232835381415e-06, "loss": 0.1941, "step": 3501 }, { "epoch": 1.6582420834566438, "grad_norm": 1.168714165687561, "learning_rate": 8.773157206702097e-06, "loss": 0.2112, "step": 3502 }, { "epoch": 1.6587155963302753, "grad_norm": 1.0129159688949585, "learning_rate": 8.768081898891679e-06, "loss": 0.2112, "step": 3503 }, { "epoch": 1.6591891092039064, "grad_norm": 1.5583324432373047, "learning_rate": 8.763006913277553e-06, "loss": 0.2165, "step": 3504 }, { "epoch": 1.659662622077538, "grad_norm": 1.3734880685806274, "learning_rate": 8.757932251187037e-06, "loss": 0.2241, "step": 3505 }, { "epoch": 1.660136134951169, "grad_norm": 1.3286396265029907, "learning_rate": 8.752857913947357e-06, "loss": 0.2349, "step": 3506 }, { "epoch": 1.6606096478248003, "grad_norm": 1.3131154775619507, "learning_rate": 8.747783902885657e-06, "loss": 0.2454, "step": 3507 }, { "epoch": 1.6610831606984315, "grad_norm": 1.0709985494613647, "learning_rate": 8.742710219329e-06, "loss": 0.225, "step": 3508 }, { "epoch": 1.6615566735720626, "grad_norm": 1.4955830574035645, "learning_rate": 8.737636864604357e-06, "loss": 0.1989, "step": 3509 }, { "epoch": 1.6620301864456941, "grad_norm": 1.0613009929656982, "learning_rate": 8.73256384003861e-06, "loss": 0.2329, "step": 3510 }, { "epoch": 1.6625036993193252, "grad_norm": 1.2073135375976562, "learning_rate": 8.727491146958566e-06, "loss": 0.2259, "step": 3511 }, { "epoch": 1.6629772121929565, "grad_norm": 1.160805583000183, "learning_rate": 8.722418786690936e-06, "loss": 0.2198, "step": 3512 }, { "epoch": 1.6634507250665878, "grad_norm": 1.5360057353973389, "learning_rate": 8.717346760562345e-06, "loss": 0.2242, "step": 3513 }, { "epoch": 1.663924237940219, "grad_norm": 1.0685076713562012, "learning_rate": 8.712275069899337e-06, "loss": 0.2136, "step": 3514 }, { "epoch": 1.6643977508138503, "grad_norm": 1.6072028875350952, "learning_rate": 8.707203716028358e-06, "loss": 0.2215, "step": 3515 }, { "epoch": 1.6648712636874814, "grad_norm": 1.209231972694397, "learning_rate": 8.702132700275778e-06, "loss": 0.2225, "step": 3516 }, { "epoch": 1.665344776561113, "grad_norm": 1.0955963134765625, "learning_rate": 8.697062023967869e-06, "loss": 0.2164, "step": 3517 }, { "epoch": 1.665818289434744, "grad_norm": 0.9355467557907104, "learning_rate": 8.691991688430818e-06, "loss": 0.2106, "step": 3518 }, { "epoch": 1.6662918023083753, "grad_norm": 0.9584832787513733, "learning_rate": 8.686921694990724e-06, "loss": 0.2093, "step": 3519 }, { "epoch": 1.6667653151820065, "grad_norm": 1.7122430801391602, "learning_rate": 8.681852044973591e-06, "loss": 0.2166, "step": 3520 }, { "epoch": 1.6672388280556376, "grad_norm": 1.5137004852294922, "learning_rate": 8.67678273970534e-06, "loss": 0.2199, "step": 3521 }, { "epoch": 1.6677123409292691, "grad_norm": 1.2796556949615479, "learning_rate": 8.671713780511798e-06, "loss": 0.2053, "step": 3522 }, { "epoch": 1.6681858538029002, "grad_norm": 1.1300054788589478, "learning_rate": 8.666645168718705e-06, "loss": 0.2256, "step": 3523 }, { "epoch": 1.6686593666765315, "grad_norm": 1.094119906425476, "learning_rate": 8.661576905651704e-06, "loss": 0.2042, "step": 3524 }, { "epoch": 1.6691328795501628, "grad_norm": 1.1889249086380005, "learning_rate": 8.656508992636352e-06, "loss": 0.2144, "step": 3525 }, { "epoch": 1.669606392423794, "grad_norm": 0.9747273325920105, "learning_rate": 8.65144143099811e-06, "loss": 0.1979, "step": 3526 }, { "epoch": 1.6700799052974253, "grad_norm": 1.1570683717727661, "learning_rate": 8.646374222062352e-06, "loss": 0.2429, "step": 3527 }, { "epoch": 1.6705534181710564, "grad_norm": 1.3277277946472168, "learning_rate": 8.641307367154356e-06, "loss": 0.2064, "step": 3528 }, { "epoch": 1.671026931044688, "grad_norm": 1.3713091611862183, "learning_rate": 8.636240867599314e-06, "loss": 0.2051, "step": 3529 }, { "epoch": 1.671500443918319, "grad_norm": 1.2886407375335693, "learning_rate": 8.631174724722315e-06, "loss": 0.2237, "step": 3530 }, { "epoch": 1.6719739567919503, "grad_norm": 1.255431056022644, "learning_rate": 8.626108939848362e-06, "loss": 0.2136, "step": 3531 }, { "epoch": 1.6724474696655816, "grad_norm": 1.0875186920166016, "learning_rate": 8.621043514302361e-06, "loss": 0.1884, "step": 3532 }, { "epoch": 1.6729209825392126, "grad_norm": 1.1179817914962769, "learning_rate": 8.615978449409124e-06, "loss": 0.2243, "step": 3533 }, { "epoch": 1.6733944954128441, "grad_norm": 0.9758694171905518, "learning_rate": 8.610913746493377e-06, "loss": 0.2246, "step": 3534 }, { "epoch": 1.6738680082864752, "grad_norm": 1.0491384267807007, "learning_rate": 8.605849406879736e-06, "loss": 0.241, "step": 3535 }, { "epoch": 1.6743415211601067, "grad_norm": 1.1519798040390015, "learning_rate": 8.600785431892731e-06, "loss": 0.2076, "step": 3536 }, { "epoch": 1.6748150340337378, "grad_norm": 1.3320951461791992, "learning_rate": 8.5957218228568e-06, "loss": 0.2107, "step": 3537 }, { "epoch": 1.675288546907369, "grad_norm": 1.127048134803772, "learning_rate": 8.590658581096278e-06, "loss": 0.2077, "step": 3538 }, { "epoch": 1.6757620597810003, "grad_norm": 1.4201406240463257, "learning_rate": 8.585595707935408e-06, "loss": 0.2116, "step": 3539 }, { "epoch": 1.6762355726546314, "grad_norm": 1.774510145187378, "learning_rate": 8.580533204698336e-06, "loss": 0.2003, "step": 3540 }, { "epoch": 1.676709085528263, "grad_norm": 1.9603132009506226, "learning_rate": 8.57547107270911e-06, "loss": 0.2353, "step": 3541 }, { "epoch": 1.677182598401894, "grad_norm": 1.5556761026382446, "learning_rate": 8.570409313291683e-06, "loss": 0.2191, "step": 3542 }, { "epoch": 1.6776561112755253, "grad_norm": 1.459722638130188, "learning_rate": 8.56534792776991e-06, "loss": 0.1919, "step": 3543 }, { "epoch": 1.6781296241491566, "grad_norm": 1.822821021080017, "learning_rate": 8.560286917467543e-06, "loss": 0.2449, "step": 3544 }, { "epoch": 1.6786031370227879, "grad_norm": 1.4285695552825928, "learning_rate": 8.555226283708246e-06, "loss": 0.1897, "step": 3545 }, { "epoch": 1.6790766498964191, "grad_norm": 1.1042506694793701, "learning_rate": 8.550166027815576e-06, "loss": 0.2184, "step": 3546 }, { "epoch": 1.6795501627700502, "grad_norm": 0.9724579453468323, "learning_rate": 8.545106151112994e-06, "loss": 0.2061, "step": 3547 }, { "epoch": 1.6800236756436817, "grad_norm": 1.1628752946853638, "learning_rate": 8.540046654923863e-06, "loss": 0.1937, "step": 3548 }, { "epoch": 1.6804971885173128, "grad_norm": 1.408586025238037, "learning_rate": 8.534987540571445e-06, "loss": 0.2233, "step": 3549 }, { "epoch": 1.680970701390944, "grad_norm": 1.0293395519256592, "learning_rate": 8.529928809378902e-06, "loss": 0.2141, "step": 3550 }, { "epoch": 1.6814442142645754, "grad_norm": 2.2602131366729736, "learning_rate": 8.524870462669296e-06, "loss": 0.2243, "step": 3551 }, { "epoch": 1.6819177271382064, "grad_norm": 1.1277772188186646, "learning_rate": 8.519812501765591e-06, "loss": 0.2004, "step": 3552 }, { "epoch": 1.682391240011838, "grad_norm": 1.7212578058242798, "learning_rate": 8.514754927990646e-06, "loss": 0.1934, "step": 3553 }, { "epoch": 1.682864752885469, "grad_norm": 1.1089184284210205, "learning_rate": 8.509697742667219e-06, "loss": 0.2354, "step": 3554 }, { "epoch": 1.6833382657591003, "grad_norm": 1.5018812417984009, "learning_rate": 8.504640947117973e-06, "loss": 0.2144, "step": 3555 }, { "epoch": 1.6838117786327316, "grad_norm": 1.6116001605987549, "learning_rate": 8.49958454266546e-06, "loss": 0.2174, "step": 3556 }, { "epoch": 1.6842852915063629, "grad_norm": 1.9937829971313477, "learning_rate": 8.494528530632136e-06, "loss": 0.2132, "step": 3557 }, { "epoch": 1.6847588043799941, "grad_norm": 1.3421846628189087, "learning_rate": 8.48947291234035e-06, "loss": 0.2168, "step": 3558 }, { "epoch": 1.6852323172536252, "grad_norm": 1.112237572669983, "learning_rate": 8.484417689112356e-06, "loss": 0.2025, "step": 3559 }, { "epoch": 1.6857058301272567, "grad_norm": 1.0491182804107666, "learning_rate": 8.47936286227029e-06, "loss": 0.2236, "step": 3560 }, { "epoch": 1.6861793430008878, "grad_norm": 1.0888773202896118, "learning_rate": 8.4743084331362e-06, "loss": 0.2186, "step": 3561 }, { "epoch": 1.686652855874519, "grad_norm": 0.9247356653213501, "learning_rate": 8.46925440303202e-06, "loss": 0.2121, "step": 3562 }, { "epoch": 1.6871263687481504, "grad_norm": 0.986939013004303, "learning_rate": 8.464200773279582e-06, "loss": 0.2269, "step": 3563 }, { "epoch": 1.6875998816217814, "grad_norm": 1.2144672870635986, "learning_rate": 8.459147545200617e-06, "loss": 0.219, "step": 3564 }, { "epoch": 1.688073394495413, "grad_norm": 1.2081797122955322, "learning_rate": 8.454094720116745e-06, "loss": 0.1891, "step": 3565 }, { "epoch": 1.688546907369044, "grad_norm": 1.819753646850586, "learning_rate": 8.449042299349487e-06, "loss": 0.2095, "step": 3566 }, { "epoch": 1.6890204202426755, "grad_norm": 0.952063262462616, "learning_rate": 8.443990284220252e-06, "loss": 0.2111, "step": 3567 }, { "epoch": 1.6894939331163066, "grad_norm": 1.0713047981262207, "learning_rate": 8.43893867605035e-06, "loss": 0.2046, "step": 3568 }, { "epoch": 1.6899674459899379, "grad_norm": 1.6247305870056152, "learning_rate": 8.433887476160976e-06, "loss": 0.1997, "step": 3569 }, { "epoch": 1.6904409588635692, "grad_norm": 0.9922040104866028, "learning_rate": 8.428836685873223e-06, "loss": 0.1791, "step": 3570 }, { "epoch": 1.6909144717372002, "grad_norm": 1.1413791179656982, "learning_rate": 8.423786306508076e-06, "loss": 0.2028, "step": 3571 }, { "epoch": 1.6913879846108317, "grad_norm": 1.1221214532852173, "learning_rate": 8.418736339386417e-06, "loss": 0.2044, "step": 3572 }, { "epoch": 1.6918614974844628, "grad_norm": 1.0321074724197388, "learning_rate": 8.413686785829013e-06, "loss": 0.2371, "step": 3573 }, { "epoch": 1.692335010358094, "grad_norm": 1.458929181098938, "learning_rate": 8.408637647156528e-06, "loss": 0.228, "step": 3574 }, { "epoch": 1.6928085232317254, "grad_norm": 1.4914942979812622, "learning_rate": 8.403588924689511e-06, "loss": 0.1998, "step": 3575 }, { "epoch": 1.6932820361053567, "grad_norm": 1.992150902748108, "learning_rate": 8.398540619748414e-06, "loss": 0.2095, "step": 3576 }, { "epoch": 1.693755548978988, "grad_norm": 0.9819469451904297, "learning_rate": 8.393492733653566e-06, "loss": 0.2025, "step": 3577 }, { "epoch": 1.694229061852619, "grad_norm": 0.8754822611808777, "learning_rate": 8.388445267725197e-06, "loss": 0.1947, "step": 3578 }, { "epoch": 1.6947025747262505, "grad_norm": 1.3592606782913208, "learning_rate": 8.38339822328342e-06, "loss": 0.2065, "step": 3579 }, { "epoch": 1.6951760875998816, "grad_norm": 1.2748628854751587, "learning_rate": 8.378351601648243e-06, "loss": 0.2091, "step": 3580 }, { "epoch": 1.6956496004735129, "grad_norm": 1.1652089357376099, "learning_rate": 8.373305404139558e-06, "loss": 0.1885, "step": 3581 }, { "epoch": 1.6961231133471442, "grad_norm": 1.0622856616973877, "learning_rate": 8.368259632077153e-06, "loss": 0.2115, "step": 3582 }, { "epoch": 1.6965966262207752, "grad_norm": 1.0490455627441406, "learning_rate": 8.363214286780699e-06, "loss": 0.2059, "step": 3583 }, { "epoch": 1.6970701390944067, "grad_norm": 1.0844651460647583, "learning_rate": 8.35816936956976e-06, "loss": 0.223, "step": 3584 }, { "epoch": 1.6975436519680378, "grad_norm": 1.131882905960083, "learning_rate": 8.35312488176378e-06, "loss": 0.2312, "step": 3585 }, { "epoch": 1.698017164841669, "grad_norm": 1.6975009441375732, "learning_rate": 8.348080824682102e-06, "loss": 0.2397, "step": 3586 }, { "epoch": 1.6984906777153004, "grad_norm": 1.1549729108810425, "learning_rate": 8.343037199643947e-06, "loss": 0.2051, "step": 3587 }, { "epoch": 1.6989641905889317, "grad_norm": 1.2071688175201416, "learning_rate": 8.33799400796843e-06, "loss": 0.2313, "step": 3588 }, { "epoch": 1.699437703462563, "grad_norm": 1.1195539236068726, "learning_rate": 8.332951250974543e-06, "loss": 0.2112, "step": 3589 }, { "epoch": 1.699911216336194, "grad_norm": 1.7075669765472412, "learning_rate": 8.327908929981175e-06, "loss": 0.2271, "step": 3590 }, { "epoch": 1.7003847292098255, "grad_norm": 1.0217007398605347, "learning_rate": 8.322867046307096e-06, "loss": 0.1973, "step": 3591 }, { "epoch": 1.7008582420834566, "grad_norm": 1.1293320655822754, "learning_rate": 8.317825601270954e-06, "loss": 0.2094, "step": 3592 }, { "epoch": 1.7013317549570879, "grad_norm": 1.1042124032974243, "learning_rate": 8.312784596191308e-06, "loss": 0.2178, "step": 3593 }, { "epoch": 1.7018052678307192, "grad_norm": 1.4056512117385864, "learning_rate": 8.307744032386571e-06, "loss": 0.2054, "step": 3594 }, { "epoch": 1.7022787807043502, "grad_norm": 1.0147428512573242, "learning_rate": 8.302703911175057e-06, "loss": 0.1927, "step": 3595 }, { "epoch": 1.7027522935779817, "grad_norm": 1.2608839273452759, "learning_rate": 8.297664233874958e-06, "loss": 0.2028, "step": 3596 }, { "epoch": 1.7032258064516128, "grad_norm": 1.170110821723938, "learning_rate": 8.292625001804359e-06, "loss": 0.21, "step": 3597 }, { "epoch": 1.7036993193252443, "grad_norm": 1.0504069328308105, "learning_rate": 8.28758621628122e-06, "loss": 0.2135, "step": 3598 }, { "epoch": 1.7041728321988754, "grad_norm": 0.971572995185852, "learning_rate": 8.282547878623384e-06, "loss": 0.2039, "step": 3599 }, { "epoch": 1.7046463450725067, "grad_norm": 1.2856515645980835, "learning_rate": 8.277509990148584e-06, "loss": 0.2159, "step": 3600 }, { "epoch": 1.705119857946138, "grad_norm": 0.9921664595603943, "learning_rate": 8.272472552174426e-06, "loss": 0.2166, "step": 3601 }, { "epoch": 1.705593370819769, "grad_norm": 1.2325758934020996, "learning_rate": 8.267435566018409e-06, "loss": 0.1934, "step": 3602 }, { "epoch": 1.7060668836934005, "grad_norm": 1.430541753768921, "learning_rate": 8.262399032997906e-06, "loss": 0.2026, "step": 3603 }, { "epoch": 1.7065403965670316, "grad_norm": 1.1874326467514038, "learning_rate": 8.257362954430172e-06, "loss": 0.1966, "step": 3604 }, { "epoch": 1.707013909440663, "grad_norm": 1.9422804117202759, "learning_rate": 8.252327331632343e-06, "loss": 0.2389, "step": 3605 }, { "epoch": 1.7074874223142942, "grad_norm": 1.1945825815200806, "learning_rate": 8.247292165921443e-06, "loss": 0.2184, "step": 3606 }, { "epoch": 1.7079609351879255, "grad_norm": 1.6779309511184692, "learning_rate": 8.242257458614368e-06, "loss": 0.1963, "step": 3607 }, { "epoch": 1.7084344480615568, "grad_norm": 1.2465808391571045, "learning_rate": 8.237223211027897e-06, "loss": 0.2136, "step": 3608 }, { "epoch": 1.7089079609351878, "grad_norm": 2.2780213356018066, "learning_rate": 8.23218942447869e-06, "loss": 0.22, "step": 3609 }, { "epoch": 1.7093814738088193, "grad_norm": 1.2657932043075562, "learning_rate": 8.227156100283283e-06, "loss": 0.2034, "step": 3610 }, { "epoch": 1.7098549866824504, "grad_norm": 1.1345802545547485, "learning_rate": 8.222123239758097e-06, "loss": 0.1913, "step": 3611 }, { "epoch": 1.7103284995560817, "grad_norm": 1.0455700159072876, "learning_rate": 8.217090844219425e-06, "loss": 0.2135, "step": 3612 }, { "epoch": 1.710802012429713, "grad_norm": 1.8622885942459106, "learning_rate": 8.212058914983445e-06, "loss": 0.2242, "step": 3613 }, { "epoch": 1.711275525303344, "grad_norm": 1.0153396129608154, "learning_rate": 8.207027453366206e-06, "loss": 0.2115, "step": 3614 }, { "epoch": 1.7117490381769755, "grad_norm": 1.053398609161377, "learning_rate": 8.201996460683638e-06, "loss": 0.2055, "step": 3615 }, { "epoch": 1.7122225510506066, "grad_norm": 1.2141062021255493, "learning_rate": 8.19696593825155e-06, "loss": 0.1882, "step": 3616 }, { "epoch": 1.712696063924238, "grad_norm": 1.2081797122955322, "learning_rate": 8.191935887385628e-06, "loss": 0.2217, "step": 3617 }, { "epoch": 1.7131695767978692, "grad_norm": 1.0772361755371094, "learning_rate": 8.186906309401431e-06, "loss": 0.2318, "step": 3618 }, { "epoch": 1.7136430896715005, "grad_norm": 1.2851800918579102, "learning_rate": 8.181877205614398e-06, "loss": 0.2083, "step": 3619 }, { "epoch": 1.7141166025451318, "grad_norm": 1.463058590888977, "learning_rate": 8.176848577339843e-06, "loss": 0.2274, "step": 3620 }, { "epoch": 1.7145901154187628, "grad_norm": 1.5855882167816162, "learning_rate": 8.171820425892952e-06, "loss": 0.2146, "step": 3621 }, { "epoch": 1.7150636282923943, "grad_norm": 1.2082139253616333, "learning_rate": 8.166792752588797e-06, "loss": 0.2106, "step": 3622 }, { "epoch": 1.7155371411660254, "grad_norm": 0.9224141240119934, "learning_rate": 8.161765558742307e-06, "loss": 0.1913, "step": 3623 }, { "epoch": 1.7160106540396567, "grad_norm": 1.1134727001190186, "learning_rate": 8.156738845668303e-06, "loss": 0.2006, "step": 3624 }, { "epoch": 1.716484166913288, "grad_norm": 1.2312144041061401, "learning_rate": 8.15171261468147e-06, "loss": 0.2164, "step": 3625 }, { "epoch": 1.716957679786919, "grad_norm": 1.2112536430358887, "learning_rate": 8.146686867096376e-06, "loss": 0.2118, "step": 3626 }, { "epoch": 1.7174311926605506, "grad_norm": 0.9137608408927917, "learning_rate": 8.141661604227448e-06, "loss": 0.1946, "step": 3627 }, { "epoch": 1.7179047055341816, "grad_norm": 1.5561513900756836, "learning_rate": 8.136636827389002e-06, "loss": 0.2256, "step": 3628 }, { "epoch": 1.718378218407813, "grad_norm": 0.8672730922698975, "learning_rate": 8.131612537895218e-06, "loss": 0.1872, "step": 3629 }, { "epoch": 1.7188517312814442, "grad_norm": 1.000693917274475, "learning_rate": 8.126588737060149e-06, "loss": 0.2139, "step": 3630 }, { "epoch": 1.7193252441550755, "grad_norm": 0.9694979190826416, "learning_rate": 8.121565426197722e-06, "loss": 0.1927, "step": 3631 }, { "epoch": 1.7197987570287068, "grad_norm": 0.90753573179245, "learning_rate": 8.116542606621736e-06, "loss": 0.2048, "step": 3632 }, { "epoch": 1.7202722699023378, "grad_norm": 1.230448842048645, "learning_rate": 8.111520279645864e-06, "loss": 0.2298, "step": 3633 }, { "epoch": 1.7207457827759693, "grad_norm": 1.0563310384750366, "learning_rate": 8.106498446583641e-06, "loss": 0.2158, "step": 3634 }, { "epoch": 1.7212192956496004, "grad_norm": 1.2370612621307373, "learning_rate": 8.101477108748486e-06, "loss": 0.214, "step": 3635 }, { "epoch": 1.7216928085232317, "grad_norm": 0.9708116054534912, "learning_rate": 8.096456267453674e-06, "loss": 0.1969, "step": 3636 }, { "epoch": 1.722166321396863, "grad_norm": 1.1120837926864624, "learning_rate": 8.091435924012365e-06, "loss": 0.2115, "step": 3637 }, { "epoch": 1.7226398342704943, "grad_norm": 0.838356077671051, "learning_rate": 8.086416079737577e-06, "loss": 0.2015, "step": 3638 }, { "epoch": 1.7231133471441256, "grad_norm": 1.4962152242660522, "learning_rate": 8.081396735942204e-06, "loss": 0.2272, "step": 3639 }, { "epoch": 1.7235868600177566, "grad_norm": 1.4194129705429077, "learning_rate": 8.076377893939003e-06, "loss": 0.2158, "step": 3640 }, { "epoch": 1.7240603728913881, "grad_norm": 1.0114027261734009, "learning_rate": 8.071359555040607e-06, "loss": 0.218, "step": 3641 }, { "epoch": 1.7245338857650192, "grad_norm": 1.3933831453323364, "learning_rate": 8.066341720559513e-06, "loss": 0.1817, "step": 3642 }, { "epoch": 1.7250073986386505, "grad_norm": 1.2769237756729126, "learning_rate": 8.06132439180809e-06, "loss": 0.208, "step": 3643 }, { "epoch": 1.7254809115122818, "grad_norm": 1.4070305824279785, "learning_rate": 8.056307570098566e-06, "loss": 0.2166, "step": 3644 }, { "epoch": 1.7259544243859128, "grad_norm": 1.0970323085784912, "learning_rate": 8.051291256743048e-06, "loss": 0.1809, "step": 3645 }, { "epoch": 1.7264279372595444, "grad_norm": 1.1892532110214233, "learning_rate": 8.046275453053501e-06, "loss": 0.2006, "step": 3646 }, { "epoch": 1.7269014501331754, "grad_norm": 1.8722046613693237, "learning_rate": 8.041260160341766e-06, "loss": 0.2364, "step": 3647 }, { "epoch": 1.7273749630068067, "grad_norm": 1.7485390901565552, "learning_rate": 8.036245379919535e-06, "loss": 0.1906, "step": 3648 }, { "epoch": 1.727848475880438, "grad_norm": 2.5090813636779785, "learning_rate": 8.03123111309838e-06, "loss": 0.2261, "step": 3649 }, { "epoch": 1.7283219887540693, "grad_norm": 1.2415854930877686, "learning_rate": 8.026217361189737e-06, "loss": 0.1927, "step": 3650 }, { "epoch": 1.7287955016277006, "grad_norm": 1.012861728668213, "learning_rate": 8.0212041255049e-06, "loss": 0.2023, "step": 3651 }, { "epoch": 1.7292690145013316, "grad_norm": 0.9463083148002625, "learning_rate": 8.016191407355034e-06, "loss": 0.2002, "step": 3652 }, { "epoch": 1.7297425273749631, "grad_norm": 1.2277804613113403, "learning_rate": 8.011179208051168e-06, "loss": 0.1973, "step": 3653 }, { "epoch": 1.7302160402485942, "grad_norm": 0.9926773309707642, "learning_rate": 8.006167528904194e-06, "loss": 0.1912, "step": 3654 }, { "epoch": 1.7306895531222255, "grad_norm": 2.3296539783477783, "learning_rate": 8.001156371224868e-06, "loss": 0.2088, "step": 3655 }, { "epoch": 1.7311630659958568, "grad_norm": 2.002706289291382, "learning_rate": 7.996145736323807e-06, "loss": 0.2359, "step": 3656 }, { "epoch": 1.7316365788694879, "grad_norm": 1.5044262409210205, "learning_rate": 7.991135625511503e-06, "loss": 0.2037, "step": 3657 }, { "epoch": 1.7321100917431194, "grad_norm": 1.2724112272262573, "learning_rate": 7.986126040098291e-06, "loss": 0.2139, "step": 3658 }, { "epoch": 1.7325836046167504, "grad_norm": 1.1246263980865479, "learning_rate": 7.981116981394388e-06, "loss": 0.2028, "step": 3659 }, { "epoch": 1.7330571174903817, "grad_norm": 0.9579681158065796, "learning_rate": 7.97610845070986e-06, "loss": 0.1982, "step": 3660 }, { "epoch": 1.733530630364013, "grad_norm": 1.939510703086853, "learning_rate": 7.971100449354643e-06, "loss": 0.2115, "step": 3661 }, { "epoch": 1.7340041432376443, "grad_norm": 1.2159603834152222, "learning_rate": 7.96609297863853e-06, "loss": 0.2167, "step": 3662 }, { "epoch": 1.7344776561112756, "grad_norm": 1.5756498575210571, "learning_rate": 7.961086039871178e-06, "loss": 0.2003, "step": 3663 }, { "epoch": 1.7349511689849066, "grad_norm": 1.5483087301254272, "learning_rate": 7.956079634362101e-06, "loss": 0.1893, "step": 3664 }, { "epoch": 1.7354246818585382, "grad_norm": 0.9675044417381287, "learning_rate": 7.951073763420679e-06, "loss": 0.2116, "step": 3665 }, { "epoch": 1.7358981947321692, "grad_norm": 1.4665745496749878, "learning_rate": 7.946068428356146e-06, "loss": 0.2102, "step": 3666 }, { "epoch": 1.7363717076058005, "grad_norm": 1.4936654567718506, "learning_rate": 7.941063630477603e-06, "loss": 0.2233, "step": 3667 }, { "epoch": 1.7368452204794318, "grad_norm": 1.196628212928772, "learning_rate": 7.936059371094004e-06, "loss": 0.205, "step": 3668 }, { "epoch": 1.737318733353063, "grad_norm": 1.1913931369781494, "learning_rate": 7.931055651514165e-06, "loss": 0.2089, "step": 3669 }, { "epoch": 1.7377922462266944, "grad_norm": 1.4205213785171509, "learning_rate": 7.926052473046756e-06, "loss": 0.233, "step": 3670 }, { "epoch": 1.7382657591003254, "grad_norm": 1.5477283000946045, "learning_rate": 7.921049837000318e-06, "loss": 0.2258, "step": 3671 }, { "epoch": 1.738739271973957, "grad_norm": 2.6856656074523926, "learning_rate": 7.916047744683244e-06, "loss": 0.216, "step": 3672 }, { "epoch": 1.739212784847588, "grad_norm": 1.0358319282531738, "learning_rate": 7.911046197403775e-06, "loss": 0.2089, "step": 3673 }, { "epoch": 1.7396862977212193, "grad_norm": 1.1069475412368774, "learning_rate": 7.90604519647002e-06, "loss": 0.2109, "step": 3674 }, { "epoch": 1.7401598105948506, "grad_norm": 1.5351327657699585, "learning_rate": 7.901044743189943e-06, "loss": 0.2081, "step": 3675 }, { "epoch": 1.7406333234684817, "grad_norm": 1.5493526458740234, "learning_rate": 7.896044838871365e-06, "loss": 0.2187, "step": 3676 }, { "epoch": 1.7411068363421132, "grad_norm": 1.2532929182052612, "learning_rate": 7.891045484821961e-06, "loss": 0.2365, "step": 3677 }, { "epoch": 1.7415803492157442, "grad_norm": 1.0188684463500977, "learning_rate": 7.886046682349267e-06, "loss": 0.2313, "step": 3678 }, { "epoch": 1.7420538620893755, "grad_norm": 1.1516002416610718, "learning_rate": 7.88104843276067e-06, "loss": 0.2071, "step": 3679 }, { "epoch": 1.7425273749630068, "grad_norm": 1.1517740488052368, "learning_rate": 7.876050737363411e-06, "loss": 0.2039, "step": 3680 }, { "epoch": 1.743000887836638, "grad_norm": 1.396052598953247, "learning_rate": 7.871053597464593e-06, "loss": 0.2138, "step": 3681 }, { "epoch": 1.7434744007102694, "grad_norm": 1.08030104637146, "learning_rate": 7.86605701437117e-06, "loss": 0.2316, "step": 3682 }, { "epoch": 1.7439479135839004, "grad_norm": 2.2446389198303223, "learning_rate": 7.861060989389947e-06, "loss": 0.2267, "step": 3683 }, { "epoch": 1.744421426457532, "grad_norm": 1.3251670598983765, "learning_rate": 7.856065523827586e-06, "loss": 0.1961, "step": 3684 }, { "epoch": 1.744894939331163, "grad_norm": 1.188880205154419, "learning_rate": 7.851070618990607e-06, "loss": 0.2184, "step": 3685 }, { "epoch": 1.7453684522047943, "grad_norm": 1.126604676246643, "learning_rate": 7.846076276185372e-06, "loss": 0.24, "step": 3686 }, { "epoch": 1.7458419650784256, "grad_norm": 1.0912270545959473, "learning_rate": 7.841082496718112e-06, "loss": 0.2049, "step": 3687 }, { "epoch": 1.7463154779520567, "grad_norm": 1.5733311176300049, "learning_rate": 7.836089281894895e-06, "loss": 0.2068, "step": 3688 }, { "epoch": 1.7467889908256882, "grad_norm": 0.9812442064285278, "learning_rate": 7.831096633021651e-06, "loss": 0.2069, "step": 3689 }, { "epoch": 1.7472625036993192, "grad_norm": 1.4512606859207153, "learning_rate": 7.826104551404161e-06, "loss": 0.2207, "step": 3690 }, { "epoch": 1.7477360165729505, "grad_norm": 2.106611967086792, "learning_rate": 7.821113038348052e-06, "loss": 0.2208, "step": 3691 }, { "epoch": 1.7482095294465818, "grad_norm": 1.069126009941101, "learning_rate": 7.81612209515881e-06, "loss": 0.2355, "step": 3692 }, { "epoch": 1.748683042320213, "grad_norm": 1.088439702987671, "learning_rate": 7.811131723141763e-06, "loss": 0.2171, "step": 3693 }, { "epoch": 1.7491565551938444, "grad_norm": 1.1019070148468018, "learning_rate": 7.806141923602098e-06, "loss": 0.2021, "step": 3694 }, { "epoch": 1.7496300680674755, "grad_norm": 1.3674668073654175, "learning_rate": 7.801152697844849e-06, "loss": 0.2084, "step": 3695 }, { "epoch": 1.750103580941107, "grad_norm": 1.3417162895202637, "learning_rate": 7.796164047174898e-06, "loss": 0.2, "step": 3696 }, { "epoch": 1.750577093814738, "grad_norm": 1.5355737209320068, "learning_rate": 7.79117597289698e-06, "loss": 0.2159, "step": 3697 }, { "epoch": 1.7510506066883693, "grad_norm": 1.4953505992889404, "learning_rate": 7.786188476315678e-06, "loss": 0.2188, "step": 3698 }, { "epoch": 1.7515241195620006, "grad_norm": 1.4030543565750122, "learning_rate": 7.781201558735423e-06, "loss": 0.2026, "step": 3699 }, { "epoch": 1.751997632435632, "grad_norm": 1.5488836765289307, "learning_rate": 7.776215221460496e-06, "loss": 0.2155, "step": 3700 }, { "epoch": 1.7524711453092632, "grad_norm": 1.2541826963424683, "learning_rate": 7.771229465795024e-06, "loss": 0.2176, "step": 3701 }, { "epoch": 1.7529446581828942, "grad_norm": 0.9930739998817444, "learning_rate": 7.766244293042983e-06, "loss": 0.2305, "step": 3702 }, { "epoch": 1.7534181710565258, "grad_norm": 1.1047271490097046, "learning_rate": 7.7612597045082e-06, "loss": 0.2213, "step": 3703 }, { "epoch": 1.7538916839301568, "grad_norm": 1.2397290468215942, "learning_rate": 7.75627570149434e-06, "loss": 0.2186, "step": 3704 }, { "epoch": 1.754365196803788, "grad_norm": 1.353489637374878, "learning_rate": 7.751292285304928e-06, "loss": 0.2263, "step": 3705 }, { "epoch": 1.7548387096774194, "grad_norm": 1.1363734006881714, "learning_rate": 7.746309457243324e-06, "loss": 0.2071, "step": 3706 }, { "epoch": 1.7553122225510505, "grad_norm": 1.537649393081665, "learning_rate": 7.74132721861274e-06, "loss": 0.2063, "step": 3707 }, { "epoch": 1.755785735424682, "grad_norm": 1.076953649520874, "learning_rate": 7.73634557071623e-06, "loss": 0.2097, "step": 3708 }, { "epoch": 1.756259248298313, "grad_norm": 1.2859452962875366, "learning_rate": 7.731364514856698e-06, "loss": 0.2212, "step": 3709 }, { "epoch": 1.7567327611719443, "grad_norm": 1.1581248044967651, "learning_rate": 7.726384052336893e-06, "loss": 0.2189, "step": 3710 }, { "epoch": 1.7572062740455756, "grad_norm": 1.3928226232528687, "learning_rate": 7.721404184459405e-06, "loss": 0.2324, "step": 3711 }, { "epoch": 1.757679786919207, "grad_norm": 1.322218894958496, "learning_rate": 7.716424912526672e-06, "loss": 0.201, "step": 3712 }, { "epoch": 1.7581532997928382, "grad_norm": 1.102550745010376, "learning_rate": 7.711446237840971e-06, "loss": 0.2371, "step": 3713 }, { "epoch": 1.7586268126664693, "grad_norm": 1.2192142009735107, "learning_rate": 7.70646816170443e-06, "loss": 0.2255, "step": 3714 }, { "epoch": 1.7591003255401008, "grad_norm": 1.6167590618133545, "learning_rate": 7.701490685419014e-06, "loss": 0.2051, "step": 3715 }, { "epoch": 1.7595738384137318, "grad_norm": 1.1392605304718018, "learning_rate": 7.696513810286534e-06, "loss": 0.2178, "step": 3716 }, { "epoch": 1.7600473512873631, "grad_norm": 0.9116237759590149, "learning_rate": 7.69153753760865e-06, "loss": 0.2093, "step": 3717 }, { "epoch": 1.7605208641609944, "grad_norm": 1.1358370780944824, "learning_rate": 7.686561868686848e-06, "loss": 0.2036, "step": 3718 }, { "epoch": 1.7609943770346255, "grad_norm": 1.4042670726776123, "learning_rate": 7.681586804822471e-06, "loss": 0.2023, "step": 3719 }, { "epoch": 1.761467889908257, "grad_norm": 1.1344070434570312, "learning_rate": 7.676612347316702e-06, "loss": 0.225, "step": 3720 }, { "epoch": 1.761941402781888, "grad_norm": 1.1140799522399902, "learning_rate": 7.671638497470558e-06, "loss": 0.1941, "step": 3721 }, { "epoch": 1.7624149156555193, "grad_norm": 0.9498646259307861, "learning_rate": 7.666665256584902e-06, "loss": 0.2175, "step": 3722 }, { "epoch": 1.7628884285291506, "grad_norm": 1.3666812181472778, "learning_rate": 7.66169262596044e-06, "loss": 0.2125, "step": 3723 }, { "epoch": 1.763361941402782, "grad_norm": 1.0839076042175293, "learning_rate": 7.656720606897711e-06, "loss": 0.2253, "step": 3724 }, { "epoch": 1.7638354542764132, "grad_norm": 0.9082208871841431, "learning_rate": 7.651749200697104e-06, "loss": 0.1788, "step": 3725 }, { "epoch": 1.7643089671500443, "grad_norm": 0.9702308773994446, "learning_rate": 7.646778408658839e-06, "loss": 0.2188, "step": 3726 }, { "epoch": 1.7647824800236758, "grad_norm": 0.9975780844688416, "learning_rate": 7.64180823208298e-06, "loss": 0.2132, "step": 3727 }, { "epoch": 1.7652559928973068, "grad_norm": 2.509084463119507, "learning_rate": 7.636838672269425e-06, "loss": 0.2237, "step": 3728 }, { "epoch": 1.7657295057709381, "grad_norm": 1.6179232597351074, "learning_rate": 7.63186973051792e-06, "loss": 0.2145, "step": 3729 }, { "epoch": 1.7662030186445694, "grad_norm": 1.5962454080581665, "learning_rate": 7.626901408128039e-06, "loss": 0.1912, "step": 3730 }, { "epoch": 1.7666765315182007, "grad_norm": 0.9649477005004883, "learning_rate": 7.621933706399202e-06, "loss": 0.2245, "step": 3731 }, { "epoch": 1.767150044391832, "grad_norm": 1.120977520942688, "learning_rate": 7.616966626630663e-06, "loss": 0.2079, "step": 3732 }, { "epoch": 1.767623557265463, "grad_norm": 1.2438452243804932, "learning_rate": 7.612000170121513e-06, "loss": 0.2257, "step": 3733 }, { "epoch": 1.7680970701390946, "grad_norm": 0.9499486684799194, "learning_rate": 7.607034338170681e-06, "loss": 0.2267, "step": 3734 }, { "epoch": 1.7685705830127256, "grad_norm": 1.2596803903579712, "learning_rate": 7.6020691320769325e-06, "loss": 0.2202, "step": 3735 }, { "epoch": 1.769044095886357, "grad_norm": 2.5322797298431396, "learning_rate": 7.597104553138872e-06, "loss": 0.2052, "step": 3736 }, { "epoch": 1.7695176087599882, "grad_norm": 1.7047964334487915, "learning_rate": 7.592140602654931e-06, "loss": 0.2073, "step": 3737 }, { "epoch": 1.7699911216336193, "grad_norm": 0.9767723083496094, "learning_rate": 7.587177281923388e-06, "loss": 0.2141, "step": 3738 }, { "epoch": 1.7704646345072508, "grad_norm": 1.1681851148605347, "learning_rate": 7.582214592242348e-06, "loss": 0.2123, "step": 3739 }, { "epoch": 1.7709381473808818, "grad_norm": 1.0731382369995117, "learning_rate": 7.577252534909758e-06, "loss": 0.1867, "step": 3740 }, { "epoch": 1.7714116602545131, "grad_norm": 1.2089591026306152, "learning_rate": 7.5722911112233956e-06, "loss": 0.1974, "step": 3741 }, { "epoch": 1.7718851731281444, "grad_norm": 1.1869101524353027, "learning_rate": 7.56733032248087e-06, "loss": 0.2027, "step": 3742 }, { "epoch": 1.7723586860017757, "grad_norm": 1.4566733837127686, "learning_rate": 7.562370169979633e-06, "loss": 0.2055, "step": 3743 }, { "epoch": 1.772832198875407, "grad_norm": 1.1467299461364746, "learning_rate": 7.55741065501696e-06, "loss": 0.2077, "step": 3744 }, { "epoch": 1.773305711749038, "grad_norm": 1.3079264163970947, "learning_rate": 7.552451778889967e-06, "loss": 0.1882, "step": 3745 }, { "epoch": 1.7737792246226696, "grad_norm": 1.1860123872756958, "learning_rate": 7.547493542895601e-06, "loss": 0.1968, "step": 3746 }, { "epoch": 1.7742527374963006, "grad_norm": 1.007750391960144, "learning_rate": 7.542535948330636e-06, "loss": 0.2064, "step": 3747 }, { "epoch": 1.774726250369932, "grad_norm": 1.3178592920303345, "learning_rate": 7.537578996491683e-06, "loss": 0.2012, "step": 3748 }, { "epoch": 1.7751997632435632, "grad_norm": 1.3056480884552002, "learning_rate": 7.532622688675193e-06, "loss": 0.2228, "step": 3749 }, { "epoch": 1.7756732761171943, "grad_norm": 1.8601652383804321, "learning_rate": 7.527667026177434e-06, "loss": 0.186, "step": 3750 }, { "epoch": 1.7761467889908258, "grad_norm": 2.2548739910125732, "learning_rate": 7.522712010294516e-06, "loss": 0.2139, "step": 3751 }, { "epoch": 1.7766203018644569, "grad_norm": 1.1233947277069092, "learning_rate": 7.517757642322372e-06, "loss": 0.2151, "step": 3752 }, { "epoch": 1.7770938147380881, "grad_norm": 1.173248291015625, "learning_rate": 7.5128039235567686e-06, "loss": 0.2143, "step": 3753 }, { "epoch": 1.7775673276117194, "grad_norm": 1.1922003030776978, "learning_rate": 7.507850855293305e-06, "loss": 0.2198, "step": 3754 }, { "epoch": 1.7780408404853507, "grad_norm": 1.0876096487045288, "learning_rate": 7.502898438827408e-06, "loss": 0.2106, "step": 3755 }, { "epoch": 1.778514353358982, "grad_norm": 1.1909540891647339, "learning_rate": 7.497946675454334e-06, "loss": 0.2067, "step": 3756 }, { "epoch": 1.778987866232613, "grad_norm": 1.084885597229004, "learning_rate": 7.49299556646917e-06, "loss": 0.1632, "step": 3757 }, { "epoch": 1.7794613791062446, "grad_norm": 0.9690635800361633, "learning_rate": 7.48804511316683e-06, "loss": 0.2316, "step": 3758 }, { "epoch": 1.7799348919798756, "grad_norm": 1.4600977897644043, "learning_rate": 7.483095316842057e-06, "loss": 0.2089, "step": 3759 }, { "epoch": 1.780408404853507, "grad_norm": 1.4429023265838623, "learning_rate": 7.478146178789423e-06, "loss": 0.2065, "step": 3760 }, { "epoch": 1.7808819177271382, "grad_norm": 1.128029227256775, "learning_rate": 7.47319770030333e-06, "loss": 0.2118, "step": 3761 }, { "epoch": 1.7813554306007695, "grad_norm": 1.9069571495056152, "learning_rate": 7.4682498826779984e-06, "loss": 0.225, "step": 3762 }, { "epoch": 1.7818289434744008, "grad_norm": 1.254188895225525, "learning_rate": 7.463302727207486e-06, "loss": 0.204, "step": 3763 }, { "epoch": 1.7823024563480319, "grad_norm": 1.4239470958709717, "learning_rate": 7.458356235185674e-06, "loss": 0.2052, "step": 3764 }, { "epoch": 1.7827759692216634, "grad_norm": 0.9776591658592224, "learning_rate": 7.45341040790627e-06, "loss": 0.2175, "step": 3765 }, { "epoch": 1.7832494820952944, "grad_norm": 1.543384075164795, "learning_rate": 7.4484652466628036e-06, "loss": 0.2158, "step": 3766 }, { "epoch": 1.7837229949689257, "grad_norm": 1.3612316846847534, "learning_rate": 7.443520752748639e-06, "loss": 0.2223, "step": 3767 }, { "epoch": 1.784196507842557, "grad_norm": 1.331251859664917, "learning_rate": 7.438576927456958e-06, "loss": 0.2304, "step": 3768 }, { "epoch": 1.784670020716188, "grad_norm": 1.209085464477539, "learning_rate": 7.433633772080772e-06, "loss": 0.2219, "step": 3769 }, { "epoch": 1.7851435335898196, "grad_norm": 2.226287364959717, "learning_rate": 7.428691287912915e-06, "loss": 0.2201, "step": 3770 }, { "epoch": 1.7856170464634507, "grad_norm": 1.0809687376022339, "learning_rate": 7.423749476246046e-06, "loss": 0.2028, "step": 3771 }, { "epoch": 1.786090559337082, "grad_norm": 1.1204088926315308, "learning_rate": 7.4188083383726475e-06, "loss": 0.2243, "step": 3772 }, { "epoch": 1.7865640722107132, "grad_norm": 1.2815765142440796, "learning_rate": 7.413867875585026e-06, "loss": 0.2129, "step": 3773 }, { "epoch": 1.7870375850843445, "grad_norm": 1.0807607173919678, "learning_rate": 7.408928089175314e-06, "loss": 0.2257, "step": 3774 }, { "epoch": 1.7875110979579758, "grad_norm": 1.4838930368423462, "learning_rate": 7.403988980435461e-06, "loss": 0.2331, "step": 3775 }, { "epoch": 1.7879846108316069, "grad_norm": 0.974097490310669, "learning_rate": 7.399050550657249e-06, "loss": 0.2171, "step": 3776 }, { "epoch": 1.7884581237052384, "grad_norm": 1.6029471158981323, "learning_rate": 7.394112801132271e-06, "loss": 0.1972, "step": 3777 }, { "epoch": 1.7889316365788694, "grad_norm": 1.0069618225097656, "learning_rate": 7.389175733151953e-06, "loss": 0.202, "step": 3778 }, { "epoch": 1.7894051494525007, "grad_norm": 1.8258870840072632, "learning_rate": 7.384239348007534e-06, "loss": 0.1895, "step": 3779 }, { "epoch": 1.789878662326132, "grad_norm": 1.2527416944503784, "learning_rate": 7.379303646990081e-06, "loss": 0.2081, "step": 3780 }, { "epoch": 1.790352175199763, "grad_norm": 1.7757433652877808, "learning_rate": 7.374368631390474e-06, "loss": 0.2195, "step": 3781 }, { "epoch": 1.7908256880733946, "grad_norm": 0.9863547086715698, "learning_rate": 7.369434302499423e-06, "loss": 0.2148, "step": 3782 }, { "epoch": 1.7912992009470257, "grad_norm": 1.3424357175827026, "learning_rate": 7.364500661607452e-06, "loss": 0.2289, "step": 3783 }, { "epoch": 1.791772713820657, "grad_norm": 1.5163919925689697, "learning_rate": 7.359567710004907e-06, "loss": 0.2079, "step": 3784 }, { "epoch": 1.7922462266942882, "grad_norm": 1.132380723953247, "learning_rate": 7.354635448981955e-06, "loss": 0.2004, "step": 3785 }, { "epoch": 1.7927197395679195, "grad_norm": 1.174932599067688, "learning_rate": 7.349703879828582e-06, "loss": 0.1928, "step": 3786 }, { "epoch": 1.7931932524415508, "grad_norm": 1.1085500717163086, "learning_rate": 7.344773003834589e-06, "loss": 0.1902, "step": 3787 }, { "epoch": 1.7936667653151819, "grad_norm": 1.5855284929275513, "learning_rate": 7.339842822289602e-06, "loss": 0.2048, "step": 3788 }, { "epoch": 1.7941402781888134, "grad_norm": 1.047294020652771, "learning_rate": 7.334913336483063e-06, "loss": 0.1961, "step": 3789 }, { "epoch": 1.7946137910624445, "grad_norm": 1.2375208139419556, "learning_rate": 7.329984547704231e-06, "loss": 0.1988, "step": 3790 }, { "epoch": 1.7950873039360757, "grad_norm": 1.0275269746780396, "learning_rate": 7.3250564572421814e-06, "loss": 0.239, "step": 3791 }, { "epoch": 1.795560816809707, "grad_norm": 1.0698773860931396, "learning_rate": 7.320129066385811e-06, "loss": 0.2176, "step": 3792 }, { "epoch": 1.7960343296833383, "grad_norm": 1.3771909475326538, "learning_rate": 7.315202376423829e-06, "loss": 0.2128, "step": 3793 }, { "epoch": 1.7965078425569696, "grad_norm": 1.1806304454803467, "learning_rate": 7.3102763886447645e-06, "loss": 0.1979, "step": 3794 }, { "epoch": 1.7969813554306007, "grad_norm": 1.011577844619751, "learning_rate": 7.305351104336963e-06, "loss": 0.1838, "step": 3795 }, { "epoch": 1.7974548683042322, "grad_norm": 2.0910136699676514, "learning_rate": 7.3004265247885865e-06, "loss": 0.1845, "step": 3796 }, { "epoch": 1.7979283811778632, "grad_norm": 0.9872161746025085, "learning_rate": 7.295502651287607e-06, "loss": 0.2177, "step": 3797 }, { "epoch": 1.7984018940514945, "grad_norm": 1.0551612377166748, "learning_rate": 7.290579485121818e-06, "loss": 0.2106, "step": 3798 }, { "epoch": 1.7988754069251258, "grad_norm": 0.9977513551712036, "learning_rate": 7.285657027578827e-06, "loss": 0.2187, "step": 3799 }, { "epoch": 1.7993489197987569, "grad_norm": 1.0159823894500732, "learning_rate": 7.280735279946054e-06, "loss": 0.22, "step": 3800 }, { "epoch": 1.7998224326723884, "grad_norm": 1.2964645624160767, "learning_rate": 7.275814243510736e-06, "loss": 0.2177, "step": 3801 }, { "epoch": 1.8002959455460195, "grad_norm": 1.3088321685791016, "learning_rate": 7.270893919559922e-06, "loss": 0.2346, "step": 3802 }, { "epoch": 1.8007694584196507, "grad_norm": 1.1884821653366089, "learning_rate": 7.265974309380475e-06, "loss": 0.1938, "step": 3803 }, { "epoch": 1.801242971293282, "grad_norm": 1.544429063796997, "learning_rate": 7.26105541425907e-06, "loss": 0.2023, "step": 3804 }, { "epoch": 1.8017164841669133, "grad_norm": 0.9471104741096497, "learning_rate": 7.2561372354822035e-06, "loss": 0.2057, "step": 3805 }, { "epoch": 1.8021899970405446, "grad_norm": 1.2937753200531006, "learning_rate": 7.251219774336169e-06, "loss": 0.2245, "step": 3806 }, { "epoch": 1.8026635099141757, "grad_norm": 1.004191517829895, "learning_rate": 7.246303032107084e-06, "loss": 0.1868, "step": 3807 }, { "epoch": 1.8031370227878072, "grad_norm": 1.476674199104309, "learning_rate": 7.2413870100808755e-06, "loss": 0.2203, "step": 3808 }, { "epoch": 1.8036105356614383, "grad_norm": 0.9113790988922119, "learning_rate": 7.2364717095432825e-06, "loss": 0.2114, "step": 3809 }, { "epoch": 1.8040840485350695, "grad_norm": 1.3018831014633179, "learning_rate": 7.231557131779854e-06, "loss": 0.2191, "step": 3810 }, { "epoch": 1.8045575614087008, "grad_norm": 1.301070213317871, "learning_rate": 7.226643278075948e-06, "loss": 0.1999, "step": 3811 }, { "epoch": 1.805031074282332, "grad_norm": 1.776887059211731, "learning_rate": 7.2217301497167405e-06, "loss": 0.2103, "step": 3812 }, { "epoch": 1.8055045871559634, "grad_norm": 2.6810529232025146, "learning_rate": 7.216817747987208e-06, "loss": 0.2098, "step": 3813 }, { "epoch": 1.8059781000295945, "grad_norm": 1.186989426612854, "learning_rate": 7.2119060741721435e-06, "loss": 0.1888, "step": 3814 }, { "epoch": 1.8064516129032258, "grad_norm": 0.9349979162216187, "learning_rate": 7.206995129556151e-06, "loss": 0.2345, "step": 3815 }, { "epoch": 1.806925125776857, "grad_norm": 1.71229887008667, "learning_rate": 7.202084915423636e-06, "loss": 0.2431, "step": 3816 }, { "epoch": 1.8073986386504883, "grad_norm": 1.5204129219055176, "learning_rate": 7.197175433058818e-06, "loss": 0.2119, "step": 3817 }, { "epoch": 1.8078721515241196, "grad_norm": 1.7288326025009155, "learning_rate": 7.192266683745728e-06, "loss": 0.22, "step": 3818 }, { "epoch": 1.8083456643977507, "grad_norm": 1.4279898405075073, "learning_rate": 7.187358668768198e-06, "loss": 0.2029, "step": 3819 }, { "epoch": 1.8088191772713822, "grad_norm": 1.4333093166351318, "learning_rate": 7.182451389409877e-06, "loss": 0.2323, "step": 3820 }, { "epoch": 1.8092926901450133, "grad_norm": 1.506713628768921, "learning_rate": 7.177544846954212e-06, "loss": 0.2408, "step": 3821 }, { "epoch": 1.8097662030186445, "grad_norm": 1.236220121383667, "learning_rate": 7.172639042684464e-06, "loss": 0.2193, "step": 3822 }, { "epoch": 1.8102397158922758, "grad_norm": 0.997626006603241, "learning_rate": 7.1677339778836975e-06, "loss": 0.2065, "step": 3823 }, { "epoch": 1.8107132287659071, "grad_norm": 1.5409119129180908, "learning_rate": 7.162829653834787e-06, "loss": 0.205, "step": 3824 }, { "epoch": 1.8111867416395384, "grad_norm": 1.3550500869750977, "learning_rate": 7.157926071820411e-06, "loss": 0.2062, "step": 3825 }, { "epoch": 1.8116602545131695, "grad_norm": 1.6117483377456665, "learning_rate": 7.153023233123047e-06, "loss": 0.2185, "step": 3826 }, { "epoch": 1.812133767386801, "grad_norm": 1.3827286958694458, "learning_rate": 7.148121139024995e-06, "loss": 0.1976, "step": 3827 }, { "epoch": 1.812607280260432, "grad_norm": 1.7857599258422852, "learning_rate": 7.143219790808347e-06, "loss": 0.2123, "step": 3828 }, { "epoch": 1.8130807931340633, "grad_norm": 1.2344205379486084, "learning_rate": 7.138319189755002e-06, "loss": 0.2103, "step": 3829 }, { "epoch": 1.8135543060076946, "grad_norm": 2.3192245960235596, "learning_rate": 7.1334193371466675e-06, "loss": 0.231, "step": 3830 }, { "epoch": 1.8140278188813257, "grad_norm": 1.136781096458435, "learning_rate": 7.128520234264851e-06, "loss": 0.2119, "step": 3831 }, { "epoch": 1.8145013317549572, "grad_norm": 1.3078882694244385, "learning_rate": 7.1236218823908645e-06, "loss": 0.2428, "step": 3832 }, { "epoch": 1.8149748446285883, "grad_norm": 1.2478915452957153, "learning_rate": 7.118724282805825e-06, "loss": 0.2439, "step": 3833 }, { "epoch": 1.8154483575022196, "grad_norm": 1.50113844871521, "learning_rate": 7.113827436790655e-06, "loss": 0.2277, "step": 3834 }, { "epoch": 1.8159218703758508, "grad_norm": 1.2147942781448364, "learning_rate": 7.108931345626074e-06, "loss": 0.2285, "step": 3835 }, { "epoch": 1.8163953832494821, "grad_norm": 1.1065564155578613, "learning_rate": 7.104036010592609e-06, "loss": 0.1995, "step": 3836 }, { "epoch": 1.8168688961231134, "grad_norm": 1.3819801807403564, "learning_rate": 7.099141432970588e-06, "loss": 0.1979, "step": 3837 }, { "epoch": 1.8173424089967445, "grad_norm": 0.9944810271263123, "learning_rate": 7.094247614040139e-06, "loss": 0.2053, "step": 3838 }, { "epoch": 1.817815921870376, "grad_norm": 1.5402950048446655, "learning_rate": 7.0893545550811956e-06, "loss": 0.2208, "step": 3839 }, { "epoch": 1.818289434744007, "grad_norm": 1.463292121887207, "learning_rate": 7.08446225737349e-06, "loss": 0.2016, "step": 3840 }, { "epoch": 1.8187629476176383, "grad_norm": 1.0113959312438965, "learning_rate": 7.079570722196553e-06, "loss": 0.2195, "step": 3841 }, { "epoch": 1.8192364604912696, "grad_norm": 1.070245623588562, "learning_rate": 7.074679950829719e-06, "loss": 0.2084, "step": 3842 }, { "epoch": 1.8197099733649007, "grad_norm": 1.2320399284362793, "learning_rate": 7.069789944552124e-06, "loss": 0.206, "step": 3843 }, { "epoch": 1.8201834862385322, "grad_norm": 1.2585248947143555, "learning_rate": 7.0649007046427006e-06, "loss": 0.2279, "step": 3844 }, { "epoch": 1.8206569991121633, "grad_norm": 0.9737508296966553, "learning_rate": 7.060012232380182e-06, "loss": 0.1996, "step": 3845 }, { "epoch": 1.8211305119857946, "grad_norm": 1.2638822793960571, "learning_rate": 7.0551245290431e-06, "loss": 0.1924, "step": 3846 }, { "epoch": 1.8216040248594259, "grad_norm": 1.16457200050354, "learning_rate": 7.0502375959097904e-06, "loss": 0.2235, "step": 3847 }, { "epoch": 1.8220775377330571, "grad_norm": 1.078177809715271, "learning_rate": 7.045351434258378e-06, "loss": 0.2087, "step": 3848 }, { "epoch": 1.8225510506066884, "grad_norm": 1.1603608131408691, "learning_rate": 7.040466045366796e-06, "loss": 0.2128, "step": 3849 }, { "epoch": 1.8230245634803195, "grad_norm": 1.2323132753372192, "learning_rate": 7.03558143051277e-06, "loss": 0.2041, "step": 3850 }, { "epoch": 1.823498076353951, "grad_norm": 1.5576890707015991, "learning_rate": 7.0306975909738205e-06, "loss": 0.2194, "step": 3851 }, { "epoch": 1.823971589227582, "grad_norm": 0.993789553642273, "learning_rate": 7.025814528027272e-06, "loss": 0.1883, "step": 3852 }, { "epoch": 1.8244451021012134, "grad_norm": 1.0642006397247314, "learning_rate": 7.02093224295024e-06, "loss": 0.1956, "step": 3853 }, { "epoch": 1.8249186149748446, "grad_norm": 1.1008495092391968, "learning_rate": 7.016050737019641e-06, "loss": 0.2071, "step": 3854 }, { "epoch": 1.825392127848476, "grad_norm": 1.0528384447097778, "learning_rate": 7.0111700115121835e-06, "loss": 0.2297, "step": 3855 }, { "epoch": 1.8258656407221072, "grad_norm": 1.6767280101776123, "learning_rate": 7.006290067704378e-06, "loss": 0.2402, "step": 3856 }, { "epoch": 1.8263391535957383, "grad_norm": 0.9541283249855042, "learning_rate": 7.001410906872522e-06, "loss": 0.1919, "step": 3857 }, { "epoch": 1.8268126664693698, "grad_norm": 0.9566398859024048, "learning_rate": 6.996532530292717e-06, "loss": 0.1902, "step": 3858 }, { "epoch": 1.8272861793430009, "grad_norm": 0.9043372869491577, "learning_rate": 6.991654939240855e-06, "loss": 0.2088, "step": 3859 }, { "epoch": 1.8277596922166321, "grad_norm": 1.314922571182251, "learning_rate": 6.98677813499262e-06, "loss": 0.2167, "step": 3860 }, { "epoch": 1.8282332050902634, "grad_norm": 1.1723600625991821, "learning_rate": 6.981902118823495e-06, "loss": 0.2107, "step": 3861 }, { "epoch": 1.8287067179638945, "grad_norm": 1.5371065139770508, "learning_rate": 6.977026892008753e-06, "loss": 0.2167, "step": 3862 }, { "epoch": 1.829180230837526, "grad_norm": 1.1447199583053589, "learning_rate": 6.972152455823467e-06, "loss": 0.2027, "step": 3863 }, { "epoch": 1.829653743711157, "grad_norm": 1.6060473918914795, "learning_rate": 6.967278811542495e-06, "loss": 0.2164, "step": 3864 }, { "epoch": 1.8301272565847884, "grad_norm": 1.6065421104431152, "learning_rate": 6.96240596044049e-06, "loss": 0.2087, "step": 3865 }, { "epoch": 1.8306007694584197, "grad_norm": 1.0932152271270752, "learning_rate": 6.957533903791904e-06, "loss": 0.2294, "step": 3866 }, { "epoch": 1.831074282332051, "grad_norm": 1.129197359085083, "learning_rate": 6.9526626428709745e-06, "loss": 0.1843, "step": 3867 }, { "epoch": 1.8315477952056822, "grad_norm": 2.516554117202759, "learning_rate": 6.947792178951733e-06, "loss": 0.2076, "step": 3868 }, { "epoch": 1.8320213080793133, "grad_norm": 1.335574984550476, "learning_rate": 6.942922513308001e-06, "loss": 0.2114, "step": 3869 }, { "epoch": 1.8324948209529448, "grad_norm": 0.9613366723060608, "learning_rate": 6.9380536472133945e-06, "loss": 0.2074, "step": 3870 }, { "epoch": 1.8329683338265759, "grad_norm": 1.1925537586212158, "learning_rate": 6.933185581941316e-06, "loss": 0.2034, "step": 3871 }, { "epoch": 1.8334418467002072, "grad_norm": 1.1290981769561768, "learning_rate": 6.928318318764964e-06, "loss": 0.1972, "step": 3872 }, { "epoch": 1.8339153595738384, "grad_norm": 1.4056495428085327, "learning_rate": 6.923451858957322e-06, "loss": 0.2269, "step": 3873 }, { "epoch": 1.8343888724474695, "grad_norm": 1.2841758728027344, "learning_rate": 6.918586203791169e-06, "loss": 0.1974, "step": 3874 }, { "epoch": 1.834862385321101, "grad_norm": 0.927555501461029, "learning_rate": 6.913721354539065e-06, "loss": 0.2272, "step": 3875 }, { "epoch": 1.835335898194732, "grad_norm": 1.3648067712783813, "learning_rate": 6.908857312473366e-06, "loss": 0.2267, "step": 3876 }, { "epoch": 1.8358094110683634, "grad_norm": 1.9164044857025146, "learning_rate": 6.903994078866216e-06, "loss": 0.226, "step": 3877 }, { "epoch": 1.8362829239419947, "grad_norm": 1.0202974081039429, "learning_rate": 6.899131654989548e-06, "loss": 0.205, "step": 3878 }, { "epoch": 1.836756436815626, "grad_norm": 1.1604101657867432, "learning_rate": 6.894270042115081e-06, "loss": 0.1836, "step": 3879 }, { "epoch": 1.8372299496892572, "grad_norm": 1.1902738809585571, "learning_rate": 6.889409241514323e-06, "loss": 0.2165, "step": 3880 }, { "epoch": 1.8377034625628883, "grad_norm": 1.178782343864441, "learning_rate": 6.88454925445857e-06, "loss": 0.1952, "step": 3881 }, { "epoch": 1.8381769754365198, "grad_norm": 1.345608115196228, "learning_rate": 6.879690082218903e-06, "loss": 0.2005, "step": 3882 }, { "epoch": 1.8386504883101509, "grad_norm": 1.6030305624008179, "learning_rate": 6.874831726066194e-06, "loss": 0.2087, "step": 3883 }, { "epoch": 1.8391240011837822, "grad_norm": 1.1542789936065674, "learning_rate": 6.869974187271098e-06, "loss": 0.1984, "step": 3884 }, { "epoch": 1.8395975140574135, "grad_norm": 1.1518371105194092, "learning_rate": 6.865117467104058e-06, "loss": 0.2063, "step": 3885 }, { "epoch": 1.8400710269310447, "grad_norm": 1.9634912014007568, "learning_rate": 6.8602615668353e-06, "loss": 0.2074, "step": 3886 }, { "epoch": 1.840544539804676, "grad_norm": 1.0860075950622559, "learning_rate": 6.85540648773484e-06, "loss": 0.202, "step": 3887 }, { "epoch": 1.841018052678307, "grad_norm": 1.4793963432312012, "learning_rate": 6.850552231072477e-06, "loss": 0.229, "step": 3888 }, { "epoch": 1.8414915655519386, "grad_norm": 1.0886497497558594, "learning_rate": 6.845698798117795e-06, "loss": 0.234, "step": 3889 }, { "epoch": 1.8419650784255697, "grad_norm": 2.2416272163391113, "learning_rate": 6.840846190140161e-06, "loss": 0.2166, "step": 3890 }, { "epoch": 1.842438591299201, "grad_norm": 1.1083406209945679, "learning_rate": 6.83599440840873e-06, "loss": 0.2016, "step": 3891 }, { "epoch": 1.8429121041728322, "grad_norm": 1.5353022813796997, "learning_rate": 6.831143454192437e-06, "loss": 0.1789, "step": 3892 }, { "epoch": 1.8433856170464633, "grad_norm": 1.475759744644165, "learning_rate": 6.826293328760004e-06, "loss": 0.2397, "step": 3893 }, { "epoch": 1.8438591299200948, "grad_norm": 1.3669559955596924, "learning_rate": 6.821444033379936e-06, "loss": 0.2096, "step": 3894 }, { "epoch": 1.8443326427937259, "grad_norm": 0.972876787185669, "learning_rate": 6.816595569320514e-06, "loss": 0.2265, "step": 3895 }, { "epoch": 1.8448061556673572, "grad_norm": 1.7018649578094482, "learning_rate": 6.8117479378498104e-06, "loss": 0.2174, "step": 3896 }, { "epoch": 1.8452796685409885, "grad_norm": 1.9252698421478271, "learning_rate": 6.806901140235675e-06, "loss": 0.2152, "step": 3897 }, { "epoch": 1.8457531814146197, "grad_norm": 1.056630253791809, "learning_rate": 6.802055177745743e-06, "loss": 0.2357, "step": 3898 }, { "epoch": 1.846226694288251, "grad_norm": 1.101554036140442, "learning_rate": 6.79721005164743e-06, "loss": 0.213, "step": 3899 }, { "epoch": 1.846700207161882, "grad_norm": 1.172674536705017, "learning_rate": 6.792365763207926e-06, "loss": 0.1947, "step": 3900 }, { "epoch": 1.8471737200355136, "grad_norm": 2.0155622959136963, "learning_rate": 6.787522313694214e-06, "loss": 0.2403, "step": 3901 }, { "epoch": 1.8476472329091447, "grad_norm": 1.046518325805664, "learning_rate": 6.782679704373051e-06, "loss": 0.2045, "step": 3902 }, { "epoch": 1.848120745782776, "grad_norm": 1.204442024230957, "learning_rate": 6.777837936510971e-06, "loss": 0.2257, "step": 3903 }, { "epoch": 1.8485942586564073, "grad_norm": 1.0126386880874634, "learning_rate": 6.772997011374294e-06, "loss": 0.194, "step": 3904 }, { "epoch": 1.8490677715300383, "grad_norm": 1.27089524269104, "learning_rate": 6.768156930229118e-06, "loss": 0.1956, "step": 3905 }, { "epoch": 1.8495412844036698, "grad_norm": 1.372044563293457, "learning_rate": 6.763317694341319e-06, "loss": 0.1892, "step": 3906 }, { "epoch": 1.850014797277301, "grad_norm": 1.0626308917999268, "learning_rate": 6.758479304976553e-06, "loss": 0.2172, "step": 3907 }, { "epoch": 1.8504883101509322, "grad_norm": 0.8939222693443298, "learning_rate": 6.753641763400252e-06, "loss": 0.2101, "step": 3908 }, { "epoch": 1.8509618230245635, "grad_norm": 1.0738952159881592, "learning_rate": 6.748805070877632e-06, "loss": 0.195, "step": 3909 }, { "epoch": 1.8514353358981948, "grad_norm": 2.0406644344329834, "learning_rate": 6.743969228673679e-06, "loss": 0.2011, "step": 3910 }, { "epoch": 1.851908848771826, "grad_norm": 1.1621339321136475, "learning_rate": 6.739134238053162e-06, "loss": 0.2406, "step": 3911 }, { "epoch": 1.852382361645457, "grad_norm": 1.1607718467712402, "learning_rate": 6.734300100280629e-06, "loss": 0.2154, "step": 3912 }, { "epoch": 1.8528558745190886, "grad_norm": 1.2867686748504639, "learning_rate": 6.729466816620398e-06, "loss": 0.2091, "step": 3913 }, { "epoch": 1.8533293873927197, "grad_norm": 1.1185880899429321, "learning_rate": 6.724634388336571e-06, "loss": 0.2298, "step": 3914 }, { "epoch": 1.853802900266351, "grad_norm": 1.1135728359222412, "learning_rate": 6.71980281669302e-06, "loss": 0.2092, "step": 3915 }, { "epoch": 1.8542764131399823, "grad_norm": 0.8761497735977173, "learning_rate": 6.714972102953399e-06, "loss": 0.1895, "step": 3916 }, { "epoch": 1.8547499260136135, "grad_norm": 1.1779834032058716, "learning_rate": 6.710142248381133e-06, "loss": 0.2315, "step": 3917 }, { "epoch": 1.8552234388872448, "grad_norm": 0.9498329758644104, "learning_rate": 6.705313254239424e-06, "loss": 0.2117, "step": 3918 }, { "epoch": 1.855696951760876, "grad_norm": 1.2298184633255005, "learning_rate": 6.700485121791252e-06, "loss": 0.2018, "step": 3919 }, { "epoch": 1.8561704646345074, "grad_norm": 1.0038814544677734, "learning_rate": 6.695657852299362e-06, "loss": 0.1963, "step": 3920 }, { "epoch": 1.8566439775081385, "grad_norm": 1.4736988544464111, "learning_rate": 6.690831447026283e-06, "loss": 0.2308, "step": 3921 }, { "epoch": 1.8571174903817698, "grad_norm": 1.168468952178955, "learning_rate": 6.686005907234317e-06, "loss": 0.2056, "step": 3922 }, { "epoch": 1.857591003255401, "grad_norm": 1.3906092643737793, "learning_rate": 6.681181234185532e-06, "loss": 0.2446, "step": 3923 }, { "epoch": 1.8580645161290321, "grad_norm": 1.0936076641082764, "learning_rate": 6.6763574291417795e-06, "loss": 0.1935, "step": 3924 }, { "epoch": 1.8585380290026636, "grad_norm": 1.0105923414230347, "learning_rate": 6.67153449336468e-06, "loss": 0.2192, "step": 3925 }, { "epoch": 1.8590115418762947, "grad_norm": 1.5099976062774658, "learning_rate": 6.666712428115621e-06, "loss": 0.199, "step": 3926 }, { "epoch": 1.859485054749926, "grad_norm": 1.0132044553756714, "learning_rate": 6.661891234655769e-06, "loss": 0.2101, "step": 3927 }, { "epoch": 1.8599585676235573, "grad_norm": 0.927442193031311, "learning_rate": 6.657070914246063e-06, "loss": 0.1969, "step": 3928 }, { "epoch": 1.8604320804971886, "grad_norm": 1.5215531587600708, "learning_rate": 6.6522514681472105e-06, "loss": 0.1945, "step": 3929 }, { "epoch": 1.8609055933708198, "grad_norm": 1.445884108543396, "learning_rate": 6.64743289761969e-06, "loss": 0.2198, "step": 3930 }, { "epoch": 1.861379106244451, "grad_norm": 1.817511796951294, "learning_rate": 6.64261520392375e-06, "loss": 0.2005, "step": 3931 }, { "epoch": 1.8618526191180824, "grad_norm": 1.0783874988555908, "learning_rate": 6.637798388319416e-06, "loss": 0.1863, "step": 3932 }, { "epoch": 1.8623261319917135, "grad_norm": 1.2979896068572998, "learning_rate": 6.632982452066476e-06, "loss": 0.2191, "step": 3933 }, { "epoch": 1.8627996448653448, "grad_norm": 0.9577775001525879, "learning_rate": 6.628167396424494e-06, "loss": 0.1862, "step": 3934 }, { "epoch": 1.863273157738976, "grad_norm": 1.0281299352645874, "learning_rate": 6.623353222652802e-06, "loss": 0.2308, "step": 3935 }, { "epoch": 1.8637466706126071, "grad_norm": 1.4038194417953491, "learning_rate": 6.6185399320105e-06, "loss": 0.2115, "step": 3936 }, { "epoch": 1.8642201834862386, "grad_norm": 2.125293254852295, "learning_rate": 6.613727525756455e-06, "loss": 0.2016, "step": 3937 }, { "epoch": 1.8646936963598697, "grad_norm": 1.3290354013442993, "learning_rate": 6.608916005149311e-06, "loss": 0.1913, "step": 3938 }, { "epoch": 1.865167209233501, "grad_norm": 1.1084948778152466, "learning_rate": 6.604105371447469e-06, "loss": 0.1794, "step": 3939 }, { "epoch": 1.8656407221071323, "grad_norm": 1.267200231552124, "learning_rate": 6.599295625909107e-06, "loss": 0.1885, "step": 3940 }, { "epoch": 1.8661142349807636, "grad_norm": 1.9131397008895874, "learning_rate": 6.5944867697921654e-06, "loss": 0.224, "step": 3941 }, { "epoch": 1.8665877478543949, "grad_norm": 1.2804118394851685, "learning_rate": 6.589678804354353e-06, "loss": 0.1945, "step": 3942 }, { "epoch": 1.867061260728026, "grad_norm": 0.9017341732978821, "learning_rate": 6.584871730853153e-06, "loss": 0.2059, "step": 3943 }, { "epoch": 1.8675347736016574, "grad_norm": 1.365110158920288, "learning_rate": 6.5800655505458065e-06, "loss": 0.222, "step": 3944 }, { "epoch": 1.8680082864752885, "grad_norm": 0.8447331190109253, "learning_rate": 6.5752602646893224e-06, "loss": 0.1945, "step": 3945 }, { "epoch": 1.8684817993489198, "grad_norm": 1.1045807600021362, "learning_rate": 6.5704558745404755e-06, "loss": 0.1969, "step": 3946 }, { "epoch": 1.868955312222551, "grad_norm": 1.3846262693405151, "learning_rate": 6.56565238135581e-06, "loss": 0.2017, "step": 3947 }, { "epoch": 1.8694288250961824, "grad_norm": 1.107153058052063, "learning_rate": 6.560849786391632e-06, "loss": 0.2207, "step": 3948 }, { "epoch": 1.8699023379698136, "grad_norm": 1.206408143043518, "learning_rate": 6.556048090904015e-06, "loss": 0.2088, "step": 3949 }, { "epoch": 1.8703758508434447, "grad_norm": 1.063124418258667, "learning_rate": 6.5512472961487946e-06, "loss": 0.2066, "step": 3950 }, { "epoch": 1.8708493637170762, "grad_norm": 1.2000980377197266, "learning_rate": 6.5464474033815754e-06, "loss": 0.1757, "step": 3951 }, { "epoch": 1.8713228765907073, "grad_norm": 1.3470823764801025, "learning_rate": 6.541648413857718e-06, "loss": 0.2252, "step": 3952 }, { "epoch": 1.8717963894643386, "grad_norm": 1.3533769845962524, "learning_rate": 6.536850328832358e-06, "loss": 0.2145, "step": 3953 }, { "epoch": 1.8722699023379699, "grad_norm": 1.6405057907104492, "learning_rate": 6.5320531495603825e-06, "loss": 0.1892, "step": 3954 }, { "epoch": 1.872743415211601, "grad_norm": 1.296522855758667, "learning_rate": 6.527256877296449e-06, "loss": 0.2205, "step": 3955 }, { "epoch": 1.8732169280852324, "grad_norm": 1.0236841440200806, "learning_rate": 6.522461513294979e-06, "loss": 0.1865, "step": 3956 }, { "epoch": 1.8736904409588635, "grad_norm": 1.0961860418319702, "learning_rate": 6.517667058810151e-06, "loss": 0.1896, "step": 3957 }, { "epoch": 1.8741639538324948, "grad_norm": 1.3094490766525269, "learning_rate": 6.5128735150959075e-06, "loss": 0.2059, "step": 3958 }, { "epoch": 1.874637466706126, "grad_norm": 1.5833834409713745, "learning_rate": 6.508080883405957e-06, "loss": 0.223, "step": 3959 }, { "epoch": 1.8751109795797574, "grad_norm": 0.9895943999290466, "learning_rate": 6.503289164993765e-06, "loss": 0.2081, "step": 3960 }, { "epoch": 1.8755844924533887, "grad_norm": 1.386003851890564, "learning_rate": 6.498498361112557e-06, "loss": 0.2138, "step": 3961 }, { "epoch": 1.8760580053270197, "grad_norm": 1.1875851154327393, "learning_rate": 6.4937084730153236e-06, "loss": 0.2184, "step": 3962 }, { "epoch": 1.8765315182006512, "grad_norm": 1.4439812898635864, "learning_rate": 6.4889195019548155e-06, "loss": 0.2191, "step": 3963 }, { "epoch": 1.8770050310742823, "grad_norm": 1.2867056131362915, "learning_rate": 6.484131449183537e-06, "loss": 0.195, "step": 3964 }, { "epoch": 1.8774785439479136, "grad_norm": 1.1023000478744507, "learning_rate": 6.47934431595376e-06, "loss": 0.2226, "step": 3965 }, { "epoch": 1.8779520568215449, "grad_norm": 2.235792636871338, "learning_rate": 6.474558103517513e-06, "loss": 0.2143, "step": 3966 }, { "epoch": 1.878425569695176, "grad_norm": 1.0556797981262207, "learning_rate": 6.469772813126584e-06, "loss": 0.1981, "step": 3967 }, { "epoch": 1.8788990825688074, "grad_norm": 0.9840413928031921, "learning_rate": 6.464988446032518e-06, "loss": 0.2003, "step": 3968 }, { "epoch": 1.8793725954424385, "grad_norm": 1.3949209451675415, "learning_rate": 6.460205003486621e-06, "loss": 0.2189, "step": 3969 }, { "epoch": 1.8798461083160698, "grad_norm": 0.8188128471374512, "learning_rate": 6.4554224867399575e-06, "loss": 0.1956, "step": 3970 }, { "epoch": 1.880319621189701, "grad_norm": 1.0726219415664673, "learning_rate": 6.450640897043346e-06, "loss": 0.2257, "step": 3971 }, { "epoch": 1.8807931340633324, "grad_norm": 1.4881572723388672, "learning_rate": 6.445860235647367e-06, "loss": 0.1915, "step": 3972 }, { "epoch": 1.8812666469369637, "grad_norm": 1.1142457723617554, "learning_rate": 6.44108050380236e-06, "loss": 0.1911, "step": 3973 }, { "epoch": 1.8817401598105947, "grad_norm": 1.912439227104187, "learning_rate": 6.43630170275841e-06, "loss": 0.1943, "step": 3974 }, { "epoch": 1.8822136726842262, "grad_norm": 1.1108665466308594, "learning_rate": 6.431523833765369e-06, "loss": 0.2103, "step": 3975 }, { "epoch": 1.8826871855578573, "grad_norm": 1.0399905443191528, "learning_rate": 6.426746898072845e-06, "loss": 0.2138, "step": 3976 }, { "epoch": 1.8831606984314886, "grad_norm": 0.9671503305435181, "learning_rate": 6.421970896930199e-06, "loss": 0.193, "step": 3977 }, { "epoch": 1.8836342113051199, "grad_norm": 1.4614564180374146, "learning_rate": 6.417195831586545e-06, "loss": 0.2061, "step": 3978 }, { "epoch": 1.8841077241787512, "grad_norm": 1.1578701734542847, "learning_rate": 6.412421703290759e-06, "loss": 0.2123, "step": 3979 }, { "epoch": 1.8845812370523825, "grad_norm": 1.1793917417526245, "learning_rate": 6.4076485132914644e-06, "loss": 0.2134, "step": 3980 }, { "epoch": 1.8850547499260135, "grad_norm": 1.1729707717895508, "learning_rate": 6.402876262837045e-06, "loss": 0.217, "step": 3981 }, { "epoch": 1.885528262799645, "grad_norm": 1.058951735496521, "learning_rate": 6.398104953175639e-06, "loss": 0.2041, "step": 3982 }, { "epoch": 1.886001775673276, "grad_norm": 1.7468572854995728, "learning_rate": 6.393334585555133e-06, "loss": 0.215, "step": 3983 }, { "epoch": 1.8864752885469074, "grad_norm": 0.8419433832168579, "learning_rate": 6.388565161223172e-06, "loss": 0.1718, "step": 3984 }, { "epoch": 1.8869488014205387, "grad_norm": 1.8909525871276855, "learning_rate": 6.383796681427154e-06, "loss": 0.1912, "step": 3985 }, { "epoch": 1.8874223142941697, "grad_norm": 1.1229642629623413, "learning_rate": 6.379029147414228e-06, "loss": 0.2449, "step": 3986 }, { "epoch": 1.8878958271678012, "grad_norm": 1.213046669960022, "learning_rate": 6.374262560431297e-06, "loss": 0.1952, "step": 3987 }, { "epoch": 1.8883693400414323, "grad_norm": 1.6721361875534058, "learning_rate": 6.369496921725016e-06, "loss": 0.2018, "step": 3988 }, { "epoch": 1.8888428529150636, "grad_norm": 1.6675399541854858, "learning_rate": 6.364732232541788e-06, "loss": 0.2019, "step": 3989 }, { "epoch": 1.8893163657886949, "grad_norm": 1.043617606163025, "learning_rate": 6.359968494127777e-06, "loss": 0.1982, "step": 3990 }, { "epoch": 1.8897898786623262, "grad_norm": 0.9541271924972534, "learning_rate": 6.355205707728889e-06, "loss": 0.2128, "step": 3991 }, { "epoch": 1.8902633915359575, "grad_norm": 1.9646912813186646, "learning_rate": 6.350443874590786e-06, "loss": 0.1915, "step": 3992 }, { "epoch": 1.8907369044095885, "grad_norm": 1.3205372095108032, "learning_rate": 6.34568299595888e-06, "loss": 0.1902, "step": 3993 }, { "epoch": 1.89121041728322, "grad_norm": 2.0450875759124756, "learning_rate": 6.340923073078333e-06, "loss": 0.208, "step": 3994 }, { "epoch": 1.891683930156851, "grad_norm": 1.3528573513031006, "learning_rate": 6.336164107194056e-06, "loss": 0.2167, "step": 3995 }, { "epoch": 1.8921574430304824, "grad_norm": 1.0544376373291016, "learning_rate": 6.331406099550711e-06, "loss": 0.213, "step": 3996 }, { "epoch": 1.8926309559041137, "grad_norm": 1.3501182794570923, "learning_rate": 6.326649051392709e-06, "loss": 0.1947, "step": 3997 }, { "epoch": 1.8931044687777447, "grad_norm": 1.4328538179397583, "learning_rate": 6.321892963964214e-06, "loss": 0.1958, "step": 3998 }, { "epoch": 1.8935779816513763, "grad_norm": 1.6328697204589844, "learning_rate": 6.317137838509126e-06, "loss": 0.2082, "step": 3999 }, { "epoch": 1.8940514945250073, "grad_norm": 2.273808240890503, "learning_rate": 6.3123836762711085e-06, "loss": 0.2325, "step": 4000 }, { "epoch": 1.8945250073986386, "grad_norm": 1.4385737180709839, "learning_rate": 6.307630478493565e-06, "loss": 0.2211, "step": 4001 }, { "epoch": 1.89499852027227, "grad_norm": 1.7842854261398315, "learning_rate": 6.30287824641965e-06, "loss": 0.2317, "step": 4002 }, { "epoch": 1.8954720331459012, "grad_norm": 1.4352833032608032, "learning_rate": 6.298126981292263e-06, "loss": 0.2032, "step": 4003 }, { "epoch": 1.8959455460195325, "grad_norm": 1.0405994653701782, "learning_rate": 6.29337668435405e-06, "loss": 0.213, "step": 4004 }, { "epoch": 1.8964190588931635, "grad_norm": 1.2680540084838867, "learning_rate": 6.288627356847407e-06, "loss": 0.2208, "step": 4005 }, { "epoch": 1.896892571766795, "grad_norm": 0.9765031337738037, "learning_rate": 6.283879000014476e-06, "loss": 0.1896, "step": 4006 }, { "epoch": 1.897366084640426, "grad_norm": 1.2945647239685059, "learning_rate": 6.27913161509714e-06, "loss": 0.2107, "step": 4007 }, { "epoch": 1.8978395975140574, "grad_norm": 1.2577142715454102, "learning_rate": 6.274385203337039e-06, "loss": 0.2156, "step": 4008 }, { "epoch": 1.8983131103876887, "grad_norm": 2.5358786582946777, "learning_rate": 6.269639765975543e-06, "loss": 0.2061, "step": 4009 }, { "epoch": 1.89878662326132, "grad_norm": 0.9788986444473267, "learning_rate": 6.264895304253779e-06, "loss": 0.1898, "step": 4010 }, { "epoch": 1.8992601361349513, "grad_norm": 1.0234096050262451, "learning_rate": 6.260151819412616e-06, "loss": 0.1886, "step": 4011 }, { "epoch": 1.8997336490085823, "grad_norm": 1.4769940376281738, "learning_rate": 6.255409312692664e-06, "loss": 0.2173, "step": 4012 }, { "epoch": 1.9002071618822138, "grad_norm": 1.2099794149398804, "learning_rate": 6.250667785334282e-06, "loss": 0.2085, "step": 4013 }, { "epoch": 1.900680674755845, "grad_norm": 1.8601834774017334, "learning_rate": 6.24592723857757e-06, "loss": 0.2349, "step": 4014 }, { "epoch": 1.9011541876294762, "grad_norm": 1.1096925735473633, "learning_rate": 6.241187673662375e-06, "loss": 0.2258, "step": 4015 }, { "epoch": 1.9016277005031075, "grad_norm": 1.2290061712265015, "learning_rate": 6.236449091828278e-06, "loss": 0.2111, "step": 4016 }, { "epoch": 1.9021012133767385, "grad_norm": 1.0335679054260254, "learning_rate": 6.231711494314618e-06, "loss": 0.1963, "step": 4017 }, { "epoch": 1.90257472625037, "grad_norm": 1.2123810052871704, "learning_rate": 6.226974882360462e-06, "loss": 0.2317, "step": 4018 }, { "epoch": 1.9030482391240011, "grad_norm": 1.043067216873169, "learning_rate": 6.222239257204625e-06, "loss": 0.2035, "step": 4019 }, { "epoch": 1.9035217519976324, "grad_norm": 1.1024198532104492, "learning_rate": 6.217504620085662e-06, "loss": 0.2172, "step": 4020 }, { "epoch": 1.9039952648712637, "grad_norm": 1.036494493484497, "learning_rate": 6.21277097224188e-06, "loss": 0.2044, "step": 4021 }, { "epoch": 1.904468777744895, "grad_norm": 1.0444680452346802, "learning_rate": 6.208038314911312e-06, "loss": 0.1994, "step": 4022 }, { "epoch": 1.9049422906185263, "grad_norm": 1.1238281726837158, "learning_rate": 6.203306649331744e-06, "loss": 0.1888, "step": 4023 }, { "epoch": 1.9054158034921573, "grad_norm": 0.9902118444442749, "learning_rate": 6.1985759767406925e-06, "loss": 0.1989, "step": 4024 }, { "epoch": 1.9058893163657888, "grad_norm": 1.583835244178772, "learning_rate": 6.1938462983754235e-06, "loss": 0.2093, "step": 4025 }, { "epoch": 1.90636282923942, "grad_norm": 1.8781702518463135, "learning_rate": 6.189117615472935e-06, "loss": 0.2073, "step": 4026 }, { "epoch": 1.9068363421130512, "grad_norm": 1.4825783967971802, "learning_rate": 6.1843899292699695e-06, "loss": 0.2244, "step": 4027 }, { "epoch": 1.9073098549866825, "grad_norm": 1.3632996082305908, "learning_rate": 6.179663241003008e-06, "loss": 0.2145, "step": 4028 }, { "epoch": 1.9077833678603136, "grad_norm": 1.502853274345398, "learning_rate": 6.174937551908271e-06, "loss": 0.2004, "step": 4029 }, { "epoch": 1.908256880733945, "grad_norm": 1.4652563333511353, "learning_rate": 6.170212863221715e-06, "loss": 0.2041, "step": 4030 }, { "epoch": 1.9087303936075761, "grad_norm": 1.1851582527160645, "learning_rate": 6.165489176179039e-06, "loss": 0.2071, "step": 4031 }, { "epoch": 1.9092039064812074, "grad_norm": 1.044634222984314, "learning_rate": 6.16076649201568e-06, "loss": 0.203, "step": 4032 }, { "epoch": 1.9096774193548387, "grad_norm": 1.1396992206573486, "learning_rate": 6.1560448119668034e-06, "loss": 0.2165, "step": 4033 }, { "epoch": 1.91015093222847, "grad_norm": 1.1796884536743164, "learning_rate": 6.151324137267322e-06, "loss": 0.2182, "step": 4034 }, { "epoch": 1.9106244451021013, "grad_norm": 1.1063097715377808, "learning_rate": 6.146604469151886e-06, "loss": 0.2015, "step": 4035 }, { "epoch": 1.9110979579757323, "grad_norm": 2.4029173851013184, "learning_rate": 6.141885808854877e-06, "loss": 0.231, "step": 4036 }, { "epoch": 1.9115714708493639, "grad_norm": 1.0543550252914429, "learning_rate": 6.137168157610413e-06, "loss": 0.1866, "step": 4037 }, { "epoch": 1.912044983722995, "grad_norm": 1.8748868703842163, "learning_rate": 6.1324515166523535e-06, "loss": 0.1752, "step": 4038 }, { "epoch": 1.9125184965966262, "grad_norm": 1.1984789371490479, "learning_rate": 6.1277358872142875e-06, "loss": 0.213, "step": 4039 }, { "epoch": 1.9129920094702575, "grad_norm": 1.075285792350769, "learning_rate": 6.1230212705295455e-06, "loss": 0.2163, "step": 4040 }, { "epoch": 1.9134655223438886, "grad_norm": 1.1514887809753418, "learning_rate": 6.118307667831187e-06, "loss": 0.1981, "step": 4041 }, { "epoch": 1.91393903521752, "grad_norm": 1.0806608200073242, "learning_rate": 6.113595080352013e-06, "loss": 0.2059, "step": 4042 }, { "epoch": 1.9144125480911511, "grad_norm": 0.9989210963249207, "learning_rate": 6.10888350932455e-06, "loss": 0.1973, "step": 4043 }, { "epoch": 1.9148860609647826, "grad_norm": 1.4374021291732788, "learning_rate": 6.104172955981069e-06, "loss": 0.2153, "step": 4044 }, { "epoch": 1.9153595738384137, "grad_norm": 1.3092314004898071, "learning_rate": 6.0994634215535665e-06, "loss": 0.1993, "step": 4045 }, { "epoch": 1.915833086712045, "grad_norm": 1.3858625888824463, "learning_rate": 6.094754907273777e-06, "loss": 0.1935, "step": 4046 }, { "epoch": 1.9163065995856763, "grad_norm": 1.9879931211471558, "learning_rate": 6.090047414373166e-06, "loss": 0.1879, "step": 4047 }, { "epoch": 1.9167801124593074, "grad_norm": 1.0218373537063599, "learning_rate": 6.085340944082935e-06, "loss": 0.2103, "step": 4048 }, { "epoch": 1.9172536253329389, "grad_norm": 1.021353840827942, "learning_rate": 6.0806354976340145e-06, "loss": 0.2111, "step": 4049 }, { "epoch": 1.91772713820657, "grad_norm": 1.1734100580215454, "learning_rate": 6.075931076257069e-06, "loss": 0.2014, "step": 4050 }, { "epoch": 1.9182006510802012, "grad_norm": 1.5947613716125488, "learning_rate": 6.071227681182494e-06, "loss": 0.1894, "step": 4051 }, { "epoch": 1.9186741639538325, "grad_norm": 1.2673771381378174, "learning_rate": 6.066525313640419e-06, "loss": 0.1882, "step": 4052 }, { "epoch": 1.9191476768274638, "grad_norm": 1.011744499206543, "learning_rate": 6.061823974860699e-06, "loss": 0.2165, "step": 4053 }, { "epoch": 1.919621189701095, "grad_norm": 1.5951597690582275, "learning_rate": 6.057123666072927e-06, "loss": 0.232, "step": 4054 }, { "epoch": 1.9200947025747261, "grad_norm": 1.3709065914154053, "learning_rate": 6.052424388506421e-06, "loss": 0.2051, "step": 4055 }, { "epoch": 1.9205682154483577, "grad_norm": 1.1398015022277832, "learning_rate": 6.047726143390236e-06, "loss": 0.2136, "step": 4056 }, { "epoch": 1.9210417283219887, "grad_norm": 1.450731873512268, "learning_rate": 6.043028931953148e-06, "loss": 0.2186, "step": 4057 }, { "epoch": 1.92151524119562, "grad_norm": 1.0435858964920044, "learning_rate": 6.03833275542367e-06, "loss": 0.21, "step": 4058 }, { "epoch": 1.9219887540692513, "grad_norm": 1.3922533988952637, "learning_rate": 6.033637615030039e-06, "loss": 0.2023, "step": 4059 }, { "epoch": 1.9224622669428824, "grad_norm": 1.1114870309829712, "learning_rate": 6.028943512000227e-06, "loss": 0.2057, "step": 4060 }, { "epoch": 1.9229357798165139, "grad_norm": 1.509615421295166, "learning_rate": 6.02425044756193e-06, "loss": 0.1999, "step": 4061 }, { "epoch": 1.923409292690145, "grad_norm": 1.4083658456802368, "learning_rate": 6.019558422942575e-06, "loss": 0.2151, "step": 4062 }, { "epoch": 1.9238828055637762, "grad_norm": 1.1245790719985962, "learning_rate": 6.014867439369314e-06, "loss": 0.2422, "step": 4063 }, { "epoch": 1.9243563184374075, "grad_norm": 1.3989055156707764, "learning_rate": 6.010177498069027e-06, "loss": 0.1943, "step": 4064 }, { "epoch": 1.9248298313110388, "grad_norm": 1.1599762439727783, "learning_rate": 6.005488600268328e-06, "loss": 0.2005, "step": 4065 }, { "epoch": 1.92530334418467, "grad_norm": 1.421303391456604, "learning_rate": 6.000800747193547e-06, "loss": 0.2228, "step": 4066 }, { "epoch": 1.9257768570583012, "grad_norm": 1.3245368003845215, "learning_rate": 5.996113940070754e-06, "loss": 0.2075, "step": 4067 }, { "epoch": 1.9262503699319327, "grad_norm": 1.5470653772354126, "learning_rate": 5.99142818012573e-06, "loss": 0.2029, "step": 4068 }, { "epoch": 1.9267238828055637, "grad_norm": 1.1906659603118896, "learning_rate": 5.986743468583996e-06, "loss": 0.2261, "step": 4069 }, { "epoch": 1.927197395679195, "grad_norm": 1.2057496309280396, "learning_rate": 5.982059806670788e-06, "loss": 0.2086, "step": 4070 }, { "epoch": 1.9276709085528263, "grad_norm": 1.0178190469741821, "learning_rate": 5.9773771956110785e-06, "loss": 0.1835, "step": 4071 }, { "epoch": 1.9281444214264574, "grad_norm": 1.4230746030807495, "learning_rate": 5.972695636629555e-06, "loss": 0.2209, "step": 4072 }, { "epoch": 1.9286179343000889, "grad_norm": 1.5597596168518066, "learning_rate": 5.968015130950638e-06, "loss": 0.1961, "step": 4073 }, { "epoch": 1.92909144717372, "grad_norm": 0.9596446752548218, "learning_rate": 5.963335679798465e-06, "loss": 0.1941, "step": 4074 }, { "epoch": 1.9295649600473515, "grad_norm": 1.1124833822250366, "learning_rate": 5.958657284396902e-06, "loss": 0.1979, "step": 4075 }, { "epoch": 1.9300384729209825, "grad_norm": 1.0595910549163818, "learning_rate": 5.953979945969539e-06, "loss": 0.2157, "step": 4076 }, { "epoch": 1.9305119857946138, "grad_norm": 0.9917038083076477, "learning_rate": 5.949303665739689e-06, "loss": 0.2023, "step": 4077 }, { "epoch": 1.930985498668245, "grad_norm": 1.0265789031982422, "learning_rate": 5.944628444930388e-06, "loss": 0.2223, "step": 4078 }, { "epoch": 1.9314590115418762, "grad_norm": 1.217456579208374, "learning_rate": 5.9399542847643935e-06, "loss": 0.2226, "step": 4079 }, { "epoch": 1.9319325244155077, "grad_norm": 1.284399151802063, "learning_rate": 5.935281186464188e-06, "loss": 0.2251, "step": 4080 }, { "epoch": 1.9324060372891387, "grad_norm": 1.6773645877838135, "learning_rate": 5.930609151251975e-06, "loss": 0.2266, "step": 4081 }, { "epoch": 1.93287955016277, "grad_norm": 1.1189942359924316, "learning_rate": 5.925938180349679e-06, "loss": 0.1975, "step": 4082 }, { "epoch": 1.9333530630364013, "grad_norm": 1.3536006212234497, "learning_rate": 5.921268274978951e-06, "loss": 0.2106, "step": 4083 }, { "epoch": 1.9338265759100326, "grad_norm": 1.3153413534164429, "learning_rate": 5.9165994363611565e-06, "loss": 0.2272, "step": 4084 }, { "epoch": 1.9343000887836639, "grad_norm": 1.37685227394104, "learning_rate": 5.911931665717386e-06, "loss": 0.2237, "step": 4085 }, { "epoch": 1.934773601657295, "grad_norm": 1.1147427558898926, "learning_rate": 5.907264964268451e-06, "loss": 0.2185, "step": 4086 }, { "epoch": 1.9352471145309265, "grad_norm": 1.5562469959259033, "learning_rate": 5.902599333234882e-06, "loss": 0.205, "step": 4087 }, { "epoch": 1.9357206274045575, "grad_norm": 1.3844199180603027, "learning_rate": 5.8979347738369276e-06, "loss": 0.2183, "step": 4088 }, { "epoch": 1.9361941402781888, "grad_norm": 1.3082056045532227, "learning_rate": 5.89327128729456e-06, "loss": 0.1965, "step": 4089 }, { "epoch": 1.93666765315182, "grad_norm": 0.897994875907898, "learning_rate": 5.888608874827469e-06, "loss": 0.2081, "step": 4090 }, { "epoch": 1.9371411660254512, "grad_norm": 0.9968763589859009, "learning_rate": 5.883947537655061e-06, "loss": 0.2208, "step": 4091 }, { "epoch": 1.9376146788990827, "grad_norm": 1.1803154945373535, "learning_rate": 5.8792872769964705e-06, "loss": 0.1927, "step": 4092 }, { "epoch": 1.9380881917727137, "grad_norm": 1.3356378078460693, "learning_rate": 5.874628094070536e-06, "loss": 0.1976, "step": 4093 }, { "epoch": 1.938561704646345, "grad_norm": 1.4385789632797241, "learning_rate": 5.869969990095828e-06, "loss": 0.22, "step": 4094 }, { "epoch": 1.9390352175199763, "grad_norm": 1.067402958869934, "learning_rate": 5.865312966290624e-06, "loss": 0.2028, "step": 4095 }, { "epoch": 1.9395087303936076, "grad_norm": 1.020371675491333, "learning_rate": 5.8606570238729286e-06, "loss": 0.2087, "step": 4096 }, { "epoch": 1.939982243267239, "grad_norm": 1.094397783279419, "learning_rate": 5.856002164060453e-06, "loss": 0.1967, "step": 4097 }, { "epoch": 1.94045575614087, "grad_norm": 1.3213562965393066, "learning_rate": 5.85134838807063e-06, "loss": 0.2218, "step": 4098 }, { "epoch": 1.9409292690145015, "grad_norm": 1.4701647758483887, "learning_rate": 5.846695697120617e-06, "loss": 0.2085, "step": 4099 }, { "epoch": 1.9414027818881325, "grad_norm": 0.9358981847763062, "learning_rate": 5.842044092427277e-06, "loss": 0.2129, "step": 4100 }, { "epoch": 1.9418762947617638, "grad_norm": 1.27782142162323, "learning_rate": 5.837393575207194e-06, "loss": 0.204, "step": 4101 }, { "epoch": 1.942349807635395, "grad_norm": 1.1165828704833984, "learning_rate": 5.832744146676661e-06, "loss": 0.2169, "step": 4102 }, { "epoch": 1.9428233205090262, "grad_norm": 1.0212310552597046, "learning_rate": 5.828095808051697e-06, "loss": 0.2175, "step": 4103 }, { "epoch": 1.9432968333826577, "grad_norm": 0.9246084094047546, "learning_rate": 5.823448560548024e-06, "loss": 0.1908, "step": 4104 }, { "epoch": 1.9437703462562888, "grad_norm": 1.1818164587020874, "learning_rate": 5.818802405381091e-06, "loss": 0.2047, "step": 4105 }, { "epoch": 1.9442438591299203, "grad_norm": 1.5867642164230347, "learning_rate": 5.814157343766049e-06, "loss": 0.2179, "step": 4106 }, { "epoch": 1.9447173720035513, "grad_norm": 1.1746879816055298, "learning_rate": 5.8095133769177766e-06, "loss": 0.1948, "step": 4107 }, { "epoch": 1.9451908848771826, "grad_norm": 1.1779289245605469, "learning_rate": 5.804870506050853e-06, "loss": 0.2007, "step": 4108 }, { "epoch": 1.945664397750814, "grad_norm": 0.9775161743164062, "learning_rate": 5.800228732379574e-06, "loss": 0.202, "step": 4109 }, { "epoch": 1.946137910624445, "grad_norm": 1.0120432376861572, "learning_rate": 5.795588057117958e-06, "loss": 0.1857, "step": 4110 }, { "epoch": 1.9466114234980765, "grad_norm": 1.3756533861160278, "learning_rate": 5.790948481479721e-06, "loss": 0.181, "step": 4111 }, { "epoch": 1.9470849363717075, "grad_norm": 1.0722591876983643, "learning_rate": 5.786310006678308e-06, "loss": 0.2051, "step": 4112 }, { "epoch": 1.9475584492453388, "grad_norm": 1.1463873386383057, "learning_rate": 5.781672633926858e-06, "loss": 0.1911, "step": 4113 }, { "epoch": 1.9480319621189701, "grad_norm": 1.0058190822601318, "learning_rate": 5.77703636443824e-06, "loss": 0.2162, "step": 4114 }, { "epoch": 1.9485054749926014, "grad_norm": 1.7192530632019043, "learning_rate": 5.772401199425017e-06, "loss": 0.2117, "step": 4115 }, { "epoch": 1.9489789878662327, "grad_norm": 1.1650121212005615, "learning_rate": 5.76776714009948e-06, "loss": 0.2388, "step": 4116 }, { "epoch": 1.9494525007398638, "grad_norm": 1.3511508703231812, "learning_rate": 5.763134187673618e-06, "loss": 0.2045, "step": 4117 }, { "epoch": 1.9499260136134953, "grad_norm": 1.0509499311447144, "learning_rate": 5.7585023433591315e-06, "loss": 0.2094, "step": 4118 }, { "epoch": 1.9503995264871263, "grad_norm": 1.0673551559448242, "learning_rate": 5.7538716083674425e-06, "loss": 0.1909, "step": 4119 }, { "epoch": 1.9508730393607576, "grad_norm": 1.0487030744552612, "learning_rate": 5.749241983909668e-06, "loss": 0.2173, "step": 4120 }, { "epoch": 1.951346552234389, "grad_norm": 1.2835848331451416, "learning_rate": 5.744613471196648e-06, "loss": 0.2258, "step": 4121 }, { "epoch": 1.95182006510802, "grad_norm": 1.9468538761138916, "learning_rate": 5.739986071438919e-06, "loss": 0.2424, "step": 4122 }, { "epoch": 1.9522935779816515, "grad_norm": 1.0411995649337769, "learning_rate": 5.735359785846739e-06, "loss": 0.1879, "step": 4123 }, { "epoch": 1.9527670908552826, "grad_norm": 1.0039364099502563, "learning_rate": 5.730734615630063e-06, "loss": 0.1935, "step": 4124 }, { "epoch": 1.9532406037289138, "grad_norm": 1.342782735824585, "learning_rate": 5.7261105619985635e-06, "loss": 0.2116, "step": 4125 }, { "epoch": 1.9537141166025451, "grad_norm": 0.9727945923805237, "learning_rate": 5.721487626161617e-06, "loss": 0.2056, "step": 4126 }, { "epoch": 1.9541876294761764, "grad_norm": 0.834372341632843, "learning_rate": 5.7168658093283026e-06, "loss": 0.1894, "step": 4127 }, { "epoch": 1.9546611423498077, "grad_norm": 1.5934944152832031, "learning_rate": 5.7122451127074185e-06, "loss": 0.206, "step": 4128 }, { "epoch": 1.9551346552234388, "grad_norm": 1.4059547185897827, "learning_rate": 5.7076255375074574e-06, "loss": 0.2246, "step": 4129 }, { "epoch": 1.9556081680970703, "grad_norm": 1.7891411781311035, "learning_rate": 5.703007084936631e-06, "loss": 0.188, "step": 4130 }, { "epoch": 1.9560816809707013, "grad_norm": 0.9586341381072998, "learning_rate": 5.698389756202844e-06, "loss": 0.2105, "step": 4131 }, { "epoch": 1.9565551938443326, "grad_norm": 1.686652660369873, "learning_rate": 5.693773552513723e-06, "loss": 0.2437, "step": 4132 }, { "epoch": 1.957028706717964, "grad_norm": 1.3186736106872559, "learning_rate": 5.689158475076582e-06, "loss": 0.2004, "step": 4133 }, { "epoch": 1.957502219591595, "grad_norm": 1.3622384071350098, "learning_rate": 5.6845445250984566e-06, "loss": 0.2079, "step": 4134 }, { "epoch": 1.9579757324652265, "grad_norm": 1.2416852712631226, "learning_rate": 5.679931703786077e-06, "loss": 0.2208, "step": 4135 }, { "epoch": 1.9584492453388576, "grad_norm": 0.9193575978279114, "learning_rate": 5.675320012345887e-06, "loss": 0.2292, "step": 4136 }, { "epoch": 1.958922758212489, "grad_norm": 1.2942508459091187, "learning_rate": 5.670709451984022e-06, "loss": 0.2146, "step": 4137 }, { "epoch": 1.9593962710861201, "grad_norm": 1.3082016706466675, "learning_rate": 5.666100023906336e-06, "loss": 0.2173, "step": 4138 }, { "epoch": 1.9598697839597514, "grad_norm": 1.0084069967269897, "learning_rate": 5.66149172931838e-06, "loss": 0.2172, "step": 4139 }, { "epoch": 1.9603432968333827, "grad_norm": 1.1174718141555786, "learning_rate": 5.656884569425407e-06, "loss": 0.2017, "step": 4140 }, { "epoch": 1.9608168097070138, "grad_norm": 1.1940710544586182, "learning_rate": 5.6522785454323795e-06, "loss": 0.2027, "step": 4141 }, { "epoch": 1.9612903225806453, "grad_norm": 1.1007012128829956, "learning_rate": 5.647673658543954e-06, "loss": 0.2218, "step": 4142 }, { "epoch": 1.9617638354542764, "grad_norm": 1.368607997894287, "learning_rate": 5.643069909964491e-06, "loss": 0.1888, "step": 4143 }, { "epoch": 1.9622373483279076, "grad_norm": 1.2197048664093018, "learning_rate": 5.638467300898067e-06, "loss": 0.2261, "step": 4144 }, { "epoch": 1.962710861201539, "grad_norm": 1.1047946214675903, "learning_rate": 5.6338658325484395e-06, "loss": 0.1864, "step": 4145 }, { "epoch": 1.9631843740751702, "grad_norm": 1.3874999284744263, "learning_rate": 5.629265506119086e-06, "loss": 0.2419, "step": 4146 }, { "epoch": 1.9636578869488015, "grad_norm": 1.1594977378845215, "learning_rate": 5.62466632281317e-06, "loss": 0.2087, "step": 4147 }, { "epoch": 1.9641313998224326, "grad_norm": 1.0370484590530396, "learning_rate": 5.620068283833573e-06, "loss": 0.2138, "step": 4148 }, { "epoch": 1.964604912696064, "grad_norm": 1.3975090980529785, "learning_rate": 5.615471390382858e-06, "loss": 0.1873, "step": 4149 }, { "epoch": 1.9650784255696951, "grad_norm": 1.7761406898498535, "learning_rate": 5.610875643663305e-06, "loss": 0.215, "step": 4150 }, { "epoch": 1.9655519384433264, "grad_norm": 1.3416228294372559, "learning_rate": 5.606281044876887e-06, "loss": 0.2152, "step": 4151 }, { "epoch": 1.9660254513169577, "grad_norm": 1.7138333320617676, "learning_rate": 5.601687595225269e-06, "loss": 0.2265, "step": 4152 }, { "epoch": 1.9664989641905888, "grad_norm": 1.7663260698318481, "learning_rate": 5.597095295909833e-06, "loss": 0.2109, "step": 4153 }, { "epoch": 1.9669724770642203, "grad_norm": 1.0027509927749634, "learning_rate": 5.592504148131645e-06, "loss": 0.2036, "step": 4154 }, { "epoch": 1.9674459899378514, "grad_norm": 0.9841294884681702, "learning_rate": 5.587914153091479e-06, "loss": 0.223, "step": 4155 }, { "epoch": 1.9679195028114826, "grad_norm": 1.8754013776779175, "learning_rate": 5.583325311989799e-06, "loss": 0.1892, "step": 4156 }, { "epoch": 1.968393015685114, "grad_norm": 1.138641357421875, "learning_rate": 5.57873762602678e-06, "loss": 0.2099, "step": 4157 }, { "epoch": 1.9688665285587452, "grad_norm": 1.1637465953826904, "learning_rate": 5.574151096402276e-06, "loss": 0.1921, "step": 4158 }, { "epoch": 1.9693400414323765, "grad_norm": 1.3505114316940308, "learning_rate": 5.569565724315862e-06, "loss": 0.2148, "step": 4159 }, { "epoch": 1.9698135543060076, "grad_norm": 2.0613527297973633, "learning_rate": 5.5649815109667874e-06, "loss": 0.1949, "step": 4160 }, { "epoch": 1.970287067179639, "grad_norm": 1.8924801349639893, "learning_rate": 5.560398457554016e-06, "loss": 0.2145, "step": 4161 }, { "epoch": 1.9707605800532702, "grad_norm": 1.1464554071426392, "learning_rate": 5.5558165652762e-06, "loss": 0.2119, "step": 4162 }, { "epoch": 1.9712340929269014, "grad_norm": 1.047352910041809, "learning_rate": 5.551235835331682e-06, "loss": 0.2148, "step": 4163 }, { "epoch": 1.9717076058005327, "grad_norm": 1.259980320930481, "learning_rate": 5.546656268918517e-06, "loss": 0.2182, "step": 4164 }, { "epoch": 1.9721811186741638, "grad_norm": 1.1720131635665894, "learning_rate": 5.542077867234441e-06, "loss": 0.1947, "step": 4165 }, { "epoch": 1.9726546315477953, "grad_norm": 1.064708948135376, "learning_rate": 5.537500631476895e-06, "loss": 0.1935, "step": 4166 }, { "epoch": 1.9731281444214264, "grad_norm": 1.1356375217437744, "learning_rate": 5.5329245628430036e-06, "loss": 0.1802, "step": 4167 }, { "epoch": 1.9736016572950577, "grad_norm": 2.05705189704895, "learning_rate": 5.528349662529604e-06, "loss": 0.2095, "step": 4168 }, { "epoch": 1.974075170168689, "grad_norm": 1.081849455833435, "learning_rate": 5.5237759317332065e-06, "loss": 0.2166, "step": 4169 }, { "epoch": 1.9745486830423202, "grad_norm": 1.1514532566070557, "learning_rate": 5.519203371650035e-06, "loss": 0.2273, "step": 4170 }, { "epoch": 1.9750221959159515, "grad_norm": 1.438201665878296, "learning_rate": 5.514631983475995e-06, "loss": 0.2014, "step": 4171 }, { "epoch": 1.9754957087895826, "grad_norm": 1.2542989253997803, "learning_rate": 5.510061768406683e-06, "loss": 0.2045, "step": 4172 }, { "epoch": 1.975969221663214, "grad_norm": 0.9703104496002197, "learning_rate": 5.505492727637406e-06, "loss": 0.1985, "step": 4173 }, { "epoch": 1.9764427345368452, "grad_norm": 1.1615082025527954, "learning_rate": 5.5009248623631416e-06, "loss": 0.1902, "step": 4174 }, { "epoch": 1.9769162474104764, "grad_norm": 1.1647366285324097, "learning_rate": 5.496358173778582e-06, "loss": 0.2114, "step": 4175 }, { "epoch": 1.9773897602841077, "grad_norm": 1.780960202217102, "learning_rate": 5.4917926630780895e-06, "loss": 0.2196, "step": 4176 }, { "epoch": 1.977863273157739, "grad_norm": 1.0951555967330933, "learning_rate": 5.487228331455734e-06, "loss": 0.1874, "step": 4177 }, { "epoch": 1.9783367860313703, "grad_norm": 1.208298921585083, "learning_rate": 5.482665180105278e-06, "loss": 0.1986, "step": 4178 }, { "epoch": 1.9788102989050014, "grad_norm": 1.1400575637817383, "learning_rate": 5.4781032102201605e-06, "loss": 0.2193, "step": 4179 }, { "epoch": 1.9792838117786329, "grad_norm": 1.7641512155532837, "learning_rate": 5.4735424229935274e-06, "loss": 0.2038, "step": 4180 }, { "epoch": 1.979757324652264, "grad_norm": 1.020308256149292, "learning_rate": 5.468982819618204e-06, "loss": 0.2143, "step": 4181 }, { "epoch": 1.9802308375258952, "grad_norm": 0.9065782427787781, "learning_rate": 5.464424401286715e-06, "loss": 0.2096, "step": 4182 }, { "epoch": 1.9807043503995265, "grad_norm": 1.3247904777526855, "learning_rate": 5.459867169191267e-06, "loss": 0.2037, "step": 4183 }, { "epoch": 1.9811778632731576, "grad_norm": 1.2426649332046509, "learning_rate": 5.455311124523762e-06, "loss": 0.227, "step": 4184 }, { "epoch": 1.981651376146789, "grad_norm": 0.9883340001106262, "learning_rate": 5.4507562684757875e-06, "loss": 0.1973, "step": 4185 }, { "epoch": 1.9821248890204202, "grad_norm": 1.001715898513794, "learning_rate": 5.446202602238626e-06, "loss": 0.2071, "step": 4186 }, { "epoch": 1.9825984018940515, "grad_norm": 0.9738106727600098, "learning_rate": 5.441650127003244e-06, "loss": 0.2014, "step": 4187 }, { "epoch": 1.9830719147676827, "grad_norm": 1.1363893747329712, "learning_rate": 5.4370988439602916e-06, "loss": 0.1934, "step": 4188 }, { "epoch": 1.983545427641314, "grad_norm": 1.2461233139038086, "learning_rate": 5.4325487543001196e-06, "loss": 0.2202, "step": 4189 }, { "epoch": 1.9840189405149453, "grad_norm": 1.1095913648605347, "learning_rate": 5.427999859212757e-06, "loss": 0.2152, "step": 4190 }, { "epoch": 1.9844924533885764, "grad_norm": 1.7337262630462646, "learning_rate": 5.423452159887927e-06, "loss": 0.2166, "step": 4191 }, { "epoch": 1.984965966262208, "grad_norm": 0.9954274296760559, "learning_rate": 5.41890565751503e-06, "loss": 0.2174, "step": 4192 }, { "epoch": 1.985439479135839, "grad_norm": 1.3146260976791382, "learning_rate": 5.414360353283168e-06, "loss": 0.1902, "step": 4193 }, { "epoch": 1.9859129920094702, "grad_norm": 1.0898852348327637, "learning_rate": 5.409816248381112e-06, "loss": 0.2221, "step": 4194 }, { "epoch": 1.9863865048831015, "grad_norm": 1.0851001739501953, "learning_rate": 5.405273343997339e-06, "loss": 0.1969, "step": 4195 }, { "epoch": 1.9868600177567326, "grad_norm": 1.586696743965149, "learning_rate": 5.400731641319996e-06, "loss": 0.198, "step": 4196 }, { "epoch": 1.987333530630364, "grad_norm": 1.0100387334823608, "learning_rate": 5.39619114153692e-06, "loss": 0.1984, "step": 4197 }, { "epoch": 1.9878070435039952, "grad_norm": 1.0449753999710083, "learning_rate": 5.39165184583564e-06, "loss": 0.2017, "step": 4198 }, { "epoch": 1.9882805563776265, "grad_norm": 1.3189187049865723, "learning_rate": 5.387113755403357e-06, "loss": 0.1889, "step": 4199 }, { "epoch": 1.9887540692512578, "grad_norm": 1.8392877578735352, "learning_rate": 5.3825768714269745e-06, "loss": 0.2115, "step": 4200 }, { "epoch": 1.989227582124889, "grad_norm": 0.9645484685897827, "learning_rate": 5.378041195093063e-06, "loss": 0.1919, "step": 4201 }, { "epoch": 1.9897010949985203, "grad_norm": 1.2029494047164917, "learning_rate": 5.3735067275878915e-06, "loss": 0.2015, "step": 4202 }, { "epoch": 1.9901746078721514, "grad_norm": 1.8107424974441528, "learning_rate": 5.368973470097401e-06, "loss": 0.209, "step": 4203 }, { "epoch": 1.990648120745783, "grad_norm": 1.4543911218643188, "learning_rate": 5.364441423807224e-06, "loss": 0.2173, "step": 4204 }, { "epoch": 1.991121633619414, "grad_norm": 1.1092660427093506, "learning_rate": 5.359910589902674e-06, "loss": 0.2003, "step": 4205 }, { "epoch": 1.9915951464930453, "grad_norm": 1.1251716613769531, "learning_rate": 5.355380969568742e-06, "loss": 0.2065, "step": 4206 }, { "epoch": 1.9920686593666765, "grad_norm": 1.116974949836731, "learning_rate": 5.350852563990112e-06, "loss": 0.2168, "step": 4207 }, { "epoch": 1.9925421722403078, "grad_norm": 1.312705397605896, "learning_rate": 5.34632537435114e-06, "loss": 0.2283, "step": 4208 }, { "epoch": 1.9930156851139391, "grad_norm": 1.2690129280090332, "learning_rate": 5.341799401835877e-06, "loss": 0.2151, "step": 4209 }, { "epoch": 1.9934891979875702, "grad_norm": 0.9621937870979309, "learning_rate": 5.3372746476280366e-06, "loss": 0.2054, "step": 4210 }, { "epoch": 1.9939627108612017, "grad_norm": 1.264472246170044, "learning_rate": 5.3327511129110344e-06, "loss": 0.2327, "step": 4211 }, { "epoch": 1.9944362237348328, "grad_norm": 1.66592276096344, "learning_rate": 5.328228798867947e-06, "loss": 0.2133, "step": 4212 }, { "epoch": 1.994909736608464, "grad_norm": 1.2937157154083252, "learning_rate": 5.323707706681553e-06, "loss": 0.2025, "step": 4213 }, { "epoch": 1.9953832494820953, "grad_norm": 1.0050321817398071, "learning_rate": 5.319187837534292e-06, "loss": 0.2058, "step": 4214 }, { "epoch": 1.9958567623557264, "grad_norm": 1.160569190979004, "learning_rate": 5.314669192608296e-06, "loss": 0.205, "step": 4215 }, { "epoch": 1.996330275229358, "grad_norm": 0.9353100657463074, "learning_rate": 5.310151773085376e-06, "loss": 0.1983, "step": 4216 }, { "epoch": 1.996803788102989, "grad_norm": 1.170721173286438, "learning_rate": 5.3056355801470114e-06, "loss": 0.2015, "step": 4217 }, { "epoch": 1.9972773009766203, "grad_norm": 1.126014232635498, "learning_rate": 5.301120614974378e-06, "loss": 0.1832, "step": 4218 }, { "epoch": 1.9977508138502516, "grad_norm": 0.9776692390441895, "learning_rate": 5.296606878748313e-06, "loss": 0.1971, "step": 4219 }, { "epoch": 1.9982243267238828, "grad_norm": 1.0567349195480347, "learning_rate": 5.292094372649348e-06, "loss": 0.2223, "step": 4220 }, { "epoch": 1.9986978395975141, "grad_norm": 1.1440393924713135, "learning_rate": 5.287583097857682e-06, "loss": 0.2532, "step": 4221 }, { "epoch": 1.9991713524711452, "grad_norm": 0.9647483229637146, "learning_rate": 5.283073055553191e-06, "loss": 0.2087, "step": 4222 }, { "epoch": 1.9996448653447767, "grad_norm": 1.0528312921524048, "learning_rate": 5.278564246915441e-06, "loss": 0.2172, "step": 4223 }, { "epoch": 2.0001183782184078, "grad_norm": 1.2826507091522217, "learning_rate": 5.27405667312366e-06, "loss": 0.1835, "step": 4224 }, { "epoch": 2.0005918910920393, "grad_norm": 2.0795340538024902, "learning_rate": 5.269550335356769e-06, "loss": 0.1905, "step": 4225 }, { "epoch": 2.0010654039656703, "grad_norm": 1.348443627357483, "learning_rate": 5.265045234793348e-06, "loss": 0.1877, "step": 4226 }, { "epoch": 2.0015389168393014, "grad_norm": 0.9791116118431091, "learning_rate": 5.260541372611669e-06, "loss": 0.2081, "step": 4227 }, { "epoch": 2.002012429712933, "grad_norm": 0.9848561882972717, "learning_rate": 5.256038749989671e-06, "loss": 0.18, "step": 4228 }, { "epoch": 2.002485942586564, "grad_norm": 1.2935410737991333, "learning_rate": 5.251537368104974e-06, "loss": 0.1972, "step": 4229 }, { "epoch": 2.0029594554601955, "grad_norm": 1.8192050457000732, "learning_rate": 5.2470372281348695e-06, "loss": 0.2058, "step": 4230 }, { "epoch": 2.0034329683338266, "grad_norm": 1.1813068389892578, "learning_rate": 5.242538331256322e-06, "loss": 0.2309, "step": 4231 }, { "epoch": 2.0039064812074576, "grad_norm": 1.1924341917037964, "learning_rate": 5.238040678645981e-06, "loss": 0.1972, "step": 4232 }, { "epoch": 2.004379994081089, "grad_norm": 1.2242668867111206, "learning_rate": 5.233544271480158e-06, "loss": 0.1799, "step": 4233 }, { "epoch": 2.00485350695472, "grad_norm": 1.3374933004379272, "learning_rate": 5.22904911093485e-06, "loss": 0.2176, "step": 4234 }, { "epoch": 2.0053270198283517, "grad_norm": 1.1583667993545532, "learning_rate": 5.224555198185719e-06, "loss": 0.1999, "step": 4235 }, { "epoch": 2.0058005327019828, "grad_norm": 1.0789278745651245, "learning_rate": 5.220062534408109e-06, "loss": 0.2103, "step": 4236 }, { "epoch": 2.0062740455756143, "grad_norm": 1.0617800951004028, "learning_rate": 5.215571120777027e-06, "loss": 0.1883, "step": 4237 }, { "epoch": 2.0067475584492453, "grad_norm": 1.34521484375, "learning_rate": 5.211080958467166e-06, "loss": 0.2037, "step": 4238 }, { "epoch": 2.0072210713228764, "grad_norm": 1.116936445236206, "learning_rate": 5.206592048652876e-06, "loss": 0.1971, "step": 4239 }, { "epoch": 2.007694584196508, "grad_norm": 2.06673264503479, "learning_rate": 5.202104392508198e-06, "loss": 0.2102, "step": 4240 }, { "epoch": 2.008168097070139, "grad_norm": 0.9581062197685242, "learning_rate": 5.19761799120683e-06, "loss": 0.214, "step": 4241 }, { "epoch": 2.0086416099437705, "grad_norm": 1.1789650917053223, "learning_rate": 5.193132845922143e-06, "loss": 0.2078, "step": 4242 }, { "epoch": 2.0091151228174016, "grad_norm": 1.1819120645523071, "learning_rate": 5.188648957827191e-06, "loss": 0.1923, "step": 4243 }, { "epoch": 2.0095886356910326, "grad_norm": 2.0509867668151855, "learning_rate": 5.184166328094684e-06, "loss": 0.2147, "step": 4244 }, { "epoch": 2.010062148564664, "grad_norm": 1.0577517747879028, "learning_rate": 5.17968495789702e-06, "loss": 0.2028, "step": 4245 }, { "epoch": 2.010535661438295, "grad_norm": 1.2950037717819214, "learning_rate": 5.175204848406248e-06, "loss": 0.1884, "step": 4246 }, { "epoch": 2.0110091743119267, "grad_norm": 1.7854464054107666, "learning_rate": 5.170726000794105e-06, "loss": 0.1976, "step": 4247 }, { "epoch": 2.011482687185558, "grad_norm": 1.12384831905365, "learning_rate": 5.166248416231985e-06, "loss": 0.1866, "step": 4248 }, { "epoch": 2.0119562000591893, "grad_norm": 1.3171573877334595, "learning_rate": 5.161772095890963e-06, "loss": 0.1952, "step": 4249 }, { "epoch": 2.0124297129328204, "grad_norm": 1.4037394523620605, "learning_rate": 5.157297040941775e-06, "loss": 0.1977, "step": 4250 }, { "epoch": 2.0129032258064514, "grad_norm": 1.2727725505828857, "learning_rate": 5.152823252554824e-06, "loss": 0.2269, "step": 4251 }, { "epoch": 2.013376738680083, "grad_norm": 1.1373170614242554, "learning_rate": 5.1483507319001925e-06, "loss": 0.2105, "step": 4252 }, { "epoch": 2.013850251553714, "grad_norm": 1.0914908647537231, "learning_rate": 5.143879480147616e-06, "loss": 0.225, "step": 4253 }, { "epoch": 2.0143237644273455, "grad_norm": 1.7061387300491333, "learning_rate": 5.1394094984665185e-06, "loss": 0.1815, "step": 4254 }, { "epoch": 2.0147972773009766, "grad_norm": 1.8330706357955933, "learning_rate": 5.134940788025978e-06, "loss": 0.1946, "step": 4255 }, { "epoch": 2.015270790174608, "grad_norm": 1.9821844100952148, "learning_rate": 5.130473349994737e-06, "loss": 0.2131, "step": 4256 }, { "epoch": 2.015744303048239, "grad_norm": 1.2151135206222534, "learning_rate": 5.1260071855412175e-06, "loss": 0.1673, "step": 4257 }, { "epoch": 2.01621781592187, "grad_norm": 1.4756810665130615, "learning_rate": 5.121542295833493e-06, "loss": 0.2035, "step": 4258 }, { "epoch": 2.0166913287955017, "grad_norm": 1.570881724357605, "learning_rate": 5.117078682039323e-06, "loss": 0.2112, "step": 4259 }, { "epoch": 2.017164841669133, "grad_norm": 1.0294634103775024, "learning_rate": 5.112616345326114e-06, "loss": 0.2062, "step": 4260 }, { "epoch": 2.0176383545427643, "grad_norm": 0.987274169921875, "learning_rate": 5.108155286860953e-06, "loss": 0.2028, "step": 4261 }, { "epoch": 2.0181118674163954, "grad_norm": 1.121518611907959, "learning_rate": 5.10369550781058e-06, "loss": 0.2018, "step": 4262 }, { "epoch": 2.0185853802900264, "grad_norm": 0.953002393245697, "learning_rate": 5.099237009341417e-06, "loss": 0.1732, "step": 4263 }, { "epoch": 2.019058893163658, "grad_norm": 0.955437183380127, "learning_rate": 5.094779792619531e-06, "loss": 0.1911, "step": 4264 }, { "epoch": 2.019532406037289, "grad_norm": 1.8718898296356201, "learning_rate": 5.0903238588106725e-06, "loss": 0.1966, "step": 4265 }, { "epoch": 2.0200059189109205, "grad_norm": 1.4193190336227417, "learning_rate": 5.085869209080246e-06, "loss": 0.2138, "step": 4266 }, { "epoch": 2.0204794317845516, "grad_norm": 1.1836130619049072, "learning_rate": 5.081415844593314e-06, "loss": 0.1817, "step": 4267 }, { "epoch": 2.020952944658183, "grad_norm": 2.136206865310669, "learning_rate": 5.076963766514622e-06, "loss": 0.21, "step": 4268 }, { "epoch": 2.021426457531814, "grad_norm": 1.0202592611312866, "learning_rate": 5.072512976008559e-06, "loss": 0.223, "step": 4269 }, { "epoch": 2.0218999704054452, "grad_norm": 1.1156057119369507, "learning_rate": 5.068063474239195e-06, "loss": 0.2055, "step": 4270 }, { "epoch": 2.0223734832790767, "grad_norm": 1.3342862129211426, "learning_rate": 5.063615262370247e-06, "loss": 0.1872, "step": 4271 }, { "epoch": 2.022846996152708, "grad_norm": 1.031692624092102, "learning_rate": 5.059168341565109e-06, "loss": 0.2132, "step": 4272 }, { "epoch": 2.0233205090263393, "grad_norm": 1.07035493850708, "learning_rate": 5.0547227129868225e-06, "loss": 0.2238, "step": 4273 }, { "epoch": 2.0237940218999704, "grad_norm": 1.6974101066589355, "learning_rate": 5.050278377798105e-06, "loss": 0.194, "step": 4274 }, { "epoch": 2.0242675347736014, "grad_norm": 1.17416512966156, "learning_rate": 5.045835337161329e-06, "loss": 0.2137, "step": 4275 }, { "epoch": 2.024741047647233, "grad_norm": 1.0354957580566406, "learning_rate": 5.041393592238521e-06, "loss": 0.2022, "step": 4276 }, { "epoch": 2.025214560520864, "grad_norm": 1.328599452972412, "learning_rate": 5.036953144191388e-06, "loss": 0.2116, "step": 4277 }, { "epoch": 2.0256880733944955, "grad_norm": 0.9613640308380127, "learning_rate": 5.032513994181276e-06, "loss": 0.1968, "step": 4278 }, { "epoch": 2.0261615862681266, "grad_norm": 1.4717971086502075, "learning_rate": 5.0280761433692095e-06, "loss": 0.2161, "step": 4279 }, { "epoch": 2.026635099141758, "grad_norm": 1.2015409469604492, "learning_rate": 5.023639592915861e-06, "loss": 0.2002, "step": 4280 }, { "epoch": 2.027108612015389, "grad_norm": 0.9941813945770264, "learning_rate": 5.019204343981572e-06, "loss": 0.1973, "step": 4281 }, { "epoch": 2.0275821248890202, "grad_norm": 1.1398041248321533, "learning_rate": 5.0147703977263315e-06, "loss": 0.2201, "step": 4282 }, { "epoch": 2.0280556377626517, "grad_norm": 1.4286599159240723, "learning_rate": 5.010337755309804e-06, "loss": 0.1974, "step": 4283 }, { "epoch": 2.028529150636283, "grad_norm": 1.7072778940200806, "learning_rate": 5.005906417891302e-06, "loss": 0.204, "step": 4284 }, { "epoch": 2.0290026635099143, "grad_norm": 1.2885264158248901, "learning_rate": 5.0014763866297935e-06, "loss": 0.2257, "step": 4285 }, { "epoch": 2.0294761763835454, "grad_norm": 1.2946069240570068, "learning_rate": 4.997047662683917e-06, "loss": 0.2029, "step": 4286 }, { "epoch": 2.029949689257177, "grad_norm": 1.0730814933776855, "learning_rate": 4.992620247211957e-06, "loss": 0.208, "step": 4287 }, { "epoch": 2.030423202130808, "grad_norm": 1.4278727769851685, "learning_rate": 4.988194141371868e-06, "loss": 0.1989, "step": 4288 }, { "epoch": 2.030896715004439, "grad_norm": 0.9635195136070251, "learning_rate": 4.9837693463212474e-06, "loss": 0.2119, "step": 4289 }, { "epoch": 2.0313702278780705, "grad_norm": 1.49040949344635, "learning_rate": 4.979345863217366e-06, "loss": 0.1987, "step": 4290 }, { "epoch": 2.0318437407517016, "grad_norm": 1.3482121229171753, "learning_rate": 4.974923693217135e-06, "loss": 0.2184, "step": 4291 }, { "epoch": 2.032317253625333, "grad_norm": 1.1210942268371582, "learning_rate": 4.970502837477135e-06, "loss": 0.211, "step": 4292 }, { "epoch": 2.032790766498964, "grad_norm": 1.4026209115982056, "learning_rate": 4.9660832971536014e-06, "loss": 0.1975, "step": 4293 }, { "epoch": 2.0332642793725952, "grad_norm": 1.885514259338379, "learning_rate": 4.961665073402413e-06, "loss": 0.2031, "step": 4294 }, { "epoch": 2.0337377922462267, "grad_norm": 1.712158441543579, "learning_rate": 4.957248167379124e-06, "loss": 0.202, "step": 4295 }, { "epoch": 2.034211305119858, "grad_norm": 1.4192404747009277, "learning_rate": 4.9528325802389246e-06, "loss": 0.2093, "step": 4296 }, { "epoch": 2.0346848179934893, "grad_norm": 1.8846938610076904, "learning_rate": 4.948418313136676e-06, "loss": 0.2055, "step": 4297 }, { "epoch": 2.0351583308671204, "grad_norm": 1.1656869649887085, "learning_rate": 4.94400536722688e-06, "loss": 0.2003, "step": 4298 }, { "epoch": 2.035631843740752, "grad_norm": 1.072436809539795, "learning_rate": 4.9395937436637096e-06, "loss": 0.1975, "step": 4299 }, { "epoch": 2.036105356614383, "grad_norm": 1.109155297279358, "learning_rate": 4.935183443600976e-06, "loss": 0.191, "step": 4300 }, { "epoch": 2.036578869488014, "grad_norm": 1.9992730617523193, "learning_rate": 4.930774468192147e-06, "loss": 0.2152, "step": 4301 }, { "epoch": 2.0370523823616455, "grad_norm": 1.2080824375152588, "learning_rate": 4.926366818590358e-06, "loss": 0.2105, "step": 4302 }, { "epoch": 2.0375258952352766, "grad_norm": 1.5259896516799927, "learning_rate": 4.921960495948377e-06, "loss": 0.2042, "step": 4303 }, { "epoch": 2.037999408108908, "grad_norm": 1.8273221254348755, "learning_rate": 4.917555501418643e-06, "loss": 0.2145, "step": 4304 }, { "epoch": 2.038472920982539, "grad_norm": 1.103759765625, "learning_rate": 4.913151836153232e-06, "loss": 0.1831, "step": 4305 }, { "epoch": 2.0389464338561702, "grad_norm": 1.3072751760482788, "learning_rate": 4.908749501303889e-06, "loss": 0.1974, "step": 4306 }, { "epoch": 2.0394199467298018, "grad_norm": 1.1292860507965088, "learning_rate": 4.904348498021993e-06, "loss": 0.1865, "step": 4307 }, { "epoch": 2.039893459603433, "grad_norm": 1.2447689771652222, "learning_rate": 4.8999488274585935e-06, "loss": 0.2132, "step": 4308 }, { "epoch": 2.0403669724770643, "grad_norm": 0.8923916816711426, "learning_rate": 4.895550490764377e-06, "loss": 0.1893, "step": 4309 }, { "epoch": 2.0408404853506954, "grad_norm": 1.4532053470611572, "learning_rate": 4.891153489089681e-06, "loss": 0.2084, "step": 4310 }, { "epoch": 2.041313998224327, "grad_norm": 1.329358696937561, "learning_rate": 4.886757823584511e-06, "loss": 0.1988, "step": 4311 }, { "epoch": 2.041787511097958, "grad_norm": 1.5044658184051514, "learning_rate": 4.882363495398498e-06, "loss": 0.1831, "step": 4312 }, { "epoch": 2.042261023971589, "grad_norm": 1.3755611181259155, "learning_rate": 4.877970505680946e-06, "loss": 0.2147, "step": 4313 }, { "epoch": 2.0427345368452205, "grad_norm": 0.9415785074234009, "learning_rate": 4.8735788555807905e-06, "loss": 0.1909, "step": 4314 }, { "epoch": 2.0432080497188516, "grad_norm": 1.1154485940933228, "learning_rate": 4.8691885462466345e-06, "loss": 0.2137, "step": 4315 }, { "epoch": 2.043681562592483, "grad_norm": 1.1781600713729858, "learning_rate": 4.864799578826713e-06, "loss": 0.2118, "step": 4316 }, { "epoch": 2.044155075466114, "grad_norm": 1.8692867755889893, "learning_rate": 4.860411954468925e-06, "loss": 0.2269, "step": 4317 }, { "epoch": 2.0446285883397457, "grad_norm": 1.1391620635986328, "learning_rate": 4.856025674320803e-06, "loss": 0.199, "step": 4318 }, { "epoch": 2.0451021012133768, "grad_norm": 1.0855448246002197, "learning_rate": 4.851640739529547e-06, "loss": 0.191, "step": 4319 }, { "epoch": 2.045575614087008, "grad_norm": 1.8946772813796997, "learning_rate": 4.847257151241987e-06, "loss": 0.2126, "step": 4320 }, { "epoch": 2.0460491269606393, "grad_norm": 1.7969681024551392, "learning_rate": 4.842874910604606e-06, "loss": 0.2058, "step": 4321 }, { "epoch": 2.0465226398342704, "grad_norm": 1.028253436088562, "learning_rate": 4.838494018763546e-06, "loss": 0.2044, "step": 4322 }, { "epoch": 2.046996152707902, "grad_norm": 1.3800286054611206, "learning_rate": 4.8341144768645754e-06, "loss": 0.2053, "step": 4323 }, { "epoch": 2.047469665581533, "grad_norm": 0.9476808905601501, "learning_rate": 4.829736286053131e-06, "loss": 0.2135, "step": 4324 }, { "epoch": 2.047943178455164, "grad_norm": 1.1911259889602661, "learning_rate": 4.82535944747428e-06, "loss": 0.2038, "step": 4325 }, { "epoch": 2.0484166913287956, "grad_norm": 0.8657131791114807, "learning_rate": 4.820983962272748e-06, "loss": 0.1959, "step": 4326 }, { "epoch": 2.0488902042024266, "grad_norm": 2.2093346118927, "learning_rate": 4.816609831592895e-06, "loss": 0.208, "step": 4327 }, { "epoch": 2.049363717076058, "grad_norm": 1.8499171733856201, "learning_rate": 4.812237056578738e-06, "loss": 0.206, "step": 4328 }, { "epoch": 2.049837229949689, "grad_norm": 1.404910683631897, "learning_rate": 4.807865638373932e-06, "loss": 0.1919, "step": 4329 }, { "epoch": 2.0503107428233207, "grad_norm": 0.8892547488212585, "learning_rate": 4.803495578121775e-06, "loss": 0.1778, "step": 4330 }, { "epoch": 2.0507842556969518, "grad_norm": 1.3549368381500244, "learning_rate": 4.799126876965219e-06, "loss": 0.202, "step": 4331 }, { "epoch": 2.051257768570583, "grad_norm": 1.4522606134414673, "learning_rate": 4.794759536046854e-06, "loss": 0.1813, "step": 4332 }, { "epoch": 2.0517312814442143, "grad_norm": 1.1356416940689087, "learning_rate": 4.790393556508918e-06, "loss": 0.1793, "step": 4333 }, { "epoch": 2.0522047943178454, "grad_norm": 1.0403809547424316, "learning_rate": 4.786028939493292e-06, "loss": 0.2073, "step": 4334 }, { "epoch": 2.052678307191477, "grad_norm": 1.0235661268234253, "learning_rate": 4.781665686141493e-06, "loss": 0.1865, "step": 4335 }, { "epoch": 2.053151820065108, "grad_norm": 1.1404122114181519, "learning_rate": 4.777303797594694e-06, "loss": 0.1997, "step": 4336 }, { "epoch": 2.053625332938739, "grad_norm": 1.2667036056518555, "learning_rate": 4.772943274993701e-06, "loss": 0.2054, "step": 4337 }, { "epoch": 2.0540988458123706, "grad_norm": 1.177870273590088, "learning_rate": 4.768584119478971e-06, "loss": 0.2103, "step": 4338 }, { "epoch": 2.0545723586860016, "grad_norm": 0.9611978530883789, "learning_rate": 4.764226332190591e-06, "loss": 0.203, "step": 4339 }, { "epoch": 2.055045871559633, "grad_norm": 1.3527143001556396, "learning_rate": 4.759869914268308e-06, "loss": 0.2133, "step": 4340 }, { "epoch": 2.055519384433264, "grad_norm": 0.833387017250061, "learning_rate": 4.7555148668514925e-06, "loss": 0.1858, "step": 4341 }, { "epoch": 2.0559928973068957, "grad_norm": 1.0796325206756592, "learning_rate": 4.751161191079173e-06, "loss": 0.21, "step": 4342 }, { "epoch": 2.056466410180527, "grad_norm": 0.8967043161392212, "learning_rate": 4.746808888090004e-06, "loss": 0.1963, "step": 4343 }, { "epoch": 2.056939923054158, "grad_norm": 1.2309699058532715, "learning_rate": 4.742457959022296e-06, "loss": 0.2024, "step": 4344 }, { "epoch": 2.0574134359277894, "grad_norm": 1.511290431022644, "learning_rate": 4.738108405013988e-06, "loss": 0.2285, "step": 4345 }, { "epoch": 2.0578869488014204, "grad_norm": 2.3464651107788086, "learning_rate": 4.73376022720266e-06, "loss": 0.2228, "step": 4346 }, { "epoch": 2.058360461675052, "grad_norm": 1.3974965810775757, "learning_rate": 4.729413426725546e-06, "loss": 0.2022, "step": 4347 }, { "epoch": 2.058833974548683, "grad_norm": 1.8854213953018188, "learning_rate": 4.725068004719499e-06, "loss": 0.1891, "step": 4348 }, { "epoch": 2.0593074874223145, "grad_norm": 1.3588758707046509, "learning_rate": 4.72072396232103e-06, "loss": 0.2084, "step": 4349 }, { "epoch": 2.0597810002959456, "grad_norm": 1.5690404176712036, "learning_rate": 4.716381300666275e-06, "loss": 0.1923, "step": 4350 }, { "epoch": 2.0602545131695766, "grad_norm": 1.721727967262268, "learning_rate": 4.7120400208910235e-06, "loss": 0.1893, "step": 4351 }, { "epoch": 2.060728026043208, "grad_norm": 1.5722174644470215, "learning_rate": 4.707700124130686e-06, "loss": 0.1797, "step": 4352 }, { "epoch": 2.061201538916839, "grad_norm": 1.4766589403152466, "learning_rate": 4.703361611520331e-06, "loss": 0.2187, "step": 4353 }, { "epoch": 2.0616750517904707, "grad_norm": 1.067948341369629, "learning_rate": 4.699024484194648e-06, "loss": 0.2299, "step": 4354 }, { "epoch": 2.062148564664102, "grad_norm": 1.0318337678909302, "learning_rate": 4.694688743287966e-06, "loss": 0.2086, "step": 4355 }, { "epoch": 2.062622077537733, "grad_norm": 1.2142385244369507, "learning_rate": 4.6903543899342685e-06, "loss": 0.2005, "step": 4356 }, { "epoch": 2.0630955904113644, "grad_norm": 1.0302040576934814, "learning_rate": 4.686021425267152e-06, "loss": 0.2019, "step": 4357 }, { "epoch": 2.0635691032849954, "grad_norm": 0.8725736737251282, "learning_rate": 4.681689850419871e-06, "loss": 0.1938, "step": 4358 }, { "epoch": 2.064042616158627, "grad_norm": 0.9284090995788574, "learning_rate": 4.677359666525299e-06, "loss": 0.1988, "step": 4359 }, { "epoch": 2.064516129032258, "grad_norm": 1.0550601482391357, "learning_rate": 4.673030874715961e-06, "loss": 0.2268, "step": 4360 }, { "epoch": 2.0649896419058895, "grad_norm": 2.1118485927581787, "learning_rate": 4.668703476124005e-06, "loss": 0.2087, "step": 4361 }, { "epoch": 2.0654631547795206, "grad_norm": 1.2798197269439697, "learning_rate": 4.664377471881226e-06, "loss": 0.2028, "step": 4362 }, { "epoch": 2.0659366676531516, "grad_norm": 1.4490396976470947, "learning_rate": 4.660052863119046e-06, "loss": 0.2108, "step": 4363 }, { "epoch": 2.066410180526783, "grad_norm": 1.0242908000946045, "learning_rate": 4.65572965096852e-06, "loss": 0.1924, "step": 4364 }, { "epoch": 2.0668836934004142, "grad_norm": 1.2563238143920898, "learning_rate": 4.651407836560351e-06, "loss": 0.1882, "step": 4365 }, { "epoch": 2.0673572062740457, "grad_norm": 1.3273645639419556, "learning_rate": 4.6470874210248595e-06, "loss": 0.2313, "step": 4366 }, { "epoch": 2.067830719147677, "grad_norm": 2.203810453414917, "learning_rate": 4.642768405492016e-06, "loss": 0.206, "step": 4367 }, { "epoch": 2.068304232021308, "grad_norm": 2.2822744846343994, "learning_rate": 4.63845079109141e-06, "loss": 0.2121, "step": 4368 }, { "epoch": 2.0687777448949394, "grad_norm": 1.70283043384552, "learning_rate": 4.63413457895228e-06, "loss": 0.1918, "step": 4369 }, { "epoch": 2.0692512577685704, "grad_norm": 1.6546306610107422, "learning_rate": 4.629819770203482e-06, "loss": 0.2086, "step": 4370 }, { "epoch": 2.069724770642202, "grad_norm": 1.611094355583191, "learning_rate": 4.625506365973515e-06, "loss": 0.2005, "step": 4371 }, { "epoch": 2.070198283515833, "grad_norm": 1.6032445430755615, "learning_rate": 4.621194367390515e-06, "loss": 0.1965, "step": 4372 }, { "epoch": 2.0706717963894645, "grad_norm": 1.065320372581482, "learning_rate": 4.6168837755822326e-06, "loss": 0.213, "step": 4373 }, { "epoch": 2.0711453092630956, "grad_norm": 1.1174391508102417, "learning_rate": 4.612574591676071e-06, "loss": 0.1863, "step": 4374 }, { "epoch": 2.0716188221367267, "grad_norm": 1.169047236442566, "learning_rate": 4.608266816799049e-06, "loss": 0.2003, "step": 4375 }, { "epoch": 2.072092335010358, "grad_norm": 1.628919005393982, "learning_rate": 4.603960452077828e-06, "loss": 0.21, "step": 4376 }, { "epoch": 2.0725658478839892, "grad_norm": 1.1893731355667114, "learning_rate": 4.599655498638691e-06, "loss": 0.2092, "step": 4377 }, { "epoch": 2.0730393607576207, "grad_norm": 1.3188472986221313, "learning_rate": 4.595351957607564e-06, "loss": 0.2084, "step": 4378 }, { "epoch": 2.073512873631252, "grad_norm": 1.0750354528427124, "learning_rate": 4.5910498301099935e-06, "loss": 0.2228, "step": 4379 }, { "epoch": 2.073986386504883, "grad_norm": 1.0731077194213867, "learning_rate": 4.5867491172711546e-06, "loss": 0.2001, "step": 4380 }, { "epoch": 2.0744598993785144, "grad_norm": 0.9855295419692993, "learning_rate": 4.582449820215865e-06, "loss": 0.2041, "step": 4381 }, { "epoch": 2.0749334122521454, "grad_norm": 1.3802636861801147, "learning_rate": 4.578151940068558e-06, "loss": 0.2164, "step": 4382 }, { "epoch": 2.075406925125777, "grad_norm": 1.4265203475952148, "learning_rate": 4.57385547795331e-06, "loss": 0.2146, "step": 4383 }, { "epoch": 2.075880437999408, "grad_norm": 1.3063853979110718, "learning_rate": 4.569560434993809e-06, "loss": 0.1938, "step": 4384 }, { "epoch": 2.0763539508730395, "grad_norm": 1.087546467781067, "learning_rate": 4.5652668123133925e-06, "loss": 0.1883, "step": 4385 }, { "epoch": 2.0768274637466706, "grad_norm": 1.0859131813049316, "learning_rate": 4.560974611035007e-06, "loss": 0.2257, "step": 4386 }, { "epoch": 2.0773009766203017, "grad_norm": 1.2528398036956787, "learning_rate": 4.556683832281246e-06, "loss": 0.1905, "step": 4387 }, { "epoch": 2.077774489493933, "grad_norm": 1.1462734937667847, "learning_rate": 4.552394477174316e-06, "loss": 0.191, "step": 4388 }, { "epoch": 2.0782480023675642, "grad_norm": 1.486925721168518, "learning_rate": 4.5481065468360515e-06, "loss": 0.1935, "step": 4389 }, { "epoch": 2.0787215152411957, "grad_norm": 1.0987601280212402, "learning_rate": 4.5438200423879285e-06, "loss": 0.213, "step": 4390 }, { "epoch": 2.079195028114827, "grad_norm": 0.965355396270752, "learning_rate": 4.539534964951033e-06, "loss": 0.2278, "step": 4391 }, { "epoch": 2.0796685409884583, "grad_norm": 1.4318578243255615, "learning_rate": 4.535251315646093e-06, "loss": 0.2028, "step": 4392 }, { "epoch": 2.0801420538620894, "grad_norm": 1.0449013710021973, "learning_rate": 4.53096909559345e-06, "loss": 0.2047, "step": 4393 }, { "epoch": 2.0806155667357205, "grad_norm": 2.591721296310425, "learning_rate": 4.526688305913081e-06, "loss": 0.2128, "step": 4394 }, { "epoch": 2.081089079609352, "grad_norm": 1.750844955444336, "learning_rate": 4.52240894772458e-06, "loss": 0.2035, "step": 4395 }, { "epoch": 2.081562592482983, "grad_norm": 1.5162577629089355, "learning_rate": 4.51813102214718e-06, "loss": 0.2092, "step": 4396 }, { "epoch": 2.0820361053566145, "grad_norm": 2.782047986984253, "learning_rate": 4.513854530299723e-06, "loss": 0.1874, "step": 4397 }, { "epoch": 2.0825096182302456, "grad_norm": 2.013601779937744, "learning_rate": 4.50957947330069e-06, "loss": 0.2174, "step": 4398 }, { "epoch": 2.0829831311038767, "grad_norm": 1.1781798601150513, "learning_rate": 4.50530585226818e-06, "loss": 0.1953, "step": 4399 }, { "epoch": 2.083456643977508, "grad_norm": 1.7047263383865356, "learning_rate": 4.501033668319913e-06, "loss": 0.202, "step": 4400 }, { "epoch": 2.0839301568511392, "grad_norm": 1.3971747159957886, "learning_rate": 4.496762922573244e-06, "loss": 0.2062, "step": 4401 }, { "epoch": 2.0844036697247708, "grad_norm": 1.4985466003417969, "learning_rate": 4.492493616145137e-06, "loss": 0.1795, "step": 4402 }, { "epoch": 2.084877182598402, "grad_norm": 1.2991846799850464, "learning_rate": 4.4882257501521975e-06, "loss": 0.1858, "step": 4403 }, { "epoch": 2.0853506954720333, "grad_norm": 1.194708228111267, "learning_rate": 4.483959325710636e-06, "loss": 0.1944, "step": 4404 }, { "epoch": 2.0858242083456644, "grad_norm": 1.1843652725219727, "learning_rate": 4.479694343936303e-06, "loss": 0.2054, "step": 4405 }, { "epoch": 2.0862977212192955, "grad_norm": 1.1956802606582642, "learning_rate": 4.4754308059446546e-06, "loss": 0.2074, "step": 4406 }, { "epoch": 2.086771234092927, "grad_norm": 1.1742326021194458, "learning_rate": 4.471168712850787e-06, "loss": 0.2047, "step": 4407 }, { "epoch": 2.087244746966558, "grad_norm": 1.7787115573883057, "learning_rate": 4.466908065769404e-06, "loss": 0.2243, "step": 4408 }, { "epoch": 2.0877182598401895, "grad_norm": 1.4370203018188477, "learning_rate": 4.46264886581483e-06, "loss": 0.25, "step": 4409 }, { "epoch": 2.0881917727138206, "grad_norm": 1.0106841325759888, "learning_rate": 4.458391114101034e-06, "loss": 0.2019, "step": 4410 }, { "epoch": 2.088665285587452, "grad_norm": 0.9871804714202881, "learning_rate": 4.454134811741577e-06, "loss": 0.1807, "step": 4411 }, { "epoch": 2.089138798461083, "grad_norm": 1.2592934370040894, "learning_rate": 4.449879959849662e-06, "loss": 0.215, "step": 4412 }, { "epoch": 2.0896123113347143, "grad_norm": 1.557584285736084, "learning_rate": 4.445626559538101e-06, "loss": 0.1972, "step": 4413 }, { "epoch": 2.0900858242083458, "grad_norm": 1.67927885055542, "learning_rate": 4.4413746119193245e-06, "loss": 0.1969, "step": 4414 }, { "epoch": 2.090559337081977, "grad_norm": 1.2148725986480713, "learning_rate": 4.437124118105397e-06, "loss": 0.2133, "step": 4415 }, { "epoch": 2.0910328499556083, "grad_norm": 1.0764367580413818, "learning_rate": 4.4328750792079875e-06, "loss": 0.2014, "step": 4416 }, { "epoch": 2.0915063628292394, "grad_norm": 0.9037055373191833, "learning_rate": 4.428627496338398e-06, "loss": 0.1938, "step": 4417 }, { "epoch": 2.0919798757028705, "grad_norm": 1.0375968217849731, "learning_rate": 4.424381370607535e-06, "loss": 0.2043, "step": 4418 }, { "epoch": 2.092453388576502, "grad_norm": 1.5051581859588623, "learning_rate": 4.420136703125938e-06, "loss": 0.2058, "step": 4419 }, { "epoch": 2.092926901450133, "grad_norm": 1.5662999153137207, "learning_rate": 4.415893495003753e-06, "loss": 0.2044, "step": 4420 }, { "epoch": 2.0934004143237646, "grad_norm": 1.0691457986831665, "learning_rate": 4.411651747350758e-06, "loss": 0.197, "step": 4421 }, { "epoch": 2.0938739271973956, "grad_norm": 1.773221731185913, "learning_rate": 4.407411461276333e-06, "loss": 0.1969, "step": 4422 }, { "epoch": 2.094347440071027, "grad_norm": 1.03744637966156, "learning_rate": 4.4031726378894915e-06, "loss": 0.2462, "step": 4423 }, { "epoch": 2.094820952944658, "grad_norm": 1.8114067316055298, "learning_rate": 4.3989352782988525e-06, "loss": 0.1918, "step": 4424 }, { "epoch": 2.0952944658182893, "grad_norm": 0.8494598269462585, "learning_rate": 4.394699383612653e-06, "loss": 0.1993, "step": 4425 }, { "epoch": 2.0957679786919208, "grad_norm": 1.0043613910675049, "learning_rate": 4.390464954938759e-06, "loss": 0.202, "step": 4426 }, { "epoch": 2.096241491565552, "grad_norm": 1.178818941116333, "learning_rate": 4.386231993384635e-06, "loss": 0.1917, "step": 4427 }, { "epoch": 2.0967150044391833, "grad_norm": 1.1056585311889648, "learning_rate": 4.382000500057381e-06, "loss": 0.1876, "step": 4428 }, { "epoch": 2.0971885173128144, "grad_norm": 1.0591120719909668, "learning_rate": 4.377770476063694e-06, "loss": 0.2058, "step": 4429 }, { "epoch": 2.0976620301864455, "grad_norm": 1.0878396034240723, "learning_rate": 4.373541922509905e-06, "loss": 0.1971, "step": 4430 }, { "epoch": 2.098135543060077, "grad_norm": 1.087065577507019, "learning_rate": 4.369314840501943e-06, "loss": 0.2182, "step": 4431 }, { "epoch": 2.098609055933708, "grad_norm": 1.2058556079864502, "learning_rate": 4.365089231145367e-06, "loss": 0.203, "step": 4432 }, { "epoch": 2.0990825688073396, "grad_norm": 1.8395373821258545, "learning_rate": 4.360865095545343e-06, "loss": 0.1915, "step": 4433 }, { "epoch": 2.0995560816809706, "grad_norm": 1.0700697898864746, "learning_rate": 4.356642434806646e-06, "loss": 0.2025, "step": 4434 }, { "epoch": 2.100029594554602, "grad_norm": 1.068076491355896, "learning_rate": 4.352421250033683e-06, "loss": 0.1822, "step": 4435 }, { "epoch": 2.100503107428233, "grad_norm": 1.1136810779571533, "learning_rate": 4.348201542330455e-06, "loss": 0.1865, "step": 4436 }, { "epoch": 2.1009766203018643, "grad_norm": 1.1286970376968384, "learning_rate": 4.3439833128005925e-06, "loss": 0.1867, "step": 4437 }, { "epoch": 2.101450133175496, "grad_norm": 1.4740508794784546, "learning_rate": 4.339766562547326e-06, "loss": 0.2043, "step": 4438 }, { "epoch": 2.101923646049127, "grad_norm": 0.8717254996299744, "learning_rate": 4.335551292673515e-06, "loss": 0.1854, "step": 4439 }, { "epoch": 2.1023971589227584, "grad_norm": 1.1837518215179443, "learning_rate": 4.331337504281613e-06, "loss": 0.183, "step": 4440 }, { "epoch": 2.1028706717963894, "grad_norm": 1.649964690208435, "learning_rate": 4.327125198473704e-06, "loss": 0.2153, "step": 4441 }, { "epoch": 2.1033441846700205, "grad_norm": 1.4881905317306519, "learning_rate": 4.322914376351472e-06, "loss": 0.2027, "step": 4442 }, { "epoch": 2.103817697543652, "grad_norm": 1.6832579374313354, "learning_rate": 4.318705039016215e-06, "loss": 0.2197, "step": 4443 }, { "epoch": 2.104291210417283, "grad_norm": 1.136296033859253, "learning_rate": 4.314497187568848e-06, "loss": 0.2201, "step": 4444 }, { "epoch": 2.1047647232909146, "grad_norm": 0.9659789800643921, "learning_rate": 4.310290823109889e-06, "loss": 0.1773, "step": 4445 }, { "epoch": 2.1052382361645456, "grad_norm": 1.2847449779510498, "learning_rate": 4.306085946739481e-06, "loss": 0.1805, "step": 4446 }, { "epoch": 2.105711749038177, "grad_norm": 1.6592435836791992, "learning_rate": 4.301882559557359e-06, "loss": 0.1944, "step": 4447 }, { "epoch": 2.106185261911808, "grad_norm": 1.5656230449676514, "learning_rate": 4.297680662662882e-06, "loss": 0.2252, "step": 4448 }, { "epoch": 2.1066587747854393, "grad_norm": 1.2571629285812378, "learning_rate": 4.293480257155022e-06, "loss": 0.224, "step": 4449 }, { "epoch": 2.107132287659071, "grad_norm": 1.8102307319641113, "learning_rate": 4.289281344132344e-06, "loss": 0.1945, "step": 4450 }, { "epoch": 2.107605800532702, "grad_norm": 1.113694190979004, "learning_rate": 4.285083924693041e-06, "loss": 0.1983, "step": 4451 }, { "epoch": 2.1080793134063334, "grad_norm": 1.0855345726013184, "learning_rate": 4.280887999934902e-06, "loss": 0.2136, "step": 4452 }, { "epoch": 2.1085528262799644, "grad_norm": 1.6805202960968018, "learning_rate": 4.276693570955337e-06, "loss": 0.1893, "step": 4453 }, { "epoch": 2.109026339153596, "grad_norm": 1.4205180406570435, "learning_rate": 4.272500638851351e-06, "loss": 0.2042, "step": 4454 }, { "epoch": 2.109499852027227, "grad_norm": 1.77024245262146, "learning_rate": 4.2683092047195725e-06, "loss": 0.2161, "step": 4455 }, { "epoch": 2.109973364900858, "grad_norm": 1.2872737646102905, "learning_rate": 4.264119269656224e-06, "loss": 0.199, "step": 4456 }, { "epoch": 2.1104468777744896, "grad_norm": 1.008962631225586, "learning_rate": 4.259930834757149e-06, "loss": 0.2022, "step": 4457 }, { "epoch": 2.1109203906481206, "grad_norm": 1.3520052433013916, "learning_rate": 4.255743901117788e-06, "loss": 0.2068, "step": 4458 }, { "epoch": 2.111393903521752, "grad_norm": 1.200093388557434, "learning_rate": 4.25155846983319e-06, "loss": 0.2165, "step": 4459 }, { "epoch": 2.1118674163953832, "grad_norm": 0.9952120780944824, "learning_rate": 4.247374541998022e-06, "loss": 0.2021, "step": 4460 }, { "epoch": 2.1123409292690143, "grad_norm": 2.106011390686035, "learning_rate": 4.243192118706543e-06, "loss": 0.1916, "step": 4461 }, { "epoch": 2.112814442142646, "grad_norm": 1.3868449926376343, "learning_rate": 4.239011201052631e-06, "loss": 0.2154, "step": 4462 }, { "epoch": 2.113287955016277, "grad_norm": 1.1115097999572754, "learning_rate": 4.234831790129759e-06, "loss": 0.2044, "step": 4463 }, { "epoch": 2.1137614678899084, "grad_norm": 1.1005358695983887, "learning_rate": 4.2306538870310185e-06, "loss": 0.2041, "step": 4464 }, { "epoch": 2.1142349807635394, "grad_norm": 1.5084381103515625, "learning_rate": 4.226477492849092e-06, "loss": 0.2113, "step": 4465 }, { "epoch": 2.114708493637171, "grad_norm": 1.4259034395217896, "learning_rate": 4.222302608676283e-06, "loss": 0.2041, "step": 4466 }, { "epoch": 2.115182006510802, "grad_norm": 1.2917234897613525, "learning_rate": 4.218129235604488e-06, "loss": 0.2141, "step": 4467 }, { "epoch": 2.115655519384433, "grad_norm": 0.9340530633926392, "learning_rate": 4.21395737472521e-06, "loss": 0.1878, "step": 4468 }, { "epoch": 2.1161290322580646, "grad_norm": 1.0747179985046387, "learning_rate": 4.209787027129563e-06, "loss": 0.2098, "step": 4469 }, { "epoch": 2.1166025451316957, "grad_norm": 1.3655407428741455, "learning_rate": 4.2056181939082584e-06, "loss": 0.1985, "step": 4470 }, { "epoch": 2.117076058005327, "grad_norm": 1.1653110980987549, "learning_rate": 4.201450876151619e-06, "loss": 0.2072, "step": 4471 }, { "epoch": 2.1175495708789582, "grad_norm": 1.0803667306900024, "learning_rate": 4.19728507494956e-06, "loss": 0.2158, "step": 4472 }, { "epoch": 2.1180230837525897, "grad_norm": 1.5151382684707642, "learning_rate": 4.193120791391612e-06, "loss": 0.1998, "step": 4473 }, { "epoch": 2.118496596626221, "grad_norm": 1.5851473808288574, "learning_rate": 4.1889580265669e-06, "loss": 0.199, "step": 4474 }, { "epoch": 2.118970109499852, "grad_norm": 1.5319643020629883, "learning_rate": 4.184796781564158e-06, "loss": 0.1875, "step": 4475 }, { "epoch": 2.1194436223734834, "grad_norm": 1.3403736352920532, "learning_rate": 4.180637057471714e-06, "loss": 0.1847, "step": 4476 }, { "epoch": 2.1199171352471144, "grad_norm": 1.2243744134902954, "learning_rate": 4.1764788553775105e-06, "loss": 0.1827, "step": 4477 }, { "epoch": 2.120390648120746, "grad_norm": 0.9514744877815247, "learning_rate": 4.1723221763690826e-06, "loss": 0.1932, "step": 4478 }, { "epoch": 2.120864160994377, "grad_norm": 0.9540281295776367, "learning_rate": 4.1681670215335646e-06, "loss": 0.1746, "step": 4479 }, { "epoch": 2.121337673868008, "grad_norm": 1.6678895950317383, "learning_rate": 4.1640133919577065e-06, "loss": 0.2004, "step": 4480 }, { "epoch": 2.1218111867416396, "grad_norm": 1.186750054359436, "learning_rate": 4.15986128872784e-06, "loss": 0.2015, "step": 4481 }, { "epoch": 2.1222846996152707, "grad_norm": 1.694643259048462, "learning_rate": 4.155710712929916e-06, "loss": 0.1983, "step": 4482 }, { "epoch": 2.122758212488902, "grad_norm": 1.0623250007629395, "learning_rate": 4.151561665649471e-06, "loss": 0.2111, "step": 4483 }, { "epoch": 2.1232317253625332, "grad_norm": 1.233841061592102, "learning_rate": 4.147414147971655e-06, "loss": 0.2121, "step": 4484 }, { "epoch": 2.1237052382361647, "grad_norm": 1.1146776676177979, "learning_rate": 4.143268160981204e-06, "loss": 0.1958, "step": 4485 }, { "epoch": 2.124178751109796, "grad_norm": 1.02981436252594, "learning_rate": 4.139123705762469e-06, "loss": 0.1988, "step": 4486 }, { "epoch": 2.124652263983427, "grad_norm": 1.6232107877731323, "learning_rate": 4.134980783399384e-06, "loss": 0.2104, "step": 4487 }, { "epoch": 2.1251257768570584, "grad_norm": 1.1073497533798218, "learning_rate": 4.130839394975493e-06, "loss": 0.1978, "step": 4488 }, { "epoch": 2.1255992897306895, "grad_norm": 1.430740475654602, "learning_rate": 4.126699541573943e-06, "loss": 0.1999, "step": 4489 }, { "epoch": 2.126072802604321, "grad_norm": 1.1626049280166626, "learning_rate": 4.122561224277463e-06, "loss": 0.1954, "step": 4490 }, { "epoch": 2.126546315477952, "grad_norm": 0.9593620300292969, "learning_rate": 4.1184244441683965e-06, "loss": 0.1728, "step": 4491 }, { "epoch": 2.1270198283515835, "grad_norm": 1.0123560428619385, "learning_rate": 4.114289202328678e-06, "loss": 0.2072, "step": 4492 }, { "epoch": 2.1274933412252146, "grad_norm": 1.1748878955841064, "learning_rate": 4.110155499839833e-06, "loss": 0.2059, "step": 4493 }, { "epoch": 2.1279668540988457, "grad_norm": 1.172324538230896, "learning_rate": 4.106023337783e-06, "loss": 0.1993, "step": 4494 }, { "epoch": 2.128440366972477, "grad_norm": 1.9978069067001343, "learning_rate": 4.1018927172389e-06, "loss": 0.2031, "step": 4495 }, { "epoch": 2.1289138798461082, "grad_norm": 1.4836291074752808, "learning_rate": 4.097763639287864e-06, "loss": 0.2432, "step": 4496 }, { "epoch": 2.1293873927197398, "grad_norm": 1.3724143505096436, "learning_rate": 4.093636105009804e-06, "loss": 0.2108, "step": 4497 }, { "epoch": 2.129860905593371, "grad_norm": 1.8259536027908325, "learning_rate": 4.0895101154842444e-06, "loss": 0.1977, "step": 4498 }, { "epoch": 2.130334418467002, "grad_norm": 1.0505609512329102, "learning_rate": 4.08538567179029e-06, "loss": 0.1911, "step": 4499 }, { "epoch": 2.1308079313406334, "grad_norm": 1.6359678506851196, "learning_rate": 4.081262775006659e-06, "loss": 0.2034, "step": 4500 }, { "epoch": 2.1312814442142645, "grad_norm": 1.0886156558990479, "learning_rate": 4.077141426211647e-06, "loss": 0.2053, "step": 4501 }, { "epoch": 2.131754957087896, "grad_norm": 1.12444269657135, "learning_rate": 4.073021626483159e-06, "loss": 0.2192, "step": 4502 }, { "epoch": 2.132228469961527, "grad_norm": 0.8436000347137451, "learning_rate": 4.0689033768986855e-06, "loss": 0.2068, "step": 4503 }, { "epoch": 2.132701982835158, "grad_norm": 1.2649866342544556, "learning_rate": 4.064786678535313e-06, "loss": 0.2072, "step": 4504 }, { "epoch": 2.1331754957087896, "grad_norm": 1.0428705215454102, "learning_rate": 4.0606715324697285e-06, "loss": 0.2224, "step": 4505 }, { "epoch": 2.1336490085824207, "grad_norm": 1.1856803894042969, "learning_rate": 4.056557939778205e-06, "loss": 0.1947, "step": 4506 }, { "epoch": 2.134122521456052, "grad_norm": 2.003635883331299, "learning_rate": 4.052445901536618e-06, "loss": 0.2234, "step": 4507 }, { "epoch": 2.1345960343296833, "grad_norm": 1.0833152532577515, "learning_rate": 4.048335418820425e-06, "loss": 0.1878, "step": 4508 }, { "epoch": 2.1350695472033148, "grad_norm": 1.1753066778182983, "learning_rate": 4.04422649270469e-06, "loss": 0.1935, "step": 4509 }, { "epoch": 2.135543060076946, "grad_norm": 1.2887529134750366, "learning_rate": 4.040119124264056e-06, "loss": 0.21, "step": 4510 }, { "epoch": 2.136016572950577, "grad_norm": 1.0373543500900269, "learning_rate": 4.036013314572772e-06, "loss": 0.2036, "step": 4511 }, { "epoch": 2.1364900858242084, "grad_norm": 1.2322744131088257, "learning_rate": 4.0319090647046714e-06, "loss": 0.2086, "step": 4512 }, { "epoch": 2.1369635986978395, "grad_norm": 1.1571698188781738, "learning_rate": 4.0278063757331745e-06, "loss": 0.1731, "step": 4513 }, { "epoch": 2.137437111571471, "grad_norm": 1.0368672609329224, "learning_rate": 4.0237052487313084e-06, "loss": 0.204, "step": 4514 }, { "epoch": 2.137910624445102, "grad_norm": 1.494619369506836, "learning_rate": 4.0196056847716795e-06, "loss": 0.2129, "step": 4515 }, { "epoch": 2.1383841373187336, "grad_norm": 1.6594982147216797, "learning_rate": 4.015507684926491e-06, "loss": 0.1958, "step": 4516 }, { "epoch": 2.1388576501923646, "grad_norm": 1.1065243482589722, "learning_rate": 4.0114112502675305e-06, "loss": 0.1961, "step": 4517 }, { "epoch": 2.1393311630659957, "grad_norm": 1.0827975273132324, "learning_rate": 4.007316381866188e-06, "loss": 0.2111, "step": 4518 }, { "epoch": 2.139804675939627, "grad_norm": 1.3087358474731445, "learning_rate": 4.003223080793432e-06, "loss": 0.2031, "step": 4519 }, { "epoch": 2.1402781888132583, "grad_norm": 1.1883478164672852, "learning_rate": 3.999131348119829e-06, "loss": 0.198, "step": 4520 }, { "epoch": 2.1407517016868898, "grad_norm": 1.2664716243743896, "learning_rate": 3.995041184915531e-06, "loss": 0.2151, "step": 4521 }, { "epoch": 2.141225214560521, "grad_norm": 1.1769858598709106, "learning_rate": 3.990952592250277e-06, "loss": 0.2112, "step": 4522 }, { "epoch": 2.141698727434152, "grad_norm": 1.0426677465438843, "learning_rate": 3.986865571193404e-06, "loss": 0.1892, "step": 4523 }, { "epoch": 2.1421722403077834, "grad_norm": 1.4847642183303833, "learning_rate": 3.98278012281383e-06, "loss": 0.209, "step": 4524 }, { "epoch": 2.1426457531814145, "grad_norm": 1.9673742055892944, "learning_rate": 3.978696248180069e-06, "loss": 0.2053, "step": 4525 }, { "epoch": 2.143119266055046, "grad_norm": 1.1002095937728882, "learning_rate": 3.9746139483602095e-06, "loss": 0.1944, "step": 4526 }, { "epoch": 2.143592778928677, "grad_norm": 1.0705955028533936, "learning_rate": 3.970533224421947e-06, "loss": 0.1889, "step": 4527 }, { "epoch": 2.1440662918023086, "grad_norm": 1.1432850360870361, "learning_rate": 3.9664540774325545e-06, "loss": 0.1978, "step": 4528 }, { "epoch": 2.1445398046759396, "grad_norm": 1.0200495719909668, "learning_rate": 3.962376508458887e-06, "loss": 0.2121, "step": 4529 }, { "epoch": 2.1450133175495707, "grad_norm": 1.4120169878005981, "learning_rate": 3.958300518567403e-06, "loss": 0.2158, "step": 4530 }, { "epoch": 2.145486830423202, "grad_norm": 1.242547631263733, "learning_rate": 3.954226108824129e-06, "loss": 0.2045, "step": 4531 }, { "epoch": 2.1459603432968333, "grad_norm": 1.3954753875732422, "learning_rate": 3.950153280294695e-06, "loss": 0.1992, "step": 4532 }, { "epoch": 2.146433856170465, "grad_norm": 1.1535682678222656, "learning_rate": 3.946082034044303e-06, "loss": 0.1887, "step": 4533 }, { "epoch": 2.146907369044096, "grad_norm": 1.6597424745559692, "learning_rate": 3.942012371137755e-06, "loss": 0.206, "step": 4534 }, { "epoch": 2.1473808819177274, "grad_norm": 1.2386748790740967, "learning_rate": 3.937944292639426e-06, "loss": 0.2018, "step": 4535 }, { "epoch": 2.1478543947913584, "grad_norm": 1.133917212486267, "learning_rate": 3.9338777996132885e-06, "loss": 0.2068, "step": 4536 }, { "epoch": 2.1483279076649895, "grad_norm": 1.2553430795669556, "learning_rate": 3.929812893122892e-06, "loss": 0.2355, "step": 4537 }, { "epoch": 2.148801420538621, "grad_norm": 1.0734045505523682, "learning_rate": 3.9257495742313704e-06, "loss": 0.2102, "step": 4538 }, { "epoch": 2.149274933412252, "grad_norm": 1.530061960220337, "learning_rate": 3.9216878440014506e-06, "loss": 0.2079, "step": 4539 }, { "epoch": 2.1497484462858836, "grad_norm": 1.4186937808990479, "learning_rate": 3.917627703495434e-06, "loss": 0.1909, "step": 4540 }, { "epoch": 2.1502219591595146, "grad_norm": 0.925179123878479, "learning_rate": 3.913569153775216e-06, "loss": 0.1862, "step": 4541 }, { "epoch": 2.1506954720331457, "grad_norm": 1.8251923322677612, "learning_rate": 3.909512195902266e-06, "loss": 0.1955, "step": 4542 }, { "epoch": 2.151168984906777, "grad_norm": 1.1590983867645264, "learning_rate": 3.905456830937651e-06, "loss": 0.1839, "step": 4543 }, { "epoch": 2.1516424977804083, "grad_norm": 1.3032938241958618, "learning_rate": 3.901403059942e-06, "loss": 0.1992, "step": 4544 }, { "epoch": 2.15211601065404, "grad_norm": 1.323080062866211, "learning_rate": 3.897350883975551e-06, "loss": 0.1924, "step": 4545 }, { "epoch": 2.152589523527671, "grad_norm": 1.6115025281906128, "learning_rate": 3.893300304098102e-06, "loss": 0.1954, "step": 4546 }, { "epoch": 2.1530630364013024, "grad_norm": 1.2296570539474487, "learning_rate": 3.889251321369044e-06, "loss": 0.2111, "step": 4547 }, { "epoch": 2.1535365492749334, "grad_norm": 1.6636604070663452, "learning_rate": 3.885203936847355e-06, "loss": 0.2111, "step": 4548 }, { "epoch": 2.1540100621485645, "grad_norm": 1.3433752059936523, "learning_rate": 3.881158151591583e-06, "loss": 0.227, "step": 4549 }, { "epoch": 2.154483575022196, "grad_norm": 1.0224047899246216, "learning_rate": 3.877113966659869e-06, "loss": 0.1997, "step": 4550 }, { "epoch": 2.154957087895827, "grad_norm": 1.1871498823165894, "learning_rate": 3.8730713831099265e-06, "loss": 0.2111, "step": 4551 }, { "epoch": 2.1554306007694586, "grad_norm": 1.0613186359405518, "learning_rate": 3.869030401999059e-06, "loss": 0.2026, "step": 4552 }, { "epoch": 2.1559041136430896, "grad_norm": 1.2302757501602173, "learning_rate": 3.8649910243841395e-06, "loss": 0.212, "step": 4553 }, { "epoch": 2.156377626516721, "grad_norm": 0.9007354974746704, "learning_rate": 3.860953251321635e-06, "loss": 0.1887, "step": 4554 }, { "epoch": 2.1568511393903522, "grad_norm": 1.7830326557159424, "learning_rate": 3.856917083867581e-06, "loss": 0.1895, "step": 4555 }, { "epoch": 2.1573246522639833, "grad_norm": 1.7577792406082153, "learning_rate": 3.852882523077604e-06, "loss": 0.222, "step": 4556 }, { "epoch": 2.157798165137615, "grad_norm": 1.0963820219039917, "learning_rate": 3.8488495700068994e-06, "loss": 0.2307, "step": 4557 }, { "epoch": 2.158271678011246, "grad_norm": 1.826006531715393, "learning_rate": 3.844818225710246e-06, "loss": 0.1977, "step": 4558 }, { "epoch": 2.1587451908848774, "grad_norm": 1.689234972000122, "learning_rate": 3.840788491242009e-06, "loss": 0.204, "step": 4559 }, { "epoch": 2.1592187037585084, "grad_norm": 1.8260334730148315, "learning_rate": 3.8367603676561195e-06, "loss": 0.2045, "step": 4560 }, { "epoch": 2.1596922166321395, "grad_norm": 1.4265202283859253, "learning_rate": 3.832733856006103e-06, "loss": 0.2122, "step": 4561 }, { "epoch": 2.160165729505771, "grad_norm": 1.1697028875350952, "learning_rate": 3.8287089573450444e-06, "loss": 0.1932, "step": 4562 }, { "epoch": 2.160639242379402, "grad_norm": 1.2971158027648926, "learning_rate": 3.824685672725626e-06, "loss": 0.1934, "step": 4563 }, { "epoch": 2.1611127552530336, "grad_norm": 0.9162940382957458, "learning_rate": 3.820664003200092e-06, "loss": 0.2034, "step": 4564 }, { "epoch": 2.1615862681266647, "grad_norm": 2.122776985168457, "learning_rate": 3.816643949820275e-06, "loss": 0.2026, "step": 4565 }, { "epoch": 2.1620597810002957, "grad_norm": 1.0466620922088623, "learning_rate": 3.8126255136375857e-06, "loss": 0.2109, "step": 4566 }, { "epoch": 2.1625332938739272, "grad_norm": 0.9423272013664246, "learning_rate": 3.808608695702999e-06, "loss": 0.1851, "step": 4567 }, { "epoch": 2.1630068067475583, "grad_norm": 1.181909203529358, "learning_rate": 3.8045934970670808e-06, "loss": 0.2037, "step": 4568 }, { "epoch": 2.16348031962119, "grad_norm": 1.0272080898284912, "learning_rate": 3.800579918779963e-06, "loss": 0.1763, "step": 4569 }, { "epoch": 2.163953832494821, "grad_norm": 1.413796305656433, "learning_rate": 3.796567961891363e-06, "loss": 0.1911, "step": 4570 }, { "epoch": 2.1644273453684524, "grad_norm": 1.3752124309539795, "learning_rate": 3.792557627450568e-06, "loss": 0.193, "step": 4571 }, { "epoch": 2.1649008582420834, "grad_norm": 1.488099217414856, "learning_rate": 3.788548916506437e-06, "loss": 0.1938, "step": 4572 }, { "epoch": 2.1653743711157145, "grad_norm": 1.0083427429199219, "learning_rate": 3.7845418301074176e-06, "loss": 0.2191, "step": 4573 }, { "epoch": 2.165847883989346, "grad_norm": 1.5522511005401611, "learning_rate": 3.7805363693015172e-06, "loss": 0.1947, "step": 4574 }, { "epoch": 2.166321396862977, "grad_norm": 1.5290082693099976, "learning_rate": 3.7765325351363335e-06, "loss": 0.1975, "step": 4575 }, { "epoch": 2.1667949097366086, "grad_norm": 0.8700765371322632, "learning_rate": 3.772530328659023e-06, "loss": 0.2067, "step": 4576 }, { "epoch": 2.1672684226102397, "grad_norm": 0.9798673987388611, "learning_rate": 3.7685297509163297e-06, "loss": 0.2168, "step": 4577 }, { "epoch": 2.167741935483871, "grad_norm": 1.7563843727111816, "learning_rate": 3.7645308029545623e-06, "loss": 0.1976, "step": 4578 }, { "epoch": 2.1682154483575022, "grad_norm": 1.0324368476867676, "learning_rate": 3.7605334858196107e-06, "loss": 0.204, "step": 4579 }, { "epoch": 2.1686889612311333, "grad_norm": 1.488629937171936, "learning_rate": 3.75653780055693e-06, "loss": 0.2303, "step": 4580 }, { "epoch": 2.169162474104765, "grad_norm": 1.2473211288452148, "learning_rate": 3.752543748211559e-06, "loss": 0.2015, "step": 4581 }, { "epoch": 2.169635986978396, "grad_norm": 1.4931015968322754, "learning_rate": 3.748551329828101e-06, "loss": 0.2084, "step": 4582 }, { "epoch": 2.1701094998520274, "grad_norm": 1.2696232795715332, "learning_rate": 3.7445605464507295e-06, "loss": 0.2173, "step": 4583 }, { "epoch": 2.1705830127256585, "grad_norm": 0.9876007437705994, "learning_rate": 3.740571399123204e-06, "loss": 0.2009, "step": 4584 }, { "epoch": 2.1710565255992895, "grad_norm": 1.6325695514678955, "learning_rate": 3.7365838888888395e-06, "loss": 0.2295, "step": 4585 }, { "epoch": 2.171530038472921, "grad_norm": 1.6730865240097046, "learning_rate": 3.732598016790537e-06, "loss": 0.2208, "step": 4586 }, { "epoch": 2.172003551346552, "grad_norm": 1.2315953969955444, "learning_rate": 3.728613783870759e-06, "loss": 0.1985, "step": 4587 }, { "epoch": 2.1724770642201836, "grad_norm": 1.5150845050811768, "learning_rate": 3.724631191171547e-06, "loss": 0.194, "step": 4588 }, { "epoch": 2.1729505770938147, "grad_norm": 1.2721232175827026, "learning_rate": 3.7206502397345044e-06, "loss": 0.1883, "step": 4589 }, { "epoch": 2.173424089967446, "grad_norm": 1.0581114292144775, "learning_rate": 3.7166709306008163e-06, "loss": 0.2121, "step": 4590 }, { "epoch": 2.1738976028410772, "grad_norm": 0.9920486211776733, "learning_rate": 3.712693264811231e-06, "loss": 0.192, "step": 4591 }, { "epoch": 2.1743711157147083, "grad_norm": 1.3474196195602417, "learning_rate": 3.7087172434060635e-06, "loss": 0.2062, "step": 4592 }, { "epoch": 2.17484462858834, "grad_norm": 1.098014235496521, "learning_rate": 3.704742867425212e-06, "loss": 0.2018, "step": 4593 }, { "epoch": 2.175318141461971, "grad_norm": 1.0982664823532104, "learning_rate": 3.7007701379081275e-06, "loss": 0.204, "step": 4594 }, { "epoch": 2.1757916543356024, "grad_norm": 1.2308850288391113, "learning_rate": 3.6967990558938484e-06, "loss": 0.2105, "step": 4595 }, { "epoch": 2.1762651672092335, "grad_norm": 1.5977511405944824, "learning_rate": 3.6928296224209636e-06, "loss": 0.2215, "step": 4596 }, { "epoch": 2.176738680082865, "grad_norm": 1.0719974040985107, "learning_rate": 3.688861838527649e-06, "loss": 0.196, "step": 4597 }, { "epoch": 2.177212192956496, "grad_norm": 1.4752261638641357, "learning_rate": 3.684895705251632e-06, "loss": 0.1891, "step": 4598 }, { "epoch": 2.177685705830127, "grad_norm": 1.9501125812530518, "learning_rate": 3.6809312236302243e-06, "loss": 0.208, "step": 4599 }, { "epoch": 2.1781592187037586, "grad_norm": 1.51497483253479, "learning_rate": 3.6769683947002934e-06, "loss": 0.2276, "step": 4600 }, { "epoch": 2.1786327315773897, "grad_norm": 1.680430293083191, "learning_rate": 3.673007219498276e-06, "loss": 0.1995, "step": 4601 }, { "epoch": 2.179106244451021, "grad_norm": 1.073860764503479, "learning_rate": 3.6690476990601866e-06, "loss": 0.217, "step": 4602 }, { "epoch": 2.1795797573246523, "grad_norm": 1.1040340662002563, "learning_rate": 3.665089834421588e-06, "loss": 0.2057, "step": 4603 }, { "epoch": 2.1800532701982833, "grad_norm": 1.3993034362792969, "learning_rate": 3.661133626617638e-06, "loss": 0.2013, "step": 4604 }, { "epoch": 2.180526783071915, "grad_norm": 1.149754524230957, "learning_rate": 3.657179076683034e-06, "loss": 0.1923, "step": 4605 }, { "epoch": 2.181000295945546, "grad_norm": 1.6532198190689087, "learning_rate": 3.653226185652049e-06, "loss": 0.1997, "step": 4606 }, { "epoch": 2.1814738088191774, "grad_norm": 1.3920501470565796, "learning_rate": 3.6492749545585313e-06, "loss": 0.2051, "step": 4607 }, { "epoch": 2.1819473216928085, "grad_norm": 1.0357420444488525, "learning_rate": 3.6453253844358783e-06, "loss": 0.2199, "step": 4608 }, { "epoch": 2.18242083456644, "grad_norm": 1.4226856231689453, "learning_rate": 3.6413774763170707e-06, "loss": 0.2089, "step": 4609 }, { "epoch": 2.182894347440071, "grad_norm": 1.0830601453781128, "learning_rate": 3.637431231234637e-06, "loss": 0.2125, "step": 4610 }, { "epoch": 2.183367860313702, "grad_norm": 1.3604761362075806, "learning_rate": 3.6334866502206877e-06, "loss": 0.203, "step": 4611 }, { "epoch": 2.1838413731873336, "grad_norm": 1.3618247509002686, "learning_rate": 3.6295437343068828e-06, "loss": 0.1979, "step": 4612 }, { "epoch": 2.1843148860609647, "grad_norm": 1.374878168106079, "learning_rate": 3.625602484524461e-06, "loss": 0.2205, "step": 4613 }, { "epoch": 2.184788398934596, "grad_norm": 0.9489659667015076, "learning_rate": 3.6216629019042106e-06, "loss": 0.1847, "step": 4614 }, { "epoch": 2.1852619118082273, "grad_norm": 1.074998378753662, "learning_rate": 3.6177249874764986e-06, "loss": 0.1974, "step": 4615 }, { "epoch": 2.1857354246818583, "grad_norm": 1.1851049661636353, "learning_rate": 3.613788742271246e-06, "loss": 0.2032, "step": 4616 }, { "epoch": 2.18620893755549, "grad_norm": 0.9041722416877747, "learning_rate": 3.6098541673179353e-06, "loss": 0.2215, "step": 4617 }, { "epoch": 2.186682450429121, "grad_norm": 1.1576639413833618, "learning_rate": 3.605921263645623e-06, "loss": 0.21, "step": 4618 }, { "epoch": 2.1871559633027524, "grad_norm": 1.2321959733963013, "learning_rate": 3.6019900322829173e-06, "loss": 0.2291, "step": 4619 }, { "epoch": 2.1876294761763835, "grad_norm": 1.1647976636886597, "learning_rate": 3.5980604742579985e-06, "loss": 0.1897, "step": 4620 }, { "epoch": 2.188102989050015, "grad_norm": 1.6139419078826904, "learning_rate": 3.594132590598599e-06, "loss": 0.2243, "step": 4621 }, { "epoch": 2.188576501923646, "grad_norm": 1.1993911266326904, "learning_rate": 3.5902063823320255e-06, "loss": 0.2209, "step": 4622 }, { "epoch": 2.189050014797277, "grad_norm": 1.2088009119033813, "learning_rate": 3.5862818504851325e-06, "loss": 0.1982, "step": 4623 }, { "epoch": 2.1895235276709086, "grad_norm": 1.2603254318237305, "learning_rate": 3.5823589960843506e-06, "loss": 0.1903, "step": 4624 }, { "epoch": 2.1899970405445397, "grad_norm": 1.01237952709198, "learning_rate": 3.5784378201556612e-06, "loss": 0.2093, "step": 4625 }, { "epoch": 2.190470553418171, "grad_norm": 1.6355202198028564, "learning_rate": 3.5745183237246074e-06, "loss": 0.1958, "step": 4626 }, { "epoch": 2.1909440662918023, "grad_norm": 1.0540543794631958, "learning_rate": 3.570600507816301e-06, "loss": 0.2108, "step": 4627 }, { "epoch": 2.1914175791654333, "grad_norm": 2.3014087677001953, "learning_rate": 3.5666843734554022e-06, "loss": 0.1943, "step": 4628 }, { "epoch": 2.191891092039065, "grad_norm": 1.5514744520187378, "learning_rate": 3.562769921666147e-06, "loss": 0.2027, "step": 4629 }, { "epoch": 2.192364604912696, "grad_norm": 1.1093729734420776, "learning_rate": 3.558857153472314e-06, "loss": 0.1883, "step": 4630 }, { "epoch": 2.1928381177863274, "grad_norm": 1.1554844379425049, "learning_rate": 3.554946069897256e-06, "loss": 0.1939, "step": 4631 }, { "epoch": 2.1933116306599585, "grad_norm": 1.6279453039169312, "learning_rate": 3.5510366719638745e-06, "loss": 0.2058, "step": 4632 }, { "epoch": 2.19378514353359, "grad_norm": 1.1952141523361206, "learning_rate": 3.54712896069464e-06, "loss": 0.1856, "step": 4633 }, { "epoch": 2.194258656407221, "grad_norm": 1.0594127178192139, "learning_rate": 3.543222937111571e-06, "loss": 0.2095, "step": 4634 }, { "epoch": 2.194732169280852, "grad_norm": 1.2939765453338623, "learning_rate": 3.539318602236257e-06, "loss": 0.1998, "step": 4635 }, { "epoch": 2.1952056821544836, "grad_norm": 1.392008900642395, "learning_rate": 3.535415957089835e-06, "loss": 0.1864, "step": 4636 }, { "epoch": 2.1956791950281147, "grad_norm": 1.3421459197998047, "learning_rate": 3.531515002693e-06, "loss": 0.2139, "step": 4637 }, { "epoch": 2.196152707901746, "grad_norm": 1.5865654945373535, "learning_rate": 3.5276157400660184e-06, "loss": 0.2222, "step": 4638 }, { "epoch": 2.1966262207753773, "grad_norm": 0.9927922487258911, "learning_rate": 3.523718170228696e-06, "loss": 0.1997, "step": 4639 }, { "epoch": 2.197099733649009, "grad_norm": 1.2386114597320557, "learning_rate": 3.5198222942004113e-06, "loss": 0.1857, "step": 4640 }, { "epoch": 2.19757324652264, "grad_norm": 0.8716875910758972, "learning_rate": 3.5159281130000867e-06, "loss": 0.1986, "step": 4641 }, { "epoch": 2.198046759396271, "grad_norm": 1.1668813228607178, "learning_rate": 3.512035627646211e-06, "loss": 0.2301, "step": 4642 }, { "epoch": 2.1985202722699024, "grad_norm": 1.0498262643814087, "learning_rate": 3.5081448391568307e-06, "loss": 0.2091, "step": 4643 }, { "epoch": 2.1989937851435335, "grad_norm": 1.0276434421539307, "learning_rate": 3.5042557485495355e-06, "loss": 0.1967, "step": 4644 }, { "epoch": 2.199467298017165, "grad_norm": 2.1473894119262695, "learning_rate": 3.500368356841487e-06, "loss": 0.2149, "step": 4645 }, { "epoch": 2.199940810890796, "grad_norm": 1.263840913772583, "learning_rate": 3.496482665049389e-06, "loss": 0.2281, "step": 4646 }, { "epoch": 2.200414323764427, "grad_norm": 1.1061254739761353, "learning_rate": 3.4925986741895125e-06, "loss": 0.1913, "step": 4647 }, { "epoch": 2.2008878366380586, "grad_norm": 1.3180168867111206, "learning_rate": 3.4887163852776716e-06, "loss": 0.2162, "step": 4648 }, { "epoch": 2.2013613495116897, "grad_norm": 1.3046200275421143, "learning_rate": 3.484835799329248e-06, "loss": 0.1811, "step": 4649 }, { "epoch": 2.2018348623853212, "grad_norm": 1.533521056175232, "learning_rate": 3.4809569173591683e-06, "loss": 0.2237, "step": 4650 }, { "epoch": 2.2023083752589523, "grad_norm": 1.1395829916000366, "learning_rate": 3.4770797403819122e-06, "loss": 0.1978, "step": 4651 }, { "epoch": 2.202781888132584, "grad_norm": 1.0285067558288574, "learning_rate": 3.4732042694115265e-06, "loss": 0.1905, "step": 4652 }, { "epoch": 2.203255401006215, "grad_norm": 1.173947811126709, "learning_rate": 3.4693305054615957e-06, "loss": 0.1928, "step": 4653 }, { "epoch": 2.203728913879846, "grad_norm": 1.178647518157959, "learning_rate": 3.4654584495452716e-06, "loss": 0.1898, "step": 4654 }, { "epoch": 2.2042024267534774, "grad_norm": 1.0134490728378296, "learning_rate": 3.4615881026752473e-06, "loss": 0.2047, "step": 4655 }, { "epoch": 2.2046759396271085, "grad_norm": 2.0668044090270996, "learning_rate": 3.4577194658637815e-06, "loss": 0.2035, "step": 4656 }, { "epoch": 2.20514945250074, "grad_norm": 1.4281859397888184, "learning_rate": 3.4538525401226697e-06, "loss": 0.1914, "step": 4657 }, { "epoch": 2.205622965374371, "grad_norm": 1.2419672012329102, "learning_rate": 3.4499873264632787e-06, "loss": 0.2086, "step": 4658 }, { "epoch": 2.2060964782480026, "grad_norm": 1.1350500583648682, "learning_rate": 3.44612382589651e-06, "loss": 0.1957, "step": 4659 }, { "epoch": 2.2065699911216337, "grad_norm": 1.8303426504135132, "learning_rate": 3.4422620394328322e-06, "loss": 0.1979, "step": 4660 }, { "epoch": 2.2070435039952647, "grad_norm": 1.1737298965454102, "learning_rate": 3.438401968082253e-06, "loss": 0.1961, "step": 4661 }, { "epoch": 2.2075170168688962, "grad_norm": 1.198386311531067, "learning_rate": 3.434543612854336e-06, "loss": 0.2124, "step": 4662 }, { "epoch": 2.2079905297425273, "grad_norm": 1.2552683353424072, "learning_rate": 3.4306869747582016e-06, "loss": 0.1846, "step": 4663 }, { "epoch": 2.208464042616159, "grad_norm": 0.9571728110313416, "learning_rate": 3.426832054802511e-06, "loss": 0.1961, "step": 4664 }, { "epoch": 2.20893755548979, "grad_norm": 1.5007789134979248, "learning_rate": 3.422978853995487e-06, "loss": 0.2061, "step": 4665 }, { "epoch": 2.209411068363421, "grad_norm": 1.0542176961898804, "learning_rate": 3.4191273733448916e-06, "loss": 0.2199, "step": 4666 }, { "epoch": 2.2098845812370524, "grad_norm": 1.1434391736984253, "learning_rate": 3.4152776138580466e-06, "loss": 0.2185, "step": 4667 }, { "epoch": 2.2103580941106835, "grad_norm": 1.2754579782485962, "learning_rate": 3.411429576541815e-06, "loss": 0.2134, "step": 4668 }, { "epoch": 2.210831606984315, "grad_norm": 0.9875036478042603, "learning_rate": 3.4075832624026204e-06, "loss": 0.1884, "step": 4669 }, { "epoch": 2.211305119857946, "grad_norm": 1.2512601613998413, "learning_rate": 3.403738672446425e-06, "loss": 0.1965, "step": 4670 }, { "epoch": 2.211778632731577, "grad_norm": 1.0288127660751343, "learning_rate": 3.3998958076787415e-06, "loss": 0.1922, "step": 4671 }, { "epoch": 2.2122521456052087, "grad_norm": 1.011583924293518, "learning_rate": 3.3960546691046405e-06, "loss": 0.1961, "step": 4672 }, { "epoch": 2.2127256584788397, "grad_norm": 1.268002986907959, "learning_rate": 3.3922152577287284e-06, "loss": 0.183, "step": 4673 }, { "epoch": 2.2131991713524712, "grad_norm": 1.4242584705352783, "learning_rate": 3.388377574555172e-06, "loss": 0.2272, "step": 4674 }, { "epoch": 2.2136726842261023, "grad_norm": 1.1874171495437622, "learning_rate": 3.3845416205876737e-06, "loss": 0.2049, "step": 4675 }, { "epoch": 2.214146197099734, "grad_norm": 0.9578942060470581, "learning_rate": 3.380707396829498e-06, "loss": 0.2171, "step": 4676 }, { "epoch": 2.214619709973365, "grad_norm": 1.165828824043274, "learning_rate": 3.3768749042834416e-06, "loss": 0.2164, "step": 4677 }, { "epoch": 2.215093222846996, "grad_norm": 1.2874082326889038, "learning_rate": 3.3730441439518637e-06, "loss": 0.1935, "step": 4678 }, { "epoch": 2.2155667357206275, "grad_norm": 1.277848720550537, "learning_rate": 3.3692151168366573e-06, "loss": 0.1781, "step": 4679 }, { "epoch": 2.2160402485942585, "grad_norm": 2.349771499633789, "learning_rate": 3.3653878239392668e-06, "loss": 0.2103, "step": 4680 }, { "epoch": 2.21651376146789, "grad_norm": 1.3279833793640137, "learning_rate": 3.3615622662606852e-06, "loss": 0.2103, "step": 4681 }, { "epoch": 2.216987274341521, "grad_norm": 1.032465934753418, "learning_rate": 3.357738444801449e-06, "loss": 0.1824, "step": 4682 }, { "epoch": 2.2174607872151526, "grad_norm": 1.4926496744155884, "learning_rate": 3.35391636056165e-06, "loss": 0.1904, "step": 4683 }, { "epoch": 2.2179343000887837, "grad_norm": 0.948499858379364, "learning_rate": 3.350096014540909e-06, "loss": 0.1849, "step": 4684 }, { "epoch": 2.2184078129624147, "grad_norm": 1.0163851976394653, "learning_rate": 3.3462774077383996e-06, "loss": 0.209, "step": 4685 }, { "epoch": 2.2188813258360462, "grad_norm": 1.0877399444580078, "learning_rate": 3.3424605411528476e-06, "loss": 0.2033, "step": 4686 }, { "epoch": 2.2193548387096773, "grad_norm": 1.5267970561981201, "learning_rate": 3.338645415782512e-06, "loss": 0.207, "step": 4687 }, { "epoch": 2.219828351583309, "grad_norm": 1.1469260454177856, "learning_rate": 3.334832032625208e-06, "loss": 0.1791, "step": 4688 }, { "epoch": 2.22030186445694, "grad_norm": 1.130159854888916, "learning_rate": 3.3310203926782826e-06, "loss": 0.1979, "step": 4689 }, { "epoch": 2.220775377330571, "grad_norm": 0.9808071851730347, "learning_rate": 3.3272104969386388e-06, "loss": 0.178, "step": 4690 }, { "epoch": 2.2212488902042025, "grad_norm": 1.1407606601715088, "learning_rate": 3.3234023464027143e-06, "loss": 0.1997, "step": 4691 }, { "epoch": 2.2217224030778335, "grad_norm": 1.0201860666275024, "learning_rate": 3.319595942066498e-06, "loss": 0.2017, "step": 4692 }, { "epoch": 2.222195915951465, "grad_norm": 1.1310131549835205, "learning_rate": 3.3157912849255137e-06, "loss": 0.2015, "step": 4693 }, { "epoch": 2.222669428825096, "grad_norm": 1.1622865200042725, "learning_rate": 3.311988375974837e-06, "loss": 0.2173, "step": 4694 }, { "epoch": 2.2231429416987276, "grad_norm": 1.5009393692016602, "learning_rate": 3.308187216209082e-06, "loss": 0.2147, "step": 4695 }, { "epoch": 2.2236164545723587, "grad_norm": 1.1541341543197632, "learning_rate": 3.304387806622399e-06, "loss": 0.1975, "step": 4696 }, { "epoch": 2.2240899674459897, "grad_norm": 1.204338788986206, "learning_rate": 3.3005901482084947e-06, "loss": 0.2123, "step": 4697 }, { "epoch": 2.2245634803196213, "grad_norm": 1.4117945432662964, "learning_rate": 3.2967942419606045e-06, "loss": 0.2167, "step": 4698 }, { "epoch": 2.2250369931932523, "grad_norm": 1.0476959943771362, "learning_rate": 3.293000088871515e-06, "loss": 0.2071, "step": 4699 }, { "epoch": 2.225510506066884, "grad_norm": 1.0269898176193237, "learning_rate": 3.289207689933547e-06, "loss": 0.1831, "step": 4700 }, { "epoch": 2.225984018940515, "grad_norm": 1.0802260637283325, "learning_rate": 3.2854170461385705e-06, "loss": 0.2284, "step": 4701 }, { "epoch": 2.2264575318141464, "grad_norm": 1.3022472858428955, "learning_rate": 3.2816281584779853e-06, "loss": 0.2124, "step": 4702 }, { "epoch": 2.2269310446877775, "grad_norm": 1.3747221231460571, "learning_rate": 3.277841027942745e-06, "loss": 0.203, "step": 4703 }, { "epoch": 2.2274045575614085, "grad_norm": 1.0478261709213257, "learning_rate": 3.274055655523335e-06, "loss": 0.1948, "step": 4704 }, { "epoch": 2.22787807043504, "grad_norm": 1.0842070579528809, "learning_rate": 3.2702720422097777e-06, "loss": 0.1837, "step": 4705 }, { "epoch": 2.228351583308671, "grad_norm": 1.1079217195510864, "learning_rate": 3.2664901889916477e-06, "loss": 0.1974, "step": 4706 }, { "epoch": 2.2288250961823026, "grad_norm": 1.3619805574417114, "learning_rate": 3.2627100968580472e-06, "loss": 0.1895, "step": 4707 }, { "epoch": 2.2292986090559337, "grad_norm": 1.1514424085617065, "learning_rate": 3.2589317667976286e-06, "loss": 0.1885, "step": 4708 }, { "epoch": 2.2297721219295648, "grad_norm": 1.3855830430984497, "learning_rate": 3.25515519979857e-06, "loss": 0.1978, "step": 4709 }, { "epoch": 2.2302456348031963, "grad_norm": 1.361132025718689, "learning_rate": 3.2513803968486037e-06, "loss": 0.201, "step": 4710 }, { "epoch": 2.2307191476768273, "grad_norm": 1.0529563426971436, "learning_rate": 3.2476073589349866e-06, "loss": 0.2019, "step": 4711 }, { "epoch": 2.231192660550459, "grad_norm": 1.162382960319519, "learning_rate": 3.2438360870445263e-06, "loss": 0.2119, "step": 4712 }, { "epoch": 2.23166617342409, "grad_norm": 0.9659311175346375, "learning_rate": 3.2400665821635568e-06, "loss": 0.2018, "step": 4713 }, { "epoch": 2.2321396862977214, "grad_norm": 1.2184617519378662, "learning_rate": 3.236298845277961e-06, "loss": 0.2146, "step": 4714 }, { "epoch": 2.2326131991713525, "grad_norm": 1.6080515384674072, "learning_rate": 3.2325328773731524e-06, "loss": 0.2, "step": 4715 }, { "epoch": 2.2330867120449835, "grad_norm": 1.5760236978530884, "learning_rate": 3.228768679434079e-06, "loss": 0.2092, "step": 4716 }, { "epoch": 2.233560224918615, "grad_norm": 1.4468612670898438, "learning_rate": 3.2250062524452376e-06, "loss": 0.2187, "step": 4717 }, { "epoch": 2.234033737792246, "grad_norm": 1.2942707538604736, "learning_rate": 3.2212455973906477e-06, "loss": 0.1937, "step": 4718 }, { "epoch": 2.2345072506658776, "grad_norm": 1.237175703048706, "learning_rate": 3.2174867152538802e-06, "loss": 0.213, "step": 4719 }, { "epoch": 2.2349807635395087, "grad_norm": 1.146020770072937, "learning_rate": 3.213729607018026e-06, "loss": 0.2081, "step": 4720 }, { "epoch": 2.23545427641314, "grad_norm": 1.1109809875488281, "learning_rate": 3.209974273665726e-06, "loss": 0.2109, "step": 4721 }, { "epoch": 2.2359277892867713, "grad_norm": 1.0821495056152344, "learning_rate": 3.2062207161791526e-06, "loss": 0.2124, "step": 4722 }, { "epoch": 2.2364013021604023, "grad_norm": 1.1807968616485596, "learning_rate": 3.2024689355400063e-06, "loss": 0.2362, "step": 4723 }, { "epoch": 2.236874815034034, "grad_norm": 1.4384002685546875, "learning_rate": 3.1987189327295377e-06, "loss": 0.1825, "step": 4724 }, { "epoch": 2.237348327907665, "grad_norm": 1.1353758573532104, "learning_rate": 3.1949707087285144e-06, "loss": 0.2115, "step": 4725 }, { "epoch": 2.2378218407812964, "grad_norm": 2.6835150718688965, "learning_rate": 3.1912242645172576e-06, "loss": 0.2054, "step": 4726 }, { "epoch": 2.2382953536549275, "grad_norm": 0.9744576811790466, "learning_rate": 3.187479601075605e-06, "loss": 0.1937, "step": 4727 }, { "epoch": 2.2387688665285586, "grad_norm": 1.31288480758667, "learning_rate": 3.183736719382944e-06, "loss": 0.2199, "step": 4728 }, { "epoch": 2.23924237940219, "grad_norm": 0.9159910678863525, "learning_rate": 3.179995620418187e-06, "loss": 0.2009, "step": 4729 }, { "epoch": 2.239715892275821, "grad_norm": 1.6688588857650757, "learning_rate": 3.176256305159778e-06, "loss": 0.1832, "step": 4730 }, { "epoch": 2.2401894051494526, "grad_norm": 0.9921554923057556, "learning_rate": 3.1725187745857066e-06, "loss": 0.2143, "step": 4731 }, { "epoch": 2.2406629180230837, "grad_norm": 0.9964454770088196, "learning_rate": 3.16878302967348e-06, "loss": 0.1916, "step": 4732 }, { "epoch": 2.2411364308967148, "grad_norm": 1.3738844394683838, "learning_rate": 3.1650490714001536e-06, "loss": 0.181, "step": 4733 }, { "epoch": 2.2416099437703463, "grad_norm": 1.0650712251663208, "learning_rate": 3.1613169007423016e-06, "loss": 0.1985, "step": 4734 }, { "epoch": 2.2420834566439773, "grad_norm": 0.9678623080253601, "learning_rate": 3.1575865186760425e-06, "loss": 0.1875, "step": 4735 }, { "epoch": 2.242556969517609, "grad_norm": 1.2123451232910156, "learning_rate": 3.1538579261770177e-06, "loss": 0.2132, "step": 4736 }, { "epoch": 2.24303048239124, "grad_norm": 1.060880422592163, "learning_rate": 3.150131124220408e-06, "loss": 0.1999, "step": 4737 }, { "epoch": 2.2435039952648714, "grad_norm": 1.4341163635253906, "learning_rate": 3.1464061137809187e-06, "loss": 0.209, "step": 4738 }, { "epoch": 2.2439775081385025, "grad_norm": 1.1810160875320435, "learning_rate": 3.142682895832796e-06, "loss": 0.2031, "step": 4739 }, { "epoch": 2.2444510210121336, "grad_norm": 1.1386125087738037, "learning_rate": 3.1389614713498073e-06, "loss": 0.1958, "step": 4740 }, { "epoch": 2.244924533885765, "grad_norm": 1.4858670234680176, "learning_rate": 3.1352418413052543e-06, "loss": 0.1942, "step": 4741 }, { "epoch": 2.245398046759396, "grad_norm": 1.1537786722183228, "learning_rate": 3.131524006671974e-06, "loss": 0.1929, "step": 4742 }, { "epoch": 2.2458715596330276, "grad_norm": 1.0491985082626343, "learning_rate": 3.127807968422326e-06, "loss": 0.2021, "step": 4743 }, { "epoch": 2.2463450725066587, "grad_norm": 1.1256279945373535, "learning_rate": 3.1240937275282103e-06, "loss": 0.2091, "step": 4744 }, { "epoch": 2.2468185853802902, "grad_norm": 1.0205121040344238, "learning_rate": 3.120381284961043e-06, "loss": 0.2042, "step": 4745 }, { "epoch": 2.2472920982539213, "grad_norm": 1.3747820854187012, "learning_rate": 3.116670641691785e-06, "loss": 0.2037, "step": 4746 }, { "epoch": 2.2477656111275524, "grad_norm": 1.127557396888733, "learning_rate": 3.112961798690913e-06, "loss": 0.2081, "step": 4747 }, { "epoch": 2.248239124001184, "grad_norm": 1.0771230459213257, "learning_rate": 3.109254756928445e-06, "loss": 0.1914, "step": 4748 }, { "epoch": 2.248712636874815, "grad_norm": 1.6226811408996582, "learning_rate": 3.105549517373919e-06, "loss": 0.2036, "step": 4749 }, { "epoch": 2.2491861497484464, "grad_norm": 1.0237318277359009, "learning_rate": 3.1018460809964025e-06, "loss": 0.2105, "step": 4750 }, { "epoch": 2.2496596626220775, "grad_norm": 1.0179623365402222, "learning_rate": 3.0981444487644984e-06, "loss": 0.2184, "step": 4751 }, { "epoch": 2.2501331754957086, "grad_norm": 1.6776036024093628, "learning_rate": 3.0944446216463276e-06, "loss": 0.2038, "step": 4752 }, { "epoch": 2.25060668836934, "grad_norm": 1.2936748266220093, "learning_rate": 3.09074660060955e-06, "loss": 0.2189, "step": 4753 }, { "epoch": 2.251080201242971, "grad_norm": 0.9462767839431763, "learning_rate": 3.087050386621341e-06, "loss": 0.2057, "step": 4754 }, { "epoch": 2.2515537141166027, "grad_norm": 1.1547857522964478, "learning_rate": 3.083355980648416e-06, "loss": 0.2095, "step": 4755 }, { "epoch": 2.2520272269902337, "grad_norm": 2.062722682952881, "learning_rate": 3.0796633836570055e-06, "loss": 0.1905, "step": 4756 }, { "epoch": 2.2525007398638652, "grad_norm": 1.355178952217102, "learning_rate": 3.0759725966128774e-06, "loss": 0.2056, "step": 4757 }, { "epoch": 2.2529742527374963, "grad_norm": 1.0428045988082886, "learning_rate": 3.072283620481321e-06, "loss": 0.1885, "step": 4758 }, { "epoch": 2.2534477656111274, "grad_norm": 1.1466599702835083, "learning_rate": 3.068596456227143e-06, "loss": 0.2027, "step": 4759 }, { "epoch": 2.253921278484759, "grad_norm": 1.3236037492752075, "learning_rate": 3.0649111048147006e-06, "loss": 0.2144, "step": 4760 }, { "epoch": 2.25439479135839, "grad_norm": 1.5951242446899414, "learning_rate": 3.061227567207852e-06, "loss": 0.1973, "step": 4761 }, { "epoch": 2.2548683042320214, "grad_norm": 1.3796095848083496, "learning_rate": 3.0575458443699957e-06, "loss": 0.1869, "step": 4762 }, { "epoch": 2.2553418171056525, "grad_norm": 1.0382217168807983, "learning_rate": 3.053865937264049e-06, "loss": 0.1942, "step": 4763 }, { "epoch": 2.255815329979284, "grad_norm": 0.9059621095657349, "learning_rate": 3.0501878468524525e-06, "loss": 0.1961, "step": 4764 }, { "epoch": 2.256288842852915, "grad_norm": 1.007012963294983, "learning_rate": 3.046511574097183e-06, "loss": 0.2033, "step": 4765 }, { "epoch": 2.256762355726546, "grad_norm": 0.9761565327644348, "learning_rate": 3.042837119959726e-06, "loss": 0.2006, "step": 4766 }, { "epoch": 2.2572358686001777, "grad_norm": 1.1975845098495483, "learning_rate": 3.039164485401106e-06, "loss": 0.2131, "step": 4767 }, { "epoch": 2.2577093814738087, "grad_norm": 1.2189717292785645, "learning_rate": 3.0354936713818594e-06, "loss": 0.1828, "step": 4768 }, { "epoch": 2.2581828943474402, "grad_norm": 1.35395085811615, "learning_rate": 3.0318246788620588e-06, "loss": 0.1928, "step": 4769 }, { "epoch": 2.2586564072210713, "grad_norm": 0.9459072947502136, "learning_rate": 3.028157508801287e-06, "loss": 0.2134, "step": 4770 }, { "epoch": 2.2591299200947024, "grad_norm": 1.2507688999176025, "learning_rate": 3.0244921621586643e-06, "loss": 0.207, "step": 4771 }, { "epoch": 2.259603432968334, "grad_norm": 1.5289151668548584, "learning_rate": 3.020828639892818e-06, "loss": 0.2006, "step": 4772 }, { "epoch": 2.260076945841965, "grad_norm": 1.1962858438491821, "learning_rate": 3.0171669429619154e-06, "loss": 0.2226, "step": 4773 }, { "epoch": 2.2605504587155965, "grad_norm": 1.2389626502990723, "learning_rate": 3.0135070723236346e-06, "loss": 0.2018, "step": 4774 }, { "epoch": 2.2610239715892275, "grad_norm": 1.1596938371658325, "learning_rate": 3.0098490289351756e-06, "loss": 0.1978, "step": 4775 }, { "epoch": 2.2614974844628586, "grad_norm": 1.206876516342163, "learning_rate": 3.0061928137532713e-06, "loss": 0.1913, "step": 4776 }, { "epoch": 2.26197099733649, "grad_norm": 1.2051137685775757, "learning_rate": 3.002538427734163e-06, "loss": 0.2041, "step": 4777 }, { "epoch": 2.262444510210121, "grad_norm": 0.8974770903587341, "learning_rate": 2.9988858718336256e-06, "loss": 0.2054, "step": 4778 }, { "epoch": 2.2629180230837527, "grad_norm": 1.372613787651062, "learning_rate": 2.995235147006945e-06, "loss": 0.1953, "step": 4779 }, { "epoch": 2.2633915359573837, "grad_norm": 1.115798830986023, "learning_rate": 2.991586254208939e-06, "loss": 0.213, "step": 4780 }, { "epoch": 2.2638650488310152, "grad_norm": 1.4372373819351196, "learning_rate": 2.987939194393933e-06, "loss": 0.2302, "step": 4781 }, { "epoch": 2.2643385617046463, "grad_norm": 0.9313597679138184, "learning_rate": 2.984293968515788e-06, "loss": 0.2021, "step": 4782 }, { "epoch": 2.264812074578278, "grad_norm": 1.0332454442977905, "learning_rate": 2.9806505775278738e-06, "loss": 0.1874, "step": 4783 }, { "epoch": 2.265285587451909, "grad_norm": 1.0937074422836304, "learning_rate": 2.9770090223830803e-06, "loss": 0.199, "step": 4784 }, { "epoch": 2.26575910032554, "grad_norm": 1.022531509399414, "learning_rate": 2.9733693040338286e-06, "loss": 0.1963, "step": 4785 }, { "epoch": 2.2662326131991715, "grad_norm": 1.1225355863571167, "learning_rate": 2.969731423432045e-06, "loss": 0.2178, "step": 4786 }, { "epoch": 2.2667061260728025, "grad_norm": 1.8823810815811157, "learning_rate": 2.9660953815291893e-06, "loss": 0.2103, "step": 4787 }, { "epoch": 2.267179638946434, "grad_norm": 1.7231241464614868, "learning_rate": 2.962461179276225e-06, "loss": 0.1777, "step": 4788 }, { "epoch": 2.267653151820065, "grad_norm": 1.254069447517395, "learning_rate": 2.9588288176236502e-06, "loss": 0.2163, "step": 4789 }, { "epoch": 2.268126664693696, "grad_norm": 1.0961583852767944, "learning_rate": 2.955198297521469e-06, "loss": 0.2097, "step": 4790 }, { "epoch": 2.2686001775673277, "grad_norm": 1.0620346069335938, "learning_rate": 2.9515696199192123e-06, "loss": 0.1878, "step": 4791 }, { "epoch": 2.2690736904409587, "grad_norm": 1.0803302526474, "learning_rate": 2.9479427857659213e-06, "loss": 0.2128, "step": 4792 }, { "epoch": 2.2695472033145903, "grad_norm": 1.3696365356445312, "learning_rate": 2.9443177960101653e-06, "loss": 0.2047, "step": 4793 }, { "epoch": 2.2700207161882213, "grad_norm": 1.0663868188858032, "learning_rate": 2.9406946516000236e-06, "loss": 0.1987, "step": 4794 }, { "epoch": 2.2704942290618524, "grad_norm": 1.149569034576416, "learning_rate": 2.9370733534830887e-06, "loss": 0.1871, "step": 4795 }, { "epoch": 2.270967741935484, "grad_norm": 1.892318606376648, "learning_rate": 2.9334539026064847e-06, "loss": 0.1883, "step": 4796 }, { "epoch": 2.271441254809115, "grad_norm": 1.3495607376098633, "learning_rate": 2.9298362999168373e-06, "loss": 0.2193, "step": 4797 }, { "epoch": 2.2719147676827465, "grad_norm": 1.4502182006835938, "learning_rate": 2.926220546360299e-06, "loss": 0.1985, "step": 4798 }, { "epoch": 2.2723882805563775, "grad_norm": 1.812647819519043, "learning_rate": 2.922606642882537e-06, "loss": 0.2017, "step": 4799 }, { "epoch": 2.272861793430009, "grad_norm": 1.912377953529358, "learning_rate": 2.9189945904287287e-06, "loss": 0.2218, "step": 4800 }, { "epoch": 2.27333530630364, "grad_norm": 1.0828890800476074, "learning_rate": 2.915384389943576e-06, "loss": 0.1887, "step": 4801 }, { "epoch": 2.2738088191772716, "grad_norm": 1.1113115549087524, "learning_rate": 2.911776042371286e-06, "loss": 0.1963, "step": 4802 }, { "epoch": 2.2742823320509027, "grad_norm": 1.5018105506896973, "learning_rate": 2.908169548655595e-06, "loss": 0.1994, "step": 4803 }, { "epoch": 2.2747558449245338, "grad_norm": 1.3521610498428345, "learning_rate": 2.9045649097397386e-06, "loss": 0.2026, "step": 4804 }, { "epoch": 2.2752293577981653, "grad_norm": 1.7627965211868286, "learning_rate": 2.9009621265664832e-06, "loss": 0.1797, "step": 4805 }, { "epoch": 2.2757028706717963, "grad_norm": 1.138641357421875, "learning_rate": 2.8973612000780937e-06, "loss": 0.2053, "step": 4806 }, { "epoch": 2.276176383545428, "grad_norm": 1.8081318140029907, "learning_rate": 2.8937621312163653e-06, "loss": 0.1976, "step": 4807 }, { "epoch": 2.276649896419059, "grad_norm": 1.7190033197402954, "learning_rate": 2.890164920922597e-06, "loss": 0.2069, "step": 4808 }, { "epoch": 2.27712340929269, "grad_norm": 1.0892248153686523, "learning_rate": 2.8865695701376005e-06, "loss": 0.2048, "step": 4809 }, { "epoch": 2.2775969221663215, "grad_norm": 1.098090648651123, "learning_rate": 2.8829760798017115e-06, "loss": 0.193, "step": 4810 }, { "epoch": 2.2780704350399525, "grad_norm": 1.2340128421783447, "learning_rate": 2.8793844508547664e-06, "loss": 0.1822, "step": 4811 }, { "epoch": 2.278543947913584, "grad_norm": 0.9539898037910461, "learning_rate": 2.875794684236127e-06, "loss": 0.1829, "step": 4812 }, { "epoch": 2.279017460787215, "grad_norm": 1.3292118310928345, "learning_rate": 2.8722067808846575e-06, "loss": 0.2284, "step": 4813 }, { "epoch": 2.279490973660846, "grad_norm": 1.208977460861206, "learning_rate": 2.8686207417387446e-06, "loss": 0.1792, "step": 4814 }, { "epoch": 2.2799644865344777, "grad_norm": 1.084511637687683, "learning_rate": 2.8650365677362756e-06, "loss": 0.1716, "step": 4815 }, { "epoch": 2.2804379994081088, "grad_norm": 1.649825930595398, "learning_rate": 2.861454259814662e-06, "loss": 0.2078, "step": 4816 }, { "epoch": 2.2809115122817403, "grad_norm": 1.3843883275985718, "learning_rate": 2.857873818910821e-06, "loss": 0.219, "step": 4817 }, { "epoch": 2.2813850251553713, "grad_norm": 1.0740282535552979, "learning_rate": 2.854295245961178e-06, "loss": 0.1984, "step": 4818 }, { "epoch": 2.281858538029003, "grad_norm": 1.2983367443084717, "learning_rate": 2.8507185419016813e-06, "loss": 0.213, "step": 4819 }, { "epoch": 2.282332050902634, "grad_norm": 1.1610127687454224, "learning_rate": 2.8471437076677767e-06, "loss": 0.2434, "step": 4820 }, { "epoch": 2.282805563776265, "grad_norm": 1.033072829246521, "learning_rate": 2.8435707441944325e-06, "loss": 0.195, "step": 4821 }, { "epoch": 2.2832790766498965, "grad_norm": 1.0944322347640991, "learning_rate": 2.839999652416119e-06, "loss": 0.2027, "step": 4822 }, { "epoch": 2.2837525895235276, "grad_norm": 1.1411755084991455, "learning_rate": 2.8364304332668245e-06, "loss": 0.2242, "step": 4823 }, { "epoch": 2.284226102397159, "grad_norm": 1.6203806400299072, "learning_rate": 2.83286308768004e-06, "loss": 0.2057, "step": 4824 }, { "epoch": 2.28469961527079, "grad_norm": 1.048741102218628, "learning_rate": 2.829297616588775e-06, "loss": 0.2284, "step": 4825 }, { "epoch": 2.2851731281444216, "grad_norm": 1.2415732145309448, "learning_rate": 2.825734020925538e-06, "loss": 0.1923, "step": 4826 }, { "epoch": 2.2856466410180527, "grad_norm": 1.5212392807006836, "learning_rate": 2.82217230162236e-06, "loss": 0.2163, "step": 4827 }, { "epoch": 2.2861201538916838, "grad_norm": 1.1868995428085327, "learning_rate": 2.8186124596107713e-06, "loss": 0.201, "step": 4828 }, { "epoch": 2.2865936667653153, "grad_norm": 1.1392196416854858, "learning_rate": 2.8150544958218097e-06, "loss": 0.2098, "step": 4829 }, { "epoch": 2.2870671796389463, "grad_norm": 1.0334511995315552, "learning_rate": 2.8114984111860334e-06, "loss": 0.2125, "step": 4830 }, { "epoch": 2.287540692512578, "grad_norm": 1.3680204153060913, "learning_rate": 2.8079442066334963e-06, "loss": 0.1946, "step": 4831 }, { "epoch": 2.288014205386209, "grad_norm": 1.040935754776001, "learning_rate": 2.8043918830937713e-06, "loss": 0.1835, "step": 4832 }, { "epoch": 2.28848771825984, "grad_norm": 0.9355210661888123, "learning_rate": 2.8008414414959295e-06, "loss": 0.1898, "step": 4833 }, { "epoch": 2.2889612311334715, "grad_norm": 1.3512656688690186, "learning_rate": 2.7972928827685597e-06, "loss": 0.1793, "step": 4834 }, { "epoch": 2.2894347440071026, "grad_norm": 1.103294849395752, "learning_rate": 2.793746207839748e-06, "loss": 0.2012, "step": 4835 }, { "epoch": 2.289908256880734, "grad_norm": 1.275721549987793, "learning_rate": 2.7902014176370996e-06, "loss": 0.2176, "step": 4836 }, { "epoch": 2.290381769754365, "grad_norm": 1.1361020803451538, "learning_rate": 2.786658513087712e-06, "loss": 0.2, "step": 4837 }, { "epoch": 2.290855282627996, "grad_norm": 1.2212600708007812, "learning_rate": 2.7831174951182027e-06, "loss": 0.2056, "step": 4838 }, { "epoch": 2.2913287955016277, "grad_norm": 1.6602025032043457, "learning_rate": 2.7795783646546924e-06, "loss": 0.2165, "step": 4839 }, { "epoch": 2.291802308375259, "grad_norm": 1.3278297185897827, "learning_rate": 2.7760411226228022e-06, "loss": 0.193, "step": 4840 }, { "epoch": 2.2922758212488903, "grad_norm": 1.0122475624084473, "learning_rate": 2.7725057699476674e-06, "loss": 0.1895, "step": 4841 }, { "epoch": 2.2927493341225214, "grad_norm": 1.3454642295837402, "learning_rate": 2.7689723075539245e-06, "loss": 0.1943, "step": 4842 }, { "epoch": 2.293222846996153, "grad_norm": 1.102555274963379, "learning_rate": 2.765440736365712e-06, "loss": 0.2111, "step": 4843 }, { "epoch": 2.293696359869784, "grad_norm": 1.9383556842803955, "learning_rate": 2.7619110573066856e-06, "loss": 0.1992, "step": 4844 }, { "epoch": 2.2941698727434154, "grad_norm": 1.0931776762008667, "learning_rate": 2.7583832712999912e-06, "loss": 0.2254, "step": 4845 }, { "epoch": 2.2946433856170465, "grad_norm": 1.0424646139144897, "learning_rate": 2.754857379268294e-06, "loss": 0.2012, "step": 4846 }, { "epoch": 2.2951168984906776, "grad_norm": 1.0645688772201538, "learning_rate": 2.7513333821337516e-06, "loss": 0.1864, "step": 4847 }, { "epoch": 2.295590411364309, "grad_norm": 1.6237589120864868, "learning_rate": 2.7478112808180378e-06, "loss": 0.175, "step": 4848 }, { "epoch": 2.29606392423794, "grad_norm": 1.3338507413864136, "learning_rate": 2.744291076242317e-06, "loss": 0.1883, "step": 4849 }, { "epoch": 2.2965374371115717, "grad_norm": 2.062040328979492, "learning_rate": 2.74077276932727e-06, "loss": 0.1894, "step": 4850 }, { "epoch": 2.2970109499852027, "grad_norm": 1.056857705116272, "learning_rate": 2.7372563609930726e-06, "loss": 0.1875, "step": 4851 }, { "epoch": 2.297484462858834, "grad_norm": 1.0261149406433105, "learning_rate": 2.7337418521594107e-06, "loss": 0.19, "step": 4852 }, { "epoch": 2.2979579757324653, "grad_norm": 1.3251570463180542, "learning_rate": 2.730229243745469e-06, "loss": 0.215, "step": 4853 }, { "epoch": 2.2984314886060964, "grad_norm": 0.9587031602859497, "learning_rate": 2.726718536669933e-06, "loss": 0.2052, "step": 4854 }, { "epoch": 2.298905001479728, "grad_norm": 0.987009584903717, "learning_rate": 2.7232097318510007e-06, "loss": 0.1793, "step": 4855 }, { "epoch": 2.299378514353359, "grad_norm": 1.4788868427276611, "learning_rate": 2.7197028302063587e-06, "loss": 0.2226, "step": 4856 }, { "epoch": 2.29985202722699, "grad_norm": 1.2045164108276367, "learning_rate": 2.716197832653211e-06, "loss": 0.1932, "step": 4857 }, { "epoch": 2.3003255401006215, "grad_norm": 1.0435082912445068, "learning_rate": 2.7126947401082494e-06, "loss": 0.2083, "step": 4858 }, { "epoch": 2.3007990529742526, "grad_norm": 1.5519726276397705, "learning_rate": 2.709193553487679e-06, "loss": 0.2008, "step": 4859 }, { "epoch": 2.301272565847884, "grad_norm": 1.805208444595337, "learning_rate": 2.7056942737071955e-06, "loss": 0.2021, "step": 4860 }, { "epoch": 2.301746078721515, "grad_norm": 1.2867292165756226, "learning_rate": 2.702196901682009e-06, "loss": 0.1852, "step": 4861 }, { "epoch": 2.3022195915951467, "grad_norm": 1.190337061882019, "learning_rate": 2.6987014383268196e-06, "loss": 0.1956, "step": 4862 }, { "epoch": 2.3026931044687777, "grad_norm": 1.159690499305725, "learning_rate": 2.6952078845558292e-06, "loss": 0.1994, "step": 4863 }, { "epoch": 2.3031666173424092, "grad_norm": 1.3060331344604492, "learning_rate": 2.691716241282748e-06, "loss": 0.2027, "step": 4864 }, { "epoch": 2.3036401302160403, "grad_norm": 1.173386812210083, "learning_rate": 2.6882265094207783e-06, "loss": 0.1967, "step": 4865 }, { "epoch": 2.3041136430896714, "grad_norm": 1.255037784576416, "learning_rate": 2.684738689882629e-06, "loss": 0.1947, "step": 4866 }, { "epoch": 2.304587155963303, "grad_norm": 1.0719294548034668, "learning_rate": 2.6812527835805013e-06, "loss": 0.2067, "step": 4867 }, { "epoch": 2.305060668836934, "grad_norm": 1.0298558473587036, "learning_rate": 2.6777687914261054e-06, "loss": 0.1918, "step": 4868 }, { "epoch": 2.3055341817105655, "grad_norm": 1.2399965524673462, "learning_rate": 2.6742867143306404e-06, "loss": 0.2119, "step": 4869 }, { "epoch": 2.3060076945841965, "grad_norm": 1.3060729503631592, "learning_rate": 2.6708065532048167e-06, "loss": 0.2244, "step": 4870 }, { "epoch": 2.3064812074578276, "grad_norm": 1.0601017475128174, "learning_rate": 2.6673283089588286e-06, "loss": 0.1678, "step": 4871 }, { "epoch": 2.306954720331459, "grad_norm": 1.461519718170166, "learning_rate": 2.6638519825023855e-06, "loss": 0.2043, "step": 4872 }, { "epoch": 2.30742823320509, "grad_norm": 1.202025055885315, "learning_rate": 2.6603775747446836e-06, "loss": 0.219, "step": 4873 }, { "epoch": 2.3079017460787217, "grad_norm": 1.5408061742782593, "learning_rate": 2.6569050865944168e-06, "loss": 0.2058, "step": 4874 }, { "epoch": 2.3083752589523527, "grad_norm": 1.1081178188323975, "learning_rate": 2.653434518959788e-06, "loss": 0.2056, "step": 4875 }, { "epoch": 2.308848771825984, "grad_norm": 1.2208150625228882, "learning_rate": 2.649965872748481e-06, "loss": 0.2119, "step": 4876 }, { "epoch": 2.3093222846996153, "grad_norm": 0.9531561732292175, "learning_rate": 2.6464991488676996e-06, "loss": 0.175, "step": 4877 }, { "epoch": 2.3097957975732464, "grad_norm": 1.0431057214736938, "learning_rate": 2.6430343482241237e-06, "loss": 0.2357, "step": 4878 }, { "epoch": 2.310269310446878, "grad_norm": 1.079884648323059, "learning_rate": 2.6395714717239384e-06, "loss": 0.2059, "step": 4879 }, { "epoch": 2.310742823320509, "grad_norm": 1.2502458095550537, "learning_rate": 2.63611052027283e-06, "loss": 0.2051, "step": 4880 }, { "epoch": 2.3112163361941405, "grad_norm": 1.1873927116394043, "learning_rate": 2.6326514947759718e-06, "loss": 0.2142, "step": 4881 }, { "epoch": 2.3116898490677715, "grad_norm": 1.1415022611618042, "learning_rate": 2.629194396138044e-06, "loss": 0.1781, "step": 4882 }, { "epoch": 2.3121633619414026, "grad_norm": 1.0206491947174072, "learning_rate": 2.625739225263211e-06, "loss": 0.1994, "step": 4883 }, { "epoch": 2.312636874815034, "grad_norm": 1.136160969734192, "learning_rate": 2.622285983055146e-06, "loss": 0.1869, "step": 4884 }, { "epoch": 2.313110387688665, "grad_norm": 1.2701917886734009, "learning_rate": 2.6188346704170053e-06, "loss": 0.2147, "step": 4885 }, { "epoch": 2.3135839005622967, "grad_norm": 1.050111174583435, "learning_rate": 2.615385288251452e-06, "loss": 0.2092, "step": 4886 }, { "epoch": 2.3140574134359277, "grad_norm": 1.1938689947128296, "learning_rate": 2.6119378374606354e-06, "loss": 0.2094, "step": 4887 }, { "epoch": 2.3145309263095593, "grad_norm": 1.2285950183868408, "learning_rate": 2.608492318946201e-06, "loss": 0.1953, "step": 4888 }, { "epoch": 2.3150044391831903, "grad_norm": 2.1110057830810547, "learning_rate": 2.6050487336092967e-06, "loss": 0.2099, "step": 4889 }, { "epoch": 2.3154779520568214, "grad_norm": 1.1653103828430176, "learning_rate": 2.6016070823505525e-06, "loss": 0.1689, "step": 4890 }, { "epoch": 2.315951464930453, "grad_norm": 1.4252010583877563, "learning_rate": 2.5981673660701055e-06, "loss": 0.1917, "step": 4891 }, { "epoch": 2.316424977804084, "grad_norm": 1.0870301723480225, "learning_rate": 2.5947295856675737e-06, "loss": 0.1839, "step": 4892 }, { "epoch": 2.3168984906777155, "grad_norm": 1.1563901901245117, "learning_rate": 2.5912937420420823e-06, "loss": 0.1985, "step": 4893 }, { "epoch": 2.3173720035513465, "grad_norm": 0.9440892338752747, "learning_rate": 2.587859836092237e-06, "loss": 0.1894, "step": 4894 }, { "epoch": 2.3178455164249776, "grad_norm": 1.289351463317871, "learning_rate": 2.5844278687161474e-06, "loss": 0.2051, "step": 4895 }, { "epoch": 2.318319029298609, "grad_norm": 1.7012381553649902, "learning_rate": 2.58099784081141e-06, "loss": 0.1951, "step": 4896 }, { "epoch": 2.31879254217224, "grad_norm": 0.9597797393798828, "learning_rate": 2.577569753275112e-06, "loss": 0.2101, "step": 4897 }, { "epoch": 2.3192660550458717, "grad_norm": 1.141579270362854, "learning_rate": 2.574143607003843e-06, "loss": 0.2081, "step": 4898 }, { "epoch": 2.3197395679195028, "grad_norm": 1.3628860712051392, "learning_rate": 2.570719402893671e-06, "loss": 0.2154, "step": 4899 }, { "epoch": 2.320213080793134, "grad_norm": 1.7435754537582397, "learning_rate": 2.5672971418401716e-06, "loss": 0.1822, "step": 4900 }, { "epoch": 2.3206865936667653, "grad_norm": 1.5572841167449951, "learning_rate": 2.5638768247383962e-06, "loss": 0.1956, "step": 4901 }, { "epoch": 2.3211601065403964, "grad_norm": 1.8090298175811768, "learning_rate": 2.5604584524829036e-06, "loss": 0.1927, "step": 4902 }, { "epoch": 2.321633619414028, "grad_norm": 1.089689016342163, "learning_rate": 2.5570420259677285e-06, "loss": 0.2198, "step": 4903 }, { "epoch": 2.322107132287659, "grad_norm": 0.9589508175849915, "learning_rate": 2.553627546086411e-06, "loss": 0.2007, "step": 4904 }, { "epoch": 2.3225806451612905, "grad_norm": 0.8565977811813354, "learning_rate": 2.55021501373197e-06, "loss": 0.1745, "step": 4905 }, { "epoch": 2.3230541580349215, "grad_norm": 1.0628879070281982, "learning_rate": 2.5468044297969265e-06, "loss": 0.1977, "step": 4906 }, { "epoch": 2.323527670908553, "grad_norm": 1.0389453172683716, "learning_rate": 2.543395795173281e-06, "loss": 0.1992, "step": 4907 }, { "epoch": 2.324001183782184, "grad_norm": 1.0597020387649536, "learning_rate": 2.5399891107525277e-06, "loss": 0.2128, "step": 4908 }, { "epoch": 2.324474696655815, "grad_norm": 1.1213783025741577, "learning_rate": 2.5365843774256573e-06, "loss": 0.1916, "step": 4909 }, { "epoch": 2.3249482095294467, "grad_norm": 1.2535953521728516, "learning_rate": 2.5331815960831387e-06, "loss": 0.1868, "step": 4910 }, { "epoch": 2.3254217224030778, "grad_norm": 1.0827935934066772, "learning_rate": 2.5297807676149435e-06, "loss": 0.2103, "step": 4911 }, { "epoch": 2.3258952352767093, "grad_norm": 1.4444578886032104, "learning_rate": 2.526381892910519e-06, "loss": 0.2075, "step": 4912 }, { "epoch": 2.3263687481503403, "grad_norm": 1.0366746187210083, "learning_rate": 2.5229849728588143e-06, "loss": 0.2126, "step": 4913 }, { "epoch": 2.3268422610239714, "grad_norm": 1.0630645751953125, "learning_rate": 2.519590008348255e-06, "loss": 0.2182, "step": 4914 }, { "epoch": 2.327315773897603, "grad_norm": 1.4311585426330566, "learning_rate": 2.5161970002667636e-06, "loss": 0.2224, "step": 4915 }, { "epoch": 2.327789286771234, "grad_norm": 1.9269888401031494, "learning_rate": 2.512805949501752e-06, "loss": 0.1924, "step": 4916 }, { "epoch": 2.3282627996448655, "grad_norm": 1.697356939315796, "learning_rate": 2.5094168569401123e-06, "loss": 0.2131, "step": 4917 }, { "epoch": 2.3287363125184966, "grad_norm": 1.6500767469406128, "learning_rate": 2.5060297234682328e-06, "loss": 0.2155, "step": 4918 }, { "epoch": 2.3292098253921276, "grad_norm": 1.0592091083526611, "learning_rate": 2.502644549971981e-06, "loss": 0.209, "step": 4919 }, { "epoch": 2.329683338265759, "grad_norm": 1.3273735046386719, "learning_rate": 2.499261337336721e-06, "loss": 0.2025, "step": 4920 }, { "epoch": 2.33015685113939, "grad_norm": 2.2133803367614746, "learning_rate": 2.4958800864472974e-06, "loss": 0.1929, "step": 4921 }, { "epoch": 2.3306303640130217, "grad_norm": 1.3577215671539307, "learning_rate": 2.492500798188042e-06, "loss": 0.2091, "step": 4922 }, { "epoch": 2.3311038768866528, "grad_norm": 1.0294976234436035, "learning_rate": 2.4891234734427784e-06, "loss": 0.226, "step": 4923 }, { "epoch": 2.3315773897602843, "grad_norm": 1.083409070968628, "learning_rate": 2.485748113094809e-06, "loss": 0.208, "step": 4924 }, { "epoch": 2.3320509026339153, "grad_norm": 1.1658552885055542, "learning_rate": 2.4823747180269332e-06, "loss": 0.2045, "step": 4925 }, { "epoch": 2.332524415507547, "grad_norm": 1.3609403371810913, "learning_rate": 2.479003289121422e-06, "loss": 0.2003, "step": 4926 }, { "epoch": 2.332997928381178, "grad_norm": 0.9218862652778625, "learning_rate": 2.4756338272600476e-06, "loss": 0.1993, "step": 4927 }, { "epoch": 2.333471441254809, "grad_norm": 1.0133236646652222, "learning_rate": 2.472266333324054e-06, "loss": 0.2027, "step": 4928 }, { "epoch": 2.3339449541284405, "grad_norm": 0.9589722752571106, "learning_rate": 2.4689008081941825e-06, "loss": 0.1872, "step": 4929 }, { "epoch": 2.3344184670020716, "grad_norm": 1.2544256448745728, "learning_rate": 2.4655372527506473e-06, "loss": 0.2071, "step": 4930 }, { "epoch": 2.334891979875703, "grad_norm": 1.001051902770996, "learning_rate": 2.462175667873161e-06, "loss": 0.2062, "step": 4931 }, { "epoch": 2.335365492749334, "grad_norm": 1.142121434211731, "learning_rate": 2.45881605444091e-06, "loss": 0.1953, "step": 4932 }, { "epoch": 2.335839005622965, "grad_norm": 1.1744766235351562, "learning_rate": 2.4554584133325653e-06, "loss": 0.1917, "step": 4933 }, { "epoch": 2.3363125184965967, "grad_norm": 1.3197706937789917, "learning_rate": 2.4521027454262925e-06, "loss": 0.1952, "step": 4934 }, { "epoch": 2.336786031370228, "grad_norm": 1.6744234561920166, "learning_rate": 2.4487490515997282e-06, "loss": 0.2136, "step": 4935 }, { "epoch": 2.3372595442438593, "grad_norm": 1.0961428880691528, "learning_rate": 2.445397332730004e-06, "loss": 0.2046, "step": 4936 }, { "epoch": 2.3377330571174904, "grad_norm": 0.8409469127655029, "learning_rate": 2.4420475896937236e-06, "loss": 0.1953, "step": 4937 }, { "epoch": 2.3382065699911214, "grad_norm": 1.0160478353500366, "learning_rate": 2.4386998233669867e-06, "loss": 0.1908, "step": 4938 }, { "epoch": 2.338680082864753, "grad_norm": 1.2315906286239624, "learning_rate": 2.4353540346253622e-06, "loss": 0.212, "step": 4939 }, { "epoch": 2.339153595738384, "grad_norm": 1.112743616104126, "learning_rate": 2.4320102243439157e-06, "loss": 0.2025, "step": 4940 }, { "epoch": 2.3396271086120155, "grad_norm": 1.1238675117492676, "learning_rate": 2.4286683933971853e-06, "loss": 0.2032, "step": 4941 }, { "epoch": 2.3401006214856466, "grad_norm": 1.2250267267227173, "learning_rate": 2.4253285426591923e-06, "loss": 0.2298, "step": 4942 }, { "epoch": 2.340574134359278, "grad_norm": 1.391561508178711, "learning_rate": 2.4219906730034472e-06, "loss": 0.2103, "step": 4943 }, { "epoch": 2.341047647232909, "grad_norm": 1.2246688604354858, "learning_rate": 2.418654785302933e-06, "loss": 0.202, "step": 4944 }, { "epoch": 2.34152116010654, "grad_norm": 1.2264790534973145, "learning_rate": 2.4153208804301232e-06, "loss": 0.2202, "step": 4945 }, { "epoch": 2.3419946729801717, "grad_norm": 0.9517511129379272, "learning_rate": 2.4119889592569633e-06, "loss": 0.1852, "step": 4946 }, { "epoch": 2.342468185853803, "grad_norm": 0.9769893288612366, "learning_rate": 2.408659022654892e-06, "loss": 0.1959, "step": 4947 }, { "epoch": 2.3429416987274343, "grad_norm": 1.0385048389434814, "learning_rate": 2.4053310714948153e-06, "loss": 0.1984, "step": 4948 }, { "epoch": 2.3434152116010654, "grad_norm": 0.9590575098991394, "learning_rate": 2.402005106647133e-06, "loss": 0.1941, "step": 4949 }, { "epoch": 2.343888724474697, "grad_norm": 1.5553905963897705, "learning_rate": 2.3986811289817126e-06, "loss": 0.2009, "step": 4950 }, { "epoch": 2.344362237348328, "grad_norm": 1.0020135641098022, "learning_rate": 2.3953591393679143e-06, "loss": 0.1865, "step": 4951 }, { "epoch": 2.344835750221959, "grad_norm": 1.1729081869125366, "learning_rate": 2.39203913867457e-06, "loss": 0.2014, "step": 4952 }, { "epoch": 2.3453092630955905, "grad_norm": 1.1632317304611206, "learning_rate": 2.3887211277699883e-06, "loss": 0.2208, "step": 4953 }, { "epoch": 2.3457827759692216, "grad_norm": 1.0477259159088135, "learning_rate": 2.3854051075219744e-06, "loss": 0.2035, "step": 4954 }, { "epoch": 2.346256288842853, "grad_norm": 1.2778170108795166, "learning_rate": 2.382091078797791e-06, "loss": 0.2128, "step": 4955 }, { "epoch": 2.346729801716484, "grad_norm": 1.322662115097046, "learning_rate": 2.3787790424641986e-06, "loss": 0.2053, "step": 4956 }, { "epoch": 2.347203314590115, "grad_norm": 1.149414300918579, "learning_rate": 2.3754689993874247e-06, "loss": 0.2045, "step": 4957 }, { "epoch": 2.3476768274637467, "grad_norm": 1.00741446018219, "learning_rate": 2.3721609504331755e-06, "loss": 0.1899, "step": 4958 }, { "epoch": 2.348150340337378, "grad_norm": 1.745548963546753, "learning_rate": 2.3688548964666446e-06, "loss": 0.1886, "step": 4959 }, { "epoch": 2.3486238532110093, "grad_norm": 1.0258804559707642, "learning_rate": 2.365550838352495e-06, "loss": 0.1878, "step": 4960 }, { "epoch": 2.3490973660846404, "grad_norm": 1.286632776260376, "learning_rate": 2.3622487769548754e-06, "loss": 0.2032, "step": 4961 }, { "epoch": 2.3495708789582714, "grad_norm": 1.0202562808990479, "learning_rate": 2.3589487131374023e-06, "loss": 0.2097, "step": 4962 }, { "epoch": 2.350044391831903, "grad_norm": 1.0657576322555542, "learning_rate": 2.3556506477631826e-06, "loss": 0.1977, "step": 4963 }, { "epoch": 2.350517904705534, "grad_norm": 1.0342962741851807, "learning_rate": 2.3523545816947856e-06, "loss": 0.1915, "step": 4964 }, { "epoch": 2.3509914175791655, "grad_norm": 1.115942120552063, "learning_rate": 2.3490605157942726e-06, "loss": 0.2007, "step": 4965 }, { "epoch": 2.3514649304527966, "grad_norm": 2.0979726314544678, "learning_rate": 2.3457684509231725e-06, "loss": 0.1993, "step": 4966 }, { "epoch": 2.351938443326428, "grad_norm": 1.075711965560913, "learning_rate": 2.3424783879424894e-06, "loss": 0.1756, "step": 4967 }, { "epoch": 2.352411956200059, "grad_norm": 1.1330883502960205, "learning_rate": 2.3391903277127127e-06, "loss": 0.2115, "step": 4968 }, { "epoch": 2.3528854690736907, "grad_norm": 1.1201201677322388, "learning_rate": 2.3359042710937986e-06, "loss": 0.1995, "step": 4969 }, { "epoch": 2.3533589819473217, "grad_norm": 1.2727965116500854, "learning_rate": 2.3326202189451873e-06, "loss": 0.2007, "step": 4970 }, { "epoch": 2.353832494820953, "grad_norm": 1.3274377584457397, "learning_rate": 2.3293381721257868e-06, "loss": 0.1829, "step": 4971 }, { "epoch": 2.3543060076945843, "grad_norm": 0.9202346205711365, "learning_rate": 2.326058131493991e-06, "loss": 0.1861, "step": 4972 }, { "epoch": 2.3547795205682154, "grad_norm": 1.0923426151275635, "learning_rate": 2.3227800979076552e-06, "loss": 0.2093, "step": 4973 }, { "epoch": 2.355253033441847, "grad_norm": 1.1253713369369507, "learning_rate": 2.319504072224125e-06, "loss": 0.2014, "step": 4974 }, { "epoch": 2.355726546315478, "grad_norm": 1.5804685354232788, "learning_rate": 2.316230055300208e-06, "loss": 0.1789, "step": 4975 }, { "epoch": 2.356200059189109, "grad_norm": 2.09173321723938, "learning_rate": 2.312958047992192e-06, "loss": 0.1968, "step": 4976 }, { "epoch": 2.3566735720627405, "grad_norm": 1.405809760093689, "learning_rate": 2.3096880511558427e-06, "loss": 0.2299, "step": 4977 }, { "epoch": 2.3571470849363716, "grad_norm": 1.0536633729934692, "learning_rate": 2.306420065646392e-06, "loss": 0.2048, "step": 4978 }, { "epoch": 2.357620597810003, "grad_norm": 1.2741084098815918, "learning_rate": 2.3031540923185536e-06, "loss": 0.2172, "step": 4979 }, { "epoch": 2.358094110683634, "grad_norm": 0.9234102964401245, "learning_rate": 2.299890132026508e-06, "loss": 0.2006, "step": 4980 }, { "epoch": 2.3585676235572652, "grad_norm": 1.0131012201309204, "learning_rate": 2.296628185623915e-06, "loss": 0.1732, "step": 4981 }, { "epoch": 2.3590411364308967, "grad_norm": 1.1941182613372803, "learning_rate": 2.2933682539639026e-06, "loss": 0.2364, "step": 4982 }, { "epoch": 2.359514649304528, "grad_norm": 1.0108896493911743, "learning_rate": 2.2901103378990785e-06, "loss": 0.1966, "step": 4983 }, { "epoch": 2.3599881621781593, "grad_norm": 0.8393535017967224, "learning_rate": 2.286854438281515e-06, "loss": 0.194, "step": 4984 }, { "epoch": 2.3604616750517904, "grad_norm": 1.4110928773880005, "learning_rate": 2.283600555962765e-06, "loss": 0.1968, "step": 4985 }, { "epoch": 2.360935187925422, "grad_norm": 1.11850905418396, "learning_rate": 2.2803486917938487e-06, "loss": 0.2094, "step": 4986 }, { "epoch": 2.361408700799053, "grad_norm": 1.1148712635040283, "learning_rate": 2.2770988466252565e-06, "loss": 0.2258, "step": 4987 }, { "epoch": 2.3618822136726845, "grad_norm": 1.2502351999282837, "learning_rate": 2.273851021306959e-06, "loss": 0.1848, "step": 4988 }, { "epoch": 2.3623557265463155, "grad_norm": 1.252803087234497, "learning_rate": 2.27060521668839e-06, "loss": 0.2035, "step": 4989 }, { "epoch": 2.3628292394199466, "grad_norm": 1.0564323663711548, "learning_rate": 2.26736143361846e-06, "loss": 0.1949, "step": 4990 }, { "epoch": 2.363302752293578, "grad_norm": 1.3723267316818237, "learning_rate": 2.2641196729455482e-06, "loss": 0.211, "step": 4991 }, { "epoch": 2.363776265167209, "grad_norm": 1.104577660560608, "learning_rate": 2.2608799355175058e-06, "loss": 0.1932, "step": 4992 }, { "epoch": 2.3642497780408407, "grad_norm": 1.5187643766403198, "learning_rate": 2.2576422221816596e-06, "loss": 0.1772, "step": 4993 }, { "epoch": 2.3647232909144718, "grad_norm": 1.233651041984558, "learning_rate": 2.254406533784794e-06, "loss": 0.2034, "step": 4994 }, { "epoch": 2.365196803788103, "grad_norm": 1.5177334547042847, "learning_rate": 2.2511728711731806e-06, "loss": 0.2064, "step": 4995 }, { "epoch": 2.3656703166617343, "grad_norm": 1.8442257642745972, "learning_rate": 2.2479412351925444e-06, "loss": 0.2119, "step": 4996 }, { "epoch": 2.3661438295353654, "grad_norm": 1.1477844715118408, "learning_rate": 2.2447116266880964e-06, "loss": 0.1973, "step": 4997 }, { "epoch": 2.366617342408997, "grad_norm": 1.1657443046569824, "learning_rate": 2.241484046504503e-06, "loss": 0.2081, "step": 4998 }, { "epoch": 2.367090855282628, "grad_norm": 1.501852035522461, "learning_rate": 2.238258495485912e-06, "loss": 0.2391, "step": 4999 }, { "epoch": 2.367564368156259, "grad_norm": 1.438940167427063, "learning_rate": 2.2350349744759324e-06, "loss": 0.2117, "step": 5000 }, { "epoch": 2.3680378810298905, "grad_norm": 2.076133966445923, "learning_rate": 2.231813484317643e-06, "loss": 0.1915, "step": 5001 }, { "epoch": 2.3685113939035216, "grad_norm": 0.9920544624328613, "learning_rate": 2.2285940258535987e-06, "loss": 0.2254, "step": 5002 }, { "epoch": 2.368984906777153, "grad_norm": 0.9885861873626709, "learning_rate": 2.2253765999258115e-06, "loss": 0.2179, "step": 5003 }, { "epoch": 2.369458419650784, "grad_norm": 1.199271559715271, "learning_rate": 2.222161207375775e-06, "loss": 0.1974, "step": 5004 }, { "epoch": 2.3699319325244157, "grad_norm": 0.865847110748291, "learning_rate": 2.2189478490444373e-06, "loss": 0.1935, "step": 5005 }, { "epoch": 2.3704054453980468, "grad_norm": 1.360724925994873, "learning_rate": 2.2157365257722273e-06, "loss": 0.1836, "step": 5006 }, { "epoch": 2.370878958271678, "grad_norm": 1.0131431818008423, "learning_rate": 2.2125272383990304e-06, "loss": 0.1864, "step": 5007 }, { "epoch": 2.3713524711453093, "grad_norm": 0.978959858417511, "learning_rate": 2.209319987764209e-06, "loss": 0.2075, "step": 5008 }, { "epoch": 2.3718259840189404, "grad_norm": 1.2233644723892212, "learning_rate": 2.2061147747065847e-06, "loss": 0.2237, "step": 5009 }, { "epoch": 2.372299496892572, "grad_norm": 1.8459240198135376, "learning_rate": 2.2029116000644544e-06, "loss": 0.1897, "step": 5010 }, { "epoch": 2.372773009766203, "grad_norm": 1.2090134620666504, "learning_rate": 2.1997104646755763e-06, "loss": 0.2038, "step": 5011 }, { "epoch": 2.3732465226398345, "grad_norm": 0.9758379459381104, "learning_rate": 2.1965113693771725e-06, "loss": 0.1924, "step": 5012 }, { "epoch": 2.3737200355134656, "grad_norm": 1.6864513158798218, "learning_rate": 2.193314315005941e-06, "loss": 0.2004, "step": 5013 }, { "epoch": 2.3741935483870966, "grad_norm": 1.3247301578521729, "learning_rate": 2.190119302398035e-06, "loss": 0.1929, "step": 5014 }, { "epoch": 2.374667061260728, "grad_norm": 1.4092978239059448, "learning_rate": 2.186926332389084e-06, "loss": 0.2078, "step": 5015 }, { "epoch": 2.375140574134359, "grad_norm": 0.9686800241470337, "learning_rate": 2.1837354058141756e-06, "loss": 0.1993, "step": 5016 }, { "epoch": 2.3756140870079907, "grad_norm": 1.527159571647644, "learning_rate": 2.1805465235078695e-06, "loss": 0.191, "step": 5017 }, { "epoch": 2.3760875998816218, "grad_norm": 1.4575700759887695, "learning_rate": 2.177359686304181e-06, "loss": 0.203, "step": 5018 }, { "epoch": 2.376561112755253, "grad_norm": 1.0823571681976318, "learning_rate": 2.1741748950366036e-06, "loss": 0.2032, "step": 5019 }, { "epoch": 2.3770346256288843, "grad_norm": 1.1299647092819214, "learning_rate": 2.170992150538085e-06, "loss": 0.1946, "step": 5020 }, { "epoch": 2.3775081385025154, "grad_norm": 1.379338264465332, "learning_rate": 2.167811453641039e-06, "loss": 0.2056, "step": 5021 }, { "epoch": 2.377981651376147, "grad_norm": 1.223885416984558, "learning_rate": 2.1646328051773512e-06, "loss": 0.2183, "step": 5022 }, { "epoch": 2.378455164249778, "grad_norm": 1.0248559713363647, "learning_rate": 2.1614562059783627e-06, "loss": 0.1936, "step": 5023 }, { "epoch": 2.378928677123409, "grad_norm": 1.0929266214370728, "learning_rate": 2.1582816568748856e-06, "loss": 0.1954, "step": 5024 }, { "epoch": 2.3794021899970406, "grad_norm": 1.3425700664520264, "learning_rate": 2.155109158697187e-06, "loss": 0.1968, "step": 5025 }, { "epoch": 2.3798757028706716, "grad_norm": 1.4846118688583374, "learning_rate": 2.151938712275011e-06, "loss": 0.1819, "step": 5026 }, { "epoch": 2.380349215744303, "grad_norm": 1.0532817840576172, "learning_rate": 2.14877031843755e-06, "loss": 0.1961, "step": 5027 }, { "epoch": 2.380822728617934, "grad_norm": 1.0059374570846558, "learning_rate": 2.145603978013473e-06, "loss": 0.2051, "step": 5028 }, { "epoch": 2.3812962414915657, "grad_norm": 1.0236760377883911, "learning_rate": 2.1424396918309e-06, "loss": 0.2084, "step": 5029 }, { "epoch": 2.381769754365197, "grad_norm": 1.142202377319336, "learning_rate": 2.139277460717425e-06, "loss": 0.1947, "step": 5030 }, { "epoch": 2.3822432672388283, "grad_norm": 1.1363552808761597, "learning_rate": 2.1361172855000943e-06, "loss": 0.2102, "step": 5031 }, { "epoch": 2.3827167801124594, "grad_norm": 1.0805346965789795, "learning_rate": 2.1329591670054227e-06, "loss": 0.1846, "step": 5032 }, { "epoch": 2.3831902929860904, "grad_norm": 1.1634663343429565, "learning_rate": 2.1298031060593893e-06, "loss": 0.2285, "step": 5033 }, { "epoch": 2.383663805859722, "grad_norm": 0.9185308814048767, "learning_rate": 2.1266491034874247e-06, "loss": 0.1965, "step": 5034 }, { "epoch": 2.384137318733353, "grad_norm": 0.9834035038948059, "learning_rate": 2.1234971601144362e-06, "loss": 0.2108, "step": 5035 }, { "epoch": 2.3846108316069845, "grad_norm": 1.1302801370620728, "learning_rate": 2.1203472767647782e-06, "loss": 0.1893, "step": 5036 }, { "epoch": 2.3850843444806156, "grad_norm": 1.2327231168746948, "learning_rate": 2.117199454262271e-06, "loss": 0.1993, "step": 5037 }, { "epoch": 2.3855578573542466, "grad_norm": 0.890938401222229, "learning_rate": 2.1140536934302024e-06, "loss": 0.1921, "step": 5038 }, { "epoch": 2.386031370227878, "grad_norm": 0.9080513715744019, "learning_rate": 2.1109099950913105e-06, "loss": 0.1937, "step": 5039 }, { "epoch": 2.386504883101509, "grad_norm": 0.9706762433052063, "learning_rate": 2.107768360067806e-06, "loss": 0.2035, "step": 5040 }, { "epoch": 2.3869783959751407, "grad_norm": 1.112209439277649, "learning_rate": 2.1046287891813445e-06, "loss": 0.2072, "step": 5041 }, { "epoch": 2.387451908848772, "grad_norm": 1.2673587799072266, "learning_rate": 2.1014912832530587e-06, "loss": 0.1955, "step": 5042 }, { "epoch": 2.387925421722403, "grad_norm": 1.109052300453186, "learning_rate": 2.0983558431035266e-06, "loss": 0.1937, "step": 5043 }, { "epoch": 2.3883989345960344, "grad_norm": 0.9777413606643677, "learning_rate": 2.095222469552799e-06, "loss": 0.2019, "step": 5044 }, { "epoch": 2.3888724474696654, "grad_norm": 1.3833292722702026, "learning_rate": 2.0920911634203745e-06, "loss": 0.1948, "step": 5045 }, { "epoch": 2.389345960343297, "grad_norm": 1.2488656044006348, "learning_rate": 2.0889619255252147e-06, "loss": 0.1998, "step": 5046 }, { "epoch": 2.389819473216928, "grad_norm": 1.1769556999206543, "learning_rate": 2.085834756685747e-06, "loss": 0.2172, "step": 5047 }, { "epoch": 2.3902929860905595, "grad_norm": 1.6330255270004272, "learning_rate": 2.082709657719848e-06, "loss": 0.1885, "step": 5048 }, { "epoch": 2.3907664989641906, "grad_norm": 1.0924394130706787, "learning_rate": 2.0795866294448596e-06, "loss": 0.2175, "step": 5049 }, { "epoch": 2.391240011837822, "grad_norm": 1.4790942668914795, "learning_rate": 2.0764656726775767e-06, "loss": 0.1987, "step": 5050 }, { "epoch": 2.391713524711453, "grad_norm": 0.9769918918609619, "learning_rate": 2.0733467882342597e-06, "loss": 0.2065, "step": 5051 }, { "epoch": 2.392187037585084, "grad_norm": 1.1905534267425537, "learning_rate": 2.0702299769306177e-06, "loss": 0.2126, "step": 5052 }, { "epoch": 2.3926605504587157, "grad_norm": 1.0014641284942627, "learning_rate": 2.067115239581828e-06, "loss": 0.1905, "step": 5053 }, { "epoch": 2.393134063332347, "grad_norm": 1.234556794166565, "learning_rate": 2.0640025770025186e-06, "loss": 0.2117, "step": 5054 }, { "epoch": 2.3936075762059783, "grad_norm": 1.1088292598724365, "learning_rate": 2.0608919900067716e-06, "loss": 0.2065, "step": 5055 }, { "epoch": 2.3940810890796094, "grad_norm": 0.9730679988861084, "learning_rate": 2.0577834794081377e-06, "loss": 0.1976, "step": 5056 }, { "epoch": 2.3945546019532404, "grad_norm": 1.591340184211731, "learning_rate": 2.0546770460196117e-06, "loss": 0.1804, "step": 5057 }, { "epoch": 2.395028114826872, "grad_norm": 1.2157362699508667, "learning_rate": 2.051572690653658e-06, "loss": 0.1855, "step": 5058 }, { "epoch": 2.395501627700503, "grad_norm": 1.2474086284637451, "learning_rate": 2.0484704141221845e-06, "loss": 0.1692, "step": 5059 }, { "epoch": 2.3959751405741345, "grad_norm": 0.999910831451416, "learning_rate": 2.0453702172365663e-06, "loss": 0.1992, "step": 5060 }, { "epoch": 2.3964486534477656, "grad_norm": 1.1098980903625488, "learning_rate": 2.0422721008076264e-06, "loss": 0.2495, "step": 5061 }, { "epoch": 2.3969221663213967, "grad_norm": 1.1341627836227417, "learning_rate": 2.0391760656456506e-06, "loss": 0.1914, "step": 5062 }, { "epoch": 2.397395679195028, "grad_norm": 0.9773929119110107, "learning_rate": 2.0360821125603726e-06, "loss": 0.204, "step": 5063 }, { "epoch": 2.3978691920686592, "grad_norm": 1.09663724899292, "learning_rate": 2.0329902423609926e-06, "loss": 0.2104, "step": 5064 }, { "epoch": 2.3983427049422907, "grad_norm": 1.1711968183517456, "learning_rate": 2.0299004558561554e-06, "loss": 0.2043, "step": 5065 }, { "epoch": 2.398816217815922, "grad_norm": 1.7636314630508423, "learning_rate": 2.026812753853962e-06, "loss": 0.1984, "step": 5066 }, { "epoch": 2.3992897306895533, "grad_norm": 0.9198000431060791, "learning_rate": 2.023727137161976e-06, "loss": 0.1887, "step": 5067 }, { "epoch": 2.3997632435631844, "grad_norm": 1.0493696928024292, "learning_rate": 2.020643606587207e-06, "loss": 0.1972, "step": 5068 }, { "epoch": 2.4002367564368154, "grad_norm": 1.075859785079956, "learning_rate": 2.0175621629361274e-06, "loss": 0.1963, "step": 5069 }, { "epoch": 2.400710269310447, "grad_norm": 1.5328881740570068, "learning_rate": 2.0144828070146528e-06, "loss": 0.1893, "step": 5070 }, { "epoch": 2.401183782184078, "grad_norm": 1.1692156791687012, "learning_rate": 2.011405539628163e-06, "loss": 0.181, "step": 5071 }, { "epoch": 2.4016572950577095, "grad_norm": 2.3541417121887207, "learning_rate": 2.0083303615814896e-06, "loss": 0.2215, "step": 5072 }, { "epoch": 2.4021308079313406, "grad_norm": 1.5428338050842285, "learning_rate": 2.00525727367891e-06, "loss": 0.1991, "step": 5073 }, { "epoch": 2.402604320804972, "grad_norm": 0.920405924320221, "learning_rate": 2.002186276724166e-06, "loss": 0.1992, "step": 5074 }, { "epoch": 2.403077833678603, "grad_norm": 1.1372102499008179, "learning_rate": 1.999117371520444e-06, "loss": 0.2079, "step": 5075 }, { "epoch": 2.4035513465522342, "grad_norm": 1.1284953355789185, "learning_rate": 1.9960505588703893e-06, "loss": 0.2023, "step": 5076 }, { "epoch": 2.4040248594258657, "grad_norm": 1.1499290466308594, "learning_rate": 1.9929858395760927e-06, "loss": 0.1872, "step": 5077 }, { "epoch": 2.404498372299497, "grad_norm": 1.525108814239502, "learning_rate": 1.9899232144391077e-06, "loss": 0.2028, "step": 5078 }, { "epoch": 2.4049718851731283, "grad_norm": 1.109409213066101, "learning_rate": 1.986862684260431e-06, "loss": 0.2014, "step": 5079 }, { "epoch": 2.4054453980467594, "grad_norm": 1.8481568098068237, "learning_rate": 1.983804249840513e-06, "loss": 0.1663, "step": 5080 }, { "epoch": 2.4059189109203905, "grad_norm": 0.8443739414215088, "learning_rate": 1.9807479119792618e-06, "loss": 0.1905, "step": 5081 }, { "epoch": 2.406392423794022, "grad_norm": 1.2149391174316406, "learning_rate": 1.9776936714760297e-06, "loss": 0.1994, "step": 5082 }, { "epoch": 2.406865936667653, "grad_norm": 1.261169672012329, "learning_rate": 1.974641529129626e-06, "loss": 0.2054, "step": 5083 }, { "epoch": 2.4073394495412845, "grad_norm": 1.2343029975891113, "learning_rate": 1.971591485738308e-06, "loss": 0.1861, "step": 5084 }, { "epoch": 2.4078129624149156, "grad_norm": 1.0303642749786377, "learning_rate": 1.968543542099787e-06, "loss": 0.2034, "step": 5085 }, { "epoch": 2.4082864752885467, "grad_norm": 1.204028606414795, "learning_rate": 1.9654976990112184e-06, "loss": 0.1955, "step": 5086 }, { "epoch": 2.408759988162178, "grad_norm": 1.6389491558074951, "learning_rate": 1.9624539572692193e-06, "loss": 0.2175, "step": 5087 }, { "epoch": 2.4092335010358092, "grad_norm": 1.201258897781372, "learning_rate": 1.9594123176698467e-06, "loss": 0.1972, "step": 5088 }, { "epoch": 2.4097070139094408, "grad_norm": 1.1844000816345215, "learning_rate": 1.9563727810086155e-06, "loss": 0.193, "step": 5089 }, { "epoch": 2.410180526783072, "grad_norm": 1.0625587701797485, "learning_rate": 1.953335348080484e-06, "loss": 0.2036, "step": 5090 }, { "epoch": 2.4106540396567033, "grad_norm": 1.2302863597869873, "learning_rate": 1.950300019679863e-06, "loss": 0.2189, "step": 5091 }, { "epoch": 2.4111275525303344, "grad_norm": 0.8708613514900208, "learning_rate": 1.9472667966006177e-06, "loss": 0.1779, "step": 5092 }, { "epoch": 2.411601065403966, "grad_norm": 1.382256031036377, "learning_rate": 1.944235679636053e-06, "loss": 0.1904, "step": 5093 }, { "epoch": 2.412074578277597, "grad_norm": 1.4713544845581055, "learning_rate": 1.941206669578933e-06, "loss": 0.218, "step": 5094 }, { "epoch": 2.412548091151228, "grad_norm": 1.513837456703186, "learning_rate": 1.9381797672214618e-06, "loss": 0.201, "step": 5095 }, { "epoch": 2.4130216040248595, "grad_norm": 1.2420634031295776, "learning_rate": 1.935154973355302e-06, "loss": 0.225, "step": 5096 }, { "epoch": 2.4134951168984906, "grad_norm": 1.2236741781234741, "learning_rate": 1.9321322887715533e-06, "loss": 0.1928, "step": 5097 }, { "epoch": 2.413968629772122, "grad_norm": 1.1855653524398804, "learning_rate": 1.929111714260774e-06, "loss": 0.2099, "step": 5098 }, { "epoch": 2.414442142645753, "grad_norm": 1.738868236541748, "learning_rate": 1.9260932506129647e-06, "loss": 0.1922, "step": 5099 }, { "epoch": 2.4149156555193843, "grad_norm": 1.1198941469192505, "learning_rate": 1.923076898617574e-06, "loss": 0.2124, "step": 5100 }, { "epoch": 2.4153891683930158, "grad_norm": 0.9823029041290283, "learning_rate": 1.920062659063503e-06, "loss": 0.2041, "step": 5101 }, { "epoch": 2.415862681266647, "grad_norm": 1.4813225269317627, "learning_rate": 1.917050532739092e-06, "loss": 0.2001, "step": 5102 }, { "epoch": 2.4163361941402783, "grad_norm": 1.1506394147872925, "learning_rate": 1.9140405204321387e-06, "loss": 0.1975, "step": 5103 }, { "epoch": 2.4168097070139094, "grad_norm": 1.1538875102996826, "learning_rate": 1.911032622929879e-06, "loss": 0.1927, "step": 5104 }, { "epoch": 2.4172832198875405, "grad_norm": 1.162630558013916, "learning_rate": 1.908026841019003e-06, "loss": 0.2028, "step": 5105 }, { "epoch": 2.417756732761172, "grad_norm": 1.90775465965271, "learning_rate": 1.9050231754856407e-06, "loss": 0.2086, "step": 5106 }, { "epoch": 2.418230245634803, "grad_norm": 1.8542652130126953, "learning_rate": 1.9020216271153747e-06, "loss": 0.2132, "step": 5107 }, { "epoch": 2.4187037585084346, "grad_norm": 1.7880303859710693, "learning_rate": 1.8990221966932266e-06, "loss": 0.207, "step": 5108 }, { "epoch": 2.4191772713820656, "grad_norm": 1.4888118505477905, "learning_rate": 1.8960248850036722e-06, "loss": 0.234, "step": 5109 }, { "epoch": 2.419650784255697, "grad_norm": 1.0973647832870483, "learning_rate": 1.8930296928306313e-06, "loss": 0.1932, "step": 5110 }, { "epoch": 2.420124297129328, "grad_norm": 1.4137020111083984, "learning_rate": 1.8900366209574627e-06, "loss": 0.199, "step": 5111 }, { "epoch": 2.4205978100029597, "grad_norm": 1.0636194944381714, "learning_rate": 1.8870456701669792e-06, "loss": 0.1843, "step": 5112 }, { "epoch": 2.4210713228765908, "grad_norm": 0.9531354308128357, "learning_rate": 1.8840568412414318e-06, "loss": 0.2034, "step": 5113 }, { "epoch": 2.421544835750222, "grad_norm": 1.3613364696502686, "learning_rate": 1.8810701349625237e-06, "loss": 0.1957, "step": 5114 }, { "epoch": 2.4220183486238533, "grad_norm": 1.619372844696045, "learning_rate": 1.8780855521113983e-06, "loss": 0.1774, "step": 5115 }, { "epoch": 2.4224918614974844, "grad_norm": 1.429699182510376, "learning_rate": 1.8751030934686398e-06, "loss": 0.2106, "step": 5116 }, { "epoch": 2.422965374371116, "grad_norm": 1.0113288164138794, "learning_rate": 1.8721227598142876e-06, "loss": 0.2062, "step": 5117 }, { "epoch": 2.423438887244747, "grad_norm": 1.9973174333572388, "learning_rate": 1.869144551927814e-06, "loss": 0.2193, "step": 5118 }, { "epoch": 2.423912400118378, "grad_norm": 1.051254153251648, "learning_rate": 1.8661684705881456e-06, "loss": 0.198, "step": 5119 }, { "epoch": 2.4243859129920096, "grad_norm": 1.627144455909729, "learning_rate": 1.863194516573642e-06, "loss": 0.1883, "step": 5120 }, { "epoch": 2.4248594258656406, "grad_norm": 1.1323554515838623, "learning_rate": 1.860222690662119e-06, "loss": 0.2051, "step": 5121 }, { "epoch": 2.425332938739272, "grad_norm": 1.0321543216705322, "learning_rate": 1.8572529936308225e-06, "loss": 0.2016, "step": 5122 }, { "epoch": 2.425806451612903, "grad_norm": 1.293243646621704, "learning_rate": 1.8542854262564537e-06, "loss": 0.2199, "step": 5123 }, { "epoch": 2.4262799644865343, "grad_norm": 1.043289065361023, "learning_rate": 1.851319989315149e-06, "loss": 0.1967, "step": 5124 }, { "epoch": 2.426753477360166, "grad_norm": 1.376531720161438, "learning_rate": 1.8483566835824862e-06, "loss": 0.2124, "step": 5125 }, { "epoch": 2.427226990233797, "grad_norm": 1.3981205224990845, "learning_rate": 1.8453955098334953e-06, "loss": 0.2134, "step": 5126 }, { "epoch": 2.4277005031074284, "grad_norm": 1.1886367797851562, "learning_rate": 1.8424364688426365e-06, "loss": 0.1731, "step": 5127 }, { "epoch": 2.4281740159810594, "grad_norm": 0.9211165308952332, "learning_rate": 1.8394795613838256e-06, "loss": 0.1854, "step": 5128 }, { "epoch": 2.428647528854691, "grad_norm": 1.1072946786880493, "learning_rate": 1.8365247882304061e-06, "loss": 0.1827, "step": 5129 }, { "epoch": 2.429121041728322, "grad_norm": 0.8516340851783752, "learning_rate": 1.8335721501551774e-06, "loss": 0.2105, "step": 5130 }, { "epoch": 2.429594554601953, "grad_norm": 1.1309757232666016, "learning_rate": 1.8306216479303663e-06, "loss": 0.195, "step": 5131 }, { "epoch": 2.4300680674755846, "grad_norm": 1.1593215465545654, "learning_rate": 1.8276732823276556e-06, "loss": 0.2171, "step": 5132 }, { "epoch": 2.4305415803492156, "grad_norm": 0.949464738368988, "learning_rate": 1.8247270541181572e-06, "loss": 0.2146, "step": 5133 }, { "epoch": 2.431015093222847, "grad_norm": 1.4666881561279297, "learning_rate": 1.8217829640724271e-06, "loss": 0.2085, "step": 5134 }, { "epoch": 2.431488606096478, "grad_norm": 1.130859136581421, "learning_rate": 1.8188410129604684e-06, "loss": 0.2297, "step": 5135 }, { "epoch": 2.4319621189701097, "grad_norm": 1.016351342201233, "learning_rate": 1.8159012015517152e-06, "loss": 0.1865, "step": 5136 }, { "epoch": 2.432435631843741, "grad_norm": 1.0443476438522339, "learning_rate": 1.8129635306150517e-06, "loss": 0.2097, "step": 5137 }, { "epoch": 2.432909144717372, "grad_norm": 1.0505341291427612, "learning_rate": 1.8100280009187931e-06, "loss": 0.1951, "step": 5138 }, { "epoch": 2.4333826575910034, "grad_norm": 1.0258119106292725, "learning_rate": 1.8070946132307033e-06, "loss": 0.2119, "step": 5139 }, { "epoch": 2.4338561704646344, "grad_norm": 1.3897591829299927, "learning_rate": 1.804163368317976e-06, "loss": 0.2081, "step": 5140 }, { "epoch": 2.434329683338266, "grad_norm": 1.1299906969070435, "learning_rate": 1.801234266947256e-06, "loss": 0.187, "step": 5141 }, { "epoch": 2.434803196211897, "grad_norm": 1.4262118339538574, "learning_rate": 1.798307309884616e-06, "loss": 0.2022, "step": 5142 }, { "epoch": 2.435276709085528, "grad_norm": 1.6670175790786743, "learning_rate": 1.795382497895578e-06, "loss": 0.2016, "step": 5143 }, { "epoch": 2.4357502219591596, "grad_norm": 1.0870014429092407, "learning_rate": 1.792459831745097e-06, "loss": 0.205, "step": 5144 }, { "epoch": 2.4362237348327906, "grad_norm": 1.5366194248199463, "learning_rate": 1.7895393121975646e-06, "loss": 0.1978, "step": 5145 }, { "epoch": 2.436697247706422, "grad_norm": 1.1896754503250122, "learning_rate": 1.7866209400168211e-06, "loss": 0.2016, "step": 5146 }, { "epoch": 2.437170760580053, "grad_norm": 1.7006498575210571, "learning_rate": 1.7837047159661302e-06, "loss": 0.1941, "step": 5147 }, { "epoch": 2.4376442734536843, "grad_norm": 1.0235669612884521, "learning_rate": 1.7807906408082087e-06, "loss": 0.2014, "step": 5148 }, { "epoch": 2.438117786327316, "grad_norm": 1.008035659790039, "learning_rate": 1.7778787153052045e-06, "loss": 0.2089, "step": 5149 }, { "epoch": 2.438591299200947, "grad_norm": 1.0255095958709717, "learning_rate": 1.7749689402186998e-06, "loss": 0.1944, "step": 5150 }, { "epoch": 2.4390648120745784, "grad_norm": 1.1309661865234375, "learning_rate": 1.7720613163097233e-06, "loss": 0.1999, "step": 5151 }, { "epoch": 2.4395383249482094, "grad_norm": 1.1396714448928833, "learning_rate": 1.7691558443387302e-06, "loss": 0.1979, "step": 5152 }, { "epoch": 2.440011837821841, "grad_norm": 1.2275151014328003, "learning_rate": 1.766252525065625e-06, "loss": 0.1792, "step": 5153 }, { "epoch": 2.440485350695472, "grad_norm": 1.1925066709518433, "learning_rate": 1.7633513592497354e-06, "loss": 0.2024, "step": 5154 }, { "epoch": 2.4409588635691035, "grad_norm": 0.9196386933326721, "learning_rate": 1.7604523476498413e-06, "loss": 0.1739, "step": 5155 }, { "epoch": 2.4414323764427346, "grad_norm": 1.4410473108291626, "learning_rate": 1.7575554910241444e-06, "loss": 0.1839, "step": 5156 }, { "epoch": 2.4419058893163657, "grad_norm": 1.2430617809295654, "learning_rate": 1.7546607901302948e-06, "loss": 0.2111, "step": 5157 }, { "epoch": 2.442379402189997, "grad_norm": 1.0924980640411377, "learning_rate": 1.7517682457253715e-06, "loss": 0.218, "step": 5158 }, { "epoch": 2.4428529150636282, "grad_norm": 1.1421464681625366, "learning_rate": 1.7488778585658894e-06, "loss": 0.2173, "step": 5159 }, { "epoch": 2.4433264279372597, "grad_norm": 1.0318294763565063, "learning_rate": 1.745989629407806e-06, "loss": 0.2207, "step": 5160 }, { "epoch": 2.443799940810891, "grad_norm": 1.0395057201385498, "learning_rate": 1.7431035590065037e-06, "loss": 0.174, "step": 5161 }, { "epoch": 2.444273453684522, "grad_norm": 1.2233282327651978, "learning_rate": 1.7402196481168132e-06, "loss": 0.2084, "step": 5162 }, { "epoch": 2.4447469665581534, "grad_norm": 1.3540483713150024, "learning_rate": 1.7373378974929878e-06, "loss": 0.1778, "step": 5163 }, { "epoch": 2.4452204794317844, "grad_norm": 1.648417353630066, "learning_rate": 1.7344583078887255e-06, "loss": 0.2016, "step": 5164 }, { "epoch": 2.445693992305416, "grad_norm": 1.0285402536392212, "learning_rate": 1.731580880057152e-06, "loss": 0.2081, "step": 5165 }, { "epoch": 2.446167505179047, "grad_norm": 1.1315886974334717, "learning_rate": 1.7287056147508353e-06, "loss": 0.2134, "step": 5166 }, { "epoch": 2.446641018052678, "grad_norm": 1.0791397094726562, "learning_rate": 1.7258325127217668e-06, "loss": 0.1921, "step": 5167 }, { "epoch": 2.4471145309263096, "grad_norm": 1.0540249347686768, "learning_rate": 1.7229615747213858e-06, "loss": 0.2018, "step": 5168 }, { "epoch": 2.4475880437999407, "grad_norm": 0.8960132002830505, "learning_rate": 1.7200928015005546e-06, "loss": 0.1999, "step": 5169 }, { "epoch": 2.448061556673572, "grad_norm": 1.0253756046295166, "learning_rate": 1.7172261938095713e-06, "loss": 0.202, "step": 5170 }, { "epoch": 2.4485350695472032, "grad_norm": 1.6087161302566528, "learning_rate": 1.7143617523981737e-06, "loss": 0.208, "step": 5171 }, { "epoch": 2.4490085824208347, "grad_norm": 1.3046700954437256, "learning_rate": 1.7114994780155236e-06, "loss": 0.2251, "step": 5172 }, { "epoch": 2.449482095294466, "grad_norm": 0.9480929374694824, "learning_rate": 1.7086393714102278e-06, "loss": 0.1973, "step": 5173 }, { "epoch": 2.449955608168097, "grad_norm": 1.570953369140625, "learning_rate": 1.7057814333303146e-06, "loss": 0.1994, "step": 5174 }, { "epoch": 2.4504291210417284, "grad_norm": 0.91864013671875, "learning_rate": 1.7029256645232529e-06, "loss": 0.1762, "step": 5175 }, { "epoch": 2.4509026339153595, "grad_norm": 1.4908578395843506, "learning_rate": 1.7000720657359383e-06, "loss": 0.2058, "step": 5176 }, { "epoch": 2.451376146788991, "grad_norm": 1.3648159503936768, "learning_rate": 1.6972206377147072e-06, "loss": 0.2089, "step": 5177 }, { "epoch": 2.451849659662622, "grad_norm": 1.1475952863693237, "learning_rate": 1.6943713812053185e-06, "loss": 0.2, "step": 5178 }, { "epoch": 2.4523231725362535, "grad_norm": 1.2770181894302368, "learning_rate": 1.6915242969529676e-06, "loss": 0.2011, "step": 5179 }, { "epoch": 2.4527966854098846, "grad_norm": 0.8871325850486755, "learning_rate": 1.6886793857022866e-06, "loss": 0.1983, "step": 5180 }, { "epoch": 2.4532701982835157, "grad_norm": 1.2607122659683228, "learning_rate": 1.6858366481973288e-06, "loss": 0.1899, "step": 5181 }, { "epoch": 2.453743711157147, "grad_norm": 0.8917462825775146, "learning_rate": 1.6829960851815896e-06, "loss": 0.1821, "step": 5182 }, { "epoch": 2.4542172240307782, "grad_norm": 0.9355318546295166, "learning_rate": 1.680157697397986e-06, "loss": 0.1975, "step": 5183 }, { "epoch": 2.4546907369044098, "grad_norm": 2.099505662918091, "learning_rate": 1.6773214855888765e-06, "loss": 0.1737, "step": 5184 }, { "epoch": 2.455164249778041, "grad_norm": 1.3527806997299194, "learning_rate": 1.6744874504960395e-06, "loss": 0.1821, "step": 5185 }, { "epoch": 2.455637762651672, "grad_norm": 1.080901026725769, "learning_rate": 1.6716555928606959e-06, "loss": 0.1998, "step": 5186 }, { "epoch": 2.4561112755253034, "grad_norm": 1.0348796844482422, "learning_rate": 1.668825913423483e-06, "loss": 0.204, "step": 5187 }, { "epoch": 2.4565847883989345, "grad_norm": 1.0990900993347168, "learning_rate": 1.665998412924481e-06, "loss": 0.1958, "step": 5188 }, { "epoch": 2.457058301272566, "grad_norm": 1.2333770990371704, "learning_rate": 1.6631730921031964e-06, "loss": 0.2101, "step": 5189 }, { "epoch": 2.457531814146197, "grad_norm": 1.7082300186157227, "learning_rate": 1.660349951698561e-06, "loss": 0.1913, "step": 5190 }, { "epoch": 2.4580053270198285, "grad_norm": 1.0094937086105347, "learning_rate": 1.6575289924489435e-06, "loss": 0.1842, "step": 5191 }, { "epoch": 2.4584788398934596, "grad_norm": 1.1216864585876465, "learning_rate": 1.6547102150921346e-06, "loss": 0.2108, "step": 5192 }, { "epoch": 2.4589523527670907, "grad_norm": 2.20843505859375, "learning_rate": 1.6518936203653636e-06, "loss": 0.184, "step": 5193 }, { "epoch": 2.459425865640722, "grad_norm": 0.8962819576263428, "learning_rate": 1.6490792090052799e-06, "loss": 0.2049, "step": 5194 }, { "epoch": 2.4598993785143533, "grad_norm": 1.1314330101013184, "learning_rate": 1.6462669817479638e-06, "loss": 0.21, "step": 5195 }, { "epoch": 2.4603728913879848, "grad_norm": 1.110121726989746, "learning_rate": 1.6434569393289313e-06, "loss": 0.1899, "step": 5196 }, { "epoch": 2.460846404261616, "grad_norm": 1.1242990493774414, "learning_rate": 1.6406490824831166e-06, "loss": 0.2234, "step": 5197 }, { "epoch": 2.4613199171352473, "grad_norm": 0.946533739566803, "learning_rate": 1.6378434119448939e-06, "loss": 0.1964, "step": 5198 }, { "epoch": 2.4617934300088784, "grad_norm": 1.0624065399169922, "learning_rate": 1.6350399284480523e-06, "loss": 0.1787, "step": 5199 }, { "epoch": 2.4622669428825095, "grad_norm": 0.9491204619407654, "learning_rate": 1.632238632725821e-06, "loss": 0.1927, "step": 5200 }, { "epoch": 2.462740455756141, "grad_norm": 1.2071702480316162, "learning_rate": 1.6294395255108487e-06, "loss": 0.1941, "step": 5201 }, { "epoch": 2.463213968629772, "grad_norm": 1.1479326486587524, "learning_rate": 1.6266426075352182e-06, "loss": 0.2133, "step": 5202 }, { "epoch": 2.4636874815034036, "grad_norm": 0.9430690407752991, "learning_rate": 1.6238478795304346e-06, "loss": 0.2007, "step": 5203 }, { "epoch": 2.4641609943770346, "grad_norm": 1.0291746854782104, "learning_rate": 1.62105534222743e-06, "loss": 0.1892, "step": 5204 }, { "epoch": 2.4646345072506657, "grad_norm": 1.4124772548675537, "learning_rate": 1.61826499635657e-06, "loss": 0.1997, "step": 5205 }, { "epoch": 2.465108020124297, "grad_norm": 1.0831562280654907, "learning_rate": 1.6154768426476375e-06, "loss": 0.2015, "step": 5206 }, { "epoch": 2.4655815329979283, "grad_norm": 1.5308729410171509, "learning_rate": 1.6126908818298514e-06, "loss": 0.1886, "step": 5207 }, { "epoch": 2.4660550458715598, "grad_norm": 1.0939233303070068, "learning_rate": 1.6099071146318502e-06, "loss": 0.2037, "step": 5208 }, { "epoch": 2.466528558745191, "grad_norm": 1.4337128400802612, "learning_rate": 1.6071255417817045e-06, "loss": 0.1942, "step": 5209 }, { "epoch": 2.467002071618822, "grad_norm": 1.2987715005874634, "learning_rate": 1.6043461640069025e-06, "loss": 0.1953, "step": 5210 }, { "epoch": 2.4674755844924534, "grad_norm": 1.0721392631530762, "learning_rate": 1.6015689820343705e-06, "loss": 0.2063, "step": 5211 }, { "epoch": 2.4679490973660845, "grad_norm": 1.0339511632919312, "learning_rate": 1.5987939965904498e-06, "loss": 0.1766, "step": 5212 }, { "epoch": 2.468422610239716, "grad_norm": 1.3787801265716553, "learning_rate": 1.5960212084009097e-06, "loss": 0.2037, "step": 5213 }, { "epoch": 2.468896123113347, "grad_norm": 1.1688666343688965, "learning_rate": 1.593250618190949e-06, "loss": 0.1851, "step": 5214 }, { "epoch": 2.4693696359869786, "grad_norm": 1.2239266633987427, "learning_rate": 1.590482226685186e-06, "loss": 0.2122, "step": 5215 }, { "epoch": 2.4698431488606096, "grad_norm": 1.1540725231170654, "learning_rate": 1.5877160346076714e-06, "loss": 0.213, "step": 5216 }, { "epoch": 2.470316661734241, "grad_norm": 0.8362421989440918, "learning_rate": 1.584952042681871e-06, "loss": 0.1734, "step": 5217 }, { "epoch": 2.470790174607872, "grad_norm": 1.1025753021240234, "learning_rate": 1.5821902516306842e-06, "loss": 0.1877, "step": 5218 }, { "epoch": 2.4712636874815033, "grad_norm": 1.23174250125885, "learning_rate": 1.5794306621764265e-06, "loss": 0.1998, "step": 5219 }, { "epoch": 2.471737200355135, "grad_norm": 1.2430462837219238, "learning_rate": 1.5766732750408465e-06, "loss": 0.2024, "step": 5220 }, { "epoch": 2.472210713228766, "grad_norm": 1.0162687301635742, "learning_rate": 1.573918090945109e-06, "loss": 0.2102, "step": 5221 }, { "epoch": 2.4726842261023974, "grad_norm": 1.1504173278808594, "learning_rate": 1.571165110609808e-06, "loss": 0.2077, "step": 5222 }, { "epoch": 2.4731577389760284, "grad_norm": 0.9471516609191895, "learning_rate": 1.5684143347549586e-06, "loss": 0.1864, "step": 5223 }, { "epoch": 2.4736312518496595, "grad_norm": 1.431763768196106, "learning_rate": 1.5656657640999973e-06, "loss": 0.1995, "step": 5224 }, { "epoch": 2.474104764723291, "grad_norm": 1.2276103496551514, "learning_rate": 1.562919399363787e-06, "loss": 0.2024, "step": 5225 }, { "epoch": 2.474578277596922, "grad_norm": 1.1695070266723633, "learning_rate": 1.5601752412646143e-06, "loss": 0.1863, "step": 5226 }, { "epoch": 2.4750517904705536, "grad_norm": 1.6259397268295288, "learning_rate": 1.5574332905201883e-06, "loss": 0.1961, "step": 5227 }, { "epoch": 2.4755253033441846, "grad_norm": 1.0487055778503418, "learning_rate": 1.554693547847639e-06, "loss": 0.1864, "step": 5228 }, { "epoch": 2.4759988162178157, "grad_norm": 1.0676391124725342, "learning_rate": 1.551956013963517e-06, "loss": 0.2179, "step": 5229 }, { "epoch": 2.476472329091447, "grad_norm": 1.1580907106399536, "learning_rate": 1.5492206895838013e-06, "loss": 0.198, "step": 5230 }, { "epoch": 2.4769458419650783, "grad_norm": 1.0551162958145142, "learning_rate": 1.546487575423886e-06, "loss": 0.1941, "step": 5231 }, { "epoch": 2.47741935483871, "grad_norm": 1.4372506141662598, "learning_rate": 1.5437566721985952e-06, "loss": 0.1876, "step": 5232 }, { "epoch": 2.477892867712341, "grad_norm": 1.0641188621520996, "learning_rate": 1.5410279806221662e-06, "loss": 0.2156, "step": 5233 }, { "epoch": 2.4783663805859724, "grad_norm": 1.8951334953308105, "learning_rate": 1.5383015014082659e-06, "loss": 0.1862, "step": 5234 }, { "epoch": 2.4788398934596034, "grad_norm": 1.3256562948226929, "learning_rate": 1.5355772352699738e-06, "loss": 0.1894, "step": 5235 }, { "epoch": 2.4793134063332345, "grad_norm": 0.990827739238739, "learning_rate": 1.5328551829198e-06, "loss": 0.1896, "step": 5236 }, { "epoch": 2.479786919206866, "grad_norm": 1.4543442726135254, "learning_rate": 1.53013534506967e-06, "loss": 0.2072, "step": 5237 }, { "epoch": 2.480260432080497, "grad_norm": 1.17643141746521, "learning_rate": 1.5274177224309273e-06, "loss": 0.2118, "step": 5238 }, { "epoch": 2.4807339449541286, "grad_norm": 1.1279579401016235, "learning_rate": 1.5247023157143459e-06, "loss": 0.2043, "step": 5239 }, { "epoch": 2.4812074578277596, "grad_norm": 1.2052768468856812, "learning_rate": 1.5219891256301079e-06, "loss": 0.2013, "step": 5240 }, { "epoch": 2.481680970701391, "grad_norm": 1.2631618976593018, "learning_rate": 1.5192781528878285e-06, "loss": 0.2011, "step": 5241 }, { "epoch": 2.482154483575022, "grad_norm": 1.0374186038970947, "learning_rate": 1.5165693981965302e-06, "loss": 0.1748, "step": 5242 }, { "epoch": 2.4826279964486533, "grad_norm": 1.1341975927352905, "learning_rate": 1.513862862264668e-06, "loss": 0.214, "step": 5243 }, { "epoch": 2.483101509322285, "grad_norm": 0.9370356202125549, "learning_rate": 1.5111585458001032e-06, "loss": 0.1778, "step": 5244 }, { "epoch": 2.483575022195916, "grad_norm": 1.78024423122406, "learning_rate": 1.5084564495101306e-06, "loss": 0.1897, "step": 5245 }, { "epoch": 2.4840485350695474, "grad_norm": 1.1708461046218872, "learning_rate": 1.5057565741014513e-06, "loss": 0.2309, "step": 5246 }, { "epoch": 2.4845220479431784, "grad_norm": 1.8832025527954102, "learning_rate": 1.5030589202801982e-06, "loss": 0.2073, "step": 5247 }, { "epoch": 2.4849955608168095, "grad_norm": 1.0782551765441895, "learning_rate": 1.5003634887519126e-06, "loss": 0.2115, "step": 5248 }, { "epoch": 2.485469073690441, "grad_norm": 2.393939733505249, "learning_rate": 1.497670280221556e-06, "loss": 0.202, "step": 5249 }, { "epoch": 2.485942586564072, "grad_norm": 1.293050765991211, "learning_rate": 1.4949792953935172e-06, "loss": 0.1859, "step": 5250 }, { "epoch": 2.4864160994377036, "grad_norm": 1.0516685247421265, "learning_rate": 1.4922905349715922e-06, "loss": 0.2269, "step": 5251 }, { "epoch": 2.4868896123113347, "grad_norm": 0.9482864141464233, "learning_rate": 1.489603999659004e-06, "loss": 0.1947, "step": 5252 }, { "epoch": 2.4873631251849657, "grad_norm": 1.2541604042053223, "learning_rate": 1.486919690158386e-06, "loss": 0.2006, "step": 5253 }, { "epoch": 2.4878366380585972, "grad_norm": 1.3161418437957764, "learning_rate": 1.4842376071717989e-06, "loss": 0.2067, "step": 5254 }, { "epoch": 2.4883101509322283, "grad_norm": 1.2702239751815796, "learning_rate": 1.4815577514007106e-06, "loss": 0.1866, "step": 5255 }, { "epoch": 2.48878366380586, "grad_norm": 1.2881168127059937, "learning_rate": 1.478880123546015e-06, "loss": 0.1928, "step": 5256 }, { "epoch": 2.489257176679491, "grad_norm": 1.0193349123001099, "learning_rate": 1.476204724308019e-06, "loss": 0.2026, "step": 5257 }, { "epoch": 2.4897306895531224, "grad_norm": 1.3195552825927734, "learning_rate": 1.4735315543864436e-06, "loss": 0.1803, "step": 5258 }, { "epoch": 2.4902042024267534, "grad_norm": 1.2184456586837769, "learning_rate": 1.4708606144804371e-06, "loss": 0.2057, "step": 5259 }, { "epoch": 2.490677715300385, "grad_norm": 1.221314549446106, "learning_rate": 1.468191905288553e-06, "loss": 0.1912, "step": 5260 }, { "epoch": 2.491151228174016, "grad_norm": 1.2005162239074707, "learning_rate": 1.4655254275087693e-06, "loss": 0.2101, "step": 5261 }, { "epoch": 2.491624741047647, "grad_norm": 2.1400723457336426, "learning_rate": 1.4628611818384753e-06, "loss": 0.1901, "step": 5262 }, { "epoch": 2.4920982539212786, "grad_norm": 1.6290489435195923, "learning_rate": 1.460199168974481e-06, "loss": 0.1929, "step": 5263 }, { "epoch": 2.4925717667949097, "grad_norm": 1.020664095878601, "learning_rate": 1.4575393896130073e-06, "loss": 0.1879, "step": 5264 }, { "epoch": 2.493045279668541, "grad_norm": 0.9219028353691101, "learning_rate": 1.454881844449697e-06, "loss": 0.2014, "step": 5265 }, { "epoch": 2.4935187925421722, "grad_norm": 1.7500580549240112, "learning_rate": 1.4522265341796048e-06, "loss": 0.2044, "step": 5266 }, { "epoch": 2.4939923054158033, "grad_norm": 1.1425151824951172, "learning_rate": 1.4495734594971988e-06, "loss": 0.2053, "step": 5267 }, { "epoch": 2.494465818289435, "grad_norm": 1.0946452617645264, "learning_rate": 1.4469226210963693e-06, "loss": 0.2082, "step": 5268 }, { "epoch": 2.494939331163066, "grad_norm": 1.198346495628357, "learning_rate": 1.444274019670413e-06, "loss": 0.1991, "step": 5269 }, { "epoch": 2.4954128440366974, "grad_norm": 1.0651274919509888, "learning_rate": 1.4416276559120511e-06, "loss": 0.2051, "step": 5270 }, { "epoch": 2.4958863569103285, "grad_norm": 1.0308165550231934, "learning_rate": 1.4389835305134092e-06, "loss": 0.1973, "step": 5271 }, { "epoch": 2.4963598697839595, "grad_norm": 1.7552647590637207, "learning_rate": 1.436341644166037e-06, "loss": 0.2061, "step": 5272 }, { "epoch": 2.496833382657591, "grad_norm": 1.7813093662261963, "learning_rate": 1.4337019975608934e-06, "loss": 0.1912, "step": 5273 }, { "epoch": 2.497306895531222, "grad_norm": 1.5826293230056763, "learning_rate": 1.4310645913883493e-06, "loss": 0.2167, "step": 5274 }, { "epoch": 2.4977804084048536, "grad_norm": 0.9368717670440674, "learning_rate": 1.4284294263381982e-06, "loss": 0.2067, "step": 5275 }, { "epoch": 2.4982539212784847, "grad_norm": 1.000964879989624, "learning_rate": 1.4257965030996357e-06, "loss": 0.1868, "step": 5276 }, { "epoch": 2.498727434152116, "grad_norm": 1.1660606861114502, "learning_rate": 1.4231658223612842e-06, "loss": 0.2016, "step": 5277 }, { "epoch": 2.4992009470257472, "grad_norm": 1.216518521308899, "learning_rate": 1.420537384811167e-06, "loss": 0.2274, "step": 5278 }, { "epoch": 2.4996744598993788, "grad_norm": 1.3850187063217163, "learning_rate": 1.4179111911367315e-06, "loss": 0.2104, "step": 5279 }, { "epoch": 2.50014797277301, "grad_norm": 1.306488275527954, "learning_rate": 1.4152872420248288e-06, "loss": 0.2175, "step": 5280 }, { "epoch": 2.500621485646641, "grad_norm": 1.0250792503356934, "learning_rate": 1.4126655381617327e-06, "loss": 0.181, "step": 5281 }, { "epoch": 2.5010949985202724, "grad_norm": 1.5189791917800903, "learning_rate": 1.4100460802331205e-06, "loss": 0.2075, "step": 5282 }, { "epoch": 2.5015685113939035, "grad_norm": 1.1778138875961304, "learning_rate": 1.4074288689240856e-06, "loss": 0.1985, "step": 5283 }, { "epoch": 2.502042024267535, "grad_norm": 1.0283559560775757, "learning_rate": 1.4048139049191389e-06, "loss": 0.2098, "step": 5284 }, { "epoch": 2.502515537141166, "grad_norm": 0.9769028425216675, "learning_rate": 1.4022011889021936e-06, "loss": 0.2068, "step": 5285 }, { "epoch": 2.502989050014797, "grad_norm": 1.1810842752456665, "learning_rate": 1.399590721556584e-06, "loss": 0.2083, "step": 5286 }, { "epoch": 2.5034625628884286, "grad_norm": 1.018278956413269, "learning_rate": 1.396982503565051e-06, "loss": 0.2003, "step": 5287 }, { "epoch": 2.5039360757620597, "grad_norm": 1.1597832441329956, "learning_rate": 1.3943765356097505e-06, "loss": 0.1902, "step": 5288 }, { "epoch": 2.504409588635691, "grad_norm": 1.001120686531067, "learning_rate": 1.3917728183722456e-06, "loss": 0.1733, "step": 5289 }, { "epoch": 2.5048831015093223, "grad_norm": 1.1091008186340332, "learning_rate": 1.389171352533517e-06, "loss": 0.2251, "step": 5290 }, { "epoch": 2.5053566143829533, "grad_norm": 0.9414849877357483, "learning_rate": 1.3865721387739507e-06, "loss": 0.1797, "step": 5291 }, { "epoch": 2.505830127256585, "grad_norm": 1.9510407447814941, "learning_rate": 1.3839751777733445e-06, "loss": 0.1881, "step": 5292 }, { "epoch": 2.506303640130216, "grad_norm": 1.3246495723724365, "learning_rate": 1.3813804702109124e-06, "loss": 0.2084, "step": 5293 }, { "epoch": 2.5067771530038474, "grad_norm": 1.0592890977859497, "learning_rate": 1.37878801676527e-06, "loss": 0.2072, "step": 5294 }, { "epoch": 2.5072506658774785, "grad_norm": 1.3438637256622314, "learning_rate": 1.3761978181144542e-06, "loss": 0.2262, "step": 5295 }, { "epoch": 2.5077241787511095, "grad_norm": 1.0453280210494995, "learning_rate": 1.373609874935903e-06, "loss": 0.1857, "step": 5296 }, { "epoch": 2.508197691624741, "grad_norm": 1.3020319938659668, "learning_rate": 1.3710241879064689e-06, "loss": 0.1875, "step": 5297 }, { "epoch": 2.5086712044983726, "grad_norm": 1.245324969291687, "learning_rate": 1.3684407577024116e-06, "loss": 0.1978, "step": 5298 }, { "epoch": 2.5091447173720036, "grad_norm": 1.0261917114257812, "learning_rate": 1.3658595849994072e-06, "loss": 0.199, "step": 5299 }, { "epoch": 2.5096182302456347, "grad_norm": 1.699816107749939, "learning_rate": 1.36328067047253e-06, "loss": 0.2069, "step": 5300 }, { "epoch": 2.510091743119266, "grad_norm": 2.2417402267456055, "learning_rate": 1.360704014796277e-06, "loss": 0.1876, "step": 5301 }, { "epoch": 2.5105652559928973, "grad_norm": 0.9877926707267761, "learning_rate": 1.3581296186445426e-06, "loss": 0.1888, "step": 5302 }, { "epoch": 2.5110387688665288, "grad_norm": 1.183518648147583, "learning_rate": 1.3555574826906337e-06, "loss": 0.1995, "step": 5303 }, { "epoch": 2.51151228174016, "grad_norm": 1.0759626626968384, "learning_rate": 1.3529876076072746e-06, "loss": 0.1998, "step": 5304 }, { "epoch": 2.511985794613791, "grad_norm": 1.130675196647644, "learning_rate": 1.3504199940665852e-06, "loss": 0.1963, "step": 5305 }, { "epoch": 2.5124593074874224, "grad_norm": 1.4707733392715454, "learning_rate": 1.347854642740104e-06, "loss": 0.216, "step": 5306 }, { "epoch": 2.5129328203610535, "grad_norm": 1.2437856197357178, "learning_rate": 1.3452915542987732e-06, "loss": 0.2187, "step": 5307 }, { "epoch": 2.513406333234685, "grad_norm": 1.0514782667160034, "learning_rate": 1.3427307294129411e-06, "loss": 0.2178, "step": 5308 }, { "epoch": 2.513879846108316, "grad_norm": 1.6245321035385132, "learning_rate": 1.3401721687523706e-06, "loss": 0.2095, "step": 5309 }, { "epoch": 2.514353358981947, "grad_norm": 1.0891262292861938, "learning_rate": 1.3376158729862232e-06, "loss": 0.1807, "step": 5310 }, { "epoch": 2.5148268718555786, "grad_norm": 1.054901361465454, "learning_rate": 1.3350618427830796e-06, "loss": 0.2074, "step": 5311 }, { "epoch": 2.5153003847292097, "grad_norm": 1.5967611074447632, "learning_rate": 1.3325100788109168e-06, "loss": 0.1997, "step": 5312 }, { "epoch": 2.515773897602841, "grad_norm": 1.1578254699707031, "learning_rate": 1.3299605817371285e-06, "loss": 0.2056, "step": 5313 }, { "epoch": 2.5162474104764723, "grad_norm": 0.9452877044677734, "learning_rate": 1.327413352228506e-06, "loss": 0.2101, "step": 5314 }, { "epoch": 2.5167209233501033, "grad_norm": 1.0103105306625366, "learning_rate": 1.3248683909512584e-06, "loss": 0.1875, "step": 5315 }, { "epoch": 2.517194436223735, "grad_norm": 1.4317337274551392, "learning_rate": 1.322325698570992e-06, "loss": 0.1962, "step": 5316 }, { "epoch": 2.5176679490973664, "grad_norm": 1.23985755443573, "learning_rate": 1.3197852757527219e-06, "loss": 0.2054, "step": 5317 }, { "epoch": 2.5181414619709974, "grad_norm": 1.0402408838272095, "learning_rate": 1.3172471231608753e-06, "loss": 0.1856, "step": 5318 }, { "epoch": 2.5186149748446285, "grad_norm": 1.3654496669769287, "learning_rate": 1.3147112414592777e-06, "loss": 0.2111, "step": 5319 }, { "epoch": 2.51908848771826, "grad_norm": 1.1006624698638916, "learning_rate": 1.312177631311169e-06, "loss": 0.2167, "step": 5320 }, { "epoch": 2.519562000591891, "grad_norm": 0.9989845752716064, "learning_rate": 1.3096462933791853e-06, "loss": 0.1995, "step": 5321 }, { "epoch": 2.5200355134655226, "grad_norm": 1.1869349479675293, "learning_rate": 1.3071172283253786e-06, "loss": 0.2215, "step": 5322 }, { "epoch": 2.5205090263391536, "grad_norm": 0.9278461933135986, "learning_rate": 1.3045904368111973e-06, "loss": 0.194, "step": 5323 }, { "epoch": 2.5209825392127847, "grad_norm": 1.345420241355896, "learning_rate": 1.3020659194975028e-06, "loss": 0.2158, "step": 5324 }, { "epoch": 2.521456052086416, "grad_norm": 1.0108526945114136, "learning_rate": 1.2995436770445547e-06, "loss": 0.1979, "step": 5325 }, { "epoch": 2.5219295649600473, "grad_norm": 1.0445326566696167, "learning_rate": 1.2970237101120253e-06, "loss": 0.1861, "step": 5326 }, { "epoch": 2.522403077833679, "grad_norm": 1.2074165344238281, "learning_rate": 1.2945060193589852e-06, "loss": 0.2149, "step": 5327 }, { "epoch": 2.52287659070731, "grad_norm": 0.880549430847168, "learning_rate": 1.2919906054439103e-06, "loss": 0.1945, "step": 5328 }, { "epoch": 2.523350103580941, "grad_norm": 1.272971749305725, "learning_rate": 1.289477469024687e-06, "loss": 0.1983, "step": 5329 }, { "epoch": 2.5238236164545724, "grad_norm": 1.3522940874099731, "learning_rate": 1.2869666107585975e-06, "loss": 0.2175, "step": 5330 }, { "epoch": 2.5242971293282035, "grad_norm": 1.0059926509857178, "learning_rate": 1.2844580313023368e-06, "loss": 0.1978, "step": 5331 }, { "epoch": 2.524770642201835, "grad_norm": 0.781419038772583, "learning_rate": 1.2819517313119956e-06, "loss": 0.1766, "step": 5332 }, { "epoch": 2.525244155075466, "grad_norm": 1.2181997299194336, "learning_rate": 1.279447711443077e-06, "loss": 0.1921, "step": 5333 }, { "epoch": 2.525717667949097, "grad_norm": 1.028064489364624, "learning_rate": 1.2769459723504795e-06, "loss": 0.1777, "step": 5334 }, { "epoch": 2.5261911808227286, "grad_norm": 1.0090110301971436, "learning_rate": 1.274446514688511e-06, "loss": 0.2018, "step": 5335 }, { "epoch": 2.5266646936963597, "grad_norm": 1.4929478168487549, "learning_rate": 1.2719493391108806e-06, "loss": 0.1938, "step": 5336 }, { "epoch": 2.527138206569991, "grad_norm": 1.0513087511062622, "learning_rate": 1.2694544462706959e-06, "loss": 0.1879, "step": 5337 }, { "epoch": 2.5276117194436223, "grad_norm": 1.1161446571350098, "learning_rate": 1.2669618368204795e-06, "loss": 0.1997, "step": 5338 }, { "epoch": 2.5280852323172534, "grad_norm": 1.209606647491455, "learning_rate": 1.2644715114121432e-06, "loss": 0.1928, "step": 5339 }, { "epoch": 2.528558745190885, "grad_norm": 1.125483512878418, "learning_rate": 1.2619834706970113e-06, "loss": 0.1892, "step": 5340 }, { "epoch": 2.5290322580645164, "grad_norm": 1.3125486373901367, "learning_rate": 1.2594977153258036e-06, "loss": 0.2068, "step": 5341 }, { "epoch": 2.5295057709381474, "grad_norm": 1.0961823463439941, "learning_rate": 1.2570142459486478e-06, "loss": 0.1934, "step": 5342 }, { "epoch": 2.5299792838117785, "grad_norm": 1.161319375038147, "learning_rate": 1.254533063215072e-06, "loss": 0.1815, "step": 5343 }, { "epoch": 2.53045279668541, "grad_norm": 1.1228846311569214, "learning_rate": 1.2520541677740038e-06, "loss": 0.1877, "step": 5344 }, { "epoch": 2.530926309559041, "grad_norm": 1.2616682052612305, "learning_rate": 1.2495775602737759e-06, "loss": 0.2004, "step": 5345 }, { "epoch": 2.5313998224326726, "grad_norm": 1.3314753770828247, "learning_rate": 1.2471032413621188e-06, "loss": 0.223, "step": 5346 }, { "epoch": 2.5318733353063037, "grad_norm": 1.303731918334961, "learning_rate": 1.2446312116861703e-06, "loss": 0.1988, "step": 5347 }, { "epoch": 2.5323468481799347, "grad_norm": 1.0389883518218994, "learning_rate": 1.2421614718924623e-06, "loss": 0.194, "step": 5348 }, { "epoch": 2.5328203610535662, "grad_norm": 1.4113802909851074, "learning_rate": 1.239694022626935e-06, "loss": 0.1868, "step": 5349 }, { "epoch": 2.5332938739271973, "grad_norm": 1.1160602569580078, "learning_rate": 1.2372288645349207e-06, "loss": 0.1914, "step": 5350 }, { "epoch": 2.533767386800829, "grad_norm": 1.1180630922317505, "learning_rate": 1.2347659982611637e-06, "loss": 0.187, "step": 5351 }, { "epoch": 2.53424089967446, "grad_norm": 2.2258567810058594, "learning_rate": 1.2323054244498001e-06, "loss": 0.1949, "step": 5352 }, { "epoch": 2.534714412548091, "grad_norm": 1.0925301313400269, "learning_rate": 1.2298471437443671e-06, "loss": 0.1731, "step": 5353 }, { "epoch": 2.5351879254217224, "grad_norm": 1.3025660514831543, "learning_rate": 1.2273911567878095e-06, "loss": 0.2104, "step": 5354 }, { "epoch": 2.5356614382953535, "grad_norm": 1.7932027578353882, "learning_rate": 1.224937464222461e-06, "loss": 0.202, "step": 5355 }, { "epoch": 2.536134951168985, "grad_norm": 1.089361310005188, "learning_rate": 1.222486066690066e-06, "loss": 0.2003, "step": 5356 }, { "epoch": 2.536608464042616, "grad_norm": 0.9862686395645142, "learning_rate": 1.22003696483176e-06, "loss": 0.218, "step": 5357 }, { "epoch": 2.537081976916247, "grad_norm": 1.0599945783615112, "learning_rate": 1.2175901592880867e-06, "loss": 0.2115, "step": 5358 }, { "epoch": 2.5375554897898787, "grad_norm": 1.217585563659668, "learning_rate": 1.21514565069898e-06, "loss": 0.1972, "step": 5359 }, { "epoch": 2.53802900266351, "grad_norm": 1.0531421899795532, "learning_rate": 1.2127034397037808e-06, "loss": 0.1863, "step": 5360 }, { "epoch": 2.5385025155371412, "grad_norm": 1.6633493900299072, "learning_rate": 1.2102635269412244e-06, "loss": 0.184, "step": 5361 }, { "epoch": 2.5389760284107723, "grad_norm": 1.1086888313293457, "learning_rate": 1.207825913049445e-06, "loss": 0.2196, "step": 5362 }, { "epoch": 2.539449541284404, "grad_norm": 1.2374646663665771, "learning_rate": 1.2053905986659798e-06, "loss": 0.2041, "step": 5363 }, { "epoch": 2.539923054158035, "grad_norm": 0.9073995351791382, "learning_rate": 1.2029575844277585e-06, "loss": 0.1934, "step": 5364 }, { "epoch": 2.5403965670316664, "grad_norm": 1.31955885887146, "learning_rate": 1.2005268709711172e-06, "loss": 0.1862, "step": 5365 }, { "epoch": 2.5408700799052975, "grad_norm": 0.9501339793205261, "learning_rate": 1.1980984589317802e-06, "loss": 0.1849, "step": 5366 }, { "epoch": 2.5413435927789285, "grad_norm": 1.0616928339004517, "learning_rate": 1.1956723489448796e-06, "loss": 0.2111, "step": 5367 }, { "epoch": 2.54181710565256, "grad_norm": 1.4748432636260986, "learning_rate": 1.1932485416449369e-06, "loss": 0.1916, "step": 5368 }, { "epoch": 2.542290618526191, "grad_norm": 1.1778125762939453, "learning_rate": 1.1908270376658804e-06, "loss": 0.2055, "step": 5369 }, { "epoch": 2.5427641313998226, "grad_norm": 1.297905445098877, "learning_rate": 1.1884078376410291e-06, "loss": 0.2062, "step": 5370 }, { "epoch": 2.5432376442734537, "grad_norm": 0.9151560664176941, "learning_rate": 1.1859909422030991e-06, "loss": 0.1974, "step": 5371 }, { "epoch": 2.5437111571470847, "grad_norm": 1.6805721521377563, "learning_rate": 1.1835763519842092e-06, "loss": 0.2103, "step": 5372 }, { "epoch": 2.5441846700207162, "grad_norm": 1.0770697593688965, "learning_rate": 1.1811640676158686e-06, "loss": 0.1965, "step": 5373 }, { "epoch": 2.5446581828943473, "grad_norm": 1.3352454900741577, "learning_rate": 1.1787540897289918e-06, "loss": 0.2031, "step": 5374 }, { "epoch": 2.545131695767979, "grad_norm": 1.8438594341278076, "learning_rate": 1.176346418953881e-06, "loss": 0.1951, "step": 5375 }, { "epoch": 2.54560520864161, "grad_norm": 1.525089144706726, "learning_rate": 1.1739410559202425e-06, "loss": 0.2543, "step": 5376 }, { "epoch": 2.546078721515241, "grad_norm": 1.3220195770263672, "learning_rate": 1.171538001257172e-06, "loss": 0.2211, "step": 5377 }, { "epoch": 2.5465522343888725, "grad_norm": 1.0386881828308105, "learning_rate": 1.16913725559317e-06, "loss": 0.2002, "step": 5378 }, { "epoch": 2.547025747262504, "grad_norm": 1.009458303451538, "learning_rate": 1.1667388195561247e-06, "loss": 0.1873, "step": 5379 }, { "epoch": 2.547499260136135, "grad_norm": 1.3089097738265991, "learning_rate": 1.164342693773326e-06, "loss": 0.1998, "step": 5380 }, { "epoch": 2.547972773009766, "grad_norm": 1.061797022819519, "learning_rate": 1.161948878871455e-06, "loss": 0.2146, "step": 5381 }, { "epoch": 2.5484462858833976, "grad_norm": 0.9451477527618408, "learning_rate": 1.1595573754765932e-06, "loss": 0.2215, "step": 5382 }, { "epoch": 2.5489197987570287, "grad_norm": 1.7291032075881958, "learning_rate": 1.1571681842142158e-06, "loss": 0.1936, "step": 5383 }, { "epoch": 2.54939331163066, "grad_norm": 1.9363996982574463, "learning_rate": 1.1547813057091906e-06, "loss": 0.2136, "step": 5384 }, { "epoch": 2.5498668245042913, "grad_norm": 0.9909132122993469, "learning_rate": 1.1523967405857838e-06, "loss": 0.1925, "step": 5385 }, { "epoch": 2.5503403373779223, "grad_norm": 1.205962896347046, "learning_rate": 1.1500144894676568e-06, "loss": 0.2009, "step": 5386 }, { "epoch": 2.550813850251554, "grad_norm": 0.9973340034484863, "learning_rate": 1.14763455297786e-06, "loss": 0.1876, "step": 5387 }, { "epoch": 2.551287363125185, "grad_norm": 1.2037270069122314, "learning_rate": 1.1452569317388474e-06, "loss": 0.2058, "step": 5388 }, { "epoch": 2.5517608759988164, "grad_norm": 0.9326767325401306, "learning_rate": 1.1428816263724596e-06, "loss": 0.1998, "step": 5389 }, { "epoch": 2.5522343888724475, "grad_norm": 1.0340579748153687, "learning_rate": 1.1405086374999386e-06, "loss": 0.1951, "step": 5390 }, { "epoch": 2.5527079017460785, "grad_norm": 1.1469632387161255, "learning_rate": 1.1381379657419112e-06, "loss": 0.2013, "step": 5391 }, { "epoch": 2.55318141461971, "grad_norm": 1.0741297006607056, "learning_rate": 1.1357696117184103e-06, "loss": 0.183, "step": 5392 }, { "epoch": 2.553654927493341, "grad_norm": 1.3129887580871582, "learning_rate": 1.1334035760488493e-06, "loss": 0.2213, "step": 5393 }, { "epoch": 2.5541284403669726, "grad_norm": 1.0129112005233765, "learning_rate": 1.1310398593520488e-06, "loss": 0.1879, "step": 5394 }, { "epoch": 2.5546019532406037, "grad_norm": 0.8904241323471069, "learning_rate": 1.128678462246212e-06, "loss": 0.2156, "step": 5395 }, { "epoch": 2.5550754661142348, "grad_norm": 1.214227557182312, "learning_rate": 1.1263193853489384e-06, "loss": 0.2161, "step": 5396 }, { "epoch": 2.5555489789878663, "grad_norm": 1.472237229347229, "learning_rate": 1.1239626292772254e-06, "loss": 0.2028, "step": 5397 }, { "epoch": 2.5560224918614973, "grad_norm": 1.0698444843292236, "learning_rate": 1.1216081946474566e-06, "loss": 0.2018, "step": 5398 }, { "epoch": 2.556496004735129, "grad_norm": 1.198815107345581, "learning_rate": 1.1192560820754151e-06, "loss": 0.2144, "step": 5399 }, { "epoch": 2.55696951760876, "grad_norm": 2.1452925205230713, "learning_rate": 1.1169062921762686e-06, "loss": 0.2089, "step": 5400 }, { "epoch": 2.557443030482391, "grad_norm": 1.0623699426651, "learning_rate": 1.1145588255645868e-06, "loss": 0.2058, "step": 5401 }, { "epoch": 2.5579165433560225, "grad_norm": 1.0359307527542114, "learning_rate": 1.112213682854323e-06, "loss": 0.2041, "step": 5402 }, { "epoch": 2.558390056229654, "grad_norm": 1.2944672107696533, "learning_rate": 1.1098708646588308e-06, "loss": 0.211, "step": 5403 }, { "epoch": 2.558863569103285, "grad_norm": 1.033818006515503, "learning_rate": 1.107530371590847e-06, "loss": 0.1941, "step": 5404 }, { "epoch": 2.559337081976916, "grad_norm": 1.210862398147583, "learning_rate": 1.1051922042625096e-06, "loss": 0.2018, "step": 5405 }, { "epoch": 2.5598105948505476, "grad_norm": 0.9782442450523376, "learning_rate": 1.1028563632853407e-06, "loss": 0.197, "step": 5406 }, { "epoch": 2.5602841077241787, "grad_norm": 1.218672752380371, "learning_rate": 1.1005228492702557e-06, "loss": 0.2224, "step": 5407 }, { "epoch": 2.56075762059781, "grad_norm": 1.1376310586929321, "learning_rate": 1.0981916628275679e-06, "loss": 0.2009, "step": 5408 }, { "epoch": 2.5612311334714413, "grad_norm": 1.071939468383789, "learning_rate": 1.0958628045669705e-06, "loss": 0.194, "step": 5409 }, { "epoch": 2.5617046463450723, "grad_norm": 1.157542109489441, "learning_rate": 1.0935362750975597e-06, "loss": 0.2161, "step": 5410 }, { "epoch": 2.562178159218704, "grad_norm": 1.7687501907348633, "learning_rate": 1.091212075027811e-06, "loss": 0.1928, "step": 5411 }, { "epoch": 2.562651672092335, "grad_norm": 1.131037950515747, "learning_rate": 1.0888902049656014e-06, "loss": 0.2192, "step": 5412 }, { "epoch": 2.5631251849659664, "grad_norm": 1.0506967306137085, "learning_rate": 1.0865706655181907e-06, "loss": 0.1816, "step": 5413 }, { "epoch": 2.5635986978395975, "grad_norm": 1.2665033340454102, "learning_rate": 1.0842534572922348e-06, "loss": 0.1969, "step": 5414 }, { "epoch": 2.5640722107132286, "grad_norm": 1.2768964767456055, "learning_rate": 1.0819385808937743e-06, "loss": 0.2108, "step": 5415 }, { "epoch": 2.56454572358686, "grad_norm": 1.3844908475875854, "learning_rate": 1.0796260369282429e-06, "loss": 0.2119, "step": 5416 }, { "epoch": 2.565019236460491, "grad_norm": 1.3508644104003906, "learning_rate": 1.0773158260004668e-06, "loss": 0.1835, "step": 5417 }, { "epoch": 2.5654927493341226, "grad_norm": 1.0464918613433838, "learning_rate": 1.0750079487146558e-06, "loss": 0.2183, "step": 5418 }, { "epoch": 2.5659662622077537, "grad_norm": 1.4549388885498047, "learning_rate": 1.0727024056744172e-06, "loss": 0.2108, "step": 5419 }, { "epoch": 2.5664397750813848, "grad_norm": 1.0538500547409058, "learning_rate": 1.0703991974827399e-06, "loss": 0.2007, "step": 5420 }, { "epoch": 2.5669132879550163, "grad_norm": 1.0848989486694336, "learning_rate": 1.0680983247420062e-06, "loss": 0.2185, "step": 5421 }, { "epoch": 2.567386800828648, "grad_norm": 0.9970195889472961, "learning_rate": 1.0657997880539894e-06, "loss": 0.2014, "step": 5422 }, { "epoch": 2.567860313702279, "grad_norm": 1.1892081499099731, "learning_rate": 1.0635035880198474e-06, "loss": 0.1993, "step": 5423 }, { "epoch": 2.56833382657591, "grad_norm": 1.3775814771652222, "learning_rate": 1.061209725240132e-06, "loss": 0.1976, "step": 5424 }, { "epoch": 2.5688073394495414, "grad_norm": 1.3159923553466797, "learning_rate": 1.0589182003147758e-06, "loss": 0.2057, "step": 5425 }, { "epoch": 2.5692808523231725, "grad_norm": 1.09856116771698, "learning_rate": 1.0566290138431113e-06, "loss": 0.1962, "step": 5426 }, { "epoch": 2.569754365196804, "grad_norm": 1.0620033740997314, "learning_rate": 1.0543421664238473e-06, "loss": 0.2097, "step": 5427 }, { "epoch": 2.570227878070435, "grad_norm": 1.6413558721542358, "learning_rate": 1.0520576586550923e-06, "loss": 0.2114, "step": 5428 }, { "epoch": 2.570701390944066, "grad_norm": 1.1933810710906982, "learning_rate": 1.0497754911343316e-06, "loss": 0.2145, "step": 5429 }, { "epoch": 2.5711749038176976, "grad_norm": 1.7124356031417847, "learning_rate": 1.0474956644584488e-06, "loss": 0.1995, "step": 5430 }, { "epoch": 2.5716484166913287, "grad_norm": 0.9946605563163757, "learning_rate": 1.0452181792237092e-06, "loss": 0.2177, "step": 5431 }, { "epoch": 2.57212192956496, "grad_norm": 1.332301139831543, "learning_rate": 1.0429430360257642e-06, "loss": 0.1792, "step": 5432 }, { "epoch": 2.5725954424385913, "grad_norm": 1.497750163078308, "learning_rate": 1.0406702354596598e-06, "loss": 0.196, "step": 5433 }, { "epoch": 2.5730689553122224, "grad_norm": 1.034303069114685, "learning_rate": 1.0383997781198218e-06, "loss": 0.1817, "step": 5434 }, { "epoch": 2.573542468185854, "grad_norm": 0.7837559580802917, "learning_rate": 1.0361316646000686e-06, "loss": 0.1871, "step": 5435 }, { "epoch": 2.574015981059485, "grad_norm": 1.18108069896698, "learning_rate": 1.0338658954936008e-06, "loss": 0.1877, "step": 5436 }, { "epoch": 2.5744894939331164, "grad_norm": 0.9967142939567566, "learning_rate": 1.0316024713930129e-06, "loss": 0.1927, "step": 5437 }, { "epoch": 2.5749630068067475, "grad_norm": 1.073191523551941, "learning_rate": 1.0293413928902761e-06, "loss": 0.2007, "step": 5438 }, { "epoch": 2.5754365196803786, "grad_norm": 1.0731115341186523, "learning_rate": 1.0270826605767592e-06, "loss": 0.2134, "step": 5439 }, { "epoch": 2.57591003255401, "grad_norm": 1.0234590768814087, "learning_rate": 1.024826275043209e-06, "loss": 0.1902, "step": 5440 }, { "epoch": 2.5763835454276416, "grad_norm": 1.0381513833999634, "learning_rate": 1.0225722368797598e-06, "loss": 0.1957, "step": 5441 }, { "epoch": 2.5768570583012727, "grad_norm": 1.2362014055252075, "learning_rate": 1.020320546675937e-06, "loss": 0.186, "step": 5442 }, { "epoch": 2.5773305711749037, "grad_norm": 0.9183345437049866, "learning_rate": 1.0180712050206442e-06, "loss": 0.1847, "step": 5443 }, { "epoch": 2.5778040840485352, "grad_norm": 1.0637094974517822, "learning_rate": 1.01582421250218e-06, "loss": 0.1914, "step": 5444 }, { "epoch": 2.5782775969221663, "grad_norm": 1.023342490196228, "learning_rate": 1.0135795697082195e-06, "loss": 0.2049, "step": 5445 }, { "epoch": 2.578751109795798, "grad_norm": 0.9113678336143494, "learning_rate": 1.0113372772258302e-06, "loss": 0.185, "step": 5446 }, { "epoch": 2.579224622669429, "grad_norm": 1.1002506017684937, "learning_rate": 1.009097335641459e-06, "loss": 0.2028, "step": 5447 }, { "epoch": 2.57969813554306, "grad_norm": 1.069926142692566, "learning_rate": 1.0068597455409458e-06, "loss": 0.1777, "step": 5448 }, { "epoch": 2.5801716484166914, "grad_norm": 1.127890944480896, "learning_rate": 1.0046245075095074e-06, "loss": 0.1816, "step": 5449 }, { "epoch": 2.5806451612903225, "grad_norm": 0.9867877960205078, "learning_rate": 1.0023916221317465e-06, "loss": 0.199, "step": 5450 }, { "epoch": 2.581118674163954, "grad_norm": 1.3227022886276245, "learning_rate": 1.000161089991658e-06, "loss": 0.196, "step": 5451 }, { "epoch": 2.581592187037585, "grad_norm": 1.1155279874801636, "learning_rate": 9.979329116726111e-07, "loss": 0.1934, "step": 5452 }, { "epoch": 2.582065699911216, "grad_norm": 1.0326663255691528, "learning_rate": 9.957070877573682e-07, "loss": 0.1942, "step": 5453 }, { "epoch": 2.5825392127848477, "grad_norm": 1.2042585611343384, "learning_rate": 9.934836188280693e-07, "loss": 0.189, "step": 5454 }, { "epoch": 2.5830127256584787, "grad_norm": 1.3072971105575562, "learning_rate": 9.91262505466245e-07, "loss": 0.1837, "step": 5455 }, { "epoch": 2.5834862385321102, "grad_norm": 1.351815938949585, "learning_rate": 9.890437482528004e-07, "loss": 0.1906, "step": 5456 }, { "epoch": 2.5839597514057413, "grad_norm": 1.0295909643173218, "learning_rate": 9.868273477680357e-07, "loss": 0.2168, "step": 5457 }, { "epoch": 2.5844332642793724, "grad_norm": 0.970486044883728, "learning_rate": 9.84613304591625e-07, "loss": 0.1817, "step": 5458 }, { "epoch": 2.584906777153004, "grad_norm": 1.0597081184387207, "learning_rate": 9.824016193026308e-07, "loss": 0.1955, "step": 5459 }, { "epoch": 2.585380290026635, "grad_norm": 1.491175889968872, "learning_rate": 9.801922924795004e-07, "loss": 0.2162, "step": 5460 }, { "epoch": 2.5858538029002665, "grad_norm": 1.3303380012512207, "learning_rate": 9.779853247000593e-07, "loss": 0.1997, "step": 5461 }, { "epoch": 2.5863273157738975, "grad_norm": 1.9732062816619873, "learning_rate": 9.757807165415213e-07, "loss": 0.2225, "step": 5462 }, { "epoch": 2.5868008286475286, "grad_norm": 1.0555258989334106, "learning_rate": 9.735784685804773e-07, "loss": 0.1806, "step": 5463 }, { "epoch": 2.58727434152116, "grad_norm": 1.026889681816101, "learning_rate": 9.713785813929056e-07, "loss": 0.2107, "step": 5464 }, { "epoch": 2.5877478543947916, "grad_norm": 1.0289431810379028, "learning_rate": 9.69181055554167e-07, "loss": 0.1958, "step": 5465 }, { "epoch": 2.5882213672684227, "grad_norm": 0.9716818332672119, "learning_rate": 9.669858916389985e-07, "loss": 0.1865, "step": 5466 }, { "epoch": 2.5886948801420537, "grad_norm": 1.6020885705947876, "learning_rate": 9.647930902215296e-07, "loss": 0.1944, "step": 5467 }, { "epoch": 2.5891683930156852, "grad_norm": 1.1589189767837524, "learning_rate": 9.626026518752619e-07, "loss": 0.1926, "step": 5468 }, { "epoch": 2.5896419058893163, "grad_norm": 1.059776782989502, "learning_rate": 9.604145771730865e-07, "loss": 0.1716, "step": 5469 }, { "epoch": 2.590115418762948, "grad_norm": 1.3090096712112427, "learning_rate": 9.582288666872708e-07, "loss": 0.2115, "step": 5470 }, { "epoch": 2.590588931636579, "grad_norm": 1.174440860748291, "learning_rate": 9.560455209894691e-07, "loss": 0.1893, "step": 5471 }, { "epoch": 2.59106244451021, "grad_norm": 1.4484925270080566, "learning_rate": 9.538645406507108e-07, "loss": 0.2198, "step": 5472 }, { "epoch": 2.5915359573838415, "grad_norm": 1.0521667003631592, "learning_rate": 9.516859262414147e-07, "loss": 0.1962, "step": 5473 }, { "epoch": 2.5920094702574725, "grad_norm": 1.0996438264846802, "learning_rate": 9.495096783313729e-07, "loss": 0.2061, "step": 5474 }, { "epoch": 2.592482983131104, "grad_norm": 1.0292750597000122, "learning_rate": 9.473357974897623e-07, "loss": 0.1941, "step": 5475 }, { "epoch": 2.592956496004735, "grad_norm": 1.131027340888977, "learning_rate": 9.451642842851427e-07, "loss": 0.1894, "step": 5476 }, { "epoch": 2.593430008878366, "grad_norm": 1.0126436948776245, "learning_rate": 9.429951392854486e-07, "loss": 0.1936, "step": 5477 }, { "epoch": 2.5939035217519977, "grad_norm": 1.0096503496170044, "learning_rate": 9.40828363058004e-07, "loss": 0.1973, "step": 5478 }, { "epoch": 2.5943770346256287, "grad_norm": 1.16969895362854, "learning_rate": 9.386639561695043e-07, "loss": 0.2078, "step": 5479 }, { "epoch": 2.5948505474992603, "grad_norm": 1.0769869089126587, "learning_rate": 9.365019191860314e-07, "loss": 0.2057, "step": 5480 }, { "epoch": 2.5953240603728913, "grad_norm": 1.0034693479537964, "learning_rate": 9.343422526730428e-07, "loss": 0.199, "step": 5481 }, { "epoch": 2.5957975732465224, "grad_norm": 1.0947496891021729, "learning_rate": 9.321849571953822e-07, "loss": 0.1851, "step": 5482 }, { "epoch": 2.596271086120154, "grad_norm": 1.386936068534851, "learning_rate": 9.300300333172652e-07, "loss": 0.2034, "step": 5483 }, { "epoch": 2.5967445989937854, "grad_norm": 1.0438120365142822, "learning_rate": 9.27877481602295e-07, "loss": 0.2129, "step": 5484 }, { "epoch": 2.5972181118674165, "grad_norm": 1.0546690225601196, "learning_rate": 9.257273026134494e-07, "loss": 0.2108, "step": 5485 }, { "epoch": 2.5976916247410475, "grad_norm": 1.3479506969451904, "learning_rate": 9.235794969130851e-07, "loss": 0.184, "step": 5486 }, { "epoch": 2.598165137614679, "grad_norm": 1.1528756618499756, "learning_rate": 9.214340650629439e-07, "loss": 0.1933, "step": 5487 }, { "epoch": 2.59863865048831, "grad_norm": 1.3162215948104858, "learning_rate": 9.192910076241379e-07, "loss": 0.1848, "step": 5488 }, { "epoch": 2.5991121633619416, "grad_norm": 1.3679094314575195, "learning_rate": 9.171503251571678e-07, "loss": 0.2207, "step": 5489 }, { "epoch": 2.5995856762355727, "grad_norm": 1.234590768814087, "learning_rate": 9.150120182219046e-07, "loss": 0.1839, "step": 5490 }, { "epoch": 2.6000591891092037, "grad_norm": 1.105852484703064, "learning_rate": 9.128760873776054e-07, "loss": 0.2014, "step": 5491 }, { "epoch": 2.6005327019828353, "grad_norm": 1.1004148721694946, "learning_rate": 9.107425331828989e-07, "loss": 0.1932, "step": 5492 }, { "epoch": 2.6010062148564663, "grad_norm": 0.9159629940986633, "learning_rate": 9.086113561957987e-07, "loss": 0.2114, "step": 5493 }, { "epoch": 2.601479727730098, "grad_norm": 1.061231255531311, "learning_rate": 9.064825569736924e-07, "loss": 0.1762, "step": 5494 }, { "epoch": 2.601953240603729, "grad_norm": 1.741815209388733, "learning_rate": 9.043561360733444e-07, "loss": 0.187, "step": 5495 }, { "epoch": 2.60242675347736, "grad_norm": 1.0610990524291992, "learning_rate": 9.022320940509033e-07, "loss": 0.1988, "step": 5496 }, { "epoch": 2.6029002663509915, "grad_norm": 1.0070830583572388, "learning_rate": 9.001104314618892e-07, "loss": 0.2234, "step": 5497 }, { "epoch": 2.6033737792246225, "grad_norm": 0.9660130143165588, "learning_rate": 8.979911488612037e-07, "loss": 0.2007, "step": 5498 }, { "epoch": 2.603847292098254, "grad_norm": 1.0970488786697388, "learning_rate": 8.958742468031257e-07, "loss": 0.2136, "step": 5499 }, { "epoch": 2.604320804971885, "grad_norm": 0.9795224666595459, "learning_rate": 8.937597258413078e-07, "loss": 0.2162, "step": 5500 }, { "epoch": 2.604794317845516, "grad_norm": 1.2171189785003662, "learning_rate": 8.916475865287855e-07, "loss": 0.2051, "step": 5501 }, { "epoch": 2.6052678307191477, "grad_norm": 1.032201886177063, "learning_rate": 8.895378294179658e-07, "loss": 0.1856, "step": 5502 }, { "epoch": 2.605741343592779, "grad_norm": 1.1588233709335327, "learning_rate": 8.874304550606383e-07, "loss": 0.2028, "step": 5503 }, { "epoch": 2.6062148564664103, "grad_norm": 0.9060316681861877, "learning_rate": 8.853254640079633e-07, "loss": 0.1907, "step": 5504 }, { "epoch": 2.6066883693400413, "grad_norm": 1.5966558456420898, "learning_rate": 8.832228568104839e-07, "loss": 0.2036, "step": 5505 }, { "epoch": 2.607161882213673, "grad_norm": 1.2059063911437988, "learning_rate": 8.811226340181133e-07, "loss": 0.2239, "step": 5506 }, { "epoch": 2.607635395087304, "grad_norm": 1.165710210800171, "learning_rate": 8.79024796180149e-07, "loss": 0.2087, "step": 5507 }, { "epoch": 2.6081089079609354, "grad_norm": 1.2334532737731934, "learning_rate": 8.769293438452553e-07, "loss": 0.203, "step": 5508 }, { "epoch": 2.6085824208345665, "grad_norm": 1.1886234283447266, "learning_rate": 8.748362775614816e-07, "loss": 0.2152, "step": 5509 }, { "epoch": 2.6090559337081975, "grad_norm": 1.4193661212921143, "learning_rate": 8.727455978762478e-07, "loss": 0.1982, "step": 5510 }, { "epoch": 2.609529446581829, "grad_norm": 1.0052508115768433, "learning_rate": 8.706573053363487e-07, "loss": 0.2157, "step": 5511 }, { "epoch": 2.61000295945546, "grad_norm": 1.1085777282714844, "learning_rate": 8.685714004879608e-07, "loss": 0.1973, "step": 5512 }, { "epoch": 2.6104764723290916, "grad_norm": 1.2764101028442383, "learning_rate": 8.664878838766289e-07, "loss": 0.1935, "step": 5513 }, { "epoch": 2.6109499852027227, "grad_norm": 0.94977867603302, "learning_rate": 8.644067560472802e-07, "loss": 0.1907, "step": 5514 }, { "epoch": 2.6114234980763538, "grad_norm": 0.928570032119751, "learning_rate": 8.623280175442094e-07, "loss": 0.1929, "step": 5515 }, { "epoch": 2.6118970109499853, "grad_norm": 1.3341639041900635, "learning_rate": 8.602516689110952e-07, "loss": 0.1987, "step": 5516 }, { "epoch": 2.6123705238236163, "grad_norm": 1.2463881969451904, "learning_rate": 8.581777106909827e-07, "loss": 0.1788, "step": 5517 }, { "epoch": 2.612844036697248, "grad_norm": 1.0751234292984009, "learning_rate": 8.561061434262996e-07, "loss": 0.2133, "step": 5518 }, { "epoch": 2.613317549570879, "grad_norm": 0.9469509124755859, "learning_rate": 8.540369676588411e-07, "loss": 0.1925, "step": 5519 }, { "epoch": 2.61379106244451, "grad_norm": 1.0061715841293335, "learning_rate": 8.51970183929779e-07, "loss": 0.2128, "step": 5520 }, { "epoch": 2.6142645753181415, "grad_norm": 1.019024133682251, "learning_rate": 8.49905792779665e-07, "loss": 0.2089, "step": 5521 }, { "epoch": 2.6147380881917726, "grad_norm": 2.2128031253814697, "learning_rate": 8.478437947484164e-07, "loss": 0.2122, "step": 5522 }, { "epoch": 2.615211601065404, "grad_norm": 1.6629959344863892, "learning_rate": 8.457841903753327e-07, "loss": 0.2341, "step": 5523 }, { "epoch": 2.615685113939035, "grad_norm": 1.15640127658844, "learning_rate": 8.4372698019908e-07, "loss": 0.2024, "step": 5524 }, { "epoch": 2.616158626812666, "grad_norm": 0.9604775905609131, "learning_rate": 8.416721647577053e-07, "loss": 0.1944, "step": 5525 }, { "epoch": 2.6166321396862977, "grad_norm": 0.9418638348579407, "learning_rate": 8.396197445886223e-07, "loss": 0.1688, "step": 5526 }, { "epoch": 2.617105652559929, "grad_norm": 1.1123701333999634, "learning_rate": 8.375697202286248e-07, "loss": 0.2019, "step": 5527 }, { "epoch": 2.6175791654335603, "grad_norm": 1.4473880529403687, "learning_rate": 8.355220922138762e-07, "loss": 0.2034, "step": 5528 }, { "epoch": 2.6180526783071913, "grad_norm": 1.3960907459259033, "learning_rate": 8.334768610799104e-07, "loss": 0.1939, "step": 5529 }, { "epoch": 2.618526191180823, "grad_norm": 0.9807230830192566, "learning_rate": 8.314340273616428e-07, "loss": 0.2027, "step": 5530 }, { "epoch": 2.618999704054454, "grad_norm": 1.4325486421585083, "learning_rate": 8.293935915933526e-07, "loss": 0.2103, "step": 5531 }, { "epoch": 2.6194732169280854, "grad_norm": 1.228115200996399, "learning_rate": 8.273555543087009e-07, "loss": 0.1993, "step": 5532 }, { "epoch": 2.6199467298017165, "grad_norm": 1.0208629369735718, "learning_rate": 8.25319916040711e-07, "loss": 0.1961, "step": 5533 }, { "epoch": 2.6204202426753476, "grad_norm": 1.7419202327728271, "learning_rate": 8.232866773217896e-07, "loss": 0.2002, "step": 5534 }, { "epoch": 2.620893755548979, "grad_norm": 1.0411862134933472, "learning_rate": 8.212558386837067e-07, "loss": 0.1958, "step": 5535 }, { "epoch": 2.62136726842261, "grad_norm": 1.4005458354949951, "learning_rate": 8.192274006576095e-07, "loss": 0.2136, "step": 5536 }, { "epoch": 2.6218407812962417, "grad_norm": 1.0770565271377563, "learning_rate": 8.172013637740195e-07, "loss": 0.2002, "step": 5537 }, { "epoch": 2.6223142941698727, "grad_norm": 1.2719935178756714, "learning_rate": 8.151777285628226e-07, "loss": 0.2034, "step": 5538 }, { "epoch": 2.622787807043504, "grad_norm": 1.5015056133270264, "learning_rate": 8.131564955532856e-07, "loss": 0.1889, "step": 5539 }, { "epoch": 2.6232613199171353, "grad_norm": 1.192428469657898, "learning_rate": 8.111376652740388e-07, "loss": 0.1872, "step": 5540 }, { "epoch": 2.6237348327907664, "grad_norm": 1.306041955947876, "learning_rate": 8.091212382530899e-07, "loss": 0.2135, "step": 5541 }, { "epoch": 2.624208345664398, "grad_norm": 1.4788362979888916, "learning_rate": 8.071072150178138e-07, "loss": 0.2017, "step": 5542 }, { "epoch": 2.624681858538029, "grad_norm": 1.2858426570892334, "learning_rate": 8.050955960949625e-07, "loss": 0.2031, "step": 5543 }, { "epoch": 2.62515537141166, "grad_norm": 1.2146573066711426, "learning_rate": 8.030863820106527e-07, "loss": 0.1757, "step": 5544 }, { "epoch": 2.6256288842852915, "grad_norm": 1.650189995765686, "learning_rate": 8.010795732903731e-07, "loss": 0.2143, "step": 5545 }, { "epoch": 2.626102397158923, "grad_norm": 1.076585292816162, "learning_rate": 7.990751704589906e-07, "loss": 0.1987, "step": 5546 }, { "epoch": 2.626575910032554, "grad_norm": 1.0550071001052856, "learning_rate": 7.970731740407311e-07, "loss": 0.1886, "step": 5547 }, { "epoch": 2.627049422906185, "grad_norm": 1.0130494832992554, "learning_rate": 7.950735845592039e-07, "loss": 0.2068, "step": 5548 }, { "epoch": 2.6275229357798167, "grad_norm": 1.2154542207717896, "learning_rate": 7.93076402537376e-07, "loss": 0.2018, "step": 5549 }, { "epoch": 2.6279964486534477, "grad_norm": 0.8742989897727966, "learning_rate": 7.910816284975975e-07, "loss": 0.1784, "step": 5550 }, { "epoch": 2.6284699615270792, "grad_norm": 1.0601682662963867, "learning_rate": 7.890892629615765e-07, "loss": 0.1972, "step": 5551 }, { "epoch": 2.6289434744007103, "grad_norm": 1.270751953125, "learning_rate": 7.870993064504018e-07, "loss": 0.2142, "step": 5552 }, { "epoch": 2.6294169872743414, "grad_norm": 1.1302870512008667, "learning_rate": 7.851117594845237e-07, "loss": 0.2138, "step": 5553 }, { "epoch": 2.629890500147973, "grad_norm": 1.3007394075393677, "learning_rate": 7.831266225837675e-07, "loss": 0.1978, "step": 5554 }, { "epoch": 2.630364013021604, "grad_norm": 0.9886184930801392, "learning_rate": 7.811438962673268e-07, "loss": 0.1897, "step": 5555 }, { "epoch": 2.6308375258952355, "grad_norm": 1.188515305519104, "learning_rate": 7.791635810537624e-07, "loss": 0.2089, "step": 5556 }, { "epoch": 2.6313110387688665, "grad_norm": 1.18173086643219, "learning_rate": 7.771856774610109e-07, "loss": 0.2251, "step": 5557 }, { "epoch": 2.6317845516424976, "grad_norm": 1.0726786851882935, "learning_rate": 7.752101860063687e-07, "loss": 0.2004, "step": 5558 }, { "epoch": 2.632258064516129, "grad_norm": 1.0728563070297241, "learning_rate": 7.732371072065126e-07, "loss": 0.1939, "step": 5559 }, { "epoch": 2.63273157738976, "grad_norm": 1.0991253852844238, "learning_rate": 7.712664415774762e-07, "loss": 0.1699, "step": 5560 }, { "epoch": 2.6332050902633917, "grad_norm": 0.9829975962638855, "learning_rate": 7.692981896346718e-07, "loss": 0.199, "step": 5561 }, { "epoch": 2.6336786031370227, "grad_norm": 0.8773795366287231, "learning_rate": 7.673323518928755e-07, "loss": 0.1894, "step": 5562 }, { "epoch": 2.634152116010654, "grad_norm": 1.0769257545471191, "learning_rate": 7.653689288662335e-07, "loss": 0.2144, "step": 5563 }, { "epoch": 2.6346256288842853, "grad_norm": 1.1146119832992554, "learning_rate": 7.63407921068261e-07, "loss": 0.2237, "step": 5564 }, { "epoch": 2.635099141757917, "grad_norm": 0.9974523782730103, "learning_rate": 7.614493290118386e-07, "loss": 0.2062, "step": 5565 }, { "epoch": 2.635572654631548, "grad_norm": 1.053133249282837, "learning_rate": 7.594931532092198e-07, "loss": 0.2005, "step": 5566 }, { "epoch": 2.636046167505179, "grad_norm": 1.3148210048675537, "learning_rate": 7.575393941720199e-07, "loss": 0.2097, "step": 5567 }, { "epoch": 2.6365196803788105, "grad_norm": 1.292978286743164, "learning_rate": 7.555880524112291e-07, "loss": 0.1951, "step": 5568 }, { "epoch": 2.6369931932524415, "grad_norm": 1.3344711065292358, "learning_rate": 7.536391284372002e-07, "loss": 0.1978, "step": 5569 }, { "epoch": 2.637466706126073, "grad_norm": 1.8182041645050049, "learning_rate": 7.516926227596566e-07, "loss": 0.2056, "step": 5570 }, { "epoch": 2.637940218999704, "grad_norm": 1.400888204574585, "learning_rate": 7.497485358876866e-07, "loss": 0.2021, "step": 5571 }, { "epoch": 2.638413731873335, "grad_norm": 0.8689830303192139, "learning_rate": 7.478068683297501e-07, "loss": 0.1753, "step": 5572 }, { "epoch": 2.6388872447469667, "grad_norm": 1.108304738998413, "learning_rate": 7.458676205936688e-07, "loss": 0.1851, "step": 5573 }, { "epoch": 2.6393607576205977, "grad_norm": 1.0863293409347534, "learning_rate": 7.439307931866346e-07, "loss": 0.2266, "step": 5574 }, { "epoch": 2.6398342704942293, "grad_norm": 1.085361361503601, "learning_rate": 7.419963866152058e-07, "loss": 0.2166, "step": 5575 }, { "epoch": 2.6403077833678603, "grad_norm": 1.4498753547668457, "learning_rate": 7.400644013853087e-07, "loss": 0.1799, "step": 5576 }, { "epoch": 2.6407812962414914, "grad_norm": 1.1030598878860474, "learning_rate": 7.381348380022368e-07, "loss": 0.1678, "step": 5577 }, { "epoch": 2.641254809115123, "grad_norm": 1.5014503002166748, "learning_rate": 7.362076969706478e-07, "loss": 0.2074, "step": 5578 }, { "epoch": 2.641728321988754, "grad_norm": 1.086707353591919, "learning_rate": 7.342829787945638e-07, "loss": 0.2029, "step": 5579 }, { "epoch": 2.6422018348623855, "grad_norm": 1.4167083501815796, "learning_rate": 7.323606839773811e-07, "loss": 0.1902, "step": 5580 }, { "epoch": 2.6426753477360165, "grad_norm": 0.9145298600196838, "learning_rate": 7.304408130218532e-07, "loss": 0.2039, "step": 5581 }, { "epoch": 2.6431488606096476, "grad_norm": 2.068638563156128, "learning_rate": 7.285233664301073e-07, "loss": 0.2075, "step": 5582 }, { "epoch": 2.643622373483279, "grad_norm": 1.0407607555389404, "learning_rate": 7.266083447036287e-07, "loss": 0.1838, "step": 5583 }, { "epoch": 2.64409588635691, "grad_norm": 1.2109540700912476, "learning_rate": 7.246957483432782e-07, "loss": 0.2107, "step": 5584 }, { "epoch": 2.6445693992305417, "grad_norm": 1.51192045211792, "learning_rate": 7.227855778492732e-07, "loss": 0.1798, "step": 5585 }, { "epoch": 2.6450429121041727, "grad_norm": 1.015613317489624, "learning_rate": 7.208778337212019e-07, "loss": 0.2107, "step": 5586 }, { "epoch": 2.645516424977804, "grad_norm": 1.1152918338775635, "learning_rate": 7.189725164580152e-07, "loss": 0.2313, "step": 5587 }, { "epoch": 2.6459899378514353, "grad_norm": 1.4424123764038086, "learning_rate": 7.170696265580323e-07, "loss": 0.1907, "step": 5588 }, { "epoch": 2.646463450725067, "grad_norm": 1.2000633478164673, "learning_rate": 7.15169164518934e-07, "loss": 0.1903, "step": 5589 }, { "epoch": 2.646936963598698, "grad_norm": 0.9677756428718567, "learning_rate": 7.132711308377682e-07, "loss": 0.2042, "step": 5590 }, { "epoch": 2.647410476472329, "grad_norm": 1.519376516342163, "learning_rate": 7.113755260109478e-07, "loss": 0.1914, "step": 5591 }, { "epoch": 2.6478839893459605, "grad_norm": 0.9879403114318848, "learning_rate": 7.094823505342485e-07, "loss": 0.2147, "step": 5592 }, { "epoch": 2.6483575022195915, "grad_norm": 1.1486883163452148, "learning_rate": 7.075916049028142e-07, "loss": 0.2034, "step": 5593 }, { "epoch": 2.648831015093223, "grad_norm": 0.992047905921936, "learning_rate": 7.057032896111494e-07, "loss": 0.1918, "step": 5594 }, { "epoch": 2.649304527966854, "grad_norm": 1.395175576210022, "learning_rate": 7.038174051531266e-07, "loss": 0.1973, "step": 5595 }, { "epoch": 2.649778040840485, "grad_norm": 0.9354866743087769, "learning_rate": 7.019339520219793e-07, "loss": 0.1899, "step": 5596 }, { "epoch": 2.6502515537141167, "grad_norm": 1.0331776142120361, "learning_rate": 7.000529307103066e-07, "loss": 0.1847, "step": 5597 }, { "epoch": 2.6507250665877478, "grad_norm": 1.2936712503433228, "learning_rate": 6.981743417100728e-07, "loss": 0.1875, "step": 5598 }, { "epoch": 2.6511985794613793, "grad_norm": 1.1697713136672974, "learning_rate": 6.962981855126017e-07, "loss": 0.1885, "step": 5599 }, { "epoch": 2.6516720923350103, "grad_norm": 1.2552714347839355, "learning_rate": 6.944244626085872e-07, "loss": 0.2045, "step": 5600 }, { "epoch": 2.6521456052086414, "grad_norm": 1.1409389972686768, "learning_rate": 6.925531734880808e-07, "loss": 0.2088, "step": 5601 }, { "epoch": 2.652619118082273, "grad_norm": 1.1405903100967407, "learning_rate": 6.906843186405032e-07, "loss": 0.2132, "step": 5602 }, { "epoch": 2.653092630955904, "grad_norm": 1.2994462251663208, "learning_rate": 6.888178985546312e-07, "loss": 0.201, "step": 5603 }, { "epoch": 2.6535661438295355, "grad_norm": 1.3358666896820068, "learning_rate": 6.869539137186132e-07, "loss": 0.208, "step": 5604 }, { "epoch": 2.6540396567031665, "grad_norm": 1.0983041524887085, "learning_rate": 6.850923646199526e-07, "loss": 0.2056, "step": 5605 }, { "epoch": 2.6545131695767976, "grad_norm": 1.1728966236114502, "learning_rate": 6.832332517455242e-07, "loss": 0.2267, "step": 5606 }, { "epoch": 2.654986682450429, "grad_norm": 1.0750831365585327, "learning_rate": 6.813765755815571e-07, "loss": 0.204, "step": 5607 }, { "epoch": 2.6554601953240606, "grad_norm": 1.2680811882019043, "learning_rate": 6.795223366136471e-07, "loss": 0.1889, "step": 5608 }, { "epoch": 2.6559337081976917, "grad_norm": 1.0505774021148682, "learning_rate": 6.776705353267554e-07, "loss": 0.1937, "step": 5609 }, { "epoch": 2.6564072210713228, "grad_norm": 1.042385220527649, "learning_rate": 6.758211722052e-07, "loss": 0.2031, "step": 5610 }, { "epoch": 2.6568807339449543, "grad_norm": 1.3553048372268677, "learning_rate": 6.73974247732666e-07, "loss": 0.2098, "step": 5611 }, { "epoch": 2.6573542468185853, "grad_norm": 1.1963160037994385, "learning_rate": 6.721297623921963e-07, "loss": 0.1899, "step": 5612 }, { "epoch": 2.657827759692217, "grad_norm": 1.010690450668335, "learning_rate": 6.702877166662014e-07, "loss": 0.2022, "step": 5613 }, { "epoch": 2.658301272565848, "grad_norm": 1.4362280368804932, "learning_rate": 6.684481110364471e-07, "loss": 0.1826, "step": 5614 }, { "epoch": 2.658774785439479, "grad_norm": 1.1894739866256714, "learning_rate": 6.666109459840664e-07, "loss": 0.1979, "step": 5615 }, { "epoch": 2.6592482983131105, "grad_norm": 1.0000640153884888, "learning_rate": 6.647762219895526e-07, "loss": 0.1917, "step": 5616 }, { "epoch": 2.6597218111867416, "grad_norm": 1.6517083644866943, "learning_rate": 6.629439395327597e-07, "loss": 0.1963, "step": 5617 }, { "epoch": 2.660195324060373, "grad_norm": 1.1130201816558838, "learning_rate": 6.611140990929032e-07, "loss": 0.1901, "step": 5618 }, { "epoch": 2.660668836934004, "grad_norm": 1.1720373630523682, "learning_rate": 6.592867011485593e-07, "loss": 0.1891, "step": 5619 }, { "epoch": 2.661142349807635, "grad_norm": 1.1503329277038574, "learning_rate": 6.574617461776689e-07, "loss": 0.211, "step": 5620 }, { "epoch": 2.6616158626812667, "grad_norm": 1.2841466665267944, "learning_rate": 6.55639234657528e-07, "loss": 0.192, "step": 5621 }, { "epoch": 2.6620893755548978, "grad_norm": 1.003335952758789, "learning_rate": 6.538191670648008e-07, "loss": 0.1809, "step": 5622 }, { "epoch": 2.6625628884285293, "grad_norm": 1.1069647073745728, "learning_rate": 6.520015438755056e-07, "loss": 0.2017, "step": 5623 }, { "epoch": 2.6630364013021603, "grad_norm": 1.0702106952667236, "learning_rate": 6.501863655650243e-07, "loss": 0.2038, "step": 5624 }, { "epoch": 2.6635099141757914, "grad_norm": 0.9489043951034546, "learning_rate": 6.483736326081003e-07, "loss": 0.1976, "step": 5625 }, { "epoch": 2.663983427049423, "grad_norm": 1.254457712173462, "learning_rate": 6.465633454788345e-07, "loss": 0.2198, "step": 5626 }, { "epoch": 2.6644569399230544, "grad_norm": 1.0318970680236816, "learning_rate": 6.447555046506937e-07, "loss": 0.1741, "step": 5627 }, { "epoch": 2.6649304527966855, "grad_norm": 1.3176510334014893, "learning_rate": 6.429501105964964e-07, "loss": 0.2041, "step": 5628 }, { "epoch": 2.6654039656703166, "grad_norm": 1.1123440265655518, "learning_rate": 6.411471637884315e-07, "loss": 0.2024, "step": 5629 }, { "epoch": 2.665877478543948, "grad_norm": 0.9631196856498718, "learning_rate": 6.393466646980362e-07, "loss": 0.1872, "step": 5630 }, { "epoch": 2.666350991417579, "grad_norm": 1.3509668111801147, "learning_rate": 6.375486137962194e-07, "loss": 0.1934, "step": 5631 }, { "epoch": 2.6668245042912107, "grad_norm": 1.3398867845535278, "learning_rate": 6.357530115532417e-07, "loss": 0.1883, "step": 5632 }, { "epoch": 2.6672980171648417, "grad_norm": 1.4973032474517822, "learning_rate": 6.339598584387241e-07, "loss": 0.2354, "step": 5633 }, { "epoch": 2.667771530038473, "grad_norm": 1.0023930072784424, "learning_rate": 6.321691549216502e-07, "loss": 0.2074, "step": 5634 }, { "epoch": 2.6682450429121043, "grad_norm": 1.0620249509811401, "learning_rate": 6.303809014703599e-07, "loss": 0.1999, "step": 5635 }, { "epoch": 2.6687185557857354, "grad_norm": 1.0061266422271729, "learning_rate": 6.285950985525569e-07, "loss": 0.2038, "step": 5636 }, { "epoch": 2.669192068659367, "grad_norm": 0.9382733106613159, "learning_rate": 6.268117466352952e-07, "loss": 0.1932, "step": 5637 }, { "epoch": 2.669665581532998, "grad_norm": 1.1017144918441772, "learning_rate": 6.250308461849986e-07, "loss": 0.2087, "step": 5638 }, { "epoch": 2.670139094406629, "grad_norm": 1.4511491060256958, "learning_rate": 6.232523976674409e-07, "loss": 0.2105, "step": 5639 }, { "epoch": 2.6706126072802605, "grad_norm": 1.016521692276001, "learning_rate": 6.214764015477614e-07, "loss": 0.1869, "step": 5640 }, { "epoch": 2.6710861201538916, "grad_norm": 1.1885292530059814, "learning_rate": 6.197028582904507e-07, "loss": 0.2107, "step": 5641 }, { "epoch": 2.671559633027523, "grad_norm": 1.1228227615356445, "learning_rate": 6.179317683593656e-07, "loss": 0.2178, "step": 5642 }, { "epoch": 2.672033145901154, "grad_norm": 1.4119621515274048, "learning_rate": 6.161631322177164e-07, "loss": 0.2027, "step": 5643 }, { "epoch": 2.672506658774785, "grad_norm": 1.3242872953414917, "learning_rate": 6.1439695032807e-07, "loss": 0.2273, "step": 5644 }, { "epoch": 2.6729801716484167, "grad_norm": 1.2599126100540161, "learning_rate": 6.126332231523591e-07, "loss": 0.2258, "step": 5645 }, { "epoch": 2.673453684522048, "grad_norm": 1.113526701927185, "learning_rate": 6.108719511518658e-07, "loss": 0.1929, "step": 5646 }, { "epoch": 2.6739271973956793, "grad_norm": 1.0311851501464844, "learning_rate": 6.09113134787237e-07, "loss": 0.2111, "step": 5647 }, { "epoch": 2.6744007102693104, "grad_norm": 1.3474483489990234, "learning_rate": 6.073567745184694e-07, "loss": 0.1973, "step": 5648 }, { "epoch": 2.6748742231429414, "grad_norm": 1.5055440664291382, "learning_rate": 6.056028708049278e-07, "loss": 0.2036, "step": 5649 }, { "epoch": 2.675347736016573, "grad_norm": 1.0800116062164307, "learning_rate": 6.038514241053239e-07, "loss": 0.1864, "step": 5650 }, { "epoch": 2.6758212488902045, "grad_norm": 1.0215858221054077, "learning_rate": 6.021024348777349e-07, "loss": 0.1721, "step": 5651 }, { "epoch": 2.6762947617638355, "grad_norm": 1.2525755167007446, "learning_rate": 6.003559035795914e-07, "loss": 0.1724, "step": 5652 }, { "epoch": 2.6767682746374666, "grad_norm": 1.2781281471252441, "learning_rate": 5.986118306676791e-07, "loss": 0.2081, "step": 5653 }, { "epoch": 2.677241787511098, "grad_norm": 1.0916087627410889, "learning_rate": 5.968702165981477e-07, "loss": 0.1801, "step": 5654 }, { "epoch": 2.677715300384729, "grad_norm": 0.9678347706794739, "learning_rate": 5.95131061826496e-07, "loss": 0.1771, "step": 5655 }, { "epoch": 2.6781888132583607, "grad_norm": 1.1609114408493042, "learning_rate": 5.933943668075869e-07, "loss": 0.1954, "step": 5656 }, { "epoch": 2.6786623261319917, "grad_norm": 1.1677464246749878, "learning_rate": 5.916601319956339e-07, "loss": 0.1781, "step": 5657 }, { "epoch": 2.679135839005623, "grad_norm": 1.1138585805892944, "learning_rate": 5.899283578442073e-07, "loss": 0.1877, "step": 5658 }, { "epoch": 2.6796093518792543, "grad_norm": 1.0214651823043823, "learning_rate": 5.881990448062402e-07, "loss": 0.2018, "step": 5659 }, { "epoch": 2.6800828647528854, "grad_norm": 1.117795467376709, "learning_rate": 5.864721933340145e-07, "loss": 0.2012, "step": 5660 }, { "epoch": 2.680556377626517, "grad_norm": 1.546120524406433, "learning_rate": 5.847478038791732e-07, "loss": 0.2145, "step": 5661 }, { "epoch": 2.681029890500148, "grad_norm": 1.232239842414856, "learning_rate": 5.830258768927122e-07, "loss": 0.181, "step": 5662 }, { "epoch": 2.681503403373779, "grad_norm": 1.0880526304244995, "learning_rate": 5.813064128249879e-07, "loss": 0.1968, "step": 5663 }, { "epoch": 2.6819769162474105, "grad_norm": 1.078931450843811, "learning_rate": 5.795894121257062e-07, "loss": 0.2053, "step": 5664 }, { "epoch": 2.6824504291210416, "grad_norm": 2.163848400115967, "learning_rate": 5.778748752439345e-07, "loss": 0.1973, "step": 5665 }, { "epoch": 2.682923941994673, "grad_norm": 1.1884369850158691, "learning_rate": 5.761628026280908e-07, "loss": 0.2006, "step": 5666 }, { "epoch": 2.683397454868304, "grad_norm": 1.1050399541854858, "learning_rate": 5.744531947259535e-07, "loss": 0.1786, "step": 5667 }, { "epoch": 2.6838709677419352, "grad_norm": 1.53378427028656, "learning_rate": 5.727460519846539e-07, "loss": 0.2083, "step": 5668 }, { "epoch": 2.6843444806155667, "grad_norm": 1.0245704650878906, "learning_rate": 5.710413748506772e-07, "loss": 0.2315, "step": 5669 }, { "epoch": 2.6848179934891983, "grad_norm": 1.2128366231918335, "learning_rate": 5.693391637698664e-07, "loss": 0.2033, "step": 5670 }, { "epoch": 2.6852915063628293, "grad_norm": 1.1133742332458496, "learning_rate": 5.676394191874179e-07, "loss": 0.1781, "step": 5671 }, { "epoch": 2.6857650192364604, "grad_norm": 1.095633864402771, "learning_rate": 5.65942141547885e-07, "loss": 0.2086, "step": 5672 }, { "epoch": 2.686238532110092, "grad_norm": 1.5314477682113647, "learning_rate": 5.642473312951713e-07, "loss": 0.2064, "step": 5673 }, { "epoch": 2.686712044983723, "grad_norm": 1.4954949617385864, "learning_rate": 5.625549888725401e-07, "loss": 0.1821, "step": 5674 }, { "epoch": 2.6871855578573545, "grad_norm": 1.1045236587524414, "learning_rate": 5.608651147226074e-07, "loss": 0.1925, "step": 5675 }, { "epoch": 2.6876590707309855, "grad_norm": 1.7516658306121826, "learning_rate": 5.591777092873429e-07, "loss": 0.2061, "step": 5676 }, { "epoch": 2.6881325836046166, "grad_norm": 1.2861231565475464, "learning_rate": 5.574927730080725e-07, "loss": 0.2016, "step": 5677 }, { "epoch": 2.688606096478248, "grad_norm": 1.7489197254180908, "learning_rate": 5.558103063254716e-07, "loss": 0.2002, "step": 5678 }, { "epoch": 2.689079609351879, "grad_norm": 1.1831798553466797, "learning_rate": 5.541303096795769e-07, "loss": 0.2127, "step": 5679 }, { "epoch": 2.6895531222255107, "grad_norm": 1.17523992061615, "learning_rate": 5.524527835097726e-07, "loss": 0.1985, "step": 5680 }, { "epoch": 2.6900266350991417, "grad_norm": 1.547934651374817, "learning_rate": 5.507777282548021e-07, "loss": 0.2032, "step": 5681 }, { "epoch": 2.690500147972773, "grad_norm": 1.073725700378418, "learning_rate": 5.491051443527573e-07, "loss": 0.2342, "step": 5682 }, { "epoch": 2.6909736608464043, "grad_norm": 0.9526923298835754, "learning_rate": 5.474350322410882e-07, "loss": 0.1952, "step": 5683 }, { "epoch": 2.6914471737200354, "grad_norm": 1.3431147336959839, "learning_rate": 5.457673923565954e-07, "loss": 0.1967, "step": 5684 }, { "epoch": 2.691920686593667, "grad_norm": 1.127583622932434, "learning_rate": 5.441022251354355e-07, "loss": 0.2074, "step": 5685 }, { "epoch": 2.692394199467298, "grad_norm": 0.9880527257919312, "learning_rate": 5.424395310131159e-07, "loss": 0.2143, "step": 5686 }, { "epoch": 2.692867712340929, "grad_norm": 1.1635633707046509, "learning_rate": 5.407793104244963e-07, "loss": 0.2018, "step": 5687 }, { "epoch": 2.6933412252145605, "grad_norm": 1.3094615936279297, "learning_rate": 5.391215638037961e-07, "loss": 0.229, "step": 5688 }, { "epoch": 2.6938147380881916, "grad_norm": 1.3654303550720215, "learning_rate": 5.374662915845774e-07, "loss": 0.2072, "step": 5689 }, { "epoch": 2.694288250961823, "grad_norm": 1.0598773956298828, "learning_rate": 5.358134941997661e-07, "loss": 0.219, "step": 5690 }, { "epoch": 2.694761763835454, "grad_norm": 1.110864281654358, "learning_rate": 5.341631720816309e-07, "loss": 0.192, "step": 5691 }, { "epoch": 2.6952352767090857, "grad_norm": 1.1458567380905151, "learning_rate": 5.325153256617988e-07, "loss": 0.1928, "step": 5692 }, { "epoch": 2.6957087895827168, "grad_norm": 1.2874677181243896, "learning_rate": 5.308699553712515e-07, "loss": 0.2226, "step": 5693 }, { "epoch": 2.6961823024563483, "grad_norm": 1.3333238363265991, "learning_rate": 5.29227061640315e-07, "loss": 0.1834, "step": 5694 }, { "epoch": 2.6966558153299793, "grad_norm": 1.119017481803894, "learning_rate": 5.275866448986755e-07, "loss": 0.1743, "step": 5695 }, { "epoch": 2.6971293282036104, "grad_norm": 1.0952125787734985, "learning_rate": 5.259487055753653e-07, "loss": 0.213, "step": 5696 }, { "epoch": 2.697602841077242, "grad_norm": 1.2849918603897095, "learning_rate": 5.243132440987752e-07, "loss": 0.1902, "step": 5697 }, { "epoch": 2.698076353950873, "grad_norm": 1.2609485387802124, "learning_rate": 5.226802608966419e-07, "loss": 0.2232, "step": 5698 }, { "epoch": 2.6985498668245045, "grad_norm": 1.236099123954773, "learning_rate": 5.210497563960581e-07, "loss": 0.2236, "step": 5699 }, { "epoch": 2.6990233796981355, "grad_norm": 1.6993776559829712, "learning_rate": 5.19421731023464e-07, "loss": 0.1883, "step": 5700 }, { "epoch": 2.6994968925717666, "grad_norm": 2.0070505142211914, "learning_rate": 5.177961852046565e-07, "loss": 0.202, "step": 5701 }, { "epoch": 2.699970405445398, "grad_norm": 1.0293316841125488, "learning_rate": 5.161731193647801e-07, "loss": 0.1863, "step": 5702 }, { "epoch": 2.700443918319029, "grad_norm": 0.9846745133399963, "learning_rate": 5.145525339283308e-07, "loss": 0.1895, "step": 5703 }, { "epoch": 2.7009174311926607, "grad_norm": 1.1934609413146973, "learning_rate": 5.129344293191607e-07, "loss": 0.1918, "step": 5704 }, { "epoch": 2.7013909440662918, "grad_norm": 1.1649199724197388, "learning_rate": 5.113188059604657e-07, "loss": 0.2288, "step": 5705 }, { "epoch": 2.701864456939923, "grad_norm": 1.2630609273910522, "learning_rate": 5.097056642747988e-07, "loss": 0.1989, "step": 5706 }, { "epoch": 2.7023379698135543, "grad_norm": 1.5064605474472046, "learning_rate": 5.080950046840594e-07, "loss": 0.2033, "step": 5707 }, { "epoch": 2.7028114826871854, "grad_norm": 1.2008765935897827, "learning_rate": 5.064868276095036e-07, "loss": 0.2044, "step": 5708 }, { "epoch": 2.703284995560817, "grad_norm": 1.0486986637115479, "learning_rate": 5.048811334717307e-07, "loss": 0.1992, "step": 5709 }, { "epoch": 2.703758508434448, "grad_norm": 1.0270819664001465, "learning_rate": 5.032779226906981e-07, "loss": 0.2147, "step": 5710 }, { "epoch": 2.704232021308079, "grad_norm": 0.9577698707580566, "learning_rate": 5.016771956857081e-07, "loss": 0.1697, "step": 5711 }, { "epoch": 2.7047055341817106, "grad_norm": 1.1315481662750244, "learning_rate": 5.000789528754147e-07, "loss": 0.201, "step": 5712 }, { "epoch": 2.705179047055342, "grad_norm": 1.1655607223510742, "learning_rate": 4.984831946778246e-07, "loss": 0.2057, "step": 5713 }, { "epoch": 2.705652559928973, "grad_norm": 1.2996375560760498, "learning_rate": 4.968899215102907e-07, "loss": 0.2162, "step": 5714 }, { "epoch": 2.706126072802604, "grad_norm": 1.351178765296936, "learning_rate": 4.952991337895219e-07, "loss": 0.1751, "step": 5715 }, { "epoch": 2.7065995856762357, "grad_norm": 0.9919388890266418, "learning_rate": 4.937108319315687e-07, "loss": 0.2041, "step": 5716 }, { "epoch": 2.7070730985498668, "grad_norm": 1.0616121292114258, "learning_rate": 4.92125016351841e-07, "loss": 0.2077, "step": 5717 }, { "epoch": 2.7075466114234983, "grad_norm": 1.1046313047409058, "learning_rate": 4.905416874650892e-07, "loss": 0.2069, "step": 5718 }, { "epoch": 2.7080201242971293, "grad_norm": 1.44788658618927, "learning_rate": 4.889608456854211e-07, "loss": 0.1919, "step": 5719 }, { "epoch": 2.7084936371707604, "grad_norm": 0.9813434481620789, "learning_rate": 4.873824914262882e-07, "loss": 0.1997, "step": 5720 }, { "epoch": 2.708967150044392, "grad_norm": 1.2912452220916748, "learning_rate": 4.858066251004956e-07, "loss": 0.2067, "step": 5721 }, { "epoch": 2.709440662918023, "grad_norm": 0.9664787650108337, "learning_rate": 4.842332471201961e-07, "loss": 0.1958, "step": 5722 }, { "epoch": 2.7099141757916545, "grad_norm": 1.488752841949463, "learning_rate": 4.826623578968881e-07, "loss": 0.2134, "step": 5723 }, { "epoch": 2.7103876886652856, "grad_norm": 1.1605441570281982, "learning_rate": 4.810939578414265e-07, "loss": 0.2028, "step": 5724 }, { "epoch": 2.7108612015389166, "grad_norm": 1.2621954679489136, "learning_rate": 4.795280473640085e-07, "loss": 0.2198, "step": 5725 }, { "epoch": 2.711334714412548, "grad_norm": 1.275181531906128, "learning_rate": 4.779646268741866e-07, "loss": 0.1786, "step": 5726 }, { "epoch": 2.711808227286179, "grad_norm": 0.8626628518104553, "learning_rate": 4.7640369678085275e-07, "loss": 0.1957, "step": 5727 }, { "epoch": 2.7122817401598107, "grad_norm": 1.1933006048202515, "learning_rate": 4.7484525749225907e-07, "loss": 0.1729, "step": 5728 }, { "epoch": 2.712755253033442, "grad_norm": 0.9497954249382019, "learning_rate": 4.7328930941599514e-07, "loss": 0.1848, "step": 5729 }, { "epoch": 2.713228765907073, "grad_norm": 1.1757985353469849, "learning_rate": 4.717358529590077e-07, "loss": 0.2026, "step": 5730 }, { "epoch": 2.7137022787807044, "grad_norm": 1.2278982400894165, "learning_rate": 4.7018488852758727e-07, "loss": 0.2011, "step": 5731 }, { "epoch": 2.714175791654336, "grad_norm": 1.4735335111618042, "learning_rate": 4.6863641652737157e-07, "loss": 0.2165, "step": 5732 }, { "epoch": 2.714649304527967, "grad_norm": 1.0341819524765015, "learning_rate": 4.6709043736335334e-07, "loss": 0.2221, "step": 5733 }, { "epoch": 2.715122817401598, "grad_norm": 1.042711853981018, "learning_rate": 4.655469514398636e-07, "loss": 0.208, "step": 5734 }, { "epoch": 2.7155963302752295, "grad_norm": 1.383939504623413, "learning_rate": 4.6400595916058944e-07, "loss": 0.2037, "step": 5735 }, { "epoch": 2.7160698431488606, "grad_norm": 0.9150691032409668, "learning_rate": 4.6246746092856176e-07, "loss": 0.1982, "step": 5736 }, { "epoch": 2.716543356022492, "grad_norm": 1.098061203956604, "learning_rate": 4.6093145714615763e-07, "loss": 0.2108, "step": 5737 }, { "epoch": 2.717016868896123, "grad_norm": 0.9107375144958496, "learning_rate": 4.5939794821510785e-07, "loss": 0.1822, "step": 5738 }, { "epoch": 2.717490381769754, "grad_norm": 1.2272454500198364, "learning_rate": 4.578669345364828e-07, "loss": 0.1883, "step": 5739 }, { "epoch": 2.7179638946433857, "grad_norm": 1.202950119972229, "learning_rate": 4.5633841651070766e-07, "loss": 0.1952, "step": 5740 }, { "epoch": 2.718437407517017, "grad_norm": 1.1974061727523804, "learning_rate": 4.548123945375493e-07, "loss": 0.2013, "step": 5741 }, { "epoch": 2.7189109203906483, "grad_norm": 1.0299830436706543, "learning_rate": 4.5328886901612743e-07, "loss": 0.1803, "step": 5742 }, { "epoch": 2.7193844332642794, "grad_norm": 0.9163728356361389, "learning_rate": 4.5176784034489993e-07, "loss": 0.1797, "step": 5743 }, { "epoch": 2.7198579461379104, "grad_norm": 1.6618443727493286, "learning_rate": 4.5024930892168305e-07, "loss": 0.2081, "step": 5744 }, { "epoch": 2.720331459011542, "grad_norm": 1.2780410051345825, "learning_rate": 4.487332751436302e-07, "loss": 0.1836, "step": 5745 }, { "epoch": 2.720804971885173, "grad_norm": 0.9704563617706299, "learning_rate": 4.472197394072464e-07, "loss": 0.1998, "step": 5746 }, { "epoch": 2.7212784847588045, "grad_norm": 1.1087617874145508, "learning_rate": 4.457087021083839e-07, "loss": 0.2173, "step": 5747 }, { "epoch": 2.7217519976324356, "grad_norm": 1.3192325830459595, "learning_rate": 4.442001636422366e-07, "loss": 0.2108, "step": 5748 }, { "epoch": 2.7222255105060666, "grad_norm": 0.8854762315750122, "learning_rate": 4.4269412440335114e-07, "loss": 0.2035, "step": 5749 }, { "epoch": 2.722699023379698, "grad_norm": 1.296974778175354, "learning_rate": 4.411905847856157e-07, "loss": 0.1926, "step": 5750 }, { "epoch": 2.7231725362533292, "grad_norm": 1.4468486309051514, "learning_rate": 4.39689545182268e-07, "loss": 0.198, "step": 5751 }, { "epoch": 2.7236460491269607, "grad_norm": 1.2393492460250854, "learning_rate": 4.381910059858896e-07, "loss": 0.1975, "step": 5752 }, { "epoch": 2.724119562000592, "grad_norm": 1.3274880647659302, "learning_rate": 4.366949675884091e-07, "loss": 0.201, "step": 5753 }, { "epoch": 2.7245930748742233, "grad_norm": 1.3509947061538696, "learning_rate": 4.352014303811003e-07, "loss": 0.1904, "step": 5754 }, { "epoch": 2.7250665877478544, "grad_norm": 0.9841063022613525, "learning_rate": 4.337103947545862e-07, "loss": 0.1994, "step": 5755 }, { "epoch": 2.725540100621486, "grad_norm": 1.1662771701812744, "learning_rate": 4.3222186109882933e-07, "loss": 0.2036, "step": 5756 }, { "epoch": 2.726013613495117, "grad_norm": 0.9167471528053284, "learning_rate": 4.307358298031428e-07, "loss": 0.1971, "step": 5757 }, { "epoch": 2.726487126368748, "grad_norm": 1.2513035535812378, "learning_rate": 4.2925230125618336e-07, "loss": 0.2269, "step": 5758 }, { "epoch": 2.7269606392423795, "grad_norm": 1.0162153244018555, "learning_rate": 4.2777127584595403e-07, "loss": 0.1965, "step": 5759 }, { "epoch": 2.7274341521160106, "grad_norm": 1.4992437362670898, "learning_rate": 4.2629275395980275e-07, "loss": 0.1795, "step": 5760 }, { "epoch": 2.727907664989642, "grad_norm": 1.2534233331680298, "learning_rate": 4.248167359844224e-07, "loss": 0.2045, "step": 5761 }, { "epoch": 2.728381177863273, "grad_norm": 1.2920854091644287, "learning_rate": 4.23343222305852e-07, "loss": 0.1998, "step": 5762 }, { "epoch": 2.7288546907369042, "grad_norm": 0.9584261775016785, "learning_rate": 4.2187221330947216e-07, "loss": 0.1905, "step": 5763 }, { "epoch": 2.7293282036105357, "grad_norm": 1.5315953493118286, "learning_rate": 4.2040370938001507e-07, "loss": 0.21, "step": 5764 }, { "epoch": 2.729801716484167, "grad_norm": 1.48392915725708, "learning_rate": 4.1893771090155246e-07, "loss": 0.1877, "step": 5765 }, { "epoch": 2.7302752293577983, "grad_norm": 0.9685987830162048, "learning_rate": 4.174742182574998e-07, "loss": 0.2019, "step": 5766 }, { "epoch": 2.7307487422314294, "grad_norm": 1.2201647758483887, "learning_rate": 4.1601323183062205e-07, "loss": 0.1967, "step": 5767 }, { "epoch": 2.7312222551050604, "grad_norm": 0.9604336023330688, "learning_rate": 4.1455475200302353e-07, "loss": 0.1898, "step": 5768 }, { "epoch": 2.731695767978692, "grad_norm": 0.9989910125732422, "learning_rate": 4.1309877915615913e-07, "loss": 0.2053, "step": 5769 }, { "epoch": 2.732169280852323, "grad_norm": 1.192683219909668, "learning_rate": 4.116453136708187e-07, "loss": 0.2246, "step": 5770 }, { "epoch": 2.7326427937259545, "grad_norm": 1.0321522951126099, "learning_rate": 4.101943559271504e-07, "loss": 0.2037, "step": 5771 }, { "epoch": 2.7331163065995856, "grad_norm": 1.1789295673370361, "learning_rate": 4.0874590630463283e-07, "loss": 0.2316, "step": 5772 }, { "epoch": 2.7335898194732167, "grad_norm": 1.1018298864364624, "learning_rate": 4.072999651820941e-07, "loss": 0.2042, "step": 5773 }, { "epoch": 2.734063332346848, "grad_norm": 0.9121476411819458, "learning_rate": 4.058565329377073e-07, "loss": 0.1769, "step": 5774 }, { "epoch": 2.7345368452204797, "grad_norm": 1.112981915473938, "learning_rate": 4.044156099489882e-07, "loss": 0.189, "step": 5775 }, { "epoch": 2.7350103580941107, "grad_norm": 1.084700345993042, "learning_rate": 4.0297719659279645e-07, "loss": 0.2051, "step": 5776 }, { "epoch": 2.735483870967742, "grad_norm": 1.0018030405044556, "learning_rate": 4.015412932453333e-07, "loss": 0.2046, "step": 5777 }, { "epoch": 2.7359573838413733, "grad_norm": 1.5083683729171753, "learning_rate": 4.0010790028214843e-07, "loss": 0.1895, "step": 5778 }, { "epoch": 2.7364308967150044, "grad_norm": 1.0385921001434326, "learning_rate": 3.9867701807812963e-07, "loss": 0.2093, "step": 5779 }, { "epoch": 2.736904409588636, "grad_norm": 1.0134193897247314, "learning_rate": 3.9724864700751207e-07, "loss": 0.2117, "step": 5780 }, { "epoch": 2.737377922462267, "grad_norm": 1.1653934717178345, "learning_rate": 3.9582278744387137e-07, "loss": 0.2079, "step": 5781 }, { "epoch": 2.737851435335898, "grad_norm": 1.1373720169067383, "learning_rate": 3.9439943976012696e-07, "loss": 0.2118, "step": 5782 }, { "epoch": 2.7383249482095295, "grad_norm": 1.2817039489746094, "learning_rate": 3.929786043285433e-07, "loss": 0.2094, "step": 5783 }, { "epoch": 2.7387984610831606, "grad_norm": 0.9910814166069031, "learning_rate": 3.915602815207231e-07, "loss": 0.2151, "step": 5784 }, { "epoch": 2.739271973956792, "grad_norm": 1.1655786037445068, "learning_rate": 3.901444717076186e-07, "loss": 0.2014, "step": 5785 }, { "epoch": 2.739745486830423, "grad_norm": 1.3065234422683716, "learning_rate": 3.8873117525951797e-07, "loss": 0.1806, "step": 5786 }, { "epoch": 2.7402189997040542, "grad_norm": 1.1026694774627686, "learning_rate": 3.873203925460589e-07, "loss": 0.183, "step": 5787 }, { "epoch": 2.7406925125776858, "grad_norm": 1.210478663444519, "learning_rate": 3.8591212393621405e-07, "loss": 0.216, "step": 5788 }, { "epoch": 2.741166025451317, "grad_norm": 1.6913577318191528, "learning_rate": 3.845063697983065e-07, "loss": 0.2123, "step": 5789 }, { "epoch": 2.7416395383249483, "grad_norm": 1.2592370510101318, "learning_rate": 3.8310313049999546e-07, "loss": 0.1888, "step": 5790 }, { "epoch": 2.7421130511985794, "grad_norm": 0.9264201521873474, "learning_rate": 3.8170240640828304e-07, "loss": 0.1949, "step": 5791 }, { "epoch": 2.7425865640722105, "grad_norm": 1.0950411558151245, "learning_rate": 3.8030419788951834e-07, "loss": 0.215, "step": 5792 }, { "epoch": 2.743060076945842, "grad_norm": 1.2804242372512817, "learning_rate": 3.789085053093866e-07, "loss": 0.2031, "step": 5793 }, { "epoch": 2.7435335898194735, "grad_norm": 1.092449426651001, "learning_rate": 3.775153290329203e-07, "loss": 0.1747, "step": 5794 }, { "epoch": 2.7440071026931045, "grad_norm": 0.9500091075897217, "learning_rate": 3.7612466942448797e-07, "loss": 0.2102, "step": 5795 }, { "epoch": 2.7444806155667356, "grad_norm": 1.2651680707931519, "learning_rate": 3.747365268478076e-07, "loss": 0.1895, "step": 5796 }, { "epoch": 2.744954128440367, "grad_norm": 1.0188058614730835, "learning_rate": 3.733509016659298e-07, "loss": 0.2146, "step": 5797 }, { "epoch": 2.745427641313998, "grad_norm": 1.008729338645935, "learning_rate": 3.7196779424125585e-07, "loss": 0.2023, "step": 5798 }, { "epoch": 2.7459011541876297, "grad_norm": 1.0331710577011108, "learning_rate": 3.705872049355208e-07, "loss": 0.1947, "step": 5799 }, { "epoch": 2.7463746670612608, "grad_norm": 0.8746491074562073, "learning_rate": 3.6920913410980585e-07, "loss": 0.1955, "step": 5800 }, { "epoch": 2.746848179934892, "grad_norm": 0.9344222545623779, "learning_rate": 3.678335821245327e-07, "loss": 0.1811, "step": 5801 }, { "epoch": 2.7473216928085233, "grad_norm": 0.9031004309654236, "learning_rate": 3.664605493394624e-07, "loss": 0.1813, "step": 5802 }, { "epoch": 2.7477952056821544, "grad_norm": 1.2377285957336426, "learning_rate": 3.6509003611369884e-07, "loss": 0.2265, "step": 5803 }, { "epoch": 2.748268718555786, "grad_norm": 1.0772106647491455, "learning_rate": 3.6372204280568644e-07, "loss": 0.2122, "step": 5804 }, { "epoch": 2.748742231429417, "grad_norm": 0.8862806558609009, "learning_rate": 3.623565697732123e-07, "loss": 0.1984, "step": 5805 }, { "epoch": 2.749215744303048, "grad_norm": 1.0365098714828491, "learning_rate": 3.6099361737339965e-07, "loss": 0.1737, "step": 5806 }, { "epoch": 2.7496892571766796, "grad_norm": 1.0085512399673462, "learning_rate": 3.596331859627189e-07, "loss": 0.1949, "step": 5807 }, { "epoch": 2.7501627700503106, "grad_norm": 1.1064403057098389, "learning_rate": 3.582752758969743e-07, "loss": 0.1957, "step": 5808 }, { "epoch": 2.750636282923942, "grad_norm": 0.9437001347541809, "learning_rate": 3.5691988753131625e-07, "loss": 0.1861, "step": 5809 }, { "epoch": 2.751109795797573, "grad_norm": 0.9680258631706238, "learning_rate": 3.5556702122023444e-07, "loss": 0.2232, "step": 5810 }, { "epoch": 2.7515833086712043, "grad_norm": 0.9630969762802124, "learning_rate": 3.5421667731755484e-07, "loss": 0.196, "step": 5811 }, { "epoch": 2.7520568215448358, "grad_norm": 1.0958917140960693, "learning_rate": 3.528688561764515e-07, "loss": 0.2042, "step": 5812 }, { "epoch": 2.752530334418467, "grad_norm": 1.1072863340377808, "learning_rate": 3.5152355814942916e-07, "loss": 0.2031, "step": 5813 }, { "epoch": 2.7530038472920983, "grad_norm": 1.231067180633545, "learning_rate": 3.5018078358834084e-07, "loss": 0.1989, "step": 5814 }, { "epoch": 2.7534773601657294, "grad_norm": 1.1398643255233765, "learning_rate": 3.4884053284437444e-07, "loss": 0.2065, "step": 5815 }, { "epoch": 2.7539508730393605, "grad_norm": 1.0876073837280273, "learning_rate": 3.4750280626805964e-07, "loss": 0.1859, "step": 5816 }, { "epoch": 2.754424385912992, "grad_norm": 1.428344964981079, "learning_rate": 3.461676042092688e-07, "loss": 0.2038, "step": 5817 }, { "epoch": 2.7548978987866235, "grad_norm": 0.8772413730621338, "learning_rate": 3.4483492701720687e-07, "loss": 0.1925, "step": 5818 }, { "epoch": 2.7553714116602546, "grad_norm": 1.0749726295471191, "learning_rate": 3.435047750404252e-07, "loss": 0.2179, "step": 5819 }, { "epoch": 2.7558449245338856, "grad_norm": 1.8996601104736328, "learning_rate": 3.4217714862681215e-07, "loss": 0.1664, "step": 5820 }, { "epoch": 2.756318437407517, "grad_norm": 1.382574200630188, "learning_rate": 3.408520481235955e-07, "loss": 0.2156, "step": 5821 }, { "epoch": 2.756791950281148, "grad_norm": 1.1029624938964844, "learning_rate": 3.395294738773403e-07, "loss": 0.2294, "step": 5822 }, { "epoch": 2.7572654631547797, "grad_norm": 1.0061315298080444, "learning_rate": 3.382094262339575e-07, "loss": 0.1968, "step": 5823 }, { "epoch": 2.757738976028411, "grad_norm": 1.1826341152191162, "learning_rate": 3.368919055386888e-07, "loss": 0.1868, "step": 5824 }, { "epoch": 2.758212488902042, "grad_norm": 1.690619707107544, "learning_rate": 3.3557691213612075e-07, "loss": 0.2114, "step": 5825 }, { "epoch": 2.7586860017756734, "grad_norm": 1.034387469291687, "learning_rate": 3.342644463701783e-07, "loss": 0.2105, "step": 5826 }, { "epoch": 2.7591595146493044, "grad_norm": 0.9920729398727417, "learning_rate": 3.3295450858412125e-07, "loss": 0.1833, "step": 5827 }, { "epoch": 2.759633027522936, "grad_norm": 1.3329393863677979, "learning_rate": 3.316470991205534e-07, "loss": 0.2245, "step": 5828 }, { "epoch": 2.760106540396567, "grad_norm": 1.0018433332443237, "learning_rate": 3.3034221832141446e-07, "loss": 0.1915, "step": 5829 }, { "epoch": 2.760580053270198, "grad_norm": 1.5985726118087769, "learning_rate": 3.2903986652798367e-07, "loss": 0.1633, "step": 5830 }, { "epoch": 2.7610535661438296, "grad_norm": 2.1785120964050293, "learning_rate": 3.2774004408087756e-07, "loss": 0.1907, "step": 5831 }, { "epoch": 2.7615270790174606, "grad_norm": 0.9680808782577515, "learning_rate": 3.264427513200552e-07, "loss": 0.1905, "step": 5832 }, { "epoch": 2.762000591891092, "grad_norm": 1.224849820137024, "learning_rate": 3.2514798858480635e-07, "loss": 0.192, "step": 5833 }, { "epoch": 2.762474104764723, "grad_norm": 1.091199517250061, "learning_rate": 3.238557562137679e-07, "loss": 0.212, "step": 5834 }, { "epoch": 2.7629476176383543, "grad_norm": 1.2063572406768799, "learning_rate": 3.225660545449083e-07, "loss": 0.1971, "step": 5835 }, { "epoch": 2.763421130511986, "grad_norm": 1.4341157674789429, "learning_rate": 3.2127888391553674e-07, "loss": 0.1935, "step": 5836 }, { "epoch": 2.7638946433856173, "grad_norm": 1.0425024032592773, "learning_rate": 3.1999424466230166e-07, "loss": 0.2098, "step": 5837 }, { "epoch": 2.7643681562592484, "grad_norm": 1.3151661157608032, "learning_rate": 3.187121371211854e-07, "loss": 0.1904, "step": 5838 }, { "epoch": 2.7648416691328794, "grad_norm": 1.4269354343414307, "learning_rate": 3.1743256162751534e-07, "loss": 0.2162, "step": 5839 }, { "epoch": 2.765315182006511, "grad_norm": 0.9791456460952759, "learning_rate": 3.161555185159471e-07, "loss": 0.177, "step": 5840 }, { "epoch": 2.765788694880142, "grad_norm": 1.1203821897506714, "learning_rate": 3.1488100812048337e-07, "loss": 0.2033, "step": 5841 }, { "epoch": 2.7662622077537735, "grad_norm": 1.1284160614013672, "learning_rate": 3.136090307744555e-07, "loss": 0.1727, "step": 5842 }, { "epoch": 2.7667357206274046, "grad_norm": 1.2004402875900269, "learning_rate": 3.1233958681054164e-07, "loss": 0.183, "step": 5843 }, { "epoch": 2.7672092335010356, "grad_norm": 2.777414083480835, "learning_rate": 3.110726765607497e-07, "loss": 0.2068, "step": 5844 }, { "epoch": 2.767682746374667, "grad_norm": 1.0925432443618774, "learning_rate": 3.0980830035642783e-07, "loss": 0.2027, "step": 5845 }, { "epoch": 2.7681562592482982, "grad_norm": 0.955273449420929, "learning_rate": 3.085464585282627e-07, "loss": 0.2024, "step": 5846 }, { "epoch": 2.7686297721219297, "grad_norm": 1.7570418119430542, "learning_rate": 3.0728715140627584e-07, "loss": 0.199, "step": 5847 }, { "epoch": 2.769103284995561, "grad_norm": 1.181026577949524, "learning_rate": 3.0603037931982603e-07, "loss": 0.2072, "step": 5848 }, { "epoch": 2.769576797869192, "grad_norm": 1.4937407970428467, "learning_rate": 3.0477614259761254e-07, "loss": 0.2191, "step": 5849 }, { "epoch": 2.7700503107428234, "grad_norm": 0.9318932890892029, "learning_rate": 3.035244415676675e-07, "loss": 0.1817, "step": 5850 }, { "epoch": 2.7705238236164544, "grad_norm": 1.0921509265899658, "learning_rate": 3.0227527655736223e-07, "loss": 0.2072, "step": 5851 }, { "epoch": 2.770997336490086, "grad_norm": 1.1498230695724487, "learning_rate": 3.01028647893401e-07, "loss": 0.2132, "step": 5852 }, { "epoch": 2.771470849363717, "grad_norm": 1.4725837707519531, "learning_rate": 2.9978455590182974e-07, "loss": 0.2007, "step": 5853 }, { "epoch": 2.771944362237348, "grad_norm": 1.1440794467926025, "learning_rate": 2.985430009080281e-07, "loss": 0.2044, "step": 5854 }, { "epoch": 2.7724178751109796, "grad_norm": 1.0810843706130981, "learning_rate": 2.9730398323671415e-07, "loss": 0.175, "step": 5855 }, { "epoch": 2.772891387984611, "grad_norm": 1.1769734621047974, "learning_rate": 2.960675032119387e-07, "loss": 0.2085, "step": 5856 }, { "epoch": 2.773364900858242, "grad_norm": 0.9242314100265503, "learning_rate": 2.9483356115709295e-07, "loss": 0.2023, "step": 5857 }, { "epoch": 2.7738384137318732, "grad_norm": 0.8873565793037415, "learning_rate": 2.936021573949011e-07, "loss": 0.2078, "step": 5858 }, { "epoch": 2.7743119266055047, "grad_norm": 1.0390340089797974, "learning_rate": 2.923732922474265e-07, "loss": 0.2173, "step": 5859 }, { "epoch": 2.774785439479136, "grad_norm": 1.056854248046875, "learning_rate": 2.911469660360655e-07, "loss": 0.2255, "step": 5860 }, { "epoch": 2.7752589523527673, "grad_norm": 0.9877737164497375, "learning_rate": 2.8992317908155263e-07, "loss": 0.1992, "step": 5861 }, { "epoch": 2.7757324652263984, "grad_norm": 1.1483983993530273, "learning_rate": 2.8870193170395855e-07, "loss": 0.2159, "step": 5862 }, { "epoch": 2.7762059781000294, "grad_norm": 1.4070454835891724, "learning_rate": 2.874832242226866e-07, "loss": 0.2035, "step": 5863 }, { "epoch": 2.776679490973661, "grad_norm": 0.9695731401443481, "learning_rate": 2.862670569564796e-07, "loss": 0.2004, "step": 5864 }, { "epoch": 2.777153003847292, "grad_norm": 1.5455920696258545, "learning_rate": 2.850534302234131e-07, "loss": 0.1964, "step": 5865 }, { "epoch": 2.7776265167209235, "grad_norm": 1.2403937578201294, "learning_rate": 2.838423443409011e-07, "loss": 0.22, "step": 5866 }, { "epoch": 2.7781000295945546, "grad_norm": 1.3792948722839355, "learning_rate": 2.8263379962568894e-07, "loss": 0.2188, "step": 5867 }, { "epoch": 2.7785735424681857, "grad_norm": 0.9905843138694763, "learning_rate": 2.8142779639386275e-07, "loss": 0.1821, "step": 5868 }, { "epoch": 2.779047055341817, "grad_norm": 0.9501548409461975, "learning_rate": 2.8022433496084024e-07, "loss": 0.2016, "step": 5869 }, { "epoch": 2.7795205682154482, "grad_norm": 0.990218460559845, "learning_rate": 2.7902341564137294e-07, "loss": 0.1896, "step": 5870 }, { "epoch": 2.7799940810890797, "grad_norm": 0.9677179455757141, "learning_rate": 2.778250387495529e-07, "loss": 0.21, "step": 5871 }, { "epoch": 2.780467593962711, "grad_norm": 1.4591187238693237, "learning_rate": 2.766292045988006e-07, "loss": 0.2014, "step": 5872 }, { "epoch": 2.780941106836342, "grad_norm": 1.055142879486084, "learning_rate": 2.754359135018791e-07, "loss": 0.1694, "step": 5873 }, { "epoch": 2.7814146197099734, "grad_norm": 1.0192638635635376, "learning_rate": 2.742451657708778e-07, "loss": 0.1877, "step": 5874 }, { "epoch": 2.7818881325836045, "grad_norm": 1.215868353843689, "learning_rate": 2.730569617172296e-07, "loss": 0.1723, "step": 5875 }, { "epoch": 2.782361645457236, "grad_norm": 1.0110760927200317, "learning_rate": 2.7187130165169383e-07, "loss": 0.2039, "step": 5876 }, { "epoch": 2.782835158330867, "grad_norm": 1.146170735359192, "learning_rate": 2.706881858843702e-07, "loss": 0.1955, "step": 5877 }, { "epoch": 2.783308671204498, "grad_norm": 0.9560944437980652, "learning_rate": 2.695076147246911e-07, "loss": 0.1919, "step": 5878 }, { "epoch": 2.7837821840781296, "grad_norm": 1.2928173542022705, "learning_rate": 2.683295884814252e-07, "loss": 0.1987, "step": 5879 }, { "epoch": 2.784255696951761, "grad_norm": 1.1882565021514893, "learning_rate": 2.6715410746267155e-07, "loss": 0.2041, "step": 5880 }, { "epoch": 2.784729209825392, "grad_norm": 1.3022156953811646, "learning_rate": 2.6598117197586646e-07, "loss": 0.1777, "step": 5881 }, { "epoch": 2.7852027226990232, "grad_norm": 1.2867157459259033, "learning_rate": 2.6481078232778013e-07, "loss": 0.1863, "step": 5882 }, { "epoch": 2.7856762355726548, "grad_norm": 1.3041218519210815, "learning_rate": 2.6364293882451655e-07, "loss": 0.1834, "step": 5883 }, { "epoch": 2.786149748446286, "grad_norm": 1.1009517908096313, "learning_rate": 2.624776417715147e-07, "loss": 0.1964, "step": 5884 }, { "epoch": 2.7866232613199173, "grad_norm": 1.0136332511901855, "learning_rate": 2.6131489147354527e-07, "loss": 0.1895, "step": 5885 }, { "epoch": 2.7870967741935484, "grad_norm": 1.4136160612106323, "learning_rate": 2.60154688234715e-07, "loss": 0.2031, "step": 5886 }, { "epoch": 2.7875702870671795, "grad_norm": 1.1578394174575806, "learning_rate": 2.589970323584645e-07, "loss": 0.2127, "step": 5887 }, { "epoch": 2.788043799940811, "grad_norm": 1.2432137727737427, "learning_rate": 2.5784192414756714e-07, "loss": 0.1898, "step": 5888 }, { "epoch": 2.788517312814442, "grad_norm": 1.029590368270874, "learning_rate": 2.5668936390413014e-07, "loss": 0.1919, "step": 5889 }, { "epoch": 2.7889908256880735, "grad_norm": 1.444937825202942, "learning_rate": 2.5553935192959457e-07, "loss": 0.2137, "step": 5890 }, { "epoch": 2.7894643385617046, "grad_norm": 1.5030622482299805, "learning_rate": 2.543918885247354e-07, "loss": 0.1897, "step": 5891 }, { "epoch": 2.7899378514353357, "grad_norm": 0.9852031469345093, "learning_rate": 2.5324697398965927e-07, "loss": 0.2015, "step": 5892 }, { "epoch": 2.790411364308967, "grad_norm": 1.4873343706130981, "learning_rate": 2.5210460862380993e-07, "loss": 0.1801, "step": 5893 }, { "epoch": 2.7908848771825983, "grad_norm": 1.5172432661056519, "learning_rate": 2.5096479272596064e-07, "loss": 0.2038, "step": 5894 }, { "epoch": 2.7913583900562298, "grad_norm": 1.1348522901535034, "learning_rate": 2.4982752659421736e-07, "loss": 0.1754, "step": 5895 }, { "epoch": 2.791831902929861, "grad_norm": 0.959349513053894, "learning_rate": 2.4869281052602447e-07, "loss": 0.1908, "step": 5896 }, { "epoch": 2.792305415803492, "grad_norm": 1.4271739721298218, "learning_rate": 2.4756064481815445e-07, "loss": 0.1959, "step": 5897 }, { "epoch": 2.7927789286771234, "grad_norm": 1.5378594398498535, "learning_rate": 2.4643102976671383e-07, "loss": 0.2231, "step": 5898 }, { "epoch": 2.793252441550755, "grad_norm": 1.1620539426803589, "learning_rate": 2.45303965667143e-07, "loss": 0.2145, "step": 5899 }, { "epoch": 2.793725954424386, "grad_norm": 1.0995843410491943, "learning_rate": 2.4417945281421495e-07, "loss": 0.204, "step": 5900 }, { "epoch": 2.794199467298017, "grad_norm": 0.9468101859092712, "learning_rate": 2.430574915020345e-07, "loss": 0.1756, "step": 5901 }, { "epoch": 2.7946729801716486, "grad_norm": 1.0497629642486572, "learning_rate": 2.419380820240413e-07, "loss": 0.2112, "step": 5902 }, { "epoch": 2.7951464930452796, "grad_norm": 1.391728401184082, "learning_rate": 2.408212246730035e-07, "loss": 0.2024, "step": 5903 }, { "epoch": 2.795620005918911, "grad_norm": 0.9138317108154297, "learning_rate": 2.397069197410273e-07, "loss": 0.2109, "step": 5904 }, { "epoch": 2.796093518792542, "grad_norm": 0.9192736148834229, "learning_rate": 2.3859516751954637e-07, "loss": 0.2016, "step": 5905 }, { "epoch": 2.7965670316661733, "grad_norm": 1.567175030708313, "learning_rate": 2.3748596829932914e-07, "loss": 0.2126, "step": 5906 }, { "epoch": 2.7970405445398048, "grad_norm": 1.9273465871810913, "learning_rate": 2.36379322370478e-07, "loss": 0.2062, "step": 5907 }, { "epoch": 2.797514057413436, "grad_norm": 1.1850330829620361, "learning_rate": 2.3527523002242147e-07, "loss": 0.2089, "step": 5908 }, { "epoch": 2.7979875702870673, "grad_norm": 1.0225036144256592, "learning_rate": 2.3417369154392854e-07, "loss": 0.2057, "step": 5909 }, { "epoch": 2.7984610831606984, "grad_norm": 0.9364516735076904, "learning_rate": 2.330747072230921e-07, "loss": 0.1966, "step": 5910 }, { "epoch": 2.7989345960343295, "grad_norm": 1.2841675281524658, "learning_rate": 2.3197827734734446e-07, "loss": 0.1784, "step": 5911 }, { "epoch": 2.799408108907961, "grad_norm": 1.0790499448776245, "learning_rate": 2.3088440220344288e-07, "loss": 0.1821, "step": 5912 }, { "epoch": 2.799881621781592, "grad_norm": 1.2959651947021484, "learning_rate": 2.2979308207748295e-07, "loss": 0.1877, "step": 5913 }, { "epoch": 2.8003551346552236, "grad_norm": 1.3200701475143433, "learning_rate": 2.287043172548875e-07, "loss": 0.2125, "step": 5914 }, { "epoch": 2.8008286475288546, "grad_norm": 1.6399303674697876, "learning_rate": 2.2761810802041205e-07, "loss": 0.2024, "step": 5915 }, { "epoch": 2.8013021604024857, "grad_norm": 1.029131293296814, "learning_rate": 2.2653445465814493e-07, "loss": 0.2218, "step": 5916 }, { "epoch": 2.801775673276117, "grad_norm": 1.1296693086624146, "learning_rate": 2.2545335745150387e-07, "loss": 0.1993, "step": 5917 }, { "epoch": 2.8022491861497487, "grad_norm": 1.231637954711914, "learning_rate": 2.2437481668324268e-07, "loss": 0.1831, "step": 5918 }, { "epoch": 2.80272269902338, "grad_norm": 1.1062796115875244, "learning_rate": 2.2329883263543905e-07, "loss": 0.19, "step": 5919 }, { "epoch": 2.803196211897011, "grad_norm": 0.9966011643409729, "learning_rate": 2.22225405589509e-07, "loss": 0.1985, "step": 5920 }, { "epoch": 2.8036697247706424, "grad_norm": 0.9549378752708435, "learning_rate": 2.2115453582619573e-07, "loss": 0.1719, "step": 5921 }, { "epoch": 2.8041432376442734, "grad_norm": 1.1001967191696167, "learning_rate": 2.200862236255763e-07, "loss": 0.2117, "step": 5922 }, { "epoch": 2.804616750517905, "grad_norm": 1.617583990097046, "learning_rate": 2.1902046926705611e-07, "loss": 0.1837, "step": 5923 }, { "epoch": 2.805090263391536, "grad_norm": 1.1403623819351196, "learning_rate": 2.179572730293733e-07, "loss": 0.2129, "step": 5924 }, { "epoch": 2.805563776265167, "grad_norm": 1.1201438903808594, "learning_rate": 2.1689663519059545e-07, "loss": 0.1948, "step": 5925 }, { "epoch": 2.8060372891387986, "grad_norm": 1.2484549283981323, "learning_rate": 2.158385560281251e-07, "loss": 0.197, "step": 5926 }, { "epoch": 2.8065108020124296, "grad_norm": 1.149869441986084, "learning_rate": 2.1478303581869087e-07, "loss": 0.1816, "step": 5927 }, { "epoch": 2.806984314886061, "grad_norm": 1.0050691366195679, "learning_rate": 2.1373007483835306e-07, "loss": 0.1902, "step": 5928 }, { "epoch": 2.807457827759692, "grad_norm": 1.3803904056549072, "learning_rate": 2.1267967336250472e-07, "loss": 0.2059, "step": 5929 }, { "epoch": 2.8079313406333233, "grad_norm": 1.0127631425857544, "learning_rate": 2.1163183166586943e-07, "loss": 0.1983, "step": 5930 }, { "epoch": 2.808404853506955, "grad_norm": 1.1045966148376465, "learning_rate": 2.1058655002249573e-07, "loss": 0.2302, "step": 5931 }, { "epoch": 2.808878366380586, "grad_norm": 1.2346396446228027, "learning_rate": 2.0954382870577162e-07, "loss": 0.193, "step": 5932 }, { "epoch": 2.8093518792542174, "grad_norm": 1.0287126302719116, "learning_rate": 2.0850366798840892e-07, "loss": 0.2041, "step": 5933 }, { "epoch": 2.8098253921278484, "grad_norm": 1.2126495838165283, "learning_rate": 2.074660681424512e-07, "loss": 0.198, "step": 5934 }, { "epoch": 2.8102989050014795, "grad_norm": 0.9380123615264893, "learning_rate": 2.064310294392724e-07, "loss": 0.1903, "step": 5935 }, { "epoch": 2.810772417875111, "grad_norm": 1.0035574436187744, "learning_rate": 2.0539855214957828e-07, "loss": 0.2078, "step": 5936 }, { "epoch": 2.811245930748742, "grad_norm": 1.22806715965271, "learning_rate": 2.043686365434028e-07, "loss": 0.2211, "step": 5937 }, { "epoch": 2.8117194436223736, "grad_norm": 1.3754026889801025, "learning_rate": 2.0334128289011046e-07, "loss": 0.1877, "step": 5938 }, { "epoch": 2.8121929564960046, "grad_norm": 1.7814515829086304, "learning_rate": 2.0231649145839528e-07, "loss": 0.2241, "step": 5939 }, { "epoch": 2.8126664693696357, "grad_norm": 1.2736109495162964, "learning_rate": 2.012942625162817e-07, "loss": 0.186, "step": 5940 }, { "epoch": 2.8131399822432672, "grad_norm": 1.2346388101577759, "learning_rate": 2.002745963311248e-07, "loss": 0.1886, "step": 5941 }, { "epoch": 2.8136134951168987, "grad_norm": 0.9936687350273132, "learning_rate": 1.9925749316960563e-07, "loss": 0.2186, "step": 5942 }, { "epoch": 2.81408700799053, "grad_norm": 0.9427998065948486, "learning_rate": 1.9824295329774145e-07, "loss": 0.1811, "step": 5943 }, { "epoch": 2.814560520864161, "grad_norm": 1.290932059288025, "learning_rate": 1.9723097698087336e-07, "loss": 0.1865, "step": 5944 }, { "epoch": 2.8150340337377924, "grad_norm": 1.3386746644973755, "learning_rate": 1.9622156448367403e-07, "loss": 0.1974, "step": 5945 }, { "epoch": 2.8155075466114234, "grad_norm": 0.9645776748657227, "learning_rate": 1.9521471607014565e-07, "loss": 0.1782, "step": 5946 }, { "epoch": 2.815981059485055, "grad_norm": 1.2050148248672485, "learning_rate": 1.9421043200361976e-07, "loss": 0.1887, "step": 5947 }, { "epoch": 2.816454572358686, "grad_norm": 1.0068618059158325, "learning_rate": 1.9320871254675745e-07, "loss": 0.2107, "step": 5948 }, { "epoch": 2.816928085232317, "grad_norm": 2.5484302043914795, "learning_rate": 1.9220955796154794e-07, "loss": 0.1889, "step": 5949 }, { "epoch": 2.8174015981059486, "grad_norm": 1.0873286724090576, "learning_rate": 1.9121296850931225e-07, "loss": 0.2036, "step": 5950 }, { "epoch": 2.8178751109795797, "grad_norm": 1.162097454071045, "learning_rate": 1.9021894445069744e-07, "loss": 0.2023, "step": 5951 }, { "epoch": 2.818348623853211, "grad_norm": 1.1545616388320923, "learning_rate": 1.892274860456811e-07, "loss": 0.2182, "step": 5952 }, { "epoch": 2.8188221367268422, "grad_norm": 1.2992243766784668, "learning_rate": 1.8823859355356798e-07, "loss": 0.1967, "step": 5953 }, { "epoch": 2.8192956496004733, "grad_norm": 0.9885454177856445, "learning_rate": 1.8725226723299682e-07, "loss": 0.1857, "step": 5954 }, { "epoch": 2.819769162474105, "grad_norm": 1.3030709028244019, "learning_rate": 1.8626850734192904e-07, "loss": 0.1846, "step": 5955 }, { "epoch": 2.820242675347736, "grad_norm": 1.0009846687316895, "learning_rate": 1.8528731413765877e-07, "loss": 0.2014, "step": 5956 }, { "epoch": 2.8207161882213674, "grad_norm": 1.367098331451416, "learning_rate": 1.8430868787680633e-07, "loss": 0.2086, "step": 5957 }, { "epoch": 2.8211897010949984, "grad_norm": 1.151480793952942, "learning_rate": 1.8333262881532476e-07, "loss": 0.2091, "step": 5958 }, { "epoch": 2.8216632139686295, "grad_norm": 1.2363460063934326, "learning_rate": 1.8235913720848985e-07, "loss": 0.2001, "step": 5959 }, { "epoch": 2.822136726842261, "grad_norm": 1.2438163757324219, "learning_rate": 1.8138821331091015e-07, "loss": 0.2125, "step": 5960 }, { "epoch": 2.8226102397158925, "grad_norm": 1.2525054216384888, "learning_rate": 1.8041985737652256e-07, "loss": 0.2103, "step": 5961 }, { "epoch": 2.8230837525895236, "grad_norm": 1.3402167558670044, "learning_rate": 1.7945406965858892e-07, "loss": 0.1944, "step": 5962 }, { "epoch": 2.8235572654631547, "grad_norm": 1.0227385759353638, "learning_rate": 1.784908504097027e-07, "loss": 0.2063, "step": 5963 }, { "epoch": 2.824030778336786, "grad_norm": 1.5239243507385254, "learning_rate": 1.7753019988178577e-07, "loss": 0.1862, "step": 5964 }, { "epoch": 2.8245042912104172, "grad_norm": 1.001605749130249, "learning_rate": 1.765721183260849e-07, "loss": 0.1981, "step": 5965 }, { "epoch": 2.8249778040840487, "grad_norm": 1.0575411319732666, "learning_rate": 1.7561660599317853e-07, "loss": 0.2199, "step": 5966 }, { "epoch": 2.82545131695768, "grad_norm": 1.230980396270752, "learning_rate": 1.7466366313297123e-07, "loss": 0.2134, "step": 5967 }, { "epoch": 2.825924829831311, "grad_norm": 1.0976228713989258, "learning_rate": 1.7371328999469695e-07, "loss": 0.2227, "step": 5968 }, { "epoch": 2.8263983427049424, "grad_norm": 1.2216429710388184, "learning_rate": 1.7276548682691463e-07, "loss": 0.1866, "step": 5969 }, { "epoch": 2.8268718555785735, "grad_norm": 1.946909785270691, "learning_rate": 1.71820253877516e-07, "loss": 0.2028, "step": 5970 }, { "epoch": 2.827345368452205, "grad_norm": 1.2762550115585327, "learning_rate": 1.7087759139371442e-07, "loss": 0.1854, "step": 5971 }, { "epoch": 2.827818881325836, "grad_norm": 1.4644784927368164, "learning_rate": 1.6993749962205597e-07, "loss": 0.1905, "step": 5972 }, { "epoch": 2.828292394199467, "grad_norm": 1.5430243015289307, "learning_rate": 1.689999788084129e-07, "loss": 0.2004, "step": 5973 }, { "epoch": 2.8287659070730986, "grad_norm": 1.2102375030517578, "learning_rate": 1.680650291979824e-07, "loss": 0.1786, "step": 5974 }, { "epoch": 2.8292394199467297, "grad_norm": 1.029317855834961, "learning_rate": 1.671326510352944e-07, "loss": 0.1895, "step": 5975 }, { "epoch": 2.829712932820361, "grad_norm": 1.2381408214569092, "learning_rate": 1.6620284456420167e-07, "loss": 0.2229, "step": 5976 }, { "epoch": 2.8301864456939922, "grad_norm": 0.9947028160095215, "learning_rate": 1.652756100278874e-07, "loss": 0.204, "step": 5977 }, { "epoch": 2.8306599585676233, "grad_norm": 1.2137975692749023, "learning_rate": 1.6435094766885873e-07, "loss": 0.2174, "step": 5978 }, { "epoch": 2.831133471441255, "grad_norm": 1.7809674739837646, "learning_rate": 1.6342885772895445e-07, "loss": 0.1848, "step": 5979 }, { "epoch": 2.8316069843148863, "grad_norm": 1.0932118892669678, "learning_rate": 1.625093404493372e-07, "loss": 0.182, "step": 5980 }, { "epoch": 2.8320804971885174, "grad_norm": 1.009130835533142, "learning_rate": 1.6159239607049793e-07, "loss": 0.187, "step": 5981 }, { "epoch": 2.8325540100621485, "grad_norm": 1.0555768013000488, "learning_rate": 1.606780248322548e-07, "loss": 0.218, "step": 5982 }, { "epoch": 2.83302752293578, "grad_norm": 1.0417250394821167, "learning_rate": 1.597662269737521e-07, "loss": 0.2098, "step": 5983 }, { "epoch": 2.833501035809411, "grad_norm": 1.1635463237762451, "learning_rate": 1.588570027334635e-07, "loss": 0.2166, "step": 5984 }, { "epoch": 2.8339745486830425, "grad_norm": 0.8822041749954224, "learning_rate": 1.5795035234918543e-07, "loss": 0.1992, "step": 5985 }, { "epoch": 2.8344480615566736, "grad_norm": 1.387373685836792, "learning_rate": 1.5704627605804601e-07, "loss": 0.2067, "step": 5986 }, { "epoch": 2.8349215744303047, "grad_norm": 1.6232830286026, "learning_rate": 1.5614477409649497e-07, "loss": 0.1954, "step": 5987 }, { "epoch": 2.835395087303936, "grad_norm": 1.3384531736373901, "learning_rate": 1.5524584670031372e-07, "loss": 0.2028, "step": 5988 }, { "epoch": 2.8358686001775673, "grad_norm": 1.060403823852539, "learning_rate": 1.5434949410460753e-07, "loss": 0.1862, "step": 5989 }, { "epoch": 2.8363421130511988, "grad_norm": 1.383388876914978, "learning_rate": 1.5345571654380775e-07, "loss": 0.2104, "step": 5990 }, { "epoch": 2.83681562592483, "grad_norm": 1.2764064073562622, "learning_rate": 1.5256451425167406e-07, "loss": 0.2042, "step": 5991 }, { "epoch": 2.837289138798461, "grad_norm": 1.2864024639129639, "learning_rate": 1.5167588746129224e-07, "loss": 0.209, "step": 5992 }, { "epoch": 2.8377626516720924, "grad_norm": 0.9310190081596375, "learning_rate": 1.5078983640507416e-07, "loss": 0.1899, "step": 5993 }, { "epoch": 2.8382361645457235, "grad_norm": 1.218672752380371, "learning_rate": 1.4990636131475554e-07, "loss": 0.1884, "step": 5994 }, { "epoch": 2.838709677419355, "grad_norm": 1.2877182960510254, "learning_rate": 1.490254624214049e-07, "loss": 0.1916, "step": 5995 }, { "epoch": 2.839183190292986, "grad_norm": 1.661100149154663, "learning_rate": 1.4814713995540797e-07, "loss": 0.1996, "step": 5996 }, { "epoch": 2.839656703166617, "grad_norm": 1.07488214969635, "learning_rate": 1.472713941464865e-07, "loss": 0.2045, "step": 5997 }, { "epoch": 2.8401302160402486, "grad_norm": 1.1992322206497192, "learning_rate": 1.4639822522367952e-07, "loss": 0.1849, "step": 5998 }, { "epoch": 2.8406037289138797, "grad_norm": 1.5754666328430176, "learning_rate": 1.455276334153577e-07, "loss": 0.21, "step": 5999 }, { "epoch": 2.841077241787511, "grad_norm": 1.114166021347046, "learning_rate": 1.4465961894921555e-07, "loss": 0.2, "step": 6000 }, { "epoch": 2.8415507546611423, "grad_norm": 1.2283138036727905, "learning_rate": 1.4379418205227368e-07, "loss": 0.1895, "step": 6001 }, { "epoch": 2.8420242675347733, "grad_norm": 1.11212158203125, "learning_rate": 1.4293132295087998e-07, "loss": 0.205, "step": 6002 }, { "epoch": 2.842497780408405, "grad_norm": 1.7816588878631592, "learning_rate": 1.4207104187070386e-07, "loss": 0.235, "step": 6003 }, { "epoch": 2.8429712932820363, "grad_norm": 1.1824668645858765, "learning_rate": 1.4121333903674762e-07, "loss": 0.2125, "step": 6004 }, { "epoch": 2.8434448061556674, "grad_norm": 1.4136723279953003, "learning_rate": 1.4035821467333177e-07, "loss": 0.2042, "step": 6005 }, { "epoch": 2.8439183190292985, "grad_norm": 1.4217491149902344, "learning_rate": 1.3950566900410856e-07, "loss": 0.1737, "step": 6006 }, { "epoch": 2.84439183190293, "grad_norm": 1.3726348876953125, "learning_rate": 1.3865570225205072e-07, "loss": 0.2252, "step": 6007 }, { "epoch": 2.844865344776561, "grad_norm": 0.8112472891807556, "learning_rate": 1.3780831463946042e-07, "loss": 0.1636, "step": 6008 }, { "epoch": 2.8453388576501926, "grad_norm": 1.479446530342102, "learning_rate": 1.3696350638796263e-07, "loss": 0.2228, "step": 6009 }, { "epoch": 2.8458123705238236, "grad_norm": 1.054337501525879, "learning_rate": 1.3612127771850947e-07, "loss": 0.2159, "step": 6010 }, { "epoch": 2.8462858833974547, "grad_norm": 1.3831230401992798, "learning_rate": 1.3528162885137919e-07, "loss": 0.1956, "step": 6011 }, { "epoch": 2.846759396271086, "grad_norm": 0.9735446572303772, "learning_rate": 1.3444456000617056e-07, "loss": 0.1992, "step": 6012 }, { "epoch": 2.8472329091447173, "grad_norm": 1.2813206911087036, "learning_rate": 1.3361007140181293e-07, "loss": 0.1998, "step": 6013 }, { "epoch": 2.847706422018349, "grad_norm": 1.0976970195770264, "learning_rate": 1.3277816325655835e-07, "loss": 0.1954, "step": 6014 }, { "epoch": 2.84817993489198, "grad_norm": 1.1628068685531616, "learning_rate": 1.319488357879839e-07, "loss": 0.2002, "step": 6015 }, { "epoch": 2.848653447765611, "grad_norm": 1.114010214805603, "learning_rate": 1.3112208921299274e-07, "loss": 0.1749, "step": 6016 }, { "epoch": 2.8491269606392424, "grad_norm": 1.5268833637237549, "learning_rate": 1.3029792374781413e-07, "loss": 0.2106, "step": 6017 }, { "epoch": 2.8496004735128735, "grad_norm": 0.991341769695282, "learning_rate": 1.294763396079979e-07, "loss": 0.1836, "step": 6018 }, { "epoch": 2.850073986386505, "grad_norm": 1.0029536485671997, "learning_rate": 1.2865733700842098e-07, "loss": 0.2071, "step": 6019 }, { "epoch": 2.850547499260136, "grad_norm": 2.5262866020202637, "learning_rate": 1.2784091616328876e-07, "loss": 0.1866, "step": 6020 }, { "epoch": 2.851021012133767, "grad_norm": 1.2454925775527954, "learning_rate": 1.2702707728612596e-07, "loss": 0.175, "step": 6021 }, { "epoch": 2.8514945250073986, "grad_norm": 1.1130245923995972, "learning_rate": 1.2621582058978455e-07, "loss": 0.196, "step": 6022 }, { "epoch": 2.85196803788103, "grad_norm": 1.1060099601745605, "learning_rate": 1.2540714628644146e-07, "loss": 0.191, "step": 6023 }, { "epoch": 2.852441550754661, "grad_norm": 0.9595826268196106, "learning_rate": 1.2460105458759753e-07, "loss": 0.207, "step": 6024 }, { "epoch": 2.8529150636282923, "grad_norm": 1.54879891872406, "learning_rate": 1.2379754570407742e-07, "loss": 0.2039, "step": 6025 }, { "epoch": 2.853388576501924, "grad_norm": 1.1590107679367065, "learning_rate": 1.2299661984603307e-07, "loss": 0.2101, "step": 6026 }, { "epoch": 2.853862089375555, "grad_norm": 1.0808899402618408, "learning_rate": 1.2219827722293687e-07, "loss": 0.1841, "step": 6027 }, { "epoch": 2.8543356022491864, "grad_norm": 1.6629842519760132, "learning_rate": 1.214025180435885e-07, "loss": 0.2059, "step": 6028 }, { "epoch": 2.8548091151228174, "grad_norm": 0.9478086829185486, "learning_rate": 1.2060934251611146e-07, "loss": 0.1861, "step": 6029 }, { "epoch": 2.8552826279964485, "grad_norm": 1.5418405532836914, "learning_rate": 1.1981875084795202e-07, "loss": 0.1955, "step": 6030 }, { "epoch": 2.85575614087008, "grad_norm": 1.2840735912322998, "learning_rate": 1.190307432458826e-07, "loss": 0.2217, "step": 6031 }, { "epoch": 2.856229653743711, "grad_norm": 0.9639061689376831, "learning_rate": 1.1824531991599831e-07, "loss": 0.1806, "step": 6032 }, { "epoch": 2.8567031666173426, "grad_norm": 1.2272212505340576, "learning_rate": 1.1746248106372149e-07, "loss": 0.1941, "step": 6033 }, { "epoch": 2.8571766794909736, "grad_norm": 1.424233078956604, "learning_rate": 1.1668222689379172e-07, "loss": 0.1933, "step": 6034 }, { "epoch": 2.8576501923646047, "grad_norm": 1.2118334770202637, "learning_rate": 1.159045576102813e-07, "loss": 0.2095, "step": 6035 }, { "epoch": 2.8581237052382362, "grad_norm": 1.1151028871536255, "learning_rate": 1.1512947341657976e-07, "loss": 0.1909, "step": 6036 }, { "epoch": 2.8585972181118673, "grad_norm": 1.1803522109985352, "learning_rate": 1.1435697451540385e-07, "loss": 0.1931, "step": 6037 }, { "epoch": 2.859070730985499, "grad_norm": 1.8160289525985718, "learning_rate": 1.1358706110879302e-07, "loss": 0.2031, "step": 6038 }, { "epoch": 2.85954424385913, "grad_norm": 1.0799731016159058, "learning_rate": 1.1281973339810847e-07, "loss": 0.1992, "step": 6039 }, { "epoch": 2.860017756732761, "grad_norm": 1.1362674236297607, "learning_rate": 1.1205499158404187e-07, "loss": 0.1917, "step": 6040 }, { "epoch": 2.8604912696063924, "grad_norm": 2.0242366790771484, "learning_rate": 1.1129283586659988e-07, "loss": 0.2059, "step": 6041 }, { "epoch": 2.860964782480024, "grad_norm": 0.9954387545585632, "learning_rate": 1.1053326644511863e-07, "loss": 0.1961, "step": 6042 }, { "epoch": 2.861438295353655, "grad_norm": 1.4265193939208984, "learning_rate": 1.0977628351825697e-07, "loss": 0.2002, "step": 6043 }, { "epoch": 2.861911808227286, "grad_norm": 1.1525962352752686, "learning_rate": 1.0902188728399433e-07, "loss": 0.2124, "step": 6044 }, { "epoch": 2.8623853211009176, "grad_norm": 0.9790692925453186, "learning_rate": 1.082700779396384e-07, "loss": 0.1971, "step": 6045 }, { "epoch": 2.8628588339745487, "grad_norm": 1.0439430475234985, "learning_rate": 1.0752085568181524e-07, "loss": 0.191, "step": 6046 }, { "epoch": 2.86333234684818, "grad_norm": 0.9470370411872864, "learning_rate": 1.0677422070647924e-07, "loss": 0.1869, "step": 6047 }, { "epoch": 2.8638058597218112, "grad_norm": 0.9897739291191101, "learning_rate": 1.0603017320890307e-07, "loss": 0.1836, "step": 6048 }, { "epoch": 2.8642793725954423, "grad_norm": 0.8592694997787476, "learning_rate": 1.0528871338368773e-07, "loss": 0.1808, "step": 6049 }, { "epoch": 2.864752885469074, "grad_norm": 1.6525866985321045, "learning_rate": 1.0454984142475145e-07, "loss": 0.2002, "step": 6050 }, { "epoch": 2.865226398342705, "grad_norm": 1.4348740577697754, "learning_rate": 1.0381355752534295e-07, "loss": 0.179, "step": 6051 }, { "epoch": 2.8656999112163364, "grad_norm": 0.9945125579833984, "learning_rate": 1.0307986187802709e-07, "loss": 0.1853, "step": 6052 }, { "epoch": 2.8661734240899674, "grad_norm": 1.0377004146575928, "learning_rate": 1.0234875467469707e-07, "loss": 0.1957, "step": 6053 }, { "epoch": 2.8666469369635985, "grad_norm": 1.3555017709732056, "learning_rate": 1.0162023610656547e-07, "loss": 0.177, "step": 6054 }, { "epoch": 2.86712044983723, "grad_norm": 1.3089735507965088, "learning_rate": 1.0089430636416875e-07, "loss": 0.1857, "step": 6055 }, { "epoch": 2.867593962710861, "grad_norm": 1.1990454196929932, "learning_rate": 1.001709656373695e-07, "loss": 0.1987, "step": 6056 }, { "epoch": 2.8680674755844926, "grad_norm": 0.9925453066825867, "learning_rate": 9.94502141153475e-08, "loss": 0.1858, "step": 6057 }, { "epoch": 2.8685409884581237, "grad_norm": 1.205861210823059, "learning_rate": 9.873205198660974e-08, "loss": 0.182, "step": 6058 }, { "epoch": 2.8690145013317547, "grad_norm": 1.1431076526641846, "learning_rate": 9.801647943898484e-08, "loss": 0.1988, "step": 6059 }, { "epoch": 2.8694880142053862, "grad_norm": 1.1238492727279663, "learning_rate": 9.730349665962424e-08, "loss": 0.2117, "step": 6060 }, { "epoch": 2.8699615270790173, "grad_norm": 1.089830994606018, "learning_rate": 9.659310383499986e-08, "loss": 0.2003, "step": 6061 }, { "epoch": 2.870435039952649, "grad_norm": 1.0016067028045654, "learning_rate": 9.588530115091088e-08, "loss": 0.1881, "step": 6062 }, { "epoch": 2.87090855282628, "grad_norm": 1.2619209289550781, "learning_rate": 9.518008879247365e-08, "loss": 0.2057, "step": 6063 }, { "epoch": 2.871382065699911, "grad_norm": 1.600156307220459, "learning_rate": 9.447746694413063e-08, "loss": 0.1807, "step": 6064 }, { "epoch": 2.8718555785735425, "grad_norm": 1.0998058319091797, "learning_rate": 9.377743578964704e-08, "loss": 0.2085, "step": 6065 }, { "epoch": 2.872329091447174, "grad_norm": 1.203202486038208, "learning_rate": 9.307999551210645e-08, "loss": 0.1866, "step": 6066 }, { "epoch": 2.872802604320805, "grad_norm": 1.0956839323043823, "learning_rate": 9.23851462939207e-08, "loss": 0.2027, "step": 6067 }, { "epoch": 2.873276117194436, "grad_norm": 1.0432785749435425, "learning_rate": 9.169288831681889e-08, "loss": 0.1942, "step": 6068 }, { "epoch": 2.8737496300680676, "grad_norm": 1.4329001903533936, "learning_rate": 9.100322176185505e-08, "loss": 0.1827, "step": 6069 }, { "epoch": 2.8742231429416987, "grad_norm": 0.974470317363739, "learning_rate": 9.031614680940381e-08, "loss": 0.2002, "step": 6070 }, { "epoch": 2.87469665581533, "grad_norm": 1.5563040971755981, "learning_rate": 8.963166363916586e-08, "loss": 0.188, "step": 6071 }, { "epoch": 2.8751701686889612, "grad_norm": 1.2367172241210938, "learning_rate": 8.894977243015801e-08, "loss": 0.2312, "step": 6072 }, { "epoch": 2.8756436815625923, "grad_norm": 1.5719220638275146, "learning_rate": 8.827047336072426e-08, "loss": 0.1985, "step": 6073 }, { "epoch": 2.876117194436224, "grad_norm": 1.091109037399292, "learning_rate": 8.759376660852803e-08, "loss": 0.1942, "step": 6074 }, { "epoch": 2.876590707309855, "grad_norm": 0.9197636842727661, "learning_rate": 8.691965235055444e-08, "loss": 0.1919, "step": 6075 }, { "epoch": 2.8770642201834864, "grad_norm": 1.4586553573608398, "learning_rate": 8.624813076311356e-08, "loss": 0.2025, "step": 6076 }, { "epoch": 2.8775377330571175, "grad_norm": 1.1022119522094727, "learning_rate": 8.557920202183379e-08, "loss": 0.2004, "step": 6077 }, { "epoch": 2.8780112459307485, "grad_norm": 1.0540403127670288, "learning_rate": 8.491286630166851e-08, "loss": 0.2178, "step": 6078 }, { "epoch": 2.87848475880438, "grad_norm": 1.0437299013137817, "learning_rate": 8.424912377688943e-08, "loss": 0.1937, "step": 6079 }, { "epoch": 2.878958271678011, "grad_norm": 1.272997498512268, "learning_rate": 8.358797462109325e-08, "loss": 0.186, "step": 6080 }, { "epoch": 2.8794317845516426, "grad_norm": 1.2749963998794556, "learning_rate": 8.29294190071972e-08, "loss": 0.2155, "step": 6081 }, { "epoch": 2.8799052974252737, "grad_norm": 0.9653557538986206, "learning_rate": 8.227345710744018e-08, "loss": 0.1891, "step": 6082 }, { "epoch": 2.8803788102989047, "grad_norm": 1.1792298555374146, "learning_rate": 8.16200890933827e-08, "loss": 0.1945, "step": 6083 }, { "epoch": 2.8808523231725363, "grad_norm": 1.3401811122894287, "learning_rate": 8.096931513590589e-08, "loss": 0.1901, "step": 6084 }, { "epoch": 2.8813258360461678, "grad_norm": 0.9627504944801331, "learning_rate": 8.03211354052147e-08, "loss": 0.2, "step": 6085 }, { "epoch": 2.881799348919799, "grad_norm": 1.163562536239624, "learning_rate": 7.967555007083239e-08, "loss": 0.1979, "step": 6086 }, { "epoch": 2.88227286179343, "grad_norm": 1.035914421081543, "learning_rate": 7.903255930160836e-08, "loss": 0.1956, "step": 6087 }, { "epoch": 2.8827463746670614, "grad_norm": 1.300705075263977, "learning_rate": 7.83921632657092e-08, "loss": 0.2012, "step": 6088 }, { "epoch": 2.8832198875406925, "grad_norm": 1.1752310991287231, "learning_rate": 7.775436213062426e-08, "loss": 0.1685, "step": 6089 }, { "epoch": 2.883693400414324, "grad_norm": 0.986862301826477, "learning_rate": 7.711915606316345e-08, "loss": 0.1726, "step": 6090 }, { "epoch": 2.884166913287955, "grad_norm": 1.0843511819839478, "learning_rate": 7.648654522946053e-08, "loss": 0.1928, "step": 6091 }, { "epoch": 2.884640426161586, "grad_norm": 1.3774464130401611, "learning_rate": 7.58565297949676e-08, "loss": 0.1992, "step": 6092 }, { "epoch": 2.8851139390352176, "grad_norm": 0.9635332822799683, "learning_rate": 7.522910992445842e-08, "loss": 0.2011, "step": 6093 }, { "epoch": 2.8855874519088487, "grad_norm": 0.9744061827659607, "learning_rate": 7.46042857820306e-08, "loss": 0.1843, "step": 6094 }, { "epoch": 2.88606096478248, "grad_norm": 1.0627609491348267, "learning_rate": 7.39820575311001e-08, "loss": 0.2044, "step": 6095 }, { "epoch": 2.8865344776561113, "grad_norm": 1.7792444229125977, "learning_rate": 7.33624253344034e-08, "loss": 0.1901, "step": 6096 }, { "epoch": 2.8870079905297423, "grad_norm": 1.1715210676193237, "learning_rate": 7.274538935400199e-08, "loss": 0.2155, "step": 6097 }, { "epoch": 2.887481503403374, "grad_norm": 1.223122000694275, "learning_rate": 7.213094975127233e-08, "loss": 0.1888, "step": 6098 }, { "epoch": 2.887955016277005, "grad_norm": 1.16334068775177, "learning_rate": 7.151910668691808e-08, "loss": 0.209, "step": 6099 }, { "epoch": 2.8884285291506364, "grad_norm": 1.090516209602356, "learning_rate": 7.090986032095903e-08, "loss": 0.1873, "step": 6100 }, { "epoch": 2.8889020420242675, "grad_norm": 0.9286671876907349, "learning_rate": 7.030321081273883e-08, "loss": 0.1888, "step": 6101 }, { "epoch": 2.8893755548978985, "grad_norm": 0.9373717904090881, "learning_rate": 6.969915832092056e-08, "loss": 0.1875, "step": 6102 }, { "epoch": 2.88984906777153, "grad_norm": 1.0975648164749146, "learning_rate": 6.909770300348784e-08, "loss": 0.1918, "step": 6103 }, { "epoch": 2.8903225806451616, "grad_norm": 1.237648844718933, "learning_rate": 6.849884501774484e-08, "loss": 0.2088, "step": 6104 }, { "epoch": 2.8907960935187926, "grad_norm": 1.114311933517456, "learning_rate": 6.790258452031962e-08, "loss": 0.1929, "step": 6105 }, { "epoch": 2.8912696063924237, "grad_norm": 1.3131335973739624, "learning_rate": 6.73089216671563e-08, "loss": 0.1966, "step": 6106 }, { "epoch": 2.891743119266055, "grad_norm": 1.0691511631011963, "learning_rate": 6.671785661352182e-08, "loss": 0.195, "step": 6107 }, { "epoch": 2.8922166321396863, "grad_norm": 1.2618228197097778, "learning_rate": 6.612938951400472e-08, "loss": 0.1809, "step": 6108 }, { "epoch": 2.892690145013318, "grad_norm": 1.2614898681640625, "learning_rate": 6.554352052251079e-08, "loss": 0.2123, "step": 6109 }, { "epoch": 2.893163657886949, "grad_norm": 1.0897921323776245, "learning_rate": 6.496024979226967e-08, "loss": 0.2032, "step": 6110 }, { "epoch": 2.89363717076058, "grad_norm": 1.0847673416137695, "learning_rate": 6.437957747583046e-08, "loss": 0.1998, "step": 6111 }, { "epoch": 2.8941106836342114, "grad_norm": 1.2519115209579468, "learning_rate": 6.380150372506277e-08, "loss": 0.1941, "step": 6112 }, { "epoch": 2.8945841965078425, "grad_norm": 1.4796113967895508, "learning_rate": 6.322602869115568e-08, "loss": 0.2203, "step": 6113 }, { "epoch": 2.895057709381474, "grad_norm": 1.2136837244033813, "learning_rate": 6.265315252461878e-08, "loss": 0.1934, "step": 6114 }, { "epoch": 2.895531222255105, "grad_norm": 1.1030209064483643, "learning_rate": 6.208287537528223e-08, "loss": 0.2029, "step": 6115 }, { "epoch": 2.896004735128736, "grad_norm": 0.975267231464386, "learning_rate": 6.151519739229672e-08, "loss": 0.1909, "step": 6116 }, { "epoch": 2.8964782480023676, "grad_norm": 1.1102718114852905, "learning_rate": 6.095011872413347e-08, "loss": 0.1771, "step": 6117 }, { "epoch": 2.8969517608759987, "grad_norm": 1.001387596130371, "learning_rate": 6.038763951858206e-08, "loss": 0.1903, "step": 6118 }, { "epoch": 2.89742527374963, "grad_norm": 0.9320979118347168, "learning_rate": 5.982775992275592e-08, "loss": 0.2005, "step": 6119 }, { "epoch": 2.8978987866232613, "grad_norm": 1.0721495151519775, "learning_rate": 5.9270480083083445e-08, "loss": 0.2199, "step": 6120 }, { "epoch": 2.8983722994968923, "grad_norm": 1.3128505945205688, "learning_rate": 5.871580014531697e-08, "loss": 0.2027, "step": 6121 }, { "epoch": 2.898845812370524, "grad_norm": 0.9453513622283936, "learning_rate": 5.816372025452821e-08, "loss": 0.1975, "step": 6122 }, { "epoch": 2.899319325244155, "grad_norm": 1.4196256399154663, "learning_rate": 5.7614240555107224e-08, "loss": 0.2019, "step": 6123 }, { "epoch": 2.8997928381177864, "grad_norm": 1.2253470420837402, "learning_rate": 5.706736119076683e-08, "loss": 0.2197, "step": 6124 }, { "epoch": 2.9002663509914175, "grad_norm": 1.5370304584503174, "learning_rate": 5.652308230453596e-08, "loss": 0.198, "step": 6125 }, { "epoch": 2.9007398638650486, "grad_norm": 1.08665931224823, "learning_rate": 5.5981404038767394e-08, "loss": 0.1892, "step": 6126 }, { "epoch": 2.90121337673868, "grad_norm": 1.2362626791000366, "learning_rate": 5.5442326535130044e-08, "loss": 0.19, "step": 6127 }, { "epoch": 2.9016868896123116, "grad_norm": 1.4645034074783325, "learning_rate": 5.490584993461556e-08, "loss": 0.1764, "step": 6128 }, { "epoch": 2.9021604024859426, "grad_norm": 1.3126651048660278, "learning_rate": 5.4371974377533944e-08, "loss": 0.18, "step": 6129 }, { "epoch": 2.9026339153595737, "grad_norm": 0.9817326068878174, "learning_rate": 5.384070000351571e-08, "loss": 0.1861, "step": 6130 }, { "epoch": 2.9031074282332052, "grad_norm": 1.1825339794158936, "learning_rate": 5.331202695151083e-08, "loss": 0.1834, "step": 6131 }, { "epoch": 2.9035809411068363, "grad_norm": 1.0101470947265625, "learning_rate": 5.278595535978648e-08, "loss": 0.2065, "step": 6132 }, { "epoch": 2.904054453980468, "grad_norm": 0.9827751517295837, "learning_rate": 5.22624853659337e-08, "loss": 0.2021, "step": 6133 }, { "epoch": 2.904527966854099, "grad_norm": 1.8256947994232178, "learning_rate": 5.174161710685965e-08, "loss": 0.2104, "step": 6134 }, { "epoch": 2.90500147972773, "grad_norm": 1.0786854028701782, "learning_rate": 5.122335071879425e-08, "loss": 0.2024, "step": 6135 }, { "epoch": 2.9054749926013614, "grad_norm": 1.2459017038345337, "learning_rate": 5.0707686337282404e-08, "loss": 0.1871, "step": 6136 }, { "epoch": 2.9059485054749925, "grad_norm": 1.1489919424057007, "learning_rate": 5.0194624097194e-08, "loss": 0.1885, "step": 6137 }, { "epoch": 2.906422018348624, "grad_norm": 1.2657074928283691, "learning_rate": 4.968416413271393e-08, "loss": 0.1907, "step": 6138 }, { "epoch": 2.906895531222255, "grad_norm": 1.3893004655838013, "learning_rate": 4.9176306577347624e-08, "loss": 0.2081, "step": 6139 }, { "epoch": 2.907369044095886, "grad_norm": 1.1752687692642212, "learning_rate": 4.8671051563922156e-08, "loss": 0.2268, "step": 6140 }, { "epoch": 2.9078425569695177, "grad_norm": 0.9716362953186035, "learning_rate": 4.816839922457961e-08, "loss": 0.188, "step": 6141 }, { "epoch": 2.9083160698431487, "grad_norm": 1.7129836082458496, "learning_rate": 4.766834969078704e-08, "loss": 0.2392, "step": 6142 }, { "epoch": 2.9087895827167802, "grad_norm": 1.019442081451416, "learning_rate": 4.717090309332428e-08, "loss": 0.1874, "step": 6143 }, { "epoch": 2.9092630955904113, "grad_norm": 1.5065118074417114, "learning_rate": 4.667605956229615e-08, "loss": 0.1921, "step": 6144 }, { "epoch": 2.9097366084640424, "grad_norm": 0.9485598206520081, "learning_rate": 4.618381922712245e-08, "loss": 0.1998, "step": 6145 }, { "epoch": 2.910210121337674, "grad_norm": 0.9879134297370911, "learning_rate": 4.5694182216544645e-08, "loss": 0.2106, "step": 6146 }, { "epoch": 2.9106836342113054, "grad_norm": 1.3310444355010986, "learning_rate": 4.520714865862252e-08, "loss": 0.2027, "step": 6147 }, { "epoch": 2.9111571470849364, "grad_norm": 1.6184104681015015, "learning_rate": 4.47227186807353e-08, "loss": 0.1938, "step": 6148 }, { "epoch": 2.9116306599585675, "grad_norm": 0.9067788124084473, "learning_rate": 4.4240892409580516e-08, "loss": 0.1856, "step": 6149 }, { "epoch": 2.912104172832199, "grad_norm": 1.6381840705871582, "learning_rate": 4.3761669971176255e-08, "loss": 0.2039, "step": 6150 }, { "epoch": 2.91257768570583, "grad_norm": 1.0911681652069092, "learning_rate": 4.328505149085782e-08, "loss": 0.1867, "step": 6151 }, { "epoch": 2.9130511985794616, "grad_norm": 0.9727244973182678, "learning_rate": 4.281103709327883e-08, "loss": 0.2022, "step": 6152 }, { "epoch": 2.9135247114530927, "grad_norm": 0.9916009306907654, "learning_rate": 4.233962690241567e-08, "loss": 0.1753, "step": 6153 }, { "epoch": 2.9139982243267237, "grad_norm": 1.301115870475769, "learning_rate": 4.18708210415586e-08, "loss": 0.1829, "step": 6154 }, { "epoch": 2.9144717372003552, "grad_norm": 1.1094136238098145, "learning_rate": 4.140461963332065e-08, "loss": 0.2055, "step": 6155 }, { "epoch": 2.9149452500739863, "grad_norm": 0.9840229749679565, "learning_rate": 4.094102279963319e-08, "loss": 0.2242, "step": 6156 }, { "epoch": 2.915418762947618, "grad_norm": 1.3108941316604614, "learning_rate": 4.048003066174366e-08, "loss": 0.1853, "step": 6157 }, { "epoch": 2.915892275821249, "grad_norm": 1.0700182914733887, "learning_rate": 4.002164334022118e-08, "loss": 0.2149, "step": 6158 }, { "epoch": 2.91636578869488, "grad_norm": 1.2526887655258179, "learning_rate": 3.956586095495207e-08, "loss": 0.1841, "step": 6159 }, { "epoch": 2.9168393015685115, "grad_norm": 1.097798466682434, "learning_rate": 3.911268362514209e-08, "loss": 0.1977, "step": 6160 }, { "epoch": 2.9173128144421425, "grad_norm": 1.293502688407898, "learning_rate": 3.866211146931531e-08, "loss": 0.2091, "step": 6161 }, { "epoch": 2.917786327315774, "grad_norm": 1.0824062824249268, "learning_rate": 3.821414460531414e-08, "loss": 0.2008, "step": 6162 }, { "epoch": 2.918259840189405, "grad_norm": 1.2927944660186768, "learning_rate": 3.776878315030042e-08, "loss": 0.2031, "step": 6163 }, { "epoch": 2.918733353063036, "grad_norm": 1.0434043407440186, "learning_rate": 3.73260272207554e-08, "loss": 0.204, "step": 6164 }, { "epoch": 2.9192068659366677, "grad_norm": 0.9378446936607361, "learning_rate": 3.688587693247536e-08, "loss": 0.1847, "step": 6165 }, { "epoch": 2.919680378810299, "grad_norm": 1.1560747623443604, "learning_rate": 3.644833240057821e-08, "loss": 0.2163, "step": 6166 }, { "epoch": 2.9201538916839302, "grad_norm": 0.9858490228652954, "learning_rate": 3.601339373950019e-08, "loss": 0.1798, "step": 6167 }, { "epoch": 2.9206274045575613, "grad_norm": 1.1007888317108154, "learning_rate": 3.558106106299475e-08, "loss": 0.1843, "step": 6168 }, { "epoch": 2.921100917431193, "grad_norm": 1.1641054153442383, "learning_rate": 3.515133448413366e-08, "loss": 0.187, "step": 6169 }, { "epoch": 2.921574430304824, "grad_norm": 1.4323322772979736, "learning_rate": 3.472421411530924e-08, "loss": 0.1945, "step": 6170 }, { "epoch": 2.9220479431784554, "grad_norm": 1.1009232997894287, "learning_rate": 3.429970006822991e-08, "loss": 0.1778, "step": 6171 }, { "epoch": 2.9225214560520865, "grad_norm": 1.0793761014938354, "learning_rate": 3.387779245392242e-08, "loss": 0.1852, "step": 6172 }, { "epoch": 2.9229949689257175, "grad_norm": 1.315026879310608, "learning_rate": 3.345849138273405e-08, "loss": 0.2095, "step": 6173 }, { "epoch": 2.923468481799349, "grad_norm": 1.0471131801605225, "learning_rate": 3.304179696432708e-08, "loss": 0.2058, "step": 6174 }, { "epoch": 2.92394199467298, "grad_norm": 1.2734088897705078, "learning_rate": 3.262770930768655e-08, "loss": 0.1976, "step": 6175 }, { "epoch": 2.9244155075466116, "grad_norm": 1.2894588708877563, "learning_rate": 3.2216228521111393e-08, "loss": 0.2161, "step": 6176 }, { "epoch": 2.9248890204202427, "grad_norm": 0.9765024781227112, "learning_rate": 3.180735471222107e-08, "loss": 0.2171, "step": 6177 }, { "epoch": 2.9253625332938737, "grad_norm": 0.9731594324111938, "learning_rate": 3.140108798795227e-08, "loss": 0.1879, "step": 6178 }, { "epoch": 2.9258360461675053, "grad_norm": 1.8532750606536865, "learning_rate": 3.099742845455889e-08, "loss": 0.1931, "step": 6179 }, { "epoch": 2.9263095590411363, "grad_norm": 1.2193810939788818, "learning_rate": 3.059637621761646e-08, "loss": 0.1973, "step": 6180 }, { "epoch": 2.926783071914768, "grad_norm": 1.1351593732833862, "learning_rate": 3.019793138201554e-08, "loss": 0.2097, "step": 6181 }, { "epoch": 2.927256584788399, "grad_norm": 1.9396110773086548, "learning_rate": 2.9802094051964993e-08, "loss": 0.2438, "step": 6182 }, { "epoch": 2.92773009766203, "grad_norm": 0.9938279986381531, "learning_rate": 2.9408864330991993e-08, "loss": 0.198, "step": 6183 }, { "epoch": 2.9282036105356615, "grad_norm": 1.4137035608291626, "learning_rate": 2.9018242321943168e-08, "loss": 0.2003, "step": 6184 }, { "epoch": 2.9286771234092925, "grad_norm": 1.076438546180725, "learning_rate": 2.8630228126981242e-08, "loss": 0.2066, "step": 6185 }, { "epoch": 2.929150636282924, "grad_norm": 1.398909330368042, "learning_rate": 2.8244821847587256e-08, "loss": 0.1752, "step": 6186 }, { "epoch": 2.929624149156555, "grad_norm": 0.9109712243080139, "learning_rate": 2.7862023584561692e-08, "loss": 0.196, "step": 6187 }, { "epoch": 2.930097662030186, "grad_norm": 1.3671661615371704, "learning_rate": 2.748183343802002e-08, "loss": 0.2127, "step": 6188 }, { "epoch": 2.9305711749038177, "grad_norm": 1.2796677350997925, "learning_rate": 2.7104251507398262e-08, "loss": 0.2206, "step": 6189 }, { "epoch": 2.931044687777449, "grad_norm": 0.9807784557342529, "learning_rate": 2.6729277891449634e-08, "loss": 0.1877, "step": 6190 }, { "epoch": 2.9315182006510803, "grad_norm": 0.9959766864776611, "learning_rate": 2.6356912688244585e-08, "loss": 0.187, "step": 6191 }, { "epoch": 2.9319917135247113, "grad_norm": 1.6035785675048828, "learning_rate": 2.5987155995171876e-08, "loss": 0.1968, "step": 6192 }, { "epoch": 2.932465226398343, "grad_norm": 1.377586841583252, "learning_rate": 2.5620007908937483e-08, "loss": 0.2178, "step": 6193 }, { "epoch": 2.932938739271974, "grad_norm": 1.7171190977096558, "learning_rate": 2.5255468525564598e-08, "loss": 0.2138, "step": 6194 }, { "epoch": 2.9334122521456054, "grad_norm": 1.3508435487747192, "learning_rate": 2.489353794039695e-08, "loss": 0.2017, "step": 6195 }, { "epoch": 2.9338857650192365, "grad_norm": 1.0717988014221191, "learning_rate": 2.4534216248092158e-08, "loss": 0.1961, "step": 6196 }, { "epoch": 2.9343592778928675, "grad_norm": 1.060861349105835, "learning_rate": 2.4177503542627266e-08, "loss": 0.2179, "step": 6197 }, { "epoch": 2.934832790766499, "grad_norm": 1.0644173622131348, "learning_rate": 2.382339991729987e-08, "loss": 0.2089, "step": 6198 }, { "epoch": 2.93530630364013, "grad_norm": 1.3779876232147217, "learning_rate": 2.3471905464719226e-08, "loss": 0.2026, "step": 6199 }, { "epoch": 2.9357798165137616, "grad_norm": 1.585129737854004, "learning_rate": 2.312302027681623e-08, "loss": 0.2053, "step": 6200 }, { "epoch": 2.9362533293873927, "grad_norm": 1.6859186887741089, "learning_rate": 2.2776744444839017e-08, "loss": 0.2124, "step": 6201 }, { "epoch": 2.9367268422610238, "grad_norm": 1.0570391416549683, "learning_rate": 2.24330780593518e-08, "loss": 0.1986, "step": 6202 }, { "epoch": 2.9372003551346553, "grad_norm": 1.2987060546875, "learning_rate": 2.2092021210238233e-08, "loss": 0.2184, "step": 6203 }, { "epoch": 2.9376738680082863, "grad_norm": 1.2632862329483032, "learning_rate": 2.1753573986698086e-08, "loss": 0.2145, "step": 6204 }, { "epoch": 2.938147380881918, "grad_norm": 0.9195615649223328, "learning_rate": 2.141773647724832e-08, "loss": 0.1825, "step": 6205 }, { "epoch": 2.938620893755549, "grad_norm": 1.0756436586380005, "learning_rate": 2.1084508769725344e-08, "loss": 0.2099, "step": 6206 }, { "epoch": 2.93909440662918, "grad_norm": 1.0231488943099976, "learning_rate": 2.0753890951280554e-08, "loss": 0.2167, "step": 6207 }, { "epoch": 2.9395679195028115, "grad_norm": 1.5014095306396484, "learning_rate": 2.0425883108383672e-08, "loss": 0.1996, "step": 6208 }, { "epoch": 2.940041432376443, "grad_norm": 1.0937631130218506, "learning_rate": 2.010048532682274e-08, "loss": 0.223, "step": 6209 }, { "epoch": 2.940514945250074, "grad_norm": 1.103412389755249, "learning_rate": 1.9777697691701904e-08, "loss": 0.1888, "step": 6210 }, { "epoch": 2.940988458123705, "grad_norm": 1.1209019422531128, "learning_rate": 1.945752028744252e-08, "loss": 0.1824, "step": 6211 }, { "epoch": 2.9414619709973366, "grad_norm": 0.9927660226821899, "learning_rate": 1.913995319778539e-08, "loss": 0.186, "step": 6212 }, { "epoch": 2.9419354838709677, "grad_norm": 1.021451473236084, "learning_rate": 1.8824996505787398e-08, "loss": 0.1983, "step": 6213 }, { "epoch": 2.942408996744599, "grad_norm": 1.1407397985458374, "learning_rate": 1.8512650293820433e-08, "loss": 0.1997, "step": 6214 }, { "epoch": 2.9428825096182303, "grad_norm": 0.8883314728736877, "learning_rate": 1.820291464357693e-08, "loss": 0.1719, "step": 6215 }, { "epoch": 2.9433560224918613, "grad_norm": 1.1217479705810547, "learning_rate": 1.789578963606431e-08, "loss": 0.1944, "step": 6216 }, { "epoch": 2.943829535365493, "grad_norm": 1.5166805982589722, "learning_rate": 1.7591275351609428e-08, "loss": 0.215, "step": 6217 }, { "epoch": 2.944303048239124, "grad_norm": 1.0720276832580566, "learning_rate": 1.7289371869854132e-08, "loss": 0.1917, "step": 6218 }, { "epoch": 2.9447765611127554, "grad_norm": 1.4796210527420044, "learning_rate": 1.699007926975971e-08, "loss": 0.2059, "step": 6219 }, { "epoch": 2.9452500739863865, "grad_norm": 1.3808337450027466, "learning_rate": 1.6693397629601317e-08, "loss": 0.1986, "step": 6220 }, { "epoch": 2.9457235868600176, "grad_norm": 1.275921106338501, "learning_rate": 1.6399327026974666e-08, "loss": 0.1888, "step": 6221 }, { "epoch": 2.946197099733649, "grad_norm": 1.051120638847351, "learning_rate": 1.6107867538790456e-08, "loss": 0.2187, "step": 6222 }, { "epoch": 2.94667061260728, "grad_norm": 1.0114779472351074, "learning_rate": 1.58190192412766e-08, "loss": 0.2084, "step": 6223 }, { "epoch": 2.9471441254809116, "grad_norm": 1.3596932888031006, "learning_rate": 1.5532782209979336e-08, "loss": 0.1849, "step": 6224 }, { "epoch": 2.9476176383545427, "grad_norm": 2.326784372329712, "learning_rate": 1.524915651976211e-08, "loss": 0.184, "step": 6225 }, { "epoch": 2.948091151228174, "grad_norm": 1.3456201553344727, "learning_rate": 1.4968142244802254e-08, "loss": 0.2014, "step": 6226 }, { "epoch": 2.9485646641018053, "grad_norm": 1.592707872390747, "learning_rate": 1.4689739458597641e-08, "loss": 0.193, "step": 6227 }, { "epoch": 2.9490381769754364, "grad_norm": 1.3673793077468872, "learning_rate": 1.4413948233961138e-08, "loss": 0.2131, "step": 6228 }, { "epoch": 2.949511689849068, "grad_norm": 1.1322108507156372, "learning_rate": 1.414076864302505e-08, "loss": 0.2056, "step": 6229 }, { "epoch": 2.949985202722699, "grad_norm": 0.8551841974258423, "learning_rate": 1.3870200757235552e-08, "loss": 0.1825, "step": 6230 }, { "epoch": 2.9504587155963304, "grad_norm": 0.9732584357261658, "learning_rate": 1.3602244647356044e-08, "loss": 0.208, "step": 6231 }, { "epoch": 2.9509322284699615, "grad_norm": 1.1727527379989624, "learning_rate": 1.3336900383469353e-08, "loss": 0.1963, "step": 6232 }, { "epoch": 2.951405741343593, "grad_norm": 0.9997616410255432, "learning_rate": 1.30741680349733e-08, "loss": 0.1923, "step": 6233 }, { "epoch": 2.951879254217224, "grad_norm": 1.2932562828063965, "learning_rate": 1.2814047670584028e-08, "loss": 0.1968, "step": 6234 }, { "epoch": 2.952352767090855, "grad_norm": 1.0885539054870605, "learning_rate": 1.2556539358331566e-08, "loss": 0.1931, "step": 6235 }, { "epoch": 2.9528262799644867, "grad_norm": 1.0896440744400024, "learning_rate": 1.230164316556537e-08, "loss": 0.1941, "step": 6236 }, { "epoch": 2.9532997928381177, "grad_norm": 1.0477862358093262, "learning_rate": 1.2049359158952111e-08, "loss": 0.192, "step": 6237 }, { "epoch": 2.9537733057117492, "grad_norm": 0.844715416431427, "learning_rate": 1.1799687404473458e-08, "loss": 0.1807, "step": 6238 }, { "epoch": 2.9542468185853803, "grad_norm": 1.3414781093597412, "learning_rate": 1.1552627967428288e-08, "loss": 0.2202, "step": 6239 }, { "epoch": 2.9547203314590114, "grad_norm": 0.9424867630004883, "learning_rate": 1.1308180912432688e-08, "loss": 0.1943, "step": 6240 }, { "epoch": 2.955193844332643, "grad_norm": 1.0376644134521484, "learning_rate": 1.1066346303421071e-08, "loss": 0.1946, "step": 6241 }, { "epoch": 2.955667357206274, "grad_norm": 1.2721221446990967, "learning_rate": 1.0827124203640627e-08, "loss": 0.1955, "step": 6242 }, { "epoch": 2.9561408700799054, "grad_norm": 1.0730029344558716, "learning_rate": 1.059051467565797e-08, "loss": 0.1921, "step": 6243 }, { "epoch": 2.9566143829535365, "grad_norm": 1.2354124784469604, "learning_rate": 1.0356517781358044e-08, "loss": 0.2074, "step": 6244 }, { "epoch": 2.9570878958271676, "grad_norm": 0.9237876534461975, "learning_rate": 1.0125133581938562e-08, "loss": 0.2075, "step": 6245 }, { "epoch": 2.957561408700799, "grad_norm": 1.4493162631988525, "learning_rate": 9.896362137916672e-09, "loss": 0.2207, "step": 6246 }, { "epoch": 2.95803492157443, "grad_norm": 1.3988970518112183, "learning_rate": 9.670203509124509e-09, "loss": 0.197, "step": 6247 }, { "epoch": 2.9585084344480617, "grad_norm": 1.1738380193710327, "learning_rate": 9.44665775471254e-09, "loss": 0.2102, "step": 6248 }, { "epoch": 2.9589819473216927, "grad_norm": 1.2801897525787354, "learning_rate": 9.225724933146218e-09, "loss": 0.2061, "step": 6249 }, { "epoch": 2.959455460195324, "grad_norm": 1.5334160327911377, "learning_rate": 9.007405102209321e-09, "loss": 0.1896, "step": 6250 }, { "epoch": 2.9599289730689553, "grad_norm": 1.0199025869369507, "learning_rate": 8.791698318999508e-09, "loss": 0.2034, "step": 6251 }, { "epoch": 2.960402485942587, "grad_norm": 0.8134233951568604, "learning_rate": 8.578604639936095e-09, "loss": 0.1722, "step": 6252 }, { "epoch": 2.960875998816218, "grad_norm": 1.0595088005065918, "learning_rate": 8.368124120747833e-09, "loss": 0.1856, "step": 6253 }, { "epoch": 2.961349511689849, "grad_norm": 0.9498534202575684, "learning_rate": 8.160256816487355e-09, "loss": 0.1602, "step": 6254 }, { "epoch": 2.9618230245634805, "grad_norm": 0.8854544162750244, "learning_rate": 7.95500278151784e-09, "loss": 0.1959, "step": 6255 }, { "epoch": 2.9622965374371115, "grad_norm": 1.1654853820800781, "learning_rate": 7.752362069523012e-09, "loss": 0.2078, "step": 6256 }, { "epoch": 2.962770050310743, "grad_norm": 0.9548454880714417, "learning_rate": 7.552334733500477e-09, "loss": 0.1876, "step": 6257 }, { "epoch": 2.963243563184374, "grad_norm": 1.966020107269287, "learning_rate": 7.354920825766166e-09, "loss": 0.1918, "step": 6258 }, { "epoch": 2.963717076058005, "grad_norm": 1.6142258644104004, "learning_rate": 7.160120397950998e-09, "loss": 0.2173, "step": 6259 }, { "epoch": 2.9641905889316367, "grad_norm": 0.9371702075004578, "learning_rate": 6.967933501004221e-09, "loss": 0.1948, "step": 6260 }, { "epoch": 2.9646641018052677, "grad_norm": 1.0012017488479614, "learning_rate": 6.778360185190069e-09, "loss": 0.196, "step": 6261 }, { "epoch": 2.9651376146788992, "grad_norm": 1.5582655668258667, "learning_rate": 6.591400500088885e-09, "loss": 0.197, "step": 6262 }, { "epoch": 2.9656111275525303, "grad_norm": 1.4498084783554077, "learning_rate": 6.407054494599329e-09, "loss": 0.2011, "step": 6263 }, { "epoch": 2.9660846404261614, "grad_norm": 1.6158671379089355, "learning_rate": 6.2253222169339485e-09, "loss": 0.1903, "step": 6264 }, { "epoch": 2.966558153299793, "grad_norm": 1.1202692985534668, "learning_rate": 6.046203714624721e-09, "loss": 0.2186, "step": 6265 }, { "epoch": 2.967031666173424, "grad_norm": 1.2885193824768066, "learning_rate": 5.8696990345175064e-09, "loss": 0.2111, "step": 6266 }, { "epoch": 2.9675051790470555, "grad_norm": 0.9069783687591553, "learning_rate": 5.695808222775379e-09, "loss": 0.2006, "step": 6267 }, { "epoch": 2.9679786919206865, "grad_norm": 1.5070737600326538, "learning_rate": 5.524531324877513e-09, "loss": 0.1914, "step": 6268 }, { "epoch": 2.9684522047943176, "grad_norm": 1.019160509109497, "learning_rate": 5.3558683856203e-09, "loss": 0.21, "step": 6269 }, { "epoch": 2.968925717667949, "grad_norm": 1.3049955368041992, "learning_rate": 5.189819449116229e-09, "loss": 0.1916, "step": 6270 }, { "epoch": 2.9693992305415806, "grad_norm": 1.0356775522232056, "learning_rate": 5.026384558792785e-09, "loss": 0.2048, "step": 6271 }, { "epoch": 2.9698727434152117, "grad_norm": 1.2920750379562378, "learning_rate": 4.865563757394665e-09, "loss": 0.2184, "step": 6272 }, { "epoch": 2.9703462562888427, "grad_norm": 1.0191415548324585, "learning_rate": 4.707357086983777e-09, "loss": 0.1999, "step": 6273 }, { "epoch": 2.9708197691624743, "grad_norm": 1.0479013919830322, "learning_rate": 4.5517645889381346e-09, "loss": 0.2006, "step": 6274 }, { "epoch": 2.9712932820361053, "grad_norm": 1.0281206369400024, "learning_rate": 4.398786303949632e-09, "loss": 0.1796, "step": 6275 }, { "epoch": 2.971766794909737, "grad_norm": 1.0202572345733643, "learning_rate": 4.248422272029596e-09, "loss": 0.2112, "step": 6276 }, { "epoch": 2.972240307783368, "grad_norm": 1.0091017484664917, "learning_rate": 4.100672532504346e-09, "loss": 0.1999, "step": 6277 }, { "epoch": 2.972713820656999, "grad_norm": 0.9518880844116211, "learning_rate": 3.955537124016306e-09, "loss": 0.1934, "step": 6278 }, { "epoch": 2.9731873335306305, "grad_norm": 1.1970912218093872, "learning_rate": 3.813016084522892e-09, "loss": 0.1956, "step": 6279 }, { "epoch": 2.9736608464042615, "grad_norm": 1.150801181793213, "learning_rate": 3.673109451300949e-09, "loss": 0.1675, "step": 6280 }, { "epoch": 2.974134359277893, "grad_norm": 1.025273323059082, "learning_rate": 3.53581726094121e-09, "loss": 0.1821, "step": 6281 }, { "epoch": 2.974607872151524, "grad_norm": 1.1569122076034546, "learning_rate": 3.4011395493505073e-09, "loss": 0.1819, "step": 6282 }, { "epoch": 2.975081385025155, "grad_norm": 2.0140018463134766, "learning_rate": 3.269076351752887e-09, "loss": 0.1852, "step": 6283 }, { "epoch": 2.9755548978987867, "grad_norm": 1.07260000705719, "learning_rate": 3.139627702688497e-09, "loss": 0.2121, "step": 6284 }, { "epoch": 2.9760284107724178, "grad_norm": 1.5674619674682617, "learning_rate": 3.0127936360124786e-09, "loss": 0.1916, "step": 6285 }, { "epoch": 2.9765019236460493, "grad_norm": 1.289867639541626, "learning_rate": 2.888574184898296e-09, "loss": 0.1823, "step": 6286 }, { "epoch": 2.9769754365196803, "grad_norm": 1.5313868522644043, "learning_rate": 2.7669693818332954e-09, "loss": 0.1996, "step": 6287 }, { "epoch": 2.9774489493933114, "grad_norm": 1.160865306854248, "learning_rate": 2.6479792586220356e-09, "loss": 0.211, "step": 6288 }, { "epoch": 2.977922462266943, "grad_norm": 1.0990283489227295, "learning_rate": 2.531603846386288e-09, "loss": 0.1845, "step": 6289 }, { "epoch": 2.978395975140574, "grad_norm": 1.126089096069336, "learning_rate": 2.417843175561707e-09, "loss": 0.189, "step": 6290 }, { "epoch": 2.9788694880142055, "grad_norm": 1.5801364183425903, "learning_rate": 2.30669727590227e-09, "loss": 0.214, "step": 6291 }, { "epoch": 2.9793430008878365, "grad_norm": 0.9013964533805847, "learning_rate": 2.1981661764769456e-09, "loss": 0.2052, "step": 6292 }, { "epoch": 2.979816513761468, "grad_norm": 1.815775752067566, "learning_rate": 2.0922499056708066e-09, "loss": 0.1886, "step": 6293 }, { "epoch": 2.980290026635099, "grad_norm": 1.0103025436401367, "learning_rate": 1.9889484911850276e-09, "loss": 0.1864, "step": 6294 }, { "epoch": 2.9807635395087306, "grad_norm": 1.127058744430542, "learning_rate": 1.8882619600368855e-09, "loss": 0.2151, "step": 6295 }, { "epoch": 2.9812370523823617, "grad_norm": 1.0886855125427246, "learning_rate": 1.7901903385597607e-09, "loss": 0.1955, "step": 6296 }, { "epoch": 2.9817105652559928, "grad_norm": 1.1773751974105835, "learning_rate": 1.694733652405356e-09, "loss": 0.172, "step": 6297 }, { "epoch": 2.9821840781296243, "grad_norm": 0.9003830552101135, "learning_rate": 1.601891926537036e-09, "loss": 0.2013, "step": 6298 }, { "epoch": 2.9826575910032553, "grad_norm": 1.0472919940948486, "learning_rate": 1.5116651852375985e-09, "loss": 0.1943, "step": 6299 }, { "epoch": 2.983131103876887, "grad_norm": 1.1670929193496704, "learning_rate": 1.4240534521059447e-09, "loss": 0.1955, "step": 6300 }, { "epoch": 2.983604616750518, "grad_norm": 1.7558685541152954, "learning_rate": 1.3390567500537466e-09, "loss": 0.2085, "step": 6301 }, { "epoch": 2.984078129624149, "grad_norm": 1.4146206378936768, "learning_rate": 1.2566751013132205e-09, "loss": 0.2048, "step": 6302 }, { "epoch": 2.9845516424977805, "grad_norm": 1.2582781314849854, "learning_rate": 1.1769085274304648e-09, "loss": 0.2005, "step": 6303 }, { "epoch": 2.9850251553714116, "grad_norm": 1.0818257331848145, "learning_rate": 1.0997570492654597e-09, "loss": 0.2016, "step": 6304 }, { "epoch": 2.985498668245043, "grad_norm": 1.0899266004562378, "learning_rate": 1.025220686998729e-09, "loss": 0.2178, "step": 6305 }, { "epoch": 2.985972181118674, "grad_norm": 0.9373279809951782, "learning_rate": 9.53299460123569e-10, "loss": 0.1966, "step": 6306 }, { "epoch": 2.986445693992305, "grad_norm": 1.2788419723510742, "learning_rate": 8.83993387450488e-10, "loss": 0.2131, "step": 6307 }, { "epoch": 2.9869192068659367, "grad_norm": 1.247537612915039, "learning_rate": 8.173024871060974e-10, "loss": 0.1796, "step": 6308 }, { "epoch": 2.9873927197395678, "grad_norm": 1.1506285667419434, "learning_rate": 7.532267765320012e-10, "loss": 0.1883, "step": 6309 }, { "epoch": 2.9878662326131993, "grad_norm": 1.517842173576355, "learning_rate": 6.917662724870155e-10, "loss": 0.2005, "step": 6310 }, { "epoch": 2.9883397454868303, "grad_norm": 1.6192688941955566, "learning_rate": 6.329209910460598e-10, "loss": 0.1934, "step": 6311 }, { "epoch": 2.9888132583604614, "grad_norm": 1.194815754890442, "learning_rate": 5.766909475979354e-10, "loss": 0.2053, "step": 6312 }, { "epoch": 2.989286771234093, "grad_norm": 0.9704394340515137, "learning_rate": 5.230761568508769e-10, "loss": 0.1886, "step": 6313 }, { "epoch": 2.9897602841077244, "grad_norm": 1.4745657444000244, "learning_rate": 4.72076632827001e-10, "loss": 0.2025, "step": 6314 }, { "epoch": 2.9902337969813555, "grad_norm": 1.8984787464141846, "learning_rate": 4.2369238886341704e-10, "loss": 0.2104, "step": 6315 }, { "epoch": 2.9907073098549866, "grad_norm": 1.1849009990692139, "learning_rate": 3.7792343761555717e-10, "loss": 0.1879, "step": 6316 }, { "epoch": 2.991180822728618, "grad_norm": 1.43865168094635, "learning_rate": 3.347697910538461e-10, "loss": 0.2133, "step": 6317 }, { "epoch": 2.991654335602249, "grad_norm": 1.0743141174316406, "learning_rate": 2.942314604648111e-10, "loss": 0.1879, "step": 6318 }, { "epoch": 2.9921278484758806, "grad_norm": 0.9571679830551147, "learning_rate": 2.5630845645108207e-10, "loss": 0.1998, "step": 6319 }, { "epoch": 2.9926013613495117, "grad_norm": 0.9296308755874634, "learning_rate": 2.210007889302812e-10, "loss": 0.1982, "step": 6320 }, { "epoch": 2.993074874223143, "grad_norm": 1.0719934701919556, "learning_rate": 1.883084671372437e-10, "loss": 0.1833, "step": 6321 }, { "epoch": 2.9935483870967743, "grad_norm": 1.165964961051941, "learning_rate": 1.5823149962179707e-10, "loss": 0.198, "step": 6322 }, { "epoch": 2.9940218999704054, "grad_norm": 1.052899956703186, "learning_rate": 1.3076989425098162e-10, "loss": 0.1998, "step": 6323 }, { "epoch": 2.994495412844037, "grad_norm": 1.0535532236099243, "learning_rate": 1.0592365820683015e-10, "loss": 0.2141, "step": 6324 }, { "epoch": 2.994968925717668, "grad_norm": 0.9778185486793518, "learning_rate": 8.369279798747798e-11, "loss": 0.1918, "step": 6325 }, { "epoch": 2.995442438591299, "grad_norm": 1.1342120170593262, "learning_rate": 6.407731940827333e-11, "loss": 0.1983, "step": 6326 }, { "epoch": 2.9959159514649305, "grad_norm": 1.020653247833252, "learning_rate": 4.707722759733635e-11, "loss": 0.1911, "step": 6327 }, { "epoch": 2.9963894643385616, "grad_norm": 0.9396451115608215, "learning_rate": 3.26925270033307e-11, "loss": 0.1879, "step": 6328 }, { "epoch": 2.996862977212193, "grad_norm": 0.9087719321250916, "learning_rate": 2.0923221385471538e-11, "loss": 0.1948, "step": 6329 }, { "epoch": 2.997336490085824, "grad_norm": 1.0705500841140747, "learning_rate": 1.1769313825737983e-11, "loss": 0.1777, "step": 6330 }, { "epoch": 2.997810002959455, "grad_norm": 1.0305906534194946, "learning_rate": 5.230806714440206e-12, "loss": 0.1944, "step": 6331 }, { "epoch": 2.9982835158330867, "grad_norm": 1.1944894790649414, "learning_rate": 1.307701764652336e-12, "loss": 0.1901, "step": 6332 }, { "epoch": 2.9987570287067182, "grad_norm": 1.1254169940948486, "learning_rate": 0.0, "loss": 0.1845, "step": 6333 }, { "epoch": 2.9987570287067182, "step": 6333, "total_flos": 6.209261276653158e+16, "train_loss": 0.25655047147971993, "train_runtime": 12932.3665, "train_samples_per_second": 62.704, "train_steps_per_second": 0.49 } ], "logging_steps": 1.0, "max_steps": 6333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.209261276653158e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }