diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,44373 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9987570287067182, + "eval_steps": 500, + "global_step": 6333, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047351287363125186, + "grad_norm": 32.22987365722656, + "learning_rate": 1.0526315789473685e-07, + "loss": 4.8137, + "step": 1 + }, + { + "epoch": 0.0009470257472625037, + "grad_norm": 31.049623489379883, + "learning_rate": 2.105263157894737e-07, + "loss": 4.6949, + "step": 2 + }, + { + "epoch": 0.0014205386208937555, + "grad_norm": 30.38652801513672, + "learning_rate": 3.1578947368421055e-07, + "loss": 4.7183, + "step": 3 + }, + { + "epoch": 0.0018940514945250074, + "grad_norm": 30.45220184326172, + "learning_rate": 4.210526315789474e-07, + "loss": 4.6404, + "step": 4 + }, + { + "epoch": 0.0023675643681562593, + "grad_norm": 29.60318946838379, + "learning_rate": 5.263157894736843e-07, + "loss": 4.6377, + "step": 5 + }, + { + "epoch": 0.002841077241787511, + "grad_norm": 30.485383987426758, + "learning_rate": 6.315789473684211e-07, + "loss": 4.7493, + "step": 6 + }, + { + "epoch": 0.003314590115418763, + "grad_norm": 29.985885620117188, + "learning_rate": 7.368421052631579e-07, + "loss": 4.6343, + "step": 7 + }, + { + "epoch": 0.003788102989050015, + "grad_norm": 29.9451904296875, + "learning_rate": 8.421052631578948e-07, + "loss": 4.6325, + "step": 8 + }, + { + "epoch": 0.0042616158626812665, + "grad_norm": 28.334569931030273, + "learning_rate": 9.473684210526317e-07, + "loss": 4.4123, + "step": 9 + }, + { + "epoch": 0.004735128736312519, + "grad_norm": 30.083621978759766, + "learning_rate": 1.0526315789473685e-06, + "loss": 4.4909, + "step": 10 + }, + { + "epoch": 0.00520864160994377, + "grad_norm": 29.243478775024414, + "learning_rate": 1.1578947368421053e-06, + "loss": 4.4436, + "step": 11 + }, + { + "epoch": 0.005682154483575022, + "grad_norm": 27.903270721435547, + "learning_rate": 1.2631578947368422e-06, + "loss": 4.4001, + "step": 12 + }, + { + "epoch": 0.006155667357206274, + "grad_norm": 28.32964515686035, + "learning_rate": 1.3684210526315791e-06, + "loss": 4.2836, + "step": 13 + }, + { + "epoch": 0.006629180230837526, + "grad_norm": 28.63888931274414, + "learning_rate": 1.4736842105263159e-06, + "loss": 4.2648, + "step": 14 + }, + { + "epoch": 0.007102693104468778, + "grad_norm": 27.617847442626953, + "learning_rate": 1.5789473684210526e-06, + "loss": 4.1683, + "step": 15 + }, + { + "epoch": 0.00757620597810003, + "grad_norm": 27.17360496520996, + "learning_rate": 1.6842105263157895e-06, + "loss": 4.1395, + "step": 16 + }, + { + "epoch": 0.008049718851731282, + "grad_norm": 27.059186935424805, + "learning_rate": 1.7894736842105265e-06, + "loss": 3.9835, + "step": 17 + }, + { + "epoch": 0.008523231725362533, + "grad_norm": 27.937824249267578, + "learning_rate": 1.8947368421052634e-06, + "loss": 3.9632, + "step": 18 + }, + { + "epoch": 0.008996744598993784, + "grad_norm": 27.05862045288086, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.8811, + "step": 19 + }, + { + "epoch": 0.009470257472625037, + "grad_norm": 26.196979522705078, + "learning_rate": 2.105263157894737e-06, + "loss": 3.8312, + "step": 20 + }, + { + "epoch": 0.009943770346256289, + "grad_norm": 26.135305404663086, + "learning_rate": 2.2105263157894738e-06, + "loss": 3.7232, + "step": 21 + }, + { + "epoch": 0.01041728321988754, + "grad_norm": 25.997011184692383, + "learning_rate": 2.3157894736842105e-06, + "loss": 3.6288, + "step": 22 + }, + { + "epoch": 0.010890796093518793, + "grad_norm": 26.397438049316406, + "learning_rate": 2.4210526315789477e-06, + "loss": 3.5437, + "step": 23 + }, + { + "epoch": 0.011364308967150044, + "grad_norm": 24.695541381835938, + "learning_rate": 2.5263157894736844e-06, + "loss": 3.4521, + "step": 24 + }, + { + "epoch": 0.011837821840781295, + "grad_norm": 24.346065521240234, + "learning_rate": 2.631578947368421e-06, + "loss": 3.269, + "step": 25 + }, + { + "epoch": 0.012311334714412548, + "grad_norm": 24.483036041259766, + "learning_rate": 2.7368421052631583e-06, + "loss": 3.2028, + "step": 26 + }, + { + "epoch": 0.0127848475880438, + "grad_norm": 24.411731719970703, + "learning_rate": 2.842105263157895e-06, + "loss": 3.1983, + "step": 27 + }, + { + "epoch": 0.013258360461675053, + "grad_norm": 24.398752212524414, + "learning_rate": 2.9473684210526317e-06, + "loss": 3.0967, + "step": 28 + }, + { + "epoch": 0.013731873335306304, + "grad_norm": 24.130815505981445, + "learning_rate": 3.052631578947369e-06, + "loss": 2.9756, + "step": 29 + }, + { + "epoch": 0.014205386208937555, + "grad_norm": 22.618980407714844, + "learning_rate": 3.157894736842105e-06, + "loss": 2.7865, + "step": 30 + }, + { + "epoch": 0.014678899082568808, + "grad_norm": 22.387496948242188, + "learning_rate": 3.2631578947368423e-06, + "loss": 2.7595, + "step": 31 + }, + { + "epoch": 0.01515241195620006, + "grad_norm": 21.85611915588379, + "learning_rate": 3.368421052631579e-06, + "loss": 2.6221, + "step": 32 + }, + { + "epoch": 0.01562592482983131, + "grad_norm": 21.454317092895508, + "learning_rate": 3.473684210526316e-06, + "loss": 2.4785, + "step": 33 + }, + { + "epoch": 0.016099437703462564, + "grad_norm": 20.223222732543945, + "learning_rate": 3.578947368421053e-06, + "loss": 2.4204, + "step": 34 + }, + { + "epoch": 0.016572950577093813, + "grad_norm": 20.16985511779785, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.4438, + "step": 35 + }, + { + "epoch": 0.017046463450725066, + "grad_norm": 18.412492752075195, + "learning_rate": 3.789473684210527e-06, + "loss": 2.2141, + "step": 36 + }, + { + "epoch": 0.01751997632435632, + "grad_norm": 17.698265075683594, + "learning_rate": 3.894736842105263e-06, + "loss": 2.1291, + "step": 37 + }, + { + "epoch": 0.01799348919798757, + "grad_norm": 15.409119606018066, + "learning_rate": 4.000000000000001e-06, + "loss": 2.0709, + "step": 38 + }, + { + "epoch": 0.01846700207161882, + "grad_norm": 14.957233428955078, + "learning_rate": 4.105263157894737e-06, + "loss": 1.9438, + "step": 39 + }, + { + "epoch": 0.018940514945250075, + "grad_norm": 14.200984001159668, + "learning_rate": 4.210526315789474e-06, + "loss": 1.8377, + "step": 40 + }, + { + "epoch": 0.019414027818881324, + "grad_norm": 11.768697738647461, + "learning_rate": 4.315789473684211e-06, + "loss": 1.8675, + "step": 41 + }, + { + "epoch": 0.019887540692512577, + "grad_norm": 11.194828987121582, + "learning_rate": 4.4210526315789476e-06, + "loss": 1.7639, + "step": 42 + }, + { + "epoch": 0.02036105356614383, + "grad_norm": 10.777433395385742, + "learning_rate": 4.526315789473685e-06, + "loss": 1.6869, + "step": 43 + }, + { + "epoch": 0.02083456643977508, + "grad_norm": 9.706337928771973, + "learning_rate": 4.631578947368421e-06, + "loss": 1.7424, + "step": 44 + }, + { + "epoch": 0.021308079313406333, + "grad_norm": 9.26910400390625, + "learning_rate": 4.736842105263158e-06, + "loss": 1.5467, + "step": 45 + }, + { + "epoch": 0.021781592187037586, + "grad_norm": 8.306986808776855, + "learning_rate": 4.842105263157895e-06, + "loss": 1.5561, + "step": 46 + }, + { + "epoch": 0.022255105060668835, + "grad_norm": 7.666769027709961, + "learning_rate": 4.947368421052632e-06, + "loss": 1.4493, + "step": 47 + }, + { + "epoch": 0.022728617934300088, + "grad_norm": 7.483455181121826, + "learning_rate": 5.052631578947369e-06, + "loss": 1.383, + "step": 48 + }, + { + "epoch": 0.02320213080793134, + "grad_norm": 7.80124044418335, + "learning_rate": 5.157894736842106e-06, + "loss": 1.4092, + "step": 49 + }, + { + "epoch": 0.02367564368156259, + "grad_norm": 7.231191158294678, + "learning_rate": 5.263157894736842e-06, + "loss": 1.3909, + "step": 50 + }, + { + "epoch": 0.024149156555193844, + "grad_norm": 6.258241176605225, + "learning_rate": 5.36842105263158e-06, + "loss": 1.3075, + "step": 51 + }, + { + "epoch": 0.024622669428825097, + "grad_norm": 6.165030479431152, + "learning_rate": 5.4736842105263165e-06, + "loss": 1.2565, + "step": 52 + }, + { + "epoch": 0.02509618230245635, + "grad_norm": 6.1708455085754395, + "learning_rate": 5.578947368421052e-06, + "loss": 1.3199, + "step": 53 + }, + { + "epoch": 0.0255696951760876, + "grad_norm": 5.459502696990967, + "learning_rate": 5.68421052631579e-06, + "loss": 1.1997, + "step": 54 + }, + { + "epoch": 0.026043208049718852, + "grad_norm": 5.5988640785217285, + "learning_rate": 5.789473684210527e-06, + "loss": 1.0775, + "step": 55 + }, + { + "epoch": 0.026516720923350105, + "grad_norm": 4.691328048706055, + "learning_rate": 5.8947368421052634e-06, + "loss": 1.185, + "step": 56 + }, + { + "epoch": 0.026990233796981355, + "grad_norm": 4.2112507820129395, + "learning_rate": 6e-06, + "loss": 1.099, + "step": 57 + }, + { + "epoch": 0.027463746670612608, + "grad_norm": 4.1589674949646, + "learning_rate": 6.105263157894738e-06, + "loss": 1.1201, + "step": 58 + }, + { + "epoch": 0.02793725954424386, + "grad_norm": 4.95751428604126, + "learning_rate": 6.2105263157894745e-06, + "loss": 1.1349, + "step": 59 + }, + { + "epoch": 0.02841077241787511, + "grad_norm": 3.543083906173706, + "learning_rate": 6.31578947368421e-06, + "loss": 1.0243, + "step": 60 + }, + { + "epoch": 0.028884285291506363, + "grad_norm": 2.737982749938965, + "learning_rate": 6.421052631578948e-06, + "loss": 0.9596, + "step": 61 + }, + { + "epoch": 0.029357798165137616, + "grad_norm": 3.6427838802337646, + "learning_rate": 6.526315789473685e-06, + "loss": 0.9429, + "step": 62 + }, + { + "epoch": 0.029831311038768866, + "grad_norm": 3.1644389629364014, + "learning_rate": 6.631578947368421e-06, + "loss": 1.0354, + "step": 63 + }, + { + "epoch": 0.03030482391240012, + "grad_norm": 2.886333465576172, + "learning_rate": 6.736842105263158e-06, + "loss": 0.9896, + "step": 64 + }, + { + "epoch": 0.030778336786031372, + "grad_norm": 2.797492265701294, + "learning_rate": 6.842105263157896e-06, + "loss": 0.9985, + "step": 65 + }, + { + "epoch": 0.03125184965966262, + "grad_norm": 2.594764471054077, + "learning_rate": 6.947368421052632e-06, + "loss": 0.9611, + "step": 66 + }, + { + "epoch": 0.031725362533293874, + "grad_norm": 3.788191795349121, + "learning_rate": 7.052631578947369e-06, + "loss": 0.9094, + "step": 67 + }, + { + "epoch": 0.03219887540692513, + "grad_norm": 2.9197521209716797, + "learning_rate": 7.157894736842106e-06, + "loss": 0.8584, + "step": 68 + }, + { + "epoch": 0.03267238828055638, + "grad_norm": 3.339698314666748, + "learning_rate": 7.263157894736843e-06, + "loss": 0.8872, + "step": 69 + }, + { + "epoch": 0.033145901154187626, + "grad_norm": 3.415419578552246, + "learning_rate": 7.368421052631579e-06, + "loss": 0.8177, + "step": 70 + }, + { + "epoch": 0.03361941402781888, + "grad_norm": 2.3660242557525635, + "learning_rate": 7.473684210526316e-06, + "loss": 0.9576, + "step": 71 + }, + { + "epoch": 0.03409292690145013, + "grad_norm": 2.605135202407837, + "learning_rate": 7.578947368421054e-06, + "loss": 0.8901, + "step": 72 + }, + { + "epoch": 0.034566439775081385, + "grad_norm": 3.861926317214966, + "learning_rate": 7.68421052631579e-06, + "loss": 0.8733, + "step": 73 + }, + { + "epoch": 0.03503995264871264, + "grad_norm": 3.650552272796631, + "learning_rate": 7.789473684210526e-06, + "loss": 0.9273, + "step": 74 + }, + { + "epoch": 0.03551346552234389, + "grad_norm": 3.098768949508667, + "learning_rate": 7.894736842105265e-06, + "loss": 0.9519, + "step": 75 + }, + { + "epoch": 0.03598697839597514, + "grad_norm": 3.1536178588867188, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8491, + "step": 76 + }, + { + "epoch": 0.03646049126960639, + "grad_norm": 1.9899249076843262, + "learning_rate": 8.105263157894736e-06, + "loss": 0.8351, + "step": 77 + }, + { + "epoch": 0.03693400414323764, + "grad_norm": 2.1144354343414307, + "learning_rate": 8.210526315789475e-06, + "loss": 0.8503, + "step": 78 + }, + { + "epoch": 0.037407517016868896, + "grad_norm": 2.7250900268554688, + "learning_rate": 8.315789473684212e-06, + "loss": 0.8294, + "step": 79 + }, + { + "epoch": 0.03788102989050015, + "grad_norm": 2.3137624263763428, + "learning_rate": 8.421052631578948e-06, + "loss": 0.78, + "step": 80 + }, + { + "epoch": 0.0383545427641314, + "grad_norm": 2.3112668991088867, + "learning_rate": 8.526315789473685e-06, + "loss": 0.7696, + "step": 81 + }, + { + "epoch": 0.03882805563776265, + "grad_norm": 2.882868766784668, + "learning_rate": 8.631578947368422e-06, + "loss": 0.829, + "step": 82 + }, + { + "epoch": 0.0393015685113939, + "grad_norm": 2.7895877361297607, + "learning_rate": 8.736842105263158e-06, + "loss": 0.8318, + "step": 83 + }, + { + "epoch": 0.039775081385025154, + "grad_norm": 2.179732322692871, + "learning_rate": 8.842105263157895e-06, + "loss": 0.7615, + "step": 84 + }, + { + "epoch": 0.04024859425865641, + "grad_norm": 2.3356051445007324, + "learning_rate": 8.947368421052632e-06, + "loss": 0.6886, + "step": 85 + }, + { + "epoch": 0.04072210713228766, + "grad_norm": 2.664323568344116, + "learning_rate": 9.05263157894737e-06, + "loss": 0.7625, + "step": 86 + }, + { + "epoch": 0.04119562000591891, + "grad_norm": 2.517026424407959, + "learning_rate": 9.157894736842105e-06, + "loss": 0.6589, + "step": 87 + }, + { + "epoch": 0.04166913287955016, + "grad_norm": 1.9179812669754028, + "learning_rate": 9.263157894736842e-06, + "loss": 0.695, + "step": 88 + }, + { + "epoch": 0.04214264575318141, + "grad_norm": 2.2245266437530518, + "learning_rate": 9.36842105263158e-06, + "loss": 0.6799, + "step": 89 + }, + { + "epoch": 0.042616158626812665, + "grad_norm": 2.003324270248413, + "learning_rate": 9.473684210526315e-06, + "loss": 0.6828, + "step": 90 + }, + { + "epoch": 0.04308967150044392, + "grad_norm": 2.43106746673584, + "learning_rate": 9.578947368421054e-06, + "loss": 0.6584, + "step": 91 + }, + { + "epoch": 0.04356318437407517, + "grad_norm": 2.4158437252044678, + "learning_rate": 9.68421052631579e-06, + "loss": 0.6645, + "step": 92 + }, + { + "epoch": 0.044036697247706424, + "grad_norm": 2.5698916912078857, + "learning_rate": 9.789473684210527e-06, + "loss": 0.6451, + "step": 93 + }, + { + "epoch": 0.04451021012133767, + "grad_norm": 1.9317587614059448, + "learning_rate": 9.894736842105264e-06, + "loss": 0.6401, + "step": 94 + }, + { + "epoch": 0.044983722994968924, + "grad_norm": 1.8760066032409668, + "learning_rate": 1e-05, + "loss": 0.6576, + "step": 95 + }, + { + "epoch": 0.045457235868600177, + "grad_norm": 2.3672170639038086, + "learning_rate": 1.0105263157894738e-05, + "loss": 0.7306, + "step": 96 + }, + { + "epoch": 0.04593074874223143, + "grad_norm": 1.790613055229187, + "learning_rate": 1.0210526315789476e-05, + "loss": 0.6165, + "step": 97 + }, + { + "epoch": 0.04640426161586268, + "grad_norm": 1.7723586559295654, + "learning_rate": 1.0315789473684213e-05, + "loss": 0.5753, + "step": 98 + }, + { + "epoch": 0.046877774489493935, + "grad_norm": 2.3334202766418457, + "learning_rate": 1.0421052631578948e-05, + "loss": 0.6944, + "step": 99 + }, + { + "epoch": 0.04735128736312518, + "grad_norm": 1.6184226274490356, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.5287, + "step": 100 + }, + { + "epoch": 0.047824800236756435, + "grad_norm": 1.6825703382492065, + "learning_rate": 1.0631578947368421e-05, + "loss": 0.5426, + "step": 101 + }, + { + "epoch": 0.04829831311038769, + "grad_norm": 1.4492383003234863, + "learning_rate": 1.073684210526316e-05, + "loss": 0.53, + "step": 102 + }, + { + "epoch": 0.04877182598401894, + "grad_norm": 2.225109577178955, + "learning_rate": 1.0842105263157896e-05, + "loss": 0.5375, + "step": 103 + }, + { + "epoch": 0.049245338857650194, + "grad_norm": 1.645599365234375, + "learning_rate": 1.0947368421052633e-05, + "loss": 0.5288, + "step": 104 + }, + { + "epoch": 0.049718851731281447, + "grad_norm": 1.912824034690857, + "learning_rate": 1.105263157894737e-05, + "loss": 0.5346, + "step": 105 + }, + { + "epoch": 0.0501923646049127, + "grad_norm": 2.340646505355835, + "learning_rate": 1.1157894736842105e-05, + "loss": 0.5581, + "step": 106 + }, + { + "epoch": 0.050665877478543946, + "grad_norm": 1.8115869760513306, + "learning_rate": 1.1263157894736843e-05, + "loss": 0.5986, + "step": 107 + }, + { + "epoch": 0.0511393903521752, + "grad_norm": 2.0887653827667236, + "learning_rate": 1.136842105263158e-05, + "loss": 0.5353, + "step": 108 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 1.7228904962539673, + "learning_rate": 1.1473684210526317e-05, + "loss": 0.5273, + "step": 109 + }, + { + "epoch": 0.052086416099437705, + "grad_norm": 2.6061673164367676, + "learning_rate": 1.1578947368421053e-05, + "loss": 0.5476, + "step": 110 + }, + { + "epoch": 0.05255992897306896, + "grad_norm": 1.8213406801223755, + "learning_rate": 1.1684210526315792e-05, + "loss": 0.611, + "step": 111 + }, + { + "epoch": 0.05303344184670021, + "grad_norm": 1.6525737047195435, + "learning_rate": 1.1789473684210527e-05, + "loss": 0.5687, + "step": 112 + }, + { + "epoch": 0.05350695472033146, + "grad_norm": 1.7281478643417358, + "learning_rate": 1.1894736842105264e-05, + "loss": 0.5377, + "step": 113 + }, + { + "epoch": 0.05398046759396271, + "grad_norm": 1.835943341255188, + "learning_rate": 1.2e-05, + "loss": 0.4596, + "step": 114 + }, + { + "epoch": 0.05445398046759396, + "grad_norm": 1.6497899293899536, + "learning_rate": 1.2105263157894737e-05, + "loss": 0.544, + "step": 115 + }, + { + "epoch": 0.054927493341225216, + "grad_norm": 1.4863883256912231, + "learning_rate": 1.2210526315789475e-05, + "loss": 0.4536, + "step": 116 + }, + { + "epoch": 0.05540100621485647, + "grad_norm": 1.4068655967712402, + "learning_rate": 1.2315789473684212e-05, + "loss": 0.4522, + "step": 117 + }, + { + "epoch": 0.05587451908848772, + "grad_norm": 1.8424599170684814, + "learning_rate": 1.2421052631578949e-05, + "loss": 0.5377, + "step": 118 + }, + { + "epoch": 0.05634803196211897, + "grad_norm": 1.6451683044433594, + "learning_rate": 1.2526315789473684e-05, + "loss": 0.5254, + "step": 119 + }, + { + "epoch": 0.05682154483575022, + "grad_norm": 1.8392103910446167, + "learning_rate": 1.263157894736842e-05, + "loss": 0.4758, + "step": 120 + }, + { + "epoch": 0.057295057709381474, + "grad_norm": 1.9140554666519165, + "learning_rate": 1.2736842105263159e-05, + "loss": 0.5243, + "step": 121 + }, + { + "epoch": 0.05776857058301273, + "grad_norm": 2.6478939056396484, + "learning_rate": 1.2842105263157896e-05, + "loss": 0.4688, + "step": 122 + }, + { + "epoch": 0.05824208345664398, + "grad_norm": 1.8704174757003784, + "learning_rate": 1.2947368421052633e-05, + "loss": 0.5192, + "step": 123 + }, + { + "epoch": 0.05871559633027523, + "grad_norm": 1.791478157043457, + "learning_rate": 1.305263157894737e-05, + "loss": 0.5216, + "step": 124 + }, + { + "epoch": 0.05918910920390648, + "grad_norm": 2.0871686935424805, + "learning_rate": 1.3157894736842108e-05, + "loss": 0.4937, + "step": 125 + }, + { + "epoch": 0.05966262207753773, + "grad_norm": 1.6168326139450073, + "learning_rate": 1.3263157894736843e-05, + "loss": 0.5167, + "step": 126 + }, + { + "epoch": 0.060136134951168985, + "grad_norm": 1.9244468212127686, + "learning_rate": 1.336842105263158e-05, + "loss": 0.482, + "step": 127 + }, + { + "epoch": 0.06060964782480024, + "grad_norm": 1.7854818105697632, + "learning_rate": 1.3473684210526316e-05, + "loss": 0.5376, + "step": 128 + }, + { + "epoch": 0.06108316069843149, + "grad_norm": 1.5491472482681274, + "learning_rate": 1.3578947368421055e-05, + "loss": 0.4578, + "step": 129 + }, + { + "epoch": 0.061556673572062744, + "grad_norm": 2.987952470779419, + "learning_rate": 1.3684210526315791e-05, + "loss": 0.4415, + "step": 130 + }, + { + "epoch": 0.06203018644569399, + "grad_norm": 2.423494338989258, + "learning_rate": 1.3789473684210528e-05, + "loss": 0.4908, + "step": 131 + }, + { + "epoch": 0.06250369931932524, + "grad_norm": 1.8803611993789673, + "learning_rate": 1.3894736842105265e-05, + "loss": 0.4588, + "step": 132 + }, + { + "epoch": 0.0629772121929565, + "grad_norm": 2.069321393966675, + "learning_rate": 1.4e-05, + "loss": 0.5236, + "step": 133 + }, + { + "epoch": 0.06345072506658775, + "grad_norm": 2.4028356075286865, + "learning_rate": 1.4105263157894738e-05, + "loss": 0.5207, + "step": 134 + }, + { + "epoch": 0.063924237940219, + "grad_norm": 1.9155303239822388, + "learning_rate": 1.4210526315789475e-05, + "loss": 0.4975, + "step": 135 + }, + { + "epoch": 0.06439775081385025, + "grad_norm": 3.001650810241699, + "learning_rate": 1.4315789473684212e-05, + "loss": 0.4302, + "step": 136 + }, + { + "epoch": 0.0648712636874815, + "grad_norm": 2.335148334503174, + "learning_rate": 1.4421052631578948e-05, + "loss": 0.5123, + "step": 137 + }, + { + "epoch": 0.06534477656111276, + "grad_norm": 1.7075388431549072, + "learning_rate": 1.4526315789473687e-05, + "loss": 0.456, + "step": 138 + }, + { + "epoch": 0.065818289434744, + "grad_norm": 2.3079159259796143, + "learning_rate": 1.4631578947368424e-05, + "loss": 0.486, + "step": 139 + }, + { + "epoch": 0.06629180230837525, + "grad_norm": 2.429774522781372, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.4889, + "step": 140 + }, + { + "epoch": 0.06676531518200651, + "grad_norm": 2.658094644546509, + "learning_rate": 1.4842105263157895e-05, + "loss": 0.4289, + "step": 141 + }, + { + "epoch": 0.06723882805563776, + "grad_norm": 1.507424235343933, + "learning_rate": 1.4947368421052632e-05, + "loss": 0.4725, + "step": 142 + }, + { + "epoch": 0.06771234092926902, + "grad_norm": 1.9254406690597534, + "learning_rate": 1.505263157894737e-05, + "loss": 0.456, + "step": 143 + }, + { + "epoch": 0.06818585380290026, + "grad_norm": 2.3014895915985107, + "learning_rate": 1.5157894736842107e-05, + "loss": 0.4627, + "step": 144 + }, + { + "epoch": 0.06865936667653152, + "grad_norm": 2.4393584728240967, + "learning_rate": 1.5263157894736846e-05, + "loss": 0.5304, + "step": 145 + }, + { + "epoch": 0.06913287955016277, + "grad_norm": 1.627350926399231, + "learning_rate": 1.536842105263158e-05, + "loss": 0.4897, + "step": 146 + }, + { + "epoch": 0.06960639242379402, + "grad_norm": 1.4329842329025269, + "learning_rate": 1.5473684210526316e-05, + "loss": 0.4581, + "step": 147 + }, + { + "epoch": 0.07007990529742528, + "grad_norm": 1.653219223022461, + "learning_rate": 1.5578947368421052e-05, + "loss": 0.4961, + "step": 148 + }, + { + "epoch": 0.07055341817105652, + "grad_norm": 1.8321658372879028, + "learning_rate": 1.568421052631579e-05, + "loss": 0.4684, + "step": 149 + }, + { + "epoch": 0.07102693104468778, + "grad_norm": 1.8446546792984009, + "learning_rate": 1.578947368421053e-05, + "loss": 0.4637, + "step": 150 + }, + { + "epoch": 0.07150044391831903, + "grad_norm": 1.486007809638977, + "learning_rate": 1.5894736842105266e-05, + "loss": 0.3637, + "step": 151 + }, + { + "epoch": 0.07197395679195027, + "grad_norm": 1.6993294954299927, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4323, + "step": 152 + }, + { + "epoch": 0.07244746966558153, + "grad_norm": 1.391952633857727, + "learning_rate": 1.6105263157894736e-05, + "loss": 0.4072, + "step": 153 + }, + { + "epoch": 0.07292098253921278, + "grad_norm": 2.0186688899993896, + "learning_rate": 1.6210526315789473e-05, + "loss": 0.4412, + "step": 154 + }, + { + "epoch": 0.07339449541284404, + "grad_norm": 2.2258450984954834, + "learning_rate": 1.6315789473684213e-05, + "loss": 0.4302, + "step": 155 + }, + { + "epoch": 0.07386800828647529, + "grad_norm": 1.21897554397583, + "learning_rate": 1.642105263157895e-05, + "loss": 0.4197, + "step": 156 + }, + { + "epoch": 0.07434152116010655, + "grad_norm": 2.815114974975586, + "learning_rate": 1.6526315789473686e-05, + "loss": 0.4093, + "step": 157 + }, + { + "epoch": 0.07481503403373779, + "grad_norm": 1.9448319673538208, + "learning_rate": 1.6631578947368423e-05, + "loss": 0.3686, + "step": 158 + }, + { + "epoch": 0.07528854690736904, + "grad_norm": 1.9755760431289673, + "learning_rate": 1.673684210526316e-05, + "loss": 0.4371, + "step": 159 + }, + { + "epoch": 0.0757620597810003, + "grad_norm": 2.202780246734619, + "learning_rate": 1.6842105263157896e-05, + "loss": 0.41, + "step": 160 + }, + { + "epoch": 0.07623557265463154, + "grad_norm": 1.373448133468628, + "learning_rate": 1.6947368421052633e-05, + "loss": 0.3839, + "step": 161 + }, + { + "epoch": 0.0767090855282628, + "grad_norm": 2.2185139656066895, + "learning_rate": 1.705263157894737e-05, + "loss": 0.4207, + "step": 162 + }, + { + "epoch": 0.07718259840189405, + "grad_norm": 1.905508279800415, + "learning_rate": 1.7157894736842107e-05, + "loss": 0.3555, + "step": 163 + }, + { + "epoch": 0.0776561112755253, + "grad_norm": 1.355228304862976, + "learning_rate": 1.7263157894736843e-05, + "loss": 0.4047, + "step": 164 + }, + { + "epoch": 0.07812962414915656, + "grad_norm": 1.822799563407898, + "learning_rate": 1.736842105263158e-05, + "loss": 0.4355, + "step": 165 + }, + { + "epoch": 0.0786031370227878, + "grad_norm": 1.541835069656372, + "learning_rate": 1.7473684210526317e-05, + "loss": 0.3955, + "step": 166 + }, + { + "epoch": 0.07907664989641906, + "grad_norm": 1.8151495456695557, + "learning_rate": 1.7578947368421054e-05, + "loss": 0.4162, + "step": 167 + }, + { + "epoch": 0.07955016277005031, + "grad_norm": 1.320173978805542, + "learning_rate": 1.768421052631579e-05, + "loss": 0.4257, + "step": 168 + }, + { + "epoch": 0.08002367564368157, + "grad_norm": 2.0332558155059814, + "learning_rate": 1.7789473684210527e-05, + "loss": 0.3654, + "step": 169 + }, + { + "epoch": 0.08049718851731281, + "grad_norm": 1.4273725748062134, + "learning_rate": 1.7894736842105264e-05, + "loss": 0.4193, + "step": 170 + }, + { + "epoch": 0.08097070139094406, + "grad_norm": 1.5815318822860718, + "learning_rate": 1.8e-05, + "loss": 0.4495, + "step": 171 + }, + { + "epoch": 0.08144421426457532, + "grad_norm": 1.5733940601348877, + "learning_rate": 1.810526315789474e-05, + "loss": 0.4453, + "step": 172 + }, + { + "epoch": 0.08191772713820657, + "grad_norm": 1.554313063621521, + "learning_rate": 1.8210526315789477e-05, + "loss": 0.4122, + "step": 173 + }, + { + "epoch": 0.08239124001183783, + "grad_norm": 1.6655805110931396, + "learning_rate": 1.831578947368421e-05, + "loss": 0.4201, + "step": 174 + }, + { + "epoch": 0.08286475288546907, + "grad_norm": 2.0391829013824463, + "learning_rate": 1.8421052631578947e-05, + "loss": 0.3712, + "step": 175 + }, + { + "epoch": 0.08333826575910032, + "grad_norm": 1.7739325761795044, + "learning_rate": 1.8526315789473684e-05, + "loss": 0.3718, + "step": 176 + }, + { + "epoch": 0.08381177863273158, + "grad_norm": 1.7949507236480713, + "learning_rate": 1.8631578947368424e-05, + "loss": 0.4053, + "step": 177 + }, + { + "epoch": 0.08428529150636282, + "grad_norm": 1.8814252614974976, + "learning_rate": 1.873684210526316e-05, + "loss": 0.3998, + "step": 178 + }, + { + "epoch": 0.08475880437999408, + "grad_norm": 1.8132871389389038, + "learning_rate": 1.8842105263157898e-05, + "loss": 0.4553, + "step": 179 + }, + { + "epoch": 0.08523231725362533, + "grad_norm": 1.2668806314468384, + "learning_rate": 1.894736842105263e-05, + "loss": 0.3793, + "step": 180 + }, + { + "epoch": 0.08570583012725659, + "grad_norm": 1.6622042655944824, + "learning_rate": 1.9052631578947368e-05, + "loss": 0.398, + "step": 181 + }, + { + "epoch": 0.08617934300088784, + "grad_norm": 2.258330821990967, + "learning_rate": 1.9157894736842108e-05, + "loss": 0.3751, + "step": 182 + }, + { + "epoch": 0.08665285587451908, + "grad_norm": 2.4360086917877197, + "learning_rate": 1.9263157894736845e-05, + "loss": 0.3793, + "step": 183 + }, + { + "epoch": 0.08712636874815034, + "grad_norm": 1.8272238969802856, + "learning_rate": 1.936842105263158e-05, + "loss": 0.4108, + "step": 184 + }, + { + "epoch": 0.08759988162178159, + "grad_norm": 1.8764162063598633, + "learning_rate": 1.9473684210526318e-05, + "loss": 0.3656, + "step": 185 + }, + { + "epoch": 0.08807339449541285, + "grad_norm": 1.8359413146972656, + "learning_rate": 1.9578947368421055e-05, + "loss": 0.4059, + "step": 186 + }, + { + "epoch": 0.0885469073690441, + "grad_norm": 1.6942843198776245, + "learning_rate": 1.968421052631579e-05, + "loss": 0.4086, + "step": 187 + }, + { + "epoch": 0.08902042024267534, + "grad_norm": 1.4842171669006348, + "learning_rate": 1.9789473684210528e-05, + "loss": 0.3665, + "step": 188 + }, + { + "epoch": 0.0894939331163066, + "grad_norm": 2.0384953022003174, + "learning_rate": 1.9894736842105265e-05, + "loss": 0.4014, + "step": 189 + }, + { + "epoch": 0.08996744598993785, + "grad_norm": 2.0458626747131348, + "learning_rate": 2e-05, + "loss": 0.3963, + "step": 190 + }, + { + "epoch": 0.09044095886356911, + "grad_norm": 1.6842604875564575, + "learning_rate": 1.999999869229824e-05, + "loss": 0.3383, + "step": 191 + }, + { + "epoch": 0.09091447173720035, + "grad_norm": 1.3942151069641113, + "learning_rate": 1.9999994769193288e-05, + "loss": 0.3629, + "step": 192 + }, + { + "epoch": 0.09138798461083161, + "grad_norm": 1.8772989511489868, + "learning_rate": 1.9999988230686176e-05, + "loss": 0.3963, + "step": 193 + }, + { + "epoch": 0.09186149748446286, + "grad_norm": 1.479197382926941, + "learning_rate": 1.9999979076778615e-05, + "loss": 0.409, + "step": 194 + }, + { + "epoch": 0.0923350103580941, + "grad_norm": 1.848061442375183, + "learning_rate": 1.9999967307473e-05, + "loss": 0.3716, + "step": 195 + }, + { + "epoch": 0.09280852323172536, + "grad_norm": 1.4783906936645508, + "learning_rate": 1.9999952922772404e-05, + "loss": 0.3995, + "step": 196 + }, + { + "epoch": 0.09328203610535661, + "grad_norm": 1.4269295930862427, + "learning_rate": 1.9999935922680593e-05, + "loss": 0.428, + "step": 197 + }, + { + "epoch": 0.09375554897898787, + "grad_norm": 2.0845489501953125, + "learning_rate": 1.9999916307202013e-05, + "loss": 0.3909, + "step": 198 + }, + { + "epoch": 0.09422906185261912, + "grad_norm": 1.550615668296814, + "learning_rate": 1.9999894076341794e-05, + "loss": 0.3585, + "step": 199 + }, + { + "epoch": 0.09470257472625036, + "grad_norm": 1.9279413223266602, + "learning_rate": 1.999986923010575e-05, + "loss": 0.3696, + "step": 200 + }, + { + "epoch": 0.09517608759988162, + "grad_norm": 1.6810389757156372, + "learning_rate": 1.999984176850038e-05, + "loss": 0.4183, + "step": 201 + }, + { + "epoch": 0.09564960047351287, + "grad_norm": 1.6918199062347412, + "learning_rate": 1.9999811691532865e-05, + "loss": 0.4065, + "step": 202 + }, + { + "epoch": 0.09612311334714413, + "grad_norm": 1.8900259733200073, + "learning_rate": 1.999977899921107e-05, + "loss": 0.3825, + "step": 203 + }, + { + "epoch": 0.09659662622077538, + "grad_norm": 1.9546706676483154, + "learning_rate": 1.999974369154355e-05, + "loss": 0.3864, + "step": 204 + }, + { + "epoch": 0.09707013909440664, + "grad_norm": 1.354948878288269, + "learning_rate": 1.9999705768539537e-05, + "loss": 0.3947, + "step": 205 + }, + { + "epoch": 0.09754365196803788, + "grad_norm": 1.7710624933242798, + "learning_rate": 1.9999665230208947e-05, + "loss": 0.4115, + "step": 206 + }, + { + "epoch": 0.09801716484166913, + "grad_norm": 1.6053873300552368, + "learning_rate": 1.9999622076562387e-05, + "loss": 0.3892, + "step": 207 + }, + { + "epoch": 0.09849067771530039, + "grad_norm": 1.1621034145355225, + "learning_rate": 1.999957630761114e-05, + "loss": 0.3298, + "step": 208 + }, + { + "epoch": 0.09896419058893163, + "grad_norm": 1.2299578189849854, + "learning_rate": 1.9999527923367175e-05, + "loss": 0.3176, + "step": 209 + }, + { + "epoch": 0.09943770346256289, + "grad_norm": 1.7731809616088867, + "learning_rate": 1.999947692384315e-05, + "loss": 0.3745, + "step": 210 + }, + { + "epoch": 0.09991121633619414, + "grad_norm": 1.3102498054504395, + "learning_rate": 1.9999423309052405e-05, + "loss": 0.3688, + "step": 211 + }, + { + "epoch": 0.1003847292098254, + "grad_norm": 1.3057738542556763, + "learning_rate": 1.9999367079008957e-05, + "loss": 0.3865, + "step": 212 + }, + { + "epoch": 0.10085824208345665, + "grad_norm": 1.6710134744644165, + "learning_rate": 1.9999308233727516e-05, + "loss": 0.3523, + "step": 213 + }, + { + "epoch": 0.10133175495708789, + "grad_norm": 1.6011284589767456, + "learning_rate": 1.9999246773223468e-05, + "loss": 0.3286, + "step": 214 + }, + { + "epoch": 0.10180526783071915, + "grad_norm": 1.5414258241653442, + "learning_rate": 1.9999182697512896e-05, + "loss": 0.3608, + "step": 215 + }, + { + "epoch": 0.1022787807043504, + "grad_norm": 2.0255424976348877, + "learning_rate": 1.9999116006612553e-05, + "loss": 0.3466, + "step": 216 + }, + { + "epoch": 0.10275229357798166, + "grad_norm": 1.9602148532867432, + "learning_rate": 1.999904670053988e-05, + "loss": 0.3827, + "step": 217 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 2.306156635284424, + "learning_rate": 1.9998974779313004e-05, + "loss": 0.371, + "step": 218 + }, + { + "epoch": 0.10369931932524415, + "grad_norm": 1.572983980178833, + "learning_rate": 1.9998900242950736e-05, + "loss": 0.3279, + "step": 219 + }, + { + "epoch": 0.10417283219887541, + "grad_norm": 1.3669328689575195, + "learning_rate": 1.9998823091472574e-05, + "loss": 0.3717, + "step": 220 + }, + { + "epoch": 0.10464634507250666, + "grad_norm": 1.9657361507415771, + "learning_rate": 1.9998743324898687e-05, + "loss": 0.3583, + "step": 221 + }, + { + "epoch": 0.10511985794613792, + "grad_norm": 1.8538391590118408, + "learning_rate": 1.9998660943249947e-05, + "loss": 0.3816, + "step": 222 + }, + { + "epoch": 0.10559337081976916, + "grad_norm": 1.4254624843597412, + "learning_rate": 1.9998575946547897e-05, + "loss": 0.3319, + "step": 223 + }, + { + "epoch": 0.10606688369340042, + "grad_norm": 1.865258812904358, + "learning_rate": 1.9998488334814766e-05, + "loss": 0.3527, + "step": 224 + }, + { + "epoch": 0.10654039656703167, + "grad_norm": 1.446964979171753, + "learning_rate": 1.9998398108073465e-05, + "loss": 0.4055, + "step": 225 + }, + { + "epoch": 0.10701390944066291, + "grad_norm": 1.7752127647399902, + "learning_rate": 1.9998305266347598e-05, + "loss": 0.3875, + "step": 226 + }, + { + "epoch": 0.10748742231429417, + "grad_norm": 1.8876196146011353, + "learning_rate": 1.9998209809661443e-05, + "loss": 0.3439, + "step": 227 + }, + { + "epoch": 0.10796093518792542, + "grad_norm": 1.2993323802947998, + "learning_rate": 1.9998111738039964e-05, + "loss": 0.3372, + "step": 228 + }, + { + "epoch": 0.10843444806155668, + "grad_norm": 1.4371949434280396, + "learning_rate": 1.9998011051508816e-05, + "loss": 0.3672, + "step": 229 + }, + { + "epoch": 0.10890796093518793, + "grad_norm": 1.5482254028320312, + "learning_rate": 1.9997907750094332e-05, + "loss": 0.3554, + "step": 230 + }, + { + "epoch": 0.10938147380881917, + "grad_norm": 1.4659943580627441, + "learning_rate": 1.9997801833823526e-05, + "loss": 0.3691, + "step": 231 + }, + { + "epoch": 0.10985498668245043, + "grad_norm": 1.5168931484222412, + "learning_rate": 1.99976933027241e-05, + "loss": 0.3559, + "step": 232 + }, + { + "epoch": 0.11032849955608168, + "grad_norm": 1.4778302907943726, + "learning_rate": 1.999758215682444e-05, + "loss": 0.3201, + "step": 233 + }, + { + "epoch": 0.11080201242971294, + "grad_norm": 2.5164334774017334, + "learning_rate": 1.9997468396153615e-05, + "loss": 0.3711, + "step": 234 + }, + { + "epoch": 0.11127552530334418, + "grad_norm": 1.5941665172576904, + "learning_rate": 1.999735202074138e-05, + "loss": 0.3489, + "step": 235 + }, + { + "epoch": 0.11174903817697544, + "grad_norm": 1.5780380964279175, + "learning_rate": 1.9997233030618167e-05, + "loss": 0.3282, + "step": 236 + }, + { + "epoch": 0.11222255105060669, + "grad_norm": 1.8493186235427856, + "learning_rate": 1.9997111425815103e-05, + "loss": 0.3516, + "step": 237 + }, + { + "epoch": 0.11269606392423794, + "grad_norm": 1.4640581607818604, + "learning_rate": 1.999698720636399e-05, + "loss": 0.3542, + "step": 238 + }, + { + "epoch": 0.1131695767978692, + "grad_norm": 2.8939545154571533, + "learning_rate": 1.9996860372297312e-05, + "loss": 0.3596, + "step": 239 + }, + { + "epoch": 0.11364308967150044, + "grad_norm": 1.7447127103805542, + "learning_rate": 1.999673092364825e-05, + "loss": 0.3593, + "step": 240 + }, + { + "epoch": 0.1141166025451317, + "grad_norm": 1.753167986869812, + "learning_rate": 1.9996598860450653e-05, + "loss": 0.3278, + "step": 241 + }, + { + "epoch": 0.11459011541876295, + "grad_norm": 1.8249387741088867, + "learning_rate": 1.9996464182739063e-05, + "loss": 0.3373, + "step": 242 + }, + { + "epoch": 0.1150636282923942, + "grad_norm": 1.9170994758605957, + "learning_rate": 1.99963268905487e-05, + "loss": 0.3284, + "step": 243 + }, + { + "epoch": 0.11553714116602545, + "grad_norm": 2.2071142196655273, + "learning_rate": 1.999618698391548e-05, + "loss": 0.3633, + "step": 244 + }, + { + "epoch": 0.1160106540396567, + "grad_norm": 1.6293482780456543, + "learning_rate": 1.9996044462875984e-05, + "loss": 0.3624, + "step": 245 + }, + { + "epoch": 0.11648416691328796, + "grad_norm": 2.3833389282226562, + "learning_rate": 1.9995899327467498e-05, + "loss": 0.3329, + "step": 246 + }, + { + "epoch": 0.1169576797869192, + "grad_norm": 1.9268333911895752, + "learning_rate": 1.999575157772797e-05, + "loss": 0.3451, + "step": 247 + }, + { + "epoch": 0.11743119266055047, + "grad_norm": 1.3293579816818237, + "learning_rate": 1.9995601213696053e-05, + "loss": 0.3676, + "step": 248 + }, + { + "epoch": 0.11790470553418171, + "grad_norm": 1.651096224784851, + "learning_rate": 1.9995448235411062e-05, + "loss": 0.3527, + "step": 249 + }, + { + "epoch": 0.11837821840781296, + "grad_norm": 1.949661135673523, + "learning_rate": 1.999529264291302e-05, + "loss": 0.4063, + "step": 250 + }, + { + "epoch": 0.11885173128144422, + "grad_norm": 1.3306670188903809, + "learning_rate": 1.9995134436242606e-05, + "loss": 0.3607, + "step": 251 + }, + { + "epoch": 0.11932524415507546, + "grad_norm": 1.5161584615707397, + "learning_rate": 1.9994973615441207e-05, + "loss": 0.3825, + "step": 252 + }, + { + "epoch": 0.11979875702870672, + "grad_norm": 2.071152687072754, + "learning_rate": 1.9994810180550884e-05, + "loss": 0.3306, + "step": 253 + }, + { + "epoch": 0.12027226990233797, + "grad_norm": 1.514334797859192, + "learning_rate": 1.9994644131614382e-05, + "loss": 0.3254, + "step": 254 + }, + { + "epoch": 0.12074578277596922, + "grad_norm": 1.9304691553115845, + "learning_rate": 1.9994475468675122e-05, + "loss": 0.3863, + "step": 255 + }, + { + "epoch": 0.12121929564960048, + "grad_norm": 1.6955091953277588, + "learning_rate": 1.9994304191777228e-05, + "loss": 0.3559, + "step": 256 + }, + { + "epoch": 0.12169280852323172, + "grad_norm": 1.3693172931671143, + "learning_rate": 1.9994130300965485e-05, + "loss": 0.3393, + "step": 257 + }, + { + "epoch": 0.12216632139686298, + "grad_norm": 1.4945056438446045, + "learning_rate": 1.9993953796285377e-05, + "loss": 0.3051, + "step": 258 + }, + { + "epoch": 0.12263983427049423, + "grad_norm": 1.3563364744186401, + "learning_rate": 1.999377467778307e-05, + "loss": 0.3369, + "step": 259 + }, + { + "epoch": 0.12311334714412549, + "grad_norm": 1.2931690216064453, + "learning_rate": 1.9993592945505402e-05, + "loss": 0.3297, + "step": 260 + }, + { + "epoch": 0.12358686001775673, + "grad_norm": 1.5800117254257202, + "learning_rate": 1.9993408599499914e-05, + "loss": 0.3287, + "step": 261 + }, + { + "epoch": 0.12406037289138798, + "grad_norm": 1.3716331720352173, + "learning_rate": 1.999322163981481e-05, + "loss": 0.3092, + "step": 262 + }, + { + "epoch": 0.12453388576501924, + "grad_norm": 1.6586685180664062, + "learning_rate": 1.9993032066499e-05, + "loss": 0.4093, + "step": 263 + }, + { + "epoch": 0.12500739863865049, + "grad_norm": 1.4457738399505615, + "learning_rate": 1.999283987960205e-05, + "loss": 0.3679, + "step": 264 + }, + { + "epoch": 0.12548091151228175, + "grad_norm": 1.7291022539138794, + "learning_rate": 1.9992645079174235e-05, + "loss": 0.3708, + "step": 265 + }, + { + "epoch": 0.125954424385913, + "grad_norm": 1.6154969930648804, + "learning_rate": 1.99924476652665e-05, + "loss": 0.3684, + "step": 266 + }, + { + "epoch": 0.12642793725954424, + "grad_norm": 1.2971596717834473, + "learning_rate": 1.999224763793048e-05, + "loss": 0.3378, + "step": 267 + }, + { + "epoch": 0.1269014501331755, + "grad_norm": 2.0271036624908447, + "learning_rate": 1.9992044997218484e-05, + "loss": 0.3062, + "step": 268 + }, + { + "epoch": 0.12737496300680676, + "grad_norm": 1.4624063968658447, + "learning_rate": 1.9991839743183514e-05, + "loss": 0.3105, + "step": 269 + }, + { + "epoch": 0.127848475880438, + "grad_norm": 1.4940061569213867, + "learning_rate": 1.9991631875879252e-05, + "loss": 0.3523, + "step": 270 + }, + { + "epoch": 0.12832198875406925, + "grad_norm": 1.7870043516159058, + "learning_rate": 1.9991421395360066e-05, + "loss": 0.3551, + "step": 271 + }, + { + "epoch": 0.1287955016277005, + "grad_norm": 1.2716615200042725, + "learning_rate": 1.9991208301681e-05, + "loss": 0.3073, + "step": 272 + }, + { + "epoch": 0.12926901450133174, + "grad_norm": 1.2899311780929565, + "learning_rate": 1.9990992594897792e-05, + "loss": 0.3495, + "step": 273 + }, + { + "epoch": 0.129742527374963, + "grad_norm": 1.1452953815460205, + "learning_rate": 1.9990774275066856e-05, + "loss": 0.3054, + "step": 274 + }, + { + "epoch": 0.13021604024859426, + "grad_norm": 1.4126183986663818, + "learning_rate": 1.9990553342245288e-05, + "loss": 0.3209, + "step": 275 + }, + { + "epoch": 0.13068955312222552, + "grad_norm": 1.9172788858413696, + "learning_rate": 1.9990329796490878e-05, + "loss": 0.326, + "step": 276 + }, + { + "epoch": 0.13116306599585675, + "grad_norm": 1.8034718036651611, + "learning_rate": 1.9990103637862086e-05, + "loss": 0.3302, + "step": 277 + }, + { + "epoch": 0.131636578869488, + "grad_norm": 2.3256547451019287, + "learning_rate": 1.998987486641806e-05, + "loss": 0.36, + "step": 278 + }, + { + "epoch": 0.13211009174311927, + "grad_norm": 2.554792642593384, + "learning_rate": 1.9989643482218642e-05, + "loss": 0.3144, + "step": 279 + }, + { + "epoch": 0.1325836046167505, + "grad_norm": 1.872496485710144, + "learning_rate": 1.9989409485324342e-05, + "loss": 0.3312, + "step": 280 + }, + { + "epoch": 0.13305711749038177, + "grad_norm": 1.47379469871521, + "learning_rate": 1.998917287579636e-05, + "loss": 0.359, + "step": 281 + }, + { + "epoch": 0.13353063036401303, + "grad_norm": 1.6940183639526367, + "learning_rate": 1.998893365369658e-05, + "loss": 0.3364, + "step": 282 + }, + { + "epoch": 0.13400414323764429, + "grad_norm": 2.1470634937286377, + "learning_rate": 1.9988691819087568e-05, + "loss": 0.3076, + "step": 283 + }, + { + "epoch": 0.13447765611127552, + "grad_norm": 1.484626054763794, + "learning_rate": 1.9988447372032573e-05, + "loss": 0.3061, + "step": 284 + }, + { + "epoch": 0.13495116898490678, + "grad_norm": 2.2221710681915283, + "learning_rate": 1.9988200312595527e-05, + "loss": 0.3271, + "step": 285 + }, + { + "epoch": 0.13542468185853804, + "grad_norm": 1.8547298908233643, + "learning_rate": 1.998795064084105e-05, + "loss": 0.3321, + "step": 286 + }, + { + "epoch": 0.13589819473216927, + "grad_norm": 1.4910904169082642, + "learning_rate": 1.9987698356834438e-05, + "loss": 0.3319, + "step": 287 + }, + { + "epoch": 0.13637170760580053, + "grad_norm": 1.8652129173278809, + "learning_rate": 1.9987443460641672e-05, + "loss": 0.3279, + "step": 288 + }, + { + "epoch": 0.1368452204794318, + "grad_norm": 1.7757213115692139, + "learning_rate": 1.998718595232942e-05, + "loss": 0.3662, + "step": 289 + }, + { + "epoch": 0.13731873335306305, + "grad_norm": 1.7252709865570068, + "learning_rate": 1.998692583196503e-05, + "loss": 0.3394, + "step": 290 + }, + { + "epoch": 0.13779224622669428, + "grad_norm": 1.5977404117584229, + "learning_rate": 1.998666309961653e-05, + "loss": 0.3187, + "step": 291 + }, + { + "epoch": 0.13826575910032554, + "grad_norm": 2.18762469291687, + "learning_rate": 1.9986397755352643e-05, + "loss": 0.3153, + "step": 292 + }, + { + "epoch": 0.1387392719739568, + "grad_norm": 1.6634726524353027, + "learning_rate": 1.9986129799242767e-05, + "loss": 0.3312, + "step": 293 + }, + { + "epoch": 0.13921278484758803, + "grad_norm": 1.709532618522644, + "learning_rate": 1.9985859231356976e-05, + "loss": 0.3461, + "step": 294 + }, + { + "epoch": 0.1396862977212193, + "grad_norm": 1.9468045234680176, + "learning_rate": 1.998558605176604e-05, + "loss": 0.3441, + "step": 295 + }, + { + "epoch": 0.14015981059485055, + "grad_norm": 1.3585155010223389, + "learning_rate": 1.9985310260541403e-05, + "loss": 0.332, + "step": 296 + }, + { + "epoch": 0.14063332346848179, + "grad_norm": 2.272162437438965, + "learning_rate": 1.99850318577552e-05, + "loss": 0.3187, + "step": 297 + }, + { + "epoch": 0.14110683634211305, + "grad_norm": 1.5794426202774048, + "learning_rate": 1.998475084348024e-05, + "loss": 0.2903, + "step": 298 + }, + { + "epoch": 0.1415803492157443, + "grad_norm": 1.7926369905471802, + "learning_rate": 1.998446721779002e-05, + "loss": 0.3271, + "step": 299 + }, + { + "epoch": 0.14205386208937557, + "grad_norm": 2.8953635692596436, + "learning_rate": 1.9984180980758724e-05, + "loss": 0.3276, + "step": 300 + }, + { + "epoch": 0.1425273749630068, + "grad_norm": 2.217665910720825, + "learning_rate": 1.998389213246121e-05, + "loss": 0.3039, + "step": 301 + }, + { + "epoch": 0.14300088783663806, + "grad_norm": 1.6621983051300049, + "learning_rate": 1.9983600672973026e-05, + "loss": 0.3323, + "step": 302 + }, + { + "epoch": 0.14347440071026932, + "grad_norm": 2.2490603923797607, + "learning_rate": 1.99833066023704e-05, + "loss": 0.3002, + "step": 303 + }, + { + "epoch": 0.14394791358390055, + "grad_norm": 1.3009684085845947, + "learning_rate": 1.9983009920730244e-05, + "loss": 0.3074, + "step": 304 + }, + { + "epoch": 0.1444214264575318, + "grad_norm": 1.508434534072876, + "learning_rate": 1.998271062813015e-05, + "loss": 0.3917, + "step": 305 + }, + { + "epoch": 0.14489493933116307, + "grad_norm": 1.421549677848816, + "learning_rate": 1.998240872464839e-05, + "loss": 0.3269, + "step": 306 + }, + { + "epoch": 0.14536845220479433, + "grad_norm": 1.5330595970153809, + "learning_rate": 1.9982104210363936e-05, + "loss": 0.3122, + "step": 307 + }, + { + "epoch": 0.14584196507842556, + "grad_norm": 1.7626796960830688, + "learning_rate": 1.9981797085356426e-05, + "loss": 0.3176, + "step": 308 + }, + { + "epoch": 0.14631547795205682, + "grad_norm": 1.6426533460617065, + "learning_rate": 1.998148734970618e-05, + "loss": 0.3441, + "step": 309 + }, + { + "epoch": 0.14678899082568808, + "grad_norm": 1.3211804628372192, + "learning_rate": 1.9981175003494217e-05, + "loss": 0.2941, + "step": 310 + }, + { + "epoch": 0.1472625036993193, + "grad_norm": 1.625162124633789, + "learning_rate": 1.9980860046802214e-05, + "loss": 0.3284, + "step": 311 + }, + { + "epoch": 0.14773601657295057, + "grad_norm": 1.1030138731002808, + "learning_rate": 1.998054247971256e-05, + "loss": 0.3086, + "step": 312 + }, + { + "epoch": 0.14820952944658183, + "grad_norm": 1.5524837970733643, + "learning_rate": 1.9980222302308297e-05, + "loss": 0.3069, + "step": 313 + }, + { + "epoch": 0.1486830423202131, + "grad_norm": 1.973054051399231, + "learning_rate": 1.997989951467318e-05, + "loss": 0.3185, + "step": 314 + }, + { + "epoch": 0.14915655519384433, + "grad_norm": 1.6762253046035767, + "learning_rate": 1.9979574116891617e-05, + "loss": 0.3271, + "step": 315 + }, + { + "epoch": 0.14963006806747559, + "grad_norm": 1.4578509330749512, + "learning_rate": 1.997924610904872e-05, + "loss": 0.3082, + "step": 316 + }, + { + "epoch": 0.15010358094110685, + "grad_norm": 1.9425774812698364, + "learning_rate": 1.9978915491230277e-05, + "loss": 0.3229, + "step": 317 + }, + { + "epoch": 0.15057709381473808, + "grad_norm": 1.977746605873108, + "learning_rate": 1.997858226352275e-05, + "loss": 0.3359, + "step": 318 + }, + { + "epoch": 0.15105060668836934, + "grad_norm": 1.842903971672058, + "learning_rate": 1.9978246426013304e-05, + "loss": 0.3891, + "step": 319 + }, + { + "epoch": 0.1515241195620006, + "grad_norm": 1.3752530813217163, + "learning_rate": 1.9977907978789762e-05, + "loss": 0.3273, + "step": 320 + }, + { + "epoch": 0.15199763243563186, + "grad_norm": 1.6148236989974976, + "learning_rate": 1.997756692194065e-05, + "loss": 0.3787, + "step": 321 + }, + { + "epoch": 0.1524711453092631, + "grad_norm": 1.5016567707061768, + "learning_rate": 1.997722325555516e-05, + "loss": 0.2908, + "step": 322 + }, + { + "epoch": 0.15294465818289435, + "grad_norm": 1.3146488666534424, + "learning_rate": 1.9976876979723185e-05, + "loss": 0.2753, + "step": 323 + }, + { + "epoch": 0.1534181710565256, + "grad_norm": 1.3233283758163452, + "learning_rate": 1.9976528094535285e-05, + "loss": 0.3403, + "step": 324 + }, + { + "epoch": 0.15389168393015684, + "grad_norm": 1.2820795774459839, + "learning_rate": 1.9976176600082702e-05, + "loss": 0.3536, + "step": 325 + }, + { + "epoch": 0.1543651968037881, + "grad_norm": 1.5473982095718384, + "learning_rate": 1.9975822496457377e-05, + "loss": 0.3327, + "step": 326 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 2.2213194370269775, + "learning_rate": 1.9975465783751908e-05, + "loss": 0.3157, + "step": 327 + }, + { + "epoch": 0.1553122225510506, + "grad_norm": 2.196427345275879, + "learning_rate": 1.9975106462059604e-05, + "loss": 0.2918, + "step": 328 + }, + { + "epoch": 0.15578573542468185, + "grad_norm": 1.3942145109176636, + "learning_rate": 1.997474453147444e-05, + "loss": 0.3043, + "step": 329 + }, + { + "epoch": 0.1562592482983131, + "grad_norm": 1.2132917642593384, + "learning_rate": 1.9974379992091065e-05, + "loss": 0.2957, + "step": 330 + }, + { + "epoch": 0.15673276117194437, + "grad_norm": 1.470230221748352, + "learning_rate": 1.997401284400483e-05, + "loss": 0.3122, + "step": 331 + }, + { + "epoch": 0.1572062740455756, + "grad_norm": 2.0138354301452637, + "learning_rate": 1.9973643087311755e-05, + "loss": 0.3466, + "step": 332 + }, + { + "epoch": 0.15767978691920687, + "grad_norm": 1.662995457649231, + "learning_rate": 1.9973270722108553e-05, + "loss": 0.3182, + "step": 333 + }, + { + "epoch": 0.15815329979283813, + "grad_norm": 1.7974313497543335, + "learning_rate": 1.9972895748492603e-05, + "loss": 0.3393, + "step": 334 + }, + { + "epoch": 0.15862681266646936, + "grad_norm": 1.5902550220489502, + "learning_rate": 1.9972518166561983e-05, + "loss": 0.304, + "step": 335 + }, + { + "epoch": 0.15910032554010062, + "grad_norm": 1.5947198867797852, + "learning_rate": 1.997213797641544e-05, + "loss": 0.3209, + "step": 336 + }, + { + "epoch": 0.15957383841373188, + "grad_norm": 1.3484359979629517, + "learning_rate": 1.9971755178152416e-05, + "loss": 0.3136, + "step": 337 + }, + { + "epoch": 0.16004735128736314, + "grad_norm": 1.3645367622375488, + "learning_rate": 1.9971369771873022e-05, + "loss": 0.3269, + "step": 338 + }, + { + "epoch": 0.16052086416099437, + "grad_norm": 1.8705230951309204, + "learning_rate": 1.997098175767806e-05, + "loss": 0.2756, + "step": 339 + }, + { + "epoch": 0.16099437703462563, + "grad_norm": 1.3941823244094849, + "learning_rate": 1.997059113566901e-05, + "loss": 0.291, + "step": 340 + }, + { + "epoch": 0.1614678899082569, + "grad_norm": 1.3297516107559204, + "learning_rate": 1.9970197905948035e-05, + "loss": 0.3159, + "step": 341 + }, + { + "epoch": 0.16194140278188812, + "grad_norm": 1.6793006658554077, + "learning_rate": 1.996980206861799e-05, + "loss": 0.3268, + "step": 342 + }, + { + "epoch": 0.16241491565551938, + "grad_norm": 1.1595239639282227, + "learning_rate": 1.9969403623782385e-05, + "loss": 0.2932, + "step": 343 + }, + { + "epoch": 0.16288842852915064, + "grad_norm": 2.3051133155822754, + "learning_rate": 1.9969002571545442e-05, + "loss": 0.3267, + "step": 344 + }, + { + "epoch": 0.1633619414027819, + "grad_norm": 2.491314172744751, + "learning_rate": 1.996859891201205e-05, + "loss": 0.3367, + "step": 345 + }, + { + "epoch": 0.16383545427641313, + "grad_norm": 1.5665141344070435, + "learning_rate": 1.996819264528778e-05, + "loss": 0.3012, + "step": 346 + }, + { + "epoch": 0.1643089671500444, + "grad_norm": 2.1546201705932617, + "learning_rate": 1.996778377147889e-05, + "loss": 0.3249, + "step": 347 + }, + { + "epoch": 0.16478248002367565, + "grad_norm": 2.4622466564178467, + "learning_rate": 1.9967372290692314e-05, + "loss": 0.3302, + "step": 348 + }, + { + "epoch": 0.16525599289730689, + "grad_norm": 1.5641582012176514, + "learning_rate": 1.9966958203035673e-05, + "loss": 0.3561, + "step": 349 + }, + { + "epoch": 0.16572950577093815, + "grad_norm": 1.4802383184432983, + "learning_rate": 1.996654150861727e-05, + "loss": 0.2927, + "step": 350 + }, + { + "epoch": 0.1662030186445694, + "grad_norm": 2.025378942489624, + "learning_rate": 1.996612220754608e-05, + "loss": 0.3213, + "step": 351 + }, + { + "epoch": 0.16667653151820064, + "grad_norm": 2.12949275970459, + "learning_rate": 1.9965700299931772e-05, + "loss": 0.3446, + "step": 352 + }, + { + "epoch": 0.1671500443918319, + "grad_norm": 1.340847373008728, + "learning_rate": 1.9965275785884692e-05, + "loss": 0.308, + "step": 353 + }, + { + "epoch": 0.16762355726546316, + "grad_norm": 1.11890709400177, + "learning_rate": 1.9964848665515867e-05, + "loss": 0.2796, + "step": 354 + }, + { + "epoch": 0.16809707013909442, + "grad_norm": 1.3249599933624268, + "learning_rate": 1.9964418938937005e-05, + "loss": 0.3066, + "step": 355 + }, + { + "epoch": 0.16857058301272565, + "grad_norm": 2.209138870239258, + "learning_rate": 1.99639866062605e-05, + "loss": 0.3221, + "step": 356 + }, + { + "epoch": 0.1690440958863569, + "grad_norm": 1.4605731964111328, + "learning_rate": 1.9963551667599425e-05, + "loss": 0.3104, + "step": 357 + }, + { + "epoch": 0.16951760875998817, + "grad_norm": 1.4312578439712524, + "learning_rate": 1.9963114123067525e-05, + "loss": 0.3303, + "step": 358 + }, + { + "epoch": 0.1699911216336194, + "grad_norm": 1.3438875675201416, + "learning_rate": 1.9962673972779244e-05, + "loss": 0.327, + "step": 359 + }, + { + "epoch": 0.17046463450725066, + "grad_norm": 1.717059850692749, + "learning_rate": 1.99622312168497e-05, + "loss": 0.3151, + "step": 360 + }, + { + "epoch": 0.17093814738088192, + "grad_norm": 1.7237285375595093, + "learning_rate": 1.9961785855394685e-05, + "loss": 0.288, + "step": 361 + }, + { + "epoch": 0.17141166025451318, + "grad_norm": 1.342538833618164, + "learning_rate": 1.9961337888530686e-05, + "loss": 0.2842, + "step": 362 + }, + { + "epoch": 0.17188517312814441, + "grad_norm": 1.9628273248672485, + "learning_rate": 1.996088731637486e-05, + "loss": 0.3327, + "step": 363 + }, + { + "epoch": 0.17235868600177567, + "grad_norm": 1.262096643447876, + "learning_rate": 1.996043413904505e-05, + "loss": 0.2896, + "step": 364 + }, + { + "epoch": 0.17283219887540693, + "grad_norm": 1.4459971189498901, + "learning_rate": 1.995997835665978e-05, + "loss": 0.3512, + "step": 365 + }, + { + "epoch": 0.17330571174903817, + "grad_norm": 1.829930067062378, + "learning_rate": 1.9959519969338257e-05, + "loss": 0.3222, + "step": 366 + }, + { + "epoch": 0.17377922462266943, + "grad_norm": 1.5174282789230347, + "learning_rate": 1.9959058977200368e-05, + "loss": 0.3152, + "step": 367 + }, + { + "epoch": 0.17425273749630069, + "grad_norm": 2.4811291694641113, + "learning_rate": 1.9958595380366683e-05, + "loss": 0.2855, + "step": 368 + }, + { + "epoch": 0.17472625036993195, + "grad_norm": 1.502210259437561, + "learning_rate": 1.995812917895844e-05, + "loss": 0.3019, + "step": 369 + }, + { + "epoch": 0.17519976324356318, + "grad_norm": 2.2557857036590576, + "learning_rate": 1.9957660373097587e-05, + "loss": 0.3081, + "step": 370 + }, + { + "epoch": 0.17567327611719444, + "grad_norm": 1.5600831508636475, + "learning_rate": 1.9957188962906722e-05, + "loss": 0.2764, + "step": 371 + }, + { + "epoch": 0.1761467889908257, + "grad_norm": 1.6627660989761353, + "learning_rate": 1.9956714948509144e-05, + "loss": 0.2831, + "step": 372 + }, + { + "epoch": 0.17662030186445693, + "grad_norm": 1.7966787815093994, + "learning_rate": 1.9956238330028825e-05, + "loss": 0.3213, + "step": 373 + }, + { + "epoch": 0.1770938147380882, + "grad_norm": 1.3147883415222168, + "learning_rate": 1.9955759107590424e-05, + "loss": 0.2842, + "step": 374 + }, + { + "epoch": 0.17756732761171945, + "grad_norm": 1.6301828622817993, + "learning_rate": 1.9955277281319265e-05, + "loss": 0.288, + "step": 375 + }, + { + "epoch": 0.17804084048535068, + "grad_norm": 1.4622524976730347, + "learning_rate": 1.995479285134138e-05, + "loss": 0.3361, + "step": 376 + }, + { + "epoch": 0.17851435335898194, + "grad_norm": 1.8074785470962524, + "learning_rate": 1.9954305817783456e-05, + "loss": 0.3038, + "step": 377 + }, + { + "epoch": 0.1789878662326132, + "grad_norm": 1.4103055000305176, + "learning_rate": 1.995381618077288e-05, + "loss": 0.2992, + "step": 378 + }, + { + "epoch": 0.17946137910624446, + "grad_norm": 1.4178228378295898, + "learning_rate": 1.9953323940437707e-05, + "loss": 0.3074, + "step": 379 + }, + { + "epoch": 0.1799348919798757, + "grad_norm": 2.322828769683838, + "learning_rate": 1.9952829096906677e-05, + "loss": 0.3414, + "step": 380 + }, + { + "epoch": 0.18040840485350695, + "grad_norm": 1.5784039497375488, + "learning_rate": 1.9952331650309217e-05, + "loss": 0.3, + "step": 381 + }, + { + "epoch": 0.18088191772713821, + "grad_norm": 1.6129640340805054, + "learning_rate": 1.9951831600775423e-05, + "loss": 0.3698, + "step": 382 + }, + { + "epoch": 0.18135543060076945, + "grad_norm": 1.7583578824996948, + "learning_rate": 1.995132894843608e-05, + "loss": 0.312, + "step": 383 + }, + { + "epoch": 0.1818289434744007, + "grad_norm": 1.4428211450576782, + "learning_rate": 1.9950823693422653e-05, + "loss": 0.3401, + "step": 384 + }, + { + "epoch": 0.18230245634803197, + "grad_norm": 1.8162686824798584, + "learning_rate": 1.995031583586729e-05, + "loss": 0.2884, + "step": 385 + }, + { + "epoch": 0.18277596922166323, + "grad_norm": 1.871068000793457, + "learning_rate": 1.9949805375902807e-05, + "loss": 0.2974, + "step": 386 + }, + { + "epoch": 0.18324948209529446, + "grad_norm": 1.9816604852676392, + "learning_rate": 1.994929231366272e-05, + "loss": 0.269, + "step": 387 + }, + { + "epoch": 0.18372299496892572, + "grad_norm": 1.8843307495117188, + "learning_rate": 1.994877664928121e-05, + "loss": 0.3071, + "step": 388 + }, + { + "epoch": 0.18419650784255698, + "grad_norm": 1.7556949853897095, + "learning_rate": 1.994825838289314e-05, + "loss": 0.3656, + "step": 389 + }, + { + "epoch": 0.1846700207161882, + "grad_norm": 2.0573790073394775, + "learning_rate": 1.9947737514634068e-05, + "loss": 0.3047, + "step": 390 + }, + { + "epoch": 0.18514353358981947, + "grad_norm": 1.355928897857666, + "learning_rate": 1.9947214044640215e-05, + "loss": 0.3033, + "step": 391 + }, + { + "epoch": 0.18561704646345073, + "grad_norm": 1.6918185949325562, + "learning_rate": 1.9946687973048493e-05, + "loss": 0.2985, + "step": 392 + }, + { + "epoch": 0.186090559337082, + "grad_norm": 1.477439045906067, + "learning_rate": 1.9946159299996485e-05, + "loss": 0.3319, + "step": 393 + }, + { + "epoch": 0.18656407221071322, + "grad_norm": 1.5000855922698975, + "learning_rate": 1.9945628025622466e-05, + "loss": 0.2956, + "step": 394 + }, + { + "epoch": 0.18703758508434448, + "grad_norm": 1.4173663854599, + "learning_rate": 1.9945094150065385e-05, + "loss": 0.361, + "step": 395 + }, + { + "epoch": 0.18751109795797574, + "grad_norm": 1.6704801321029663, + "learning_rate": 1.9944557673464873e-05, + "loss": 0.3124, + "step": 396 + }, + { + "epoch": 0.18798461083160697, + "grad_norm": 1.5634368658065796, + "learning_rate": 1.9944018595961235e-05, + "loss": 0.294, + "step": 397 + }, + { + "epoch": 0.18845812370523823, + "grad_norm": 1.3400732278823853, + "learning_rate": 1.9943476917695465e-05, + "loss": 0.3401, + "step": 398 + }, + { + "epoch": 0.1889316365788695, + "grad_norm": 1.7150495052337646, + "learning_rate": 1.9942932638809233e-05, + "loss": 0.319, + "step": 399 + }, + { + "epoch": 0.18940514945250073, + "grad_norm": 1.299709439277649, + "learning_rate": 1.9942385759444892e-05, + "loss": 0.3081, + "step": 400 + }, + { + "epoch": 0.189878662326132, + "grad_norm": 1.4215167760849, + "learning_rate": 1.9941836279745473e-05, + "loss": 0.3435, + "step": 401 + }, + { + "epoch": 0.19035217519976325, + "grad_norm": 1.525974154472351, + "learning_rate": 1.9941284199854684e-05, + "loss": 0.2943, + "step": 402 + }, + { + "epoch": 0.1908256880733945, + "grad_norm": 1.4975392818450928, + "learning_rate": 1.994072951991692e-05, + "loss": 0.2931, + "step": 403 + }, + { + "epoch": 0.19129920094702574, + "grad_norm": 2.3207857608795166, + "learning_rate": 1.9940172240077248e-05, + "loss": 0.2704, + "step": 404 + }, + { + "epoch": 0.191772713820657, + "grad_norm": 1.4448798894882202, + "learning_rate": 1.993961236048142e-05, + "loss": 0.3027, + "step": 405 + }, + { + "epoch": 0.19224622669428826, + "grad_norm": 1.5526810884475708, + "learning_rate": 1.9939049881275868e-05, + "loss": 0.3239, + "step": 406 + }, + { + "epoch": 0.1927197395679195, + "grad_norm": 1.3078432083129883, + "learning_rate": 1.9938484802607704e-05, + "loss": 0.2749, + "step": 407 + }, + { + "epoch": 0.19319325244155075, + "grad_norm": 1.5397154092788696, + "learning_rate": 1.993791712462472e-05, + "loss": 0.3503, + "step": 408 + }, + { + "epoch": 0.193666765315182, + "grad_norm": 1.3340824842453003, + "learning_rate": 1.9937346847475382e-05, + "loss": 0.3272, + "step": 409 + }, + { + "epoch": 0.19414027818881327, + "grad_norm": 1.463257074356079, + "learning_rate": 1.9936773971308847e-05, + "loss": 0.3026, + "step": 410 + }, + { + "epoch": 0.1946137910624445, + "grad_norm": 1.3551300764083862, + "learning_rate": 1.993619849627494e-05, + "loss": 0.2841, + "step": 411 + }, + { + "epoch": 0.19508730393607576, + "grad_norm": 1.297788143157959, + "learning_rate": 1.9935620422524172e-05, + "loss": 0.2927, + "step": 412 + }, + { + "epoch": 0.19556081680970702, + "grad_norm": 1.5475691556930542, + "learning_rate": 1.993503975020773e-05, + "loss": 0.2562, + "step": 413 + }, + { + "epoch": 0.19603432968333825, + "grad_norm": 1.5862221717834473, + "learning_rate": 1.993445647947749e-05, + "loss": 0.2706, + "step": 414 + }, + { + "epoch": 0.19650784255696951, + "grad_norm": 1.496739387512207, + "learning_rate": 1.9933870610486e-05, + "loss": 0.2786, + "step": 415 + }, + { + "epoch": 0.19698135543060077, + "grad_norm": 1.2570983171463013, + "learning_rate": 1.9933282143386478e-05, + "loss": 0.2779, + "step": 416 + }, + { + "epoch": 0.19745486830423203, + "grad_norm": 1.8604596853256226, + "learning_rate": 1.9932691078332843e-05, + "loss": 0.307, + "step": 417 + }, + { + "epoch": 0.19792838117786327, + "grad_norm": 1.6916229724884033, + "learning_rate": 1.9932097415479683e-05, + "loss": 0.2797, + "step": 418 + }, + { + "epoch": 0.19840189405149453, + "grad_norm": 1.1430197954177856, + "learning_rate": 1.993150115498226e-05, + "loss": 0.2891, + "step": 419 + }, + { + "epoch": 0.19887540692512579, + "grad_norm": 1.5780760049819946, + "learning_rate": 1.9930902296996516e-05, + "loss": 0.2891, + "step": 420 + }, + { + "epoch": 0.19934891979875702, + "grad_norm": 1.7553094625473022, + "learning_rate": 1.993030084167908e-05, + "loss": 0.3078, + "step": 421 + }, + { + "epoch": 0.19982243267238828, + "grad_norm": 1.4407962560653687, + "learning_rate": 1.9929696789187264e-05, + "loss": 0.3151, + "step": 422 + }, + { + "epoch": 0.20029594554601954, + "grad_norm": 1.4529813528060913, + "learning_rate": 1.9929090139679045e-05, + "loss": 0.3176, + "step": 423 + }, + { + "epoch": 0.2007694584196508, + "grad_norm": 1.5530153512954712, + "learning_rate": 1.9928480893313082e-05, + "loss": 0.2996, + "step": 424 + }, + { + "epoch": 0.20124297129328203, + "grad_norm": 1.2986822128295898, + "learning_rate": 1.992786905024873e-05, + "loss": 0.3124, + "step": 425 + }, + { + "epoch": 0.2017164841669133, + "grad_norm": 1.4415348768234253, + "learning_rate": 1.9927254610646e-05, + "loss": 0.2665, + "step": 426 + }, + { + "epoch": 0.20218999704054455, + "grad_norm": 1.6581053733825684, + "learning_rate": 1.9926637574665598e-05, + "loss": 0.2962, + "step": 427 + }, + { + "epoch": 0.20266350991417578, + "grad_norm": 1.351175308227539, + "learning_rate": 1.9926017942468903e-05, + "loss": 0.299, + "step": 428 + }, + { + "epoch": 0.20313702278780704, + "grad_norm": 1.107627511024475, + "learning_rate": 1.992539571421797e-05, + "loss": 0.2441, + "step": 429 + }, + { + "epoch": 0.2036105356614383, + "grad_norm": 1.7610833644866943, + "learning_rate": 1.9924770890075544e-05, + "loss": 0.2864, + "step": 430 + }, + { + "epoch": 0.20408404853506953, + "grad_norm": 1.2733440399169922, + "learning_rate": 1.9924143470205034e-05, + "loss": 0.2957, + "step": 431 + }, + { + "epoch": 0.2045575614087008, + "grad_norm": 1.452659010887146, + "learning_rate": 1.992351345477054e-05, + "loss": 0.2657, + "step": 432 + }, + { + "epoch": 0.20503107428233205, + "grad_norm": 1.7691302299499512, + "learning_rate": 1.992288084393684e-05, + "loss": 0.3139, + "step": 433 + }, + { + "epoch": 0.20550458715596331, + "grad_norm": 1.0048437118530273, + "learning_rate": 1.9922245637869376e-05, + "loss": 0.2768, + "step": 434 + }, + { + "epoch": 0.20597810002959455, + "grad_norm": 1.4264775514602661, + "learning_rate": 1.9921607836734292e-05, + "loss": 0.3009, + "step": 435 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 1.2522767782211304, + "learning_rate": 1.9920967440698392e-05, + "loss": 0.3091, + "step": 436 + }, + { + "epoch": 0.20692512577685707, + "grad_norm": 1.4616775512695312, + "learning_rate": 1.992032444992917e-05, + "loss": 0.2709, + "step": 437 + }, + { + "epoch": 0.2073986386504883, + "grad_norm": 2.048379421234131, + "learning_rate": 1.9919678864594788e-05, + "loss": 0.2922, + "step": 438 + }, + { + "epoch": 0.20787215152411956, + "grad_norm": 2.135870933532715, + "learning_rate": 1.9919030684864097e-05, + "loss": 0.3139, + "step": 439 + }, + { + "epoch": 0.20834566439775082, + "grad_norm": 1.5550521612167358, + "learning_rate": 1.991837991090662e-05, + "loss": 0.2967, + "step": 440 + }, + { + "epoch": 0.20881917727138208, + "grad_norm": 1.4404891729354858, + "learning_rate": 1.9917726542892562e-05, + "loss": 0.2625, + "step": 441 + }, + { + "epoch": 0.2092926901450133, + "grad_norm": 1.4406466484069824, + "learning_rate": 1.9917070580992805e-05, + "loss": 0.2846, + "step": 442 + }, + { + "epoch": 0.20976620301864457, + "grad_norm": 2.054635524749756, + "learning_rate": 1.9916412025378907e-05, + "loss": 0.3299, + "step": 443 + }, + { + "epoch": 0.21023971589227583, + "grad_norm": 1.5401420593261719, + "learning_rate": 1.9915750876223112e-05, + "loss": 0.2995, + "step": 444 + }, + { + "epoch": 0.21071322876590706, + "grad_norm": 1.5220952033996582, + "learning_rate": 1.9915087133698333e-05, + "loss": 0.2638, + "step": 445 + }, + { + "epoch": 0.21118674163953832, + "grad_norm": 1.1745985746383667, + "learning_rate": 1.9914420797978167e-05, + "loss": 0.2603, + "step": 446 + }, + { + "epoch": 0.21166025451316958, + "grad_norm": 1.6358566284179688, + "learning_rate": 1.9913751869236888e-05, + "loss": 0.2847, + "step": 447 + }, + { + "epoch": 0.21213376738680084, + "grad_norm": 1.0750114917755127, + "learning_rate": 1.9913080347649446e-05, + "loss": 0.2783, + "step": 448 + }, + { + "epoch": 0.21260728026043207, + "grad_norm": 2.263500928878784, + "learning_rate": 1.9912406233391474e-05, + "loss": 0.2818, + "step": 449 + }, + { + "epoch": 0.21308079313406333, + "grad_norm": 1.2337026596069336, + "learning_rate": 1.991172952663928e-05, + "loss": 0.2995, + "step": 450 + }, + { + "epoch": 0.2135543060076946, + "grad_norm": 1.5734847784042358, + "learning_rate": 1.9911050227569845e-05, + "loss": 0.2681, + "step": 451 + }, + { + "epoch": 0.21402781888132583, + "grad_norm": 1.5128898620605469, + "learning_rate": 1.9910368336360836e-05, + "loss": 0.3128, + "step": 452 + }, + { + "epoch": 0.2145013317549571, + "grad_norm": 2.963315486907959, + "learning_rate": 1.99096838531906e-05, + "loss": 0.3157, + "step": 453 + }, + { + "epoch": 0.21497484462858835, + "grad_norm": 1.357264757156372, + "learning_rate": 1.990899677823815e-05, + "loss": 0.3173, + "step": 454 + }, + { + "epoch": 0.21544835750221958, + "grad_norm": 1.2650480270385742, + "learning_rate": 1.9908307111683184e-05, + "loss": 0.2686, + "step": 455 + }, + { + "epoch": 0.21592187037585084, + "grad_norm": 1.538608193397522, + "learning_rate": 1.990761485370608e-05, + "loss": 0.3223, + "step": 456 + }, + { + "epoch": 0.2163953832494821, + "grad_norm": 1.675292730331421, + "learning_rate": 1.9906920004487894e-05, + "loss": 0.292, + "step": 457 + }, + { + "epoch": 0.21686889612311336, + "grad_norm": 1.2939106225967407, + "learning_rate": 1.9906222564210353e-05, + "loss": 0.2695, + "step": 458 + }, + { + "epoch": 0.2173424089967446, + "grad_norm": 1.1748974323272705, + "learning_rate": 1.990552253305587e-05, + "loss": 0.3129, + "step": 459 + }, + { + "epoch": 0.21781592187037585, + "grad_norm": 1.7201486825942993, + "learning_rate": 1.9904819911207526e-05, + "loss": 0.3042, + "step": 460 + }, + { + "epoch": 0.2182894347440071, + "grad_norm": 1.790076732635498, + "learning_rate": 1.990411469884909e-05, + "loss": 0.3384, + "step": 461 + }, + { + "epoch": 0.21876294761763834, + "grad_norm": 1.5797431468963623, + "learning_rate": 1.9903406896165e-05, + "loss": 0.3244, + "step": 462 + }, + { + "epoch": 0.2192364604912696, + "grad_norm": 1.390303373336792, + "learning_rate": 1.9902696503340378e-05, + "loss": 0.2862, + "step": 463 + }, + { + "epoch": 0.21970997336490086, + "grad_norm": 1.6508777141571045, + "learning_rate": 1.990198352056102e-05, + "loss": 0.313, + "step": 464 + }, + { + "epoch": 0.22018348623853212, + "grad_norm": 1.5535106658935547, + "learning_rate": 1.990126794801339e-05, + "loss": 0.313, + "step": 465 + }, + { + "epoch": 0.22065699911216335, + "grad_norm": 1.5495903491973877, + "learning_rate": 1.9900549785884654e-05, + "loss": 0.2866, + "step": 466 + }, + { + "epoch": 0.22113051198579461, + "grad_norm": 1.5182076692581177, + "learning_rate": 1.989982903436263e-05, + "loss": 0.2786, + "step": 467 + }, + { + "epoch": 0.22160402485942587, + "grad_norm": 1.5054259300231934, + "learning_rate": 1.989910569363583e-05, + "loss": 0.2599, + "step": 468 + }, + { + "epoch": 0.2220775377330571, + "grad_norm": 1.793799877166748, + "learning_rate": 1.989837976389344e-05, + "loss": 0.3061, + "step": 469 + }, + { + "epoch": 0.22255105060668837, + "grad_norm": 1.3098176717758179, + "learning_rate": 1.9897651245325306e-05, + "loss": 0.268, + "step": 470 + }, + { + "epoch": 0.22302456348031963, + "grad_norm": 1.5548096895217896, + "learning_rate": 1.9896920138121977e-05, + "loss": 0.2858, + "step": 471 + }, + { + "epoch": 0.2234980763539509, + "grad_norm": 1.5963671207427979, + "learning_rate": 1.989618644247466e-05, + "loss": 0.3073, + "step": 472 + }, + { + "epoch": 0.22397158922758212, + "grad_norm": 1.266221046447754, + "learning_rate": 1.989545015857525e-05, + "loss": 0.2709, + "step": 473 + }, + { + "epoch": 0.22444510210121338, + "grad_norm": 1.2214033603668213, + "learning_rate": 1.9894711286616313e-05, + "loss": 0.2725, + "step": 474 + }, + { + "epoch": 0.22491861497484464, + "grad_norm": 1.5835678577423096, + "learning_rate": 1.98939698267911e-05, + "loss": 0.2411, + "step": 475 + }, + { + "epoch": 0.22539212784847587, + "grad_norm": 1.4112712144851685, + "learning_rate": 1.989322577929352e-05, + "loss": 0.2913, + "step": 476 + }, + { + "epoch": 0.22586564072210713, + "grad_norm": 1.335115909576416, + "learning_rate": 1.9892479144318187e-05, + "loss": 0.2936, + "step": 477 + }, + { + "epoch": 0.2263391535957384, + "grad_norm": 1.4088594913482666, + "learning_rate": 1.989172992206036e-05, + "loss": 0.3129, + "step": 478 + }, + { + "epoch": 0.22681266646936962, + "grad_norm": 1.2447527647018433, + "learning_rate": 1.989097811271601e-05, + "loss": 0.3029, + "step": 479 + }, + { + "epoch": 0.22728617934300088, + "grad_norm": 1.2852405309677124, + "learning_rate": 1.9890223716481746e-05, + "loss": 0.2788, + "step": 480 + }, + { + "epoch": 0.22775969221663214, + "grad_norm": 2.38966703414917, + "learning_rate": 1.9889466733554883e-05, + "loss": 0.3573, + "step": 481 + }, + { + "epoch": 0.2282332050902634, + "grad_norm": 1.3226877450942993, + "learning_rate": 1.9888707164133403e-05, + "loss": 0.2997, + "step": 482 + }, + { + "epoch": 0.22870671796389463, + "grad_norm": 1.2726070880889893, + "learning_rate": 1.988794500841596e-05, + "loss": 0.27, + "step": 483 + }, + { + "epoch": 0.2291802308375259, + "grad_norm": 1.3730125427246094, + "learning_rate": 1.9887180266601892e-05, + "loss": 0.2917, + "step": 484 + }, + { + "epoch": 0.22965374371115715, + "grad_norm": 1.4895507097244263, + "learning_rate": 1.988641293889121e-05, + "loss": 0.3041, + "step": 485 + }, + { + "epoch": 0.2301272565847884, + "grad_norm": 1.5664935111999512, + "learning_rate": 1.9885643025484598e-05, + "loss": 0.2987, + "step": 486 + }, + { + "epoch": 0.23060076945841965, + "grad_norm": 1.38105046749115, + "learning_rate": 1.988487052658342e-05, + "loss": 0.2739, + "step": 487 + }, + { + "epoch": 0.2310742823320509, + "grad_norm": 1.3974148035049438, + "learning_rate": 1.988409544238972e-05, + "loss": 0.294, + "step": 488 + }, + { + "epoch": 0.23154779520568217, + "grad_norm": 1.1836129426956177, + "learning_rate": 1.988331777310621e-05, + "loss": 0.2847, + "step": 489 + }, + { + "epoch": 0.2320213080793134, + "grad_norm": 1.9731239080429077, + "learning_rate": 1.9882537518936283e-05, + "loss": 0.2773, + "step": 490 + }, + { + "epoch": 0.23249482095294466, + "grad_norm": 1.8344943523406982, + "learning_rate": 1.9881754680084e-05, + "loss": 0.3044, + "step": 491 + }, + { + "epoch": 0.23296833382657592, + "grad_norm": 1.9124189615249634, + "learning_rate": 1.988096925675412e-05, + "loss": 0.2658, + "step": 492 + }, + { + "epoch": 0.23344184670020715, + "grad_norm": 1.1877110004425049, + "learning_rate": 1.988018124915205e-05, + "loss": 0.2978, + "step": 493 + }, + { + "epoch": 0.2339153595738384, + "grad_norm": 3.0704288482666016, + "learning_rate": 1.987939065748389e-05, + "loss": 0.2754, + "step": 494 + }, + { + "epoch": 0.23438887244746967, + "grad_norm": 2.0718958377838135, + "learning_rate": 1.9878597481956416e-05, + "loss": 0.2838, + "step": 495 + }, + { + "epoch": 0.23486238532110093, + "grad_norm": 1.83311927318573, + "learning_rate": 1.9877801722777064e-05, + "loss": 0.2823, + "step": 496 + }, + { + "epoch": 0.23533589819473216, + "grad_norm": 1.3887392282485962, + "learning_rate": 1.9877003380153968e-05, + "loss": 0.2805, + "step": 497 + }, + { + "epoch": 0.23580941106836342, + "grad_norm": 1.7678110599517822, + "learning_rate": 1.9876202454295926e-05, + "loss": 0.2892, + "step": 498 + }, + { + "epoch": 0.23628292394199468, + "grad_norm": 1.2601211071014404, + "learning_rate": 1.9875398945412403e-05, + "loss": 0.3009, + "step": 499 + }, + { + "epoch": 0.23675643681562591, + "grad_norm": 1.4841456413269043, + "learning_rate": 1.987459285371356e-05, + "loss": 0.2835, + "step": 500 + }, + { + "epoch": 0.23722994968925717, + "grad_norm": 1.53849196434021, + "learning_rate": 1.9873784179410217e-05, + "loss": 0.2825, + "step": 501 + }, + { + "epoch": 0.23770346256288843, + "grad_norm": 1.1709768772125244, + "learning_rate": 1.9872972922713875e-05, + "loss": 0.2877, + "step": 502 + }, + { + "epoch": 0.23817697543651967, + "grad_norm": 1.4577317237854004, + "learning_rate": 1.9872159083836713e-05, + "loss": 0.2704, + "step": 503 + }, + { + "epoch": 0.23865048831015093, + "grad_norm": 1.207236886024475, + "learning_rate": 1.9871342662991582e-05, + "loss": 0.3042, + "step": 504 + }, + { + "epoch": 0.2391240011837822, + "grad_norm": 1.409219741821289, + "learning_rate": 1.9870523660392004e-05, + "loss": 0.2925, + "step": 505 + }, + { + "epoch": 0.23959751405741345, + "grad_norm": 2.00093412399292, + "learning_rate": 1.986970207625219e-05, + "loss": 0.3005, + "step": 506 + }, + { + "epoch": 0.24007102693104468, + "grad_norm": 1.3477380275726318, + "learning_rate": 1.986887791078701e-05, + "loss": 0.2754, + "step": 507 + }, + { + "epoch": 0.24054453980467594, + "grad_norm": 1.0551843643188477, + "learning_rate": 1.9868051164212017e-05, + "loss": 0.3066, + "step": 508 + }, + { + "epoch": 0.2410180526783072, + "grad_norm": 1.1477186679840088, + "learning_rate": 1.986722183674344e-05, + "loss": 0.2924, + "step": 509 + }, + { + "epoch": 0.24149156555193843, + "grad_norm": 1.6527049541473389, + "learning_rate": 1.9866389928598188e-05, + "loss": 0.3204, + "step": 510 + }, + { + "epoch": 0.2419650784255697, + "grad_norm": 1.1355421543121338, + "learning_rate": 1.986555543999383e-05, + "loss": 0.2904, + "step": 511 + }, + { + "epoch": 0.24243859129920095, + "grad_norm": 1.9797145128250122, + "learning_rate": 1.9864718371148623e-05, + "loss": 0.2834, + "step": 512 + }, + { + "epoch": 0.2429121041728322, + "grad_norm": 2.0345993041992188, + "learning_rate": 1.9863878722281492e-05, + "loss": 0.2688, + "step": 513 + }, + { + "epoch": 0.24338561704646344, + "grad_norm": 2.046318769454956, + "learning_rate": 1.986303649361204e-05, + "loss": 0.2846, + "step": 514 + }, + { + "epoch": 0.2438591299200947, + "grad_norm": 1.3903858661651611, + "learning_rate": 1.986219168536054e-05, + "loss": 0.2915, + "step": 515 + }, + { + "epoch": 0.24433264279372596, + "grad_norm": 1.3947668075561523, + "learning_rate": 1.986134429774795e-05, + "loss": 0.3142, + "step": 516 + }, + { + "epoch": 0.2448061556673572, + "grad_norm": 1.6972023248672485, + "learning_rate": 1.9860494330995892e-05, + "loss": 0.2862, + "step": 517 + }, + { + "epoch": 0.24527966854098845, + "grad_norm": 1.431135892868042, + "learning_rate": 1.9859641785326672e-05, + "loss": 0.2691, + "step": 518 + }, + { + "epoch": 0.24575318141461971, + "grad_norm": 1.653490424156189, + "learning_rate": 1.9858786660963253e-05, + "loss": 0.2368, + "step": 519 + }, + { + "epoch": 0.24622669428825097, + "grad_norm": 2.2051680088043213, + "learning_rate": 1.98579289581293e-05, + "loss": 0.294, + "step": 520 + }, + { + "epoch": 0.2467002071618822, + "grad_norm": 2.165895938873291, + "learning_rate": 1.9857068677049124e-05, + "loss": 0.2745, + "step": 521 + }, + { + "epoch": 0.24717372003551347, + "grad_norm": 2.177927255630493, + "learning_rate": 1.9856205817947728e-05, + "loss": 0.3092, + "step": 522 + }, + { + "epoch": 0.24764723290914473, + "grad_norm": 1.3167831897735596, + "learning_rate": 1.9855340381050787e-05, + "loss": 0.2428, + "step": 523 + }, + { + "epoch": 0.24812074578277596, + "grad_norm": 1.1702346801757812, + "learning_rate": 1.9854472366584646e-05, + "loss": 0.2834, + "step": 524 + }, + { + "epoch": 0.24859425865640722, + "grad_norm": 1.4927124977111816, + "learning_rate": 1.9853601774776322e-05, + "loss": 0.2695, + "step": 525 + }, + { + "epoch": 0.24906777153003848, + "grad_norm": 2.9464986324310303, + "learning_rate": 1.9852728605853516e-05, + "loss": 0.278, + "step": 526 + }, + { + "epoch": 0.24954128440366974, + "grad_norm": 1.3696672916412354, + "learning_rate": 1.9851852860044594e-05, + "loss": 0.2743, + "step": 527 + }, + { + "epoch": 0.25001479727730097, + "grad_norm": 1.6753240823745728, + "learning_rate": 1.9850974537578597e-05, + "loss": 0.2999, + "step": 528 + }, + { + "epoch": 0.2504883101509322, + "grad_norm": 1.2681008577346802, + "learning_rate": 1.9850093638685247e-05, + "loss": 0.2763, + "step": 529 + }, + { + "epoch": 0.2509618230245635, + "grad_norm": 1.4322853088378906, + "learning_rate": 1.984921016359493e-05, + "loss": 0.301, + "step": 530 + }, + { + "epoch": 0.2514353358981947, + "grad_norm": 1.3877912759780884, + "learning_rate": 1.984832411253871e-05, + "loss": 0.2866, + "step": 531 + }, + { + "epoch": 0.251908848771826, + "grad_norm": 1.4901669025421143, + "learning_rate": 1.9847435485748328e-05, + "loss": 0.3069, + "step": 532 + }, + { + "epoch": 0.25238236164545724, + "grad_norm": 1.5434002876281738, + "learning_rate": 1.9846544283456195e-05, + "loss": 0.2884, + "step": 533 + }, + { + "epoch": 0.2528558745190885, + "grad_norm": 1.3284276723861694, + "learning_rate": 1.9845650505895397e-05, + "loss": 0.2709, + "step": 534 + }, + { + "epoch": 0.25332938739271976, + "grad_norm": 1.1716914176940918, + "learning_rate": 1.9844754153299686e-05, + "loss": 0.273, + "step": 535 + }, + { + "epoch": 0.253802900266351, + "grad_norm": 1.208626389503479, + "learning_rate": 1.984385522590351e-05, + "loss": 0.2616, + "step": 536 + }, + { + "epoch": 0.2542764131399822, + "grad_norm": 1.6577744483947754, + "learning_rate": 1.9842953723941954e-05, + "loss": 0.2702, + "step": 537 + }, + { + "epoch": 0.2547499260136135, + "grad_norm": 1.4658805131912231, + "learning_rate": 1.9842049647650815e-05, + "loss": 0.2848, + "step": 538 + }, + { + "epoch": 0.25522343888724475, + "grad_norm": 1.4830657243728638, + "learning_rate": 1.984114299726654e-05, + "loss": 0.2536, + "step": 539 + }, + { + "epoch": 0.255696951760876, + "grad_norm": 1.5815433263778687, + "learning_rate": 1.984023377302625e-05, + "loss": 0.2853, + "step": 540 + }, + { + "epoch": 0.25617046463450727, + "grad_norm": 1.2159545421600342, + "learning_rate": 1.9839321975167747e-05, + "loss": 0.3015, + "step": 541 + }, + { + "epoch": 0.2566439775081385, + "grad_norm": 1.711661696434021, + "learning_rate": 1.9838407603929503e-05, + "loss": 0.2924, + "step": 542 + }, + { + "epoch": 0.25711749038176973, + "grad_norm": 1.3681540489196777, + "learning_rate": 1.9837490659550665e-05, + "loss": 0.2999, + "step": 543 + }, + { + "epoch": 0.257591003255401, + "grad_norm": 1.9190740585327148, + "learning_rate": 1.983657114227105e-05, + "loss": 0.3154, + "step": 544 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.5599863529205322, + "learning_rate": 1.9835649052331143e-05, + "loss": 0.2775, + "step": 545 + }, + { + "epoch": 0.2585380290026635, + "grad_norm": 1.3822370767593384, + "learning_rate": 1.9834724389972116e-05, + "loss": 0.2659, + "step": 546 + }, + { + "epoch": 0.25901154187629477, + "grad_norm": 1.2202012538909912, + "learning_rate": 1.98337971554358e-05, + "loss": 0.2835, + "step": 547 + }, + { + "epoch": 0.259485054749926, + "grad_norm": 1.225102186203003, + "learning_rate": 1.9832867348964707e-05, + "loss": 0.3081, + "step": 548 + }, + { + "epoch": 0.2599585676235573, + "grad_norm": 1.9328206777572632, + "learning_rate": 1.983193497080202e-05, + "loss": 0.2807, + "step": 549 + }, + { + "epoch": 0.2604320804971885, + "grad_norm": 3.2432518005371094, + "learning_rate": 1.9831000021191592e-05, + "loss": 0.3038, + "step": 550 + }, + { + "epoch": 0.26090559337081975, + "grad_norm": 1.950760841369629, + "learning_rate": 1.9830062500377945e-05, + "loss": 0.3174, + "step": 551 + }, + { + "epoch": 0.26137910624445104, + "grad_norm": 1.1726529598236084, + "learning_rate": 1.9829122408606288e-05, + "loss": 0.2757, + "step": 552 + }, + { + "epoch": 0.2618526191180823, + "grad_norm": 1.5435218811035156, + "learning_rate": 1.9828179746122487e-05, + "loss": 0.301, + "step": 553 + }, + { + "epoch": 0.2623261319917135, + "grad_norm": 2.2376794815063477, + "learning_rate": 1.9827234513173087e-05, + "loss": 0.2955, + "step": 554 + }, + { + "epoch": 0.2627996448653448, + "grad_norm": 2.062591791152954, + "learning_rate": 1.9826286710005305e-05, + "loss": 0.255, + "step": 555 + }, + { + "epoch": 0.263273157738976, + "grad_norm": 1.1453508138656616, + "learning_rate": 1.982533633686703e-05, + "loss": 0.2597, + "step": 556 + }, + { + "epoch": 0.26374667061260726, + "grad_norm": 1.465489387512207, + "learning_rate": 1.9824383394006825e-05, + "loss": 0.2905, + "step": 557 + }, + { + "epoch": 0.26422018348623855, + "grad_norm": 1.4714313745498657, + "learning_rate": 1.9823427881673916e-05, + "loss": 0.2584, + "step": 558 + }, + { + "epoch": 0.2646936963598698, + "grad_norm": 1.2892956733703613, + "learning_rate": 1.9822469800118215e-05, + "loss": 0.2792, + "step": 559 + }, + { + "epoch": 0.265167209233501, + "grad_norm": 1.3988784551620483, + "learning_rate": 1.98215091495903e-05, + "loss": 0.2749, + "step": 560 + }, + { + "epoch": 0.2656407221071323, + "grad_norm": 1.2862416505813599, + "learning_rate": 1.9820545930341413e-05, + "loss": 0.2765, + "step": 561 + }, + { + "epoch": 0.26611423498076353, + "grad_norm": 1.5326262712478638, + "learning_rate": 1.981958014262348e-05, + "loss": 0.3028, + "step": 562 + }, + { + "epoch": 0.2665877478543948, + "grad_norm": 0.968665361404419, + "learning_rate": 1.981861178668909e-05, + "loss": 0.255, + "step": 563 + }, + { + "epoch": 0.26706126072802605, + "grad_norm": 1.944764494895935, + "learning_rate": 1.981764086279151e-05, + "loss": 0.292, + "step": 564 + }, + { + "epoch": 0.2675347736016573, + "grad_norm": 1.2996660470962524, + "learning_rate": 1.9816667371184677e-05, + "loss": 0.2615, + "step": 565 + }, + { + "epoch": 0.26800828647528857, + "grad_norm": 1.362474799156189, + "learning_rate": 1.9815691312123194e-05, + "loss": 0.2889, + "step": 566 + }, + { + "epoch": 0.2684817993489198, + "grad_norm": 1.6865662336349487, + "learning_rate": 1.9814712685862342e-05, + "loss": 0.2745, + "step": 567 + }, + { + "epoch": 0.26895531222255104, + "grad_norm": 1.5344867706298828, + "learning_rate": 1.9813731492658073e-05, + "loss": 0.2901, + "step": 568 + }, + { + "epoch": 0.2694288250961823, + "grad_norm": 1.4177885055541992, + "learning_rate": 1.9812747732767006e-05, + "loss": 0.2864, + "step": 569 + }, + { + "epoch": 0.26990233796981355, + "grad_norm": 1.1646602153778076, + "learning_rate": 1.981176140644643e-05, + "loss": 0.2668, + "step": 570 + }, + { + "epoch": 0.2703758508434448, + "grad_norm": 1.417022943496704, + "learning_rate": 1.981077251395432e-05, + "loss": 0.2761, + "step": 571 + }, + { + "epoch": 0.2708493637170761, + "grad_norm": 1.7936999797821045, + "learning_rate": 1.9809781055549308e-05, + "loss": 0.2899, + "step": 572 + }, + { + "epoch": 0.2713228765907073, + "grad_norm": 2.242428779602051, + "learning_rate": 1.980878703149069e-05, + "loss": 0.2971, + "step": 573 + }, + { + "epoch": 0.27179638946433854, + "grad_norm": 1.8133538961410522, + "learning_rate": 1.980779044203845e-05, + "loss": 0.2789, + "step": 574 + }, + { + "epoch": 0.2722699023379698, + "grad_norm": 1.2497389316558838, + "learning_rate": 1.9806791287453245e-05, + "loss": 0.2885, + "step": 575 + }, + { + "epoch": 0.27274341521160106, + "grad_norm": 1.5333000421524048, + "learning_rate": 1.9805789567996384e-05, + "loss": 0.2917, + "step": 576 + }, + { + "epoch": 0.2732169280852323, + "grad_norm": 2.611602306365967, + "learning_rate": 1.9804785283929858e-05, + "loss": 0.2752, + "step": 577 + }, + { + "epoch": 0.2736904409588636, + "grad_norm": 1.3517531156539917, + "learning_rate": 1.9803778435516327e-05, + "loss": 0.2619, + "step": 578 + }, + { + "epoch": 0.2741639538324948, + "grad_norm": 1.452409267425537, + "learning_rate": 1.9802769023019128e-05, + "loss": 0.2735, + "step": 579 + }, + { + "epoch": 0.2746374667061261, + "grad_norm": 1.5018271207809448, + "learning_rate": 1.980175704670226e-05, + "loss": 0.3296, + "step": 580 + }, + { + "epoch": 0.27511097957975733, + "grad_norm": 2.2614331245422363, + "learning_rate": 1.9800742506830394e-05, + "loss": 0.2969, + "step": 581 + }, + { + "epoch": 0.27558449245338856, + "grad_norm": 1.6071243286132812, + "learning_rate": 1.979972540366888e-05, + "loss": 0.3061, + "step": 582 + }, + { + "epoch": 0.27605800532701985, + "grad_norm": 1.25821852684021, + "learning_rate": 1.979870573748372e-05, + "loss": 0.2715, + "step": 583 + }, + { + "epoch": 0.2765315182006511, + "grad_norm": 1.5205860137939453, + "learning_rate": 1.9797683508541606e-05, + "loss": 0.3016, + "step": 584 + }, + { + "epoch": 0.2770050310742823, + "grad_norm": 1.3796861171722412, + "learning_rate": 1.9796658717109892e-05, + "loss": 0.2868, + "step": 585 + }, + { + "epoch": 0.2774785439479136, + "grad_norm": 1.5328634977340698, + "learning_rate": 1.97956313634566e-05, + "loss": 0.2787, + "step": 586 + }, + { + "epoch": 0.27795205682154484, + "grad_norm": 1.6029144525527954, + "learning_rate": 1.9794601447850424e-05, + "loss": 0.2508, + "step": 587 + }, + { + "epoch": 0.27842556969517607, + "grad_norm": 1.3855677843093872, + "learning_rate": 1.9793568970560728e-05, + "loss": 0.2723, + "step": 588 + }, + { + "epoch": 0.27889908256880735, + "grad_norm": 1.2493582963943481, + "learning_rate": 1.979253393185755e-05, + "loss": 0.2589, + "step": 589 + }, + { + "epoch": 0.2793725954424386, + "grad_norm": 1.4009218215942383, + "learning_rate": 1.9791496332011593e-05, + "loss": 0.2998, + "step": 590 + }, + { + "epoch": 0.2798461083160698, + "grad_norm": 2.099388837814331, + "learning_rate": 1.979045617129423e-05, + "loss": 0.292, + "step": 591 + }, + { + "epoch": 0.2803196211897011, + "grad_norm": 1.3639984130859375, + "learning_rate": 1.9789413449977505e-05, + "loss": 0.2717, + "step": 592 + }, + { + "epoch": 0.28079313406333234, + "grad_norm": 1.2905452251434326, + "learning_rate": 1.9788368168334135e-05, + "loss": 0.245, + "step": 593 + }, + { + "epoch": 0.28126664693696357, + "grad_norm": 1.319740653038025, + "learning_rate": 1.9787320326637494e-05, + "loss": 0.259, + "step": 594 + }, + { + "epoch": 0.28174015981059486, + "grad_norm": 1.6142657995224, + "learning_rate": 1.9786269925161646e-05, + "loss": 0.2819, + "step": 595 + }, + { + "epoch": 0.2822136726842261, + "grad_norm": 1.273977279663086, + "learning_rate": 1.978521696418131e-05, + "loss": 0.2574, + "step": 596 + }, + { + "epoch": 0.2826871855578574, + "grad_norm": 1.7523685693740845, + "learning_rate": 1.9784161443971878e-05, + "loss": 0.2703, + "step": 597 + }, + { + "epoch": 0.2831606984314886, + "grad_norm": 1.5006474256515503, + "learning_rate": 1.9783103364809405e-05, + "loss": 0.3043, + "step": 598 + }, + { + "epoch": 0.28363421130511984, + "grad_norm": 1.1372324228286743, + "learning_rate": 1.978204272697063e-05, + "loss": 0.2793, + "step": 599 + }, + { + "epoch": 0.28410772417875113, + "grad_norm": 1.5428109169006348, + "learning_rate": 1.9780979530732947e-05, + "loss": 0.2455, + "step": 600 + }, + { + "epoch": 0.28458123705238236, + "grad_norm": 1.6741421222686768, + "learning_rate": 1.9779913776374427e-05, + "loss": 0.2767, + "step": 601 + }, + { + "epoch": 0.2850547499260136, + "grad_norm": 1.1833195686340332, + "learning_rate": 1.9778845464173805e-05, + "loss": 0.2639, + "step": 602 + }, + { + "epoch": 0.2855282627996449, + "grad_norm": 1.1689391136169434, + "learning_rate": 1.9777774594410495e-05, + "loss": 0.2601, + "step": 603 + }, + { + "epoch": 0.2860017756732761, + "grad_norm": 1.502760887145996, + "learning_rate": 1.9776701167364565e-05, + "loss": 0.3045, + "step": 604 + }, + { + "epoch": 0.28647528854690735, + "grad_norm": 1.974056601524353, + "learning_rate": 1.977562518331676e-05, + "loss": 0.2785, + "step": 605 + }, + { + "epoch": 0.28694880142053864, + "grad_norm": 1.409839391708374, + "learning_rate": 1.97745466425485e-05, + "loss": 0.2732, + "step": 606 + }, + { + "epoch": 0.28742231429416987, + "grad_norm": 1.0581002235412598, + "learning_rate": 1.9773465545341855e-05, + "loss": 0.2569, + "step": 607 + }, + { + "epoch": 0.2878958271678011, + "grad_norm": 1.3603756427764893, + "learning_rate": 1.977238189197959e-05, + "loss": 0.2536, + "step": 608 + }, + { + "epoch": 0.2883693400414324, + "grad_norm": 1.2487221956253052, + "learning_rate": 1.9771295682745115e-05, + "loss": 0.2998, + "step": 609 + }, + { + "epoch": 0.2888428529150636, + "grad_norm": 1.1602859497070312, + "learning_rate": 1.977020691792252e-05, + "loss": 0.2476, + "step": 610 + }, + { + "epoch": 0.2893163657886949, + "grad_norm": 1.3260509967803955, + "learning_rate": 1.976911559779656e-05, + "loss": 0.2972, + "step": 611 + }, + { + "epoch": 0.28978987866232614, + "grad_norm": 1.3510822057724, + "learning_rate": 1.976802172265266e-05, + "loss": 0.2658, + "step": 612 + }, + { + "epoch": 0.29026339153595737, + "grad_norm": 2.0700032711029053, + "learning_rate": 1.976692529277691e-05, + "loss": 0.2471, + "step": 613 + }, + { + "epoch": 0.29073690440958866, + "grad_norm": 1.5847715139389038, + "learning_rate": 1.9765826308456075e-05, + "loss": 0.2757, + "step": 614 + }, + { + "epoch": 0.2912104172832199, + "grad_norm": 1.1467682123184204, + "learning_rate": 1.976472476997758e-05, + "loss": 0.2733, + "step": 615 + }, + { + "epoch": 0.2916839301568511, + "grad_norm": 1.1180078983306885, + "learning_rate": 1.9763620677629525e-05, + "loss": 0.2494, + "step": 616 + }, + { + "epoch": 0.2921574430304824, + "grad_norm": 2.1236302852630615, + "learning_rate": 1.9762514031700673e-05, + "loss": 0.2934, + "step": 617 + }, + { + "epoch": 0.29263095590411364, + "grad_norm": 1.3528096675872803, + "learning_rate": 1.9761404832480455e-05, + "loss": 0.2811, + "step": 618 + }, + { + "epoch": 0.2931044687777449, + "grad_norm": 1.4439085721969604, + "learning_rate": 1.9760293080258976e-05, + "loss": 0.2829, + "step": 619 + }, + { + "epoch": 0.29357798165137616, + "grad_norm": 1.2082427740097046, + "learning_rate": 1.9759178775327e-05, + "loss": 0.2781, + "step": 620 + }, + { + "epoch": 0.2940514945250074, + "grad_norm": 2.3843281269073486, + "learning_rate": 1.975806191797596e-05, + "loss": 0.2968, + "step": 621 + }, + { + "epoch": 0.2945250073986386, + "grad_norm": 1.1950457096099854, + "learning_rate": 1.9756942508497967e-05, + "loss": 0.2816, + "step": 622 + }, + { + "epoch": 0.2949985202722699, + "grad_norm": 1.4412981271743774, + "learning_rate": 1.9755820547185787e-05, + "loss": 0.2838, + "step": 623 + }, + { + "epoch": 0.29547203314590115, + "grad_norm": 1.1725237369537354, + "learning_rate": 1.975469603433286e-05, + "loss": 0.2596, + "step": 624 + }, + { + "epoch": 0.2959455460195324, + "grad_norm": 1.1337695121765137, + "learning_rate": 1.975356897023329e-05, + "loss": 0.2647, + "step": 625 + }, + { + "epoch": 0.29641905889316367, + "grad_norm": 1.444916009902954, + "learning_rate": 1.9752439355181848e-05, + "loss": 0.2679, + "step": 626 + }, + { + "epoch": 0.2968925717667949, + "grad_norm": 1.7320390939712524, + "learning_rate": 1.975130718947398e-05, + "loss": 0.2317, + "step": 627 + }, + { + "epoch": 0.2973660846404262, + "grad_norm": 2.060805320739746, + "learning_rate": 1.9750172473405785e-05, + "loss": 0.269, + "step": 628 + }, + { + "epoch": 0.2978395975140574, + "grad_norm": 1.348092794418335, + "learning_rate": 1.9749035207274044e-05, + "loss": 0.2658, + "step": 629 + }, + { + "epoch": 0.29831311038768865, + "grad_norm": 1.3302925825119019, + "learning_rate": 1.9747895391376192e-05, + "loss": 0.2504, + "step": 630 + }, + { + "epoch": 0.29878662326131994, + "grad_norm": 1.1984549760818481, + "learning_rate": 1.9746753026010342e-05, + "loss": 0.2713, + "step": 631 + }, + { + "epoch": 0.29926013613495117, + "grad_norm": 1.1172891855239868, + "learning_rate": 1.9745608111475267e-05, + "loss": 0.2636, + "step": 632 + }, + { + "epoch": 0.2997336490085824, + "grad_norm": 1.5490065813064575, + "learning_rate": 1.9744460648070408e-05, + "loss": 0.2618, + "step": 633 + }, + { + "epoch": 0.3002071618822137, + "grad_norm": 1.167188048362732, + "learning_rate": 1.974331063609587e-05, + "loss": 0.2601, + "step": 634 + }, + { + "epoch": 0.3006806747558449, + "grad_norm": 1.5427215099334717, + "learning_rate": 1.9742158075852435e-05, + "loss": 0.281, + "step": 635 + }, + { + "epoch": 0.30115418762947616, + "grad_norm": 1.17001211643219, + "learning_rate": 1.9741002967641537e-05, + "loss": 0.2592, + "step": 636 + }, + { + "epoch": 0.30162770050310744, + "grad_norm": 1.1877490282058716, + "learning_rate": 1.9739845311765286e-05, + "loss": 0.3065, + "step": 637 + }, + { + "epoch": 0.3021012133767387, + "grad_norm": 1.3651347160339355, + "learning_rate": 1.9738685108526456e-05, + "loss": 0.2801, + "step": 638 + }, + { + "epoch": 0.3025747262503699, + "grad_norm": 1.283866047859192, + "learning_rate": 1.9737522358228487e-05, + "loss": 0.2939, + "step": 639 + }, + { + "epoch": 0.3030482391240012, + "grad_norm": 1.2590893507003784, + "learning_rate": 1.9736357061175483e-05, + "loss": 0.2657, + "step": 640 + }, + { + "epoch": 0.3035217519976324, + "grad_norm": 1.178700566291809, + "learning_rate": 1.973518921767222e-05, + "loss": 0.2618, + "step": 641 + }, + { + "epoch": 0.3039952648712637, + "grad_norm": 1.7493693828582764, + "learning_rate": 1.9734018828024136e-05, + "loss": 0.2763, + "step": 642 + }, + { + "epoch": 0.30446877774489495, + "grad_norm": 1.9666167497634888, + "learning_rate": 1.973284589253733e-05, + "loss": 0.2826, + "step": 643 + }, + { + "epoch": 0.3049422906185262, + "grad_norm": 1.3451874256134033, + "learning_rate": 1.9731670411518578e-05, + "loss": 0.2642, + "step": 644 + }, + { + "epoch": 0.30541580349215747, + "grad_norm": 1.3448307514190674, + "learning_rate": 1.973049238527531e-05, + "loss": 0.2672, + "step": 645 + }, + { + "epoch": 0.3058893163657887, + "grad_norm": 1.249316692352295, + "learning_rate": 1.9729311814115632e-05, + "loss": 0.2564, + "step": 646 + }, + { + "epoch": 0.30636282923941993, + "grad_norm": 1.4523890018463135, + "learning_rate": 1.972812869834831e-05, + "loss": 0.2831, + "step": 647 + }, + { + "epoch": 0.3068363421130512, + "grad_norm": 1.7998663187026978, + "learning_rate": 1.9726943038282772e-05, + "loss": 0.2962, + "step": 648 + }, + { + "epoch": 0.30730985498668245, + "grad_norm": 1.393811583518982, + "learning_rate": 1.9725754834229124e-05, + "loss": 0.2553, + "step": 649 + }, + { + "epoch": 0.3077833678603137, + "grad_norm": 1.6593152284622192, + "learning_rate": 1.9724564086498123e-05, + "loss": 0.3102, + "step": 650 + }, + { + "epoch": 0.30825688073394497, + "grad_norm": 1.6688569784164429, + "learning_rate": 1.97233707954012e-05, + "loss": 0.2706, + "step": 651 + }, + { + "epoch": 0.3087303936075762, + "grad_norm": 2.265216588973999, + "learning_rate": 1.972217496125045e-05, + "loss": 0.2884, + "step": 652 + }, + { + "epoch": 0.30920390648120744, + "grad_norm": 1.522386908531189, + "learning_rate": 1.972097658435863e-05, + "loss": 0.2754, + "step": 653 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 1.4247666597366333, + "learning_rate": 1.9719775665039162e-05, + "loss": 0.2557, + "step": 654 + }, + { + "epoch": 0.31015093222846996, + "grad_norm": 1.1649177074432373, + "learning_rate": 1.971857220360614e-05, + "loss": 0.2608, + "step": 655 + }, + { + "epoch": 0.3106244451021012, + "grad_norm": 2.187192678451538, + "learning_rate": 1.9717366200374313e-05, + "loss": 0.2989, + "step": 656 + }, + { + "epoch": 0.3110979579757325, + "grad_norm": 1.4517701864242554, + "learning_rate": 1.9716157655659102e-05, + "loss": 0.2681, + "step": 657 + }, + { + "epoch": 0.3115714708493637, + "grad_norm": 1.8214770555496216, + "learning_rate": 1.971494656977659e-05, + "loss": 0.241, + "step": 658 + }, + { + "epoch": 0.312044983722995, + "grad_norm": 1.289170742034912, + "learning_rate": 1.9713732943043524e-05, + "loss": 0.2659, + "step": 659 + }, + { + "epoch": 0.3125184965966262, + "grad_norm": 1.3986639976501465, + "learning_rate": 1.9712516775777315e-05, + "loss": 0.2649, + "step": 660 + }, + { + "epoch": 0.31299200947025746, + "grad_norm": 1.7374229431152344, + "learning_rate": 1.9711298068296046e-05, + "loss": 0.2902, + "step": 661 + }, + { + "epoch": 0.31346552234388875, + "grad_norm": 1.802095890045166, + "learning_rate": 1.9710076820918448e-05, + "loss": 0.2515, + "step": 662 + }, + { + "epoch": 0.31393903521752, + "grad_norm": 1.2375832796096802, + "learning_rate": 1.9708853033963936e-05, + "loss": 0.267, + "step": 663 + }, + { + "epoch": 0.3144125480911512, + "grad_norm": 1.2813341617584229, + "learning_rate": 1.9707626707752574e-05, + "loss": 0.2743, + "step": 664 + }, + { + "epoch": 0.3148860609647825, + "grad_norm": 1.1700478792190552, + "learning_rate": 1.97063978426051e-05, + "loss": 0.2724, + "step": 665 + }, + { + "epoch": 0.31535957383841373, + "grad_norm": 2.465041160583496, + "learning_rate": 1.9705166438842907e-05, + "loss": 0.2598, + "step": 666 + }, + { + "epoch": 0.31583308671204496, + "grad_norm": 1.6531705856323242, + "learning_rate": 1.9703932496788063e-05, + "loss": 0.2889, + "step": 667 + }, + { + "epoch": 0.31630659958567625, + "grad_norm": 1.5581059455871582, + "learning_rate": 1.9702696016763286e-05, + "loss": 0.2985, + "step": 668 + }, + { + "epoch": 0.3167801124593075, + "grad_norm": 1.4005812406539917, + "learning_rate": 1.9701456999091974e-05, + "loss": 0.2475, + "step": 669 + }, + { + "epoch": 0.3172536253329387, + "grad_norm": 1.6382842063903809, + "learning_rate": 1.970021544409817e-05, + "loss": 0.2683, + "step": 670 + }, + { + "epoch": 0.31772713820657, + "grad_norm": 1.9983477592468262, + "learning_rate": 1.96989713521066e-05, + "loss": 0.2836, + "step": 671 + }, + { + "epoch": 0.31820065108020124, + "grad_norm": 1.6255732774734497, + "learning_rate": 1.9697724723442643e-05, + "loss": 0.2627, + "step": 672 + }, + { + "epoch": 0.31867416395383247, + "grad_norm": 1.2218822240829468, + "learning_rate": 1.9696475558432334e-05, + "loss": 0.2735, + "step": 673 + }, + { + "epoch": 0.31914767682746376, + "grad_norm": 1.356476902961731, + "learning_rate": 1.969522385740239e-05, + "loss": 0.2905, + "step": 674 + }, + { + "epoch": 0.319621189701095, + "grad_norm": 1.8103471994400024, + "learning_rate": 1.9693969620680177e-05, + "loss": 0.2909, + "step": 675 + }, + { + "epoch": 0.3200947025747263, + "grad_norm": 1.908302903175354, + "learning_rate": 1.9692712848593726e-05, + "loss": 0.2738, + "step": 676 + }, + { + "epoch": 0.3205682154483575, + "grad_norm": 1.2977486848831177, + "learning_rate": 1.969145354147174e-05, + "loss": 0.2518, + "step": 677 + }, + { + "epoch": 0.32104172832198874, + "grad_norm": 1.6458895206451416, + "learning_rate": 1.9690191699643575e-05, + "loss": 0.2707, + "step": 678 + }, + { + "epoch": 0.32151524119562, + "grad_norm": 2.108949661254883, + "learning_rate": 1.9688927323439254e-05, + "loss": 0.2894, + "step": 679 + }, + { + "epoch": 0.32198875406925126, + "grad_norm": 1.1589833498001099, + "learning_rate": 1.9687660413189463e-05, + "loss": 0.255, + "step": 680 + }, + { + "epoch": 0.3224622669428825, + "grad_norm": 1.281614899635315, + "learning_rate": 1.9686390969225545e-05, + "loss": 0.2743, + "step": 681 + }, + { + "epoch": 0.3229357798165138, + "grad_norm": 1.3841043710708618, + "learning_rate": 1.968511899187952e-05, + "loss": 0.2668, + "step": 682 + }, + { + "epoch": 0.323409292690145, + "grad_norm": 1.8176034688949585, + "learning_rate": 1.9683844481484054e-05, + "loss": 0.2286, + "step": 683 + }, + { + "epoch": 0.32388280556377624, + "grad_norm": 2.2643625736236572, + "learning_rate": 1.9682567438372486e-05, + "loss": 0.2619, + "step": 684 + }, + { + "epoch": 0.32435631843740753, + "grad_norm": 2.1710567474365234, + "learning_rate": 1.9681287862878818e-05, + "loss": 0.2898, + "step": 685 + }, + { + "epoch": 0.32482983131103876, + "grad_norm": 1.4091668128967285, + "learning_rate": 1.9680005755337702e-05, + "loss": 0.2515, + "step": 686 + }, + { + "epoch": 0.32530334418467, + "grad_norm": 1.261533260345459, + "learning_rate": 1.9678721116084465e-05, + "loss": 0.2553, + "step": 687 + }, + { + "epoch": 0.3257768570583013, + "grad_norm": 1.412940263748169, + "learning_rate": 1.9677433945455092e-05, + "loss": 0.3054, + "step": 688 + }, + { + "epoch": 0.3262503699319325, + "grad_norm": 1.3808501958847046, + "learning_rate": 1.9676144243786236e-05, + "loss": 0.2611, + "step": 689 + }, + { + "epoch": 0.3267238828055638, + "grad_norm": 1.1639057397842407, + "learning_rate": 1.9674852011415194e-05, + "loss": 0.2882, + "step": 690 + }, + { + "epoch": 0.32719739567919504, + "grad_norm": 1.1630046367645264, + "learning_rate": 1.9673557248679945e-05, + "loss": 0.2231, + "step": 691 + }, + { + "epoch": 0.32767090855282627, + "grad_norm": 1.795854091644287, + "learning_rate": 1.9672259955919123e-05, + "loss": 0.3062, + "step": 692 + }, + { + "epoch": 0.32814442142645756, + "grad_norm": 1.4758808612823486, + "learning_rate": 1.967096013347202e-05, + "loss": 0.277, + "step": 693 + }, + { + "epoch": 0.3286179343000888, + "grad_norm": 1.5420145988464355, + "learning_rate": 1.9669657781678587e-05, + "loss": 0.2952, + "step": 694 + }, + { + "epoch": 0.32909144717372, + "grad_norm": 1.7470226287841797, + "learning_rate": 1.9668352900879447e-05, + "loss": 0.2645, + "step": 695 + }, + { + "epoch": 0.3295649600473513, + "grad_norm": 1.6357449293136597, + "learning_rate": 1.9667045491415878e-05, + "loss": 0.2714, + "step": 696 + }, + { + "epoch": 0.33003847292098254, + "grad_norm": 2.116548776626587, + "learning_rate": 1.9665735553629824e-05, + "loss": 0.2769, + "step": 697 + }, + { + "epoch": 0.33051198579461377, + "grad_norm": 2.229929208755493, + "learning_rate": 1.966442308786388e-05, + "loss": 0.2505, + "step": 698 + }, + { + "epoch": 0.33098549866824506, + "grad_norm": 1.3335548639297485, + "learning_rate": 1.966310809446131e-05, + "loss": 0.2516, + "step": 699 + }, + { + "epoch": 0.3314590115418763, + "grad_norm": 1.3416866064071655, + "learning_rate": 1.9661790573766046e-05, + "loss": 0.2563, + "step": 700 + }, + { + "epoch": 0.3319325244155075, + "grad_norm": 1.5719635486602783, + "learning_rate": 1.966047052612266e-05, + "loss": 0.2644, + "step": 701 + }, + { + "epoch": 0.3324060372891388, + "grad_norm": 1.4301633834838867, + "learning_rate": 1.9659147951876407e-05, + "loss": 0.2599, + "step": 702 + }, + { + "epoch": 0.33287955016277004, + "grad_norm": 1.6425291299819946, + "learning_rate": 1.965782285137319e-05, + "loss": 0.2422, + "step": 703 + }, + { + "epoch": 0.3333530630364013, + "grad_norm": 1.3709927797317505, + "learning_rate": 1.9656495224959578e-05, + "loss": 0.2653, + "step": 704 + }, + { + "epoch": 0.33382657591003256, + "grad_norm": 2.5719616413116455, + "learning_rate": 1.9655165072982797e-05, + "loss": 0.2581, + "step": 705 + }, + { + "epoch": 0.3343000887836638, + "grad_norm": 2.397643804550171, + "learning_rate": 1.9653832395790733e-05, + "loss": 0.2737, + "step": 706 + }, + { + "epoch": 0.3347736016572951, + "grad_norm": 1.7374242544174194, + "learning_rate": 1.965249719373194e-05, + "loss": 0.2835, + "step": 707 + }, + { + "epoch": 0.3352471145309263, + "grad_norm": 1.2840518951416016, + "learning_rate": 1.965115946715563e-05, + "loss": 0.2673, + "step": 708 + }, + { + "epoch": 0.33572062740455755, + "grad_norm": 1.8667082786560059, + "learning_rate": 1.964981921641166e-05, + "loss": 0.2596, + "step": 709 + }, + { + "epoch": 0.33619414027818884, + "grad_norm": 1.0804299116134644, + "learning_rate": 1.9648476441850574e-05, + "loss": 0.2432, + "step": 710 + }, + { + "epoch": 0.33666765315182007, + "grad_norm": 1.5073543787002563, + "learning_rate": 1.964713114382355e-05, + "loss": 0.2594, + "step": 711 + }, + { + "epoch": 0.3371411660254513, + "grad_norm": 1.724390983581543, + "learning_rate": 1.9645783322682447e-05, + "loss": 0.251, + "step": 712 + }, + { + "epoch": 0.3376146788990826, + "grad_norm": 1.4915387630462646, + "learning_rate": 1.964443297877977e-05, + "loss": 0.2637, + "step": 713 + }, + { + "epoch": 0.3380881917727138, + "grad_norm": 1.8173447847366333, + "learning_rate": 1.9643080112468683e-05, + "loss": 0.2878, + "step": 714 + }, + { + "epoch": 0.33856170464634505, + "grad_norm": 1.6663117408752441, + "learning_rate": 1.9641724724103026e-05, + "loss": 0.2929, + "step": 715 + }, + { + "epoch": 0.33903521751997634, + "grad_norm": 1.1947954893112183, + "learning_rate": 1.9640366814037283e-05, + "loss": 0.2699, + "step": 716 + }, + { + "epoch": 0.33950873039360757, + "grad_norm": 1.2085504531860352, + "learning_rate": 1.96390063826266e-05, + "loss": 0.258, + "step": 717 + }, + { + "epoch": 0.3399822432672388, + "grad_norm": 1.3036900758743286, + "learning_rate": 1.963764343022679e-05, + "loss": 0.283, + "step": 718 + }, + { + "epoch": 0.3404557561408701, + "grad_norm": 2.1460154056549072, + "learning_rate": 1.9636277957194316e-05, + "loss": 0.2811, + "step": 719 + }, + { + "epoch": 0.3409292690145013, + "grad_norm": 2.7816526889801025, + "learning_rate": 1.9634909963886304e-05, + "loss": 0.2436, + "step": 720 + }, + { + "epoch": 0.34140278188813256, + "grad_norm": 1.8817825317382812, + "learning_rate": 1.963353945066054e-05, + "loss": 0.2644, + "step": 721 + }, + { + "epoch": 0.34187629476176384, + "grad_norm": 1.5300520658493042, + "learning_rate": 1.963216641787547e-05, + "loss": 0.2729, + "step": 722 + }, + { + "epoch": 0.3423498076353951, + "grad_norm": 1.8480312824249268, + "learning_rate": 1.9630790865890196e-05, + "loss": 0.3109, + "step": 723 + }, + { + "epoch": 0.34282332050902636, + "grad_norm": 2.683523178100586, + "learning_rate": 1.9629412795064482e-05, + "loss": 0.2516, + "step": 724 + }, + { + "epoch": 0.3432968333826576, + "grad_norm": 1.2851934432983398, + "learning_rate": 1.9628032205758746e-05, + "loss": 0.271, + "step": 725 + }, + { + "epoch": 0.34377034625628883, + "grad_norm": 1.8185439109802246, + "learning_rate": 1.962664909833407e-05, + "loss": 0.2953, + "step": 726 + }, + { + "epoch": 0.3442438591299201, + "grad_norm": 1.4006390571594238, + "learning_rate": 1.9625263473152193e-05, + "loss": 0.2656, + "step": 727 + }, + { + "epoch": 0.34471737200355135, + "grad_norm": 1.353481650352478, + "learning_rate": 1.962387533057551e-05, + "loss": 0.2615, + "step": 728 + }, + { + "epoch": 0.3451908848771826, + "grad_norm": 1.9150645732879639, + "learning_rate": 1.9622484670967083e-05, + "loss": 0.2629, + "step": 729 + }, + { + "epoch": 0.34566439775081387, + "grad_norm": 1.6772758960723877, + "learning_rate": 1.9621091494690616e-05, + "loss": 0.2406, + "step": 730 + }, + { + "epoch": 0.3461379106244451, + "grad_norm": 1.7029880285263062, + "learning_rate": 1.9619695802110485e-05, + "loss": 0.2548, + "step": 731 + }, + { + "epoch": 0.34661142349807633, + "grad_norm": 1.2326855659484863, + "learning_rate": 1.961829759359172e-05, + "loss": 0.2976, + "step": 732 + }, + { + "epoch": 0.3470849363717076, + "grad_norm": 1.2379788160324097, + "learning_rate": 1.961689686950001e-05, + "loss": 0.2472, + "step": 733 + }, + { + "epoch": 0.34755844924533885, + "grad_norm": 2.5410702228546143, + "learning_rate": 1.9615493630201694e-05, + "loss": 0.2827, + "step": 734 + }, + { + "epoch": 0.3480319621189701, + "grad_norm": 2.657710075378418, + "learning_rate": 1.961408787606379e-05, + "loss": 0.2616, + "step": 735 + }, + { + "epoch": 0.34850547499260137, + "grad_norm": 1.4291493892669678, + "learning_rate": 1.9612679607453942e-05, + "loss": 0.2978, + "step": 736 + }, + { + "epoch": 0.3489789878662326, + "grad_norm": 1.5034074783325195, + "learning_rate": 1.9611268824740482e-05, + "loss": 0.2616, + "step": 737 + }, + { + "epoch": 0.3494525007398639, + "grad_norm": 1.3025211095809937, + "learning_rate": 1.9609855528292386e-05, + "loss": 0.2704, + "step": 738 + }, + { + "epoch": 0.3499260136134951, + "grad_norm": 2.3469293117523193, + "learning_rate": 1.960843971847928e-05, + "loss": 0.3042, + "step": 739 + }, + { + "epoch": 0.35039952648712636, + "grad_norm": 2.3904573917388916, + "learning_rate": 1.960702139567146e-05, + "loss": 0.2687, + "step": 740 + }, + { + "epoch": 0.35087303936075764, + "grad_norm": 1.4793574810028076, + "learning_rate": 1.9605600560239874e-05, + "loss": 0.2495, + "step": 741 + }, + { + "epoch": 0.3513465522343889, + "grad_norm": 1.2374844551086426, + "learning_rate": 1.960417721255613e-05, + "loss": 0.2791, + "step": 742 + }, + { + "epoch": 0.3518200651080201, + "grad_norm": 2.1964030265808105, + "learning_rate": 1.960275135299249e-05, + "loss": 0.2738, + "step": 743 + }, + { + "epoch": 0.3522935779816514, + "grad_norm": 1.7759153842926025, + "learning_rate": 1.9601322981921872e-05, + "loss": 0.2436, + "step": 744 + }, + { + "epoch": 0.35276709085528263, + "grad_norm": 1.5018988847732544, + "learning_rate": 1.959989209971785e-05, + "loss": 0.2935, + "step": 745 + }, + { + "epoch": 0.35324060372891386, + "grad_norm": 1.1463077068328857, + "learning_rate": 1.959845870675467e-05, + "loss": 0.2613, + "step": 746 + }, + { + "epoch": 0.35371411660254515, + "grad_norm": 1.4144256114959717, + "learning_rate": 1.9597022803407206e-05, + "loss": 0.2337, + "step": 747 + }, + { + "epoch": 0.3541876294761764, + "grad_norm": 1.2923469543457031, + "learning_rate": 1.9595584390051014e-05, + "loss": 0.2449, + "step": 748 + }, + { + "epoch": 0.3546611423498076, + "grad_norm": 2.338681936264038, + "learning_rate": 1.9594143467062295e-05, + "loss": 0.2694, + "step": 749 + }, + { + "epoch": 0.3551346552234389, + "grad_norm": 2.0817441940307617, + "learning_rate": 1.9592700034817906e-05, + "loss": 0.3, + "step": 750 + }, + { + "epoch": 0.35560816809707013, + "grad_norm": 1.2521568536758423, + "learning_rate": 1.959125409369537e-05, + "loss": 0.2664, + "step": 751 + }, + { + "epoch": 0.35608168097070136, + "grad_norm": 1.5073604583740234, + "learning_rate": 1.958980564407285e-05, + "loss": 0.2696, + "step": 752 + }, + { + "epoch": 0.35655519384433265, + "grad_norm": 1.10438871383667, + "learning_rate": 1.9588354686329182e-05, + "loss": 0.2712, + "step": 753 + }, + { + "epoch": 0.3570287067179639, + "grad_norm": 1.8521367311477661, + "learning_rate": 1.9586901220843844e-05, + "loss": 0.2802, + "step": 754 + }, + { + "epoch": 0.35750221959159517, + "grad_norm": 2.0266237258911133, + "learning_rate": 1.958544524799698e-05, + "loss": 0.2608, + "step": 755 + }, + { + "epoch": 0.3579757324652264, + "grad_norm": 1.1580361127853394, + "learning_rate": 1.958398676816938e-05, + "loss": 0.2582, + "step": 756 + }, + { + "epoch": 0.35844924533885764, + "grad_norm": 1.5954866409301758, + "learning_rate": 1.9582525781742502e-05, + "loss": 0.2436, + "step": 757 + }, + { + "epoch": 0.3589227582124889, + "grad_norm": 1.2600346803665161, + "learning_rate": 1.9581062289098448e-05, + "loss": 0.2658, + "step": 758 + }, + { + "epoch": 0.35939627108612016, + "grad_norm": 1.5517022609710693, + "learning_rate": 1.9579596290619986e-05, + "loss": 0.2886, + "step": 759 + }, + { + "epoch": 0.3598697839597514, + "grad_norm": 1.9510260820388794, + "learning_rate": 1.9578127786690532e-05, + "loss": 0.2839, + "step": 760 + }, + { + "epoch": 0.3603432968333827, + "grad_norm": 1.2476470470428467, + "learning_rate": 1.957665677769415e-05, + "loss": 0.2668, + "step": 761 + }, + { + "epoch": 0.3608168097070139, + "grad_norm": 1.4977624416351318, + "learning_rate": 1.9575183264015577e-05, + "loss": 0.2613, + "step": 762 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 1.4652796983718872, + "learning_rate": 1.95737072460402e-05, + "loss": 0.2766, + "step": 763 + }, + { + "epoch": 0.36176383545427643, + "grad_norm": 1.8096346855163574, + "learning_rate": 1.957222872415405e-05, + "loss": 0.2817, + "step": 764 + }, + { + "epoch": 0.36223734832790766, + "grad_norm": 1.9074580669403076, + "learning_rate": 1.9570747698743818e-05, + "loss": 0.2689, + "step": 765 + }, + { + "epoch": 0.3627108612015389, + "grad_norm": 1.3230053186416626, + "learning_rate": 1.956926417019686e-05, + "loss": 0.2504, + "step": 766 + }, + { + "epoch": 0.3631843740751702, + "grad_norm": 1.6510530710220337, + "learning_rate": 1.9567778138901175e-05, + "loss": 0.2582, + "step": 767 + }, + { + "epoch": 0.3636578869488014, + "grad_norm": 1.1170092821121216, + "learning_rate": 1.9566289605245416e-05, + "loss": 0.2573, + "step": 768 + }, + { + "epoch": 0.3641313998224327, + "grad_norm": 1.1853854656219482, + "learning_rate": 1.95647985696189e-05, + "loss": 0.2889, + "step": 769 + }, + { + "epoch": 0.36460491269606393, + "grad_norm": 1.359598159790039, + "learning_rate": 1.9563305032411594e-05, + "loss": 0.259, + "step": 770 + }, + { + "epoch": 0.36507842556969516, + "grad_norm": 1.3403595685958862, + "learning_rate": 1.956180899401411e-05, + "loss": 0.2468, + "step": 771 + }, + { + "epoch": 0.36555193844332645, + "grad_norm": 1.6090962886810303, + "learning_rate": 1.9560310454817736e-05, + "loss": 0.2536, + "step": 772 + }, + { + "epoch": 0.3660254513169577, + "grad_norm": 1.543067455291748, + "learning_rate": 1.9558809415214386e-05, + "loss": 0.2331, + "step": 773 + }, + { + "epoch": 0.3664989641905889, + "grad_norm": 1.650692343711853, + "learning_rate": 1.955730587559665e-05, + "loss": 0.2705, + "step": 774 + }, + { + "epoch": 0.3669724770642202, + "grad_norm": 1.7598607540130615, + "learning_rate": 1.9555799836357765e-05, + "loss": 0.2612, + "step": 775 + }, + { + "epoch": 0.36744598993785144, + "grad_norm": 1.1476891040802002, + "learning_rate": 1.955429129789162e-05, + "loss": 0.2401, + "step": 776 + }, + { + "epoch": 0.36791950281148267, + "grad_norm": 2.1535274982452393, + "learning_rate": 1.9552780260592755e-05, + "loss": 0.2797, + "step": 777 + }, + { + "epoch": 0.36839301568511396, + "grad_norm": 1.3766757249832153, + "learning_rate": 1.955126672485637e-05, + "loss": 0.2579, + "step": 778 + }, + { + "epoch": 0.3688665285587452, + "grad_norm": 1.8468455076217651, + "learning_rate": 1.954975069107832e-05, + "loss": 0.2515, + "step": 779 + }, + { + "epoch": 0.3693400414323764, + "grad_norm": 1.970403790473938, + "learning_rate": 1.95482321596551e-05, + "loss": 0.2513, + "step": 780 + }, + { + "epoch": 0.3698135543060077, + "grad_norm": 1.3336784839630127, + "learning_rate": 1.9546711130983874e-05, + "loss": 0.2741, + "step": 781 + }, + { + "epoch": 0.37028706717963894, + "grad_norm": 2.5626380443573, + "learning_rate": 1.954518760546245e-05, + "loss": 0.2537, + "step": 782 + }, + { + "epoch": 0.3707605800532702, + "grad_norm": 3.228623628616333, + "learning_rate": 1.9543661583489295e-05, + "loss": 0.2756, + "step": 783 + }, + { + "epoch": 0.37123409292690146, + "grad_norm": 2.226210832595825, + "learning_rate": 1.9542133065463518e-05, + "loss": 0.278, + "step": 784 + }, + { + "epoch": 0.3717076058005327, + "grad_norm": 1.0971438884735107, + "learning_rate": 1.9540602051784897e-05, + "loss": 0.2334, + "step": 785 + }, + { + "epoch": 0.372181118674164, + "grad_norm": 1.3254427909851074, + "learning_rate": 1.9539068542853844e-05, + "loss": 0.2461, + "step": 786 + }, + { + "epoch": 0.3726546315477952, + "grad_norm": 1.6858649253845215, + "learning_rate": 1.953753253907144e-05, + "loss": 0.272, + "step": 787 + }, + { + "epoch": 0.37312814442142644, + "grad_norm": 1.6952732801437378, + "learning_rate": 1.9535994040839413e-05, + "loss": 0.2775, + "step": 788 + }, + { + "epoch": 0.37360165729505773, + "grad_norm": 1.41287100315094, + "learning_rate": 1.9534453048560137e-05, + "loss": 0.2415, + "step": 789 + }, + { + "epoch": 0.37407517016868896, + "grad_norm": 1.457585096359253, + "learning_rate": 1.953290956263665e-05, + "loss": 0.2778, + "step": 790 + }, + { + "epoch": 0.3745486830423202, + "grad_norm": 1.6097171306610107, + "learning_rate": 1.9531363583472628e-05, + "loss": 0.2522, + "step": 791 + }, + { + "epoch": 0.3750221959159515, + "grad_norm": 1.8246599435806274, + "learning_rate": 1.9529815111472414e-05, + "loss": 0.27, + "step": 792 + }, + { + "epoch": 0.3754957087895827, + "grad_norm": 2.544400691986084, + "learning_rate": 1.9528264147040995e-05, + "loss": 0.2704, + "step": 793 + }, + { + "epoch": 0.37596922166321395, + "grad_norm": 1.3660106658935547, + "learning_rate": 1.9526710690584005e-05, + "loss": 0.2608, + "step": 794 + }, + { + "epoch": 0.37644273453684524, + "grad_norm": 1.1534936428070068, + "learning_rate": 1.9525154742507745e-05, + "loss": 0.2565, + "step": 795 + }, + { + "epoch": 0.37691624741047647, + "grad_norm": 1.5819586515426636, + "learning_rate": 1.9523596303219146e-05, + "loss": 0.2806, + "step": 796 + }, + { + "epoch": 0.3773897602841077, + "grad_norm": 1.9623225927352905, + "learning_rate": 1.9522035373125816e-05, + "loss": 0.2916, + "step": 797 + }, + { + "epoch": 0.377863273157739, + "grad_norm": 1.619255542755127, + "learning_rate": 1.9520471952635992e-05, + "loss": 0.2641, + "step": 798 + }, + { + "epoch": 0.3783367860313702, + "grad_norm": 1.587944746017456, + "learning_rate": 1.9518906042158575e-05, + "loss": 0.2707, + "step": 799 + }, + { + "epoch": 0.37881029890500145, + "grad_norm": 1.1555267572402954, + "learning_rate": 1.9517337642103116e-05, + "loss": 0.2526, + "step": 800 + }, + { + "epoch": 0.37928381177863274, + "grad_norm": 1.6961590051651, + "learning_rate": 1.9515766752879808e-05, + "loss": 0.2276, + "step": 801 + }, + { + "epoch": 0.379757324652264, + "grad_norm": 1.6464869976043701, + "learning_rate": 1.9514193374899508e-05, + "loss": 0.2662, + "step": 802 + }, + { + "epoch": 0.38023083752589526, + "grad_norm": 2.7278549671173096, + "learning_rate": 1.9512617508573713e-05, + "loss": 0.2696, + "step": 803 + }, + { + "epoch": 0.3807043503995265, + "grad_norm": 1.3315246105194092, + "learning_rate": 1.951103915431458e-05, + "loss": 0.2777, + "step": 804 + }, + { + "epoch": 0.3811778632731577, + "grad_norm": 1.1722429990768433, + "learning_rate": 1.9509458312534912e-05, + "loss": 0.2657, + "step": 805 + }, + { + "epoch": 0.381651376146789, + "grad_norm": 2.3142430782318115, + "learning_rate": 1.9507874983648163e-05, + "loss": 0.2829, + "step": 806 + }, + { + "epoch": 0.38212488902042024, + "grad_norm": 2.0772149562835693, + "learning_rate": 1.9506289168068433e-05, + "loss": 0.2589, + "step": 807 + }, + { + "epoch": 0.3825984018940515, + "grad_norm": 1.3622729778289795, + "learning_rate": 1.950470086621048e-05, + "loss": 0.2629, + "step": 808 + }, + { + "epoch": 0.38307191476768276, + "grad_norm": 1.4933375120162964, + "learning_rate": 1.9503110078489712e-05, + "loss": 0.2594, + "step": 809 + }, + { + "epoch": 0.383545427641314, + "grad_norm": 1.4791429042816162, + "learning_rate": 1.950151680532218e-05, + "loss": 0.2544, + "step": 810 + }, + { + "epoch": 0.38401894051494523, + "grad_norm": 1.3858115673065186, + "learning_rate": 1.9499921047124587e-05, + "loss": 0.2611, + "step": 811 + }, + { + "epoch": 0.3844924533885765, + "grad_norm": 1.273097038269043, + "learning_rate": 1.9498322804314297e-05, + "loss": 0.2752, + "step": 812 + }, + { + "epoch": 0.38496596626220775, + "grad_norm": 1.3330796957015991, + "learning_rate": 1.9496722077309306e-05, + "loss": 0.2363, + "step": 813 + }, + { + "epoch": 0.385439479135839, + "grad_norm": 1.091861605644226, + "learning_rate": 1.949511886652827e-05, + "loss": 0.2528, + "step": 814 + }, + { + "epoch": 0.38591299200947027, + "grad_norm": 1.2032009363174438, + "learning_rate": 1.9493513172390498e-05, + "loss": 0.2528, + "step": 815 + }, + { + "epoch": 0.3863865048831015, + "grad_norm": 2.2817001342773438, + "learning_rate": 1.949190499531594e-05, + "loss": 0.2587, + "step": 816 + }, + { + "epoch": 0.3868600177567328, + "grad_norm": 2.36639666557312, + "learning_rate": 1.9490294335725204e-05, + "loss": 0.2474, + "step": 817 + }, + { + "epoch": 0.387333530630364, + "grad_norm": 1.1809403896331787, + "learning_rate": 1.9488681194039537e-05, + "loss": 0.2738, + "step": 818 + }, + { + "epoch": 0.38780704350399525, + "grad_norm": 1.3689755201339722, + "learning_rate": 1.9487065570680845e-05, + "loss": 0.2547, + "step": 819 + }, + { + "epoch": 0.38828055637762654, + "grad_norm": 1.8941787481307983, + "learning_rate": 1.948544746607167e-05, + "loss": 0.2823, + "step": 820 + }, + { + "epoch": 0.38875406925125777, + "grad_norm": 1.0949586629867554, + "learning_rate": 1.9483826880635225e-05, + "loss": 0.2658, + "step": 821 + }, + { + "epoch": 0.389227582124889, + "grad_norm": 2.2636001110076904, + "learning_rate": 1.9482203814795344e-05, + "loss": 0.2608, + "step": 822 + }, + { + "epoch": 0.3897010949985203, + "grad_norm": 1.2517188787460327, + "learning_rate": 1.9480578268976536e-05, + "loss": 0.2892, + "step": 823 + }, + { + "epoch": 0.3901746078721515, + "grad_norm": 1.6371468305587769, + "learning_rate": 1.9478950243603946e-05, + "loss": 0.2999, + "step": 824 + }, + { + "epoch": 0.39064812074578276, + "grad_norm": 1.488568663597107, + "learning_rate": 1.947731973910336e-05, + "loss": 0.2885, + "step": 825 + }, + { + "epoch": 0.39112163361941404, + "grad_norm": 1.5922267436981201, + "learning_rate": 1.9475686755901227e-05, + "loss": 0.2553, + "step": 826 + }, + { + "epoch": 0.3915951464930453, + "grad_norm": 2.3070576190948486, + "learning_rate": 1.9474051294424634e-05, + "loss": 0.2591, + "step": 827 + }, + { + "epoch": 0.3920686593666765, + "grad_norm": 2.676198959350586, + "learning_rate": 1.9472413355101327e-05, + "loss": 0.2867, + "step": 828 + }, + { + "epoch": 0.3925421722403078, + "grad_norm": 1.9517337083816528, + "learning_rate": 1.9470772938359687e-05, + "loss": 0.2684, + "step": 829 + }, + { + "epoch": 0.39301568511393903, + "grad_norm": 1.6799813508987427, + "learning_rate": 1.946913004462875e-05, + "loss": 0.2631, + "step": 830 + }, + { + "epoch": 0.39348919798757026, + "grad_norm": 1.206333041191101, + "learning_rate": 1.9467484674338202e-05, + "loss": 0.2628, + "step": 831 + }, + { + "epoch": 0.39396271086120155, + "grad_norm": 1.4563113451004028, + "learning_rate": 1.9465836827918373e-05, + "loss": 0.2684, + "step": 832 + }, + { + "epoch": 0.3944362237348328, + "grad_norm": 2.2227954864501953, + "learning_rate": 1.9464186505800236e-05, + "loss": 0.2905, + "step": 833 + }, + { + "epoch": 0.39490973660846407, + "grad_norm": 1.46279776096344, + "learning_rate": 1.9462533708415425e-05, + "loss": 0.2615, + "step": 834 + }, + { + "epoch": 0.3953832494820953, + "grad_norm": 2.0220069885253906, + "learning_rate": 1.9460878436196206e-05, + "loss": 0.2427, + "step": 835 + }, + { + "epoch": 0.39585676235572653, + "grad_norm": 1.3017007112503052, + "learning_rate": 1.9459220689575505e-05, + "loss": 0.2563, + "step": 836 + }, + { + "epoch": 0.3963302752293578, + "grad_norm": 1.700585961341858, + "learning_rate": 1.9457560468986888e-05, + "loss": 0.2656, + "step": 837 + }, + { + "epoch": 0.39680378810298905, + "grad_norm": 1.8619630336761475, + "learning_rate": 1.9455897774864567e-05, + "loss": 0.2653, + "step": 838 + }, + { + "epoch": 0.3972773009766203, + "grad_norm": 1.1475433111190796, + "learning_rate": 1.9454232607643406e-05, + "loss": 0.2612, + "step": 839 + }, + { + "epoch": 0.39775081385025157, + "grad_norm": 1.736665964126587, + "learning_rate": 1.9452564967758912e-05, + "loss": 0.2655, + "step": 840 + }, + { + "epoch": 0.3982243267238828, + "grad_norm": 1.1102855205535889, + "learning_rate": 1.9450894855647246e-05, + "loss": 0.2437, + "step": 841 + }, + { + "epoch": 0.39869783959751404, + "grad_norm": 1.2545884847640991, + "learning_rate": 1.9449222271745202e-05, + "loss": 0.2401, + "step": 842 + }, + { + "epoch": 0.3991713524711453, + "grad_norm": 1.2088849544525146, + "learning_rate": 1.944754721649023e-05, + "loss": 0.2497, + "step": 843 + }, + { + "epoch": 0.39964486534477656, + "grad_norm": 1.708852767944336, + "learning_rate": 1.9445869690320425e-05, + "loss": 0.2513, + "step": 844 + }, + { + "epoch": 0.4001183782184078, + "grad_norm": 1.3181136846542358, + "learning_rate": 1.9444189693674528e-05, + "loss": 0.2814, + "step": 845 + }, + { + "epoch": 0.4005918910920391, + "grad_norm": 1.6149542331695557, + "learning_rate": 1.944250722699193e-05, + "loss": 0.2475, + "step": 846 + }, + { + "epoch": 0.4010654039656703, + "grad_norm": 1.4610974788665771, + "learning_rate": 1.944082229071266e-05, + "loss": 0.2571, + "step": 847 + }, + { + "epoch": 0.4015389168393016, + "grad_norm": 1.5411815643310547, + "learning_rate": 1.9439134885277394e-05, + "loss": 0.2255, + "step": 848 + }, + { + "epoch": 0.40201242971293283, + "grad_norm": 1.6833422183990479, + "learning_rate": 1.9437445011127463e-05, + "loss": 0.2551, + "step": 849 + }, + { + "epoch": 0.40248594258656406, + "grad_norm": 1.65785813331604, + "learning_rate": 1.943575266870483e-05, + "loss": 0.246, + "step": 850 + }, + { + "epoch": 0.40295945546019535, + "grad_norm": 1.3264704942703247, + "learning_rate": 1.9434057858452117e-05, + "loss": 0.2935, + "step": 851 + }, + { + "epoch": 0.4034329683338266, + "grad_norm": 1.166812539100647, + "learning_rate": 1.9432360580812583e-05, + "loss": 0.2528, + "step": 852 + }, + { + "epoch": 0.4039064812074578, + "grad_norm": 2.272095203399658, + "learning_rate": 1.9430660836230134e-05, + "loss": 0.2852, + "step": 853 + }, + { + "epoch": 0.4043799940810891, + "grad_norm": 1.7517335414886475, + "learning_rate": 1.9428958625149324e-05, + "loss": 0.2755, + "step": 854 + }, + { + "epoch": 0.40485350695472033, + "grad_norm": 1.2015048265457153, + "learning_rate": 1.942725394801535e-05, + "loss": 0.2628, + "step": 855 + }, + { + "epoch": 0.40532701982835156, + "grad_norm": 1.3004090785980225, + "learning_rate": 1.9425546805274048e-05, + "loss": 0.2415, + "step": 856 + }, + { + "epoch": 0.40580053270198285, + "grad_norm": 1.5480315685272217, + "learning_rate": 1.942383719737191e-05, + "loss": 0.3167, + "step": 857 + }, + { + "epoch": 0.4062740455756141, + "grad_norm": 2.4538869857788086, + "learning_rate": 1.9422125124756068e-05, + "loss": 0.2415, + "step": 858 + }, + { + "epoch": 0.4067475584492453, + "grad_norm": 1.163549542427063, + "learning_rate": 1.9420410587874295e-05, + "loss": 0.2615, + "step": 859 + }, + { + "epoch": 0.4072210713228766, + "grad_norm": 1.151456594467163, + "learning_rate": 1.941869358717501e-05, + "loss": 0.2391, + "step": 860 + }, + { + "epoch": 0.40769458419650784, + "grad_norm": 1.4595973491668701, + "learning_rate": 1.9416974123107287e-05, + "loss": 0.2871, + "step": 861 + }, + { + "epoch": 0.40816809707013907, + "grad_norm": 1.4766554832458496, + "learning_rate": 1.941525219612083e-05, + "loss": 0.2879, + "step": 862 + }, + { + "epoch": 0.40864160994377036, + "grad_norm": 1.770892858505249, + "learning_rate": 1.941352780666599e-05, + "loss": 0.2809, + "step": 863 + }, + { + "epoch": 0.4091151228174016, + "grad_norm": 1.4286603927612305, + "learning_rate": 1.9411800955193762e-05, + "loss": 0.2793, + "step": 864 + }, + { + "epoch": 0.4095886356910329, + "grad_norm": 1.2635271549224854, + "learning_rate": 1.9410071642155796e-05, + "loss": 0.267, + "step": 865 + }, + { + "epoch": 0.4100621485646641, + "grad_norm": 2.1183278560638428, + "learning_rate": 1.940833986800437e-05, + "loss": 0.2578, + "step": 866 + }, + { + "epoch": 0.41053566143829534, + "grad_norm": 1.1549620628356934, + "learning_rate": 1.9406605633192414e-05, + "loss": 0.2852, + "step": 867 + }, + { + "epoch": 0.41100917431192663, + "grad_norm": 1.3597477674484253, + "learning_rate": 1.9404868938173503e-05, + "loss": 0.2711, + "step": 868 + }, + { + "epoch": 0.41148268718555786, + "grad_norm": 1.2947059869766235, + "learning_rate": 1.9403129783401854e-05, + "loss": 0.2838, + "step": 869 + }, + { + "epoch": 0.4119562000591891, + "grad_norm": 1.0901488065719604, + "learning_rate": 1.9401388169332322e-05, + "loss": 0.2467, + "step": 870 + }, + { + "epoch": 0.4124297129328204, + "grad_norm": 1.7122608423233032, + "learning_rate": 1.939964409642041e-05, + "loss": 0.265, + "step": 871 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 1.7495481967926025, + "learning_rate": 1.9397897565122267e-05, + "loss": 0.2824, + "step": 872 + }, + { + "epoch": 0.41337673868008284, + "grad_norm": 1.2452020645141602, + "learning_rate": 1.939614857589468e-05, + "loss": 0.2695, + "step": 873 + }, + { + "epoch": 0.41385025155371413, + "grad_norm": 1.2135512828826904, + "learning_rate": 1.9394397129195076e-05, + "loss": 0.2606, + "step": 874 + }, + { + "epoch": 0.41432376442734536, + "grad_norm": 1.4891951084136963, + "learning_rate": 1.9392643225481535e-05, + "loss": 0.2587, + "step": 875 + }, + { + "epoch": 0.4147972773009766, + "grad_norm": 1.4017555713653564, + "learning_rate": 1.9390886865212767e-05, + "loss": 0.2776, + "step": 876 + }, + { + "epoch": 0.4152707901746079, + "grad_norm": 1.8024797439575195, + "learning_rate": 1.9389128048848136e-05, + "loss": 0.265, + "step": 877 + }, + { + "epoch": 0.4157443030482391, + "grad_norm": 1.2021361589431763, + "learning_rate": 1.9387366776847645e-05, + "loss": 0.2712, + "step": 878 + }, + { + "epoch": 0.41621781592187035, + "grad_norm": 1.449995756149292, + "learning_rate": 1.9385603049671934e-05, + "loss": 0.2887, + "step": 879 + }, + { + "epoch": 0.41669132879550164, + "grad_norm": 1.4844348430633545, + "learning_rate": 1.9383836867782287e-05, + "loss": 0.2384, + "step": 880 + }, + { + "epoch": 0.41716484166913287, + "grad_norm": 1.3519595861434937, + "learning_rate": 1.938206823164064e-05, + "loss": 0.2376, + "step": 881 + }, + { + "epoch": 0.41763835454276416, + "grad_norm": 1.2284427881240845, + "learning_rate": 1.938029714170955e-05, + "loss": 0.2525, + "step": 882 + }, + { + "epoch": 0.4181118674163954, + "grad_norm": 1.362317442893982, + "learning_rate": 1.937852359845224e-05, + "loss": 0.2695, + "step": 883 + }, + { + "epoch": 0.4185853802900266, + "grad_norm": 1.533306360244751, + "learning_rate": 1.937674760233256e-05, + "loss": 0.2334, + "step": 884 + }, + { + "epoch": 0.4190588931636579, + "grad_norm": 2.291046380996704, + "learning_rate": 1.9374969153815005e-05, + "loss": 0.2578, + "step": 885 + }, + { + "epoch": 0.41953240603728914, + "grad_norm": 1.2593145370483398, + "learning_rate": 1.937318825336471e-05, + "loss": 0.2854, + "step": 886 + }, + { + "epoch": 0.4200059189109204, + "grad_norm": 1.5685346126556396, + "learning_rate": 1.9371404901447445e-05, + "loss": 0.2777, + "step": 887 + }, + { + "epoch": 0.42047943178455166, + "grad_norm": 1.7853624820709229, + "learning_rate": 1.936961909852964e-05, + "loss": 0.2817, + "step": 888 + }, + { + "epoch": 0.4209529446581829, + "grad_norm": 1.8879746198654175, + "learning_rate": 1.9367830845078354e-05, + "loss": 0.2515, + "step": 889 + }, + { + "epoch": 0.4214264575318141, + "grad_norm": 1.719527244567871, + "learning_rate": 1.936604014156128e-05, + "loss": 0.2583, + "step": 890 + }, + { + "epoch": 0.4218999704054454, + "grad_norm": 1.735648274421692, + "learning_rate": 1.936424698844676e-05, + "loss": 0.2788, + "step": 891 + }, + { + "epoch": 0.42237348327907664, + "grad_norm": 1.6810283660888672, + "learning_rate": 1.9362451386203784e-05, + "loss": 0.2548, + "step": 892 + }, + { + "epoch": 0.4228469961527079, + "grad_norm": 1.0706638097763062, + "learning_rate": 1.9360653335301964e-05, + "loss": 0.2553, + "step": 893 + }, + { + "epoch": 0.42332050902633916, + "grad_norm": 1.1451069116592407, + "learning_rate": 1.9358852836211573e-05, + "loss": 0.2566, + "step": 894 + }, + { + "epoch": 0.4237940218999704, + "grad_norm": 1.3580163717269897, + "learning_rate": 1.9357049889403506e-05, + "loss": 0.2676, + "step": 895 + }, + { + "epoch": 0.4242675347736017, + "grad_norm": 1.5814894437789917, + "learning_rate": 1.9355244495349307e-05, + "loss": 0.2997, + "step": 896 + }, + { + "epoch": 0.4247410476472329, + "grad_norm": 2.3685078620910645, + "learning_rate": 1.9353436654521168e-05, + "loss": 0.261, + "step": 897 + }, + { + "epoch": 0.42521456052086415, + "grad_norm": 1.4430562257766724, + "learning_rate": 1.9351626367391902e-05, + "loss": 0.2477, + "step": 898 + }, + { + "epoch": 0.42568807339449544, + "grad_norm": 1.7693711519241333, + "learning_rate": 1.9349813634434977e-05, + "loss": 0.2577, + "step": 899 + }, + { + "epoch": 0.42616158626812667, + "grad_norm": 1.3744468688964844, + "learning_rate": 1.9347998456124497e-05, + "loss": 0.2736, + "step": 900 + }, + { + "epoch": 0.4266350991417579, + "grad_norm": 1.5893174409866333, + "learning_rate": 1.9346180832935202e-05, + "loss": 0.2518, + "step": 901 + }, + { + "epoch": 0.4271086120153892, + "grad_norm": 1.6329647302627563, + "learning_rate": 1.9344360765342472e-05, + "loss": 0.2574, + "step": 902 + }, + { + "epoch": 0.4275821248890204, + "grad_norm": 1.2065149545669556, + "learning_rate": 1.9342538253822334e-05, + "loss": 0.2522, + "step": 903 + }, + { + "epoch": 0.42805563776265165, + "grad_norm": 1.2911807298660278, + "learning_rate": 1.934071329885144e-05, + "loss": 0.2441, + "step": 904 + }, + { + "epoch": 0.42852915063628294, + "grad_norm": 1.260495901107788, + "learning_rate": 1.93388859009071e-05, + "loss": 0.2505, + "step": 905 + }, + { + "epoch": 0.4290026635099142, + "grad_norm": 1.6986874341964722, + "learning_rate": 1.9337056060467244e-05, + "loss": 0.2942, + "step": 906 + }, + { + "epoch": 0.4294761763835454, + "grad_norm": 1.6441107988357544, + "learning_rate": 1.933522377801045e-05, + "loss": 0.2577, + "step": 907 + }, + { + "epoch": 0.4299496892571767, + "grad_norm": 1.4719241857528687, + "learning_rate": 1.9333389054015935e-05, + "loss": 0.2602, + "step": 908 + }, + { + "epoch": 0.4304232021308079, + "grad_norm": 1.37874436378479, + "learning_rate": 1.9331551888963557e-05, + "loss": 0.2792, + "step": 909 + }, + { + "epoch": 0.43089671500443916, + "grad_norm": 1.7454997301101685, + "learning_rate": 1.93297122833338e-05, + "loss": 0.2456, + "step": 910 + }, + { + "epoch": 0.43137022787807044, + "grad_norm": 1.5132381916046143, + "learning_rate": 1.9327870237607805e-05, + "loss": 0.2826, + "step": 911 + }, + { + "epoch": 0.4318437407517017, + "grad_norm": 1.309891939163208, + "learning_rate": 1.9326025752267338e-05, + "loss": 0.2695, + "step": 912 + }, + { + "epoch": 0.43231725362533296, + "grad_norm": 1.375044822692871, + "learning_rate": 1.9324178827794803e-05, + "loss": 0.252, + "step": 913 + }, + { + "epoch": 0.4327907664989642, + "grad_norm": 1.6480258703231812, + "learning_rate": 1.9322329464673248e-05, + "loss": 0.2797, + "step": 914 + }, + { + "epoch": 0.43326427937259543, + "grad_norm": 1.1525005102157593, + "learning_rate": 1.9320477663386358e-05, + "loss": 0.2449, + "step": 915 + }, + { + "epoch": 0.4337377922462267, + "grad_norm": 1.945312261581421, + "learning_rate": 1.9318623424418446e-05, + "loss": 0.2355, + "step": 916 + }, + { + "epoch": 0.43421130511985795, + "grad_norm": 1.427280068397522, + "learning_rate": 1.9316766748254477e-05, + "loss": 0.2888, + "step": 917 + }, + { + "epoch": 0.4346848179934892, + "grad_norm": 1.4059946537017822, + "learning_rate": 1.931490763538005e-05, + "loss": 0.2321, + "step": 918 + }, + { + "epoch": 0.43515833086712047, + "grad_norm": 1.3967366218566895, + "learning_rate": 1.931304608628139e-05, + "loss": 0.2585, + "step": 919 + }, + { + "epoch": 0.4356318437407517, + "grad_norm": 1.3935580253601074, + "learning_rate": 1.931118210144537e-05, + "loss": 0.2663, + "step": 920 + }, + { + "epoch": 0.43610535661438293, + "grad_norm": 1.2772636413574219, + "learning_rate": 1.93093156813595e-05, + "loss": 0.2607, + "step": 921 + }, + { + "epoch": 0.4365788694880142, + "grad_norm": 1.2750356197357178, + "learning_rate": 1.930744682651192e-05, + "loss": 0.2518, + "step": 922 + }, + { + "epoch": 0.43705238236164545, + "grad_norm": 1.2921040058135986, + "learning_rate": 1.9305575537391416e-05, + "loss": 0.2623, + "step": 923 + }, + { + "epoch": 0.4375258952352767, + "grad_norm": 1.6561219692230225, + "learning_rate": 1.9303701814487403e-05, + "loss": 0.3298, + "step": 924 + }, + { + "epoch": 0.437999408108908, + "grad_norm": 1.4498628377914429, + "learning_rate": 1.930182565828993e-05, + "loss": 0.2543, + "step": 925 + }, + { + "epoch": 0.4384729209825392, + "grad_norm": 1.2270756959915161, + "learning_rate": 1.9299947069289694e-05, + "loss": 0.2553, + "step": 926 + }, + { + "epoch": 0.43894643385617044, + "grad_norm": 1.31882905960083, + "learning_rate": 1.9298066047978024e-05, + "loss": 0.2574, + "step": 927 + }, + { + "epoch": 0.4394199467298017, + "grad_norm": 1.993991494178772, + "learning_rate": 1.9296182594846876e-05, + "loss": 0.2692, + "step": 928 + }, + { + "epoch": 0.43989345960343296, + "grad_norm": 1.425070881843567, + "learning_rate": 1.9294296710388852e-05, + "loss": 0.2553, + "step": 929 + }, + { + "epoch": 0.44036697247706424, + "grad_norm": 1.2672945261001587, + "learning_rate": 1.9292408395097187e-05, + "loss": 0.2814, + "step": 930 + }, + { + "epoch": 0.4408404853506955, + "grad_norm": 1.2957855463027954, + "learning_rate": 1.9290517649465756e-05, + "loss": 0.2714, + "step": 931 + }, + { + "epoch": 0.4413139982243267, + "grad_norm": 1.4948066473007202, + "learning_rate": 1.9288624473989055e-05, + "loss": 0.2353, + "step": 932 + }, + { + "epoch": 0.441787511097958, + "grad_norm": 1.6086140871047974, + "learning_rate": 1.9286728869162235e-05, + "loss": 0.2507, + "step": 933 + }, + { + "epoch": 0.44226102397158923, + "grad_norm": 1.5332534313201904, + "learning_rate": 1.928483083548107e-05, + "loss": 0.2744, + "step": 934 + }, + { + "epoch": 0.44273453684522046, + "grad_norm": 1.8860116004943848, + "learning_rate": 1.928293037344197e-05, + "loss": 0.2706, + "step": 935 + }, + { + "epoch": 0.44320804971885175, + "grad_norm": 1.3716895580291748, + "learning_rate": 1.9281027483541986e-05, + "loss": 0.2565, + "step": 936 + }, + { + "epoch": 0.443681562592483, + "grad_norm": 2.0549087524414062, + "learning_rate": 1.9279122166278798e-05, + "loss": 0.2524, + "step": 937 + }, + { + "epoch": 0.4441550754661142, + "grad_norm": 1.379072666168213, + "learning_rate": 1.927721442215073e-05, + "loss": 0.2694, + "step": 938 + }, + { + "epoch": 0.4446285883397455, + "grad_norm": 1.1916866302490234, + "learning_rate": 1.9275304251656723e-05, + "loss": 0.2386, + "step": 939 + }, + { + "epoch": 0.44510210121337673, + "grad_norm": 2.893798589706421, + "learning_rate": 1.9273391655296373e-05, + "loss": 0.2682, + "step": 940 + }, + { + "epoch": 0.44557561408700797, + "grad_norm": 2.7330830097198486, + "learning_rate": 1.9271476633569895e-05, + "loss": 0.2632, + "step": 941 + }, + { + "epoch": 0.44604912696063925, + "grad_norm": 1.7794554233551025, + "learning_rate": 1.926955918697815e-05, + "loss": 0.2447, + "step": 942 + }, + { + "epoch": 0.4465226398342705, + "grad_norm": 1.3762879371643066, + "learning_rate": 1.926763931602262e-05, + "loss": 0.2525, + "step": 943 + }, + { + "epoch": 0.4469961527079018, + "grad_norm": 1.5030863285064697, + "learning_rate": 1.9265717021205437e-05, + "loss": 0.248, + "step": 944 + }, + { + "epoch": 0.447469665581533, + "grad_norm": 1.604885220527649, + "learning_rate": 1.9263792303029355e-05, + "loss": 0.2527, + "step": 945 + }, + { + "epoch": 0.44794317845516424, + "grad_norm": 1.0252493619918823, + "learning_rate": 1.9261865161997765e-05, + "loss": 0.2084, + "step": 946 + }, + { + "epoch": 0.4484166913287955, + "grad_norm": 1.32304048538208, + "learning_rate": 1.925993559861469e-05, + "loss": 0.2385, + "step": 947 + }, + { + "epoch": 0.44889020420242676, + "grad_norm": 1.2492711544036865, + "learning_rate": 1.9258003613384793e-05, + "loss": 0.2862, + "step": 948 + }, + { + "epoch": 0.449363717076058, + "grad_norm": 1.9659324884414673, + "learning_rate": 1.925606920681337e-05, + "loss": 0.2729, + "step": 949 + }, + { + "epoch": 0.4498372299496893, + "grad_norm": 1.930130958557129, + "learning_rate": 1.9254132379406335e-05, + "loss": 0.2429, + "step": 950 + }, + { + "epoch": 0.4503107428233205, + "grad_norm": 1.9109481573104858, + "learning_rate": 1.925219313167025e-05, + "loss": 0.2579, + "step": 951 + }, + { + "epoch": 0.45078425569695174, + "grad_norm": 1.461925983428955, + "learning_rate": 1.9250251464112313e-05, + "loss": 0.3025, + "step": 952 + }, + { + "epoch": 0.45125776857058303, + "grad_norm": 1.1421688795089722, + "learning_rate": 1.9248307377240346e-05, + "loss": 0.2611, + "step": 953 + }, + { + "epoch": 0.45173128144421426, + "grad_norm": 1.3895161151885986, + "learning_rate": 1.92463608715628e-05, + "loss": 0.2854, + "step": 954 + }, + { + "epoch": 0.4522047943178455, + "grad_norm": 1.081526517868042, + "learning_rate": 1.9244411947588774e-05, + "loss": 0.2495, + "step": 955 + }, + { + "epoch": 0.4526783071914768, + "grad_norm": 1.5755951404571533, + "learning_rate": 1.924246060582798e-05, + "loss": 0.2645, + "step": 956 + }, + { + "epoch": 0.453151820065108, + "grad_norm": 1.318973183631897, + "learning_rate": 1.9240506846790784e-05, + "loss": 0.2215, + "step": 957 + }, + { + "epoch": 0.45362533293873925, + "grad_norm": 1.0379008054733276, + "learning_rate": 1.9238550670988166e-05, + "loss": 0.2594, + "step": 958 + }, + { + "epoch": 0.45409884581237053, + "grad_norm": 1.1944754123687744, + "learning_rate": 1.923659207893174e-05, + "loss": 0.2458, + "step": 959 + }, + { + "epoch": 0.45457235868600177, + "grad_norm": 1.7871370315551758, + "learning_rate": 1.9234631071133768e-05, + "loss": 0.2622, + "step": 960 + }, + { + "epoch": 0.45504587155963305, + "grad_norm": 1.2604117393493652, + "learning_rate": 1.9232667648107127e-05, + "loss": 0.2823, + "step": 961 + }, + { + "epoch": 0.4555193844332643, + "grad_norm": 1.3337626457214355, + "learning_rate": 1.923070181036533e-05, + "loss": 0.2737, + "step": 962 + }, + { + "epoch": 0.4559928973068955, + "grad_norm": 1.0898325443267822, + "learning_rate": 1.9228733558422525e-05, + "loss": 0.2567, + "step": 963 + }, + { + "epoch": 0.4564664101805268, + "grad_norm": 1.1792700290679932, + "learning_rate": 1.9226762892793492e-05, + "loss": 0.2511, + "step": 964 + }, + { + "epoch": 0.45693992305415804, + "grad_norm": 1.4441752433776855, + "learning_rate": 1.922478981399363e-05, + "loss": 0.2526, + "step": 965 + }, + { + "epoch": 0.45741343592778927, + "grad_norm": 1.9730075597763062, + "learning_rate": 1.9222814322538993e-05, + "loss": 0.3023, + "step": 966 + }, + { + "epoch": 0.45788694880142056, + "grad_norm": 1.2090874910354614, + "learning_rate": 1.922083641894624e-05, + "loss": 0.2561, + "step": 967 + }, + { + "epoch": 0.4583604616750518, + "grad_norm": 1.5436848402023315, + "learning_rate": 1.9218856103732675e-05, + "loss": 0.2539, + "step": 968 + }, + { + "epoch": 0.458833974548683, + "grad_norm": 1.0943453311920166, + "learning_rate": 1.9216873377416236e-05, + "loss": 0.2568, + "step": 969 + }, + { + "epoch": 0.4593074874223143, + "grad_norm": 1.5270817279815674, + "learning_rate": 1.9214888240515478e-05, + "loss": 0.2381, + "step": 970 + }, + { + "epoch": 0.45978100029594554, + "grad_norm": 1.254157543182373, + "learning_rate": 1.9212900693549602e-05, + "loss": 0.2476, + "step": 971 + }, + { + "epoch": 0.4602545131695768, + "grad_norm": 1.5397264957427979, + "learning_rate": 1.9210910737038424e-05, + "loss": 0.2651, + "step": 972 + }, + { + "epoch": 0.46072802604320806, + "grad_norm": 1.4279857873916626, + "learning_rate": 1.9208918371502404e-05, + "loss": 0.249, + "step": 973 + }, + { + "epoch": 0.4612015389168393, + "grad_norm": 1.1333208084106445, + "learning_rate": 1.9206923597462625e-05, + "loss": 0.2391, + "step": 974 + }, + { + "epoch": 0.4616750517904706, + "grad_norm": 1.3642656803131104, + "learning_rate": 1.9204926415440798e-05, + "loss": 0.2462, + "step": 975 + }, + { + "epoch": 0.4621485646641018, + "grad_norm": 1.0775152444839478, + "learning_rate": 1.920292682595927e-05, + "loss": 0.2345, + "step": 976 + }, + { + "epoch": 0.46262207753773305, + "grad_norm": 1.583609938621521, + "learning_rate": 1.9200924829541012e-05, + "loss": 0.2341, + "step": 977 + }, + { + "epoch": 0.46309559041136433, + "grad_norm": 1.214303731918335, + "learning_rate": 1.919892042670963e-05, + "loss": 0.2515, + "step": 978 + }, + { + "epoch": 0.46356910328499557, + "grad_norm": 1.1225674152374268, + "learning_rate": 1.919691361798935e-05, + "loss": 0.2615, + "step": 979 + }, + { + "epoch": 0.4640426161586268, + "grad_norm": 1.8481762409210205, + "learning_rate": 1.9194904403905038e-05, + "loss": 0.2602, + "step": 980 + }, + { + "epoch": 0.4645161290322581, + "grad_norm": 1.6956995725631714, + "learning_rate": 1.9192892784982185e-05, + "loss": 0.2413, + "step": 981 + }, + { + "epoch": 0.4649896419058893, + "grad_norm": 1.4857017993927002, + "learning_rate": 1.919087876174691e-05, + "loss": 0.271, + "step": 982 + }, + { + "epoch": 0.46546315477952055, + "grad_norm": 1.868812918663025, + "learning_rate": 1.918886233472596e-05, + "loss": 0.2553, + "step": 983 + }, + { + "epoch": 0.46593666765315184, + "grad_norm": 2.422353982925415, + "learning_rate": 1.9186843504446716e-05, + "loss": 0.2641, + "step": 984 + }, + { + "epoch": 0.46641018052678307, + "grad_norm": 1.8645198345184326, + "learning_rate": 1.9184822271437176e-05, + "loss": 0.2752, + "step": 985 + }, + { + "epoch": 0.4668836934004143, + "grad_norm": 1.2485929727554321, + "learning_rate": 1.9182798636225983e-05, + "loss": 0.2439, + "step": 986 + }, + { + "epoch": 0.4673572062740456, + "grad_norm": 1.1110594272613525, + "learning_rate": 1.918077259934239e-05, + "loss": 0.2504, + "step": 987 + }, + { + "epoch": 0.4678307191476768, + "grad_norm": 1.529255747795105, + "learning_rate": 1.9178744161316297e-05, + "loss": 0.2643, + "step": 988 + }, + { + "epoch": 0.46830423202130805, + "grad_norm": 1.2772247791290283, + "learning_rate": 1.9176713322678212e-05, + "loss": 0.2207, + "step": 989 + }, + { + "epoch": 0.46877774489493934, + "grad_norm": 1.129361867904663, + "learning_rate": 1.917468008395929e-05, + "loss": 0.2468, + "step": 990 + }, + { + "epoch": 0.4692512577685706, + "grad_norm": 1.451148271560669, + "learning_rate": 1.9172644445691305e-05, + "loss": 0.2493, + "step": 991 + }, + { + "epoch": 0.46972477064220186, + "grad_norm": 1.5785322189331055, + "learning_rate": 1.9170606408406648e-05, + "loss": 0.2314, + "step": 992 + }, + { + "epoch": 0.4701982835158331, + "grad_norm": 1.7173629999160767, + "learning_rate": 1.916856597263836e-05, + "loss": 0.2629, + "step": 993 + }, + { + "epoch": 0.4706717963894643, + "grad_norm": 1.471721887588501, + "learning_rate": 1.916652313892009e-05, + "loss": 0.2752, + "step": 994 + }, + { + "epoch": 0.4711453092630956, + "grad_norm": 1.9189519882202148, + "learning_rate": 1.9164477907786128e-05, + "loss": 0.2714, + "step": 995 + }, + { + "epoch": 0.47161882213672685, + "grad_norm": 1.542122721672058, + "learning_rate": 1.9162430279771378e-05, + "loss": 0.2648, + "step": 996 + }, + { + "epoch": 0.4720923350103581, + "grad_norm": 1.2591853141784668, + "learning_rate": 1.916038025541138e-05, + "loss": 0.2792, + "step": 997 + }, + { + "epoch": 0.47256584788398937, + "grad_norm": 1.6615357398986816, + "learning_rate": 1.9158327835242296e-05, + "loss": 0.2639, + "step": 998 + }, + { + "epoch": 0.4730393607576206, + "grad_norm": 1.494132161140442, + "learning_rate": 1.915627301980092e-05, + "loss": 0.2994, + "step": 999 + }, + { + "epoch": 0.47351287363125183, + "grad_norm": 1.3454561233520508, + "learning_rate": 1.915421580962467e-05, + "loss": 0.2394, + "step": 1000 + }, + { + "epoch": 0.4739863865048831, + "grad_norm": 1.6147300004959106, + "learning_rate": 1.9152156205251583e-05, + "loss": 0.2415, + "step": 1001 + }, + { + "epoch": 0.47445989937851435, + "grad_norm": 1.5121729373931885, + "learning_rate": 1.9150094207220338e-05, + "loss": 0.2285, + "step": 1002 + }, + { + "epoch": 0.4749334122521456, + "grad_norm": 2.2810189723968506, + "learning_rate": 1.9148029816070223e-05, + "loss": 0.2352, + "step": 1003 + }, + { + "epoch": 0.47540692512577687, + "grad_norm": 1.55019211769104, + "learning_rate": 1.9145963032341163e-05, + "loss": 0.273, + "step": 1004 + }, + { + "epoch": 0.4758804379994081, + "grad_norm": 1.1222288608551025, + "learning_rate": 1.9143893856573702e-05, + "loss": 0.2655, + "step": 1005 + }, + { + "epoch": 0.47635395087303933, + "grad_norm": 2.17794132232666, + "learning_rate": 1.9141822289309016e-05, + "loss": 0.2886, + "step": 1006 + }, + { + "epoch": 0.4768274637466706, + "grad_norm": 1.7431869506835938, + "learning_rate": 1.9139748331088906e-05, + "loss": 0.2651, + "step": 1007 + }, + { + "epoch": 0.47730097662030185, + "grad_norm": 1.3544774055480957, + "learning_rate": 1.913767198245579e-05, + "loss": 0.2441, + "step": 1008 + }, + { + "epoch": 0.47777448949393314, + "grad_norm": 1.2691947221755981, + "learning_rate": 1.9135593243952724e-05, + "loss": 0.222, + "step": 1009 + }, + { + "epoch": 0.4782480023675644, + "grad_norm": 1.9088751077651978, + "learning_rate": 1.913351211612337e-05, + "loss": 0.2371, + "step": 1010 + }, + { + "epoch": 0.4787215152411956, + "grad_norm": 1.6651183366775513, + "learning_rate": 1.9131428599512042e-05, + "loss": 0.2516, + "step": 1011 + }, + { + "epoch": 0.4791950281148269, + "grad_norm": 1.7003988027572632, + "learning_rate": 1.9129342694663655e-05, + "loss": 0.2557, + "step": 1012 + }, + { + "epoch": 0.4796685409884581, + "grad_norm": 1.6492764949798584, + "learning_rate": 1.9127254402123755e-05, + "loss": 0.277, + "step": 1013 + }, + { + "epoch": 0.48014205386208936, + "grad_norm": 1.4109059572219849, + "learning_rate": 1.912516372243852e-05, + "loss": 0.2222, + "step": 1014 + }, + { + "epoch": 0.48061556673572065, + "grad_norm": 1.9010688066482544, + "learning_rate": 1.9123070656154748e-05, + "loss": 0.2614, + "step": 1015 + }, + { + "epoch": 0.4810890796093519, + "grad_norm": 1.4977048635482788, + "learning_rate": 1.9120975203819855e-05, + "loss": 0.2262, + "step": 1016 + }, + { + "epoch": 0.4815625924829831, + "grad_norm": 1.206935167312622, + "learning_rate": 1.9118877365981887e-05, + "loss": 0.2591, + "step": 1017 + }, + { + "epoch": 0.4820361053566144, + "grad_norm": 1.7453712224960327, + "learning_rate": 1.9116777143189517e-05, + "loss": 0.2233, + "step": 1018 + }, + { + "epoch": 0.48250961823024563, + "grad_norm": 1.1337833404541016, + "learning_rate": 1.9114674535992038e-05, + "loss": 0.2608, + "step": 1019 + }, + { + "epoch": 0.48298313110387686, + "grad_norm": 2.339742422103882, + "learning_rate": 1.9112569544939364e-05, + "loss": 0.2628, + "step": 1020 + }, + { + "epoch": 0.48345664397750815, + "grad_norm": 1.3370864391326904, + "learning_rate": 1.9110462170582036e-05, + "loss": 0.2664, + "step": 1021 + }, + { + "epoch": 0.4839301568511394, + "grad_norm": 1.41420578956604, + "learning_rate": 1.9108352413471215e-05, + "loss": 0.2721, + "step": 1022 + }, + { + "epoch": 0.48440366972477067, + "grad_norm": 1.787891149520874, + "learning_rate": 1.9106240274158693e-05, + "loss": 0.2511, + "step": 1023 + }, + { + "epoch": 0.4848771825984019, + "grad_norm": 1.4634735584259033, + "learning_rate": 1.9104125753196876e-05, + "loss": 0.2523, + "step": 1024 + }, + { + "epoch": 0.48535069547203313, + "grad_norm": 1.4561212062835693, + "learning_rate": 1.9102008851138797e-05, + "loss": 0.2913, + "step": 1025 + }, + { + "epoch": 0.4858242083456644, + "grad_norm": 1.1067405939102173, + "learning_rate": 1.9099889568538113e-05, + "loss": 0.2476, + "step": 1026 + }, + { + "epoch": 0.48629772121929565, + "grad_norm": 1.2803436517715454, + "learning_rate": 1.90977679059491e-05, + "loss": 0.2446, + "step": 1027 + }, + { + "epoch": 0.4867712340929269, + "grad_norm": 1.3610336780548096, + "learning_rate": 1.909564386392666e-05, + "loss": 0.2365, + "step": 1028 + }, + { + "epoch": 0.4872447469665582, + "grad_norm": 1.3339933156967163, + "learning_rate": 1.909351744302631e-05, + "loss": 0.2395, + "step": 1029 + }, + { + "epoch": 0.4877182598401894, + "grad_norm": 2.5949342250823975, + "learning_rate": 1.9091388643804202e-05, + "loss": 0.2646, + "step": 1030 + }, + { + "epoch": 0.48819177271382064, + "grad_norm": 1.3655803203582764, + "learning_rate": 1.9089257466817102e-05, + "loss": 0.2368, + "step": 1031 + }, + { + "epoch": 0.4886652855874519, + "grad_norm": 1.8703043460845947, + "learning_rate": 1.9087123912622397e-05, + "loss": 0.301, + "step": 1032 + }, + { + "epoch": 0.48913879846108316, + "grad_norm": 1.272451400756836, + "learning_rate": 1.9084987981778097e-05, + "loss": 0.2422, + "step": 1033 + }, + { + "epoch": 0.4896123113347144, + "grad_norm": 1.2157567739486694, + "learning_rate": 1.9082849674842835e-05, + "loss": 0.2515, + "step": 1034 + }, + { + "epoch": 0.4900858242083457, + "grad_norm": 1.857765555381775, + "learning_rate": 1.9080708992375863e-05, + "loss": 0.2427, + "step": 1035 + }, + { + "epoch": 0.4905593370819769, + "grad_norm": 1.2299634218215942, + "learning_rate": 1.907856593493706e-05, + "loss": 0.2659, + "step": 1036 + }, + { + "epoch": 0.49103284995560814, + "grad_norm": 1.3312522172927856, + "learning_rate": 1.9076420503086915e-05, + "loss": 0.2546, + "step": 1037 + }, + { + "epoch": 0.49150636282923943, + "grad_norm": 2.030163049697876, + "learning_rate": 1.9074272697386554e-05, + "loss": 0.2592, + "step": 1038 + }, + { + "epoch": 0.49197987570287066, + "grad_norm": 1.262022852897644, + "learning_rate": 1.9072122518397706e-05, + "loss": 0.2468, + "step": 1039 + }, + { + "epoch": 0.49245338857650195, + "grad_norm": 1.5466980934143066, + "learning_rate": 1.9069969966682738e-05, + "loss": 0.2692, + "step": 1040 + }, + { + "epoch": 0.4929269014501332, + "grad_norm": 1.136094093322754, + "learning_rate": 1.9067815042804622e-05, + "loss": 0.2811, + "step": 1041 + }, + { + "epoch": 0.4934004143237644, + "grad_norm": 3.164823532104492, + "learning_rate": 1.906565774732696e-05, + "loss": 0.252, + "step": 1042 + }, + { + "epoch": 0.4938739271973957, + "grad_norm": 1.530435562133789, + "learning_rate": 1.9063498080813973e-05, + "loss": 0.2797, + "step": 1043 + }, + { + "epoch": 0.49434744007102693, + "grad_norm": 1.6616190671920776, + "learning_rate": 1.9061336043830498e-05, + "loss": 0.2717, + "step": 1044 + }, + { + "epoch": 0.49482095294465817, + "grad_norm": 1.4890795946121216, + "learning_rate": 1.9059171636942e-05, + "loss": 0.2734, + "step": 1045 + }, + { + "epoch": 0.49529446581828945, + "grad_norm": 2.622288703918457, + "learning_rate": 1.905700486071455e-05, + "loss": 0.2473, + "step": 1046 + }, + { + "epoch": 0.4957679786919207, + "grad_norm": 1.4621379375457764, + "learning_rate": 1.905483571571486e-05, + "loss": 0.2511, + "step": 1047 + }, + { + "epoch": 0.4962414915655519, + "grad_norm": 1.3425689935684204, + "learning_rate": 1.905266420251024e-05, + "loss": 0.2709, + "step": 1048 + }, + { + "epoch": 0.4967150044391832, + "grad_norm": 1.391316294670105, + "learning_rate": 1.905049032166863e-05, + "loss": 0.2401, + "step": 1049 + }, + { + "epoch": 0.49718851731281444, + "grad_norm": 1.435872197151184, + "learning_rate": 1.9048314073758586e-05, + "loss": 0.2355, + "step": 1050 + }, + { + "epoch": 0.49766203018644567, + "grad_norm": 1.4960476160049438, + "learning_rate": 1.9046135459349287e-05, + "loss": 0.2619, + "step": 1051 + }, + { + "epoch": 0.49813554306007696, + "grad_norm": 2.056912899017334, + "learning_rate": 1.9043954479010532e-05, + "loss": 0.258, + "step": 1052 + }, + { + "epoch": 0.4986090559337082, + "grad_norm": 1.3434464931488037, + "learning_rate": 1.9041771133312732e-05, + "loss": 0.2469, + "step": 1053 + }, + { + "epoch": 0.4990825688073395, + "grad_norm": 1.5715394020080566, + "learning_rate": 1.9039585422826916e-05, + "loss": 0.2293, + "step": 1054 + }, + { + "epoch": 0.4995560816809707, + "grad_norm": 1.3329315185546875, + "learning_rate": 1.903739734812474e-05, + "loss": 0.2552, + "step": 1055 + }, + { + "epoch": 0.5000295945546019, + "grad_norm": 1.1879801750183105, + "learning_rate": 1.9035206909778475e-05, + "loss": 0.2491, + "step": 1056 + }, + { + "epoch": 0.5005031074282332, + "grad_norm": 1.3756626844406128, + "learning_rate": 1.9033014108361003e-05, + "loss": 0.2371, + "step": 1057 + }, + { + "epoch": 0.5009766203018644, + "grad_norm": 1.3430827856063843, + "learning_rate": 1.9030818944445836e-05, + "loss": 0.2535, + "step": 1058 + }, + { + "epoch": 0.5014501331754957, + "grad_norm": 1.2272851467132568, + "learning_rate": 1.9028621418607095e-05, + "loss": 0.2473, + "step": 1059 + }, + { + "epoch": 0.501923646049127, + "grad_norm": 1.979054570198059, + "learning_rate": 1.9026421531419522e-05, + "loss": 0.2675, + "step": 1060 + }, + { + "epoch": 0.5023971589227582, + "grad_norm": 1.8961822986602783, + "learning_rate": 1.902421928345848e-05, + "loss": 0.2752, + "step": 1061 + }, + { + "epoch": 0.5028706717963894, + "grad_norm": 1.5014052391052246, + "learning_rate": 1.902201467529994e-05, + "loss": 0.2348, + "step": 1062 + }, + { + "epoch": 0.5033441846700207, + "grad_norm": 1.413966178894043, + "learning_rate": 1.90198077075205e-05, + "loss": 0.2568, + "step": 1063 + }, + { + "epoch": 0.503817697543652, + "grad_norm": 1.097167730331421, + "learning_rate": 1.901759838069737e-05, + "loss": 0.2602, + "step": 1064 + }, + { + "epoch": 0.5042912104172832, + "grad_norm": 2.0718507766723633, + "learning_rate": 1.9015386695408377e-05, + "loss": 0.2524, + "step": 1065 + }, + { + "epoch": 0.5047647232909145, + "grad_norm": 1.8669925928115845, + "learning_rate": 1.9013172652231967e-05, + "loss": 0.2333, + "step": 1066 + }, + { + "epoch": 0.5052382361645458, + "grad_norm": 1.3957126140594482, + "learning_rate": 1.9010956251747202e-05, + "loss": 0.266, + "step": 1067 + }, + { + "epoch": 0.505711749038177, + "grad_norm": 2.0462698936462402, + "learning_rate": 1.9008737494533757e-05, + "loss": 0.2846, + "step": 1068 + }, + { + "epoch": 0.5061852619118082, + "grad_norm": 1.195210576057434, + "learning_rate": 1.9006516381171933e-05, + "loss": 0.2851, + "step": 1069 + }, + { + "epoch": 0.5066587747854395, + "grad_norm": 1.3141087293624878, + "learning_rate": 1.9004292912242634e-05, + "loss": 0.2393, + "step": 1070 + }, + { + "epoch": 0.5071322876590707, + "grad_norm": 1.6447049379348755, + "learning_rate": 1.900206708832739e-05, + "loss": 0.2711, + "step": 1071 + }, + { + "epoch": 0.507605800532702, + "grad_norm": 2.366652727127075, + "learning_rate": 1.8999838910008347e-05, + "loss": 0.2233, + "step": 1072 + }, + { + "epoch": 0.5080793134063333, + "grad_norm": 1.6027371883392334, + "learning_rate": 1.8997608377868256e-05, + "loss": 0.2689, + "step": 1073 + }, + { + "epoch": 0.5085528262799645, + "grad_norm": 1.4096899032592773, + "learning_rate": 1.8995375492490495e-05, + "loss": 0.2539, + "step": 1074 + }, + { + "epoch": 0.5090263391535957, + "grad_norm": 1.179916501045227, + "learning_rate": 1.8993140254459057e-05, + "loss": 0.2667, + "step": 1075 + }, + { + "epoch": 0.509499852027227, + "grad_norm": 1.141992449760437, + "learning_rate": 1.8990902664358542e-05, + "loss": 0.2461, + "step": 1076 + }, + { + "epoch": 0.5099733649008582, + "grad_norm": 1.3046503067016602, + "learning_rate": 1.8988662722774172e-05, + "loss": 0.2607, + "step": 1077 + }, + { + "epoch": 0.5104468777744895, + "grad_norm": 1.1678529977798462, + "learning_rate": 1.898642043029178e-05, + "loss": 0.2352, + "step": 1078 + }, + { + "epoch": 0.5109203906481208, + "grad_norm": 1.4803732633590698, + "learning_rate": 1.8984175787497822e-05, + "loss": 0.2612, + "step": 1079 + }, + { + "epoch": 0.511393903521752, + "grad_norm": 1.7487013339996338, + "learning_rate": 1.898192879497936e-05, + "loss": 0.2348, + "step": 1080 + }, + { + "epoch": 0.5118674163953832, + "grad_norm": 1.448327898979187, + "learning_rate": 1.8979679453324068e-05, + "loss": 0.261, + "step": 1081 + }, + { + "epoch": 0.5123409292690145, + "grad_norm": 1.1041336059570312, + "learning_rate": 1.8977427763120242e-05, + "loss": 0.2754, + "step": 1082 + }, + { + "epoch": 0.5128144421426457, + "grad_norm": 1.948614239692688, + "learning_rate": 1.8975173724956794e-05, + "loss": 0.2594, + "step": 1083 + }, + { + "epoch": 0.513287955016277, + "grad_norm": 1.1922783851623535, + "learning_rate": 1.897291733942324e-05, + "loss": 0.2457, + "step": 1084 + }, + { + "epoch": 0.5137614678899083, + "grad_norm": 1.4861289262771606, + "learning_rate": 1.8970658607109723e-05, + "loss": 0.2459, + "step": 1085 + }, + { + "epoch": 0.5142349807635395, + "grad_norm": 1.497668981552124, + "learning_rate": 1.896839752860699e-05, + "loss": 0.2828, + "step": 1086 + }, + { + "epoch": 0.5147084936371707, + "grad_norm": 1.194231390953064, + "learning_rate": 1.89661341045064e-05, + "loss": 0.2524, + "step": 1087 + }, + { + "epoch": 0.515182006510802, + "grad_norm": 1.5421984195709229, + "learning_rate": 1.8963868335399933e-05, + "loss": 0.2355, + "step": 1088 + }, + { + "epoch": 0.5156555193844332, + "grad_norm": 1.1698225736618042, + "learning_rate": 1.8961600221880177e-05, + "loss": 0.253, + "step": 1089 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.2847546339035034, + "learning_rate": 1.8959329764540343e-05, + "loss": 0.2463, + "step": 1090 + }, + { + "epoch": 0.5166025451316958, + "grad_norm": 1.8415346145629883, + "learning_rate": 1.895705696397424e-05, + "loss": 0.2602, + "step": 1091 + }, + { + "epoch": 0.517076058005327, + "grad_norm": 1.1953470706939697, + "learning_rate": 1.8954781820776293e-05, + "loss": 0.2544, + "step": 1092 + }, + { + "epoch": 0.5175495708789583, + "grad_norm": 1.2167423963546753, + "learning_rate": 1.8952504335541554e-05, + "loss": 0.2353, + "step": 1093 + }, + { + "epoch": 0.5180230837525895, + "grad_norm": 1.259513258934021, + "learning_rate": 1.8950224508865667e-05, + "loss": 0.2395, + "step": 1094 + }, + { + "epoch": 0.5184965966262208, + "grad_norm": 1.6243139505386353, + "learning_rate": 1.894794234134491e-05, + "loss": 0.2362, + "step": 1095 + }, + { + "epoch": 0.518970109499852, + "grad_norm": 1.0280331373214722, + "learning_rate": 1.8945657833576155e-05, + "loss": 0.2476, + "step": 1096 + }, + { + "epoch": 0.5194436223734833, + "grad_norm": 1.332411289215088, + "learning_rate": 1.894337098615689e-05, + "loss": 0.2551, + "step": 1097 + }, + { + "epoch": 0.5199171352471146, + "grad_norm": 1.2964811325073242, + "learning_rate": 1.8941081799685227e-05, + "loss": 0.2598, + "step": 1098 + }, + { + "epoch": 0.5203906481207458, + "grad_norm": 1.6705334186553955, + "learning_rate": 1.893879027475987e-05, + "loss": 0.2639, + "step": 1099 + }, + { + "epoch": 0.520864160994377, + "grad_norm": 1.3009783029556274, + "learning_rate": 1.8936496411980156e-05, + "loss": 0.2803, + "step": 1100 + }, + { + "epoch": 0.5213376738680083, + "grad_norm": 1.179648995399475, + "learning_rate": 1.8934200211946013e-05, + "loss": 0.2561, + "step": 1101 + }, + { + "epoch": 0.5218111867416395, + "grad_norm": 1.4020172357559204, + "learning_rate": 1.8931901675257996e-05, + "loss": 0.2533, + "step": 1102 + }, + { + "epoch": 0.5222846996152708, + "grad_norm": 1.20439875125885, + "learning_rate": 1.8929600802517262e-05, + "loss": 0.2684, + "step": 1103 + }, + { + "epoch": 0.5227582124889021, + "grad_norm": 1.7954668998718262, + "learning_rate": 1.8927297594325583e-05, + "loss": 0.2242, + "step": 1104 + }, + { + "epoch": 0.5232317253625333, + "grad_norm": 1.4614356756210327, + "learning_rate": 1.8924992051285345e-05, + "loss": 0.2541, + "step": 1105 + }, + { + "epoch": 0.5237052382361645, + "grad_norm": 1.306846022605896, + "learning_rate": 1.8922684173999538e-05, + "loss": 0.2382, + "step": 1106 + }, + { + "epoch": 0.5241787511097958, + "grad_norm": 1.7092262506484985, + "learning_rate": 1.8920373963071757e-05, + "loss": 0.2512, + "step": 1107 + }, + { + "epoch": 0.524652263983427, + "grad_norm": 1.5031977891921997, + "learning_rate": 1.8918061419106228e-05, + "loss": 0.2581, + "step": 1108 + }, + { + "epoch": 0.5251257768570583, + "grad_norm": 1.5629005432128906, + "learning_rate": 1.8915746542707767e-05, + "loss": 0.2439, + "step": 1109 + }, + { + "epoch": 0.5255992897306896, + "grad_norm": 1.4823302030563354, + "learning_rate": 1.891342933448181e-05, + "loss": 0.2734, + "step": 1110 + }, + { + "epoch": 0.5260728026043208, + "grad_norm": 1.880656123161316, + "learning_rate": 1.89111097950344e-05, + "loss": 0.2425, + "step": 1111 + }, + { + "epoch": 0.526546315477952, + "grad_norm": 1.1813774108886719, + "learning_rate": 1.8908787924972188e-05, + "loss": 0.2375, + "step": 1112 + }, + { + "epoch": 0.5270198283515833, + "grad_norm": 1.9654037952423096, + "learning_rate": 1.8906463724902443e-05, + "loss": 0.2817, + "step": 1113 + }, + { + "epoch": 0.5274933412252145, + "grad_norm": 1.7562179565429688, + "learning_rate": 1.890413719543303e-05, + "loss": 0.256, + "step": 1114 + }, + { + "epoch": 0.5279668540988458, + "grad_norm": 1.140051245689392, + "learning_rate": 1.8901808337172433e-05, + "loss": 0.2455, + "step": 1115 + }, + { + "epoch": 0.5284403669724771, + "grad_norm": 1.8479341268539429, + "learning_rate": 1.8899477150729745e-05, + "loss": 0.2571, + "step": 1116 + }, + { + "epoch": 0.5289138798461083, + "grad_norm": 1.7184622287750244, + "learning_rate": 1.889714363671466e-05, + "loss": 0.2463, + "step": 1117 + }, + { + "epoch": 0.5293873927197396, + "grad_norm": 1.3186061382293701, + "learning_rate": 1.8894807795737492e-05, + "loss": 0.2657, + "step": 1118 + }, + { + "epoch": 0.5298609055933708, + "grad_norm": 1.8647300004959106, + "learning_rate": 1.8892469628409157e-05, + "loss": 0.2505, + "step": 1119 + }, + { + "epoch": 0.530334418467002, + "grad_norm": 2.4725730419158936, + "learning_rate": 1.889012913534117e-05, + "loss": 0.2475, + "step": 1120 + }, + { + "epoch": 0.5308079313406333, + "grad_norm": 1.2804913520812988, + "learning_rate": 1.8887786317145677e-05, + "loss": 0.2607, + "step": 1121 + }, + { + "epoch": 0.5312814442142646, + "grad_norm": 1.6455892324447632, + "learning_rate": 1.8885441174435417e-05, + "loss": 0.2273, + "step": 1122 + }, + { + "epoch": 0.5317549570878958, + "grad_norm": 1.4230307340621948, + "learning_rate": 1.8883093707823733e-05, + "loss": 0.2571, + "step": 1123 + }, + { + "epoch": 0.5322284699615271, + "grad_norm": 2.230576992034912, + "learning_rate": 1.8880743917924585e-05, + "loss": 0.2418, + "step": 1124 + }, + { + "epoch": 0.5327019828351583, + "grad_norm": 1.6521943807601929, + "learning_rate": 1.8878391805352544e-05, + "loss": 0.2537, + "step": 1125 + }, + { + "epoch": 0.5331754957087896, + "grad_norm": 1.336004376411438, + "learning_rate": 1.887603737072278e-05, + "loss": 0.243, + "step": 1126 + }, + { + "epoch": 0.5336490085824208, + "grad_norm": 1.650602102279663, + "learning_rate": 1.8873680614651065e-05, + "loss": 0.2544, + "step": 1127 + }, + { + "epoch": 0.5341225214560521, + "grad_norm": 1.5167171955108643, + "learning_rate": 1.8871321537753792e-05, + "loss": 0.2596, + "step": 1128 + }, + { + "epoch": 0.5345960343296834, + "grad_norm": 1.3937066793441772, + "learning_rate": 1.8868960140647953e-05, + "loss": 0.2398, + "step": 1129 + }, + { + "epoch": 0.5350695472033146, + "grad_norm": 1.1855547428131104, + "learning_rate": 1.886659642395115e-05, + "loss": 0.2312, + "step": 1130 + }, + { + "epoch": 0.5355430600769459, + "grad_norm": 1.1568256616592407, + "learning_rate": 1.8864230388281594e-05, + "loss": 0.2481, + "step": 1131 + }, + { + "epoch": 0.5360165729505771, + "grad_norm": 1.2066378593444824, + "learning_rate": 1.886186203425809e-05, + "loss": 0.2306, + "step": 1132 + }, + { + "epoch": 0.5364900858242083, + "grad_norm": 1.1401264667510986, + "learning_rate": 1.8859491362500066e-05, + "loss": 0.2538, + "step": 1133 + }, + { + "epoch": 0.5369635986978396, + "grad_norm": 1.3635470867156982, + "learning_rate": 1.885711837362754e-05, + "loss": 0.247, + "step": 1134 + }, + { + "epoch": 0.5374371115714709, + "grad_norm": 1.619167685508728, + "learning_rate": 1.8854743068261154e-05, + "loss": 0.2123, + "step": 1135 + }, + { + "epoch": 0.5379106244451021, + "grad_norm": 1.432413101196289, + "learning_rate": 1.885236544702214e-05, + "loss": 0.2377, + "step": 1136 + }, + { + "epoch": 0.5383841373187334, + "grad_norm": 2.2878897190093994, + "learning_rate": 1.8849985510532348e-05, + "loss": 0.2466, + "step": 1137 + }, + { + "epoch": 0.5388576501923646, + "grad_norm": 1.848306655883789, + "learning_rate": 1.8847603259414215e-05, + "loss": 0.2458, + "step": 1138 + }, + { + "epoch": 0.5393311630659958, + "grad_norm": 1.1554219722747803, + "learning_rate": 1.884521869429081e-05, + "loss": 0.2542, + "step": 1139 + }, + { + "epoch": 0.5398046759396271, + "grad_norm": 1.239094614982605, + "learning_rate": 1.8842831815785783e-05, + "loss": 0.2569, + "step": 1140 + }, + { + "epoch": 0.5402781888132584, + "grad_norm": 1.240552544593811, + "learning_rate": 1.8840442624523408e-05, + "loss": 0.2391, + "step": 1141 + }, + { + "epoch": 0.5407517016868896, + "grad_norm": 1.8577840328216553, + "learning_rate": 1.8838051121128545e-05, + "loss": 0.224, + "step": 1142 + }, + { + "epoch": 0.5412252145605209, + "grad_norm": 1.2552670240402222, + "learning_rate": 1.883565730622668e-05, + "loss": 0.2271, + "step": 1143 + }, + { + "epoch": 0.5416987274341521, + "grad_norm": 1.31777822971344, + "learning_rate": 1.8833261180443877e-05, + "loss": 0.2579, + "step": 1144 + }, + { + "epoch": 0.5421722403077833, + "grad_norm": 1.180493950843811, + "learning_rate": 1.8830862744406834e-05, + "loss": 0.2437, + "step": 1145 + }, + { + "epoch": 0.5426457531814146, + "grad_norm": 1.399220585823059, + "learning_rate": 1.8828461998742827e-05, + "loss": 0.2548, + "step": 1146 + }, + { + "epoch": 0.5431192660550459, + "grad_norm": 1.4404245615005493, + "learning_rate": 1.8826058944079763e-05, + "loss": 0.2314, + "step": 1147 + }, + { + "epoch": 0.5435927789286771, + "grad_norm": 1.5792372226715088, + "learning_rate": 1.8823653581046122e-05, + "loss": 0.2487, + "step": 1148 + }, + { + "epoch": 0.5440662918023084, + "grad_norm": 1.5916982889175415, + "learning_rate": 1.8821245910271013e-05, + "loss": 0.2647, + "step": 1149 + }, + { + "epoch": 0.5445398046759397, + "grad_norm": 1.326357364654541, + "learning_rate": 1.8818835932384133e-05, + "loss": 0.2418, + "step": 1150 + }, + { + "epoch": 0.5450133175495708, + "grad_norm": 1.949175238609314, + "learning_rate": 1.8816423648015795e-05, + "loss": 0.252, + "step": 1151 + }, + { + "epoch": 0.5454868304232021, + "grad_norm": 1.1373693943023682, + "learning_rate": 1.88140090577969e-05, + "loss": 0.2407, + "step": 1152 + }, + { + "epoch": 0.5459603432968334, + "grad_norm": 1.1092511415481567, + "learning_rate": 1.8811592162358977e-05, + "loss": 0.2749, + "step": 1153 + }, + { + "epoch": 0.5464338561704646, + "grad_norm": 2.095144510269165, + "learning_rate": 1.8809172962334124e-05, + "loss": 0.2357, + "step": 1154 + }, + { + "epoch": 0.5469073690440959, + "grad_norm": 2.180050849914551, + "learning_rate": 1.8806751458355064e-05, + "loss": 0.2326, + "step": 1155 + }, + { + "epoch": 0.5473808819177272, + "grad_norm": 1.321502923965454, + "learning_rate": 1.8804327651055123e-05, + "loss": 0.2304, + "step": 1156 + }, + { + "epoch": 0.5478543947913584, + "grad_norm": 1.3724721670150757, + "learning_rate": 1.8801901541068224e-05, + "loss": 0.2372, + "step": 1157 + }, + { + "epoch": 0.5483279076649896, + "grad_norm": 1.1256623268127441, + "learning_rate": 1.8799473129028886e-05, + "loss": 0.2449, + "step": 1158 + }, + { + "epoch": 0.5488014205386209, + "grad_norm": 1.4590364694595337, + "learning_rate": 1.8797042415572244e-05, + "loss": 0.2559, + "step": 1159 + }, + { + "epoch": 0.5492749334122522, + "grad_norm": 1.8025602102279663, + "learning_rate": 1.879460940133402e-05, + "loss": 0.2778, + "step": 1160 + }, + { + "epoch": 0.5497484462858834, + "grad_norm": 1.1516093015670776, + "learning_rate": 1.879217408695056e-05, + "loss": 0.2249, + "step": 1161 + }, + { + "epoch": 0.5502219591595147, + "grad_norm": 1.4225125312805176, + "learning_rate": 1.8789736473058776e-05, + "loss": 0.256, + "step": 1162 + }, + { + "epoch": 0.550695472033146, + "grad_norm": 1.2126541137695312, + "learning_rate": 1.8787296560296223e-05, + "loss": 0.2269, + "step": 1163 + }, + { + "epoch": 0.5511689849067771, + "grad_norm": 1.0485234260559082, + "learning_rate": 1.8784854349301023e-05, + "loss": 0.231, + "step": 1164 + }, + { + "epoch": 0.5516424977804084, + "grad_norm": 2.125662326812744, + "learning_rate": 1.8782409840711916e-05, + "loss": 0.2419, + "step": 1165 + }, + { + "epoch": 0.5521160106540397, + "grad_norm": 2.186833620071411, + "learning_rate": 1.877996303516824e-05, + "loss": 0.2685, + "step": 1166 + }, + { + "epoch": 0.5525895235276709, + "grad_norm": 1.3703869581222534, + "learning_rate": 1.8777513933309938e-05, + "loss": 0.2663, + "step": 1167 + }, + { + "epoch": 0.5530630364013022, + "grad_norm": 1.2371742725372314, + "learning_rate": 1.877506253577754e-05, + "loss": 0.2626, + "step": 1168 + }, + { + "epoch": 0.5535365492749335, + "grad_norm": 1.1112666130065918, + "learning_rate": 1.877260884321219e-05, + "loss": 0.2318, + "step": 1169 + }, + { + "epoch": 0.5540100621485646, + "grad_norm": 1.446602463722229, + "learning_rate": 1.8770152856255636e-05, + "loss": 0.2662, + "step": 1170 + }, + { + "epoch": 0.5544835750221959, + "grad_norm": 1.32243013381958, + "learning_rate": 1.87676945755502e-05, + "loss": 0.2765, + "step": 1171 + }, + { + "epoch": 0.5549570878958272, + "grad_norm": 1.6575604677200317, + "learning_rate": 1.8765234001738838e-05, + "loss": 0.2783, + "step": 1172 + }, + { + "epoch": 0.5554306007694584, + "grad_norm": 1.1545252799987793, + "learning_rate": 1.8762771135465078e-05, + "loss": 0.2414, + "step": 1173 + }, + { + "epoch": 0.5559041136430897, + "grad_norm": 1.1764464378356934, + "learning_rate": 1.8760305977373067e-05, + "loss": 0.2403, + "step": 1174 + }, + { + "epoch": 0.556377626516721, + "grad_norm": 1.4649169445037842, + "learning_rate": 1.875783852810754e-05, + "loss": 0.2466, + "step": 1175 + }, + { + "epoch": 0.5568511393903521, + "grad_norm": 1.1785379648208618, + "learning_rate": 1.8755368788313834e-05, + "loss": 0.2363, + "step": 1176 + }, + { + "epoch": 0.5573246522639834, + "grad_norm": 1.8420625925064087, + "learning_rate": 1.8752896758637884e-05, + "loss": 0.2261, + "step": 1177 + }, + { + "epoch": 0.5577981651376147, + "grad_norm": 1.241428256034851, + "learning_rate": 1.8750422439726227e-05, + "loss": 0.2642, + "step": 1178 + }, + { + "epoch": 0.5582716780112459, + "grad_norm": 2.547609567642212, + "learning_rate": 1.8747945832226e-05, + "loss": 0.2315, + "step": 1179 + }, + { + "epoch": 0.5587451908848772, + "grad_norm": 1.8056316375732422, + "learning_rate": 1.874546693678493e-05, + "loss": 0.2672, + "step": 1180 + }, + { + "epoch": 0.5592187037585085, + "grad_norm": 1.3478612899780273, + "learning_rate": 1.874298575405135e-05, + "loss": 0.217, + "step": 1181 + }, + { + "epoch": 0.5596922166321396, + "grad_norm": 1.2503442764282227, + "learning_rate": 1.8740502284674197e-05, + "loss": 0.2321, + "step": 1182 + }, + { + "epoch": 0.5601657295057709, + "grad_norm": 2.2722413539886475, + "learning_rate": 1.873801652930299e-05, + "loss": 0.2657, + "step": 1183 + }, + { + "epoch": 0.5606392423794022, + "grad_norm": 1.1966623067855835, + "learning_rate": 1.873552848858786e-05, + "loss": 0.2579, + "step": 1184 + }, + { + "epoch": 0.5611127552530334, + "grad_norm": 1.2810187339782715, + "learning_rate": 1.8733038163179524e-05, + "loss": 0.2727, + "step": 1185 + }, + { + "epoch": 0.5615862681266647, + "grad_norm": 1.2064369916915894, + "learning_rate": 1.8730545553729306e-05, + "loss": 0.2572, + "step": 1186 + }, + { + "epoch": 0.562059781000296, + "grad_norm": 1.379677653312683, + "learning_rate": 1.8728050660889123e-05, + "loss": 0.2622, + "step": 1187 + }, + { + "epoch": 0.5625332938739271, + "grad_norm": 0.9348089098930359, + "learning_rate": 1.8725553485311492e-05, + "loss": 0.2304, + "step": 1188 + }, + { + "epoch": 0.5630068067475584, + "grad_norm": 2.2606005668640137, + "learning_rate": 1.872305402764952e-05, + "loss": 0.2673, + "step": 1189 + }, + { + "epoch": 0.5634803196211897, + "grad_norm": 1.8432506322860718, + "learning_rate": 1.8720552288556926e-05, + "loss": 0.2779, + "step": 1190 + }, + { + "epoch": 0.563953832494821, + "grad_norm": 1.0819976329803467, + "learning_rate": 1.8718048268688006e-05, + "loss": 0.2349, + "step": 1191 + }, + { + "epoch": 0.5644273453684522, + "grad_norm": 1.164947748184204, + "learning_rate": 1.8715541968697666e-05, + "loss": 0.2575, + "step": 1192 + }, + { + "epoch": 0.5649008582420835, + "grad_norm": 1.1525195837020874, + "learning_rate": 1.8713033389241406e-05, + "loss": 0.2653, + "step": 1193 + }, + { + "epoch": 0.5653743711157148, + "grad_norm": 1.0925800800323486, + "learning_rate": 1.8710522530975315e-05, + "loss": 0.2368, + "step": 1194 + }, + { + "epoch": 0.5658478839893459, + "grad_norm": 1.066985845565796, + "learning_rate": 1.870800939455609e-05, + "loss": 0.2427, + "step": 1195 + }, + { + "epoch": 0.5663213968629772, + "grad_norm": 1.2804023027420044, + "learning_rate": 1.8705493980641017e-05, + "loss": 0.2649, + "step": 1196 + }, + { + "epoch": 0.5667949097366085, + "grad_norm": 1.0042173862457275, + "learning_rate": 1.8702976289887978e-05, + "loss": 0.2408, + "step": 1197 + }, + { + "epoch": 0.5672684226102397, + "grad_norm": 1.1509785652160645, + "learning_rate": 1.8700456322955446e-05, + "loss": 0.2527, + "step": 1198 + }, + { + "epoch": 0.567741935483871, + "grad_norm": 1.66000235080719, + "learning_rate": 1.8697934080502498e-05, + "loss": 0.2614, + "step": 1199 + }, + { + "epoch": 0.5682154483575023, + "grad_norm": 1.2652623653411865, + "learning_rate": 1.8695409563188803e-05, + "loss": 0.2366, + "step": 1200 + }, + { + "epoch": 0.5686889612311334, + "grad_norm": 1.2608882188796997, + "learning_rate": 1.8692882771674624e-05, + "loss": 0.2625, + "step": 1201 + }, + { + "epoch": 0.5691624741047647, + "grad_norm": 1.0795193910598755, + "learning_rate": 1.8690353706620815e-05, + "loss": 0.2497, + "step": 1202 + }, + { + "epoch": 0.569635986978396, + "grad_norm": 1.6654939651489258, + "learning_rate": 1.868782236868883e-05, + "loss": 0.2668, + "step": 1203 + }, + { + "epoch": 0.5701094998520272, + "grad_norm": 1.3972606658935547, + "learning_rate": 1.8685288758540724e-05, + "loss": 0.2451, + "step": 1204 + }, + { + "epoch": 0.5705830127256585, + "grad_norm": 1.5168894529342651, + "learning_rate": 1.8682752876839127e-05, + "loss": 0.2625, + "step": 1205 + }, + { + "epoch": 0.5710565255992898, + "grad_norm": 1.420271873474121, + "learning_rate": 1.868021472424728e-05, + "loss": 0.2467, + "step": 1206 + }, + { + "epoch": 0.5715300384729209, + "grad_norm": 1.3395509719848633, + "learning_rate": 1.8677674301429012e-05, + "loss": 0.2397, + "step": 1207 + }, + { + "epoch": 0.5720035513465522, + "grad_norm": 1.5764024257659912, + "learning_rate": 1.8675131609048742e-05, + "loss": 0.2556, + "step": 1208 + }, + { + "epoch": 0.5724770642201835, + "grad_norm": 1.2619208097457886, + "learning_rate": 1.8672586647771496e-05, + "loss": 0.2772, + "step": 1209 + }, + { + "epoch": 0.5729505770938147, + "grad_norm": 1.2036354541778564, + "learning_rate": 1.8670039418262873e-05, + "loss": 0.2276, + "step": 1210 + }, + { + "epoch": 0.573424089967446, + "grad_norm": 1.003892183303833, + "learning_rate": 1.8667489921189083e-05, + "loss": 0.2257, + "step": 1211 + }, + { + "epoch": 0.5738976028410773, + "grad_norm": 1.2825255393981934, + "learning_rate": 1.8664938157216923e-05, + "loss": 0.26, + "step": 1212 + }, + { + "epoch": 0.5743711157147084, + "grad_norm": 1.5353515148162842, + "learning_rate": 1.866238412701378e-05, + "loss": 0.2344, + "step": 1213 + }, + { + "epoch": 0.5748446285883397, + "grad_norm": 1.1604024171829224, + "learning_rate": 1.8659827831247632e-05, + "loss": 0.2378, + "step": 1214 + }, + { + "epoch": 0.575318141461971, + "grad_norm": 1.5623371601104736, + "learning_rate": 1.865726927058706e-05, + "loss": 0.2597, + "step": 1215 + }, + { + "epoch": 0.5757916543356022, + "grad_norm": 1.4064077138900757, + "learning_rate": 1.8654708445701227e-05, + "loss": 0.2308, + "step": 1216 + }, + { + "epoch": 0.5762651672092335, + "grad_norm": 1.2463563680648804, + "learning_rate": 1.8652145357259897e-05, + "loss": 0.2289, + "step": 1217 + }, + { + "epoch": 0.5767386800828648, + "grad_norm": 1.6035746335983276, + "learning_rate": 1.8649580005933415e-05, + "loss": 0.2724, + "step": 1218 + }, + { + "epoch": 0.577212192956496, + "grad_norm": 1.4112597703933716, + "learning_rate": 1.8647012392392728e-05, + "loss": 0.255, + "step": 1219 + }, + { + "epoch": 0.5776857058301272, + "grad_norm": 1.4282629489898682, + "learning_rate": 1.8644442517309366e-05, + "loss": 0.2478, + "step": 1220 + }, + { + "epoch": 0.5781592187037585, + "grad_norm": 1.234606385231018, + "learning_rate": 1.8641870381355463e-05, + "loss": 0.2597, + "step": 1221 + }, + { + "epoch": 0.5786327315773898, + "grad_norm": 1.283451795578003, + "learning_rate": 1.8639295985203726e-05, + "loss": 0.2724, + "step": 1222 + }, + { + "epoch": 0.579106244451021, + "grad_norm": 1.5409351587295532, + "learning_rate": 1.8636719329527474e-05, + "loss": 0.267, + "step": 1223 + }, + { + "epoch": 0.5795797573246523, + "grad_norm": 1.1971688270568848, + "learning_rate": 1.8634140415000595e-05, + "loss": 0.2635, + "step": 1224 + }, + { + "epoch": 0.5800532701982836, + "grad_norm": 1.3609153032302856, + "learning_rate": 1.863155924229759e-05, + "loss": 0.2479, + "step": 1225 + }, + { + "epoch": 0.5805267830719147, + "grad_norm": 1.1887803077697754, + "learning_rate": 1.8628975812093535e-05, + "loss": 0.2453, + "step": 1226 + }, + { + "epoch": 0.581000295945546, + "grad_norm": 2.107870101928711, + "learning_rate": 1.86263901250641e-05, + "loss": 0.249, + "step": 1227 + }, + { + "epoch": 0.5814738088191773, + "grad_norm": 1.9113880395889282, + "learning_rate": 1.8623802181885548e-05, + "loss": 0.2569, + "step": 1228 + }, + { + "epoch": 0.5819473216928085, + "grad_norm": 1.2835649251937866, + "learning_rate": 1.862121198323473e-05, + "loss": 0.2462, + "step": 1229 + }, + { + "epoch": 0.5824208345664398, + "grad_norm": 1.388893723487854, + "learning_rate": 1.861861952978909e-05, + "loss": 0.2582, + "step": 1230 + }, + { + "epoch": 0.5828943474400711, + "grad_norm": 1.21442449092865, + "learning_rate": 1.861602482222666e-05, + "loss": 0.2257, + "step": 1231 + }, + { + "epoch": 0.5833678603137022, + "grad_norm": 1.585196614265442, + "learning_rate": 1.8613427861226056e-05, + "loss": 0.2475, + "step": 1232 + }, + { + "epoch": 0.5838413731873335, + "grad_norm": 2.5939364433288574, + "learning_rate": 1.8610828647466487e-05, + "loss": 0.2519, + "step": 1233 + }, + { + "epoch": 0.5843148860609648, + "grad_norm": 1.4720892906188965, + "learning_rate": 1.8608227181627757e-05, + "loss": 0.2598, + "step": 1234 + }, + { + "epoch": 0.584788398934596, + "grad_norm": 1.8123221397399902, + "learning_rate": 1.860562346439025e-05, + "loss": 0.2372, + "step": 1235 + }, + { + "epoch": 0.5852619118082273, + "grad_norm": 1.4891722202301025, + "learning_rate": 1.8603017496434953e-05, + "loss": 0.2353, + "step": 1236 + }, + { + "epoch": 0.5857354246818586, + "grad_norm": 1.0998507738113403, + "learning_rate": 1.860040927844342e-05, + "loss": 0.2437, + "step": 1237 + }, + { + "epoch": 0.5862089375554898, + "grad_norm": 1.521974802017212, + "learning_rate": 1.859779881109781e-05, + "loss": 0.2569, + "step": 1238 + }, + { + "epoch": 0.586682450429121, + "grad_norm": 1.6860134601593018, + "learning_rate": 1.8595186095080864e-05, + "loss": 0.2483, + "step": 1239 + }, + { + "epoch": 0.5871559633027523, + "grad_norm": 1.1612238883972168, + "learning_rate": 1.8592571131075915e-05, + "loss": 0.2645, + "step": 1240 + }, + { + "epoch": 0.5876294761763835, + "grad_norm": 1.3629693984985352, + "learning_rate": 1.8589953919766882e-05, + "loss": 0.2492, + "step": 1241 + }, + { + "epoch": 0.5881029890500148, + "grad_norm": 1.175735592842102, + "learning_rate": 1.8587334461838267e-05, + "loss": 0.2256, + "step": 1242 + }, + { + "epoch": 0.5885765019236461, + "grad_norm": 2.3192355632781982, + "learning_rate": 1.8584712757975173e-05, + "loss": 0.2593, + "step": 1243 + }, + { + "epoch": 0.5890500147972773, + "grad_norm": 1.1697986125946045, + "learning_rate": 1.858208880886327e-05, + "loss": 0.2484, + "step": 1244 + }, + { + "epoch": 0.5895235276709085, + "grad_norm": 1.8343950510025024, + "learning_rate": 1.8579462615188832e-05, + "loss": 0.2652, + "step": 1245 + }, + { + "epoch": 0.5899970405445398, + "grad_norm": 1.3866143226623535, + "learning_rate": 1.8576834177638717e-05, + "loss": 0.2543, + "step": 1246 + }, + { + "epoch": 0.590470553418171, + "grad_norm": 1.11716628074646, + "learning_rate": 1.8574203496900366e-05, + "loss": 0.2338, + "step": 1247 + }, + { + "epoch": 0.5909440662918023, + "grad_norm": 1.593776822090149, + "learning_rate": 1.8571570573661803e-05, + "loss": 0.2547, + "step": 1248 + }, + { + "epoch": 0.5914175791654336, + "grad_norm": 2.5242745876312256, + "learning_rate": 1.8568935408611652e-05, + "loss": 0.2697, + "step": 1249 + }, + { + "epoch": 0.5918910920390648, + "grad_norm": 1.9075709581375122, + "learning_rate": 1.856629800243911e-05, + "loss": 0.2924, + "step": 1250 + }, + { + "epoch": 0.592364604912696, + "grad_norm": 1.8078994750976562, + "learning_rate": 1.8563658355833965e-05, + "loss": 0.2521, + "step": 1251 + }, + { + "epoch": 0.5928381177863273, + "grad_norm": 1.6747920513153076, + "learning_rate": 1.856101646948659e-05, + "loss": 0.2511, + "step": 1252 + }, + { + "epoch": 0.5933116306599586, + "grad_norm": 1.425503134727478, + "learning_rate": 1.8558372344087953e-05, + "loss": 0.2487, + "step": 1253 + }, + { + "epoch": 0.5937851435335898, + "grad_norm": 2.0698137283325195, + "learning_rate": 1.8555725980329588e-05, + "loss": 0.2581, + "step": 1254 + }, + { + "epoch": 0.5942586564072211, + "grad_norm": 2.361027479171753, + "learning_rate": 1.8553077378903632e-05, + "loss": 0.2371, + "step": 1255 + }, + { + "epoch": 0.5947321692808524, + "grad_norm": 2.6887784004211426, + "learning_rate": 1.8550426540502802e-05, + "loss": 0.2425, + "step": 1256 + }, + { + "epoch": 0.5952056821544836, + "grad_norm": 1.9129482507705688, + "learning_rate": 1.8547773465820397e-05, + "loss": 0.2334, + "step": 1257 + }, + { + "epoch": 0.5956791950281148, + "grad_norm": 1.7753819227218628, + "learning_rate": 1.8545118155550305e-05, + "loss": 0.2376, + "step": 1258 + }, + { + "epoch": 0.5961527079017461, + "grad_norm": 1.9076635837554932, + "learning_rate": 1.8542460610386993e-05, + "loss": 0.2295, + "step": 1259 + }, + { + "epoch": 0.5966262207753773, + "grad_norm": 2.463299512863159, + "learning_rate": 1.853980083102552e-05, + "loss": 0.2538, + "step": 1260 + }, + { + "epoch": 0.5970997336490086, + "grad_norm": 1.5017327070236206, + "learning_rate": 1.8537138818161527e-05, + "loss": 0.2312, + "step": 1261 + }, + { + "epoch": 0.5975732465226399, + "grad_norm": 1.36251962184906, + "learning_rate": 1.8534474572491235e-05, + "loss": 0.2536, + "step": 1262 + }, + { + "epoch": 0.598046759396271, + "grad_norm": 1.2385445833206177, + "learning_rate": 1.853180809471145e-05, + "loss": 0.2439, + "step": 1263 + }, + { + "epoch": 0.5985202722699023, + "grad_norm": 1.0537524223327637, + "learning_rate": 1.8529139385519567e-05, + "loss": 0.2483, + "step": 1264 + }, + { + "epoch": 0.5989937851435336, + "grad_norm": 3.3753559589385986, + "learning_rate": 1.8526468445613556e-05, + "loss": 0.2376, + "step": 1265 + }, + { + "epoch": 0.5994672980171648, + "grad_norm": 2.3799593448638916, + "learning_rate": 1.8523795275691986e-05, + "loss": 0.2361, + "step": 1266 + }, + { + "epoch": 0.5999408108907961, + "grad_norm": 1.300310492515564, + "learning_rate": 1.852111987645399e-05, + "loss": 0.2466, + "step": 1267 + }, + { + "epoch": 0.6004143237644274, + "grad_norm": 1.4112299680709839, + "learning_rate": 1.851844224859929e-05, + "loss": 0.2418, + "step": 1268 + }, + { + "epoch": 0.6008878366380586, + "grad_norm": 1.9448109865188599, + "learning_rate": 1.8515762392828205e-05, + "loss": 0.2492, + "step": 1269 + }, + { + "epoch": 0.6013613495116898, + "grad_norm": 1.0596145391464233, + "learning_rate": 1.8513080309841616e-05, + "loss": 0.2385, + "step": 1270 + }, + { + "epoch": 0.6018348623853211, + "grad_norm": 1.4580031633377075, + "learning_rate": 1.8510396000341e-05, + "loss": 0.2615, + "step": 1271 + }, + { + "epoch": 0.6023083752589523, + "grad_norm": 1.4013538360595703, + "learning_rate": 1.850770946502841e-05, + "loss": 0.2271, + "step": 1272 + }, + { + "epoch": 0.6027818881325836, + "grad_norm": 1.3812453746795654, + "learning_rate": 1.8505020704606486e-05, + "loss": 0.2506, + "step": 1273 + }, + { + "epoch": 0.6032554010062149, + "grad_norm": 1.411999225616455, + "learning_rate": 1.8502329719778448e-05, + "loss": 0.2343, + "step": 1274 + }, + { + "epoch": 0.6037289138798461, + "grad_norm": 1.6212693452835083, + "learning_rate": 1.849963651124809e-05, + "loss": 0.2382, + "step": 1275 + }, + { + "epoch": 0.6042024267534774, + "grad_norm": 1.3826075792312622, + "learning_rate": 1.8496941079719805e-05, + "loss": 0.2541, + "step": 1276 + }, + { + "epoch": 0.6046759396271086, + "grad_norm": 2.0459675788879395, + "learning_rate": 1.8494243425898548e-05, + "loss": 0.2367, + "step": 1277 + }, + { + "epoch": 0.6051494525007398, + "grad_norm": 2.222378969192505, + "learning_rate": 1.8491543550489872e-05, + "loss": 0.2267, + "step": 1278 + }, + { + "epoch": 0.6056229653743711, + "grad_norm": 1.439396619796753, + "learning_rate": 1.84888414541999e-05, + "loss": 0.2454, + "step": 1279 + }, + { + "epoch": 0.6060964782480024, + "grad_norm": 1.3656351566314697, + "learning_rate": 1.8486137137735337e-05, + "loss": 0.2417, + "step": 1280 + }, + { + "epoch": 0.6065699911216336, + "grad_norm": 1.2693840265274048, + "learning_rate": 1.848343060180347e-05, + "loss": 0.2573, + "step": 1281 + }, + { + "epoch": 0.6070435039952649, + "grad_norm": 1.9069156646728516, + "learning_rate": 1.8480721847112174e-05, + "loss": 0.2466, + "step": 1282 + }, + { + "epoch": 0.6075170168688961, + "grad_norm": 1.1109650135040283, + "learning_rate": 1.8478010874369894e-05, + "loss": 0.2637, + "step": 1283 + }, + { + "epoch": 0.6079905297425274, + "grad_norm": 1.4270925521850586, + "learning_rate": 1.8475297684285657e-05, + "loss": 0.1981, + "step": 1284 + }, + { + "epoch": 0.6084640426161586, + "grad_norm": 1.5452510118484497, + "learning_rate": 1.8472582277569072e-05, + "loss": 0.2564, + "step": 1285 + }, + { + "epoch": 0.6089375554897899, + "grad_norm": 1.9207974672317505, + "learning_rate": 1.8469864654930333e-05, + "loss": 0.2676, + "step": 1286 + }, + { + "epoch": 0.6094110683634212, + "grad_norm": 1.18226957321167, + "learning_rate": 1.8467144817080204e-05, + "loss": 0.2474, + "step": 1287 + }, + { + "epoch": 0.6098845812370524, + "grad_norm": 1.3588470220565796, + "learning_rate": 1.8464422764730028e-05, + "loss": 0.2469, + "step": 1288 + }, + { + "epoch": 0.6103580941106836, + "grad_norm": 1.4473971128463745, + "learning_rate": 1.8461698498591736e-05, + "loss": 0.2322, + "step": 1289 + }, + { + "epoch": 0.6108316069843149, + "grad_norm": 2.050037145614624, + "learning_rate": 1.8458972019377834e-05, + "loss": 0.2611, + "step": 1290 + }, + { + "epoch": 0.6113051198579461, + "grad_norm": 1.4301786422729492, + "learning_rate": 1.8456243327801407e-05, + "loss": 0.2265, + "step": 1291 + }, + { + "epoch": 0.6117786327315774, + "grad_norm": 1.096746802330017, + "learning_rate": 1.8453512424576113e-05, + "loss": 0.2279, + "step": 1292 + }, + { + "epoch": 0.6122521456052087, + "grad_norm": 1.5921342372894287, + "learning_rate": 1.84507793104162e-05, + "loss": 0.2564, + "step": 1293 + }, + { + "epoch": 0.6127256584788399, + "grad_norm": 1.1023545265197754, + "learning_rate": 1.8448043986036483e-05, + "loss": 0.2359, + "step": 1294 + }, + { + "epoch": 0.6131991713524712, + "grad_norm": 2.039523124694824, + "learning_rate": 1.844530645215236e-05, + "loss": 0.2283, + "step": 1295 + }, + { + "epoch": 0.6136726842261024, + "grad_norm": 1.5656194686889648, + "learning_rate": 1.8442566709479813e-05, + "loss": 0.256, + "step": 1296 + }, + { + "epoch": 0.6141461970997336, + "grad_norm": 1.9712538719177246, + "learning_rate": 1.8439824758735385e-05, + "loss": 0.2526, + "step": 1297 + }, + { + "epoch": 0.6146197099733649, + "grad_norm": 1.1353607177734375, + "learning_rate": 1.8437080600636215e-05, + "loss": 0.2606, + "step": 1298 + }, + { + "epoch": 0.6150932228469962, + "grad_norm": 2.3655588626861572, + "learning_rate": 1.8434334235900008e-05, + "loss": 0.2293, + "step": 1299 + }, + { + "epoch": 0.6155667357206274, + "grad_norm": 1.5827844142913818, + "learning_rate": 1.8431585665245047e-05, + "loss": 0.2443, + "step": 1300 + }, + { + "epoch": 0.6160402485942587, + "grad_norm": 1.5247994661331177, + "learning_rate": 1.8428834889390194e-05, + "loss": 0.2444, + "step": 1301 + }, + { + "epoch": 0.6165137614678899, + "grad_norm": 1.465590476989746, + "learning_rate": 1.8426081909054893e-05, + "loss": 0.2279, + "step": 1302 + }, + { + "epoch": 0.6169872743415211, + "grad_norm": 1.2217838764190674, + "learning_rate": 1.8423326724959157e-05, + "loss": 0.2393, + "step": 1303 + }, + { + "epoch": 0.6174607872151524, + "grad_norm": 1.6876059770584106, + "learning_rate": 1.8420569337823576e-05, + "loss": 0.248, + "step": 1304 + }, + { + "epoch": 0.6179343000887837, + "grad_norm": 1.808072805404663, + "learning_rate": 1.841780974836932e-05, + "loss": 0.2678, + "step": 1305 + }, + { + "epoch": 0.6184078129624149, + "grad_norm": 2.256350517272949, + "learning_rate": 1.8415047957318132e-05, + "loss": 0.2462, + "step": 1306 + }, + { + "epoch": 0.6188813258360462, + "grad_norm": 2.597975969314575, + "learning_rate": 1.8412283965392334e-05, + "loss": 0.2504, + "step": 1307 + }, + { + "epoch": 0.6193548387096774, + "grad_norm": 1.6564836502075195, + "learning_rate": 1.8409517773314816e-05, + "loss": 0.2264, + "step": 1308 + }, + { + "epoch": 0.6198283515833086, + "grad_norm": 1.5437556505203247, + "learning_rate": 1.8406749381809054e-05, + "loss": 0.2375, + "step": 1309 + }, + { + "epoch": 0.6203018644569399, + "grad_norm": 2.6221256256103516, + "learning_rate": 1.8403978791599095e-05, + "loss": 0.2454, + "step": 1310 + }, + { + "epoch": 0.6207753773305712, + "grad_norm": 1.4474231004714966, + "learning_rate": 1.8401206003409554e-05, + "loss": 0.2489, + "step": 1311 + }, + { + "epoch": 0.6212488902042024, + "grad_norm": 2.578899621963501, + "learning_rate": 1.839843101796563e-05, + "loss": 0.2466, + "step": 1312 + }, + { + "epoch": 0.6217224030778337, + "grad_norm": 2.1055898666381836, + "learning_rate": 1.8395653835993098e-05, + "loss": 0.2741, + "step": 1313 + }, + { + "epoch": 0.622195915951465, + "grad_norm": 1.2362160682678223, + "learning_rate": 1.83928744582183e-05, + "loss": 0.2472, + "step": 1314 + }, + { + "epoch": 0.6226694288250961, + "grad_norm": 1.5515974760055542, + "learning_rate": 1.839009288536815e-05, + "loss": 0.2346, + "step": 1315 + }, + { + "epoch": 0.6231429416987274, + "grad_norm": 1.4846057891845703, + "learning_rate": 1.838730911817015e-05, + "loss": 0.2439, + "step": 1316 + }, + { + "epoch": 0.6236164545723587, + "grad_norm": 2.014627695083618, + "learning_rate": 1.8384523157352365e-05, + "loss": 0.2398, + "step": 1317 + }, + { + "epoch": 0.62408996744599, + "grad_norm": 2.682614326477051, + "learning_rate": 1.8381735003643434e-05, + "loss": 0.2567, + "step": 1318 + }, + { + "epoch": 0.6245634803196212, + "grad_norm": 1.1889935731887817, + "learning_rate": 1.837894465777257e-05, + "loss": 0.2425, + "step": 1319 + }, + { + "epoch": 0.6250369931932525, + "grad_norm": 1.283697247505188, + "learning_rate": 1.8376152120469567e-05, + "loss": 0.2576, + "step": 1320 + }, + { + "epoch": 0.6255105060668837, + "grad_norm": 1.7607721090316772, + "learning_rate": 1.8373357392464783e-05, + "loss": 0.2428, + "step": 1321 + }, + { + "epoch": 0.6259840189405149, + "grad_norm": 2.0898520946502686, + "learning_rate": 1.837056047448915e-05, + "loss": 0.246, + "step": 1322 + }, + { + "epoch": 0.6264575318141462, + "grad_norm": 1.7172597646713257, + "learning_rate": 1.836776136727418e-05, + "loss": 0.242, + "step": 1323 + }, + { + "epoch": 0.6269310446877775, + "grad_norm": 1.7374228239059448, + "learning_rate": 1.8364960071551948e-05, + "loss": 0.252, + "step": 1324 + }, + { + "epoch": 0.6274045575614087, + "grad_norm": 1.4779413938522339, + "learning_rate": 1.836215658805511e-05, + "loss": 0.2424, + "step": 1325 + }, + { + "epoch": 0.62787807043504, + "grad_norm": 1.4057080745697021, + "learning_rate": 1.8359350917516882e-05, + "loss": 0.2516, + "step": 1326 + }, + { + "epoch": 0.6283515833086712, + "grad_norm": 1.7283161878585815, + "learning_rate": 1.835654306067107e-05, + "loss": 0.2559, + "step": 1327 + }, + { + "epoch": 0.6288250961823024, + "grad_norm": 1.3666443824768066, + "learning_rate": 1.835373301825204e-05, + "loss": 0.235, + "step": 1328 + }, + { + "epoch": 0.6292986090559337, + "grad_norm": 1.4366817474365234, + "learning_rate": 1.8350920790994723e-05, + "loss": 0.2405, + "step": 1329 + }, + { + "epoch": 0.629772121929565, + "grad_norm": 1.3197062015533447, + "learning_rate": 1.834810637963464e-05, + "loss": 0.2191, + "step": 1330 + }, + { + "epoch": 0.6302456348031962, + "grad_norm": 1.0538299083709717, + "learning_rate": 1.8345289784907863e-05, + "loss": 0.2255, + "step": 1331 + }, + { + "epoch": 0.6307191476768275, + "grad_norm": 1.732539415359497, + "learning_rate": 1.8342471007551058e-05, + "loss": 0.2733, + "step": 1332 + }, + { + "epoch": 0.6311926605504588, + "grad_norm": 1.7113350629806519, + "learning_rate": 1.833965004830144e-05, + "loss": 0.2285, + "step": 1333 + }, + { + "epoch": 0.6316661734240899, + "grad_norm": 2.064622640609741, + "learning_rate": 1.8336826907896806e-05, + "loss": 0.2398, + "step": 1334 + }, + { + "epoch": 0.6321396862977212, + "grad_norm": 1.9892138242721558, + "learning_rate": 1.8334001587075517e-05, + "loss": 0.2503, + "step": 1335 + }, + { + "epoch": 0.6326131991713525, + "grad_norm": 1.079971194267273, + "learning_rate": 1.833117408657652e-05, + "loss": 0.2354, + "step": 1336 + }, + { + "epoch": 0.6330867120449837, + "grad_norm": 1.276336669921875, + "learning_rate": 1.8328344407139307e-05, + "loss": 0.2452, + "step": 1337 + }, + { + "epoch": 0.633560224918615, + "grad_norm": 1.477914571762085, + "learning_rate": 1.832551254950396e-05, + "loss": 0.2422, + "step": 1338 + }, + { + "epoch": 0.6340337377922463, + "grad_norm": 1.510981798171997, + "learning_rate": 1.832267851441113e-05, + "loss": 0.2523, + "step": 1339 + }, + { + "epoch": 0.6345072506658774, + "grad_norm": 1.4486583471298218, + "learning_rate": 1.8319842302602014e-05, + "loss": 0.2492, + "step": 1340 + }, + { + "epoch": 0.6349807635395087, + "grad_norm": 1.9352304935455322, + "learning_rate": 1.8317003914818415e-05, + "loss": 0.2373, + "step": 1341 + }, + { + "epoch": 0.63545427641314, + "grad_norm": 1.3235574960708618, + "learning_rate": 1.8314163351802673e-05, + "loss": 0.2464, + "step": 1342 + }, + { + "epoch": 0.6359277892867712, + "grad_norm": 1.9728165864944458, + "learning_rate": 1.831132061429772e-05, + "loss": 0.2387, + "step": 1343 + }, + { + "epoch": 0.6364013021604025, + "grad_norm": 1.5262460708618164, + "learning_rate": 1.8308475703047034e-05, + "loss": 0.2273, + "step": 1344 + }, + { + "epoch": 0.6368748150340338, + "grad_norm": 1.6200482845306396, + "learning_rate": 1.8305628618794685e-05, + "loss": 0.2561, + "step": 1345 + }, + { + "epoch": 0.6373483279076649, + "grad_norm": 1.5177505016326904, + "learning_rate": 1.8302779362285298e-05, + "loss": 0.2331, + "step": 1346 + }, + { + "epoch": 0.6378218407812962, + "grad_norm": 1.3834291696548462, + "learning_rate": 1.8299927934264064e-05, + "loss": 0.2117, + "step": 1347 + }, + { + "epoch": 0.6382953536549275, + "grad_norm": 1.5583209991455078, + "learning_rate": 1.829707433547675e-05, + "loss": 0.2055, + "step": 1348 + }, + { + "epoch": 0.6387688665285588, + "grad_norm": 1.7908825874328613, + "learning_rate": 1.8294218566669684e-05, + "loss": 0.264, + "step": 1349 + }, + { + "epoch": 0.63924237940219, + "grad_norm": 2.1676015853881836, + "learning_rate": 1.8291360628589774e-05, + "loss": 0.2424, + "step": 1350 + }, + { + "epoch": 0.6397158922758213, + "grad_norm": 1.7668577432632446, + "learning_rate": 1.8288500521984477e-05, + "loss": 0.2576, + "step": 1351 + }, + { + "epoch": 0.6401894051494526, + "grad_norm": 1.537380337715149, + "learning_rate": 1.828563824760183e-05, + "loss": 0.243, + "step": 1352 + }, + { + "epoch": 0.6406629180230837, + "grad_norm": 1.5818302631378174, + "learning_rate": 1.828277380619043e-05, + "loss": 0.2637, + "step": 1353 + }, + { + "epoch": 0.641136430896715, + "grad_norm": 1.2502553462982178, + "learning_rate": 1.827990719849945e-05, + "loss": 0.2299, + "step": 1354 + }, + { + "epoch": 0.6416099437703463, + "grad_norm": 1.2032712697982788, + "learning_rate": 1.8277038425278616e-05, + "loss": 0.2597, + "step": 1355 + }, + { + "epoch": 0.6420834566439775, + "grad_norm": 1.1171890497207642, + "learning_rate": 1.8274167487278232e-05, + "loss": 0.24, + "step": 1356 + }, + { + "epoch": 0.6425569695176088, + "grad_norm": 1.7673466205596924, + "learning_rate": 1.8271294385249168e-05, + "loss": 0.2274, + "step": 1357 + }, + { + "epoch": 0.64303048239124, + "grad_norm": 1.4950395822525024, + "learning_rate": 1.8268419119942852e-05, + "loss": 0.2521, + "step": 1358 + }, + { + "epoch": 0.6435039952648712, + "grad_norm": 1.344424843788147, + "learning_rate": 1.8265541692111277e-05, + "loss": 0.2201, + "step": 1359 + }, + { + "epoch": 0.6439775081385025, + "grad_norm": 1.1256235837936401, + "learning_rate": 1.8262662102507012e-05, + "loss": 0.2402, + "step": 1360 + }, + { + "epoch": 0.6444510210121338, + "grad_norm": 1.2942851781845093, + "learning_rate": 1.8259780351883188e-05, + "loss": 0.2561, + "step": 1361 + }, + { + "epoch": 0.644924533885765, + "grad_norm": 1.1802473068237305, + "learning_rate": 1.8256896440993498e-05, + "loss": 0.2516, + "step": 1362 + }, + { + "epoch": 0.6453980467593963, + "grad_norm": 1.1492762565612793, + "learning_rate": 1.8254010370592197e-05, + "loss": 0.2492, + "step": 1363 + }, + { + "epoch": 0.6458715596330276, + "grad_norm": 1.716592788696289, + "learning_rate": 1.825112214143411e-05, + "loss": 0.2208, + "step": 1364 + }, + { + "epoch": 0.6463450725066587, + "grad_norm": 1.5306360721588135, + "learning_rate": 1.824823175427463e-05, + "loss": 0.2273, + "step": 1365 + }, + { + "epoch": 0.64681858538029, + "grad_norm": 1.6594233512878418, + "learning_rate": 1.8245339209869705e-05, + "loss": 0.2514, + "step": 1366 + }, + { + "epoch": 0.6472920982539213, + "grad_norm": 1.2237942218780518, + "learning_rate": 1.8242444508975857e-05, + "loss": 0.2514, + "step": 1367 + }, + { + "epoch": 0.6477656111275525, + "grad_norm": 1.5434156656265259, + "learning_rate": 1.8239547652350162e-05, + "loss": 0.2523, + "step": 1368 + }, + { + "epoch": 0.6482391240011838, + "grad_norm": 1.3254024982452393, + "learning_rate": 1.8236648640750266e-05, + "loss": 0.2588, + "step": 1369 + }, + { + "epoch": 0.6487126368748151, + "grad_norm": 1.8970592021942139, + "learning_rate": 1.823374747493438e-05, + "loss": 0.2487, + "step": 1370 + }, + { + "epoch": 0.6491861497484462, + "grad_norm": 1.7532835006713867, + "learning_rate": 1.8230844155661273e-05, + "loss": 0.2405, + "step": 1371 + }, + { + "epoch": 0.6496596626220775, + "grad_norm": 1.1474004983901978, + "learning_rate": 1.822793868369028e-05, + "loss": 0.2221, + "step": 1372 + }, + { + "epoch": 0.6501331754957088, + "grad_norm": 1.8724579811096191, + "learning_rate": 1.8225031059781302e-05, + "loss": 0.2345, + "step": 1373 + }, + { + "epoch": 0.65060668836934, + "grad_norm": 1.337428331375122, + "learning_rate": 1.8222121284694798e-05, + "loss": 0.2737, + "step": 1374 + }, + { + "epoch": 0.6510802012429713, + "grad_norm": 1.1940720081329346, + "learning_rate": 1.8219209359191793e-05, + "loss": 0.2089, + "step": 1375 + }, + { + "epoch": 0.6515537141166026, + "grad_norm": 1.327831745147705, + "learning_rate": 1.821629528403387e-05, + "loss": 0.2629, + "step": 1376 + }, + { + "epoch": 0.6520272269902337, + "grad_norm": 1.3946506977081299, + "learning_rate": 1.8213379059983184e-05, + "loss": 0.227, + "step": 1377 + }, + { + "epoch": 0.652500739863865, + "grad_norm": 1.840470790863037, + "learning_rate": 1.8210460687802437e-05, + "loss": 0.2434, + "step": 1378 + }, + { + "epoch": 0.6529742527374963, + "grad_norm": 1.3250887393951416, + "learning_rate": 1.8207540168254907e-05, + "loss": 0.2449, + "step": 1379 + }, + { + "epoch": 0.6534477656111276, + "grad_norm": 1.3822866678237915, + "learning_rate": 1.8204617502104426e-05, + "loss": 0.2286, + "step": 1380 + }, + { + "epoch": 0.6539212784847588, + "grad_norm": 1.2406575679779053, + "learning_rate": 1.8201692690115385e-05, + "loss": 0.2551, + "step": 1381 + }, + { + "epoch": 0.6543947913583901, + "grad_norm": 1.167535662651062, + "learning_rate": 1.819876573305275e-05, + "loss": 0.2365, + "step": 1382 + }, + { + "epoch": 0.6548683042320214, + "grad_norm": 1.555350661277771, + "learning_rate": 1.8195836631682025e-05, + "loss": 0.2523, + "step": 1383 + }, + { + "epoch": 0.6553418171056525, + "grad_norm": 1.228309988975525, + "learning_rate": 1.8192905386769298e-05, + "loss": 0.2235, + "step": 1384 + }, + { + "epoch": 0.6558153299792838, + "grad_norm": 1.2090561389923096, + "learning_rate": 1.8189971999081207e-05, + "loss": 0.24, + "step": 1385 + }, + { + "epoch": 0.6562888428529151, + "grad_norm": 1.1428760290145874, + "learning_rate": 1.818703646938495e-05, + "loss": 0.2441, + "step": 1386 + }, + { + "epoch": 0.6567623557265463, + "grad_norm": 2.0970654487609863, + "learning_rate": 1.8184098798448285e-05, + "loss": 0.2685, + "step": 1387 + }, + { + "epoch": 0.6572358686001776, + "grad_norm": 1.638376235961914, + "learning_rate": 1.8181158987039534e-05, + "loss": 0.2524, + "step": 1388 + }, + { + "epoch": 0.6577093814738089, + "grad_norm": 1.7937678098678589, + "learning_rate": 1.8178217035927578e-05, + "loss": 0.2446, + "step": 1389 + }, + { + "epoch": 0.65818289434744, + "grad_norm": 1.5529017448425293, + "learning_rate": 1.8175272945881847e-05, + "loss": 0.262, + "step": 1390 + }, + { + "epoch": 0.6586564072210713, + "grad_norm": 2.30466628074646, + "learning_rate": 1.8172326717672348e-05, + "loss": 0.239, + "step": 1391 + }, + { + "epoch": 0.6591299200947026, + "grad_norm": 2.677629232406616, + "learning_rate": 1.8169378352069637e-05, + "loss": 0.2682, + "step": 1392 + }, + { + "epoch": 0.6596034329683338, + "grad_norm": 2.3182013034820557, + "learning_rate": 1.8166427849844826e-05, + "loss": 0.2425, + "step": 1393 + }, + { + "epoch": 0.6600769458419651, + "grad_norm": 1.5342023372650146, + "learning_rate": 1.8163475211769595e-05, + "loss": 0.2674, + "step": 1394 + }, + { + "epoch": 0.6605504587155964, + "grad_norm": 2.3066344261169434, + "learning_rate": 1.8160520438616176e-05, + "loss": 0.2506, + "step": 1395 + }, + { + "epoch": 0.6610239715892275, + "grad_norm": 1.210817813873291, + "learning_rate": 1.8157563531157366e-05, + "loss": 0.2399, + "step": 1396 + }, + { + "epoch": 0.6614974844628588, + "grad_norm": 1.5053683519363403, + "learning_rate": 1.815460449016651e-05, + "loss": 0.2343, + "step": 1397 + }, + { + "epoch": 0.6619709973364901, + "grad_norm": 1.1325657367706299, + "learning_rate": 1.8151643316417518e-05, + "loss": 0.2338, + "step": 1398 + }, + { + "epoch": 0.6624445102101213, + "grad_norm": 2.2156691551208496, + "learning_rate": 1.8148680010684856e-05, + "loss": 0.2328, + "step": 1399 + }, + { + "epoch": 0.6629180230837526, + "grad_norm": 1.634922742843628, + "learning_rate": 1.814571457374355e-05, + "loss": 0.2786, + "step": 1400 + }, + { + "epoch": 0.6633915359573839, + "grad_norm": 1.8771041631698608, + "learning_rate": 1.8142747006369176e-05, + "loss": 0.2626, + "step": 1401 + }, + { + "epoch": 0.663865048831015, + "grad_norm": 1.1594529151916504, + "learning_rate": 1.813977730933788e-05, + "loss": 0.2044, + "step": 1402 + }, + { + "epoch": 0.6643385617046463, + "grad_norm": 1.380021095275879, + "learning_rate": 1.8136805483426358e-05, + "loss": 0.2689, + "step": 1403 + }, + { + "epoch": 0.6648120745782776, + "grad_norm": 1.0207122564315796, + "learning_rate": 1.8133831529411856e-05, + "loss": 0.249, + "step": 1404 + }, + { + "epoch": 0.6652855874519088, + "grad_norm": 1.723662257194519, + "learning_rate": 1.8130855448072186e-05, + "loss": 0.2373, + "step": 1405 + }, + { + "epoch": 0.6657591003255401, + "grad_norm": 1.606900930404663, + "learning_rate": 1.8127877240185716e-05, + "loss": 0.2731, + "step": 1406 + }, + { + "epoch": 0.6662326131991714, + "grad_norm": 1.1863452196121216, + "learning_rate": 1.8124896906531363e-05, + "loss": 0.2379, + "step": 1407 + }, + { + "epoch": 0.6667061260728026, + "grad_norm": 1.311086893081665, + "learning_rate": 1.8121914447888605e-05, + "loss": 0.2295, + "step": 1408 + }, + { + "epoch": 0.6671796389464338, + "grad_norm": 1.2078311443328857, + "learning_rate": 1.811892986503748e-05, + "loss": 0.2209, + "step": 1409 + }, + { + "epoch": 0.6676531518200651, + "grad_norm": 2.1490330696105957, + "learning_rate": 1.811594315875857e-05, + "loss": 0.2922, + "step": 1410 + }, + { + "epoch": 0.6681266646936964, + "grad_norm": 1.3307857513427734, + "learning_rate": 1.8112954329833022e-05, + "loss": 0.229, + "step": 1411 + }, + { + "epoch": 0.6686001775673276, + "grad_norm": 1.1393177509307861, + "learning_rate": 1.8109963379042538e-05, + "loss": 0.243, + "step": 1412 + }, + { + "epoch": 0.6690736904409589, + "grad_norm": 1.3860878944396973, + "learning_rate": 1.810697030716937e-05, + "loss": 0.246, + "step": 1413 + }, + { + "epoch": 0.6695472033145902, + "grad_norm": 1.2071013450622559, + "learning_rate": 1.8103975114996327e-05, + "loss": 0.2124, + "step": 1414 + }, + { + "epoch": 0.6700207161882213, + "grad_norm": 1.5767385959625244, + "learning_rate": 1.8100977803306774e-05, + "loss": 0.2346, + "step": 1415 + }, + { + "epoch": 0.6704942290618526, + "grad_norm": 1.6426430940628052, + "learning_rate": 1.8097978372884627e-05, + "loss": 0.2258, + "step": 1416 + }, + { + "epoch": 0.6709677419354839, + "grad_norm": 1.0845011472702026, + "learning_rate": 1.8094976824514363e-05, + "loss": 0.2492, + "step": 1417 + }, + { + "epoch": 0.6714412548091151, + "grad_norm": 1.211498498916626, + "learning_rate": 1.8091973158981e-05, + "loss": 0.2457, + "step": 1418 + }, + { + "epoch": 0.6719147676827464, + "grad_norm": 1.3050991296768188, + "learning_rate": 1.8088967377070122e-05, + "loss": 0.1988, + "step": 1419 + }, + { + "epoch": 0.6723882805563777, + "grad_norm": 2.2744362354278564, + "learning_rate": 1.8085959479567866e-05, + "loss": 0.2364, + "step": 1420 + }, + { + "epoch": 0.6728617934300088, + "grad_norm": 1.9004119634628296, + "learning_rate": 1.808294946726091e-05, + "loss": 0.2387, + "step": 1421 + }, + { + "epoch": 0.6733353063036401, + "grad_norm": 1.1495134830474854, + "learning_rate": 1.80799373409365e-05, + "loss": 0.2364, + "step": 1422 + }, + { + "epoch": 0.6738088191772714, + "grad_norm": 1.161669373512268, + "learning_rate": 1.807692310138243e-05, + "loss": 0.2158, + "step": 1423 + }, + { + "epoch": 0.6742823320509026, + "grad_norm": 1.1612114906311035, + "learning_rate": 1.807390674938704e-05, + "loss": 0.2477, + "step": 1424 + }, + { + "epoch": 0.6747558449245339, + "grad_norm": 1.5632646083831787, + "learning_rate": 1.8070888285739227e-05, + "loss": 0.2696, + "step": 1425 + }, + { + "epoch": 0.6752293577981652, + "grad_norm": 1.4020699262619019, + "learning_rate": 1.806786771122845e-05, + "loss": 0.2244, + "step": 1426 + }, + { + "epoch": 0.6757028706717964, + "grad_norm": 1.7532941102981567, + "learning_rate": 1.80648450266447e-05, + "loss": 0.3041, + "step": 1427 + }, + { + "epoch": 0.6761763835454276, + "grad_norm": 2.035996675491333, + "learning_rate": 1.806182023277854e-05, + "loss": 0.2601, + "step": 1428 + }, + { + "epoch": 0.6766498964190589, + "grad_norm": 1.4259287118911743, + "learning_rate": 1.805879333042107e-05, + "loss": 0.2476, + "step": 1429 + }, + { + "epoch": 0.6771234092926901, + "grad_norm": 1.3768444061279297, + "learning_rate": 1.805576432036395e-05, + "loss": 0.2458, + "step": 1430 + }, + { + "epoch": 0.6775969221663214, + "grad_norm": 1.153491497039795, + "learning_rate": 1.8052733203399385e-05, + "loss": 0.2582, + "step": 1431 + }, + { + "epoch": 0.6780704350399527, + "grad_norm": 1.0750600099563599, + "learning_rate": 1.804969998032014e-05, + "loss": 0.2708, + "step": 1432 + }, + { + "epoch": 0.6785439479135839, + "grad_norm": 1.5494842529296875, + "learning_rate": 1.8046664651919517e-05, + "loss": 0.2314, + "step": 1433 + }, + { + "epoch": 0.6790174607872151, + "grad_norm": 1.6655199527740479, + "learning_rate": 1.8043627218991385e-05, + "loss": 0.257, + "step": 1434 + }, + { + "epoch": 0.6794909736608464, + "grad_norm": 1.0493881702423096, + "learning_rate": 1.8040587682330155e-05, + "loss": 0.2497, + "step": 1435 + }, + { + "epoch": 0.6799644865344776, + "grad_norm": 1.266782283782959, + "learning_rate": 1.8037546042730783e-05, + "loss": 0.2443, + "step": 1436 + }, + { + "epoch": 0.6804379994081089, + "grad_norm": 1.381783127784729, + "learning_rate": 1.8034502300988784e-05, + "loss": 0.2467, + "step": 1437 + }, + { + "epoch": 0.6809115122817402, + "grad_norm": 1.3426703214645386, + "learning_rate": 1.803145645790022e-05, + "loss": 0.2399, + "step": 1438 + }, + { + "epoch": 0.6813850251553714, + "grad_norm": 1.1513340473175049, + "learning_rate": 1.8028408514261695e-05, + "loss": 0.2482, + "step": 1439 + }, + { + "epoch": 0.6818585380290026, + "grad_norm": 1.5276223421096802, + "learning_rate": 1.8025358470870376e-05, + "loss": 0.2307, + "step": 1440 + }, + { + "epoch": 0.6823320509026339, + "grad_norm": 1.416548252105713, + "learning_rate": 1.8022306328523973e-05, + "loss": 0.2142, + "step": 1441 + }, + { + "epoch": 0.6828055637762651, + "grad_norm": 1.427147626876831, + "learning_rate": 1.801925208802074e-05, + "loss": 0.2139, + "step": 1442 + }, + { + "epoch": 0.6832790766498964, + "grad_norm": 1.3105965852737427, + "learning_rate": 1.8016195750159488e-05, + "loss": 0.2398, + "step": 1443 + }, + { + "epoch": 0.6837525895235277, + "grad_norm": 1.5109857320785522, + "learning_rate": 1.801313731573957e-05, + "loss": 0.2733, + "step": 1444 + }, + { + "epoch": 0.684226102397159, + "grad_norm": 1.150864601135254, + "learning_rate": 1.8010076785560896e-05, + "loss": 0.2492, + "step": 1445 + }, + { + "epoch": 0.6846996152707902, + "grad_norm": 1.219900131225586, + "learning_rate": 1.8007014160423907e-05, + "loss": 0.2619, + "step": 1446 + }, + { + "epoch": 0.6851731281444214, + "grad_norm": 1.590561032295227, + "learning_rate": 1.8003949441129612e-05, + "loss": 0.2396, + "step": 1447 + }, + { + "epoch": 0.6856466410180527, + "grad_norm": 1.0455493927001953, + "learning_rate": 1.8000882628479558e-05, + "loss": 0.234, + "step": 1448 + }, + { + "epoch": 0.6861201538916839, + "grad_norm": 1.0974806547164917, + "learning_rate": 1.7997813723275834e-05, + "loss": 0.2592, + "step": 1449 + }, + { + "epoch": 0.6865936667653152, + "grad_norm": 1.3178282976150513, + "learning_rate": 1.799474272632109e-05, + "loss": 0.2275, + "step": 1450 + }, + { + "epoch": 0.6870671796389465, + "grad_norm": 1.2756496667861938, + "learning_rate": 1.7991669638418515e-05, + "loss": 0.2165, + "step": 1451 + }, + { + "epoch": 0.6875406925125777, + "grad_norm": 1.5571712255477905, + "learning_rate": 1.798859446037184e-05, + "loss": 0.2507, + "step": 1452 + }, + { + "epoch": 0.6880142053862089, + "grad_norm": 1.1532251834869385, + "learning_rate": 1.798551719298535e-05, + "loss": 0.2464, + "step": 1453 + }, + { + "epoch": 0.6884877182598402, + "grad_norm": 1.2180898189544678, + "learning_rate": 1.7982437837063878e-05, + "loss": 0.2383, + "step": 1454 + }, + { + "epoch": 0.6889612311334714, + "grad_norm": 1.2221808433532715, + "learning_rate": 1.7979356393412796e-05, + "loss": 0.2409, + "step": 1455 + }, + { + "epoch": 0.6894347440071027, + "grad_norm": 1.3098372220993042, + "learning_rate": 1.797627286283803e-05, + "loss": 0.2462, + "step": 1456 + }, + { + "epoch": 0.689908256880734, + "grad_norm": 1.6582274436950684, + "learning_rate": 1.7973187246146044e-05, + "loss": 0.2566, + "step": 1457 + }, + { + "epoch": 0.6903817697543652, + "grad_norm": 1.5802083015441895, + "learning_rate": 1.7970099544143852e-05, + "loss": 0.2495, + "step": 1458 + }, + { + "epoch": 0.6908552826279964, + "grad_norm": 1.2195215225219727, + "learning_rate": 1.7967009757639008e-05, + "loss": 0.2477, + "step": 1459 + }, + { + "epoch": 0.6913287955016277, + "grad_norm": 1.1696422100067139, + "learning_rate": 1.796391788743963e-05, + "loss": 0.2191, + "step": 1460 + }, + { + "epoch": 0.6918023083752589, + "grad_norm": 1.1211189031600952, + "learning_rate": 1.7960823934354352e-05, + "loss": 0.2616, + "step": 1461 + }, + { + "epoch": 0.6922758212488902, + "grad_norm": 1.4870905876159668, + "learning_rate": 1.7957727899192375e-05, + "loss": 0.2307, + "step": 1462 + }, + { + "epoch": 0.6927493341225215, + "grad_norm": 0.9586571455001831, + "learning_rate": 1.7954629782763437e-05, + "loss": 0.2507, + "step": 1463 + }, + { + "epoch": 0.6932228469961527, + "grad_norm": 1.396880865097046, + "learning_rate": 1.7951529585877818e-05, + "loss": 0.2456, + "step": 1464 + }, + { + "epoch": 0.693696359869784, + "grad_norm": 1.5271177291870117, + "learning_rate": 1.7948427309346346e-05, + "loss": 0.2374, + "step": 1465 + }, + { + "epoch": 0.6941698727434152, + "grad_norm": 1.1528421640396118, + "learning_rate": 1.7945322953980387e-05, + "loss": 0.2209, + "step": 1466 + }, + { + "epoch": 0.6946433856170464, + "grad_norm": 1.230507254600525, + "learning_rate": 1.7942216520591867e-05, + "loss": 0.2414, + "step": 1467 + }, + { + "epoch": 0.6951168984906777, + "grad_norm": 1.2597784996032715, + "learning_rate": 1.793910800999323e-05, + "loss": 0.2556, + "step": 1468 + }, + { + "epoch": 0.695590411364309, + "grad_norm": 1.1090929508209229, + "learning_rate": 1.7935997422997484e-05, + "loss": 0.2575, + "step": 1469 + }, + { + "epoch": 0.6960639242379402, + "grad_norm": 1.2305208444595337, + "learning_rate": 1.7932884760418172e-05, + "loss": 0.2437, + "step": 1470 + }, + { + "epoch": 0.6965374371115715, + "grad_norm": 1.3730130195617676, + "learning_rate": 1.7929770023069383e-05, + "loss": 0.2321, + "step": 1471 + }, + { + "epoch": 0.6970109499852027, + "grad_norm": 1.4209880828857422, + "learning_rate": 1.7926653211765742e-05, + "loss": 0.2597, + "step": 1472 + }, + { + "epoch": 0.6974844628588339, + "grad_norm": 1.2759629487991333, + "learning_rate": 1.7923534327322427e-05, + "loss": 0.2424, + "step": 1473 + }, + { + "epoch": 0.6979579757324652, + "grad_norm": 1.0876637697219849, + "learning_rate": 1.7920413370555143e-05, + "loss": 0.2589, + "step": 1474 + }, + { + "epoch": 0.6984314886060965, + "grad_norm": 1.1464848518371582, + "learning_rate": 1.7917290342280154e-05, + "loss": 0.2399, + "step": 1475 + }, + { + "epoch": 0.6989050014797278, + "grad_norm": 1.3303989171981812, + "learning_rate": 1.7914165243314256e-05, + "loss": 0.2653, + "step": 1476 + }, + { + "epoch": 0.699378514353359, + "grad_norm": 1.5119075775146484, + "learning_rate": 1.7911038074474788e-05, + "loss": 0.2422, + "step": 1477 + }, + { + "epoch": 0.6998520272269902, + "grad_norm": 1.3617953062057495, + "learning_rate": 1.790790883657963e-05, + "loss": 0.253, + "step": 1478 + }, + { + "epoch": 0.7003255401006215, + "grad_norm": 1.2398014068603516, + "learning_rate": 1.7904777530447203e-05, + "loss": 0.247, + "step": 1479 + }, + { + "epoch": 0.7007990529742527, + "grad_norm": 0.9873283505439758, + "learning_rate": 1.7901644156896474e-05, + "loss": 0.2248, + "step": 1480 + }, + { + "epoch": 0.701272565847884, + "grad_norm": 1.4423537254333496, + "learning_rate": 1.7898508716746944e-05, + "loss": 0.2449, + "step": 1481 + }, + { + "epoch": 0.7017460787215153, + "grad_norm": 1.600345253944397, + "learning_rate": 1.7895371210818656e-05, + "loss": 0.252, + "step": 1482 + }, + { + "epoch": 0.7022195915951465, + "grad_norm": 1.43065345287323, + "learning_rate": 1.78922316399322e-05, + "loss": 0.247, + "step": 1483 + }, + { + "epoch": 0.7026931044687778, + "grad_norm": 1.2158089876174927, + "learning_rate": 1.7889090004908692e-05, + "loss": 0.2162, + "step": 1484 + }, + { + "epoch": 0.703166617342409, + "grad_norm": 1.3399561643600464, + "learning_rate": 1.78859463065698e-05, + "loss": 0.2449, + "step": 1485 + }, + { + "epoch": 0.7036401302160402, + "grad_norm": 2.326958179473877, + "learning_rate": 1.788280054573773e-05, + "loss": 0.2354, + "step": 1486 + }, + { + "epoch": 0.7041136430896715, + "grad_norm": 1.8712157011032104, + "learning_rate": 1.7879652723235223e-05, + "loss": 0.2409, + "step": 1487 + }, + { + "epoch": 0.7045871559633028, + "grad_norm": 1.066486120223999, + "learning_rate": 1.7876502839885564e-05, + "loss": 0.2323, + "step": 1488 + }, + { + "epoch": 0.705060668836934, + "grad_norm": 1.8184261322021484, + "learning_rate": 1.7873350896512574e-05, + "loss": 0.2201, + "step": 1489 + }, + { + "epoch": 0.7055341817105653, + "grad_norm": 2.173187255859375, + "learning_rate": 1.787019689394061e-05, + "loss": 0.2419, + "step": 1490 + }, + { + "epoch": 0.7060076945841965, + "grad_norm": 1.2974936962127686, + "learning_rate": 1.786704083299458e-05, + "loss": 0.2354, + "step": 1491 + }, + { + "epoch": 0.7064812074578277, + "grad_norm": 1.272907018661499, + "learning_rate": 1.786388271449991e-05, + "loss": 0.2416, + "step": 1492 + }, + { + "epoch": 0.706954720331459, + "grad_norm": 0.8591986894607544, + "learning_rate": 1.7860722539282577e-05, + "loss": 0.2094, + "step": 1493 + }, + { + "epoch": 0.7074282332050903, + "grad_norm": 1.1995400190353394, + "learning_rate": 1.7857560308169103e-05, + "loss": 0.2289, + "step": 1494 + }, + { + "epoch": 0.7079017460787215, + "grad_norm": 1.7277050018310547, + "learning_rate": 1.785439602198653e-05, + "loss": 0.2252, + "step": 1495 + }, + { + "epoch": 0.7083752589523528, + "grad_norm": 2.3876638412475586, + "learning_rate": 1.785122968156245e-05, + "loss": 0.2494, + "step": 1496 + }, + { + "epoch": 0.708848771825984, + "grad_norm": 2.412504196166992, + "learning_rate": 1.7848061287724993e-05, + "loss": 0.2361, + "step": 1497 + }, + { + "epoch": 0.7093222846996152, + "grad_norm": 1.439038872718811, + "learning_rate": 1.7844890841302815e-05, + "loss": 0.251, + "step": 1498 + }, + { + "epoch": 0.7097957975732465, + "grad_norm": 1.5058058500289917, + "learning_rate": 1.7841718343125117e-05, + "loss": 0.2373, + "step": 1499 + }, + { + "epoch": 0.7102693104468778, + "grad_norm": 1.5263663530349731, + "learning_rate": 1.7838543794021637e-05, + "loss": 0.2423, + "step": 1500 + }, + { + "epoch": 0.710742823320509, + "grad_norm": 1.1687594652175903, + "learning_rate": 1.783536719482265e-05, + "loss": 0.2125, + "step": 1501 + }, + { + "epoch": 0.7112163361941403, + "grad_norm": 1.1493502855300903, + "learning_rate": 1.783218854635896e-05, + "loss": 0.2257, + "step": 1502 + }, + { + "epoch": 0.7116898490677716, + "grad_norm": 2.6674232482910156, + "learning_rate": 1.782900784946192e-05, + "loss": 0.2585, + "step": 1503 + }, + { + "epoch": 0.7121633619414027, + "grad_norm": 1.5568790435791016, + "learning_rate": 1.7825825104963398e-05, + "loss": 0.2436, + "step": 1504 + }, + { + "epoch": 0.712636874815034, + "grad_norm": 1.7096036672592163, + "learning_rate": 1.782264031369582e-05, + "loss": 0.2551, + "step": 1505 + }, + { + "epoch": 0.7131103876886653, + "grad_norm": 1.8939051628112793, + "learning_rate": 1.7819453476492136e-05, + "loss": 0.2482, + "step": 1506 + }, + { + "epoch": 0.7135839005622966, + "grad_norm": 1.7977244853973389, + "learning_rate": 1.7816264594185826e-05, + "loss": 0.2449, + "step": 1507 + }, + { + "epoch": 0.7140574134359278, + "grad_norm": 1.4691609144210815, + "learning_rate": 1.781307366761092e-05, + "loss": 0.2438, + "step": 1508 + }, + { + "epoch": 0.7145309263095591, + "grad_norm": 1.6823903322219849, + "learning_rate": 1.7809880697601965e-05, + "loss": 0.2588, + "step": 1509 + }, + { + "epoch": 0.7150044391831903, + "grad_norm": 1.1935667991638184, + "learning_rate": 1.7806685684994063e-05, + "loss": 0.2381, + "step": 1510 + }, + { + "epoch": 0.7154779520568215, + "grad_norm": 1.196661114692688, + "learning_rate": 1.7803488630622833e-05, + "loss": 0.2411, + "step": 1511 + }, + { + "epoch": 0.7159514649304528, + "grad_norm": 1.9287184476852417, + "learning_rate": 1.7800289535324426e-05, + "loss": 0.2545, + "step": 1512 + }, + { + "epoch": 0.7164249778040841, + "grad_norm": 2.1813395023345947, + "learning_rate": 1.7797088399935547e-05, + "loss": 0.2496, + "step": 1513 + }, + { + "epoch": 0.7168984906777153, + "grad_norm": 1.9686847925186157, + "learning_rate": 1.7793885225293418e-05, + "loss": 0.2352, + "step": 1514 + }, + { + "epoch": 0.7173720035513466, + "grad_norm": 1.5410264730453491, + "learning_rate": 1.779068001223579e-05, + "loss": 0.2156, + "step": 1515 + }, + { + "epoch": 0.7178455164249778, + "grad_norm": 1.2590621709823608, + "learning_rate": 1.7787472761600973e-05, + "loss": 0.2278, + "step": 1516 + }, + { + "epoch": 0.718319029298609, + "grad_norm": 1.5641133785247803, + "learning_rate": 1.7784263474227774e-05, + "loss": 0.2149, + "step": 1517 + }, + { + "epoch": 0.7187925421722403, + "grad_norm": 1.5322697162628174, + "learning_rate": 1.7781052150955566e-05, + "loss": 0.2337, + "step": 1518 + }, + { + "epoch": 0.7192660550458716, + "grad_norm": 1.4889428615570068, + "learning_rate": 1.7777838792624228e-05, + "loss": 0.2447, + "step": 1519 + }, + { + "epoch": 0.7197395679195028, + "grad_norm": 2.111682176589966, + "learning_rate": 1.777462340007419e-05, + "loss": 0.2279, + "step": 1520 + }, + { + "epoch": 0.7202130807931341, + "grad_norm": 1.946251630783081, + "learning_rate": 1.7771405974146403e-05, + "loss": 0.2399, + "step": 1521 + }, + { + "epoch": 0.7206865936667654, + "grad_norm": 1.0815329551696777, + "learning_rate": 1.776818651568236e-05, + "loss": 0.2292, + "step": 1522 + }, + { + "epoch": 0.7211601065403965, + "grad_norm": 1.4069883823394775, + "learning_rate": 1.7764965025524072e-05, + "loss": 0.2677, + "step": 1523 + }, + { + "epoch": 0.7216336194140278, + "grad_norm": 1.227126121520996, + "learning_rate": 1.776174150451409e-05, + "loss": 0.2468, + "step": 1524 + }, + { + "epoch": 0.7221071322876591, + "grad_norm": 1.791418433189392, + "learning_rate": 1.7758515953495496e-05, + "loss": 0.2335, + "step": 1525 + }, + { + "epoch": 0.7225806451612903, + "grad_norm": 2.3596413135528564, + "learning_rate": 1.7755288373311906e-05, + "loss": 0.2589, + "step": 1526 + }, + { + "epoch": 0.7230541580349216, + "grad_norm": 1.5666567087173462, + "learning_rate": 1.7752058764807455e-05, + "loss": 0.2381, + "step": 1527 + }, + { + "epoch": 0.7235276709085529, + "grad_norm": 1.1688320636749268, + "learning_rate": 1.7748827128826822e-05, + "loss": 0.2469, + "step": 1528 + }, + { + "epoch": 0.724001183782184, + "grad_norm": 1.2592840194702148, + "learning_rate": 1.7745593466215204e-05, + "loss": 0.224, + "step": 1529 + }, + { + "epoch": 0.7244746966558153, + "grad_norm": 1.0283125638961792, + "learning_rate": 1.774235777781834e-05, + "loss": 0.2054, + "step": 1530 + }, + { + "epoch": 0.7249482095294466, + "grad_norm": 1.7181527614593506, + "learning_rate": 1.7739120064482493e-05, + "loss": 0.2394, + "step": 1531 + }, + { + "epoch": 0.7254217224030778, + "grad_norm": 1.6182515621185303, + "learning_rate": 1.7735880327054453e-05, + "loss": 0.2225, + "step": 1532 + }, + { + "epoch": 0.7258952352767091, + "grad_norm": 1.1025639772415161, + "learning_rate": 1.7732638566381544e-05, + "loss": 0.2413, + "step": 1533 + }, + { + "epoch": 0.7263687481503404, + "grad_norm": 1.6946557760238647, + "learning_rate": 1.7729394783311614e-05, + "loss": 0.2424, + "step": 1534 + }, + { + "epoch": 0.7268422610239715, + "grad_norm": 1.2723066806793213, + "learning_rate": 1.7726148978693046e-05, + "loss": 0.2264, + "step": 1535 + }, + { + "epoch": 0.7273157738976028, + "grad_norm": 1.2535463571548462, + "learning_rate": 1.7722901153374748e-05, + "loss": 0.2516, + "step": 1536 + }, + { + "epoch": 0.7277892867712341, + "grad_norm": 1.6399126052856445, + "learning_rate": 1.7719651308206157e-05, + "loss": 0.237, + "step": 1537 + }, + { + "epoch": 0.7282627996448654, + "grad_norm": 1.1695709228515625, + "learning_rate": 1.771639944403724e-05, + "loss": 0.2125, + "step": 1538 + }, + { + "epoch": 0.7287363125184966, + "grad_norm": 1.4084415435791016, + "learning_rate": 1.7713145561718486e-05, + "loss": 0.2153, + "step": 1539 + }, + { + "epoch": 0.7292098253921279, + "grad_norm": 1.2717535495758057, + "learning_rate": 1.7709889662100926e-05, + "loss": 0.2599, + "step": 1540 + }, + { + "epoch": 0.7296833382657592, + "grad_norm": 1.3030693531036377, + "learning_rate": 1.77066317460361e-05, + "loss": 0.2262, + "step": 1541 + }, + { + "epoch": 0.7301568511393903, + "grad_norm": 1.2973533868789673, + "learning_rate": 1.7703371814376088e-05, + "loss": 0.2526, + "step": 1542 + }, + { + "epoch": 0.7306303640130216, + "grad_norm": 1.3349380493164062, + "learning_rate": 1.7700109867973494e-05, + "loss": 0.2436, + "step": 1543 + }, + { + "epoch": 0.7311038768866529, + "grad_norm": 1.424356460571289, + "learning_rate": 1.769684590768145e-05, + "loss": 0.2335, + "step": 1544 + }, + { + "epoch": 0.7315773897602841, + "grad_norm": 1.7350075244903564, + "learning_rate": 1.769357993435361e-05, + "loss": 0.2501, + "step": 1545 + }, + { + "epoch": 0.7320509026339154, + "grad_norm": 1.5108102560043335, + "learning_rate": 1.7690311948844162e-05, + "loss": 0.2606, + "step": 1546 + }, + { + "epoch": 0.7325244155075467, + "grad_norm": 1.3136991262435913, + "learning_rate": 1.768704195200781e-05, + "loss": 0.217, + "step": 1547 + }, + { + "epoch": 0.7329979283811778, + "grad_norm": 1.9467602968215942, + "learning_rate": 1.7683769944699793e-05, + "loss": 0.245, + "step": 1548 + }, + { + "epoch": 0.7334714412548091, + "grad_norm": 1.9547033309936523, + "learning_rate": 1.768049592777588e-05, + "loss": 0.2583, + "step": 1549 + }, + { + "epoch": 0.7339449541284404, + "grad_norm": 1.5787625312805176, + "learning_rate": 1.7677219902092345e-05, + "loss": 0.2554, + "step": 1550 + }, + { + "epoch": 0.7344184670020716, + "grad_norm": 1.423520803451538, + "learning_rate": 1.7673941868506014e-05, + "loss": 0.2397, + "step": 1551 + }, + { + "epoch": 0.7348919798757029, + "grad_norm": 2.1277055740356445, + "learning_rate": 1.7670661827874217e-05, + "loss": 0.2384, + "step": 1552 + }, + { + "epoch": 0.7353654927493342, + "grad_norm": 1.7148913145065308, + "learning_rate": 1.7667379781054816e-05, + "loss": 0.2562, + "step": 1553 + }, + { + "epoch": 0.7358390056229653, + "grad_norm": 1.5137001276016235, + "learning_rate": 1.7664095728906202e-05, + "loss": 0.2452, + "step": 1554 + }, + { + "epoch": 0.7363125184965966, + "grad_norm": 1.6710152626037598, + "learning_rate": 1.766080967228729e-05, + "loss": 0.2281, + "step": 1555 + }, + { + "epoch": 0.7367860313702279, + "grad_norm": 1.5264017581939697, + "learning_rate": 1.7657521612057513e-05, + "loss": 0.2265, + "step": 1556 + }, + { + "epoch": 0.7372595442438591, + "grad_norm": 2.8512990474700928, + "learning_rate": 1.765423154907683e-05, + "loss": 0.2342, + "step": 1557 + }, + { + "epoch": 0.7377330571174904, + "grad_norm": 1.2972322702407837, + "learning_rate": 1.7650939484205728e-05, + "loss": 0.2237, + "step": 1558 + }, + { + "epoch": 0.7382065699911217, + "grad_norm": 1.2768687009811401, + "learning_rate": 1.7647645418305215e-05, + "loss": 0.2285, + "step": 1559 + }, + { + "epoch": 0.7386800828647528, + "grad_norm": 0.9737210273742676, + "learning_rate": 1.7644349352236822e-05, + "loss": 0.2494, + "step": 1560 + }, + { + "epoch": 0.7391535957383841, + "grad_norm": 1.2401894330978394, + "learning_rate": 1.7641051286862597e-05, + "loss": 0.263, + "step": 1561 + }, + { + "epoch": 0.7396271086120154, + "grad_norm": 1.2521781921386719, + "learning_rate": 1.763775122304513e-05, + "loss": 0.2624, + "step": 1562 + }, + { + "epoch": 0.7401006214856466, + "grad_norm": 1.5043491125106812, + "learning_rate": 1.7634449161647506e-05, + "loss": 0.2325, + "step": 1563 + }, + { + "epoch": 0.7405741343592779, + "grad_norm": 1.1127467155456543, + "learning_rate": 1.7631145103533357e-05, + "loss": 0.2532, + "step": 1564 + }, + { + "epoch": 0.7410476472329092, + "grad_norm": 1.3029921054840088, + "learning_rate": 1.7627839049566827e-05, + "loss": 0.2445, + "step": 1565 + }, + { + "epoch": 0.7415211601065403, + "grad_norm": 1.4428067207336426, + "learning_rate": 1.762453100061258e-05, + "loss": 0.2123, + "step": 1566 + }, + { + "epoch": 0.7419946729801716, + "grad_norm": 1.1127543449401855, + "learning_rate": 1.76212209575358e-05, + "loss": 0.2265, + "step": 1567 + }, + { + "epoch": 0.7424681858538029, + "grad_norm": 0.9954766631126404, + "learning_rate": 1.761790892120221e-05, + "loss": 0.2089, + "step": 1568 + }, + { + "epoch": 0.7429416987274341, + "grad_norm": 1.198296070098877, + "learning_rate": 1.7614594892478026e-05, + "loss": 0.2272, + "step": 1569 + }, + { + "epoch": 0.7434152116010654, + "grad_norm": 1.9979981184005737, + "learning_rate": 1.761127887223001e-05, + "loss": 0.2511, + "step": 1570 + }, + { + "epoch": 0.7438887244746967, + "grad_norm": 1.982858657836914, + "learning_rate": 1.7607960861325434e-05, + "loss": 0.2372, + "step": 1571 + }, + { + "epoch": 0.744362237348328, + "grad_norm": 1.2229499816894531, + "learning_rate": 1.760464086063209e-05, + "loss": 0.2101, + "step": 1572 + }, + { + "epoch": 0.7448357502219591, + "grad_norm": 1.25929856300354, + "learning_rate": 1.760131887101829e-05, + "loss": 0.27, + "step": 1573 + }, + { + "epoch": 0.7453092630955904, + "grad_norm": 1.0544230937957764, + "learning_rate": 1.7597994893352873e-05, + "loss": 0.2076, + "step": 1574 + }, + { + "epoch": 0.7457827759692217, + "grad_norm": 1.874146819114685, + "learning_rate": 1.7594668928505186e-05, + "loss": 0.2267, + "step": 1575 + }, + { + "epoch": 0.7462562888428529, + "grad_norm": 1.3984766006469727, + "learning_rate": 1.7591340977345112e-05, + "loss": 0.218, + "step": 1576 + }, + { + "epoch": 0.7467298017164842, + "grad_norm": 1.5700955390930176, + "learning_rate": 1.758801104074304e-05, + "loss": 0.2278, + "step": 1577 + }, + { + "epoch": 0.7472033145901155, + "grad_norm": 1.9291844367980957, + "learning_rate": 1.7584679119569882e-05, + "loss": 0.2446, + "step": 1578 + }, + { + "epoch": 0.7476768274637466, + "grad_norm": 1.6862872838974, + "learning_rate": 1.758134521469707e-05, + "loss": 0.2461, + "step": 1579 + }, + { + "epoch": 0.7481503403373779, + "grad_norm": 2.24984073638916, + "learning_rate": 1.7578009326996556e-05, + "loss": 0.2702, + "step": 1580 + }, + { + "epoch": 0.7486238532110092, + "grad_norm": 1.163398265838623, + "learning_rate": 1.757467145734081e-05, + "loss": 0.2349, + "step": 1581 + }, + { + "epoch": 0.7490973660846404, + "grad_norm": 1.2229551076889038, + "learning_rate": 1.757133160660282e-05, + "loss": 0.2439, + "step": 1582 + }, + { + "epoch": 0.7495708789582717, + "grad_norm": 1.2336183786392212, + "learning_rate": 1.7567989775656088e-05, + "loss": 0.2629, + "step": 1583 + }, + { + "epoch": 0.750044391831903, + "grad_norm": 1.5344003438949585, + "learning_rate": 1.756464596537464e-05, + "loss": 0.2362, + "step": 1584 + }, + { + "epoch": 0.7505179047055341, + "grad_norm": 1.0532424449920654, + "learning_rate": 1.7561300176633015e-05, + "loss": 0.2238, + "step": 1585 + }, + { + "epoch": 0.7509914175791654, + "grad_norm": 1.153308391571045, + "learning_rate": 1.755795241030628e-05, + "loss": 0.2429, + "step": 1586 + }, + { + "epoch": 0.7514649304527967, + "grad_norm": 1.3348098993301392, + "learning_rate": 1.755460266727e-05, + "loss": 0.2435, + "step": 1587 + }, + { + "epoch": 0.7519384433264279, + "grad_norm": 1.3655999898910522, + "learning_rate": 1.7551250948400273e-05, + "loss": 0.2197, + "step": 1588 + }, + { + "epoch": 0.7524119562000592, + "grad_norm": 2.0334396362304688, + "learning_rate": 1.754789725457371e-05, + "loss": 0.2142, + "step": 1589 + }, + { + "epoch": 0.7528854690736905, + "grad_norm": 1.0390914678573608, + "learning_rate": 1.754454158666744e-05, + "loss": 0.2386, + "step": 1590 + }, + { + "epoch": 0.7533589819473216, + "grad_norm": 1.4241347312927246, + "learning_rate": 1.7541183945559095e-05, + "loss": 0.2464, + "step": 1591 + }, + { + "epoch": 0.7538324948209529, + "grad_norm": 1.2554645538330078, + "learning_rate": 1.7537824332126842e-05, + "loss": 0.2298, + "step": 1592 + }, + { + "epoch": 0.7543060076945842, + "grad_norm": 1.4590020179748535, + "learning_rate": 1.753446274724935e-05, + "loss": 0.2441, + "step": 1593 + }, + { + "epoch": 0.7547795205682154, + "grad_norm": 1.2696107625961304, + "learning_rate": 1.753109919180582e-05, + "loss": 0.2478, + "step": 1594 + }, + { + "epoch": 0.7552530334418467, + "grad_norm": 1.4026949405670166, + "learning_rate": 1.7527733666675945e-05, + "loss": 0.256, + "step": 1595 + }, + { + "epoch": 0.755726546315478, + "grad_norm": 1.0931724309921265, + "learning_rate": 1.7524366172739954e-05, + "loss": 0.2363, + "step": 1596 + }, + { + "epoch": 0.7562000591891092, + "grad_norm": 1.0928606986999512, + "learning_rate": 1.7520996710878577e-05, + "loss": 0.2427, + "step": 1597 + }, + { + "epoch": 0.7566735720627404, + "grad_norm": 1.247383713722229, + "learning_rate": 1.751762528197307e-05, + "loss": 0.2468, + "step": 1598 + }, + { + "epoch": 0.7571470849363717, + "grad_norm": 1.1337532997131348, + "learning_rate": 1.7514251886905192e-05, + "loss": 0.2321, + "step": 1599 + }, + { + "epoch": 0.7576205978100029, + "grad_norm": 1.2659448385238647, + "learning_rate": 1.7510876526557225e-05, + "loss": 0.2439, + "step": 1600 + }, + { + "epoch": 0.7580941106836342, + "grad_norm": 1.3613624572753906, + "learning_rate": 1.7507499201811958e-05, + "loss": 0.233, + "step": 1601 + }, + { + "epoch": 0.7585676235572655, + "grad_norm": 1.2028915882110596, + "learning_rate": 1.7504119913552707e-05, + "loss": 0.2176, + "step": 1602 + }, + { + "epoch": 0.7590411364308968, + "grad_norm": 1.3185116052627563, + "learning_rate": 1.750073866266328e-05, + "loss": 0.2633, + "step": 1603 + }, + { + "epoch": 0.759514649304528, + "grad_norm": 1.093675971031189, + "learning_rate": 1.749735545002802e-05, + "loss": 0.2641, + "step": 1604 + }, + { + "epoch": 0.7599881621781592, + "grad_norm": 1.3118257522583008, + "learning_rate": 1.7493970276531768e-05, + "loss": 0.2769, + "step": 1605 + }, + { + "epoch": 0.7604616750517905, + "grad_norm": 1.8869200944900513, + "learning_rate": 1.7490583143059885e-05, + "loss": 0.247, + "step": 1606 + }, + { + "epoch": 0.7609351879254217, + "grad_norm": 1.1935513019561768, + "learning_rate": 1.748719405049825e-05, + "loss": 0.2098, + "step": 1607 + }, + { + "epoch": 0.761408700799053, + "grad_norm": 1.2408536672592163, + "learning_rate": 1.7483802999733237e-05, + "loss": 0.227, + "step": 1608 + }, + { + "epoch": 0.7618822136726843, + "grad_norm": 1.9241235256195068, + "learning_rate": 1.748040999165175e-05, + "loss": 0.2486, + "step": 1609 + }, + { + "epoch": 0.7623557265463154, + "grad_norm": 2.5321922302246094, + "learning_rate": 1.7477015027141192e-05, + "loss": 0.2458, + "step": 1610 + }, + { + "epoch": 0.7628292394199467, + "grad_norm": 1.3591722249984741, + "learning_rate": 1.7473618107089482e-05, + "loss": 0.2064, + "step": 1611 + }, + { + "epoch": 0.763302752293578, + "grad_norm": 1.9121155738830566, + "learning_rate": 1.747021923238506e-05, + "loss": 0.2565, + "step": 1612 + }, + { + "epoch": 0.7637762651672092, + "grad_norm": 1.7858729362487793, + "learning_rate": 1.7466818403916862e-05, + "loss": 0.2642, + "step": 1613 + }, + { + "epoch": 0.7642497780408405, + "grad_norm": 1.5614818334579468, + "learning_rate": 1.7463415622574346e-05, + "loss": 0.2363, + "step": 1614 + }, + { + "epoch": 0.7647232909144718, + "grad_norm": 1.199522852897644, + "learning_rate": 1.7460010889247473e-05, + "loss": 0.2525, + "step": 1615 + }, + { + "epoch": 0.765196803788103, + "grad_norm": 1.5608481168746948, + "learning_rate": 1.7456604204826725e-05, + "loss": 0.2218, + "step": 1616 + }, + { + "epoch": 0.7656703166617342, + "grad_norm": 2.3330767154693604, + "learning_rate": 1.7453195570203075e-05, + "loss": 0.2299, + "step": 1617 + }, + { + "epoch": 0.7661438295353655, + "grad_norm": 2.918964385986328, + "learning_rate": 1.7449784986268033e-05, + "loss": 0.2124, + "step": 1618 + }, + { + "epoch": 0.7666173424089967, + "grad_norm": 2.289975643157959, + "learning_rate": 1.7446372453913592e-05, + "loss": 0.2549, + "step": 1619 + }, + { + "epoch": 0.767090855282628, + "grad_norm": 1.3809151649475098, + "learning_rate": 1.7442957974032274e-05, + "loss": 0.2461, + "step": 1620 + }, + { + "epoch": 0.7675643681562593, + "grad_norm": 1.484054446220398, + "learning_rate": 1.74395415475171e-05, + "loss": 0.2427, + "step": 1621 + }, + { + "epoch": 0.7680378810298905, + "grad_norm": 1.378715991973877, + "learning_rate": 1.7436123175261607e-05, + "loss": 0.2572, + "step": 1622 + }, + { + "epoch": 0.7685113939035217, + "grad_norm": 1.168416976928711, + "learning_rate": 1.7432702858159835e-05, + "loss": 0.2395, + "step": 1623 + }, + { + "epoch": 0.768984906777153, + "grad_norm": 1.0436391830444336, + "learning_rate": 1.742928059710633e-05, + "loss": 0.2144, + "step": 1624 + }, + { + "epoch": 0.7694584196507842, + "grad_norm": 2.4827699661254883, + "learning_rate": 1.742585639299616e-05, + "loss": 0.2408, + "step": 1625 + }, + { + "epoch": 0.7699319325244155, + "grad_norm": 2.694228410720825, + "learning_rate": 1.7422430246724892e-05, + "loss": 0.2193, + "step": 1626 + }, + { + "epoch": 0.7704054453980468, + "grad_norm": 1.7454769611358643, + "learning_rate": 1.7419002159188593e-05, + "loss": 0.2328, + "step": 1627 + }, + { + "epoch": 0.770878958271678, + "grad_norm": 2.133936643600464, + "learning_rate": 1.7415572131283856e-05, + "loss": 0.2401, + "step": 1628 + }, + { + "epoch": 0.7713524711453092, + "grad_norm": 1.3482989072799683, + "learning_rate": 1.7412140163907765e-05, + "loss": 0.2338, + "step": 1629 + }, + { + "epoch": 0.7718259840189405, + "grad_norm": 1.8004233837127686, + "learning_rate": 1.7408706257957922e-05, + "loss": 0.2271, + "step": 1630 + }, + { + "epoch": 0.7722994968925717, + "grad_norm": 1.835065484046936, + "learning_rate": 1.740527041433243e-05, + "loss": 0.2661, + "step": 1631 + }, + { + "epoch": 0.772773009766203, + "grad_norm": 1.9195033311843872, + "learning_rate": 1.7401832633929897e-05, + "loss": 0.2482, + "step": 1632 + }, + { + "epoch": 0.7732465226398343, + "grad_norm": 1.346403956413269, + "learning_rate": 1.7398392917649448e-05, + "loss": 0.2437, + "step": 1633 + }, + { + "epoch": 0.7737200355134656, + "grad_norm": 1.3023998737335205, + "learning_rate": 1.7394951266390708e-05, + "loss": 0.2476, + "step": 1634 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 1.6358240842819214, + "learning_rate": 1.7391507681053802e-05, + "loss": 0.2367, + "step": 1635 + }, + { + "epoch": 0.774667061260728, + "grad_norm": 1.2242166996002197, + "learning_rate": 1.7388062162539368e-05, + "loss": 0.2527, + "step": 1636 + }, + { + "epoch": 0.7751405741343593, + "grad_norm": 1.1994009017944336, + "learning_rate": 1.738461471174855e-05, + "loss": 0.2417, + "step": 1637 + }, + { + "epoch": 0.7756140870079905, + "grad_norm": 1.6394872665405273, + "learning_rate": 1.7381165329582996e-05, + "loss": 0.2193, + "step": 1638 + }, + { + "epoch": 0.7760875998816218, + "grad_norm": 1.1021173000335693, + "learning_rate": 1.7377714016944856e-05, + "loss": 0.2529, + "step": 1639 + }, + { + "epoch": 0.7765611127552531, + "grad_norm": 0.9502620697021484, + "learning_rate": 1.737426077473679e-05, + "loss": 0.2347, + "step": 1640 + }, + { + "epoch": 0.7770346256288843, + "grad_norm": 1.5584297180175781, + "learning_rate": 1.737080560386196e-05, + "loss": 0.2619, + "step": 1641 + }, + { + "epoch": 0.7775081385025155, + "grad_norm": 1.0773392915725708, + "learning_rate": 1.736734850522403e-05, + "loss": 0.2445, + "step": 1642 + }, + { + "epoch": 0.7779816513761468, + "grad_norm": 0.9587130546569824, + "learning_rate": 1.736388947972717e-05, + "loss": 0.2168, + "step": 1643 + }, + { + "epoch": 0.778455164249778, + "grad_norm": 1.201387882232666, + "learning_rate": 1.7360428528276062e-05, + "loss": 0.2421, + "step": 1644 + }, + { + "epoch": 0.7789286771234093, + "grad_norm": 1.5380560159683228, + "learning_rate": 1.735696565177588e-05, + "loss": 0.2278, + "step": 1645 + }, + { + "epoch": 0.7794021899970406, + "grad_norm": 1.150850534439087, + "learning_rate": 1.7353500851132305e-05, + "loss": 0.2434, + "step": 1646 + }, + { + "epoch": 0.7798757028706718, + "grad_norm": 1.1250213384628296, + "learning_rate": 1.735003412725152e-05, + "loss": 0.2712, + "step": 1647 + }, + { + "epoch": 0.780349215744303, + "grad_norm": 1.428877592086792, + "learning_rate": 1.7346565481040218e-05, + "loss": 0.2397, + "step": 1648 + }, + { + "epoch": 0.7808227286179343, + "grad_norm": 1.245980978012085, + "learning_rate": 1.734309491340559e-05, + "loss": 0.2259, + "step": 1649 + }, + { + "epoch": 0.7812962414915655, + "grad_norm": 1.2803640365600586, + "learning_rate": 1.7339622425255323e-05, + "loss": 0.224, + "step": 1650 + }, + { + "epoch": 0.7817697543651968, + "grad_norm": 1.3713748455047607, + "learning_rate": 1.7336148017497617e-05, + "loss": 0.2288, + "step": 1651 + }, + { + "epoch": 0.7822432672388281, + "grad_norm": 1.4782074689865112, + "learning_rate": 1.7332671691041173e-05, + "loss": 0.2334, + "step": 1652 + }, + { + "epoch": 0.7827167801124593, + "grad_norm": 1.6904542446136475, + "learning_rate": 1.7329193446795186e-05, + "loss": 0.2539, + "step": 1653 + }, + { + "epoch": 0.7831902929860906, + "grad_norm": 2.416238784790039, + "learning_rate": 1.732571328566936e-05, + "loss": 0.2279, + "step": 1654 + }, + { + "epoch": 0.7836638058597218, + "grad_norm": 2.9311797618865967, + "learning_rate": 1.7322231208573897e-05, + "loss": 0.2518, + "step": 1655 + }, + { + "epoch": 0.784137318733353, + "grad_norm": 1.2711611986160278, + "learning_rate": 1.7318747216419502e-05, + "loss": 0.2443, + "step": 1656 + }, + { + "epoch": 0.7846108316069843, + "grad_norm": 1.742221713066101, + "learning_rate": 1.7315261310117376e-05, + "loss": 0.2431, + "step": 1657 + }, + { + "epoch": 0.7850843444806156, + "grad_norm": 1.3446333408355713, + "learning_rate": 1.7311773490579225e-05, + "loss": 0.2241, + "step": 1658 + }, + { + "epoch": 0.7855578573542468, + "grad_norm": 1.3194044828414917, + "learning_rate": 1.7308283758717255e-05, + "loss": 0.2399, + "step": 1659 + }, + { + "epoch": 0.7860313702278781, + "grad_norm": 1.2633658647537231, + "learning_rate": 1.7304792115444172e-05, + "loss": 0.2336, + "step": 1660 + }, + { + "epoch": 0.7865048831015093, + "grad_norm": 1.167573094367981, + "learning_rate": 1.7301298561673186e-05, + "loss": 0.2337, + "step": 1661 + }, + { + "epoch": 0.7869783959751405, + "grad_norm": 1.2154494524002075, + "learning_rate": 1.7297803098317995e-05, + "loss": 0.2571, + "step": 1662 + }, + { + "epoch": 0.7874519088487718, + "grad_norm": 1.3965184688568115, + "learning_rate": 1.7294305726292804e-05, + "loss": 0.2521, + "step": 1663 + }, + { + "epoch": 0.7879254217224031, + "grad_norm": 1.1656535863876343, + "learning_rate": 1.7290806446512324e-05, + "loss": 0.2134, + "step": 1664 + }, + { + "epoch": 0.7883989345960344, + "grad_norm": 1.8412251472473145, + "learning_rate": 1.728730525989175e-05, + "loss": 0.2219, + "step": 1665 + }, + { + "epoch": 0.7888724474696656, + "grad_norm": 2.7580809593200684, + "learning_rate": 1.7283802167346793e-05, + "loss": 0.2484, + "step": 1666 + }, + { + "epoch": 0.7893459603432968, + "grad_norm": 1.3492392301559448, + "learning_rate": 1.7280297169793643e-05, + "loss": 0.2292, + "step": 1667 + }, + { + "epoch": 0.7898194732169281, + "grad_norm": 1.6488065719604492, + "learning_rate": 1.7276790268149e-05, + "loss": 0.2416, + "step": 1668 + }, + { + "epoch": 0.7902929860905593, + "grad_norm": 1.1698122024536133, + "learning_rate": 1.7273281463330066e-05, + "loss": 0.2375, + "step": 1669 + }, + { + "epoch": 0.7907664989641906, + "grad_norm": 1.7447906732559204, + "learning_rate": 1.7269770756254532e-05, + "loss": 0.257, + "step": 1670 + }, + { + "epoch": 0.7912400118378219, + "grad_norm": 1.752770185470581, + "learning_rate": 1.726625814784059e-05, + "loss": 0.2087, + "step": 1671 + }, + { + "epoch": 0.7917135247114531, + "grad_norm": 1.0216140747070312, + "learning_rate": 1.7262743639006928e-05, + "loss": 0.2084, + "step": 1672 + }, + { + "epoch": 0.7921870375850844, + "grad_norm": 1.923169493675232, + "learning_rate": 1.7259227230672733e-05, + "loss": 0.231, + "step": 1673 + }, + { + "epoch": 0.7926605504587156, + "grad_norm": 1.4375102519989014, + "learning_rate": 1.7255708923757683e-05, + "loss": 0.2381, + "step": 1674 + }, + { + "epoch": 0.7931340633323468, + "grad_norm": 1.4559400081634521, + "learning_rate": 1.7252188719181965e-05, + "loss": 0.2328, + "step": 1675 + }, + { + "epoch": 0.7936075762059781, + "grad_norm": 1.6422553062438965, + "learning_rate": 1.724866661786625e-05, + "loss": 0.2457, + "step": 1676 + }, + { + "epoch": 0.7940810890796094, + "grad_norm": 1.408460259437561, + "learning_rate": 1.7245142620731707e-05, + "loss": 0.2479, + "step": 1677 + }, + { + "epoch": 0.7945546019532406, + "grad_norm": 1.476257562637329, + "learning_rate": 1.724161672870001e-05, + "loss": 0.2193, + "step": 1678 + }, + { + "epoch": 0.7950281148268719, + "grad_norm": 1.108467698097229, + "learning_rate": 1.7238088942693315e-05, + "loss": 0.2265, + "step": 1679 + }, + { + "epoch": 0.7955016277005031, + "grad_norm": 1.2234896421432495, + "learning_rate": 1.723455926363429e-05, + "loss": 0.2383, + "step": 1680 + }, + { + "epoch": 0.7959751405741343, + "grad_norm": 0.9792563915252686, + "learning_rate": 1.723102769244608e-05, + "loss": 0.2288, + "step": 1681 + }, + { + "epoch": 0.7964486534477656, + "grad_norm": 1.273815393447876, + "learning_rate": 1.7227494230052337e-05, + "loss": 0.2287, + "step": 1682 + }, + { + "epoch": 0.7969221663213969, + "grad_norm": 1.5027164220809937, + "learning_rate": 1.72239588773772e-05, + "loss": 0.2635, + "step": 1683 + }, + { + "epoch": 0.7973956791950281, + "grad_norm": 1.1177582740783691, + "learning_rate": 1.7220421635345312e-05, + "loss": 0.2351, + "step": 1684 + }, + { + "epoch": 0.7978691920686594, + "grad_norm": 1.341367483139038, + "learning_rate": 1.7216882504881796e-05, + "loss": 0.2124, + "step": 1685 + }, + { + "epoch": 0.7983427049422906, + "grad_norm": 1.2859315872192383, + "learning_rate": 1.721334148691229e-05, + "loss": 0.2186, + "step": 1686 + }, + { + "epoch": 0.7988162178159218, + "grad_norm": 1.8942023515701294, + "learning_rate": 1.7209798582362904e-05, + "loss": 0.2746, + "step": 1687 + }, + { + "epoch": 0.7992897306895531, + "grad_norm": 1.0868715047836304, + "learning_rate": 1.7206253792160255e-05, + "loss": 0.238, + "step": 1688 + }, + { + "epoch": 0.7997632435631844, + "grad_norm": 1.7027019262313843, + "learning_rate": 1.7202707117231443e-05, + "loss": 0.2381, + "step": 1689 + }, + { + "epoch": 0.8002367564368156, + "grad_norm": 1.0921725034713745, + "learning_rate": 1.7199158558504074e-05, + "loss": 0.2556, + "step": 1690 + }, + { + "epoch": 0.8007102693104469, + "grad_norm": 1.053194522857666, + "learning_rate": 1.7195608116906234e-05, + "loss": 0.2267, + "step": 1691 + }, + { + "epoch": 0.8011837821840782, + "grad_norm": 1.3293603658676147, + "learning_rate": 1.7192055793366506e-05, + "loss": 0.2312, + "step": 1692 + }, + { + "epoch": 0.8016572950577093, + "grad_norm": 1.9301906824111938, + "learning_rate": 1.718850158881397e-05, + "loss": 0.2198, + "step": 1693 + }, + { + "epoch": 0.8021308079313406, + "grad_norm": 1.5638904571533203, + "learning_rate": 1.7184945504178193e-05, + "loss": 0.2624, + "step": 1694 + }, + { + "epoch": 0.8026043208049719, + "grad_norm": 1.715846061706543, + "learning_rate": 1.7181387540389235e-05, + "loss": 0.2195, + "step": 1695 + }, + { + "epoch": 0.8030778336786032, + "grad_norm": 1.0800182819366455, + "learning_rate": 1.7177827698377646e-05, + "loss": 0.2419, + "step": 1696 + }, + { + "epoch": 0.8035513465522344, + "grad_norm": 1.205739974975586, + "learning_rate": 1.7174265979074464e-05, + "loss": 0.2211, + "step": 1697 + }, + { + "epoch": 0.8040248594258657, + "grad_norm": 1.1386871337890625, + "learning_rate": 1.717070238341123e-05, + "loss": 0.2216, + "step": 1698 + }, + { + "epoch": 0.8044983722994969, + "grad_norm": 1.242944598197937, + "learning_rate": 1.7167136912319962e-05, + "loss": 0.2197, + "step": 1699 + }, + { + "epoch": 0.8049718851731281, + "grad_norm": 1.081717610359192, + "learning_rate": 1.716356956673318e-05, + "loss": 0.2377, + "step": 1700 + }, + { + "epoch": 0.8054453980467594, + "grad_norm": 1.5992521047592163, + "learning_rate": 1.7160000347583885e-05, + "loss": 0.2345, + "step": 1701 + }, + { + "epoch": 0.8059189109203907, + "grad_norm": 1.3137495517730713, + "learning_rate": 1.715642925580557e-05, + "loss": 0.2322, + "step": 1702 + }, + { + "epoch": 0.8063924237940219, + "grad_norm": 1.910725474357605, + "learning_rate": 1.7152856292332225e-05, + "loss": 0.2487, + "step": 1703 + }, + { + "epoch": 0.8068659366676532, + "grad_norm": 1.408942699432373, + "learning_rate": 1.7149281458098325e-05, + "loss": 0.2556, + "step": 1704 + }, + { + "epoch": 0.8073394495412844, + "grad_norm": 1.5198936462402344, + "learning_rate": 1.7145704754038825e-05, + "loss": 0.2308, + "step": 1705 + }, + { + "epoch": 0.8078129624149156, + "grad_norm": 1.0505871772766113, + "learning_rate": 1.7142126181089184e-05, + "loss": 0.2366, + "step": 1706 + }, + { + "epoch": 0.8082864752885469, + "grad_norm": 1.4777036905288696, + "learning_rate": 1.713854574018534e-05, + "loss": 0.2297, + "step": 1707 + }, + { + "epoch": 0.8087599881621782, + "grad_norm": 1.320868730545044, + "learning_rate": 1.7134963432263725e-05, + "loss": 0.2222, + "step": 1708 + }, + { + "epoch": 0.8092335010358094, + "grad_norm": 1.3242131471633911, + "learning_rate": 1.713137925826126e-05, + "loss": 0.2458, + "step": 1709 + }, + { + "epoch": 0.8097070139094407, + "grad_norm": 1.8690855503082275, + "learning_rate": 1.7127793219115345e-05, + "loss": 0.2241, + "step": 1710 + }, + { + "epoch": 0.810180526783072, + "grad_norm": 1.4543685913085938, + "learning_rate": 1.7124205315763876e-05, + "loss": 0.2369, + "step": 1711 + }, + { + "epoch": 0.8106540396567031, + "grad_norm": 1.1340458393096924, + "learning_rate": 1.7120615549145234e-05, + "loss": 0.2446, + "step": 1712 + }, + { + "epoch": 0.8111275525303344, + "grad_norm": 2.3450067043304443, + "learning_rate": 1.711702392019829e-05, + "loss": 0.2507, + "step": 1713 + }, + { + "epoch": 0.8116010654039657, + "grad_norm": 1.5409314632415771, + "learning_rate": 1.71134304298624e-05, + "loss": 0.213, + "step": 1714 + }, + { + "epoch": 0.8120745782775969, + "grad_norm": 1.5783090591430664, + "learning_rate": 1.7109835079077406e-05, + "loss": 0.2278, + "step": 1715 + }, + { + "epoch": 0.8125480911512282, + "grad_norm": 1.2138385772705078, + "learning_rate": 1.7106237868783635e-05, + "loss": 0.2228, + "step": 1716 + }, + { + "epoch": 0.8130216040248595, + "grad_norm": 1.4703527688980103, + "learning_rate": 1.710263879992191e-05, + "loss": 0.2285, + "step": 1717 + }, + { + "epoch": 0.8134951168984906, + "grad_norm": 1.418495774269104, + "learning_rate": 1.709903787343352e-05, + "loss": 0.2528, + "step": 1718 + }, + { + "epoch": 0.8139686297721219, + "grad_norm": 2.0826809406280518, + "learning_rate": 1.7095435090260263e-05, + "loss": 0.2252, + "step": 1719 + }, + { + "epoch": 0.8144421426457532, + "grad_norm": 1.1822896003723145, + "learning_rate": 1.7091830451344406e-05, + "loss": 0.2336, + "step": 1720 + }, + { + "epoch": 0.8149156555193844, + "grad_norm": 2.3410282135009766, + "learning_rate": 1.7088223957628714e-05, + "loss": 0.2513, + "step": 1721 + }, + { + "epoch": 0.8153891683930157, + "grad_norm": 0.9941398501396179, + "learning_rate": 1.708461561005643e-05, + "loss": 0.2511, + "step": 1722 + }, + { + "epoch": 0.815862681266647, + "grad_norm": 1.2086396217346191, + "learning_rate": 1.708100540957127e-05, + "loss": 0.2535, + "step": 1723 + }, + { + "epoch": 0.8163361941402781, + "grad_norm": 1.3249119520187378, + "learning_rate": 1.7077393357117467e-05, + "loss": 0.252, + "step": 1724 + }, + { + "epoch": 0.8168097070139094, + "grad_norm": 1.0457037687301636, + "learning_rate": 1.70737794536397e-05, + "loss": 0.2265, + "step": 1725 + }, + { + "epoch": 0.8172832198875407, + "grad_norm": 1.7129755020141602, + "learning_rate": 1.7070163700083163e-05, + "loss": 0.2345, + "step": 1726 + }, + { + "epoch": 0.8177567327611719, + "grad_norm": 1.0255284309387207, + "learning_rate": 1.7066546097393518e-05, + "loss": 0.2303, + "step": 1727 + }, + { + "epoch": 0.8182302456348032, + "grad_norm": 1.2299504280090332, + "learning_rate": 1.7062926646516915e-05, + "loss": 0.2454, + "step": 1728 + }, + { + "epoch": 0.8187037585084345, + "grad_norm": 1.1177610158920288, + "learning_rate": 1.705930534839998e-05, + "loss": 0.2478, + "step": 1729 + }, + { + "epoch": 0.8191772713820658, + "grad_norm": 1.1740856170654297, + "learning_rate": 1.7055682203989838e-05, + "loss": 0.2253, + "step": 1730 + }, + { + "epoch": 0.8196507842556969, + "grad_norm": 2.3321869373321533, + "learning_rate": 1.705205721423408e-05, + "loss": 0.2168, + "step": 1731 + }, + { + "epoch": 0.8201242971293282, + "grad_norm": 1.5028231143951416, + "learning_rate": 1.7048430380080793e-05, + "loss": 0.2292, + "step": 1732 + }, + { + "epoch": 0.8205978100029595, + "grad_norm": 1.116110920906067, + "learning_rate": 1.7044801702478534e-05, + "loss": 0.2118, + "step": 1733 + }, + { + "epoch": 0.8210713228765907, + "grad_norm": 2.010495901107788, + "learning_rate": 1.7041171182376354e-05, + "loss": 0.2414, + "step": 1734 + }, + { + "epoch": 0.821544835750222, + "grad_norm": 1.1517298221588135, + "learning_rate": 1.7037538820723777e-05, + "loss": 0.2366, + "step": 1735 + }, + { + "epoch": 0.8220183486238533, + "grad_norm": 1.393791913986206, + "learning_rate": 1.7033904618470814e-05, + "loss": 0.2457, + "step": 1736 + }, + { + "epoch": 0.8224918614974844, + "grad_norm": 1.2726801633834839, + "learning_rate": 1.7030268576567956e-05, + "loss": 0.2476, + "step": 1737 + }, + { + "epoch": 0.8229653743711157, + "grad_norm": 1.198588490486145, + "learning_rate": 1.7026630695966172e-05, + "loss": 0.2283, + "step": 1738 + }, + { + "epoch": 0.823438887244747, + "grad_norm": 1.3464490175247192, + "learning_rate": 1.7022990977616922e-05, + "loss": 0.2221, + "step": 1739 + }, + { + "epoch": 0.8239124001183782, + "grad_norm": 1.504508137702942, + "learning_rate": 1.7019349422472128e-05, + "loss": 0.2292, + "step": 1740 + }, + { + "epoch": 0.8243859129920095, + "grad_norm": 1.8641504049301147, + "learning_rate": 1.7015706031484215e-05, + "loss": 0.2391, + "step": 1741 + }, + { + "epoch": 0.8248594258656408, + "grad_norm": 1.2077713012695312, + "learning_rate": 1.7012060805606067e-05, + "loss": 0.2267, + "step": 1742 + }, + { + "epoch": 0.8253329387392719, + "grad_norm": 1.1325595378875732, + "learning_rate": 1.7008413745791063e-05, + "loss": 0.2077, + "step": 1743 + }, + { + "epoch": 0.8258064516129032, + "grad_norm": 1.5017510652542114, + "learning_rate": 1.7004764852993056e-05, + "loss": 0.2344, + "step": 1744 + }, + { + "epoch": 0.8262799644865345, + "grad_norm": 1.8392795324325562, + "learning_rate": 1.7001114128166375e-05, + "loss": 0.2312, + "step": 1745 + }, + { + "epoch": 0.8267534773601657, + "grad_norm": 1.4507255554199219, + "learning_rate": 1.6997461572265838e-05, + "loss": 0.2629, + "step": 1746 + }, + { + "epoch": 0.827226990233797, + "grad_norm": 1.8967761993408203, + "learning_rate": 1.6993807186246735e-05, + "loss": 0.2135, + "step": 1747 + }, + { + "epoch": 0.8277005031074283, + "grad_norm": 1.259427785873413, + "learning_rate": 1.6990150971064827e-05, + "loss": 0.2199, + "step": 1748 + }, + { + "epoch": 0.8281740159810594, + "grad_norm": 1.5154216289520264, + "learning_rate": 1.698649292767637e-05, + "loss": 0.2385, + "step": 1749 + }, + { + "epoch": 0.8286475288546907, + "grad_norm": 1.33448326587677, + "learning_rate": 1.6982833057038087e-05, + "loss": 0.229, + "step": 1750 + }, + { + "epoch": 0.829121041728322, + "grad_norm": 1.2957985401153564, + "learning_rate": 1.6979171360107183e-05, + "loss": 0.2483, + "step": 1751 + }, + { + "epoch": 0.8295945546019532, + "grad_norm": 1.2026399374008179, + "learning_rate": 1.6975507837841338e-05, + "loss": 0.2452, + "step": 1752 + }, + { + "epoch": 0.8300680674755845, + "grad_norm": 1.6381677389144897, + "learning_rate": 1.6971842491198716e-05, + "loss": 0.2299, + "step": 1753 + }, + { + "epoch": 0.8305415803492158, + "grad_norm": 1.994767189025879, + "learning_rate": 1.6968175321137942e-05, + "loss": 0.2343, + "step": 1754 + }, + { + "epoch": 0.831015093222847, + "grad_norm": 1.1473991870880127, + "learning_rate": 1.696450632861814e-05, + "loss": 0.2431, + "step": 1755 + }, + { + "epoch": 0.8314886060964782, + "grad_norm": 1.2881150245666504, + "learning_rate": 1.6960835514598897e-05, + "loss": 0.245, + "step": 1756 + }, + { + "epoch": 0.8319621189701095, + "grad_norm": 1.086107850074768, + "learning_rate": 1.6957162880040273e-05, + "loss": 0.221, + "step": 1757 + }, + { + "epoch": 0.8324356318437407, + "grad_norm": 1.6053413152694702, + "learning_rate": 1.695348842590282e-05, + "loss": 0.2587, + "step": 1758 + }, + { + "epoch": 0.832909144717372, + "grad_norm": 1.3724325895309448, + "learning_rate": 1.694981215314755e-05, + "loss": 0.2508, + "step": 1759 + }, + { + "epoch": 0.8333826575910033, + "grad_norm": 1.0389055013656616, + "learning_rate": 1.6946134062735953e-05, + "loss": 0.214, + "step": 1760 + }, + { + "epoch": 0.8338561704646346, + "grad_norm": 1.638784646987915, + "learning_rate": 1.6942454155630005e-05, + "loss": 0.2257, + "step": 1761 + }, + { + "epoch": 0.8343296833382657, + "grad_norm": 1.2976818084716797, + "learning_rate": 1.6938772432792146e-05, + "loss": 0.2631, + "step": 1762 + }, + { + "epoch": 0.834803196211897, + "grad_norm": 1.4617828130722046, + "learning_rate": 1.69350888951853e-05, + "loss": 0.2217, + "step": 1763 + }, + { + "epoch": 0.8352767090855283, + "grad_norm": 1.6357537508010864, + "learning_rate": 1.693140354377286e-05, + "loss": 0.276, + "step": 1764 + }, + { + "epoch": 0.8357502219591595, + "grad_norm": 2.0060503482818604, + "learning_rate": 1.6927716379518683e-05, + "loss": 0.2226, + "step": 1765 + }, + { + "epoch": 0.8362237348327908, + "grad_norm": 1.3638052940368652, + "learning_rate": 1.6924027403387125e-05, + "loss": 0.2312, + "step": 1766 + }, + { + "epoch": 0.8366972477064221, + "grad_norm": 1.0776060819625854, + "learning_rate": 1.6920336616343e-05, + "loss": 0.2261, + "step": 1767 + }, + { + "epoch": 0.8371707605800532, + "grad_norm": 1.6560837030410767, + "learning_rate": 1.691664401935159e-05, + "loss": 0.2287, + "step": 1768 + }, + { + "epoch": 0.8376442734536845, + "grad_norm": 1.7977901697158813, + "learning_rate": 1.691294961337866e-05, + "loss": 0.2451, + "step": 1769 + }, + { + "epoch": 0.8381177863273158, + "grad_norm": 1.1947377920150757, + "learning_rate": 1.6909253399390453e-05, + "loss": 0.2327, + "step": 1770 + }, + { + "epoch": 0.838591299200947, + "grad_norm": 1.2023718357086182, + "learning_rate": 1.6905555378353676e-05, + "loss": 0.2413, + "step": 1771 + }, + { + "epoch": 0.8390648120745783, + "grad_norm": 1.2229983806610107, + "learning_rate": 1.6901855551235505e-05, + "loss": 0.2319, + "step": 1772 + }, + { + "epoch": 0.8395383249482096, + "grad_norm": 2.0484726428985596, + "learning_rate": 1.68981539190036e-05, + "loss": 0.2407, + "step": 1773 + }, + { + "epoch": 0.8400118378218407, + "grad_norm": 1.0766804218292236, + "learning_rate": 1.6894450482626087e-05, + "loss": 0.2518, + "step": 1774 + }, + { + "epoch": 0.840485350695472, + "grad_norm": 1.1905542612075806, + "learning_rate": 1.6890745243071558e-05, + "loss": 0.2279, + "step": 1775 + }, + { + "epoch": 0.8409588635691033, + "grad_norm": 1.0002597570419312, + "learning_rate": 1.688703820130909e-05, + "loss": 0.235, + "step": 1776 + }, + { + "epoch": 0.8414323764427345, + "grad_norm": 1.6852128505706787, + "learning_rate": 1.688332935830822e-05, + "loss": 0.2301, + "step": 1777 + }, + { + "epoch": 0.8419058893163658, + "grad_norm": 0.9994098544120789, + "learning_rate": 1.687961871503896e-05, + "loss": 0.2174, + "step": 1778 + }, + { + "epoch": 0.8423794021899971, + "grad_norm": 1.0002872943878174, + "learning_rate": 1.6875906272471795e-05, + "loss": 0.2541, + "step": 1779 + }, + { + "epoch": 0.8428529150636282, + "grad_norm": 1.7979168891906738, + "learning_rate": 1.6872192031577675e-05, + "loss": 0.2421, + "step": 1780 + }, + { + "epoch": 0.8433264279372595, + "grad_norm": 1.4673607349395752, + "learning_rate": 1.6868475993328027e-05, + "loss": 0.2231, + "step": 1781 + }, + { + "epoch": 0.8437999408108908, + "grad_norm": 1.4320827722549438, + "learning_rate": 1.6864758158694747e-05, + "loss": 0.2273, + "step": 1782 + }, + { + "epoch": 0.844273453684522, + "grad_norm": 1.837491750717163, + "learning_rate": 1.6861038528650197e-05, + "loss": 0.2815, + "step": 1783 + }, + { + "epoch": 0.8447469665581533, + "grad_norm": 0.963911771774292, + "learning_rate": 1.685731710416721e-05, + "loss": 0.209, + "step": 1784 + }, + { + "epoch": 0.8452204794317846, + "grad_norm": 1.31343412399292, + "learning_rate": 1.685359388621908e-05, + "loss": 0.2288, + "step": 1785 + }, + { + "epoch": 0.8456939923054158, + "grad_norm": 1.2854394912719727, + "learning_rate": 1.6849868875779594e-05, + "loss": 0.2427, + "step": 1786 + }, + { + "epoch": 0.846167505179047, + "grad_norm": 1.5275187492370605, + "learning_rate": 1.6846142073822983e-05, + "loss": 0.259, + "step": 1787 + }, + { + "epoch": 0.8466410180526783, + "grad_norm": 1.2911092042922974, + "learning_rate": 1.684241348132396e-05, + "loss": 0.258, + "step": 1788 + }, + { + "epoch": 0.8471145309263095, + "grad_norm": 1.2889440059661865, + "learning_rate": 1.68386830992577e-05, + "loss": 0.235, + "step": 1789 + }, + { + "epoch": 0.8475880437999408, + "grad_norm": 1.3690677881240845, + "learning_rate": 1.683495092859985e-05, + "loss": 0.2401, + "step": 1790 + }, + { + "epoch": 0.8480615566735721, + "grad_norm": 1.3110579252243042, + "learning_rate": 1.683121697032652e-05, + "loss": 0.2563, + "step": 1791 + }, + { + "epoch": 0.8485350695472034, + "grad_norm": 1.2487221956253052, + "learning_rate": 1.6827481225414298e-05, + "loss": 0.2234, + "step": 1792 + }, + { + "epoch": 0.8490085824208345, + "grad_norm": 1.9341635704040527, + "learning_rate": 1.6823743694840226e-05, + "loss": 0.2653, + "step": 1793 + }, + { + "epoch": 0.8494820952944658, + "grad_norm": 1.6593029499053955, + "learning_rate": 1.6820004379581816e-05, + "loss": 0.2799, + "step": 1794 + }, + { + "epoch": 0.8499556081680971, + "grad_norm": 1.5015946626663208, + "learning_rate": 1.6816263280617054e-05, + "loss": 0.2821, + "step": 1795 + }, + { + "epoch": 0.8504291210417283, + "grad_norm": 1.0924832820892334, + "learning_rate": 1.6812520398924393e-05, + "loss": 0.2256, + "step": 1796 + }, + { + "epoch": 0.8509026339153596, + "grad_norm": 3.2573161125183105, + "learning_rate": 1.6808775735482746e-05, + "loss": 0.2438, + "step": 1797 + }, + { + "epoch": 0.8513761467889909, + "grad_norm": 1.6617119312286377, + "learning_rate": 1.6805029291271485e-05, + "loss": 0.2604, + "step": 1798 + }, + { + "epoch": 0.851849659662622, + "grad_norm": 1.2218396663665771, + "learning_rate": 1.680128106727046e-05, + "loss": 0.2546, + "step": 1799 + }, + { + "epoch": 0.8523231725362533, + "grad_norm": 1.5456347465515137, + "learning_rate": 1.6797531064459995e-05, + "loss": 0.232, + "step": 1800 + }, + { + "epoch": 0.8527966854098846, + "grad_norm": 1.6752814054489136, + "learning_rate": 1.679377928382085e-05, + "loss": 0.2486, + "step": 1801 + }, + { + "epoch": 0.8532701982835158, + "grad_norm": 1.3025007247924805, + "learning_rate": 1.6790025726334274e-05, + "loss": 0.243, + "step": 1802 + }, + { + "epoch": 0.8537437111571471, + "grad_norm": 1.4086090326309204, + "learning_rate": 1.6786270392981976e-05, + "loss": 0.2331, + "step": 1803 + }, + { + "epoch": 0.8542172240307784, + "grad_norm": 1.0914981365203857, + "learning_rate": 1.6782513284746124e-05, + "loss": 0.2667, + "step": 1804 + }, + { + "epoch": 0.8546907369044096, + "grad_norm": 1.737202525138855, + "learning_rate": 1.6778754402609356e-05, + "loss": 0.2584, + "step": 1805 + }, + { + "epoch": 0.8551642497780408, + "grad_norm": 1.0595831871032715, + "learning_rate": 1.6774993747554767e-05, + "loss": 0.2386, + "step": 1806 + }, + { + "epoch": 0.8556377626516721, + "grad_norm": 1.2180547714233398, + "learning_rate": 1.6771231320565925e-05, + "loss": 0.2429, + "step": 1807 + }, + { + "epoch": 0.8561112755253033, + "grad_norm": 1.1929233074188232, + "learning_rate": 1.6767467122626852e-05, + "loss": 0.227, + "step": 1808 + }, + { + "epoch": 0.8565847883989346, + "grad_norm": 1.554271936416626, + "learning_rate": 1.6763701154722043e-05, + "loss": 0.2447, + "step": 1809 + }, + { + "epoch": 0.8570583012725659, + "grad_norm": 1.0554946660995483, + "learning_rate": 1.6759933417836446e-05, + "loss": 0.2397, + "step": 1810 + }, + { + "epoch": 0.8575318141461971, + "grad_norm": 1.350473165512085, + "learning_rate": 1.6756163912955478e-05, + "loss": 0.2526, + "step": 1811 + }, + { + "epoch": 0.8580053270198283, + "grad_norm": 1.6592644453048706, + "learning_rate": 1.6752392641065015e-05, + "loss": 0.2294, + "step": 1812 + }, + { + "epoch": 0.8584788398934596, + "grad_norm": 0.9810163974761963, + "learning_rate": 1.67486196031514e-05, + "loss": 0.2413, + "step": 1813 + }, + { + "epoch": 0.8589523527670908, + "grad_norm": 1.351130485534668, + "learning_rate": 1.674484480020143e-05, + "loss": 0.2768, + "step": 1814 + }, + { + "epoch": 0.8594258656407221, + "grad_norm": 1.1572606563568115, + "learning_rate": 1.6741068233202374e-05, + "loss": 0.233, + "step": 1815 + }, + { + "epoch": 0.8598993785143534, + "grad_norm": 1.5954844951629639, + "learning_rate": 1.6737289903141954e-05, + "loss": 0.2755, + "step": 1816 + }, + { + "epoch": 0.8603728913879846, + "grad_norm": 1.2186288833618164, + "learning_rate": 1.6733509811008354e-05, + "loss": 0.2461, + "step": 1817 + }, + { + "epoch": 0.8608464042616158, + "grad_norm": 1.1969510316848755, + "learning_rate": 1.6729727957790224e-05, + "loss": 0.2269, + "step": 1818 + }, + { + "epoch": 0.8613199171352471, + "grad_norm": 1.5450844764709473, + "learning_rate": 1.672594434447667e-05, + "loss": 0.2235, + "step": 1819 + }, + { + "epoch": 0.8617934300088783, + "grad_norm": 1.2357224225997925, + "learning_rate": 1.672215897205726e-05, + "loss": 0.2287, + "step": 1820 + }, + { + "epoch": 0.8622669428825096, + "grad_norm": 1.3368169069290161, + "learning_rate": 1.6718371841522015e-05, + "loss": 0.2543, + "step": 1821 + }, + { + "epoch": 0.8627404557561409, + "grad_norm": 1.64212167263031, + "learning_rate": 1.6714582953861432e-05, + "loss": 0.2242, + "step": 1822 + }, + { + "epoch": 0.8632139686297722, + "grad_norm": 1.6220941543579102, + "learning_rate": 1.6710792310066454e-05, + "loss": 0.2628, + "step": 1823 + }, + { + "epoch": 0.8636874815034034, + "grad_norm": 1.4587221145629883, + "learning_rate": 1.6706999911128488e-05, + "loss": 0.2459, + "step": 1824 + }, + { + "epoch": 0.8641609943770346, + "grad_norm": 1.312369704246521, + "learning_rate": 1.6703205758039397e-05, + "loss": 0.249, + "step": 1825 + }, + { + "epoch": 0.8646345072506659, + "grad_norm": 1.632554054260254, + "learning_rate": 1.6699409851791507e-05, + "loss": 0.2515, + "step": 1826 + }, + { + "epoch": 0.8651080201242971, + "grad_norm": 1.6713101863861084, + "learning_rate": 1.6695612193377604e-05, + "loss": 0.2434, + "step": 1827 + }, + { + "epoch": 0.8655815329979284, + "grad_norm": 1.163874626159668, + "learning_rate": 1.6691812783790924e-05, + "loss": 0.2101, + "step": 1828 + }, + { + "epoch": 0.8660550458715597, + "grad_norm": 1.604141116142273, + "learning_rate": 1.6688011624025164e-05, + "loss": 0.2043, + "step": 1829 + }, + { + "epoch": 0.8665285587451909, + "grad_norm": 1.5439268350601196, + "learning_rate": 1.6684208715074488e-05, + "loss": 0.2381, + "step": 1830 + }, + { + "epoch": 0.8670020716188221, + "grad_norm": 2.153048038482666, + "learning_rate": 1.6680404057933504e-05, + "loss": 0.2072, + "step": 1831 + }, + { + "epoch": 0.8674755844924534, + "grad_norm": 1.1268725395202637, + "learning_rate": 1.6676597653597287e-05, + "loss": 0.2483, + "step": 1832 + }, + { + "epoch": 0.8679490973660846, + "grad_norm": 1.470022439956665, + "learning_rate": 1.6672789503061362e-05, + "loss": 0.24, + "step": 1833 + }, + { + "epoch": 0.8684226102397159, + "grad_norm": 1.7773938179016113, + "learning_rate": 1.6668979607321717e-05, + "loss": 0.2214, + "step": 1834 + }, + { + "epoch": 0.8688961231133472, + "grad_norm": 1.1891618967056274, + "learning_rate": 1.6665167967374795e-05, + "loss": 0.2411, + "step": 1835 + }, + { + "epoch": 0.8693696359869784, + "grad_norm": 2.1788344383239746, + "learning_rate": 1.666135458421749e-05, + "loss": 0.2029, + "step": 1836 + }, + { + "epoch": 0.8698431488606096, + "grad_norm": 1.9523930549621582, + "learning_rate": 1.6657539458847157e-05, + "loss": 0.2247, + "step": 1837 + }, + { + "epoch": 0.8703166617342409, + "grad_norm": 1.6084970235824585, + "learning_rate": 1.6653722592261605e-05, + "loss": 0.2564, + "step": 1838 + }, + { + "epoch": 0.8707901746078721, + "grad_norm": 1.0069568157196045, + "learning_rate": 1.6649903985459093e-05, + "loss": 0.2296, + "step": 1839 + }, + { + "epoch": 0.8712636874815034, + "grad_norm": 1.9601439237594604, + "learning_rate": 1.664608363943835e-05, + "loss": 0.2398, + "step": 1840 + }, + { + "epoch": 0.8717372003551347, + "grad_norm": 1.4246207475662231, + "learning_rate": 1.664226155519855e-05, + "loss": 0.2336, + "step": 1841 + }, + { + "epoch": 0.8722107132287659, + "grad_norm": 1.4902780055999756, + "learning_rate": 1.6638437733739317e-05, + "loss": 0.2309, + "step": 1842 + }, + { + "epoch": 0.8726842261023972, + "grad_norm": 1.1577699184417725, + "learning_rate": 1.6634612176060736e-05, + "loss": 0.2355, + "step": 1843 + }, + { + "epoch": 0.8731577389760284, + "grad_norm": 1.28193998336792, + "learning_rate": 1.6630784883163347e-05, + "loss": 0.2353, + "step": 1844 + }, + { + "epoch": 0.8736312518496596, + "grad_norm": 1.1589275598526, + "learning_rate": 1.662695585604814e-05, + "loss": 0.2501, + "step": 1845 + }, + { + "epoch": 0.8741047647232909, + "grad_norm": 1.8391677141189575, + "learning_rate": 1.662312509571656e-05, + "loss": 0.2188, + "step": 1846 + }, + { + "epoch": 0.8745782775969222, + "grad_norm": 1.7798473834991455, + "learning_rate": 1.6619292603170505e-05, + "loss": 0.2465, + "step": 1847 + }, + { + "epoch": 0.8750517904705534, + "grad_norm": 0.9828191995620728, + "learning_rate": 1.6615458379412327e-05, + "loss": 0.247, + "step": 1848 + }, + { + "epoch": 0.8755253033441847, + "grad_norm": 1.7325772047042847, + "learning_rate": 1.6611622425444834e-05, + "loss": 0.2165, + "step": 1849 + }, + { + "epoch": 0.875998816217816, + "grad_norm": 1.395041584968567, + "learning_rate": 1.6607784742271275e-05, + "loss": 0.2143, + "step": 1850 + }, + { + "epoch": 0.8764723290914471, + "grad_norm": 1.2467516660690308, + "learning_rate": 1.6603945330895364e-05, + "loss": 0.2121, + "step": 1851 + }, + { + "epoch": 0.8769458419650784, + "grad_norm": 1.279456615447998, + "learning_rate": 1.660010419232126e-05, + "loss": 0.2339, + "step": 1852 + }, + { + "epoch": 0.8774193548387097, + "grad_norm": 1.203802227973938, + "learning_rate": 1.659626132755358e-05, + "loss": 0.2216, + "step": 1853 + }, + { + "epoch": 0.8778928677123409, + "grad_norm": 1.2188462018966675, + "learning_rate": 1.6592416737597382e-05, + "loss": 0.2611, + "step": 1854 + }, + { + "epoch": 0.8783663805859722, + "grad_norm": 1.2634485960006714, + "learning_rate": 1.6588570423458185e-05, + "loss": 0.2343, + "step": 1855 + }, + { + "epoch": 0.8788398934596034, + "grad_norm": 1.070681095123291, + "learning_rate": 1.6584722386141955e-05, + "loss": 0.2278, + "step": 1856 + }, + { + "epoch": 0.8793134063332347, + "grad_norm": 1.0444326400756836, + "learning_rate": 1.6580872626655113e-05, + "loss": 0.2236, + "step": 1857 + }, + { + "epoch": 0.8797869192068659, + "grad_norm": 1.1631354093551636, + "learning_rate": 1.6577021146004514e-05, + "loss": 0.2565, + "step": 1858 + }, + { + "epoch": 0.8802604320804972, + "grad_norm": 2.2485361099243164, + "learning_rate": 1.6573167945197492e-05, + "loss": 0.243, + "step": 1859 + }, + { + "epoch": 0.8807339449541285, + "grad_norm": 1.143649697303772, + "learning_rate": 1.65693130252418e-05, + "loss": 0.2247, + "step": 1860 + }, + { + "epoch": 0.8812074578277597, + "grad_norm": 1.7670962810516357, + "learning_rate": 1.6565456387145667e-05, + "loss": 0.2466, + "step": 1861 + }, + { + "epoch": 0.881680970701391, + "grad_norm": 1.5178930759429932, + "learning_rate": 1.6561598031917752e-05, + "loss": 0.2272, + "step": 1862 + }, + { + "epoch": 0.8821544835750222, + "grad_norm": 2.0225138664245605, + "learning_rate": 1.655773796056717e-05, + "loss": 0.2544, + "step": 1863 + }, + { + "epoch": 0.8826279964486534, + "grad_norm": 1.248473048210144, + "learning_rate": 1.655387617410349e-05, + "loss": 0.2331, + "step": 1864 + }, + { + "epoch": 0.8831015093222847, + "grad_norm": 2.6657464504241943, + "learning_rate": 1.6550012673536725e-05, + "loss": 0.2501, + "step": 1865 + }, + { + "epoch": 0.883575022195916, + "grad_norm": 1.0329293012619019, + "learning_rate": 1.654614745987733e-05, + "loss": 0.2139, + "step": 1866 + }, + { + "epoch": 0.8840485350695472, + "grad_norm": 1.3353016376495361, + "learning_rate": 1.6542280534136223e-05, + "loss": 0.2816, + "step": 1867 + }, + { + "epoch": 0.8845220479431785, + "grad_norm": 0.9897204041481018, + "learning_rate": 1.6538411897324757e-05, + "loss": 0.2147, + "step": 1868 + }, + { + "epoch": 0.8849955608168097, + "grad_norm": 1.145198941230774, + "learning_rate": 1.653454155045473e-05, + "loss": 0.228, + "step": 1869 + }, + { + "epoch": 0.8854690736904409, + "grad_norm": 1.2381608486175537, + "learning_rate": 1.6530669494538403e-05, + "loss": 0.267, + "step": 1870 + }, + { + "epoch": 0.8859425865640722, + "grad_norm": 1.4348737001419067, + "learning_rate": 1.6526795730588477e-05, + "loss": 0.2294, + "step": 1871 + }, + { + "epoch": 0.8864160994377035, + "grad_norm": 1.178130030632019, + "learning_rate": 1.652292025961809e-05, + "loss": 0.2079, + "step": 1872 + }, + { + "epoch": 0.8868896123113347, + "grad_norm": 1.6016205549240112, + "learning_rate": 1.6519043082640834e-05, + "loss": 0.22, + "step": 1873 + }, + { + "epoch": 0.887363125184966, + "grad_norm": 1.1184009313583374, + "learning_rate": 1.6515164200670754e-05, + "loss": 0.2505, + "step": 1874 + }, + { + "epoch": 0.8878366380585972, + "grad_norm": 1.3344361782073975, + "learning_rate": 1.651128361472233e-05, + "loss": 0.2063, + "step": 1875 + }, + { + "epoch": 0.8883101509322284, + "grad_norm": 1.438899278640747, + "learning_rate": 1.6507401325810488e-05, + "loss": 0.2183, + "step": 1876 + }, + { + "epoch": 0.8887836638058597, + "grad_norm": 1.2038847208023071, + "learning_rate": 1.650351733495061e-05, + "loss": 0.2361, + "step": 1877 + }, + { + "epoch": 0.889257176679491, + "grad_norm": 1.5315783023834229, + "learning_rate": 1.6499631643158512e-05, + "loss": 0.2188, + "step": 1878 + }, + { + "epoch": 0.8897306895531222, + "grad_norm": 1.375877857208252, + "learning_rate": 1.6495744251450464e-05, + "loss": 0.238, + "step": 1879 + }, + { + "epoch": 0.8902042024267535, + "grad_norm": 2.0350253582000732, + "learning_rate": 1.6491855160843172e-05, + "loss": 0.2217, + "step": 1880 + }, + { + "epoch": 0.8906777153003848, + "grad_norm": 1.5811187028884888, + "learning_rate": 1.648796437235379e-05, + "loss": 0.233, + "step": 1881 + }, + { + "epoch": 0.8911512281740159, + "grad_norm": 1.5892693996429443, + "learning_rate": 1.6484071886999917e-05, + "loss": 0.2399, + "step": 1882 + }, + { + "epoch": 0.8916247410476472, + "grad_norm": 1.705972671508789, + "learning_rate": 1.6480177705799594e-05, + "loss": 0.2467, + "step": 1883 + }, + { + "epoch": 0.8920982539212785, + "grad_norm": 1.4446234703063965, + "learning_rate": 1.647628182977131e-05, + "loss": 0.2375, + "step": 1884 + }, + { + "epoch": 0.8925717667949097, + "grad_norm": 1.0984984636306763, + "learning_rate": 1.6472384259933986e-05, + "loss": 0.2171, + "step": 1885 + }, + { + "epoch": 0.893045279668541, + "grad_norm": 1.1526954174041748, + "learning_rate": 1.6468484997307003e-05, + "loss": 0.2437, + "step": 1886 + }, + { + "epoch": 0.8935187925421723, + "grad_norm": 1.1996570825576782, + "learning_rate": 1.646458404291017e-05, + "loss": 0.2596, + "step": 1887 + }, + { + "epoch": 0.8939923054158035, + "grad_norm": 2.095825433731079, + "learning_rate": 1.6460681397763746e-05, + "loss": 0.23, + "step": 1888 + }, + { + "epoch": 0.8944658182894347, + "grad_norm": 1.0920315980911255, + "learning_rate": 1.645677706288843e-05, + "loss": 0.2274, + "step": 1889 + }, + { + "epoch": 0.894939331163066, + "grad_norm": 1.171776294708252, + "learning_rate": 1.6452871039305365e-05, + "loss": 0.2355, + "step": 1890 + }, + { + "epoch": 0.8954128440366973, + "grad_norm": 1.1109461784362793, + "learning_rate": 1.6448963328036125e-05, + "loss": 0.2205, + "step": 1891 + }, + { + "epoch": 0.8958863569103285, + "grad_norm": 1.180391550064087, + "learning_rate": 1.6445053930102747e-05, + "loss": 0.2343, + "step": 1892 + }, + { + "epoch": 0.8963598697839598, + "grad_norm": 1.4910218715667725, + "learning_rate": 1.6441142846527688e-05, + "loss": 0.2145, + "step": 1893 + }, + { + "epoch": 0.896833382657591, + "grad_norm": 1.087941288948059, + "learning_rate": 1.6437230078333855e-05, + "loss": 0.2206, + "step": 1894 + }, + { + "epoch": 0.8973068955312222, + "grad_norm": 1.3966666460037231, + "learning_rate": 1.6433315626544598e-05, + "loss": 0.2289, + "step": 1895 + }, + { + "epoch": 0.8977804084048535, + "grad_norm": 1.0643726587295532, + "learning_rate": 1.6429399492183703e-05, + "loss": 0.2301, + "step": 1896 + }, + { + "epoch": 0.8982539212784848, + "grad_norm": 1.5619611740112305, + "learning_rate": 1.6425481676275396e-05, + "loss": 0.2353, + "step": 1897 + }, + { + "epoch": 0.898727434152116, + "grad_norm": 1.496078610420227, + "learning_rate": 1.642156217984434e-05, + "loss": 0.204, + "step": 1898 + }, + { + "epoch": 0.8992009470257473, + "grad_norm": 1.6565096378326416, + "learning_rate": 1.6417641003915653e-05, + "loss": 0.2353, + "step": 1899 + }, + { + "epoch": 0.8996744598993786, + "grad_norm": 1.215949535369873, + "learning_rate": 1.641371814951487e-05, + "loss": 0.2246, + "step": 1900 + }, + { + "epoch": 0.9001479727730097, + "grad_norm": 1.6516441106796265, + "learning_rate": 1.6409793617667976e-05, + "loss": 0.2404, + "step": 1901 + }, + { + "epoch": 0.900621485646641, + "grad_norm": 1.6095350980758667, + "learning_rate": 1.6405867409401403e-05, + "loss": 0.2316, + "step": 1902 + }, + { + "epoch": 0.9010949985202723, + "grad_norm": 1.5806862115859985, + "learning_rate": 1.6401939525742007e-05, + "loss": 0.2754, + "step": 1903 + }, + { + "epoch": 0.9015685113939035, + "grad_norm": 1.4216324090957642, + "learning_rate": 1.6398009967717086e-05, + "loss": 0.2322, + "step": 1904 + }, + { + "epoch": 0.9020420242675348, + "grad_norm": 1.7498546838760376, + "learning_rate": 1.639407873635438e-05, + "loss": 0.2352, + "step": 1905 + }, + { + "epoch": 0.9025155371411661, + "grad_norm": 0.969447135925293, + "learning_rate": 1.639014583268207e-05, + "loss": 0.2338, + "step": 1906 + }, + { + "epoch": 0.9029890500147972, + "grad_norm": 2.0034878253936768, + "learning_rate": 1.638621125772876e-05, + "loss": 0.2381, + "step": 1907 + }, + { + "epoch": 0.9034625628884285, + "grad_norm": 1.6915373802185059, + "learning_rate": 1.6382275012523503e-05, + "loss": 0.25, + "step": 1908 + }, + { + "epoch": 0.9039360757620598, + "grad_norm": 1.4558416604995728, + "learning_rate": 1.637833709809579e-05, + "loss": 0.2475, + "step": 1909 + }, + { + "epoch": 0.904409588635691, + "grad_norm": 1.660110354423523, + "learning_rate": 1.6374397515475543e-05, + "loss": 0.2226, + "step": 1910 + }, + { + "epoch": 0.9048831015093223, + "grad_norm": 1.2827781438827515, + "learning_rate": 1.637045626569312e-05, + "loss": 0.2301, + "step": 1911 + }, + { + "epoch": 0.9053566143829536, + "grad_norm": 2.6431972980499268, + "learning_rate": 1.6366513349779313e-05, + "loss": 0.2198, + "step": 1912 + }, + { + "epoch": 0.9058301272565847, + "grad_norm": 2.985435962677002, + "learning_rate": 1.6362568768765362e-05, + "loss": 0.2571, + "step": 1913 + }, + { + "epoch": 0.906303640130216, + "grad_norm": 2.624217987060547, + "learning_rate": 1.635862252368293e-05, + "loss": 0.2407, + "step": 1914 + }, + { + "epoch": 0.9067771530038473, + "grad_norm": 2.1122894287109375, + "learning_rate": 1.635467461556412e-05, + "loss": 0.229, + "step": 1915 + }, + { + "epoch": 0.9072506658774785, + "grad_norm": 1.5032678842544556, + "learning_rate": 1.6350725045441472e-05, + "loss": 0.2417, + "step": 1916 + }, + { + "epoch": 0.9077241787511098, + "grad_norm": 1.1959266662597656, + "learning_rate": 1.6346773814347952e-05, + "loss": 0.2241, + "step": 1917 + }, + { + "epoch": 0.9081976916247411, + "grad_norm": 2.0419347286224365, + "learning_rate": 1.634282092331697e-05, + "loss": 0.2398, + "step": 1918 + }, + { + "epoch": 0.9086712044983724, + "grad_norm": 2.168184280395508, + "learning_rate": 1.6338866373382366e-05, + "loss": 0.2275, + "step": 1919 + }, + { + "epoch": 0.9091447173720035, + "grad_norm": 2.695232629776001, + "learning_rate": 1.6334910165578413e-05, + "loss": 0.2307, + "step": 1920 + }, + { + "epoch": 0.9096182302456348, + "grad_norm": 2.0876898765563965, + "learning_rate": 1.6330952300939817e-05, + "loss": 0.2517, + "step": 1921 + }, + { + "epoch": 0.9100917431192661, + "grad_norm": 1.803197979927063, + "learning_rate": 1.6326992780501727e-05, + "loss": 0.2283, + "step": 1922 + }, + { + "epoch": 0.9105652559928973, + "grad_norm": 1.1686948537826538, + "learning_rate": 1.632303160529971e-05, + "loss": 0.2201, + "step": 1923 + }, + { + "epoch": 0.9110387688665286, + "grad_norm": 1.152266025543213, + "learning_rate": 1.6319068776369783e-05, + "loss": 0.2007, + "step": 1924 + }, + { + "epoch": 0.9115122817401599, + "grad_norm": 2.0857367515563965, + "learning_rate": 1.631510429474837e-05, + "loss": 0.2101, + "step": 1925 + }, + { + "epoch": 0.911985794613791, + "grad_norm": 1.8467085361480713, + "learning_rate": 1.6311138161472355e-05, + "loss": 0.2554, + "step": 1926 + }, + { + "epoch": 0.9124593074874223, + "grad_norm": 2.0132124423980713, + "learning_rate": 1.6307170377579038e-05, + "loss": 0.2226, + "step": 1927 + }, + { + "epoch": 0.9129328203610536, + "grad_norm": 2.2759344577789307, + "learning_rate": 1.6303200944106155e-05, + "loss": 0.2046, + "step": 1928 + }, + { + "epoch": 0.9134063332346848, + "grad_norm": 1.024839162826538, + "learning_rate": 1.6299229862091876e-05, + "loss": 0.2192, + "step": 1929 + }, + { + "epoch": 0.9138798461083161, + "grad_norm": 1.2846004962921143, + "learning_rate": 1.629525713257479e-05, + "loss": 0.2037, + "step": 1930 + }, + { + "epoch": 0.9143533589819474, + "grad_norm": 1.3948590755462646, + "learning_rate": 1.6291282756593937e-05, + "loss": 0.2416, + "step": 1931 + }, + { + "epoch": 0.9148268718555785, + "grad_norm": 2.4310286045074463, + "learning_rate": 1.628730673518877e-05, + "loss": 0.2268, + "step": 1932 + }, + { + "epoch": 0.9153003847292098, + "grad_norm": 1.4620105028152466, + "learning_rate": 1.6283329069399188e-05, + "loss": 0.2336, + "step": 1933 + }, + { + "epoch": 0.9157738976028411, + "grad_norm": 1.0024254322052002, + "learning_rate": 1.6279349760265497e-05, + "loss": 0.2386, + "step": 1934 + }, + { + "epoch": 0.9162474104764723, + "grad_norm": 1.0868014097213745, + "learning_rate": 1.6275368808828457e-05, + "loss": 0.2486, + "step": 1935 + }, + { + "epoch": 0.9167209233501036, + "grad_norm": 2.2207415103912354, + "learning_rate": 1.6271386216129245e-05, + "loss": 0.2333, + "step": 1936 + }, + { + "epoch": 0.9171944362237349, + "grad_norm": 1.3191596269607544, + "learning_rate": 1.6267401983209464e-05, + "loss": 0.2305, + "step": 1937 + }, + { + "epoch": 0.917667949097366, + "grad_norm": 1.842523217201233, + "learning_rate": 1.626341611111116e-05, + "loss": 0.2442, + "step": 1938 + }, + { + "epoch": 0.9181414619709973, + "grad_norm": 1.3315726518630981, + "learning_rate": 1.62594286008768e-05, + "loss": 0.24, + "step": 1939 + }, + { + "epoch": 0.9186149748446286, + "grad_norm": 2.0374701023101807, + "learning_rate": 1.6255439453549274e-05, + "loss": 0.2319, + "step": 1940 + }, + { + "epoch": 0.9190884877182598, + "grad_norm": 1.2693907022476196, + "learning_rate": 1.62514486701719e-05, + "loss": 0.2409, + "step": 1941 + }, + { + "epoch": 0.9195620005918911, + "grad_norm": 1.0667670965194702, + "learning_rate": 1.6247456251788444e-05, + "loss": 0.2414, + "step": 1942 + }, + { + "epoch": 0.9200355134655224, + "grad_norm": 1.2325665950775146, + "learning_rate": 1.624346219944307e-05, + "loss": 0.2227, + "step": 1943 + }, + { + "epoch": 0.9205090263391535, + "grad_norm": 2.379380941390991, + "learning_rate": 1.6239466514180393e-05, + "loss": 0.2169, + "step": 1944 + }, + { + "epoch": 0.9209825392127848, + "grad_norm": 1.3998842239379883, + "learning_rate": 1.623546919704544e-05, + "loss": 0.2434, + "step": 1945 + }, + { + "epoch": 0.9214560520864161, + "grad_norm": 1.3410049676895142, + "learning_rate": 1.6231470249083675e-05, + "loss": 0.2471, + "step": 1946 + }, + { + "epoch": 0.9219295649600473, + "grad_norm": 1.5513018369674683, + "learning_rate": 1.622746967134098e-05, + "loss": 0.2519, + "step": 1947 + }, + { + "epoch": 0.9224030778336786, + "grad_norm": 1.0884742736816406, + "learning_rate": 1.622346746486367e-05, + "loss": 0.2354, + "step": 1948 + }, + { + "epoch": 0.9228765907073099, + "grad_norm": 1.212095856666565, + "learning_rate": 1.6219463630698484e-05, + "loss": 0.2266, + "step": 1949 + }, + { + "epoch": 0.9233501035809412, + "grad_norm": 1.2227774858474731, + "learning_rate": 1.6215458169892582e-05, + "loss": 0.2474, + "step": 1950 + }, + { + "epoch": 0.9238236164545723, + "grad_norm": 1.3644037246704102, + "learning_rate": 1.6211451083493564e-05, + "loss": 0.2658, + "step": 1951 + }, + { + "epoch": 0.9242971293282036, + "grad_norm": 1.7697309255599976, + "learning_rate": 1.6207442372549436e-05, + "loss": 0.2118, + "step": 1952 + }, + { + "epoch": 0.9247706422018349, + "grad_norm": 1.1578645706176758, + "learning_rate": 1.6203432038108638e-05, + "loss": 0.2359, + "step": 1953 + }, + { + "epoch": 0.9252441550754661, + "grad_norm": 2.0185935497283936, + "learning_rate": 1.6199420081220035e-05, + "loss": 0.2172, + "step": 1954 + }, + { + "epoch": 0.9257176679490974, + "grad_norm": 1.3043760061264038, + "learning_rate": 1.619540650293292e-05, + "loss": 0.2327, + "step": 1955 + }, + { + "epoch": 0.9261911808227287, + "grad_norm": 0.9914813041687012, + "learning_rate": 1.6191391304297e-05, + "loss": 0.2352, + "step": 1956 + }, + { + "epoch": 0.9266646936963598, + "grad_norm": 2.466811180114746, + "learning_rate": 1.6187374486362414e-05, + "loss": 0.2604, + "step": 1957 + }, + { + "epoch": 0.9271382065699911, + "grad_norm": 1.7497360706329346, + "learning_rate": 1.6183356050179724e-05, + "loss": 0.2298, + "step": 1958 + }, + { + "epoch": 0.9276117194436224, + "grad_norm": 1.3502507209777832, + "learning_rate": 1.617933599679991e-05, + "loss": 0.2325, + "step": 1959 + }, + { + "epoch": 0.9280852323172536, + "grad_norm": 1.0206633806228638, + "learning_rate": 1.6175314327274377e-05, + "loss": 0.2066, + "step": 1960 + }, + { + "epoch": 0.9285587451908849, + "grad_norm": 1.507358193397522, + "learning_rate": 1.6171291042654957e-05, + "loss": 0.2335, + "step": 1961 + }, + { + "epoch": 0.9290322580645162, + "grad_norm": 1.3536046743392944, + "learning_rate": 1.6167266143993904e-05, + "loss": 0.2261, + "step": 1962 + }, + { + "epoch": 0.9295057709381473, + "grad_norm": 0.9721176028251648, + "learning_rate": 1.6163239632343883e-05, + "loss": 0.2134, + "step": 1963 + }, + { + "epoch": 0.9299792838117786, + "grad_norm": 1.2623369693756104, + "learning_rate": 1.6159211508757996e-05, + "loss": 0.2231, + "step": 1964 + }, + { + "epoch": 0.9304527966854099, + "grad_norm": 1.866391658782959, + "learning_rate": 1.615518177428976e-05, + "loss": 0.2441, + "step": 1965 + }, + { + "epoch": 0.9309263095590411, + "grad_norm": 1.2802680730819702, + "learning_rate": 1.6151150429993106e-05, + "loss": 0.2501, + "step": 1966 + }, + { + "epoch": 0.9313998224326724, + "grad_norm": 1.4352434873580933, + "learning_rate": 1.61471174769224e-05, + "loss": 0.2254, + "step": 1967 + }, + { + "epoch": 0.9318733353063037, + "grad_norm": 1.0232847929000854, + "learning_rate": 1.614308291613242e-05, + "loss": 0.2456, + "step": 1968 + }, + { + "epoch": 0.9323468481799349, + "grad_norm": 1.138548493385315, + "learning_rate": 1.6139046748678366e-05, + "loss": 0.2217, + "step": 1969 + }, + { + "epoch": 0.9328203610535661, + "grad_norm": 1.6218969821929932, + "learning_rate": 1.613500897561586e-05, + "loss": 0.2379, + "step": 1970 + }, + { + "epoch": 0.9332938739271974, + "grad_norm": 1.2417789697647095, + "learning_rate": 1.6130969598000945e-05, + "loss": 0.2304, + "step": 1971 + }, + { + "epoch": 0.9337673868008286, + "grad_norm": 1.4800796508789062, + "learning_rate": 1.6126928616890077e-05, + "loss": 0.2745, + "step": 1972 + }, + { + "epoch": 0.9342408996744599, + "grad_norm": 2.1304409503936768, + "learning_rate": 1.6122886033340134e-05, + "loss": 0.2395, + "step": 1973 + }, + { + "epoch": 0.9347144125480912, + "grad_norm": 1.1594027280807495, + "learning_rate": 1.6118841848408418e-05, + "loss": 0.2544, + "step": 1974 + }, + { + "epoch": 0.9351879254217224, + "grad_norm": 1.1337738037109375, + "learning_rate": 1.6114796063152648e-05, + "loss": 0.2067, + "step": 1975 + }, + { + "epoch": 0.9356614382953536, + "grad_norm": 1.8558512926101685, + "learning_rate": 1.611074867863096e-05, + "loss": 0.2369, + "step": 1976 + }, + { + "epoch": 0.9361349511689849, + "grad_norm": 1.4643030166625977, + "learning_rate": 1.61066996959019e-05, + "loss": 0.207, + "step": 1977 + }, + { + "epoch": 0.9366084640426161, + "grad_norm": 1.9388593435287476, + "learning_rate": 1.610264911602445e-05, + "loss": 0.2065, + "step": 1978 + }, + { + "epoch": 0.9370819769162474, + "grad_norm": 1.1505235433578491, + "learning_rate": 1.6098596940058e-05, + "loss": 0.2367, + "step": 1979 + }, + { + "epoch": 0.9375554897898787, + "grad_norm": 1.003483533859253, + "learning_rate": 1.6094543169062353e-05, + "loss": 0.2095, + "step": 1980 + }, + { + "epoch": 0.9380290026635099, + "grad_norm": 1.2500495910644531, + "learning_rate": 1.6090487804097734e-05, + "loss": 0.2377, + "step": 1981 + }, + { + "epoch": 0.9385025155371411, + "grad_norm": 1.7271959781646729, + "learning_rate": 1.6086430846224787e-05, + "loss": 0.2288, + "step": 1982 + }, + { + "epoch": 0.9389760284107724, + "grad_norm": 1.2142095565795898, + "learning_rate": 1.6082372296504568e-05, + "loss": 0.2158, + "step": 1983 + }, + { + "epoch": 0.9394495412844037, + "grad_norm": 0.9605250954627991, + "learning_rate": 1.6078312155998554e-05, + "loss": 0.205, + "step": 1984 + }, + { + "epoch": 0.9399230541580349, + "grad_norm": 1.0248587131500244, + "learning_rate": 1.6074250425768632e-05, + "loss": 0.2326, + "step": 1985 + }, + { + "epoch": 0.9403965670316662, + "grad_norm": 1.8944402933120728, + "learning_rate": 1.607018710687711e-05, + "loss": 0.227, + "step": 1986 + }, + { + "epoch": 0.9408700799052975, + "grad_norm": 0.8332754373550415, + "learning_rate": 1.6066122200386713e-05, + "loss": 0.2075, + "step": 1987 + }, + { + "epoch": 0.9413435927789287, + "grad_norm": 1.4765225648880005, + "learning_rate": 1.6062055707360575e-05, + "loss": 0.2085, + "step": 1988 + }, + { + "epoch": 0.9418171056525599, + "grad_norm": 1.3047866821289062, + "learning_rate": 1.6057987628862246e-05, + "loss": 0.2134, + "step": 1989 + }, + { + "epoch": 0.9422906185261912, + "grad_norm": 1.0208147764205933, + "learning_rate": 1.6053917965955698e-05, + "loss": 0.2091, + "step": 1990 + }, + { + "epoch": 0.9427641313998224, + "grad_norm": 1.4134602546691895, + "learning_rate": 1.6049846719705307e-05, + "loss": 0.2519, + "step": 1991 + }, + { + "epoch": 0.9432376442734537, + "grad_norm": 1.1737221479415894, + "learning_rate": 1.604577389117587e-05, + "loss": 0.2391, + "step": 1992 + }, + { + "epoch": 0.943711157147085, + "grad_norm": 1.1827245950698853, + "learning_rate": 1.60416994814326e-05, + "loss": 0.2178, + "step": 1993 + }, + { + "epoch": 0.9441846700207162, + "grad_norm": 1.5996443033218384, + "learning_rate": 1.6037623491541114e-05, + "loss": 0.2106, + "step": 1994 + }, + { + "epoch": 0.9446581828943474, + "grad_norm": 1.233630895614624, + "learning_rate": 1.6033545922567447e-05, + "loss": 0.2299, + "step": 1995 + }, + { + "epoch": 0.9451316957679787, + "grad_norm": 0.9926683306694031, + "learning_rate": 1.6029466775578054e-05, + "loss": 0.2464, + "step": 1996 + }, + { + "epoch": 0.9456052086416099, + "grad_norm": 1.6486265659332275, + "learning_rate": 1.602538605163979e-05, + "loss": 0.23, + "step": 1997 + }, + { + "epoch": 0.9460787215152412, + "grad_norm": 1.1196324825286865, + "learning_rate": 1.602130375181994e-05, + "loss": 0.2234, + "step": 1998 + }, + { + "epoch": 0.9465522343888725, + "grad_norm": 1.0825940370559692, + "learning_rate": 1.6017219877186173e-05, + "loss": 0.207, + "step": 1999 + }, + { + "epoch": 0.9470257472625037, + "grad_norm": 1.6445330381393433, + "learning_rate": 1.60131344288066e-05, + "loss": 0.2202, + "step": 2000 + }, + { + "epoch": 0.947499260136135, + "grad_norm": 1.1196541786193848, + "learning_rate": 1.600904740774973e-05, + "loss": 0.2332, + "step": 2001 + }, + { + "epoch": 0.9479727730097662, + "grad_norm": 1.173668622970581, + "learning_rate": 1.6004958815084476e-05, + "loss": 0.25, + "step": 2002 + }, + { + "epoch": 0.9484462858833974, + "grad_norm": 1.4854817390441895, + "learning_rate": 1.6000868651880175e-05, + "loss": 0.2444, + "step": 2003 + }, + { + "epoch": 0.9489197987570287, + "grad_norm": 1.8547112941741943, + "learning_rate": 1.599677691920657e-05, + "loss": 0.2711, + "step": 2004 + }, + { + "epoch": 0.94939331163066, + "grad_norm": 1.0332943201065063, + "learning_rate": 1.5992683618133817e-05, + "loss": 0.231, + "step": 2005 + }, + { + "epoch": 0.9498668245042912, + "grad_norm": 1.8548601865768433, + "learning_rate": 1.5988588749732472e-05, + "loss": 0.2198, + "step": 2006 + }, + { + "epoch": 0.9503403373779225, + "grad_norm": 1.8581714630126953, + "learning_rate": 1.5984492315073512e-05, + "loss": 0.2327, + "step": 2007 + }, + { + "epoch": 0.9508138502515537, + "grad_norm": 1.062828540802002, + "learning_rate": 1.5980394315228323e-05, + "loss": 0.224, + "step": 2008 + }, + { + "epoch": 0.9512873631251849, + "grad_norm": 1.4137210845947266, + "learning_rate": 1.5976294751268695e-05, + "loss": 0.2407, + "step": 2009 + }, + { + "epoch": 0.9517608759988162, + "grad_norm": 0.9517819285392761, + "learning_rate": 1.597219362426683e-05, + "loss": 0.2383, + "step": 2010 + }, + { + "epoch": 0.9522343888724475, + "grad_norm": 1.2080825567245483, + "learning_rate": 1.5968090935295335e-05, + "loss": 0.2174, + "step": 2011 + }, + { + "epoch": 0.9527079017460787, + "grad_norm": 1.1361576318740845, + "learning_rate": 1.596398668542723e-05, + "loss": 0.2225, + "step": 2012 + }, + { + "epoch": 0.95318141461971, + "grad_norm": 1.490893840789795, + "learning_rate": 1.5959880875735944e-05, + "loss": 0.2416, + "step": 2013 + }, + { + "epoch": 0.9536549274933412, + "grad_norm": 1.1593446731567383, + "learning_rate": 1.5955773507295313e-05, + "loss": 0.2363, + "step": 2014 + }, + { + "epoch": 0.9541284403669725, + "grad_norm": 1.0832358598709106, + "learning_rate": 1.5951664581179578e-05, + "loss": 0.2241, + "step": 2015 + }, + { + "epoch": 0.9546019532406037, + "grad_norm": 1.426303505897522, + "learning_rate": 1.5947554098463386e-05, + "loss": 0.2233, + "step": 2016 + }, + { + "epoch": 0.955075466114235, + "grad_norm": 1.7878800630569458, + "learning_rate": 1.5943442060221795e-05, + "loss": 0.2577, + "step": 2017 + }, + { + "epoch": 0.9555489789878663, + "grad_norm": 1.071764349937439, + "learning_rate": 1.5939328467530276e-05, + "loss": 0.2123, + "step": 2018 + }, + { + "epoch": 0.9560224918614975, + "grad_norm": 1.2568780183792114, + "learning_rate": 1.593521332146469e-05, + "loss": 0.2506, + "step": 2019 + }, + { + "epoch": 0.9564960047351287, + "grad_norm": 1.9318242073059082, + "learning_rate": 1.593109662310132e-05, + "loss": 0.2185, + "step": 2020 + }, + { + "epoch": 0.95696951760876, + "grad_norm": 1.3242648839950562, + "learning_rate": 1.5926978373516842e-05, + "loss": 0.2374, + "step": 2021 + }, + { + "epoch": 0.9574430304823912, + "grad_norm": 1.1390552520751953, + "learning_rate": 1.5922858573788356e-05, + "loss": 0.2206, + "step": 2022 + }, + { + "epoch": 0.9579165433560225, + "grad_norm": 1.5206451416015625, + "learning_rate": 1.5918737224993345e-05, + "loss": 0.2218, + "step": 2023 + }, + { + "epoch": 0.9583900562296538, + "grad_norm": 1.1097302436828613, + "learning_rate": 1.591461432820971e-05, + "loss": 0.1886, + "step": 2024 + }, + { + "epoch": 0.958863569103285, + "grad_norm": 1.2331650257110596, + "learning_rate": 1.591048988451576e-05, + "loss": 0.2343, + "step": 2025 + }, + { + "epoch": 0.9593370819769163, + "grad_norm": 2.753371000289917, + "learning_rate": 1.5906363894990197e-05, + "loss": 0.2412, + "step": 2026 + }, + { + "epoch": 0.9598105948505475, + "grad_norm": 0.9969640374183655, + "learning_rate": 1.590223636071214e-05, + "loss": 0.2389, + "step": 2027 + }, + { + "epoch": 0.9602841077241787, + "grad_norm": 2.2440433502197266, + "learning_rate": 1.58981072827611e-05, + "loss": 0.1999, + "step": 2028 + }, + { + "epoch": 0.96075762059781, + "grad_norm": 1.106385350227356, + "learning_rate": 1.5893976662217e-05, + "loss": 0.2168, + "step": 2029 + }, + { + "epoch": 0.9612311334714413, + "grad_norm": 2.1727712154388428, + "learning_rate": 1.588984450016017e-05, + "loss": 0.2357, + "step": 2030 + }, + { + "epoch": 0.9617046463450725, + "grad_norm": 1.639740228652954, + "learning_rate": 1.5885710797671326e-05, + "loss": 0.2533, + "step": 2031 + }, + { + "epoch": 0.9621781592187038, + "grad_norm": 0.9460485577583313, + "learning_rate": 1.5881575555831604e-05, + "loss": 0.2454, + "step": 2032 + }, + { + "epoch": 0.962651672092335, + "grad_norm": 1.232308268547058, + "learning_rate": 1.5877438775722536e-05, + "loss": 0.2304, + "step": 2033 + }, + { + "epoch": 0.9631251849659662, + "grad_norm": 1.2091732025146484, + "learning_rate": 1.587330045842606e-05, + "loss": 0.2175, + "step": 2034 + }, + { + "epoch": 0.9635986978395975, + "grad_norm": 1.2049965858459473, + "learning_rate": 1.586916060502451e-05, + "loss": 0.2437, + "step": 2035 + }, + { + "epoch": 0.9640722107132288, + "grad_norm": 1.1409215927124023, + "learning_rate": 1.586501921660062e-05, + "loss": 0.2253, + "step": 2036 + }, + { + "epoch": 0.96454572358686, + "grad_norm": 1.720682978630066, + "learning_rate": 1.5860876294237535e-05, + "loss": 0.2218, + "step": 2037 + }, + { + "epoch": 0.9650192364604913, + "grad_norm": 1.6841931343078613, + "learning_rate": 1.58567318390188e-05, + "loss": 0.2172, + "step": 2038 + }, + { + "epoch": 0.9654927493341225, + "grad_norm": 0.894889235496521, + "learning_rate": 1.5852585852028348e-05, + "loss": 0.232, + "step": 2039 + }, + { + "epoch": 0.9659662622077537, + "grad_norm": 1.6859184503555298, + "learning_rate": 1.584843833435053e-05, + "loss": 0.2161, + "step": 2040 + }, + { + "epoch": 0.966439775081385, + "grad_norm": 1.403887391090393, + "learning_rate": 1.5844289287070088e-05, + "loss": 0.2235, + "step": 2041 + }, + { + "epoch": 0.9669132879550163, + "grad_norm": 1.843876600265503, + "learning_rate": 1.5840138711272165e-05, + "loss": 0.201, + "step": 2042 + }, + { + "epoch": 0.9673868008286475, + "grad_norm": 0.9713191986083984, + "learning_rate": 1.58359866080423e-05, + "loss": 0.2385, + "step": 2043 + }, + { + "epoch": 0.9678603137022788, + "grad_norm": 1.014763593673706, + "learning_rate": 1.583183297846644e-05, + "loss": 0.231, + "step": 2044 + }, + { + "epoch": 0.96833382657591, + "grad_norm": 1.2346670627593994, + "learning_rate": 1.5827677823630922e-05, + "loss": 0.2111, + "step": 2045 + }, + { + "epoch": 0.9688073394495413, + "grad_norm": 1.3078184127807617, + "learning_rate": 1.5823521144622493e-05, + "loss": 0.2232, + "step": 2046 + }, + { + "epoch": 0.9692808523231725, + "grad_norm": 1.7039018869400024, + "learning_rate": 1.5819362942528288e-05, + "loss": 0.2271, + "step": 2047 + }, + { + "epoch": 0.9697543651968038, + "grad_norm": 2.7722675800323486, + "learning_rate": 1.5815203218435847e-05, + "loss": 0.1962, + "step": 2048 + }, + { + "epoch": 0.9702278780704351, + "grad_norm": 1.9730979204177856, + "learning_rate": 1.5811041973433103e-05, + "loss": 0.2382, + "step": 2049 + }, + { + "epoch": 0.9707013909440663, + "grad_norm": 0.8822180032730103, + "learning_rate": 1.580687920860839e-05, + "loss": 0.2026, + "step": 2050 + }, + { + "epoch": 0.9711749038176976, + "grad_norm": 1.793717861175537, + "learning_rate": 1.5802714925050444e-05, + "loss": 0.2262, + "step": 2051 + }, + { + "epoch": 0.9716484166913288, + "grad_norm": 1.0498217344284058, + "learning_rate": 1.5798549123848386e-05, + "loss": 0.2067, + "step": 2052 + }, + { + "epoch": 0.97212192956496, + "grad_norm": 1.1422361135482788, + "learning_rate": 1.5794381806091742e-05, + "loss": 0.2108, + "step": 2053 + }, + { + "epoch": 0.9725954424385913, + "grad_norm": 1.9571365118026733, + "learning_rate": 1.579021297287044e-05, + "loss": 0.2208, + "step": 2054 + }, + { + "epoch": 0.9730689553122226, + "grad_norm": 1.8578828573226929, + "learning_rate": 1.5786042625274795e-05, + "loss": 0.242, + "step": 2055 + }, + { + "epoch": 0.9735424681858538, + "grad_norm": 1.3278276920318604, + "learning_rate": 1.5781870764395515e-05, + "loss": 0.2039, + "step": 2056 + }, + { + "epoch": 0.9740159810594851, + "grad_norm": 1.0172450542449951, + "learning_rate": 1.5777697391323717e-05, + "loss": 0.2138, + "step": 2057 + }, + { + "epoch": 0.9744894939331163, + "grad_norm": 1.599645733833313, + "learning_rate": 1.577352250715091e-05, + "loss": 0.2476, + "step": 2058 + }, + { + "epoch": 0.9749630068067475, + "grad_norm": 1.6316616535186768, + "learning_rate": 1.5769346112968985e-05, + "loss": 0.2329, + "step": 2059 + }, + { + "epoch": 0.9754365196803788, + "grad_norm": 1.9370191097259521, + "learning_rate": 1.5765168209870243e-05, + "loss": 0.2382, + "step": 2060 + }, + { + "epoch": 0.9759100325540101, + "grad_norm": 1.4778871536254883, + "learning_rate": 1.5760988798947372e-05, + "loss": 0.2407, + "step": 2061 + }, + { + "epoch": 0.9763835454276413, + "grad_norm": 1.1680418252944946, + "learning_rate": 1.575680788129346e-05, + "loss": 0.2301, + "step": 2062 + }, + { + "epoch": 0.9768570583012726, + "grad_norm": 1.1255524158477783, + "learning_rate": 1.575262545800198e-05, + "loss": 0.2359, + "step": 2063 + }, + { + "epoch": 0.9773305711749039, + "grad_norm": 1.652750849723816, + "learning_rate": 1.5748441530166814e-05, + "loss": 0.2416, + "step": 2064 + }, + { + "epoch": 0.977804084048535, + "grad_norm": 1.3256607055664062, + "learning_rate": 1.5744256098882217e-05, + "loss": 0.2516, + "step": 2065 + }, + { + "epoch": 0.9782775969221663, + "grad_norm": 1.3550041913986206, + "learning_rate": 1.5740069165242854e-05, + "loss": 0.2473, + "step": 2066 + }, + { + "epoch": 0.9787511097957976, + "grad_norm": 1.5172563791275024, + "learning_rate": 1.5735880730343776e-05, + "loss": 0.2229, + "step": 2067 + }, + { + "epoch": 0.9792246226694288, + "grad_norm": 1.0142927169799805, + "learning_rate": 1.573169079528043e-05, + "loss": 0.2393, + "step": 2068 + }, + { + "epoch": 0.9796981355430601, + "grad_norm": 0.9681066274642944, + "learning_rate": 1.5727499361148647e-05, + "loss": 0.2281, + "step": 2069 + }, + { + "epoch": 0.9801716484166914, + "grad_norm": 1.673020362854004, + "learning_rate": 1.5723306429044663e-05, + "loss": 0.2345, + "step": 2070 + }, + { + "epoch": 0.9806451612903225, + "grad_norm": 1.8111623525619507, + "learning_rate": 1.57191120000651e-05, + "loss": 0.207, + "step": 2071 + }, + { + "epoch": 0.9811186741639538, + "grad_norm": 2.4038047790527344, + "learning_rate": 1.571491607530696e-05, + "loss": 0.222, + "step": 2072 + }, + { + "epoch": 0.9815921870375851, + "grad_norm": 2.7947847843170166, + "learning_rate": 1.5710718655867658e-05, + "loss": 0.2338, + "step": 2073 + }, + { + "epoch": 0.9820656999112163, + "grad_norm": 2.207833766937256, + "learning_rate": 1.5706519742844982e-05, + "loss": 0.2417, + "step": 2074 + }, + { + "epoch": 0.9825392127848476, + "grad_norm": 1.7318761348724365, + "learning_rate": 1.5702319337337118e-05, + "loss": 0.2323, + "step": 2075 + }, + { + "epoch": 0.9830127256584789, + "grad_norm": 1.349669098854065, + "learning_rate": 1.5698117440442643e-05, + "loss": 0.2377, + "step": 2076 + }, + { + "epoch": 0.9834862385321101, + "grad_norm": 1.8161946535110474, + "learning_rate": 1.5693914053260524e-05, + "loss": 0.2341, + "step": 2077 + }, + { + "epoch": 0.9839597514057413, + "grad_norm": 2.296698570251465, + "learning_rate": 1.5689709176890113e-05, + "loss": 0.226, + "step": 2078 + }, + { + "epoch": 0.9844332642793726, + "grad_norm": 1.6288074254989624, + "learning_rate": 1.5685502812431156e-05, + "loss": 0.2476, + "step": 2079 + }, + { + "epoch": 0.9849067771530039, + "grad_norm": 1.528737187385559, + "learning_rate": 1.5681294960983787e-05, + "loss": 0.2455, + "step": 2080 + }, + { + "epoch": 0.9853802900266351, + "grad_norm": 1.2330217361450195, + "learning_rate": 1.567708562364853e-05, + "loss": 0.2377, + "step": 2081 + }, + { + "epoch": 0.9858538029002664, + "grad_norm": 1.067199945449829, + "learning_rate": 1.56728748015263e-05, + "loss": 0.2244, + "step": 2082 + }, + { + "epoch": 0.9863273157738977, + "grad_norm": 1.0939373970031738, + "learning_rate": 1.566866249571839e-05, + "loss": 0.2275, + "step": 2083 + }, + { + "epoch": 0.9868008286475288, + "grad_norm": 1.1805354356765747, + "learning_rate": 1.566444870732649e-05, + "loss": 0.2211, + "step": 2084 + }, + { + "epoch": 0.9872743415211601, + "grad_norm": 2.558940887451172, + "learning_rate": 1.5660233437452676e-05, + "loss": 0.2368, + "step": 2085 + }, + { + "epoch": 0.9877478543947914, + "grad_norm": 1.4615000486373901, + "learning_rate": 1.565601668719941e-05, + "loss": 0.2194, + "step": 2086 + }, + { + "epoch": 0.9882213672684226, + "grad_norm": 1.341464638710022, + "learning_rate": 1.565179845766955e-05, + "loss": 0.2447, + "step": 2087 + }, + { + "epoch": 0.9886948801420539, + "grad_norm": 1.341123104095459, + "learning_rate": 1.564757874996632e-05, + "loss": 0.2241, + "step": 2088 + }, + { + "epoch": 0.9891683930156852, + "grad_norm": 0.9674537181854248, + "learning_rate": 1.5643357565193355e-05, + "loss": 0.2312, + "step": 2089 + }, + { + "epoch": 0.9896419058893163, + "grad_norm": 1.1203300952911377, + "learning_rate": 1.5639134904454663e-05, + "loss": 0.2513, + "step": 2090 + }, + { + "epoch": 0.9901154187629476, + "grad_norm": 1.7176257371902466, + "learning_rate": 1.5634910768854634e-05, + "loss": 0.2076, + "step": 2091 + }, + { + "epoch": 0.9905889316365789, + "grad_norm": 2.126793146133423, + "learning_rate": 1.5630685159498057e-05, + "loss": 0.2445, + "step": 2092 + }, + { + "epoch": 0.9910624445102101, + "grad_norm": 1.8809027671813965, + "learning_rate": 1.56264580774901e-05, + "loss": 0.2538, + "step": 2093 + }, + { + "epoch": 0.9915359573838414, + "grad_norm": 1.7825748920440674, + "learning_rate": 1.562222952393631e-05, + "loss": 0.2337, + "step": 2094 + }, + { + "epoch": 0.9920094702574727, + "grad_norm": 1.1806340217590332, + "learning_rate": 1.5617999499942623e-05, + "loss": 0.2319, + "step": 2095 + }, + { + "epoch": 0.9924829831311038, + "grad_norm": 1.4441994428634644, + "learning_rate": 1.5613768006615367e-05, + "loss": 0.2383, + "step": 2096 + }, + { + "epoch": 0.9929564960047351, + "grad_norm": 0.8531612157821655, + "learning_rate": 1.5609535045061247e-05, + "loss": 0.2205, + "step": 2097 + }, + { + "epoch": 0.9934300088783664, + "grad_norm": 1.457108497619629, + "learning_rate": 1.5605300616387347e-05, + "loss": 0.2266, + "step": 2098 + }, + { + "epoch": 0.9939035217519976, + "grad_norm": 1.050585389137268, + "learning_rate": 1.5601064721701155e-05, + "loss": 0.2373, + "step": 2099 + }, + { + "epoch": 0.9943770346256289, + "grad_norm": 0.9880205392837524, + "learning_rate": 1.5596827362110512e-05, + "loss": 0.2344, + "step": 2100 + }, + { + "epoch": 0.9948505474992602, + "grad_norm": 1.1303571462631226, + "learning_rate": 1.559258853872367e-05, + "loss": 0.2044, + "step": 2101 + }, + { + "epoch": 0.9953240603728913, + "grad_norm": 1.6071451902389526, + "learning_rate": 1.5588348252649246e-05, + "loss": 0.2096, + "step": 2102 + }, + { + "epoch": 0.9957975732465226, + "grad_norm": 1.32492995262146, + "learning_rate": 1.5584106504996247e-05, + "loss": 0.2312, + "step": 2103 + }, + { + "epoch": 0.9962710861201539, + "grad_norm": 1.9682942628860474, + "learning_rate": 1.5579863296874066e-05, + "loss": 0.2378, + "step": 2104 + }, + { + "epoch": 0.9967445989937851, + "grad_norm": 1.4948111772537231, + "learning_rate": 1.5575618629392466e-05, + "loss": 0.2195, + "step": 2105 + }, + { + "epoch": 0.9972181118674164, + "grad_norm": 1.1894011497497559, + "learning_rate": 1.5571372503661604e-05, + "loss": 0.2456, + "step": 2106 + }, + { + "epoch": 0.9976916247410477, + "grad_norm": 1.0907820463180542, + "learning_rate": 1.556712492079201e-05, + "loss": 0.2328, + "step": 2107 + }, + { + "epoch": 0.998165137614679, + "grad_norm": 1.6044102907180786, + "learning_rate": 1.5562875881894605e-05, + "loss": 0.2159, + "step": 2108 + }, + { + "epoch": 0.9986386504883101, + "grad_norm": 2.430710554122925, + "learning_rate": 1.5558625388080676e-05, + "loss": 0.2135, + "step": 2109 + }, + { + "epoch": 0.9991121633619414, + "grad_norm": 1.4718191623687744, + "learning_rate": 1.5554373440461904e-05, + "loss": 0.2448, + "step": 2110 + }, + { + "epoch": 0.9995856762355727, + "grad_norm": 1.1206008195877075, + "learning_rate": 1.5550120040150338e-05, + "loss": 0.2055, + "step": 2111 + }, + { + "epoch": 1.0000591891092039, + "grad_norm": 1.4358619451522827, + "learning_rate": 1.5545865188258423e-05, + "loss": 0.2423, + "step": 2112 + }, + { + "epoch": 1.0005327019828352, + "grad_norm": 1.060018539428711, + "learning_rate": 1.5541608885898968e-05, + "loss": 0.2124, + "step": 2113 + }, + { + "epoch": 1.0010062148564665, + "grad_norm": 1.6677261590957642, + "learning_rate": 1.553735113418517e-05, + "loss": 0.2323, + "step": 2114 + }, + { + "epoch": 1.0014797277300977, + "grad_norm": 1.8145389556884766, + "learning_rate": 1.55330919342306e-05, + "loss": 0.2383, + "step": 2115 + }, + { + "epoch": 1.0019532406037288, + "grad_norm": 1.2736999988555908, + "learning_rate": 1.552883128714922e-05, + "loss": 0.2094, + "step": 2116 + }, + { + "epoch": 1.00242675347736, + "grad_norm": 1.4934360980987549, + "learning_rate": 1.552456919405535e-05, + "loss": 0.2103, + "step": 2117 + }, + { + "epoch": 1.0029002663509914, + "grad_norm": 1.330871343612671, + "learning_rate": 1.5520305656063702e-05, + "loss": 0.2176, + "step": 2118 + }, + { + "epoch": 1.0033737792246227, + "grad_norm": 1.226676344871521, + "learning_rate": 1.5516040674289364e-05, + "loss": 0.2181, + "step": 2119 + }, + { + "epoch": 1.003847292098254, + "grad_norm": 0.9475164413452148, + "learning_rate": 1.5511774249847806e-05, + "loss": 0.2092, + "step": 2120 + }, + { + "epoch": 1.0043208049718853, + "grad_norm": 2.017439126968384, + "learning_rate": 1.5507506383854867e-05, + "loss": 0.2404, + "step": 2121 + }, + { + "epoch": 1.0047943178455163, + "grad_norm": 2.2948458194732666, + "learning_rate": 1.5503237077426762e-05, + "loss": 0.2225, + "step": 2122 + }, + { + "epoch": 1.0052678307191476, + "grad_norm": 1.144311547279358, + "learning_rate": 1.5498966331680093e-05, + "loss": 0.2437, + "step": 2123 + }, + { + "epoch": 1.005741343592779, + "grad_norm": 1.512420892715454, + "learning_rate": 1.5494694147731822e-05, + "loss": 0.2337, + "step": 2124 + }, + { + "epoch": 1.0062148564664102, + "grad_norm": 1.3547443151474, + "learning_rate": 1.549042052669931e-05, + "loss": 0.2101, + "step": 2125 + }, + { + "epoch": 1.0066883693400415, + "grad_norm": 1.429449200630188, + "learning_rate": 1.5486145469700278e-05, + "loss": 0.2054, + "step": 2126 + }, + { + "epoch": 1.0071618822136728, + "grad_norm": 2.169478178024292, + "learning_rate": 1.5481868977852823e-05, + "loss": 0.2465, + "step": 2127 + }, + { + "epoch": 1.007635395087304, + "grad_norm": 1.4005268812179565, + "learning_rate": 1.547759105227542e-05, + "loss": 0.2212, + "step": 2128 + }, + { + "epoch": 1.008108907960935, + "grad_norm": 1.521437644958496, + "learning_rate": 1.547331169408692e-05, + "loss": 0.2458, + "step": 2129 + }, + { + "epoch": 1.0085824208345664, + "grad_norm": 1.0772839784622192, + "learning_rate": 1.5469030904406554e-05, + "loss": 0.2265, + "step": 2130 + }, + { + "epoch": 1.0090559337081977, + "grad_norm": 1.4369512796401978, + "learning_rate": 1.546474868435391e-05, + "loss": 0.2037, + "step": 2131 + }, + { + "epoch": 1.009529446581829, + "grad_norm": 1.2946951389312744, + "learning_rate": 1.546046503504897e-05, + "loss": 0.2261, + "step": 2132 + }, + { + "epoch": 1.0100029594554603, + "grad_norm": 1.3359379768371582, + "learning_rate": 1.5456179957612074e-05, + "loss": 0.2487, + "step": 2133 + }, + { + "epoch": 1.0104764723290915, + "grad_norm": 2.1459126472473145, + "learning_rate": 1.545189345316395e-05, + "loss": 0.2004, + "step": 2134 + }, + { + "epoch": 1.0109499852027226, + "grad_norm": 2.0224199295043945, + "learning_rate": 1.5447605522825687e-05, + "loss": 0.2237, + "step": 2135 + }, + { + "epoch": 1.011423498076354, + "grad_norm": 1.4307156801223755, + "learning_rate": 1.5443316167718756e-05, + "loss": 0.2076, + "step": 2136 + }, + { + "epoch": 1.0118970109499852, + "grad_norm": 1.0832078456878662, + "learning_rate": 1.543902538896499e-05, + "loss": 0.2175, + "step": 2137 + }, + { + "epoch": 1.0123705238236165, + "grad_norm": 1.5245671272277832, + "learning_rate": 1.543473318768661e-05, + "loss": 0.245, + "step": 2138 + }, + { + "epoch": 1.0128440366972478, + "grad_norm": 1.020795226097107, + "learning_rate": 1.5430439565006193e-05, + "loss": 0.2282, + "step": 2139 + }, + { + "epoch": 1.013317549570879, + "grad_norm": 2.5934853553771973, + "learning_rate": 1.5426144522046692e-05, + "loss": 0.2264, + "step": 2140 + }, + { + "epoch": 1.0137910624445101, + "grad_norm": 1.0803589820861816, + "learning_rate": 1.5421848059931443e-05, + "loss": 0.2158, + "step": 2141 + }, + { + "epoch": 1.0142645753181414, + "grad_norm": 1.115559697151184, + "learning_rate": 1.541755017978414e-05, + "loss": 0.2259, + "step": 2142 + }, + { + "epoch": 1.0147380881917727, + "grad_norm": 1.1411402225494385, + "learning_rate": 1.5413250882728847e-05, + "loss": 0.2196, + "step": 2143 + }, + { + "epoch": 1.015211601065404, + "grad_norm": 1.0891308784484863, + "learning_rate": 1.540895016989001e-05, + "loss": 0.2257, + "step": 2144 + }, + { + "epoch": 1.0156851139390353, + "grad_norm": 1.2236292362213135, + "learning_rate": 1.5404648042392437e-05, + "loss": 0.2483, + "step": 2145 + }, + { + "epoch": 1.0161586268126666, + "grad_norm": 1.3101778030395508, + "learning_rate": 1.5400344501361305e-05, + "loss": 0.2389, + "step": 2146 + }, + { + "epoch": 1.0166321396862976, + "grad_norm": 1.1559494733810425, + "learning_rate": 1.5396039547922174e-05, + "loss": 0.2229, + "step": 2147 + }, + { + "epoch": 1.017105652559929, + "grad_norm": 1.1943864822387695, + "learning_rate": 1.5391733183200952e-05, + "loss": 0.2181, + "step": 2148 + }, + { + "epoch": 1.0175791654335602, + "grad_norm": 1.2304420471191406, + "learning_rate": 1.5387425408323934e-05, + "loss": 0.2105, + "step": 2149 + }, + { + "epoch": 1.0180526783071915, + "grad_norm": 1.0440901517868042, + "learning_rate": 1.5383116224417767e-05, + "loss": 0.2332, + "step": 2150 + }, + { + "epoch": 1.0185261911808228, + "grad_norm": 1.2044522762298584, + "learning_rate": 1.5378805632609487e-05, + "loss": 0.2375, + "step": 2151 + }, + { + "epoch": 1.018999704054454, + "grad_norm": 1.4749743938446045, + "learning_rate": 1.5374493634026486e-05, + "loss": 0.1913, + "step": 2152 + }, + { + "epoch": 1.0194732169280851, + "grad_norm": 1.4272947311401367, + "learning_rate": 1.537018022979652e-05, + "loss": 0.22, + "step": 2153 + }, + { + "epoch": 1.0199467298017164, + "grad_norm": 1.7914042472839355, + "learning_rate": 1.5365865421047724e-05, + "loss": 0.2344, + "step": 2154 + }, + { + "epoch": 1.0204202426753477, + "grad_norm": 1.1792020797729492, + "learning_rate": 1.5361549208908594e-05, + "loss": 0.2023, + "step": 2155 + }, + { + "epoch": 1.020893755548979, + "grad_norm": 1.5299314260482788, + "learning_rate": 1.5357231594507988e-05, + "loss": 0.2242, + "step": 2156 + }, + { + "epoch": 1.0213672684226103, + "grad_norm": 1.1294841766357422, + "learning_rate": 1.5352912578975144e-05, + "loss": 0.2355, + "step": 2157 + }, + { + "epoch": 1.0218407812962416, + "grad_norm": 1.5787099599838257, + "learning_rate": 1.5348592163439655e-05, + "loss": 0.2163, + "step": 2158 + }, + { + "epoch": 1.0223142941698729, + "grad_norm": 1.2531263828277588, + "learning_rate": 1.5344270349031486e-05, + "loss": 0.2435, + "step": 2159 + }, + { + "epoch": 1.022787807043504, + "grad_norm": 1.2263544797897339, + "learning_rate": 1.5339947136880962e-05, + "loss": 0.213, + "step": 2160 + }, + { + "epoch": 1.0232613199171352, + "grad_norm": 1.8795762062072754, + "learning_rate": 1.5335622528118777e-05, + "loss": 0.2309, + "step": 2161 + }, + { + "epoch": 1.0237348327907665, + "grad_norm": 1.558018684387207, + "learning_rate": 1.5331296523876e-05, + "loss": 0.231, + "step": 2162 + }, + { + "epoch": 1.0242083456643978, + "grad_norm": 1.8025420904159546, + "learning_rate": 1.5326969125284043e-05, + "loss": 0.202, + "step": 2163 + }, + { + "epoch": 1.024681858538029, + "grad_norm": 1.190397024154663, + "learning_rate": 1.5322640333474704e-05, + "loss": 0.2224, + "step": 2164 + }, + { + "epoch": 1.0251553714116604, + "grad_norm": 1.9141662120819092, + "learning_rate": 1.5318310149580133e-05, + "loss": 0.2584, + "step": 2165 + }, + { + "epoch": 1.0256288842852914, + "grad_norm": 1.1253691911697388, + "learning_rate": 1.531397857473285e-05, + "loss": 0.2169, + "step": 2166 + }, + { + "epoch": 1.0261023971589227, + "grad_norm": 1.264083981513977, + "learning_rate": 1.530964561006574e-05, + "loss": 0.2143, + "step": 2167 + }, + { + "epoch": 1.026575910032554, + "grad_norm": 1.5587284564971924, + "learning_rate": 1.5305311256712038e-05, + "loss": 0.2312, + "step": 2168 + }, + { + "epoch": 1.0270494229061853, + "grad_norm": 1.0813679695129395, + "learning_rate": 1.5300975515805358e-05, + "loss": 0.2241, + "step": 2169 + }, + { + "epoch": 1.0275229357798166, + "grad_norm": 1.4243192672729492, + "learning_rate": 1.5296638388479673e-05, + "loss": 0.2121, + "step": 2170 + }, + { + "epoch": 1.0279964486534479, + "grad_norm": 1.1746392250061035, + "learning_rate": 1.5292299875869313e-05, + "loss": 0.238, + "step": 2171 + }, + { + "epoch": 1.028469961527079, + "grad_norm": 1.360106110572815, + "learning_rate": 1.528795997910898e-05, + "loss": 0.2336, + "step": 2172 + }, + { + "epoch": 1.0289434744007102, + "grad_norm": 1.4826574325561523, + "learning_rate": 1.5283618699333725e-05, + "loss": 0.2278, + "step": 2173 + }, + { + "epoch": 1.0294169872743415, + "grad_norm": 1.1893174648284912, + "learning_rate": 1.5279276037678972e-05, + "loss": 0.2175, + "step": 2174 + }, + { + "epoch": 1.0298905001479728, + "grad_norm": 1.326848030090332, + "learning_rate": 1.5274931995280504e-05, + "loss": 0.2246, + "step": 2175 + }, + { + "epoch": 1.030364013021604, + "grad_norm": 1.2376220226287842, + "learning_rate": 1.527058657327446e-05, + "loss": 0.2405, + "step": 2176 + }, + { + "epoch": 1.0308375258952354, + "grad_norm": 0.9731859564781189, + "learning_rate": 1.5266239772797343e-05, + "loss": 0.2302, + "step": 2177 + }, + { + "epoch": 1.0313110387688664, + "grad_norm": 1.755089521408081, + "learning_rate": 1.5261891594986014e-05, + "loss": 0.1949, + "step": 2178 + }, + { + "epoch": 1.0317845516424977, + "grad_norm": 2.296330451965332, + "learning_rate": 1.5257542040977706e-05, + "loss": 0.2192, + "step": 2179 + }, + { + "epoch": 1.032258064516129, + "grad_norm": 2.243488311767578, + "learning_rate": 1.5253191111909997e-05, + "loss": 0.2227, + "step": 2180 + }, + { + "epoch": 1.0327315773897603, + "grad_norm": 1.5681140422821045, + "learning_rate": 1.524883880892083e-05, + "loss": 0.2159, + "step": 2181 + }, + { + "epoch": 1.0332050902633916, + "grad_norm": 1.9723291397094727, + "learning_rate": 1.5244485133148507e-05, + "loss": 0.232, + "step": 2182 + }, + { + "epoch": 1.0336786031370229, + "grad_norm": 1.862541913986206, + "learning_rate": 1.5240130085731695e-05, + "loss": 0.2463, + "step": 2183 + }, + { + "epoch": 1.034152116010654, + "grad_norm": 1.2561465501785278, + "learning_rate": 1.5235773667809409e-05, + "loss": 0.2121, + "step": 2184 + }, + { + "epoch": 1.0346256288842852, + "grad_norm": 1.7062082290649414, + "learning_rate": 1.5231415880521032e-05, + "loss": 0.2215, + "step": 2185 + }, + { + "epoch": 1.0350991417579165, + "grad_norm": 1.24093496799469, + "learning_rate": 1.5227056725006301e-05, + "loss": 0.2052, + "step": 2186 + }, + { + "epoch": 1.0355726546315478, + "grad_norm": 1.7129935026168823, + "learning_rate": 1.5222696202405307e-05, + "loss": 0.1988, + "step": 2187 + }, + { + "epoch": 1.036046167505179, + "grad_norm": 1.3242411613464355, + "learning_rate": 1.5218334313858507e-05, + "loss": 0.2089, + "step": 2188 + }, + { + "epoch": 1.0365196803788104, + "grad_norm": 2.027369737625122, + "learning_rate": 1.5213971060506709e-05, + "loss": 0.2572, + "step": 2189 + }, + { + "epoch": 1.0369931932524414, + "grad_norm": 1.3638339042663574, + "learning_rate": 1.520960644349108e-05, + "loss": 0.2261, + "step": 2190 + }, + { + "epoch": 1.0374667061260727, + "grad_norm": 1.4748543500900269, + "learning_rate": 1.5205240463953146e-05, + "loss": 0.2149, + "step": 2191 + }, + { + "epoch": 1.037940218999704, + "grad_norm": 1.1796306371688843, + "learning_rate": 1.5200873123034783e-05, + "loss": 0.237, + "step": 2192 + }, + { + "epoch": 1.0384137318733353, + "grad_norm": 1.1525838375091553, + "learning_rate": 1.5196504421878229e-05, + "loss": 0.2385, + "step": 2193 + }, + { + "epoch": 1.0388872447469666, + "grad_norm": 2.3929810523986816, + "learning_rate": 1.5192134361626074e-05, + "loss": 0.2005, + "step": 2194 + }, + { + "epoch": 1.0393607576205979, + "grad_norm": 1.697789192199707, + "learning_rate": 1.5187762943421266e-05, + "loss": 0.2422, + "step": 2195 + }, + { + "epoch": 1.0398342704942292, + "grad_norm": 1.2148844003677368, + "learning_rate": 1.5183390168407108e-05, + "loss": 0.2188, + "step": 2196 + }, + { + "epoch": 1.0403077833678602, + "grad_norm": 1.2797152996063232, + "learning_rate": 1.5179016037727256e-05, + "loss": 0.215, + "step": 2197 + }, + { + "epoch": 1.0407812962414915, + "grad_norm": 1.7191182374954224, + "learning_rate": 1.5174640552525724e-05, + "loss": 0.2422, + "step": 2198 + }, + { + "epoch": 1.0412548091151228, + "grad_norm": 1.3166422843933105, + "learning_rate": 1.5170263713946873e-05, + "loss": 0.22, + "step": 2199 + }, + { + "epoch": 1.041728321988754, + "grad_norm": 1.1417112350463867, + "learning_rate": 1.5165885523135426e-05, + "loss": 0.2137, + "step": 2200 + }, + { + "epoch": 1.0422018348623854, + "grad_norm": 1.1441370248794556, + "learning_rate": 1.516150598123646e-05, + "loss": 0.2146, + "step": 2201 + }, + { + "epoch": 1.0426753477360167, + "grad_norm": 1.5537104606628418, + "learning_rate": 1.5157125089395397e-05, + "loss": 0.21, + "step": 2202 + }, + { + "epoch": 1.0431488606096477, + "grad_norm": 1.3530882596969604, + "learning_rate": 1.5152742848758018e-05, + "loss": 0.2085, + "step": 2203 + }, + { + "epoch": 1.043622373483279, + "grad_norm": 1.541207194328308, + "learning_rate": 1.5148359260470456e-05, + "loss": 0.2315, + "step": 2204 + }, + { + "epoch": 1.0440958863569103, + "grad_norm": 1.0132625102996826, + "learning_rate": 1.5143974325679196e-05, + "loss": 0.2008, + "step": 2205 + }, + { + "epoch": 1.0445693992305416, + "grad_norm": 1.1668498516082764, + "learning_rate": 1.5139588045531077e-05, + "loss": 0.2239, + "step": 2206 + }, + { + "epoch": 1.0450429121041729, + "grad_norm": 1.6192450523376465, + "learning_rate": 1.5135200421173288e-05, + "loss": 0.2306, + "step": 2207 + }, + { + "epoch": 1.0455164249778042, + "grad_norm": 1.6233320236206055, + "learning_rate": 1.5130811453753369e-05, + "loss": 0.2243, + "step": 2208 + }, + { + "epoch": 1.0459899378514352, + "grad_norm": 1.1391911506652832, + "learning_rate": 1.512642114441921e-05, + "loss": 0.1966, + "step": 2209 + }, + { + "epoch": 1.0464634507250665, + "grad_norm": 1.7349532842636108, + "learning_rate": 1.512202949431906e-05, + "loss": 0.2256, + "step": 2210 + }, + { + "epoch": 1.0469369635986978, + "grad_norm": 1.411200761795044, + "learning_rate": 1.5117636504601505e-05, + "loss": 0.2225, + "step": 2211 + }, + { + "epoch": 1.047410476472329, + "grad_norm": 1.5413966178894043, + "learning_rate": 1.5113242176415495e-05, + "loss": 0.2301, + "step": 2212 + }, + { + "epoch": 1.0478839893459604, + "grad_norm": 1.4936213493347168, + "learning_rate": 1.5108846510910322e-05, + "loss": 0.2299, + "step": 2213 + }, + { + "epoch": 1.0483575022195917, + "grad_norm": 2.126307964324951, + "learning_rate": 1.5104449509235628e-05, + "loss": 0.2195, + "step": 2214 + }, + { + "epoch": 1.0488310150932227, + "grad_norm": 1.6920348405838013, + "learning_rate": 1.5100051172541408e-05, + "loss": 0.2081, + "step": 2215 + }, + { + "epoch": 1.049304527966854, + "grad_norm": 1.1273527145385742, + "learning_rate": 1.5095651501978007e-05, + "loss": 0.2157, + "step": 2216 + }, + { + "epoch": 1.0497780408404853, + "grad_norm": 0.9604322910308838, + "learning_rate": 1.5091250498696113e-05, + "loss": 0.2404, + "step": 2217 + }, + { + "epoch": 1.0502515537141166, + "grad_norm": 1.639671802520752, + "learning_rate": 1.508684816384677e-05, + "loss": 0.2384, + "step": 2218 + }, + { + "epoch": 1.050725066587748, + "grad_norm": 1.559471845626831, + "learning_rate": 1.5082444498581362e-05, + "loss": 0.2078, + "step": 2219 + }, + { + "epoch": 1.0511985794613792, + "grad_norm": 1.1831077337265015, + "learning_rate": 1.5078039504051626e-05, + "loss": 0.2396, + "step": 2220 + }, + { + "epoch": 1.0516720923350102, + "grad_norm": 1.6754920482635498, + "learning_rate": 1.5073633181409645e-05, + "loss": 0.2161, + "step": 2221 + }, + { + "epoch": 1.0521456052086415, + "grad_norm": 1.6285377740859985, + "learning_rate": 1.5069225531807852e-05, + "loss": 0.2252, + "step": 2222 + }, + { + "epoch": 1.0526191180822728, + "grad_norm": 0.9481242895126343, + "learning_rate": 1.5064816556399027e-05, + "loss": 0.2061, + "step": 2223 + }, + { + "epoch": 1.053092630955904, + "grad_norm": 1.5364612340927124, + "learning_rate": 1.506040625633629e-05, + "loss": 0.229, + "step": 2224 + }, + { + "epoch": 1.0535661438295354, + "grad_norm": 1.2100945711135864, + "learning_rate": 1.5055994632773119e-05, + "loss": 0.2278, + "step": 2225 + }, + { + "epoch": 1.0540396567031667, + "grad_norm": 1.6874003410339355, + "learning_rate": 1.5051581686863323e-05, + "loss": 0.2117, + "step": 2226 + }, + { + "epoch": 1.054513169576798, + "grad_norm": 1.3330941200256348, + "learning_rate": 1.5047167419761075e-05, + "loss": 0.2267, + "step": 2227 + }, + { + "epoch": 1.054986682450429, + "grad_norm": 1.1632763147354126, + "learning_rate": 1.5042751832620879e-05, + "loss": 0.234, + "step": 2228 + }, + { + "epoch": 1.0554601953240603, + "grad_norm": 1.2275227308273315, + "learning_rate": 1.5038334926597587e-05, + "loss": 0.2079, + "step": 2229 + }, + { + "epoch": 1.0559337081976916, + "grad_norm": 1.616982102394104, + "learning_rate": 1.50339167028464e-05, + "loss": 0.2224, + "step": 2230 + }, + { + "epoch": 1.056407221071323, + "grad_norm": 1.067596197128296, + "learning_rate": 1.5029497162522865e-05, + "loss": 0.229, + "step": 2231 + }, + { + "epoch": 1.0568807339449542, + "grad_norm": 1.3072737455368042, + "learning_rate": 1.5025076306782866e-05, + "loss": 0.205, + "step": 2232 + }, + { + "epoch": 1.0573542468185855, + "grad_norm": 1.5054287910461426, + "learning_rate": 1.5020654136782637e-05, + "loss": 0.2159, + "step": 2233 + }, + { + "epoch": 1.0578277596922165, + "grad_norm": 1.238438606262207, + "learning_rate": 1.5016230653678757e-05, + "loss": 0.2145, + "step": 2234 + }, + { + "epoch": 1.0583012725658478, + "grad_norm": 1.200659155845642, + "learning_rate": 1.5011805858628137e-05, + "loss": 0.26, + "step": 2235 + }, + { + "epoch": 1.0587747854394791, + "grad_norm": 1.4251799583435059, + "learning_rate": 1.5007379752788045e-05, + "loss": 0.2004, + "step": 2236 + }, + { + "epoch": 1.0592482983131104, + "grad_norm": 1.2545171976089478, + "learning_rate": 1.5002952337316088e-05, + "loss": 0.2127, + "step": 2237 + }, + { + "epoch": 1.0597218111867417, + "grad_norm": 1.177280306816101, + "learning_rate": 1.499852361337021e-05, + "loss": 0.212, + "step": 2238 + }, + { + "epoch": 1.060195324060373, + "grad_norm": 1.241126537322998, + "learning_rate": 1.4994093582108704e-05, + "loss": 0.2193, + "step": 2239 + }, + { + "epoch": 1.060668836934004, + "grad_norm": 1.5119160413742065, + "learning_rate": 1.49896622446902e-05, + "loss": 0.2198, + "step": 2240 + }, + { + "epoch": 1.0611423498076353, + "grad_norm": 1.6282936334609985, + "learning_rate": 1.498522960227367e-05, + "loss": 0.2185, + "step": 2241 + }, + { + "epoch": 1.0616158626812666, + "grad_norm": 0.9974349737167358, + "learning_rate": 1.4980795656018432e-05, + "loss": 0.2323, + "step": 2242 + }, + { + "epoch": 1.062089375554898, + "grad_norm": 1.2673429250717163, + "learning_rate": 1.4976360407084141e-05, + "loss": 0.2032, + "step": 2243 + }, + { + "epoch": 1.0625628884285292, + "grad_norm": 1.0515581369400024, + "learning_rate": 1.4971923856630792e-05, + "loss": 0.201, + "step": 2244 + }, + { + "epoch": 1.0630364013021605, + "grad_norm": 1.0648077726364136, + "learning_rate": 1.4967486005818727e-05, + "loss": 0.2237, + "step": 2245 + }, + { + "epoch": 1.0635099141757918, + "grad_norm": 1.3801108598709106, + "learning_rate": 1.4963046855808618e-05, + "loss": 0.2375, + "step": 2246 + }, + { + "epoch": 1.0639834270494228, + "grad_norm": 1.4944959878921509, + "learning_rate": 1.4958606407761482e-05, + "loss": 0.2441, + "step": 2247 + }, + { + "epoch": 1.0644569399230541, + "grad_norm": 2.0347981452941895, + "learning_rate": 1.4954164662838677e-05, + "loss": 0.2568, + "step": 2248 + }, + { + "epoch": 1.0649304527966854, + "grad_norm": 1.357796311378479, + "learning_rate": 1.4949721622201896e-05, + "loss": 0.2161, + "step": 2249 + }, + { + "epoch": 1.0654039656703167, + "grad_norm": 1.0744569301605225, + "learning_rate": 1.4945277287013178e-05, + "loss": 0.2269, + "step": 2250 + }, + { + "epoch": 1.065877478543948, + "grad_norm": 1.2349190711975098, + "learning_rate": 1.4940831658434893e-05, + "loss": 0.2213, + "step": 2251 + }, + { + "epoch": 1.066350991417579, + "grad_norm": 1.181551456451416, + "learning_rate": 1.4936384737629753e-05, + "loss": 0.218, + "step": 2252 + }, + { + "epoch": 1.0668245042912103, + "grad_norm": 1.792569637298584, + "learning_rate": 1.4931936525760806e-05, + "loss": 0.2103, + "step": 2253 + }, + { + "epoch": 1.0672980171648416, + "grad_norm": 1.04933500289917, + "learning_rate": 1.4927487023991441e-05, + "loss": 0.2085, + "step": 2254 + }, + { + "epoch": 1.067771530038473, + "grad_norm": 1.1748439073562622, + "learning_rate": 1.4923036233485383e-05, + "loss": 0.2093, + "step": 2255 + }, + { + "epoch": 1.0682450429121042, + "grad_norm": 1.149575114250183, + "learning_rate": 1.4918584155406688e-05, + "loss": 0.251, + "step": 2256 + }, + { + "epoch": 1.0687185557857355, + "grad_norm": 1.4834145307540894, + "learning_rate": 1.4914130790919761e-05, + "loss": 0.2117, + "step": 2257 + }, + { + "epoch": 1.0691920686593668, + "grad_norm": 1.1242401599884033, + "learning_rate": 1.4909676141189332e-05, + "loss": 0.2525, + "step": 2258 + }, + { + "epoch": 1.0696655815329978, + "grad_norm": 2.0916712284088135, + "learning_rate": 1.4905220207380468e-05, + "loss": 0.2408, + "step": 2259 + }, + { + "epoch": 1.0701390944066291, + "grad_norm": 1.4777238368988037, + "learning_rate": 1.4900762990658585e-05, + "loss": 0.2203, + "step": 2260 + }, + { + "epoch": 1.0706126072802604, + "grad_norm": 1.251544713973999, + "learning_rate": 1.4896304492189417e-05, + "loss": 0.2283, + "step": 2261 + }, + { + "epoch": 1.0710861201538917, + "grad_norm": 1.3505147695541382, + "learning_rate": 1.489184471313905e-05, + "loss": 0.2255, + "step": 2262 + }, + { + "epoch": 1.071559633027523, + "grad_norm": 1.4952011108398438, + "learning_rate": 1.4887383654673889e-05, + "loss": 0.2214, + "step": 2263 + }, + { + "epoch": 1.0720331459011543, + "grad_norm": 1.3948256969451904, + "learning_rate": 1.488292131796068e-05, + "loss": 0.2502, + "step": 2264 + }, + { + "epoch": 1.0725066587747853, + "grad_norm": 1.5061980485916138, + "learning_rate": 1.4878457704166506e-05, + "loss": 0.1799, + "step": 2265 + }, + { + "epoch": 1.0729801716484166, + "grad_norm": 1.324911117553711, + "learning_rate": 1.4873992814458786e-05, + "loss": 0.2278, + "step": 2266 + }, + { + "epoch": 1.073453684522048, + "grad_norm": 1.1821624040603638, + "learning_rate": 1.4869526650005264e-05, + "loss": 0.2209, + "step": 2267 + }, + { + "epoch": 1.0739271973956792, + "grad_norm": 1.988358736038208, + "learning_rate": 1.4865059211974024e-05, + "loss": 0.2366, + "step": 2268 + }, + { + "epoch": 1.0744007102693105, + "grad_norm": 1.212149739265442, + "learning_rate": 1.4860590501533482e-05, + "loss": 0.2386, + "step": 2269 + }, + { + "epoch": 1.0748742231429418, + "grad_norm": 1.2160612344741821, + "learning_rate": 1.4856120519852383e-05, + "loss": 0.2227, + "step": 2270 + }, + { + "epoch": 1.0753477360165729, + "grad_norm": 1.1903105974197388, + "learning_rate": 1.4851649268099813e-05, + "loss": 0.2442, + "step": 2271 + }, + { + "epoch": 1.0758212488902041, + "grad_norm": 1.6250642538070679, + "learning_rate": 1.484717674744518e-05, + "loss": 0.23, + "step": 2272 + }, + { + "epoch": 1.0762947617638354, + "grad_norm": 1.3602406978607178, + "learning_rate": 1.484270295905823e-05, + "loss": 0.2328, + "step": 2273 + }, + { + "epoch": 1.0767682746374667, + "grad_norm": 1.4769501686096191, + "learning_rate": 1.4838227904109041e-05, + "loss": 0.2319, + "step": 2274 + }, + { + "epoch": 1.077241787511098, + "grad_norm": 1.1678149700164795, + "learning_rate": 1.4833751583768017e-05, + "loss": 0.221, + "step": 2275 + }, + { + "epoch": 1.0777153003847293, + "grad_norm": 2.216442823410034, + "learning_rate": 1.48292739992059e-05, + "loss": 0.2286, + "step": 2276 + }, + { + "epoch": 1.0781888132583606, + "grad_norm": 2.1490554809570312, + "learning_rate": 1.4824795151593756e-05, + "loss": 0.2138, + "step": 2277 + }, + { + "epoch": 1.0786623261319916, + "grad_norm": 1.926507830619812, + "learning_rate": 1.4820315042102986e-05, + "loss": 0.2501, + "step": 2278 + }, + { + "epoch": 1.079135839005623, + "grad_norm": 1.7893540859222412, + "learning_rate": 1.481583367190532e-05, + "loss": 0.2324, + "step": 2279 + }, + { + "epoch": 1.0796093518792542, + "grad_norm": 1.2722523212432861, + "learning_rate": 1.4811351042172813e-05, + "loss": 0.2112, + "step": 2280 + }, + { + "epoch": 1.0800828647528855, + "grad_norm": 2.427342414855957, + "learning_rate": 1.480686715407786e-05, + "loss": 0.2359, + "step": 2281 + }, + { + "epoch": 1.0805563776265168, + "grad_norm": 1.4813814163208008, + "learning_rate": 1.4802382008793174e-05, + "loss": 0.2157, + "step": 2282 + }, + { + "epoch": 1.0810298905001479, + "grad_norm": 1.7409425973892212, + "learning_rate": 1.4797895607491803e-05, + "loss": 0.2105, + "step": 2283 + }, + { + "epoch": 1.0815034033737791, + "grad_norm": 1.6621959209442139, + "learning_rate": 1.4793407951347125e-05, + "loss": 0.2328, + "step": 2284 + }, + { + "epoch": 1.0819769162474104, + "grad_norm": 1.1202408075332642, + "learning_rate": 1.4788919041532836e-05, + "loss": 0.2422, + "step": 2285 + }, + { + "epoch": 1.0824504291210417, + "grad_norm": 1.863373041152954, + "learning_rate": 1.4784428879222974e-05, + "loss": 0.2561, + "step": 2286 + }, + { + "epoch": 1.082923941994673, + "grad_norm": 1.392762541770935, + "learning_rate": 1.4779937465591893e-05, + "loss": 0.2376, + "step": 2287 + }, + { + "epoch": 1.0833974548683043, + "grad_norm": 1.4913394451141357, + "learning_rate": 1.4775444801814283e-05, + "loss": 0.2238, + "step": 2288 + }, + { + "epoch": 1.0838709677419356, + "grad_norm": 0.916836142539978, + "learning_rate": 1.4770950889065154e-05, + "loss": 0.2274, + "step": 2289 + }, + { + "epoch": 1.0843444806155667, + "grad_norm": 1.3088781833648682, + "learning_rate": 1.4766455728519846e-05, + "loss": 0.2168, + "step": 2290 + }, + { + "epoch": 1.084817993489198, + "grad_norm": 1.986661672592163, + "learning_rate": 1.4761959321354025e-05, + "loss": 0.2137, + "step": 2291 + }, + { + "epoch": 1.0852915063628292, + "grad_norm": 2.061962604522705, + "learning_rate": 1.4757461668743681e-05, + "loss": 0.1949, + "step": 2292 + }, + { + "epoch": 1.0857650192364605, + "grad_norm": 1.4279181957244873, + "learning_rate": 1.4752962771865136e-05, + "loss": 0.2108, + "step": 2293 + }, + { + "epoch": 1.0862385321100918, + "grad_norm": 1.3412773609161377, + "learning_rate": 1.474846263189503e-05, + "loss": 0.2322, + "step": 2294 + }, + { + "epoch": 1.086712044983723, + "grad_norm": 2.3214855194091797, + "learning_rate": 1.4743961250010331e-05, + "loss": 0.198, + "step": 2295 + }, + { + "epoch": 1.0871855578573542, + "grad_norm": 1.6193276643753052, + "learning_rate": 1.4739458627388332e-05, + "loss": 0.2224, + "step": 2296 + }, + { + "epoch": 1.0876590707309854, + "grad_norm": 1.793225646018982, + "learning_rate": 1.4734954765206652e-05, + "loss": 0.2259, + "step": 2297 + }, + { + "epoch": 1.0881325836046167, + "grad_norm": 1.7120318412780762, + "learning_rate": 1.4730449664643234e-05, + "loss": 0.2271, + "step": 2298 + }, + { + "epoch": 1.088606096478248, + "grad_norm": 1.1128010749816895, + "learning_rate": 1.4725943326876342e-05, + "loss": 0.2003, + "step": 2299 + }, + { + "epoch": 1.0890796093518793, + "grad_norm": 1.3492709398269653, + "learning_rate": 1.472143575308456e-05, + "loss": 0.2332, + "step": 2300 + }, + { + "epoch": 1.0895531222255106, + "grad_norm": 1.4029978513717651, + "learning_rate": 1.471692694444681e-05, + "loss": 0.2185, + "step": 2301 + }, + { + "epoch": 1.0900266350991417, + "grad_norm": 1.254938006401062, + "learning_rate": 1.471241690214232e-05, + "loss": 0.223, + "step": 2302 + }, + { + "epoch": 1.090500147972773, + "grad_norm": 1.5678329467773438, + "learning_rate": 1.4707905627350653e-05, + "loss": 0.2186, + "step": 2303 + }, + { + "epoch": 1.0909736608464042, + "grad_norm": 1.8724418878555298, + "learning_rate": 1.4703393121251685e-05, + "loss": 0.2135, + "step": 2304 + }, + { + "epoch": 1.0914471737200355, + "grad_norm": 1.3198801279067993, + "learning_rate": 1.4698879385025625e-05, + "loss": 0.2076, + "step": 2305 + }, + { + "epoch": 1.0919206865936668, + "grad_norm": 1.733998417854309, + "learning_rate": 1.4694364419852986e-05, + "loss": 0.2184, + "step": 2306 + }, + { + "epoch": 1.092394199467298, + "grad_norm": 0.94035804271698, + "learning_rate": 1.4689848226914627e-05, + "loss": 0.2124, + "step": 2307 + }, + { + "epoch": 1.0928677123409292, + "grad_norm": 0.9416120052337646, + "learning_rate": 1.4685330807391704e-05, + "loss": 0.2126, + "step": 2308 + }, + { + "epoch": 1.0933412252145605, + "grad_norm": 1.1188304424285889, + "learning_rate": 1.468081216246571e-05, + "loss": 0.2299, + "step": 2309 + }, + { + "epoch": 1.0938147380881917, + "grad_norm": 1.0400240421295166, + "learning_rate": 1.467629229331845e-05, + "loss": 0.2196, + "step": 2310 + }, + { + "epoch": 1.094288250961823, + "grad_norm": 1.3964438438415527, + "learning_rate": 1.4671771201132054e-05, + "loss": 0.2477, + "step": 2311 + }, + { + "epoch": 1.0947617638354543, + "grad_norm": 1.3592251539230347, + "learning_rate": 1.4667248887088971e-05, + "loss": 0.2073, + "step": 2312 + }, + { + "epoch": 1.0952352767090856, + "grad_norm": 3.0954136848449707, + "learning_rate": 1.4662725352371967e-05, + "loss": 0.2125, + "step": 2313 + }, + { + "epoch": 1.0957087895827167, + "grad_norm": 1.2312672138214111, + "learning_rate": 1.4658200598164127e-05, + "loss": 0.2282, + "step": 2314 + }, + { + "epoch": 1.096182302456348, + "grad_norm": 1.0197677612304688, + "learning_rate": 1.4653674625648861e-05, + "loss": 0.2203, + "step": 2315 + }, + { + "epoch": 1.0966558153299792, + "grad_norm": 1.2136882543563843, + "learning_rate": 1.4649147436009895e-05, + "loss": 0.2358, + "step": 2316 + }, + { + "epoch": 1.0971293282036105, + "grad_norm": 1.1311389207839966, + "learning_rate": 1.4644619030431264e-05, + "loss": 0.2187, + "step": 2317 + }, + { + "epoch": 1.0976028410772418, + "grad_norm": 1.0771368741989136, + "learning_rate": 1.4640089410097333e-05, + "loss": 0.2128, + "step": 2318 + }, + { + "epoch": 1.098076353950873, + "grad_norm": 1.247301459312439, + "learning_rate": 1.463555857619278e-05, + "loss": 0.2158, + "step": 2319 + }, + { + "epoch": 1.0985498668245044, + "grad_norm": 1.5069024562835693, + "learning_rate": 1.4631026529902601e-05, + "loss": 0.2248, + "step": 2320 + }, + { + "epoch": 1.0990233796981355, + "grad_norm": 1.451413869857788, + "learning_rate": 1.462649327241211e-05, + "loss": 0.2121, + "step": 2321 + }, + { + "epoch": 1.0994968925717667, + "grad_norm": 1.4309951066970825, + "learning_rate": 1.4621958804906938e-05, + "loss": 0.2378, + "step": 2322 + }, + { + "epoch": 1.099970405445398, + "grad_norm": 1.5729237794876099, + "learning_rate": 1.4617423128573028e-05, + "loss": 0.2183, + "step": 2323 + }, + { + "epoch": 1.1004439183190293, + "grad_norm": 2.354729413986206, + "learning_rate": 1.4612886244596647e-05, + "loss": 0.223, + "step": 2324 + }, + { + "epoch": 1.1009174311926606, + "grad_norm": 1.197218894958496, + "learning_rate": 1.4608348154164367e-05, + "loss": 0.2095, + "step": 2325 + }, + { + "epoch": 1.101390944066292, + "grad_norm": 2.5518031120300293, + "learning_rate": 1.4603808858463085e-05, + "loss": 0.2002, + "step": 2326 + }, + { + "epoch": 1.101864456939923, + "grad_norm": 1.1937416791915894, + "learning_rate": 1.459926835868001e-05, + "loss": 0.2171, + "step": 2327 + }, + { + "epoch": 1.1023379698135543, + "grad_norm": 1.7216546535491943, + "learning_rate": 1.4594726656002664e-05, + "loss": 0.2304, + "step": 2328 + }, + { + "epoch": 1.1028114826871855, + "grad_norm": 1.408703327178955, + "learning_rate": 1.4590183751618889e-05, + "loss": 0.2391, + "step": 2329 + }, + { + "epoch": 1.1032849955608168, + "grad_norm": 1.8831274509429932, + "learning_rate": 1.4585639646716836e-05, + "loss": 0.2414, + "step": 2330 + }, + { + "epoch": 1.1037585084344481, + "grad_norm": 1.7065414190292358, + "learning_rate": 1.4581094342484972e-05, + "loss": 0.2365, + "step": 2331 + }, + { + "epoch": 1.1042320213080794, + "grad_norm": 1.767787218093872, + "learning_rate": 1.4576547840112077e-05, + "loss": 0.2229, + "step": 2332 + }, + { + "epoch": 1.1047055341817105, + "grad_norm": 1.4647932052612305, + "learning_rate": 1.4572000140787244e-05, + "loss": 0.2404, + "step": 2333 + }, + { + "epoch": 1.1051790470553418, + "grad_norm": 1.9528988599777222, + "learning_rate": 1.4567451245699884e-05, + "loss": 0.2326, + "step": 2334 + }, + { + "epoch": 1.105652559928973, + "grad_norm": 1.284548282623291, + "learning_rate": 1.456290115603971e-05, + "loss": 0.2263, + "step": 2335 + }, + { + "epoch": 1.1061260728026043, + "grad_norm": 1.5598032474517822, + "learning_rate": 1.4558349872996762e-05, + "loss": 0.2211, + "step": 2336 + }, + { + "epoch": 1.1065995856762356, + "grad_norm": 1.3689323663711548, + "learning_rate": 1.4553797397761376e-05, + "loss": 0.2168, + "step": 2337 + }, + { + "epoch": 1.107073098549867, + "grad_norm": 0.9562427401542664, + "learning_rate": 1.4549243731524213e-05, + "loss": 0.213, + "step": 2338 + }, + { + "epoch": 1.107546611423498, + "grad_norm": 1.2045793533325195, + "learning_rate": 1.4544688875476238e-05, + "loss": 0.2287, + "step": 2339 + }, + { + "epoch": 1.1080201242971293, + "grad_norm": 1.2727464437484741, + "learning_rate": 1.4540132830808733e-05, + "loss": 0.2112, + "step": 2340 + }, + { + "epoch": 1.1084936371707605, + "grad_norm": 1.1461354494094849, + "learning_rate": 1.4535575598713287e-05, + "loss": 0.2019, + "step": 2341 + }, + { + "epoch": 1.1089671500443918, + "grad_norm": 1.330062985420227, + "learning_rate": 1.4531017180381797e-05, + "loss": 0.212, + "step": 2342 + }, + { + "epoch": 1.1094406629180231, + "grad_norm": 1.4322924613952637, + "learning_rate": 1.4526457577006473e-05, + "loss": 0.2197, + "step": 2343 + }, + { + "epoch": 1.1099141757916544, + "grad_norm": 1.5362545251846313, + "learning_rate": 1.4521896789779842e-05, + "loss": 0.2195, + "step": 2344 + }, + { + "epoch": 1.1103876886652855, + "grad_norm": 1.0807383060455322, + "learning_rate": 1.4517334819894724e-05, + "loss": 0.2394, + "step": 2345 + }, + { + "epoch": 1.1108612015389168, + "grad_norm": 1.3042844533920288, + "learning_rate": 1.4512771668544266e-05, + "loss": 0.2393, + "step": 2346 + }, + { + "epoch": 1.111334714412548, + "grad_norm": 1.3964802026748657, + "learning_rate": 1.4508207336921914e-05, + "loss": 0.2293, + "step": 2347 + }, + { + "epoch": 1.1118082272861793, + "grad_norm": 0.9700664281845093, + "learning_rate": 1.4503641826221424e-05, + "loss": 0.2346, + "step": 2348 + }, + { + "epoch": 1.1122817401598106, + "grad_norm": 1.142829418182373, + "learning_rate": 1.449907513763686e-05, + "loss": 0.2185, + "step": 2349 + }, + { + "epoch": 1.112755253033442, + "grad_norm": 1.3726601600646973, + "learning_rate": 1.44945072723626e-05, + "loss": 0.2247, + "step": 2350 + }, + { + "epoch": 1.1132287659070732, + "grad_norm": 1.0523629188537598, + "learning_rate": 1.4489938231593321e-05, + "loss": 0.2263, + "step": 2351 + }, + { + "epoch": 1.1137022787807043, + "grad_norm": 1.1638647317886353, + "learning_rate": 1.4485368016524013e-05, + "loss": 0.2094, + "step": 2352 + }, + { + "epoch": 1.1141757916543356, + "grad_norm": 1.121314525604248, + "learning_rate": 1.4480796628349972e-05, + "loss": 0.2036, + "step": 2353 + }, + { + "epoch": 1.1146493045279668, + "grad_norm": 1.4763176441192627, + "learning_rate": 1.4476224068266798e-05, + "loss": 0.2385, + "step": 2354 + }, + { + "epoch": 1.1151228174015981, + "grad_norm": 1.7545726299285889, + "learning_rate": 1.4471650337470402e-05, + "loss": 0.2234, + "step": 2355 + }, + { + "epoch": 1.1155963302752294, + "grad_norm": 1.8628343343734741, + "learning_rate": 1.4467075437156998e-05, + "loss": 0.2142, + "step": 2356 + }, + { + "epoch": 1.1160698431488607, + "grad_norm": 1.5996010303497314, + "learning_rate": 1.446249936852311e-05, + "loss": 0.2413, + "step": 2357 + }, + { + "epoch": 1.1165433560224918, + "grad_norm": 1.7511857748031616, + "learning_rate": 1.4457922132765563e-05, + "loss": 0.2044, + "step": 2358 + }, + { + "epoch": 1.117016868896123, + "grad_norm": 0.8672329783439636, + "learning_rate": 1.4453343731081488e-05, + "loss": 0.2006, + "step": 2359 + }, + { + "epoch": 1.1174903817697543, + "grad_norm": 1.460821509361267, + "learning_rate": 1.444876416466832e-05, + "loss": 0.2255, + "step": 2360 + }, + { + "epoch": 1.1179638946433856, + "grad_norm": 1.3242175579071045, + "learning_rate": 1.4444183434723807e-05, + "loss": 0.2342, + "step": 2361 + }, + { + "epoch": 1.118437407517017, + "grad_norm": 1.3085192441940308, + "learning_rate": 1.4439601542445987e-05, + "loss": 0.2688, + "step": 2362 + }, + { + "epoch": 1.1189109203906482, + "grad_norm": 1.3582148551940918, + "learning_rate": 1.4435018489033214e-05, + "loss": 0.2452, + "step": 2363 + }, + { + "epoch": 1.1193844332642793, + "grad_norm": 1.1882847547531128, + "learning_rate": 1.443043427568414e-05, + "loss": 0.2238, + "step": 2364 + }, + { + "epoch": 1.1198579461379106, + "grad_norm": 1.574207067489624, + "learning_rate": 1.4425848903597724e-05, + "loss": 0.2382, + "step": 2365 + }, + { + "epoch": 1.1203314590115419, + "grad_norm": 1.0986261367797852, + "learning_rate": 1.4421262373973223e-05, + "loss": 0.2102, + "step": 2366 + }, + { + "epoch": 1.1208049718851731, + "grad_norm": 1.3533656597137451, + "learning_rate": 1.4416674688010202e-05, + "loss": 0.2158, + "step": 2367 + }, + { + "epoch": 1.1212784847588044, + "grad_norm": 1.6296641826629639, + "learning_rate": 1.4412085846908526e-05, + "loss": 0.2101, + "step": 2368 + }, + { + "epoch": 1.1217519976324357, + "grad_norm": 1.0967878103256226, + "learning_rate": 1.4407495851868359e-05, + "loss": 0.2318, + "step": 2369 + }, + { + "epoch": 1.1222255105060668, + "grad_norm": 1.2239967584609985, + "learning_rate": 1.440290470409017e-05, + "loss": 0.2045, + "step": 2370 + }, + { + "epoch": 1.122699023379698, + "grad_norm": 1.5295300483703613, + "learning_rate": 1.4398312404774735e-05, + "loss": 0.2554, + "step": 2371 + }, + { + "epoch": 1.1231725362533294, + "grad_norm": 1.227840542793274, + "learning_rate": 1.439371895512312e-05, + "loss": 0.2367, + "step": 2372 + }, + { + "epoch": 1.1236460491269606, + "grad_norm": 1.9119755029678345, + "learning_rate": 1.4389124356336696e-05, + "loss": 0.2372, + "step": 2373 + }, + { + "epoch": 1.124119562000592, + "grad_norm": 1.034201979637146, + "learning_rate": 1.4384528609617143e-05, + "loss": 0.199, + "step": 2374 + }, + { + "epoch": 1.1245930748742232, + "grad_norm": 1.1491769552230835, + "learning_rate": 1.4379931716166429e-05, + "loss": 0.2293, + "step": 2375 + }, + { + "epoch": 1.1250665877478543, + "grad_norm": 1.0914734601974487, + "learning_rate": 1.4375333677186829e-05, + "loss": 0.2179, + "step": 2376 + }, + { + "epoch": 1.1255401006214856, + "grad_norm": 2.126800775527954, + "learning_rate": 1.4370734493880916e-05, + "loss": 0.2166, + "step": 2377 + }, + { + "epoch": 1.1260136134951169, + "grad_norm": 2.5790648460388184, + "learning_rate": 1.4366134167451562e-05, + "loss": 0.256, + "step": 2378 + }, + { + "epoch": 1.1264871263687481, + "grad_norm": 1.1876254081726074, + "learning_rate": 1.4361532699101935e-05, + "loss": 0.223, + "step": 2379 + }, + { + "epoch": 1.1269606392423794, + "grad_norm": 1.856112003326416, + "learning_rate": 1.4356930090035508e-05, + "loss": 0.2211, + "step": 2380 + }, + { + "epoch": 1.1274341521160107, + "grad_norm": 1.3353132009506226, + "learning_rate": 1.4352326341456048e-05, + "loss": 0.2256, + "step": 2381 + }, + { + "epoch": 1.127907664989642, + "grad_norm": 1.3428007364273071, + "learning_rate": 1.4347721454567623e-05, + "loss": 0.2296, + "step": 2382 + }, + { + "epoch": 1.128381177863273, + "grad_norm": 1.5457638502120972, + "learning_rate": 1.4343115430574592e-05, + "loss": 0.2188, + "step": 2383 + }, + { + "epoch": 1.1288546907369044, + "grad_norm": 1.9435369968414307, + "learning_rate": 1.433850827068162e-05, + "loss": 0.2184, + "step": 2384 + }, + { + "epoch": 1.1293282036105357, + "grad_norm": 1.060576319694519, + "learning_rate": 1.4333899976093664e-05, + "loss": 0.2332, + "step": 2385 + }, + { + "epoch": 1.129801716484167, + "grad_norm": 2.13041090965271, + "learning_rate": 1.4329290548015981e-05, + "loss": 0.231, + "step": 2386 + }, + { + "epoch": 1.1302752293577982, + "grad_norm": 1.1683634519577026, + "learning_rate": 1.4324679987654118e-05, + "loss": 0.211, + "step": 2387 + }, + { + "epoch": 1.1307487422314293, + "grad_norm": 1.2120347023010254, + "learning_rate": 1.4320068296213926e-05, + "loss": 0.2094, + "step": 2388 + }, + { + "epoch": 1.1312222551050606, + "grad_norm": 1.454931616783142, + "learning_rate": 1.4315455474901547e-05, + "loss": 0.2472, + "step": 2389 + }, + { + "epoch": 1.1316957679786919, + "grad_norm": 1.3851542472839355, + "learning_rate": 1.4310841524923421e-05, + "loss": 0.2272, + "step": 2390 + }, + { + "epoch": 1.1321692808523232, + "grad_norm": 1.0476808547973633, + "learning_rate": 1.4306226447486283e-05, + "loss": 0.2176, + "step": 2391 + }, + { + "epoch": 1.1326427937259544, + "grad_norm": 1.5204083919525146, + "learning_rate": 1.4301610243797158e-05, + "loss": 0.2392, + "step": 2392 + }, + { + "epoch": 1.1331163065995857, + "grad_norm": 1.792072057723999, + "learning_rate": 1.4296992915063373e-05, + "loss": 0.2097, + "step": 2393 + }, + { + "epoch": 1.133589819473217, + "grad_norm": 1.989022135734558, + "learning_rate": 1.4292374462492547e-05, + "loss": 0.2483, + "step": 2394 + }, + { + "epoch": 1.134063332346848, + "grad_norm": 1.7426658868789673, + "learning_rate": 1.4287754887292589e-05, + "loss": 0.2133, + "step": 2395 + }, + { + "epoch": 1.1345368452204794, + "grad_norm": 1.5938878059387207, + "learning_rate": 1.42831341906717e-05, + "loss": 0.2123, + "step": 2396 + }, + { + "epoch": 1.1350103580941107, + "grad_norm": 1.6211594343185425, + "learning_rate": 1.4278512373838386e-05, + "loss": 0.2364, + "step": 2397 + }, + { + "epoch": 1.135483870967742, + "grad_norm": 0.9879429936408997, + "learning_rate": 1.4273889438001439e-05, + "loss": 0.2109, + "step": 2398 + }, + { + "epoch": 1.1359573838413732, + "grad_norm": 1.1694254875183105, + "learning_rate": 1.426926538436994e-05, + "loss": 0.2156, + "step": 2399 + }, + { + "epoch": 1.1364308967150045, + "grad_norm": 1.0623806715011597, + "learning_rate": 1.4264640214153263e-05, + "loss": 0.1991, + "step": 2400 + }, + { + "epoch": 1.1369044095886358, + "grad_norm": 1.6476950645446777, + "learning_rate": 1.4260013928561081e-05, + "loss": 0.242, + "step": 2401 + }, + { + "epoch": 1.1373779224622669, + "grad_norm": 1.9095466136932373, + "learning_rate": 1.4255386528803354e-05, + "loss": 0.2194, + "step": 2402 + }, + { + "epoch": 1.1378514353358982, + "grad_norm": 1.2309021949768066, + "learning_rate": 1.4250758016090335e-05, + "loss": 0.2141, + "step": 2403 + }, + { + "epoch": 1.1383249482095295, + "grad_norm": 1.0731322765350342, + "learning_rate": 1.4246128391632562e-05, + "loss": 0.2075, + "step": 2404 + }, + { + "epoch": 1.1387984610831607, + "grad_norm": 1.4842568635940552, + "learning_rate": 1.4241497656640872e-05, + "loss": 0.2145, + "step": 2405 + }, + { + "epoch": 1.139271973956792, + "grad_norm": 1.6689938306808472, + "learning_rate": 1.4236865812326386e-05, + "loss": 0.2281, + "step": 2406 + }, + { + "epoch": 1.139745486830423, + "grad_norm": 1.186214566230774, + "learning_rate": 1.4232232859900523e-05, + "loss": 0.2296, + "step": 2407 + }, + { + "epoch": 1.1402189997040544, + "grad_norm": 2.036839246749878, + "learning_rate": 1.4227598800574984e-05, + "loss": 0.1975, + "step": 2408 + }, + { + "epoch": 1.1406925125776857, + "grad_norm": 1.4921070337295532, + "learning_rate": 1.4222963635561761e-05, + "loss": 0.2342, + "step": 2409 + }, + { + "epoch": 1.141166025451317, + "grad_norm": 1.0753675699234009, + "learning_rate": 1.4218327366073142e-05, + "loss": 0.2251, + "step": 2410 + }, + { + "epoch": 1.1416395383249482, + "grad_norm": 1.8981343507766724, + "learning_rate": 1.4213689993321693e-05, + "loss": 0.2247, + "step": 2411 + }, + { + "epoch": 1.1421130511985795, + "grad_norm": 1.5621330738067627, + "learning_rate": 1.4209051518520279e-05, + "loss": 0.2106, + "step": 2412 + }, + { + "epoch": 1.1425865640722108, + "grad_norm": 1.145002841949463, + "learning_rate": 1.4204411942882046e-05, + "loss": 0.2496, + "step": 2413 + }, + { + "epoch": 1.1430600769458419, + "grad_norm": 1.893953561782837, + "learning_rate": 1.4199771267620429e-05, + "loss": 0.223, + "step": 2414 + }, + { + "epoch": 1.1435335898194732, + "grad_norm": 2.6021175384521484, + "learning_rate": 1.4195129493949152e-05, + "loss": 0.2389, + "step": 2415 + }, + { + "epoch": 1.1440071026931045, + "grad_norm": 1.2552067041397095, + "learning_rate": 1.4190486623082224e-05, + "loss": 0.2072, + "step": 2416 + }, + { + "epoch": 1.1444806155667357, + "grad_norm": 1.4053682088851929, + "learning_rate": 1.418584265623395e-05, + "loss": 0.2115, + "step": 2417 + }, + { + "epoch": 1.144954128440367, + "grad_norm": 1.3213428258895874, + "learning_rate": 1.418119759461891e-05, + "loss": 0.2304, + "step": 2418 + }, + { + "epoch": 1.145427641313998, + "grad_norm": 1.09917151927948, + "learning_rate": 1.4176551439451976e-05, + "loss": 0.2189, + "step": 2419 + }, + { + "epoch": 1.1459011541876294, + "grad_norm": 3.225097417831421, + "learning_rate": 1.4171904191948306e-05, + "loss": 0.215, + "step": 2420 + }, + { + "epoch": 1.1463746670612607, + "grad_norm": 1.6562659740447998, + "learning_rate": 1.416725585332334e-05, + "loss": 0.2157, + "step": 2421 + }, + { + "epoch": 1.146848179934892, + "grad_norm": 1.3881036043167114, + "learning_rate": 1.4162606424792809e-05, + "loss": 0.2283, + "step": 2422 + }, + { + "epoch": 1.1473216928085233, + "grad_norm": 1.0177631378173828, + "learning_rate": 1.4157955907572722e-05, + "loss": 0.1981, + "step": 2423 + }, + { + "epoch": 1.1477952056821545, + "grad_norm": 1.4494411945343018, + "learning_rate": 1.4153304302879383e-05, + "loss": 0.2367, + "step": 2424 + }, + { + "epoch": 1.1482687185557858, + "grad_norm": 1.3473752737045288, + "learning_rate": 1.4148651611929371e-05, + "loss": 0.236, + "step": 2425 + }, + { + "epoch": 1.148742231429417, + "grad_norm": 1.4328070878982544, + "learning_rate": 1.4143997835939552e-05, + "loss": 0.2182, + "step": 2426 + }, + { + "epoch": 1.1492157443030482, + "grad_norm": 1.770027995109558, + "learning_rate": 1.4139342976127077e-05, + "loss": 0.2246, + "step": 2427 + }, + { + "epoch": 1.1496892571766795, + "grad_norm": 1.1986833810806274, + "learning_rate": 1.413468703370938e-05, + "loss": 0.2309, + "step": 2428 + }, + { + "epoch": 1.1501627700503108, + "grad_norm": 1.74411940574646, + "learning_rate": 1.4130030009904174e-05, + "loss": 0.2027, + "step": 2429 + }, + { + "epoch": 1.150636282923942, + "grad_norm": 1.3250808715820312, + "learning_rate": 1.4125371905929468e-05, + "loss": 0.2219, + "step": 2430 + }, + { + "epoch": 1.1511097957975733, + "grad_norm": 1.336143136024475, + "learning_rate": 1.4120712723003535e-05, + "loss": 0.2279, + "step": 2431 + }, + { + "epoch": 1.1515833086712046, + "grad_norm": 1.1617181301116943, + "learning_rate": 1.4116052462344942e-05, + "loss": 0.2143, + "step": 2432 + }, + { + "epoch": 1.1520568215448357, + "grad_norm": 1.7668710947036743, + "learning_rate": 1.4111391125172537e-05, + "loss": 0.2268, + "step": 2433 + }, + { + "epoch": 1.152530334418467, + "grad_norm": 1.455324649810791, + "learning_rate": 1.4106728712705446e-05, + "loss": 0.2289, + "step": 2434 + }, + { + "epoch": 1.1530038472920983, + "grad_norm": 1.4046434164047241, + "learning_rate": 1.4102065226163078e-05, + "loss": 0.2297, + "step": 2435 + }, + { + "epoch": 1.1534773601657295, + "grad_norm": 1.2692493200302124, + "learning_rate": 1.4097400666765122e-05, + "loss": 0.2339, + "step": 2436 + }, + { + "epoch": 1.1539508730393608, + "grad_norm": 1.7210558652877808, + "learning_rate": 1.4092735035731553e-05, + "loss": 0.2129, + "step": 2437 + }, + { + "epoch": 1.154424385912992, + "grad_norm": 1.2296080589294434, + "learning_rate": 1.4088068334282617e-05, + "loss": 0.2163, + "step": 2438 + }, + { + "epoch": 1.1548978987866232, + "grad_norm": 1.0044761896133423, + "learning_rate": 1.4083400563638847e-05, + "loss": 0.2193, + "step": 2439 + }, + { + "epoch": 1.1553714116602545, + "grad_norm": 2.0132339000701904, + "learning_rate": 1.407873172502105e-05, + "loss": 0.2281, + "step": 2440 + }, + { + "epoch": 1.1558449245338858, + "grad_norm": 2.1369824409484863, + "learning_rate": 1.4074061819650322e-05, + "loss": 0.2122, + "step": 2441 + }, + { + "epoch": 1.156318437407517, + "grad_norm": 1.4052802324295044, + "learning_rate": 1.4069390848748029e-05, + "loss": 0.2225, + "step": 2442 + }, + { + "epoch": 1.1567919502811483, + "grad_norm": 1.6282340288162231, + "learning_rate": 1.4064718813535817e-05, + "loss": 0.2247, + "step": 2443 + }, + { + "epoch": 1.1572654631547796, + "grad_norm": 1.5721560716629028, + "learning_rate": 1.4060045715235609e-05, + "loss": 0.2201, + "step": 2444 + }, + { + "epoch": 1.1577389760284107, + "grad_norm": 1.0424257516860962, + "learning_rate": 1.4055371555069615e-05, + "loss": 0.2363, + "step": 2445 + }, + { + "epoch": 1.158212488902042, + "grad_norm": 1.8262851238250732, + "learning_rate": 1.4050696334260312e-05, + "loss": 0.2222, + "step": 2446 + }, + { + "epoch": 1.1586860017756733, + "grad_norm": 2.008397340774536, + "learning_rate": 1.4046020054030465e-05, + "loss": 0.242, + "step": 2447 + }, + { + "epoch": 1.1591595146493046, + "grad_norm": 3.8094611167907715, + "learning_rate": 1.40413427156031e-05, + "loss": 0.2242, + "step": 2448 + }, + { + "epoch": 1.1596330275229358, + "grad_norm": 1.1817164421081543, + "learning_rate": 1.4036664320201538e-05, + "loss": 0.2312, + "step": 2449 + }, + { + "epoch": 1.160106540396567, + "grad_norm": 1.771918535232544, + "learning_rate": 1.4031984869049366e-05, + "loss": 0.209, + "step": 2450 + }, + { + "epoch": 1.1605800532701982, + "grad_norm": 1.6216905117034912, + "learning_rate": 1.4027304363370446e-05, + "loss": 0.2296, + "step": 2451 + }, + { + "epoch": 1.1610535661438295, + "grad_norm": 1.4540348052978516, + "learning_rate": 1.4022622804388923e-05, + "loss": 0.2135, + "step": 2452 + }, + { + "epoch": 1.1615270790174608, + "grad_norm": 1.7169173955917358, + "learning_rate": 1.4017940193329213e-05, + "loss": 0.2417, + "step": 2453 + }, + { + "epoch": 1.162000591891092, + "grad_norm": 1.2987480163574219, + "learning_rate": 1.401325653141601e-05, + "loss": 0.2275, + "step": 2454 + }, + { + "epoch": 1.1624741047647233, + "grad_norm": 1.303578495979309, + "learning_rate": 1.4008571819874273e-05, + "loss": 0.2323, + "step": 2455 + }, + { + "epoch": 1.1629476176383546, + "grad_norm": 1.2520272731781006, + "learning_rate": 1.4003886059929248e-05, + "loss": 0.2025, + "step": 2456 + }, + { + "epoch": 1.1634211305119857, + "grad_norm": 2.3889143466949463, + "learning_rate": 1.3999199252806452e-05, + "loss": 0.22, + "step": 2457 + }, + { + "epoch": 1.163894643385617, + "grad_norm": 1.60451340675354, + "learning_rate": 1.3994511399731675e-05, + "loss": 0.2413, + "step": 2458 + }, + { + "epoch": 1.1643681562592483, + "grad_norm": 1.227243185043335, + "learning_rate": 1.3989822501930972e-05, + "loss": 0.1986, + "step": 2459 + }, + { + "epoch": 1.1648416691328796, + "grad_norm": 1.0574538707733154, + "learning_rate": 1.3985132560630688e-05, + "loss": 0.2294, + "step": 2460 + }, + { + "epoch": 1.1653151820065109, + "grad_norm": 1.0879645347595215, + "learning_rate": 1.3980441577057426e-05, + "loss": 0.2376, + "step": 2461 + }, + { + "epoch": 1.1657886948801421, + "grad_norm": 2.3020236492156982, + "learning_rate": 1.397574955243807e-05, + "loss": 0.2257, + "step": 2462 + }, + { + "epoch": 1.1662622077537734, + "grad_norm": 1.1904938220977783, + "learning_rate": 1.3971056487999773e-05, + "loss": 0.2329, + "step": 2463 + }, + { + "epoch": 1.1667357206274045, + "grad_norm": 2.3944849967956543, + "learning_rate": 1.3966362384969963e-05, + "loss": 0.2004, + "step": 2464 + }, + { + "epoch": 1.1672092335010358, + "grad_norm": 1.1166476011276245, + "learning_rate": 1.3961667244576335e-05, + "loss": 0.2009, + "step": 2465 + }, + { + "epoch": 1.167682746374667, + "grad_norm": 1.1066415309906006, + "learning_rate": 1.3956971068046855e-05, + "loss": 0.2289, + "step": 2466 + }, + { + "epoch": 1.1681562592482984, + "grad_norm": 1.3403149843215942, + "learning_rate": 1.3952273856609767e-05, + "loss": 0.2034, + "step": 2467 + }, + { + "epoch": 1.1686297721219296, + "grad_norm": 1.3216232061386108, + "learning_rate": 1.3947575611493583e-05, + "loss": 0.2223, + "step": 2468 + }, + { + "epoch": 1.1691032849955607, + "grad_norm": 2.135986328125, + "learning_rate": 1.3942876333927077e-05, + "loss": 0.2079, + "step": 2469 + }, + { + "epoch": 1.169576797869192, + "grad_norm": 1.8347281217575073, + "learning_rate": 1.3938176025139305e-05, + "loss": 0.1986, + "step": 2470 + }, + { + "epoch": 1.1700503107428233, + "grad_norm": 1.1069951057434082, + "learning_rate": 1.3933474686359588e-05, + "loss": 0.2261, + "step": 2471 + }, + { + "epoch": 1.1705238236164546, + "grad_norm": 1.534125804901123, + "learning_rate": 1.3928772318817509e-05, + "loss": 0.23, + "step": 2472 + }, + { + "epoch": 1.1709973364900859, + "grad_norm": 0.9962480664253235, + "learning_rate": 1.3924068923742935e-05, + "loss": 0.2145, + "step": 2473 + }, + { + "epoch": 1.1714708493637171, + "grad_norm": 1.4505356550216675, + "learning_rate": 1.391936450236599e-05, + "loss": 0.2107, + "step": 2474 + }, + { + "epoch": 1.1719443622373484, + "grad_norm": 1.0452574491500854, + "learning_rate": 1.3914659055917068e-05, + "loss": 0.2004, + "step": 2475 + }, + { + "epoch": 1.1724178751109795, + "grad_norm": 1.055141568183899, + "learning_rate": 1.3909952585626836e-05, + "loss": 0.2231, + "step": 2476 + }, + { + "epoch": 1.1728913879846108, + "grad_norm": 1.314717411994934, + "learning_rate": 1.3905245092726225e-05, + "loss": 0.2346, + "step": 2477 + }, + { + "epoch": 1.173364900858242, + "grad_norm": 1.5005841255187988, + "learning_rate": 1.3900536578446437e-05, + "loss": 0.2334, + "step": 2478 + }, + { + "epoch": 1.1738384137318734, + "grad_norm": 1.6286835670471191, + "learning_rate": 1.3895827044018934e-05, + "loss": 0.2517, + "step": 2479 + }, + { + "epoch": 1.1743119266055047, + "grad_norm": 1.1182934045791626, + "learning_rate": 1.3891116490675451e-05, + "loss": 0.2388, + "step": 2480 + }, + { + "epoch": 1.1747854394791357, + "grad_norm": 1.1708893775939941, + "learning_rate": 1.3886404919647988e-05, + "loss": 0.2131, + "step": 2481 + }, + { + "epoch": 1.175258952352767, + "grad_norm": 1.1157783269882202, + "learning_rate": 1.3881692332168815e-05, + "loss": 0.2378, + "step": 2482 + }, + { + "epoch": 1.1757324652263983, + "grad_norm": 1.3915766477584839, + "learning_rate": 1.3876978729470459e-05, + "loss": 0.2189, + "step": 2483 + }, + { + "epoch": 1.1762059781000296, + "grad_norm": 1.132906198501587, + "learning_rate": 1.3872264112785714e-05, + "loss": 0.2082, + "step": 2484 + }, + { + "epoch": 1.1766794909736609, + "grad_norm": 1.12389075756073, + "learning_rate": 1.386754848334765e-05, + "loss": 0.1907, + "step": 2485 + }, + { + "epoch": 1.1771530038472922, + "grad_norm": 1.498472809791565, + "learning_rate": 1.3862831842389591e-05, + "loss": 0.2291, + "step": 2486 + }, + { + "epoch": 1.1776265167209234, + "grad_norm": 1.148629903793335, + "learning_rate": 1.3858114191145126e-05, + "loss": 0.2093, + "step": 2487 + }, + { + "epoch": 1.1781000295945545, + "grad_norm": 1.5816822052001953, + "learning_rate": 1.3853395530848114e-05, + "loss": 0.225, + "step": 2488 + }, + { + "epoch": 1.1785735424681858, + "grad_norm": 1.4523855447769165, + "learning_rate": 1.384867586273268e-05, + "loss": 0.2078, + "step": 2489 + }, + { + "epoch": 1.179047055341817, + "grad_norm": 1.0670571327209473, + "learning_rate": 1.38439551880332e-05, + "loss": 0.2214, + "step": 2490 + }, + { + "epoch": 1.1795205682154484, + "grad_norm": 1.1799019575119019, + "learning_rate": 1.3839233507984324e-05, + "loss": 0.2186, + "step": 2491 + }, + { + "epoch": 1.1799940810890797, + "grad_norm": 1.5703297853469849, + "learning_rate": 1.3834510823820963e-05, + "loss": 0.2402, + "step": 2492 + }, + { + "epoch": 1.180467593962711, + "grad_norm": 1.9375343322753906, + "learning_rate": 1.3829787136778288e-05, + "loss": 0.2241, + "step": 2493 + }, + { + "epoch": 1.1809411068363422, + "grad_norm": 1.1045565605163574, + "learning_rate": 1.382506244809173e-05, + "loss": 0.2247, + "step": 2494 + }, + { + "epoch": 1.1814146197099733, + "grad_norm": 1.2954577207565308, + "learning_rate": 1.3820336758996994e-05, + "loss": 0.2266, + "step": 2495 + }, + { + "epoch": 1.1818881325836046, + "grad_norm": 1.3069260120391846, + "learning_rate": 1.3815610070730032e-05, + "loss": 0.2192, + "step": 2496 + }, + { + "epoch": 1.1823616454572359, + "grad_norm": 0.9491212368011475, + "learning_rate": 1.3810882384527067e-05, + "loss": 0.2241, + "step": 2497 + }, + { + "epoch": 1.1828351583308672, + "grad_norm": 1.5114809274673462, + "learning_rate": 1.3806153701624578e-05, + "loss": 0.2567, + "step": 2498 + }, + { + "epoch": 1.1833086712044985, + "grad_norm": 1.5127360820770264, + "learning_rate": 1.3801424023259308e-05, + "loss": 0.2152, + "step": 2499 + }, + { + "epoch": 1.1837821840781295, + "grad_norm": 1.0083001852035522, + "learning_rate": 1.3796693350668258e-05, + "loss": 0.2371, + "step": 2500 + }, + { + "epoch": 1.1842556969517608, + "grad_norm": 1.140708088874817, + "learning_rate": 1.3791961685088689e-05, + "loss": 0.231, + "step": 2501 + }, + { + "epoch": 1.184729209825392, + "grad_norm": 1.298230528831482, + "learning_rate": 1.3787229027758122e-05, + "loss": 0.2102, + "step": 2502 + }, + { + "epoch": 1.1852027226990234, + "grad_norm": 1.1305428743362427, + "learning_rate": 1.378249537991434e-05, + "loss": 0.2321, + "step": 2503 + }, + { + "epoch": 1.1856762355726547, + "grad_norm": 1.2196705341339111, + "learning_rate": 1.3777760742795381e-05, + "loss": 0.2243, + "step": 2504 + }, + { + "epoch": 1.186149748446286, + "grad_norm": 1.1564288139343262, + "learning_rate": 1.3773025117639543e-05, + "loss": 0.2223, + "step": 2505 + }, + { + "epoch": 1.1866232613199172, + "grad_norm": 1.8472498655319214, + "learning_rate": 1.3768288505685385e-05, + "loss": 0.199, + "step": 2506 + }, + { + "epoch": 1.1870967741935483, + "grad_norm": 1.086516261100769, + "learning_rate": 1.3763550908171724e-05, + "loss": 0.2352, + "step": 2507 + }, + { + "epoch": 1.1875702870671796, + "grad_norm": 1.063267707824707, + "learning_rate": 1.375881232633763e-05, + "loss": 0.2142, + "step": 2508 + }, + { + "epoch": 1.1880437999408109, + "grad_norm": 1.4095113277435303, + "learning_rate": 1.3754072761422434e-05, + "loss": 0.192, + "step": 2509 + }, + { + "epoch": 1.1885173128144422, + "grad_norm": 1.2273898124694824, + "learning_rate": 1.3749332214665723e-05, + "loss": 0.237, + "step": 2510 + }, + { + "epoch": 1.1889908256880735, + "grad_norm": 1.414565920829773, + "learning_rate": 1.374459068730734e-05, + "loss": 0.2165, + "step": 2511 + }, + { + "epoch": 1.1894643385617045, + "grad_norm": 1.405671238899231, + "learning_rate": 1.373984818058739e-05, + "loss": 0.2649, + "step": 2512 + }, + { + "epoch": 1.1899378514353358, + "grad_norm": 1.4026716947555542, + "learning_rate": 1.3735104695746225e-05, + "loss": 0.2252, + "step": 2513 + }, + { + "epoch": 1.190411364308967, + "grad_norm": 1.2324182987213135, + "learning_rate": 1.373036023402446e-05, + "loss": 0.2129, + "step": 2514 + }, + { + "epoch": 1.1908848771825984, + "grad_norm": 1.2538621425628662, + "learning_rate": 1.3725614796662962e-05, + "loss": 0.2241, + "step": 2515 + }, + { + "epoch": 1.1913583900562297, + "grad_norm": 1.2133036851882935, + "learning_rate": 1.372086838490286e-05, + "loss": 0.2213, + "step": 2516 + }, + { + "epoch": 1.191831902929861, + "grad_norm": 1.0370423793792725, + "learning_rate": 1.3716120999985527e-05, + "loss": 0.216, + "step": 2517 + }, + { + "epoch": 1.1923054158034923, + "grad_norm": 1.403160810470581, + "learning_rate": 1.3711372643152597e-05, + "loss": 0.2511, + "step": 2518 + }, + { + "epoch": 1.1927789286771233, + "grad_norm": 1.4306610822677612, + "learning_rate": 1.3706623315645953e-05, + "loss": 0.2194, + "step": 2519 + }, + { + "epoch": 1.1932524415507546, + "grad_norm": 0.9785909056663513, + "learning_rate": 1.370187301870774e-05, + "loss": 0.2138, + "step": 2520 + }, + { + "epoch": 1.193725954424386, + "grad_norm": 1.315924882888794, + "learning_rate": 1.3697121753580353e-05, + "loss": 0.2212, + "step": 2521 + }, + { + "epoch": 1.1941994672980172, + "grad_norm": 1.1268949508666992, + "learning_rate": 1.3692369521506437e-05, + "loss": 0.221, + "step": 2522 + }, + { + "epoch": 1.1946729801716485, + "grad_norm": 2.0519661903381348, + "learning_rate": 1.3687616323728894e-05, + "loss": 0.2417, + "step": 2523 + }, + { + "epoch": 1.1951464930452798, + "grad_norm": 1.7941192388534546, + "learning_rate": 1.3682862161490877e-05, + "loss": 0.2387, + "step": 2524 + }, + { + "epoch": 1.195620005918911, + "grad_norm": 1.0652010440826416, + "learning_rate": 1.367810703603579e-05, + "loss": 0.2142, + "step": 2525 + }, + { + "epoch": 1.196093518792542, + "grad_norm": 1.2762157917022705, + "learning_rate": 1.3673350948607294e-05, + "loss": 0.2268, + "step": 2526 + }, + { + "epoch": 1.1965670316661734, + "grad_norm": 1.4592194557189941, + "learning_rate": 1.3668593900449292e-05, + "loss": 0.2157, + "step": 2527 + }, + { + "epoch": 1.1970405445398047, + "grad_norm": 1.3134995698928833, + "learning_rate": 1.3663835892805947e-05, + "loss": 0.2369, + "step": 2528 + }, + { + "epoch": 1.197514057413436, + "grad_norm": 2.157960891723633, + "learning_rate": 1.365907692692167e-05, + "loss": 0.1831, + "step": 2529 + }, + { + "epoch": 1.1979875702870673, + "grad_norm": 1.0059032440185547, + "learning_rate": 1.3654317004041122e-05, + "loss": 0.2266, + "step": 2530 + }, + { + "epoch": 1.1984610831606983, + "grad_norm": 1.2563735246658325, + "learning_rate": 1.3649556125409218e-05, + "loss": 0.2189, + "step": 2531 + }, + { + "epoch": 1.1989345960343296, + "grad_norm": 0.9727022051811218, + "learning_rate": 1.3644794292271116e-05, + "loss": 0.2255, + "step": 2532 + }, + { + "epoch": 1.199408108907961, + "grad_norm": 0.8149449825286865, + "learning_rate": 1.3640031505872225e-05, + "loss": 0.2023, + "step": 2533 + }, + { + "epoch": 1.1998816217815922, + "grad_norm": 1.0001932382583618, + "learning_rate": 1.3635267767458213e-05, + "loss": 0.2004, + "step": 2534 + }, + { + "epoch": 1.2003551346552235, + "grad_norm": 1.593064546585083, + "learning_rate": 1.3630503078274988e-05, + "loss": 0.2215, + "step": 2535 + }, + { + "epoch": 1.2008286475288548, + "grad_norm": 1.3267443180084229, + "learning_rate": 1.3625737439568705e-05, + "loss": 0.2162, + "step": 2536 + }, + { + "epoch": 1.201302160402486, + "grad_norm": 1.1851780414581299, + "learning_rate": 1.3620970852585774e-05, + "loss": 0.2121, + "step": 2537 + }, + { + "epoch": 1.2017756732761171, + "grad_norm": 0.9846872687339783, + "learning_rate": 1.3616203318572847e-05, + "loss": 0.2268, + "step": 2538 + }, + { + "epoch": 1.2022491861497484, + "grad_norm": 1.268949031829834, + "learning_rate": 1.3611434838776827e-05, + "loss": 0.2392, + "step": 2539 + }, + { + "epoch": 1.2027226990233797, + "grad_norm": 1.274588704109192, + "learning_rate": 1.3606665414444868e-05, + "loss": 0.2315, + "step": 2540 + }, + { + "epoch": 1.203196211897011, + "grad_norm": 2.332651376724243, + "learning_rate": 1.3601895046824363e-05, + "loss": 0.2311, + "step": 2541 + }, + { + "epoch": 1.2036697247706423, + "grad_norm": 2.8526484966278076, + "learning_rate": 1.3597123737162955e-05, + "loss": 0.2059, + "step": 2542 + }, + { + "epoch": 1.2041432376442733, + "grad_norm": 1.6713618040084839, + "learning_rate": 1.359235148670854e-05, + "loss": 0.2153, + "step": 2543 + }, + { + "epoch": 1.2046167505179046, + "grad_norm": 1.6479053497314453, + "learning_rate": 1.3587578296709248e-05, + "loss": 0.2273, + "step": 2544 + }, + { + "epoch": 1.205090263391536, + "grad_norm": 1.376947283744812, + "learning_rate": 1.358280416841346e-05, + "loss": 0.2047, + "step": 2545 + }, + { + "epoch": 1.2055637762651672, + "grad_norm": 1.5050996541976929, + "learning_rate": 1.3578029103069805e-05, + "loss": 0.211, + "step": 2546 + }, + { + "epoch": 1.2060372891387985, + "grad_norm": 1.4506083726882935, + "learning_rate": 1.357325310192716e-05, + "loss": 0.2179, + "step": 2547 + }, + { + "epoch": 1.2065108020124298, + "grad_norm": 1.4101526737213135, + "learning_rate": 1.3568476166234634e-05, + "loss": 0.2305, + "step": 2548 + }, + { + "epoch": 1.206984314886061, + "grad_norm": 2.1561319828033447, + "learning_rate": 1.3563698297241596e-05, + "loss": 0.2227, + "step": 2549 + }, + { + "epoch": 1.2074578277596921, + "grad_norm": 1.0047372579574585, + "learning_rate": 1.3558919496197645e-05, + "loss": 0.2211, + "step": 2550 + }, + { + "epoch": 1.2079313406333234, + "grad_norm": 1.103215217590332, + "learning_rate": 1.3554139764352634e-05, + "loss": 0.2091, + "step": 2551 + }, + { + "epoch": 1.2084048535069547, + "grad_norm": 2.47819185256958, + "learning_rate": 1.3549359102956655e-05, + "loss": 0.231, + "step": 2552 + }, + { + "epoch": 1.208878366380586, + "grad_norm": 0.9107365608215332, + "learning_rate": 1.3544577513260046e-05, + "loss": 0.2048, + "step": 2553 + }, + { + "epoch": 1.2093518792542173, + "grad_norm": 1.520993947982788, + "learning_rate": 1.353979499651338e-05, + "loss": 0.2213, + "step": 2554 + }, + { + "epoch": 1.2098253921278486, + "grad_norm": 0.9904034733772278, + "learning_rate": 1.3535011553967486e-05, + "loss": 0.2281, + "step": 2555 + }, + { + "epoch": 1.2102989050014799, + "grad_norm": 1.3577964305877686, + "learning_rate": 1.3530227186873419e-05, + "loss": 0.2299, + "step": 2556 + }, + { + "epoch": 1.210772417875111, + "grad_norm": 1.2297004461288452, + "learning_rate": 1.352544189648249e-05, + "loss": 0.1907, + "step": 2557 + }, + { + "epoch": 1.2112459307487422, + "grad_norm": 1.2518497705459595, + "learning_rate": 1.3520655684046242e-05, + "loss": 0.2217, + "step": 2558 + }, + { + "epoch": 1.2117194436223735, + "grad_norm": 1.2009005546569824, + "learning_rate": 1.3515868550816467e-05, + "loss": 0.1946, + "step": 2559 + }, + { + "epoch": 1.2121929564960048, + "grad_norm": 1.5080748796463013, + "learning_rate": 1.3511080498045189e-05, + "loss": 0.2365, + "step": 2560 + }, + { + "epoch": 1.212666469369636, + "grad_norm": 1.2145692110061646, + "learning_rate": 1.3506291526984679e-05, + "loss": 0.2444, + "step": 2561 + }, + { + "epoch": 1.2131399822432671, + "grad_norm": 1.701706886291504, + "learning_rate": 1.3501501638887447e-05, + "loss": 0.2327, + "step": 2562 + }, + { + "epoch": 1.2136134951168984, + "grad_norm": 3.05845308303833, + "learning_rate": 1.349671083500624e-05, + "loss": 0.2352, + "step": 2563 + }, + { + "epoch": 1.2140870079905297, + "grad_norm": 1.9371395111083984, + "learning_rate": 1.3491919116594045e-05, + "loss": 0.2098, + "step": 2564 + }, + { + "epoch": 1.214560520864161, + "grad_norm": 1.3178801536560059, + "learning_rate": 1.3487126484904093e-05, + "loss": 0.2176, + "step": 2565 + }, + { + "epoch": 1.2150340337377923, + "grad_norm": 1.2978839874267578, + "learning_rate": 1.3482332941189854e-05, + "loss": 0.2286, + "step": 2566 + }, + { + "epoch": 1.2155075466114236, + "grad_norm": 1.7363958358764648, + "learning_rate": 1.3477538486705025e-05, + "loss": 0.2034, + "step": 2567 + }, + { + "epoch": 1.2159810594850549, + "grad_norm": 2.09183669090271, + "learning_rate": 1.3472743122703552e-05, + "loss": 0.2326, + "step": 2568 + }, + { + "epoch": 1.216454572358686, + "grad_norm": 0.9742308855056763, + "learning_rate": 1.3467946850439622e-05, + "loss": 0.1946, + "step": 2569 + }, + { + "epoch": 1.2169280852323172, + "grad_norm": 1.709175705909729, + "learning_rate": 1.3463149671167646e-05, + "loss": 0.2307, + "step": 2570 + }, + { + "epoch": 1.2174015981059485, + "grad_norm": 1.1259140968322754, + "learning_rate": 1.3458351586142284e-05, + "loss": 0.2385, + "step": 2571 + }, + { + "epoch": 1.2178751109795798, + "grad_norm": 1.0130417346954346, + "learning_rate": 1.3453552596618427e-05, + "loss": 0.2264, + "step": 2572 + }, + { + "epoch": 1.218348623853211, + "grad_norm": 1.774648904800415, + "learning_rate": 1.3448752703851207e-05, + "loss": 0.2396, + "step": 2573 + }, + { + "epoch": 1.2188221367268421, + "grad_norm": 2.2298903465270996, + "learning_rate": 1.3443951909095984e-05, + "loss": 0.2075, + "step": 2574 + }, + { + "epoch": 1.2192956496004734, + "grad_norm": 1.3856366872787476, + "learning_rate": 1.3439150213608367e-05, + "loss": 0.2104, + "step": 2575 + }, + { + "epoch": 1.2197691624741047, + "grad_norm": 1.630552887916565, + "learning_rate": 1.343434761864419e-05, + "loss": 0.2092, + "step": 2576 + }, + { + "epoch": 1.220242675347736, + "grad_norm": 1.0852291584014893, + "learning_rate": 1.3429544125459524e-05, + "loss": 0.2098, + "step": 2577 + }, + { + "epoch": 1.2207161882213673, + "grad_norm": 1.034583568572998, + "learning_rate": 1.342473973531068e-05, + "loss": 0.2085, + "step": 2578 + }, + { + "epoch": 1.2211897010949986, + "grad_norm": 1.044459581375122, + "learning_rate": 1.3419934449454194e-05, + "loss": 0.2173, + "step": 2579 + }, + { + "epoch": 1.2216632139686299, + "grad_norm": 1.3877681493759155, + "learning_rate": 1.3415128269146846e-05, + "loss": 0.2386, + "step": 2580 + }, + { + "epoch": 1.222136726842261, + "grad_norm": 1.530917763710022, + "learning_rate": 1.3410321195645648e-05, + "loss": 0.2139, + "step": 2581 + }, + { + "epoch": 1.2226102397158922, + "grad_norm": 1.024557113647461, + "learning_rate": 1.3405513230207839e-05, + "loss": 0.236, + "step": 2582 + }, + { + "epoch": 1.2230837525895235, + "grad_norm": 1.3001816272735596, + "learning_rate": 1.3400704374090898e-05, + "loss": 0.2451, + "step": 2583 + }, + { + "epoch": 1.2235572654631548, + "grad_norm": 1.623937964439392, + "learning_rate": 1.3395894628552535e-05, + "loss": 0.2361, + "step": 2584 + }, + { + "epoch": 1.224030778336786, + "grad_norm": 1.2357534170150757, + "learning_rate": 1.3391083994850696e-05, + "loss": 0.2363, + "step": 2585 + }, + { + "epoch": 1.2245042912104174, + "grad_norm": 1.3607426881790161, + "learning_rate": 1.3386272474243546e-05, + "loss": 0.2231, + "step": 2586 + }, + { + "epoch": 1.2249778040840484, + "grad_norm": 1.3958909511566162, + "learning_rate": 1.3381460067989505e-05, + "loss": 0.2305, + "step": 2587 + }, + { + "epoch": 1.2254513169576797, + "grad_norm": 1.3651123046875, + "learning_rate": 1.3376646777347202e-05, + "loss": 0.233, + "step": 2588 + }, + { + "epoch": 1.225924829831311, + "grad_norm": 1.5809210538864136, + "learning_rate": 1.3371832603575509e-05, + "loss": 0.2273, + "step": 2589 + }, + { + "epoch": 1.2263983427049423, + "grad_norm": 1.3318142890930176, + "learning_rate": 1.3367017547933529e-05, + "loss": 0.2202, + "step": 2590 + }, + { + "epoch": 1.2268718555785736, + "grad_norm": 2.5720956325531006, + "learning_rate": 1.3362201611680587e-05, + "loss": 0.2073, + "step": 2591 + }, + { + "epoch": 1.2273453684522049, + "grad_norm": 1.4822810888290405, + "learning_rate": 1.3357384796076253e-05, + "loss": 0.2111, + "step": 2592 + }, + { + "epoch": 1.227818881325836, + "grad_norm": 1.8587872982025146, + "learning_rate": 1.3352567102380315e-05, + "loss": 0.2369, + "step": 2593 + }, + { + "epoch": 1.2282923941994672, + "grad_norm": 0.9261167645454407, + "learning_rate": 1.3347748531852791e-05, + "loss": 0.2318, + "step": 2594 + }, + { + "epoch": 1.2287659070730985, + "grad_norm": 1.5869637727737427, + "learning_rate": 1.3342929085753939e-05, + "loss": 0.2472, + "step": 2595 + }, + { + "epoch": 1.2292394199467298, + "grad_norm": 0.9304991364479065, + "learning_rate": 1.3338108765344233e-05, + "loss": 0.2046, + "step": 2596 + }, + { + "epoch": 1.229712932820361, + "grad_norm": 1.8789327144622803, + "learning_rate": 1.3333287571884382e-05, + "loss": 0.2467, + "step": 2597 + }, + { + "epoch": 1.2301864456939924, + "grad_norm": 3.1216373443603516, + "learning_rate": 1.3328465506635324e-05, + "loss": 0.2007, + "step": 2598 + }, + { + "epoch": 1.2306599585676237, + "grad_norm": 1.2716069221496582, + "learning_rate": 1.3323642570858221e-05, + "loss": 0.2083, + "step": 2599 + }, + { + "epoch": 1.2311334714412547, + "grad_norm": 1.1587867736816406, + "learning_rate": 1.331881876581447e-05, + "loss": 0.1932, + "step": 2600 + }, + { + "epoch": 1.231606984314886, + "grad_norm": 1.1460086107254028, + "learning_rate": 1.3313994092765688e-05, + "loss": 0.2137, + "step": 2601 + }, + { + "epoch": 1.2320804971885173, + "grad_norm": 1.066895604133606, + "learning_rate": 1.3309168552973718e-05, + "loss": 0.2257, + "step": 2602 + }, + { + "epoch": 1.2325540100621486, + "grad_norm": 1.1454565525054932, + "learning_rate": 1.3304342147700642e-05, + "loss": 0.2358, + "step": 2603 + }, + { + "epoch": 1.2330275229357799, + "grad_norm": 1.030795693397522, + "learning_rate": 1.3299514878208752e-05, + "loss": 0.2263, + "step": 2604 + }, + { + "epoch": 1.233501035809411, + "grad_norm": 1.6368428468704224, + "learning_rate": 1.3294686745760578e-05, + "loss": 0.2432, + "step": 2605 + }, + { + "epoch": 1.2339745486830422, + "grad_norm": 1.6315193176269531, + "learning_rate": 1.328985775161887e-05, + "loss": 0.2042, + "step": 2606 + }, + { + "epoch": 1.2344480615566735, + "grad_norm": 1.179657220840454, + "learning_rate": 1.3285027897046603e-05, + "loss": 0.2091, + "step": 2607 + }, + { + "epoch": 1.2349215744303048, + "grad_norm": 1.292107105255127, + "learning_rate": 1.3280197183306982e-05, + "loss": 0.2323, + "step": 2608 + }, + { + "epoch": 1.235395087303936, + "grad_norm": 1.0263645648956299, + "learning_rate": 1.3275365611663432e-05, + "loss": 0.2211, + "step": 2609 + }, + { + "epoch": 1.2358686001775674, + "grad_norm": 1.5201125144958496, + "learning_rate": 1.3270533183379604e-05, + "loss": 0.2254, + "step": 2610 + }, + { + "epoch": 1.2363421130511987, + "grad_norm": 1.2638511657714844, + "learning_rate": 1.3265699899719374e-05, + "loss": 0.2249, + "step": 2611 + }, + { + "epoch": 1.2368156259248297, + "grad_norm": 1.429612398147583, + "learning_rate": 1.3260865761946837e-05, + "loss": 0.2316, + "step": 2612 + }, + { + "epoch": 1.237289138798461, + "grad_norm": 1.2139184474945068, + "learning_rate": 1.3256030771326325e-05, + "loss": 0.2282, + "step": 2613 + }, + { + "epoch": 1.2377626516720923, + "grad_norm": 2.151301383972168, + "learning_rate": 1.325119492912237e-05, + "loss": 0.2462, + "step": 2614 + }, + { + "epoch": 1.2382361645457236, + "grad_norm": 1.421278715133667, + "learning_rate": 1.3246358236599747e-05, + "loss": 0.2238, + "step": 2615 + }, + { + "epoch": 1.238709677419355, + "grad_norm": 1.212721824645996, + "learning_rate": 1.3241520695023449e-05, + "loss": 0.2312, + "step": 2616 + }, + { + "epoch": 1.2391831902929862, + "grad_norm": 1.3026739358901978, + "learning_rate": 1.3236682305658682e-05, + "loss": 0.2466, + "step": 2617 + }, + { + "epoch": 1.2396567031666172, + "grad_norm": 1.0858564376831055, + "learning_rate": 1.3231843069770882e-05, + "loss": 0.2156, + "step": 2618 + }, + { + "epoch": 1.2401302160402485, + "grad_norm": 1.2696760892868042, + "learning_rate": 1.3227002988625705e-05, + "loss": 0.2361, + "step": 2619 + }, + { + "epoch": 1.2406037289138798, + "grad_norm": 1.2995176315307617, + "learning_rate": 1.322216206348903e-05, + "loss": 0.2306, + "step": 2620 + }, + { + "epoch": 1.241077241787511, + "grad_norm": 1.480022668838501, + "learning_rate": 1.3217320295626953e-05, + "loss": 0.2266, + "step": 2621 + }, + { + "epoch": 1.2415507546611424, + "grad_norm": 1.1644608974456787, + "learning_rate": 1.3212477686305789e-05, + "loss": 0.2018, + "step": 2622 + }, + { + "epoch": 1.2420242675347737, + "grad_norm": 0.9980354309082031, + "learning_rate": 1.3207634236792077e-05, + "loss": 0.2019, + "step": 2623 + }, + { + "epoch": 1.2424977804084048, + "grad_norm": 1.6177034378051758, + "learning_rate": 1.3202789948352577e-05, + "loss": 0.2148, + "step": 2624 + }, + { + "epoch": 1.242971293282036, + "grad_norm": 1.0906509160995483, + "learning_rate": 1.319794482225426e-05, + "loss": 0.2256, + "step": 2625 + }, + { + "epoch": 1.2434448061556673, + "grad_norm": 1.3726736307144165, + "learning_rate": 1.3193098859764329e-05, + "loss": 0.2089, + "step": 2626 + }, + { + "epoch": 1.2439183190292986, + "grad_norm": 1.480689525604248, + "learning_rate": 1.3188252062150195e-05, + "loss": 0.2264, + "step": 2627 + }, + { + "epoch": 1.24439183190293, + "grad_norm": 1.417734980583191, + "learning_rate": 1.3183404430679492e-05, + "loss": 0.2317, + "step": 2628 + }, + { + "epoch": 1.2448653447765612, + "grad_norm": 1.1104220151901245, + "learning_rate": 1.3178555966620067e-05, + "loss": 0.2314, + "step": 2629 + }, + { + "epoch": 1.2453388576501925, + "grad_norm": 1.307004690170288, + "learning_rate": 1.3173706671239999e-05, + "loss": 0.23, + "step": 2630 + }, + { + "epoch": 1.2458123705238235, + "grad_norm": 1.3435912132263184, + "learning_rate": 1.3168856545807565e-05, + "loss": 0.2255, + "step": 2631 + }, + { + "epoch": 1.2462858833974548, + "grad_norm": 1.3050034046173096, + "learning_rate": 1.3164005591591273e-05, + "loss": 0.2399, + "step": 2632 + }, + { + "epoch": 1.2467593962710861, + "grad_norm": 1.0008726119995117, + "learning_rate": 1.3159153809859842e-05, + "loss": 0.2318, + "step": 2633 + }, + { + "epoch": 1.2472329091447174, + "grad_norm": 1.289227843284607, + "learning_rate": 1.3154301201882209e-05, + "loss": 0.2061, + "step": 2634 + }, + { + "epoch": 1.2477064220183487, + "grad_norm": 1.4629969596862793, + "learning_rate": 1.3149447768927526e-05, + "loss": 0.2023, + "step": 2635 + }, + { + "epoch": 1.2481799348919798, + "grad_norm": 1.1824212074279785, + "learning_rate": 1.3144593512265162e-05, + "loss": 0.2296, + "step": 2636 + }, + { + "epoch": 1.248653447765611, + "grad_norm": 1.5664751529693604, + "learning_rate": 1.3139738433164704e-05, + "loss": 0.2159, + "step": 2637 + }, + { + "epoch": 1.2491269606392423, + "grad_norm": 1.077606201171875, + "learning_rate": 1.3134882532895945e-05, + "loss": 0.2186, + "step": 2638 + }, + { + "epoch": 1.2496004735128736, + "grad_norm": 1.1679469347000122, + "learning_rate": 1.3130025812728904e-05, + "loss": 0.2338, + "step": 2639 + }, + { + "epoch": 1.250073986386505, + "grad_norm": 1.1365487575531006, + "learning_rate": 1.3125168273933811e-05, + "loss": 0.235, + "step": 2640 + }, + { + "epoch": 1.2505474992601362, + "grad_norm": 0.9639025330543518, + "learning_rate": 1.31203099177811e-05, + "loss": 0.2164, + "step": 2641 + }, + { + "epoch": 1.2510210121337675, + "grad_norm": 1.8010168075561523, + "learning_rate": 1.3115450745541434e-05, + "loss": 0.2062, + "step": 2642 + }, + { + "epoch": 1.2514945250073986, + "grad_norm": 1.1365344524383545, + "learning_rate": 1.311059075848568e-05, + "loss": 0.1904, + "step": 2643 + }, + { + "epoch": 1.2519680378810298, + "grad_norm": 2.163564682006836, + "learning_rate": 1.3105729957884923e-05, + "loss": 0.2219, + "step": 2644 + }, + { + "epoch": 1.2524415507546611, + "grad_norm": 1.4166170358657837, + "learning_rate": 1.3100868345010454e-05, + "loss": 0.2137, + "step": 2645 + }, + { + "epoch": 1.2529150636282924, + "grad_norm": 1.539779543876648, + "learning_rate": 1.3096005921133785e-05, + "loss": 0.2185, + "step": 2646 + }, + { + "epoch": 1.2533885765019237, + "grad_norm": 2.17313289642334, + "learning_rate": 1.3091142687526637e-05, + "loss": 0.2425, + "step": 2647 + }, + { + "epoch": 1.2538620893755548, + "grad_norm": 2.488083600997925, + "learning_rate": 1.3086278645460939e-05, + "loss": 0.2248, + "step": 2648 + }, + { + "epoch": 1.2543356022491863, + "grad_norm": 1.1161004304885864, + "learning_rate": 1.3081413796208835e-05, + "loss": 0.2153, + "step": 2649 + }, + { + "epoch": 1.2548091151228173, + "grad_norm": 1.0800951719284058, + "learning_rate": 1.307654814104268e-05, + "loss": 0.2141, + "step": 2650 + }, + { + "epoch": 1.2552826279964486, + "grad_norm": 1.1490174531936646, + "learning_rate": 1.3071681681235039e-05, + "loss": 0.2563, + "step": 2651 + }, + { + "epoch": 1.25575614087008, + "grad_norm": 1.0029891729354858, + "learning_rate": 1.3066814418058685e-05, + "loss": 0.2291, + "step": 2652 + }, + { + "epoch": 1.2562296537437112, + "grad_norm": 1.9102951288223267, + "learning_rate": 1.3061946352786607e-05, + "loss": 0.2197, + "step": 2653 + }, + { + "epoch": 1.2567031666173425, + "grad_norm": 1.408528208732605, + "learning_rate": 1.3057077486692e-05, + "loss": 0.2255, + "step": 2654 + }, + { + "epoch": 1.2571766794909736, + "grad_norm": 1.8932093381881714, + "learning_rate": 1.3052207821048268e-05, + "loss": 0.2058, + "step": 2655 + }, + { + "epoch": 1.2576501923646048, + "grad_norm": 1.3944228887557983, + "learning_rate": 1.3047337357129025e-05, + "loss": 0.2077, + "step": 2656 + }, + { + "epoch": 1.2581237052382361, + "grad_norm": 1.0805628299713135, + "learning_rate": 1.3042466096208099e-05, + "loss": 0.2436, + "step": 2657 + }, + { + "epoch": 1.2585972181118674, + "grad_norm": 1.1540179252624512, + "learning_rate": 1.3037594039559514e-05, + "loss": 0.2202, + "step": 2658 + }, + { + "epoch": 1.2590707309854987, + "grad_norm": 1.449994683265686, + "learning_rate": 1.303272118845751e-05, + "loss": 0.227, + "step": 2659 + }, + { + "epoch": 1.25954424385913, + "grad_norm": 1.6480882167816162, + "learning_rate": 1.3027847544176537e-05, + "loss": 0.206, + "step": 2660 + }, + { + "epoch": 1.2600177567327613, + "grad_norm": 1.056485891342163, + "learning_rate": 1.3022973107991251e-05, + "loss": 0.2193, + "step": 2661 + }, + { + "epoch": 1.2604912696063924, + "grad_norm": 1.3506393432617188, + "learning_rate": 1.301809788117651e-05, + "loss": 0.2331, + "step": 2662 + }, + { + "epoch": 1.2609647824800236, + "grad_norm": 1.8271267414093018, + "learning_rate": 1.3013221865007385e-05, + "loss": 0.2206, + "step": 2663 + }, + { + "epoch": 1.261438295353655, + "grad_norm": 1.0800082683563232, + "learning_rate": 1.3008345060759149e-05, + "loss": 0.2127, + "step": 2664 + }, + { + "epoch": 1.2619118082272862, + "grad_norm": 1.0898518562316895, + "learning_rate": 1.3003467469707287e-05, + "loss": 0.2179, + "step": 2665 + }, + { + "epoch": 1.2623853211009175, + "grad_norm": 1.098704218864441, + "learning_rate": 1.299858909312748e-05, + "loss": 0.2281, + "step": 2666 + }, + { + "epoch": 1.2628588339745486, + "grad_norm": 1.6652623414993286, + "learning_rate": 1.2993709932295628e-05, + "loss": 0.2241, + "step": 2667 + }, + { + "epoch": 1.2633323468481799, + "grad_norm": 1.2261773347854614, + "learning_rate": 1.2988829988487822e-05, + "loss": 0.2483, + "step": 2668 + }, + { + "epoch": 1.2638058597218111, + "grad_norm": 0.9692365527153015, + "learning_rate": 1.2983949262980362e-05, + "loss": 0.2104, + "step": 2669 + }, + { + "epoch": 1.2642793725954424, + "grad_norm": 1.218557357788086, + "learning_rate": 1.2979067757049763e-05, + "loss": 0.241, + "step": 2670 + }, + { + "epoch": 1.2647528854690737, + "grad_norm": 1.0587778091430664, + "learning_rate": 1.297418547197273e-05, + "loss": 0.2213, + "step": 2671 + }, + { + "epoch": 1.265226398342705, + "grad_norm": 2.7871766090393066, + "learning_rate": 1.2969302409026181e-05, + "loss": 0.22, + "step": 2672 + }, + { + "epoch": 1.2656999112163363, + "grad_norm": 1.0976502895355225, + "learning_rate": 1.2964418569487232e-05, + "loss": 0.2031, + "step": 2673 + }, + { + "epoch": 1.2661734240899674, + "grad_norm": 1.201554298400879, + "learning_rate": 1.2959533954633205e-05, + "loss": 0.2362, + "step": 2674 + }, + { + "epoch": 1.2666469369635986, + "grad_norm": 1.15542733669281, + "learning_rate": 1.2954648565741623e-05, + "loss": 0.2212, + "step": 2675 + }, + { + "epoch": 1.26712044983723, + "grad_norm": 1.641326665878296, + "learning_rate": 1.2949762404090214e-05, + "loss": 0.2476, + "step": 2676 + }, + { + "epoch": 1.2675939627108612, + "grad_norm": 1.6406174898147583, + "learning_rate": 1.2944875470956902e-05, + "loss": 0.2303, + "step": 2677 + }, + { + "epoch": 1.2680674755844925, + "grad_norm": 1.359363079071045, + "learning_rate": 1.2939987767619821e-05, + "loss": 0.2267, + "step": 2678 + }, + { + "epoch": 1.2685409884581236, + "grad_norm": 1.6398720741271973, + "learning_rate": 1.2935099295357304e-05, + "loss": 0.2117, + "step": 2679 + }, + { + "epoch": 1.269014501331755, + "grad_norm": 1.0096100568771362, + "learning_rate": 1.2930210055447879e-05, + "loss": 0.2299, + "step": 2680 + }, + { + "epoch": 1.2694880142053862, + "grad_norm": 1.2073965072631836, + "learning_rate": 1.2925320049170282e-05, + "loss": 0.2236, + "step": 2681 + }, + { + "epoch": 1.2699615270790174, + "grad_norm": 2.227213144302368, + "learning_rate": 1.292042927780345e-05, + "loss": 0.2152, + "step": 2682 + }, + { + "epoch": 1.2704350399526487, + "grad_norm": 2.0689918994903564, + "learning_rate": 1.2915537742626512e-05, + "loss": 0.2129, + "step": 2683 + }, + { + "epoch": 1.27090855282628, + "grad_norm": 2.3078198432922363, + "learning_rate": 1.2910645444918809e-05, + "loss": 0.2196, + "step": 2684 + }, + { + "epoch": 1.2713820656999113, + "grad_norm": 1.536636233329773, + "learning_rate": 1.2905752385959863e-05, + "loss": 0.2474, + "step": 2685 + }, + { + "epoch": 1.2718555785735424, + "grad_norm": 1.6504733562469482, + "learning_rate": 1.2900858567029416e-05, + "loss": 0.2419, + "step": 2686 + }, + { + "epoch": 1.2723290914471737, + "grad_norm": 2.3625659942626953, + "learning_rate": 1.2895963989407394e-05, + "loss": 0.2319, + "step": 2687 + }, + { + "epoch": 1.272802604320805, + "grad_norm": 1.3434407711029053, + "learning_rate": 1.2891068654373928e-05, + "loss": 0.2157, + "step": 2688 + }, + { + "epoch": 1.2732761171944362, + "grad_norm": 0.922859251499176, + "learning_rate": 1.2886172563209348e-05, + "loss": 0.1955, + "step": 2689 + }, + { + "epoch": 1.2737496300680675, + "grad_norm": 2.3007314205169678, + "learning_rate": 1.2881275717194175e-05, + "loss": 0.2455, + "step": 2690 + }, + { + "epoch": 1.2742231429416988, + "grad_norm": 2.0500521659851074, + "learning_rate": 1.2876378117609136e-05, + "loss": 0.2188, + "step": 2691 + }, + { + "epoch": 1.27469665581533, + "grad_norm": 1.2446473836898804, + "learning_rate": 1.2871479765735151e-05, + "loss": 0.2171, + "step": 2692 + }, + { + "epoch": 1.2751701686889612, + "grad_norm": 1.333303451538086, + "learning_rate": 1.2866580662853334e-05, + "loss": 0.2597, + "step": 2693 + }, + { + "epoch": 1.2756436815625924, + "grad_norm": 1.047584891319275, + "learning_rate": 1.2861680810244998e-05, + "loss": 0.2296, + "step": 2694 + }, + { + "epoch": 1.2761171944362237, + "grad_norm": 1.1778916120529175, + "learning_rate": 1.2856780209191655e-05, + "loss": 0.2263, + "step": 2695 + }, + { + "epoch": 1.276590707309855, + "grad_norm": 2.103959083557129, + "learning_rate": 1.2851878860975007e-05, + "loss": 0.223, + "step": 2696 + }, + { + "epoch": 1.2770642201834863, + "grad_norm": 2.1192874908447266, + "learning_rate": 1.2846976766876956e-05, + "loss": 0.2031, + "step": 2697 + }, + { + "epoch": 1.2775377330571174, + "grad_norm": 1.320399522781372, + "learning_rate": 1.2842073928179594e-05, + "loss": 0.224, + "step": 2698 + }, + { + "epoch": 1.2780112459307487, + "grad_norm": 2.2415990829467773, + "learning_rate": 1.2837170346165216e-05, + "loss": 0.2253, + "step": 2699 + }, + { + "epoch": 1.27848475880438, + "grad_norm": 1.6020069122314453, + "learning_rate": 1.2832266022116304e-05, + "loss": 0.2348, + "step": 2700 + }, + { + "epoch": 1.2789582716780112, + "grad_norm": 1.4526188373565674, + "learning_rate": 1.2827360957315542e-05, + "loss": 0.2167, + "step": 2701 + }, + { + "epoch": 1.2794317845516425, + "grad_norm": 1.26724374294281, + "learning_rate": 1.2822455153045792e-05, + "loss": 0.2219, + "step": 2702 + }, + { + "epoch": 1.2799052974252738, + "grad_norm": 2.3599040508270264, + "learning_rate": 1.281754861059013e-05, + "loss": 0.2317, + "step": 2703 + }, + { + "epoch": 1.280378810298905, + "grad_norm": 1.8642935752868652, + "learning_rate": 1.2812641331231806e-05, + "loss": 0.2083, + "step": 2704 + }, + { + "epoch": 1.2808523231725362, + "grad_norm": 1.1180553436279297, + "learning_rate": 1.2807733316254275e-05, + "loss": 0.1894, + "step": 2705 + }, + { + "epoch": 1.2813258360461675, + "grad_norm": 1.712058663368225, + "learning_rate": 1.2802824566941186e-05, + "loss": 0.2353, + "step": 2706 + }, + { + "epoch": 1.2817993489197987, + "grad_norm": 1.1192132234573364, + "learning_rate": 1.279791508457637e-05, + "loss": 0.2343, + "step": 2707 + }, + { + "epoch": 1.28227286179343, + "grad_norm": 1.7516214847564697, + "learning_rate": 1.279300487044385e-05, + "loss": 0.2302, + "step": 2708 + }, + { + "epoch": 1.2827463746670613, + "grad_norm": 1.5895251035690308, + "learning_rate": 1.2788093925827858e-05, + "loss": 0.2144, + "step": 2709 + }, + { + "epoch": 1.2832198875406924, + "grad_norm": 1.3747384548187256, + "learning_rate": 1.2783182252012796e-05, + "loss": 0.2168, + "step": 2710 + }, + { + "epoch": 1.283693400414324, + "grad_norm": 1.1060492992401123, + "learning_rate": 1.2778269850283263e-05, + "loss": 0.2158, + "step": 2711 + }, + { + "epoch": 1.284166913287955, + "grad_norm": 0.8876561522483826, + "learning_rate": 1.2773356721924053e-05, + "loss": 0.1834, + "step": 2712 + }, + { + "epoch": 1.2846404261615862, + "grad_norm": 1.2856721878051758, + "learning_rate": 1.276844286822015e-05, + "loss": 0.2477, + "step": 2713 + }, + { + "epoch": 1.2851139390352175, + "grad_norm": 2.298548460006714, + "learning_rate": 1.2763528290456719e-05, + "loss": 0.2009, + "step": 2714 + }, + { + "epoch": 1.2855874519088488, + "grad_norm": 1.167225956916809, + "learning_rate": 1.2758612989919126e-05, + "loss": 0.2077, + "step": 2715 + }, + { + "epoch": 1.28606096478248, + "grad_norm": 1.04922354221344, + "learning_rate": 1.275369696789292e-05, + "loss": 0.2088, + "step": 2716 + }, + { + "epoch": 1.2865344776561112, + "grad_norm": 1.2038118839263916, + "learning_rate": 1.2748780225663835e-05, + "loss": 0.2267, + "step": 2717 + }, + { + "epoch": 1.2870079905297425, + "grad_norm": 1.241967797279358, + "learning_rate": 1.27438627645178e-05, + "loss": 0.2171, + "step": 2718 + }, + { + "epoch": 1.2874815034033738, + "grad_norm": 1.1782163381576538, + "learning_rate": 1.2738944585740933e-05, + "loss": 0.2194, + "step": 2719 + }, + { + "epoch": 1.287955016277005, + "grad_norm": 0.991797685623169, + "learning_rate": 1.2734025690619529e-05, + "loss": 0.2126, + "step": 2720 + }, + { + "epoch": 1.2884285291506363, + "grad_norm": 1.2841802835464478, + "learning_rate": 1.2729106080440081e-05, + "loss": 0.2145, + "step": 2721 + }, + { + "epoch": 1.2889020420242676, + "grad_norm": 1.4147202968597412, + "learning_rate": 1.2724185756489267e-05, + "loss": 0.2368, + "step": 2722 + }, + { + "epoch": 1.289375554897899, + "grad_norm": 1.1640384197235107, + "learning_rate": 1.271926472005395e-05, + "loss": 0.2179, + "step": 2723 + }, + { + "epoch": 1.28984906777153, + "grad_norm": 1.5157102346420288, + "learning_rate": 1.2714342972421177e-05, + "loss": 0.2185, + "step": 2724 + }, + { + "epoch": 1.2903225806451613, + "grad_norm": 1.4633888006210327, + "learning_rate": 1.2709420514878183e-05, + "loss": 0.2216, + "step": 2725 + }, + { + "epoch": 1.2907960935187925, + "grad_norm": 1.7542057037353516, + "learning_rate": 1.2704497348712397e-05, + "loss": 0.2188, + "step": 2726 + }, + { + "epoch": 1.2912696063924238, + "grad_norm": 1.3297491073608398, + "learning_rate": 1.2699573475211415e-05, + "loss": 0.2083, + "step": 2727 + }, + { + "epoch": 1.2917431192660551, + "grad_norm": 1.0260534286499023, + "learning_rate": 1.2694648895663038e-05, + "loss": 0.2324, + "step": 2728 + }, + { + "epoch": 1.2922166321396862, + "grad_norm": 1.0116316080093384, + "learning_rate": 1.2689723611355236e-05, + "loss": 0.2188, + "step": 2729 + }, + { + "epoch": 1.2926901450133175, + "grad_norm": 1.2791035175323486, + "learning_rate": 1.2684797623576173e-05, + "loss": 0.2286, + "step": 2730 + }, + { + "epoch": 1.2931636578869488, + "grad_norm": 2.320425033569336, + "learning_rate": 1.2679870933614189e-05, + "loss": 0.2229, + "step": 2731 + }, + { + "epoch": 1.29363717076058, + "grad_norm": 1.2323137521743774, + "learning_rate": 1.2674943542757819e-05, + "loss": 0.2404, + "step": 2732 + }, + { + "epoch": 1.2941106836342113, + "grad_norm": 1.2795385122299194, + "learning_rate": 1.267001545229577e-05, + "loss": 0.2178, + "step": 2733 + }, + { + "epoch": 1.2945841965078426, + "grad_norm": 1.4273737668991089, + "learning_rate": 1.2665086663516937e-05, + "loss": 0.2046, + "step": 2734 + }, + { + "epoch": 1.295057709381474, + "grad_norm": 1.6153572797775269, + "learning_rate": 1.26601571777104e-05, + "loss": 0.2214, + "step": 2735 + }, + { + "epoch": 1.295531222255105, + "grad_norm": 1.3819822072982788, + "learning_rate": 1.2655226996165415e-05, + "loss": 0.2056, + "step": 2736 + }, + { + "epoch": 1.2960047351287363, + "grad_norm": 1.2994166612625122, + "learning_rate": 1.2650296120171424e-05, + "loss": 0.1976, + "step": 2737 + }, + { + "epoch": 1.2964782480023676, + "grad_norm": 2.0569915771484375, + "learning_rate": 1.2645364551018049e-05, + "loss": 0.2245, + "step": 2738 + }, + { + "epoch": 1.2969517608759988, + "grad_norm": 1.7465271949768066, + "learning_rate": 1.2640432289995097e-05, + "loss": 0.2217, + "step": 2739 + }, + { + "epoch": 1.2974252737496301, + "grad_norm": 1.355583667755127, + "learning_rate": 1.2635499338392554e-05, + "loss": 0.206, + "step": 2740 + }, + { + "epoch": 1.2978987866232612, + "grad_norm": 1.9201940298080444, + "learning_rate": 1.2630565697500583e-05, + "loss": 0.2159, + "step": 2741 + }, + { + "epoch": 1.2983722994968927, + "grad_norm": 2.0526649951934814, + "learning_rate": 1.262563136860953e-05, + "loss": 0.204, + "step": 2742 + }, + { + "epoch": 1.2988458123705238, + "grad_norm": 1.1159857511520386, + "learning_rate": 1.2620696353009925e-05, + "loss": 0.2238, + "step": 2743 + }, + { + "epoch": 1.299319325244155, + "grad_norm": 1.1484869718551636, + "learning_rate": 1.2615760651992469e-05, + "loss": 0.2084, + "step": 2744 + }, + { + "epoch": 1.2997928381177863, + "grad_norm": 2.430534839630127, + "learning_rate": 1.261082426684805e-05, + "loss": 0.2258, + "step": 2745 + }, + { + "epoch": 1.3002663509914176, + "grad_norm": 1.5022352933883667, + "learning_rate": 1.2605887198867732e-05, + "loss": 0.2052, + "step": 2746 + }, + { + "epoch": 1.300739863865049, + "grad_norm": 1.0015721321105957, + "learning_rate": 1.2600949449342754e-05, + "loss": 0.2288, + "step": 2747 + }, + { + "epoch": 1.30121337673868, + "grad_norm": 1.3668203353881836, + "learning_rate": 1.259601101956454e-05, + "loss": 0.2349, + "step": 2748 + }, + { + "epoch": 1.3016868896123113, + "grad_norm": 2.162443161010742, + "learning_rate": 1.259107191082469e-05, + "loss": 0.237, + "step": 2749 + }, + { + "epoch": 1.3021604024859426, + "grad_norm": 1.806618094444275, + "learning_rate": 1.2586132124414978e-05, + "loss": 0.2307, + "step": 2750 + }, + { + "epoch": 1.3026339153595738, + "grad_norm": 1.345781922340393, + "learning_rate": 1.2581191661627355e-05, + "loss": 0.2118, + "step": 2751 + }, + { + "epoch": 1.3031074282332051, + "grad_norm": 1.1584277153015137, + "learning_rate": 1.2576250523753956e-05, + "loss": 0.2287, + "step": 2752 + }, + { + "epoch": 1.3035809411068364, + "grad_norm": 1.5245954990386963, + "learning_rate": 1.257130871208709e-05, + "loss": 0.2424, + "step": 2753 + }, + { + "epoch": 1.3040544539804677, + "grad_norm": 1.4683634042739868, + "learning_rate": 1.2566366227919232e-05, + "loss": 0.2133, + "step": 2754 + }, + { + "epoch": 1.3045279668540988, + "grad_norm": 2.778150796890259, + "learning_rate": 1.2561423072543043e-05, + "loss": 0.2346, + "step": 2755 + }, + { + "epoch": 1.30500147972773, + "grad_norm": 2.2693684101104736, + "learning_rate": 1.2556479247251364e-05, + "loss": 0.221, + "step": 2756 + }, + { + "epoch": 1.3054749926013614, + "grad_norm": 2.705554962158203, + "learning_rate": 1.2551534753337198e-05, + "loss": 0.2178, + "step": 2757 + }, + { + "epoch": 1.3059485054749926, + "grad_norm": 1.8546373844146729, + "learning_rate": 1.2546589592093734e-05, + "loss": 0.2251, + "step": 2758 + }, + { + "epoch": 1.306422018348624, + "grad_norm": 1.1411086320877075, + "learning_rate": 1.2541643764814328e-05, + "loss": 0.2077, + "step": 2759 + }, + { + "epoch": 1.306895531222255, + "grad_norm": 1.2965822219848633, + "learning_rate": 1.2536697272792517e-05, + "loss": 0.2308, + "step": 2760 + }, + { + "epoch": 1.3073690440958863, + "grad_norm": 1.3362536430358887, + "learning_rate": 1.2531750117322004e-05, + "loss": 0.2004, + "step": 2761 + }, + { + "epoch": 1.3078425569695176, + "grad_norm": 1.4326553344726562, + "learning_rate": 1.2526802299696674e-05, + "loss": 0.223, + "step": 2762 + }, + { + "epoch": 1.3083160698431489, + "grad_norm": 2.0484044551849365, + "learning_rate": 1.252185382121058e-05, + "loss": 0.2205, + "step": 2763 + }, + { + "epoch": 1.3087895827167801, + "grad_norm": 1.2032407522201538, + "learning_rate": 1.2516904683157947e-05, + "loss": 0.237, + "step": 2764 + }, + { + "epoch": 1.3092630955904114, + "grad_norm": 1.9354078769683838, + "learning_rate": 1.2511954886833173e-05, + "loss": 0.2233, + "step": 2765 + }, + { + "epoch": 1.3097366084640427, + "grad_norm": 1.3299360275268555, + "learning_rate": 1.2507004433530832e-05, + "loss": 0.2189, + "step": 2766 + }, + { + "epoch": 1.3102101213376738, + "grad_norm": 1.600842833518982, + "learning_rate": 1.2502053324545666e-05, + "loss": 0.2371, + "step": 2767 + }, + { + "epoch": 1.310683634211305, + "grad_norm": 2.048098564147949, + "learning_rate": 1.2497101561172593e-05, + "loss": 0.224, + "step": 2768 + }, + { + "epoch": 1.3111571470849364, + "grad_norm": 1.361019492149353, + "learning_rate": 1.2492149144706696e-05, + "loss": 0.2266, + "step": 2769 + }, + { + "epoch": 1.3116306599585676, + "grad_norm": 2.096219778060913, + "learning_rate": 1.2487196076443233e-05, + "loss": 0.2166, + "step": 2770 + }, + { + "epoch": 1.312104172832199, + "grad_norm": 1.8196600675582886, + "learning_rate": 1.2482242357677631e-05, + "loss": 0.1956, + "step": 2771 + }, + { + "epoch": 1.31257768570583, + "grad_norm": 0.9747202396392822, + "learning_rate": 1.2477287989705487e-05, + "loss": 0.2051, + "step": 2772 + }, + { + "epoch": 1.3130511985794615, + "grad_norm": 1.0282551050186157, + "learning_rate": 1.2472332973822568e-05, + "loss": 0.2144, + "step": 2773 + }, + { + "epoch": 1.3135247114530926, + "grad_norm": 0.8776658773422241, + "learning_rate": 1.2467377311324809e-05, + "loss": 0.2119, + "step": 2774 + }, + { + "epoch": 1.3139982243267239, + "grad_norm": 1.1889859437942505, + "learning_rate": 1.2462421003508318e-05, + "loss": 0.2086, + "step": 2775 + }, + { + "epoch": 1.3144717372003552, + "grad_norm": 2.8763959407806396, + "learning_rate": 1.2457464051669368e-05, + "loss": 0.2256, + "step": 2776 + }, + { + "epoch": 1.3149452500739864, + "grad_norm": 1.6497411727905273, + "learning_rate": 1.2452506457104406e-05, + "loss": 0.2192, + "step": 2777 + }, + { + "epoch": 1.3154187629476177, + "grad_norm": 1.5440013408660889, + "learning_rate": 1.2447548221110037e-05, + "loss": 0.2078, + "step": 2778 + }, + { + "epoch": 1.3158922758212488, + "grad_norm": 1.3232966661453247, + "learning_rate": 1.2442589344983043e-05, + "loss": 0.2108, + "step": 2779 + }, + { + "epoch": 1.31636578869488, + "grad_norm": 0.9742394685745239, + "learning_rate": 1.2437629830020372e-05, + "loss": 0.2167, + "step": 2780 + }, + { + "epoch": 1.3168393015685114, + "grad_norm": 1.2739520072937012, + "learning_rate": 1.2432669677519134e-05, + "loss": 0.2363, + "step": 2781 + }, + { + "epoch": 1.3173128144421427, + "grad_norm": 1.014618158340454, + "learning_rate": 1.2427708888776611e-05, + "loss": 0.2226, + "step": 2782 + }, + { + "epoch": 1.317786327315774, + "grad_norm": 2.0462191104888916, + "learning_rate": 1.2422747465090246e-05, + "loss": 0.2393, + "step": 2783 + }, + { + "epoch": 1.3182598401894052, + "grad_norm": 1.8126726150512695, + "learning_rate": 1.2417785407757657e-05, + "loss": 0.2352, + "step": 2784 + }, + { + "epoch": 1.3187333530630365, + "grad_norm": 1.3824222087860107, + "learning_rate": 1.2412822718076619e-05, + "loss": 0.2277, + "step": 2785 + }, + { + "epoch": 1.3192068659366676, + "grad_norm": 1.3327887058258057, + "learning_rate": 1.2407859397345073e-05, + "loss": 0.2179, + "step": 2786 + }, + { + "epoch": 1.3196803788102989, + "grad_norm": 1.1972007751464844, + "learning_rate": 1.2402895446861131e-05, + "loss": 0.2516, + "step": 2787 + }, + { + "epoch": 1.3201538916839302, + "grad_norm": 1.0115021467208862, + "learning_rate": 1.239793086792307e-05, + "loss": 0.2129, + "step": 2788 + }, + { + "epoch": 1.3206274045575614, + "grad_norm": 2.7716517448425293, + "learning_rate": 1.2392965661829321e-05, + "loss": 0.2244, + "step": 2789 + }, + { + "epoch": 1.3211009174311927, + "grad_norm": 2.3599562644958496, + "learning_rate": 1.238799982987849e-05, + "loss": 0.2132, + "step": 2790 + }, + { + "epoch": 1.3215744303048238, + "grad_norm": 2.0140914916992188, + "learning_rate": 1.238303337336934e-05, + "loss": 0.1975, + "step": 2791 + }, + { + "epoch": 1.322047943178455, + "grad_norm": 1.8659518957138062, + "learning_rate": 1.2378066293600801e-05, + "loss": 0.2089, + "step": 2792 + }, + { + "epoch": 1.3225214560520864, + "grad_norm": 2.2264151573181152, + "learning_rate": 1.2373098591871964e-05, + "loss": 0.1899, + "step": 2793 + }, + { + "epoch": 1.3229949689257177, + "grad_norm": 2.3644020557403564, + "learning_rate": 1.2368130269482084e-05, + "loss": 0.2203, + "step": 2794 + }, + { + "epoch": 1.323468481799349, + "grad_norm": 1.8323869705200195, + "learning_rate": 1.2363161327730577e-05, + "loss": 0.2109, + "step": 2795 + }, + { + "epoch": 1.3239419946729802, + "grad_norm": 1.3173034191131592, + "learning_rate": 1.2358191767917024e-05, + "loss": 0.2372, + "step": 2796 + }, + { + "epoch": 1.3244155075466115, + "grad_norm": 1.2270554304122925, + "learning_rate": 1.2353221591341163e-05, + "loss": 0.211, + "step": 2797 + }, + { + "epoch": 1.3248890204202426, + "grad_norm": 2.137927770614624, + "learning_rate": 1.2348250799302898e-05, + "loss": 0.2115, + "step": 2798 + }, + { + "epoch": 1.3253625332938739, + "grad_norm": 2.619537591934204, + "learning_rate": 1.2343279393102292e-05, + "loss": 0.2316, + "step": 2799 + }, + { + "epoch": 1.3258360461675052, + "grad_norm": 1.1064987182617188, + "learning_rate": 1.2338307374039564e-05, + "loss": 0.2106, + "step": 2800 + }, + { + "epoch": 1.3263095590411365, + "grad_norm": 1.347868800163269, + "learning_rate": 1.2333334743415103e-05, + "loss": 0.2111, + "step": 2801 + }, + { + "epoch": 1.3267830719147677, + "grad_norm": 1.0354684591293335, + "learning_rate": 1.2328361502529444e-05, + "loss": 0.221, + "step": 2802 + }, + { + "epoch": 1.3272565847883988, + "grad_norm": 1.5998934507369995, + "learning_rate": 1.23233876526833e-05, + "loss": 0.2271, + "step": 2803 + }, + { + "epoch": 1.3277300976620303, + "grad_norm": 1.4590028524398804, + "learning_rate": 1.231841319517753e-05, + "loss": 0.2234, + "step": 2804 + }, + { + "epoch": 1.3282036105356614, + "grad_norm": 2.0347890853881836, + "learning_rate": 1.2313438131313155e-05, + "loss": 0.2375, + "step": 2805 + }, + { + "epoch": 1.3286771234092927, + "grad_norm": 1.5571720600128174, + "learning_rate": 1.2308462462391356e-05, + "loss": 0.2286, + "step": 2806 + }, + { + "epoch": 1.329150636282924, + "grad_norm": 1.1657594442367554, + "learning_rate": 1.2303486189713466e-05, + "loss": 0.2066, + "step": 2807 + }, + { + "epoch": 1.3296241491565552, + "grad_norm": 1.2159301042556763, + "learning_rate": 1.2298509314580986e-05, + "loss": 0.2515, + "step": 2808 + }, + { + "epoch": 1.3300976620301865, + "grad_norm": 2.0197834968566895, + "learning_rate": 1.2293531838295572e-05, + "loss": 0.2153, + "step": 2809 + }, + { + "epoch": 1.3305711749038176, + "grad_norm": 1.6019450426101685, + "learning_rate": 1.228855376215903e-05, + "loss": 0.2245, + "step": 2810 + }, + { + "epoch": 1.3310446877774489, + "grad_norm": 1.1272090673446655, + "learning_rate": 1.228357508747333e-05, + "loss": 0.2135, + "step": 2811 + }, + { + "epoch": 1.3315182006510802, + "grad_norm": 1.550205111503601, + "learning_rate": 1.2278595815540595e-05, + "loss": 0.2498, + "step": 2812 + }, + { + "epoch": 1.3319917135247115, + "grad_norm": 1.4885501861572266, + "learning_rate": 1.2273615947663107e-05, + "loss": 0.2371, + "step": 2813 + }, + { + "epoch": 1.3324652263983428, + "grad_norm": 1.3265790939331055, + "learning_rate": 1.2268635485143303e-05, + "loss": 0.2216, + "step": 2814 + }, + { + "epoch": 1.332938739271974, + "grad_norm": 1.121368646621704, + "learning_rate": 1.2263654429283774e-05, + "loss": 0.2311, + "step": 2815 + }, + { + "epoch": 1.3334122521456053, + "grad_norm": 1.9394561052322388, + "learning_rate": 1.2258672781387267e-05, + "loss": 0.2228, + "step": 2816 + }, + { + "epoch": 1.3338857650192364, + "grad_norm": 1.1712671518325806, + "learning_rate": 1.2253690542756682e-05, + "loss": 0.2071, + "step": 2817 + }, + { + "epoch": 1.3343592778928677, + "grad_norm": 1.167645812034607, + "learning_rate": 1.2248707714695077e-05, + "loss": 0.2191, + "step": 2818 + }, + { + "epoch": 1.334832790766499, + "grad_norm": 1.0730623006820679, + "learning_rate": 1.2243724298505664e-05, + "loss": 0.2258, + "step": 2819 + }, + { + "epoch": 1.3353063036401303, + "grad_norm": 1.0903040170669556, + "learning_rate": 1.2238740295491804e-05, + "loss": 0.2109, + "step": 2820 + }, + { + "epoch": 1.3357798165137615, + "grad_norm": 0.9441567063331604, + "learning_rate": 1.223375570695702e-05, + "loss": 0.2366, + "step": 2821 + }, + { + "epoch": 1.3362533293873926, + "grad_norm": 0.9626446962356567, + "learning_rate": 1.222877053420498e-05, + "loss": 0.1965, + "step": 2822 + }, + { + "epoch": 1.336726842261024, + "grad_norm": 1.2279541492462158, + "learning_rate": 1.2223784778539508e-05, + "loss": 0.2155, + "step": 2823 + }, + { + "epoch": 1.3372003551346552, + "grad_norm": 1.0313448905944824, + "learning_rate": 1.2218798441264579e-05, + "loss": 0.2165, + "step": 2824 + }, + { + "epoch": 1.3376738680082865, + "grad_norm": 1.0775771141052246, + "learning_rate": 1.2213811523684325e-05, + "loss": 0.2141, + "step": 2825 + }, + { + "epoch": 1.3381473808819178, + "grad_norm": 0.9873069524765015, + "learning_rate": 1.2208824027103021e-05, + "loss": 0.1984, + "step": 2826 + }, + { + "epoch": 1.338620893755549, + "grad_norm": 1.219462513923645, + "learning_rate": 1.2203835952825105e-05, + "loss": 0.2372, + "step": 2827 + }, + { + "epoch": 1.3390944066291803, + "grad_norm": 1.2963061332702637, + "learning_rate": 1.2198847302155154e-05, + "loss": 0.2206, + "step": 2828 + }, + { + "epoch": 1.3395679195028114, + "grad_norm": 1.0308513641357422, + "learning_rate": 1.2193858076397905e-05, + "loss": 0.2288, + "step": 2829 + }, + { + "epoch": 1.3400414323764427, + "grad_norm": 1.5890069007873535, + "learning_rate": 1.2188868276858238e-05, + "loss": 0.2171, + "step": 2830 + }, + { + "epoch": 1.340514945250074, + "grad_norm": 2.1345863342285156, + "learning_rate": 1.2183877904841193e-05, + "loss": 0.2138, + "step": 2831 + }, + { + "epoch": 1.3409884581237053, + "grad_norm": 1.1476088762283325, + "learning_rate": 1.217888696165195e-05, + "loss": 0.2144, + "step": 2832 + }, + { + "epoch": 1.3414619709973366, + "grad_norm": 1.109824776649475, + "learning_rate": 1.2173895448595842e-05, + "loss": 0.2018, + "step": 2833 + }, + { + "epoch": 1.3419354838709676, + "grad_norm": 1.1557180881500244, + "learning_rate": 1.216890336697835e-05, + "loss": 0.2178, + "step": 2834 + }, + { + "epoch": 1.3424089967445991, + "grad_norm": 1.1168832778930664, + "learning_rate": 1.2163910718105108e-05, + "loss": 0.2174, + "step": 2835 + }, + { + "epoch": 1.3428825096182302, + "grad_norm": 1.25518000125885, + "learning_rate": 1.2158917503281891e-05, + "loss": 0.2305, + "step": 2836 + }, + { + "epoch": 1.3433560224918615, + "grad_norm": 1.8499336242675781, + "learning_rate": 1.215392372381463e-05, + "loss": 0.202, + "step": 2837 + }, + { + "epoch": 1.3438295353654928, + "grad_norm": 1.5814791917800903, + "learning_rate": 1.2148929381009398e-05, + "loss": 0.2027, + "step": 2838 + }, + { + "epoch": 1.344303048239124, + "grad_norm": 1.3987447023391724, + "learning_rate": 1.2143934476172416e-05, + "loss": 0.2271, + "step": 2839 + }, + { + "epoch": 1.3447765611127553, + "grad_norm": 1.075582504272461, + "learning_rate": 1.2138939010610055e-05, + "loss": 0.1957, + "step": 2840 + }, + { + "epoch": 1.3452500739863864, + "grad_norm": 1.5337646007537842, + "learning_rate": 1.2133942985628833e-05, + "loss": 0.2177, + "step": 2841 + }, + { + "epoch": 1.3457235868600177, + "grad_norm": 2.3917672634124756, + "learning_rate": 1.2128946402535409e-05, + "loss": 0.2185, + "step": 2842 + }, + { + "epoch": 1.346197099733649, + "grad_norm": 2.155311346054077, + "learning_rate": 1.2123949262636592e-05, + "loss": 0.221, + "step": 2843 + }, + { + "epoch": 1.3466706126072803, + "grad_norm": 0.9766375422477722, + "learning_rate": 1.2118951567239331e-05, + "loss": 0.2271, + "step": 2844 + }, + { + "epoch": 1.3471441254809116, + "grad_norm": 1.0675561428070068, + "learning_rate": 1.2113953317650733e-05, + "loss": 0.2214, + "step": 2845 + }, + { + "epoch": 1.3476176383545428, + "grad_norm": 1.6465510129928589, + "learning_rate": 1.2108954515178037e-05, + "loss": 0.2025, + "step": 2846 + }, + { + "epoch": 1.3480911512281741, + "grad_norm": 1.7982463836669922, + "learning_rate": 1.2103955161128635e-05, + "loss": 0.203, + "step": 2847 + }, + { + "epoch": 1.3485646641018052, + "grad_norm": 1.2066468000411987, + "learning_rate": 1.2098955256810057e-05, + "loss": 0.215, + "step": 2848 + }, + { + "epoch": 1.3490381769754365, + "grad_norm": 1.1641322374343872, + "learning_rate": 1.2093954803529981e-05, + "loss": 0.2276, + "step": 2849 + }, + { + "epoch": 1.3495116898490678, + "grad_norm": 1.1573890447616577, + "learning_rate": 1.2088953802596229e-05, + "loss": 0.2076, + "step": 2850 + }, + { + "epoch": 1.349985202722699, + "grad_norm": 0.9064158201217651, + "learning_rate": 1.208395225531676e-05, + "loss": 0.2124, + "step": 2851 + }, + { + "epoch": 1.3504587155963304, + "grad_norm": 1.221864104270935, + "learning_rate": 1.207895016299968e-05, + "loss": 0.203, + "step": 2852 + }, + { + "epoch": 1.3509322284699614, + "grad_norm": 1.3113806247711182, + "learning_rate": 1.2073947526953245e-05, + "loss": 0.2201, + "step": 2853 + }, + { + "epoch": 1.3514057413435927, + "grad_norm": 1.3473477363586426, + "learning_rate": 1.206894434848584e-05, + "loss": 0.2346, + "step": 2854 + }, + { + "epoch": 1.351879254217224, + "grad_norm": 1.0617995262145996, + "learning_rate": 1.2063940628906001e-05, + "loss": 0.2009, + "step": 2855 + }, + { + "epoch": 1.3523527670908553, + "grad_norm": 1.075543761253357, + "learning_rate": 1.20589363695224e-05, + "loss": 0.2371, + "step": 2856 + }, + { + "epoch": 1.3528262799644866, + "grad_norm": 1.068983554840088, + "learning_rate": 1.2053931571643857e-05, + "loss": 0.2256, + "step": 2857 + }, + { + "epoch": 1.3532997928381179, + "grad_norm": 0.9813727736473083, + "learning_rate": 1.2048926236579326e-05, + "loss": 0.2171, + "step": 2858 + }, + { + "epoch": 1.3537733057117491, + "grad_norm": 1.6709295511245728, + "learning_rate": 1.2043920365637904e-05, + "loss": 0.2091, + "step": 2859 + }, + { + "epoch": 1.3542468185853802, + "grad_norm": 0.9895414710044861, + "learning_rate": 1.2038913960128828e-05, + "loss": 0.2049, + "step": 2860 + }, + { + "epoch": 1.3547203314590115, + "grad_norm": 1.3172953128814697, + "learning_rate": 1.2033907021361476e-05, + "loss": 0.2308, + "step": 2861 + }, + { + "epoch": 1.3551938443326428, + "grad_norm": 1.719438076019287, + "learning_rate": 1.2028899550645362e-05, + "loss": 0.2239, + "step": 2862 + }, + { + "epoch": 1.355667357206274, + "grad_norm": 1.0997658967971802, + "learning_rate": 1.2023891549290143e-05, + "loss": 0.2123, + "step": 2863 + }, + { + "epoch": 1.3561408700799054, + "grad_norm": 1.2592804431915283, + "learning_rate": 1.2018883018605614e-05, + "loss": 0.2157, + "step": 2864 + }, + { + "epoch": 1.3566143829535364, + "grad_norm": 1.2449421882629395, + "learning_rate": 1.201387395990171e-05, + "loss": 0.2011, + "step": 2865 + }, + { + "epoch": 1.357087895827168, + "grad_norm": 1.2251133918762207, + "learning_rate": 1.20088643744885e-05, + "loss": 0.2227, + "step": 2866 + }, + { + "epoch": 1.357561408700799, + "grad_norm": 1.0823248624801636, + "learning_rate": 1.2003854263676196e-05, + "loss": 0.2384, + "step": 2867 + }, + { + "epoch": 1.3580349215744303, + "grad_norm": 1.333550214767456, + "learning_rate": 1.1998843628775136e-05, + "loss": 0.2299, + "step": 2868 + }, + { + "epoch": 1.3585084344480616, + "grad_norm": 1.7682206630706787, + "learning_rate": 1.199383247109581e-05, + "loss": 0.2277, + "step": 2869 + }, + { + "epoch": 1.3589819473216929, + "grad_norm": 1.3045170307159424, + "learning_rate": 1.1988820791948834e-05, + "loss": 0.2093, + "step": 2870 + }, + { + "epoch": 1.3594554601953242, + "grad_norm": 2.6720004081726074, + "learning_rate": 1.1983808592644967e-05, + "loss": 0.2414, + "step": 2871 + }, + { + "epoch": 1.3599289730689552, + "grad_norm": 0.9892565608024597, + "learning_rate": 1.1978795874495103e-05, + "loss": 0.206, + "step": 2872 + }, + { + "epoch": 1.3604024859425865, + "grad_norm": 1.4673343896865845, + "learning_rate": 1.1973782638810264e-05, + "loss": 0.2143, + "step": 2873 + }, + { + "epoch": 1.3608759988162178, + "grad_norm": 1.2298039197921753, + "learning_rate": 1.1968768886901621e-05, + "loss": 0.227, + "step": 2874 + }, + { + "epoch": 1.361349511689849, + "grad_norm": 1.48917555809021, + "learning_rate": 1.1963754620080467e-05, + "loss": 0.2148, + "step": 2875 + }, + { + "epoch": 1.3618230245634804, + "grad_norm": 1.349953532218933, + "learning_rate": 1.1958739839658238e-05, + "loss": 0.1988, + "step": 2876 + }, + { + "epoch": 1.3622965374371117, + "grad_norm": 1.3024402856826782, + "learning_rate": 1.1953724546946502e-05, + "loss": 0.2332, + "step": 2877 + }, + { + "epoch": 1.362770050310743, + "grad_norm": 1.4080761671066284, + "learning_rate": 1.1948708743256954e-05, + "loss": 0.221, + "step": 2878 + }, + { + "epoch": 1.363243563184374, + "grad_norm": 1.1572239398956299, + "learning_rate": 1.1943692429901437e-05, + "loss": 0.2094, + "step": 2879 + }, + { + "epoch": 1.3637170760580053, + "grad_norm": 1.7980104684829712, + "learning_rate": 1.1938675608191914e-05, + "loss": 0.2282, + "step": 2880 + }, + { + "epoch": 1.3641905889316366, + "grad_norm": 1.5248477458953857, + "learning_rate": 1.193365827944049e-05, + "loss": 0.2235, + "step": 2881 + }, + { + "epoch": 1.3646641018052679, + "grad_norm": 1.2481404542922974, + "learning_rate": 1.1928640444959396e-05, + "loss": 0.2013, + "step": 2882 + }, + { + "epoch": 1.3651376146788992, + "grad_norm": 2.0264244079589844, + "learning_rate": 1.1923622106060997e-05, + "loss": 0.2341, + "step": 2883 + }, + { + "epoch": 1.3656111275525302, + "grad_norm": 1.1340547800064087, + "learning_rate": 1.19186032640578e-05, + "loss": 0.2295, + "step": 2884 + }, + { + "epoch": 1.3660846404261615, + "grad_norm": 1.4470584392547607, + "learning_rate": 1.1913583920262424e-05, + "loss": 0.2207, + "step": 2885 + }, + { + "epoch": 1.3665581532997928, + "grad_norm": 1.103227138519287, + "learning_rate": 1.1908564075987637e-05, + "loss": 0.2065, + "step": 2886 + }, + { + "epoch": 1.367031666173424, + "grad_norm": 1.558107852935791, + "learning_rate": 1.1903543732546326e-05, + "loss": 0.2203, + "step": 2887 + }, + { + "epoch": 1.3675051790470554, + "grad_norm": 1.080611228942871, + "learning_rate": 1.1898522891251516e-05, + "loss": 0.2328, + "step": 2888 + }, + { + "epoch": 1.3679786919206867, + "grad_norm": 1.09610915184021, + "learning_rate": 1.1893501553416357e-05, + "loss": 0.2292, + "step": 2889 + }, + { + "epoch": 1.368452204794318, + "grad_norm": 1.4019349813461304, + "learning_rate": 1.1888479720354138e-05, + "loss": 0.2118, + "step": 2890 + }, + { + "epoch": 1.368925717667949, + "grad_norm": 1.5220277309417725, + "learning_rate": 1.1883457393378263e-05, + "loss": 0.2051, + "step": 2891 + }, + { + "epoch": 1.3693992305415803, + "grad_norm": 0.9200198650360107, + "learning_rate": 1.187843457380228e-05, + "loss": 0.214, + "step": 2892 + }, + { + "epoch": 1.3698727434152116, + "grad_norm": 1.2134068012237549, + "learning_rate": 1.1873411262939854e-05, + "loss": 0.2068, + "step": 2893 + }, + { + "epoch": 1.3703462562888429, + "grad_norm": 1.125510334968567, + "learning_rate": 1.1868387462104787e-05, + "loss": 0.2247, + "step": 2894 + }, + { + "epoch": 1.3708197691624742, + "grad_norm": 0.9510089159011841, + "learning_rate": 1.1863363172611003e-05, + "loss": 0.2077, + "step": 2895 + }, + { + "epoch": 1.3712932820361052, + "grad_norm": 1.2528704404830933, + "learning_rate": 1.1858338395772555e-05, + "loss": 0.234, + "step": 2896 + }, + { + "epoch": 1.3717667949097367, + "grad_norm": 1.718252182006836, + "learning_rate": 1.185331313290363e-05, + "loss": 0.2241, + "step": 2897 + }, + { + "epoch": 1.3722403077833678, + "grad_norm": 1.3889955282211304, + "learning_rate": 1.1848287385318533e-05, + "loss": 0.239, + "step": 2898 + }, + { + "epoch": 1.372713820656999, + "grad_norm": 1.056844711303711, + "learning_rate": 1.1843261154331702e-05, + "loss": 0.1979, + "step": 2899 + }, + { + "epoch": 1.3731873335306304, + "grad_norm": 1.1087300777435303, + "learning_rate": 1.1838234441257698e-05, + "loss": 0.2094, + "step": 2900 + }, + { + "epoch": 1.3736608464042617, + "grad_norm": 1.014634609222412, + "learning_rate": 1.1833207247411208e-05, + "loss": 0.2052, + "step": 2901 + }, + { + "epoch": 1.374134359277893, + "grad_norm": 1.0202566385269165, + "learning_rate": 1.182817957410705e-05, + "loss": 0.2064, + "step": 2902 + }, + { + "epoch": 1.374607872151524, + "grad_norm": 1.1275465488433838, + "learning_rate": 1.1823151422660162e-05, + "loss": 0.2161, + "step": 2903 + }, + { + "epoch": 1.3750813850251553, + "grad_norm": 1.6918755769729614, + "learning_rate": 1.1818122794385604e-05, + "loss": 0.2127, + "step": 2904 + }, + { + "epoch": 1.3755548978987866, + "grad_norm": 1.5377037525177002, + "learning_rate": 1.1813093690598572e-05, + "loss": 0.1938, + "step": 2905 + }, + { + "epoch": 1.3760284107724179, + "grad_norm": 1.1492862701416016, + "learning_rate": 1.1808064112614375e-05, + "loss": 0.2127, + "step": 2906 + }, + { + "epoch": 1.3765019236460492, + "grad_norm": 1.8122957944869995, + "learning_rate": 1.1803034061748453e-05, + "loss": 0.1997, + "step": 2907 + }, + { + "epoch": 1.3769754365196802, + "grad_norm": 1.2996703386306763, + "learning_rate": 1.1798003539316365e-05, + "loss": 0.226, + "step": 2908 + }, + { + "epoch": 1.3774489493933118, + "grad_norm": 1.1555538177490234, + "learning_rate": 1.1792972546633799e-05, + "loss": 0.2417, + "step": 2909 + }, + { + "epoch": 1.3779224622669428, + "grad_norm": 1.0537813901901245, + "learning_rate": 1.178794108501656e-05, + "loss": 0.221, + "step": 2910 + }, + { + "epoch": 1.378395975140574, + "grad_norm": 1.2504982948303223, + "learning_rate": 1.1782909155780577e-05, + "loss": 0.207, + "step": 2911 + }, + { + "epoch": 1.3788694880142054, + "grad_norm": 1.0087714195251465, + "learning_rate": 1.1777876760241907e-05, + "loss": 0.2108, + "step": 2912 + }, + { + "epoch": 1.3793430008878367, + "grad_norm": 0.9194630980491638, + "learning_rate": 1.1772843899716719e-05, + "loss": 0.2024, + "step": 2913 + }, + { + "epoch": 1.379816513761468, + "grad_norm": 1.1061056852340698, + "learning_rate": 1.1767810575521312e-05, + "loss": 0.2317, + "step": 2914 + }, + { + "epoch": 1.380290026635099, + "grad_norm": 1.6991933584213257, + "learning_rate": 1.1762776788972106e-05, + "loss": 0.2297, + "step": 2915 + }, + { + "epoch": 1.3807635395087303, + "grad_norm": 1.4516056776046753, + "learning_rate": 1.1757742541385636e-05, + "loss": 0.2268, + "step": 2916 + }, + { + "epoch": 1.3812370523823616, + "grad_norm": 1.5717034339904785, + "learning_rate": 1.1752707834078558e-05, + "loss": 0.2298, + "step": 2917 + }, + { + "epoch": 1.381710565255993, + "grad_norm": 1.2218979597091675, + "learning_rate": 1.1747672668367659e-05, + "loss": 0.2222, + "step": 2918 + }, + { + "epoch": 1.3821840781296242, + "grad_norm": 1.1709834337234497, + "learning_rate": 1.1742637045569832e-05, + "loss": 0.2199, + "step": 2919 + }, + { + "epoch": 1.3826575910032555, + "grad_norm": 1.1610695123672485, + "learning_rate": 1.1737600967002095e-05, + "loss": 0.2203, + "step": 2920 + }, + { + "epoch": 1.3831311038768868, + "grad_norm": 1.1131560802459717, + "learning_rate": 1.1732564433981594e-05, + "loss": 0.205, + "step": 2921 + }, + { + "epoch": 1.3836046167505178, + "grad_norm": 1.2609494924545288, + "learning_rate": 1.1727527447825575e-05, + "loss": 0.2129, + "step": 2922 + }, + { + "epoch": 1.3840781296241491, + "grad_norm": 1.1561344861984253, + "learning_rate": 1.1722490009851418e-05, + "loss": 0.2299, + "step": 2923 + }, + { + "epoch": 1.3845516424977804, + "grad_norm": 1.0187231302261353, + "learning_rate": 1.1717452121376616e-05, + "loss": 0.1932, + "step": 2924 + }, + { + "epoch": 1.3850251553714117, + "grad_norm": 1.7099744081497192, + "learning_rate": 1.1712413783718782e-05, + "loss": 0.2273, + "step": 2925 + }, + { + "epoch": 1.385498668245043, + "grad_norm": 1.1517730951309204, + "learning_rate": 1.1707374998195643e-05, + "loss": 0.2322, + "step": 2926 + }, + { + "epoch": 1.385972181118674, + "grad_norm": 1.6895732879638672, + "learning_rate": 1.1702335766125042e-05, + "loss": 0.2198, + "step": 2927 + }, + { + "epoch": 1.3864456939923056, + "grad_norm": 1.1857640743255615, + "learning_rate": 1.1697296088824945e-05, + "loss": 0.2639, + "step": 2928 + }, + { + "epoch": 1.3869192068659366, + "grad_norm": 1.5664935111999512, + "learning_rate": 1.1692255967613432e-05, + "loss": 0.2146, + "step": 2929 + }, + { + "epoch": 1.387392719739568, + "grad_norm": 1.2540571689605713, + "learning_rate": 1.1687215403808697e-05, + "loss": 0.2288, + "step": 2930 + }, + { + "epoch": 1.3878662326131992, + "grad_norm": 1.223510980606079, + "learning_rate": 1.1682174398729044e-05, + "loss": 0.2205, + "step": 2931 + }, + { + "epoch": 1.3883397454868305, + "grad_norm": 0.9939408302307129, + "learning_rate": 1.1677132953692911e-05, + "loss": 0.2321, + "step": 2932 + }, + { + "epoch": 1.3888132583604618, + "grad_norm": 1.3630565404891968, + "learning_rate": 1.1672091070018832e-05, + "loss": 0.2274, + "step": 2933 + }, + { + "epoch": 1.3892867712340928, + "grad_norm": 1.4759294986724854, + "learning_rate": 1.1667048749025462e-05, + "loss": 0.2319, + "step": 2934 + }, + { + "epoch": 1.3897602841077241, + "grad_norm": 0.9733860492706299, + "learning_rate": 1.1662005992031577e-05, + "loss": 0.2087, + "step": 2935 + }, + { + "epoch": 1.3902337969813554, + "grad_norm": 1.1344149112701416, + "learning_rate": 1.1656962800356058e-05, + "loss": 0.2226, + "step": 2936 + }, + { + "epoch": 1.3907073098549867, + "grad_norm": 1.0230250358581543, + "learning_rate": 1.1651919175317903e-05, + "loss": 0.2076, + "step": 2937 + }, + { + "epoch": 1.391180822728618, + "grad_norm": 1.2815759181976318, + "learning_rate": 1.1646875118236225e-05, + "loss": 0.209, + "step": 2938 + }, + { + "epoch": 1.391654335602249, + "grad_norm": 1.4215916395187378, + "learning_rate": 1.1641830630430246e-05, + "loss": 0.2402, + "step": 2939 + }, + { + "epoch": 1.3921278484758806, + "grad_norm": 1.8291083574295044, + "learning_rate": 1.1636785713219305e-05, + "loss": 0.2072, + "step": 2940 + }, + { + "epoch": 1.3926013613495116, + "grad_norm": 1.3542451858520508, + "learning_rate": 1.163174036792285e-05, + "loss": 0.2279, + "step": 2941 + }, + { + "epoch": 1.393074874223143, + "grad_norm": 1.9127904176712036, + "learning_rate": 1.1626694595860443e-05, + "loss": 0.2264, + "step": 2942 + }, + { + "epoch": 1.3935483870967742, + "grad_norm": 1.4430345296859741, + "learning_rate": 1.1621648398351762e-05, + "loss": 0.2149, + "step": 2943 + }, + { + "epoch": 1.3940218999704055, + "grad_norm": 2.1805191040039062, + "learning_rate": 1.1616601776716583e-05, + "loss": 0.2259, + "step": 2944 + }, + { + "epoch": 1.3944954128440368, + "grad_norm": 1.4834105968475342, + "learning_rate": 1.1611554732274806e-05, + "loss": 0.2054, + "step": 2945 + }, + { + "epoch": 1.3949689257176678, + "grad_norm": 1.532042145729065, + "learning_rate": 1.1606507266346436e-05, + "loss": 0.2169, + "step": 2946 + }, + { + "epoch": 1.3954424385912991, + "grad_norm": 1.2459840774536133, + "learning_rate": 1.160145938025159e-05, + "loss": 0.2091, + "step": 2947 + }, + { + "epoch": 1.3959159514649304, + "grad_norm": 1.190146565437317, + "learning_rate": 1.159641107531049e-05, + "loss": 0.1977, + "step": 2948 + }, + { + "epoch": 1.3963894643385617, + "grad_norm": 1.033531665802002, + "learning_rate": 1.1591362352843477e-05, + "loss": 0.1989, + "step": 2949 + }, + { + "epoch": 1.396862977212193, + "grad_norm": 1.877530813217163, + "learning_rate": 1.158631321417099e-05, + "loss": 0.2075, + "step": 2950 + }, + { + "epoch": 1.3973364900858243, + "grad_norm": 1.1932871341705322, + "learning_rate": 1.1581263660613585e-05, + "loss": 0.2094, + "step": 2951 + }, + { + "epoch": 1.3978100029594556, + "grad_norm": 1.5696194171905518, + "learning_rate": 1.1576213693491925e-05, + "loss": 0.207, + "step": 2952 + }, + { + "epoch": 1.3982835158330866, + "grad_norm": 1.6218510866165161, + "learning_rate": 1.157116331412678e-05, + "loss": 0.1988, + "step": 2953 + }, + { + "epoch": 1.398757028706718, + "grad_norm": 1.69428551197052, + "learning_rate": 1.1566112523839028e-05, + "loss": 0.2118, + "step": 2954 + }, + { + "epoch": 1.3992305415803492, + "grad_norm": 0.9720800518989563, + "learning_rate": 1.1561061323949652e-05, + "loss": 0.1867, + "step": 2955 + }, + { + "epoch": 1.3997040544539805, + "grad_norm": 1.5280438661575317, + "learning_rate": 1.155600971577975e-05, + "loss": 0.2288, + "step": 2956 + }, + { + "epoch": 1.4001775673276118, + "grad_norm": 1.5449702739715576, + "learning_rate": 1.1550957700650517e-05, + "loss": 0.2242, + "step": 2957 + }, + { + "epoch": 1.4006510802012428, + "grad_norm": 1.8118822574615479, + "learning_rate": 1.1545905279883258e-05, + "loss": 0.2122, + "step": 2958 + }, + { + "epoch": 1.4011245930748744, + "grad_norm": 1.7148773670196533, + "learning_rate": 1.1540852454799388e-05, + "loss": 0.2237, + "step": 2959 + }, + { + "epoch": 1.4015981059485054, + "grad_norm": 1.2663835287094116, + "learning_rate": 1.1535799226720421e-05, + "loss": 0.2229, + "step": 2960 + }, + { + "epoch": 1.4020716188221367, + "grad_norm": 1.3801897764205933, + "learning_rate": 1.1530745596967982e-05, + "loss": 0.2263, + "step": 2961 + }, + { + "epoch": 1.402545131695768, + "grad_norm": 1.4142658710479736, + "learning_rate": 1.1525691566863802e-05, + "loss": 0.2169, + "step": 2962 + }, + { + "epoch": 1.4030186445693993, + "grad_norm": 0.985097348690033, + "learning_rate": 1.1520637137729711e-05, + "loss": 0.2025, + "step": 2963 + }, + { + "epoch": 1.4034921574430306, + "grad_norm": 1.7682098150253296, + "learning_rate": 1.1515582310887647e-05, + "loss": 0.2098, + "step": 2964 + }, + { + "epoch": 1.4039656703166616, + "grad_norm": 1.2826495170593262, + "learning_rate": 1.1510527087659648e-05, + "loss": 0.2185, + "step": 2965 + }, + { + "epoch": 1.404439183190293, + "grad_norm": 0.9184038639068604, + "learning_rate": 1.1505471469367864e-05, + "loss": 0.2079, + "step": 2966 + }, + { + "epoch": 1.4049126960639242, + "grad_norm": 1.2887505292892456, + "learning_rate": 1.1500415457334539e-05, + "loss": 0.1951, + "step": 2967 + }, + { + "epoch": 1.4053862089375555, + "grad_norm": 1.2137596607208252, + "learning_rate": 1.1495359052882028e-05, + "loss": 0.2093, + "step": 2968 + }, + { + "epoch": 1.4058597218111868, + "grad_norm": 1.2313615083694458, + "learning_rate": 1.1490302257332781e-05, + "loss": 0.2015, + "step": 2969 + }, + { + "epoch": 1.4063332346848179, + "grad_norm": 1.5044950246810913, + "learning_rate": 1.1485245072009357e-05, + "loss": 0.2187, + "step": 2970 + }, + { + "epoch": 1.4068067475584494, + "grad_norm": 1.2070937156677246, + "learning_rate": 1.1480187498234412e-05, + "loss": 0.2067, + "step": 2971 + }, + { + "epoch": 1.4072802604320804, + "grad_norm": 1.3706823587417603, + "learning_rate": 1.1475129537330707e-05, + "loss": 0.2081, + "step": 2972 + }, + { + "epoch": 1.4077537733057117, + "grad_norm": 1.1687662601470947, + "learning_rate": 1.1470071190621103e-05, + "loss": 0.2081, + "step": 2973 + }, + { + "epoch": 1.408227286179343, + "grad_norm": 1.102828025817871, + "learning_rate": 1.1465012459428562e-05, + "loss": 0.2129, + "step": 2974 + }, + { + "epoch": 1.4087007990529743, + "grad_norm": 1.8313281536102295, + "learning_rate": 1.1459953345076142e-05, + "loss": 0.2111, + "step": 2975 + }, + { + "epoch": 1.4091743119266056, + "grad_norm": 1.3744657039642334, + "learning_rate": 1.1454893848887013e-05, + "loss": 0.2099, + "step": 2976 + }, + { + "epoch": 1.4096478248002366, + "grad_norm": 0.9316054582595825, + "learning_rate": 1.144983397218443e-05, + "loss": 0.2132, + "step": 2977 + }, + { + "epoch": 1.410121337673868, + "grad_norm": 1.1082512140274048, + "learning_rate": 1.1444773716291759e-05, + "loss": 0.1941, + "step": 2978 + }, + { + "epoch": 1.4105948505474992, + "grad_norm": 1.3401765823364258, + "learning_rate": 1.143971308253246e-05, + "loss": 0.2226, + "step": 2979 + }, + { + "epoch": 1.4110683634211305, + "grad_norm": 1.1040364503860474, + "learning_rate": 1.1434652072230092e-05, + "loss": 0.1832, + "step": 2980 + }, + { + "epoch": 1.4115418762947618, + "grad_norm": 1.007563829421997, + "learning_rate": 1.142959068670832e-05, + "loss": 0.2122, + "step": 2981 + }, + { + "epoch": 1.412015389168393, + "grad_norm": 1.306701421737671, + "learning_rate": 1.1424528927290892e-05, + "loss": 0.2162, + "step": 2982 + }, + { + "epoch": 1.4124889020420244, + "grad_norm": 0.9834753274917603, + "learning_rate": 1.1419466795301665e-05, + "loss": 0.201, + "step": 2983 + }, + { + "epoch": 1.4129624149156554, + "grad_norm": 1.4253736734390259, + "learning_rate": 1.1414404292064593e-05, + "loss": 0.2153, + "step": 2984 + }, + { + "epoch": 1.4134359277892867, + "grad_norm": 1.133834719657898, + "learning_rate": 1.1409341418903725e-05, + "loss": 0.2283, + "step": 2985 + }, + { + "epoch": 1.413909440662918, + "grad_norm": 1.4796968698501587, + "learning_rate": 1.1404278177143202e-05, + "loss": 0.2251, + "step": 2986 + }, + { + "epoch": 1.4143829535365493, + "grad_norm": 1.3336002826690674, + "learning_rate": 1.139921456810727e-05, + "loss": 0.206, + "step": 2987 + }, + { + "epoch": 1.4148564664101806, + "grad_norm": 1.420716404914856, + "learning_rate": 1.1394150593120268e-05, + "loss": 0.2226, + "step": 2988 + }, + { + "epoch": 1.4153299792838117, + "grad_norm": 1.5567362308502197, + "learning_rate": 1.1389086253506626e-05, + "loss": 0.2219, + "step": 2989 + }, + { + "epoch": 1.4158034921574432, + "grad_norm": 0.9303296804428101, + "learning_rate": 1.1384021550590878e-05, + "loss": 0.2277, + "step": 2990 + }, + { + "epoch": 1.4162770050310742, + "grad_norm": 1.0575077533721924, + "learning_rate": 1.1378956485697644e-05, + "loss": 0.2224, + "step": 2991 + }, + { + "epoch": 1.4167505179047055, + "grad_norm": 1.0305496454238892, + "learning_rate": 1.1373891060151643e-05, + "loss": 0.2341, + "step": 2992 + }, + { + "epoch": 1.4172240307783368, + "grad_norm": 1.0244580507278442, + "learning_rate": 1.1368825275277689e-05, + "loss": 0.2011, + "step": 2993 + }, + { + "epoch": 1.417697543651968, + "grad_norm": 1.31196928024292, + "learning_rate": 1.136375913240069e-05, + "loss": 0.2253, + "step": 2994 + }, + { + "epoch": 1.4181710565255994, + "grad_norm": 1.118857502937317, + "learning_rate": 1.1358692632845645e-05, + "loss": 0.2278, + "step": 2995 + }, + { + "epoch": 1.4186445693992304, + "grad_norm": 1.114423155784607, + "learning_rate": 1.1353625777937652e-05, + "loss": 0.2004, + "step": 2996 + }, + { + "epoch": 1.4191180822728617, + "grad_norm": 1.2260422706604004, + "learning_rate": 1.1348558569001896e-05, + "loss": 0.2056, + "step": 2997 + }, + { + "epoch": 1.419591595146493, + "grad_norm": 1.8472795486450195, + "learning_rate": 1.1343491007363652e-05, + "loss": 0.2181, + "step": 2998 + }, + { + "epoch": 1.4200651080201243, + "grad_norm": 1.8274016380310059, + "learning_rate": 1.1338423094348299e-05, + "loss": 0.2138, + "step": 2999 + }, + { + "epoch": 1.4205386208937556, + "grad_norm": 1.1858888864517212, + "learning_rate": 1.1333354831281295e-05, + "loss": 0.2022, + "step": 3000 + }, + { + "epoch": 1.4210121337673867, + "grad_norm": 1.1788923740386963, + "learning_rate": 1.13282862194882e-05, + "loss": 0.1814, + "step": 3001 + }, + { + "epoch": 1.4214856466410182, + "grad_norm": 1.1308039426803589, + "learning_rate": 1.132321726029466e-05, + "loss": 0.2167, + "step": 3002 + }, + { + "epoch": 1.4219591595146492, + "grad_norm": 1.0506905317306519, + "learning_rate": 1.1318147955026408e-05, + "loss": 0.2011, + "step": 3003 + }, + { + "epoch": 1.4224326723882805, + "grad_norm": 1.2325364351272583, + "learning_rate": 1.1313078305009278e-05, + "loss": 0.1922, + "step": 3004 + }, + { + "epoch": 1.4229061852619118, + "grad_norm": 1.2299723625183105, + "learning_rate": 1.130800831156918e-05, + "loss": 0.2292, + "step": 3005 + }, + { + "epoch": 1.423379698135543, + "grad_norm": 1.5796177387237549, + "learning_rate": 1.1302937976032131e-05, + "loss": 0.2311, + "step": 3006 + }, + { + "epoch": 1.4238532110091744, + "grad_norm": 1.015859603881836, + "learning_rate": 1.1297867299724223e-05, + "loss": 0.2398, + "step": 3007 + }, + { + "epoch": 1.4243267238828055, + "grad_norm": 1.0847722291946411, + "learning_rate": 1.1292796283971645e-05, + "loss": 0.2105, + "step": 3008 + }, + { + "epoch": 1.4248002367564367, + "grad_norm": 1.5696632862091064, + "learning_rate": 1.1287724930100668e-05, + "loss": 0.22, + "step": 3009 + }, + { + "epoch": 1.425273749630068, + "grad_norm": 1.0364569425582886, + "learning_rate": 1.128265323943766e-05, + "loss": 0.2086, + "step": 3010 + }, + { + "epoch": 1.4257472625036993, + "grad_norm": 1.0130901336669922, + "learning_rate": 1.127758121330907e-05, + "loss": 0.1888, + "step": 3011 + }, + { + "epoch": 1.4262207753773306, + "grad_norm": 1.5478230714797974, + "learning_rate": 1.127250885304144e-05, + "loss": 0.2325, + "step": 3012 + }, + { + "epoch": 1.426694288250962, + "grad_norm": 1.2723689079284668, + "learning_rate": 1.1267436159961393e-05, + "loss": 0.2187, + "step": 3013 + }, + { + "epoch": 1.4271678011245932, + "grad_norm": 1.111171007156372, + "learning_rate": 1.1262363135395648e-05, + "loss": 0.215, + "step": 3014 + }, + { + "epoch": 1.4276413139982242, + "grad_norm": 1.3690720796585083, + "learning_rate": 1.1257289780671004e-05, + "loss": 0.2206, + "step": 3015 + }, + { + "epoch": 1.4281148268718555, + "grad_norm": 1.2229105234146118, + "learning_rate": 1.1252216097114344e-05, + "loss": 0.2297, + "step": 3016 + }, + { + "epoch": 1.4285883397454868, + "grad_norm": 1.8208054304122925, + "learning_rate": 1.124714208605265e-05, + "loss": 0.2429, + "step": 3017 + }, + { + "epoch": 1.4290618526191181, + "grad_norm": 1.015411615371704, + "learning_rate": 1.1242067748812968e-05, + "loss": 0.2145, + "step": 3018 + }, + { + "epoch": 1.4295353654927494, + "grad_norm": 1.8651050329208374, + "learning_rate": 1.123699308672245e-05, + "loss": 0.2222, + "step": 3019 + }, + { + "epoch": 1.4300088783663805, + "grad_norm": 2.2374584674835205, + "learning_rate": 1.1231918101108325e-05, + "loss": 0.2141, + "step": 3020 + }, + { + "epoch": 1.430482391240012, + "grad_norm": 1.1194039583206177, + "learning_rate": 1.1226842793297904e-05, + "loss": 0.212, + "step": 3021 + }, + { + "epoch": 1.430955904113643, + "grad_norm": 1.2247196435928345, + "learning_rate": 1.122176716461859e-05, + "loss": 0.2164, + "step": 3022 + }, + { + "epoch": 1.4314294169872743, + "grad_norm": 1.0762709379196167, + "learning_rate": 1.1216691216397854e-05, + "loss": 0.2328, + "step": 3023 + }, + { + "epoch": 1.4319029298609056, + "grad_norm": 1.0583688020706177, + "learning_rate": 1.1211614949963274e-05, + "loss": 0.1974, + "step": 3024 + }, + { + "epoch": 1.432376442734537, + "grad_norm": 1.0842574834823608, + "learning_rate": 1.1206538366642491e-05, + "loss": 0.1934, + "step": 3025 + }, + { + "epoch": 1.4328499556081682, + "grad_norm": 1.6864724159240723, + "learning_rate": 1.1201461467763238e-05, + "loss": 0.227, + "step": 3026 + }, + { + "epoch": 1.4333234684817993, + "grad_norm": 1.0597320795059204, + "learning_rate": 1.1196384254653328e-05, + "loss": 0.2115, + "step": 3027 + }, + { + "epoch": 1.4337969813554305, + "grad_norm": 1.4735287427902222, + "learning_rate": 1.1191306728640659e-05, + "loss": 0.2357, + "step": 3028 + }, + { + "epoch": 1.4342704942290618, + "grad_norm": 1.1243826150894165, + "learning_rate": 1.1186228891053205e-05, + "loss": 0.2247, + "step": 3029 + }, + { + "epoch": 1.4347440071026931, + "grad_norm": 1.8924311399459839, + "learning_rate": 1.1181150743219031e-05, + "loss": 0.2168, + "step": 3030 + }, + { + "epoch": 1.4352175199763244, + "grad_norm": 1.0397160053253174, + "learning_rate": 1.1176072286466273e-05, + "loss": 0.2292, + "step": 3031 + }, + { + "epoch": 1.4356910328499555, + "grad_norm": 1.515010118484497, + "learning_rate": 1.1170993522123155e-05, + "loss": 0.236, + "step": 3032 + }, + { + "epoch": 1.436164545723587, + "grad_norm": 1.094772458076477, + "learning_rate": 1.1165914451517978e-05, + "loss": 0.1897, + "step": 3033 + }, + { + "epoch": 1.436638058597218, + "grad_norm": 1.71537446975708, + "learning_rate": 1.1160835075979124e-05, + "loss": 0.1892, + "step": 3034 + }, + { + "epoch": 1.4371115714708493, + "grad_norm": 1.3985260725021362, + "learning_rate": 1.1155755396835057e-05, + "loss": 0.2327, + "step": 3035 + }, + { + "epoch": 1.4375850843444806, + "grad_norm": 0.8228690028190613, + "learning_rate": 1.1150675415414314e-05, + "loss": 0.2021, + "step": 3036 + }, + { + "epoch": 1.438058597218112, + "grad_norm": 0.931536078453064, + "learning_rate": 1.1145595133045517e-05, + "loss": 0.2083, + "step": 3037 + }, + { + "epoch": 1.4385321100917432, + "grad_norm": 1.1511605978012085, + "learning_rate": 1.1140514551057361e-05, + "loss": 0.201, + "step": 3038 + }, + { + "epoch": 1.4390056229653743, + "grad_norm": 1.2221344709396362, + "learning_rate": 1.113543367077863e-05, + "loss": 0.2049, + "step": 3039 + }, + { + "epoch": 1.4394791358390056, + "grad_norm": 1.1700915098190308, + "learning_rate": 1.1130352493538176e-05, + "loss": 0.208, + "step": 3040 + }, + { + "epoch": 1.4399526487126368, + "grad_norm": 0.9786930084228516, + "learning_rate": 1.1125271020664931e-05, + "loss": 0.2006, + "step": 3041 + }, + { + "epoch": 1.4404261615862681, + "grad_norm": 1.0784828662872314, + "learning_rate": 1.1120189253487912e-05, + "loss": 0.2204, + "step": 3042 + }, + { + "epoch": 1.4408996744598994, + "grad_norm": 1.8664531707763672, + "learning_rate": 1.1115107193336194e-05, + "loss": 0.1987, + "step": 3043 + }, + { + "epoch": 1.4413731873335307, + "grad_norm": 1.3436121940612793, + "learning_rate": 1.111002484153895e-05, + "loss": 0.2301, + "step": 3044 + }, + { + "epoch": 1.441846700207162, + "grad_norm": 1.8887630701065063, + "learning_rate": 1.1104942199425418e-05, + "loss": 0.2189, + "step": 3045 + }, + { + "epoch": 1.442320213080793, + "grad_norm": 1.4706549644470215, + "learning_rate": 1.1099859268324911e-05, + "loss": 0.2382, + "step": 3046 + }, + { + "epoch": 1.4427937259544243, + "grad_norm": 1.3858911991119385, + "learning_rate": 1.1094776049566822e-05, + "loss": 0.2085, + "step": 3047 + }, + { + "epoch": 1.4432672388280556, + "grad_norm": 1.5077974796295166, + "learning_rate": 1.1089692544480622e-05, + "loss": 0.2127, + "step": 3048 + }, + { + "epoch": 1.443740751701687, + "grad_norm": 1.5535578727722168, + "learning_rate": 1.1084608754395846e-05, + "loss": 0.2164, + "step": 3049 + }, + { + "epoch": 1.4442142645753182, + "grad_norm": 1.3303495645523071, + "learning_rate": 1.1079524680642115e-05, + "loss": 0.2167, + "step": 3050 + }, + { + "epoch": 1.4446877774489493, + "grad_norm": 1.0496814250946045, + "learning_rate": 1.1074440324549118e-05, + "loss": 0.2073, + "step": 3051 + }, + { + "epoch": 1.4451612903225808, + "grad_norm": 2.0565896034240723, + "learning_rate": 1.1069355687446624e-05, + "loss": 0.2019, + "step": 3052 + }, + { + "epoch": 1.4456348031962118, + "grad_norm": 1.2639986276626587, + "learning_rate": 1.1064270770664463e-05, + "loss": 0.2108, + "step": 3053 + }, + { + "epoch": 1.4461083160698431, + "grad_norm": 1.359134554862976, + "learning_rate": 1.1059185575532548e-05, + "loss": 0.2302, + "step": 3054 + }, + { + "epoch": 1.4465818289434744, + "grad_norm": 1.306850552558899, + "learning_rate": 1.1054100103380862e-05, + "loss": 0.228, + "step": 3055 + }, + { + "epoch": 1.4470553418171057, + "grad_norm": 1.1254212856292725, + "learning_rate": 1.1049014355539466e-05, + "loss": 0.2271, + "step": 3056 + }, + { + "epoch": 1.447528854690737, + "grad_norm": 1.1044193506240845, + "learning_rate": 1.1043928333338486e-05, + "loss": 0.2116, + "step": 3057 + }, + { + "epoch": 1.448002367564368, + "grad_norm": 1.277251958847046, + "learning_rate": 1.1038842038108117e-05, + "loss": 0.2246, + "step": 3058 + }, + { + "epoch": 1.4484758804379994, + "grad_norm": 1.157020092010498, + "learning_rate": 1.1033755471178638e-05, + "loss": 0.2379, + "step": 3059 + }, + { + "epoch": 1.4489493933116306, + "grad_norm": 1.0917078256607056, + "learning_rate": 1.102866863388039e-05, + "loss": 0.2357, + "step": 3060 + }, + { + "epoch": 1.449422906185262, + "grad_norm": 1.5090773105621338, + "learning_rate": 1.1023581527543784e-05, + "loss": 0.2085, + "step": 3061 + }, + { + "epoch": 1.4498964190588932, + "grad_norm": 1.000868320465088, + "learning_rate": 1.1018494153499302e-05, + "loss": 0.2221, + "step": 3062 + }, + { + "epoch": 1.4503699319325243, + "grad_norm": 1.2444463968276978, + "learning_rate": 1.10134065130775e-05, + "loss": 0.2355, + "step": 3063 + }, + { + "epoch": 1.4508434448061558, + "grad_norm": 1.5682604312896729, + "learning_rate": 1.1008318607609e-05, + "loss": 0.2147, + "step": 3064 + }, + { + "epoch": 1.4513169576797869, + "grad_norm": 1.108665108680725, + "learning_rate": 1.1003230438424498e-05, + "loss": 0.205, + "step": 3065 + }, + { + "epoch": 1.4517904705534181, + "grad_norm": 1.2267842292785645, + "learning_rate": 1.0998142006854754e-05, + "loss": 0.2354, + "step": 3066 + }, + { + "epoch": 1.4522639834270494, + "grad_norm": 1.2378571033477783, + "learning_rate": 1.0993053314230593e-05, + "loss": 0.2122, + "step": 3067 + }, + { + "epoch": 1.4527374963006807, + "grad_norm": 1.5879398584365845, + "learning_rate": 1.0987964361882921e-05, + "loss": 0.2425, + "step": 3068 + }, + { + "epoch": 1.453211009174312, + "grad_norm": 0.9649191498756409, + "learning_rate": 1.0982875151142702e-05, + "loss": 0.2217, + "step": 3069 + }, + { + "epoch": 1.453684522047943, + "grad_norm": 1.1818419694900513, + "learning_rate": 1.097778568334097e-05, + "loss": 0.1961, + "step": 3070 + }, + { + "epoch": 1.4541580349215744, + "grad_norm": 1.4016741514205933, + "learning_rate": 1.0972695959808822e-05, + "loss": 0.1939, + "step": 3071 + }, + { + "epoch": 1.4546315477952056, + "grad_norm": 1.1574840545654297, + "learning_rate": 1.096760598187743e-05, + "loss": 0.2055, + "step": 3072 + }, + { + "epoch": 1.455105060668837, + "grad_norm": 2.2353291511535645, + "learning_rate": 1.096251575087803e-05, + "loss": 0.2041, + "step": 3073 + }, + { + "epoch": 1.4555785735424682, + "grad_norm": 1.6579928398132324, + "learning_rate": 1.095742526814192e-05, + "loss": 0.1982, + "step": 3074 + }, + { + "epoch": 1.4560520864160995, + "grad_norm": 1.2612777948379517, + "learning_rate": 1.0952334535000466e-05, + "loss": 0.2086, + "step": 3075 + }, + { + "epoch": 1.4565255992897308, + "grad_norm": 1.2440173625946045, + "learning_rate": 1.0947243552785103e-05, + "loss": 0.2239, + "step": 3076 + }, + { + "epoch": 1.4569991121633619, + "grad_norm": 1.0874110460281372, + "learning_rate": 1.0942152322827326e-05, + "loss": 0.2099, + "step": 3077 + }, + { + "epoch": 1.4574726250369932, + "grad_norm": 1.3692322969436646, + "learning_rate": 1.0937060846458701e-05, + "loss": 0.2069, + "step": 3078 + }, + { + "epoch": 1.4579461379106244, + "grad_norm": 1.1790695190429688, + "learning_rate": 1.0931969125010848e-05, + "loss": 0.2467, + "step": 3079 + }, + { + "epoch": 1.4584196507842557, + "grad_norm": 1.1012235879898071, + "learning_rate": 1.0926877159815463e-05, + "loss": 0.2214, + "step": 3080 + }, + { + "epoch": 1.458893163657887, + "grad_norm": 1.178139328956604, + "learning_rate": 1.0921784952204299e-05, + "loss": 0.2138, + "step": 3081 + }, + { + "epoch": 1.459366676531518, + "grad_norm": 1.5324974060058594, + "learning_rate": 1.0916692503509172e-05, + "loss": 0.234, + "step": 3082 + }, + { + "epoch": 1.4598401894051496, + "grad_norm": 1.391147255897522, + "learning_rate": 1.0911599815061966e-05, + "loss": 0.2091, + "step": 3083 + }, + { + "epoch": 1.4603137022787807, + "grad_norm": 1.076026439666748, + "learning_rate": 1.0906506888194621e-05, + "loss": 0.2073, + "step": 3084 + }, + { + "epoch": 1.460787215152412, + "grad_norm": 1.4098458290100098, + "learning_rate": 1.0901413724239144e-05, + "loss": 0.2358, + "step": 3085 + }, + { + "epoch": 1.4612607280260432, + "grad_norm": 1.8523685932159424, + "learning_rate": 1.0896320324527606e-05, + "loss": 0.2471, + "step": 3086 + }, + { + "epoch": 1.4617342408996745, + "grad_norm": 1.161373496055603, + "learning_rate": 1.0891226690392136e-05, + "loss": 0.2168, + "step": 3087 + }, + { + "epoch": 1.4622077537733058, + "grad_norm": 1.116976261138916, + "learning_rate": 1.0886132823164919e-05, + "loss": 0.2141, + "step": 3088 + }, + { + "epoch": 1.4626812666469369, + "grad_norm": 0.9809169769287109, + "learning_rate": 1.0881038724178211e-05, + "loss": 0.2297, + "step": 3089 + }, + { + "epoch": 1.4631547795205682, + "grad_norm": 1.2216544151306152, + "learning_rate": 1.0875944394764325e-05, + "loss": 0.2317, + "step": 3090 + }, + { + "epoch": 1.4636282923941994, + "grad_norm": 1.036920189857483, + "learning_rate": 1.0870849836255632e-05, + "loss": 0.2307, + "step": 3091 + }, + { + "epoch": 1.4641018052678307, + "grad_norm": 1.1497174501419067, + "learning_rate": 1.0865755049984568e-05, + "loss": 0.2114, + "step": 3092 + }, + { + "epoch": 1.464575318141462, + "grad_norm": 1.21004056930542, + "learning_rate": 1.0860660037283622e-05, + "loss": 0.2054, + "step": 3093 + }, + { + "epoch": 1.465048831015093, + "grad_norm": 2.018868923187256, + "learning_rate": 1.0855564799485345e-05, + "loss": 0.2074, + "step": 3094 + }, + { + "epoch": 1.4655223438887246, + "grad_norm": 1.4326393604278564, + "learning_rate": 1.0850469337922348e-05, + "loss": 0.2179, + "step": 3095 + }, + { + "epoch": 1.4659958567623557, + "grad_norm": 1.4479272365570068, + "learning_rate": 1.0845373653927303e-05, + "loss": 0.2191, + "step": 3096 + }, + { + "epoch": 1.466469369635987, + "grad_norm": 1.525227665901184, + "learning_rate": 1.084027774883293e-05, + "loss": 0.2227, + "step": 3097 + }, + { + "epoch": 1.4669428825096182, + "grad_norm": 1.1272501945495605, + "learning_rate": 1.0835181623972022e-05, + "loss": 0.2156, + "step": 3098 + }, + { + "epoch": 1.4674163953832495, + "grad_norm": 1.327453851699829, + "learning_rate": 1.0830085280677415e-05, + "loss": 0.2185, + "step": 3099 + }, + { + "epoch": 1.4678899082568808, + "grad_norm": 1.1839863061904907, + "learning_rate": 1.082498872028201e-05, + "loss": 0.2343, + "step": 3100 + }, + { + "epoch": 1.4683634211305119, + "grad_norm": 0.9597224593162537, + "learning_rate": 1.0819891944118768e-05, + "loss": 0.193, + "step": 3101 + }, + { + "epoch": 1.4688369340041432, + "grad_norm": 1.457085371017456, + "learning_rate": 1.0814794953520695e-05, + "loss": 0.2102, + "step": 3102 + }, + { + "epoch": 1.4693104468777745, + "grad_norm": 1.054195761680603, + "learning_rate": 1.0809697749820862e-05, + "loss": 0.2354, + "step": 3103 + }, + { + "epoch": 1.4697839597514057, + "grad_norm": 1.249380111694336, + "learning_rate": 1.0804600334352398e-05, + "loss": 0.2031, + "step": 3104 + }, + { + "epoch": 1.470257472625037, + "grad_norm": 1.4922364950180054, + "learning_rate": 1.0799502708448475e-05, + "loss": 0.2107, + "step": 3105 + }, + { + "epoch": 1.4707309854986683, + "grad_norm": 1.830451488494873, + "learning_rate": 1.079440487344233e-05, + "loss": 0.2185, + "step": 3106 + }, + { + "epoch": 1.4712044983722996, + "grad_norm": 1.072477102279663, + "learning_rate": 1.0789306830667256e-05, + "loss": 0.2233, + "step": 3107 + }, + { + "epoch": 1.4716780112459307, + "grad_norm": 1.220098853111267, + "learning_rate": 1.0784208581456595e-05, + "loss": 0.2268, + "step": 3108 + }, + { + "epoch": 1.472151524119562, + "grad_norm": 1.263378381729126, + "learning_rate": 1.0779110127143746e-05, + "loss": 0.2277, + "step": 3109 + }, + { + "epoch": 1.4726250369931932, + "grad_norm": 1.1181433200836182, + "learning_rate": 1.0774011469062157e-05, + "loss": 0.2124, + "step": 3110 + }, + { + "epoch": 1.4730985498668245, + "grad_norm": 0.9625453352928162, + "learning_rate": 1.0768912608545336e-05, + "loss": 0.2112, + "step": 3111 + }, + { + "epoch": 1.4735720627404558, + "grad_norm": 1.250014066696167, + "learning_rate": 1.0763813546926842e-05, + "loss": 0.2388, + "step": 3112 + }, + { + "epoch": 1.474045575614087, + "grad_norm": 1.1710668802261353, + "learning_rate": 1.0758714285540281e-05, + "loss": 0.2245, + "step": 3113 + }, + { + "epoch": 1.4745190884877182, + "grad_norm": 1.6824264526367188, + "learning_rate": 1.0753614825719321e-05, + "loss": 0.1949, + "step": 3114 + }, + { + "epoch": 1.4749926013613495, + "grad_norm": 1.1846730709075928, + "learning_rate": 1.0748515168797673e-05, + "loss": 0.2139, + "step": 3115 + }, + { + "epoch": 1.4754661142349808, + "grad_norm": 1.0218571424484253, + "learning_rate": 1.07434153161091e-05, + "loss": 0.1977, + "step": 3116 + }, + { + "epoch": 1.475939627108612, + "grad_norm": 1.459490418434143, + "learning_rate": 1.0738315268987424e-05, + "loss": 0.2223, + "step": 3117 + }, + { + "epoch": 1.4764131399822433, + "grad_norm": 1.3154277801513672, + "learning_rate": 1.0733215028766515e-05, + "loss": 0.1916, + "step": 3118 + }, + { + "epoch": 1.4768866528558746, + "grad_norm": 1.44888436794281, + "learning_rate": 1.0728114596780287e-05, + "loss": 0.1832, + "step": 3119 + }, + { + "epoch": 1.4773601657295057, + "grad_norm": 1.1630711555480957, + "learning_rate": 1.072301397436271e-05, + "loss": 0.2025, + "step": 3120 + }, + { + "epoch": 1.477833678603137, + "grad_norm": 1.9314302206039429, + "learning_rate": 1.0717913162847803e-05, + "loss": 0.1954, + "step": 3121 + }, + { + "epoch": 1.4783071914767683, + "grad_norm": 2.154477119445801, + "learning_rate": 1.0712812163569638e-05, + "loss": 0.2119, + "step": 3122 + }, + { + "epoch": 1.4787807043503995, + "grad_norm": 1.4404516220092773, + "learning_rate": 1.0707710977862322e-05, + "loss": 0.2379, + "step": 3123 + }, + { + "epoch": 1.4792542172240308, + "grad_norm": 1.926571249961853, + "learning_rate": 1.070260960706003e-05, + "loss": 0.2268, + "step": 3124 + }, + { + "epoch": 1.479727730097662, + "grad_norm": 1.082242488861084, + "learning_rate": 1.0697508052496975e-05, + "loss": 0.2129, + "step": 3125 + }, + { + "epoch": 1.4802012429712934, + "grad_norm": 1.2488141059875488, + "learning_rate": 1.0692406315507416e-05, + "loss": 0.225, + "step": 3126 + }, + { + "epoch": 1.4806747558449245, + "grad_norm": 1.4362695217132568, + "learning_rate": 1.0687304397425666e-05, + "loss": 0.2294, + "step": 3127 + }, + { + "epoch": 1.4811482687185558, + "grad_norm": 1.3312585353851318, + "learning_rate": 1.068220229958608e-05, + "loss": 0.2222, + "step": 3128 + }, + { + "epoch": 1.481621781592187, + "grad_norm": 1.7034131288528442, + "learning_rate": 1.0677100023323064e-05, + "loss": 0.2519, + "step": 3129 + }, + { + "epoch": 1.4820952944658183, + "grad_norm": 1.3612613677978516, + "learning_rate": 1.067199756997107e-05, + "loss": 0.2048, + "step": 3130 + }, + { + "epoch": 1.4825688073394496, + "grad_norm": 1.0770941972732544, + "learning_rate": 1.0666894940864595e-05, + "loss": 0.2233, + "step": 3131 + }, + { + "epoch": 1.4830423202130807, + "grad_norm": 1.6613733768463135, + "learning_rate": 1.0661792137338183e-05, + "loss": 0.2034, + "step": 3132 + }, + { + "epoch": 1.483515833086712, + "grad_norm": 1.7866359949111938, + "learning_rate": 1.0656689160726418e-05, + "loss": 0.2307, + "step": 3133 + }, + { + "epoch": 1.4839893459603433, + "grad_norm": 0.9103512167930603, + "learning_rate": 1.065158601236394e-05, + "loss": 0.2262, + "step": 3134 + }, + { + "epoch": 1.4844628588339746, + "grad_norm": 1.5666887760162354, + "learning_rate": 1.0646482693585427e-05, + "loss": 0.1796, + "step": 3135 + }, + { + "epoch": 1.4849363717076058, + "grad_norm": 1.1646493673324585, + "learning_rate": 1.0641379205725599e-05, + "loss": 0.2278, + "step": 3136 + }, + { + "epoch": 1.4854098845812371, + "grad_norm": 1.1764169931411743, + "learning_rate": 1.063627555011923e-05, + "loss": 0.1983, + "step": 3137 + }, + { + "epoch": 1.4858833974548684, + "grad_norm": 1.3643893003463745, + "learning_rate": 1.0631171728101129e-05, + "loss": 0.1969, + "step": 3138 + }, + { + "epoch": 1.4863569103284995, + "grad_norm": 1.2476388216018677, + "learning_rate": 1.0626067741006155e-05, + "loss": 0.1941, + "step": 3139 + }, + { + "epoch": 1.4868304232021308, + "grad_norm": 1.1567429304122925, + "learning_rate": 1.0620963590169197e-05, + "loss": 0.2146, + "step": 3140 + }, + { + "epoch": 1.487303936075762, + "grad_norm": 1.037000060081482, + "learning_rate": 1.0615859276925207e-05, + "loss": 0.2062, + "step": 3141 + }, + { + "epoch": 1.4877774489493933, + "grad_norm": 1.5151640176773071, + "learning_rate": 1.0610754802609162e-05, + "loss": 0.2161, + "step": 3142 + }, + { + "epoch": 1.4882509618230246, + "grad_norm": 1.4791309833526611, + "learning_rate": 1.060565016855609e-05, + "loss": 0.2273, + "step": 3143 + }, + { + "epoch": 1.4887244746966557, + "grad_norm": 1.1170471906661987, + "learning_rate": 1.0600545376101061e-05, + "loss": 0.2279, + "step": 3144 + }, + { + "epoch": 1.489197987570287, + "grad_norm": 1.664336085319519, + "learning_rate": 1.0595440426579182e-05, + "loss": 0.2168, + "step": 3145 + }, + { + "epoch": 1.4896715004439183, + "grad_norm": 1.1722487211227417, + "learning_rate": 1.0590335321325603e-05, + "loss": 0.2215, + "step": 3146 + }, + { + "epoch": 1.4901450133175496, + "grad_norm": 1.0749174356460571, + "learning_rate": 1.0585230061675515e-05, + "loss": 0.2348, + "step": 3147 + }, + { + "epoch": 1.4906185261911808, + "grad_norm": 1.5461552143096924, + "learning_rate": 1.0580124648964153e-05, + "loss": 0.2247, + "step": 3148 + }, + { + "epoch": 1.4910920390648121, + "grad_norm": 0.9260118007659912, + "learning_rate": 1.0575019084526785e-05, + "loss": 0.2142, + "step": 3149 + }, + { + "epoch": 1.4915655519384434, + "grad_norm": 1.4796782732009888, + "learning_rate": 1.0569913369698722e-05, + "loss": 0.2071, + "step": 3150 + }, + { + "epoch": 1.4920390648120745, + "grad_norm": 1.0759540796279907, + "learning_rate": 1.0564807505815316e-05, + "loss": 0.1932, + "step": 3151 + }, + { + "epoch": 1.4925125776857058, + "grad_norm": 1.082331895828247, + "learning_rate": 1.0559701494211953e-05, + "loss": 0.212, + "step": 3152 + }, + { + "epoch": 1.492986090559337, + "grad_norm": 1.0994702577590942, + "learning_rate": 1.0554595336224065e-05, + "loss": 0.2176, + "step": 3153 + }, + { + "epoch": 1.4934596034329684, + "grad_norm": 0.989517092704773, + "learning_rate": 1.0549489033187116e-05, + "loss": 0.2085, + "step": 3154 + }, + { + "epoch": 1.4939331163065996, + "grad_norm": 1.2326009273529053, + "learning_rate": 1.0544382586436613e-05, + "loss": 0.1911, + "step": 3155 + }, + { + "epoch": 1.4944066291802307, + "grad_norm": 1.4476752281188965, + "learning_rate": 1.0539275997308099e-05, + "loss": 0.2365, + "step": 3156 + }, + { + "epoch": 1.4948801420538622, + "grad_norm": 1.07553231716156, + "learning_rate": 1.0534169267137146e-05, + "loss": 0.2231, + "step": 3157 + }, + { + "epoch": 1.4953536549274933, + "grad_norm": 1.2448638677597046, + "learning_rate": 1.0529062397259375e-05, + "loss": 0.2211, + "step": 3158 + }, + { + "epoch": 1.4958271678011246, + "grad_norm": 1.1470022201538086, + "learning_rate": 1.0523955389010438e-05, + "loss": 0.2268, + "step": 3159 + }, + { + "epoch": 1.4963006806747559, + "grad_norm": 1.2596864700317383, + "learning_rate": 1.0518848243726026e-05, + "loss": 0.2212, + "step": 3160 + }, + { + "epoch": 1.4967741935483871, + "grad_norm": 1.1401036977767944, + "learning_rate": 1.0513740962741861e-05, + "loss": 0.2301, + "step": 3161 + }, + { + "epoch": 1.4972477064220184, + "grad_norm": 1.7496001720428467, + "learning_rate": 1.05086335473937e-05, + "loss": 0.2162, + "step": 3162 + }, + { + "epoch": 1.4977212192956495, + "grad_norm": 1.3748828172683716, + "learning_rate": 1.0503525999017343e-05, + "loss": 0.2101, + "step": 3163 + }, + { + "epoch": 1.4981947321692808, + "grad_norm": 1.002260446548462, + "learning_rate": 1.049841831894862e-05, + "loss": 0.2274, + "step": 3164 + }, + { + "epoch": 1.498668245042912, + "grad_norm": 1.6265779733657837, + "learning_rate": 1.0493310508523393e-05, + "loss": 0.2283, + "step": 3165 + }, + { + "epoch": 1.4991417579165434, + "grad_norm": 1.0258605480194092, + "learning_rate": 1.0488202569077564e-05, + "loss": 0.199, + "step": 3166 + }, + { + "epoch": 1.4996152707901746, + "grad_norm": 1.2962850332260132, + "learning_rate": 1.0483094501947062e-05, + "loss": 0.2447, + "step": 3167 + }, + { + "epoch": 1.5000887836638057, + "grad_norm": 0.9795529246330261, + "learning_rate": 1.0477986308467851e-05, + "loss": 0.2269, + "step": 3168 + }, + { + "epoch": 1.5005622965374372, + "grad_norm": 1.3065211772918701, + "learning_rate": 1.0472877989975933e-05, + "loss": 0.2304, + "step": 3169 + }, + { + "epoch": 1.5010358094110683, + "grad_norm": 2.2391700744628906, + "learning_rate": 1.046776954780734e-05, + "loss": 0.2273, + "step": 3170 + }, + { + "epoch": 1.5015093222846996, + "grad_norm": 1.249174952507019, + "learning_rate": 1.0462660983298132e-05, + "loss": 0.1968, + "step": 3171 + }, + { + "epoch": 1.5019828351583309, + "grad_norm": 1.0384050607681274, + "learning_rate": 1.045755229778441e-05, + "loss": 0.2087, + "step": 3172 + }, + { + "epoch": 1.5024563480319622, + "grad_norm": 1.1867655515670776, + "learning_rate": 1.0452443492602296e-05, + "loss": 0.2292, + "step": 3173 + }, + { + "epoch": 1.5029298609055934, + "grad_norm": 1.0175690650939941, + "learning_rate": 1.0447334569087953e-05, + "loss": 0.2039, + "step": 3174 + }, + { + "epoch": 1.5034033737792245, + "grad_norm": 1.395185947418213, + "learning_rate": 1.0442225528577568e-05, + "loss": 0.2102, + "step": 3175 + }, + { + "epoch": 1.503876886652856, + "grad_norm": 1.4154939651489258, + "learning_rate": 1.0437116372407364e-05, + "loss": 0.2346, + "step": 3176 + }, + { + "epoch": 1.504350399526487, + "grad_norm": 1.1730825901031494, + "learning_rate": 1.0432007101913588e-05, + "loss": 0.1973, + "step": 3177 + }, + { + "epoch": 1.5048239124001184, + "grad_norm": 1.7297412157058716, + "learning_rate": 1.0426897718432523e-05, + "loss": 0.216, + "step": 3178 + }, + { + "epoch": 1.5052974252737497, + "grad_norm": 0.9606209993362427, + "learning_rate": 1.0421788223300478e-05, + "loss": 0.2313, + "step": 3179 + }, + { + "epoch": 1.5057709381473807, + "grad_norm": 1.1701611280441284, + "learning_rate": 1.041667861785379e-05, + "loss": 0.208, + "step": 3180 + }, + { + "epoch": 1.5062444510210122, + "grad_norm": 1.2683926820755005, + "learning_rate": 1.0411568903428832e-05, + "loss": 0.229, + "step": 3181 + }, + { + "epoch": 1.5067179638946433, + "grad_norm": 1.734331727027893, + "learning_rate": 1.0406459081361998e-05, + "loss": 0.2352, + "step": 3182 + }, + { + "epoch": 1.5071914767682746, + "grad_norm": 1.5925458669662476, + "learning_rate": 1.0401349152989713e-05, + "loss": 0.2294, + "step": 3183 + }, + { + "epoch": 1.5076649896419059, + "grad_norm": 1.7187014818191528, + "learning_rate": 1.0396239119648426e-05, + "loss": 0.2182, + "step": 3184 + }, + { + "epoch": 1.5081385025155372, + "grad_norm": 1.6393986940383911, + "learning_rate": 1.0391128982674621e-05, + "loss": 0.2139, + "step": 3185 + }, + { + "epoch": 1.5086120153891684, + "grad_norm": 1.1025725603103638, + "learning_rate": 1.0386018743404805e-05, + "loss": 0.2337, + "step": 3186 + }, + { + "epoch": 1.5090855282627995, + "grad_norm": 1.29452383518219, + "learning_rate": 1.0380908403175509e-05, + "loss": 0.1877, + "step": 3187 + }, + { + "epoch": 1.509559041136431, + "grad_norm": 2.088343858718872, + "learning_rate": 1.0375797963323295e-05, + "loss": 0.2016, + "step": 3188 + }, + { + "epoch": 1.510032554010062, + "grad_norm": 1.1434667110443115, + "learning_rate": 1.037068742518475e-05, + "loss": 0.2022, + "step": 3189 + }, + { + "epoch": 1.5105060668836934, + "grad_norm": 1.0598808526992798, + "learning_rate": 1.0365576790096486e-05, + "loss": 0.2173, + "step": 3190 + }, + { + "epoch": 1.5109795797573247, + "grad_norm": 1.190727710723877, + "learning_rate": 1.0360466059395138e-05, + "loss": 0.2441, + "step": 3191 + }, + { + "epoch": 1.511453092630956, + "grad_norm": 1.5847951173782349, + "learning_rate": 1.0355355234417369e-05, + "loss": 0.2008, + "step": 3192 + }, + { + "epoch": 1.5119266055045872, + "grad_norm": 1.251030445098877, + "learning_rate": 1.035024431649987e-05, + "loss": 0.2403, + "step": 3193 + }, + { + "epoch": 1.5124001183782183, + "grad_norm": 1.6456767320632935, + "learning_rate": 1.0345133306979342e-05, + "loss": 0.2249, + "step": 3194 + }, + { + "epoch": 1.5128736312518498, + "grad_norm": 1.1858950853347778, + "learning_rate": 1.0340022207192532e-05, + "loss": 0.2287, + "step": 3195 + }, + { + "epoch": 1.5133471441254809, + "grad_norm": 1.4092812538146973, + "learning_rate": 1.0334911018476194e-05, + "loss": 0.2057, + "step": 3196 + }, + { + "epoch": 1.5138206569991122, + "grad_norm": 1.0549933910369873, + "learning_rate": 1.0329799742167108e-05, + "loss": 0.1973, + "step": 3197 + }, + { + "epoch": 1.5142941698727435, + "grad_norm": 1.5164347887039185, + "learning_rate": 1.0324688379602083e-05, + "loss": 0.2488, + "step": 3198 + }, + { + "epoch": 1.5147676827463745, + "grad_norm": 1.8977534770965576, + "learning_rate": 1.0319576932117945e-05, + "loss": 0.2014, + "step": 3199 + }, + { + "epoch": 1.515241195620006, + "grad_norm": 1.5914242267608643, + "learning_rate": 1.0314465401051544e-05, + "loss": 0.2275, + "step": 3200 + }, + { + "epoch": 1.515714708493637, + "grad_norm": 1.1438359022140503, + "learning_rate": 1.030935378773975e-05, + "loss": 0.2144, + "step": 3201 + }, + { + "epoch": 1.5161882213672684, + "grad_norm": 1.1040819883346558, + "learning_rate": 1.0304242093519456e-05, + "loss": 0.218, + "step": 3202 + }, + { + "epoch": 1.5166617342408997, + "grad_norm": 1.5802520513534546, + "learning_rate": 1.0299130319727576e-05, + "loss": 0.2124, + "step": 3203 + }, + { + "epoch": 1.517135247114531, + "grad_norm": 1.4523437023162842, + "learning_rate": 1.029401846770105e-05, + "loss": 0.2154, + "step": 3204 + }, + { + "epoch": 1.5176087599881622, + "grad_norm": 1.1811856031417847, + "learning_rate": 1.0288906538776831e-05, + "loss": 0.22, + "step": 3205 + }, + { + "epoch": 1.5180822728617933, + "grad_norm": 1.3491801023483276, + "learning_rate": 1.0283794534291891e-05, + "loss": 0.2113, + "step": 3206 + }, + { + "epoch": 1.5185557857354248, + "grad_norm": 1.7651821374893188, + "learning_rate": 1.027868245558323e-05, + "loss": 0.2177, + "step": 3207 + }, + { + "epoch": 1.519029298609056, + "grad_norm": 1.385327696800232, + "learning_rate": 1.0273570303987859e-05, + "loss": 0.2135, + "step": 3208 + }, + { + "epoch": 1.5195028114826872, + "grad_norm": 1.7392898797988892, + "learning_rate": 1.0268458080842815e-05, + "loss": 0.2174, + "step": 3209 + }, + { + "epoch": 1.5199763243563185, + "grad_norm": 1.3350859880447388, + "learning_rate": 1.0263345787485156e-05, + "loss": 0.2226, + "step": 3210 + }, + { + "epoch": 1.5204498372299495, + "grad_norm": 1.0405269861221313, + "learning_rate": 1.025823342525194e-05, + "loss": 0.2163, + "step": 3211 + }, + { + "epoch": 1.520923350103581, + "grad_norm": 1.7307740449905396, + "learning_rate": 1.0253120995480264e-05, + "loss": 0.2056, + "step": 3212 + }, + { + "epoch": 1.521396862977212, + "grad_norm": 1.300768494606018, + "learning_rate": 1.024800849950723e-05, + "loss": 0.1799, + "step": 3213 + }, + { + "epoch": 1.5218703758508434, + "grad_norm": 1.4394781589508057, + "learning_rate": 1.024289593866997e-05, + "loss": 0.2101, + "step": 3214 + }, + { + "epoch": 1.5223438887244747, + "grad_norm": 1.2956507205963135, + "learning_rate": 1.0237783314305621e-05, + "loss": 0.2143, + "step": 3215 + }, + { + "epoch": 1.522817401598106, + "grad_norm": 1.18267822265625, + "learning_rate": 1.0232670627751337e-05, + "loss": 0.2064, + "step": 3216 + }, + { + "epoch": 1.5232909144717373, + "grad_norm": 1.1501325368881226, + "learning_rate": 1.02275578803443e-05, + "loss": 0.2249, + "step": 3217 + }, + { + "epoch": 1.5237644273453683, + "grad_norm": 1.0747531652450562, + "learning_rate": 1.0222445073421692e-05, + "loss": 0.2215, + "step": 3218 + }, + { + "epoch": 1.5242379402189998, + "grad_norm": 1.061420202255249, + "learning_rate": 1.021733220832072e-05, + "loss": 0.1964, + "step": 3219 + }, + { + "epoch": 1.524711453092631, + "grad_norm": 1.1536870002746582, + "learning_rate": 1.0212219286378606e-05, + "loss": 0.2291, + "step": 3220 + }, + { + "epoch": 1.5251849659662622, + "grad_norm": 2.1566736698150635, + "learning_rate": 1.0207106308932585e-05, + "loss": 0.2479, + "step": 3221 + }, + { + "epoch": 1.5256584788398935, + "grad_norm": 1.3876272439956665, + "learning_rate": 1.0201993277319906e-05, + "loss": 0.2181, + "step": 3222 + }, + { + "epoch": 1.5261319917135248, + "grad_norm": 1.3905116319656372, + "learning_rate": 1.0196880192877836e-05, + "loss": 0.2275, + "step": 3223 + }, + { + "epoch": 1.526605504587156, + "grad_norm": 1.5620601177215576, + "learning_rate": 1.019176705694365e-05, + "loss": 0.222, + "step": 3224 + }, + { + "epoch": 1.5270790174607871, + "grad_norm": 1.212353229522705, + "learning_rate": 1.018665387085464e-05, + "loss": 0.196, + "step": 3225 + }, + { + "epoch": 1.5275525303344186, + "grad_norm": 1.0069432258605957, + "learning_rate": 1.018154063594811e-05, + "loss": 0.2234, + "step": 3226 + }, + { + "epoch": 1.5280260432080497, + "grad_norm": 1.6228710412979126, + "learning_rate": 1.017642735356138e-05, + "loss": 0.2348, + "step": 3227 + }, + { + "epoch": 1.528499556081681, + "grad_norm": 1.7563085556030273, + "learning_rate": 1.0171314025031777e-05, + "loss": 0.2149, + "step": 3228 + }, + { + "epoch": 1.5289730689553123, + "grad_norm": 1.539036750793457, + "learning_rate": 1.0166200651696642e-05, + "loss": 0.2116, + "step": 3229 + }, + { + "epoch": 1.5294465818289433, + "grad_norm": 2.3316597938537598, + "learning_rate": 1.0161087234893332e-05, + "loss": 0.2123, + "step": 3230 + }, + { + "epoch": 1.5299200947025748, + "grad_norm": 1.3801292181015015, + "learning_rate": 1.0155973775959209e-05, + "loss": 0.2363, + "step": 3231 + }, + { + "epoch": 1.530393607576206, + "grad_norm": 1.6556155681610107, + "learning_rate": 1.0150860276231649e-05, + "loss": 0.2092, + "step": 3232 + }, + { + "epoch": 1.5308671204498372, + "grad_norm": 1.1947085857391357, + "learning_rate": 1.0145746737048037e-05, + "loss": 0.2128, + "step": 3233 + }, + { + "epoch": 1.5313406333234685, + "grad_norm": 1.491898775100708, + "learning_rate": 1.0140633159745775e-05, + "loss": 0.2195, + "step": 3234 + }, + { + "epoch": 1.5318141461970998, + "grad_norm": 1.2863268852233887, + "learning_rate": 1.0135519545662267e-05, + "loss": 0.2168, + "step": 3235 + }, + { + "epoch": 1.532287659070731, + "grad_norm": 2.18508243560791, + "learning_rate": 1.013040589613493e-05, + "loss": 0.2423, + "step": 3236 + }, + { + "epoch": 1.5327611719443621, + "grad_norm": 0.9997796416282654, + "learning_rate": 1.0125292212501186e-05, + "loss": 0.2101, + "step": 3237 + }, + { + "epoch": 1.5332346848179936, + "grad_norm": 3.1009128093719482, + "learning_rate": 1.0120178496098474e-05, + "loss": 0.2044, + "step": 3238 + }, + { + "epoch": 1.5337081976916247, + "grad_norm": 2.1549055576324463, + "learning_rate": 1.0115064748264236e-05, + "loss": 0.2169, + "step": 3239 + }, + { + "epoch": 1.534181710565256, + "grad_norm": 1.5560920238494873, + "learning_rate": 1.0109950970335923e-05, + "loss": 0.2285, + "step": 3240 + }, + { + "epoch": 1.5346552234388873, + "grad_norm": 2.1575727462768555, + "learning_rate": 1.0104837163650994e-05, + "loss": 0.2077, + "step": 3241 + }, + { + "epoch": 1.5351287363125183, + "grad_norm": 2.3875033855438232, + "learning_rate": 1.0099723329546915e-05, + "loss": 0.2247, + "step": 3242 + }, + { + "epoch": 1.5356022491861498, + "grad_norm": 1.0770213603973389, + "learning_rate": 1.0094609469361162e-05, + "loss": 0.215, + "step": 3243 + }, + { + "epoch": 1.536075762059781, + "grad_norm": 1.1401655673980713, + "learning_rate": 1.0089495584431217e-05, + "loss": 0.2126, + "step": 3244 + }, + { + "epoch": 1.5365492749334122, + "grad_norm": 1.0763803720474243, + "learning_rate": 1.0084381676094566e-05, + "loss": 0.2448, + "step": 3245 + }, + { + "epoch": 1.5370227878070435, + "grad_norm": 1.8991236686706543, + "learning_rate": 1.00792677456887e-05, + "loss": 0.2243, + "step": 3246 + }, + { + "epoch": 1.5374963006806748, + "grad_norm": 2.337759256362915, + "learning_rate": 1.0074153794551119e-05, + "loss": 0.2146, + "step": 3247 + }, + { + "epoch": 1.537969813554306, + "grad_norm": 2.1561119556427, + "learning_rate": 1.0069039824019326e-05, + "loss": 0.195, + "step": 3248 + }, + { + "epoch": 1.5384433264279371, + "grad_norm": 2.016658306121826, + "learning_rate": 1.0063925835430838e-05, + "loss": 0.2334, + "step": 3249 + }, + { + "epoch": 1.5389168393015686, + "grad_norm": 2.295039653778076, + "learning_rate": 1.005881183012316e-05, + "loss": 0.2266, + "step": 3250 + }, + { + "epoch": 1.5393903521751997, + "grad_norm": 2.0591042041778564, + "learning_rate": 1.0053697809433817e-05, + "loss": 0.2052, + "step": 3251 + }, + { + "epoch": 1.539863865048831, + "grad_norm": 1.625144362449646, + "learning_rate": 1.004858377470033e-05, + "loss": 0.2076, + "step": 3252 + }, + { + "epoch": 1.5403373779224623, + "grad_norm": 0.9673202633857727, + "learning_rate": 1.0043469727260228e-05, + "loss": 0.2103, + "step": 3253 + }, + { + "epoch": 1.5408108907960936, + "grad_norm": 1.1474332809448242, + "learning_rate": 1.0038355668451037e-05, + "loss": 0.214, + "step": 3254 + }, + { + "epoch": 1.5412844036697249, + "grad_norm": 1.004038691520691, + "learning_rate": 1.0033241599610288e-05, + "loss": 0.1771, + "step": 3255 + }, + { + "epoch": 1.541757916543356, + "grad_norm": 1.4131745100021362, + "learning_rate": 1.0028127522075522e-05, + "loss": 0.2241, + "step": 3256 + }, + { + "epoch": 1.5422314294169874, + "grad_norm": 2.0983598232269287, + "learning_rate": 1.0023013437184273e-05, + "loss": 0.2309, + "step": 3257 + }, + { + "epoch": 1.5427049422906185, + "grad_norm": 1.784072995185852, + "learning_rate": 1.0017899346274082e-05, + "loss": 0.2212, + "step": 3258 + }, + { + "epoch": 1.5431784551642498, + "grad_norm": 2.016923427581787, + "learning_rate": 1.0012785250682488e-05, + "loss": 0.2223, + "step": 3259 + }, + { + "epoch": 1.543651968037881, + "grad_norm": 1.3807404041290283, + "learning_rate": 1.0007671151747038e-05, + "loss": 0.2082, + "step": 3260 + }, + { + "epoch": 1.5441254809115121, + "grad_norm": 1.9663400650024414, + "learning_rate": 1.000255705080527e-05, + "loss": 0.2303, + "step": 3261 + }, + { + "epoch": 1.5445989937851436, + "grad_norm": 1.2780094146728516, + "learning_rate": 9.997442949194733e-06, + "loss": 0.2188, + "step": 3262 + }, + { + "epoch": 1.5450725066587747, + "grad_norm": 1.079207181930542, + "learning_rate": 9.992328848252965e-06, + "loss": 0.2033, + "step": 3263 + }, + { + "epoch": 1.545546019532406, + "grad_norm": 1.1068617105484009, + "learning_rate": 9.987214749317514e-06, + "loss": 0.2441, + "step": 3264 + }, + { + "epoch": 1.5460195324060373, + "grad_norm": 1.965084195137024, + "learning_rate": 9.982100653725921e-06, + "loss": 0.1883, + "step": 3265 + }, + { + "epoch": 1.5464930452796686, + "grad_norm": 2.6802639961242676, + "learning_rate": 9.97698656281573e-06, + "loss": 0.2319, + "step": 3266 + }, + { + "epoch": 1.5469665581532999, + "grad_norm": 1.9177919626235962, + "learning_rate": 9.971872477924482e-06, + "loss": 0.2197, + "step": 3267 + }, + { + "epoch": 1.547440071026931, + "grad_norm": 2.636075735092163, + "learning_rate": 9.966758400389714e-06, + "loss": 0.2109, + "step": 3268 + }, + { + "epoch": 1.5479135839005624, + "grad_norm": 2.6643176078796387, + "learning_rate": 9.961644331548967e-06, + "loss": 0.2137, + "step": 3269 + }, + { + "epoch": 1.5483870967741935, + "grad_norm": 2.1698949337005615, + "learning_rate": 9.956530272739775e-06, + "loss": 0.2371, + "step": 3270 + }, + { + "epoch": 1.5488606096478248, + "grad_norm": 1.0527149438858032, + "learning_rate": 9.951416225299671e-06, + "loss": 0.2027, + "step": 3271 + }, + { + "epoch": 1.549334122521456, + "grad_norm": 1.1439855098724365, + "learning_rate": 9.946302190566186e-06, + "loss": 0.2106, + "step": 3272 + }, + { + "epoch": 1.5498076353950871, + "grad_norm": 1.49906325340271, + "learning_rate": 9.941188169876843e-06, + "loss": 0.2192, + "step": 3273 + }, + { + "epoch": 1.5502811482687187, + "grad_norm": 1.464437484741211, + "learning_rate": 9.936074164569168e-06, + "loss": 0.2061, + "step": 3274 + }, + { + "epoch": 1.5507546611423497, + "grad_norm": 2.1014366149902344, + "learning_rate": 9.930960175980677e-06, + "loss": 0.2131, + "step": 3275 + }, + { + "epoch": 1.551228174015981, + "grad_norm": 1.1291313171386719, + "learning_rate": 9.925846205448886e-06, + "loss": 0.2004, + "step": 3276 + }, + { + "epoch": 1.5517016868896123, + "grad_norm": 2.514810800552368, + "learning_rate": 9.920732254311306e-06, + "loss": 0.1979, + "step": 3277 + }, + { + "epoch": 1.5521751997632436, + "grad_norm": 1.7324765920639038, + "learning_rate": 9.91561832390544e-06, + "loss": 0.1977, + "step": 3278 + }, + { + "epoch": 1.5526487126368749, + "grad_norm": 1.4034252166748047, + "learning_rate": 9.910504415568788e-06, + "loss": 0.2243, + "step": 3279 + }, + { + "epoch": 1.553122225510506, + "grad_norm": 1.4264874458312988, + "learning_rate": 9.90539053063884e-06, + "loss": 0.2129, + "step": 3280 + }, + { + "epoch": 1.5535957383841374, + "grad_norm": 1.2019026279449463, + "learning_rate": 9.900276670453085e-06, + "loss": 0.2052, + "step": 3281 + }, + { + "epoch": 1.5540692512577685, + "grad_norm": 1.222839593887329, + "learning_rate": 9.895162836349006e-06, + "loss": 0.2145, + "step": 3282 + }, + { + "epoch": 1.5545427641313998, + "grad_norm": 1.3310260772705078, + "learning_rate": 9.890049029664079e-06, + "loss": 0.2224, + "step": 3283 + }, + { + "epoch": 1.555016277005031, + "grad_norm": 1.5830497741699219, + "learning_rate": 9.884935251735766e-06, + "loss": 0.2102, + "step": 3284 + }, + { + "epoch": 1.5554897898786624, + "grad_norm": 1.1304932832717896, + "learning_rate": 9.879821503901527e-06, + "loss": 0.2128, + "step": 3285 + }, + { + "epoch": 1.5559633027522937, + "grad_norm": 1.5791419744491577, + "learning_rate": 9.874707787498814e-06, + "loss": 0.2253, + "step": 3286 + }, + { + "epoch": 1.5564368156259247, + "grad_norm": 2.089458465576172, + "learning_rate": 9.869594103865074e-06, + "loss": 0.209, + "step": 3287 + }, + { + "epoch": 1.5569103284995562, + "grad_norm": 1.063925862312317, + "learning_rate": 9.864480454337735e-06, + "loss": 0.2186, + "step": 3288 + }, + { + "epoch": 1.5573838413731873, + "grad_norm": 1.1000468730926514, + "learning_rate": 9.859366840254227e-06, + "loss": 0.212, + "step": 3289 + }, + { + "epoch": 1.5578573542468186, + "grad_norm": 1.3365740776062012, + "learning_rate": 9.854253262951964e-06, + "loss": 0.1934, + "step": 3290 + }, + { + "epoch": 1.5583308671204499, + "grad_norm": 1.6963863372802734, + "learning_rate": 9.849139723768354e-06, + "loss": 0.2115, + "step": 3291 + }, + { + "epoch": 1.558804379994081, + "grad_norm": 1.2676887512207031, + "learning_rate": 9.844026224040794e-06, + "loss": 0.2232, + "step": 3292 + }, + { + "epoch": 1.5592778928677125, + "grad_norm": 1.4118210077285767, + "learning_rate": 9.838912765106671e-06, + "loss": 0.2005, + "step": 3293 + }, + { + "epoch": 1.5597514057413435, + "grad_norm": 1.1821280717849731, + "learning_rate": 9.83379934830336e-06, + "loss": 0.2133, + "step": 3294 + }, + { + "epoch": 1.5602249186149748, + "grad_norm": 1.2033153772354126, + "learning_rate": 9.828685974968224e-06, + "loss": 0.2165, + "step": 3295 + }, + { + "epoch": 1.560698431488606, + "grad_norm": 1.9517135620117188, + "learning_rate": 9.823572646438622e-06, + "loss": 0.2394, + "step": 3296 + }, + { + "epoch": 1.5611719443622374, + "grad_norm": 1.292248249053955, + "learning_rate": 9.818459364051893e-06, + "loss": 0.2039, + "step": 3297 + }, + { + "epoch": 1.5616454572358687, + "grad_norm": 1.03531813621521, + "learning_rate": 9.813346129145364e-06, + "loss": 0.2124, + "step": 3298 + }, + { + "epoch": 1.5621189701094997, + "grad_norm": 1.404657006263733, + "learning_rate": 9.808232943056354e-06, + "loss": 0.2242, + "step": 3299 + }, + { + "epoch": 1.5625924829831312, + "grad_norm": 1.1680656671524048, + "learning_rate": 9.803119807122167e-06, + "loss": 0.193, + "step": 3300 + }, + { + "epoch": 1.5630659958567623, + "grad_norm": 1.476773738861084, + "learning_rate": 9.798006722680096e-06, + "loss": 0.2155, + "step": 3301 + }, + { + "epoch": 1.5635395087303936, + "grad_norm": 1.3005181550979614, + "learning_rate": 9.792893691067417e-06, + "loss": 0.1809, + "step": 3302 + }, + { + "epoch": 1.5640130216040249, + "grad_norm": 1.2317800521850586, + "learning_rate": 9.787780713621397e-06, + "loss": 0.2186, + "step": 3303 + }, + { + "epoch": 1.564486534477656, + "grad_norm": 1.0465822219848633, + "learning_rate": 9.782667791679283e-06, + "loss": 0.2259, + "step": 3304 + }, + { + "epoch": 1.5649600473512875, + "grad_norm": 1.5197325944900513, + "learning_rate": 9.777554926578311e-06, + "loss": 0.237, + "step": 3305 + }, + { + "epoch": 1.5654335602249185, + "grad_norm": 1.2708994150161743, + "learning_rate": 9.772442119655706e-06, + "loss": 0.2116, + "step": 3306 + }, + { + "epoch": 1.5659070730985498, + "grad_norm": 1.392824411392212, + "learning_rate": 9.767329372248666e-06, + "loss": 0.2197, + "step": 3307 + }, + { + "epoch": 1.566380585972181, + "grad_norm": 2.88095760345459, + "learning_rate": 9.762216685694382e-06, + "loss": 0.2157, + "step": 3308 + }, + { + "epoch": 1.5668540988458124, + "grad_norm": 1.6576155424118042, + "learning_rate": 9.757104061330033e-06, + "loss": 0.1998, + "step": 3309 + }, + { + "epoch": 1.5673276117194437, + "grad_norm": 1.031028151512146, + "learning_rate": 9.751991500492772e-06, + "loss": 0.2168, + "step": 3310 + }, + { + "epoch": 1.5678011245930747, + "grad_norm": 2.0515482425689697, + "learning_rate": 9.746879004519741e-06, + "loss": 0.1973, + "step": 3311 + }, + { + "epoch": 1.5682746374667063, + "grad_norm": 1.582655668258667, + "learning_rate": 9.741766574748066e-06, + "loss": 0.2349, + "step": 3312 + }, + { + "epoch": 1.5687481503403373, + "grad_norm": 0.992260754108429, + "learning_rate": 9.736654212514851e-06, + "loss": 0.1884, + "step": 3313 + }, + { + "epoch": 1.5692216632139686, + "grad_norm": 0.8423192501068115, + "learning_rate": 9.731541919157186e-06, + "loss": 0.2027, + "step": 3314 + }, + { + "epoch": 1.5696951760876, + "grad_norm": 1.082379937171936, + "learning_rate": 9.726429696012143e-06, + "loss": 0.207, + "step": 3315 + }, + { + "epoch": 1.5701686889612312, + "grad_norm": 1.25356125831604, + "learning_rate": 9.721317544416775e-06, + "loss": 0.2285, + "step": 3316 + }, + { + "epoch": 1.5706422018348625, + "grad_norm": 1.3357824087142944, + "learning_rate": 9.716205465708114e-06, + "loss": 0.1885, + "step": 3317 + }, + { + "epoch": 1.5711157147084935, + "grad_norm": 1.4782239198684692, + "learning_rate": 9.711093461223175e-06, + "loss": 0.1942, + "step": 3318 + }, + { + "epoch": 1.571589227582125, + "grad_norm": 1.226721167564392, + "learning_rate": 9.705981532298955e-06, + "loss": 0.2043, + "step": 3319 + }, + { + "epoch": 1.5720627404557561, + "grad_norm": 1.5207408666610718, + "learning_rate": 9.700869680272422e-06, + "loss": 0.2314, + "step": 3320 + }, + { + "epoch": 1.5725362533293874, + "grad_norm": 1.0883424282073975, + "learning_rate": 9.695757906480545e-06, + "loss": 0.2212, + "step": 3321 + }, + { + "epoch": 1.5730097662030187, + "grad_norm": 1.4195133447647095, + "learning_rate": 9.690646212260254e-06, + "loss": 0.2189, + "step": 3322 + }, + { + "epoch": 1.5734832790766498, + "grad_norm": 2.070651054382324, + "learning_rate": 9.68553459894846e-06, + "loss": 0.2199, + "step": 3323 + }, + { + "epoch": 1.5739567919502813, + "grad_norm": 1.0643229484558105, + "learning_rate": 9.680423067882057e-06, + "loss": 0.1874, + "step": 3324 + }, + { + "epoch": 1.5744303048239123, + "grad_norm": 2.199005365371704, + "learning_rate": 9.675311620397917e-06, + "loss": 0.2141, + "step": 3325 + }, + { + "epoch": 1.5749038176975436, + "grad_norm": 1.1408580541610718, + "learning_rate": 9.670200257832891e-06, + "loss": 0.1889, + "step": 3326 + }, + { + "epoch": 1.575377330571175, + "grad_norm": 1.1677594184875488, + "learning_rate": 9.665088981523807e-06, + "loss": 0.2221, + "step": 3327 + }, + { + "epoch": 1.5758508434448062, + "grad_norm": 1.3932559490203857, + "learning_rate": 9.659977792807468e-06, + "loss": 0.2027, + "step": 3328 + }, + { + "epoch": 1.5763243563184375, + "grad_norm": 1.122646689414978, + "learning_rate": 9.654866693020656e-06, + "loss": 0.1988, + "step": 3329 + }, + { + "epoch": 1.5767978691920685, + "grad_norm": 1.0918054580688477, + "learning_rate": 9.649755683500134e-06, + "loss": 0.2109, + "step": 3330 + }, + { + "epoch": 1.5772713820657, + "grad_norm": 1.0284793376922607, + "learning_rate": 9.644644765582633e-06, + "loss": 0.1926, + "step": 3331 + }, + { + "epoch": 1.5777448949393311, + "grad_norm": 1.006220817565918, + "learning_rate": 9.639533940604867e-06, + "loss": 0.23, + "step": 3332 + }, + { + "epoch": 1.5782184078129624, + "grad_norm": 1.0561009645462036, + "learning_rate": 9.634423209903518e-06, + "loss": 0.2246, + "step": 3333 + }, + { + "epoch": 1.5786919206865937, + "grad_norm": 1.0437111854553223, + "learning_rate": 9.629312574815251e-06, + "loss": 0.2267, + "step": 3334 + }, + { + "epoch": 1.5791654335602248, + "grad_norm": 2.093397617340088, + "learning_rate": 9.624202036676707e-06, + "loss": 0.2099, + "step": 3335 + }, + { + "epoch": 1.5796389464338563, + "grad_norm": 2.2291595935821533, + "learning_rate": 9.619091596824493e-06, + "loss": 0.2153, + "step": 3336 + }, + { + "epoch": 1.5801124593074873, + "grad_norm": 1.3327558040618896, + "learning_rate": 9.613981256595199e-06, + "loss": 0.1965, + "step": 3337 + }, + { + "epoch": 1.5805859721811186, + "grad_norm": 0.9187760353088379, + "learning_rate": 9.60887101732538e-06, + "loss": 0.2022, + "step": 3338 + }, + { + "epoch": 1.58105948505475, + "grad_norm": 1.6713463068008423, + "learning_rate": 9.603760880351576e-06, + "loss": 0.2052, + "step": 3339 + }, + { + "epoch": 1.5815329979283812, + "grad_norm": 1.2200876474380493, + "learning_rate": 9.59865084701029e-06, + "loss": 0.2185, + "step": 3340 + }, + { + "epoch": 1.5820065108020125, + "grad_norm": 1.1830116510391235, + "learning_rate": 9.593540918638006e-06, + "loss": 0.1816, + "step": 3341 + }, + { + "epoch": 1.5824800236756436, + "grad_norm": 1.3431400060653687, + "learning_rate": 9.588431096571171e-06, + "loss": 0.2059, + "step": 3342 + }, + { + "epoch": 1.582953536549275, + "grad_norm": 1.1804959774017334, + "learning_rate": 9.583321382146212e-06, + "loss": 0.2229, + "step": 3343 + }, + { + "epoch": 1.5834270494229061, + "grad_norm": 1.8426966667175293, + "learning_rate": 9.578211776699527e-06, + "loss": 0.2133, + "step": 3344 + }, + { + "epoch": 1.5839005622965374, + "grad_norm": 1.2378004789352417, + "learning_rate": 9.57310228156748e-06, + "loss": 0.236, + "step": 3345 + }, + { + "epoch": 1.5843740751701687, + "grad_norm": 1.0932939052581787, + "learning_rate": 9.567992898086415e-06, + "loss": 0.2039, + "step": 3346 + }, + { + "epoch": 1.5848475880438, + "grad_norm": 1.0567042827606201, + "learning_rate": 9.56288362759264e-06, + "loss": 0.1963, + "step": 3347 + }, + { + "epoch": 1.5853211009174313, + "grad_norm": 1.230892300605774, + "learning_rate": 9.557774471422434e-06, + "loss": 0.2288, + "step": 3348 + }, + { + "epoch": 1.5857946137910623, + "grad_norm": 1.4192466735839844, + "learning_rate": 9.552665430912049e-06, + "loss": 0.2143, + "step": 3349 + }, + { + "epoch": 1.5862681266646939, + "grad_norm": 1.3479020595550537, + "learning_rate": 9.547556507397705e-06, + "loss": 0.2058, + "step": 3350 + }, + { + "epoch": 1.586741639538325, + "grad_norm": 1.4939135313034058, + "learning_rate": 9.542447702215596e-06, + "loss": 0.2394, + "step": 3351 + }, + { + "epoch": 1.5872151524119562, + "grad_norm": 1.3415539264678955, + "learning_rate": 9.537339016701871e-06, + "loss": 0.2256, + "step": 3352 + }, + { + "epoch": 1.5876886652855875, + "grad_norm": 1.2717926502227783, + "learning_rate": 9.532230452192666e-06, + "loss": 0.2284, + "step": 3353 + }, + { + "epoch": 1.5881621781592186, + "grad_norm": 1.1318105459213257, + "learning_rate": 9.527122010024072e-06, + "loss": 0.2142, + "step": 3354 + }, + { + "epoch": 1.58863569103285, + "grad_norm": 1.1898558139801025, + "learning_rate": 9.522013691532154e-06, + "loss": 0.2224, + "step": 3355 + }, + { + "epoch": 1.5891092039064811, + "grad_norm": 1.3306890726089478, + "learning_rate": 9.516905498052944e-06, + "loss": 0.2193, + "step": 3356 + }, + { + "epoch": 1.5895827167801124, + "grad_norm": 1.2777800559997559, + "learning_rate": 9.51179743092244e-06, + "loss": 0.2109, + "step": 3357 + }, + { + "epoch": 1.5900562296537437, + "grad_norm": 1.1809998750686646, + "learning_rate": 9.50668949147661e-06, + "loss": 0.2307, + "step": 3358 + }, + { + "epoch": 1.590529742527375, + "grad_norm": 1.5275858640670776, + "learning_rate": 9.50158168105138e-06, + "loss": 0.2074, + "step": 3359 + }, + { + "epoch": 1.5910032554010063, + "grad_norm": 1.153929352760315, + "learning_rate": 9.496474000982657e-06, + "loss": 0.1873, + "step": 3360 + }, + { + "epoch": 1.5914767682746374, + "grad_norm": 1.1171983480453491, + "learning_rate": 9.4913664526063e-06, + "loss": 0.2069, + "step": 3361 + }, + { + "epoch": 1.5919502811482689, + "grad_norm": 1.2041152715682983, + "learning_rate": 9.48625903725814e-06, + "loss": 0.2075, + "step": 3362 + }, + { + "epoch": 1.5924237940219, + "grad_norm": 2.3762893676757812, + "learning_rate": 9.481151756273976e-06, + "loss": 0.2169, + "step": 3363 + }, + { + "epoch": 1.5928973068955312, + "grad_norm": 3.4273829460144043, + "learning_rate": 9.476044610989562e-06, + "loss": 0.2338, + "step": 3364 + }, + { + "epoch": 1.5933708197691625, + "grad_norm": 0.9142179489135742, + "learning_rate": 9.470937602740624e-06, + "loss": 0.216, + "step": 3365 + }, + { + "epoch": 1.5938443326427936, + "grad_norm": 1.2260160446166992, + "learning_rate": 9.465830732862857e-06, + "loss": 0.2242, + "step": 3366 + }, + { + "epoch": 1.594317845516425, + "grad_norm": 1.3920302391052246, + "learning_rate": 9.460724002691906e-06, + "loss": 0.216, + "step": 3367 + }, + { + "epoch": 1.5947913583900561, + "grad_norm": 0.9747047424316406, + "learning_rate": 9.455617413563389e-06, + "loss": 0.2445, + "step": 3368 + }, + { + "epoch": 1.5952648712636874, + "grad_norm": 1.1095210313796997, + "learning_rate": 9.450510966812885e-06, + "loss": 0.1813, + "step": 3369 + }, + { + "epoch": 1.5957383841373187, + "grad_norm": 1.0806443691253662, + "learning_rate": 9.445404663775938e-06, + "loss": 0.2429, + "step": 3370 + }, + { + "epoch": 1.59621189701095, + "grad_norm": 1.3196625709533691, + "learning_rate": 9.44029850578805e-06, + "loss": 0.244, + "step": 3371 + }, + { + "epoch": 1.5966854098845813, + "grad_norm": 2.6262784004211426, + "learning_rate": 9.435192494184689e-06, + "loss": 0.2116, + "step": 3372 + }, + { + "epoch": 1.5971589227582124, + "grad_norm": 1.1327464580535889, + "learning_rate": 9.43008663030128e-06, + "loss": 0.1857, + "step": 3373 + }, + { + "epoch": 1.5976324356318439, + "grad_norm": 0.9777623414993286, + "learning_rate": 9.424980915473217e-06, + "loss": 0.2091, + "step": 3374 + }, + { + "epoch": 1.598105948505475, + "grad_norm": 1.3106697797775269, + "learning_rate": 9.419875351035848e-06, + "loss": 0.213, + "step": 3375 + }, + { + "epoch": 1.5985794613791062, + "grad_norm": 1.0573484897613525, + "learning_rate": 9.414769938324487e-06, + "loss": 0.2028, + "step": 3376 + }, + { + "epoch": 1.5990529742527375, + "grad_norm": 1.3696736097335815, + "learning_rate": 9.4096646786744e-06, + "loss": 0.2101, + "step": 3377 + }, + { + "epoch": 1.5995264871263688, + "grad_norm": 1.2642654180526733, + "learning_rate": 9.404559573420822e-06, + "loss": 0.2352, + "step": 3378 + }, + { + "epoch": 1.6, + "grad_norm": 1.4823346138000488, + "learning_rate": 9.399454623898942e-06, + "loss": 0.2035, + "step": 3379 + }, + { + "epoch": 1.6004735128736312, + "grad_norm": 1.1810314655303955, + "learning_rate": 9.394349831443912e-06, + "loss": 0.2161, + "step": 3380 + }, + { + "epoch": 1.6009470257472627, + "grad_norm": 1.377890706062317, + "learning_rate": 9.389245197390842e-06, + "loss": 0.2498, + "step": 3381 + }, + { + "epoch": 1.6014205386208937, + "grad_norm": 1.6414611339569092, + "learning_rate": 9.384140723074796e-06, + "loss": 0.2296, + "step": 3382 + }, + { + "epoch": 1.601894051494525, + "grad_norm": 1.2565416097640991, + "learning_rate": 9.379036409830804e-06, + "loss": 0.2056, + "step": 3383 + }, + { + "epoch": 1.6023675643681563, + "grad_norm": 1.1565498113632202, + "learning_rate": 9.37393225899385e-06, + "loss": 0.2115, + "step": 3384 + }, + { + "epoch": 1.6028410772417874, + "grad_norm": 1.0674059391021729, + "learning_rate": 9.368828271898874e-06, + "loss": 0.2103, + "step": 3385 + }, + { + "epoch": 1.6033145901154189, + "grad_norm": 0.9259766340255737, + "learning_rate": 9.363724449880773e-06, + "loss": 0.1965, + "step": 3386 + }, + { + "epoch": 1.60378810298905, + "grad_norm": 1.08977472782135, + "learning_rate": 9.358620794274404e-06, + "loss": 0.1933, + "step": 3387 + }, + { + "epoch": 1.6042616158626812, + "grad_norm": 1.5023025274276733, + "learning_rate": 9.35351730641458e-06, + "loss": 0.2293, + "step": 3388 + }, + { + "epoch": 1.6047351287363125, + "grad_norm": 1.0194486379623413, + "learning_rate": 9.348413987636065e-06, + "loss": 0.2344, + "step": 3389 + }, + { + "epoch": 1.6052086416099438, + "grad_norm": 2.0349748134613037, + "learning_rate": 9.343310839273587e-06, + "loss": 0.2097, + "step": 3390 + }, + { + "epoch": 1.605682154483575, + "grad_norm": 0.9003639221191406, + "learning_rate": 9.338207862661824e-06, + "loss": 0.2193, + "step": 3391 + }, + { + "epoch": 1.6061556673572062, + "grad_norm": 1.3183727264404297, + "learning_rate": 9.33310505913541e-06, + "loss": 0.2418, + "step": 3392 + }, + { + "epoch": 1.6066291802308377, + "grad_norm": 1.593930721282959, + "learning_rate": 9.328002430028932e-06, + "loss": 0.1988, + "step": 3393 + }, + { + "epoch": 1.6071026931044687, + "grad_norm": 1.2510972023010254, + "learning_rate": 9.322899976676938e-06, + "loss": 0.201, + "step": 3394 + }, + { + "epoch": 1.6075762059781, + "grad_norm": 1.210264801979065, + "learning_rate": 9.317797700413925e-06, + "loss": 0.2076, + "step": 3395 + }, + { + "epoch": 1.6080497188517313, + "grad_norm": 1.0849723815917969, + "learning_rate": 9.31269560257434e-06, + "loss": 0.2217, + "step": 3396 + }, + { + "epoch": 1.6085232317253624, + "grad_norm": 1.2907907962799072, + "learning_rate": 9.307593684492588e-06, + "loss": 0.2071, + "step": 3397 + }, + { + "epoch": 1.6089967445989939, + "grad_norm": 1.6149160861968994, + "learning_rate": 9.302491947503027e-06, + "loss": 0.2145, + "step": 3398 + }, + { + "epoch": 1.609470257472625, + "grad_norm": 1.4612150192260742, + "learning_rate": 9.29739039293997e-06, + "loss": 0.2184, + "step": 3399 + }, + { + "epoch": 1.6099437703462562, + "grad_norm": 1.7195252180099487, + "learning_rate": 9.292289022137678e-06, + "loss": 0.1968, + "step": 3400 + }, + { + "epoch": 1.6104172832198875, + "grad_norm": 2.2150564193725586, + "learning_rate": 9.287187836430366e-06, + "loss": 0.2549, + "step": 3401 + }, + { + "epoch": 1.6108907960935188, + "grad_norm": 1.1368675231933594, + "learning_rate": 9.282086837152198e-06, + "loss": 0.2028, + "step": 3402 + }, + { + "epoch": 1.61136430896715, + "grad_norm": 1.2357794046401978, + "learning_rate": 9.276986025637291e-06, + "loss": 0.2165, + "step": 3403 + }, + { + "epoch": 1.6118378218407812, + "grad_norm": 1.7708053588867188, + "learning_rate": 9.271885403219715e-06, + "loss": 0.1969, + "step": 3404 + }, + { + "epoch": 1.6123113347144127, + "grad_norm": 1.0533392429351807, + "learning_rate": 9.266784971233487e-06, + "loss": 0.2193, + "step": 3405 + }, + { + "epoch": 1.6127848475880437, + "grad_norm": 2.6355881690979004, + "learning_rate": 9.261684731012575e-06, + "loss": 0.201, + "step": 3406 + }, + { + "epoch": 1.613258360461675, + "grad_norm": 1.196489930152893, + "learning_rate": 9.256584683890902e-06, + "loss": 0.2151, + "step": 3407 + }, + { + "epoch": 1.6137318733353063, + "grad_norm": 1.8537029027938843, + "learning_rate": 9.25148483120233e-06, + "loss": 0.2343, + "step": 3408 + }, + { + "epoch": 1.6142053862089376, + "grad_norm": 1.2622405290603638, + "learning_rate": 9.24638517428068e-06, + "loss": 0.2154, + "step": 3409 + }, + { + "epoch": 1.614678899082569, + "grad_norm": 1.1913985013961792, + "learning_rate": 9.24128571445972e-06, + "loss": 0.2275, + "step": 3410 + }, + { + "epoch": 1.6151524119562, + "grad_norm": 1.129180669784546, + "learning_rate": 9.236186453073161e-06, + "loss": 0.208, + "step": 3411 + }, + { + "epoch": 1.6156259248298315, + "grad_norm": 1.8293282985687256, + "learning_rate": 9.231087391454665e-06, + "loss": 0.2184, + "step": 3412 + }, + { + "epoch": 1.6160994377034625, + "grad_norm": 1.8663169145584106, + "learning_rate": 9.225988530937846e-06, + "loss": 0.2305, + "step": 3413 + }, + { + "epoch": 1.6165729505770938, + "grad_norm": 1.2646862268447876, + "learning_rate": 9.220889872856258e-06, + "loss": 0.2168, + "step": 3414 + }, + { + "epoch": 1.6170464634507251, + "grad_norm": 1.4086544513702393, + "learning_rate": 9.215791418543407e-06, + "loss": 0.2109, + "step": 3415 + }, + { + "epoch": 1.6175199763243562, + "grad_norm": 1.7981756925582886, + "learning_rate": 9.210693169332746e-06, + "loss": 0.2098, + "step": 3416 + }, + { + "epoch": 1.6179934891979877, + "grad_norm": 1.776749849319458, + "learning_rate": 9.205595126557673e-06, + "loss": 0.2017, + "step": 3417 + }, + { + "epoch": 1.6184670020716188, + "grad_norm": 1.2278921604156494, + "learning_rate": 9.200497291551528e-06, + "loss": 0.2068, + "step": 3418 + }, + { + "epoch": 1.61894051494525, + "grad_norm": 0.9366362690925598, + "learning_rate": 9.195399665647607e-06, + "loss": 0.1798, + "step": 3419 + }, + { + "epoch": 1.6194140278188813, + "grad_norm": 1.8072518110275269, + "learning_rate": 9.190302250179141e-06, + "loss": 0.2245, + "step": 3420 + }, + { + "epoch": 1.6198875406925126, + "grad_norm": 1.4255504608154297, + "learning_rate": 9.185205046479308e-06, + "loss": 0.2082, + "step": 3421 + }, + { + "epoch": 1.620361053566144, + "grad_norm": 1.2994076013565063, + "learning_rate": 9.180108055881236e-06, + "loss": 0.2107, + "step": 3422 + }, + { + "epoch": 1.620834566439775, + "grad_norm": 1.6966938972473145, + "learning_rate": 9.175011279717992e-06, + "loss": 0.2548, + "step": 3423 + }, + { + "epoch": 1.6213080793134065, + "grad_norm": 1.2594822645187378, + "learning_rate": 9.169914719322588e-06, + "loss": 0.2031, + "step": 3424 + }, + { + "epoch": 1.6217815921870375, + "grad_norm": 1.0965392589569092, + "learning_rate": 9.164818376027981e-06, + "loss": 0.2233, + "step": 3425 + }, + { + "epoch": 1.6222551050606688, + "grad_norm": 1.0597667694091797, + "learning_rate": 9.159722251167073e-06, + "loss": 0.2034, + "step": 3426 + }, + { + "epoch": 1.6227286179343001, + "grad_norm": 1.3505845069885254, + "learning_rate": 9.154626346072702e-06, + "loss": 0.2138, + "step": 3427 + }, + { + "epoch": 1.6232021308079312, + "grad_norm": 0.9662488102912903, + "learning_rate": 9.149530662077655e-06, + "loss": 0.1981, + "step": 3428 + }, + { + "epoch": 1.6236756436815627, + "grad_norm": 0.9503706693649292, + "learning_rate": 9.144435200514658e-06, + "loss": 0.2161, + "step": 3429 + }, + { + "epoch": 1.6241491565551938, + "grad_norm": 1.316527247428894, + "learning_rate": 9.139339962716383e-06, + "loss": 0.188, + "step": 3430 + }, + { + "epoch": 1.624622669428825, + "grad_norm": 1.0049852132797241, + "learning_rate": 9.134244950015437e-06, + "loss": 0.2057, + "step": 3431 + }, + { + "epoch": 1.6250961823024563, + "grad_norm": 1.619187355041504, + "learning_rate": 9.129150163744371e-06, + "loss": 0.2018, + "step": 3432 + }, + { + "epoch": 1.6255696951760876, + "grad_norm": 1.270595908164978, + "learning_rate": 9.12405560523568e-06, + "loss": 0.1814, + "step": 3433 + }, + { + "epoch": 1.626043208049719, + "grad_norm": 1.6350579261779785, + "learning_rate": 9.118961275821792e-06, + "loss": 0.2457, + "step": 3434 + }, + { + "epoch": 1.62651672092335, + "grad_norm": 2.1566319465637207, + "learning_rate": 9.113867176835086e-06, + "loss": 0.1939, + "step": 3435 + }, + { + "epoch": 1.6269902337969815, + "grad_norm": 1.6590288877487183, + "learning_rate": 9.10877330960787e-06, + "loss": 0.2094, + "step": 3436 + }, + { + "epoch": 1.6274637466706126, + "grad_norm": 1.642244577407837, + "learning_rate": 9.103679675472395e-06, + "loss": 0.2082, + "step": 3437 + }, + { + "epoch": 1.6279372595442438, + "grad_norm": 1.484878659248352, + "learning_rate": 9.098586275760854e-06, + "loss": 0.2094, + "step": 3438 + }, + { + "epoch": 1.6284107724178751, + "grad_norm": 1.3303133249282837, + "learning_rate": 9.093493111805379e-06, + "loss": 0.2161, + "step": 3439 + }, + { + "epoch": 1.6288842852915064, + "grad_norm": 1.818798542022705, + "learning_rate": 9.088400184938036e-06, + "loss": 0.2261, + "step": 3440 + }, + { + "epoch": 1.6293577981651377, + "grad_norm": 1.2942332029342651, + "learning_rate": 9.08330749649083e-06, + "loss": 0.2272, + "step": 3441 + }, + { + "epoch": 1.6298313110387688, + "grad_norm": 1.2421215772628784, + "learning_rate": 9.078215047795703e-06, + "loss": 0.2111, + "step": 3442 + }, + { + "epoch": 1.6303048239124003, + "grad_norm": 1.215605616569519, + "learning_rate": 9.073122840184537e-06, + "loss": 0.2487, + "step": 3443 + }, + { + "epoch": 1.6307783367860313, + "grad_norm": 1.1501661539077759, + "learning_rate": 9.068030874989152e-06, + "loss": 0.2214, + "step": 3444 + }, + { + "epoch": 1.6312518496596626, + "grad_norm": 1.0269571542739868, + "learning_rate": 9.062939153541302e-06, + "loss": 0.2149, + "step": 3445 + }, + { + "epoch": 1.631725362533294, + "grad_norm": 1.087510108947754, + "learning_rate": 9.057847677172675e-06, + "loss": 0.2047, + "step": 3446 + }, + { + "epoch": 1.632198875406925, + "grad_norm": 1.1288443803787231, + "learning_rate": 9.052756447214899e-06, + "loss": 0.2051, + "step": 3447 + }, + { + "epoch": 1.6326723882805565, + "grad_norm": 0.9681136608123779, + "learning_rate": 9.047665464999537e-06, + "loss": 0.2058, + "step": 3448 + }, + { + "epoch": 1.6331459011541876, + "grad_norm": 1.3318946361541748, + "learning_rate": 9.042574731858084e-06, + "loss": 0.224, + "step": 3449 + }, + { + "epoch": 1.6336194140278189, + "grad_norm": 1.4043301343917847, + "learning_rate": 9.037484249121974e-06, + "loss": 0.2254, + "step": 3450 + }, + { + "epoch": 1.6340929269014501, + "grad_norm": 1.0534493923187256, + "learning_rate": 9.032394018122572e-06, + "loss": 0.217, + "step": 3451 + }, + { + "epoch": 1.6345664397750814, + "grad_norm": 1.055869221687317, + "learning_rate": 9.027304040191181e-06, + "loss": 0.2174, + "step": 3452 + }, + { + "epoch": 1.6350399526487127, + "grad_norm": 0.9580625891685486, + "learning_rate": 9.022214316659035e-06, + "loss": 0.2125, + "step": 3453 + }, + { + "epoch": 1.6355134655223438, + "grad_norm": 1.0676701068878174, + "learning_rate": 9.0171248488573e-06, + "loss": 0.2094, + "step": 3454 + }, + { + "epoch": 1.6359869783959753, + "grad_norm": 1.6258785724639893, + "learning_rate": 9.012035638117082e-06, + "loss": 0.2406, + "step": 3455 + }, + { + "epoch": 1.6364604912696064, + "grad_norm": 1.652103304862976, + "learning_rate": 9.006946685769408e-06, + "loss": 0.1958, + "step": 3456 + }, + { + "epoch": 1.6369340041432376, + "grad_norm": 1.625702977180481, + "learning_rate": 9.001857993145251e-06, + "loss": 0.2426, + "step": 3457 + }, + { + "epoch": 1.637407517016869, + "grad_norm": 1.5374822616577148, + "learning_rate": 8.996769561575504e-06, + "loss": 0.2269, + "step": 3458 + }, + { + "epoch": 1.6378810298905, + "grad_norm": 1.0075321197509766, + "learning_rate": 8.991681392391001e-06, + "loss": 0.1967, + "step": 3459 + }, + { + "epoch": 1.6383545427641315, + "grad_norm": 1.2052757740020752, + "learning_rate": 8.986593486922504e-06, + "loss": 0.1997, + "step": 3460 + }, + { + "epoch": 1.6388280556377626, + "grad_norm": 1.0683456659317017, + "learning_rate": 8.981505846500703e-06, + "loss": 0.2165, + "step": 3461 + }, + { + "epoch": 1.6393015685113939, + "grad_norm": 1.3364335298538208, + "learning_rate": 8.976418472456222e-06, + "loss": 0.2098, + "step": 3462 + }, + { + "epoch": 1.6397750813850251, + "grad_norm": 1.8314017057418823, + "learning_rate": 8.971331366119613e-06, + "loss": 0.2215, + "step": 3463 + }, + { + "epoch": 1.6402485942586564, + "grad_norm": 1.2816479206085205, + "learning_rate": 8.966244528821366e-06, + "loss": 0.2137, + "step": 3464 + }, + { + "epoch": 1.6407221071322877, + "grad_norm": 1.2321319580078125, + "learning_rate": 8.961157961891886e-06, + "loss": 0.2345, + "step": 3465 + }, + { + "epoch": 1.6411956200059188, + "grad_norm": 1.0520602464675903, + "learning_rate": 8.95607166666152e-06, + "loss": 0.2141, + "step": 3466 + }, + { + "epoch": 1.6416691328795503, + "grad_norm": 1.592511773109436, + "learning_rate": 8.950985644460539e-06, + "loss": 0.2279, + "step": 3467 + }, + { + "epoch": 1.6421426457531814, + "grad_norm": 1.1186596155166626, + "learning_rate": 8.945899896619143e-06, + "loss": 0.2285, + "step": 3468 + }, + { + "epoch": 1.6426161586268127, + "grad_norm": 1.4300451278686523, + "learning_rate": 8.940814424467457e-06, + "loss": 0.2494, + "step": 3469 + }, + { + "epoch": 1.643089671500444, + "grad_norm": 1.1781500577926636, + "learning_rate": 8.935729229335544e-06, + "loss": 0.2253, + "step": 3470 + }, + { + "epoch": 1.6435631843740752, + "grad_norm": 1.3187671899795532, + "learning_rate": 8.930644312553381e-06, + "loss": 0.2244, + "step": 3471 + }, + { + "epoch": 1.6440366972477065, + "grad_norm": 1.1596239805221558, + "learning_rate": 8.925559675450883e-06, + "loss": 0.2193, + "step": 3472 + }, + { + "epoch": 1.6445102101213376, + "grad_norm": 2.0229430198669434, + "learning_rate": 8.920475319357886e-06, + "loss": 0.2182, + "step": 3473 + }, + { + "epoch": 1.644983722994969, + "grad_norm": 1.2594224214553833, + "learning_rate": 8.915391245604159e-06, + "loss": 0.2311, + "step": 3474 + }, + { + "epoch": 1.6454572358686002, + "grad_norm": 1.5540090799331665, + "learning_rate": 8.910307455519385e-06, + "loss": 0.2034, + "step": 3475 + }, + { + "epoch": 1.6459307487422314, + "grad_norm": 1.0850417613983154, + "learning_rate": 8.905223950433178e-06, + "loss": 0.1913, + "step": 3476 + }, + { + "epoch": 1.6464042616158627, + "grad_norm": 1.3703140020370483, + "learning_rate": 8.90014073167509e-06, + "loss": 0.2108, + "step": 3477 + }, + { + "epoch": 1.6468777744894938, + "grad_norm": 1.050058364868164, + "learning_rate": 8.895057800574584e-06, + "loss": 0.2306, + "step": 3478 + }, + { + "epoch": 1.6473512873631253, + "grad_norm": 1.3480294942855835, + "learning_rate": 8.889975158461051e-06, + "loss": 0.202, + "step": 3479 + }, + { + "epoch": 1.6478248002367564, + "grad_norm": 1.2146999835968018, + "learning_rate": 8.884892806663808e-06, + "loss": 0.198, + "step": 3480 + }, + { + "epoch": 1.6482983131103877, + "grad_norm": 1.6478931903839111, + "learning_rate": 8.879810746512091e-06, + "loss": 0.2188, + "step": 3481 + }, + { + "epoch": 1.648771825984019, + "grad_norm": 1.0835014581680298, + "learning_rate": 8.874728979335069e-06, + "loss": 0.2076, + "step": 3482 + }, + { + "epoch": 1.6492453388576502, + "grad_norm": 1.0451796054840088, + "learning_rate": 8.869647506461823e-06, + "loss": 0.221, + "step": 3483 + }, + { + "epoch": 1.6497188517312815, + "grad_norm": 1.0795484781265259, + "learning_rate": 8.86456632922137e-06, + "loss": 0.2048, + "step": 3484 + }, + { + "epoch": 1.6501923646049126, + "grad_norm": 1.3780536651611328, + "learning_rate": 8.85948544894264e-06, + "loss": 0.2048, + "step": 3485 + }, + { + "epoch": 1.650665877478544, + "grad_norm": 1.2751306295394897, + "learning_rate": 8.854404866954485e-06, + "loss": 0.2238, + "step": 3486 + }, + { + "epoch": 1.6511393903521752, + "grad_norm": 1.0825250148773193, + "learning_rate": 8.84932458458569e-06, + "loss": 0.2076, + "step": 3487 + }, + { + "epoch": 1.6516129032258065, + "grad_norm": 1.1926277875900269, + "learning_rate": 8.844244603164946e-06, + "loss": 0.2038, + "step": 3488 + }, + { + "epoch": 1.6520864160994377, + "grad_norm": 1.0996829271316528, + "learning_rate": 8.839164924020878e-06, + "loss": 0.1927, + "step": 3489 + }, + { + "epoch": 1.6525599289730688, + "grad_norm": 0.9266497492790222, + "learning_rate": 8.834085548482024e-06, + "loss": 0.2048, + "step": 3490 + }, + { + "epoch": 1.6530334418467003, + "grad_norm": 1.086111068725586, + "learning_rate": 8.829006477876847e-06, + "loss": 0.2219, + "step": 3491 + }, + { + "epoch": 1.6535069547203314, + "grad_norm": 1.0157992839813232, + "learning_rate": 8.82392771353373e-06, + "loss": 0.1998, + "step": 3492 + }, + { + "epoch": 1.6539804675939627, + "grad_norm": 1.2996865510940552, + "learning_rate": 8.818849256780972e-06, + "loss": 0.2155, + "step": 3493 + }, + { + "epoch": 1.654453980467594, + "grad_norm": 1.0794087648391724, + "learning_rate": 8.813771108946798e-06, + "loss": 0.208, + "step": 3494 + }, + { + "epoch": 1.6549274933412252, + "grad_norm": 1.0981913805007935, + "learning_rate": 8.808693271359346e-06, + "loss": 0.2074, + "step": 3495 + }, + { + "epoch": 1.6554010062148565, + "grad_norm": 1.7508763074874878, + "learning_rate": 8.803615745346675e-06, + "loss": 0.238, + "step": 3496 + }, + { + "epoch": 1.6558745190884876, + "grad_norm": 1.0915553569793701, + "learning_rate": 8.798538532236764e-06, + "loss": 0.2166, + "step": 3497 + }, + { + "epoch": 1.656348031962119, + "grad_norm": 0.9217873811721802, + "learning_rate": 8.79346163335751e-06, + "loss": 0.1957, + "step": 3498 + }, + { + "epoch": 1.6568215448357502, + "grad_norm": 2.03841233253479, + "learning_rate": 8.78838505003673e-06, + "loss": 0.2089, + "step": 3499 + }, + { + "epoch": 1.6572950577093815, + "grad_norm": 1.8650535345077515, + "learning_rate": 8.783308783602148e-06, + "loss": 0.188, + "step": 3500 + }, + { + "epoch": 1.6577685705830127, + "grad_norm": 0.9951745867729187, + "learning_rate": 8.778232835381415e-06, + "loss": 0.1941, + "step": 3501 + }, + { + "epoch": 1.6582420834566438, + "grad_norm": 1.168714165687561, + "learning_rate": 8.773157206702097e-06, + "loss": 0.2112, + "step": 3502 + }, + { + "epoch": 1.6587155963302753, + "grad_norm": 1.0129159688949585, + "learning_rate": 8.768081898891679e-06, + "loss": 0.2112, + "step": 3503 + }, + { + "epoch": 1.6591891092039064, + "grad_norm": 1.5583324432373047, + "learning_rate": 8.763006913277553e-06, + "loss": 0.2165, + "step": 3504 + }, + { + "epoch": 1.659662622077538, + "grad_norm": 1.3734880685806274, + "learning_rate": 8.757932251187037e-06, + "loss": 0.2241, + "step": 3505 + }, + { + "epoch": 1.660136134951169, + "grad_norm": 1.3286396265029907, + "learning_rate": 8.752857913947357e-06, + "loss": 0.2349, + "step": 3506 + }, + { + "epoch": 1.6606096478248003, + "grad_norm": 1.3131154775619507, + "learning_rate": 8.747783902885657e-06, + "loss": 0.2454, + "step": 3507 + }, + { + "epoch": 1.6610831606984315, + "grad_norm": 1.0709985494613647, + "learning_rate": 8.742710219329e-06, + "loss": 0.225, + "step": 3508 + }, + { + "epoch": 1.6615566735720626, + "grad_norm": 1.4955830574035645, + "learning_rate": 8.737636864604357e-06, + "loss": 0.1989, + "step": 3509 + }, + { + "epoch": 1.6620301864456941, + "grad_norm": 1.0613009929656982, + "learning_rate": 8.73256384003861e-06, + "loss": 0.2329, + "step": 3510 + }, + { + "epoch": 1.6625036993193252, + "grad_norm": 1.2073135375976562, + "learning_rate": 8.727491146958566e-06, + "loss": 0.2259, + "step": 3511 + }, + { + "epoch": 1.6629772121929565, + "grad_norm": 1.160805583000183, + "learning_rate": 8.722418786690936e-06, + "loss": 0.2198, + "step": 3512 + }, + { + "epoch": 1.6634507250665878, + "grad_norm": 1.5360057353973389, + "learning_rate": 8.717346760562345e-06, + "loss": 0.2242, + "step": 3513 + }, + { + "epoch": 1.663924237940219, + "grad_norm": 1.0685076713562012, + "learning_rate": 8.712275069899337e-06, + "loss": 0.2136, + "step": 3514 + }, + { + "epoch": 1.6643977508138503, + "grad_norm": 1.6072028875350952, + "learning_rate": 8.707203716028358e-06, + "loss": 0.2215, + "step": 3515 + }, + { + "epoch": 1.6648712636874814, + "grad_norm": 1.209231972694397, + "learning_rate": 8.702132700275778e-06, + "loss": 0.2225, + "step": 3516 + }, + { + "epoch": 1.665344776561113, + "grad_norm": 1.0955963134765625, + "learning_rate": 8.697062023967869e-06, + "loss": 0.2164, + "step": 3517 + }, + { + "epoch": 1.665818289434744, + "grad_norm": 0.9355467557907104, + "learning_rate": 8.691991688430818e-06, + "loss": 0.2106, + "step": 3518 + }, + { + "epoch": 1.6662918023083753, + "grad_norm": 0.9584832787513733, + "learning_rate": 8.686921694990724e-06, + "loss": 0.2093, + "step": 3519 + }, + { + "epoch": 1.6667653151820065, + "grad_norm": 1.7122430801391602, + "learning_rate": 8.681852044973591e-06, + "loss": 0.2166, + "step": 3520 + }, + { + "epoch": 1.6672388280556376, + "grad_norm": 1.5137004852294922, + "learning_rate": 8.67678273970534e-06, + "loss": 0.2199, + "step": 3521 + }, + { + "epoch": 1.6677123409292691, + "grad_norm": 1.2796556949615479, + "learning_rate": 8.671713780511798e-06, + "loss": 0.2053, + "step": 3522 + }, + { + "epoch": 1.6681858538029002, + "grad_norm": 1.1300054788589478, + "learning_rate": 8.666645168718705e-06, + "loss": 0.2256, + "step": 3523 + }, + { + "epoch": 1.6686593666765315, + "grad_norm": 1.094119906425476, + "learning_rate": 8.661576905651704e-06, + "loss": 0.2042, + "step": 3524 + }, + { + "epoch": 1.6691328795501628, + "grad_norm": 1.1889249086380005, + "learning_rate": 8.656508992636352e-06, + "loss": 0.2144, + "step": 3525 + }, + { + "epoch": 1.669606392423794, + "grad_norm": 0.9747273325920105, + "learning_rate": 8.65144143099811e-06, + "loss": 0.1979, + "step": 3526 + }, + { + "epoch": 1.6700799052974253, + "grad_norm": 1.1570683717727661, + "learning_rate": 8.646374222062352e-06, + "loss": 0.2429, + "step": 3527 + }, + { + "epoch": 1.6705534181710564, + "grad_norm": 1.3277277946472168, + "learning_rate": 8.641307367154356e-06, + "loss": 0.2064, + "step": 3528 + }, + { + "epoch": 1.671026931044688, + "grad_norm": 1.3713091611862183, + "learning_rate": 8.636240867599314e-06, + "loss": 0.2051, + "step": 3529 + }, + { + "epoch": 1.671500443918319, + "grad_norm": 1.2886407375335693, + "learning_rate": 8.631174724722315e-06, + "loss": 0.2237, + "step": 3530 + }, + { + "epoch": 1.6719739567919503, + "grad_norm": 1.255431056022644, + "learning_rate": 8.626108939848362e-06, + "loss": 0.2136, + "step": 3531 + }, + { + "epoch": 1.6724474696655816, + "grad_norm": 1.0875186920166016, + "learning_rate": 8.621043514302361e-06, + "loss": 0.1884, + "step": 3532 + }, + { + "epoch": 1.6729209825392126, + "grad_norm": 1.1179817914962769, + "learning_rate": 8.615978449409124e-06, + "loss": 0.2243, + "step": 3533 + }, + { + "epoch": 1.6733944954128441, + "grad_norm": 0.9758694171905518, + "learning_rate": 8.610913746493377e-06, + "loss": 0.2246, + "step": 3534 + }, + { + "epoch": 1.6738680082864752, + "grad_norm": 1.0491384267807007, + "learning_rate": 8.605849406879736e-06, + "loss": 0.241, + "step": 3535 + }, + { + "epoch": 1.6743415211601067, + "grad_norm": 1.1519798040390015, + "learning_rate": 8.600785431892731e-06, + "loss": 0.2076, + "step": 3536 + }, + { + "epoch": 1.6748150340337378, + "grad_norm": 1.3320951461791992, + "learning_rate": 8.5957218228568e-06, + "loss": 0.2107, + "step": 3537 + }, + { + "epoch": 1.675288546907369, + "grad_norm": 1.127048134803772, + "learning_rate": 8.590658581096278e-06, + "loss": 0.2077, + "step": 3538 + }, + { + "epoch": 1.6757620597810003, + "grad_norm": 1.4201406240463257, + "learning_rate": 8.585595707935408e-06, + "loss": 0.2116, + "step": 3539 + }, + { + "epoch": 1.6762355726546314, + "grad_norm": 1.774510145187378, + "learning_rate": 8.580533204698336e-06, + "loss": 0.2003, + "step": 3540 + }, + { + "epoch": 1.676709085528263, + "grad_norm": 1.9603132009506226, + "learning_rate": 8.57547107270911e-06, + "loss": 0.2353, + "step": 3541 + }, + { + "epoch": 1.677182598401894, + "grad_norm": 1.5556761026382446, + "learning_rate": 8.570409313291683e-06, + "loss": 0.2191, + "step": 3542 + }, + { + "epoch": 1.6776561112755253, + "grad_norm": 1.459722638130188, + "learning_rate": 8.56534792776991e-06, + "loss": 0.1919, + "step": 3543 + }, + { + "epoch": 1.6781296241491566, + "grad_norm": 1.822821021080017, + "learning_rate": 8.560286917467543e-06, + "loss": 0.2449, + "step": 3544 + }, + { + "epoch": 1.6786031370227879, + "grad_norm": 1.4285695552825928, + "learning_rate": 8.555226283708246e-06, + "loss": 0.1897, + "step": 3545 + }, + { + "epoch": 1.6790766498964191, + "grad_norm": 1.1042506694793701, + "learning_rate": 8.550166027815576e-06, + "loss": 0.2184, + "step": 3546 + }, + { + "epoch": 1.6795501627700502, + "grad_norm": 0.9724579453468323, + "learning_rate": 8.545106151112994e-06, + "loss": 0.2061, + "step": 3547 + }, + { + "epoch": 1.6800236756436817, + "grad_norm": 1.1628752946853638, + "learning_rate": 8.540046654923863e-06, + "loss": 0.1937, + "step": 3548 + }, + { + "epoch": 1.6804971885173128, + "grad_norm": 1.408586025238037, + "learning_rate": 8.534987540571445e-06, + "loss": 0.2233, + "step": 3549 + }, + { + "epoch": 1.680970701390944, + "grad_norm": 1.0293395519256592, + "learning_rate": 8.529928809378902e-06, + "loss": 0.2141, + "step": 3550 + }, + { + "epoch": 1.6814442142645754, + "grad_norm": 2.2602131366729736, + "learning_rate": 8.524870462669296e-06, + "loss": 0.2243, + "step": 3551 + }, + { + "epoch": 1.6819177271382064, + "grad_norm": 1.1277772188186646, + "learning_rate": 8.519812501765591e-06, + "loss": 0.2004, + "step": 3552 + }, + { + "epoch": 1.682391240011838, + "grad_norm": 1.7212578058242798, + "learning_rate": 8.514754927990646e-06, + "loss": 0.1934, + "step": 3553 + }, + { + "epoch": 1.682864752885469, + "grad_norm": 1.1089184284210205, + "learning_rate": 8.509697742667219e-06, + "loss": 0.2354, + "step": 3554 + }, + { + "epoch": 1.6833382657591003, + "grad_norm": 1.5018812417984009, + "learning_rate": 8.504640947117973e-06, + "loss": 0.2144, + "step": 3555 + }, + { + "epoch": 1.6838117786327316, + "grad_norm": 1.6116001605987549, + "learning_rate": 8.49958454266546e-06, + "loss": 0.2174, + "step": 3556 + }, + { + "epoch": 1.6842852915063629, + "grad_norm": 1.9937829971313477, + "learning_rate": 8.494528530632136e-06, + "loss": 0.2132, + "step": 3557 + }, + { + "epoch": 1.6847588043799941, + "grad_norm": 1.3421846628189087, + "learning_rate": 8.48947291234035e-06, + "loss": 0.2168, + "step": 3558 + }, + { + "epoch": 1.6852323172536252, + "grad_norm": 1.112237572669983, + "learning_rate": 8.484417689112356e-06, + "loss": 0.2025, + "step": 3559 + }, + { + "epoch": 1.6857058301272567, + "grad_norm": 1.0491182804107666, + "learning_rate": 8.47936286227029e-06, + "loss": 0.2236, + "step": 3560 + }, + { + "epoch": 1.6861793430008878, + "grad_norm": 1.0888773202896118, + "learning_rate": 8.4743084331362e-06, + "loss": 0.2186, + "step": 3561 + }, + { + "epoch": 1.686652855874519, + "grad_norm": 0.9247356653213501, + "learning_rate": 8.46925440303202e-06, + "loss": 0.2121, + "step": 3562 + }, + { + "epoch": 1.6871263687481504, + "grad_norm": 0.986939013004303, + "learning_rate": 8.464200773279582e-06, + "loss": 0.2269, + "step": 3563 + }, + { + "epoch": 1.6875998816217814, + "grad_norm": 1.2144672870635986, + "learning_rate": 8.459147545200617e-06, + "loss": 0.219, + "step": 3564 + }, + { + "epoch": 1.688073394495413, + "grad_norm": 1.2081797122955322, + "learning_rate": 8.454094720116745e-06, + "loss": 0.1891, + "step": 3565 + }, + { + "epoch": 1.688546907369044, + "grad_norm": 1.819753646850586, + "learning_rate": 8.449042299349487e-06, + "loss": 0.2095, + "step": 3566 + }, + { + "epoch": 1.6890204202426755, + "grad_norm": 0.952063262462616, + "learning_rate": 8.443990284220252e-06, + "loss": 0.2111, + "step": 3567 + }, + { + "epoch": 1.6894939331163066, + "grad_norm": 1.0713047981262207, + "learning_rate": 8.43893867605035e-06, + "loss": 0.2046, + "step": 3568 + }, + { + "epoch": 1.6899674459899379, + "grad_norm": 1.6247305870056152, + "learning_rate": 8.433887476160976e-06, + "loss": 0.1997, + "step": 3569 + }, + { + "epoch": 1.6904409588635692, + "grad_norm": 0.9922040104866028, + "learning_rate": 8.428836685873223e-06, + "loss": 0.1791, + "step": 3570 + }, + { + "epoch": 1.6909144717372002, + "grad_norm": 1.1413791179656982, + "learning_rate": 8.423786306508076e-06, + "loss": 0.2028, + "step": 3571 + }, + { + "epoch": 1.6913879846108317, + "grad_norm": 1.1221214532852173, + "learning_rate": 8.418736339386417e-06, + "loss": 0.2044, + "step": 3572 + }, + { + "epoch": 1.6918614974844628, + "grad_norm": 1.0321074724197388, + "learning_rate": 8.413686785829013e-06, + "loss": 0.2371, + "step": 3573 + }, + { + "epoch": 1.692335010358094, + "grad_norm": 1.458929181098938, + "learning_rate": 8.408637647156528e-06, + "loss": 0.228, + "step": 3574 + }, + { + "epoch": 1.6928085232317254, + "grad_norm": 1.4914942979812622, + "learning_rate": 8.403588924689511e-06, + "loss": 0.1998, + "step": 3575 + }, + { + "epoch": 1.6932820361053567, + "grad_norm": 1.992150902748108, + "learning_rate": 8.398540619748414e-06, + "loss": 0.2095, + "step": 3576 + }, + { + "epoch": 1.693755548978988, + "grad_norm": 0.9819469451904297, + "learning_rate": 8.393492733653566e-06, + "loss": 0.2025, + "step": 3577 + }, + { + "epoch": 1.694229061852619, + "grad_norm": 0.8754822611808777, + "learning_rate": 8.388445267725197e-06, + "loss": 0.1947, + "step": 3578 + }, + { + "epoch": 1.6947025747262505, + "grad_norm": 1.3592606782913208, + "learning_rate": 8.38339822328342e-06, + "loss": 0.2065, + "step": 3579 + }, + { + "epoch": 1.6951760875998816, + "grad_norm": 1.2748628854751587, + "learning_rate": 8.378351601648243e-06, + "loss": 0.2091, + "step": 3580 + }, + { + "epoch": 1.6956496004735129, + "grad_norm": 1.1652089357376099, + "learning_rate": 8.373305404139558e-06, + "loss": 0.1885, + "step": 3581 + }, + { + "epoch": 1.6961231133471442, + "grad_norm": 1.0622856616973877, + "learning_rate": 8.368259632077153e-06, + "loss": 0.2115, + "step": 3582 + }, + { + "epoch": 1.6965966262207752, + "grad_norm": 1.0490455627441406, + "learning_rate": 8.363214286780699e-06, + "loss": 0.2059, + "step": 3583 + }, + { + "epoch": 1.6970701390944067, + "grad_norm": 1.0844651460647583, + "learning_rate": 8.35816936956976e-06, + "loss": 0.223, + "step": 3584 + }, + { + "epoch": 1.6975436519680378, + "grad_norm": 1.131882905960083, + "learning_rate": 8.35312488176378e-06, + "loss": 0.2312, + "step": 3585 + }, + { + "epoch": 1.698017164841669, + "grad_norm": 1.6975009441375732, + "learning_rate": 8.348080824682102e-06, + "loss": 0.2397, + "step": 3586 + }, + { + "epoch": 1.6984906777153004, + "grad_norm": 1.1549729108810425, + "learning_rate": 8.343037199643947e-06, + "loss": 0.2051, + "step": 3587 + }, + { + "epoch": 1.6989641905889317, + "grad_norm": 1.2071688175201416, + "learning_rate": 8.33799400796843e-06, + "loss": 0.2313, + "step": 3588 + }, + { + "epoch": 1.699437703462563, + "grad_norm": 1.1195539236068726, + "learning_rate": 8.332951250974543e-06, + "loss": 0.2112, + "step": 3589 + }, + { + "epoch": 1.699911216336194, + "grad_norm": 1.7075669765472412, + "learning_rate": 8.327908929981175e-06, + "loss": 0.2271, + "step": 3590 + }, + { + "epoch": 1.7003847292098255, + "grad_norm": 1.0217007398605347, + "learning_rate": 8.322867046307096e-06, + "loss": 0.1973, + "step": 3591 + }, + { + "epoch": 1.7008582420834566, + "grad_norm": 1.1293320655822754, + "learning_rate": 8.317825601270954e-06, + "loss": 0.2094, + "step": 3592 + }, + { + "epoch": 1.7013317549570879, + "grad_norm": 1.1042124032974243, + "learning_rate": 8.312784596191308e-06, + "loss": 0.2178, + "step": 3593 + }, + { + "epoch": 1.7018052678307192, + "grad_norm": 1.4056512117385864, + "learning_rate": 8.307744032386571e-06, + "loss": 0.2054, + "step": 3594 + }, + { + "epoch": 1.7022787807043502, + "grad_norm": 1.0147428512573242, + "learning_rate": 8.302703911175057e-06, + "loss": 0.1927, + "step": 3595 + }, + { + "epoch": 1.7027522935779817, + "grad_norm": 1.2608839273452759, + "learning_rate": 8.297664233874958e-06, + "loss": 0.2028, + "step": 3596 + }, + { + "epoch": 1.7032258064516128, + "grad_norm": 1.170110821723938, + "learning_rate": 8.292625001804359e-06, + "loss": 0.21, + "step": 3597 + }, + { + "epoch": 1.7036993193252443, + "grad_norm": 1.0504069328308105, + "learning_rate": 8.28758621628122e-06, + "loss": 0.2135, + "step": 3598 + }, + { + "epoch": 1.7041728321988754, + "grad_norm": 0.971572995185852, + "learning_rate": 8.282547878623384e-06, + "loss": 0.2039, + "step": 3599 + }, + { + "epoch": 1.7046463450725067, + "grad_norm": 1.2856515645980835, + "learning_rate": 8.277509990148584e-06, + "loss": 0.2159, + "step": 3600 + }, + { + "epoch": 1.705119857946138, + "grad_norm": 0.9921664595603943, + "learning_rate": 8.272472552174426e-06, + "loss": 0.2166, + "step": 3601 + }, + { + "epoch": 1.705593370819769, + "grad_norm": 1.2325758934020996, + "learning_rate": 8.267435566018409e-06, + "loss": 0.1934, + "step": 3602 + }, + { + "epoch": 1.7060668836934005, + "grad_norm": 1.430541753768921, + "learning_rate": 8.262399032997906e-06, + "loss": 0.2026, + "step": 3603 + }, + { + "epoch": 1.7065403965670316, + "grad_norm": 1.1874326467514038, + "learning_rate": 8.257362954430172e-06, + "loss": 0.1966, + "step": 3604 + }, + { + "epoch": 1.707013909440663, + "grad_norm": 1.9422804117202759, + "learning_rate": 8.252327331632343e-06, + "loss": 0.2389, + "step": 3605 + }, + { + "epoch": 1.7074874223142942, + "grad_norm": 1.1945825815200806, + "learning_rate": 8.247292165921443e-06, + "loss": 0.2184, + "step": 3606 + }, + { + "epoch": 1.7079609351879255, + "grad_norm": 1.6779309511184692, + "learning_rate": 8.242257458614368e-06, + "loss": 0.1963, + "step": 3607 + }, + { + "epoch": 1.7084344480615568, + "grad_norm": 1.2465808391571045, + "learning_rate": 8.237223211027897e-06, + "loss": 0.2136, + "step": 3608 + }, + { + "epoch": 1.7089079609351878, + "grad_norm": 2.2780213356018066, + "learning_rate": 8.23218942447869e-06, + "loss": 0.22, + "step": 3609 + }, + { + "epoch": 1.7093814738088193, + "grad_norm": 1.2657932043075562, + "learning_rate": 8.227156100283283e-06, + "loss": 0.2034, + "step": 3610 + }, + { + "epoch": 1.7098549866824504, + "grad_norm": 1.1345802545547485, + "learning_rate": 8.222123239758097e-06, + "loss": 0.1913, + "step": 3611 + }, + { + "epoch": 1.7103284995560817, + "grad_norm": 1.0455700159072876, + "learning_rate": 8.217090844219425e-06, + "loss": 0.2135, + "step": 3612 + }, + { + "epoch": 1.710802012429713, + "grad_norm": 1.8622885942459106, + "learning_rate": 8.212058914983445e-06, + "loss": 0.2242, + "step": 3613 + }, + { + "epoch": 1.711275525303344, + "grad_norm": 1.0153396129608154, + "learning_rate": 8.207027453366206e-06, + "loss": 0.2115, + "step": 3614 + }, + { + "epoch": 1.7117490381769755, + "grad_norm": 1.053398609161377, + "learning_rate": 8.201996460683638e-06, + "loss": 0.2055, + "step": 3615 + }, + { + "epoch": 1.7122225510506066, + "grad_norm": 1.2141062021255493, + "learning_rate": 8.19696593825155e-06, + "loss": 0.1882, + "step": 3616 + }, + { + "epoch": 1.712696063924238, + "grad_norm": 1.2081797122955322, + "learning_rate": 8.191935887385628e-06, + "loss": 0.2217, + "step": 3617 + }, + { + "epoch": 1.7131695767978692, + "grad_norm": 1.0772361755371094, + "learning_rate": 8.186906309401431e-06, + "loss": 0.2318, + "step": 3618 + }, + { + "epoch": 1.7136430896715005, + "grad_norm": 1.2851800918579102, + "learning_rate": 8.181877205614398e-06, + "loss": 0.2083, + "step": 3619 + }, + { + "epoch": 1.7141166025451318, + "grad_norm": 1.463058590888977, + "learning_rate": 8.176848577339843e-06, + "loss": 0.2274, + "step": 3620 + }, + { + "epoch": 1.7145901154187628, + "grad_norm": 1.5855882167816162, + "learning_rate": 8.171820425892952e-06, + "loss": 0.2146, + "step": 3621 + }, + { + "epoch": 1.7150636282923943, + "grad_norm": 1.2082139253616333, + "learning_rate": 8.166792752588797e-06, + "loss": 0.2106, + "step": 3622 + }, + { + "epoch": 1.7155371411660254, + "grad_norm": 0.9224141240119934, + "learning_rate": 8.161765558742307e-06, + "loss": 0.1913, + "step": 3623 + }, + { + "epoch": 1.7160106540396567, + "grad_norm": 1.1134727001190186, + "learning_rate": 8.156738845668303e-06, + "loss": 0.2006, + "step": 3624 + }, + { + "epoch": 1.716484166913288, + "grad_norm": 1.2312144041061401, + "learning_rate": 8.15171261468147e-06, + "loss": 0.2164, + "step": 3625 + }, + { + "epoch": 1.716957679786919, + "grad_norm": 1.2112536430358887, + "learning_rate": 8.146686867096376e-06, + "loss": 0.2118, + "step": 3626 + }, + { + "epoch": 1.7174311926605506, + "grad_norm": 0.9137608408927917, + "learning_rate": 8.141661604227448e-06, + "loss": 0.1946, + "step": 3627 + }, + { + "epoch": 1.7179047055341816, + "grad_norm": 1.5561513900756836, + "learning_rate": 8.136636827389002e-06, + "loss": 0.2256, + "step": 3628 + }, + { + "epoch": 1.718378218407813, + "grad_norm": 0.8672730922698975, + "learning_rate": 8.131612537895218e-06, + "loss": 0.1872, + "step": 3629 + }, + { + "epoch": 1.7188517312814442, + "grad_norm": 1.000693917274475, + "learning_rate": 8.126588737060149e-06, + "loss": 0.2139, + "step": 3630 + }, + { + "epoch": 1.7193252441550755, + "grad_norm": 0.9694979190826416, + "learning_rate": 8.121565426197722e-06, + "loss": 0.1927, + "step": 3631 + }, + { + "epoch": 1.7197987570287068, + "grad_norm": 0.90753573179245, + "learning_rate": 8.116542606621736e-06, + "loss": 0.2048, + "step": 3632 + }, + { + "epoch": 1.7202722699023378, + "grad_norm": 1.230448842048645, + "learning_rate": 8.111520279645864e-06, + "loss": 0.2298, + "step": 3633 + }, + { + "epoch": 1.7207457827759693, + "grad_norm": 1.0563310384750366, + "learning_rate": 8.106498446583641e-06, + "loss": 0.2158, + "step": 3634 + }, + { + "epoch": 1.7212192956496004, + "grad_norm": 1.2370612621307373, + "learning_rate": 8.101477108748486e-06, + "loss": 0.214, + "step": 3635 + }, + { + "epoch": 1.7216928085232317, + "grad_norm": 0.9708116054534912, + "learning_rate": 8.096456267453674e-06, + "loss": 0.1969, + "step": 3636 + }, + { + "epoch": 1.722166321396863, + "grad_norm": 1.1120837926864624, + "learning_rate": 8.091435924012365e-06, + "loss": 0.2115, + "step": 3637 + }, + { + "epoch": 1.7226398342704943, + "grad_norm": 0.838356077671051, + "learning_rate": 8.086416079737577e-06, + "loss": 0.2015, + "step": 3638 + }, + { + "epoch": 1.7231133471441256, + "grad_norm": 1.4962152242660522, + "learning_rate": 8.081396735942204e-06, + "loss": 0.2272, + "step": 3639 + }, + { + "epoch": 1.7235868600177566, + "grad_norm": 1.4194129705429077, + "learning_rate": 8.076377893939003e-06, + "loss": 0.2158, + "step": 3640 + }, + { + "epoch": 1.7240603728913881, + "grad_norm": 1.0114027261734009, + "learning_rate": 8.071359555040607e-06, + "loss": 0.218, + "step": 3641 + }, + { + "epoch": 1.7245338857650192, + "grad_norm": 1.3933831453323364, + "learning_rate": 8.066341720559513e-06, + "loss": 0.1817, + "step": 3642 + }, + { + "epoch": 1.7250073986386505, + "grad_norm": 1.2769237756729126, + "learning_rate": 8.06132439180809e-06, + "loss": 0.208, + "step": 3643 + }, + { + "epoch": 1.7254809115122818, + "grad_norm": 1.4070305824279785, + "learning_rate": 8.056307570098566e-06, + "loss": 0.2166, + "step": 3644 + }, + { + "epoch": 1.7259544243859128, + "grad_norm": 1.0970323085784912, + "learning_rate": 8.051291256743048e-06, + "loss": 0.1809, + "step": 3645 + }, + { + "epoch": 1.7264279372595444, + "grad_norm": 1.1892532110214233, + "learning_rate": 8.046275453053501e-06, + "loss": 0.2006, + "step": 3646 + }, + { + "epoch": 1.7269014501331754, + "grad_norm": 1.8722046613693237, + "learning_rate": 8.041260160341766e-06, + "loss": 0.2364, + "step": 3647 + }, + { + "epoch": 1.7273749630068067, + "grad_norm": 1.7485390901565552, + "learning_rate": 8.036245379919535e-06, + "loss": 0.1906, + "step": 3648 + }, + { + "epoch": 1.727848475880438, + "grad_norm": 2.5090813636779785, + "learning_rate": 8.03123111309838e-06, + "loss": 0.2261, + "step": 3649 + }, + { + "epoch": 1.7283219887540693, + "grad_norm": 1.2415854930877686, + "learning_rate": 8.026217361189737e-06, + "loss": 0.1927, + "step": 3650 + }, + { + "epoch": 1.7287955016277006, + "grad_norm": 1.012861728668213, + "learning_rate": 8.0212041255049e-06, + "loss": 0.2023, + "step": 3651 + }, + { + "epoch": 1.7292690145013316, + "grad_norm": 0.9463083148002625, + "learning_rate": 8.016191407355034e-06, + "loss": 0.2002, + "step": 3652 + }, + { + "epoch": 1.7297425273749631, + "grad_norm": 1.2277804613113403, + "learning_rate": 8.011179208051168e-06, + "loss": 0.1973, + "step": 3653 + }, + { + "epoch": 1.7302160402485942, + "grad_norm": 0.9926773309707642, + "learning_rate": 8.006167528904194e-06, + "loss": 0.1912, + "step": 3654 + }, + { + "epoch": 1.7306895531222255, + "grad_norm": 2.3296539783477783, + "learning_rate": 8.001156371224868e-06, + "loss": 0.2088, + "step": 3655 + }, + { + "epoch": 1.7311630659958568, + "grad_norm": 2.002706289291382, + "learning_rate": 7.996145736323807e-06, + "loss": 0.2359, + "step": 3656 + }, + { + "epoch": 1.7316365788694879, + "grad_norm": 1.5044262409210205, + "learning_rate": 7.991135625511503e-06, + "loss": 0.2037, + "step": 3657 + }, + { + "epoch": 1.7321100917431194, + "grad_norm": 1.2724112272262573, + "learning_rate": 7.986126040098291e-06, + "loss": 0.2139, + "step": 3658 + }, + { + "epoch": 1.7325836046167504, + "grad_norm": 1.1246263980865479, + "learning_rate": 7.981116981394388e-06, + "loss": 0.2028, + "step": 3659 + }, + { + "epoch": 1.7330571174903817, + "grad_norm": 0.9579681158065796, + "learning_rate": 7.97610845070986e-06, + "loss": 0.1982, + "step": 3660 + }, + { + "epoch": 1.733530630364013, + "grad_norm": 1.939510703086853, + "learning_rate": 7.971100449354643e-06, + "loss": 0.2115, + "step": 3661 + }, + { + "epoch": 1.7340041432376443, + "grad_norm": 1.2159603834152222, + "learning_rate": 7.96609297863853e-06, + "loss": 0.2167, + "step": 3662 + }, + { + "epoch": 1.7344776561112756, + "grad_norm": 1.5756498575210571, + "learning_rate": 7.961086039871178e-06, + "loss": 0.2003, + "step": 3663 + }, + { + "epoch": 1.7349511689849066, + "grad_norm": 1.5483087301254272, + "learning_rate": 7.956079634362101e-06, + "loss": 0.1893, + "step": 3664 + }, + { + "epoch": 1.7354246818585382, + "grad_norm": 0.9675044417381287, + "learning_rate": 7.951073763420679e-06, + "loss": 0.2116, + "step": 3665 + }, + { + "epoch": 1.7358981947321692, + "grad_norm": 1.4665745496749878, + "learning_rate": 7.946068428356146e-06, + "loss": 0.2102, + "step": 3666 + }, + { + "epoch": 1.7363717076058005, + "grad_norm": 1.4936654567718506, + "learning_rate": 7.941063630477603e-06, + "loss": 0.2233, + "step": 3667 + }, + { + "epoch": 1.7368452204794318, + "grad_norm": 1.196628212928772, + "learning_rate": 7.936059371094004e-06, + "loss": 0.205, + "step": 3668 + }, + { + "epoch": 1.737318733353063, + "grad_norm": 1.1913931369781494, + "learning_rate": 7.931055651514165e-06, + "loss": 0.2089, + "step": 3669 + }, + { + "epoch": 1.7377922462266944, + "grad_norm": 1.4205213785171509, + "learning_rate": 7.926052473046756e-06, + "loss": 0.233, + "step": 3670 + }, + { + "epoch": 1.7382657591003254, + "grad_norm": 1.5477283000946045, + "learning_rate": 7.921049837000318e-06, + "loss": 0.2258, + "step": 3671 + }, + { + "epoch": 1.738739271973957, + "grad_norm": 2.6856656074523926, + "learning_rate": 7.916047744683244e-06, + "loss": 0.216, + "step": 3672 + }, + { + "epoch": 1.739212784847588, + "grad_norm": 1.0358319282531738, + "learning_rate": 7.911046197403775e-06, + "loss": 0.2089, + "step": 3673 + }, + { + "epoch": 1.7396862977212193, + "grad_norm": 1.1069475412368774, + "learning_rate": 7.90604519647002e-06, + "loss": 0.2109, + "step": 3674 + }, + { + "epoch": 1.7401598105948506, + "grad_norm": 1.5351327657699585, + "learning_rate": 7.901044743189943e-06, + "loss": 0.2081, + "step": 3675 + }, + { + "epoch": 1.7406333234684817, + "grad_norm": 1.5493526458740234, + "learning_rate": 7.896044838871365e-06, + "loss": 0.2187, + "step": 3676 + }, + { + "epoch": 1.7411068363421132, + "grad_norm": 1.2532929182052612, + "learning_rate": 7.891045484821961e-06, + "loss": 0.2365, + "step": 3677 + }, + { + "epoch": 1.7415803492157442, + "grad_norm": 1.0188684463500977, + "learning_rate": 7.886046682349267e-06, + "loss": 0.2313, + "step": 3678 + }, + { + "epoch": 1.7420538620893755, + "grad_norm": 1.1516002416610718, + "learning_rate": 7.88104843276067e-06, + "loss": 0.2071, + "step": 3679 + }, + { + "epoch": 1.7425273749630068, + "grad_norm": 1.1517740488052368, + "learning_rate": 7.876050737363411e-06, + "loss": 0.2039, + "step": 3680 + }, + { + "epoch": 1.743000887836638, + "grad_norm": 1.396052598953247, + "learning_rate": 7.871053597464593e-06, + "loss": 0.2138, + "step": 3681 + }, + { + "epoch": 1.7434744007102694, + "grad_norm": 1.08030104637146, + "learning_rate": 7.86605701437117e-06, + "loss": 0.2316, + "step": 3682 + }, + { + "epoch": 1.7439479135839004, + "grad_norm": 2.2446389198303223, + "learning_rate": 7.861060989389947e-06, + "loss": 0.2267, + "step": 3683 + }, + { + "epoch": 1.744421426457532, + "grad_norm": 1.3251670598983765, + "learning_rate": 7.856065523827586e-06, + "loss": 0.1961, + "step": 3684 + }, + { + "epoch": 1.744894939331163, + "grad_norm": 1.188880205154419, + "learning_rate": 7.851070618990607e-06, + "loss": 0.2184, + "step": 3685 + }, + { + "epoch": 1.7453684522047943, + "grad_norm": 1.126604676246643, + "learning_rate": 7.846076276185372e-06, + "loss": 0.24, + "step": 3686 + }, + { + "epoch": 1.7458419650784256, + "grad_norm": 1.0912270545959473, + "learning_rate": 7.841082496718112e-06, + "loss": 0.2049, + "step": 3687 + }, + { + "epoch": 1.7463154779520567, + "grad_norm": 1.5733311176300049, + "learning_rate": 7.836089281894895e-06, + "loss": 0.2068, + "step": 3688 + }, + { + "epoch": 1.7467889908256882, + "grad_norm": 0.9812442064285278, + "learning_rate": 7.831096633021651e-06, + "loss": 0.2069, + "step": 3689 + }, + { + "epoch": 1.7472625036993192, + "grad_norm": 1.4512606859207153, + "learning_rate": 7.826104551404161e-06, + "loss": 0.2207, + "step": 3690 + }, + { + "epoch": 1.7477360165729505, + "grad_norm": 2.106611967086792, + "learning_rate": 7.821113038348052e-06, + "loss": 0.2208, + "step": 3691 + }, + { + "epoch": 1.7482095294465818, + "grad_norm": 1.069126009941101, + "learning_rate": 7.81612209515881e-06, + "loss": 0.2355, + "step": 3692 + }, + { + "epoch": 1.748683042320213, + "grad_norm": 1.088439702987671, + "learning_rate": 7.811131723141763e-06, + "loss": 0.2171, + "step": 3693 + }, + { + "epoch": 1.7491565551938444, + "grad_norm": 1.1019070148468018, + "learning_rate": 7.806141923602098e-06, + "loss": 0.2021, + "step": 3694 + }, + { + "epoch": 1.7496300680674755, + "grad_norm": 1.3674668073654175, + "learning_rate": 7.801152697844849e-06, + "loss": 0.2084, + "step": 3695 + }, + { + "epoch": 1.750103580941107, + "grad_norm": 1.3417162895202637, + "learning_rate": 7.796164047174898e-06, + "loss": 0.2, + "step": 3696 + }, + { + "epoch": 1.750577093814738, + "grad_norm": 1.5355737209320068, + "learning_rate": 7.79117597289698e-06, + "loss": 0.2159, + "step": 3697 + }, + { + "epoch": 1.7510506066883693, + "grad_norm": 1.4953505992889404, + "learning_rate": 7.786188476315678e-06, + "loss": 0.2188, + "step": 3698 + }, + { + "epoch": 1.7515241195620006, + "grad_norm": 1.4030543565750122, + "learning_rate": 7.781201558735423e-06, + "loss": 0.2026, + "step": 3699 + }, + { + "epoch": 1.751997632435632, + "grad_norm": 1.5488836765289307, + "learning_rate": 7.776215221460496e-06, + "loss": 0.2155, + "step": 3700 + }, + { + "epoch": 1.7524711453092632, + "grad_norm": 1.2541826963424683, + "learning_rate": 7.771229465795024e-06, + "loss": 0.2176, + "step": 3701 + }, + { + "epoch": 1.7529446581828942, + "grad_norm": 0.9930739998817444, + "learning_rate": 7.766244293042983e-06, + "loss": 0.2305, + "step": 3702 + }, + { + "epoch": 1.7534181710565258, + "grad_norm": 1.1047271490097046, + "learning_rate": 7.7612597045082e-06, + "loss": 0.2213, + "step": 3703 + }, + { + "epoch": 1.7538916839301568, + "grad_norm": 1.2397290468215942, + "learning_rate": 7.75627570149434e-06, + "loss": 0.2186, + "step": 3704 + }, + { + "epoch": 1.754365196803788, + "grad_norm": 1.353489637374878, + "learning_rate": 7.751292285304928e-06, + "loss": 0.2263, + "step": 3705 + }, + { + "epoch": 1.7548387096774194, + "grad_norm": 1.1363734006881714, + "learning_rate": 7.746309457243324e-06, + "loss": 0.2071, + "step": 3706 + }, + { + "epoch": 1.7553122225510505, + "grad_norm": 1.537649393081665, + "learning_rate": 7.74132721861274e-06, + "loss": 0.2063, + "step": 3707 + }, + { + "epoch": 1.755785735424682, + "grad_norm": 1.076953649520874, + "learning_rate": 7.73634557071623e-06, + "loss": 0.2097, + "step": 3708 + }, + { + "epoch": 1.756259248298313, + "grad_norm": 1.2859452962875366, + "learning_rate": 7.731364514856698e-06, + "loss": 0.2212, + "step": 3709 + }, + { + "epoch": 1.7567327611719443, + "grad_norm": 1.1581248044967651, + "learning_rate": 7.726384052336893e-06, + "loss": 0.2189, + "step": 3710 + }, + { + "epoch": 1.7572062740455756, + "grad_norm": 1.3928226232528687, + "learning_rate": 7.721404184459405e-06, + "loss": 0.2324, + "step": 3711 + }, + { + "epoch": 1.757679786919207, + "grad_norm": 1.322218894958496, + "learning_rate": 7.716424912526672e-06, + "loss": 0.201, + "step": 3712 + }, + { + "epoch": 1.7581532997928382, + "grad_norm": 1.102550745010376, + "learning_rate": 7.711446237840971e-06, + "loss": 0.2371, + "step": 3713 + }, + { + "epoch": 1.7586268126664693, + "grad_norm": 1.2192142009735107, + "learning_rate": 7.70646816170443e-06, + "loss": 0.2255, + "step": 3714 + }, + { + "epoch": 1.7591003255401008, + "grad_norm": 1.6167590618133545, + "learning_rate": 7.701490685419014e-06, + "loss": 0.2051, + "step": 3715 + }, + { + "epoch": 1.7595738384137318, + "grad_norm": 1.1392605304718018, + "learning_rate": 7.696513810286534e-06, + "loss": 0.2178, + "step": 3716 + }, + { + "epoch": 1.7600473512873631, + "grad_norm": 0.9116237759590149, + "learning_rate": 7.69153753760865e-06, + "loss": 0.2093, + "step": 3717 + }, + { + "epoch": 1.7605208641609944, + "grad_norm": 1.1358370780944824, + "learning_rate": 7.686561868686848e-06, + "loss": 0.2036, + "step": 3718 + }, + { + "epoch": 1.7609943770346255, + "grad_norm": 1.4042670726776123, + "learning_rate": 7.681586804822471e-06, + "loss": 0.2023, + "step": 3719 + }, + { + "epoch": 1.761467889908257, + "grad_norm": 1.1344070434570312, + "learning_rate": 7.676612347316702e-06, + "loss": 0.225, + "step": 3720 + }, + { + "epoch": 1.761941402781888, + "grad_norm": 1.1140799522399902, + "learning_rate": 7.671638497470558e-06, + "loss": 0.1941, + "step": 3721 + }, + { + "epoch": 1.7624149156555193, + "grad_norm": 0.9498646259307861, + "learning_rate": 7.666665256584902e-06, + "loss": 0.2175, + "step": 3722 + }, + { + "epoch": 1.7628884285291506, + "grad_norm": 1.3666812181472778, + "learning_rate": 7.66169262596044e-06, + "loss": 0.2125, + "step": 3723 + }, + { + "epoch": 1.763361941402782, + "grad_norm": 1.0839076042175293, + "learning_rate": 7.656720606897711e-06, + "loss": 0.2253, + "step": 3724 + }, + { + "epoch": 1.7638354542764132, + "grad_norm": 0.9082208871841431, + "learning_rate": 7.651749200697104e-06, + "loss": 0.1788, + "step": 3725 + }, + { + "epoch": 1.7643089671500443, + "grad_norm": 0.9702308773994446, + "learning_rate": 7.646778408658839e-06, + "loss": 0.2188, + "step": 3726 + }, + { + "epoch": 1.7647824800236758, + "grad_norm": 0.9975780844688416, + "learning_rate": 7.64180823208298e-06, + "loss": 0.2132, + "step": 3727 + }, + { + "epoch": 1.7652559928973068, + "grad_norm": 2.509084463119507, + "learning_rate": 7.636838672269425e-06, + "loss": 0.2237, + "step": 3728 + }, + { + "epoch": 1.7657295057709381, + "grad_norm": 1.6179232597351074, + "learning_rate": 7.63186973051792e-06, + "loss": 0.2145, + "step": 3729 + }, + { + "epoch": 1.7662030186445694, + "grad_norm": 1.5962454080581665, + "learning_rate": 7.626901408128039e-06, + "loss": 0.1912, + "step": 3730 + }, + { + "epoch": 1.7666765315182007, + "grad_norm": 0.9649477005004883, + "learning_rate": 7.621933706399202e-06, + "loss": 0.2245, + "step": 3731 + }, + { + "epoch": 1.767150044391832, + "grad_norm": 1.120977520942688, + "learning_rate": 7.616966626630663e-06, + "loss": 0.2079, + "step": 3732 + }, + { + "epoch": 1.767623557265463, + "grad_norm": 1.2438452243804932, + "learning_rate": 7.612000170121513e-06, + "loss": 0.2257, + "step": 3733 + }, + { + "epoch": 1.7680970701390946, + "grad_norm": 0.9499486684799194, + "learning_rate": 7.607034338170681e-06, + "loss": 0.2267, + "step": 3734 + }, + { + "epoch": 1.7685705830127256, + "grad_norm": 1.2596803903579712, + "learning_rate": 7.6020691320769325e-06, + "loss": 0.2202, + "step": 3735 + }, + { + "epoch": 1.769044095886357, + "grad_norm": 2.5322797298431396, + "learning_rate": 7.597104553138872e-06, + "loss": 0.2052, + "step": 3736 + }, + { + "epoch": 1.7695176087599882, + "grad_norm": 1.7047964334487915, + "learning_rate": 7.592140602654931e-06, + "loss": 0.2073, + "step": 3737 + }, + { + "epoch": 1.7699911216336193, + "grad_norm": 0.9767723083496094, + "learning_rate": 7.587177281923388e-06, + "loss": 0.2141, + "step": 3738 + }, + { + "epoch": 1.7704646345072508, + "grad_norm": 1.1681851148605347, + "learning_rate": 7.582214592242348e-06, + "loss": 0.2123, + "step": 3739 + }, + { + "epoch": 1.7709381473808818, + "grad_norm": 1.0731382369995117, + "learning_rate": 7.577252534909758e-06, + "loss": 0.1867, + "step": 3740 + }, + { + "epoch": 1.7714116602545131, + "grad_norm": 1.2089591026306152, + "learning_rate": 7.5722911112233956e-06, + "loss": 0.1974, + "step": 3741 + }, + { + "epoch": 1.7718851731281444, + "grad_norm": 1.1869101524353027, + "learning_rate": 7.56733032248087e-06, + "loss": 0.2027, + "step": 3742 + }, + { + "epoch": 1.7723586860017757, + "grad_norm": 1.4566733837127686, + "learning_rate": 7.562370169979633e-06, + "loss": 0.2055, + "step": 3743 + }, + { + "epoch": 1.772832198875407, + "grad_norm": 1.1467299461364746, + "learning_rate": 7.55741065501696e-06, + "loss": 0.2077, + "step": 3744 + }, + { + "epoch": 1.773305711749038, + "grad_norm": 1.3079264163970947, + "learning_rate": 7.552451778889967e-06, + "loss": 0.1882, + "step": 3745 + }, + { + "epoch": 1.7737792246226696, + "grad_norm": 1.1860123872756958, + "learning_rate": 7.547493542895601e-06, + "loss": 0.1968, + "step": 3746 + }, + { + "epoch": 1.7742527374963006, + "grad_norm": 1.007750391960144, + "learning_rate": 7.542535948330636e-06, + "loss": 0.2064, + "step": 3747 + }, + { + "epoch": 1.774726250369932, + "grad_norm": 1.3178592920303345, + "learning_rate": 7.537578996491683e-06, + "loss": 0.2012, + "step": 3748 + }, + { + "epoch": 1.7751997632435632, + "grad_norm": 1.3056480884552002, + "learning_rate": 7.532622688675193e-06, + "loss": 0.2228, + "step": 3749 + }, + { + "epoch": 1.7756732761171943, + "grad_norm": 1.8601652383804321, + "learning_rate": 7.527667026177434e-06, + "loss": 0.186, + "step": 3750 + }, + { + "epoch": 1.7761467889908258, + "grad_norm": 2.2548739910125732, + "learning_rate": 7.522712010294516e-06, + "loss": 0.2139, + "step": 3751 + }, + { + "epoch": 1.7766203018644569, + "grad_norm": 1.1233947277069092, + "learning_rate": 7.517757642322372e-06, + "loss": 0.2151, + "step": 3752 + }, + { + "epoch": 1.7770938147380881, + "grad_norm": 1.173248291015625, + "learning_rate": 7.5128039235567686e-06, + "loss": 0.2143, + "step": 3753 + }, + { + "epoch": 1.7775673276117194, + "grad_norm": 1.1922003030776978, + "learning_rate": 7.507850855293305e-06, + "loss": 0.2198, + "step": 3754 + }, + { + "epoch": 1.7780408404853507, + "grad_norm": 1.0876096487045288, + "learning_rate": 7.502898438827408e-06, + "loss": 0.2106, + "step": 3755 + }, + { + "epoch": 1.778514353358982, + "grad_norm": 1.1909540891647339, + "learning_rate": 7.497946675454334e-06, + "loss": 0.2067, + "step": 3756 + }, + { + "epoch": 1.778987866232613, + "grad_norm": 1.084885597229004, + "learning_rate": 7.49299556646917e-06, + "loss": 0.1632, + "step": 3757 + }, + { + "epoch": 1.7794613791062446, + "grad_norm": 0.9690635800361633, + "learning_rate": 7.48804511316683e-06, + "loss": 0.2316, + "step": 3758 + }, + { + "epoch": 1.7799348919798756, + "grad_norm": 1.4600977897644043, + "learning_rate": 7.483095316842057e-06, + "loss": 0.2089, + "step": 3759 + }, + { + "epoch": 1.780408404853507, + "grad_norm": 1.4429023265838623, + "learning_rate": 7.478146178789423e-06, + "loss": 0.2065, + "step": 3760 + }, + { + "epoch": 1.7808819177271382, + "grad_norm": 1.128029227256775, + "learning_rate": 7.47319770030333e-06, + "loss": 0.2118, + "step": 3761 + }, + { + "epoch": 1.7813554306007695, + "grad_norm": 1.9069571495056152, + "learning_rate": 7.4682498826779984e-06, + "loss": 0.225, + "step": 3762 + }, + { + "epoch": 1.7818289434744008, + "grad_norm": 1.254188895225525, + "learning_rate": 7.463302727207486e-06, + "loss": 0.204, + "step": 3763 + }, + { + "epoch": 1.7823024563480319, + "grad_norm": 1.4239470958709717, + "learning_rate": 7.458356235185674e-06, + "loss": 0.2052, + "step": 3764 + }, + { + "epoch": 1.7827759692216634, + "grad_norm": 0.9776591658592224, + "learning_rate": 7.45341040790627e-06, + "loss": 0.2175, + "step": 3765 + }, + { + "epoch": 1.7832494820952944, + "grad_norm": 1.543384075164795, + "learning_rate": 7.4484652466628036e-06, + "loss": 0.2158, + "step": 3766 + }, + { + "epoch": 1.7837229949689257, + "grad_norm": 1.3612316846847534, + "learning_rate": 7.443520752748639e-06, + "loss": 0.2223, + "step": 3767 + }, + { + "epoch": 1.784196507842557, + "grad_norm": 1.331251859664917, + "learning_rate": 7.438576927456958e-06, + "loss": 0.2304, + "step": 3768 + }, + { + "epoch": 1.784670020716188, + "grad_norm": 1.209085464477539, + "learning_rate": 7.433633772080772e-06, + "loss": 0.2219, + "step": 3769 + }, + { + "epoch": 1.7851435335898196, + "grad_norm": 2.226287364959717, + "learning_rate": 7.428691287912915e-06, + "loss": 0.2201, + "step": 3770 + }, + { + "epoch": 1.7856170464634507, + "grad_norm": 1.0809687376022339, + "learning_rate": 7.423749476246046e-06, + "loss": 0.2028, + "step": 3771 + }, + { + "epoch": 1.786090559337082, + "grad_norm": 1.1204088926315308, + "learning_rate": 7.4188083383726475e-06, + "loss": 0.2243, + "step": 3772 + }, + { + "epoch": 1.7865640722107132, + "grad_norm": 1.2815765142440796, + "learning_rate": 7.413867875585026e-06, + "loss": 0.2129, + "step": 3773 + }, + { + "epoch": 1.7870375850843445, + "grad_norm": 1.0807607173919678, + "learning_rate": 7.408928089175314e-06, + "loss": 0.2257, + "step": 3774 + }, + { + "epoch": 1.7875110979579758, + "grad_norm": 1.4838930368423462, + "learning_rate": 7.403988980435461e-06, + "loss": 0.2331, + "step": 3775 + }, + { + "epoch": 1.7879846108316069, + "grad_norm": 0.974097490310669, + "learning_rate": 7.399050550657249e-06, + "loss": 0.2171, + "step": 3776 + }, + { + "epoch": 1.7884581237052384, + "grad_norm": 1.6029471158981323, + "learning_rate": 7.394112801132271e-06, + "loss": 0.1972, + "step": 3777 + }, + { + "epoch": 1.7889316365788694, + "grad_norm": 1.0069618225097656, + "learning_rate": 7.389175733151953e-06, + "loss": 0.202, + "step": 3778 + }, + { + "epoch": 1.7894051494525007, + "grad_norm": 1.8258870840072632, + "learning_rate": 7.384239348007534e-06, + "loss": 0.1895, + "step": 3779 + }, + { + "epoch": 1.789878662326132, + "grad_norm": 1.2527416944503784, + "learning_rate": 7.379303646990081e-06, + "loss": 0.2081, + "step": 3780 + }, + { + "epoch": 1.790352175199763, + "grad_norm": 1.7757433652877808, + "learning_rate": 7.374368631390474e-06, + "loss": 0.2195, + "step": 3781 + }, + { + "epoch": 1.7908256880733946, + "grad_norm": 0.9863547086715698, + "learning_rate": 7.369434302499423e-06, + "loss": 0.2148, + "step": 3782 + }, + { + "epoch": 1.7912992009470257, + "grad_norm": 1.3424357175827026, + "learning_rate": 7.364500661607452e-06, + "loss": 0.2289, + "step": 3783 + }, + { + "epoch": 1.791772713820657, + "grad_norm": 1.5163919925689697, + "learning_rate": 7.359567710004907e-06, + "loss": 0.2079, + "step": 3784 + }, + { + "epoch": 1.7922462266942882, + "grad_norm": 1.132380723953247, + "learning_rate": 7.354635448981955e-06, + "loss": 0.2004, + "step": 3785 + }, + { + "epoch": 1.7927197395679195, + "grad_norm": 1.174932599067688, + "learning_rate": 7.349703879828582e-06, + "loss": 0.1928, + "step": 3786 + }, + { + "epoch": 1.7931932524415508, + "grad_norm": 1.1085500717163086, + "learning_rate": 7.344773003834589e-06, + "loss": 0.1902, + "step": 3787 + }, + { + "epoch": 1.7936667653151819, + "grad_norm": 1.5855284929275513, + "learning_rate": 7.339842822289602e-06, + "loss": 0.2048, + "step": 3788 + }, + { + "epoch": 1.7941402781888134, + "grad_norm": 1.047294020652771, + "learning_rate": 7.334913336483063e-06, + "loss": 0.1961, + "step": 3789 + }, + { + "epoch": 1.7946137910624445, + "grad_norm": 1.2375208139419556, + "learning_rate": 7.329984547704231e-06, + "loss": 0.1988, + "step": 3790 + }, + { + "epoch": 1.7950873039360757, + "grad_norm": 1.0275269746780396, + "learning_rate": 7.3250564572421814e-06, + "loss": 0.239, + "step": 3791 + }, + { + "epoch": 1.795560816809707, + "grad_norm": 1.0698773860931396, + "learning_rate": 7.320129066385811e-06, + "loss": 0.2176, + "step": 3792 + }, + { + "epoch": 1.7960343296833383, + "grad_norm": 1.3771909475326538, + "learning_rate": 7.315202376423829e-06, + "loss": 0.2128, + "step": 3793 + }, + { + "epoch": 1.7965078425569696, + "grad_norm": 1.1806304454803467, + "learning_rate": 7.3102763886447645e-06, + "loss": 0.1979, + "step": 3794 + }, + { + "epoch": 1.7969813554306007, + "grad_norm": 1.011577844619751, + "learning_rate": 7.305351104336963e-06, + "loss": 0.1838, + "step": 3795 + }, + { + "epoch": 1.7974548683042322, + "grad_norm": 2.0910136699676514, + "learning_rate": 7.3004265247885865e-06, + "loss": 0.1845, + "step": 3796 + }, + { + "epoch": 1.7979283811778632, + "grad_norm": 0.9872161746025085, + "learning_rate": 7.295502651287607e-06, + "loss": 0.2177, + "step": 3797 + }, + { + "epoch": 1.7984018940514945, + "grad_norm": 1.0551612377166748, + "learning_rate": 7.290579485121818e-06, + "loss": 0.2106, + "step": 3798 + }, + { + "epoch": 1.7988754069251258, + "grad_norm": 0.9977513551712036, + "learning_rate": 7.285657027578827e-06, + "loss": 0.2187, + "step": 3799 + }, + { + "epoch": 1.7993489197987569, + "grad_norm": 1.0159823894500732, + "learning_rate": 7.280735279946054e-06, + "loss": 0.22, + "step": 3800 + }, + { + "epoch": 1.7998224326723884, + "grad_norm": 1.2964645624160767, + "learning_rate": 7.275814243510736e-06, + "loss": 0.2177, + "step": 3801 + }, + { + "epoch": 1.8002959455460195, + "grad_norm": 1.3088321685791016, + "learning_rate": 7.270893919559922e-06, + "loss": 0.2346, + "step": 3802 + }, + { + "epoch": 1.8007694584196507, + "grad_norm": 1.1884821653366089, + "learning_rate": 7.265974309380475e-06, + "loss": 0.1938, + "step": 3803 + }, + { + "epoch": 1.801242971293282, + "grad_norm": 1.544429063796997, + "learning_rate": 7.26105541425907e-06, + "loss": 0.2023, + "step": 3804 + }, + { + "epoch": 1.8017164841669133, + "grad_norm": 0.9471104741096497, + "learning_rate": 7.2561372354822035e-06, + "loss": 0.2057, + "step": 3805 + }, + { + "epoch": 1.8021899970405446, + "grad_norm": 1.2937753200531006, + "learning_rate": 7.251219774336169e-06, + "loss": 0.2245, + "step": 3806 + }, + { + "epoch": 1.8026635099141757, + "grad_norm": 1.004191517829895, + "learning_rate": 7.246303032107084e-06, + "loss": 0.1868, + "step": 3807 + }, + { + "epoch": 1.8031370227878072, + "grad_norm": 1.476674199104309, + "learning_rate": 7.2413870100808755e-06, + "loss": 0.2203, + "step": 3808 + }, + { + "epoch": 1.8036105356614383, + "grad_norm": 0.9113790988922119, + "learning_rate": 7.2364717095432825e-06, + "loss": 0.2114, + "step": 3809 + }, + { + "epoch": 1.8040840485350695, + "grad_norm": 1.3018831014633179, + "learning_rate": 7.231557131779854e-06, + "loss": 0.2191, + "step": 3810 + }, + { + "epoch": 1.8045575614087008, + "grad_norm": 1.301070213317871, + "learning_rate": 7.226643278075948e-06, + "loss": 0.1999, + "step": 3811 + }, + { + "epoch": 1.805031074282332, + "grad_norm": 1.776887059211731, + "learning_rate": 7.2217301497167405e-06, + "loss": 0.2103, + "step": 3812 + }, + { + "epoch": 1.8055045871559634, + "grad_norm": 2.6810529232025146, + "learning_rate": 7.216817747987208e-06, + "loss": 0.2098, + "step": 3813 + }, + { + "epoch": 1.8059781000295945, + "grad_norm": 1.186989426612854, + "learning_rate": 7.2119060741721435e-06, + "loss": 0.1888, + "step": 3814 + }, + { + "epoch": 1.8064516129032258, + "grad_norm": 0.9349979162216187, + "learning_rate": 7.206995129556151e-06, + "loss": 0.2345, + "step": 3815 + }, + { + "epoch": 1.806925125776857, + "grad_norm": 1.71229887008667, + "learning_rate": 7.202084915423636e-06, + "loss": 0.2431, + "step": 3816 + }, + { + "epoch": 1.8073986386504883, + "grad_norm": 1.5204129219055176, + "learning_rate": 7.197175433058818e-06, + "loss": 0.2119, + "step": 3817 + }, + { + "epoch": 1.8078721515241196, + "grad_norm": 1.7288326025009155, + "learning_rate": 7.192266683745728e-06, + "loss": 0.22, + "step": 3818 + }, + { + "epoch": 1.8083456643977507, + "grad_norm": 1.4279898405075073, + "learning_rate": 7.187358668768198e-06, + "loss": 0.2029, + "step": 3819 + }, + { + "epoch": 1.8088191772713822, + "grad_norm": 1.4333093166351318, + "learning_rate": 7.182451389409877e-06, + "loss": 0.2323, + "step": 3820 + }, + { + "epoch": 1.8092926901450133, + "grad_norm": 1.506713628768921, + "learning_rate": 7.177544846954212e-06, + "loss": 0.2408, + "step": 3821 + }, + { + "epoch": 1.8097662030186445, + "grad_norm": 1.236220121383667, + "learning_rate": 7.172639042684464e-06, + "loss": 0.2193, + "step": 3822 + }, + { + "epoch": 1.8102397158922758, + "grad_norm": 0.997626006603241, + "learning_rate": 7.1677339778836975e-06, + "loss": 0.2065, + "step": 3823 + }, + { + "epoch": 1.8107132287659071, + "grad_norm": 1.5409119129180908, + "learning_rate": 7.162829653834787e-06, + "loss": 0.205, + "step": 3824 + }, + { + "epoch": 1.8111867416395384, + "grad_norm": 1.3550500869750977, + "learning_rate": 7.157926071820411e-06, + "loss": 0.2062, + "step": 3825 + }, + { + "epoch": 1.8116602545131695, + "grad_norm": 1.6117483377456665, + "learning_rate": 7.153023233123047e-06, + "loss": 0.2185, + "step": 3826 + }, + { + "epoch": 1.812133767386801, + "grad_norm": 1.3827286958694458, + "learning_rate": 7.148121139024995e-06, + "loss": 0.1976, + "step": 3827 + }, + { + "epoch": 1.812607280260432, + "grad_norm": 1.7857599258422852, + "learning_rate": 7.143219790808347e-06, + "loss": 0.2123, + "step": 3828 + }, + { + "epoch": 1.8130807931340633, + "grad_norm": 1.2344205379486084, + "learning_rate": 7.138319189755002e-06, + "loss": 0.2103, + "step": 3829 + }, + { + "epoch": 1.8135543060076946, + "grad_norm": 2.3192245960235596, + "learning_rate": 7.1334193371466675e-06, + "loss": 0.231, + "step": 3830 + }, + { + "epoch": 1.8140278188813257, + "grad_norm": 1.136781096458435, + "learning_rate": 7.128520234264851e-06, + "loss": 0.2119, + "step": 3831 + }, + { + "epoch": 1.8145013317549572, + "grad_norm": 1.3078882694244385, + "learning_rate": 7.1236218823908645e-06, + "loss": 0.2428, + "step": 3832 + }, + { + "epoch": 1.8149748446285883, + "grad_norm": 1.2478915452957153, + "learning_rate": 7.118724282805825e-06, + "loss": 0.2439, + "step": 3833 + }, + { + "epoch": 1.8154483575022196, + "grad_norm": 1.50113844871521, + "learning_rate": 7.113827436790655e-06, + "loss": 0.2277, + "step": 3834 + }, + { + "epoch": 1.8159218703758508, + "grad_norm": 1.2147942781448364, + "learning_rate": 7.108931345626074e-06, + "loss": 0.2285, + "step": 3835 + }, + { + "epoch": 1.8163953832494821, + "grad_norm": 1.1065564155578613, + "learning_rate": 7.104036010592609e-06, + "loss": 0.1995, + "step": 3836 + }, + { + "epoch": 1.8168688961231134, + "grad_norm": 1.3819801807403564, + "learning_rate": 7.099141432970588e-06, + "loss": 0.1979, + "step": 3837 + }, + { + "epoch": 1.8173424089967445, + "grad_norm": 0.9944810271263123, + "learning_rate": 7.094247614040139e-06, + "loss": 0.2053, + "step": 3838 + }, + { + "epoch": 1.817815921870376, + "grad_norm": 1.5402950048446655, + "learning_rate": 7.0893545550811956e-06, + "loss": 0.2208, + "step": 3839 + }, + { + "epoch": 1.818289434744007, + "grad_norm": 1.463292121887207, + "learning_rate": 7.08446225737349e-06, + "loss": 0.2016, + "step": 3840 + }, + { + "epoch": 1.8187629476176383, + "grad_norm": 1.0113959312438965, + "learning_rate": 7.079570722196553e-06, + "loss": 0.2195, + "step": 3841 + }, + { + "epoch": 1.8192364604912696, + "grad_norm": 1.070245623588562, + "learning_rate": 7.074679950829719e-06, + "loss": 0.2084, + "step": 3842 + }, + { + "epoch": 1.8197099733649007, + "grad_norm": 1.2320399284362793, + "learning_rate": 7.069789944552124e-06, + "loss": 0.206, + "step": 3843 + }, + { + "epoch": 1.8201834862385322, + "grad_norm": 1.2585248947143555, + "learning_rate": 7.0649007046427006e-06, + "loss": 0.2279, + "step": 3844 + }, + { + "epoch": 1.8206569991121633, + "grad_norm": 0.9737508296966553, + "learning_rate": 7.060012232380182e-06, + "loss": 0.1996, + "step": 3845 + }, + { + "epoch": 1.8211305119857946, + "grad_norm": 1.2638822793960571, + "learning_rate": 7.0551245290431e-06, + "loss": 0.1924, + "step": 3846 + }, + { + "epoch": 1.8216040248594259, + "grad_norm": 1.16457200050354, + "learning_rate": 7.0502375959097904e-06, + "loss": 0.2235, + "step": 3847 + }, + { + "epoch": 1.8220775377330571, + "grad_norm": 1.078177809715271, + "learning_rate": 7.045351434258378e-06, + "loss": 0.2087, + "step": 3848 + }, + { + "epoch": 1.8225510506066884, + "grad_norm": 1.1603608131408691, + "learning_rate": 7.040466045366796e-06, + "loss": 0.2128, + "step": 3849 + }, + { + "epoch": 1.8230245634803195, + "grad_norm": 1.2323132753372192, + "learning_rate": 7.03558143051277e-06, + "loss": 0.2041, + "step": 3850 + }, + { + "epoch": 1.823498076353951, + "grad_norm": 1.5576890707015991, + "learning_rate": 7.0306975909738205e-06, + "loss": 0.2194, + "step": 3851 + }, + { + "epoch": 1.823971589227582, + "grad_norm": 0.993789553642273, + "learning_rate": 7.025814528027272e-06, + "loss": 0.1883, + "step": 3852 + }, + { + "epoch": 1.8244451021012134, + "grad_norm": 1.0642006397247314, + "learning_rate": 7.02093224295024e-06, + "loss": 0.1956, + "step": 3853 + }, + { + "epoch": 1.8249186149748446, + "grad_norm": 1.1008495092391968, + "learning_rate": 7.016050737019641e-06, + "loss": 0.2071, + "step": 3854 + }, + { + "epoch": 1.825392127848476, + "grad_norm": 1.0528384447097778, + "learning_rate": 7.0111700115121835e-06, + "loss": 0.2297, + "step": 3855 + }, + { + "epoch": 1.8258656407221072, + "grad_norm": 1.6767280101776123, + "learning_rate": 7.006290067704378e-06, + "loss": 0.2402, + "step": 3856 + }, + { + "epoch": 1.8263391535957383, + "grad_norm": 0.9541283249855042, + "learning_rate": 7.001410906872522e-06, + "loss": 0.1919, + "step": 3857 + }, + { + "epoch": 1.8268126664693698, + "grad_norm": 0.9566398859024048, + "learning_rate": 6.996532530292717e-06, + "loss": 0.1902, + "step": 3858 + }, + { + "epoch": 1.8272861793430009, + "grad_norm": 0.9043372869491577, + "learning_rate": 6.991654939240855e-06, + "loss": 0.2088, + "step": 3859 + }, + { + "epoch": 1.8277596922166321, + "grad_norm": 1.314922571182251, + "learning_rate": 6.98677813499262e-06, + "loss": 0.2167, + "step": 3860 + }, + { + "epoch": 1.8282332050902634, + "grad_norm": 1.1723600625991821, + "learning_rate": 6.981902118823495e-06, + "loss": 0.2107, + "step": 3861 + }, + { + "epoch": 1.8287067179638945, + "grad_norm": 1.5371065139770508, + "learning_rate": 6.977026892008753e-06, + "loss": 0.2167, + "step": 3862 + }, + { + "epoch": 1.829180230837526, + "grad_norm": 1.1447199583053589, + "learning_rate": 6.972152455823467e-06, + "loss": 0.2027, + "step": 3863 + }, + { + "epoch": 1.829653743711157, + "grad_norm": 1.6060473918914795, + "learning_rate": 6.967278811542495e-06, + "loss": 0.2164, + "step": 3864 + }, + { + "epoch": 1.8301272565847884, + "grad_norm": 1.6065421104431152, + "learning_rate": 6.96240596044049e-06, + "loss": 0.2087, + "step": 3865 + }, + { + "epoch": 1.8306007694584197, + "grad_norm": 1.0932152271270752, + "learning_rate": 6.957533903791904e-06, + "loss": 0.2294, + "step": 3866 + }, + { + "epoch": 1.831074282332051, + "grad_norm": 1.129197359085083, + "learning_rate": 6.9526626428709745e-06, + "loss": 0.1843, + "step": 3867 + }, + { + "epoch": 1.8315477952056822, + "grad_norm": 2.516554117202759, + "learning_rate": 6.947792178951733e-06, + "loss": 0.2076, + "step": 3868 + }, + { + "epoch": 1.8320213080793133, + "grad_norm": 1.335574984550476, + "learning_rate": 6.942922513308001e-06, + "loss": 0.2114, + "step": 3869 + }, + { + "epoch": 1.8324948209529448, + "grad_norm": 0.9613366723060608, + "learning_rate": 6.9380536472133945e-06, + "loss": 0.2074, + "step": 3870 + }, + { + "epoch": 1.8329683338265759, + "grad_norm": 1.1925537586212158, + "learning_rate": 6.933185581941316e-06, + "loss": 0.2034, + "step": 3871 + }, + { + "epoch": 1.8334418467002072, + "grad_norm": 1.1290981769561768, + "learning_rate": 6.928318318764964e-06, + "loss": 0.1972, + "step": 3872 + }, + { + "epoch": 1.8339153595738384, + "grad_norm": 1.4056495428085327, + "learning_rate": 6.923451858957322e-06, + "loss": 0.2269, + "step": 3873 + }, + { + "epoch": 1.8343888724474695, + "grad_norm": 1.2841758728027344, + "learning_rate": 6.918586203791169e-06, + "loss": 0.1974, + "step": 3874 + }, + { + "epoch": 1.834862385321101, + "grad_norm": 0.927555501461029, + "learning_rate": 6.913721354539065e-06, + "loss": 0.2272, + "step": 3875 + }, + { + "epoch": 1.835335898194732, + "grad_norm": 1.3648067712783813, + "learning_rate": 6.908857312473366e-06, + "loss": 0.2267, + "step": 3876 + }, + { + "epoch": 1.8358094110683634, + "grad_norm": 1.9164044857025146, + "learning_rate": 6.903994078866216e-06, + "loss": 0.226, + "step": 3877 + }, + { + "epoch": 1.8362829239419947, + "grad_norm": 1.0202974081039429, + "learning_rate": 6.899131654989548e-06, + "loss": 0.205, + "step": 3878 + }, + { + "epoch": 1.836756436815626, + "grad_norm": 1.1604101657867432, + "learning_rate": 6.894270042115081e-06, + "loss": 0.1836, + "step": 3879 + }, + { + "epoch": 1.8372299496892572, + "grad_norm": 1.1902738809585571, + "learning_rate": 6.889409241514323e-06, + "loss": 0.2165, + "step": 3880 + }, + { + "epoch": 1.8377034625628883, + "grad_norm": 1.178782343864441, + "learning_rate": 6.88454925445857e-06, + "loss": 0.1952, + "step": 3881 + }, + { + "epoch": 1.8381769754365198, + "grad_norm": 1.345608115196228, + "learning_rate": 6.879690082218903e-06, + "loss": 0.2005, + "step": 3882 + }, + { + "epoch": 1.8386504883101509, + "grad_norm": 1.6030305624008179, + "learning_rate": 6.874831726066194e-06, + "loss": 0.2087, + "step": 3883 + }, + { + "epoch": 1.8391240011837822, + "grad_norm": 1.1542789936065674, + "learning_rate": 6.869974187271098e-06, + "loss": 0.1984, + "step": 3884 + }, + { + "epoch": 1.8395975140574135, + "grad_norm": 1.1518371105194092, + "learning_rate": 6.865117467104058e-06, + "loss": 0.2063, + "step": 3885 + }, + { + "epoch": 1.8400710269310447, + "grad_norm": 1.9634912014007568, + "learning_rate": 6.8602615668353e-06, + "loss": 0.2074, + "step": 3886 + }, + { + "epoch": 1.840544539804676, + "grad_norm": 1.0860075950622559, + "learning_rate": 6.85540648773484e-06, + "loss": 0.202, + "step": 3887 + }, + { + "epoch": 1.841018052678307, + "grad_norm": 1.4793963432312012, + "learning_rate": 6.850552231072477e-06, + "loss": 0.229, + "step": 3888 + }, + { + "epoch": 1.8414915655519386, + "grad_norm": 1.0886497497558594, + "learning_rate": 6.845698798117795e-06, + "loss": 0.234, + "step": 3889 + }, + { + "epoch": 1.8419650784255697, + "grad_norm": 2.2416272163391113, + "learning_rate": 6.840846190140161e-06, + "loss": 0.2166, + "step": 3890 + }, + { + "epoch": 1.842438591299201, + "grad_norm": 1.1083406209945679, + "learning_rate": 6.83599440840873e-06, + "loss": 0.2016, + "step": 3891 + }, + { + "epoch": 1.8429121041728322, + "grad_norm": 1.5353022813796997, + "learning_rate": 6.831143454192437e-06, + "loss": 0.1789, + "step": 3892 + }, + { + "epoch": 1.8433856170464633, + "grad_norm": 1.475759744644165, + "learning_rate": 6.826293328760004e-06, + "loss": 0.2397, + "step": 3893 + }, + { + "epoch": 1.8438591299200948, + "grad_norm": 1.3669559955596924, + "learning_rate": 6.821444033379936e-06, + "loss": 0.2096, + "step": 3894 + }, + { + "epoch": 1.8443326427937259, + "grad_norm": 0.972876787185669, + "learning_rate": 6.816595569320514e-06, + "loss": 0.2265, + "step": 3895 + }, + { + "epoch": 1.8448061556673572, + "grad_norm": 1.7018649578094482, + "learning_rate": 6.8117479378498104e-06, + "loss": 0.2174, + "step": 3896 + }, + { + "epoch": 1.8452796685409885, + "grad_norm": 1.9252698421478271, + "learning_rate": 6.806901140235675e-06, + "loss": 0.2152, + "step": 3897 + }, + { + "epoch": 1.8457531814146197, + "grad_norm": 1.056630253791809, + "learning_rate": 6.802055177745743e-06, + "loss": 0.2357, + "step": 3898 + }, + { + "epoch": 1.846226694288251, + "grad_norm": 1.101554036140442, + "learning_rate": 6.79721005164743e-06, + "loss": 0.213, + "step": 3899 + }, + { + "epoch": 1.846700207161882, + "grad_norm": 1.172674536705017, + "learning_rate": 6.792365763207926e-06, + "loss": 0.1947, + "step": 3900 + }, + { + "epoch": 1.8471737200355136, + "grad_norm": 2.0155622959136963, + "learning_rate": 6.787522313694214e-06, + "loss": 0.2403, + "step": 3901 + }, + { + "epoch": 1.8476472329091447, + "grad_norm": 1.046518325805664, + "learning_rate": 6.782679704373051e-06, + "loss": 0.2045, + "step": 3902 + }, + { + "epoch": 1.848120745782776, + "grad_norm": 1.204442024230957, + "learning_rate": 6.777837936510971e-06, + "loss": 0.2257, + "step": 3903 + }, + { + "epoch": 1.8485942586564073, + "grad_norm": 1.0126386880874634, + "learning_rate": 6.772997011374294e-06, + "loss": 0.194, + "step": 3904 + }, + { + "epoch": 1.8490677715300383, + "grad_norm": 1.27089524269104, + "learning_rate": 6.768156930229118e-06, + "loss": 0.1956, + "step": 3905 + }, + { + "epoch": 1.8495412844036698, + "grad_norm": 1.372044563293457, + "learning_rate": 6.763317694341319e-06, + "loss": 0.1892, + "step": 3906 + }, + { + "epoch": 1.850014797277301, + "grad_norm": 1.0626308917999268, + "learning_rate": 6.758479304976553e-06, + "loss": 0.2172, + "step": 3907 + }, + { + "epoch": 1.8504883101509322, + "grad_norm": 0.8939222693443298, + "learning_rate": 6.753641763400252e-06, + "loss": 0.2101, + "step": 3908 + }, + { + "epoch": 1.8509618230245635, + "grad_norm": 1.0738952159881592, + "learning_rate": 6.748805070877632e-06, + "loss": 0.195, + "step": 3909 + }, + { + "epoch": 1.8514353358981948, + "grad_norm": 2.0406644344329834, + "learning_rate": 6.743969228673679e-06, + "loss": 0.2011, + "step": 3910 + }, + { + "epoch": 1.851908848771826, + "grad_norm": 1.1621339321136475, + "learning_rate": 6.739134238053162e-06, + "loss": 0.2406, + "step": 3911 + }, + { + "epoch": 1.852382361645457, + "grad_norm": 1.1607718467712402, + "learning_rate": 6.734300100280629e-06, + "loss": 0.2154, + "step": 3912 + }, + { + "epoch": 1.8528558745190886, + "grad_norm": 1.2867686748504639, + "learning_rate": 6.729466816620398e-06, + "loss": 0.2091, + "step": 3913 + }, + { + "epoch": 1.8533293873927197, + "grad_norm": 1.1185880899429321, + "learning_rate": 6.724634388336571e-06, + "loss": 0.2298, + "step": 3914 + }, + { + "epoch": 1.853802900266351, + "grad_norm": 1.1135728359222412, + "learning_rate": 6.71980281669302e-06, + "loss": 0.2092, + "step": 3915 + }, + { + "epoch": 1.8542764131399823, + "grad_norm": 0.8761497735977173, + "learning_rate": 6.714972102953399e-06, + "loss": 0.1895, + "step": 3916 + }, + { + "epoch": 1.8547499260136135, + "grad_norm": 1.1779834032058716, + "learning_rate": 6.710142248381133e-06, + "loss": 0.2315, + "step": 3917 + }, + { + "epoch": 1.8552234388872448, + "grad_norm": 0.9498329758644104, + "learning_rate": 6.705313254239424e-06, + "loss": 0.2117, + "step": 3918 + }, + { + "epoch": 1.855696951760876, + "grad_norm": 1.2298184633255005, + "learning_rate": 6.700485121791252e-06, + "loss": 0.2018, + "step": 3919 + }, + { + "epoch": 1.8561704646345074, + "grad_norm": 1.0038814544677734, + "learning_rate": 6.695657852299362e-06, + "loss": 0.1963, + "step": 3920 + }, + { + "epoch": 1.8566439775081385, + "grad_norm": 1.4736988544464111, + "learning_rate": 6.690831447026283e-06, + "loss": 0.2308, + "step": 3921 + }, + { + "epoch": 1.8571174903817698, + "grad_norm": 1.168468952178955, + "learning_rate": 6.686005907234317e-06, + "loss": 0.2056, + "step": 3922 + }, + { + "epoch": 1.857591003255401, + "grad_norm": 1.3906092643737793, + "learning_rate": 6.681181234185532e-06, + "loss": 0.2446, + "step": 3923 + }, + { + "epoch": 1.8580645161290321, + "grad_norm": 1.0936076641082764, + "learning_rate": 6.6763574291417795e-06, + "loss": 0.1935, + "step": 3924 + }, + { + "epoch": 1.8585380290026636, + "grad_norm": 1.0105923414230347, + "learning_rate": 6.67153449336468e-06, + "loss": 0.2192, + "step": 3925 + }, + { + "epoch": 1.8590115418762947, + "grad_norm": 1.5099976062774658, + "learning_rate": 6.666712428115621e-06, + "loss": 0.199, + "step": 3926 + }, + { + "epoch": 1.859485054749926, + "grad_norm": 1.0132044553756714, + "learning_rate": 6.661891234655769e-06, + "loss": 0.2101, + "step": 3927 + }, + { + "epoch": 1.8599585676235573, + "grad_norm": 0.927442193031311, + "learning_rate": 6.657070914246063e-06, + "loss": 0.1969, + "step": 3928 + }, + { + "epoch": 1.8604320804971886, + "grad_norm": 1.5215531587600708, + "learning_rate": 6.6522514681472105e-06, + "loss": 0.1945, + "step": 3929 + }, + { + "epoch": 1.8609055933708198, + "grad_norm": 1.445884108543396, + "learning_rate": 6.64743289761969e-06, + "loss": 0.2198, + "step": 3930 + }, + { + "epoch": 1.861379106244451, + "grad_norm": 1.817511796951294, + "learning_rate": 6.64261520392375e-06, + "loss": 0.2005, + "step": 3931 + }, + { + "epoch": 1.8618526191180824, + "grad_norm": 1.0783874988555908, + "learning_rate": 6.637798388319416e-06, + "loss": 0.1863, + "step": 3932 + }, + { + "epoch": 1.8623261319917135, + "grad_norm": 1.2979896068572998, + "learning_rate": 6.632982452066476e-06, + "loss": 0.2191, + "step": 3933 + }, + { + "epoch": 1.8627996448653448, + "grad_norm": 0.9577775001525879, + "learning_rate": 6.628167396424494e-06, + "loss": 0.1862, + "step": 3934 + }, + { + "epoch": 1.863273157738976, + "grad_norm": 1.0281299352645874, + "learning_rate": 6.623353222652802e-06, + "loss": 0.2308, + "step": 3935 + }, + { + "epoch": 1.8637466706126071, + "grad_norm": 1.4038194417953491, + "learning_rate": 6.6185399320105e-06, + "loss": 0.2115, + "step": 3936 + }, + { + "epoch": 1.8642201834862386, + "grad_norm": 2.125293254852295, + "learning_rate": 6.613727525756455e-06, + "loss": 0.2016, + "step": 3937 + }, + { + "epoch": 1.8646936963598697, + "grad_norm": 1.3290354013442993, + "learning_rate": 6.608916005149311e-06, + "loss": 0.1913, + "step": 3938 + }, + { + "epoch": 1.865167209233501, + "grad_norm": 1.1084948778152466, + "learning_rate": 6.604105371447469e-06, + "loss": 0.1794, + "step": 3939 + }, + { + "epoch": 1.8656407221071323, + "grad_norm": 1.267200231552124, + "learning_rate": 6.599295625909107e-06, + "loss": 0.1885, + "step": 3940 + }, + { + "epoch": 1.8661142349807636, + "grad_norm": 1.9131397008895874, + "learning_rate": 6.5944867697921654e-06, + "loss": 0.224, + "step": 3941 + }, + { + "epoch": 1.8665877478543949, + "grad_norm": 1.2804118394851685, + "learning_rate": 6.589678804354353e-06, + "loss": 0.1945, + "step": 3942 + }, + { + "epoch": 1.867061260728026, + "grad_norm": 0.9017341732978821, + "learning_rate": 6.584871730853153e-06, + "loss": 0.2059, + "step": 3943 + }, + { + "epoch": 1.8675347736016574, + "grad_norm": 1.365110158920288, + "learning_rate": 6.5800655505458065e-06, + "loss": 0.222, + "step": 3944 + }, + { + "epoch": 1.8680082864752885, + "grad_norm": 0.8447331190109253, + "learning_rate": 6.5752602646893224e-06, + "loss": 0.1945, + "step": 3945 + }, + { + "epoch": 1.8684817993489198, + "grad_norm": 1.1045807600021362, + "learning_rate": 6.5704558745404755e-06, + "loss": 0.1969, + "step": 3946 + }, + { + "epoch": 1.868955312222551, + "grad_norm": 1.3846262693405151, + "learning_rate": 6.56565238135581e-06, + "loss": 0.2017, + "step": 3947 + }, + { + "epoch": 1.8694288250961824, + "grad_norm": 1.107153058052063, + "learning_rate": 6.560849786391632e-06, + "loss": 0.2207, + "step": 3948 + }, + { + "epoch": 1.8699023379698136, + "grad_norm": 1.206408143043518, + "learning_rate": 6.556048090904015e-06, + "loss": 0.2088, + "step": 3949 + }, + { + "epoch": 1.8703758508434447, + "grad_norm": 1.063124418258667, + "learning_rate": 6.5512472961487946e-06, + "loss": 0.2066, + "step": 3950 + }, + { + "epoch": 1.8708493637170762, + "grad_norm": 1.2000980377197266, + "learning_rate": 6.5464474033815754e-06, + "loss": 0.1757, + "step": 3951 + }, + { + "epoch": 1.8713228765907073, + "grad_norm": 1.3470823764801025, + "learning_rate": 6.541648413857718e-06, + "loss": 0.2252, + "step": 3952 + }, + { + "epoch": 1.8717963894643386, + "grad_norm": 1.3533769845962524, + "learning_rate": 6.536850328832358e-06, + "loss": 0.2145, + "step": 3953 + }, + { + "epoch": 1.8722699023379699, + "grad_norm": 1.6405057907104492, + "learning_rate": 6.5320531495603825e-06, + "loss": 0.1892, + "step": 3954 + }, + { + "epoch": 1.872743415211601, + "grad_norm": 1.296522855758667, + "learning_rate": 6.527256877296449e-06, + "loss": 0.2205, + "step": 3955 + }, + { + "epoch": 1.8732169280852324, + "grad_norm": 1.0236841440200806, + "learning_rate": 6.522461513294979e-06, + "loss": 0.1865, + "step": 3956 + }, + { + "epoch": 1.8736904409588635, + "grad_norm": 1.0961860418319702, + "learning_rate": 6.517667058810151e-06, + "loss": 0.1896, + "step": 3957 + }, + { + "epoch": 1.8741639538324948, + "grad_norm": 1.3094490766525269, + "learning_rate": 6.5128735150959075e-06, + "loss": 0.2059, + "step": 3958 + }, + { + "epoch": 1.874637466706126, + "grad_norm": 1.5833834409713745, + "learning_rate": 6.508080883405957e-06, + "loss": 0.223, + "step": 3959 + }, + { + "epoch": 1.8751109795797574, + "grad_norm": 0.9895943999290466, + "learning_rate": 6.503289164993765e-06, + "loss": 0.2081, + "step": 3960 + }, + { + "epoch": 1.8755844924533887, + "grad_norm": 1.386003851890564, + "learning_rate": 6.498498361112557e-06, + "loss": 0.2138, + "step": 3961 + }, + { + "epoch": 1.8760580053270197, + "grad_norm": 1.1875851154327393, + "learning_rate": 6.4937084730153236e-06, + "loss": 0.2184, + "step": 3962 + }, + { + "epoch": 1.8765315182006512, + "grad_norm": 1.4439812898635864, + "learning_rate": 6.4889195019548155e-06, + "loss": 0.2191, + "step": 3963 + }, + { + "epoch": 1.8770050310742823, + "grad_norm": 1.2867056131362915, + "learning_rate": 6.484131449183537e-06, + "loss": 0.195, + "step": 3964 + }, + { + "epoch": 1.8774785439479136, + "grad_norm": 1.1023000478744507, + "learning_rate": 6.47934431595376e-06, + "loss": 0.2226, + "step": 3965 + }, + { + "epoch": 1.8779520568215449, + "grad_norm": 2.235792636871338, + "learning_rate": 6.474558103517513e-06, + "loss": 0.2143, + "step": 3966 + }, + { + "epoch": 1.878425569695176, + "grad_norm": 1.0556797981262207, + "learning_rate": 6.469772813126584e-06, + "loss": 0.1981, + "step": 3967 + }, + { + "epoch": 1.8788990825688074, + "grad_norm": 0.9840413928031921, + "learning_rate": 6.464988446032518e-06, + "loss": 0.2003, + "step": 3968 + }, + { + "epoch": 1.8793725954424385, + "grad_norm": 1.3949209451675415, + "learning_rate": 6.460205003486621e-06, + "loss": 0.2189, + "step": 3969 + }, + { + "epoch": 1.8798461083160698, + "grad_norm": 0.8188128471374512, + "learning_rate": 6.4554224867399575e-06, + "loss": 0.1956, + "step": 3970 + }, + { + "epoch": 1.880319621189701, + "grad_norm": 1.0726219415664673, + "learning_rate": 6.450640897043346e-06, + "loss": 0.2257, + "step": 3971 + }, + { + "epoch": 1.8807931340633324, + "grad_norm": 1.4881572723388672, + "learning_rate": 6.445860235647367e-06, + "loss": 0.1915, + "step": 3972 + }, + { + "epoch": 1.8812666469369637, + "grad_norm": 1.1142457723617554, + "learning_rate": 6.44108050380236e-06, + "loss": 0.1911, + "step": 3973 + }, + { + "epoch": 1.8817401598105947, + "grad_norm": 1.912439227104187, + "learning_rate": 6.43630170275841e-06, + "loss": 0.1943, + "step": 3974 + }, + { + "epoch": 1.8822136726842262, + "grad_norm": 1.1108665466308594, + "learning_rate": 6.431523833765369e-06, + "loss": 0.2103, + "step": 3975 + }, + { + "epoch": 1.8826871855578573, + "grad_norm": 1.0399905443191528, + "learning_rate": 6.426746898072845e-06, + "loss": 0.2138, + "step": 3976 + }, + { + "epoch": 1.8831606984314886, + "grad_norm": 0.9671503305435181, + "learning_rate": 6.421970896930199e-06, + "loss": 0.193, + "step": 3977 + }, + { + "epoch": 1.8836342113051199, + "grad_norm": 1.4614564180374146, + "learning_rate": 6.417195831586545e-06, + "loss": 0.2061, + "step": 3978 + }, + { + "epoch": 1.8841077241787512, + "grad_norm": 1.1578701734542847, + "learning_rate": 6.412421703290759e-06, + "loss": 0.2123, + "step": 3979 + }, + { + "epoch": 1.8845812370523825, + "grad_norm": 1.1793917417526245, + "learning_rate": 6.4076485132914644e-06, + "loss": 0.2134, + "step": 3980 + }, + { + "epoch": 1.8850547499260135, + "grad_norm": 1.1729707717895508, + "learning_rate": 6.402876262837045e-06, + "loss": 0.217, + "step": 3981 + }, + { + "epoch": 1.885528262799645, + "grad_norm": 1.058951735496521, + "learning_rate": 6.398104953175639e-06, + "loss": 0.2041, + "step": 3982 + }, + { + "epoch": 1.886001775673276, + "grad_norm": 1.7468572854995728, + "learning_rate": 6.393334585555133e-06, + "loss": 0.215, + "step": 3983 + }, + { + "epoch": 1.8864752885469074, + "grad_norm": 0.8419433832168579, + "learning_rate": 6.388565161223172e-06, + "loss": 0.1718, + "step": 3984 + }, + { + "epoch": 1.8869488014205387, + "grad_norm": 1.8909525871276855, + "learning_rate": 6.383796681427154e-06, + "loss": 0.1912, + "step": 3985 + }, + { + "epoch": 1.8874223142941697, + "grad_norm": 1.1229642629623413, + "learning_rate": 6.379029147414228e-06, + "loss": 0.2449, + "step": 3986 + }, + { + "epoch": 1.8878958271678012, + "grad_norm": 1.213046669960022, + "learning_rate": 6.374262560431297e-06, + "loss": 0.1952, + "step": 3987 + }, + { + "epoch": 1.8883693400414323, + "grad_norm": 1.6721361875534058, + "learning_rate": 6.369496921725016e-06, + "loss": 0.2018, + "step": 3988 + }, + { + "epoch": 1.8888428529150636, + "grad_norm": 1.6675399541854858, + "learning_rate": 6.364732232541788e-06, + "loss": 0.2019, + "step": 3989 + }, + { + "epoch": 1.8893163657886949, + "grad_norm": 1.043617606163025, + "learning_rate": 6.359968494127777e-06, + "loss": 0.1982, + "step": 3990 + }, + { + "epoch": 1.8897898786623262, + "grad_norm": 0.9541271924972534, + "learning_rate": 6.355205707728889e-06, + "loss": 0.2128, + "step": 3991 + }, + { + "epoch": 1.8902633915359575, + "grad_norm": 1.9646912813186646, + "learning_rate": 6.350443874590786e-06, + "loss": 0.1915, + "step": 3992 + }, + { + "epoch": 1.8907369044095885, + "grad_norm": 1.3205372095108032, + "learning_rate": 6.34568299595888e-06, + "loss": 0.1902, + "step": 3993 + }, + { + "epoch": 1.89121041728322, + "grad_norm": 2.0450875759124756, + "learning_rate": 6.340923073078333e-06, + "loss": 0.208, + "step": 3994 + }, + { + "epoch": 1.891683930156851, + "grad_norm": 1.3528573513031006, + "learning_rate": 6.336164107194056e-06, + "loss": 0.2167, + "step": 3995 + }, + { + "epoch": 1.8921574430304824, + "grad_norm": 1.0544376373291016, + "learning_rate": 6.331406099550711e-06, + "loss": 0.213, + "step": 3996 + }, + { + "epoch": 1.8926309559041137, + "grad_norm": 1.3501182794570923, + "learning_rate": 6.326649051392709e-06, + "loss": 0.1947, + "step": 3997 + }, + { + "epoch": 1.8931044687777447, + "grad_norm": 1.4328538179397583, + "learning_rate": 6.321892963964214e-06, + "loss": 0.1958, + "step": 3998 + }, + { + "epoch": 1.8935779816513763, + "grad_norm": 1.6328697204589844, + "learning_rate": 6.317137838509126e-06, + "loss": 0.2082, + "step": 3999 + }, + { + "epoch": 1.8940514945250073, + "grad_norm": 2.273808240890503, + "learning_rate": 6.3123836762711085e-06, + "loss": 0.2325, + "step": 4000 + }, + { + "epoch": 1.8945250073986386, + "grad_norm": 1.4385737180709839, + "learning_rate": 6.307630478493565e-06, + "loss": 0.2211, + "step": 4001 + }, + { + "epoch": 1.89499852027227, + "grad_norm": 1.7842854261398315, + "learning_rate": 6.30287824641965e-06, + "loss": 0.2317, + "step": 4002 + }, + { + "epoch": 1.8954720331459012, + "grad_norm": 1.4352833032608032, + "learning_rate": 6.298126981292263e-06, + "loss": 0.2032, + "step": 4003 + }, + { + "epoch": 1.8959455460195325, + "grad_norm": 1.0405994653701782, + "learning_rate": 6.29337668435405e-06, + "loss": 0.213, + "step": 4004 + }, + { + "epoch": 1.8964190588931635, + "grad_norm": 1.2680540084838867, + "learning_rate": 6.288627356847407e-06, + "loss": 0.2208, + "step": 4005 + }, + { + "epoch": 1.896892571766795, + "grad_norm": 0.9765031337738037, + "learning_rate": 6.283879000014476e-06, + "loss": 0.1896, + "step": 4006 + }, + { + "epoch": 1.897366084640426, + "grad_norm": 1.2945647239685059, + "learning_rate": 6.27913161509714e-06, + "loss": 0.2107, + "step": 4007 + }, + { + "epoch": 1.8978395975140574, + "grad_norm": 1.2577142715454102, + "learning_rate": 6.274385203337039e-06, + "loss": 0.2156, + "step": 4008 + }, + { + "epoch": 1.8983131103876887, + "grad_norm": 2.5358786582946777, + "learning_rate": 6.269639765975543e-06, + "loss": 0.2061, + "step": 4009 + }, + { + "epoch": 1.89878662326132, + "grad_norm": 0.9788986444473267, + "learning_rate": 6.264895304253779e-06, + "loss": 0.1898, + "step": 4010 + }, + { + "epoch": 1.8992601361349513, + "grad_norm": 1.0234096050262451, + "learning_rate": 6.260151819412616e-06, + "loss": 0.1886, + "step": 4011 + }, + { + "epoch": 1.8997336490085823, + "grad_norm": 1.4769940376281738, + "learning_rate": 6.255409312692664e-06, + "loss": 0.2173, + "step": 4012 + }, + { + "epoch": 1.9002071618822138, + "grad_norm": 1.2099794149398804, + "learning_rate": 6.250667785334282e-06, + "loss": 0.2085, + "step": 4013 + }, + { + "epoch": 1.900680674755845, + "grad_norm": 1.8601834774017334, + "learning_rate": 6.24592723857757e-06, + "loss": 0.2349, + "step": 4014 + }, + { + "epoch": 1.9011541876294762, + "grad_norm": 1.1096925735473633, + "learning_rate": 6.241187673662375e-06, + "loss": 0.2258, + "step": 4015 + }, + { + "epoch": 1.9016277005031075, + "grad_norm": 1.2290061712265015, + "learning_rate": 6.236449091828278e-06, + "loss": 0.2111, + "step": 4016 + }, + { + "epoch": 1.9021012133767385, + "grad_norm": 1.0335679054260254, + "learning_rate": 6.231711494314618e-06, + "loss": 0.1963, + "step": 4017 + }, + { + "epoch": 1.90257472625037, + "grad_norm": 1.2123810052871704, + "learning_rate": 6.226974882360462e-06, + "loss": 0.2317, + "step": 4018 + }, + { + "epoch": 1.9030482391240011, + "grad_norm": 1.043067216873169, + "learning_rate": 6.222239257204625e-06, + "loss": 0.2035, + "step": 4019 + }, + { + "epoch": 1.9035217519976324, + "grad_norm": 1.1024198532104492, + "learning_rate": 6.217504620085662e-06, + "loss": 0.2172, + "step": 4020 + }, + { + "epoch": 1.9039952648712637, + "grad_norm": 1.036494493484497, + "learning_rate": 6.21277097224188e-06, + "loss": 0.2044, + "step": 4021 + }, + { + "epoch": 1.904468777744895, + "grad_norm": 1.0444680452346802, + "learning_rate": 6.208038314911312e-06, + "loss": 0.1994, + "step": 4022 + }, + { + "epoch": 1.9049422906185263, + "grad_norm": 1.1238281726837158, + "learning_rate": 6.203306649331744e-06, + "loss": 0.1888, + "step": 4023 + }, + { + "epoch": 1.9054158034921573, + "grad_norm": 0.9902118444442749, + "learning_rate": 6.1985759767406925e-06, + "loss": 0.1989, + "step": 4024 + }, + { + "epoch": 1.9058893163657888, + "grad_norm": 1.583835244178772, + "learning_rate": 6.1938462983754235e-06, + "loss": 0.2093, + "step": 4025 + }, + { + "epoch": 1.90636282923942, + "grad_norm": 1.8781702518463135, + "learning_rate": 6.189117615472935e-06, + "loss": 0.2073, + "step": 4026 + }, + { + "epoch": 1.9068363421130512, + "grad_norm": 1.4825783967971802, + "learning_rate": 6.1843899292699695e-06, + "loss": 0.2244, + "step": 4027 + }, + { + "epoch": 1.9073098549866825, + "grad_norm": 1.3632996082305908, + "learning_rate": 6.179663241003008e-06, + "loss": 0.2145, + "step": 4028 + }, + { + "epoch": 1.9077833678603136, + "grad_norm": 1.502853274345398, + "learning_rate": 6.174937551908271e-06, + "loss": 0.2004, + "step": 4029 + }, + { + "epoch": 1.908256880733945, + "grad_norm": 1.4652563333511353, + "learning_rate": 6.170212863221715e-06, + "loss": 0.2041, + "step": 4030 + }, + { + "epoch": 1.9087303936075761, + "grad_norm": 1.1851582527160645, + "learning_rate": 6.165489176179039e-06, + "loss": 0.2071, + "step": 4031 + }, + { + "epoch": 1.9092039064812074, + "grad_norm": 1.044634222984314, + "learning_rate": 6.16076649201568e-06, + "loss": 0.203, + "step": 4032 + }, + { + "epoch": 1.9096774193548387, + "grad_norm": 1.1396992206573486, + "learning_rate": 6.1560448119668034e-06, + "loss": 0.2165, + "step": 4033 + }, + { + "epoch": 1.91015093222847, + "grad_norm": 1.1796884536743164, + "learning_rate": 6.151324137267322e-06, + "loss": 0.2182, + "step": 4034 + }, + { + "epoch": 1.9106244451021013, + "grad_norm": 1.1063097715377808, + "learning_rate": 6.146604469151886e-06, + "loss": 0.2015, + "step": 4035 + }, + { + "epoch": 1.9110979579757323, + "grad_norm": 2.4029173851013184, + "learning_rate": 6.141885808854877e-06, + "loss": 0.231, + "step": 4036 + }, + { + "epoch": 1.9115714708493639, + "grad_norm": 1.0543550252914429, + "learning_rate": 6.137168157610413e-06, + "loss": 0.1866, + "step": 4037 + }, + { + "epoch": 1.912044983722995, + "grad_norm": 1.8748868703842163, + "learning_rate": 6.1324515166523535e-06, + "loss": 0.1752, + "step": 4038 + }, + { + "epoch": 1.9125184965966262, + "grad_norm": 1.1984789371490479, + "learning_rate": 6.1277358872142875e-06, + "loss": 0.213, + "step": 4039 + }, + { + "epoch": 1.9129920094702575, + "grad_norm": 1.075285792350769, + "learning_rate": 6.1230212705295455e-06, + "loss": 0.2163, + "step": 4040 + }, + { + "epoch": 1.9134655223438886, + "grad_norm": 1.1514887809753418, + "learning_rate": 6.118307667831187e-06, + "loss": 0.1981, + "step": 4041 + }, + { + "epoch": 1.91393903521752, + "grad_norm": 1.0806608200073242, + "learning_rate": 6.113595080352013e-06, + "loss": 0.2059, + "step": 4042 + }, + { + "epoch": 1.9144125480911511, + "grad_norm": 0.9989210963249207, + "learning_rate": 6.10888350932455e-06, + "loss": 0.1973, + "step": 4043 + }, + { + "epoch": 1.9148860609647826, + "grad_norm": 1.4374021291732788, + "learning_rate": 6.104172955981069e-06, + "loss": 0.2153, + "step": 4044 + }, + { + "epoch": 1.9153595738384137, + "grad_norm": 1.3092314004898071, + "learning_rate": 6.0994634215535665e-06, + "loss": 0.1993, + "step": 4045 + }, + { + "epoch": 1.915833086712045, + "grad_norm": 1.3858625888824463, + "learning_rate": 6.094754907273777e-06, + "loss": 0.1935, + "step": 4046 + }, + { + "epoch": 1.9163065995856763, + "grad_norm": 1.9879931211471558, + "learning_rate": 6.090047414373166e-06, + "loss": 0.1879, + "step": 4047 + }, + { + "epoch": 1.9167801124593074, + "grad_norm": 1.0218373537063599, + "learning_rate": 6.085340944082935e-06, + "loss": 0.2103, + "step": 4048 + }, + { + "epoch": 1.9172536253329389, + "grad_norm": 1.021353840827942, + "learning_rate": 6.0806354976340145e-06, + "loss": 0.2111, + "step": 4049 + }, + { + "epoch": 1.91772713820657, + "grad_norm": 1.1734100580215454, + "learning_rate": 6.075931076257069e-06, + "loss": 0.2014, + "step": 4050 + }, + { + "epoch": 1.9182006510802012, + "grad_norm": 1.5947613716125488, + "learning_rate": 6.071227681182494e-06, + "loss": 0.1894, + "step": 4051 + }, + { + "epoch": 1.9186741639538325, + "grad_norm": 1.2673771381378174, + "learning_rate": 6.066525313640419e-06, + "loss": 0.1882, + "step": 4052 + }, + { + "epoch": 1.9191476768274638, + "grad_norm": 1.011744499206543, + "learning_rate": 6.061823974860699e-06, + "loss": 0.2165, + "step": 4053 + }, + { + "epoch": 1.919621189701095, + "grad_norm": 1.5951597690582275, + "learning_rate": 6.057123666072927e-06, + "loss": 0.232, + "step": 4054 + }, + { + "epoch": 1.9200947025747261, + "grad_norm": 1.3709065914154053, + "learning_rate": 6.052424388506421e-06, + "loss": 0.2051, + "step": 4055 + }, + { + "epoch": 1.9205682154483577, + "grad_norm": 1.1398015022277832, + "learning_rate": 6.047726143390236e-06, + "loss": 0.2136, + "step": 4056 + }, + { + "epoch": 1.9210417283219887, + "grad_norm": 1.450731873512268, + "learning_rate": 6.043028931953148e-06, + "loss": 0.2186, + "step": 4057 + }, + { + "epoch": 1.92151524119562, + "grad_norm": 1.0435858964920044, + "learning_rate": 6.03833275542367e-06, + "loss": 0.21, + "step": 4058 + }, + { + "epoch": 1.9219887540692513, + "grad_norm": 1.3922533988952637, + "learning_rate": 6.033637615030039e-06, + "loss": 0.2023, + "step": 4059 + }, + { + "epoch": 1.9224622669428824, + "grad_norm": 1.1114870309829712, + "learning_rate": 6.028943512000227e-06, + "loss": 0.2057, + "step": 4060 + }, + { + "epoch": 1.9229357798165139, + "grad_norm": 1.509615421295166, + "learning_rate": 6.02425044756193e-06, + "loss": 0.1999, + "step": 4061 + }, + { + "epoch": 1.923409292690145, + "grad_norm": 1.4083658456802368, + "learning_rate": 6.019558422942575e-06, + "loss": 0.2151, + "step": 4062 + }, + { + "epoch": 1.9238828055637762, + "grad_norm": 1.1245790719985962, + "learning_rate": 6.014867439369314e-06, + "loss": 0.2422, + "step": 4063 + }, + { + "epoch": 1.9243563184374075, + "grad_norm": 1.3989055156707764, + "learning_rate": 6.010177498069027e-06, + "loss": 0.1943, + "step": 4064 + }, + { + "epoch": 1.9248298313110388, + "grad_norm": 1.1599762439727783, + "learning_rate": 6.005488600268328e-06, + "loss": 0.2005, + "step": 4065 + }, + { + "epoch": 1.92530334418467, + "grad_norm": 1.421303391456604, + "learning_rate": 6.000800747193547e-06, + "loss": 0.2228, + "step": 4066 + }, + { + "epoch": 1.9257768570583012, + "grad_norm": 1.3245368003845215, + "learning_rate": 5.996113940070754e-06, + "loss": 0.2075, + "step": 4067 + }, + { + "epoch": 1.9262503699319327, + "grad_norm": 1.5470653772354126, + "learning_rate": 5.99142818012573e-06, + "loss": 0.2029, + "step": 4068 + }, + { + "epoch": 1.9267238828055637, + "grad_norm": 1.1906659603118896, + "learning_rate": 5.986743468583996e-06, + "loss": 0.2261, + "step": 4069 + }, + { + "epoch": 1.927197395679195, + "grad_norm": 1.2057496309280396, + "learning_rate": 5.982059806670788e-06, + "loss": 0.2086, + "step": 4070 + }, + { + "epoch": 1.9276709085528263, + "grad_norm": 1.0178190469741821, + "learning_rate": 5.9773771956110785e-06, + "loss": 0.1835, + "step": 4071 + }, + { + "epoch": 1.9281444214264574, + "grad_norm": 1.4230746030807495, + "learning_rate": 5.972695636629555e-06, + "loss": 0.2209, + "step": 4072 + }, + { + "epoch": 1.9286179343000889, + "grad_norm": 1.5597596168518066, + "learning_rate": 5.968015130950638e-06, + "loss": 0.1961, + "step": 4073 + }, + { + "epoch": 1.92909144717372, + "grad_norm": 0.9596446752548218, + "learning_rate": 5.963335679798465e-06, + "loss": 0.1941, + "step": 4074 + }, + { + "epoch": 1.9295649600473515, + "grad_norm": 1.1124833822250366, + "learning_rate": 5.958657284396902e-06, + "loss": 0.1979, + "step": 4075 + }, + { + "epoch": 1.9300384729209825, + "grad_norm": 1.0595910549163818, + "learning_rate": 5.953979945969539e-06, + "loss": 0.2157, + "step": 4076 + }, + { + "epoch": 1.9305119857946138, + "grad_norm": 0.9917038083076477, + "learning_rate": 5.949303665739689e-06, + "loss": 0.2023, + "step": 4077 + }, + { + "epoch": 1.930985498668245, + "grad_norm": 1.0265789031982422, + "learning_rate": 5.944628444930388e-06, + "loss": 0.2223, + "step": 4078 + }, + { + "epoch": 1.9314590115418762, + "grad_norm": 1.217456579208374, + "learning_rate": 5.9399542847643935e-06, + "loss": 0.2226, + "step": 4079 + }, + { + "epoch": 1.9319325244155077, + "grad_norm": 1.284399151802063, + "learning_rate": 5.935281186464188e-06, + "loss": 0.2251, + "step": 4080 + }, + { + "epoch": 1.9324060372891387, + "grad_norm": 1.6773645877838135, + "learning_rate": 5.930609151251975e-06, + "loss": 0.2266, + "step": 4081 + }, + { + "epoch": 1.93287955016277, + "grad_norm": 1.1189942359924316, + "learning_rate": 5.925938180349679e-06, + "loss": 0.1975, + "step": 4082 + }, + { + "epoch": 1.9333530630364013, + "grad_norm": 1.3536006212234497, + "learning_rate": 5.921268274978951e-06, + "loss": 0.2106, + "step": 4083 + }, + { + "epoch": 1.9338265759100326, + "grad_norm": 1.3153413534164429, + "learning_rate": 5.9165994363611565e-06, + "loss": 0.2272, + "step": 4084 + }, + { + "epoch": 1.9343000887836639, + "grad_norm": 1.37685227394104, + "learning_rate": 5.911931665717386e-06, + "loss": 0.2237, + "step": 4085 + }, + { + "epoch": 1.934773601657295, + "grad_norm": 1.1147427558898926, + "learning_rate": 5.907264964268451e-06, + "loss": 0.2185, + "step": 4086 + }, + { + "epoch": 1.9352471145309265, + "grad_norm": 1.5562469959259033, + "learning_rate": 5.902599333234882e-06, + "loss": 0.205, + "step": 4087 + }, + { + "epoch": 1.9357206274045575, + "grad_norm": 1.3844199180603027, + "learning_rate": 5.8979347738369276e-06, + "loss": 0.2183, + "step": 4088 + }, + { + "epoch": 1.9361941402781888, + "grad_norm": 1.3082056045532227, + "learning_rate": 5.89327128729456e-06, + "loss": 0.1965, + "step": 4089 + }, + { + "epoch": 1.93666765315182, + "grad_norm": 0.897994875907898, + "learning_rate": 5.888608874827469e-06, + "loss": 0.2081, + "step": 4090 + }, + { + "epoch": 1.9371411660254512, + "grad_norm": 0.9968763589859009, + "learning_rate": 5.883947537655061e-06, + "loss": 0.2208, + "step": 4091 + }, + { + "epoch": 1.9376146788990827, + "grad_norm": 1.1803154945373535, + "learning_rate": 5.8792872769964705e-06, + "loss": 0.1927, + "step": 4092 + }, + { + "epoch": 1.9380881917727137, + "grad_norm": 1.3356378078460693, + "learning_rate": 5.874628094070536e-06, + "loss": 0.1976, + "step": 4093 + }, + { + "epoch": 1.938561704646345, + "grad_norm": 1.4385789632797241, + "learning_rate": 5.869969990095828e-06, + "loss": 0.22, + "step": 4094 + }, + { + "epoch": 1.9390352175199763, + "grad_norm": 1.067402958869934, + "learning_rate": 5.865312966290624e-06, + "loss": 0.2028, + "step": 4095 + }, + { + "epoch": 1.9395087303936076, + "grad_norm": 1.020371675491333, + "learning_rate": 5.8606570238729286e-06, + "loss": 0.2087, + "step": 4096 + }, + { + "epoch": 1.939982243267239, + "grad_norm": 1.094397783279419, + "learning_rate": 5.856002164060453e-06, + "loss": 0.1967, + "step": 4097 + }, + { + "epoch": 1.94045575614087, + "grad_norm": 1.3213562965393066, + "learning_rate": 5.85134838807063e-06, + "loss": 0.2218, + "step": 4098 + }, + { + "epoch": 1.9409292690145015, + "grad_norm": 1.4701647758483887, + "learning_rate": 5.846695697120617e-06, + "loss": 0.2085, + "step": 4099 + }, + { + "epoch": 1.9414027818881325, + "grad_norm": 0.9358981847763062, + "learning_rate": 5.842044092427277e-06, + "loss": 0.2129, + "step": 4100 + }, + { + "epoch": 1.9418762947617638, + "grad_norm": 1.27782142162323, + "learning_rate": 5.837393575207194e-06, + "loss": 0.204, + "step": 4101 + }, + { + "epoch": 1.942349807635395, + "grad_norm": 1.1165828704833984, + "learning_rate": 5.832744146676661e-06, + "loss": 0.2169, + "step": 4102 + }, + { + "epoch": 1.9428233205090262, + "grad_norm": 1.0212310552597046, + "learning_rate": 5.828095808051697e-06, + "loss": 0.2175, + "step": 4103 + }, + { + "epoch": 1.9432968333826577, + "grad_norm": 0.9246084094047546, + "learning_rate": 5.823448560548024e-06, + "loss": 0.1908, + "step": 4104 + }, + { + "epoch": 1.9437703462562888, + "grad_norm": 1.1818164587020874, + "learning_rate": 5.818802405381091e-06, + "loss": 0.2047, + "step": 4105 + }, + { + "epoch": 1.9442438591299203, + "grad_norm": 1.5867642164230347, + "learning_rate": 5.814157343766049e-06, + "loss": 0.2179, + "step": 4106 + }, + { + "epoch": 1.9447173720035513, + "grad_norm": 1.1746879816055298, + "learning_rate": 5.8095133769177766e-06, + "loss": 0.1948, + "step": 4107 + }, + { + "epoch": 1.9451908848771826, + "grad_norm": 1.1779289245605469, + "learning_rate": 5.804870506050853e-06, + "loss": 0.2007, + "step": 4108 + }, + { + "epoch": 1.945664397750814, + "grad_norm": 0.9775161743164062, + "learning_rate": 5.800228732379574e-06, + "loss": 0.202, + "step": 4109 + }, + { + "epoch": 1.946137910624445, + "grad_norm": 1.0120432376861572, + "learning_rate": 5.795588057117958e-06, + "loss": 0.1857, + "step": 4110 + }, + { + "epoch": 1.9466114234980765, + "grad_norm": 1.3756533861160278, + "learning_rate": 5.790948481479721e-06, + "loss": 0.181, + "step": 4111 + }, + { + "epoch": 1.9470849363717075, + "grad_norm": 1.0722591876983643, + "learning_rate": 5.786310006678308e-06, + "loss": 0.2051, + "step": 4112 + }, + { + "epoch": 1.9475584492453388, + "grad_norm": 1.1463873386383057, + "learning_rate": 5.781672633926858e-06, + "loss": 0.1911, + "step": 4113 + }, + { + "epoch": 1.9480319621189701, + "grad_norm": 1.0058190822601318, + "learning_rate": 5.77703636443824e-06, + "loss": 0.2162, + "step": 4114 + }, + { + "epoch": 1.9485054749926014, + "grad_norm": 1.7192530632019043, + "learning_rate": 5.772401199425017e-06, + "loss": 0.2117, + "step": 4115 + }, + { + "epoch": 1.9489789878662327, + "grad_norm": 1.1650121212005615, + "learning_rate": 5.76776714009948e-06, + "loss": 0.2388, + "step": 4116 + }, + { + "epoch": 1.9494525007398638, + "grad_norm": 1.3511508703231812, + "learning_rate": 5.763134187673618e-06, + "loss": 0.2045, + "step": 4117 + }, + { + "epoch": 1.9499260136134953, + "grad_norm": 1.0509499311447144, + "learning_rate": 5.7585023433591315e-06, + "loss": 0.2094, + "step": 4118 + }, + { + "epoch": 1.9503995264871263, + "grad_norm": 1.0673551559448242, + "learning_rate": 5.7538716083674425e-06, + "loss": 0.1909, + "step": 4119 + }, + { + "epoch": 1.9508730393607576, + "grad_norm": 1.0487030744552612, + "learning_rate": 5.749241983909668e-06, + "loss": 0.2173, + "step": 4120 + }, + { + "epoch": 1.951346552234389, + "grad_norm": 1.2835848331451416, + "learning_rate": 5.744613471196648e-06, + "loss": 0.2258, + "step": 4121 + }, + { + "epoch": 1.95182006510802, + "grad_norm": 1.9468538761138916, + "learning_rate": 5.739986071438919e-06, + "loss": 0.2424, + "step": 4122 + }, + { + "epoch": 1.9522935779816515, + "grad_norm": 1.0411995649337769, + "learning_rate": 5.735359785846739e-06, + "loss": 0.1879, + "step": 4123 + }, + { + "epoch": 1.9527670908552826, + "grad_norm": 1.0039364099502563, + "learning_rate": 5.730734615630063e-06, + "loss": 0.1935, + "step": 4124 + }, + { + "epoch": 1.9532406037289138, + "grad_norm": 1.342782735824585, + "learning_rate": 5.7261105619985635e-06, + "loss": 0.2116, + "step": 4125 + }, + { + "epoch": 1.9537141166025451, + "grad_norm": 0.9727945923805237, + "learning_rate": 5.721487626161617e-06, + "loss": 0.2056, + "step": 4126 + }, + { + "epoch": 1.9541876294761764, + "grad_norm": 0.834372341632843, + "learning_rate": 5.7168658093283026e-06, + "loss": 0.1894, + "step": 4127 + }, + { + "epoch": 1.9546611423498077, + "grad_norm": 1.5934944152832031, + "learning_rate": 5.7122451127074185e-06, + "loss": 0.206, + "step": 4128 + }, + { + "epoch": 1.9551346552234388, + "grad_norm": 1.4059547185897827, + "learning_rate": 5.7076255375074574e-06, + "loss": 0.2246, + "step": 4129 + }, + { + "epoch": 1.9556081680970703, + "grad_norm": 1.7891411781311035, + "learning_rate": 5.703007084936631e-06, + "loss": 0.188, + "step": 4130 + }, + { + "epoch": 1.9560816809707013, + "grad_norm": 0.9586341381072998, + "learning_rate": 5.698389756202844e-06, + "loss": 0.2105, + "step": 4131 + }, + { + "epoch": 1.9565551938443326, + "grad_norm": 1.686652660369873, + "learning_rate": 5.693773552513723e-06, + "loss": 0.2437, + "step": 4132 + }, + { + "epoch": 1.957028706717964, + "grad_norm": 1.3186736106872559, + "learning_rate": 5.689158475076582e-06, + "loss": 0.2004, + "step": 4133 + }, + { + "epoch": 1.957502219591595, + "grad_norm": 1.3622384071350098, + "learning_rate": 5.6845445250984566e-06, + "loss": 0.2079, + "step": 4134 + }, + { + "epoch": 1.9579757324652265, + "grad_norm": 1.2416852712631226, + "learning_rate": 5.679931703786077e-06, + "loss": 0.2208, + "step": 4135 + }, + { + "epoch": 1.9584492453388576, + "grad_norm": 0.9193575978279114, + "learning_rate": 5.675320012345887e-06, + "loss": 0.2292, + "step": 4136 + }, + { + "epoch": 1.958922758212489, + "grad_norm": 1.2942508459091187, + "learning_rate": 5.670709451984022e-06, + "loss": 0.2146, + "step": 4137 + }, + { + "epoch": 1.9593962710861201, + "grad_norm": 1.3082016706466675, + "learning_rate": 5.666100023906336e-06, + "loss": 0.2173, + "step": 4138 + }, + { + "epoch": 1.9598697839597514, + "grad_norm": 1.0084069967269897, + "learning_rate": 5.66149172931838e-06, + "loss": 0.2172, + "step": 4139 + }, + { + "epoch": 1.9603432968333827, + "grad_norm": 1.1174718141555786, + "learning_rate": 5.656884569425407e-06, + "loss": 0.2017, + "step": 4140 + }, + { + "epoch": 1.9608168097070138, + "grad_norm": 1.1940710544586182, + "learning_rate": 5.6522785454323795e-06, + "loss": 0.2027, + "step": 4141 + }, + { + "epoch": 1.9612903225806453, + "grad_norm": 1.1007012128829956, + "learning_rate": 5.647673658543954e-06, + "loss": 0.2218, + "step": 4142 + }, + { + "epoch": 1.9617638354542764, + "grad_norm": 1.368607997894287, + "learning_rate": 5.643069909964491e-06, + "loss": 0.1888, + "step": 4143 + }, + { + "epoch": 1.9622373483279076, + "grad_norm": 1.2197048664093018, + "learning_rate": 5.638467300898067e-06, + "loss": 0.2261, + "step": 4144 + }, + { + "epoch": 1.962710861201539, + "grad_norm": 1.1047946214675903, + "learning_rate": 5.6338658325484395e-06, + "loss": 0.1864, + "step": 4145 + }, + { + "epoch": 1.9631843740751702, + "grad_norm": 1.3874999284744263, + "learning_rate": 5.629265506119086e-06, + "loss": 0.2419, + "step": 4146 + }, + { + "epoch": 1.9636578869488015, + "grad_norm": 1.1594977378845215, + "learning_rate": 5.62466632281317e-06, + "loss": 0.2087, + "step": 4147 + }, + { + "epoch": 1.9641313998224326, + "grad_norm": 1.0370484590530396, + "learning_rate": 5.620068283833573e-06, + "loss": 0.2138, + "step": 4148 + }, + { + "epoch": 1.964604912696064, + "grad_norm": 1.3975090980529785, + "learning_rate": 5.615471390382858e-06, + "loss": 0.1873, + "step": 4149 + }, + { + "epoch": 1.9650784255696951, + "grad_norm": 1.7761406898498535, + "learning_rate": 5.610875643663305e-06, + "loss": 0.215, + "step": 4150 + }, + { + "epoch": 1.9655519384433264, + "grad_norm": 1.3416228294372559, + "learning_rate": 5.606281044876887e-06, + "loss": 0.2152, + "step": 4151 + }, + { + "epoch": 1.9660254513169577, + "grad_norm": 1.7138333320617676, + "learning_rate": 5.601687595225269e-06, + "loss": 0.2265, + "step": 4152 + }, + { + "epoch": 1.9664989641905888, + "grad_norm": 1.7663260698318481, + "learning_rate": 5.597095295909833e-06, + "loss": 0.2109, + "step": 4153 + }, + { + "epoch": 1.9669724770642203, + "grad_norm": 1.0027509927749634, + "learning_rate": 5.592504148131645e-06, + "loss": 0.2036, + "step": 4154 + }, + { + "epoch": 1.9674459899378514, + "grad_norm": 0.9841294884681702, + "learning_rate": 5.587914153091479e-06, + "loss": 0.223, + "step": 4155 + }, + { + "epoch": 1.9679195028114826, + "grad_norm": 1.8754013776779175, + "learning_rate": 5.583325311989799e-06, + "loss": 0.1892, + "step": 4156 + }, + { + "epoch": 1.968393015685114, + "grad_norm": 1.138641357421875, + "learning_rate": 5.57873762602678e-06, + "loss": 0.2099, + "step": 4157 + }, + { + "epoch": 1.9688665285587452, + "grad_norm": 1.1637465953826904, + "learning_rate": 5.574151096402276e-06, + "loss": 0.1921, + "step": 4158 + }, + { + "epoch": 1.9693400414323765, + "grad_norm": 1.3505114316940308, + "learning_rate": 5.569565724315862e-06, + "loss": 0.2148, + "step": 4159 + }, + { + "epoch": 1.9698135543060076, + "grad_norm": 2.0613527297973633, + "learning_rate": 5.5649815109667874e-06, + "loss": 0.1949, + "step": 4160 + }, + { + "epoch": 1.970287067179639, + "grad_norm": 1.8924801349639893, + "learning_rate": 5.560398457554016e-06, + "loss": 0.2145, + "step": 4161 + }, + { + "epoch": 1.9707605800532702, + "grad_norm": 1.1464554071426392, + "learning_rate": 5.5558165652762e-06, + "loss": 0.2119, + "step": 4162 + }, + { + "epoch": 1.9712340929269014, + "grad_norm": 1.047352910041809, + "learning_rate": 5.551235835331682e-06, + "loss": 0.2148, + "step": 4163 + }, + { + "epoch": 1.9717076058005327, + "grad_norm": 1.259980320930481, + "learning_rate": 5.546656268918517e-06, + "loss": 0.2182, + "step": 4164 + }, + { + "epoch": 1.9721811186741638, + "grad_norm": 1.1720131635665894, + "learning_rate": 5.542077867234441e-06, + "loss": 0.1947, + "step": 4165 + }, + { + "epoch": 1.9726546315477953, + "grad_norm": 1.064708948135376, + "learning_rate": 5.537500631476895e-06, + "loss": 0.1935, + "step": 4166 + }, + { + "epoch": 1.9731281444214264, + "grad_norm": 1.1356375217437744, + "learning_rate": 5.5329245628430036e-06, + "loss": 0.1802, + "step": 4167 + }, + { + "epoch": 1.9736016572950577, + "grad_norm": 2.05705189704895, + "learning_rate": 5.528349662529604e-06, + "loss": 0.2095, + "step": 4168 + }, + { + "epoch": 1.974075170168689, + "grad_norm": 1.081849455833435, + "learning_rate": 5.5237759317332065e-06, + "loss": 0.2166, + "step": 4169 + }, + { + "epoch": 1.9745486830423202, + "grad_norm": 1.1514532566070557, + "learning_rate": 5.519203371650035e-06, + "loss": 0.2273, + "step": 4170 + }, + { + "epoch": 1.9750221959159515, + "grad_norm": 1.438201665878296, + "learning_rate": 5.514631983475995e-06, + "loss": 0.2014, + "step": 4171 + }, + { + "epoch": 1.9754957087895826, + "grad_norm": 1.2542989253997803, + "learning_rate": 5.510061768406683e-06, + "loss": 0.2045, + "step": 4172 + }, + { + "epoch": 1.975969221663214, + "grad_norm": 0.9703104496002197, + "learning_rate": 5.505492727637406e-06, + "loss": 0.1985, + "step": 4173 + }, + { + "epoch": 1.9764427345368452, + "grad_norm": 1.1615082025527954, + "learning_rate": 5.5009248623631416e-06, + "loss": 0.1902, + "step": 4174 + }, + { + "epoch": 1.9769162474104764, + "grad_norm": 1.1647366285324097, + "learning_rate": 5.496358173778582e-06, + "loss": 0.2114, + "step": 4175 + }, + { + "epoch": 1.9773897602841077, + "grad_norm": 1.780960202217102, + "learning_rate": 5.4917926630780895e-06, + "loss": 0.2196, + "step": 4176 + }, + { + "epoch": 1.977863273157739, + "grad_norm": 1.0951555967330933, + "learning_rate": 5.487228331455734e-06, + "loss": 0.1874, + "step": 4177 + }, + { + "epoch": 1.9783367860313703, + "grad_norm": 1.208298921585083, + "learning_rate": 5.482665180105278e-06, + "loss": 0.1986, + "step": 4178 + }, + { + "epoch": 1.9788102989050014, + "grad_norm": 1.1400575637817383, + "learning_rate": 5.4781032102201605e-06, + "loss": 0.2193, + "step": 4179 + }, + { + "epoch": 1.9792838117786329, + "grad_norm": 1.7641512155532837, + "learning_rate": 5.4735424229935274e-06, + "loss": 0.2038, + "step": 4180 + }, + { + "epoch": 1.979757324652264, + "grad_norm": 1.020308256149292, + "learning_rate": 5.468982819618204e-06, + "loss": 0.2143, + "step": 4181 + }, + { + "epoch": 1.9802308375258952, + "grad_norm": 0.9065782427787781, + "learning_rate": 5.464424401286715e-06, + "loss": 0.2096, + "step": 4182 + }, + { + "epoch": 1.9807043503995265, + "grad_norm": 1.3247904777526855, + "learning_rate": 5.459867169191267e-06, + "loss": 0.2037, + "step": 4183 + }, + { + "epoch": 1.9811778632731576, + "grad_norm": 1.2426649332046509, + "learning_rate": 5.455311124523762e-06, + "loss": 0.227, + "step": 4184 + }, + { + "epoch": 1.981651376146789, + "grad_norm": 0.9883340001106262, + "learning_rate": 5.4507562684757875e-06, + "loss": 0.1973, + "step": 4185 + }, + { + "epoch": 1.9821248890204202, + "grad_norm": 1.001715898513794, + "learning_rate": 5.446202602238626e-06, + "loss": 0.2071, + "step": 4186 + }, + { + "epoch": 1.9825984018940515, + "grad_norm": 0.9738106727600098, + "learning_rate": 5.441650127003244e-06, + "loss": 0.2014, + "step": 4187 + }, + { + "epoch": 1.9830719147676827, + "grad_norm": 1.1363893747329712, + "learning_rate": 5.4370988439602916e-06, + "loss": 0.1934, + "step": 4188 + }, + { + "epoch": 1.983545427641314, + "grad_norm": 1.2461233139038086, + "learning_rate": 5.4325487543001196e-06, + "loss": 0.2202, + "step": 4189 + }, + { + "epoch": 1.9840189405149453, + "grad_norm": 1.1095913648605347, + "learning_rate": 5.427999859212757e-06, + "loss": 0.2152, + "step": 4190 + }, + { + "epoch": 1.9844924533885764, + "grad_norm": 1.7337262630462646, + "learning_rate": 5.423452159887927e-06, + "loss": 0.2166, + "step": 4191 + }, + { + "epoch": 1.984965966262208, + "grad_norm": 0.9954274296760559, + "learning_rate": 5.41890565751503e-06, + "loss": 0.2174, + "step": 4192 + }, + { + "epoch": 1.985439479135839, + "grad_norm": 1.3146260976791382, + "learning_rate": 5.414360353283168e-06, + "loss": 0.1902, + "step": 4193 + }, + { + "epoch": 1.9859129920094702, + "grad_norm": 1.0898852348327637, + "learning_rate": 5.409816248381112e-06, + "loss": 0.2221, + "step": 4194 + }, + { + "epoch": 1.9863865048831015, + "grad_norm": 1.0851001739501953, + "learning_rate": 5.405273343997339e-06, + "loss": 0.1969, + "step": 4195 + }, + { + "epoch": 1.9868600177567326, + "grad_norm": 1.586696743965149, + "learning_rate": 5.400731641319996e-06, + "loss": 0.198, + "step": 4196 + }, + { + "epoch": 1.987333530630364, + "grad_norm": 1.0100387334823608, + "learning_rate": 5.39619114153692e-06, + "loss": 0.1984, + "step": 4197 + }, + { + "epoch": 1.9878070435039952, + "grad_norm": 1.0449753999710083, + "learning_rate": 5.39165184583564e-06, + "loss": 0.2017, + "step": 4198 + }, + { + "epoch": 1.9882805563776265, + "grad_norm": 1.3189187049865723, + "learning_rate": 5.387113755403357e-06, + "loss": 0.1889, + "step": 4199 + }, + { + "epoch": 1.9887540692512578, + "grad_norm": 1.8392877578735352, + "learning_rate": 5.3825768714269745e-06, + "loss": 0.2115, + "step": 4200 + }, + { + "epoch": 1.989227582124889, + "grad_norm": 0.9645484685897827, + "learning_rate": 5.378041195093063e-06, + "loss": 0.1919, + "step": 4201 + }, + { + "epoch": 1.9897010949985203, + "grad_norm": 1.2029494047164917, + "learning_rate": 5.3735067275878915e-06, + "loss": 0.2015, + "step": 4202 + }, + { + "epoch": 1.9901746078721514, + "grad_norm": 1.8107424974441528, + "learning_rate": 5.368973470097401e-06, + "loss": 0.209, + "step": 4203 + }, + { + "epoch": 1.990648120745783, + "grad_norm": 1.4543911218643188, + "learning_rate": 5.364441423807224e-06, + "loss": 0.2173, + "step": 4204 + }, + { + "epoch": 1.991121633619414, + "grad_norm": 1.1092660427093506, + "learning_rate": 5.359910589902674e-06, + "loss": 0.2003, + "step": 4205 + }, + { + "epoch": 1.9915951464930453, + "grad_norm": 1.1251716613769531, + "learning_rate": 5.355380969568742e-06, + "loss": 0.2065, + "step": 4206 + }, + { + "epoch": 1.9920686593666765, + "grad_norm": 1.116974949836731, + "learning_rate": 5.350852563990112e-06, + "loss": 0.2168, + "step": 4207 + }, + { + "epoch": 1.9925421722403078, + "grad_norm": 1.312705397605896, + "learning_rate": 5.34632537435114e-06, + "loss": 0.2283, + "step": 4208 + }, + { + "epoch": 1.9930156851139391, + "grad_norm": 1.2690129280090332, + "learning_rate": 5.341799401835877e-06, + "loss": 0.2151, + "step": 4209 + }, + { + "epoch": 1.9934891979875702, + "grad_norm": 0.9621937870979309, + "learning_rate": 5.3372746476280366e-06, + "loss": 0.2054, + "step": 4210 + }, + { + "epoch": 1.9939627108612017, + "grad_norm": 1.264472246170044, + "learning_rate": 5.3327511129110344e-06, + "loss": 0.2327, + "step": 4211 + }, + { + "epoch": 1.9944362237348328, + "grad_norm": 1.66592276096344, + "learning_rate": 5.328228798867947e-06, + "loss": 0.2133, + "step": 4212 + }, + { + "epoch": 1.994909736608464, + "grad_norm": 1.2937157154083252, + "learning_rate": 5.323707706681553e-06, + "loss": 0.2025, + "step": 4213 + }, + { + "epoch": 1.9953832494820953, + "grad_norm": 1.0050321817398071, + "learning_rate": 5.319187837534292e-06, + "loss": 0.2058, + "step": 4214 + }, + { + "epoch": 1.9958567623557264, + "grad_norm": 1.160569190979004, + "learning_rate": 5.314669192608296e-06, + "loss": 0.205, + "step": 4215 + }, + { + "epoch": 1.996330275229358, + "grad_norm": 0.9353100657463074, + "learning_rate": 5.310151773085376e-06, + "loss": 0.1983, + "step": 4216 + }, + { + "epoch": 1.996803788102989, + "grad_norm": 1.170721173286438, + "learning_rate": 5.3056355801470114e-06, + "loss": 0.2015, + "step": 4217 + }, + { + "epoch": 1.9972773009766203, + "grad_norm": 1.126014232635498, + "learning_rate": 5.301120614974378e-06, + "loss": 0.1832, + "step": 4218 + }, + { + "epoch": 1.9977508138502516, + "grad_norm": 0.9776692390441895, + "learning_rate": 5.296606878748313e-06, + "loss": 0.1971, + "step": 4219 + }, + { + "epoch": 1.9982243267238828, + "grad_norm": 1.0567349195480347, + "learning_rate": 5.292094372649348e-06, + "loss": 0.2223, + "step": 4220 + }, + { + "epoch": 1.9986978395975141, + "grad_norm": 1.1440393924713135, + "learning_rate": 5.287583097857682e-06, + "loss": 0.2532, + "step": 4221 + }, + { + "epoch": 1.9991713524711452, + "grad_norm": 0.9647483229637146, + "learning_rate": 5.283073055553191e-06, + "loss": 0.2087, + "step": 4222 + }, + { + "epoch": 1.9996448653447767, + "grad_norm": 1.0528312921524048, + "learning_rate": 5.278564246915441e-06, + "loss": 0.2172, + "step": 4223 + }, + { + "epoch": 2.0001183782184078, + "grad_norm": 1.2826507091522217, + "learning_rate": 5.27405667312366e-06, + "loss": 0.1835, + "step": 4224 + }, + { + "epoch": 2.0005918910920393, + "grad_norm": 2.0795340538024902, + "learning_rate": 5.269550335356769e-06, + "loss": 0.1905, + "step": 4225 + }, + { + "epoch": 2.0010654039656703, + "grad_norm": 1.348443627357483, + "learning_rate": 5.265045234793348e-06, + "loss": 0.1877, + "step": 4226 + }, + { + "epoch": 2.0015389168393014, + "grad_norm": 0.9791116118431091, + "learning_rate": 5.260541372611669e-06, + "loss": 0.2081, + "step": 4227 + }, + { + "epoch": 2.002012429712933, + "grad_norm": 0.9848561882972717, + "learning_rate": 5.256038749989671e-06, + "loss": 0.18, + "step": 4228 + }, + { + "epoch": 2.002485942586564, + "grad_norm": 1.2935410737991333, + "learning_rate": 5.251537368104974e-06, + "loss": 0.1972, + "step": 4229 + }, + { + "epoch": 2.0029594554601955, + "grad_norm": 1.8192050457000732, + "learning_rate": 5.2470372281348695e-06, + "loss": 0.2058, + "step": 4230 + }, + { + "epoch": 2.0034329683338266, + "grad_norm": 1.1813068389892578, + "learning_rate": 5.242538331256322e-06, + "loss": 0.2309, + "step": 4231 + }, + { + "epoch": 2.0039064812074576, + "grad_norm": 1.1924341917037964, + "learning_rate": 5.238040678645981e-06, + "loss": 0.1972, + "step": 4232 + }, + { + "epoch": 2.004379994081089, + "grad_norm": 1.2242668867111206, + "learning_rate": 5.233544271480158e-06, + "loss": 0.1799, + "step": 4233 + }, + { + "epoch": 2.00485350695472, + "grad_norm": 1.3374933004379272, + "learning_rate": 5.22904911093485e-06, + "loss": 0.2176, + "step": 4234 + }, + { + "epoch": 2.0053270198283517, + "grad_norm": 1.1583667993545532, + "learning_rate": 5.224555198185719e-06, + "loss": 0.1999, + "step": 4235 + }, + { + "epoch": 2.0058005327019828, + "grad_norm": 1.0789278745651245, + "learning_rate": 5.220062534408109e-06, + "loss": 0.2103, + "step": 4236 + }, + { + "epoch": 2.0062740455756143, + "grad_norm": 1.0617800951004028, + "learning_rate": 5.215571120777027e-06, + "loss": 0.1883, + "step": 4237 + }, + { + "epoch": 2.0067475584492453, + "grad_norm": 1.34521484375, + "learning_rate": 5.211080958467166e-06, + "loss": 0.2037, + "step": 4238 + }, + { + "epoch": 2.0072210713228764, + "grad_norm": 1.116936445236206, + "learning_rate": 5.206592048652876e-06, + "loss": 0.1971, + "step": 4239 + }, + { + "epoch": 2.007694584196508, + "grad_norm": 2.06673264503479, + "learning_rate": 5.202104392508198e-06, + "loss": 0.2102, + "step": 4240 + }, + { + "epoch": 2.008168097070139, + "grad_norm": 0.9581062197685242, + "learning_rate": 5.19761799120683e-06, + "loss": 0.214, + "step": 4241 + }, + { + "epoch": 2.0086416099437705, + "grad_norm": 1.1789650917053223, + "learning_rate": 5.193132845922143e-06, + "loss": 0.2078, + "step": 4242 + }, + { + "epoch": 2.0091151228174016, + "grad_norm": 1.1819120645523071, + "learning_rate": 5.188648957827191e-06, + "loss": 0.1923, + "step": 4243 + }, + { + "epoch": 2.0095886356910326, + "grad_norm": 2.0509867668151855, + "learning_rate": 5.184166328094684e-06, + "loss": 0.2147, + "step": 4244 + }, + { + "epoch": 2.010062148564664, + "grad_norm": 1.0577517747879028, + "learning_rate": 5.17968495789702e-06, + "loss": 0.2028, + "step": 4245 + }, + { + "epoch": 2.010535661438295, + "grad_norm": 1.2950037717819214, + "learning_rate": 5.175204848406248e-06, + "loss": 0.1884, + "step": 4246 + }, + { + "epoch": 2.0110091743119267, + "grad_norm": 1.7854464054107666, + "learning_rate": 5.170726000794105e-06, + "loss": 0.1976, + "step": 4247 + }, + { + "epoch": 2.011482687185558, + "grad_norm": 1.12384831905365, + "learning_rate": 5.166248416231985e-06, + "loss": 0.1866, + "step": 4248 + }, + { + "epoch": 2.0119562000591893, + "grad_norm": 1.3171573877334595, + "learning_rate": 5.161772095890963e-06, + "loss": 0.1952, + "step": 4249 + }, + { + "epoch": 2.0124297129328204, + "grad_norm": 1.4037394523620605, + "learning_rate": 5.157297040941775e-06, + "loss": 0.1977, + "step": 4250 + }, + { + "epoch": 2.0129032258064514, + "grad_norm": 1.2727725505828857, + "learning_rate": 5.152823252554824e-06, + "loss": 0.2269, + "step": 4251 + }, + { + "epoch": 2.013376738680083, + "grad_norm": 1.1373170614242554, + "learning_rate": 5.1483507319001925e-06, + "loss": 0.2105, + "step": 4252 + }, + { + "epoch": 2.013850251553714, + "grad_norm": 1.0914908647537231, + "learning_rate": 5.143879480147616e-06, + "loss": 0.225, + "step": 4253 + }, + { + "epoch": 2.0143237644273455, + "grad_norm": 1.7061387300491333, + "learning_rate": 5.1394094984665185e-06, + "loss": 0.1815, + "step": 4254 + }, + { + "epoch": 2.0147972773009766, + "grad_norm": 1.8330706357955933, + "learning_rate": 5.134940788025978e-06, + "loss": 0.1946, + "step": 4255 + }, + { + "epoch": 2.015270790174608, + "grad_norm": 1.9821844100952148, + "learning_rate": 5.130473349994737e-06, + "loss": 0.2131, + "step": 4256 + }, + { + "epoch": 2.015744303048239, + "grad_norm": 1.2151135206222534, + "learning_rate": 5.1260071855412175e-06, + "loss": 0.1673, + "step": 4257 + }, + { + "epoch": 2.01621781592187, + "grad_norm": 1.4756810665130615, + "learning_rate": 5.121542295833493e-06, + "loss": 0.2035, + "step": 4258 + }, + { + "epoch": 2.0166913287955017, + "grad_norm": 1.570881724357605, + "learning_rate": 5.117078682039323e-06, + "loss": 0.2112, + "step": 4259 + }, + { + "epoch": 2.017164841669133, + "grad_norm": 1.0294634103775024, + "learning_rate": 5.112616345326114e-06, + "loss": 0.2062, + "step": 4260 + }, + { + "epoch": 2.0176383545427643, + "grad_norm": 0.987274169921875, + "learning_rate": 5.108155286860953e-06, + "loss": 0.2028, + "step": 4261 + }, + { + "epoch": 2.0181118674163954, + "grad_norm": 1.121518611907959, + "learning_rate": 5.10369550781058e-06, + "loss": 0.2018, + "step": 4262 + }, + { + "epoch": 2.0185853802900264, + "grad_norm": 0.953002393245697, + "learning_rate": 5.099237009341417e-06, + "loss": 0.1732, + "step": 4263 + }, + { + "epoch": 2.019058893163658, + "grad_norm": 0.955437183380127, + "learning_rate": 5.094779792619531e-06, + "loss": 0.1911, + "step": 4264 + }, + { + "epoch": 2.019532406037289, + "grad_norm": 1.8718898296356201, + "learning_rate": 5.0903238588106725e-06, + "loss": 0.1966, + "step": 4265 + }, + { + "epoch": 2.0200059189109205, + "grad_norm": 1.4193190336227417, + "learning_rate": 5.085869209080246e-06, + "loss": 0.2138, + "step": 4266 + }, + { + "epoch": 2.0204794317845516, + "grad_norm": 1.1836130619049072, + "learning_rate": 5.081415844593314e-06, + "loss": 0.1817, + "step": 4267 + }, + { + "epoch": 2.020952944658183, + "grad_norm": 2.136206865310669, + "learning_rate": 5.076963766514622e-06, + "loss": 0.21, + "step": 4268 + }, + { + "epoch": 2.021426457531814, + "grad_norm": 1.0202592611312866, + "learning_rate": 5.072512976008559e-06, + "loss": 0.223, + "step": 4269 + }, + { + "epoch": 2.0218999704054452, + "grad_norm": 1.1156057119369507, + "learning_rate": 5.068063474239195e-06, + "loss": 0.2055, + "step": 4270 + }, + { + "epoch": 2.0223734832790767, + "grad_norm": 1.3342862129211426, + "learning_rate": 5.063615262370247e-06, + "loss": 0.1872, + "step": 4271 + }, + { + "epoch": 2.022846996152708, + "grad_norm": 1.031692624092102, + "learning_rate": 5.059168341565109e-06, + "loss": 0.2132, + "step": 4272 + }, + { + "epoch": 2.0233205090263393, + "grad_norm": 1.07035493850708, + "learning_rate": 5.0547227129868225e-06, + "loss": 0.2238, + "step": 4273 + }, + { + "epoch": 2.0237940218999704, + "grad_norm": 1.6974101066589355, + "learning_rate": 5.050278377798105e-06, + "loss": 0.194, + "step": 4274 + }, + { + "epoch": 2.0242675347736014, + "grad_norm": 1.17416512966156, + "learning_rate": 5.045835337161329e-06, + "loss": 0.2137, + "step": 4275 + }, + { + "epoch": 2.024741047647233, + "grad_norm": 1.0354957580566406, + "learning_rate": 5.041393592238521e-06, + "loss": 0.2022, + "step": 4276 + }, + { + "epoch": 2.025214560520864, + "grad_norm": 1.328599452972412, + "learning_rate": 5.036953144191388e-06, + "loss": 0.2116, + "step": 4277 + }, + { + "epoch": 2.0256880733944955, + "grad_norm": 0.9613640308380127, + "learning_rate": 5.032513994181276e-06, + "loss": 0.1968, + "step": 4278 + }, + { + "epoch": 2.0261615862681266, + "grad_norm": 1.4717971086502075, + "learning_rate": 5.0280761433692095e-06, + "loss": 0.2161, + "step": 4279 + }, + { + "epoch": 2.026635099141758, + "grad_norm": 1.2015409469604492, + "learning_rate": 5.023639592915861e-06, + "loss": 0.2002, + "step": 4280 + }, + { + "epoch": 2.027108612015389, + "grad_norm": 0.9941813945770264, + "learning_rate": 5.019204343981572e-06, + "loss": 0.1973, + "step": 4281 + }, + { + "epoch": 2.0275821248890202, + "grad_norm": 1.1398041248321533, + "learning_rate": 5.0147703977263315e-06, + "loss": 0.2201, + "step": 4282 + }, + { + "epoch": 2.0280556377626517, + "grad_norm": 1.4286599159240723, + "learning_rate": 5.010337755309804e-06, + "loss": 0.1974, + "step": 4283 + }, + { + "epoch": 2.028529150636283, + "grad_norm": 1.7072778940200806, + "learning_rate": 5.005906417891302e-06, + "loss": 0.204, + "step": 4284 + }, + { + "epoch": 2.0290026635099143, + "grad_norm": 1.2885264158248901, + "learning_rate": 5.0014763866297935e-06, + "loss": 0.2257, + "step": 4285 + }, + { + "epoch": 2.0294761763835454, + "grad_norm": 1.2946069240570068, + "learning_rate": 4.997047662683917e-06, + "loss": 0.2029, + "step": 4286 + }, + { + "epoch": 2.029949689257177, + "grad_norm": 1.0730814933776855, + "learning_rate": 4.992620247211957e-06, + "loss": 0.208, + "step": 4287 + }, + { + "epoch": 2.030423202130808, + "grad_norm": 1.4278727769851685, + "learning_rate": 4.988194141371868e-06, + "loss": 0.1989, + "step": 4288 + }, + { + "epoch": 2.030896715004439, + "grad_norm": 0.9635195136070251, + "learning_rate": 4.9837693463212474e-06, + "loss": 0.2119, + "step": 4289 + }, + { + "epoch": 2.0313702278780705, + "grad_norm": 1.49040949344635, + "learning_rate": 4.979345863217366e-06, + "loss": 0.1987, + "step": 4290 + }, + { + "epoch": 2.0318437407517016, + "grad_norm": 1.3482121229171753, + "learning_rate": 4.974923693217135e-06, + "loss": 0.2184, + "step": 4291 + }, + { + "epoch": 2.032317253625333, + "grad_norm": 1.1210942268371582, + "learning_rate": 4.970502837477135e-06, + "loss": 0.211, + "step": 4292 + }, + { + "epoch": 2.032790766498964, + "grad_norm": 1.4026209115982056, + "learning_rate": 4.9660832971536014e-06, + "loss": 0.1975, + "step": 4293 + }, + { + "epoch": 2.0332642793725952, + "grad_norm": 1.885514259338379, + "learning_rate": 4.961665073402413e-06, + "loss": 0.2031, + "step": 4294 + }, + { + "epoch": 2.0337377922462267, + "grad_norm": 1.712158441543579, + "learning_rate": 4.957248167379124e-06, + "loss": 0.202, + "step": 4295 + }, + { + "epoch": 2.034211305119858, + "grad_norm": 1.4192404747009277, + "learning_rate": 4.9528325802389246e-06, + "loss": 0.2093, + "step": 4296 + }, + { + "epoch": 2.0346848179934893, + "grad_norm": 1.8846938610076904, + "learning_rate": 4.948418313136676e-06, + "loss": 0.2055, + "step": 4297 + }, + { + "epoch": 2.0351583308671204, + "grad_norm": 1.1656869649887085, + "learning_rate": 4.94400536722688e-06, + "loss": 0.2003, + "step": 4298 + }, + { + "epoch": 2.035631843740752, + "grad_norm": 1.072436809539795, + "learning_rate": 4.9395937436637096e-06, + "loss": 0.1975, + "step": 4299 + }, + { + "epoch": 2.036105356614383, + "grad_norm": 1.109155297279358, + "learning_rate": 4.935183443600976e-06, + "loss": 0.191, + "step": 4300 + }, + { + "epoch": 2.036578869488014, + "grad_norm": 1.9992730617523193, + "learning_rate": 4.930774468192147e-06, + "loss": 0.2152, + "step": 4301 + }, + { + "epoch": 2.0370523823616455, + "grad_norm": 1.2080824375152588, + "learning_rate": 4.926366818590358e-06, + "loss": 0.2105, + "step": 4302 + }, + { + "epoch": 2.0375258952352766, + "grad_norm": 1.5259896516799927, + "learning_rate": 4.921960495948377e-06, + "loss": 0.2042, + "step": 4303 + }, + { + "epoch": 2.037999408108908, + "grad_norm": 1.8273221254348755, + "learning_rate": 4.917555501418643e-06, + "loss": 0.2145, + "step": 4304 + }, + { + "epoch": 2.038472920982539, + "grad_norm": 1.103759765625, + "learning_rate": 4.913151836153232e-06, + "loss": 0.1831, + "step": 4305 + }, + { + "epoch": 2.0389464338561702, + "grad_norm": 1.3072751760482788, + "learning_rate": 4.908749501303889e-06, + "loss": 0.1974, + "step": 4306 + }, + { + "epoch": 2.0394199467298018, + "grad_norm": 1.1292860507965088, + "learning_rate": 4.904348498021993e-06, + "loss": 0.1865, + "step": 4307 + }, + { + "epoch": 2.039893459603433, + "grad_norm": 1.2447689771652222, + "learning_rate": 4.8999488274585935e-06, + "loss": 0.2132, + "step": 4308 + }, + { + "epoch": 2.0403669724770643, + "grad_norm": 0.8923916816711426, + "learning_rate": 4.895550490764377e-06, + "loss": 0.1893, + "step": 4309 + }, + { + "epoch": 2.0408404853506954, + "grad_norm": 1.4532053470611572, + "learning_rate": 4.891153489089681e-06, + "loss": 0.2084, + "step": 4310 + }, + { + "epoch": 2.041313998224327, + "grad_norm": 1.329358696937561, + "learning_rate": 4.886757823584511e-06, + "loss": 0.1988, + "step": 4311 + }, + { + "epoch": 2.041787511097958, + "grad_norm": 1.5044658184051514, + "learning_rate": 4.882363495398498e-06, + "loss": 0.1831, + "step": 4312 + }, + { + "epoch": 2.042261023971589, + "grad_norm": 1.3755611181259155, + "learning_rate": 4.877970505680946e-06, + "loss": 0.2147, + "step": 4313 + }, + { + "epoch": 2.0427345368452205, + "grad_norm": 0.9415785074234009, + "learning_rate": 4.8735788555807905e-06, + "loss": 0.1909, + "step": 4314 + }, + { + "epoch": 2.0432080497188516, + "grad_norm": 1.1154485940933228, + "learning_rate": 4.8691885462466345e-06, + "loss": 0.2137, + "step": 4315 + }, + { + "epoch": 2.043681562592483, + "grad_norm": 1.1781600713729858, + "learning_rate": 4.864799578826713e-06, + "loss": 0.2118, + "step": 4316 + }, + { + "epoch": 2.044155075466114, + "grad_norm": 1.8692867755889893, + "learning_rate": 4.860411954468925e-06, + "loss": 0.2269, + "step": 4317 + }, + { + "epoch": 2.0446285883397457, + "grad_norm": 1.1391620635986328, + "learning_rate": 4.856025674320803e-06, + "loss": 0.199, + "step": 4318 + }, + { + "epoch": 2.0451021012133768, + "grad_norm": 1.0855448246002197, + "learning_rate": 4.851640739529547e-06, + "loss": 0.191, + "step": 4319 + }, + { + "epoch": 2.045575614087008, + "grad_norm": 1.8946772813796997, + "learning_rate": 4.847257151241987e-06, + "loss": 0.2126, + "step": 4320 + }, + { + "epoch": 2.0460491269606393, + "grad_norm": 1.7969681024551392, + "learning_rate": 4.842874910604606e-06, + "loss": 0.2058, + "step": 4321 + }, + { + "epoch": 2.0465226398342704, + "grad_norm": 1.028253436088562, + "learning_rate": 4.838494018763546e-06, + "loss": 0.2044, + "step": 4322 + }, + { + "epoch": 2.046996152707902, + "grad_norm": 1.3800286054611206, + "learning_rate": 4.8341144768645754e-06, + "loss": 0.2053, + "step": 4323 + }, + { + "epoch": 2.047469665581533, + "grad_norm": 0.9476808905601501, + "learning_rate": 4.829736286053131e-06, + "loss": 0.2135, + "step": 4324 + }, + { + "epoch": 2.047943178455164, + "grad_norm": 1.1911259889602661, + "learning_rate": 4.82535944747428e-06, + "loss": 0.2038, + "step": 4325 + }, + { + "epoch": 2.0484166913287956, + "grad_norm": 0.8657131791114807, + "learning_rate": 4.820983962272748e-06, + "loss": 0.1959, + "step": 4326 + }, + { + "epoch": 2.0488902042024266, + "grad_norm": 2.2093346118927, + "learning_rate": 4.816609831592895e-06, + "loss": 0.208, + "step": 4327 + }, + { + "epoch": 2.049363717076058, + "grad_norm": 1.8499171733856201, + "learning_rate": 4.812237056578738e-06, + "loss": 0.206, + "step": 4328 + }, + { + "epoch": 2.049837229949689, + "grad_norm": 1.404910683631897, + "learning_rate": 4.807865638373932e-06, + "loss": 0.1919, + "step": 4329 + }, + { + "epoch": 2.0503107428233207, + "grad_norm": 0.8892547488212585, + "learning_rate": 4.803495578121775e-06, + "loss": 0.1778, + "step": 4330 + }, + { + "epoch": 2.0507842556969518, + "grad_norm": 1.3549368381500244, + "learning_rate": 4.799126876965219e-06, + "loss": 0.202, + "step": 4331 + }, + { + "epoch": 2.051257768570583, + "grad_norm": 1.4522606134414673, + "learning_rate": 4.794759536046854e-06, + "loss": 0.1813, + "step": 4332 + }, + { + "epoch": 2.0517312814442143, + "grad_norm": 1.1356416940689087, + "learning_rate": 4.790393556508918e-06, + "loss": 0.1793, + "step": 4333 + }, + { + "epoch": 2.0522047943178454, + "grad_norm": 1.0403809547424316, + "learning_rate": 4.786028939493292e-06, + "loss": 0.2073, + "step": 4334 + }, + { + "epoch": 2.052678307191477, + "grad_norm": 1.0235661268234253, + "learning_rate": 4.781665686141493e-06, + "loss": 0.1865, + "step": 4335 + }, + { + "epoch": 2.053151820065108, + "grad_norm": 1.1404122114181519, + "learning_rate": 4.777303797594694e-06, + "loss": 0.1997, + "step": 4336 + }, + { + "epoch": 2.053625332938739, + "grad_norm": 1.2667036056518555, + "learning_rate": 4.772943274993701e-06, + "loss": 0.2054, + "step": 4337 + }, + { + "epoch": 2.0540988458123706, + "grad_norm": 1.177870273590088, + "learning_rate": 4.768584119478971e-06, + "loss": 0.2103, + "step": 4338 + }, + { + "epoch": 2.0545723586860016, + "grad_norm": 0.9611978530883789, + "learning_rate": 4.764226332190591e-06, + "loss": 0.203, + "step": 4339 + }, + { + "epoch": 2.055045871559633, + "grad_norm": 1.3527143001556396, + "learning_rate": 4.759869914268308e-06, + "loss": 0.2133, + "step": 4340 + }, + { + "epoch": 2.055519384433264, + "grad_norm": 0.833387017250061, + "learning_rate": 4.7555148668514925e-06, + "loss": 0.1858, + "step": 4341 + }, + { + "epoch": 2.0559928973068957, + "grad_norm": 1.0796325206756592, + "learning_rate": 4.751161191079173e-06, + "loss": 0.21, + "step": 4342 + }, + { + "epoch": 2.056466410180527, + "grad_norm": 0.8967043161392212, + "learning_rate": 4.746808888090004e-06, + "loss": 0.1963, + "step": 4343 + }, + { + "epoch": 2.056939923054158, + "grad_norm": 1.2309699058532715, + "learning_rate": 4.742457959022296e-06, + "loss": 0.2024, + "step": 4344 + }, + { + "epoch": 2.0574134359277894, + "grad_norm": 1.511290431022644, + "learning_rate": 4.738108405013988e-06, + "loss": 0.2285, + "step": 4345 + }, + { + "epoch": 2.0578869488014204, + "grad_norm": 2.3464651107788086, + "learning_rate": 4.73376022720266e-06, + "loss": 0.2228, + "step": 4346 + }, + { + "epoch": 2.058360461675052, + "grad_norm": 1.3974965810775757, + "learning_rate": 4.729413426725546e-06, + "loss": 0.2022, + "step": 4347 + }, + { + "epoch": 2.058833974548683, + "grad_norm": 1.8854213953018188, + "learning_rate": 4.725068004719499e-06, + "loss": 0.1891, + "step": 4348 + }, + { + "epoch": 2.0593074874223145, + "grad_norm": 1.3588758707046509, + "learning_rate": 4.72072396232103e-06, + "loss": 0.2084, + "step": 4349 + }, + { + "epoch": 2.0597810002959456, + "grad_norm": 1.5690404176712036, + "learning_rate": 4.716381300666275e-06, + "loss": 0.1923, + "step": 4350 + }, + { + "epoch": 2.0602545131695766, + "grad_norm": 1.721727967262268, + "learning_rate": 4.7120400208910235e-06, + "loss": 0.1893, + "step": 4351 + }, + { + "epoch": 2.060728026043208, + "grad_norm": 1.5722174644470215, + "learning_rate": 4.707700124130686e-06, + "loss": 0.1797, + "step": 4352 + }, + { + "epoch": 2.061201538916839, + "grad_norm": 1.4766589403152466, + "learning_rate": 4.703361611520331e-06, + "loss": 0.2187, + "step": 4353 + }, + { + "epoch": 2.0616750517904707, + "grad_norm": 1.067948341369629, + "learning_rate": 4.699024484194648e-06, + "loss": 0.2299, + "step": 4354 + }, + { + "epoch": 2.062148564664102, + "grad_norm": 1.0318337678909302, + "learning_rate": 4.694688743287966e-06, + "loss": 0.2086, + "step": 4355 + }, + { + "epoch": 2.062622077537733, + "grad_norm": 1.2142385244369507, + "learning_rate": 4.6903543899342685e-06, + "loss": 0.2005, + "step": 4356 + }, + { + "epoch": 2.0630955904113644, + "grad_norm": 1.0302040576934814, + "learning_rate": 4.686021425267152e-06, + "loss": 0.2019, + "step": 4357 + }, + { + "epoch": 2.0635691032849954, + "grad_norm": 0.8725736737251282, + "learning_rate": 4.681689850419871e-06, + "loss": 0.1938, + "step": 4358 + }, + { + "epoch": 2.064042616158627, + "grad_norm": 0.9284090995788574, + "learning_rate": 4.677359666525299e-06, + "loss": 0.1988, + "step": 4359 + }, + { + "epoch": 2.064516129032258, + "grad_norm": 1.0550601482391357, + "learning_rate": 4.673030874715961e-06, + "loss": 0.2268, + "step": 4360 + }, + { + "epoch": 2.0649896419058895, + "grad_norm": 2.1118485927581787, + "learning_rate": 4.668703476124005e-06, + "loss": 0.2087, + "step": 4361 + }, + { + "epoch": 2.0654631547795206, + "grad_norm": 1.2798197269439697, + "learning_rate": 4.664377471881226e-06, + "loss": 0.2028, + "step": 4362 + }, + { + "epoch": 2.0659366676531516, + "grad_norm": 1.4490396976470947, + "learning_rate": 4.660052863119046e-06, + "loss": 0.2108, + "step": 4363 + }, + { + "epoch": 2.066410180526783, + "grad_norm": 1.0242908000946045, + "learning_rate": 4.65572965096852e-06, + "loss": 0.1924, + "step": 4364 + }, + { + "epoch": 2.0668836934004142, + "grad_norm": 1.2563238143920898, + "learning_rate": 4.651407836560351e-06, + "loss": 0.1882, + "step": 4365 + }, + { + "epoch": 2.0673572062740457, + "grad_norm": 1.3273645639419556, + "learning_rate": 4.6470874210248595e-06, + "loss": 0.2313, + "step": 4366 + }, + { + "epoch": 2.067830719147677, + "grad_norm": 2.203810453414917, + "learning_rate": 4.642768405492016e-06, + "loss": 0.206, + "step": 4367 + }, + { + "epoch": 2.068304232021308, + "grad_norm": 2.2822744846343994, + "learning_rate": 4.63845079109141e-06, + "loss": 0.2121, + "step": 4368 + }, + { + "epoch": 2.0687777448949394, + "grad_norm": 1.70283043384552, + "learning_rate": 4.63413457895228e-06, + "loss": 0.1918, + "step": 4369 + }, + { + "epoch": 2.0692512577685704, + "grad_norm": 1.6546306610107422, + "learning_rate": 4.629819770203482e-06, + "loss": 0.2086, + "step": 4370 + }, + { + "epoch": 2.069724770642202, + "grad_norm": 1.611094355583191, + "learning_rate": 4.625506365973515e-06, + "loss": 0.2005, + "step": 4371 + }, + { + "epoch": 2.070198283515833, + "grad_norm": 1.6032445430755615, + "learning_rate": 4.621194367390515e-06, + "loss": 0.1965, + "step": 4372 + }, + { + "epoch": 2.0706717963894645, + "grad_norm": 1.065320372581482, + "learning_rate": 4.6168837755822326e-06, + "loss": 0.213, + "step": 4373 + }, + { + "epoch": 2.0711453092630956, + "grad_norm": 1.1174391508102417, + "learning_rate": 4.612574591676071e-06, + "loss": 0.1863, + "step": 4374 + }, + { + "epoch": 2.0716188221367267, + "grad_norm": 1.169047236442566, + "learning_rate": 4.608266816799049e-06, + "loss": 0.2003, + "step": 4375 + }, + { + "epoch": 2.072092335010358, + "grad_norm": 1.628919005393982, + "learning_rate": 4.603960452077828e-06, + "loss": 0.21, + "step": 4376 + }, + { + "epoch": 2.0725658478839892, + "grad_norm": 1.1893731355667114, + "learning_rate": 4.599655498638691e-06, + "loss": 0.2092, + "step": 4377 + }, + { + "epoch": 2.0730393607576207, + "grad_norm": 1.3188472986221313, + "learning_rate": 4.595351957607564e-06, + "loss": 0.2084, + "step": 4378 + }, + { + "epoch": 2.073512873631252, + "grad_norm": 1.0750354528427124, + "learning_rate": 4.5910498301099935e-06, + "loss": 0.2228, + "step": 4379 + }, + { + "epoch": 2.073986386504883, + "grad_norm": 1.0731077194213867, + "learning_rate": 4.5867491172711546e-06, + "loss": 0.2001, + "step": 4380 + }, + { + "epoch": 2.0744598993785144, + "grad_norm": 0.9855295419692993, + "learning_rate": 4.582449820215865e-06, + "loss": 0.2041, + "step": 4381 + }, + { + "epoch": 2.0749334122521454, + "grad_norm": 1.3802636861801147, + "learning_rate": 4.578151940068558e-06, + "loss": 0.2164, + "step": 4382 + }, + { + "epoch": 2.075406925125777, + "grad_norm": 1.4265203475952148, + "learning_rate": 4.57385547795331e-06, + "loss": 0.2146, + "step": 4383 + }, + { + "epoch": 2.075880437999408, + "grad_norm": 1.3063853979110718, + "learning_rate": 4.569560434993809e-06, + "loss": 0.1938, + "step": 4384 + }, + { + "epoch": 2.0763539508730395, + "grad_norm": 1.087546467781067, + "learning_rate": 4.5652668123133925e-06, + "loss": 0.1883, + "step": 4385 + }, + { + "epoch": 2.0768274637466706, + "grad_norm": 1.0859131813049316, + "learning_rate": 4.560974611035007e-06, + "loss": 0.2257, + "step": 4386 + }, + { + "epoch": 2.0773009766203017, + "grad_norm": 1.2528398036956787, + "learning_rate": 4.556683832281246e-06, + "loss": 0.1905, + "step": 4387 + }, + { + "epoch": 2.077774489493933, + "grad_norm": 1.1462734937667847, + "learning_rate": 4.552394477174316e-06, + "loss": 0.191, + "step": 4388 + }, + { + "epoch": 2.0782480023675642, + "grad_norm": 1.486925721168518, + "learning_rate": 4.5481065468360515e-06, + "loss": 0.1935, + "step": 4389 + }, + { + "epoch": 2.0787215152411957, + "grad_norm": 1.0987601280212402, + "learning_rate": 4.5438200423879285e-06, + "loss": 0.213, + "step": 4390 + }, + { + "epoch": 2.079195028114827, + "grad_norm": 0.965355396270752, + "learning_rate": 4.539534964951033e-06, + "loss": 0.2278, + "step": 4391 + }, + { + "epoch": 2.0796685409884583, + "grad_norm": 1.4318578243255615, + "learning_rate": 4.535251315646093e-06, + "loss": 0.2028, + "step": 4392 + }, + { + "epoch": 2.0801420538620894, + "grad_norm": 1.0449013710021973, + "learning_rate": 4.53096909559345e-06, + "loss": 0.2047, + "step": 4393 + }, + { + "epoch": 2.0806155667357205, + "grad_norm": 2.591721296310425, + "learning_rate": 4.526688305913081e-06, + "loss": 0.2128, + "step": 4394 + }, + { + "epoch": 2.081089079609352, + "grad_norm": 1.750844955444336, + "learning_rate": 4.52240894772458e-06, + "loss": 0.2035, + "step": 4395 + }, + { + "epoch": 2.081562592482983, + "grad_norm": 1.5162577629089355, + "learning_rate": 4.51813102214718e-06, + "loss": 0.2092, + "step": 4396 + }, + { + "epoch": 2.0820361053566145, + "grad_norm": 2.782047986984253, + "learning_rate": 4.513854530299723e-06, + "loss": 0.1874, + "step": 4397 + }, + { + "epoch": 2.0825096182302456, + "grad_norm": 2.013601779937744, + "learning_rate": 4.50957947330069e-06, + "loss": 0.2174, + "step": 4398 + }, + { + "epoch": 2.0829831311038767, + "grad_norm": 1.1781798601150513, + "learning_rate": 4.50530585226818e-06, + "loss": 0.1953, + "step": 4399 + }, + { + "epoch": 2.083456643977508, + "grad_norm": 1.7047263383865356, + "learning_rate": 4.501033668319913e-06, + "loss": 0.202, + "step": 4400 + }, + { + "epoch": 2.0839301568511392, + "grad_norm": 1.3971747159957886, + "learning_rate": 4.496762922573244e-06, + "loss": 0.2062, + "step": 4401 + }, + { + "epoch": 2.0844036697247708, + "grad_norm": 1.4985466003417969, + "learning_rate": 4.492493616145137e-06, + "loss": 0.1795, + "step": 4402 + }, + { + "epoch": 2.084877182598402, + "grad_norm": 1.2991846799850464, + "learning_rate": 4.4882257501521975e-06, + "loss": 0.1858, + "step": 4403 + }, + { + "epoch": 2.0853506954720333, + "grad_norm": 1.194708228111267, + "learning_rate": 4.483959325710636e-06, + "loss": 0.1944, + "step": 4404 + }, + { + "epoch": 2.0858242083456644, + "grad_norm": 1.1843652725219727, + "learning_rate": 4.479694343936303e-06, + "loss": 0.2054, + "step": 4405 + }, + { + "epoch": 2.0862977212192955, + "grad_norm": 1.1956802606582642, + "learning_rate": 4.4754308059446546e-06, + "loss": 0.2074, + "step": 4406 + }, + { + "epoch": 2.086771234092927, + "grad_norm": 1.1742326021194458, + "learning_rate": 4.471168712850787e-06, + "loss": 0.2047, + "step": 4407 + }, + { + "epoch": 2.087244746966558, + "grad_norm": 1.7787115573883057, + "learning_rate": 4.466908065769404e-06, + "loss": 0.2243, + "step": 4408 + }, + { + "epoch": 2.0877182598401895, + "grad_norm": 1.4370203018188477, + "learning_rate": 4.46264886581483e-06, + "loss": 0.25, + "step": 4409 + }, + { + "epoch": 2.0881917727138206, + "grad_norm": 1.0106841325759888, + "learning_rate": 4.458391114101034e-06, + "loss": 0.2019, + "step": 4410 + }, + { + "epoch": 2.088665285587452, + "grad_norm": 0.9871804714202881, + "learning_rate": 4.454134811741577e-06, + "loss": 0.1807, + "step": 4411 + }, + { + "epoch": 2.089138798461083, + "grad_norm": 1.2592934370040894, + "learning_rate": 4.449879959849662e-06, + "loss": 0.215, + "step": 4412 + }, + { + "epoch": 2.0896123113347143, + "grad_norm": 1.557584285736084, + "learning_rate": 4.445626559538101e-06, + "loss": 0.1972, + "step": 4413 + }, + { + "epoch": 2.0900858242083458, + "grad_norm": 1.67927885055542, + "learning_rate": 4.4413746119193245e-06, + "loss": 0.1969, + "step": 4414 + }, + { + "epoch": 2.090559337081977, + "grad_norm": 1.2148725986480713, + "learning_rate": 4.437124118105397e-06, + "loss": 0.2133, + "step": 4415 + }, + { + "epoch": 2.0910328499556083, + "grad_norm": 1.0764367580413818, + "learning_rate": 4.4328750792079875e-06, + "loss": 0.2014, + "step": 4416 + }, + { + "epoch": 2.0915063628292394, + "grad_norm": 0.9037055373191833, + "learning_rate": 4.428627496338398e-06, + "loss": 0.1938, + "step": 4417 + }, + { + "epoch": 2.0919798757028705, + "grad_norm": 1.0375968217849731, + "learning_rate": 4.424381370607535e-06, + "loss": 0.2043, + "step": 4418 + }, + { + "epoch": 2.092453388576502, + "grad_norm": 1.5051581859588623, + "learning_rate": 4.420136703125938e-06, + "loss": 0.2058, + "step": 4419 + }, + { + "epoch": 2.092926901450133, + "grad_norm": 1.5662999153137207, + "learning_rate": 4.415893495003753e-06, + "loss": 0.2044, + "step": 4420 + }, + { + "epoch": 2.0934004143237646, + "grad_norm": 1.0691457986831665, + "learning_rate": 4.411651747350758e-06, + "loss": 0.197, + "step": 4421 + }, + { + "epoch": 2.0938739271973956, + "grad_norm": 1.773221731185913, + "learning_rate": 4.407411461276333e-06, + "loss": 0.1969, + "step": 4422 + }, + { + "epoch": 2.094347440071027, + "grad_norm": 1.03744637966156, + "learning_rate": 4.4031726378894915e-06, + "loss": 0.2462, + "step": 4423 + }, + { + "epoch": 2.094820952944658, + "grad_norm": 1.8114067316055298, + "learning_rate": 4.3989352782988525e-06, + "loss": 0.1918, + "step": 4424 + }, + { + "epoch": 2.0952944658182893, + "grad_norm": 0.8494598269462585, + "learning_rate": 4.394699383612653e-06, + "loss": 0.1993, + "step": 4425 + }, + { + "epoch": 2.0957679786919208, + "grad_norm": 1.0043613910675049, + "learning_rate": 4.390464954938759e-06, + "loss": 0.202, + "step": 4426 + }, + { + "epoch": 2.096241491565552, + "grad_norm": 1.178818941116333, + "learning_rate": 4.386231993384635e-06, + "loss": 0.1917, + "step": 4427 + }, + { + "epoch": 2.0967150044391833, + "grad_norm": 1.1056585311889648, + "learning_rate": 4.382000500057381e-06, + "loss": 0.1876, + "step": 4428 + }, + { + "epoch": 2.0971885173128144, + "grad_norm": 1.0591120719909668, + "learning_rate": 4.377770476063694e-06, + "loss": 0.2058, + "step": 4429 + }, + { + "epoch": 2.0976620301864455, + "grad_norm": 1.0878396034240723, + "learning_rate": 4.373541922509905e-06, + "loss": 0.1971, + "step": 4430 + }, + { + "epoch": 2.098135543060077, + "grad_norm": 1.087065577507019, + "learning_rate": 4.369314840501943e-06, + "loss": 0.2182, + "step": 4431 + }, + { + "epoch": 2.098609055933708, + "grad_norm": 1.2058556079864502, + "learning_rate": 4.365089231145367e-06, + "loss": 0.203, + "step": 4432 + }, + { + "epoch": 2.0990825688073396, + "grad_norm": 1.8395373821258545, + "learning_rate": 4.360865095545343e-06, + "loss": 0.1915, + "step": 4433 + }, + { + "epoch": 2.0995560816809706, + "grad_norm": 1.0700697898864746, + "learning_rate": 4.356642434806646e-06, + "loss": 0.2025, + "step": 4434 + }, + { + "epoch": 2.100029594554602, + "grad_norm": 1.068076491355896, + "learning_rate": 4.352421250033683e-06, + "loss": 0.1822, + "step": 4435 + }, + { + "epoch": 2.100503107428233, + "grad_norm": 1.1136810779571533, + "learning_rate": 4.348201542330455e-06, + "loss": 0.1865, + "step": 4436 + }, + { + "epoch": 2.1009766203018643, + "grad_norm": 1.1286970376968384, + "learning_rate": 4.3439833128005925e-06, + "loss": 0.1867, + "step": 4437 + }, + { + "epoch": 2.101450133175496, + "grad_norm": 1.4740508794784546, + "learning_rate": 4.339766562547326e-06, + "loss": 0.2043, + "step": 4438 + }, + { + "epoch": 2.101923646049127, + "grad_norm": 0.8717254996299744, + "learning_rate": 4.335551292673515e-06, + "loss": 0.1854, + "step": 4439 + }, + { + "epoch": 2.1023971589227584, + "grad_norm": 1.1837518215179443, + "learning_rate": 4.331337504281613e-06, + "loss": 0.183, + "step": 4440 + }, + { + "epoch": 2.1028706717963894, + "grad_norm": 1.649964690208435, + "learning_rate": 4.327125198473704e-06, + "loss": 0.2153, + "step": 4441 + }, + { + "epoch": 2.1033441846700205, + "grad_norm": 1.4881905317306519, + "learning_rate": 4.322914376351472e-06, + "loss": 0.2027, + "step": 4442 + }, + { + "epoch": 2.103817697543652, + "grad_norm": 1.6832579374313354, + "learning_rate": 4.318705039016215e-06, + "loss": 0.2197, + "step": 4443 + }, + { + "epoch": 2.104291210417283, + "grad_norm": 1.136296033859253, + "learning_rate": 4.314497187568848e-06, + "loss": 0.2201, + "step": 4444 + }, + { + "epoch": 2.1047647232909146, + "grad_norm": 0.9659789800643921, + "learning_rate": 4.310290823109889e-06, + "loss": 0.1773, + "step": 4445 + }, + { + "epoch": 2.1052382361645456, + "grad_norm": 1.2847449779510498, + "learning_rate": 4.306085946739481e-06, + "loss": 0.1805, + "step": 4446 + }, + { + "epoch": 2.105711749038177, + "grad_norm": 1.6592435836791992, + "learning_rate": 4.301882559557359e-06, + "loss": 0.1944, + "step": 4447 + }, + { + "epoch": 2.106185261911808, + "grad_norm": 1.5656230449676514, + "learning_rate": 4.297680662662882e-06, + "loss": 0.2252, + "step": 4448 + }, + { + "epoch": 2.1066587747854393, + "grad_norm": 1.2571629285812378, + "learning_rate": 4.293480257155022e-06, + "loss": 0.224, + "step": 4449 + }, + { + "epoch": 2.107132287659071, + "grad_norm": 1.8102307319641113, + "learning_rate": 4.289281344132344e-06, + "loss": 0.1945, + "step": 4450 + }, + { + "epoch": 2.107605800532702, + "grad_norm": 1.113694190979004, + "learning_rate": 4.285083924693041e-06, + "loss": 0.1983, + "step": 4451 + }, + { + "epoch": 2.1080793134063334, + "grad_norm": 1.0855345726013184, + "learning_rate": 4.280887999934902e-06, + "loss": 0.2136, + "step": 4452 + }, + { + "epoch": 2.1085528262799644, + "grad_norm": 1.6805202960968018, + "learning_rate": 4.276693570955337e-06, + "loss": 0.1893, + "step": 4453 + }, + { + "epoch": 2.109026339153596, + "grad_norm": 1.4205180406570435, + "learning_rate": 4.272500638851351e-06, + "loss": 0.2042, + "step": 4454 + }, + { + "epoch": 2.109499852027227, + "grad_norm": 1.77024245262146, + "learning_rate": 4.2683092047195725e-06, + "loss": 0.2161, + "step": 4455 + }, + { + "epoch": 2.109973364900858, + "grad_norm": 1.2872737646102905, + "learning_rate": 4.264119269656224e-06, + "loss": 0.199, + "step": 4456 + }, + { + "epoch": 2.1104468777744896, + "grad_norm": 1.008962631225586, + "learning_rate": 4.259930834757149e-06, + "loss": 0.2022, + "step": 4457 + }, + { + "epoch": 2.1109203906481206, + "grad_norm": 1.3520052433013916, + "learning_rate": 4.255743901117788e-06, + "loss": 0.2068, + "step": 4458 + }, + { + "epoch": 2.111393903521752, + "grad_norm": 1.200093388557434, + "learning_rate": 4.25155846983319e-06, + "loss": 0.2165, + "step": 4459 + }, + { + "epoch": 2.1118674163953832, + "grad_norm": 0.9952120780944824, + "learning_rate": 4.247374541998022e-06, + "loss": 0.2021, + "step": 4460 + }, + { + "epoch": 2.1123409292690143, + "grad_norm": 2.106011390686035, + "learning_rate": 4.243192118706543e-06, + "loss": 0.1916, + "step": 4461 + }, + { + "epoch": 2.112814442142646, + "grad_norm": 1.3868449926376343, + "learning_rate": 4.239011201052631e-06, + "loss": 0.2154, + "step": 4462 + }, + { + "epoch": 2.113287955016277, + "grad_norm": 1.1115097999572754, + "learning_rate": 4.234831790129759e-06, + "loss": 0.2044, + "step": 4463 + }, + { + "epoch": 2.1137614678899084, + "grad_norm": 1.1005358695983887, + "learning_rate": 4.2306538870310185e-06, + "loss": 0.2041, + "step": 4464 + }, + { + "epoch": 2.1142349807635394, + "grad_norm": 1.5084381103515625, + "learning_rate": 4.226477492849092e-06, + "loss": 0.2113, + "step": 4465 + }, + { + "epoch": 2.114708493637171, + "grad_norm": 1.4259034395217896, + "learning_rate": 4.222302608676283e-06, + "loss": 0.2041, + "step": 4466 + }, + { + "epoch": 2.115182006510802, + "grad_norm": 1.2917234897613525, + "learning_rate": 4.218129235604488e-06, + "loss": 0.2141, + "step": 4467 + }, + { + "epoch": 2.115655519384433, + "grad_norm": 0.9340530633926392, + "learning_rate": 4.21395737472521e-06, + "loss": 0.1878, + "step": 4468 + }, + { + "epoch": 2.1161290322580646, + "grad_norm": 1.0747179985046387, + "learning_rate": 4.209787027129563e-06, + "loss": 0.2098, + "step": 4469 + }, + { + "epoch": 2.1166025451316957, + "grad_norm": 1.3655407428741455, + "learning_rate": 4.2056181939082584e-06, + "loss": 0.1985, + "step": 4470 + }, + { + "epoch": 2.117076058005327, + "grad_norm": 1.1653110980987549, + "learning_rate": 4.201450876151619e-06, + "loss": 0.2072, + "step": 4471 + }, + { + "epoch": 2.1175495708789582, + "grad_norm": 1.0803667306900024, + "learning_rate": 4.19728507494956e-06, + "loss": 0.2158, + "step": 4472 + }, + { + "epoch": 2.1180230837525897, + "grad_norm": 1.5151382684707642, + "learning_rate": 4.193120791391612e-06, + "loss": 0.1998, + "step": 4473 + }, + { + "epoch": 2.118496596626221, + "grad_norm": 1.5851473808288574, + "learning_rate": 4.1889580265669e-06, + "loss": 0.199, + "step": 4474 + }, + { + "epoch": 2.118970109499852, + "grad_norm": 1.5319643020629883, + "learning_rate": 4.184796781564158e-06, + "loss": 0.1875, + "step": 4475 + }, + { + "epoch": 2.1194436223734834, + "grad_norm": 1.3403736352920532, + "learning_rate": 4.180637057471714e-06, + "loss": 0.1847, + "step": 4476 + }, + { + "epoch": 2.1199171352471144, + "grad_norm": 1.2243744134902954, + "learning_rate": 4.1764788553775105e-06, + "loss": 0.1827, + "step": 4477 + }, + { + "epoch": 2.120390648120746, + "grad_norm": 0.9514744877815247, + "learning_rate": 4.1723221763690826e-06, + "loss": 0.1932, + "step": 4478 + }, + { + "epoch": 2.120864160994377, + "grad_norm": 0.9540281295776367, + "learning_rate": 4.1681670215335646e-06, + "loss": 0.1746, + "step": 4479 + }, + { + "epoch": 2.121337673868008, + "grad_norm": 1.6678895950317383, + "learning_rate": 4.1640133919577065e-06, + "loss": 0.2004, + "step": 4480 + }, + { + "epoch": 2.1218111867416396, + "grad_norm": 1.186750054359436, + "learning_rate": 4.15986128872784e-06, + "loss": 0.2015, + "step": 4481 + }, + { + "epoch": 2.1222846996152707, + "grad_norm": 1.694643259048462, + "learning_rate": 4.155710712929916e-06, + "loss": 0.1983, + "step": 4482 + }, + { + "epoch": 2.122758212488902, + "grad_norm": 1.0623250007629395, + "learning_rate": 4.151561665649471e-06, + "loss": 0.2111, + "step": 4483 + }, + { + "epoch": 2.1232317253625332, + "grad_norm": 1.233841061592102, + "learning_rate": 4.147414147971655e-06, + "loss": 0.2121, + "step": 4484 + }, + { + "epoch": 2.1237052382361647, + "grad_norm": 1.1146776676177979, + "learning_rate": 4.143268160981204e-06, + "loss": 0.1958, + "step": 4485 + }, + { + "epoch": 2.124178751109796, + "grad_norm": 1.02981436252594, + "learning_rate": 4.139123705762469e-06, + "loss": 0.1988, + "step": 4486 + }, + { + "epoch": 2.124652263983427, + "grad_norm": 1.6232107877731323, + "learning_rate": 4.134980783399384e-06, + "loss": 0.2104, + "step": 4487 + }, + { + "epoch": 2.1251257768570584, + "grad_norm": 1.1073497533798218, + "learning_rate": 4.130839394975493e-06, + "loss": 0.1978, + "step": 4488 + }, + { + "epoch": 2.1255992897306895, + "grad_norm": 1.430740475654602, + "learning_rate": 4.126699541573943e-06, + "loss": 0.1999, + "step": 4489 + }, + { + "epoch": 2.126072802604321, + "grad_norm": 1.1626049280166626, + "learning_rate": 4.122561224277463e-06, + "loss": 0.1954, + "step": 4490 + }, + { + "epoch": 2.126546315477952, + "grad_norm": 0.9593620300292969, + "learning_rate": 4.1184244441683965e-06, + "loss": 0.1728, + "step": 4491 + }, + { + "epoch": 2.1270198283515835, + "grad_norm": 1.0123560428619385, + "learning_rate": 4.114289202328678e-06, + "loss": 0.2072, + "step": 4492 + }, + { + "epoch": 2.1274933412252146, + "grad_norm": 1.1748878955841064, + "learning_rate": 4.110155499839833e-06, + "loss": 0.2059, + "step": 4493 + }, + { + "epoch": 2.1279668540988457, + "grad_norm": 1.172324538230896, + "learning_rate": 4.106023337783e-06, + "loss": 0.1993, + "step": 4494 + }, + { + "epoch": 2.128440366972477, + "grad_norm": 1.9978069067001343, + "learning_rate": 4.1018927172389e-06, + "loss": 0.2031, + "step": 4495 + }, + { + "epoch": 2.1289138798461082, + "grad_norm": 1.4836291074752808, + "learning_rate": 4.097763639287864e-06, + "loss": 0.2432, + "step": 4496 + }, + { + "epoch": 2.1293873927197398, + "grad_norm": 1.3724143505096436, + "learning_rate": 4.093636105009804e-06, + "loss": 0.2108, + "step": 4497 + }, + { + "epoch": 2.129860905593371, + "grad_norm": 1.8259536027908325, + "learning_rate": 4.0895101154842444e-06, + "loss": 0.1977, + "step": 4498 + }, + { + "epoch": 2.130334418467002, + "grad_norm": 1.0505609512329102, + "learning_rate": 4.08538567179029e-06, + "loss": 0.1911, + "step": 4499 + }, + { + "epoch": 2.1308079313406334, + "grad_norm": 1.6359678506851196, + "learning_rate": 4.081262775006659e-06, + "loss": 0.2034, + "step": 4500 + }, + { + "epoch": 2.1312814442142645, + "grad_norm": 1.0886156558990479, + "learning_rate": 4.077141426211647e-06, + "loss": 0.2053, + "step": 4501 + }, + { + "epoch": 2.131754957087896, + "grad_norm": 1.12444269657135, + "learning_rate": 4.073021626483159e-06, + "loss": 0.2192, + "step": 4502 + }, + { + "epoch": 2.132228469961527, + "grad_norm": 0.8436000347137451, + "learning_rate": 4.0689033768986855e-06, + "loss": 0.2068, + "step": 4503 + }, + { + "epoch": 2.132701982835158, + "grad_norm": 1.2649866342544556, + "learning_rate": 4.064786678535313e-06, + "loss": 0.2072, + "step": 4504 + }, + { + "epoch": 2.1331754957087896, + "grad_norm": 1.0428705215454102, + "learning_rate": 4.0606715324697285e-06, + "loss": 0.2224, + "step": 4505 + }, + { + "epoch": 2.1336490085824207, + "grad_norm": 1.1856803894042969, + "learning_rate": 4.056557939778205e-06, + "loss": 0.1947, + "step": 4506 + }, + { + "epoch": 2.134122521456052, + "grad_norm": 2.003635883331299, + "learning_rate": 4.052445901536618e-06, + "loss": 0.2234, + "step": 4507 + }, + { + "epoch": 2.1345960343296833, + "grad_norm": 1.0833152532577515, + "learning_rate": 4.048335418820425e-06, + "loss": 0.1878, + "step": 4508 + }, + { + "epoch": 2.1350695472033148, + "grad_norm": 1.1753066778182983, + "learning_rate": 4.04422649270469e-06, + "loss": 0.1935, + "step": 4509 + }, + { + "epoch": 2.135543060076946, + "grad_norm": 1.2887529134750366, + "learning_rate": 4.040119124264056e-06, + "loss": 0.21, + "step": 4510 + }, + { + "epoch": 2.136016572950577, + "grad_norm": 1.0373543500900269, + "learning_rate": 4.036013314572772e-06, + "loss": 0.2036, + "step": 4511 + }, + { + "epoch": 2.1364900858242084, + "grad_norm": 1.2322744131088257, + "learning_rate": 4.0319090647046714e-06, + "loss": 0.2086, + "step": 4512 + }, + { + "epoch": 2.1369635986978395, + "grad_norm": 1.1571698188781738, + "learning_rate": 4.0278063757331745e-06, + "loss": 0.1731, + "step": 4513 + }, + { + "epoch": 2.137437111571471, + "grad_norm": 1.0368672609329224, + "learning_rate": 4.0237052487313084e-06, + "loss": 0.204, + "step": 4514 + }, + { + "epoch": 2.137910624445102, + "grad_norm": 1.494619369506836, + "learning_rate": 4.0196056847716795e-06, + "loss": 0.2129, + "step": 4515 + }, + { + "epoch": 2.1383841373187336, + "grad_norm": 1.6594982147216797, + "learning_rate": 4.015507684926491e-06, + "loss": 0.1958, + "step": 4516 + }, + { + "epoch": 2.1388576501923646, + "grad_norm": 1.1065243482589722, + "learning_rate": 4.0114112502675305e-06, + "loss": 0.1961, + "step": 4517 + }, + { + "epoch": 2.1393311630659957, + "grad_norm": 1.0827975273132324, + "learning_rate": 4.007316381866188e-06, + "loss": 0.2111, + "step": 4518 + }, + { + "epoch": 2.139804675939627, + "grad_norm": 1.3087358474731445, + "learning_rate": 4.003223080793432e-06, + "loss": 0.2031, + "step": 4519 + }, + { + "epoch": 2.1402781888132583, + "grad_norm": 1.1883478164672852, + "learning_rate": 3.999131348119829e-06, + "loss": 0.198, + "step": 4520 + }, + { + "epoch": 2.1407517016868898, + "grad_norm": 1.2664716243743896, + "learning_rate": 3.995041184915531e-06, + "loss": 0.2151, + "step": 4521 + }, + { + "epoch": 2.141225214560521, + "grad_norm": 1.1769858598709106, + "learning_rate": 3.990952592250277e-06, + "loss": 0.2112, + "step": 4522 + }, + { + "epoch": 2.141698727434152, + "grad_norm": 1.0426677465438843, + "learning_rate": 3.986865571193404e-06, + "loss": 0.1892, + "step": 4523 + }, + { + "epoch": 2.1421722403077834, + "grad_norm": 1.4847642183303833, + "learning_rate": 3.98278012281383e-06, + "loss": 0.209, + "step": 4524 + }, + { + "epoch": 2.1426457531814145, + "grad_norm": 1.9673742055892944, + "learning_rate": 3.978696248180069e-06, + "loss": 0.2053, + "step": 4525 + }, + { + "epoch": 2.143119266055046, + "grad_norm": 1.1002095937728882, + "learning_rate": 3.9746139483602095e-06, + "loss": 0.1944, + "step": 4526 + }, + { + "epoch": 2.143592778928677, + "grad_norm": 1.0705955028533936, + "learning_rate": 3.970533224421947e-06, + "loss": 0.1889, + "step": 4527 + }, + { + "epoch": 2.1440662918023086, + "grad_norm": 1.1432850360870361, + "learning_rate": 3.9664540774325545e-06, + "loss": 0.1978, + "step": 4528 + }, + { + "epoch": 2.1445398046759396, + "grad_norm": 1.0200495719909668, + "learning_rate": 3.962376508458887e-06, + "loss": 0.2121, + "step": 4529 + }, + { + "epoch": 2.1450133175495707, + "grad_norm": 1.4120169878005981, + "learning_rate": 3.958300518567403e-06, + "loss": 0.2158, + "step": 4530 + }, + { + "epoch": 2.145486830423202, + "grad_norm": 1.242547631263733, + "learning_rate": 3.954226108824129e-06, + "loss": 0.2045, + "step": 4531 + }, + { + "epoch": 2.1459603432968333, + "grad_norm": 1.3954753875732422, + "learning_rate": 3.950153280294695e-06, + "loss": 0.1992, + "step": 4532 + }, + { + "epoch": 2.146433856170465, + "grad_norm": 1.1535682678222656, + "learning_rate": 3.946082034044303e-06, + "loss": 0.1887, + "step": 4533 + }, + { + "epoch": 2.146907369044096, + "grad_norm": 1.6597424745559692, + "learning_rate": 3.942012371137755e-06, + "loss": 0.206, + "step": 4534 + }, + { + "epoch": 2.1473808819177274, + "grad_norm": 1.2386748790740967, + "learning_rate": 3.937944292639426e-06, + "loss": 0.2018, + "step": 4535 + }, + { + "epoch": 2.1478543947913584, + "grad_norm": 1.133917212486267, + "learning_rate": 3.9338777996132885e-06, + "loss": 0.2068, + "step": 4536 + }, + { + "epoch": 2.1483279076649895, + "grad_norm": 1.2553430795669556, + "learning_rate": 3.929812893122892e-06, + "loss": 0.2355, + "step": 4537 + }, + { + "epoch": 2.148801420538621, + "grad_norm": 1.0734045505523682, + "learning_rate": 3.9257495742313704e-06, + "loss": 0.2102, + "step": 4538 + }, + { + "epoch": 2.149274933412252, + "grad_norm": 1.530061960220337, + "learning_rate": 3.9216878440014506e-06, + "loss": 0.2079, + "step": 4539 + }, + { + "epoch": 2.1497484462858836, + "grad_norm": 1.4186937808990479, + "learning_rate": 3.917627703495434e-06, + "loss": 0.1909, + "step": 4540 + }, + { + "epoch": 2.1502219591595146, + "grad_norm": 0.925179123878479, + "learning_rate": 3.913569153775216e-06, + "loss": 0.1862, + "step": 4541 + }, + { + "epoch": 2.1506954720331457, + "grad_norm": 1.8251923322677612, + "learning_rate": 3.909512195902266e-06, + "loss": 0.1955, + "step": 4542 + }, + { + "epoch": 2.151168984906777, + "grad_norm": 1.1590983867645264, + "learning_rate": 3.905456830937651e-06, + "loss": 0.1839, + "step": 4543 + }, + { + "epoch": 2.1516424977804083, + "grad_norm": 1.3032938241958618, + "learning_rate": 3.901403059942e-06, + "loss": 0.1992, + "step": 4544 + }, + { + "epoch": 2.15211601065404, + "grad_norm": 1.323080062866211, + "learning_rate": 3.897350883975551e-06, + "loss": 0.1924, + "step": 4545 + }, + { + "epoch": 2.152589523527671, + "grad_norm": 1.6115025281906128, + "learning_rate": 3.893300304098102e-06, + "loss": 0.1954, + "step": 4546 + }, + { + "epoch": 2.1530630364013024, + "grad_norm": 1.2296570539474487, + "learning_rate": 3.889251321369044e-06, + "loss": 0.2111, + "step": 4547 + }, + { + "epoch": 2.1535365492749334, + "grad_norm": 1.6636604070663452, + "learning_rate": 3.885203936847355e-06, + "loss": 0.2111, + "step": 4548 + }, + { + "epoch": 2.1540100621485645, + "grad_norm": 1.3433752059936523, + "learning_rate": 3.881158151591583e-06, + "loss": 0.227, + "step": 4549 + }, + { + "epoch": 2.154483575022196, + "grad_norm": 1.0224047899246216, + "learning_rate": 3.877113966659869e-06, + "loss": 0.1997, + "step": 4550 + }, + { + "epoch": 2.154957087895827, + "grad_norm": 1.1871498823165894, + "learning_rate": 3.8730713831099265e-06, + "loss": 0.2111, + "step": 4551 + }, + { + "epoch": 2.1554306007694586, + "grad_norm": 1.0613186359405518, + "learning_rate": 3.869030401999059e-06, + "loss": 0.2026, + "step": 4552 + }, + { + "epoch": 2.1559041136430896, + "grad_norm": 1.2302757501602173, + "learning_rate": 3.8649910243841395e-06, + "loss": 0.212, + "step": 4553 + }, + { + "epoch": 2.156377626516721, + "grad_norm": 0.9007354974746704, + "learning_rate": 3.860953251321635e-06, + "loss": 0.1887, + "step": 4554 + }, + { + "epoch": 2.1568511393903522, + "grad_norm": 1.7830326557159424, + "learning_rate": 3.856917083867581e-06, + "loss": 0.1895, + "step": 4555 + }, + { + "epoch": 2.1573246522639833, + "grad_norm": 1.7577792406082153, + "learning_rate": 3.852882523077604e-06, + "loss": 0.222, + "step": 4556 + }, + { + "epoch": 2.157798165137615, + "grad_norm": 1.0963820219039917, + "learning_rate": 3.8488495700068994e-06, + "loss": 0.2307, + "step": 4557 + }, + { + "epoch": 2.158271678011246, + "grad_norm": 1.826006531715393, + "learning_rate": 3.844818225710246e-06, + "loss": 0.1977, + "step": 4558 + }, + { + "epoch": 2.1587451908848774, + "grad_norm": 1.689234972000122, + "learning_rate": 3.840788491242009e-06, + "loss": 0.204, + "step": 4559 + }, + { + "epoch": 2.1592187037585084, + "grad_norm": 1.8260334730148315, + "learning_rate": 3.8367603676561195e-06, + "loss": 0.2045, + "step": 4560 + }, + { + "epoch": 2.1596922166321395, + "grad_norm": 1.4265202283859253, + "learning_rate": 3.832733856006103e-06, + "loss": 0.2122, + "step": 4561 + }, + { + "epoch": 2.160165729505771, + "grad_norm": 1.1697028875350952, + "learning_rate": 3.8287089573450444e-06, + "loss": 0.1932, + "step": 4562 + }, + { + "epoch": 2.160639242379402, + "grad_norm": 1.2971158027648926, + "learning_rate": 3.824685672725626e-06, + "loss": 0.1934, + "step": 4563 + }, + { + "epoch": 2.1611127552530336, + "grad_norm": 0.9162940382957458, + "learning_rate": 3.820664003200092e-06, + "loss": 0.2034, + "step": 4564 + }, + { + "epoch": 2.1615862681266647, + "grad_norm": 2.122776985168457, + "learning_rate": 3.816643949820275e-06, + "loss": 0.2026, + "step": 4565 + }, + { + "epoch": 2.1620597810002957, + "grad_norm": 1.0466620922088623, + "learning_rate": 3.8126255136375857e-06, + "loss": 0.2109, + "step": 4566 + }, + { + "epoch": 2.1625332938739272, + "grad_norm": 0.9423272013664246, + "learning_rate": 3.808608695702999e-06, + "loss": 0.1851, + "step": 4567 + }, + { + "epoch": 2.1630068067475583, + "grad_norm": 1.181909203529358, + "learning_rate": 3.8045934970670808e-06, + "loss": 0.2037, + "step": 4568 + }, + { + "epoch": 2.16348031962119, + "grad_norm": 1.0272080898284912, + "learning_rate": 3.800579918779963e-06, + "loss": 0.1763, + "step": 4569 + }, + { + "epoch": 2.163953832494821, + "grad_norm": 1.413796305656433, + "learning_rate": 3.796567961891363e-06, + "loss": 0.1911, + "step": 4570 + }, + { + "epoch": 2.1644273453684524, + "grad_norm": 1.3752124309539795, + "learning_rate": 3.792557627450568e-06, + "loss": 0.193, + "step": 4571 + }, + { + "epoch": 2.1649008582420834, + "grad_norm": 1.488099217414856, + "learning_rate": 3.788548916506437e-06, + "loss": 0.1938, + "step": 4572 + }, + { + "epoch": 2.1653743711157145, + "grad_norm": 1.0083427429199219, + "learning_rate": 3.7845418301074176e-06, + "loss": 0.2191, + "step": 4573 + }, + { + "epoch": 2.165847883989346, + "grad_norm": 1.5522511005401611, + "learning_rate": 3.7805363693015172e-06, + "loss": 0.1947, + "step": 4574 + }, + { + "epoch": 2.166321396862977, + "grad_norm": 1.5290082693099976, + "learning_rate": 3.7765325351363335e-06, + "loss": 0.1975, + "step": 4575 + }, + { + "epoch": 2.1667949097366086, + "grad_norm": 0.8700765371322632, + "learning_rate": 3.772530328659023e-06, + "loss": 0.2067, + "step": 4576 + }, + { + "epoch": 2.1672684226102397, + "grad_norm": 0.9798673987388611, + "learning_rate": 3.7685297509163297e-06, + "loss": 0.2168, + "step": 4577 + }, + { + "epoch": 2.167741935483871, + "grad_norm": 1.7563843727111816, + "learning_rate": 3.7645308029545623e-06, + "loss": 0.1976, + "step": 4578 + }, + { + "epoch": 2.1682154483575022, + "grad_norm": 1.0324368476867676, + "learning_rate": 3.7605334858196107e-06, + "loss": 0.204, + "step": 4579 + }, + { + "epoch": 2.1686889612311333, + "grad_norm": 1.488629937171936, + "learning_rate": 3.75653780055693e-06, + "loss": 0.2303, + "step": 4580 + }, + { + "epoch": 2.169162474104765, + "grad_norm": 1.2473211288452148, + "learning_rate": 3.752543748211559e-06, + "loss": 0.2015, + "step": 4581 + }, + { + "epoch": 2.169635986978396, + "grad_norm": 1.4931015968322754, + "learning_rate": 3.748551329828101e-06, + "loss": 0.2084, + "step": 4582 + }, + { + "epoch": 2.1701094998520274, + "grad_norm": 1.2696232795715332, + "learning_rate": 3.7445605464507295e-06, + "loss": 0.2173, + "step": 4583 + }, + { + "epoch": 2.1705830127256585, + "grad_norm": 0.9876007437705994, + "learning_rate": 3.740571399123204e-06, + "loss": 0.2009, + "step": 4584 + }, + { + "epoch": 2.1710565255992895, + "grad_norm": 1.6325695514678955, + "learning_rate": 3.7365838888888395e-06, + "loss": 0.2295, + "step": 4585 + }, + { + "epoch": 2.171530038472921, + "grad_norm": 1.6730865240097046, + "learning_rate": 3.732598016790537e-06, + "loss": 0.2208, + "step": 4586 + }, + { + "epoch": 2.172003551346552, + "grad_norm": 1.2315953969955444, + "learning_rate": 3.728613783870759e-06, + "loss": 0.1985, + "step": 4587 + }, + { + "epoch": 2.1724770642201836, + "grad_norm": 1.5150845050811768, + "learning_rate": 3.724631191171547e-06, + "loss": 0.194, + "step": 4588 + }, + { + "epoch": 2.1729505770938147, + "grad_norm": 1.2721232175827026, + "learning_rate": 3.7206502397345044e-06, + "loss": 0.1883, + "step": 4589 + }, + { + "epoch": 2.173424089967446, + "grad_norm": 1.0581114292144775, + "learning_rate": 3.7166709306008163e-06, + "loss": 0.2121, + "step": 4590 + }, + { + "epoch": 2.1738976028410772, + "grad_norm": 0.9920486211776733, + "learning_rate": 3.712693264811231e-06, + "loss": 0.192, + "step": 4591 + }, + { + "epoch": 2.1743711157147083, + "grad_norm": 1.3474196195602417, + "learning_rate": 3.7087172434060635e-06, + "loss": 0.2062, + "step": 4592 + }, + { + "epoch": 2.17484462858834, + "grad_norm": 1.098014235496521, + "learning_rate": 3.704742867425212e-06, + "loss": 0.2018, + "step": 4593 + }, + { + "epoch": 2.175318141461971, + "grad_norm": 1.0982664823532104, + "learning_rate": 3.7007701379081275e-06, + "loss": 0.204, + "step": 4594 + }, + { + "epoch": 2.1757916543356024, + "grad_norm": 1.2308850288391113, + "learning_rate": 3.6967990558938484e-06, + "loss": 0.2105, + "step": 4595 + }, + { + "epoch": 2.1762651672092335, + "grad_norm": 1.5977511405944824, + "learning_rate": 3.6928296224209636e-06, + "loss": 0.2215, + "step": 4596 + }, + { + "epoch": 2.176738680082865, + "grad_norm": 1.0719974040985107, + "learning_rate": 3.688861838527649e-06, + "loss": 0.196, + "step": 4597 + }, + { + "epoch": 2.177212192956496, + "grad_norm": 1.4752261638641357, + "learning_rate": 3.684895705251632e-06, + "loss": 0.1891, + "step": 4598 + }, + { + "epoch": 2.177685705830127, + "grad_norm": 1.9501125812530518, + "learning_rate": 3.6809312236302243e-06, + "loss": 0.208, + "step": 4599 + }, + { + "epoch": 2.1781592187037586, + "grad_norm": 1.51497483253479, + "learning_rate": 3.6769683947002934e-06, + "loss": 0.2276, + "step": 4600 + }, + { + "epoch": 2.1786327315773897, + "grad_norm": 1.680430293083191, + "learning_rate": 3.673007219498276e-06, + "loss": 0.1995, + "step": 4601 + }, + { + "epoch": 2.179106244451021, + "grad_norm": 1.073860764503479, + "learning_rate": 3.6690476990601866e-06, + "loss": 0.217, + "step": 4602 + }, + { + "epoch": 2.1795797573246523, + "grad_norm": 1.1040340662002563, + "learning_rate": 3.665089834421588e-06, + "loss": 0.2057, + "step": 4603 + }, + { + "epoch": 2.1800532701982833, + "grad_norm": 1.3993034362792969, + "learning_rate": 3.661133626617638e-06, + "loss": 0.2013, + "step": 4604 + }, + { + "epoch": 2.180526783071915, + "grad_norm": 1.149754524230957, + "learning_rate": 3.657179076683034e-06, + "loss": 0.1923, + "step": 4605 + }, + { + "epoch": 2.181000295945546, + "grad_norm": 1.6532198190689087, + "learning_rate": 3.653226185652049e-06, + "loss": 0.1997, + "step": 4606 + }, + { + "epoch": 2.1814738088191774, + "grad_norm": 1.3920501470565796, + "learning_rate": 3.6492749545585313e-06, + "loss": 0.2051, + "step": 4607 + }, + { + "epoch": 2.1819473216928085, + "grad_norm": 1.0357420444488525, + "learning_rate": 3.6453253844358783e-06, + "loss": 0.2199, + "step": 4608 + }, + { + "epoch": 2.18242083456644, + "grad_norm": 1.4226856231689453, + "learning_rate": 3.6413774763170707e-06, + "loss": 0.2089, + "step": 4609 + }, + { + "epoch": 2.182894347440071, + "grad_norm": 1.0830601453781128, + "learning_rate": 3.637431231234637e-06, + "loss": 0.2125, + "step": 4610 + }, + { + "epoch": 2.183367860313702, + "grad_norm": 1.3604761362075806, + "learning_rate": 3.6334866502206877e-06, + "loss": 0.203, + "step": 4611 + }, + { + "epoch": 2.1838413731873336, + "grad_norm": 1.3618247509002686, + "learning_rate": 3.6295437343068828e-06, + "loss": 0.1979, + "step": 4612 + }, + { + "epoch": 2.1843148860609647, + "grad_norm": 1.374878168106079, + "learning_rate": 3.625602484524461e-06, + "loss": 0.2205, + "step": 4613 + }, + { + "epoch": 2.184788398934596, + "grad_norm": 0.9489659667015076, + "learning_rate": 3.6216629019042106e-06, + "loss": 0.1847, + "step": 4614 + }, + { + "epoch": 2.1852619118082273, + "grad_norm": 1.074998378753662, + "learning_rate": 3.6177249874764986e-06, + "loss": 0.1974, + "step": 4615 + }, + { + "epoch": 2.1857354246818583, + "grad_norm": 1.1851049661636353, + "learning_rate": 3.613788742271246e-06, + "loss": 0.2032, + "step": 4616 + }, + { + "epoch": 2.18620893755549, + "grad_norm": 0.9041722416877747, + "learning_rate": 3.6098541673179353e-06, + "loss": 0.2215, + "step": 4617 + }, + { + "epoch": 2.186682450429121, + "grad_norm": 1.1576639413833618, + "learning_rate": 3.605921263645623e-06, + "loss": 0.21, + "step": 4618 + }, + { + "epoch": 2.1871559633027524, + "grad_norm": 1.2321959733963013, + "learning_rate": 3.6019900322829173e-06, + "loss": 0.2291, + "step": 4619 + }, + { + "epoch": 2.1876294761763835, + "grad_norm": 1.1647976636886597, + "learning_rate": 3.5980604742579985e-06, + "loss": 0.1897, + "step": 4620 + }, + { + "epoch": 2.188102989050015, + "grad_norm": 1.6139419078826904, + "learning_rate": 3.594132590598599e-06, + "loss": 0.2243, + "step": 4621 + }, + { + "epoch": 2.188576501923646, + "grad_norm": 1.1993911266326904, + "learning_rate": 3.5902063823320255e-06, + "loss": 0.2209, + "step": 4622 + }, + { + "epoch": 2.189050014797277, + "grad_norm": 1.2088009119033813, + "learning_rate": 3.5862818504851325e-06, + "loss": 0.1982, + "step": 4623 + }, + { + "epoch": 2.1895235276709086, + "grad_norm": 1.2603254318237305, + "learning_rate": 3.5823589960843506e-06, + "loss": 0.1903, + "step": 4624 + }, + { + "epoch": 2.1899970405445397, + "grad_norm": 1.01237952709198, + "learning_rate": 3.5784378201556612e-06, + "loss": 0.2093, + "step": 4625 + }, + { + "epoch": 2.190470553418171, + "grad_norm": 1.6355202198028564, + "learning_rate": 3.5745183237246074e-06, + "loss": 0.1958, + "step": 4626 + }, + { + "epoch": 2.1909440662918023, + "grad_norm": 1.0540543794631958, + "learning_rate": 3.570600507816301e-06, + "loss": 0.2108, + "step": 4627 + }, + { + "epoch": 2.1914175791654333, + "grad_norm": 2.3014087677001953, + "learning_rate": 3.5666843734554022e-06, + "loss": 0.1943, + "step": 4628 + }, + { + "epoch": 2.191891092039065, + "grad_norm": 1.5514744520187378, + "learning_rate": 3.562769921666147e-06, + "loss": 0.2027, + "step": 4629 + }, + { + "epoch": 2.192364604912696, + "grad_norm": 1.1093729734420776, + "learning_rate": 3.558857153472314e-06, + "loss": 0.1883, + "step": 4630 + }, + { + "epoch": 2.1928381177863274, + "grad_norm": 1.1554844379425049, + "learning_rate": 3.554946069897256e-06, + "loss": 0.1939, + "step": 4631 + }, + { + "epoch": 2.1933116306599585, + "grad_norm": 1.6279453039169312, + "learning_rate": 3.5510366719638745e-06, + "loss": 0.2058, + "step": 4632 + }, + { + "epoch": 2.19378514353359, + "grad_norm": 1.1952141523361206, + "learning_rate": 3.54712896069464e-06, + "loss": 0.1856, + "step": 4633 + }, + { + "epoch": 2.194258656407221, + "grad_norm": 1.0594127178192139, + "learning_rate": 3.543222937111571e-06, + "loss": 0.2095, + "step": 4634 + }, + { + "epoch": 2.194732169280852, + "grad_norm": 1.2939765453338623, + "learning_rate": 3.539318602236257e-06, + "loss": 0.1998, + "step": 4635 + }, + { + "epoch": 2.1952056821544836, + "grad_norm": 1.392008900642395, + "learning_rate": 3.535415957089835e-06, + "loss": 0.1864, + "step": 4636 + }, + { + "epoch": 2.1956791950281147, + "grad_norm": 1.3421459197998047, + "learning_rate": 3.531515002693e-06, + "loss": 0.2139, + "step": 4637 + }, + { + "epoch": 2.196152707901746, + "grad_norm": 1.5865654945373535, + "learning_rate": 3.5276157400660184e-06, + "loss": 0.2222, + "step": 4638 + }, + { + "epoch": 2.1966262207753773, + "grad_norm": 0.9927922487258911, + "learning_rate": 3.523718170228696e-06, + "loss": 0.1997, + "step": 4639 + }, + { + "epoch": 2.197099733649009, + "grad_norm": 1.2386114597320557, + "learning_rate": 3.5198222942004113e-06, + "loss": 0.1857, + "step": 4640 + }, + { + "epoch": 2.19757324652264, + "grad_norm": 0.8716875910758972, + "learning_rate": 3.5159281130000867e-06, + "loss": 0.1986, + "step": 4641 + }, + { + "epoch": 2.198046759396271, + "grad_norm": 1.1668813228607178, + "learning_rate": 3.512035627646211e-06, + "loss": 0.2301, + "step": 4642 + }, + { + "epoch": 2.1985202722699024, + "grad_norm": 1.0498262643814087, + "learning_rate": 3.5081448391568307e-06, + "loss": 0.2091, + "step": 4643 + }, + { + "epoch": 2.1989937851435335, + "grad_norm": 1.0276434421539307, + "learning_rate": 3.5042557485495355e-06, + "loss": 0.1967, + "step": 4644 + }, + { + "epoch": 2.199467298017165, + "grad_norm": 2.1473894119262695, + "learning_rate": 3.500368356841487e-06, + "loss": 0.2149, + "step": 4645 + }, + { + "epoch": 2.199940810890796, + "grad_norm": 1.263840913772583, + "learning_rate": 3.496482665049389e-06, + "loss": 0.2281, + "step": 4646 + }, + { + "epoch": 2.200414323764427, + "grad_norm": 1.1061254739761353, + "learning_rate": 3.4925986741895125e-06, + "loss": 0.1913, + "step": 4647 + }, + { + "epoch": 2.2008878366380586, + "grad_norm": 1.3180168867111206, + "learning_rate": 3.4887163852776716e-06, + "loss": 0.2162, + "step": 4648 + }, + { + "epoch": 2.2013613495116897, + "grad_norm": 1.3046200275421143, + "learning_rate": 3.484835799329248e-06, + "loss": 0.1811, + "step": 4649 + }, + { + "epoch": 2.2018348623853212, + "grad_norm": 1.533521056175232, + "learning_rate": 3.4809569173591683e-06, + "loss": 0.2237, + "step": 4650 + }, + { + "epoch": 2.2023083752589523, + "grad_norm": 1.1395829916000366, + "learning_rate": 3.4770797403819122e-06, + "loss": 0.1978, + "step": 4651 + }, + { + "epoch": 2.202781888132584, + "grad_norm": 1.0285067558288574, + "learning_rate": 3.4732042694115265e-06, + "loss": 0.1905, + "step": 4652 + }, + { + "epoch": 2.203255401006215, + "grad_norm": 1.173947811126709, + "learning_rate": 3.4693305054615957e-06, + "loss": 0.1928, + "step": 4653 + }, + { + "epoch": 2.203728913879846, + "grad_norm": 1.178647518157959, + "learning_rate": 3.4654584495452716e-06, + "loss": 0.1898, + "step": 4654 + }, + { + "epoch": 2.2042024267534774, + "grad_norm": 1.0134490728378296, + "learning_rate": 3.4615881026752473e-06, + "loss": 0.2047, + "step": 4655 + }, + { + "epoch": 2.2046759396271085, + "grad_norm": 2.0668044090270996, + "learning_rate": 3.4577194658637815e-06, + "loss": 0.2035, + "step": 4656 + }, + { + "epoch": 2.20514945250074, + "grad_norm": 1.4281859397888184, + "learning_rate": 3.4538525401226697e-06, + "loss": 0.1914, + "step": 4657 + }, + { + "epoch": 2.205622965374371, + "grad_norm": 1.2419672012329102, + "learning_rate": 3.4499873264632787e-06, + "loss": 0.2086, + "step": 4658 + }, + { + "epoch": 2.2060964782480026, + "grad_norm": 1.1350500583648682, + "learning_rate": 3.44612382589651e-06, + "loss": 0.1957, + "step": 4659 + }, + { + "epoch": 2.2065699911216337, + "grad_norm": 1.8303426504135132, + "learning_rate": 3.4422620394328322e-06, + "loss": 0.1979, + "step": 4660 + }, + { + "epoch": 2.2070435039952647, + "grad_norm": 1.1737298965454102, + "learning_rate": 3.438401968082253e-06, + "loss": 0.1961, + "step": 4661 + }, + { + "epoch": 2.2075170168688962, + "grad_norm": 1.198386311531067, + "learning_rate": 3.434543612854336e-06, + "loss": 0.2124, + "step": 4662 + }, + { + "epoch": 2.2079905297425273, + "grad_norm": 1.2552683353424072, + "learning_rate": 3.4306869747582016e-06, + "loss": 0.1846, + "step": 4663 + }, + { + "epoch": 2.208464042616159, + "grad_norm": 0.9571728110313416, + "learning_rate": 3.426832054802511e-06, + "loss": 0.1961, + "step": 4664 + }, + { + "epoch": 2.20893755548979, + "grad_norm": 1.5007789134979248, + "learning_rate": 3.422978853995487e-06, + "loss": 0.2061, + "step": 4665 + }, + { + "epoch": 2.209411068363421, + "grad_norm": 1.0542176961898804, + "learning_rate": 3.4191273733448916e-06, + "loss": 0.2199, + "step": 4666 + }, + { + "epoch": 2.2098845812370524, + "grad_norm": 1.1434391736984253, + "learning_rate": 3.4152776138580466e-06, + "loss": 0.2185, + "step": 4667 + }, + { + "epoch": 2.2103580941106835, + "grad_norm": 1.2754579782485962, + "learning_rate": 3.411429576541815e-06, + "loss": 0.2134, + "step": 4668 + }, + { + "epoch": 2.210831606984315, + "grad_norm": 0.9875036478042603, + "learning_rate": 3.4075832624026204e-06, + "loss": 0.1884, + "step": 4669 + }, + { + "epoch": 2.211305119857946, + "grad_norm": 1.2512601613998413, + "learning_rate": 3.403738672446425e-06, + "loss": 0.1965, + "step": 4670 + }, + { + "epoch": 2.211778632731577, + "grad_norm": 1.0288127660751343, + "learning_rate": 3.3998958076787415e-06, + "loss": 0.1922, + "step": 4671 + }, + { + "epoch": 2.2122521456052087, + "grad_norm": 1.011583924293518, + "learning_rate": 3.3960546691046405e-06, + "loss": 0.1961, + "step": 4672 + }, + { + "epoch": 2.2127256584788397, + "grad_norm": 1.268002986907959, + "learning_rate": 3.3922152577287284e-06, + "loss": 0.183, + "step": 4673 + }, + { + "epoch": 2.2131991713524712, + "grad_norm": 1.4242584705352783, + "learning_rate": 3.388377574555172e-06, + "loss": 0.2272, + "step": 4674 + }, + { + "epoch": 2.2136726842261023, + "grad_norm": 1.1874171495437622, + "learning_rate": 3.3845416205876737e-06, + "loss": 0.2049, + "step": 4675 + }, + { + "epoch": 2.214146197099734, + "grad_norm": 0.9578942060470581, + "learning_rate": 3.380707396829498e-06, + "loss": 0.2171, + "step": 4676 + }, + { + "epoch": 2.214619709973365, + "grad_norm": 1.165828824043274, + "learning_rate": 3.3768749042834416e-06, + "loss": 0.2164, + "step": 4677 + }, + { + "epoch": 2.215093222846996, + "grad_norm": 1.2874082326889038, + "learning_rate": 3.3730441439518637e-06, + "loss": 0.1935, + "step": 4678 + }, + { + "epoch": 2.2155667357206275, + "grad_norm": 1.277848720550537, + "learning_rate": 3.3692151168366573e-06, + "loss": 0.1781, + "step": 4679 + }, + { + "epoch": 2.2160402485942585, + "grad_norm": 2.349771499633789, + "learning_rate": 3.3653878239392668e-06, + "loss": 0.2103, + "step": 4680 + }, + { + "epoch": 2.21651376146789, + "grad_norm": 1.3279833793640137, + "learning_rate": 3.3615622662606852e-06, + "loss": 0.2103, + "step": 4681 + }, + { + "epoch": 2.216987274341521, + "grad_norm": 1.032465934753418, + "learning_rate": 3.357738444801449e-06, + "loss": 0.1824, + "step": 4682 + }, + { + "epoch": 2.2174607872151526, + "grad_norm": 1.4926496744155884, + "learning_rate": 3.35391636056165e-06, + "loss": 0.1904, + "step": 4683 + }, + { + "epoch": 2.2179343000887837, + "grad_norm": 0.948499858379364, + "learning_rate": 3.350096014540909e-06, + "loss": 0.1849, + "step": 4684 + }, + { + "epoch": 2.2184078129624147, + "grad_norm": 1.0163851976394653, + "learning_rate": 3.3462774077383996e-06, + "loss": 0.209, + "step": 4685 + }, + { + "epoch": 2.2188813258360462, + "grad_norm": 1.0877399444580078, + "learning_rate": 3.3424605411528476e-06, + "loss": 0.2033, + "step": 4686 + }, + { + "epoch": 2.2193548387096773, + "grad_norm": 1.5267970561981201, + "learning_rate": 3.338645415782512e-06, + "loss": 0.207, + "step": 4687 + }, + { + "epoch": 2.219828351583309, + "grad_norm": 1.1469260454177856, + "learning_rate": 3.334832032625208e-06, + "loss": 0.1791, + "step": 4688 + }, + { + "epoch": 2.22030186445694, + "grad_norm": 1.130159854888916, + "learning_rate": 3.3310203926782826e-06, + "loss": 0.1979, + "step": 4689 + }, + { + "epoch": 2.220775377330571, + "grad_norm": 0.9808071851730347, + "learning_rate": 3.3272104969386388e-06, + "loss": 0.178, + "step": 4690 + }, + { + "epoch": 2.2212488902042025, + "grad_norm": 1.1407606601715088, + "learning_rate": 3.3234023464027143e-06, + "loss": 0.1997, + "step": 4691 + }, + { + "epoch": 2.2217224030778335, + "grad_norm": 1.0201860666275024, + "learning_rate": 3.319595942066498e-06, + "loss": 0.2017, + "step": 4692 + }, + { + "epoch": 2.222195915951465, + "grad_norm": 1.1310131549835205, + "learning_rate": 3.3157912849255137e-06, + "loss": 0.2015, + "step": 4693 + }, + { + "epoch": 2.222669428825096, + "grad_norm": 1.1622865200042725, + "learning_rate": 3.311988375974837e-06, + "loss": 0.2173, + "step": 4694 + }, + { + "epoch": 2.2231429416987276, + "grad_norm": 1.5009393692016602, + "learning_rate": 3.308187216209082e-06, + "loss": 0.2147, + "step": 4695 + }, + { + "epoch": 2.2236164545723587, + "grad_norm": 1.1541341543197632, + "learning_rate": 3.304387806622399e-06, + "loss": 0.1975, + "step": 4696 + }, + { + "epoch": 2.2240899674459897, + "grad_norm": 1.204338788986206, + "learning_rate": 3.3005901482084947e-06, + "loss": 0.2123, + "step": 4697 + }, + { + "epoch": 2.2245634803196213, + "grad_norm": 1.4117945432662964, + "learning_rate": 3.2967942419606045e-06, + "loss": 0.2167, + "step": 4698 + }, + { + "epoch": 2.2250369931932523, + "grad_norm": 1.0476959943771362, + "learning_rate": 3.293000088871515e-06, + "loss": 0.2071, + "step": 4699 + }, + { + "epoch": 2.225510506066884, + "grad_norm": 1.0269898176193237, + "learning_rate": 3.289207689933547e-06, + "loss": 0.1831, + "step": 4700 + }, + { + "epoch": 2.225984018940515, + "grad_norm": 1.0802260637283325, + "learning_rate": 3.2854170461385705e-06, + "loss": 0.2284, + "step": 4701 + }, + { + "epoch": 2.2264575318141464, + "grad_norm": 1.3022472858428955, + "learning_rate": 3.2816281584779853e-06, + "loss": 0.2124, + "step": 4702 + }, + { + "epoch": 2.2269310446877775, + "grad_norm": 1.3747221231460571, + "learning_rate": 3.277841027942745e-06, + "loss": 0.203, + "step": 4703 + }, + { + "epoch": 2.2274045575614085, + "grad_norm": 1.0478261709213257, + "learning_rate": 3.274055655523335e-06, + "loss": 0.1948, + "step": 4704 + }, + { + "epoch": 2.22787807043504, + "grad_norm": 1.0842070579528809, + "learning_rate": 3.2702720422097777e-06, + "loss": 0.1837, + "step": 4705 + }, + { + "epoch": 2.228351583308671, + "grad_norm": 1.1079217195510864, + "learning_rate": 3.2664901889916477e-06, + "loss": 0.1974, + "step": 4706 + }, + { + "epoch": 2.2288250961823026, + "grad_norm": 1.3619805574417114, + "learning_rate": 3.2627100968580472e-06, + "loss": 0.1895, + "step": 4707 + }, + { + "epoch": 2.2292986090559337, + "grad_norm": 1.1514424085617065, + "learning_rate": 3.2589317667976286e-06, + "loss": 0.1885, + "step": 4708 + }, + { + "epoch": 2.2297721219295648, + "grad_norm": 1.3855830430984497, + "learning_rate": 3.25515519979857e-06, + "loss": 0.1978, + "step": 4709 + }, + { + "epoch": 2.2302456348031963, + "grad_norm": 1.361132025718689, + "learning_rate": 3.2513803968486037e-06, + "loss": 0.201, + "step": 4710 + }, + { + "epoch": 2.2307191476768273, + "grad_norm": 1.0529563426971436, + "learning_rate": 3.2476073589349866e-06, + "loss": 0.2019, + "step": 4711 + }, + { + "epoch": 2.231192660550459, + "grad_norm": 1.162382960319519, + "learning_rate": 3.2438360870445263e-06, + "loss": 0.2119, + "step": 4712 + }, + { + "epoch": 2.23166617342409, + "grad_norm": 0.9659311175346375, + "learning_rate": 3.2400665821635568e-06, + "loss": 0.2018, + "step": 4713 + }, + { + "epoch": 2.2321396862977214, + "grad_norm": 1.2184617519378662, + "learning_rate": 3.236298845277961e-06, + "loss": 0.2146, + "step": 4714 + }, + { + "epoch": 2.2326131991713525, + "grad_norm": 1.6080515384674072, + "learning_rate": 3.2325328773731524e-06, + "loss": 0.2, + "step": 4715 + }, + { + "epoch": 2.2330867120449835, + "grad_norm": 1.5760236978530884, + "learning_rate": 3.228768679434079e-06, + "loss": 0.2092, + "step": 4716 + }, + { + "epoch": 2.233560224918615, + "grad_norm": 1.4468612670898438, + "learning_rate": 3.2250062524452376e-06, + "loss": 0.2187, + "step": 4717 + }, + { + "epoch": 2.234033737792246, + "grad_norm": 1.2942707538604736, + "learning_rate": 3.2212455973906477e-06, + "loss": 0.1937, + "step": 4718 + }, + { + "epoch": 2.2345072506658776, + "grad_norm": 1.237175703048706, + "learning_rate": 3.2174867152538802e-06, + "loss": 0.213, + "step": 4719 + }, + { + "epoch": 2.2349807635395087, + "grad_norm": 1.146020770072937, + "learning_rate": 3.213729607018026e-06, + "loss": 0.2081, + "step": 4720 + }, + { + "epoch": 2.23545427641314, + "grad_norm": 1.1109809875488281, + "learning_rate": 3.209974273665726e-06, + "loss": 0.2109, + "step": 4721 + }, + { + "epoch": 2.2359277892867713, + "grad_norm": 1.0821495056152344, + "learning_rate": 3.2062207161791526e-06, + "loss": 0.2124, + "step": 4722 + }, + { + "epoch": 2.2364013021604023, + "grad_norm": 1.1807968616485596, + "learning_rate": 3.2024689355400063e-06, + "loss": 0.2362, + "step": 4723 + }, + { + "epoch": 2.236874815034034, + "grad_norm": 1.4384002685546875, + "learning_rate": 3.1987189327295377e-06, + "loss": 0.1825, + "step": 4724 + }, + { + "epoch": 2.237348327907665, + "grad_norm": 1.1353758573532104, + "learning_rate": 3.1949707087285144e-06, + "loss": 0.2115, + "step": 4725 + }, + { + "epoch": 2.2378218407812964, + "grad_norm": 2.6835150718688965, + "learning_rate": 3.1912242645172576e-06, + "loss": 0.2054, + "step": 4726 + }, + { + "epoch": 2.2382953536549275, + "grad_norm": 0.9744576811790466, + "learning_rate": 3.187479601075605e-06, + "loss": 0.1937, + "step": 4727 + }, + { + "epoch": 2.2387688665285586, + "grad_norm": 1.31288480758667, + "learning_rate": 3.183736719382944e-06, + "loss": 0.2199, + "step": 4728 + }, + { + "epoch": 2.23924237940219, + "grad_norm": 0.9159910678863525, + "learning_rate": 3.179995620418187e-06, + "loss": 0.2009, + "step": 4729 + }, + { + "epoch": 2.239715892275821, + "grad_norm": 1.6688588857650757, + "learning_rate": 3.176256305159778e-06, + "loss": 0.1832, + "step": 4730 + }, + { + "epoch": 2.2401894051494526, + "grad_norm": 0.9921554923057556, + "learning_rate": 3.1725187745857066e-06, + "loss": 0.2143, + "step": 4731 + }, + { + "epoch": 2.2406629180230837, + "grad_norm": 0.9964454770088196, + "learning_rate": 3.16878302967348e-06, + "loss": 0.1916, + "step": 4732 + }, + { + "epoch": 2.2411364308967148, + "grad_norm": 1.3738844394683838, + "learning_rate": 3.1650490714001536e-06, + "loss": 0.181, + "step": 4733 + }, + { + "epoch": 2.2416099437703463, + "grad_norm": 1.0650712251663208, + "learning_rate": 3.1613169007423016e-06, + "loss": 0.1985, + "step": 4734 + }, + { + "epoch": 2.2420834566439773, + "grad_norm": 0.9678623080253601, + "learning_rate": 3.1575865186760425e-06, + "loss": 0.1875, + "step": 4735 + }, + { + "epoch": 2.242556969517609, + "grad_norm": 1.2123451232910156, + "learning_rate": 3.1538579261770177e-06, + "loss": 0.2132, + "step": 4736 + }, + { + "epoch": 2.24303048239124, + "grad_norm": 1.060880422592163, + "learning_rate": 3.150131124220408e-06, + "loss": 0.1999, + "step": 4737 + }, + { + "epoch": 2.2435039952648714, + "grad_norm": 1.4341163635253906, + "learning_rate": 3.1464061137809187e-06, + "loss": 0.209, + "step": 4738 + }, + { + "epoch": 2.2439775081385025, + "grad_norm": 1.1810160875320435, + "learning_rate": 3.142682895832796e-06, + "loss": 0.2031, + "step": 4739 + }, + { + "epoch": 2.2444510210121336, + "grad_norm": 1.1386125087738037, + "learning_rate": 3.1389614713498073e-06, + "loss": 0.1958, + "step": 4740 + }, + { + "epoch": 2.244924533885765, + "grad_norm": 1.4858670234680176, + "learning_rate": 3.1352418413052543e-06, + "loss": 0.1942, + "step": 4741 + }, + { + "epoch": 2.245398046759396, + "grad_norm": 1.1537786722183228, + "learning_rate": 3.131524006671974e-06, + "loss": 0.1929, + "step": 4742 + }, + { + "epoch": 2.2458715596330276, + "grad_norm": 1.0491985082626343, + "learning_rate": 3.127807968422326e-06, + "loss": 0.2021, + "step": 4743 + }, + { + "epoch": 2.2463450725066587, + "grad_norm": 1.1256279945373535, + "learning_rate": 3.1240937275282103e-06, + "loss": 0.2091, + "step": 4744 + }, + { + "epoch": 2.2468185853802902, + "grad_norm": 1.0205121040344238, + "learning_rate": 3.120381284961043e-06, + "loss": 0.2042, + "step": 4745 + }, + { + "epoch": 2.2472920982539213, + "grad_norm": 1.3747820854187012, + "learning_rate": 3.116670641691785e-06, + "loss": 0.2037, + "step": 4746 + }, + { + "epoch": 2.2477656111275524, + "grad_norm": 1.127557396888733, + "learning_rate": 3.112961798690913e-06, + "loss": 0.2081, + "step": 4747 + }, + { + "epoch": 2.248239124001184, + "grad_norm": 1.0771230459213257, + "learning_rate": 3.109254756928445e-06, + "loss": 0.1914, + "step": 4748 + }, + { + "epoch": 2.248712636874815, + "grad_norm": 1.6226811408996582, + "learning_rate": 3.105549517373919e-06, + "loss": 0.2036, + "step": 4749 + }, + { + "epoch": 2.2491861497484464, + "grad_norm": 1.0237318277359009, + "learning_rate": 3.1018460809964025e-06, + "loss": 0.2105, + "step": 4750 + }, + { + "epoch": 2.2496596626220775, + "grad_norm": 1.0179623365402222, + "learning_rate": 3.0981444487644984e-06, + "loss": 0.2184, + "step": 4751 + }, + { + "epoch": 2.2501331754957086, + "grad_norm": 1.6776036024093628, + "learning_rate": 3.0944446216463276e-06, + "loss": 0.2038, + "step": 4752 + }, + { + "epoch": 2.25060668836934, + "grad_norm": 1.2936748266220093, + "learning_rate": 3.09074660060955e-06, + "loss": 0.2189, + "step": 4753 + }, + { + "epoch": 2.251080201242971, + "grad_norm": 0.9462767839431763, + "learning_rate": 3.087050386621341e-06, + "loss": 0.2057, + "step": 4754 + }, + { + "epoch": 2.2515537141166027, + "grad_norm": 1.1547857522964478, + "learning_rate": 3.083355980648416e-06, + "loss": 0.2095, + "step": 4755 + }, + { + "epoch": 2.2520272269902337, + "grad_norm": 2.062722682952881, + "learning_rate": 3.0796633836570055e-06, + "loss": 0.1905, + "step": 4756 + }, + { + "epoch": 2.2525007398638652, + "grad_norm": 1.355178952217102, + "learning_rate": 3.0759725966128774e-06, + "loss": 0.2056, + "step": 4757 + }, + { + "epoch": 2.2529742527374963, + "grad_norm": 1.0428045988082886, + "learning_rate": 3.072283620481321e-06, + "loss": 0.1885, + "step": 4758 + }, + { + "epoch": 2.2534477656111274, + "grad_norm": 1.1466599702835083, + "learning_rate": 3.068596456227143e-06, + "loss": 0.2027, + "step": 4759 + }, + { + "epoch": 2.253921278484759, + "grad_norm": 1.3236037492752075, + "learning_rate": 3.0649111048147006e-06, + "loss": 0.2144, + "step": 4760 + }, + { + "epoch": 2.25439479135839, + "grad_norm": 1.5951242446899414, + "learning_rate": 3.061227567207852e-06, + "loss": 0.1973, + "step": 4761 + }, + { + "epoch": 2.2548683042320214, + "grad_norm": 1.3796095848083496, + "learning_rate": 3.0575458443699957e-06, + "loss": 0.1869, + "step": 4762 + }, + { + "epoch": 2.2553418171056525, + "grad_norm": 1.0382217168807983, + "learning_rate": 3.053865937264049e-06, + "loss": 0.1942, + "step": 4763 + }, + { + "epoch": 2.255815329979284, + "grad_norm": 0.9059621095657349, + "learning_rate": 3.0501878468524525e-06, + "loss": 0.1961, + "step": 4764 + }, + { + "epoch": 2.256288842852915, + "grad_norm": 1.007012963294983, + "learning_rate": 3.046511574097183e-06, + "loss": 0.2033, + "step": 4765 + }, + { + "epoch": 2.256762355726546, + "grad_norm": 0.9761565327644348, + "learning_rate": 3.042837119959726e-06, + "loss": 0.2006, + "step": 4766 + }, + { + "epoch": 2.2572358686001777, + "grad_norm": 1.1975845098495483, + "learning_rate": 3.039164485401106e-06, + "loss": 0.2131, + "step": 4767 + }, + { + "epoch": 2.2577093814738087, + "grad_norm": 1.2189717292785645, + "learning_rate": 3.0354936713818594e-06, + "loss": 0.1828, + "step": 4768 + }, + { + "epoch": 2.2581828943474402, + "grad_norm": 1.35395085811615, + "learning_rate": 3.0318246788620588e-06, + "loss": 0.1928, + "step": 4769 + }, + { + "epoch": 2.2586564072210713, + "grad_norm": 0.9459072947502136, + "learning_rate": 3.028157508801287e-06, + "loss": 0.2134, + "step": 4770 + }, + { + "epoch": 2.2591299200947024, + "grad_norm": 1.2507688999176025, + "learning_rate": 3.0244921621586643e-06, + "loss": 0.207, + "step": 4771 + }, + { + "epoch": 2.259603432968334, + "grad_norm": 1.5289151668548584, + "learning_rate": 3.020828639892818e-06, + "loss": 0.2006, + "step": 4772 + }, + { + "epoch": 2.260076945841965, + "grad_norm": 1.1962858438491821, + "learning_rate": 3.0171669429619154e-06, + "loss": 0.2226, + "step": 4773 + }, + { + "epoch": 2.2605504587155965, + "grad_norm": 1.2389626502990723, + "learning_rate": 3.0135070723236346e-06, + "loss": 0.2018, + "step": 4774 + }, + { + "epoch": 2.2610239715892275, + "grad_norm": 1.1596938371658325, + "learning_rate": 3.0098490289351756e-06, + "loss": 0.1978, + "step": 4775 + }, + { + "epoch": 2.2614974844628586, + "grad_norm": 1.206876516342163, + "learning_rate": 3.0061928137532713e-06, + "loss": 0.1913, + "step": 4776 + }, + { + "epoch": 2.26197099733649, + "grad_norm": 1.2051137685775757, + "learning_rate": 3.002538427734163e-06, + "loss": 0.2041, + "step": 4777 + }, + { + "epoch": 2.262444510210121, + "grad_norm": 0.8974770903587341, + "learning_rate": 2.9988858718336256e-06, + "loss": 0.2054, + "step": 4778 + }, + { + "epoch": 2.2629180230837527, + "grad_norm": 1.372613787651062, + "learning_rate": 2.995235147006945e-06, + "loss": 0.1953, + "step": 4779 + }, + { + "epoch": 2.2633915359573837, + "grad_norm": 1.115798830986023, + "learning_rate": 2.991586254208939e-06, + "loss": 0.213, + "step": 4780 + }, + { + "epoch": 2.2638650488310152, + "grad_norm": 1.4372373819351196, + "learning_rate": 2.987939194393933e-06, + "loss": 0.2302, + "step": 4781 + }, + { + "epoch": 2.2643385617046463, + "grad_norm": 0.9313597679138184, + "learning_rate": 2.984293968515788e-06, + "loss": 0.2021, + "step": 4782 + }, + { + "epoch": 2.264812074578278, + "grad_norm": 1.0332454442977905, + "learning_rate": 2.9806505775278738e-06, + "loss": 0.1874, + "step": 4783 + }, + { + "epoch": 2.265285587451909, + "grad_norm": 1.0937074422836304, + "learning_rate": 2.9770090223830803e-06, + "loss": 0.199, + "step": 4784 + }, + { + "epoch": 2.26575910032554, + "grad_norm": 1.022531509399414, + "learning_rate": 2.9733693040338286e-06, + "loss": 0.1963, + "step": 4785 + }, + { + "epoch": 2.2662326131991715, + "grad_norm": 1.1225355863571167, + "learning_rate": 2.969731423432045e-06, + "loss": 0.2178, + "step": 4786 + }, + { + "epoch": 2.2667061260728025, + "grad_norm": 1.8823810815811157, + "learning_rate": 2.9660953815291893e-06, + "loss": 0.2103, + "step": 4787 + }, + { + "epoch": 2.267179638946434, + "grad_norm": 1.7231241464614868, + "learning_rate": 2.962461179276225e-06, + "loss": 0.1777, + "step": 4788 + }, + { + "epoch": 2.267653151820065, + "grad_norm": 1.254069447517395, + "learning_rate": 2.9588288176236502e-06, + "loss": 0.2163, + "step": 4789 + }, + { + "epoch": 2.268126664693696, + "grad_norm": 1.0961583852767944, + "learning_rate": 2.955198297521469e-06, + "loss": 0.2097, + "step": 4790 + }, + { + "epoch": 2.2686001775673277, + "grad_norm": 1.0620346069335938, + "learning_rate": 2.9515696199192123e-06, + "loss": 0.1878, + "step": 4791 + }, + { + "epoch": 2.2690736904409587, + "grad_norm": 1.0803302526474, + "learning_rate": 2.9479427857659213e-06, + "loss": 0.2128, + "step": 4792 + }, + { + "epoch": 2.2695472033145903, + "grad_norm": 1.3696365356445312, + "learning_rate": 2.9443177960101653e-06, + "loss": 0.2047, + "step": 4793 + }, + { + "epoch": 2.2700207161882213, + "grad_norm": 1.0663868188858032, + "learning_rate": 2.9406946516000236e-06, + "loss": 0.1987, + "step": 4794 + }, + { + "epoch": 2.2704942290618524, + "grad_norm": 1.149569034576416, + "learning_rate": 2.9370733534830887e-06, + "loss": 0.1871, + "step": 4795 + }, + { + "epoch": 2.270967741935484, + "grad_norm": 1.892318606376648, + "learning_rate": 2.9334539026064847e-06, + "loss": 0.1883, + "step": 4796 + }, + { + "epoch": 2.271441254809115, + "grad_norm": 1.3495607376098633, + "learning_rate": 2.9298362999168373e-06, + "loss": 0.2193, + "step": 4797 + }, + { + "epoch": 2.2719147676827465, + "grad_norm": 1.4502182006835938, + "learning_rate": 2.926220546360299e-06, + "loss": 0.1985, + "step": 4798 + }, + { + "epoch": 2.2723882805563775, + "grad_norm": 1.812647819519043, + "learning_rate": 2.922606642882537e-06, + "loss": 0.2017, + "step": 4799 + }, + { + "epoch": 2.272861793430009, + "grad_norm": 1.912377953529358, + "learning_rate": 2.9189945904287287e-06, + "loss": 0.2218, + "step": 4800 + }, + { + "epoch": 2.27333530630364, + "grad_norm": 1.0828890800476074, + "learning_rate": 2.915384389943576e-06, + "loss": 0.1887, + "step": 4801 + }, + { + "epoch": 2.2738088191772716, + "grad_norm": 1.1113115549087524, + "learning_rate": 2.911776042371286e-06, + "loss": 0.1963, + "step": 4802 + }, + { + "epoch": 2.2742823320509027, + "grad_norm": 1.5018105506896973, + "learning_rate": 2.908169548655595e-06, + "loss": 0.1994, + "step": 4803 + }, + { + "epoch": 2.2747558449245338, + "grad_norm": 1.3521610498428345, + "learning_rate": 2.9045649097397386e-06, + "loss": 0.2026, + "step": 4804 + }, + { + "epoch": 2.2752293577981653, + "grad_norm": 1.7627965211868286, + "learning_rate": 2.9009621265664832e-06, + "loss": 0.1797, + "step": 4805 + }, + { + "epoch": 2.2757028706717963, + "grad_norm": 1.138641357421875, + "learning_rate": 2.8973612000780937e-06, + "loss": 0.2053, + "step": 4806 + }, + { + "epoch": 2.276176383545428, + "grad_norm": 1.8081318140029907, + "learning_rate": 2.8937621312163653e-06, + "loss": 0.1976, + "step": 4807 + }, + { + "epoch": 2.276649896419059, + "grad_norm": 1.7190033197402954, + "learning_rate": 2.890164920922597e-06, + "loss": 0.2069, + "step": 4808 + }, + { + "epoch": 2.27712340929269, + "grad_norm": 1.0892248153686523, + "learning_rate": 2.8865695701376005e-06, + "loss": 0.2048, + "step": 4809 + }, + { + "epoch": 2.2775969221663215, + "grad_norm": 1.098090648651123, + "learning_rate": 2.8829760798017115e-06, + "loss": 0.193, + "step": 4810 + }, + { + "epoch": 2.2780704350399525, + "grad_norm": 1.2340128421783447, + "learning_rate": 2.8793844508547664e-06, + "loss": 0.1822, + "step": 4811 + }, + { + "epoch": 2.278543947913584, + "grad_norm": 0.9539898037910461, + "learning_rate": 2.875794684236127e-06, + "loss": 0.1829, + "step": 4812 + }, + { + "epoch": 2.279017460787215, + "grad_norm": 1.3292118310928345, + "learning_rate": 2.8722067808846575e-06, + "loss": 0.2284, + "step": 4813 + }, + { + "epoch": 2.279490973660846, + "grad_norm": 1.208977460861206, + "learning_rate": 2.8686207417387446e-06, + "loss": 0.1792, + "step": 4814 + }, + { + "epoch": 2.2799644865344777, + "grad_norm": 1.084511637687683, + "learning_rate": 2.8650365677362756e-06, + "loss": 0.1716, + "step": 4815 + }, + { + "epoch": 2.2804379994081088, + "grad_norm": 1.649825930595398, + "learning_rate": 2.861454259814662e-06, + "loss": 0.2078, + "step": 4816 + }, + { + "epoch": 2.2809115122817403, + "grad_norm": 1.3843883275985718, + "learning_rate": 2.857873818910821e-06, + "loss": 0.219, + "step": 4817 + }, + { + "epoch": 2.2813850251553713, + "grad_norm": 1.0740282535552979, + "learning_rate": 2.854295245961178e-06, + "loss": 0.1984, + "step": 4818 + }, + { + "epoch": 2.281858538029003, + "grad_norm": 1.2983367443084717, + "learning_rate": 2.8507185419016813e-06, + "loss": 0.213, + "step": 4819 + }, + { + "epoch": 2.282332050902634, + "grad_norm": 1.1610127687454224, + "learning_rate": 2.8471437076677767e-06, + "loss": 0.2434, + "step": 4820 + }, + { + "epoch": 2.282805563776265, + "grad_norm": 1.033072829246521, + "learning_rate": 2.8435707441944325e-06, + "loss": 0.195, + "step": 4821 + }, + { + "epoch": 2.2832790766498965, + "grad_norm": 1.0944322347640991, + "learning_rate": 2.839999652416119e-06, + "loss": 0.2027, + "step": 4822 + }, + { + "epoch": 2.2837525895235276, + "grad_norm": 1.1411755084991455, + "learning_rate": 2.8364304332668245e-06, + "loss": 0.2242, + "step": 4823 + }, + { + "epoch": 2.284226102397159, + "grad_norm": 1.6203806400299072, + "learning_rate": 2.83286308768004e-06, + "loss": 0.2057, + "step": 4824 + }, + { + "epoch": 2.28469961527079, + "grad_norm": 1.048741102218628, + "learning_rate": 2.829297616588775e-06, + "loss": 0.2284, + "step": 4825 + }, + { + "epoch": 2.2851731281444216, + "grad_norm": 1.2415732145309448, + "learning_rate": 2.825734020925538e-06, + "loss": 0.1923, + "step": 4826 + }, + { + "epoch": 2.2856466410180527, + "grad_norm": 1.5212392807006836, + "learning_rate": 2.82217230162236e-06, + "loss": 0.2163, + "step": 4827 + }, + { + "epoch": 2.2861201538916838, + "grad_norm": 1.1868995428085327, + "learning_rate": 2.8186124596107713e-06, + "loss": 0.201, + "step": 4828 + }, + { + "epoch": 2.2865936667653153, + "grad_norm": 1.1392196416854858, + "learning_rate": 2.8150544958218097e-06, + "loss": 0.2098, + "step": 4829 + }, + { + "epoch": 2.2870671796389463, + "grad_norm": 1.0334511995315552, + "learning_rate": 2.8114984111860334e-06, + "loss": 0.2125, + "step": 4830 + }, + { + "epoch": 2.287540692512578, + "grad_norm": 1.3680204153060913, + "learning_rate": 2.8079442066334963e-06, + "loss": 0.1946, + "step": 4831 + }, + { + "epoch": 2.288014205386209, + "grad_norm": 1.040935754776001, + "learning_rate": 2.8043918830937713e-06, + "loss": 0.1835, + "step": 4832 + }, + { + "epoch": 2.28848771825984, + "grad_norm": 0.9355210661888123, + "learning_rate": 2.8008414414959295e-06, + "loss": 0.1898, + "step": 4833 + }, + { + "epoch": 2.2889612311334715, + "grad_norm": 1.3512656688690186, + "learning_rate": 2.7972928827685597e-06, + "loss": 0.1793, + "step": 4834 + }, + { + "epoch": 2.2894347440071026, + "grad_norm": 1.103294849395752, + "learning_rate": 2.793746207839748e-06, + "loss": 0.2012, + "step": 4835 + }, + { + "epoch": 2.289908256880734, + "grad_norm": 1.275721549987793, + "learning_rate": 2.7902014176370996e-06, + "loss": 0.2176, + "step": 4836 + }, + { + "epoch": 2.290381769754365, + "grad_norm": 1.1361020803451538, + "learning_rate": 2.786658513087712e-06, + "loss": 0.2, + "step": 4837 + }, + { + "epoch": 2.290855282627996, + "grad_norm": 1.2212600708007812, + "learning_rate": 2.7831174951182027e-06, + "loss": 0.2056, + "step": 4838 + }, + { + "epoch": 2.2913287955016277, + "grad_norm": 1.6602025032043457, + "learning_rate": 2.7795783646546924e-06, + "loss": 0.2165, + "step": 4839 + }, + { + "epoch": 2.291802308375259, + "grad_norm": 1.3278297185897827, + "learning_rate": 2.7760411226228022e-06, + "loss": 0.193, + "step": 4840 + }, + { + "epoch": 2.2922758212488903, + "grad_norm": 1.0122475624084473, + "learning_rate": 2.7725057699476674e-06, + "loss": 0.1895, + "step": 4841 + }, + { + "epoch": 2.2927493341225214, + "grad_norm": 1.3454642295837402, + "learning_rate": 2.7689723075539245e-06, + "loss": 0.1943, + "step": 4842 + }, + { + "epoch": 2.293222846996153, + "grad_norm": 1.102555274963379, + "learning_rate": 2.765440736365712e-06, + "loss": 0.2111, + "step": 4843 + }, + { + "epoch": 2.293696359869784, + "grad_norm": 1.9383556842803955, + "learning_rate": 2.7619110573066856e-06, + "loss": 0.1992, + "step": 4844 + }, + { + "epoch": 2.2941698727434154, + "grad_norm": 1.0931776762008667, + "learning_rate": 2.7583832712999912e-06, + "loss": 0.2254, + "step": 4845 + }, + { + "epoch": 2.2946433856170465, + "grad_norm": 1.0424646139144897, + "learning_rate": 2.754857379268294e-06, + "loss": 0.2012, + "step": 4846 + }, + { + "epoch": 2.2951168984906776, + "grad_norm": 1.0645688772201538, + "learning_rate": 2.7513333821337516e-06, + "loss": 0.1864, + "step": 4847 + }, + { + "epoch": 2.295590411364309, + "grad_norm": 1.6237589120864868, + "learning_rate": 2.7478112808180378e-06, + "loss": 0.175, + "step": 4848 + }, + { + "epoch": 2.29606392423794, + "grad_norm": 1.3338507413864136, + "learning_rate": 2.744291076242317e-06, + "loss": 0.1883, + "step": 4849 + }, + { + "epoch": 2.2965374371115717, + "grad_norm": 2.062040328979492, + "learning_rate": 2.74077276932727e-06, + "loss": 0.1894, + "step": 4850 + }, + { + "epoch": 2.2970109499852027, + "grad_norm": 1.056857705116272, + "learning_rate": 2.7372563609930726e-06, + "loss": 0.1875, + "step": 4851 + }, + { + "epoch": 2.297484462858834, + "grad_norm": 1.0261149406433105, + "learning_rate": 2.7337418521594107e-06, + "loss": 0.19, + "step": 4852 + }, + { + "epoch": 2.2979579757324653, + "grad_norm": 1.3251570463180542, + "learning_rate": 2.730229243745469e-06, + "loss": 0.215, + "step": 4853 + }, + { + "epoch": 2.2984314886060964, + "grad_norm": 0.9587031602859497, + "learning_rate": 2.726718536669933e-06, + "loss": 0.2052, + "step": 4854 + }, + { + "epoch": 2.298905001479728, + "grad_norm": 0.987009584903717, + "learning_rate": 2.7232097318510007e-06, + "loss": 0.1793, + "step": 4855 + }, + { + "epoch": 2.299378514353359, + "grad_norm": 1.4788868427276611, + "learning_rate": 2.7197028302063587e-06, + "loss": 0.2226, + "step": 4856 + }, + { + "epoch": 2.29985202722699, + "grad_norm": 1.2045164108276367, + "learning_rate": 2.716197832653211e-06, + "loss": 0.1932, + "step": 4857 + }, + { + "epoch": 2.3003255401006215, + "grad_norm": 1.0435082912445068, + "learning_rate": 2.7126947401082494e-06, + "loss": 0.2083, + "step": 4858 + }, + { + "epoch": 2.3007990529742526, + "grad_norm": 1.5519726276397705, + "learning_rate": 2.709193553487679e-06, + "loss": 0.2008, + "step": 4859 + }, + { + "epoch": 2.301272565847884, + "grad_norm": 1.805208444595337, + "learning_rate": 2.7056942737071955e-06, + "loss": 0.2021, + "step": 4860 + }, + { + "epoch": 2.301746078721515, + "grad_norm": 1.2867292165756226, + "learning_rate": 2.702196901682009e-06, + "loss": 0.1852, + "step": 4861 + }, + { + "epoch": 2.3022195915951467, + "grad_norm": 1.190337061882019, + "learning_rate": 2.6987014383268196e-06, + "loss": 0.1956, + "step": 4862 + }, + { + "epoch": 2.3026931044687777, + "grad_norm": 1.159690499305725, + "learning_rate": 2.6952078845558292e-06, + "loss": 0.1994, + "step": 4863 + }, + { + "epoch": 2.3031666173424092, + "grad_norm": 1.3060331344604492, + "learning_rate": 2.691716241282748e-06, + "loss": 0.2027, + "step": 4864 + }, + { + "epoch": 2.3036401302160403, + "grad_norm": 1.173386812210083, + "learning_rate": 2.6882265094207783e-06, + "loss": 0.1967, + "step": 4865 + }, + { + "epoch": 2.3041136430896714, + "grad_norm": 1.255037784576416, + "learning_rate": 2.684738689882629e-06, + "loss": 0.1947, + "step": 4866 + }, + { + "epoch": 2.304587155963303, + "grad_norm": 1.0719294548034668, + "learning_rate": 2.6812527835805013e-06, + "loss": 0.2067, + "step": 4867 + }, + { + "epoch": 2.305060668836934, + "grad_norm": 1.0298558473587036, + "learning_rate": 2.6777687914261054e-06, + "loss": 0.1918, + "step": 4868 + }, + { + "epoch": 2.3055341817105655, + "grad_norm": 1.2399965524673462, + "learning_rate": 2.6742867143306404e-06, + "loss": 0.2119, + "step": 4869 + }, + { + "epoch": 2.3060076945841965, + "grad_norm": 1.3060729503631592, + "learning_rate": 2.6708065532048167e-06, + "loss": 0.2244, + "step": 4870 + }, + { + "epoch": 2.3064812074578276, + "grad_norm": 1.0601017475128174, + "learning_rate": 2.6673283089588286e-06, + "loss": 0.1678, + "step": 4871 + }, + { + "epoch": 2.306954720331459, + "grad_norm": 1.461519718170166, + "learning_rate": 2.6638519825023855e-06, + "loss": 0.2043, + "step": 4872 + }, + { + "epoch": 2.30742823320509, + "grad_norm": 1.202025055885315, + "learning_rate": 2.6603775747446836e-06, + "loss": 0.219, + "step": 4873 + }, + { + "epoch": 2.3079017460787217, + "grad_norm": 1.5408061742782593, + "learning_rate": 2.6569050865944168e-06, + "loss": 0.2058, + "step": 4874 + }, + { + "epoch": 2.3083752589523527, + "grad_norm": 1.1081178188323975, + "learning_rate": 2.653434518959788e-06, + "loss": 0.2056, + "step": 4875 + }, + { + "epoch": 2.308848771825984, + "grad_norm": 1.2208150625228882, + "learning_rate": 2.649965872748481e-06, + "loss": 0.2119, + "step": 4876 + }, + { + "epoch": 2.3093222846996153, + "grad_norm": 0.9531561732292175, + "learning_rate": 2.6464991488676996e-06, + "loss": 0.175, + "step": 4877 + }, + { + "epoch": 2.3097957975732464, + "grad_norm": 1.0431057214736938, + "learning_rate": 2.6430343482241237e-06, + "loss": 0.2357, + "step": 4878 + }, + { + "epoch": 2.310269310446878, + "grad_norm": 1.079884648323059, + "learning_rate": 2.6395714717239384e-06, + "loss": 0.2059, + "step": 4879 + }, + { + "epoch": 2.310742823320509, + "grad_norm": 1.2502458095550537, + "learning_rate": 2.63611052027283e-06, + "loss": 0.2051, + "step": 4880 + }, + { + "epoch": 2.3112163361941405, + "grad_norm": 1.1873927116394043, + "learning_rate": 2.6326514947759718e-06, + "loss": 0.2142, + "step": 4881 + }, + { + "epoch": 2.3116898490677715, + "grad_norm": 1.1415022611618042, + "learning_rate": 2.629194396138044e-06, + "loss": 0.1781, + "step": 4882 + }, + { + "epoch": 2.3121633619414026, + "grad_norm": 1.0206491947174072, + "learning_rate": 2.625739225263211e-06, + "loss": 0.1994, + "step": 4883 + }, + { + "epoch": 2.312636874815034, + "grad_norm": 1.136160969734192, + "learning_rate": 2.622285983055146e-06, + "loss": 0.1869, + "step": 4884 + }, + { + "epoch": 2.313110387688665, + "grad_norm": 1.2701917886734009, + "learning_rate": 2.6188346704170053e-06, + "loss": 0.2147, + "step": 4885 + }, + { + "epoch": 2.3135839005622967, + "grad_norm": 1.050111174583435, + "learning_rate": 2.615385288251452e-06, + "loss": 0.2092, + "step": 4886 + }, + { + "epoch": 2.3140574134359277, + "grad_norm": 1.1938689947128296, + "learning_rate": 2.6119378374606354e-06, + "loss": 0.2094, + "step": 4887 + }, + { + "epoch": 2.3145309263095593, + "grad_norm": 1.2285950183868408, + "learning_rate": 2.608492318946201e-06, + "loss": 0.1953, + "step": 4888 + }, + { + "epoch": 2.3150044391831903, + "grad_norm": 2.1110057830810547, + "learning_rate": 2.6050487336092967e-06, + "loss": 0.2099, + "step": 4889 + }, + { + "epoch": 2.3154779520568214, + "grad_norm": 1.1653103828430176, + "learning_rate": 2.6016070823505525e-06, + "loss": 0.1689, + "step": 4890 + }, + { + "epoch": 2.315951464930453, + "grad_norm": 1.4252010583877563, + "learning_rate": 2.5981673660701055e-06, + "loss": 0.1917, + "step": 4891 + }, + { + "epoch": 2.316424977804084, + "grad_norm": 1.0870301723480225, + "learning_rate": 2.5947295856675737e-06, + "loss": 0.1839, + "step": 4892 + }, + { + "epoch": 2.3168984906777155, + "grad_norm": 1.1563901901245117, + "learning_rate": 2.5912937420420823e-06, + "loss": 0.1985, + "step": 4893 + }, + { + "epoch": 2.3173720035513465, + "grad_norm": 0.9440892338752747, + "learning_rate": 2.587859836092237e-06, + "loss": 0.1894, + "step": 4894 + }, + { + "epoch": 2.3178455164249776, + "grad_norm": 1.289351463317871, + "learning_rate": 2.5844278687161474e-06, + "loss": 0.2051, + "step": 4895 + }, + { + "epoch": 2.318319029298609, + "grad_norm": 1.7012381553649902, + "learning_rate": 2.58099784081141e-06, + "loss": 0.1951, + "step": 4896 + }, + { + "epoch": 2.31879254217224, + "grad_norm": 0.9597797393798828, + "learning_rate": 2.577569753275112e-06, + "loss": 0.2101, + "step": 4897 + }, + { + "epoch": 2.3192660550458717, + "grad_norm": 1.141579270362854, + "learning_rate": 2.574143607003843e-06, + "loss": 0.2081, + "step": 4898 + }, + { + "epoch": 2.3197395679195028, + "grad_norm": 1.3628860712051392, + "learning_rate": 2.570719402893671e-06, + "loss": 0.2154, + "step": 4899 + }, + { + "epoch": 2.320213080793134, + "grad_norm": 1.7435754537582397, + "learning_rate": 2.5672971418401716e-06, + "loss": 0.1822, + "step": 4900 + }, + { + "epoch": 2.3206865936667653, + "grad_norm": 1.5572841167449951, + "learning_rate": 2.5638768247383962e-06, + "loss": 0.1956, + "step": 4901 + }, + { + "epoch": 2.3211601065403964, + "grad_norm": 1.8090298175811768, + "learning_rate": 2.5604584524829036e-06, + "loss": 0.1927, + "step": 4902 + }, + { + "epoch": 2.321633619414028, + "grad_norm": 1.089689016342163, + "learning_rate": 2.5570420259677285e-06, + "loss": 0.2198, + "step": 4903 + }, + { + "epoch": 2.322107132287659, + "grad_norm": 0.9589508175849915, + "learning_rate": 2.553627546086411e-06, + "loss": 0.2007, + "step": 4904 + }, + { + "epoch": 2.3225806451612905, + "grad_norm": 0.8565977811813354, + "learning_rate": 2.55021501373197e-06, + "loss": 0.1745, + "step": 4905 + }, + { + "epoch": 2.3230541580349215, + "grad_norm": 1.0628879070281982, + "learning_rate": 2.5468044297969265e-06, + "loss": 0.1977, + "step": 4906 + }, + { + "epoch": 2.323527670908553, + "grad_norm": 1.0389453172683716, + "learning_rate": 2.543395795173281e-06, + "loss": 0.1992, + "step": 4907 + }, + { + "epoch": 2.324001183782184, + "grad_norm": 1.0597020387649536, + "learning_rate": 2.5399891107525277e-06, + "loss": 0.2128, + "step": 4908 + }, + { + "epoch": 2.324474696655815, + "grad_norm": 1.1213783025741577, + "learning_rate": 2.5365843774256573e-06, + "loss": 0.1916, + "step": 4909 + }, + { + "epoch": 2.3249482095294467, + "grad_norm": 1.2535953521728516, + "learning_rate": 2.5331815960831387e-06, + "loss": 0.1868, + "step": 4910 + }, + { + "epoch": 2.3254217224030778, + "grad_norm": 1.0827935934066772, + "learning_rate": 2.5297807676149435e-06, + "loss": 0.2103, + "step": 4911 + }, + { + "epoch": 2.3258952352767093, + "grad_norm": 1.4444578886032104, + "learning_rate": 2.526381892910519e-06, + "loss": 0.2075, + "step": 4912 + }, + { + "epoch": 2.3263687481503403, + "grad_norm": 1.0366746187210083, + "learning_rate": 2.5229849728588143e-06, + "loss": 0.2126, + "step": 4913 + }, + { + "epoch": 2.3268422610239714, + "grad_norm": 1.0630645751953125, + "learning_rate": 2.519590008348255e-06, + "loss": 0.2182, + "step": 4914 + }, + { + "epoch": 2.327315773897603, + "grad_norm": 1.4311585426330566, + "learning_rate": 2.5161970002667636e-06, + "loss": 0.2224, + "step": 4915 + }, + { + "epoch": 2.327789286771234, + "grad_norm": 1.9269888401031494, + "learning_rate": 2.512805949501752e-06, + "loss": 0.1924, + "step": 4916 + }, + { + "epoch": 2.3282627996448655, + "grad_norm": 1.697356939315796, + "learning_rate": 2.5094168569401123e-06, + "loss": 0.2131, + "step": 4917 + }, + { + "epoch": 2.3287363125184966, + "grad_norm": 1.6500767469406128, + "learning_rate": 2.5060297234682328e-06, + "loss": 0.2155, + "step": 4918 + }, + { + "epoch": 2.3292098253921276, + "grad_norm": 1.0592091083526611, + "learning_rate": 2.502644549971981e-06, + "loss": 0.209, + "step": 4919 + }, + { + "epoch": 2.329683338265759, + "grad_norm": 1.3273735046386719, + "learning_rate": 2.499261337336721e-06, + "loss": 0.2025, + "step": 4920 + }, + { + "epoch": 2.33015685113939, + "grad_norm": 2.2133803367614746, + "learning_rate": 2.4958800864472974e-06, + "loss": 0.1929, + "step": 4921 + }, + { + "epoch": 2.3306303640130217, + "grad_norm": 1.3577215671539307, + "learning_rate": 2.492500798188042e-06, + "loss": 0.2091, + "step": 4922 + }, + { + "epoch": 2.3311038768866528, + "grad_norm": 1.0294976234436035, + "learning_rate": 2.4891234734427784e-06, + "loss": 0.226, + "step": 4923 + }, + { + "epoch": 2.3315773897602843, + "grad_norm": 1.083409070968628, + "learning_rate": 2.485748113094809e-06, + "loss": 0.208, + "step": 4924 + }, + { + "epoch": 2.3320509026339153, + "grad_norm": 1.1658552885055542, + "learning_rate": 2.4823747180269332e-06, + "loss": 0.2045, + "step": 4925 + }, + { + "epoch": 2.332524415507547, + "grad_norm": 1.3609403371810913, + "learning_rate": 2.479003289121422e-06, + "loss": 0.2003, + "step": 4926 + }, + { + "epoch": 2.332997928381178, + "grad_norm": 0.9218862652778625, + "learning_rate": 2.4756338272600476e-06, + "loss": 0.1993, + "step": 4927 + }, + { + "epoch": 2.333471441254809, + "grad_norm": 1.0133236646652222, + "learning_rate": 2.472266333324054e-06, + "loss": 0.2027, + "step": 4928 + }, + { + "epoch": 2.3339449541284405, + "grad_norm": 0.9589722752571106, + "learning_rate": 2.4689008081941825e-06, + "loss": 0.1872, + "step": 4929 + }, + { + "epoch": 2.3344184670020716, + "grad_norm": 1.2544256448745728, + "learning_rate": 2.4655372527506473e-06, + "loss": 0.2071, + "step": 4930 + }, + { + "epoch": 2.334891979875703, + "grad_norm": 1.001051902770996, + "learning_rate": 2.462175667873161e-06, + "loss": 0.2062, + "step": 4931 + }, + { + "epoch": 2.335365492749334, + "grad_norm": 1.142121434211731, + "learning_rate": 2.45881605444091e-06, + "loss": 0.1953, + "step": 4932 + }, + { + "epoch": 2.335839005622965, + "grad_norm": 1.1744766235351562, + "learning_rate": 2.4554584133325653e-06, + "loss": 0.1917, + "step": 4933 + }, + { + "epoch": 2.3363125184965967, + "grad_norm": 1.3197706937789917, + "learning_rate": 2.4521027454262925e-06, + "loss": 0.1952, + "step": 4934 + }, + { + "epoch": 2.336786031370228, + "grad_norm": 1.6744234561920166, + "learning_rate": 2.4487490515997282e-06, + "loss": 0.2136, + "step": 4935 + }, + { + "epoch": 2.3372595442438593, + "grad_norm": 1.0961428880691528, + "learning_rate": 2.445397332730004e-06, + "loss": 0.2046, + "step": 4936 + }, + { + "epoch": 2.3377330571174904, + "grad_norm": 0.8409469127655029, + "learning_rate": 2.4420475896937236e-06, + "loss": 0.1953, + "step": 4937 + }, + { + "epoch": 2.3382065699911214, + "grad_norm": 1.0160478353500366, + "learning_rate": 2.4386998233669867e-06, + "loss": 0.1908, + "step": 4938 + }, + { + "epoch": 2.338680082864753, + "grad_norm": 1.2315906286239624, + "learning_rate": 2.4353540346253622e-06, + "loss": 0.212, + "step": 4939 + }, + { + "epoch": 2.339153595738384, + "grad_norm": 1.112743616104126, + "learning_rate": 2.4320102243439157e-06, + "loss": 0.2025, + "step": 4940 + }, + { + "epoch": 2.3396271086120155, + "grad_norm": 1.1238675117492676, + "learning_rate": 2.4286683933971853e-06, + "loss": 0.2032, + "step": 4941 + }, + { + "epoch": 2.3401006214856466, + "grad_norm": 1.2250267267227173, + "learning_rate": 2.4253285426591923e-06, + "loss": 0.2298, + "step": 4942 + }, + { + "epoch": 2.340574134359278, + "grad_norm": 1.391561508178711, + "learning_rate": 2.4219906730034472e-06, + "loss": 0.2103, + "step": 4943 + }, + { + "epoch": 2.341047647232909, + "grad_norm": 1.2246688604354858, + "learning_rate": 2.418654785302933e-06, + "loss": 0.202, + "step": 4944 + }, + { + "epoch": 2.34152116010654, + "grad_norm": 1.2264790534973145, + "learning_rate": 2.4153208804301232e-06, + "loss": 0.2202, + "step": 4945 + }, + { + "epoch": 2.3419946729801717, + "grad_norm": 0.9517511129379272, + "learning_rate": 2.4119889592569633e-06, + "loss": 0.1852, + "step": 4946 + }, + { + "epoch": 2.342468185853803, + "grad_norm": 0.9769893288612366, + "learning_rate": 2.408659022654892e-06, + "loss": 0.1959, + "step": 4947 + }, + { + "epoch": 2.3429416987274343, + "grad_norm": 1.0385048389434814, + "learning_rate": 2.4053310714948153e-06, + "loss": 0.1984, + "step": 4948 + }, + { + "epoch": 2.3434152116010654, + "grad_norm": 0.9590575098991394, + "learning_rate": 2.402005106647133e-06, + "loss": 0.1941, + "step": 4949 + }, + { + "epoch": 2.343888724474697, + "grad_norm": 1.5553905963897705, + "learning_rate": 2.3986811289817126e-06, + "loss": 0.2009, + "step": 4950 + }, + { + "epoch": 2.344362237348328, + "grad_norm": 1.0020135641098022, + "learning_rate": 2.3953591393679143e-06, + "loss": 0.1865, + "step": 4951 + }, + { + "epoch": 2.344835750221959, + "grad_norm": 1.1729081869125366, + "learning_rate": 2.39203913867457e-06, + "loss": 0.2014, + "step": 4952 + }, + { + "epoch": 2.3453092630955905, + "grad_norm": 1.1632317304611206, + "learning_rate": 2.3887211277699883e-06, + "loss": 0.2208, + "step": 4953 + }, + { + "epoch": 2.3457827759692216, + "grad_norm": 1.0477259159088135, + "learning_rate": 2.3854051075219744e-06, + "loss": 0.2035, + "step": 4954 + }, + { + "epoch": 2.346256288842853, + "grad_norm": 1.2778170108795166, + "learning_rate": 2.382091078797791e-06, + "loss": 0.2128, + "step": 4955 + }, + { + "epoch": 2.346729801716484, + "grad_norm": 1.322662115097046, + "learning_rate": 2.3787790424641986e-06, + "loss": 0.2053, + "step": 4956 + }, + { + "epoch": 2.347203314590115, + "grad_norm": 1.149414300918579, + "learning_rate": 2.3754689993874247e-06, + "loss": 0.2045, + "step": 4957 + }, + { + "epoch": 2.3476768274637467, + "grad_norm": 1.00741446018219, + "learning_rate": 2.3721609504331755e-06, + "loss": 0.1899, + "step": 4958 + }, + { + "epoch": 2.348150340337378, + "grad_norm": 1.745548963546753, + "learning_rate": 2.3688548964666446e-06, + "loss": 0.1886, + "step": 4959 + }, + { + "epoch": 2.3486238532110093, + "grad_norm": 1.0258804559707642, + "learning_rate": 2.365550838352495e-06, + "loss": 0.1878, + "step": 4960 + }, + { + "epoch": 2.3490973660846404, + "grad_norm": 1.286632776260376, + "learning_rate": 2.3622487769548754e-06, + "loss": 0.2032, + "step": 4961 + }, + { + "epoch": 2.3495708789582714, + "grad_norm": 1.0202562808990479, + "learning_rate": 2.3589487131374023e-06, + "loss": 0.2097, + "step": 4962 + }, + { + "epoch": 2.350044391831903, + "grad_norm": 1.0657576322555542, + "learning_rate": 2.3556506477631826e-06, + "loss": 0.1977, + "step": 4963 + }, + { + "epoch": 2.350517904705534, + "grad_norm": 1.0342962741851807, + "learning_rate": 2.3523545816947856e-06, + "loss": 0.1915, + "step": 4964 + }, + { + "epoch": 2.3509914175791655, + "grad_norm": 1.115942120552063, + "learning_rate": 2.3490605157942726e-06, + "loss": 0.2007, + "step": 4965 + }, + { + "epoch": 2.3514649304527966, + "grad_norm": 2.0979726314544678, + "learning_rate": 2.3457684509231725e-06, + "loss": 0.1993, + "step": 4966 + }, + { + "epoch": 2.351938443326428, + "grad_norm": 1.075711965560913, + "learning_rate": 2.3424783879424894e-06, + "loss": 0.1756, + "step": 4967 + }, + { + "epoch": 2.352411956200059, + "grad_norm": 1.1330883502960205, + "learning_rate": 2.3391903277127127e-06, + "loss": 0.2115, + "step": 4968 + }, + { + "epoch": 2.3528854690736907, + "grad_norm": 1.1201201677322388, + "learning_rate": 2.3359042710937986e-06, + "loss": 0.1995, + "step": 4969 + }, + { + "epoch": 2.3533589819473217, + "grad_norm": 1.2727965116500854, + "learning_rate": 2.3326202189451873e-06, + "loss": 0.2007, + "step": 4970 + }, + { + "epoch": 2.353832494820953, + "grad_norm": 1.3274377584457397, + "learning_rate": 2.3293381721257868e-06, + "loss": 0.1829, + "step": 4971 + }, + { + "epoch": 2.3543060076945843, + "grad_norm": 0.9202346205711365, + "learning_rate": 2.326058131493991e-06, + "loss": 0.1861, + "step": 4972 + }, + { + "epoch": 2.3547795205682154, + "grad_norm": 1.0923426151275635, + "learning_rate": 2.3227800979076552e-06, + "loss": 0.2093, + "step": 4973 + }, + { + "epoch": 2.355253033441847, + "grad_norm": 1.1253713369369507, + "learning_rate": 2.319504072224125e-06, + "loss": 0.2014, + "step": 4974 + }, + { + "epoch": 2.355726546315478, + "grad_norm": 1.5804685354232788, + "learning_rate": 2.316230055300208e-06, + "loss": 0.1789, + "step": 4975 + }, + { + "epoch": 2.356200059189109, + "grad_norm": 2.09173321723938, + "learning_rate": 2.312958047992192e-06, + "loss": 0.1968, + "step": 4976 + }, + { + "epoch": 2.3566735720627405, + "grad_norm": 1.405809760093689, + "learning_rate": 2.3096880511558427e-06, + "loss": 0.2299, + "step": 4977 + }, + { + "epoch": 2.3571470849363716, + "grad_norm": 1.0536633729934692, + "learning_rate": 2.306420065646392e-06, + "loss": 0.2048, + "step": 4978 + }, + { + "epoch": 2.357620597810003, + "grad_norm": 1.2741084098815918, + "learning_rate": 2.3031540923185536e-06, + "loss": 0.2172, + "step": 4979 + }, + { + "epoch": 2.358094110683634, + "grad_norm": 0.9234102964401245, + "learning_rate": 2.299890132026508e-06, + "loss": 0.2006, + "step": 4980 + }, + { + "epoch": 2.3585676235572652, + "grad_norm": 1.0131012201309204, + "learning_rate": 2.296628185623915e-06, + "loss": 0.1732, + "step": 4981 + }, + { + "epoch": 2.3590411364308967, + "grad_norm": 1.1941182613372803, + "learning_rate": 2.2933682539639026e-06, + "loss": 0.2364, + "step": 4982 + }, + { + "epoch": 2.359514649304528, + "grad_norm": 1.0108896493911743, + "learning_rate": 2.2901103378990785e-06, + "loss": 0.1966, + "step": 4983 + }, + { + "epoch": 2.3599881621781593, + "grad_norm": 0.8393535017967224, + "learning_rate": 2.286854438281515e-06, + "loss": 0.194, + "step": 4984 + }, + { + "epoch": 2.3604616750517904, + "grad_norm": 1.4110928773880005, + "learning_rate": 2.283600555962765e-06, + "loss": 0.1968, + "step": 4985 + }, + { + "epoch": 2.360935187925422, + "grad_norm": 1.11850905418396, + "learning_rate": 2.2803486917938487e-06, + "loss": 0.2094, + "step": 4986 + }, + { + "epoch": 2.361408700799053, + "grad_norm": 1.1148712635040283, + "learning_rate": 2.2770988466252565e-06, + "loss": 0.2258, + "step": 4987 + }, + { + "epoch": 2.3618822136726845, + "grad_norm": 1.2502351999282837, + "learning_rate": 2.273851021306959e-06, + "loss": 0.1848, + "step": 4988 + }, + { + "epoch": 2.3623557265463155, + "grad_norm": 1.252803087234497, + "learning_rate": 2.27060521668839e-06, + "loss": 0.2035, + "step": 4989 + }, + { + "epoch": 2.3628292394199466, + "grad_norm": 1.0564323663711548, + "learning_rate": 2.26736143361846e-06, + "loss": 0.1949, + "step": 4990 + }, + { + "epoch": 2.363302752293578, + "grad_norm": 1.3723267316818237, + "learning_rate": 2.2641196729455482e-06, + "loss": 0.211, + "step": 4991 + }, + { + "epoch": 2.363776265167209, + "grad_norm": 1.104577660560608, + "learning_rate": 2.2608799355175058e-06, + "loss": 0.1932, + "step": 4992 + }, + { + "epoch": 2.3642497780408407, + "grad_norm": 1.5187643766403198, + "learning_rate": 2.2576422221816596e-06, + "loss": 0.1772, + "step": 4993 + }, + { + "epoch": 2.3647232909144718, + "grad_norm": 1.233651041984558, + "learning_rate": 2.254406533784794e-06, + "loss": 0.2034, + "step": 4994 + }, + { + "epoch": 2.365196803788103, + "grad_norm": 1.5177334547042847, + "learning_rate": 2.2511728711731806e-06, + "loss": 0.2064, + "step": 4995 + }, + { + "epoch": 2.3656703166617343, + "grad_norm": 1.8442257642745972, + "learning_rate": 2.2479412351925444e-06, + "loss": 0.2119, + "step": 4996 + }, + { + "epoch": 2.3661438295353654, + "grad_norm": 1.1477844715118408, + "learning_rate": 2.2447116266880964e-06, + "loss": 0.1973, + "step": 4997 + }, + { + "epoch": 2.366617342408997, + "grad_norm": 1.1657443046569824, + "learning_rate": 2.241484046504503e-06, + "loss": 0.2081, + "step": 4998 + }, + { + "epoch": 2.367090855282628, + "grad_norm": 1.501852035522461, + "learning_rate": 2.238258495485912e-06, + "loss": 0.2391, + "step": 4999 + }, + { + "epoch": 2.367564368156259, + "grad_norm": 1.438940167427063, + "learning_rate": 2.2350349744759324e-06, + "loss": 0.2117, + "step": 5000 + }, + { + "epoch": 2.3680378810298905, + "grad_norm": 2.076133966445923, + "learning_rate": 2.231813484317643e-06, + "loss": 0.1915, + "step": 5001 + }, + { + "epoch": 2.3685113939035216, + "grad_norm": 0.9920544624328613, + "learning_rate": 2.2285940258535987e-06, + "loss": 0.2254, + "step": 5002 + }, + { + "epoch": 2.368984906777153, + "grad_norm": 0.9885861873626709, + "learning_rate": 2.2253765999258115e-06, + "loss": 0.2179, + "step": 5003 + }, + { + "epoch": 2.369458419650784, + "grad_norm": 1.199271559715271, + "learning_rate": 2.222161207375775e-06, + "loss": 0.1974, + "step": 5004 + }, + { + "epoch": 2.3699319325244157, + "grad_norm": 0.865847110748291, + "learning_rate": 2.2189478490444373e-06, + "loss": 0.1935, + "step": 5005 + }, + { + "epoch": 2.3704054453980468, + "grad_norm": 1.360724925994873, + "learning_rate": 2.2157365257722273e-06, + "loss": 0.1836, + "step": 5006 + }, + { + "epoch": 2.370878958271678, + "grad_norm": 1.0131431818008423, + "learning_rate": 2.2125272383990304e-06, + "loss": 0.1864, + "step": 5007 + }, + { + "epoch": 2.3713524711453093, + "grad_norm": 0.978959858417511, + "learning_rate": 2.209319987764209e-06, + "loss": 0.2075, + "step": 5008 + }, + { + "epoch": 2.3718259840189404, + "grad_norm": 1.2233644723892212, + "learning_rate": 2.2061147747065847e-06, + "loss": 0.2237, + "step": 5009 + }, + { + "epoch": 2.372299496892572, + "grad_norm": 1.8459240198135376, + "learning_rate": 2.2029116000644544e-06, + "loss": 0.1897, + "step": 5010 + }, + { + "epoch": 2.372773009766203, + "grad_norm": 1.2090134620666504, + "learning_rate": 2.1997104646755763e-06, + "loss": 0.2038, + "step": 5011 + }, + { + "epoch": 2.3732465226398345, + "grad_norm": 0.9758379459381104, + "learning_rate": 2.1965113693771725e-06, + "loss": 0.1924, + "step": 5012 + }, + { + "epoch": 2.3737200355134656, + "grad_norm": 1.6864513158798218, + "learning_rate": 2.193314315005941e-06, + "loss": 0.2004, + "step": 5013 + }, + { + "epoch": 2.3741935483870966, + "grad_norm": 1.3247301578521729, + "learning_rate": 2.190119302398035e-06, + "loss": 0.1929, + "step": 5014 + }, + { + "epoch": 2.374667061260728, + "grad_norm": 1.4092978239059448, + "learning_rate": 2.186926332389084e-06, + "loss": 0.2078, + "step": 5015 + }, + { + "epoch": 2.375140574134359, + "grad_norm": 0.9686800241470337, + "learning_rate": 2.1837354058141756e-06, + "loss": 0.1993, + "step": 5016 + }, + { + "epoch": 2.3756140870079907, + "grad_norm": 1.527159571647644, + "learning_rate": 2.1805465235078695e-06, + "loss": 0.191, + "step": 5017 + }, + { + "epoch": 2.3760875998816218, + "grad_norm": 1.4575700759887695, + "learning_rate": 2.177359686304181e-06, + "loss": 0.203, + "step": 5018 + }, + { + "epoch": 2.376561112755253, + "grad_norm": 1.0823571681976318, + "learning_rate": 2.1741748950366036e-06, + "loss": 0.2032, + "step": 5019 + }, + { + "epoch": 2.3770346256288843, + "grad_norm": 1.1299647092819214, + "learning_rate": 2.170992150538085e-06, + "loss": 0.1946, + "step": 5020 + }, + { + "epoch": 2.3775081385025154, + "grad_norm": 1.379338264465332, + "learning_rate": 2.167811453641039e-06, + "loss": 0.2056, + "step": 5021 + }, + { + "epoch": 2.377981651376147, + "grad_norm": 1.223885416984558, + "learning_rate": 2.1646328051773512e-06, + "loss": 0.2183, + "step": 5022 + }, + { + "epoch": 2.378455164249778, + "grad_norm": 1.0248559713363647, + "learning_rate": 2.1614562059783627e-06, + "loss": 0.1936, + "step": 5023 + }, + { + "epoch": 2.378928677123409, + "grad_norm": 1.0929266214370728, + "learning_rate": 2.1582816568748856e-06, + "loss": 0.1954, + "step": 5024 + }, + { + "epoch": 2.3794021899970406, + "grad_norm": 1.3425700664520264, + "learning_rate": 2.155109158697187e-06, + "loss": 0.1968, + "step": 5025 + }, + { + "epoch": 2.3798757028706716, + "grad_norm": 1.4846118688583374, + "learning_rate": 2.151938712275011e-06, + "loss": 0.1819, + "step": 5026 + }, + { + "epoch": 2.380349215744303, + "grad_norm": 1.0532817840576172, + "learning_rate": 2.14877031843755e-06, + "loss": 0.1961, + "step": 5027 + }, + { + "epoch": 2.380822728617934, + "grad_norm": 1.0059374570846558, + "learning_rate": 2.145603978013473e-06, + "loss": 0.2051, + "step": 5028 + }, + { + "epoch": 2.3812962414915657, + "grad_norm": 1.0236760377883911, + "learning_rate": 2.1424396918309e-06, + "loss": 0.2084, + "step": 5029 + }, + { + "epoch": 2.381769754365197, + "grad_norm": 1.142202377319336, + "learning_rate": 2.139277460717425e-06, + "loss": 0.1947, + "step": 5030 + }, + { + "epoch": 2.3822432672388283, + "grad_norm": 1.1363552808761597, + "learning_rate": 2.1361172855000943e-06, + "loss": 0.2102, + "step": 5031 + }, + { + "epoch": 2.3827167801124594, + "grad_norm": 1.0805346965789795, + "learning_rate": 2.1329591670054227e-06, + "loss": 0.1846, + "step": 5032 + }, + { + "epoch": 2.3831902929860904, + "grad_norm": 1.1634663343429565, + "learning_rate": 2.1298031060593893e-06, + "loss": 0.2285, + "step": 5033 + }, + { + "epoch": 2.383663805859722, + "grad_norm": 0.9185308814048767, + "learning_rate": 2.1266491034874247e-06, + "loss": 0.1965, + "step": 5034 + }, + { + "epoch": 2.384137318733353, + "grad_norm": 0.9834035038948059, + "learning_rate": 2.1234971601144362e-06, + "loss": 0.2108, + "step": 5035 + }, + { + "epoch": 2.3846108316069845, + "grad_norm": 1.1302801370620728, + "learning_rate": 2.1203472767647782e-06, + "loss": 0.1893, + "step": 5036 + }, + { + "epoch": 2.3850843444806156, + "grad_norm": 1.2327231168746948, + "learning_rate": 2.117199454262271e-06, + "loss": 0.1993, + "step": 5037 + }, + { + "epoch": 2.3855578573542466, + "grad_norm": 0.890938401222229, + "learning_rate": 2.1140536934302024e-06, + "loss": 0.1921, + "step": 5038 + }, + { + "epoch": 2.386031370227878, + "grad_norm": 0.9080513715744019, + "learning_rate": 2.1109099950913105e-06, + "loss": 0.1937, + "step": 5039 + }, + { + "epoch": 2.386504883101509, + "grad_norm": 0.9706762433052063, + "learning_rate": 2.107768360067806e-06, + "loss": 0.2035, + "step": 5040 + }, + { + "epoch": 2.3869783959751407, + "grad_norm": 1.112209439277649, + "learning_rate": 2.1046287891813445e-06, + "loss": 0.2072, + "step": 5041 + }, + { + "epoch": 2.387451908848772, + "grad_norm": 1.2673587799072266, + "learning_rate": 2.1014912832530587e-06, + "loss": 0.1955, + "step": 5042 + }, + { + "epoch": 2.387925421722403, + "grad_norm": 1.109052300453186, + "learning_rate": 2.0983558431035266e-06, + "loss": 0.1937, + "step": 5043 + }, + { + "epoch": 2.3883989345960344, + "grad_norm": 0.9777413606643677, + "learning_rate": 2.095222469552799e-06, + "loss": 0.2019, + "step": 5044 + }, + { + "epoch": 2.3888724474696654, + "grad_norm": 1.3833292722702026, + "learning_rate": 2.0920911634203745e-06, + "loss": 0.1948, + "step": 5045 + }, + { + "epoch": 2.389345960343297, + "grad_norm": 1.2488656044006348, + "learning_rate": 2.0889619255252147e-06, + "loss": 0.1998, + "step": 5046 + }, + { + "epoch": 2.389819473216928, + "grad_norm": 1.1769556999206543, + "learning_rate": 2.085834756685747e-06, + "loss": 0.2172, + "step": 5047 + }, + { + "epoch": 2.3902929860905595, + "grad_norm": 1.6330255270004272, + "learning_rate": 2.082709657719848e-06, + "loss": 0.1885, + "step": 5048 + }, + { + "epoch": 2.3907664989641906, + "grad_norm": 1.0924394130706787, + "learning_rate": 2.0795866294448596e-06, + "loss": 0.2175, + "step": 5049 + }, + { + "epoch": 2.391240011837822, + "grad_norm": 1.4790942668914795, + "learning_rate": 2.0764656726775767e-06, + "loss": 0.1987, + "step": 5050 + }, + { + "epoch": 2.391713524711453, + "grad_norm": 0.9769918918609619, + "learning_rate": 2.0733467882342597e-06, + "loss": 0.2065, + "step": 5051 + }, + { + "epoch": 2.392187037585084, + "grad_norm": 1.1905534267425537, + "learning_rate": 2.0702299769306177e-06, + "loss": 0.2126, + "step": 5052 + }, + { + "epoch": 2.3926605504587157, + "grad_norm": 1.0014641284942627, + "learning_rate": 2.067115239581828e-06, + "loss": 0.1905, + "step": 5053 + }, + { + "epoch": 2.393134063332347, + "grad_norm": 1.234556794166565, + "learning_rate": 2.0640025770025186e-06, + "loss": 0.2117, + "step": 5054 + }, + { + "epoch": 2.3936075762059783, + "grad_norm": 1.1088292598724365, + "learning_rate": 2.0608919900067716e-06, + "loss": 0.2065, + "step": 5055 + }, + { + "epoch": 2.3940810890796094, + "grad_norm": 0.9730679988861084, + "learning_rate": 2.0577834794081377e-06, + "loss": 0.1976, + "step": 5056 + }, + { + "epoch": 2.3945546019532404, + "grad_norm": 1.591340184211731, + "learning_rate": 2.0546770460196117e-06, + "loss": 0.1804, + "step": 5057 + }, + { + "epoch": 2.395028114826872, + "grad_norm": 1.2157362699508667, + "learning_rate": 2.051572690653658e-06, + "loss": 0.1855, + "step": 5058 + }, + { + "epoch": 2.395501627700503, + "grad_norm": 1.2474086284637451, + "learning_rate": 2.0484704141221845e-06, + "loss": 0.1692, + "step": 5059 + }, + { + "epoch": 2.3959751405741345, + "grad_norm": 0.999910831451416, + "learning_rate": 2.0453702172365663e-06, + "loss": 0.1992, + "step": 5060 + }, + { + "epoch": 2.3964486534477656, + "grad_norm": 1.1098980903625488, + "learning_rate": 2.0422721008076264e-06, + "loss": 0.2495, + "step": 5061 + }, + { + "epoch": 2.3969221663213967, + "grad_norm": 1.1341627836227417, + "learning_rate": 2.0391760656456506e-06, + "loss": 0.1914, + "step": 5062 + }, + { + "epoch": 2.397395679195028, + "grad_norm": 0.9773929119110107, + "learning_rate": 2.0360821125603726e-06, + "loss": 0.204, + "step": 5063 + }, + { + "epoch": 2.3978691920686592, + "grad_norm": 1.09663724899292, + "learning_rate": 2.0329902423609926e-06, + "loss": 0.2104, + "step": 5064 + }, + { + "epoch": 2.3983427049422907, + "grad_norm": 1.1711968183517456, + "learning_rate": 2.0299004558561554e-06, + "loss": 0.2043, + "step": 5065 + }, + { + "epoch": 2.398816217815922, + "grad_norm": 1.7636314630508423, + "learning_rate": 2.026812753853962e-06, + "loss": 0.1984, + "step": 5066 + }, + { + "epoch": 2.3992897306895533, + "grad_norm": 0.9198000431060791, + "learning_rate": 2.023727137161976e-06, + "loss": 0.1887, + "step": 5067 + }, + { + "epoch": 2.3997632435631844, + "grad_norm": 1.0493696928024292, + "learning_rate": 2.020643606587207e-06, + "loss": 0.1972, + "step": 5068 + }, + { + "epoch": 2.4002367564368154, + "grad_norm": 1.075859785079956, + "learning_rate": 2.0175621629361274e-06, + "loss": 0.1963, + "step": 5069 + }, + { + "epoch": 2.400710269310447, + "grad_norm": 1.5328881740570068, + "learning_rate": 2.0144828070146528e-06, + "loss": 0.1893, + "step": 5070 + }, + { + "epoch": 2.401183782184078, + "grad_norm": 1.1692156791687012, + "learning_rate": 2.011405539628163e-06, + "loss": 0.181, + "step": 5071 + }, + { + "epoch": 2.4016572950577095, + "grad_norm": 2.3541417121887207, + "learning_rate": 2.0083303615814896e-06, + "loss": 0.2215, + "step": 5072 + }, + { + "epoch": 2.4021308079313406, + "grad_norm": 1.5428338050842285, + "learning_rate": 2.00525727367891e-06, + "loss": 0.1991, + "step": 5073 + }, + { + "epoch": 2.402604320804972, + "grad_norm": 0.920405924320221, + "learning_rate": 2.002186276724166e-06, + "loss": 0.1992, + "step": 5074 + }, + { + "epoch": 2.403077833678603, + "grad_norm": 1.1372102499008179, + "learning_rate": 1.999117371520444e-06, + "loss": 0.2079, + "step": 5075 + }, + { + "epoch": 2.4035513465522342, + "grad_norm": 1.1284953355789185, + "learning_rate": 1.9960505588703893e-06, + "loss": 0.2023, + "step": 5076 + }, + { + "epoch": 2.4040248594258657, + "grad_norm": 1.1499290466308594, + "learning_rate": 1.9929858395760927e-06, + "loss": 0.1872, + "step": 5077 + }, + { + "epoch": 2.404498372299497, + "grad_norm": 1.525108814239502, + "learning_rate": 1.9899232144391077e-06, + "loss": 0.2028, + "step": 5078 + }, + { + "epoch": 2.4049718851731283, + "grad_norm": 1.109409213066101, + "learning_rate": 1.986862684260431e-06, + "loss": 0.2014, + "step": 5079 + }, + { + "epoch": 2.4054453980467594, + "grad_norm": 1.8481568098068237, + "learning_rate": 1.983804249840513e-06, + "loss": 0.1663, + "step": 5080 + }, + { + "epoch": 2.4059189109203905, + "grad_norm": 0.8443739414215088, + "learning_rate": 1.9807479119792618e-06, + "loss": 0.1905, + "step": 5081 + }, + { + "epoch": 2.406392423794022, + "grad_norm": 1.2149391174316406, + "learning_rate": 1.9776936714760297e-06, + "loss": 0.1994, + "step": 5082 + }, + { + "epoch": 2.406865936667653, + "grad_norm": 1.261169672012329, + "learning_rate": 1.974641529129626e-06, + "loss": 0.2054, + "step": 5083 + }, + { + "epoch": 2.4073394495412845, + "grad_norm": 1.2343029975891113, + "learning_rate": 1.971591485738308e-06, + "loss": 0.1861, + "step": 5084 + }, + { + "epoch": 2.4078129624149156, + "grad_norm": 1.0303642749786377, + "learning_rate": 1.968543542099787e-06, + "loss": 0.2034, + "step": 5085 + }, + { + "epoch": 2.4082864752885467, + "grad_norm": 1.204028606414795, + "learning_rate": 1.9654976990112184e-06, + "loss": 0.1955, + "step": 5086 + }, + { + "epoch": 2.408759988162178, + "grad_norm": 1.6389491558074951, + "learning_rate": 1.9624539572692193e-06, + "loss": 0.2175, + "step": 5087 + }, + { + "epoch": 2.4092335010358092, + "grad_norm": 1.201258897781372, + "learning_rate": 1.9594123176698467e-06, + "loss": 0.1972, + "step": 5088 + }, + { + "epoch": 2.4097070139094408, + "grad_norm": 1.1844000816345215, + "learning_rate": 1.9563727810086155e-06, + "loss": 0.193, + "step": 5089 + }, + { + "epoch": 2.410180526783072, + "grad_norm": 1.0625587701797485, + "learning_rate": 1.953335348080484e-06, + "loss": 0.2036, + "step": 5090 + }, + { + "epoch": 2.4106540396567033, + "grad_norm": 1.2302863597869873, + "learning_rate": 1.950300019679863e-06, + "loss": 0.2189, + "step": 5091 + }, + { + "epoch": 2.4111275525303344, + "grad_norm": 0.8708613514900208, + "learning_rate": 1.9472667966006177e-06, + "loss": 0.1779, + "step": 5092 + }, + { + "epoch": 2.411601065403966, + "grad_norm": 1.382256031036377, + "learning_rate": 1.944235679636053e-06, + "loss": 0.1904, + "step": 5093 + }, + { + "epoch": 2.412074578277597, + "grad_norm": 1.4713544845581055, + "learning_rate": 1.941206669578933e-06, + "loss": 0.218, + "step": 5094 + }, + { + "epoch": 2.412548091151228, + "grad_norm": 1.513837456703186, + "learning_rate": 1.9381797672214618e-06, + "loss": 0.201, + "step": 5095 + }, + { + "epoch": 2.4130216040248595, + "grad_norm": 1.2420634031295776, + "learning_rate": 1.935154973355302e-06, + "loss": 0.225, + "step": 5096 + }, + { + "epoch": 2.4134951168984906, + "grad_norm": 1.2236741781234741, + "learning_rate": 1.9321322887715533e-06, + "loss": 0.1928, + "step": 5097 + }, + { + "epoch": 2.413968629772122, + "grad_norm": 1.1855653524398804, + "learning_rate": 1.929111714260774e-06, + "loss": 0.2099, + "step": 5098 + }, + { + "epoch": 2.414442142645753, + "grad_norm": 1.738868236541748, + "learning_rate": 1.9260932506129647e-06, + "loss": 0.1922, + "step": 5099 + }, + { + "epoch": 2.4149156555193843, + "grad_norm": 1.1198941469192505, + "learning_rate": 1.923076898617574e-06, + "loss": 0.2124, + "step": 5100 + }, + { + "epoch": 2.4153891683930158, + "grad_norm": 0.9823029041290283, + "learning_rate": 1.920062659063503e-06, + "loss": 0.2041, + "step": 5101 + }, + { + "epoch": 2.415862681266647, + "grad_norm": 1.4813225269317627, + "learning_rate": 1.917050532739092e-06, + "loss": 0.2001, + "step": 5102 + }, + { + "epoch": 2.4163361941402783, + "grad_norm": 1.1506394147872925, + "learning_rate": 1.9140405204321387e-06, + "loss": 0.1975, + "step": 5103 + }, + { + "epoch": 2.4168097070139094, + "grad_norm": 1.1538875102996826, + "learning_rate": 1.911032622929879e-06, + "loss": 0.1927, + "step": 5104 + }, + { + "epoch": 2.4172832198875405, + "grad_norm": 1.162630558013916, + "learning_rate": 1.908026841019003e-06, + "loss": 0.2028, + "step": 5105 + }, + { + "epoch": 2.417756732761172, + "grad_norm": 1.90775465965271, + "learning_rate": 1.9050231754856407e-06, + "loss": 0.2086, + "step": 5106 + }, + { + "epoch": 2.418230245634803, + "grad_norm": 1.8542652130126953, + "learning_rate": 1.9020216271153747e-06, + "loss": 0.2132, + "step": 5107 + }, + { + "epoch": 2.4187037585084346, + "grad_norm": 1.7880303859710693, + "learning_rate": 1.8990221966932266e-06, + "loss": 0.207, + "step": 5108 + }, + { + "epoch": 2.4191772713820656, + "grad_norm": 1.4888118505477905, + "learning_rate": 1.8960248850036722e-06, + "loss": 0.234, + "step": 5109 + }, + { + "epoch": 2.419650784255697, + "grad_norm": 1.0973647832870483, + "learning_rate": 1.8930296928306313e-06, + "loss": 0.1932, + "step": 5110 + }, + { + "epoch": 2.420124297129328, + "grad_norm": 1.4137020111083984, + "learning_rate": 1.8900366209574627e-06, + "loss": 0.199, + "step": 5111 + }, + { + "epoch": 2.4205978100029597, + "grad_norm": 1.0636194944381714, + "learning_rate": 1.8870456701669792e-06, + "loss": 0.1843, + "step": 5112 + }, + { + "epoch": 2.4210713228765908, + "grad_norm": 0.9531354308128357, + "learning_rate": 1.8840568412414318e-06, + "loss": 0.2034, + "step": 5113 + }, + { + "epoch": 2.421544835750222, + "grad_norm": 1.3613364696502686, + "learning_rate": 1.8810701349625237e-06, + "loss": 0.1957, + "step": 5114 + }, + { + "epoch": 2.4220183486238533, + "grad_norm": 1.619372844696045, + "learning_rate": 1.8780855521113983e-06, + "loss": 0.1774, + "step": 5115 + }, + { + "epoch": 2.4224918614974844, + "grad_norm": 1.429699182510376, + "learning_rate": 1.8751030934686398e-06, + "loss": 0.2106, + "step": 5116 + }, + { + "epoch": 2.422965374371116, + "grad_norm": 1.0113288164138794, + "learning_rate": 1.8721227598142876e-06, + "loss": 0.2062, + "step": 5117 + }, + { + "epoch": 2.423438887244747, + "grad_norm": 1.9973174333572388, + "learning_rate": 1.869144551927814e-06, + "loss": 0.2193, + "step": 5118 + }, + { + "epoch": 2.423912400118378, + "grad_norm": 1.051254153251648, + "learning_rate": 1.8661684705881456e-06, + "loss": 0.198, + "step": 5119 + }, + { + "epoch": 2.4243859129920096, + "grad_norm": 1.627144455909729, + "learning_rate": 1.863194516573642e-06, + "loss": 0.1883, + "step": 5120 + }, + { + "epoch": 2.4248594258656406, + "grad_norm": 1.1323554515838623, + "learning_rate": 1.860222690662119e-06, + "loss": 0.2051, + "step": 5121 + }, + { + "epoch": 2.425332938739272, + "grad_norm": 1.0321543216705322, + "learning_rate": 1.8572529936308225e-06, + "loss": 0.2016, + "step": 5122 + }, + { + "epoch": 2.425806451612903, + "grad_norm": 1.293243646621704, + "learning_rate": 1.8542854262564537e-06, + "loss": 0.2199, + "step": 5123 + }, + { + "epoch": 2.4262799644865343, + "grad_norm": 1.043289065361023, + "learning_rate": 1.851319989315149e-06, + "loss": 0.1967, + "step": 5124 + }, + { + "epoch": 2.426753477360166, + "grad_norm": 1.376531720161438, + "learning_rate": 1.8483566835824862e-06, + "loss": 0.2124, + "step": 5125 + }, + { + "epoch": 2.427226990233797, + "grad_norm": 1.3981205224990845, + "learning_rate": 1.8453955098334953e-06, + "loss": 0.2134, + "step": 5126 + }, + { + "epoch": 2.4277005031074284, + "grad_norm": 1.1886367797851562, + "learning_rate": 1.8424364688426365e-06, + "loss": 0.1731, + "step": 5127 + }, + { + "epoch": 2.4281740159810594, + "grad_norm": 0.9211165308952332, + "learning_rate": 1.8394795613838256e-06, + "loss": 0.1854, + "step": 5128 + }, + { + "epoch": 2.428647528854691, + "grad_norm": 1.1072946786880493, + "learning_rate": 1.8365247882304061e-06, + "loss": 0.1827, + "step": 5129 + }, + { + "epoch": 2.429121041728322, + "grad_norm": 0.8516340851783752, + "learning_rate": 1.8335721501551774e-06, + "loss": 0.2105, + "step": 5130 + }, + { + "epoch": 2.429594554601953, + "grad_norm": 1.1309757232666016, + "learning_rate": 1.8306216479303663e-06, + "loss": 0.195, + "step": 5131 + }, + { + "epoch": 2.4300680674755846, + "grad_norm": 1.1593215465545654, + "learning_rate": 1.8276732823276556e-06, + "loss": 0.2171, + "step": 5132 + }, + { + "epoch": 2.4305415803492156, + "grad_norm": 0.949464738368988, + "learning_rate": 1.8247270541181572e-06, + "loss": 0.2146, + "step": 5133 + }, + { + "epoch": 2.431015093222847, + "grad_norm": 1.4666881561279297, + "learning_rate": 1.8217829640724271e-06, + "loss": 0.2085, + "step": 5134 + }, + { + "epoch": 2.431488606096478, + "grad_norm": 1.130859136581421, + "learning_rate": 1.8188410129604684e-06, + "loss": 0.2297, + "step": 5135 + }, + { + "epoch": 2.4319621189701097, + "grad_norm": 1.016351342201233, + "learning_rate": 1.8159012015517152e-06, + "loss": 0.1865, + "step": 5136 + }, + { + "epoch": 2.432435631843741, + "grad_norm": 1.0443476438522339, + "learning_rate": 1.8129635306150517e-06, + "loss": 0.2097, + "step": 5137 + }, + { + "epoch": 2.432909144717372, + "grad_norm": 1.0505341291427612, + "learning_rate": 1.8100280009187931e-06, + "loss": 0.1951, + "step": 5138 + }, + { + "epoch": 2.4333826575910034, + "grad_norm": 1.0258119106292725, + "learning_rate": 1.8070946132307033e-06, + "loss": 0.2119, + "step": 5139 + }, + { + "epoch": 2.4338561704646344, + "grad_norm": 1.3897591829299927, + "learning_rate": 1.804163368317976e-06, + "loss": 0.2081, + "step": 5140 + }, + { + "epoch": 2.434329683338266, + "grad_norm": 1.1299906969070435, + "learning_rate": 1.801234266947256e-06, + "loss": 0.187, + "step": 5141 + }, + { + "epoch": 2.434803196211897, + "grad_norm": 1.4262118339538574, + "learning_rate": 1.798307309884616e-06, + "loss": 0.2022, + "step": 5142 + }, + { + "epoch": 2.435276709085528, + "grad_norm": 1.6670175790786743, + "learning_rate": 1.795382497895578e-06, + "loss": 0.2016, + "step": 5143 + }, + { + "epoch": 2.4357502219591596, + "grad_norm": 1.0870014429092407, + "learning_rate": 1.792459831745097e-06, + "loss": 0.205, + "step": 5144 + }, + { + "epoch": 2.4362237348327906, + "grad_norm": 1.5366194248199463, + "learning_rate": 1.7895393121975646e-06, + "loss": 0.1978, + "step": 5145 + }, + { + "epoch": 2.436697247706422, + "grad_norm": 1.1896754503250122, + "learning_rate": 1.7866209400168211e-06, + "loss": 0.2016, + "step": 5146 + }, + { + "epoch": 2.437170760580053, + "grad_norm": 1.7006498575210571, + "learning_rate": 1.7837047159661302e-06, + "loss": 0.1941, + "step": 5147 + }, + { + "epoch": 2.4376442734536843, + "grad_norm": 1.0235669612884521, + "learning_rate": 1.7807906408082087e-06, + "loss": 0.2014, + "step": 5148 + }, + { + "epoch": 2.438117786327316, + "grad_norm": 1.008035659790039, + "learning_rate": 1.7778787153052045e-06, + "loss": 0.2089, + "step": 5149 + }, + { + "epoch": 2.438591299200947, + "grad_norm": 1.0255095958709717, + "learning_rate": 1.7749689402186998e-06, + "loss": 0.1944, + "step": 5150 + }, + { + "epoch": 2.4390648120745784, + "grad_norm": 1.1309661865234375, + "learning_rate": 1.7720613163097233e-06, + "loss": 0.1999, + "step": 5151 + }, + { + "epoch": 2.4395383249482094, + "grad_norm": 1.1396714448928833, + "learning_rate": 1.7691558443387302e-06, + "loss": 0.1979, + "step": 5152 + }, + { + "epoch": 2.440011837821841, + "grad_norm": 1.2275151014328003, + "learning_rate": 1.766252525065625e-06, + "loss": 0.1792, + "step": 5153 + }, + { + "epoch": 2.440485350695472, + "grad_norm": 1.1925066709518433, + "learning_rate": 1.7633513592497354e-06, + "loss": 0.2024, + "step": 5154 + }, + { + "epoch": 2.4409588635691035, + "grad_norm": 0.9196386933326721, + "learning_rate": 1.7604523476498413e-06, + "loss": 0.1739, + "step": 5155 + }, + { + "epoch": 2.4414323764427346, + "grad_norm": 1.4410473108291626, + "learning_rate": 1.7575554910241444e-06, + "loss": 0.1839, + "step": 5156 + }, + { + "epoch": 2.4419058893163657, + "grad_norm": 1.2430617809295654, + "learning_rate": 1.7546607901302948e-06, + "loss": 0.2111, + "step": 5157 + }, + { + "epoch": 2.442379402189997, + "grad_norm": 1.0924980640411377, + "learning_rate": 1.7517682457253715e-06, + "loss": 0.218, + "step": 5158 + }, + { + "epoch": 2.4428529150636282, + "grad_norm": 1.1421464681625366, + "learning_rate": 1.7488778585658894e-06, + "loss": 0.2173, + "step": 5159 + }, + { + "epoch": 2.4433264279372597, + "grad_norm": 1.0318294763565063, + "learning_rate": 1.745989629407806e-06, + "loss": 0.2207, + "step": 5160 + }, + { + "epoch": 2.443799940810891, + "grad_norm": 1.0395057201385498, + "learning_rate": 1.7431035590065037e-06, + "loss": 0.174, + "step": 5161 + }, + { + "epoch": 2.444273453684522, + "grad_norm": 1.2233282327651978, + "learning_rate": 1.7402196481168132e-06, + "loss": 0.2084, + "step": 5162 + }, + { + "epoch": 2.4447469665581534, + "grad_norm": 1.3540483713150024, + "learning_rate": 1.7373378974929878e-06, + "loss": 0.1778, + "step": 5163 + }, + { + "epoch": 2.4452204794317844, + "grad_norm": 1.648417353630066, + "learning_rate": 1.7344583078887255e-06, + "loss": 0.2016, + "step": 5164 + }, + { + "epoch": 2.445693992305416, + "grad_norm": 1.0285402536392212, + "learning_rate": 1.731580880057152e-06, + "loss": 0.2081, + "step": 5165 + }, + { + "epoch": 2.446167505179047, + "grad_norm": 1.1315886974334717, + "learning_rate": 1.7287056147508353e-06, + "loss": 0.2134, + "step": 5166 + }, + { + "epoch": 2.446641018052678, + "grad_norm": 1.0791397094726562, + "learning_rate": 1.7258325127217668e-06, + "loss": 0.1921, + "step": 5167 + }, + { + "epoch": 2.4471145309263096, + "grad_norm": 1.0540249347686768, + "learning_rate": 1.7229615747213858e-06, + "loss": 0.2018, + "step": 5168 + }, + { + "epoch": 2.4475880437999407, + "grad_norm": 0.8960132002830505, + "learning_rate": 1.7200928015005546e-06, + "loss": 0.1999, + "step": 5169 + }, + { + "epoch": 2.448061556673572, + "grad_norm": 1.0253756046295166, + "learning_rate": 1.7172261938095713e-06, + "loss": 0.202, + "step": 5170 + }, + { + "epoch": 2.4485350695472032, + "grad_norm": 1.6087161302566528, + "learning_rate": 1.7143617523981737e-06, + "loss": 0.208, + "step": 5171 + }, + { + "epoch": 2.4490085824208347, + "grad_norm": 1.3046700954437256, + "learning_rate": 1.7114994780155236e-06, + "loss": 0.2251, + "step": 5172 + }, + { + "epoch": 2.449482095294466, + "grad_norm": 0.9480929374694824, + "learning_rate": 1.7086393714102278e-06, + "loss": 0.1973, + "step": 5173 + }, + { + "epoch": 2.449955608168097, + "grad_norm": 1.570953369140625, + "learning_rate": 1.7057814333303146e-06, + "loss": 0.1994, + "step": 5174 + }, + { + "epoch": 2.4504291210417284, + "grad_norm": 0.91864013671875, + "learning_rate": 1.7029256645232529e-06, + "loss": 0.1762, + "step": 5175 + }, + { + "epoch": 2.4509026339153595, + "grad_norm": 1.4908578395843506, + "learning_rate": 1.7000720657359383e-06, + "loss": 0.2058, + "step": 5176 + }, + { + "epoch": 2.451376146788991, + "grad_norm": 1.3648159503936768, + "learning_rate": 1.6972206377147072e-06, + "loss": 0.2089, + "step": 5177 + }, + { + "epoch": 2.451849659662622, + "grad_norm": 1.1475952863693237, + "learning_rate": 1.6943713812053185e-06, + "loss": 0.2, + "step": 5178 + }, + { + "epoch": 2.4523231725362535, + "grad_norm": 1.2770181894302368, + "learning_rate": 1.6915242969529676e-06, + "loss": 0.2011, + "step": 5179 + }, + { + "epoch": 2.4527966854098846, + "grad_norm": 0.8871325850486755, + "learning_rate": 1.6886793857022866e-06, + "loss": 0.1983, + "step": 5180 + }, + { + "epoch": 2.4532701982835157, + "grad_norm": 1.2607122659683228, + "learning_rate": 1.6858366481973288e-06, + "loss": 0.1899, + "step": 5181 + }, + { + "epoch": 2.453743711157147, + "grad_norm": 0.8917462825775146, + "learning_rate": 1.6829960851815896e-06, + "loss": 0.1821, + "step": 5182 + }, + { + "epoch": 2.4542172240307782, + "grad_norm": 0.9355318546295166, + "learning_rate": 1.680157697397986e-06, + "loss": 0.1975, + "step": 5183 + }, + { + "epoch": 2.4546907369044098, + "grad_norm": 2.099505662918091, + "learning_rate": 1.6773214855888765e-06, + "loss": 0.1737, + "step": 5184 + }, + { + "epoch": 2.455164249778041, + "grad_norm": 1.3527806997299194, + "learning_rate": 1.6744874504960395e-06, + "loss": 0.1821, + "step": 5185 + }, + { + "epoch": 2.455637762651672, + "grad_norm": 1.080901026725769, + "learning_rate": 1.6716555928606959e-06, + "loss": 0.1998, + "step": 5186 + }, + { + "epoch": 2.4561112755253034, + "grad_norm": 1.0348796844482422, + "learning_rate": 1.668825913423483e-06, + "loss": 0.204, + "step": 5187 + }, + { + "epoch": 2.4565847883989345, + "grad_norm": 1.0990900993347168, + "learning_rate": 1.665998412924481e-06, + "loss": 0.1958, + "step": 5188 + }, + { + "epoch": 2.457058301272566, + "grad_norm": 1.2333770990371704, + "learning_rate": 1.6631730921031964e-06, + "loss": 0.2101, + "step": 5189 + }, + { + "epoch": 2.457531814146197, + "grad_norm": 1.7082300186157227, + "learning_rate": 1.660349951698561e-06, + "loss": 0.1913, + "step": 5190 + }, + { + "epoch": 2.4580053270198285, + "grad_norm": 1.0094937086105347, + "learning_rate": 1.6575289924489435e-06, + "loss": 0.1842, + "step": 5191 + }, + { + "epoch": 2.4584788398934596, + "grad_norm": 1.1216864585876465, + "learning_rate": 1.6547102150921346e-06, + "loss": 0.2108, + "step": 5192 + }, + { + "epoch": 2.4589523527670907, + "grad_norm": 2.20843505859375, + "learning_rate": 1.6518936203653636e-06, + "loss": 0.184, + "step": 5193 + }, + { + "epoch": 2.459425865640722, + "grad_norm": 0.8962819576263428, + "learning_rate": 1.6490792090052799e-06, + "loss": 0.2049, + "step": 5194 + }, + { + "epoch": 2.4598993785143533, + "grad_norm": 1.1314330101013184, + "learning_rate": 1.6462669817479638e-06, + "loss": 0.21, + "step": 5195 + }, + { + "epoch": 2.4603728913879848, + "grad_norm": 1.110121726989746, + "learning_rate": 1.6434569393289313e-06, + "loss": 0.1899, + "step": 5196 + }, + { + "epoch": 2.460846404261616, + "grad_norm": 1.1242990493774414, + "learning_rate": 1.6406490824831166e-06, + "loss": 0.2234, + "step": 5197 + }, + { + "epoch": 2.4613199171352473, + "grad_norm": 0.946533739566803, + "learning_rate": 1.6378434119448939e-06, + "loss": 0.1964, + "step": 5198 + }, + { + "epoch": 2.4617934300088784, + "grad_norm": 1.0624065399169922, + "learning_rate": 1.6350399284480523e-06, + "loss": 0.1787, + "step": 5199 + }, + { + "epoch": 2.4622669428825095, + "grad_norm": 0.9491204619407654, + "learning_rate": 1.632238632725821e-06, + "loss": 0.1927, + "step": 5200 + }, + { + "epoch": 2.462740455756141, + "grad_norm": 1.2071702480316162, + "learning_rate": 1.6294395255108487e-06, + "loss": 0.1941, + "step": 5201 + }, + { + "epoch": 2.463213968629772, + "grad_norm": 1.1479326486587524, + "learning_rate": 1.6266426075352182e-06, + "loss": 0.2133, + "step": 5202 + }, + { + "epoch": 2.4636874815034036, + "grad_norm": 0.9430690407752991, + "learning_rate": 1.6238478795304346e-06, + "loss": 0.2007, + "step": 5203 + }, + { + "epoch": 2.4641609943770346, + "grad_norm": 1.0291746854782104, + "learning_rate": 1.62105534222743e-06, + "loss": 0.1892, + "step": 5204 + }, + { + "epoch": 2.4646345072506657, + "grad_norm": 1.4124772548675537, + "learning_rate": 1.61826499635657e-06, + "loss": 0.1997, + "step": 5205 + }, + { + "epoch": 2.465108020124297, + "grad_norm": 1.0831562280654907, + "learning_rate": 1.6154768426476375e-06, + "loss": 0.2015, + "step": 5206 + }, + { + "epoch": 2.4655815329979283, + "grad_norm": 1.5308729410171509, + "learning_rate": 1.6126908818298514e-06, + "loss": 0.1886, + "step": 5207 + }, + { + "epoch": 2.4660550458715598, + "grad_norm": 1.0939233303070068, + "learning_rate": 1.6099071146318502e-06, + "loss": 0.2037, + "step": 5208 + }, + { + "epoch": 2.466528558745191, + "grad_norm": 1.4337128400802612, + "learning_rate": 1.6071255417817045e-06, + "loss": 0.1942, + "step": 5209 + }, + { + "epoch": 2.467002071618822, + "grad_norm": 1.2987715005874634, + "learning_rate": 1.6043461640069025e-06, + "loss": 0.1953, + "step": 5210 + }, + { + "epoch": 2.4674755844924534, + "grad_norm": 1.0721392631530762, + "learning_rate": 1.6015689820343705e-06, + "loss": 0.2063, + "step": 5211 + }, + { + "epoch": 2.4679490973660845, + "grad_norm": 1.0339511632919312, + "learning_rate": 1.5987939965904498e-06, + "loss": 0.1766, + "step": 5212 + }, + { + "epoch": 2.468422610239716, + "grad_norm": 1.3787801265716553, + "learning_rate": 1.5960212084009097e-06, + "loss": 0.2037, + "step": 5213 + }, + { + "epoch": 2.468896123113347, + "grad_norm": 1.1688666343688965, + "learning_rate": 1.593250618190949e-06, + "loss": 0.1851, + "step": 5214 + }, + { + "epoch": 2.4693696359869786, + "grad_norm": 1.2239266633987427, + "learning_rate": 1.590482226685186e-06, + "loss": 0.2122, + "step": 5215 + }, + { + "epoch": 2.4698431488606096, + "grad_norm": 1.1540725231170654, + "learning_rate": 1.5877160346076714e-06, + "loss": 0.213, + "step": 5216 + }, + { + "epoch": 2.470316661734241, + "grad_norm": 0.8362421989440918, + "learning_rate": 1.584952042681871e-06, + "loss": 0.1734, + "step": 5217 + }, + { + "epoch": 2.470790174607872, + "grad_norm": 1.1025753021240234, + "learning_rate": 1.5821902516306842e-06, + "loss": 0.1877, + "step": 5218 + }, + { + "epoch": 2.4712636874815033, + "grad_norm": 1.23174250125885, + "learning_rate": 1.5794306621764265e-06, + "loss": 0.1998, + "step": 5219 + }, + { + "epoch": 2.471737200355135, + "grad_norm": 1.2430462837219238, + "learning_rate": 1.5766732750408465e-06, + "loss": 0.2024, + "step": 5220 + }, + { + "epoch": 2.472210713228766, + "grad_norm": 1.0162687301635742, + "learning_rate": 1.573918090945109e-06, + "loss": 0.2102, + "step": 5221 + }, + { + "epoch": 2.4726842261023974, + "grad_norm": 1.1504173278808594, + "learning_rate": 1.571165110609808e-06, + "loss": 0.2077, + "step": 5222 + }, + { + "epoch": 2.4731577389760284, + "grad_norm": 0.9471516609191895, + "learning_rate": 1.5684143347549586e-06, + "loss": 0.1864, + "step": 5223 + }, + { + "epoch": 2.4736312518496595, + "grad_norm": 1.431763768196106, + "learning_rate": 1.5656657640999973e-06, + "loss": 0.1995, + "step": 5224 + }, + { + "epoch": 2.474104764723291, + "grad_norm": 1.2276103496551514, + "learning_rate": 1.562919399363787e-06, + "loss": 0.2024, + "step": 5225 + }, + { + "epoch": 2.474578277596922, + "grad_norm": 1.1695070266723633, + "learning_rate": 1.5601752412646143e-06, + "loss": 0.1863, + "step": 5226 + }, + { + "epoch": 2.4750517904705536, + "grad_norm": 1.6259397268295288, + "learning_rate": 1.5574332905201883e-06, + "loss": 0.1961, + "step": 5227 + }, + { + "epoch": 2.4755253033441846, + "grad_norm": 1.0487055778503418, + "learning_rate": 1.554693547847639e-06, + "loss": 0.1864, + "step": 5228 + }, + { + "epoch": 2.4759988162178157, + "grad_norm": 1.0676391124725342, + "learning_rate": 1.551956013963517e-06, + "loss": 0.2179, + "step": 5229 + }, + { + "epoch": 2.476472329091447, + "grad_norm": 1.1580907106399536, + "learning_rate": 1.5492206895838013e-06, + "loss": 0.198, + "step": 5230 + }, + { + "epoch": 2.4769458419650783, + "grad_norm": 1.0551162958145142, + "learning_rate": 1.546487575423886e-06, + "loss": 0.1941, + "step": 5231 + }, + { + "epoch": 2.47741935483871, + "grad_norm": 1.4372506141662598, + "learning_rate": 1.5437566721985952e-06, + "loss": 0.1876, + "step": 5232 + }, + { + "epoch": 2.477892867712341, + "grad_norm": 1.0641188621520996, + "learning_rate": 1.5410279806221662e-06, + "loss": 0.2156, + "step": 5233 + }, + { + "epoch": 2.4783663805859724, + "grad_norm": 1.8951334953308105, + "learning_rate": 1.5383015014082659e-06, + "loss": 0.1862, + "step": 5234 + }, + { + "epoch": 2.4788398934596034, + "grad_norm": 1.3256562948226929, + "learning_rate": 1.5355772352699738e-06, + "loss": 0.1894, + "step": 5235 + }, + { + "epoch": 2.4793134063332345, + "grad_norm": 0.990827739238739, + "learning_rate": 1.5328551829198e-06, + "loss": 0.1896, + "step": 5236 + }, + { + "epoch": 2.479786919206866, + "grad_norm": 1.4543442726135254, + "learning_rate": 1.53013534506967e-06, + "loss": 0.2072, + "step": 5237 + }, + { + "epoch": 2.480260432080497, + "grad_norm": 1.17643141746521, + "learning_rate": 1.5274177224309273e-06, + "loss": 0.2118, + "step": 5238 + }, + { + "epoch": 2.4807339449541286, + "grad_norm": 1.1279579401016235, + "learning_rate": 1.5247023157143459e-06, + "loss": 0.2043, + "step": 5239 + }, + { + "epoch": 2.4812074578277596, + "grad_norm": 1.2052768468856812, + "learning_rate": 1.5219891256301079e-06, + "loss": 0.2013, + "step": 5240 + }, + { + "epoch": 2.481680970701391, + "grad_norm": 1.2631618976593018, + "learning_rate": 1.5192781528878285e-06, + "loss": 0.2011, + "step": 5241 + }, + { + "epoch": 2.482154483575022, + "grad_norm": 1.0374186038970947, + "learning_rate": 1.5165693981965302e-06, + "loss": 0.1748, + "step": 5242 + }, + { + "epoch": 2.4826279964486533, + "grad_norm": 1.1341975927352905, + "learning_rate": 1.513862862264668e-06, + "loss": 0.214, + "step": 5243 + }, + { + "epoch": 2.483101509322285, + "grad_norm": 0.9370356202125549, + "learning_rate": 1.5111585458001032e-06, + "loss": 0.1778, + "step": 5244 + }, + { + "epoch": 2.483575022195916, + "grad_norm": 1.78024423122406, + "learning_rate": 1.5084564495101306e-06, + "loss": 0.1897, + "step": 5245 + }, + { + "epoch": 2.4840485350695474, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.5057565741014513e-06, + "loss": 0.2309, + "step": 5246 + }, + { + "epoch": 2.4845220479431784, + "grad_norm": 1.8832025527954102, + "learning_rate": 1.5030589202801982e-06, + "loss": 0.2073, + "step": 5247 + }, + { + "epoch": 2.4849955608168095, + "grad_norm": 1.0782551765441895, + "learning_rate": 1.5003634887519126e-06, + "loss": 0.2115, + "step": 5248 + }, + { + "epoch": 2.485469073690441, + "grad_norm": 2.393939733505249, + "learning_rate": 1.497670280221556e-06, + "loss": 0.202, + "step": 5249 + }, + { + "epoch": 2.485942586564072, + "grad_norm": 1.293050765991211, + "learning_rate": 1.4949792953935172e-06, + "loss": 0.1859, + "step": 5250 + }, + { + "epoch": 2.4864160994377036, + "grad_norm": 1.0516685247421265, + "learning_rate": 1.4922905349715922e-06, + "loss": 0.2269, + "step": 5251 + }, + { + "epoch": 2.4868896123113347, + "grad_norm": 0.9482864141464233, + "learning_rate": 1.489603999659004e-06, + "loss": 0.1947, + "step": 5252 + }, + { + "epoch": 2.4873631251849657, + "grad_norm": 1.2541604042053223, + "learning_rate": 1.486919690158386e-06, + "loss": 0.2006, + "step": 5253 + }, + { + "epoch": 2.4878366380585972, + "grad_norm": 1.3161418437957764, + "learning_rate": 1.4842376071717989e-06, + "loss": 0.2067, + "step": 5254 + }, + { + "epoch": 2.4883101509322283, + "grad_norm": 1.2702239751815796, + "learning_rate": 1.4815577514007106e-06, + "loss": 0.1866, + "step": 5255 + }, + { + "epoch": 2.48878366380586, + "grad_norm": 1.2881168127059937, + "learning_rate": 1.478880123546015e-06, + "loss": 0.1928, + "step": 5256 + }, + { + "epoch": 2.489257176679491, + "grad_norm": 1.0193349123001099, + "learning_rate": 1.476204724308019e-06, + "loss": 0.2026, + "step": 5257 + }, + { + "epoch": 2.4897306895531224, + "grad_norm": 1.3195552825927734, + "learning_rate": 1.4735315543864436e-06, + "loss": 0.1803, + "step": 5258 + }, + { + "epoch": 2.4902042024267534, + "grad_norm": 1.2184456586837769, + "learning_rate": 1.4708606144804371e-06, + "loss": 0.2057, + "step": 5259 + }, + { + "epoch": 2.490677715300385, + "grad_norm": 1.221314549446106, + "learning_rate": 1.468191905288553e-06, + "loss": 0.1912, + "step": 5260 + }, + { + "epoch": 2.491151228174016, + "grad_norm": 1.2005162239074707, + "learning_rate": 1.4655254275087693e-06, + "loss": 0.2101, + "step": 5261 + }, + { + "epoch": 2.491624741047647, + "grad_norm": 2.1400723457336426, + "learning_rate": 1.4628611818384753e-06, + "loss": 0.1901, + "step": 5262 + }, + { + "epoch": 2.4920982539212786, + "grad_norm": 1.6290489435195923, + "learning_rate": 1.460199168974481e-06, + "loss": 0.1929, + "step": 5263 + }, + { + "epoch": 2.4925717667949097, + "grad_norm": 1.020664095878601, + "learning_rate": 1.4575393896130073e-06, + "loss": 0.1879, + "step": 5264 + }, + { + "epoch": 2.493045279668541, + "grad_norm": 0.9219028353691101, + "learning_rate": 1.454881844449697e-06, + "loss": 0.2014, + "step": 5265 + }, + { + "epoch": 2.4935187925421722, + "grad_norm": 1.7500580549240112, + "learning_rate": 1.4522265341796048e-06, + "loss": 0.2044, + "step": 5266 + }, + { + "epoch": 2.4939923054158033, + "grad_norm": 1.1425151824951172, + "learning_rate": 1.4495734594971988e-06, + "loss": 0.2053, + "step": 5267 + }, + { + "epoch": 2.494465818289435, + "grad_norm": 1.0946452617645264, + "learning_rate": 1.4469226210963693e-06, + "loss": 0.2082, + "step": 5268 + }, + { + "epoch": 2.494939331163066, + "grad_norm": 1.198346495628357, + "learning_rate": 1.444274019670413e-06, + "loss": 0.1991, + "step": 5269 + }, + { + "epoch": 2.4954128440366974, + "grad_norm": 1.0651274919509888, + "learning_rate": 1.4416276559120511e-06, + "loss": 0.2051, + "step": 5270 + }, + { + "epoch": 2.4958863569103285, + "grad_norm": 1.0308165550231934, + "learning_rate": 1.4389835305134092e-06, + "loss": 0.1973, + "step": 5271 + }, + { + "epoch": 2.4963598697839595, + "grad_norm": 1.7552647590637207, + "learning_rate": 1.436341644166037e-06, + "loss": 0.2061, + "step": 5272 + }, + { + "epoch": 2.496833382657591, + "grad_norm": 1.7813093662261963, + "learning_rate": 1.4337019975608934e-06, + "loss": 0.1912, + "step": 5273 + }, + { + "epoch": 2.497306895531222, + "grad_norm": 1.5826293230056763, + "learning_rate": 1.4310645913883493e-06, + "loss": 0.2167, + "step": 5274 + }, + { + "epoch": 2.4977804084048536, + "grad_norm": 0.9368717670440674, + "learning_rate": 1.4284294263381982e-06, + "loss": 0.2067, + "step": 5275 + }, + { + "epoch": 2.4982539212784847, + "grad_norm": 1.000964879989624, + "learning_rate": 1.4257965030996357e-06, + "loss": 0.1868, + "step": 5276 + }, + { + "epoch": 2.498727434152116, + "grad_norm": 1.1660606861114502, + "learning_rate": 1.4231658223612842e-06, + "loss": 0.2016, + "step": 5277 + }, + { + "epoch": 2.4992009470257472, + "grad_norm": 1.216518521308899, + "learning_rate": 1.420537384811167e-06, + "loss": 0.2274, + "step": 5278 + }, + { + "epoch": 2.4996744598993788, + "grad_norm": 1.3850187063217163, + "learning_rate": 1.4179111911367315e-06, + "loss": 0.2104, + "step": 5279 + }, + { + "epoch": 2.50014797277301, + "grad_norm": 1.306488275527954, + "learning_rate": 1.4152872420248288e-06, + "loss": 0.2175, + "step": 5280 + }, + { + "epoch": 2.500621485646641, + "grad_norm": 1.0250792503356934, + "learning_rate": 1.4126655381617327e-06, + "loss": 0.181, + "step": 5281 + }, + { + "epoch": 2.5010949985202724, + "grad_norm": 1.5189791917800903, + "learning_rate": 1.4100460802331205e-06, + "loss": 0.2075, + "step": 5282 + }, + { + "epoch": 2.5015685113939035, + "grad_norm": 1.1778138875961304, + "learning_rate": 1.4074288689240856e-06, + "loss": 0.1985, + "step": 5283 + }, + { + "epoch": 2.502042024267535, + "grad_norm": 1.0283559560775757, + "learning_rate": 1.4048139049191389e-06, + "loss": 0.2098, + "step": 5284 + }, + { + "epoch": 2.502515537141166, + "grad_norm": 0.9769028425216675, + "learning_rate": 1.4022011889021936e-06, + "loss": 0.2068, + "step": 5285 + }, + { + "epoch": 2.502989050014797, + "grad_norm": 1.1810842752456665, + "learning_rate": 1.399590721556584e-06, + "loss": 0.2083, + "step": 5286 + }, + { + "epoch": 2.5034625628884286, + "grad_norm": 1.018278956413269, + "learning_rate": 1.396982503565051e-06, + "loss": 0.2003, + "step": 5287 + }, + { + "epoch": 2.5039360757620597, + "grad_norm": 1.1597832441329956, + "learning_rate": 1.3943765356097505e-06, + "loss": 0.1902, + "step": 5288 + }, + { + "epoch": 2.504409588635691, + "grad_norm": 1.001120686531067, + "learning_rate": 1.3917728183722456e-06, + "loss": 0.1733, + "step": 5289 + }, + { + "epoch": 2.5048831015093223, + "grad_norm": 1.1091008186340332, + "learning_rate": 1.389171352533517e-06, + "loss": 0.2251, + "step": 5290 + }, + { + "epoch": 2.5053566143829533, + "grad_norm": 0.9414849877357483, + "learning_rate": 1.3865721387739507e-06, + "loss": 0.1797, + "step": 5291 + }, + { + "epoch": 2.505830127256585, + "grad_norm": 1.9510407447814941, + "learning_rate": 1.3839751777733445e-06, + "loss": 0.1881, + "step": 5292 + }, + { + "epoch": 2.506303640130216, + "grad_norm": 1.3246495723724365, + "learning_rate": 1.3813804702109124e-06, + "loss": 0.2084, + "step": 5293 + }, + { + "epoch": 2.5067771530038474, + "grad_norm": 1.0592890977859497, + "learning_rate": 1.37878801676527e-06, + "loss": 0.2072, + "step": 5294 + }, + { + "epoch": 2.5072506658774785, + "grad_norm": 1.3438637256622314, + "learning_rate": 1.3761978181144542e-06, + "loss": 0.2262, + "step": 5295 + }, + { + "epoch": 2.5077241787511095, + "grad_norm": 1.0453280210494995, + "learning_rate": 1.373609874935903e-06, + "loss": 0.1857, + "step": 5296 + }, + { + "epoch": 2.508197691624741, + "grad_norm": 1.3020319938659668, + "learning_rate": 1.3710241879064689e-06, + "loss": 0.1875, + "step": 5297 + }, + { + "epoch": 2.5086712044983726, + "grad_norm": 1.245324969291687, + "learning_rate": 1.3684407577024116e-06, + "loss": 0.1978, + "step": 5298 + }, + { + "epoch": 2.5091447173720036, + "grad_norm": 1.0261917114257812, + "learning_rate": 1.3658595849994072e-06, + "loss": 0.199, + "step": 5299 + }, + { + "epoch": 2.5096182302456347, + "grad_norm": 1.699816107749939, + "learning_rate": 1.36328067047253e-06, + "loss": 0.2069, + "step": 5300 + }, + { + "epoch": 2.510091743119266, + "grad_norm": 2.2417402267456055, + "learning_rate": 1.360704014796277e-06, + "loss": 0.1876, + "step": 5301 + }, + { + "epoch": 2.5105652559928973, + "grad_norm": 0.9877926707267761, + "learning_rate": 1.3581296186445426e-06, + "loss": 0.1888, + "step": 5302 + }, + { + "epoch": 2.5110387688665288, + "grad_norm": 1.183518648147583, + "learning_rate": 1.3555574826906337e-06, + "loss": 0.1995, + "step": 5303 + }, + { + "epoch": 2.51151228174016, + "grad_norm": 1.0759626626968384, + "learning_rate": 1.3529876076072746e-06, + "loss": 0.1998, + "step": 5304 + }, + { + "epoch": 2.511985794613791, + "grad_norm": 1.130675196647644, + "learning_rate": 1.3504199940665852e-06, + "loss": 0.1963, + "step": 5305 + }, + { + "epoch": 2.5124593074874224, + "grad_norm": 1.4707733392715454, + "learning_rate": 1.347854642740104e-06, + "loss": 0.216, + "step": 5306 + }, + { + "epoch": 2.5129328203610535, + "grad_norm": 1.2437856197357178, + "learning_rate": 1.3452915542987732e-06, + "loss": 0.2187, + "step": 5307 + }, + { + "epoch": 2.513406333234685, + "grad_norm": 1.0514782667160034, + "learning_rate": 1.3427307294129411e-06, + "loss": 0.2178, + "step": 5308 + }, + { + "epoch": 2.513879846108316, + "grad_norm": 1.6245321035385132, + "learning_rate": 1.3401721687523706e-06, + "loss": 0.2095, + "step": 5309 + }, + { + "epoch": 2.514353358981947, + "grad_norm": 1.0891262292861938, + "learning_rate": 1.3376158729862232e-06, + "loss": 0.1807, + "step": 5310 + }, + { + "epoch": 2.5148268718555786, + "grad_norm": 1.054901361465454, + "learning_rate": 1.3350618427830796e-06, + "loss": 0.2074, + "step": 5311 + }, + { + "epoch": 2.5153003847292097, + "grad_norm": 1.5967611074447632, + "learning_rate": 1.3325100788109168e-06, + "loss": 0.1997, + "step": 5312 + }, + { + "epoch": 2.515773897602841, + "grad_norm": 1.1578254699707031, + "learning_rate": 1.3299605817371285e-06, + "loss": 0.2056, + "step": 5313 + }, + { + "epoch": 2.5162474104764723, + "grad_norm": 0.9452877044677734, + "learning_rate": 1.327413352228506e-06, + "loss": 0.2101, + "step": 5314 + }, + { + "epoch": 2.5167209233501033, + "grad_norm": 1.0103105306625366, + "learning_rate": 1.3248683909512584e-06, + "loss": 0.1875, + "step": 5315 + }, + { + "epoch": 2.517194436223735, + "grad_norm": 1.4317337274551392, + "learning_rate": 1.322325698570992e-06, + "loss": 0.1962, + "step": 5316 + }, + { + "epoch": 2.5176679490973664, + "grad_norm": 1.23985755443573, + "learning_rate": 1.3197852757527219e-06, + "loss": 0.2054, + "step": 5317 + }, + { + "epoch": 2.5181414619709974, + "grad_norm": 1.0402408838272095, + "learning_rate": 1.3172471231608753e-06, + "loss": 0.1856, + "step": 5318 + }, + { + "epoch": 2.5186149748446285, + "grad_norm": 1.3654496669769287, + "learning_rate": 1.3147112414592777e-06, + "loss": 0.2111, + "step": 5319 + }, + { + "epoch": 2.51908848771826, + "grad_norm": 1.1006624698638916, + "learning_rate": 1.312177631311169e-06, + "loss": 0.2167, + "step": 5320 + }, + { + "epoch": 2.519562000591891, + "grad_norm": 0.9989845752716064, + "learning_rate": 1.3096462933791853e-06, + "loss": 0.1995, + "step": 5321 + }, + { + "epoch": 2.5200355134655226, + "grad_norm": 1.1869349479675293, + "learning_rate": 1.3071172283253786e-06, + "loss": 0.2215, + "step": 5322 + }, + { + "epoch": 2.5205090263391536, + "grad_norm": 0.9278461933135986, + "learning_rate": 1.3045904368111973e-06, + "loss": 0.194, + "step": 5323 + }, + { + "epoch": 2.5209825392127847, + "grad_norm": 1.345420241355896, + "learning_rate": 1.3020659194975028e-06, + "loss": 0.2158, + "step": 5324 + }, + { + "epoch": 2.521456052086416, + "grad_norm": 1.0108526945114136, + "learning_rate": 1.2995436770445547e-06, + "loss": 0.1979, + "step": 5325 + }, + { + "epoch": 2.5219295649600473, + "grad_norm": 1.0445326566696167, + "learning_rate": 1.2970237101120253e-06, + "loss": 0.1861, + "step": 5326 + }, + { + "epoch": 2.522403077833679, + "grad_norm": 1.2074165344238281, + "learning_rate": 1.2945060193589852e-06, + "loss": 0.2149, + "step": 5327 + }, + { + "epoch": 2.52287659070731, + "grad_norm": 0.880549430847168, + "learning_rate": 1.2919906054439103e-06, + "loss": 0.1945, + "step": 5328 + }, + { + "epoch": 2.523350103580941, + "grad_norm": 1.272971749305725, + "learning_rate": 1.289477469024687e-06, + "loss": 0.1983, + "step": 5329 + }, + { + "epoch": 2.5238236164545724, + "grad_norm": 1.3522940874099731, + "learning_rate": 1.2869666107585975e-06, + "loss": 0.2175, + "step": 5330 + }, + { + "epoch": 2.5242971293282035, + "grad_norm": 1.0059926509857178, + "learning_rate": 1.2844580313023368e-06, + "loss": 0.1978, + "step": 5331 + }, + { + "epoch": 2.524770642201835, + "grad_norm": 0.781419038772583, + "learning_rate": 1.2819517313119956e-06, + "loss": 0.1766, + "step": 5332 + }, + { + "epoch": 2.525244155075466, + "grad_norm": 1.2181997299194336, + "learning_rate": 1.279447711443077e-06, + "loss": 0.1921, + "step": 5333 + }, + { + "epoch": 2.525717667949097, + "grad_norm": 1.028064489364624, + "learning_rate": 1.2769459723504795e-06, + "loss": 0.1777, + "step": 5334 + }, + { + "epoch": 2.5261911808227286, + "grad_norm": 1.0090110301971436, + "learning_rate": 1.274446514688511e-06, + "loss": 0.2018, + "step": 5335 + }, + { + "epoch": 2.5266646936963597, + "grad_norm": 1.4929478168487549, + "learning_rate": 1.2719493391108806e-06, + "loss": 0.1938, + "step": 5336 + }, + { + "epoch": 2.527138206569991, + "grad_norm": 1.0513087511062622, + "learning_rate": 1.2694544462706959e-06, + "loss": 0.1879, + "step": 5337 + }, + { + "epoch": 2.5276117194436223, + "grad_norm": 1.1161446571350098, + "learning_rate": 1.2669618368204795e-06, + "loss": 0.1997, + "step": 5338 + }, + { + "epoch": 2.5280852323172534, + "grad_norm": 1.209606647491455, + "learning_rate": 1.2644715114121432e-06, + "loss": 0.1928, + "step": 5339 + }, + { + "epoch": 2.528558745190885, + "grad_norm": 1.125483512878418, + "learning_rate": 1.2619834706970113e-06, + "loss": 0.1892, + "step": 5340 + }, + { + "epoch": 2.5290322580645164, + "grad_norm": 1.3125486373901367, + "learning_rate": 1.2594977153258036e-06, + "loss": 0.2068, + "step": 5341 + }, + { + "epoch": 2.5295057709381474, + "grad_norm": 1.0961823463439941, + "learning_rate": 1.2570142459486478e-06, + "loss": 0.1934, + "step": 5342 + }, + { + "epoch": 2.5299792838117785, + "grad_norm": 1.161319375038147, + "learning_rate": 1.254533063215072e-06, + "loss": 0.1815, + "step": 5343 + }, + { + "epoch": 2.53045279668541, + "grad_norm": 1.1228846311569214, + "learning_rate": 1.2520541677740038e-06, + "loss": 0.1877, + "step": 5344 + }, + { + "epoch": 2.530926309559041, + "grad_norm": 1.2616682052612305, + "learning_rate": 1.2495775602737759e-06, + "loss": 0.2004, + "step": 5345 + }, + { + "epoch": 2.5313998224326726, + "grad_norm": 1.3314753770828247, + "learning_rate": 1.2471032413621188e-06, + "loss": 0.223, + "step": 5346 + }, + { + "epoch": 2.5318733353063037, + "grad_norm": 1.303731918334961, + "learning_rate": 1.2446312116861703e-06, + "loss": 0.1988, + "step": 5347 + }, + { + "epoch": 2.5323468481799347, + "grad_norm": 1.0389883518218994, + "learning_rate": 1.2421614718924623e-06, + "loss": 0.194, + "step": 5348 + }, + { + "epoch": 2.5328203610535662, + "grad_norm": 1.4113802909851074, + "learning_rate": 1.239694022626935e-06, + "loss": 0.1868, + "step": 5349 + }, + { + "epoch": 2.5332938739271973, + "grad_norm": 1.1160602569580078, + "learning_rate": 1.2372288645349207e-06, + "loss": 0.1914, + "step": 5350 + }, + { + "epoch": 2.533767386800829, + "grad_norm": 1.1180630922317505, + "learning_rate": 1.2347659982611637e-06, + "loss": 0.187, + "step": 5351 + }, + { + "epoch": 2.53424089967446, + "grad_norm": 2.2258567810058594, + "learning_rate": 1.2323054244498001e-06, + "loss": 0.1949, + "step": 5352 + }, + { + "epoch": 2.534714412548091, + "grad_norm": 1.0925301313400269, + "learning_rate": 1.2298471437443671e-06, + "loss": 0.1731, + "step": 5353 + }, + { + "epoch": 2.5351879254217224, + "grad_norm": 1.3025660514831543, + "learning_rate": 1.2273911567878095e-06, + "loss": 0.2104, + "step": 5354 + }, + { + "epoch": 2.5356614382953535, + "grad_norm": 1.7932027578353882, + "learning_rate": 1.224937464222461e-06, + "loss": 0.202, + "step": 5355 + }, + { + "epoch": 2.536134951168985, + "grad_norm": 1.089361310005188, + "learning_rate": 1.222486066690066e-06, + "loss": 0.2003, + "step": 5356 + }, + { + "epoch": 2.536608464042616, + "grad_norm": 0.9862686395645142, + "learning_rate": 1.22003696483176e-06, + "loss": 0.218, + "step": 5357 + }, + { + "epoch": 2.537081976916247, + "grad_norm": 1.0599945783615112, + "learning_rate": 1.2175901592880867e-06, + "loss": 0.2115, + "step": 5358 + }, + { + "epoch": 2.5375554897898787, + "grad_norm": 1.217585563659668, + "learning_rate": 1.21514565069898e-06, + "loss": 0.1972, + "step": 5359 + }, + { + "epoch": 2.53802900266351, + "grad_norm": 1.0531421899795532, + "learning_rate": 1.2127034397037808e-06, + "loss": 0.1863, + "step": 5360 + }, + { + "epoch": 2.5385025155371412, + "grad_norm": 1.6633493900299072, + "learning_rate": 1.2102635269412244e-06, + "loss": 0.184, + "step": 5361 + }, + { + "epoch": 2.5389760284107723, + "grad_norm": 1.1086888313293457, + "learning_rate": 1.207825913049445e-06, + "loss": 0.2196, + "step": 5362 + }, + { + "epoch": 2.539449541284404, + "grad_norm": 1.2374646663665771, + "learning_rate": 1.2053905986659798e-06, + "loss": 0.2041, + "step": 5363 + }, + { + "epoch": 2.539923054158035, + "grad_norm": 0.9073995351791382, + "learning_rate": 1.2029575844277585e-06, + "loss": 0.1934, + "step": 5364 + }, + { + "epoch": 2.5403965670316664, + "grad_norm": 1.31955885887146, + "learning_rate": 1.2005268709711172e-06, + "loss": 0.1862, + "step": 5365 + }, + { + "epoch": 2.5408700799052975, + "grad_norm": 0.9501339793205261, + "learning_rate": 1.1980984589317802e-06, + "loss": 0.1849, + "step": 5366 + }, + { + "epoch": 2.5413435927789285, + "grad_norm": 1.0616928339004517, + "learning_rate": 1.1956723489448796e-06, + "loss": 0.2111, + "step": 5367 + }, + { + "epoch": 2.54181710565256, + "grad_norm": 1.4748432636260986, + "learning_rate": 1.1932485416449369e-06, + "loss": 0.1916, + "step": 5368 + }, + { + "epoch": 2.542290618526191, + "grad_norm": 1.1778125762939453, + "learning_rate": 1.1908270376658804e-06, + "loss": 0.2055, + "step": 5369 + }, + { + "epoch": 2.5427641313998226, + "grad_norm": 1.297905445098877, + "learning_rate": 1.1884078376410291e-06, + "loss": 0.2062, + "step": 5370 + }, + { + "epoch": 2.5432376442734537, + "grad_norm": 0.9151560664176941, + "learning_rate": 1.1859909422030991e-06, + "loss": 0.1974, + "step": 5371 + }, + { + "epoch": 2.5437111571470847, + "grad_norm": 1.6805721521377563, + "learning_rate": 1.1835763519842092e-06, + "loss": 0.2103, + "step": 5372 + }, + { + "epoch": 2.5441846700207162, + "grad_norm": 1.0770697593688965, + "learning_rate": 1.1811640676158686e-06, + "loss": 0.1965, + "step": 5373 + }, + { + "epoch": 2.5446581828943473, + "grad_norm": 1.3352454900741577, + "learning_rate": 1.1787540897289918e-06, + "loss": 0.2031, + "step": 5374 + }, + { + "epoch": 2.545131695767979, + "grad_norm": 1.8438594341278076, + "learning_rate": 1.176346418953881e-06, + "loss": 0.1951, + "step": 5375 + }, + { + "epoch": 2.54560520864161, + "grad_norm": 1.525089144706726, + "learning_rate": 1.1739410559202425e-06, + "loss": 0.2543, + "step": 5376 + }, + { + "epoch": 2.546078721515241, + "grad_norm": 1.3220195770263672, + "learning_rate": 1.171538001257172e-06, + "loss": 0.2211, + "step": 5377 + }, + { + "epoch": 2.5465522343888725, + "grad_norm": 1.0386881828308105, + "learning_rate": 1.16913725559317e-06, + "loss": 0.2002, + "step": 5378 + }, + { + "epoch": 2.547025747262504, + "grad_norm": 1.009458303451538, + "learning_rate": 1.1667388195561247e-06, + "loss": 0.1873, + "step": 5379 + }, + { + "epoch": 2.547499260136135, + "grad_norm": 1.3089097738265991, + "learning_rate": 1.164342693773326e-06, + "loss": 0.1998, + "step": 5380 + }, + { + "epoch": 2.547972773009766, + "grad_norm": 1.061797022819519, + "learning_rate": 1.161948878871455e-06, + "loss": 0.2146, + "step": 5381 + }, + { + "epoch": 2.5484462858833976, + "grad_norm": 0.9451477527618408, + "learning_rate": 1.1595573754765932e-06, + "loss": 0.2215, + "step": 5382 + }, + { + "epoch": 2.5489197987570287, + "grad_norm": 1.7291032075881958, + "learning_rate": 1.1571681842142158e-06, + "loss": 0.1936, + "step": 5383 + }, + { + "epoch": 2.54939331163066, + "grad_norm": 1.9363996982574463, + "learning_rate": 1.1547813057091906e-06, + "loss": 0.2136, + "step": 5384 + }, + { + "epoch": 2.5498668245042913, + "grad_norm": 0.9909132122993469, + "learning_rate": 1.1523967405857838e-06, + "loss": 0.1925, + "step": 5385 + }, + { + "epoch": 2.5503403373779223, + "grad_norm": 1.205962896347046, + "learning_rate": 1.1500144894676568e-06, + "loss": 0.2009, + "step": 5386 + }, + { + "epoch": 2.550813850251554, + "grad_norm": 0.9973340034484863, + "learning_rate": 1.14763455297786e-06, + "loss": 0.1876, + "step": 5387 + }, + { + "epoch": 2.551287363125185, + "grad_norm": 1.2037270069122314, + "learning_rate": 1.1452569317388474e-06, + "loss": 0.2058, + "step": 5388 + }, + { + "epoch": 2.5517608759988164, + "grad_norm": 0.9326767325401306, + "learning_rate": 1.1428816263724596e-06, + "loss": 0.1998, + "step": 5389 + }, + { + "epoch": 2.5522343888724475, + "grad_norm": 1.0340579748153687, + "learning_rate": 1.1405086374999386e-06, + "loss": 0.1951, + "step": 5390 + }, + { + "epoch": 2.5527079017460785, + "grad_norm": 1.1469632387161255, + "learning_rate": 1.1381379657419112e-06, + "loss": 0.2013, + "step": 5391 + }, + { + "epoch": 2.55318141461971, + "grad_norm": 1.0741297006607056, + "learning_rate": 1.1357696117184103e-06, + "loss": 0.183, + "step": 5392 + }, + { + "epoch": 2.553654927493341, + "grad_norm": 1.3129887580871582, + "learning_rate": 1.1334035760488493e-06, + "loss": 0.2213, + "step": 5393 + }, + { + "epoch": 2.5541284403669726, + "grad_norm": 1.0129112005233765, + "learning_rate": 1.1310398593520488e-06, + "loss": 0.1879, + "step": 5394 + }, + { + "epoch": 2.5546019532406037, + "grad_norm": 0.8904241323471069, + "learning_rate": 1.128678462246212e-06, + "loss": 0.2156, + "step": 5395 + }, + { + "epoch": 2.5550754661142348, + "grad_norm": 1.214227557182312, + "learning_rate": 1.1263193853489384e-06, + "loss": 0.2161, + "step": 5396 + }, + { + "epoch": 2.5555489789878663, + "grad_norm": 1.472237229347229, + "learning_rate": 1.1239626292772254e-06, + "loss": 0.2028, + "step": 5397 + }, + { + "epoch": 2.5560224918614973, + "grad_norm": 1.0698444843292236, + "learning_rate": 1.1216081946474566e-06, + "loss": 0.2018, + "step": 5398 + }, + { + "epoch": 2.556496004735129, + "grad_norm": 1.198815107345581, + "learning_rate": 1.1192560820754151e-06, + "loss": 0.2144, + "step": 5399 + }, + { + "epoch": 2.55696951760876, + "grad_norm": 2.1452925205230713, + "learning_rate": 1.1169062921762686e-06, + "loss": 0.2089, + "step": 5400 + }, + { + "epoch": 2.557443030482391, + "grad_norm": 1.0623699426651, + "learning_rate": 1.1145588255645868e-06, + "loss": 0.2058, + "step": 5401 + }, + { + "epoch": 2.5579165433560225, + "grad_norm": 1.0359307527542114, + "learning_rate": 1.112213682854323e-06, + "loss": 0.2041, + "step": 5402 + }, + { + "epoch": 2.558390056229654, + "grad_norm": 1.2944672107696533, + "learning_rate": 1.1098708646588308e-06, + "loss": 0.211, + "step": 5403 + }, + { + "epoch": 2.558863569103285, + "grad_norm": 1.033818006515503, + "learning_rate": 1.107530371590847e-06, + "loss": 0.1941, + "step": 5404 + }, + { + "epoch": 2.559337081976916, + "grad_norm": 1.210862398147583, + "learning_rate": 1.1051922042625096e-06, + "loss": 0.2018, + "step": 5405 + }, + { + "epoch": 2.5598105948505476, + "grad_norm": 0.9782442450523376, + "learning_rate": 1.1028563632853407e-06, + "loss": 0.197, + "step": 5406 + }, + { + "epoch": 2.5602841077241787, + "grad_norm": 1.218672752380371, + "learning_rate": 1.1005228492702557e-06, + "loss": 0.2224, + "step": 5407 + }, + { + "epoch": 2.56075762059781, + "grad_norm": 1.1376310586929321, + "learning_rate": 1.0981916628275679e-06, + "loss": 0.2009, + "step": 5408 + }, + { + "epoch": 2.5612311334714413, + "grad_norm": 1.071939468383789, + "learning_rate": 1.0958628045669705e-06, + "loss": 0.194, + "step": 5409 + }, + { + "epoch": 2.5617046463450723, + "grad_norm": 1.157542109489441, + "learning_rate": 1.0935362750975597e-06, + "loss": 0.2161, + "step": 5410 + }, + { + "epoch": 2.562178159218704, + "grad_norm": 1.7687501907348633, + "learning_rate": 1.091212075027811e-06, + "loss": 0.1928, + "step": 5411 + }, + { + "epoch": 2.562651672092335, + "grad_norm": 1.131037950515747, + "learning_rate": 1.0888902049656014e-06, + "loss": 0.2192, + "step": 5412 + }, + { + "epoch": 2.5631251849659664, + "grad_norm": 1.0506967306137085, + "learning_rate": 1.0865706655181907e-06, + "loss": 0.1816, + "step": 5413 + }, + { + "epoch": 2.5635986978395975, + "grad_norm": 1.2665033340454102, + "learning_rate": 1.0842534572922348e-06, + "loss": 0.1969, + "step": 5414 + }, + { + "epoch": 2.5640722107132286, + "grad_norm": 1.2768964767456055, + "learning_rate": 1.0819385808937743e-06, + "loss": 0.2108, + "step": 5415 + }, + { + "epoch": 2.56454572358686, + "grad_norm": 1.3844908475875854, + "learning_rate": 1.0796260369282429e-06, + "loss": 0.2119, + "step": 5416 + }, + { + "epoch": 2.565019236460491, + "grad_norm": 1.3508644104003906, + "learning_rate": 1.0773158260004668e-06, + "loss": 0.1835, + "step": 5417 + }, + { + "epoch": 2.5654927493341226, + "grad_norm": 1.0464918613433838, + "learning_rate": 1.0750079487146558e-06, + "loss": 0.2183, + "step": 5418 + }, + { + "epoch": 2.5659662622077537, + "grad_norm": 1.4549388885498047, + "learning_rate": 1.0727024056744172e-06, + "loss": 0.2108, + "step": 5419 + }, + { + "epoch": 2.5664397750813848, + "grad_norm": 1.0538500547409058, + "learning_rate": 1.0703991974827399e-06, + "loss": 0.2007, + "step": 5420 + }, + { + "epoch": 2.5669132879550163, + "grad_norm": 1.0848989486694336, + "learning_rate": 1.0680983247420062e-06, + "loss": 0.2185, + "step": 5421 + }, + { + "epoch": 2.567386800828648, + "grad_norm": 0.9970195889472961, + "learning_rate": 1.0657997880539894e-06, + "loss": 0.2014, + "step": 5422 + }, + { + "epoch": 2.567860313702279, + "grad_norm": 1.1892081499099731, + "learning_rate": 1.0635035880198474e-06, + "loss": 0.1993, + "step": 5423 + }, + { + "epoch": 2.56833382657591, + "grad_norm": 1.3775814771652222, + "learning_rate": 1.061209725240132e-06, + "loss": 0.1976, + "step": 5424 + }, + { + "epoch": 2.5688073394495414, + "grad_norm": 1.3159923553466797, + "learning_rate": 1.0589182003147758e-06, + "loss": 0.2057, + "step": 5425 + }, + { + "epoch": 2.5692808523231725, + "grad_norm": 1.09856116771698, + "learning_rate": 1.0566290138431113e-06, + "loss": 0.1962, + "step": 5426 + }, + { + "epoch": 2.569754365196804, + "grad_norm": 1.0620033740997314, + "learning_rate": 1.0543421664238473e-06, + "loss": 0.2097, + "step": 5427 + }, + { + "epoch": 2.570227878070435, + "grad_norm": 1.6413558721542358, + "learning_rate": 1.0520576586550923e-06, + "loss": 0.2114, + "step": 5428 + }, + { + "epoch": 2.570701390944066, + "grad_norm": 1.1933810710906982, + "learning_rate": 1.0497754911343316e-06, + "loss": 0.2145, + "step": 5429 + }, + { + "epoch": 2.5711749038176976, + "grad_norm": 1.7124356031417847, + "learning_rate": 1.0474956644584488e-06, + "loss": 0.1995, + "step": 5430 + }, + { + "epoch": 2.5716484166913287, + "grad_norm": 0.9946605563163757, + "learning_rate": 1.0452181792237092e-06, + "loss": 0.2177, + "step": 5431 + }, + { + "epoch": 2.57212192956496, + "grad_norm": 1.332301139831543, + "learning_rate": 1.0429430360257642e-06, + "loss": 0.1792, + "step": 5432 + }, + { + "epoch": 2.5725954424385913, + "grad_norm": 1.497750163078308, + "learning_rate": 1.0406702354596598e-06, + "loss": 0.196, + "step": 5433 + }, + { + "epoch": 2.5730689553122224, + "grad_norm": 1.034303069114685, + "learning_rate": 1.0383997781198218e-06, + "loss": 0.1817, + "step": 5434 + }, + { + "epoch": 2.573542468185854, + "grad_norm": 0.7837559580802917, + "learning_rate": 1.0361316646000686e-06, + "loss": 0.1871, + "step": 5435 + }, + { + "epoch": 2.574015981059485, + "grad_norm": 1.18108069896698, + "learning_rate": 1.0338658954936008e-06, + "loss": 0.1877, + "step": 5436 + }, + { + "epoch": 2.5744894939331164, + "grad_norm": 0.9967142939567566, + "learning_rate": 1.0316024713930129e-06, + "loss": 0.1927, + "step": 5437 + }, + { + "epoch": 2.5749630068067475, + "grad_norm": 1.073191523551941, + "learning_rate": 1.0293413928902761e-06, + "loss": 0.2007, + "step": 5438 + }, + { + "epoch": 2.5754365196803786, + "grad_norm": 1.0731115341186523, + "learning_rate": 1.0270826605767592e-06, + "loss": 0.2134, + "step": 5439 + }, + { + "epoch": 2.57591003255401, + "grad_norm": 1.0234590768814087, + "learning_rate": 1.024826275043209e-06, + "loss": 0.1902, + "step": 5440 + }, + { + "epoch": 2.5763835454276416, + "grad_norm": 1.0381513833999634, + "learning_rate": 1.0225722368797598e-06, + "loss": 0.1957, + "step": 5441 + }, + { + "epoch": 2.5768570583012727, + "grad_norm": 1.2362014055252075, + "learning_rate": 1.020320546675937e-06, + "loss": 0.186, + "step": 5442 + }, + { + "epoch": 2.5773305711749037, + "grad_norm": 0.9183345437049866, + "learning_rate": 1.0180712050206442e-06, + "loss": 0.1847, + "step": 5443 + }, + { + "epoch": 2.5778040840485352, + "grad_norm": 1.0637094974517822, + "learning_rate": 1.01582421250218e-06, + "loss": 0.1914, + "step": 5444 + }, + { + "epoch": 2.5782775969221663, + "grad_norm": 1.023342490196228, + "learning_rate": 1.0135795697082195e-06, + "loss": 0.2049, + "step": 5445 + }, + { + "epoch": 2.578751109795798, + "grad_norm": 0.9113678336143494, + "learning_rate": 1.0113372772258302e-06, + "loss": 0.185, + "step": 5446 + }, + { + "epoch": 2.579224622669429, + "grad_norm": 1.1002506017684937, + "learning_rate": 1.009097335641459e-06, + "loss": 0.2028, + "step": 5447 + }, + { + "epoch": 2.57969813554306, + "grad_norm": 1.069926142692566, + "learning_rate": 1.0068597455409458e-06, + "loss": 0.1777, + "step": 5448 + }, + { + "epoch": 2.5801716484166914, + "grad_norm": 1.127890944480896, + "learning_rate": 1.0046245075095074e-06, + "loss": 0.1816, + "step": 5449 + }, + { + "epoch": 2.5806451612903225, + "grad_norm": 0.9867877960205078, + "learning_rate": 1.0023916221317465e-06, + "loss": 0.199, + "step": 5450 + }, + { + "epoch": 2.581118674163954, + "grad_norm": 1.3227022886276245, + "learning_rate": 1.000161089991658e-06, + "loss": 0.196, + "step": 5451 + }, + { + "epoch": 2.581592187037585, + "grad_norm": 1.1155279874801636, + "learning_rate": 9.979329116726111e-07, + "loss": 0.1934, + "step": 5452 + }, + { + "epoch": 2.582065699911216, + "grad_norm": 1.0326663255691528, + "learning_rate": 9.957070877573682e-07, + "loss": 0.1942, + "step": 5453 + }, + { + "epoch": 2.5825392127848477, + "grad_norm": 1.2042585611343384, + "learning_rate": 9.934836188280693e-07, + "loss": 0.189, + "step": 5454 + }, + { + "epoch": 2.5830127256584787, + "grad_norm": 1.3072971105575562, + "learning_rate": 9.91262505466245e-07, + "loss": 0.1837, + "step": 5455 + }, + { + "epoch": 2.5834862385321102, + "grad_norm": 1.351815938949585, + "learning_rate": 9.890437482528004e-07, + "loss": 0.1906, + "step": 5456 + }, + { + "epoch": 2.5839597514057413, + "grad_norm": 1.0295909643173218, + "learning_rate": 9.868273477680357e-07, + "loss": 0.2168, + "step": 5457 + }, + { + "epoch": 2.5844332642793724, + "grad_norm": 0.970486044883728, + "learning_rate": 9.84613304591625e-07, + "loss": 0.1817, + "step": 5458 + }, + { + "epoch": 2.584906777153004, + "grad_norm": 1.0597081184387207, + "learning_rate": 9.824016193026308e-07, + "loss": 0.1955, + "step": 5459 + }, + { + "epoch": 2.585380290026635, + "grad_norm": 1.491175889968872, + "learning_rate": 9.801922924795004e-07, + "loss": 0.2162, + "step": 5460 + }, + { + "epoch": 2.5858538029002665, + "grad_norm": 1.3303380012512207, + "learning_rate": 9.779853247000593e-07, + "loss": 0.1997, + "step": 5461 + }, + { + "epoch": 2.5863273157738975, + "grad_norm": 1.9732062816619873, + "learning_rate": 9.757807165415213e-07, + "loss": 0.2225, + "step": 5462 + }, + { + "epoch": 2.5868008286475286, + "grad_norm": 1.0555258989334106, + "learning_rate": 9.735784685804773e-07, + "loss": 0.1806, + "step": 5463 + }, + { + "epoch": 2.58727434152116, + "grad_norm": 1.026889681816101, + "learning_rate": 9.713785813929056e-07, + "loss": 0.2107, + "step": 5464 + }, + { + "epoch": 2.5877478543947916, + "grad_norm": 1.0289431810379028, + "learning_rate": 9.69181055554167e-07, + "loss": 0.1958, + "step": 5465 + }, + { + "epoch": 2.5882213672684227, + "grad_norm": 0.9716818332672119, + "learning_rate": 9.669858916389985e-07, + "loss": 0.1865, + "step": 5466 + }, + { + "epoch": 2.5886948801420537, + "grad_norm": 1.6020885705947876, + "learning_rate": 9.647930902215296e-07, + "loss": 0.1944, + "step": 5467 + }, + { + "epoch": 2.5891683930156852, + "grad_norm": 1.1589189767837524, + "learning_rate": 9.626026518752619e-07, + "loss": 0.1926, + "step": 5468 + }, + { + "epoch": 2.5896419058893163, + "grad_norm": 1.059776782989502, + "learning_rate": 9.604145771730865e-07, + "loss": 0.1716, + "step": 5469 + }, + { + "epoch": 2.590115418762948, + "grad_norm": 1.3090096712112427, + "learning_rate": 9.582288666872708e-07, + "loss": 0.2115, + "step": 5470 + }, + { + "epoch": 2.590588931636579, + "grad_norm": 1.174440860748291, + "learning_rate": 9.560455209894691e-07, + "loss": 0.1893, + "step": 5471 + }, + { + "epoch": 2.59106244451021, + "grad_norm": 1.4484925270080566, + "learning_rate": 9.538645406507108e-07, + "loss": 0.2198, + "step": 5472 + }, + { + "epoch": 2.5915359573838415, + "grad_norm": 1.0521667003631592, + "learning_rate": 9.516859262414147e-07, + "loss": 0.1962, + "step": 5473 + }, + { + "epoch": 2.5920094702574725, + "grad_norm": 1.0996438264846802, + "learning_rate": 9.495096783313729e-07, + "loss": 0.2061, + "step": 5474 + }, + { + "epoch": 2.592482983131104, + "grad_norm": 1.0292750597000122, + "learning_rate": 9.473357974897623e-07, + "loss": 0.1941, + "step": 5475 + }, + { + "epoch": 2.592956496004735, + "grad_norm": 1.131027340888977, + "learning_rate": 9.451642842851427e-07, + "loss": 0.1894, + "step": 5476 + }, + { + "epoch": 2.593430008878366, + "grad_norm": 1.0126436948776245, + "learning_rate": 9.429951392854486e-07, + "loss": 0.1936, + "step": 5477 + }, + { + "epoch": 2.5939035217519977, + "grad_norm": 1.0096503496170044, + "learning_rate": 9.40828363058004e-07, + "loss": 0.1973, + "step": 5478 + }, + { + "epoch": 2.5943770346256287, + "grad_norm": 1.16969895362854, + "learning_rate": 9.386639561695043e-07, + "loss": 0.2078, + "step": 5479 + }, + { + "epoch": 2.5948505474992603, + "grad_norm": 1.0769869089126587, + "learning_rate": 9.365019191860314e-07, + "loss": 0.2057, + "step": 5480 + }, + { + "epoch": 2.5953240603728913, + "grad_norm": 1.0034693479537964, + "learning_rate": 9.343422526730428e-07, + "loss": 0.199, + "step": 5481 + }, + { + "epoch": 2.5957975732465224, + "grad_norm": 1.0947496891021729, + "learning_rate": 9.321849571953822e-07, + "loss": 0.1851, + "step": 5482 + }, + { + "epoch": 2.596271086120154, + "grad_norm": 1.386936068534851, + "learning_rate": 9.300300333172652e-07, + "loss": 0.2034, + "step": 5483 + }, + { + "epoch": 2.5967445989937854, + "grad_norm": 1.0438120365142822, + "learning_rate": 9.27877481602295e-07, + "loss": 0.2129, + "step": 5484 + }, + { + "epoch": 2.5972181118674165, + "grad_norm": 1.0546690225601196, + "learning_rate": 9.257273026134494e-07, + "loss": 0.2108, + "step": 5485 + }, + { + "epoch": 2.5976916247410475, + "grad_norm": 1.3479506969451904, + "learning_rate": 9.235794969130851e-07, + "loss": 0.184, + "step": 5486 + }, + { + "epoch": 2.598165137614679, + "grad_norm": 1.1528756618499756, + "learning_rate": 9.214340650629439e-07, + "loss": 0.1933, + "step": 5487 + }, + { + "epoch": 2.59863865048831, + "grad_norm": 1.3162215948104858, + "learning_rate": 9.192910076241379e-07, + "loss": 0.1848, + "step": 5488 + }, + { + "epoch": 2.5991121633619416, + "grad_norm": 1.3679094314575195, + "learning_rate": 9.171503251571678e-07, + "loss": 0.2207, + "step": 5489 + }, + { + "epoch": 2.5995856762355727, + "grad_norm": 1.234590768814087, + "learning_rate": 9.150120182219046e-07, + "loss": 0.1839, + "step": 5490 + }, + { + "epoch": 2.6000591891092037, + "grad_norm": 1.105852484703064, + "learning_rate": 9.128760873776054e-07, + "loss": 0.2014, + "step": 5491 + }, + { + "epoch": 2.6005327019828353, + "grad_norm": 1.1004148721694946, + "learning_rate": 9.107425331828989e-07, + "loss": 0.1932, + "step": 5492 + }, + { + "epoch": 2.6010062148564663, + "grad_norm": 0.9159629940986633, + "learning_rate": 9.086113561957987e-07, + "loss": 0.2114, + "step": 5493 + }, + { + "epoch": 2.601479727730098, + "grad_norm": 1.061231255531311, + "learning_rate": 9.064825569736924e-07, + "loss": 0.1762, + "step": 5494 + }, + { + "epoch": 2.601953240603729, + "grad_norm": 1.741815209388733, + "learning_rate": 9.043561360733444e-07, + "loss": 0.187, + "step": 5495 + }, + { + "epoch": 2.60242675347736, + "grad_norm": 1.0610990524291992, + "learning_rate": 9.022320940509033e-07, + "loss": 0.1988, + "step": 5496 + }, + { + "epoch": 2.6029002663509915, + "grad_norm": 1.0070830583572388, + "learning_rate": 9.001104314618892e-07, + "loss": 0.2234, + "step": 5497 + }, + { + "epoch": 2.6033737792246225, + "grad_norm": 0.9660130143165588, + "learning_rate": 8.979911488612037e-07, + "loss": 0.2007, + "step": 5498 + }, + { + "epoch": 2.603847292098254, + "grad_norm": 1.0970488786697388, + "learning_rate": 8.958742468031257e-07, + "loss": 0.2136, + "step": 5499 + }, + { + "epoch": 2.604320804971885, + "grad_norm": 0.9795224666595459, + "learning_rate": 8.937597258413078e-07, + "loss": 0.2162, + "step": 5500 + }, + { + "epoch": 2.604794317845516, + "grad_norm": 1.2171189785003662, + "learning_rate": 8.916475865287855e-07, + "loss": 0.2051, + "step": 5501 + }, + { + "epoch": 2.6052678307191477, + "grad_norm": 1.032201886177063, + "learning_rate": 8.895378294179658e-07, + "loss": 0.1856, + "step": 5502 + }, + { + "epoch": 2.605741343592779, + "grad_norm": 1.1588233709335327, + "learning_rate": 8.874304550606383e-07, + "loss": 0.2028, + "step": 5503 + }, + { + "epoch": 2.6062148564664103, + "grad_norm": 0.9060316681861877, + "learning_rate": 8.853254640079633e-07, + "loss": 0.1907, + "step": 5504 + }, + { + "epoch": 2.6066883693400413, + "grad_norm": 1.5966558456420898, + "learning_rate": 8.832228568104839e-07, + "loss": 0.2036, + "step": 5505 + }, + { + "epoch": 2.607161882213673, + "grad_norm": 1.2059063911437988, + "learning_rate": 8.811226340181133e-07, + "loss": 0.2239, + "step": 5506 + }, + { + "epoch": 2.607635395087304, + "grad_norm": 1.165710210800171, + "learning_rate": 8.79024796180149e-07, + "loss": 0.2087, + "step": 5507 + }, + { + "epoch": 2.6081089079609354, + "grad_norm": 1.2334532737731934, + "learning_rate": 8.769293438452553e-07, + "loss": 0.203, + "step": 5508 + }, + { + "epoch": 2.6085824208345665, + "grad_norm": 1.1886234283447266, + "learning_rate": 8.748362775614816e-07, + "loss": 0.2152, + "step": 5509 + }, + { + "epoch": 2.6090559337081975, + "grad_norm": 1.4193661212921143, + "learning_rate": 8.727455978762478e-07, + "loss": 0.1982, + "step": 5510 + }, + { + "epoch": 2.609529446581829, + "grad_norm": 1.0052508115768433, + "learning_rate": 8.706573053363487e-07, + "loss": 0.2157, + "step": 5511 + }, + { + "epoch": 2.61000295945546, + "grad_norm": 1.1085777282714844, + "learning_rate": 8.685714004879608e-07, + "loss": 0.1973, + "step": 5512 + }, + { + "epoch": 2.6104764723290916, + "grad_norm": 1.2764101028442383, + "learning_rate": 8.664878838766289e-07, + "loss": 0.1935, + "step": 5513 + }, + { + "epoch": 2.6109499852027227, + "grad_norm": 0.94977867603302, + "learning_rate": 8.644067560472802e-07, + "loss": 0.1907, + "step": 5514 + }, + { + "epoch": 2.6114234980763538, + "grad_norm": 0.928570032119751, + "learning_rate": 8.623280175442094e-07, + "loss": 0.1929, + "step": 5515 + }, + { + "epoch": 2.6118970109499853, + "grad_norm": 1.3341639041900635, + "learning_rate": 8.602516689110952e-07, + "loss": 0.1987, + "step": 5516 + }, + { + "epoch": 2.6123705238236163, + "grad_norm": 1.2463881969451904, + "learning_rate": 8.581777106909827e-07, + "loss": 0.1788, + "step": 5517 + }, + { + "epoch": 2.612844036697248, + "grad_norm": 1.0751234292984009, + "learning_rate": 8.561061434262996e-07, + "loss": 0.2133, + "step": 5518 + }, + { + "epoch": 2.613317549570879, + "grad_norm": 0.9469509124755859, + "learning_rate": 8.540369676588411e-07, + "loss": 0.1925, + "step": 5519 + }, + { + "epoch": 2.61379106244451, + "grad_norm": 1.0061715841293335, + "learning_rate": 8.51970183929779e-07, + "loss": 0.2128, + "step": 5520 + }, + { + "epoch": 2.6142645753181415, + "grad_norm": 1.019024133682251, + "learning_rate": 8.49905792779665e-07, + "loss": 0.2089, + "step": 5521 + }, + { + "epoch": 2.6147380881917726, + "grad_norm": 2.2128031253814697, + "learning_rate": 8.478437947484164e-07, + "loss": 0.2122, + "step": 5522 + }, + { + "epoch": 2.615211601065404, + "grad_norm": 1.6629959344863892, + "learning_rate": 8.457841903753327e-07, + "loss": 0.2341, + "step": 5523 + }, + { + "epoch": 2.615685113939035, + "grad_norm": 1.15640127658844, + "learning_rate": 8.4372698019908e-07, + "loss": 0.2024, + "step": 5524 + }, + { + "epoch": 2.616158626812666, + "grad_norm": 0.9604775905609131, + "learning_rate": 8.416721647577053e-07, + "loss": 0.1944, + "step": 5525 + }, + { + "epoch": 2.6166321396862977, + "grad_norm": 0.9418638348579407, + "learning_rate": 8.396197445886223e-07, + "loss": 0.1688, + "step": 5526 + }, + { + "epoch": 2.617105652559929, + "grad_norm": 1.1123701333999634, + "learning_rate": 8.375697202286248e-07, + "loss": 0.2019, + "step": 5527 + }, + { + "epoch": 2.6175791654335603, + "grad_norm": 1.4473880529403687, + "learning_rate": 8.355220922138762e-07, + "loss": 0.2034, + "step": 5528 + }, + { + "epoch": 2.6180526783071913, + "grad_norm": 1.3960907459259033, + "learning_rate": 8.334768610799104e-07, + "loss": 0.1939, + "step": 5529 + }, + { + "epoch": 2.618526191180823, + "grad_norm": 0.9807230830192566, + "learning_rate": 8.314340273616428e-07, + "loss": 0.2027, + "step": 5530 + }, + { + "epoch": 2.618999704054454, + "grad_norm": 1.4325486421585083, + "learning_rate": 8.293935915933526e-07, + "loss": 0.2103, + "step": 5531 + }, + { + "epoch": 2.6194732169280854, + "grad_norm": 1.228115200996399, + "learning_rate": 8.273555543087009e-07, + "loss": 0.1993, + "step": 5532 + }, + { + "epoch": 2.6199467298017165, + "grad_norm": 1.0208629369735718, + "learning_rate": 8.25319916040711e-07, + "loss": 0.1961, + "step": 5533 + }, + { + "epoch": 2.6204202426753476, + "grad_norm": 1.7419202327728271, + "learning_rate": 8.232866773217896e-07, + "loss": 0.2002, + "step": 5534 + }, + { + "epoch": 2.620893755548979, + "grad_norm": 1.0411862134933472, + "learning_rate": 8.212558386837067e-07, + "loss": 0.1958, + "step": 5535 + }, + { + "epoch": 2.62136726842261, + "grad_norm": 1.4005458354949951, + "learning_rate": 8.192274006576095e-07, + "loss": 0.2136, + "step": 5536 + }, + { + "epoch": 2.6218407812962417, + "grad_norm": 1.0770565271377563, + "learning_rate": 8.172013637740195e-07, + "loss": 0.2002, + "step": 5537 + }, + { + "epoch": 2.6223142941698727, + "grad_norm": 1.2719935178756714, + "learning_rate": 8.151777285628226e-07, + "loss": 0.2034, + "step": 5538 + }, + { + "epoch": 2.622787807043504, + "grad_norm": 1.5015056133270264, + "learning_rate": 8.131564955532856e-07, + "loss": 0.1889, + "step": 5539 + }, + { + "epoch": 2.6232613199171353, + "grad_norm": 1.192428469657898, + "learning_rate": 8.111376652740388e-07, + "loss": 0.1872, + "step": 5540 + }, + { + "epoch": 2.6237348327907664, + "grad_norm": 1.306041955947876, + "learning_rate": 8.091212382530899e-07, + "loss": 0.2135, + "step": 5541 + }, + { + "epoch": 2.624208345664398, + "grad_norm": 1.4788362979888916, + "learning_rate": 8.071072150178138e-07, + "loss": 0.2017, + "step": 5542 + }, + { + "epoch": 2.624681858538029, + "grad_norm": 1.2858426570892334, + "learning_rate": 8.050955960949625e-07, + "loss": 0.2031, + "step": 5543 + }, + { + "epoch": 2.62515537141166, + "grad_norm": 1.2146573066711426, + "learning_rate": 8.030863820106527e-07, + "loss": 0.1757, + "step": 5544 + }, + { + "epoch": 2.6256288842852915, + "grad_norm": 1.650189995765686, + "learning_rate": 8.010795732903731e-07, + "loss": 0.2143, + "step": 5545 + }, + { + "epoch": 2.626102397158923, + "grad_norm": 1.076585292816162, + "learning_rate": 7.990751704589906e-07, + "loss": 0.1987, + "step": 5546 + }, + { + "epoch": 2.626575910032554, + "grad_norm": 1.0550071001052856, + "learning_rate": 7.970731740407311e-07, + "loss": 0.1886, + "step": 5547 + }, + { + "epoch": 2.627049422906185, + "grad_norm": 1.0130494832992554, + "learning_rate": 7.950735845592039e-07, + "loss": 0.2068, + "step": 5548 + }, + { + "epoch": 2.6275229357798167, + "grad_norm": 1.2154542207717896, + "learning_rate": 7.93076402537376e-07, + "loss": 0.2018, + "step": 5549 + }, + { + "epoch": 2.6279964486534477, + "grad_norm": 0.8742989897727966, + "learning_rate": 7.910816284975975e-07, + "loss": 0.1784, + "step": 5550 + }, + { + "epoch": 2.6284699615270792, + "grad_norm": 1.0601682662963867, + "learning_rate": 7.890892629615765e-07, + "loss": 0.1972, + "step": 5551 + }, + { + "epoch": 2.6289434744007103, + "grad_norm": 1.270751953125, + "learning_rate": 7.870993064504018e-07, + "loss": 0.2142, + "step": 5552 + }, + { + "epoch": 2.6294169872743414, + "grad_norm": 1.1302870512008667, + "learning_rate": 7.851117594845237e-07, + "loss": 0.2138, + "step": 5553 + }, + { + "epoch": 2.629890500147973, + "grad_norm": 1.3007394075393677, + "learning_rate": 7.831266225837675e-07, + "loss": 0.1978, + "step": 5554 + }, + { + "epoch": 2.630364013021604, + "grad_norm": 0.9886184930801392, + "learning_rate": 7.811438962673268e-07, + "loss": 0.1897, + "step": 5555 + }, + { + "epoch": 2.6308375258952355, + "grad_norm": 1.188515305519104, + "learning_rate": 7.791635810537624e-07, + "loss": 0.2089, + "step": 5556 + }, + { + "epoch": 2.6313110387688665, + "grad_norm": 1.18173086643219, + "learning_rate": 7.771856774610109e-07, + "loss": 0.2251, + "step": 5557 + }, + { + "epoch": 2.6317845516424976, + "grad_norm": 1.0726786851882935, + "learning_rate": 7.752101860063687e-07, + "loss": 0.2004, + "step": 5558 + }, + { + "epoch": 2.632258064516129, + "grad_norm": 1.0728563070297241, + "learning_rate": 7.732371072065126e-07, + "loss": 0.1939, + "step": 5559 + }, + { + "epoch": 2.63273157738976, + "grad_norm": 1.0991253852844238, + "learning_rate": 7.712664415774762e-07, + "loss": 0.1699, + "step": 5560 + }, + { + "epoch": 2.6332050902633917, + "grad_norm": 0.9829975962638855, + "learning_rate": 7.692981896346718e-07, + "loss": 0.199, + "step": 5561 + }, + { + "epoch": 2.6336786031370227, + "grad_norm": 0.8773795366287231, + "learning_rate": 7.673323518928755e-07, + "loss": 0.1894, + "step": 5562 + }, + { + "epoch": 2.634152116010654, + "grad_norm": 1.0769257545471191, + "learning_rate": 7.653689288662335e-07, + "loss": 0.2144, + "step": 5563 + }, + { + "epoch": 2.6346256288842853, + "grad_norm": 1.1146119832992554, + "learning_rate": 7.63407921068261e-07, + "loss": 0.2237, + "step": 5564 + }, + { + "epoch": 2.635099141757917, + "grad_norm": 0.9974523782730103, + "learning_rate": 7.614493290118386e-07, + "loss": 0.2062, + "step": 5565 + }, + { + "epoch": 2.635572654631548, + "grad_norm": 1.053133249282837, + "learning_rate": 7.594931532092198e-07, + "loss": 0.2005, + "step": 5566 + }, + { + "epoch": 2.636046167505179, + "grad_norm": 1.3148210048675537, + "learning_rate": 7.575393941720199e-07, + "loss": 0.2097, + "step": 5567 + }, + { + "epoch": 2.6365196803788105, + "grad_norm": 1.292978286743164, + "learning_rate": 7.555880524112291e-07, + "loss": 0.1951, + "step": 5568 + }, + { + "epoch": 2.6369931932524415, + "grad_norm": 1.3344711065292358, + "learning_rate": 7.536391284372002e-07, + "loss": 0.1978, + "step": 5569 + }, + { + "epoch": 2.637466706126073, + "grad_norm": 1.8182041645050049, + "learning_rate": 7.516926227596566e-07, + "loss": 0.2056, + "step": 5570 + }, + { + "epoch": 2.637940218999704, + "grad_norm": 1.400888204574585, + "learning_rate": 7.497485358876866e-07, + "loss": 0.2021, + "step": 5571 + }, + { + "epoch": 2.638413731873335, + "grad_norm": 0.8689830303192139, + "learning_rate": 7.478068683297501e-07, + "loss": 0.1753, + "step": 5572 + }, + { + "epoch": 2.6388872447469667, + "grad_norm": 1.108304738998413, + "learning_rate": 7.458676205936688e-07, + "loss": 0.1851, + "step": 5573 + }, + { + "epoch": 2.6393607576205977, + "grad_norm": 1.0863293409347534, + "learning_rate": 7.439307931866346e-07, + "loss": 0.2266, + "step": 5574 + }, + { + "epoch": 2.6398342704942293, + "grad_norm": 1.085361361503601, + "learning_rate": 7.419963866152058e-07, + "loss": 0.2166, + "step": 5575 + }, + { + "epoch": 2.6403077833678603, + "grad_norm": 1.4498753547668457, + "learning_rate": 7.400644013853087e-07, + "loss": 0.1799, + "step": 5576 + }, + { + "epoch": 2.6407812962414914, + "grad_norm": 1.1030598878860474, + "learning_rate": 7.381348380022368e-07, + "loss": 0.1678, + "step": 5577 + }, + { + "epoch": 2.641254809115123, + "grad_norm": 1.5014503002166748, + "learning_rate": 7.362076969706478e-07, + "loss": 0.2074, + "step": 5578 + }, + { + "epoch": 2.641728321988754, + "grad_norm": 1.086707353591919, + "learning_rate": 7.342829787945638e-07, + "loss": 0.2029, + "step": 5579 + }, + { + "epoch": 2.6422018348623855, + "grad_norm": 1.4167083501815796, + "learning_rate": 7.323606839773811e-07, + "loss": 0.1902, + "step": 5580 + }, + { + "epoch": 2.6426753477360165, + "grad_norm": 0.9145298600196838, + "learning_rate": 7.304408130218532e-07, + "loss": 0.2039, + "step": 5581 + }, + { + "epoch": 2.6431488606096476, + "grad_norm": 2.068638563156128, + "learning_rate": 7.285233664301073e-07, + "loss": 0.2075, + "step": 5582 + }, + { + "epoch": 2.643622373483279, + "grad_norm": 1.0407607555389404, + "learning_rate": 7.266083447036287e-07, + "loss": 0.1838, + "step": 5583 + }, + { + "epoch": 2.64409588635691, + "grad_norm": 1.2109540700912476, + "learning_rate": 7.246957483432782e-07, + "loss": 0.2107, + "step": 5584 + }, + { + "epoch": 2.6445693992305417, + "grad_norm": 1.51192045211792, + "learning_rate": 7.227855778492732e-07, + "loss": 0.1798, + "step": 5585 + }, + { + "epoch": 2.6450429121041727, + "grad_norm": 1.015613317489624, + "learning_rate": 7.208778337212019e-07, + "loss": 0.2107, + "step": 5586 + }, + { + "epoch": 2.645516424977804, + "grad_norm": 1.1152918338775635, + "learning_rate": 7.189725164580152e-07, + "loss": 0.2313, + "step": 5587 + }, + { + "epoch": 2.6459899378514353, + "grad_norm": 1.4424123764038086, + "learning_rate": 7.170696265580323e-07, + "loss": 0.1907, + "step": 5588 + }, + { + "epoch": 2.646463450725067, + "grad_norm": 1.2000633478164673, + "learning_rate": 7.15169164518934e-07, + "loss": 0.1903, + "step": 5589 + }, + { + "epoch": 2.646936963598698, + "grad_norm": 0.9677756428718567, + "learning_rate": 7.132711308377682e-07, + "loss": 0.2042, + "step": 5590 + }, + { + "epoch": 2.647410476472329, + "grad_norm": 1.519376516342163, + "learning_rate": 7.113755260109478e-07, + "loss": 0.1914, + "step": 5591 + }, + { + "epoch": 2.6478839893459605, + "grad_norm": 0.9879403114318848, + "learning_rate": 7.094823505342485e-07, + "loss": 0.2147, + "step": 5592 + }, + { + "epoch": 2.6483575022195915, + "grad_norm": 1.1486883163452148, + "learning_rate": 7.075916049028142e-07, + "loss": 0.2034, + "step": 5593 + }, + { + "epoch": 2.648831015093223, + "grad_norm": 0.992047905921936, + "learning_rate": 7.057032896111494e-07, + "loss": 0.1918, + "step": 5594 + }, + { + "epoch": 2.649304527966854, + "grad_norm": 1.395175576210022, + "learning_rate": 7.038174051531266e-07, + "loss": 0.1973, + "step": 5595 + }, + { + "epoch": 2.649778040840485, + "grad_norm": 0.9354866743087769, + "learning_rate": 7.019339520219793e-07, + "loss": 0.1899, + "step": 5596 + }, + { + "epoch": 2.6502515537141167, + "grad_norm": 1.0331776142120361, + "learning_rate": 7.000529307103066e-07, + "loss": 0.1847, + "step": 5597 + }, + { + "epoch": 2.6507250665877478, + "grad_norm": 1.2936712503433228, + "learning_rate": 6.981743417100728e-07, + "loss": 0.1875, + "step": 5598 + }, + { + "epoch": 2.6511985794613793, + "grad_norm": 1.1697713136672974, + "learning_rate": 6.962981855126017e-07, + "loss": 0.1885, + "step": 5599 + }, + { + "epoch": 2.6516720923350103, + "grad_norm": 1.2552714347839355, + "learning_rate": 6.944244626085872e-07, + "loss": 0.2045, + "step": 5600 + }, + { + "epoch": 2.6521456052086414, + "grad_norm": 1.1409389972686768, + "learning_rate": 6.925531734880808e-07, + "loss": 0.2088, + "step": 5601 + }, + { + "epoch": 2.652619118082273, + "grad_norm": 1.1405903100967407, + "learning_rate": 6.906843186405032e-07, + "loss": 0.2132, + "step": 5602 + }, + { + "epoch": 2.653092630955904, + "grad_norm": 1.2994462251663208, + "learning_rate": 6.888178985546312e-07, + "loss": 0.201, + "step": 5603 + }, + { + "epoch": 2.6535661438295355, + "grad_norm": 1.3358666896820068, + "learning_rate": 6.869539137186132e-07, + "loss": 0.208, + "step": 5604 + }, + { + "epoch": 2.6540396567031665, + "grad_norm": 1.0983041524887085, + "learning_rate": 6.850923646199526e-07, + "loss": 0.2056, + "step": 5605 + }, + { + "epoch": 2.6545131695767976, + "grad_norm": 1.1728966236114502, + "learning_rate": 6.832332517455242e-07, + "loss": 0.2267, + "step": 5606 + }, + { + "epoch": 2.654986682450429, + "grad_norm": 1.0750831365585327, + "learning_rate": 6.813765755815571e-07, + "loss": 0.204, + "step": 5607 + }, + { + "epoch": 2.6554601953240606, + "grad_norm": 1.2680811882019043, + "learning_rate": 6.795223366136471e-07, + "loss": 0.1889, + "step": 5608 + }, + { + "epoch": 2.6559337081976917, + "grad_norm": 1.0505774021148682, + "learning_rate": 6.776705353267554e-07, + "loss": 0.1937, + "step": 5609 + }, + { + "epoch": 2.6564072210713228, + "grad_norm": 1.042385220527649, + "learning_rate": 6.758211722052e-07, + "loss": 0.2031, + "step": 5610 + }, + { + "epoch": 2.6568807339449543, + "grad_norm": 1.3553048372268677, + "learning_rate": 6.73974247732666e-07, + "loss": 0.2098, + "step": 5611 + }, + { + "epoch": 2.6573542468185853, + "grad_norm": 1.1963160037994385, + "learning_rate": 6.721297623921963e-07, + "loss": 0.1899, + "step": 5612 + }, + { + "epoch": 2.657827759692217, + "grad_norm": 1.010690450668335, + "learning_rate": 6.702877166662014e-07, + "loss": 0.2022, + "step": 5613 + }, + { + "epoch": 2.658301272565848, + "grad_norm": 1.4362280368804932, + "learning_rate": 6.684481110364471e-07, + "loss": 0.1826, + "step": 5614 + }, + { + "epoch": 2.658774785439479, + "grad_norm": 1.1894739866256714, + "learning_rate": 6.666109459840664e-07, + "loss": 0.1979, + "step": 5615 + }, + { + "epoch": 2.6592482983131105, + "grad_norm": 1.0000640153884888, + "learning_rate": 6.647762219895526e-07, + "loss": 0.1917, + "step": 5616 + }, + { + "epoch": 2.6597218111867416, + "grad_norm": 1.6517083644866943, + "learning_rate": 6.629439395327597e-07, + "loss": 0.1963, + "step": 5617 + }, + { + "epoch": 2.660195324060373, + "grad_norm": 1.1130201816558838, + "learning_rate": 6.611140990929032e-07, + "loss": 0.1901, + "step": 5618 + }, + { + "epoch": 2.660668836934004, + "grad_norm": 1.1720373630523682, + "learning_rate": 6.592867011485593e-07, + "loss": 0.1891, + "step": 5619 + }, + { + "epoch": 2.661142349807635, + "grad_norm": 1.1503329277038574, + "learning_rate": 6.574617461776689e-07, + "loss": 0.211, + "step": 5620 + }, + { + "epoch": 2.6616158626812667, + "grad_norm": 1.2841466665267944, + "learning_rate": 6.55639234657528e-07, + "loss": 0.192, + "step": 5621 + }, + { + "epoch": 2.6620893755548978, + "grad_norm": 1.003335952758789, + "learning_rate": 6.538191670648008e-07, + "loss": 0.1809, + "step": 5622 + }, + { + "epoch": 2.6625628884285293, + "grad_norm": 1.1069647073745728, + "learning_rate": 6.520015438755056e-07, + "loss": 0.2017, + "step": 5623 + }, + { + "epoch": 2.6630364013021603, + "grad_norm": 1.0702106952667236, + "learning_rate": 6.501863655650243e-07, + "loss": 0.2038, + "step": 5624 + }, + { + "epoch": 2.6635099141757914, + "grad_norm": 0.9489043951034546, + "learning_rate": 6.483736326081003e-07, + "loss": 0.1976, + "step": 5625 + }, + { + "epoch": 2.663983427049423, + "grad_norm": 1.254457712173462, + "learning_rate": 6.465633454788345e-07, + "loss": 0.2198, + "step": 5626 + }, + { + "epoch": 2.6644569399230544, + "grad_norm": 1.0318970680236816, + "learning_rate": 6.447555046506937e-07, + "loss": 0.1741, + "step": 5627 + }, + { + "epoch": 2.6649304527966855, + "grad_norm": 1.3176510334014893, + "learning_rate": 6.429501105964964e-07, + "loss": 0.2041, + "step": 5628 + }, + { + "epoch": 2.6654039656703166, + "grad_norm": 1.1123440265655518, + "learning_rate": 6.411471637884315e-07, + "loss": 0.2024, + "step": 5629 + }, + { + "epoch": 2.665877478543948, + "grad_norm": 0.9631196856498718, + "learning_rate": 6.393466646980362e-07, + "loss": 0.1872, + "step": 5630 + }, + { + "epoch": 2.666350991417579, + "grad_norm": 1.3509668111801147, + "learning_rate": 6.375486137962194e-07, + "loss": 0.1934, + "step": 5631 + }, + { + "epoch": 2.6668245042912107, + "grad_norm": 1.3398867845535278, + "learning_rate": 6.357530115532417e-07, + "loss": 0.1883, + "step": 5632 + }, + { + "epoch": 2.6672980171648417, + "grad_norm": 1.4973032474517822, + "learning_rate": 6.339598584387241e-07, + "loss": 0.2354, + "step": 5633 + }, + { + "epoch": 2.667771530038473, + "grad_norm": 1.0023930072784424, + "learning_rate": 6.321691549216502e-07, + "loss": 0.2074, + "step": 5634 + }, + { + "epoch": 2.6682450429121043, + "grad_norm": 1.0620249509811401, + "learning_rate": 6.303809014703599e-07, + "loss": 0.1999, + "step": 5635 + }, + { + "epoch": 2.6687185557857354, + "grad_norm": 1.0061266422271729, + "learning_rate": 6.285950985525569e-07, + "loss": 0.2038, + "step": 5636 + }, + { + "epoch": 2.669192068659367, + "grad_norm": 0.9382733106613159, + "learning_rate": 6.268117466352952e-07, + "loss": 0.1932, + "step": 5637 + }, + { + "epoch": 2.669665581532998, + "grad_norm": 1.1017144918441772, + "learning_rate": 6.250308461849986e-07, + "loss": 0.2087, + "step": 5638 + }, + { + "epoch": 2.670139094406629, + "grad_norm": 1.4511491060256958, + "learning_rate": 6.232523976674409e-07, + "loss": 0.2105, + "step": 5639 + }, + { + "epoch": 2.6706126072802605, + "grad_norm": 1.016521692276001, + "learning_rate": 6.214764015477614e-07, + "loss": 0.1869, + "step": 5640 + }, + { + "epoch": 2.6710861201538916, + "grad_norm": 1.1885292530059814, + "learning_rate": 6.197028582904507e-07, + "loss": 0.2107, + "step": 5641 + }, + { + "epoch": 2.671559633027523, + "grad_norm": 1.1228227615356445, + "learning_rate": 6.179317683593656e-07, + "loss": 0.2178, + "step": 5642 + }, + { + "epoch": 2.672033145901154, + "grad_norm": 1.4119621515274048, + "learning_rate": 6.161631322177164e-07, + "loss": 0.2027, + "step": 5643 + }, + { + "epoch": 2.672506658774785, + "grad_norm": 1.3242872953414917, + "learning_rate": 6.1439695032807e-07, + "loss": 0.2273, + "step": 5644 + }, + { + "epoch": 2.6729801716484167, + "grad_norm": 1.2599126100540161, + "learning_rate": 6.126332231523591e-07, + "loss": 0.2258, + "step": 5645 + }, + { + "epoch": 2.673453684522048, + "grad_norm": 1.113526701927185, + "learning_rate": 6.108719511518658e-07, + "loss": 0.1929, + "step": 5646 + }, + { + "epoch": 2.6739271973956793, + "grad_norm": 1.0311851501464844, + "learning_rate": 6.09113134787237e-07, + "loss": 0.2111, + "step": 5647 + }, + { + "epoch": 2.6744007102693104, + "grad_norm": 1.3474483489990234, + "learning_rate": 6.073567745184694e-07, + "loss": 0.1973, + "step": 5648 + }, + { + "epoch": 2.6748742231429414, + "grad_norm": 1.5055440664291382, + "learning_rate": 6.056028708049278e-07, + "loss": 0.2036, + "step": 5649 + }, + { + "epoch": 2.675347736016573, + "grad_norm": 1.0800116062164307, + "learning_rate": 6.038514241053239e-07, + "loss": 0.1864, + "step": 5650 + }, + { + "epoch": 2.6758212488902045, + "grad_norm": 1.0215858221054077, + "learning_rate": 6.021024348777349e-07, + "loss": 0.1721, + "step": 5651 + }, + { + "epoch": 2.6762947617638355, + "grad_norm": 1.2525755167007446, + "learning_rate": 6.003559035795914e-07, + "loss": 0.1724, + "step": 5652 + }, + { + "epoch": 2.6767682746374666, + "grad_norm": 1.2781281471252441, + "learning_rate": 5.986118306676791e-07, + "loss": 0.2081, + "step": 5653 + }, + { + "epoch": 2.677241787511098, + "grad_norm": 1.0916087627410889, + "learning_rate": 5.968702165981477e-07, + "loss": 0.1801, + "step": 5654 + }, + { + "epoch": 2.677715300384729, + "grad_norm": 0.9678347706794739, + "learning_rate": 5.95131061826496e-07, + "loss": 0.1771, + "step": 5655 + }, + { + "epoch": 2.6781888132583607, + "grad_norm": 1.1609114408493042, + "learning_rate": 5.933943668075869e-07, + "loss": 0.1954, + "step": 5656 + }, + { + "epoch": 2.6786623261319917, + "grad_norm": 1.1677464246749878, + "learning_rate": 5.916601319956339e-07, + "loss": 0.1781, + "step": 5657 + }, + { + "epoch": 2.679135839005623, + "grad_norm": 1.1138585805892944, + "learning_rate": 5.899283578442073e-07, + "loss": 0.1877, + "step": 5658 + }, + { + "epoch": 2.6796093518792543, + "grad_norm": 1.0214651823043823, + "learning_rate": 5.881990448062402e-07, + "loss": 0.2018, + "step": 5659 + }, + { + "epoch": 2.6800828647528854, + "grad_norm": 1.117795467376709, + "learning_rate": 5.864721933340145e-07, + "loss": 0.2012, + "step": 5660 + }, + { + "epoch": 2.680556377626517, + "grad_norm": 1.546120524406433, + "learning_rate": 5.847478038791732e-07, + "loss": 0.2145, + "step": 5661 + }, + { + "epoch": 2.681029890500148, + "grad_norm": 1.232239842414856, + "learning_rate": 5.830258768927122e-07, + "loss": 0.181, + "step": 5662 + }, + { + "epoch": 2.681503403373779, + "grad_norm": 1.0880526304244995, + "learning_rate": 5.813064128249879e-07, + "loss": 0.1968, + "step": 5663 + }, + { + "epoch": 2.6819769162474105, + "grad_norm": 1.078931450843811, + "learning_rate": 5.795894121257062e-07, + "loss": 0.2053, + "step": 5664 + }, + { + "epoch": 2.6824504291210416, + "grad_norm": 2.163848400115967, + "learning_rate": 5.778748752439345e-07, + "loss": 0.1973, + "step": 5665 + }, + { + "epoch": 2.682923941994673, + "grad_norm": 1.1884369850158691, + "learning_rate": 5.761628026280908e-07, + "loss": 0.2006, + "step": 5666 + }, + { + "epoch": 2.683397454868304, + "grad_norm": 1.1050399541854858, + "learning_rate": 5.744531947259535e-07, + "loss": 0.1786, + "step": 5667 + }, + { + "epoch": 2.6838709677419352, + "grad_norm": 1.53378427028656, + "learning_rate": 5.727460519846539e-07, + "loss": 0.2083, + "step": 5668 + }, + { + "epoch": 2.6843444806155667, + "grad_norm": 1.0245704650878906, + "learning_rate": 5.710413748506772e-07, + "loss": 0.2315, + "step": 5669 + }, + { + "epoch": 2.6848179934891983, + "grad_norm": 1.2128366231918335, + "learning_rate": 5.693391637698664e-07, + "loss": 0.2033, + "step": 5670 + }, + { + "epoch": 2.6852915063628293, + "grad_norm": 1.1133742332458496, + "learning_rate": 5.676394191874179e-07, + "loss": 0.1781, + "step": 5671 + }, + { + "epoch": 2.6857650192364604, + "grad_norm": 1.095633864402771, + "learning_rate": 5.65942141547885e-07, + "loss": 0.2086, + "step": 5672 + }, + { + "epoch": 2.686238532110092, + "grad_norm": 1.5314477682113647, + "learning_rate": 5.642473312951713e-07, + "loss": 0.2064, + "step": 5673 + }, + { + "epoch": 2.686712044983723, + "grad_norm": 1.4954949617385864, + "learning_rate": 5.625549888725401e-07, + "loss": 0.1821, + "step": 5674 + }, + { + "epoch": 2.6871855578573545, + "grad_norm": 1.1045236587524414, + "learning_rate": 5.608651147226074e-07, + "loss": 0.1925, + "step": 5675 + }, + { + "epoch": 2.6876590707309855, + "grad_norm": 1.7516658306121826, + "learning_rate": 5.591777092873429e-07, + "loss": 0.2061, + "step": 5676 + }, + { + "epoch": 2.6881325836046166, + "grad_norm": 1.2861231565475464, + "learning_rate": 5.574927730080725e-07, + "loss": 0.2016, + "step": 5677 + }, + { + "epoch": 2.688606096478248, + "grad_norm": 1.7489197254180908, + "learning_rate": 5.558103063254716e-07, + "loss": 0.2002, + "step": 5678 + }, + { + "epoch": 2.689079609351879, + "grad_norm": 1.1831798553466797, + "learning_rate": 5.541303096795769e-07, + "loss": 0.2127, + "step": 5679 + }, + { + "epoch": 2.6895531222255107, + "grad_norm": 1.17523992061615, + "learning_rate": 5.524527835097726e-07, + "loss": 0.1985, + "step": 5680 + }, + { + "epoch": 2.6900266350991417, + "grad_norm": 1.547934651374817, + "learning_rate": 5.507777282548021e-07, + "loss": 0.2032, + "step": 5681 + }, + { + "epoch": 2.690500147972773, + "grad_norm": 1.073725700378418, + "learning_rate": 5.491051443527573e-07, + "loss": 0.2342, + "step": 5682 + }, + { + "epoch": 2.6909736608464043, + "grad_norm": 0.9526923298835754, + "learning_rate": 5.474350322410882e-07, + "loss": 0.1952, + "step": 5683 + }, + { + "epoch": 2.6914471737200354, + "grad_norm": 1.3431147336959839, + "learning_rate": 5.457673923565954e-07, + "loss": 0.1967, + "step": 5684 + }, + { + "epoch": 2.691920686593667, + "grad_norm": 1.127583622932434, + "learning_rate": 5.441022251354355e-07, + "loss": 0.2074, + "step": 5685 + }, + { + "epoch": 2.692394199467298, + "grad_norm": 0.9880527257919312, + "learning_rate": 5.424395310131159e-07, + "loss": 0.2143, + "step": 5686 + }, + { + "epoch": 2.692867712340929, + "grad_norm": 1.1635633707046509, + "learning_rate": 5.407793104244963e-07, + "loss": 0.2018, + "step": 5687 + }, + { + "epoch": 2.6933412252145605, + "grad_norm": 1.3094615936279297, + "learning_rate": 5.391215638037961e-07, + "loss": 0.229, + "step": 5688 + }, + { + "epoch": 2.6938147380881916, + "grad_norm": 1.3654303550720215, + "learning_rate": 5.374662915845774e-07, + "loss": 0.2072, + "step": 5689 + }, + { + "epoch": 2.694288250961823, + "grad_norm": 1.0598773956298828, + "learning_rate": 5.358134941997661e-07, + "loss": 0.219, + "step": 5690 + }, + { + "epoch": 2.694761763835454, + "grad_norm": 1.110864281654358, + "learning_rate": 5.341631720816309e-07, + "loss": 0.192, + "step": 5691 + }, + { + "epoch": 2.6952352767090857, + "grad_norm": 1.1458567380905151, + "learning_rate": 5.325153256617988e-07, + "loss": 0.1928, + "step": 5692 + }, + { + "epoch": 2.6957087895827168, + "grad_norm": 1.2874677181243896, + "learning_rate": 5.308699553712515e-07, + "loss": 0.2226, + "step": 5693 + }, + { + "epoch": 2.6961823024563483, + "grad_norm": 1.3333238363265991, + "learning_rate": 5.29227061640315e-07, + "loss": 0.1834, + "step": 5694 + }, + { + "epoch": 2.6966558153299793, + "grad_norm": 1.119017481803894, + "learning_rate": 5.275866448986755e-07, + "loss": 0.1743, + "step": 5695 + }, + { + "epoch": 2.6971293282036104, + "grad_norm": 1.0952125787734985, + "learning_rate": 5.259487055753653e-07, + "loss": 0.213, + "step": 5696 + }, + { + "epoch": 2.697602841077242, + "grad_norm": 1.2849918603897095, + "learning_rate": 5.243132440987752e-07, + "loss": 0.1902, + "step": 5697 + }, + { + "epoch": 2.698076353950873, + "grad_norm": 1.2609485387802124, + "learning_rate": 5.226802608966419e-07, + "loss": 0.2232, + "step": 5698 + }, + { + "epoch": 2.6985498668245045, + "grad_norm": 1.236099123954773, + "learning_rate": 5.210497563960581e-07, + "loss": 0.2236, + "step": 5699 + }, + { + "epoch": 2.6990233796981355, + "grad_norm": 1.6993776559829712, + "learning_rate": 5.19421731023464e-07, + "loss": 0.1883, + "step": 5700 + }, + { + "epoch": 2.6994968925717666, + "grad_norm": 2.0070505142211914, + "learning_rate": 5.177961852046565e-07, + "loss": 0.202, + "step": 5701 + }, + { + "epoch": 2.699970405445398, + "grad_norm": 1.0293316841125488, + "learning_rate": 5.161731193647801e-07, + "loss": 0.1863, + "step": 5702 + }, + { + "epoch": 2.700443918319029, + "grad_norm": 0.9846745133399963, + "learning_rate": 5.145525339283308e-07, + "loss": 0.1895, + "step": 5703 + }, + { + "epoch": 2.7009174311926607, + "grad_norm": 1.1934609413146973, + "learning_rate": 5.129344293191607e-07, + "loss": 0.1918, + "step": 5704 + }, + { + "epoch": 2.7013909440662918, + "grad_norm": 1.1649199724197388, + "learning_rate": 5.113188059604657e-07, + "loss": 0.2288, + "step": 5705 + }, + { + "epoch": 2.701864456939923, + "grad_norm": 1.2630609273910522, + "learning_rate": 5.097056642747988e-07, + "loss": 0.1989, + "step": 5706 + }, + { + "epoch": 2.7023379698135543, + "grad_norm": 1.5064605474472046, + "learning_rate": 5.080950046840594e-07, + "loss": 0.2033, + "step": 5707 + }, + { + "epoch": 2.7028114826871854, + "grad_norm": 1.2008765935897827, + "learning_rate": 5.064868276095036e-07, + "loss": 0.2044, + "step": 5708 + }, + { + "epoch": 2.703284995560817, + "grad_norm": 1.0486986637115479, + "learning_rate": 5.048811334717307e-07, + "loss": 0.1992, + "step": 5709 + }, + { + "epoch": 2.703758508434448, + "grad_norm": 1.0270819664001465, + "learning_rate": 5.032779226906981e-07, + "loss": 0.2147, + "step": 5710 + }, + { + "epoch": 2.704232021308079, + "grad_norm": 0.9577698707580566, + "learning_rate": 5.016771956857081e-07, + "loss": 0.1697, + "step": 5711 + }, + { + "epoch": 2.7047055341817106, + "grad_norm": 1.1315481662750244, + "learning_rate": 5.000789528754147e-07, + "loss": 0.201, + "step": 5712 + }, + { + "epoch": 2.705179047055342, + "grad_norm": 1.1655607223510742, + "learning_rate": 4.984831946778246e-07, + "loss": 0.2057, + "step": 5713 + }, + { + "epoch": 2.705652559928973, + "grad_norm": 1.2996375560760498, + "learning_rate": 4.968899215102907e-07, + "loss": 0.2162, + "step": 5714 + }, + { + "epoch": 2.706126072802604, + "grad_norm": 1.351178765296936, + "learning_rate": 4.952991337895219e-07, + "loss": 0.1751, + "step": 5715 + }, + { + "epoch": 2.7065995856762357, + "grad_norm": 0.9919388890266418, + "learning_rate": 4.937108319315687e-07, + "loss": 0.2041, + "step": 5716 + }, + { + "epoch": 2.7070730985498668, + "grad_norm": 1.0616121292114258, + "learning_rate": 4.92125016351841e-07, + "loss": 0.2077, + "step": 5717 + }, + { + "epoch": 2.7075466114234983, + "grad_norm": 1.1046313047409058, + "learning_rate": 4.905416874650892e-07, + "loss": 0.2069, + "step": 5718 + }, + { + "epoch": 2.7080201242971293, + "grad_norm": 1.44788658618927, + "learning_rate": 4.889608456854211e-07, + "loss": 0.1919, + "step": 5719 + }, + { + "epoch": 2.7084936371707604, + "grad_norm": 0.9813434481620789, + "learning_rate": 4.873824914262882e-07, + "loss": 0.1997, + "step": 5720 + }, + { + "epoch": 2.708967150044392, + "grad_norm": 1.2912452220916748, + "learning_rate": 4.858066251004956e-07, + "loss": 0.2067, + "step": 5721 + }, + { + "epoch": 2.709440662918023, + "grad_norm": 0.9664787650108337, + "learning_rate": 4.842332471201961e-07, + "loss": 0.1958, + "step": 5722 + }, + { + "epoch": 2.7099141757916545, + "grad_norm": 1.488752841949463, + "learning_rate": 4.826623578968881e-07, + "loss": 0.2134, + "step": 5723 + }, + { + "epoch": 2.7103876886652856, + "grad_norm": 1.1605441570281982, + "learning_rate": 4.810939578414265e-07, + "loss": 0.2028, + "step": 5724 + }, + { + "epoch": 2.7108612015389166, + "grad_norm": 1.2621954679489136, + "learning_rate": 4.795280473640085e-07, + "loss": 0.2198, + "step": 5725 + }, + { + "epoch": 2.711334714412548, + "grad_norm": 1.275181531906128, + "learning_rate": 4.779646268741866e-07, + "loss": 0.1786, + "step": 5726 + }, + { + "epoch": 2.711808227286179, + "grad_norm": 0.8626628518104553, + "learning_rate": 4.7640369678085275e-07, + "loss": 0.1957, + "step": 5727 + }, + { + "epoch": 2.7122817401598107, + "grad_norm": 1.1933006048202515, + "learning_rate": 4.7484525749225907e-07, + "loss": 0.1729, + "step": 5728 + }, + { + "epoch": 2.712755253033442, + "grad_norm": 0.9497954249382019, + "learning_rate": 4.7328930941599514e-07, + "loss": 0.1848, + "step": 5729 + }, + { + "epoch": 2.713228765907073, + "grad_norm": 1.1757985353469849, + "learning_rate": 4.717358529590077e-07, + "loss": 0.2026, + "step": 5730 + }, + { + "epoch": 2.7137022787807044, + "grad_norm": 1.2278982400894165, + "learning_rate": 4.7018488852758727e-07, + "loss": 0.2011, + "step": 5731 + }, + { + "epoch": 2.714175791654336, + "grad_norm": 1.4735335111618042, + "learning_rate": 4.6863641652737157e-07, + "loss": 0.2165, + "step": 5732 + }, + { + "epoch": 2.714649304527967, + "grad_norm": 1.0341819524765015, + "learning_rate": 4.6709043736335334e-07, + "loss": 0.2221, + "step": 5733 + }, + { + "epoch": 2.715122817401598, + "grad_norm": 1.042711853981018, + "learning_rate": 4.655469514398636e-07, + "loss": 0.208, + "step": 5734 + }, + { + "epoch": 2.7155963302752295, + "grad_norm": 1.383939504623413, + "learning_rate": 4.6400595916058944e-07, + "loss": 0.2037, + "step": 5735 + }, + { + "epoch": 2.7160698431488606, + "grad_norm": 0.9150691032409668, + "learning_rate": 4.6246746092856176e-07, + "loss": 0.1982, + "step": 5736 + }, + { + "epoch": 2.716543356022492, + "grad_norm": 1.098061203956604, + "learning_rate": 4.6093145714615763e-07, + "loss": 0.2108, + "step": 5737 + }, + { + "epoch": 2.717016868896123, + "grad_norm": 0.9107375144958496, + "learning_rate": 4.5939794821510785e-07, + "loss": 0.1822, + "step": 5738 + }, + { + "epoch": 2.717490381769754, + "grad_norm": 1.2272454500198364, + "learning_rate": 4.578669345364828e-07, + "loss": 0.1883, + "step": 5739 + }, + { + "epoch": 2.7179638946433857, + "grad_norm": 1.202950119972229, + "learning_rate": 4.5633841651070766e-07, + "loss": 0.1952, + "step": 5740 + }, + { + "epoch": 2.718437407517017, + "grad_norm": 1.1974061727523804, + "learning_rate": 4.548123945375493e-07, + "loss": 0.2013, + "step": 5741 + }, + { + "epoch": 2.7189109203906483, + "grad_norm": 1.0299830436706543, + "learning_rate": 4.5328886901612743e-07, + "loss": 0.1803, + "step": 5742 + }, + { + "epoch": 2.7193844332642794, + "grad_norm": 0.9163728356361389, + "learning_rate": 4.5176784034489993e-07, + "loss": 0.1797, + "step": 5743 + }, + { + "epoch": 2.7198579461379104, + "grad_norm": 1.6618443727493286, + "learning_rate": 4.5024930892168305e-07, + "loss": 0.2081, + "step": 5744 + }, + { + "epoch": 2.720331459011542, + "grad_norm": 1.2780410051345825, + "learning_rate": 4.487332751436302e-07, + "loss": 0.1836, + "step": 5745 + }, + { + "epoch": 2.720804971885173, + "grad_norm": 0.9704563617706299, + "learning_rate": 4.472197394072464e-07, + "loss": 0.1998, + "step": 5746 + }, + { + "epoch": 2.7212784847588045, + "grad_norm": 1.1087617874145508, + "learning_rate": 4.457087021083839e-07, + "loss": 0.2173, + "step": 5747 + }, + { + "epoch": 2.7217519976324356, + "grad_norm": 1.3192325830459595, + "learning_rate": 4.442001636422366e-07, + "loss": 0.2108, + "step": 5748 + }, + { + "epoch": 2.7222255105060666, + "grad_norm": 0.8854762315750122, + "learning_rate": 4.4269412440335114e-07, + "loss": 0.2035, + "step": 5749 + }, + { + "epoch": 2.722699023379698, + "grad_norm": 1.296974778175354, + "learning_rate": 4.411905847856157e-07, + "loss": 0.1926, + "step": 5750 + }, + { + "epoch": 2.7231725362533292, + "grad_norm": 1.4468486309051514, + "learning_rate": 4.39689545182268e-07, + "loss": 0.198, + "step": 5751 + }, + { + "epoch": 2.7236460491269607, + "grad_norm": 1.2393492460250854, + "learning_rate": 4.381910059858896e-07, + "loss": 0.1975, + "step": 5752 + }, + { + "epoch": 2.724119562000592, + "grad_norm": 1.3274880647659302, + "learning_rate": 4.366949675884091e-07, + "loss": 0.201, + "step": 5753 + }, + { + "epoch": 2.7245930748742233, + "grad_norm": 1.3509947061538696, + "learning_rate": 4.352014303811003e-07, + "loss": 0.1904, + "step": 5754 + }, + { + "epoch": 2.7250665877478544, + "grad_norm": 0.9841063022613525, + "learning_rate": 4.337103947545862e-07, + "loss": 0.1994, + "step": 5755 + }, + { + "epoch": 2.725540100621486, + "grad_norm": 1.1662771701812744, + "learning_rate": 4.3222186109882933e-07, + "loss": 0.2036, + "step": 5756 + }, + { + "epoch": 2.726013613495117, + "grad_norm": 0.9167471528053284, + "learning_rate": 4.307358298031428e-07, + "loss": 0.1971, + "step": 5757 + }, + { + "epoch": 2.726487126368748, + "grad_norm": 1.2513035535812378, + "learning_rate": 4.2925230125618336e-07, + "loss": 0.2269, + "step": 5758 + }, + { + "epoch": 2.7269606392423795, + "grad_norm": 1.0162153244018555, + "learning_rate": 4.2777127584595403e-07, + "loss": 0.1965, + "step": 5759 + }, + { + "epoch": 2.7274341521160106, + "grad_norm": 1.4992437362670898, + "learning_rate": 4.2629275395980275e-07, + "loss": 0.1795, + "step": 5760 + }, + { + "epoch": 2.727907664989642, + "grad_norm": 1.2534233331680298, + "learning_rate": 4.248167359844224e-07, + "loss": 0.2045, + "step": 5761 + }, + { + "epoch": 2.728381177863273, + "grad_norm": 1.2920854091644287, + "learning_rate": 4.23343222305852e-07, + "loss": 0.1998, + "step": 5762 + }, + { + "epoch": 2.7288546907369042, + "grad_norm": 0.9584261775016785, + "learning_rate": 4.2187221330947216e-07, + "loss": 0.1905, + "step": 5763 + }, + { + "epoch": 2.7293282036105357, + "grad_norm": 1.5315953493118286, + "learning_rate": 4.2040370938001507e-07, + "loss": 0.21, + "step": 5764 + }, + { + "epoch": 2.729801716484167, + "grad_norm": 1.48392915725708, + "learning_rate": 4.1893771090155246e-07, + "loss": 0.1877, + "step": 5765 + }, + { + "epoch": 2.7302752293577983, + "grad_norm": 0.9685987830162048, + "learning_rate": 4.174742182574998e-07, + "loss": 0.2019, + "step": 5766 + }, + { + "epoch": 2.7307487422314294, + "grad_norm": 1.2201647758483887, + "learning_rate": 4.1601323183062205e-07, + "loss": 0.1967, + "step": 5767 + }, + { + "epoch": 2.7312222551050604, + "grad_norm": 0.9604336023330688, + "learning_rate": 4.1455475200302353e-07, + "loss": 0.1898, + "step": 5768 + }, + { + "epoch": 2.731695767978692, + "grad_norm": 0.9989910125732422, + "learning_rate": 4.1309877915615913e-07, + "loss": 0.2053, + "step": 5769 + }, + { + "epoch": 2.732169280852323, + "grad_norm": 1.192683219909668, + "learning_rate": 4.116453136708187e-07, + "loss": 0.2246, + "step": 5770 + }, + { + "epoch": 2.7326427937259545, + "grad_norm": 1.0321522951126099, + "learning_rate": 4.101943559271504e-07, + "loss": 0.2037, + "step": 5771 + }, + { + "epoch": 2.7331163065995856, + "grad_norm": 1.1789295673370361, + "learning_rate": 4.0874590630463283e-07, + "loss": 0.2316, + "step": 5772 + }, + { + "epoch": 2.7335898194732167, + "grad_norm": 1.1018298864364624, + "learning_rate": 4.072999651820941e-07, + "loss": 0.2042, + "step": 5773 + }, + { + "epoch": 2.734063332346848, + "grad_norm": 0.9121476411819458, + "learning_rate": 4.058565329377073e-07, + "loss": 0.1769, + "step": 5774 + }, + { + "epoch": 2.7345368452204797, + "grad_norm": 1.112981915473938, + "learning_rate": 4.044156099489882e-07, + "loss": 0.189, + "step": 5775 + }, + { + "epoch": 2.7350103580941107, + "grad_norm": 1.084700345993042, + "learning_rate": 4.0297719659279645e-07, + "loss": 0.2051, + "step": 5776 + }, + { + "epoch": 2.735483870967742, + "grad_norm": 1.0018030405044556, + "learning_rate": 4.015412932453333e-07, + "loss": 0.2046, + "step": 5777 + }, + { + "epoch": 2.7359573838413733, + "grad_norm": 1.5083683729171753, + "learning_rate": 4.0010790028214843e-07, + "loss": 0.1895, + "step": 5778 + }, + { + "epoch": 2.7364308967150044, + "grad_norm": 1.0385921001434326, + "learning_rate": 3.9867701807812963e-07, + "loss": 0.2093, + "step": 5779 + }, + { + "epoch": 2.736904409588636, + "grad_norm": 1.0134193897247314, + "learning_rate": 3.9724864700751207e-07, + "loss": 0.2117, + "step": 5780 + }, + { + "epoch": 2.737377922462267, + "grad_norm": 1.1653934717178345, + "learning_rate": 3.9582278744387137e-07, + "loss": 0.2079, + "step": 5781 + }, + { + "epoch": 2.737851435335898, + "grad_norm": 1.1373720169067383, + "learning_rate": 3.9439943976012696e-07, + "loss": 0.2118, + "step": 5782 + }, + { + "epoch": 2.7383249482095295, + "grad_norm": 1.2817039489746094, + "learning_rate": 3.929786043285433e-07, + "loss": 0.2094, + "step": 5783 + }, + { + "epoch": 2.7387984610831606, + "grad_norm": 0.9910814166069031, + "learning_rate": 3.915602815207231e-07, + "loss": 0.2151, + "step": 5784 + }, + { + "epoch": 2.739271973956792, + "grad_norm": 1.1655786037445068, + "learning_rate": 3.901444717076186e-07, + "loss": 0.2014, + "step": 5785 + }, + { + "epoch": 2.739745486830423, + "grad_norm": 1.3065234422683716, + "learning_rate": 3.8873117525951797e-07, + "loss": 0.1806, + "step": 5786 + }, + { + "epoch": 2.7402189997040542, + "grad_norm": 1.1026694774627686, + "learning_rate": 3.873203925460589e-07, + "loss": 0.183, + "step": 5787 + }, + { + "epoch": 2.7406925125776858, + "grad_norm": 1.210478663444519, + "learning_rate": 3.8591212393621405e-07, + "loss": 0.216, + "step": 5788 + }, + { + "epoch": 2.741166025451317, + "grad_norm": 1.6913577318191528, + "learning_rate": 3.845063697983065e-07, + "loss": 0.2123, + "step": 5789 + }, + { + "epoch": 2.7416395383249483, + "grad_norm": 1.2592370510101318, + "learning_rate": 3.8310313049999546e-07, + "loss": 0.1888, + "step": 5790 + }, + { + "epoch": 2.7421130511985794, + "grad_norm": 0.9264201521873474, + "learning_rate": 3.8170240640828304e-07, + "loss": 0.1949, + "step": 5791 + }, + { + "epoch": 2.7425865640722105, + "grad_norm": 1.0950411558151245, + "learning_rate": 3.8030419788951834e-07, + "loss": 0.215, + "step": 5792 + }, + { + "epoch": 2.743060076945842, + "grad_norm": 1.2804242372512817, + "learning_rate": 3.789085053093866e-07, + "loss": 0.2031, + "step": 5793 + }, + { + "epoch": 2.7435335898194735, + "grad_norm": 1.092449426651001, + "learning_rate": 3.775153290329203e-07, + "loss": 0.1747, + "step": 5794 + }, + { + "epoch": 2.7440071026931045, + "grad_norm": 0.9500091075897217, + "learning_rate": 3.7612466942448797e-07, + "loss": 0.2102, + "step": 5795 + }, + { + "epoch": 2.7444806155667356, + "grad_norm": 1.2651680707931519, + "learning_rate": 3.747365268478076e-07, + "loss": 0.1895, + "step": 5796 + }, + { + "epoch": 2.744954128440367, + "grad_norm": 1.0188058614730835, + "learning_rate": 3.733509016659298e-07, + "loss": 0.2146, + "step": 5797 + }, + { + "epoch": 2.745427641313998, + "grad_norm": 1.008729338645935, + "learning_rate": 3.7196779424125585e-07, + "loss": 0.2023, + "step": 5798 + }, + { + "epoch": 2.7459011541876297, + "grad_norm": 1.0331710577011108, + "learning_rate": 3.705872049355208e-07, + "loss": 0.1947, + "step": 5799 + }, + { + "epoch": 2.7463746670612608, + "grad_norm": 0.8746491074562073, + "learning_rate": 3.6920913410980585e-07, + "loss": 0.1955, + "step": 5800 + }, + { + "epoch": 2.746848179934892, + "grad_norm": 0.9344222545623779, + "learning_rate": 3.678335821245327e-07, + "loss": 0.1811, + "step": 5801 + }, + { + "epoch": 2.7473216928085233, + "grad_norm": 0.9031004309654236, + "learning_rate": 3.664605493394624e-07, + "loss": 0.1813, + "step": 5802 + }, + { + "epoch": 2.7477952056821544, + "grad_norm": 1.2377285957336426, + "learning_rate": 3.6509003611369884e-07, + "loss": 0.2265, + "step": 5803 + }, + { + "epoch": 2.748268718555786, + "grad_norm": 1.0772106647491455, + "learning_rate": 3.6372204280568644e-07, + "loss": 0.2122, + "step": 5804 + }, + { + "epoch": 2.748742231429417, + "grad_norm": 0.8862806558609009, + "learning_rate": 3.623565697732123e-07, + "loss": 0.1984, + "step": 5805 + }, + { + "epoch": 2.749215744303048, + "grad_norm": 1.0365098714828491, + "learning_rate": 3.6099361737339965e-07, + "loss": 0.1737, + "step": 5806 + }, + { + "epoch": 2.7496892571766796, + "grad_norm": 1.0085512399673462, + "learning_rate": 3.596331859627189e-07, + "loss": 0.1949, + "step": 5807 + }, + { + "epoch": 2.7501627700503106, + "grad_norm": 1.1064403057098389, + "learning_rate": 3.582752758969743e-07, + "loss": 0.1957, + "step": 5808 + }, + { + "epoch": 2.750636282923942, + "grad_norm": 0.9437001347541809, + "learning_rate": 3.5691988753131625e-07, + "loss": 0.1861, + "step": 5809 + }, + { + "epoch": 2.751109795797573, + "grad_norm": 0.9680258631706238, + "learning_rate": 3.5556702122023444e-07, + "loss": 0.2232, + "step": 5810 + }, + { + "epoch": 2.7515833086712043, + "grad_norm": 0.9630969762802124, + "learning_rate": 3.5421667731755484e-07, + "loss": 0.196, + "step": 5811 + }, + { + "epoch": 2.7520568215448358, + "grad_norm": 1.0958917140960693, + "learning_rate": 3.528688561764515e-07, + "loss": 0.2042, + "step": 5812 + }, + { + "epoch": 2.752530334418467, + "grad_norm": 1.1072863340377808, + "learning_rate": 3.5152355814942916e-07, + "loss": 0.2031, + "step": 5813 + }, + { + "epoch": 2.7530038472920983, + "grad_norm": 1.231067180633545, + "learning_rate": 3.5018078358834084e-07, + "loss": 0.1989, + "step": 5814 + }, + { + "epoch": 2.7534773601657294, + "grad_norm": 1.1398643255233765, + "learning_rate": 3.4884053284437444e-07, + "loss": 0.2065, + "step": 5815 + }, + { + "epoch": 2.7539508730393605, + "grad_norm": 1.0876073837280273, + "learning_rate": 3.4750280626805964e-07, + "loss": 0.1859, + "step": 5816 + }, + { + "epoch": 2.754424385912992, + "grad_norm": 1.428344964981079, + "learning_rate": 3.461676042092688e-07, + "loss": 0.2038, + "step": 5817 + }, + { + "epoch": 2.7548978987866235, + "grad_norm": 0.8772413730621338, + "learning_rate": 3.4483492701720687e-07, + "loss": 0.1925, + "step": 5818 + }, + { + "epoch": 2.7553714116602546, + "grad_norm": 1.0749726295471191, + "learning_rate": 3.435047750404252e-07, + "loss": 0.2179, + "step": 5819 + }, + { + "epoch": 2.7558449245338856, + "grad_norm": 1.8996601104736328, + "learning_rate": 3.4217714862681215e-07, + "loss": 0.1664, + "step": 5820 + }, + { + "epoch": 2.756318437407517, + "grad_norm": 1.382574200630188, + "learning_rate": 3.408520481235955e-07, + "loss": 0.2156, + "step": 5821 + }, + { + "epoch": 2.756791950281148, + "grad_norm": 1.1029624938964844, + "learning_rate": 3.395294738773403e-07, + "loss": 0.2294, + "step": 5822 + }, + { + "epoch": 2.7572654631547797, + "grad_norm": 1.0061315298080444, + "learning_rate": 3.382094262339575e-07, + "loss": 0.1968, + "step": 5823 + }, + { + "epoch": 2.757738976028411, + "grad_norm": 1.1826341152191162, + "learning_rate": 3.368919055386888e-07, + "loss": 0.1868, + "step": 5824 + }, + { + "epoch": 2.758212488902042, + "grad_norm": 1.690619707107544, + "learning_rate": 3.3557691213612075e-07, + "loss": 0.2114, + "step": 5825 + }, + { + "epoch": 2.7586860017756734, + "grad_norm": 1.034387469291687, + "learning_rate": 3.342644463701783e-07, + "loss": 0.2105, + "step": 5826 + }, + { + "epoch": 2.7591595146493044, + "grad_norm": 0.9920729398727417, + "learning_rate": 3.3295450858412125e-07, + "loss": 0.1833, + "step": 5827 + }, + { + "epoch": 2.759633027522936, + "grad_norm": 1.3329393863677979, + "learning_rate": 3.316470991205534e-07, + "loss": 0.2245, + "step": 5828 + }, + { + "epoch": 2.760106540396567, + "grad_norm": 1.0018433332443237, + "learning_rate": 3.3034221832141446e-07, + "loss": 0.1915, + "step": 5829 + }, + { + "epoch": 2.760580053270198, + "grad_norm": 1.5985726118087769, + "learning_rate": 3.2903986652798367e-07, + "loss": 0.1633, + "step": 5830 + }, + { + "epoch": 2.7610535661438296, + "grad_norm": 2.1785120964050293, + "learning_rate": 3.2774004408087756e-07, + "loss": 0.1907, + "step": 5831 + }, + { + "epoch": 2.7615270790174606, + "grad_norm": 0.9680808782577515, + "learning_rate": 3.264427513200552e-07, + "loss": 0.1905, + "step": 5832 + }, + { + "epoch": 2.762000591891092, + "grad_norm": 1.224849820137024, + "learning_rate": 3.2514798858480635e-07, + "loss": 0.192, + "step": 5833 + }, + { + "epoch": 2.762474104764723, + "grad_norm": 1.091199517250061, + "learning_rate": 3.238557562137679e-07, + "loss": 0.212, + "step": 5834 + }, + { + "epoch": 2.7629476176383543, + "grad_norm": 1.2063572406768799, + "learning_rate": 3.225660545449083e-07, + "loss": 0.1971, + "step": 5835 + }, + { + "epoch": 2.763421130511986, + "grad_norm": 1.4341157674789429, + "learning_rate": 3.2127888391553674e-07, + "loss": 0.1935, + "step": 5836 + }, + { + "epoch": 2.7638946433856173, + "grad_norm": 1.0425024032592773, + "learning_rate": 3.1999424466230166e-07, + "loss": 0.2098, + "step": 5837 + }, + { + "epoch": 2.7643681562592484, + "grad_norm": 1.3151661157608032, + "learning_rate": 3.187121371211854e-07, + "loss": 0.1904, + "step": 5838 + }, + { + "epoch": 2.7648416691328794, + "grad_norm": 1.4269354343414307, + "learning_rate": 3.1743256162751534e-07, + "loss": 0.2162, + "step": 5839 + }, + { + "epoch": 2.765315182006511, + "grad_norm": 0.9791456460952759, + "learning_rate": 3.161555185159471e-07, + "loss": 0.177, + "step": 5840 + }, + { + "epoch": 2.765788694880142, + "grad_norm": 1.1203821897506714, + "learning_rate": 3.1488100812048337e-07, + "loss": 0.2033, + "step": 5841 + }, + { + "epoch": 2.7662622077537735, + "grad_norm": 1.1284160614013672, + "learning_rate": 3.136090307744555e-07, + "loss": 0.1727, + "step": 5842 + }, + { + "epoch": 2.7667357206274046, + "grad_norm": 1.2004402875900269, + "learning_rate": 3.1233958681054164e-07, + "loss": 0.183, + "step": 5843 + }, + { + "epoch": 2.7672092335010356, + "grad_norm": 2.777414083480835, + "learning_rate": 3.110726765607497e-07, + "loss": 0.2068, + "step": 5844 + }, + { + "epoch": 2.767682746374667, + "grad_norm": 1.0925432443618774, + "learning_rate": 3.0980830035642783e-07, + "loss": 0.2027, + "step": 5845 + }, + { + "epoch": 2.7681562592482982, + "grad_norm": 0.955273449420929, + "learning_rate": 3.085464585282627e-07, + "loss": 0.2024, + "step": 5846 + }, + { + "epoch": 2.7686297721219297, + "grad_norm": 1.7570418119430542, + "learning_rate": 3.0728715140627584e-07, + "loss": 0.199, + "step": 5847 + }, + { + "epoch": 2.769103284995561, + "grad_norm": 1.181026577949524, + "learning_rate": 3.0603037931982603e-07, + "loss": 0.2072, + "step": 5848 + }, + { + "epoch": 2.769576797869192, + "grad_norm": 1.4937407970428467, + "learning_rate": 3.0477614259761254e-07, + "loss": 0.2191, + "step": 5849 + }, + { + "epoch": 2.7700503107428234, + "grad_norm": 0.9318932890892029, + "learning_rate": 3.035244415676675e-07, + "loss": 0.1817, + "step": 5850 + }, + { + "epoch": 2.7705238236164544, + "grad_norm": 1.0921509265899658, + "learning_rate": 3.0227527655736223e-07, + "loss": 0.2072, + "step": 5851 + }, + { + "epoch": 2.770997336490086, + "grad_norm": 1.1498230695724487, + "learning_rate": 3.01028647893401e-07, + "loss": 0.2132, + "step": 5852 + }, + { + "epoch": 2.771470849363717, + "grad_norm": 1.4725837707519531, + "learning_rate": 2.9978455590182974e-07, + "loss": 0.2007, + "step": 5853 + }, + { + "epoch": 2.771944362237348, + "grad_norm": 1.1440794467926025, + "learning_rate": 2.985430009080281e-07, + "loss": 0.2044, + "step": 5854 + }, + { + "epoch": 2.7724178751109796, + "grad_norm": 1.0810843706130981, + "learning_rate": 2.9730398323671415e-07, + "loss": 0.175, + "step": 5855 + }, + { + "epoch": 2.772891387984611, + "grad_norm": 1.1769734621047974, + "learning_rate": 2.960675032119387e-07, + "loss": 0.2085, + "step": 5856 + }, + { + "epoch": 2.773364900858242, + "grad_norm": 0.9242314100265503, + "learning_rate": 2.9483356115709295e-07, + "loss": 0.2023, + "step": 5857 + }, + { + "epoch": 2.7738384137318732, + "grad_norm": 0.8873565793037415, + "learning_rate": 2.936021573949011e-07, + "loss": 0.2078, + "step": 5858 + }, + { + "epoch": 2.7743119266055047, + "grad_norm": 1.0390340089797974, + "learning_rate": 2.923732922474265e-07, + "loss": 0.2173, + "step": 5859 + }, + { + "epoch": 2.774785439479136, + "grad_norm": 1.056854248046875, + "learning_rate": 2.911469660360655e-07, + "loss": 0.2255, + "step": 5860 + }, + { + "epoch": 2.7752589523527673, + "grad_norm": 0.9877737164497375, + "learning_rate": 2.8992317908155263e-07, + "loss": 0.1992, + "step": 5861 + }, + { + "epoch": 2.7757324652263984, + "grad_norm": 1.1483983993530273, + "learning_rate": 2.8870193170395855e-07, + "loss": 0.2159, + "step": 5862 + }, + { + "epoch": 2.7762059781000294, + "grad_norm": 1.4070454835891724, + "learning_rate": 2.874832242226866e-07, + "loss": 0.2035, + "step": 5863 + }, + { + "epoch": 2.776679490973661, + "grad_norm": 0.9695731401443481, + "learning_rate": 2.862670569564796e-07, + "loss": 0.2004, + "step": 5864 + }, + { + "epoch": 2.777153003847292, + "grad_norm": 1.5455920696258545, + "learning_rate": 2.850534302234131e-07, + "loss": 0.1964, + "step": 5865 + }, + { + "epoch": 2.7776265167209235, + "grad_norm": 1.2403937578201294, + "learning_rate": 2.838423443409011e-07, + "loss": 0.22, + "step": 5866 + }, + { + "epoch": 2.7781000295945546, + "grad_norm": 1.3792948722839355, + "learning_rate": 2.8263379962568894e-07, + "loss": 0.2188, + "step": 5867 + }, + { + "epoch": 2.7785735424681857, + "grad_norm": 0.9905843138694763, + "learning_rate": 2.8142779639386275e-07, + "loss": 0.1821, + "step": 5868 + }, + { + "epoch": 2.779047055341817, + "grad_norm": 0.9501548409461975, + "learning_rate": 2.8022433496084024e-07, + "loss": 0.2016, + "step": 5869 + }, + { + "epoch": 2.7795205682154482, + "grad_norm": 0.990218460559845, + "learning_rate": 2.7902341564137294e-07, + "loss": 0.1896, + "step": 5870 + }, + { + "epoch": 2.7799940810890797, + "grad_norm": 0.9677179455757141, + "learning_rate": 2.778250387495529e-07, + "loss": 0.21, + "step": 5871 + }, + { + "epoch": 2.780467593962711, + "grad_norm": 1.4591187238693237, + "learning_rate": 2.766292045988006e-07, + "loss": 0.2014, + "step": 5872 + }, + { + "epoch": 2.780941106836342, + "grad_norm": 1.055142879486084, + "learning_rate": 2.754359135018791e-07, + "loss": 0.1694, + "step": 5873 + }, + { + "epoch": 2.7814146197099734, + "grad_norm": 1.0192638635635376, + "learning_rate": 2.742451657708778e-07, + "loss": 0.1877, + "step": 5874 + }, + { + "epoch": 2.7818881325836045, + "grad_norm": 1.215868353843689, + "learning_rate": 2.730569617172296e-07, + "loss": 0.1723, + "step": 5875 + }, + { + "epoch": 2.782361645457236, + "grad_norm": 1.0110760927200317, + "learning_rate": 2.7187130165169383e-07, + "loss": 0.2039, + "step": 5876 + }, + { + "epoch": 2.782835158330867, + "grad_norm": 1.146170735359192, + "learning_rate": 2.706881858843702e-07, + "loss": 0.1955, + "step": 5877 + }, + { + "epoch": 2.783308671204498, + "grad_norm": 0.9560944437980652, + "learning_rate": 2.695076147246911e-07, + "loss": 0.1919, + "step": 5878 + }, + { + "epoch": 2.7837821840781296, + "grad_norm": 1.2928173542022705, + "learning_rate": 2.683295884814252e-07, + "loss": 0.1987, + "step": 5879 + }, + { + "epoch": 2.784255696951761, + "grad_norm": 1.1882565021514893, + "learning_rate": 2.6715410746267155e-07, + "loss": 0.2041, + "step": 5880 + }, + { + "epoch": 2.784729209825392, + "grad_norm": 1.3022156953811646, + "learning_rate": 2.6598117197586646e-07, + "loss": 0.1777, + "step": 5881 + }, + { + "epoch": 2.7852027226990232, + "grad_norm": 1.2867157459259033, + "learning_rate": 2.6481078232778013e-07, + "loss": 0.1863, + "step": 5882 + }, + { + "epoch": 2.7856762355726548, + "grad_norm": 1.3041218519210815, + "learning_rate": 2.6364293882451655e-07, + "loss": 0.1834, + "step": 5883 + }, + { + "epoch": 2.786149748446286, + "grad_norm": 1.1009517908096313, + "learning_rate": 2.624776417715147e-07, + "loss": 0.1964, + "step": 5884 + }, + { + "epoch": 2.7866232613199173, + "grad_norm": 1.0136332511901855, + "learning_rate": 2.6131489147354527e-07, + "loss": 0.1895, + "step": 5885 + }, + { + "epoch": 2.7870967741935484, + "grad_norm": 1.4136160612106323, + "learning_rate": 2.60154688234715e-07, + "loss": 0.2031, + "step": 5886 + }, + { + "epoch": 2.7875702870671795, + "grad_norm": 1.1578394174575806, + "learning_rate": 2.589970323584645e-07, + "loss": 0.2127, + "step": 5887 + }, + { + "epoch": 2.788043799940811, + "grad_norm": 1.2432137727737427, + "learning_rate": 2.5784192414756714e-07, + "loss": 0.1898, + "step": 5888 + }, + { + "epoch": 2.788517312814442, + "grad_norm": 1.029590368270874, + "learning_rate": 2.5668936390413014e-07, + "loss": 0.1919, + "step": 5889 + }, + { + "epoch": 2.7889908256880735, + "grad_norm": 1.444937825202942, + "learning_rate": 2.5553935192959457e-07, + "loss": 0.2137, + "step": 5890 + }, + { + "epoch": 2.7894643385617046, + "grad_norm": 1.5030622482299805, + "learning_rate": 2.543918885247354e-07, + "loss": 0.1897, + "step": 5891 + }, + { + "epoch": 2.7899378514353357, + "grad_norm": 0.9852031469345093, + "learning_rate": 2.5324697398965927e-07, + "loss": 0.2015, + "step": 5892 + }, + { + "epoch": 2.790411364308967, + "grad_norm": 1.4873343706130981, + "learning_rate": 2.5210460862380993e-07, + "loss": 0.1801, + "step": 5893 + }, + { + "epoch": 2.7908848771825983, + "grad_norm": 1.5172432661056519, + "learning_rate": 2.5096479272596064e-07, + "loss": 0.2038, + "step": 5894 + }, + { + "epoch": 2.7913583900562298, + "grad_norm": 1.1348522901535034, + "learning_rate": 2.4982752659421736e-07, + "loss": 0.1754, + "step": 5895 + }, + { + "epoch": 2.791831902929861, + "grad_norm": 0.959349513053894, + "learning_rate": 2.4869281052602447e-07, + "loss": 0.1908, + "step": 5896 + }, + { + "epoch": 2.792305415803492, + "grad_norm": 1.4271739721298218, + "learning_rate": 2.4756064481815445e-07, + "loss": 0.1959, + "step": 5897 + }, + { + "epoch": 2.7927789286771234, + "grad_norm": 1.5378594398498535, + "learning_rate": 2.4643102976671383e-07, + "loss": 0.2231, + "step": 5898 + }, + { + "epoch": 2.793252441550755, + "grad_norm": 1.1620539426803589, + "learning_rate": 2.45303965667143e-07, + "loss": 0.2145, + "step": 5899 + }, + { + "epoch": 2.793725954424386, + "grad_norm": 1.0995843410491943, + "learning_rate": 2.4417945281421495e-07, + "loss": 0.204, + "step": 5900 + }, + { + "epoch": 2.794199467298017, + "grad_norm": 0.9468101859092712, + "learning_rate": 2.430574915020345e-07, + "loss": 0.1756, + "step": 5901 + }, + { + "epoch": 2.7946729801716486, + "grad_norm": 1.0497629642486572, + "learning_rate": 2.419380820240413e-07, + "loss": 0.2112, + "step": 5902 + }, + { + "epoch": 2.7951464930452796, + "grad_norm": 1.391728401184082, + "learning_rate": 2.408212246730035e-07, + "loss": 0.2024, + "step": 5903 + }, + { + "epoch": 2.795620005918911, + "grad_norm": 0.9138317108154297, + "learning_rate": 2.397069197410273e-07, + "loss": 0.2109, + "step": 5904 + }, + { + "epoch": 2.796093518792542, + "grad_norm": 0.9192736148834229, + "learning_rate": 2.3859516751954637e-07, + "loss": 0.2016, + "step": 5905 + }, + { + "epoch": 2.7965670316661733, + "grad_norm": 1.567175030708313, + "learning_rate": 2.3748596829932914e-07, + "loss": 0.2126, + "step": 5906 + }, + { + "epoch": 2.7970405445398048, + "grad_norm": 1.9273465871810913, + "learning_rate": 2.36379322370478e-07, + "loss": 0.2062, + "step": 5907 + }, + { + "epoch": 2.797514057413436, + "grad_norm": 1.1850330829620361, + "learning_rate": 2.3527523002242147e-07, + "loss": 0.2089, + "step": 5908 + }, + { + "epoch": 2.7979875702870673, + "grad_norm": 1.0225036144256592, + "learning_rate": 2.3417369154392854e-07, + "loss": 0.2057, + "step": 5909 + }, + { + "epoch": 2.7984610831606984, + "grad_norm": 0.9364516735076904, + "learning_rate": 2.330747072230921e-07, + "loss": 0.1966, + "step": 5910 + }, + { + "epoch": 2.7989345960343295, + "grad_norm": 1.2841675281524658, + "learning_rate": 2.3197827734734446e-07, + "loss": 0.1784, + "step": 5911 + }, + { + "epoch": 2.799408108907961, + "grad_norm": 1.0790499448776245, + "learning_rate": 2.3088440220344288e-07, + "loss": 0.1821, + "step": 5912 + }, + { + "epoch": 2.799881621781592, + "grad_norm": 1.2959651947021484, + "learning_rate": 2.2979308207748295e-07, + "loss": 0.1877, + "step": 5913 + }, + { + "epoch": 2.8003551346552236, + "grad_norm": 1.3200701475143433, + "learning_rate": 2.287043172548875e-07, + "loss": 0.2125, + "step": 5914 + }, + { + "epoch": 2.8008286475288546, + "grad_norm": 1.6399303674697876, + "learning_rate": 2.2761810802041205e-07, + "loss": 0.2024, + "step": 5915 + }, + { + "epoch": 2.8013021604024857, + "grad_norm": 1.029131293296814, + "learning_rate": 2.2653445465814493e-07, + "loss": 0.2218, + "step": 5916 + }, + { + "epoch": 2.801775673276117, + "grad_norm": 1.1296693086624146, + "learning_rate": 2.2545335745150387e-07, + "loss": 0.1993, + "step": 5917 + }, + { + "epoch": 2.8022491861497487, + "grad_norm": 1.231637954711914, + "learning_rate": 2.2437481668324268e-07, + "loss": 0.1831, + "step": 5918 + }, + { + "epoch": 2.80272269902338, + "grad_norm": 1.1062796115875244, + "learning_rate": 2.2329883263543905e-07, + "loss": 0.19, + "step": 5919 + }, + { + "epoch": 2.803196211897011, + "grad_norm": 0.9966011643409729, + "learning_rate": 2.22225405589509e-07, + "loss": 0.1985, + "step": 5920 + }, + { + "epoch": 2.8036697247706424, + "grad_norm": 0.9549378752708435, + "learning_rate": 2.2115453582619573e-07, + "loss": 0.1719, + "step": 5921 + }, + { + "epoch": 2.8041432376442734, + "grad_norm": 1.1001967191696167, + "learning_rate": 2.200862236255763e-07, + "loss": 0.2117, + "step": 5922 + }, + { + "epoch": 2.804616750517905, + "grad_norm": 1.617583990097046, + "learning_rate": 2.1902046926705611e-07, + "loss": 0.1837, + "step": 5923 + }, + { + "epoch": 2.805090263391536, + "grad_norm": 1.1403623819351196, + "learning_rate": 2.179572730293733e-07, + "loss": 0.2129, + "step": 5924 + }, + { + "epoch": 2.805563776265167, + "grad_norm": 1.1201438903808594, + "learning_rate": 2.1689663519059545e-07, + "loss": 0.1948, + "step": 5925 + }, + { + "epoch": 2.8060372891387986, + "grad_norm": 1.2484549283981323, + "learning_rate": 2.158385560281251e-07, + "loss": 0.197, + "step": 5926 + }, + { + "epoch": 2.8065108020124296, + "grad_norm": 1.149869441986084, + "learning_rate": 2.1478303581869087e-07, + "loss": 0.1816, + "step": 5927 + }, + { + "epoch": 2.806984314886061, + "grad_norm": 1.0050691366195679, + "learning_rate": 2.1373007483835306e-07, + "loss": 0.1902, + "step": 5928 + }, + { + "epoch": 2.807457827759692, + "grad_norm": 1.3803904056549072, + "learning_rate": 2.1267967336250472e-07, + "loss": 0.2059, + "step": 5929 + }, + { + "epoch": 2.8079313406333233, + "grad_norm": 1.0127631425857544, + "learning_rate": 2.1163183166586943e-07, + "loss": 0.1983, + "step": 5930 + }, + { + "epoch": 2.808404853506955, + "grad_norm": 1.1045966148376465, + "learning_rate": 2.1058655002249573e-07, + "loss": 0.2302, + "step": 5931 + }, + { + "epoch": 2.808878366380586, + "grad_norm": 1.2346396446228027, + "learning_rate": 2.0954382870577162e-07, + "loss": 0.193, + "step": 5932 + }, + { + "epoch": 2.8093518792542174, + "grad_norm": 1.0287126302719116, + "learning_rate": 2.0850366798840892e-07, + "loss": 0.2041, + "step": 5933 + }, + { + "epoch": 2.8098253921278484, + "grad_norm": 1.2126495838165283, + "learning_rate": 2.074660681424512e-07, + "loss": 0.198, + "step": 5934 + }, + { + "epoch": 2.8102989050014795, + "grad_norm": 0.9380123615264893, + "learning_rate": 2.064310294392724e-07, + "loss": 0.1903, + "step": 5935 + }, + { + "epoch": 2.810772417875111, + "grad_norm": 1.0035574436187744, + "learning_rate": 2.0539855214957828e-07, + "loss": 0.2078, + "step": 5936 + }, + { + "epoch": 2.811245930748742, + "grad_norm": 1.22806715965271, + "learning_rate": 2.043686365434028e-07, + "loss": 0.2211, + "step": 5937 + }, + { + "epoch": 2.8117194436223736, + "grad_norm": 1.3754026889801025, + "learning_rate": 2.0334128289011046e-07, + "loss": 0.1877, + "step": 5938 + }, + { + "epoch": 2.8121929564960046, + "grad_norm": 1.7814515829086304, + "learning_rate": 2.0231649145839528e-07, + "loss": 0.2241, + "step": 5939 + }, + { + "epoch": 2.8126664693696357, + "grad_norm": 1.2736109495162964, + "learning_rate": 2.012942625162817e-07, + "loss": 0.186, + "step": 5940 + }, + { + "epoch": 2.8131399822432672, + "grad_norm": 1.2346388101577759, + "learning_rate": 2.002745963311248e-07, + "loss": 0.1886, + "step": 5941 + }, + { + "epoch": 2.8136134951168987, + "grad_norm": 0.9936687350273132, + "learning_rate": 1.9925749316960563e-07, + "loss": 0.2186, + "step": 5942 + }, + { + "epoch": 2.81408700799053, + "grad_norm": 0.9427998065948486, + "learning_rate": 1.9824295329774145e-07, + "loss": 0.1811, + "step": 5943 + }, + { + "epoch": 2.814560520864161, + "grad_norm": 1.290932059288025, + "learning_rate": 1.9723097698087336e-07, + "loss": 0.1865, + "step": 5944 + }, + { + "epoch": 2.8150340337377924, + "grad_norm": 1.3386746644973755, + "learning_rate": 1.9622156448367403e-07, + "loss": 0.1974, + "step": 5945 + }, + { + "epoch": 2.8155075466114234, + "grad_norm": 0.9645776748657227, + "learning_rate": 1.9521471607014565e-07, + "loss": 0.1782, + "step": 5946 + }, + { + "epoch": 2.815981059485055, + "grad_norm": 1.2050148248672485, + "learning_rate": 1.9421043200361976e-07, + "loss": 0.1887, + "step": 5947 + }, + { + "epoch": 2.816454572358686, + "grad_norm": 1.0068618059158325, + "learning_rate": 1.9320871254675745e-07, + "loss": 0.2107, + "step": 5948 + }, + { + "epoch": 2.816928085232317, + "grad_norm": 2.5484302043914795, + "learning_rate": 1.9220955796154794e-07, + "loss": 0.1889, + "step": 5949 + }, + { + "epoch": 2.8174015981059486, + "grad_norm": 1.0873286724090576, + "learning_rate": 1.9121296850931225e-07, + "loss": 0.2036, + "step": 5950 + }, + { + "epoch": 2.8178751109795797, + "grad_norm": 1.162097454071045, + "learning_rate": 1.9021894445069744e-07, + "loss": 0.2023, + "step": 5951 + }, + { + "epoch": 2.818348623853211, + "grad_norm": 1.1545616388320923, + "learning_rate": 1.892274860456811e-07, + "loss": 0.2182, + "step": 5952 + }, + { + "epoch": 2.8188221367268422, + "grad_norm": 1.2992243766784668, + "learning_rate": 1.8823859355356798e-07, + "loss": 0.1967, + "step": 5953 + }, + { + "epoch": 2.8192956496004733, + "grad_norm": 0.9885454177856445, + "learning_rate": 1.8725226723299682e-07, + "loss": 0.1857, + "step": 5954 + }, + { + "epoch": 2.819769162474105, + "grad_norm": 1.3030709028244019, + "learning_rate": 1.8626850734192904e-07, + "loss": 0.1846, + "step": 5955 + }, + { + "epoch": 2.820242675347736, + "grad_norm": 1.0009846687316895, + "learning_rate": 1.8528731413765877e-07, + "loss": 0.2014, + "step": 5956 + }, + { + "epoch": 2.8207161882213674, + "grad_norm": 1.367098331451416, + "learning_rate": 1.8430868787680633e-07, + "loss": 0.2086, + "step": 5957 + }, + { + "epoch": 2.8211897010949984, + "grad_norm": 1.151480793952942, + "learning_rate": 1.8333262881532476e-07, + "loss": 0.2091, + "step": 5958 + }, + { + "epoch": 2.8216632139686295, + "grad_norm": 1.2363460063934326, + "learning_rate": 1.8235913720848985e-07, + "loss": 0.2001, + "step": 5959 + }, + { + "epoch": 2.822136726842261, + "grad_norm": 1.2438163757324219, + "learning_rate": 1.8138821331091015e-07, + "loss": 0.2125, + "step": 5960 + }, + { + "epoch": 2.8226102397158925, + "grad_norm": 1.2525054216384888, + "learning_rate": 1.8041985737652256e-07, + "loss": 0.2103, + "step": 5961 + }, + { + "epoch": 2.8230837525895236, + "grad_norm": 1.3402167558670044, + "learning_rate": 1.7945406965858892e-07, + "loss": 0.1944, + "step": 5962 + }, + { + "epoch": 2.8235572654631547, + "grad_norm": 1.0227385759353638, + "learning_rate": 1.784908504097027e-07, + "loss": 0.2063, + "step": 5963 + }, + { + "epoch": 2.824030778336786, + "grad_norm": 1.5239243507385254, + "learning_rate": 1.7753019988178577e-07, + "loss": 0.1862, + "step": 5964 + }, + { + "epoch": 2.8245042912104172, + "grad_norm": 1.001605749130249, + "learning_rate": 1.765721183260849e-07, + "loss": 0.1981, + "step": 5965 + }, + { + "epoch": 2.8249778040840487, + "grad_norm": 1.0575411319732666, + "learning_rate": 1.7561660599317853e-07, + "loss": 0.2199, + "step": 5966 + }, + { + "epoch": 2.82545131695768, + "grad_norm": 1.230980396270752, + "learning_rate": 1.7466366313297123e-07, + "loss": 0.2134, + "step": 5967 + }, + { + "epoch": 2.825924829831311, + "grad_norm": 1.0976228713989258, + "learning_rate": 1.7371328999469695e-07, + "loss": 0.2227, + "step": 5968 + }, + { + "epoch": 2.8263983427049424, + "grad_norm": 1.2216429710388184, + "learning_rate": 1.7276548682691463e-07, + "loss": 0.1866, + "step": 5969 + }, + { + "epoch": 2.8268718555785735, + "grad_norm": 1.946909785270691, + "learning_rate": 1.71820253877516e-07, + "loss": 0.2028, + "step": 5970 + }, + { + "epoch": 2.827345368452205, + "grad_norm": 1.2762550115585327, + "learning_rate": 1.7087759139371442e-07, + "loss": 0.1854, + "step": 5971 + }, + { + "epoch": 2.827818881325836, + "grad_norm": 1.4644784927368164, + "learning_rate": 1.6993749962205597e-07, + "loss": 0.1905, + "step": 5972 + }, + { + "epoch": 2.828292394199467, + "grad_norm": 1.5430243015289307, + "learning_rate": 1.689999788084129e-07, + "loss": 0.2004, + "step": 5973 + }, + { + "epoch": 2.8287659070730986, + "grad_norm": 1.2102375030517578, + "learning_rate": 1.680650291979824e-07, + "loss": 0.1786, + "step": 5974 + }, + { + "epoch": 2.8292394199467297, + "grad_norm": 1.029317855834961, + "learning_rate": 1.671326510352944e-07, + "loss": 0.1895, + "step": 5975 + }, + { + "epoch": 2.829712932820361, + "grad_norm": 1.2381408214569092, + "learning_rate": 1.6620284456420167e-07, + "loss": 0.2229, + "step": 5976 + }, + { + "epoch": 2.8301864456939922, + "grad_norm": 0.9947028160095215, + "learning_rate": 1.652756100278874e-07, + "loss": 0.204, + "step": 5977 + }, + { + "epoch": 2.8306599585676233, + "grad_norm": 1.2137975692749023, + "learning_rate": 1.6435094766885873e-07, + "loss": 0.2174, + "step": 5978 + }, + { + "epoch": 2.831133471441255, + "grad_norm": 1.7809674739837646, + "learning_rate": 1.6342885772895445e-07, + "loss": 0.1848, + "step": 5979 + }, + { + "epoch": 2.8316069843148863, + "grad_norm": 1.0932118892669678, + "learning_rate": 1.625093404493372e-07, + "loss": 0.182, + "step": 5980 + }, + { + "epoch": 2.8320804971885174, + "grad_norm": 1.009130835533142, + "learning_rate": 1.6159239607049793e-07, + "loss": 0.187, + "step": 5981 + }, + { + "epoch": 2.8325540100621485, + "grad_norm": 1.0555768013000488, + "learning_rate": 1.606780248322548e-07, + "loss": 0.218, + "step": 5982 + }, + { + "epoch": 2.83302752293578, + "grad_norm": 1.0417250394821167, + "learning_rate": 1.597662269737521e-07, + "loss": 0.2098, + "step": 5983 + }, + { + "epoch": 2.833501035809411, + "grad_norm": 1.1635463237762451, + "learning_rate": 1.588570027334635e-07, + "loss": 0.2166, + "step": 5984 + }, + { + "epoch": 2.8339745486830425, + "grad_norm": 0.8822041749954224, + "learning_rate": 1.5795035234918543e-07, + "loss": 0.1992, + "step": 5985 + }, + { + "epoch": 2.8344480615566736, + "grad_norm": 1.387373685836792, + "learning_rate": 1.5704627605804601e-07, + "loss": 0.2067, + "step": 5986 + }, + { + "epoch": 2.8349215744303047, + "grad_norm": 1.6232830286026, + "learning_rate": 1.5614477409649497e-07, + "loss": 0.1954, + "step": 5987 + }, + { + "epoch": 2.835395087303936, + "grad_norm": 1.3384531736373901, + "learning_rate": 1.5524584670031372e-07, + "loss": 0.2028, + "step": 5988 + }, + { + "epoch": 2.8358686001775673, + "grad_norm": 1.060403823852539, + "learning_rate": 1.5434949410460753e-07, + "loss": 0.1862, + "step": 5989 + }, + { + "epoch": 2.8363421130511988, + "grad_norm": 1.383388876914978, + "learning_rate": 1.5345571654380775e-07, + "loss": 0.2104, + "step": 5990 + }, + { + "epoch": 2.83681562592483, + "grad_norm": 1.2764064073562622, + "learning_rate": 1.5256451425167406e-07, + "loss": 0.2042, + "step": 5991 + }, + { + "epoch": 2.837289138798461, + "grad_norm": 1.2864024639129639, + "learning_rate": 1.5167588746129224e-07, + "loss": 0.209, + "step": 5992 + }, + { + "epoch": 2.8377626516720924, + "grad_norm": 0.9310190081596375, + "learning_rate": 1.5078983640507416e-07, + "loss": 0.1899, + "step": 5993 + }, + { + "epoch": 2.8382361645457235, + "grad_norm": 1.218672752380371, + "learning_rate": 1.4990636131475554e-07, + "loss": 0.1884, + "step": 5994 + }, + { + "epoch": 2.838709677419355, + "grad_norm": 1.2877182960510254, + "learning_rate": 1.490254624214049e-07, + "loss": 0.1916, + "step": 5995 + }, + { + "epoch": 2.839183190292986, + "grad_norm": 1.661100149154663, + "learning_rate": 1.4814713995540797e-07, + "loss": 0.1996, + "step": 5996 + }, + { + "epoch": 2.839656703166617, + "grad_norm": 1.07488214969635, + "learning_rate": 1.472713941464865e-07, + "loss": 0.2045, + "step": 5997 + }, + { + "epoch": 2.8401302160402486, + "grad_norm": 1.1992322206497192, + "learning_rate": 1.4639822522367952e-07, + "loss": 0.1849, + "step": 5998 + }, + { + "epoch": 2.8406037289138797, + "grad_norm": 1.5754666328430176, + "learning_rate": 1.455276334153577e-07, + "loss": 0.21, + "step": 5999 + }, + { + "epoch": 2.841077241787511, + "grad_norm": 1.114166021347046, + "learning_rate": 1.4465961894921555e-07, + "loss": 0.2, + "step": 6000 + }, + { + "epoch": 2.8415507546611423, + "grad_norm": 1.2283138036727905, + "learning_rate": 1.4379418205227368e-07, + "loss": 0.1895, + "step": 6001 + }, + { + "epoch": 2.8420242675347733, + "grad_norm": 1.11212158203125, + "learning_rate": 1.4293132295087998e-07, + "loss": 0.205, + "step": 6002 + }, + { + "epoch": 2.842497780408405, + "grad_norm": 1.7816588878631592, + "learning_rate": 1.4207104187070386e-07, + "loss": 0.235, + "step": 6003 + }, + { + "epoch": 2.8429712932820363, + "grad_norm": 1.1824668645858765, + "learning_rate": 1.4121333903674762e-07, + "loss": 0.2125, + "step": 6004 + }, + { + "epoch": 2.8434448061556674, + "grad_norm": 1.4136723279953003, + "learning_rate": 1.4035821467333177e-07, + "loss": 0.2042, + "step": 6005 + }, + { + "epoch": 2.8439183190292985, + "grad_norm": 1.4217491149902344, + "learning_rate": 1.3950566900410856e-07, + "loss": 0.1737, + "step": 6006 + }, + { + "epoch": 2.84439183190293, + "grad_norm": 1.3726348876953125, + "learning_rate": 1.3865570225205072e-07, + "loss": 0.2252, + "step": 6007 + }, + { + "epoch": 2.844865344776561, + "grad_norm": 0.8112472891807556, + "learning_rate": 1.3780831463946042e-07, + "loss": 0.1636, + "step": 6008 + }, + { + "epoch": 2.8453388576501926, + "grad_norm": 1.479446530342102, + "learning_rate": 1.3696350638796263e-07, + "loss": 0.2228, + "step": 6009 + }, + { + "epoch": 2.8458123705238236, + "grad_norm": 1.054337501525879, + "learning_rate": 1.3612127771850947e-07, + "loss": 0.2159, + "step": 6010 + }, + { + "epoch": 2.8462858833974547, + "grad_norm": 1.3831230401992798, + "learning_rate": 1.3528162885137919e-07, + "loss": 0.1956, + "step": 6011 + }, + { + "epoch": 2.846759396271086, + "grad_norm": 0.9735446572303772, + "learning_rate": 1.3444456000617056e-07, + "loss": 0.1992, + "step": 6012 + }, + { + "epoch": 2.8472329091447173, + "grad_norm": 1.2813206911087036, + "learning_rate": 1.3361007140181293e-07, + "loss": 0.1998, + "step": 6013 + }, + { + "epoch": 2.847706422018349, + "grad_norm": 1.0976970195770264, + "learning_rate": 1.3277816325655835e-07, + "loss": 0.1954, + "step": 6014 + }, + { + "epoch": 2.84817993489198, + "grad_norm": 1.1628068685531616, + "learning_rate": 1.319488357879839e-07, + "loss": 0.2002, + "step": 6015 + }, + { + "epoch": 2.848653447765611, + "grad_norm": 1.114010214805603, + "learning_rate": 1.3112208921299274e-07, + "loss": 0.1749, + "step": 6016 + }, + { + "epoch": 2.8491269606392424, + "grad_norm": 1.5268833637237549, + "learning_rate": 1.3029792374781413e-07, + "loss": 0.2106, + "step": 6017 + }, + { + "epoch": 2.8496004735128735, + "grad_norm": 0.991341769695282, + "learning_rate": 1.294763396079979e-07, + "loss": 0.1836, + "step": 6018 + }, + { + "epoch": 2.850073986386505, + "grad_norm": 1.0029536485671997, + "learning_rate": 1.2865733700842098e-07, + "loss": 0.2071, + "step": 6019 + }, + { + "epoch": 2.850547499260136, + "grad_norm": 2.5262866020202637, + "learning_rate": 1.2784091616328876e-07, + "loss": 0.1866, + "step": 6020 + }, + { + "epoch": 2.851021012133767, + "grad_norm": 1.2454925775527954, + "learning_rate": 1.2702707728612596e-07, + "loss": 0.175, + "step": 6021 + }, + { + "epoch": 2.8514945250073986, + "grad_norm": 1.1130245923995972, + "learning_rate": 1.2621582058978455e-07, + "loss": 0.196, + "step": 6022 + }, + { + "epoch": 2.85196803788103, + "grad_norm": 1.1060099601745605, + "learning_rate": 1.2540714628644146e-07, + "loss": 0.191, + "step": 6023 + }, + { + "epoch": 2.852441550754661, + "grad_norm": 0.9595826268196106, + "learning_rate": 1.2460105458759753e-07, + "loss": 0.207, + "step": 6024 + }, + { + "epoch": 2.8529150636282923, + "grad_norm": 1.54879891872406, + "learning_rate": 1.2379754570407742e-07, + "loss": 0.2039, + "step": 6025 + }, + { + "epoch": 2.853388576501924, + "grad_norm": 1.1590107679367065, + "learning_rate": 1.2299661984603307e-07, + "loss": 0.2101, + "step": 6026 + }, + { + "epoch": 2.853862089375555, + "grad_norm": 1.0808899402618408, + "learning_rate": 1.2219827722293687e-07, + "loss": 0.1841, + "step": 6027 + }, + { + "epoch": 2.8543356022491864, + "grad_norm": 1.6629842519760132, + "learning_rate": 1.214025180435885e-07, + "loss": 0.2059, + "step": 6028 + }, + { + "epoch": 2.8548091151228174, + "grad_norm": 0.9478086829185486, + "learning_rate": 1.2060934251611146e-07, + "loss": 0.1861, + "step": 6029 + }, + { + "epoch": 2.8552826279964485, + "grad_norm": 1.5418405532836914, + "learning_rate": 1.1981875084795202e-07, + "loss": 0.1955, + "step": 6030 + }, + { + "epoch": 2.85575614087008, + "grad_norm": 1.2840735912322998, + "learning_rate": 1.190307432458826e-07, + "loss": 0.2217, + "step": 6031 + }, + { + "epoch": 2.856229653743711, + "grad_norm": 0.9639061689376831, + "learning_rate": 1.1824531991599831e-07, + "loss": 0.1806, + "step": 6032 + }, + { + "epoch": 2.8567031666173426, + "grad_norm": 1.2272212505340576, + "learning_rate": 1.1746248106372149e-07, + "loss": 0.1941, + "step": 6033 + }, + { + "epoch": 2.8571766794909736, + "grad_norm": 1.424233078956604, + "learning_rate": 1.1668222689379172e-07, + "loss": 0.1933, + "step": 6034 + }, + { + "epoch": 2.8576501923646047, + "grad_norm": 1.2118334770202637, + "learning_rate": 1.159045576102813e-07, + "loss": 0.2095, + "step": 6035 + }, + { + "epoch": 2.8581237052382362, + "grad_norm": 1.1151028871536255, + "learning_rate": 1.1512947341657976e-07, + "loss": 0.1909, + "step": 6036 + }, + { + "epoch": 2.8585972181118673, + "grad_norm": 1.1803522109985352, + "learning_rate": 1.1435697451540385e-07, + "loss": 0.1931, + "step": 6037 + }, + { + "epoch": 2.859070730985499, + "grad_norm": 1.8160289525985718, + "learning_rate": 1.1358706110879302e-07, + "loss": 0.2031, + "step": 6038 + }, + { + "epoch": 2.85954424385913, + "grad_norm": 1.0799731016159058, + "learning_rate": 1.1281973339810847e-07, + "loss": 0.1992, + "step": 6039 + }, + { + "epoch": 2.860017756732761, + "grad_norm": 1.1362674236297607, + "learning_rate": 1.1205499158404187e-07, + "loss": 0.1917, + "step": 6040 + }, + { + "epoch": 2.8604912696063924, + "grad_norm": 2.0242366790771484, + "learning_rate": 1.1129283586659988e-07, + "loss": 0.2059, + "step": 6041 + }, + { + "epoch": 2.860964782480024, + "grad_norm": 0.9954387545585632, + "learning_rate": 1.1053326644511863e-07, + "loss": 0.1961, + "step": 6042 + }, + { + "epoch": 2.861438295353655, + "grad_norm": 1.4265193939208984, + "learning_rate": 1.0977628351825697e-07, + "loss": 0.2002, + "step": 6043 + }, + { + "epoch": 2.861911808227286, + "grad_norm": 1.1525962352752686, + "learning_rate": 1.0902188728399433e-07, + "loss": 0.2124, + "step": 6044 + }, + { + "epoch": 2.8623853211009176, + "grad_norm": 0.9790692925453186, + "learning_rate": 1.082700779396384e-07, + "loss": 0.1971, + "step": 6045 + }, + { + "epoch": 2.8628588339745487, + "grad_norm": 1.0439430475234985, + "learning_rate": 1.0752085568181524e-07, + "loss": 0.191, + "step": 6046 + }, + { + "epoch": 2.86333234684818, + "grad_norm": 0.9470370411872864, + "learning_rate": 1.0677422070647924e-07, + "loss": 0.1869, + "step": 6047 + }, + { + "epoch": 2.8638058597218112, + "grad_norm": 0.9897739291191101, + "learning_rate": 1.0603017320890307e-07, + "loss": 0.1836, + "step": 6048 + }, + { + "epoch": 2.8642793725954423, + "grad_norm": 0.8592694997787476, + "learning_rate": 1.0528871338368773e-07, + "loss": 0.1808, + "step": 6049 + }, + { + "epoch": 2.864752885469074, + "grad_norm": 1.6525866985321045, + "learning_rate": 1.0454984142475145e-07, + "loss": 0.2002, + "step": 6050 + }, + { + "epoch": 2.865226398342705, + "grad_norm": 1.4348740577697754, + "learning_rate": 1.0381355752534295e-07, + "loss": 0.179, + "step": 6051 + }, + { + "epoch": 2.8656999112163364, + "grad_norm": 0.9945125579833984, + "learning_rate": 1.0307986187802709e-07, + "loss": 0.1853, + "step": 6052 + }, + { + "epoch": 2.8661734240899674, + "grad_norm": 1.0377004146575928, + "learning_rate": 1.0234875467469707e-07, + "loss": 0.1957, + "step": 6053 + }, + { + "epoch": 2.8666469369635985, + "grad_norm": 1.3555017709732056, + "learning_rate": 1.0162023610656547e-07, + "loss": 0.177, + "step": 6054 + }, + { + "epoch": 2.86712044983723, + "grad_norm": 1.3089735507965088, + "learning_rate": 1.0089430636416875e-07, + "loss": 0.1857, + "step": 6055 + }, + { + "epoch": 2.867593962710861, + "grad_norm": 1.1990454196929932, + "learning_rate": 1.001709656373695e-07, + "loss": 0.1987, + "step": 6056 + }, + { + "epoch": 2.8680674755844926, + "grad_norm": 0.9925453066825867, + "learning_rate": 9.94502141153475e-08, + "loss": 0.1858, + "step": 6057 + }, + { + "epoch": 2.8685409884581237, + "grad_norm": 1.205861210823059, + "learning_rate": 9.873205198660974e-08, + "loss": 0.182, + "step": 6058 + }, + { + "epoch": 2.8690145013317547, + "grad_norm": 1.1431076526641846, + "learning_rate": 9.801647943898484e-08, + "loss": 0.1988, + "step": 6059 + }, + { + "epoch": 2.8694880142053862, + "grad_norm": 1.1238492727279663, + "learning_rate": 9.730349665962424e-08, + "loss": 0.2117, + "step": 6060 + }, + { + "epoch": 2.8699615270790173, + "grad_norm": 1.089830994606018, + "learning_rate": 9.659310383499986e-08, + "loss": 0.2003, + "step": 6061 + }, + { + "epoch": 2.870435039952649, + "grad_norm": 1.0016067028045654, + "learning_rate": 9.588530115091088e-08, + "loss": 0.1881, + "step": 6062 + }, + { + "epoch": 2.87090855282628, + "grad_norm": 1.2619209289550781, + "learning_rate": 9.518008879247365e-08, + "loss": 0.2057, + "step": 6063 + }, + { + "epoch": 2.871382065699911, + "grad_norm": 1.600156307220459, + "learning_rate": 9.447746694413063e-08, + "loss": 0.1807, + "step": 6064 + }, + { + "epoch": 2.8718555785735425, + "grad_norm": 1.0998058319091797, + "learning_rate": 9.377743578964704e-08, + "loss": 0.2085, + "step": 6065 + }, + { + "epoch": 2.872329091447174, + "grad_norm": 1.203202486038208, + "learning_rate": 9.307999551210645e-08, + "loss": 0.1866, + "step": 6066 + }, + { + "epoch": 2.872802604320805, + "grad_norm": 1.0956839323043823, + "learning_rate": 9.23851462939207e-08, + "loss": 0.2027, + "step": 6067 + }, + { + "epoch": 2.873276117194436, + "grad_norm": 1.0432785749435425, + "learning_rate": 9.169288831681889e-08, + "loss": 0.1942, + "step": 6068 + }, + { + "epoch": 2.8737496300680676, + "grad_norm": 1.4329001903533936, + "learning_rate": 9.100322176185505e-08, + "loss": 0.1827, + "step": 6069 + }, + { + "epoch": 2.8742231429416987, + "grad_norm": 0.974470317363739, + "learning_rate": 9.031614680940381e-08, + "loss": 0.2002, + "step": 6070 + }, + { + "epoch": 2.87469665581533, + "grad_norm": 1.5563040971755981, + "learning_rate": 8.963166363916586e-08, + "loss": 0.188, + "step": 6071 + }, + { + "epoch": 2.8751701686889612, + "grad_norm": 1.2367172241210938, + "learning_rate": 8.894977243015801e-08, + "loss": 0.2312, + "step": 6072 + }, + { + "epoch": 2.8756436815625923, + "grad_norm": 1.5719220638275146, + "learning_rate": 8.827047336072426e-08, + "loss": 0.1985, + "step": 6073 + }, + { + "epoch": 2.876117194436224, + "grad_norm": 1.091109037399292, + "learning_rate": 8.759376660852803e-08, + "loss": 0.1942, + "step": 6074 + }, + { + "epoch": 2.876590707309855, + "grad_norm": 0.9197636842727661, + "learning_rate": 8.691965235055444e-08, + "loss": 0.1919, + "step": 6075 + }, + { + "epoch": 2.8770642201834864, + "grad_norm": 1.4586553573608398, + "learning_rate": 8.624813076311356e-08, + "loss": 0.2025, + "step": 6076 + }, + { + "epoch": 2.8775377330571175, + "grad_norm": 1.1022119522094727, + "learning_rate": 8.557920202183379e-08, + "loss": 0.2004, + "step": 6077 + }, + { + "epoch": 2.8780112459307485, + "grad_norm": 1.0540403127670288, + "learning_rate": 8.491286630166851e-08, + "loss": 0.2178, + "step": 6078 + }, + { + "epoch": 2.87848475880438, + "grad_norm": 1.0437299013137817, + "learning_rate": 8.424912377688943e-08, + "loss": 0.1937, + "step": 6079 + }, + { + "epoch": 2.878958271678011, + "grad_norm": 1.272997498512268, + "learning_rate": 8.358797462109325e-08, + "loss": 0.186, + "step": 6080 + }, + { + "epoch": 2.8794317845516426, + "grad_norm": 1.2749963998794556, + "learning_rate": 8.29294190071972e-08, + "loss": 0.2155, + "step": 6081 + }, + { + "epoch": 2.8799052974252737, + "grad_norm": 0.9653557538986206, + "learning_rate": 8.227345710744018e-08, + "loss": 0.1891, + "step": 6082 + }, + { + "epoch": 2.8803788102989047, + "grad_norm": 1.1792298555374146, + "learning_rate": 8.16200890933827e-08, + "loss": 0.1945, + "step": 6083 + }, + { + "epoch": 2.8808523231725363, + "grad_norm": 1.3401811122894287, + "learning_rate": 8.096931513590589e-08, + "loss": 0.1901, + "step": 6084 + }, + { + "epoch": 2.8813258360461678, + "grad_norm": 0.9627504944801331, + "learning_rate": 8.03211354052147e-08, + "loss": 0.2, + "step": 6085 + }, + { + "epoch": 2.881799348919799, + "grad_norm": 1.163562536239624, + "learning_rate": 7.967555007083239e-08, + "loss": 0.1979, + "step": 6086 + }, + { + "epoch": 2.88227286179343, + "grad_norm": 1.035914421081543, + "learning_rate": 7.903255930160836e-08, + "loss": 0.1956, + "step": 6087 + }, + { + "epoch": 2.8827463746670614, + "grad_norm": 1.300705075263977, + "learning_rate": 7.83921632657092e-08, + "loss": 0.2012, + "step": 6088 + }, + { + "epoch": 2.8832198875406925, + "grad_norm": 1.1752310991287231, + "learning_rate": 7.775436213062426e-08, + "loss": 0.1685, + "step": 6089 + }, + { + "epoch": 2.883693400414324, + "grad_norm": 0.986862301826477, + "learning_rate": 7.711915606316345e-08, + "loss": 0.1726, + "step": 6090 + }, + { + "epoch": 2.884166913287955, + "grad_norm": 1.0843511819839478, + "learning_rate": 7.648654522946053e-08, + "loss": 0.1928, + "step": 6091 + }, + { + "epoch": 2.884640426161586, + "grad_norm": 1.3774464130401611, + "learning_rate": 7.58565297949676e-08, + "loss": 0.1992, + "step": 6092 + }, + { + "epoch": 2.8851139390352176, + "grad_norm": 0.9635332822799683, + "learning_rate": 7.522910992445842e-08, + "loss": 0.2011, + "step": 6093 + }, + { + "epoch": 2.8855874519088487, + "grad_norm": 0.9744061827659607, + "learning_rate": 7.46042857820306e-08, + "loss": 0.1843, + "step": 6094 + }, + { + "epoch": 2.88606096478248, + "grad_norm": 1.0627609491348267, + "learning_rate": 7.39820575311001e-08, + "loss": 0.2044, + "step": 6095 + }, + { + "epoch": 2.8865344776561113, + "grad_norm": 1.7792444229125977, + "learning_rate": 7.33624253344034e-08, + "loss": 0.1901, + "step": 6096 + }, + { + "epoch": 2.8870079905297423, + "grad_norm": 1.1715210676193237, + "learning_rate": 7.274538935400199e-08, + "loss": 0.2155, + "step": 6097 + }, + { + "epoch": 2.887481503403374, + "grad_norm": 1.223122000694275, + "learning_rate": 7.213094975127233e-08, + "loss": 0.1888, + "step": 6098 + }, + { + "epoch": 2.887955016277005, + "grad_norm": 1.16334068775177, + "learning_rate": 7.151910668691808e-08, + "loss": 0.209, + "step": 6099 + }, + { + "epoch": 2.8884285291506364, + "grad_norm": 1.090516209602356, + "learning_rate": 7.090986032095903e-08, + "loss": 0.1873, + "step": 6100 + }, + { + "epoch": 2.8889020420242675, + "grad_norm": 0.9286671876907349, + "learning_rate": 7.030321081273883e-08, + "loss": 0.1888, + "step": 6101 + }, + { + "epoch": 2.8893755548978985, + "grad_norm": 0.9373717904090881, + "learning_rate": 6.969915832092056e-08, + "loss": 0.1875, + "step": 6102 + }, + { + "epoch": 2.88984906777153, + "grad_norm": 1.0975648164749146, + "learning_rate": 6.909770300348784e-08, + "loss": 0.1918, + "step": 6103 + }, + { + "epoch": 2.8903225806451616, + "grad_norm": 1.237648844718933, + "learning_rate": 6.849884501774484e-08, + "loss": 0.2088, + "step": 6104 + }, + { + "epoch": 2.8907960935187926, + "grad_norm": 1.114311933517456, + "learning_rate": 6.790258452031962e-08, + "loss": 0.1929, + "step": 6105 + }, + { + "epoch": 2.8912696063924237, + "grad_norm": 1.3131335973739624, + "learning_rate": 6.73089216671563e-08, + "loss": 0.1966, + "step": 6106 + }, + { + "epoch": 2.891743119266055, + "grad_norm": 1.0691511631011963, + "learning_rate": 6.671785661352182e-08, + "loss": 0.195, + "step": 6107 + }, + { + "epoch": 2.8922166321396863, + "grad_norm": 1.2618228197097778, + "learning_rate": 6.612938951400472e-08, + "loss": 0.1809, + "step": 6108 + }, + { + "epoch": 2.892690145013318, + "grad_norm": 1.2614898681640625, + "learning_rate": 6.554352052251079e-08, + "loss": 0.2123, + "step": 6109 + }, + { + "epoch": 2.893163657886949, + "grad_norm": 1.0897921323776245, + "learning_rate": 6.496024979226967e-08, + "loss": 0.2032, + "step": 6110 + }, + { + "epoch": 2.89363717076058, + "grad_norm": 1.0847673416137695, + "learning_rate": 6.437957747583046e-08, + "loss": 0.1998, + "step": 6111 + }, + { + "epoch": 2.8941106836342114, + "grad_norm": 1.2519115209579468, + "learning_rate": 6.380150372506277e-08, + "loss": 0.1941, + "step": 6112 + }, + { + "epoch": 2.8945841965078425, + "grad_norm": 1.4796113967895508, + "learning_rate": 6.322602869115568e-08, + "loss": 0.2203, + "step": 6113 + }, + { + "epoch": 2.895057709381474, + "grad_norm": 1.2136837244033813, + "learning_rate": 6.265315252461878e-08, + "loss": 0.1934, + "step": 6114 + }, + { + "epoch": 2.895531222255105, + "grad_norm": 1.1030209064483643, + "learning_rate": 6.208287537528223e-08, + "loss": 0.2029, + "step": 6115 + }, + { + "epoch": 2.896004735128736, + "grad_norm": 0.975267231464386, + "learning_rate": 6.151519739229672e-08, + "loss": 0.1909, + "step": 6116 + }, + { + "epoch": 2.8964782480023676, + "grad_norm": 1.1102718114852905, + "learning_rate": 6.095011872413347e-08, + "loss": 0.1771, + "step": 6117 + }, + { + "epoch": 2.8969517608759987, + "grad_norm": 1.001387596130371, + "learning_rate": 6.038763951858206e-08, + "loss": 0.1903, + "step": 6118 + }, + { + "epoch": 2.89742527374963, + "grad_norm": 0.9320979118347168, + "learning_rate": 5.982775992275592e-08, + "loss": 0.2005, + "step": 6119 + }, + { + "epoch": 2.8978987866232613, + "grad_norm": 1.0721495151519775, + "learning_rate": 5.9270480083083445e-08, + "loss": 0.2199, + "step": 6120 + }, + { + "epoch": 2.8983722994968923, + "grad_norm": 1.3128505945205688, + "learning_rate": 5.871580014531697e-08, + "loss": 0.2027, + "step": 6121 + }, + { + "epoch": 2.898845812370524, + "grad_norm": 0.9453513622283936, + "learning_rate": 5.816372025452821e-08, + "loss": 0.1975, + "step": 6122 + }, + { + "epoch": 2.899319325244155, + "grad_norm": 1.4196256399154663, + "learning_rate": 5.7614240555107224e-08, + "loss": 0.2019, + "step": 6123 + }, + { + "epoch": 2.8997928381177864, + "grad_norm": 1.2253470420837402, + "learning_rate": 5.706736119076683e-08, + "loss": 0.2197, + "step": 6124 + }, + { + "epoch": 2.9002663509914175, + "grad_norm": 1.5370304584503174, + "learning_rate": 5.652308230453596e-08, + "loss": 0.198, + "step": 6125 + }, + { + "epoch": 2.9007398638650486, + "grad_norm": 1.08665931224823, + "learning_rate": 5.5981404038767394e-08, + "loss": 0.1892, + "step": 6126 + }, + { + "epoch": 2.90121337673868, + "grad_norm": 1.2362626791000366, + "learning_rate": 5.5442326535130044e-08, + "loss": 0.19, + "step": 6127 + }, + { + "epoch": 2.9016868896123116, + "grad_norm": 1.4645034074783325, + "learning_rate": 5.490584993461556e-08, + "loss": 0.1764, + "step": 6128 + }, + { + "epoch": 2.9021604024859426, + "grad_norm": 1.3126651048660278, + "learning_rate": 5.4371974377533944e-08, + "loss": 0.18, + "step": 6129 + }, + { + "epoch": 2.9026339153595737, + "grad_norm": 0.9817326068878174, + "learning_rate": 5.384070000351571e-08, + "loss": 0.1861, + "step": 6130 + }, + { + "epoch": 2.9031074282332052, + "grad_norm": 1.1825339794158936, + "learning_rate": 5.331202695151083e-08, + "loss": 0.1834, + "step": 6131 + }, + { + "epoch": 2.9035809411068363, + "grad_norm": 1.0101470947265625, + "learning_rate": 5.278595535978648e-08, + "loss": 0.2065, + "step": 6132 + }, + { + "epoch": 2.904054453980468, + "grad_norm": 0.9827751517295837, + "learning_rate": 5.22624853659337e-08, + "loss": 0.2021, + "step": 6133 + }, + { + "epoch": 2.904527966854099, + "grad_norm": 1.8256947994232178, + "learning_rate": 5.174161710685965e-08, + "loss": 0.2104, + "step": 6134 + }, + { + "epoch": 2.90500147972773, + "grad_norm": 1.0786854028701782, + "learning_rate": 5.122335071879425e-08, + "loss": 0.2024, + "step": 6135 + }, + { + "epoch": 2.9054749926013614, + "grad_norm": 1.2459017038345337, + "learning_rate": 5.0707686337282404e-08, + "loss": 0.1871, + "step": 6136 + }, + { + "epoch": 2.9059485054749925, + "grad_norm": 1.1489919424057007, + "learning_rate": 5.0194624097194e-08, + "loss": 0.1885, + "step": 6137 + }, + { + "epoch": 2.906422018348624, + "grad_norm": 1.2657074928283691, + "learning_rate": 4.968416413271393e-08, + "loss": 0.1907, + "step": 6138 + }, + { + "epoch": 2.906895531222255, + "grad_norm": 1.3893004655838013, + "learning_rate": 4.9176306577347624e-08, + "loss": 0.2081, + "step": 6139 + }, + { + "epoch": 2.907369044095886, + "grad_norm": 1.1752687692642212, + "learning_rate": 4.8671051563922156e-08, + "loss": 0.2268, + "step": 6140 + }, + { + "epoch": 2.9078425569695177, + "grad_norm": 0.9716362953186035, + "learning_rate": 4.816839922457961e-08, + "loss": 0.188, + "step": 6141 + }, + { + "epoch": 2.9083160698431487, + "grad_norm": 1.7129836082458496, + "learning_rate": 4.766834969078704e-08, + "loss": 0.2392, + "step": 6142 + }, + { + "epoch": 2.9087895827167802, + "grad_norm": 1.019442081451416, + "learning_rate": 4.717090309332428e-08, + "loss": 0.1874, + "step": 6143 + }, + { + "epoch": 2.9092630955904113, + "grad_norm": 1.5065118074417114, + "learning_rate": 4.667605956229615e-08, + "loss": 0.1921, + "step": 6144 + }, + { + "epoch": 2.9097366084640424, + "grad_norm": 0.9485598206520081, + "learning_rate": 4.618381922712245e-08, + "loss": 0.1998, + "step": 6145 + }, + { + "epoch": 2.910210121337674, + "grad_norm": 0.9879134297370911, + "learning_rate": 4.5694182216544645e-08, + "loss": 0.2106, + "step": 6146 + }, + { + "epoch": 2.9106836342113054, + "grad_norm": 1.3310444355010986, + "learning_rate": 4.520714865862252e-08, + "loss": 0.2027, + "step": 6147 + }, + { + "epoch": 2.9111571470849364, + "grad_norm": 1.6184104681015015, + "learning_rate": 4.47227186807353e-08, + "loss": 0.1938, + "step": 6148 + }, + { + "epoch": 2.9116306599585675, + "grad_norm": 0.9067788124084473, + "learning_rate": 4.4240892409580516e-08, + "loss": 0.1856, + "step": 6149 + }, + { + "epoch": 2.912104172832199, + "grad_norm": 1.6381840705871582, + "learning_rate": 4.3761669971176255e-08, + "loss": 0.2039, + "step": 6150 + }, + { + "epoch": 2.91257768570583, + "grad_norm": 1.0911681652069092, + "learning_rate": 4.328505149085782e-08, + "loss": 0.1867, + "step": 6151 + }, + { + "epoch": 2.9130511985794616, + "grad_norm": 0.9727244973182678, + "learning_rate": 4.281103709327883e-08, + "loss": 0.2022, + "step": 6152 + }, + { + "epoch": 2.9135247114530927, + "grad_norm": 0.9916009306907654, + "learning_rate": 4.233962690241567e-08, + "loss": 0.1753, + "step": 6153 + }, + { + "epoch": 2.9139982243267237, + "grad_norm": 1.301115870475769, + "learning_rate": 4.18708210415586e-08, + "loss": 0.1829, + "step": 6154 + }, + { + "epoch": 2.9144717372003552, + "grad_norm": 1.1094136238098145, + "learning_rate": 4.140461963332065e-08, + "loss": 0.2055, + "step": 6155 + }, + { + "epoch": 2.9149452500739863, + "grad_norm": 0.9840229749679565, + "learning_rate": 4.094102279963319e-08, + "loss": 0.2242, + "step": 6156 + }, + { + "epoch": 2.915418762947618, + "grad_norm": 1.3108941316604614, + "learning_rate": 4.048003066174366e-08, + "loss": 0.1853, + "step": 6157 + }, + { + "epoch": 2.915892275821249, + "grad_norm": 1.0700182914733887, + "learning_rate": 4.002164334022118e-08, + "loss": 0.2149, + "step": 6158 + }, + { + "epoch": 2.91636578869488, + "grad_norm": 1.2526887655258179, + "learning_rate": 3.956586095495207e-08, + "loss": 0.1841, + "step": 6159 + }, + { + "epoch": 2.9168393015685115, + "grad_norm": 1.097798466682434, + "learning_rate": 3.911268362514209e-08, + "loss": 0.1977, + "step": 6160 + }, + { + "epoch": 2.9173128144421425, + "grad_norm": 1.293502688407898, + "learning_rate": 3.866211146931531e-08, + "loss": 0.2091, + "step": 6161 + }, + { + "epoch": 2.917786327315774, + "grad_norm": 1.0824062824249268, + "learning_rate": 3.821414460531414e-08, + "loss": 0.2008, + "step": 6162 + }, + { + "epoch": 2.918259840189405, + "grad_norm": 1.2927944660186768, + "learning_rate": 3.776878315030042e-08, + "loss": 0.2031, + "step": 6163 + }, + { + "epoch": 2.918733353063036, + "grad_norm": 1.0434043407440186, + "learning_rate": 3.73260272207554e-08, + "loss": 0.204, + "step": 6164 + }, + { + "epoch": 2.9192068659366677, + "grad_norm": 0.9378446936607361, + "learning_rate": 3.688587693247536e-08, + "loss": 0.1847, + "step": 6165 + }, + { + "epoch": 2.919680378810299, + "grad_norm": 1.1560747623443604, + "learning_rate": 3.644833240057821e-08, + "loss": 0.2163, + "step": 6166 + }, + { + "epoch": 2.9201538916839302, + "grad_norm": 0.9858490228652954, + "learning_rate": 3.601339373950019e-08, + "loss": 0.1798, + "step": 6167 + }, + { + "epoch": 2.9206274045575613, + "grad_norm": 1.1007888317108154, + "learning_rate": 3.558106106299475e-08, + "loss": 0.1843, + "step": 6168 + }, + { + "epoch": 2.921100917431193, + "grad_norm": 1.1641054153442383, + "learning_rate": 3.515133448413366e-08, + "loss": 0.187, + "step": 6169 + }, + { + "epoch": 2.921574430304824, + "grad_norm": 1.4323322772979736, + "learning_rate": 3.472421411530924e-08, + "loss": 0.1945, + "step": 6170 + }, + { + "epoch": 2.9220479431784554, + "grad_norm": 1.1009232997894287, + "learning_rate": 3.429970006822991e-08, + "loss": 0.1778, + "step": 6171 + }, + { + "epoch": 2.9225214560520865, + "grad_norm": 1.0793761014938354, + "learning_rate": 3.387779245392242e-08, + "loss": 0.1852, + "step": 6172 + }, + { + "epoch": 2.9229949689257175, + "grad_norm": 1.315026879310608, + "learning_rate": 3.345849138273405e-08, + "loss": 0.2095, + "step": 6173 + }, + { + "epoch": 2.923468481799349, + "grad_norm": 1.0471131801605225, + "learning_rate": 3.304179696432708e-08, + "loss": 0.2058, + "step": 6174 + }, + { + "epoch": 2.92394199467298, + "grad_norm": 1.2734088897705078, + "learning_rate": 3.262770930768655e-08, + "loss": 0.1976, + "step": 6175 + }, + { + "epoch": 2.9244155075466116, + "grad_norm": 1.2894588708877563, + "learning_rate": 3.2216228521111393e-08, + "loss": 0.2161, + "step": 6176 + }, + { + "epoch": 2.9248890204202427, + "grad_norm": 0.9765024781227112, + "learning_rate": 3.180735471222107e-08, + "loss": 0.2171, + "step": 6177 + }, + { + "epoch": 2.9253625332938737, + "grad_norm": 0.9731594324111938, + "learning_rate": 3.140108798795227e-08, + "loss": 0.1879, + "step": 6178 + }, + { + "epoch": 2.9258360461675053, + "grad_norm": 1.8532750606536865, + "learning_rate": 3.099742845455889e-08, + "loss": 0.1931, + "step": 6179 + }, + { + "epoch": 2.9263095590411363, + "grad_norm": 1.2193810939788818, + "learning_rate": 3.059637621761646e-08, + "loss": 0.1973, + "step": 6180 + }, + { + "epoch": 2.926783071914768, + "grad_norm": 1.1351593732833862, + "learning_rate": 3.019793138201554e-08, + "loss": 0.2097, + "step": 6181 + }, + { + "epoch": 2.927256584788399, + "grad_norm": 1.9396110773086548, + "learning_rate": 2.9802094051964993e-08, + "loss": 0.2438, + "step": 6182 + }, + { + "epoch": 2.92773009766203, + "grad_norm": 0.9938279986381531, + "learning_rate": 2.9408864330991993e-08, + "loss": 0.198, + "step": 6183 + }, + { + "epoch": 2.9282036105356615, + "grad_norm": 1.4137035608291626, + "learning_rate": 2.9018242321943168e-08, + "loss": 0.2003, + "step": 6184 + }, + { + "epoch": 2.9286771234092925, + "grad_norm": 1.076438546180725, + "learning_rate": 2.8630228126981242e-08, + "loss": 0.2066, + "step": 6185 + }, + { + "epoch": 2.929150636282924, + "grad_norm": 1.398909330368042, + "learning_rate": 2.8244821847587256e-08, + "loss": 0.1752, + "step": 6186 + }, + { + "epoch": 2.929624149156555, + "grad_norm": 0.9109712243080139, + "learning_rate": 2.7862023584561692e-08, + "loss": 0.196, + "step": 6187 + }, + { + "epoch": 2.930097662030186, + "grad_norm": 1.3671661615371704, + "learning_rate": 2.748183343802002e-08, + "loss": 0.2127, + "step": 6188 + }, + { + "epoch": 2.9305711749038177, + "grad_norm": 1.2796677350997925, + "learning_rate": 2.7104251507398262e-08, + "loss": 0.2206, + "step": 6189 + }, + { + "epoch": 2.931044687777449, + "grad_norm": 0.9807784557342529, + "learning_rate": 2.6729277891449634e-08, + "loss": 0.1877, + "step": 6190 + }, + { + "epoch": 2.9315182006510803, + "grad_norm": 0.9959766864776611, + "learning_rate": 2.6356912688244585e-08, + "loss": 0.187, + "step": 6191 + }, + { + "epoch": 2.9319917135247113, + "grad_norm": 1.6035785675048828, + "learning_rate": 2.5987155995171876e-08, + "loss": 0.1968, + "step": 6192 + }, + { + "epoch": 2.932465226398343, + "grad_norm": 1.377586841583252, + "learning_rate": 2.5620007908937483e-08, + "loss": 0.2178, + "step": 6193 + }, + { + "epoch": 2.932938739271974, + "grad_norm": 1.7171190977096558, + "learning_rate": 2.5255468525564598e-08, + "loss": 0.2138, + "step": 6194 + }, + { + "epoch": 2.9334122521456054, + "grad_norm": 1.3508435487747192, + "learning_rate": 2.489353794039695e-08, + "loss": 0.2017, + "step": 6195 + }, + { + "epoch": 2.9338857650192365, + "grad_norm": 1.0717988014221191, + "learning_rate": 2.4534216248092158e-08, + "loss": 0.1961, + "step": 6196 + }, + { + "epoch": 2.9343592778928675, + "grad_norm": 1.060861349105835, + "learning_rate": 2.4177503542627266e-08, + "loss": 0.2179, + "step": 6197 + }, + { + "epoch": 2.934832790766499, + "grad_norm": 1.0644173622131348, + "learning_rate": 2.382339991729987e-08, + "loss": 0.2089, + "step": 6198 + }, + { + "epoch": 2.93530630364013, + "grad_norm": 1.3779876232147217, + "learning_rate": 2.3471905464719226e-08, + "loss": 0.2026, + "step": 6199 + }, + { + "epoch": 2.9357798165137616, + "grad_norm": 1.585129737854004, + "learning_rate": 2.312302027681623e-08, + "loss": 0.2053, + "step": 6200 + }, + { + "epoch": 2.9362533293873927, + "grad_norm": 1.6859186887741089, + "learning_rate": 2.2776744444839017e-08, + "loss": 0.2124, + "step": 6201 + }, + { + "epoch": 2.9367268422610238, + "grad_norm": 1.0570391416549683, + "learning_rate": 2.24330780593518e-08, + "loss": 0.1986, + "step": 6202 + }, + { + "epoch": 2.9372003551346553, + "grad_norm": 1.2987060546875, + "learning_rate": 2.2092021210238233e-08, + "loss": 0.2184, + "step": 6203 + }, + { + "epoch": 2.9376738680082863, + "grad_norm": 1.2632862329483032, + "learning_rate": 2.1753573986698086e-08, + "loss": 0.2145, + "step": 6204 + }, + { + "epoch": 2.938147380881918, + "grad_norm": 0.9195615649223328, + "learning_rate": 2.141773647724832e-08, + "loss": 0.1825, + "step": 6205 + }, + { + "epoch": 2.938620893755549, + "grad_norm": 1.0756436586380005, + "learning_rate": 2.1084508769725344e-08, + "loss": 0.2099, + "step": 6206 + }, + { + "epoch": 2.93909440662918, + "grad_norm": 1.0231488943099976, + "learning_rate": 2.0753890951280554e-08, + "loss": 0.2167, + "step": 6207 + }, + { + "epoch": 2.9395679195028115, + "grad_norm": 1.5014095306396484, + "learning_rate": 2.0425883108383672e-08, + "loss": 0.1996, + "step": 6208 + }, + { + "epoch": 2.940041432376443, + "grad_norm": 1.0937631130218506, + "learning_rate": 2.010048532682274e-08, + "loss": 0.223, + "step": 6209 + }, + { + "epoch": 2.940514945250074, + "grad_norm": 1.103412389755249, + "learning_rate": 1.9777697691701904e-08, + "loss": 0.1888, + "step": 6210 + }, + { + "epoch": 2.940988458123705, + "grad_norm": 1.1209019422531128, + "learning_rate": 1.945752028744252e-08, + "loss": 0.1824, + "step": 6211 + }, + { + "epoch": 2.9414619709973366, + "grad_norm": 0.9927660226821899, + "learning_rate": 1.913995319778539e-08, + "loss": 0.186, + "step": 6212 + }, + { + "epoch": 2.9419354838709677, + "grad_norm": 1.021451473236084, + "learning_rate": 1.8824996505787398e-08, + "loss": 0.1983, + "step": 6213 + }, + { + "epoch": 2.942408996744599, + "grad_norm": 1.1407397985458374, + "learning_rate": 1.8512650293820433e-08, + "loss": 0.1997, + "step": 6214 + }, + { + "epoch": 2.9428825096182303, + "grad_norm": 0.8883314728736877, + "learning_rate": 1.820291464357693e-08, + "loss": 0.1719, + "step": 6215 + }, + { + "epoch": 2.9433560224918613, + "grad_norm": 1.1217479705810547, + "learning_rate": 1.789578963606431e-08, + "loss": 0.1944, + "step": 6216 + }, + { + "epoch": 2.943829535365493, + "grad_norm": 1.5166805982589722, + "learning_rate": 1.7591275351609428e-08, + "loss": 0.215, + "step": 6217 + }, + { + "epoch": 2.944303048239124, + "grad_norm": 1.0720276832580566, + "learning_rate": 1.7289371869854132e-08, + "loss": 0.1917, + "step": 6218 + }, + { + "epoch": 2.9447765611127554, + "grad_norm": 1.4796210527420044, + "learning_rate": 1.699007926975971e-08, + "loss": 0.2059, + "step": 6219 + }, + { + "epoch": 2.9452500739863865, + "grad_norm": 1.3808337450027466, + "learning_rate": 1.6693397629601317e-08, + "loss": 0.1986, + "step": 6220 + }, + { + "epoch": 2.9457235868600176, + "grad_norm": 1.275921106338501, + "learning_rate": 1.6399327026974666e-08, + "loss": 0.1888, + "step": 6221 + }, + { + "epoch": 2.946197099733649, + "grad_norm": 1.051120638847351, + "learning_rate": 1.6107867538790456e-08, + "loss": 0.2187, + "step": 6222 + }, + { + "epoch": 2.94667061260728, + "grad_norm": 1.0114779472351074, + "learning_rate": 1.58190192412766e-08, + "loss": 0.2084, + "step": 6223 + }, + { + "epoch": 2.9471441254809116, + "grad_norm": 1.3596932888031006, + "learning_rate": 1.5532782209979336e-08, + "loss": 0.1849, + "step": 6224 + }, + { + "epoch": 2.9476176383545427, + "grad_norm": 2.326784372329712, + "learning_rate": 1.524915651976211e-08, + "loss": 0.184, + "step": 6225 + }, + { + "epoch": 2.948091151228174, + "grad_norm": 1.3456201553344727, + "learning_rate": 1.4968142244802254e-08, + "loss": 0.2014, + "step": 6226 + }, + { + "epoch": 2.9485646641018053, + "grad_norm": 1.592707872390747, + "learning_rate": 1.4689739458597641e-08, + "loss": 0.193, + "step": 6227 + }, + { + "epoch": 2.9490381769754364, + "grad_norm": 1.3673793077468872, + "learning_rate": 1.4413948233961138e-08, + "loss": 0.2131, + "step": 6228 + }, + { + "epoch": 2.949511689849068, + "grad_norm": 1.1322108507156372, + "learning_rate": 1.414076864302505e-08, + "loss": 0.2056, + "step": 6229 + }, + { + "epoch": 2.949985202722699, + "grad_norm": 0.8551841974258423, + "learning_rate": 1.3870200757235552e-08, + "loss": 0.1825, + "step": 6230 + }, + { + "epoch": 2.9504587155963304, + "grad_norm": 0.9732584357261658, + "learning_rate": 1.3602244647356044e-08, + "loss": 0.208, + "step": 6231 + }, + { + "epoch": 2.9509322284699615, + "grad_norm": 1.1727527379989624, + "learning_rate": 1.3336900383469353e-08, + "loss": 0.1963, + "step": 6232 + }, + { + "epoch": 2.951405741343593, + "grad_norm": 0.9997616410255432, + "learning_rate": 1.30741680349733e-08, + "loss": 0.1923, + "step": 6233 + }, + { + "epoch": 2.951879254217224, + "grad_norm": 1.2932562828063965, + "learning_rate": 1.2814047670584028e-08, + "loss": 0.1968, + "step": 6234 + }, + { + "epoch": 2.952352767090855, + "grad_norm": 1.0885539054870605, + "learning_rate": 1.2556539358331566e-08, + "loss": 0.1931, + "step": 6235 + }, + { + "epoch": 2.9528262799644867, + "grad_norm": 1.0896440744400024, + "learning_rate": 1.230164316556537e-08, + "loss": 0.1941, + "step": 6236 + }, + { + "epoch": 2.9532997928381177, + "grad_norm": 1.0477862358093262, + "learning_rate": 1.2049359158952111e-08, + "loss": 0.192, + "step": 6237 + }, + { + "epoch": 2.9537733057117492, + "grad_norm": 0.844715416431427, + "learning_rate": 1.1799687404473458e-08, + "loss": 0.1807, + "step": 6238 + }, + { + "epoch": 2.9542468185853803, + "grad_norm": 1.3414781093597412, + "learning_rate": 1.1552627967428288e-08, + "loss": 0.2202, + "step": 6239 + }, + { + "epoch": 2.9547203314590114, + "grad_norm": 0.9424867630004883, + "learning_rate": 1.1308180912432688e-08, + "loss": 0.1943, + "step": 6240 + }, + { + "epoch": 2.955193844332643, + "grad_norm": 1.0376644134521484, + "learning_rate": 1.1066346303421071e-08, + "loss": 0.1946, + "step": 6241 + }, + { + "epoch": 2.955667357206274, + "grad_norm": 1.2721221446990967, + "learning_rate": 1.0827124203640627e-08, + "loss": 0.1955, + "step": 6242 + }, + { + "epoch": 2.9561408700799054, + "grad_norm": 1.0730029344558716, + "learning_rate": 1.059051467565797e-08, + "loss": 0.1921, + "step": 6243 + }, + { + "epoch": 2.9566143829535365, + "grad_norm": 1.2354124784469604, + "learning_rate": 1.0356517781358044e-08, + "loss": 0.2074, + "step": 6244 + }, + { + "epoch": 2.9570878958271676, + "grad_norm": 0.9237876534461975, + "learning_rate": 1.0125133581938562e-08, + "loss": 0.2075, + "step": 6245 + }, + { + "epoch": 2.957561408700799, + "grad_norm": 1.4493162631988525, + "learning_rate": 9.896362137916672e-09, + "loss": 0.2207, + "step": 6246 + }, + { + "epoch": 2.95803492157443, + "grad_norm": 1.3988970518112183, + "learning_rate": 9.670203509124509e-09, + "loss": 0.197, + "step": 6247 + }, + { + "epoch": 2.9585084344480617, + "grad_norm": 1.1738380193710327, + "learning_rate": 9.44665775471254e-09, + "loss": 0.2102, + "step": 6248 + }, + { + "epoch": 2.9589819473216927, + "grad_norm": 1.2801897525787354, + "learning_rate": 9.225724933146218e-09, + "loss": 0.2061, + "step": 6249 + }, + { + "epoch": 2.959455460195324, + "grad_norm": 1.5334160327911377, + "learning_rate": 9.007405102209321e-09, + "loss": 0.1896, + "step": 6250 + }, + { + "epoch": 2.9599289730689553, + "grad_norm": 1.0199025869369507, + "learning_rate": 8.791698318999508e-09, + "loss": 0.2034, + "step": 6251 + }, + { + "epoch": 2.960402485942587, + "grad_norm": 0.8134233951568604, + "learning_rate": 8.578604639936095e-09, + "loss": 0.1722, + "step": 6252 + }, + { + "epoch": 2.960875998816218, + "grad_norm": 1.0595088005065918, + "learning_rate": 8.368124120747833e-09, + "loss": 0.1856, + "step": 6253 + }, + { + "epoch": 2.961349511689849, + "grad_norm": 0.9498534202575684, + "learning_rate": 8.160256816487355e-09, + "loss": 0.1602, + "step": 6254 + }, + { + "epoch": 2.9618230245634805, + "grad_norm": 0.8854544162750244, + "learning_rate": 7.95500278151784e-09, + "loss": 0.1959, + "step": 6255 + }, + { + "epoch": 2.9622965374371115, + "grad_norm": 1.1654853820800781, + "learning_rate": 7.752362069523012e-09, + "loss": 0.2078, + "step": 6256 + }, + { + "epoch": 2.962770050310743, + "grad_norm": 0.9548454880714417, + "learning_rate": 7.552334733500477e-09, + "loss": 0.1876, + "step": 6257 + }, + { + "epoch": 2.963243563184374, + "grad_norm": 1.966020107269287, + "learning_rate": 7.354920825766166e-09, + "loss": 0.1918, + "step": 6258 + }, + { + "epoch": 2.963717076058005, + "grad_norm": 1.6142258644104004, + "learning_rate": 7.160120397950998e-09, + "loss": 0.2173, + "step": 6259 + }, + { + "epoch": 2.9641905889316367, + "grad_norm": 0.9371702075004578, + "learning_rate": 6.967933501004221e-09, + "loss": 0.1948, + "step": 6260 + }, + { + "epoch": 2.9646641018052677, + "grad_norm": 1.0012017488479614, + "learning_rate": 6.778360185190069e-09, + "loss": 0.196, + "step": 6261 + }, + { + "epoch": 2.9651376146788992, + "grad_norm": 1.5582655668258667, + "learning_rate": 6.591400500088885e-09, + "loss": 0.197, + "step": 6262 + }, + { + "epoch": 2.9656111275525303, + "grad_norm": 1.4498084783554077, + "learning_rate": 6.407054494599329e-09, + "loss": 0.2011, + "step": 6263 + }, + { + "epoch": 2.9660846404261614, + "grad_norm": 1.6158671379089355, + "learning_rate": 6.2253222169339485e-09, + "loss": 0.1903, + "step": 6264 + }, + { + "epoch": 2.966558153299793, + "grad_norm": 1.1202692985534668, + "learning_rate": 6.046203714624721e-09, + "loss": 0.2186, + "step": 6265 + }, + { + "epoch": 2.967031666173424, + "grad_norm": 1.2885193824768066, + "learning_rate": 5.8696990345175064e-09, + "loss": 0.2111, + "step": 6266 + }, + { + "epoch": 2.9675051790470555, + "grad_norm": 0.9069783687591553, + "learning_rate": 5.695808222775379e-09, + "loss": 0.2006, + "step": 6267 + }, + { + "epoch": 2.9679786919206865, + "grad_norm": 1.5070737600326538, + "learning_rate": 5.524531324877513e-09, + "loss": 0.1914, + "step": 6268 + }, + { + "epoch": 2.9684522047943176, + "grad_norm": 1.019160509109497, + "learning_rate": 5.3558683856203e-09, + "loss": 0.21, + "step": 6269 + }, + { + "epoch": 2.968925717667949, + "grad_norm": 1.3049955368041992, + "learning_rate": 5.189819449116229e-09, + "loss": 0.1916, + "step": 6270 + }, + { + "epoch": 2.9693992305415806, + "grad_norm": 1.0356775522232056, + "learning_rate": 5.026384558792785e-09, + "loss": 0.2048, + "step": 6271 + }, + { + "epoch": 2.9698727434152117, + "grad_norm": 1.2920750379562378, + "learning_rate": 4.865563757394665e-09, + "loss": 0.2184, + "step": 6272 + }, + { + "epoch": 2.9703462562888427, + "grad_norm": 1.0191415548324585, + "learning_rate": 4.707357086983777e-09, + "loss": 0.1999, + "step": 6273 + }, + { + "epoch": 2.9708197691624743, + "grad_norm": 1.0479013919830322, + "learning_rate": 4.5517645889381346e-09, + "loss": 0.2006, + "step": 6274 + }, + { + "epoch": 2.9712932820361053, + "grad_norm": 1.0281206369400024, + "learning_rate": 4.398786303949632e-09, + "loss": 0.1796, + "step": 6275 + }, + { + "epoch": 2.971766794909737, + "grad_norm": 1.0202572345733643, + "learning_rate": 4.248422272029596e-09, + "loss": 0.2112, + "step": 6276 + }, + { + "epoch": 2.972240307783368, + "grad_norm": 1.0091017484664917, + "learning_rate": 4.100672532504346e-09, + "loss": 0.1999, + "step": 6277 + }, + { + "epoch": 2.972713820656999, + "grad_norm": 0.9518880844116211, + "learning_rate": 3.955537124016306e-09, + "loss": 0.1934, + "step": 6278 + }, + { + "epoch": 2.9731873335306305, + "grad_norm": 1.1970912218093872, + "learning_rate": 3.813016084522892e-09, + "loss": 0.1956, + "step": 6279 + }, + { + "epoch": 2.9736608464042615, + "grad_norm": 1.150801181793213, + "learning_rate": 3.673109451300949e-09, + "loss": 0.1675, + "step": 6280 + }, + { + "epoch": 2.974134359277893, + "grad_norm": 1.025273323059082, + "learning_rate": 3.53581726094121e-09, + "loss": 0.1821, + "step": 6281 + }, + { + "epoch": 2.974607872151524, + "grad_norm": 1.1569122076034546, + "learning_rate": 3.4011395493505073e-09, + "loss": 0.1819, + "step": 6282 + }, + { + "epoch": 2.975081385025155, + "grad_norm": 2.0140018463134766, + "learning_rate": 3.269076351752887e-09, + "loss": 0.1852, + "step": 6283 + }, + { + "epoch": 2.9755548978987867, + "grad_norm": 1.07260000705719, + "learning_rate": 3.139627702688497e-09, + "loss": 0.2121, + "step": 6284 + }, + { + "epoch": 2.9760284107724178, + "grad_norm": 1.5674619674682617, + "learning_rate": 3.0127936360124786e-09, + "loss": 0.1916, + "step": 6285 + }, + { + "epoch": 2.9765019236460493, + "grad_norm": 1.289867639541626, + "learning_rate": 2.888574184898296e-09, + "loss": 0.1823, + "step": 6286 + }, + { + "epoch": 2.9769754365196803, + "grad_norm": 1.5313868522644043, + "learning_rate": 2.7669693818332954e-09, + "loss": 0.1996, + "step": 6287 + }, + { + "epoch": 2.9774489493933114, + "grad_norm": 1.160865306854248, + "learning_rate": 2.6479792586220356e-09, + "loss": 0.211, + "step": 6288 + }, + { + "epoch": 2.977922462266943, + "grad_norm": 1.0990283489227295, + "learning_rate": 2.531603846386288e-09, + "loss": 0.1845, + "step": 6289 + }, + { + "epoch": 2.978395975140574, + "grad_norm": 1.126089096069336, + "learning_rate": 2.417843175561707e-09, + "loss": 0.189, + "step": 6290 + }, + { + "epoch": 2.9788694880142055, + "grad_norm": 1.5801364183425903, + "learning_rate": 2.30669727590227e-09, + "loss": 0.214, + "step": 6291 + }, + { + "epoch": 2.9793430008878365, + "grad_norm": 0.9013964533805847, + "learning_rate": 2.1981661764769456e-09, + "loss": 0.2052, + "step": 6292 + }, + { + "epoch": 2.979816513761468, + "grad_norm": 1.815775752067566, + "learning_rate": 2.0922499056708066e-09, + "loss": 0.1886, + "step": 6293 + }, + { + "epoch": 2.980290026635099, + "grad_norm": 1.0103025436401367, + "learning_rate": 1.9889484911850276e-09, + "loss": 0.1864, + "step": 6294 + }, + { + "epoch": 2.9807635395087306, + "grad_norm": 1.127058744430542, + "learning_rate": 1.8882619600368855e-09, + "loss": 0.2151, + "step": 6295 + }, + { + "epoch": 2.9812370523823617, + "grad_norm": 1.0886855125427246, + "learning_rate": 1.7901903385597607e-09, + "loss": 0.1955, + "step": 6296 + }, + { + "epoch": 2.9817105652559928, + "grad_norm": 1.1773751974105835, + "learning_rate": 1.694733652405356e-09, + "loss": 0.172, + "step": 6297 + }, + { + "epoch": 2.9821840781296243, + "grad_norm": 0.9003830552101135, + "learning_rate": 1.601891926537036e-09, + "loss": 0.2013, + "step": 6298 + }, + { + "epoch": 2.9826575910032553, + "grad_norm": 1.0472919940948486, + "learning_rate": 1.5116651852375985e-09, + "loss": 0.1943, + "step": 6299 + }, + { + "epoch": 2.983131103876887, + "grad_norm": 1.1670929193496704, + "learning_rate": 1.4240534521059447e-09, + "loss": 0.1955, + "step": 6300 + }, + { + "epoch": 2.983604616750518, + "grad_norm": 1.7558685541152954, + "learning_rate": 1.3390567500537466e-09, + "loss": 0.2085, + "step": 6301 + }, + { + "epoch": 2.984078129624149, + "grad_norm": 1.4146206378936768, + "learning_rate": 1.2566751013132205e-09, + "loss": 0.2048, + "step": 6302 + }, + { + "epoch": 2.9845516424977805, + "grad_norm": 1.2582781314849854, + "learning_rate": 1.1769085274304648e-09, + "loss": 0.2005, + "step": 6303 + }, + { + "epoch": 2.9850251553714116, + "grad_norm": 1.0818257331848145, + "learning_rate": 1.0997570492654597e-09, + "loss": 0.2016, + "step": 6304 + }, + { + "epoch": 2.985498668245043, + "grad_norm": 1.0899266004562378, + "learning_rate": 1.025220686998729e-09, + "loss": 0.2178, + "step": 6305 + }, + { + "epoch": 2.985972181118674, + "grad_norm": 0.9373279809951782, + "learning_rate": 9.53299460123569e-10, + "loss": 0.1966, + "step": 6306 + }, + { + "epoch": 2.986445693992305, + "grad_norm": 1.2788419723510742, + "learning_rate": 8.83993387450488e-10, + "loss": 0.2131, + "step": 6307 + }, + { + "epoch": 2.9869192068659367, + "grad_norm": 1.247537612915039, + "learning_rate": 8.173024871060974e-10, + "loss": 0.1796, + "step": 6308 + }, + { + "epoch": 2.9873927197395678, + "grad_norm": 1.1506285667419434, + "learning_rate": 7.532267765320012e-10, + "loss": 0.1883, + "step": 6309 + }, + { + "epoch": 2.9878662326131993, + "grad_norm": 1.517842173576355, + "learning_rate": 6.917662724870155e-10, + "loss": 0.2005, + "step": 6310 + }, + { + "epoch": 2.9883397454868303, + "grad_norm": 1.6192688941955566, + "learning_rate": 6.329209910460598e-10, + "loss": 0.1934, + "step": 6311 + }, + { + "epoch": 2.9888132583604614, + "grad_norm": 1.194815754890442, + "learning_rate": 5.766909475979354e-10, + "loss": 0.2053, + "step": 6312 + }, + { + "epoch": 2.989286771234093, + "grad_norm": 0.9704394340515137, + "learning_rate": 5.230761568508769e-10, + "loss": 0.1886, + "step": 6313 + }, + { + "epoch": 2.9897602841077244, + "grad_norm": 1.4745657444000244, + "learning_rate": 4.72076632827001e-10, + "loss": 0.2025, + "step": 6314 + }, + { + "epoch": 2.9902337969813555, + "grad_norm": 1.8984787464141846, + "learning_rate": 4.2369238886341704e-10, + "loss": 0.2104, + "step": 6315 + }, + { + "epoch": 2.9907073098549866, + "grad_norm": 1.1849009990692139, + "learning_rate": 3.7792343761555717e-10, + "loss": 0.1879, + "step": 6316 + }, + { + "epoch": 2.991180822728618, + "grad_norm": 1.43865168094635, + "learning_rate": 3.347697910538461e-10, + "loss": 0.2133, + "step": 6317 + }, + { + "epoch": 2.991654335602249, + "grad_norm": 1.0743141174316406, + "learning_rate": 2.942314604648111e-10, + "loss": 0.1879, + "step": 6318 + }, + { + "epoch": 2.9921278484758806, + "grad_norm": 0.9571679830551147, + "learning_rate": 2.5630845645108207e-10, + "loss": 0.1998, + "step": 6319 + }, + { + "epoch": 2.9926013613495117, + "grad_norm": 0.9296308755874634, + "learning_rate": 2.210007889302812e-10, + "loss": 0.1982, + "step": 6320 + }, + { + "epoch": 2.993074874223143, + "grad_norm": 1.0719934701919556, + "learning_rate": 1.883084671372437e-10, + "loss": 0.1833, + "step": 6321 + }, + { + "epoch": 2.9935483870967743, + "grad_norm": 1.165964961051941, + "learning_rate": 1.5823149962179707e-10, + "loss": 0.198, + "step": 6322 + }, + { + "epoch": 2.9940218999704054, + "grad_norm": 1.052899956703186, + "learning_rate": 1.3076989425098162e-10, + "loss": 0.1998, + "step": 6323 + }, + { + "epoch": 2.994495412844037, + "grad_norm": 1.0535532236099243, + "learning_rate": 1.0592365820683015e-10, + "loss": 0.2141, + "step": 6324 + }, + { + "epoch": 2.994968925717668, + "grad_norm": 0.9778185486793518, + "learning_rate": 8.369279798747798e-11, + "loss": 0.1918, + "step": 6325 + }, + { + "epoch": 2.995442438591299, + "grad_norm": 1.1342120170593262, + "learning_rate": 6.407731940827333e-11, + "loss": 0.1983, + "step": 6326 + }, + { + "epoch": 2.9959159514649305, + "grad_norm": 1.020653247833252, + "learning_rate": 4.707722759733635e-11, + "loss": 0.1911, + "step": 6327 + }, + { + "epoch": 2.9963894643385616, + "grad_norm": 0.9396451115608215, + "learning_rate": 3.26925270033307e-11, + "loss": 0.1879, + "step": 6328 + }, + { + "epoch": 2.996862977212193, + "grad_norm": 0.9087719321250916, + "learning_rate": 2.0923221385471538e-11, + "loss": 0.1948, + "step": 6329 + }, + { + "epoch": 2.997336490085824, + "grad_norm": 1.0705500841140747, + "learning_rate": 1.1769313825737983e-11, + "loss": 0.1777, + "step": 6330 + }, + { + "epoch": 2.997810002959455, + "grad_norm": 1.0305906534194946, + "learning_rate": 5.230806714440206e-12, + "loss": 0.1944, + "step": 6331 + }, + { + "epoch": 2.9982835158330867, + "grad_norm": 1.1944894790649414, + "learning_rate": 1.307701764652336e-12, + "loss": 0.1901, + "step": 6332 + }, + { + "epoch": 2.9987570287067182, + "grad_norm": 1.1254169940948486, + "learning_rate": 0.0, + "loss": 0.1845, + "step": 6333 + }, + { + "epoch": 2.9987570287067182, + "step": 6333, + "total_flos": 6.209261276653158e+16, + "train_loss": 0.25655047147971993, + "train_runtime": 12932.3665, + "train_samples_per_second": 62.704, + "train_steps_per_second": 0.49 + } + ], + "logging_steps": 1.0, + "max_steps": 6333, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.209261276653158e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}