diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17262 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9983753046303816, + "eval_steps": 500, + "global_step": 2460, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012185215272136475, + "grad_norm": 6.834534852371434, + "learning_rate": 4.0650406504065046e-08, + "loss": 1.0814, + "step": 1 + }, + { + "epoch": 0.002437043054427295, + "grad_norm": 6.434639482368721, + "learning_rate": 8.130081300813009e-08, + "loss": 1.0829, + "step": 2 + }, + { + "epoch": 0.0036555645816409425, + "grad_norm": 6.863769364155968, + "learning_rate": 1.2195121951219514e-07, + "loss": 1.1046, + "step": 3 + }, + { + "epoch": 0.00487408610885459, + "grad_norm": 6.5857528488136925, + "learning_rate": 1.6260162601626018e-07, + "loss": 1.0853, + "step": 4 + }, + { + "epoch": 0.006092607636068237, + "grad_norm": 6.521091645053231, + "learning_rate": 2.0325203252032523e-07, + "loss": 1.0773, + "step": 5 + }, + { + "epoch": 0.007311129163281885, + "grad_norm": 6.29456329359191, + "learning_rate": 2.439024390243903e-07, + "loss": 1.0569, + "step": 6 + }, + { + "epoch": 0.008529650690495532, + "grad_norm": 6.6897100511742575, + "learning_rate": 2.845528455284553e-07, + "loss": 1.0615, + "step": 7 + }, + { + "epoch": 0.00974817221770918, + "grad_norm": 6.645918686532055, + "learning_rate": 3.2520325203252037e-07, + "loss": 1.11, + "step": 8 + }, + { + "epoch": 0.010966693744922826, + "grad_norm": 6.5860967765132505, + "learning_rate": 3.6585365853658536e-07, + "loss": 1.0968, + "step": 9 + }, + { + "epoch": 0.012185215272136474, + "grad_norm": 6.316550148573923, + "learning_rate": 4.0650406504065046e-07, + "loss": 1.0402, + "step": 10 + }, + { + "epoch": 0.013403736799350122, + "grad_norm": 6.589233946221034, + "learning_rate": 4.471544715447155e-07, + "loss": 1.0883, + "step": 11 + }, + { + "epoch": 0.01462225832656377, + "grad_norm": 5.923561630066145, + "learning_rate": 4.878048780487805e-07, + "loss": 1.0577, + "step": 12 + }, + { + "epoch": 0.015840779853777416, + "grad_norm": 6.107921911359583, + "learning_rate": 5.284552845528456e-07, + "loss": 1.0624, + "step": 13 + }, + { + "epoch": 0.017059301380991064, + "grad_norm": 5.923788461868563, + "learning_rate": 5.691056910569106e-07, + "loss": 1.0736, + "step": 14 + }, + { + "epoch": 0.018277822908204712, + "grad_norm": 5.874057009305477, + "learning_rate": 6.097560975609757e-07, + "loss": 1.0447, + "step": 15 + }, + { + "epoch": 0.01949634443541836, + "grad_norm": 5.031342070027601, + "learning_rate": 6.504065040650407e-07, + "loss": 1.0433, + "step": 16 + }, + { + "epoch": 0.020714865962632008, + "grad_norm": 4.869478502925938, + "learning_rate": 6.910569105691058e-07, + "loss": 1.0411, + "step": 17 + }, + { + "epoch": 0.021933387489845652, + "grad_norm": 4.534685569961031, + "learning_rate": 7.317073170731707e-07, + "loss": 0.9874, + "step": 18 + }, + { + "epoch": 0.0231519090170593, + "grad_norm": 4.357854034553876, + "learning_rate": 7.723577235772359e-07, + "loss": 1.0014, + "step": 19 + }, + { + "epoch": 0.024370430544272948, + "grad_norm": 4.315329088535952, + "learning_rate": 8.130081300813009e-07, + "loss": 0.9925, + "step": 20 + }, + { + "epoch": 0.025588952071486596, + "grad_norm": 3.3054166230275337, + "learning_rate": 8.53658536585366e-07, + "loss": 1.003, + "step": 21 + }, + { + "epoch": 0.026807473598700244, + "grad_norm": 2.726952834638993, + "learning_rate": 8.94308943089431e-07, + "loss": 0.9692, + "step": 22 + }, + { + "epoch": 0.028025995125913892, + "grad_norm": 2.7243029466264, + "learning_rate": 9.349593495934959e-07, + "loss": 1.0212, + "step": 23 + }, + { + "epoch": 0.02924451665312754, + "grad_norm": 2.5896267051583313, + "learning_rate": 9.75609756097561e-07, + "loss": 0.9757, + "step": 24 + }, + { + "epoch": 0.030463038180341188, + "grad_norm": 2.6706226110708498, + "learning_rate": 1.0162601626016261e-06, + "loss": 0.9801, + "step": 25 + }, + { + "epoch": 0.03168155970755483, + "grad_norm": 2.437924378367497, + "learning_rate": 1.0569105691056912e-06, + "loss": 0.9768, + "step": 26 + }, + { + "epoch": 0.03290008123476848, + "grad_norm": 2.5803827822332397, + "learning_rate": 1.0975609756097562e-06, + "loss": 0.9895, + "step": 27 + }, + { + "epoch": 0.03411860276198213, + "grad_norm": 2.203043465720474, + "learning_rate": 1.1382113821138213e-06, + "loss": 0.9726, + "step": 28 + }, + { + "epoch": 0.035337124289195776, + "grad_norm": 1.805085101507344, + "learning_rate": 1.1788617886178863e-06, + "loss": 0.9237, + "step": 29 + }, + { + "epoch": 0.036555645816409424, + "grad_norm": 2.1165116479858215, + "learning_rate": 1.2195121951219514e-06, + "loss": 0.9338, + "step": 30 + }, + { + "epoch": 0.03777416734362307, + "grad_norm": 2.3435725524236735, + "learning_rate": 1.2601626016260162e-06, + "loss": 0.9365, + "step": 31 + }, + { + "epoch": 0.03899268887083672, + "grad_norm": 2.2985182006567326, + "learning_rate": 1.3008130081300815e-06, + "loss": 0.9135, + "step": 32 + }, + { + "epoch": 0.04021121039805037, + "grad_norm": 2.1356048436615036, + "learning_rate": 1.3414634146341465e-06, + "loss": 0.9196, + "step": 33 + }, + { + "epoch": 0.041429731925264016, + "grad_norm": 2.028116965668269, + "learning_rate": 1.3821138211382116e-06, + "loss": 0.9074, + "step": 34 + }, + { + "epoch": 0.042648253452477664, + "grad_norm": 1.7266339650438713, + "learning_rate": 1.4227642276422766e-06, + "loss": 0.8969, + "step": 35 + }, + { + "epoch": 0.043866774979691305, + "grad_norm": 1.548462619619361, + "learning_rate": 1.4634146341463414e-06, + "loss": 0.9009, + "step": 36 + }, + { + "epoch": 0.04508529650690495, + "grad_norm": 1.2325234503287605, + "learning_rate": 1.5040650406504067e-06, + "loss": 0.8942, + "step": 37 + }, + { + "epoch": 0.0463038180341186, + "grad_norm": 1.119008985117944, + "learning_rate": 1.5447154471544717e-06, + "loss": 0.8869, + "step": 38 + }, + { + "epoch": 0.04752233956133225, + "grad_norm": 1.232748685157114, + "learning_rate": 1.5853658536585368e-06, + "loss": 0.8542, + "step": 39 + }, + { + "epoch": 0.048740861088545896, + "grad_norm": 1.6677472249438465, + "learning_rate": 1.6260162601626018e-06, + "loss": 0.8594, + "step": 40 + }, + { + "epoch": 0.049959382615759544, + "grad_norm": 1.5964345757340976, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.8481, + "step": 41 + }, + { + "epoch": 0.05117790414297319, + "grad_norm": 1.5241421915515152, + "learning_rate": 1.707317073170732e-06, + "loss": 0.8478, + "step": 42 + }, + { + "epoch": 0.05239642567018684, + "grad_norm": 1.350659230101709, + "learning_rate": 1.747967479674797e-06, + "loss": 0.8669, + "step": 43 + }, + { + "epoch": 0.05361494719740049, + "grad_norm": 1.0459886477510316, + "learning_rate": 1.788617886178862e-06, + "loss": 0.8229, + "step": 44 + }, + { + "epoch": 0.054833468724614136, + "grad_norm": 0.9790437025811132, + "learning_rate": 1.8292682926829268e-06, + "loss": 0.8214, + "step": 45 + }, + { + "epoch": 0.056051990251827784, + "grad_norm": 1.1555035661921353, + "learning_rate": 1.8699186991869919e-06, + "loss": 0.8106, + "step": 46 + }, + { + "epoch": 0.05727051177904143, + "grad_norm": 1.246916947603362, + "learning_rate": 1.9105691056910574e-06, + "loss": 0.8193, + "step": 47 + }, + { + "epoch": 0.05848903330625508, + "grad_norm": 1.1905103203002494, + "learning_rate": 1.951219512195122e-06, + "loss": 0.8075, + "step": 48 + }, + { + "epoch": 0.05970755483346873, + "grad_norm": 0.918493036902164, + "learning_rate": 1.991869918699187e-06, + "loss": 0.7842, + "step": 49 + }, + { + "epoch": 0.060926076360682375, + "grad_norm": 0.961427277586569, + "learning_rate": 2.0325203252032523e-06, + "loss": 0.8206, + "step": 50 + }, + { + "epoch": 0.062144597887896016, + "grad_norm": 0.7135293393398392, + "learning_rate": 2.073170731707317e-06, + "loss": 0.7784, + "step": 51 + }, + { + "epoch": 0.06336311941510966, + "grad_norm": 0.8687421281930399, + "learning_rate": 2.1138211382113824e-06, + "loss": 0.7953, + "step": 52 + }, + { + "epoch": 0.06458164094232331, + "grad_norm": 0.8575700781368814, + "learning_rate": 2.154471544715447e-06, + "loss": 0.7926, + "step": 53 + }, + { + "epoch": 0.06580016246953696, + "grad_norm": 0.9435599171162209, + "learning_rate": 2.1951219512195125e-06, + "loss": 0.7766, + "step": 54 + }, + { + "epoch": 0.06701868399675061, + "grad_norm": 0.7215369508734659, + "learning_rate": 2.2357723577235773e-06, + "loss": 0.7686, + "step": 55 + }, + { + "epoch": 0.06823720552396426, + "grad_norm": 0.6329822213923535, + "learning_rate": 2.2764227642276426e-06, + "loss": 0.7868, + "step": 56 + }, + { + "epoch": 0.0694557270511779, + "grad_norm": 0.6559439915679887, + "learning_rate": 2.317073170731708e-06, + "loss": 0.7672, + "step": 57 + }, + { + "epoch": 0.07067424857839155, + "grad_norm": 0.6424064072265188, + "learning_rate": 2.3577235772357727e-06, + "loss": 0.7876, + "step": 58 + }, + { + "epoch": 0.0718927701056052, + "grad_norm": 0.5670684781739027, + "learning_rate": 2.3983739837398375e-06, + "loss": 0.7545, + "step": 59 + }, + { + "epoch": 0.07311129163281885, + "grad_norm": 0.62119888744641, + "learning_rate": 2.4390243902439027e-06, + "loss": 0.7778, + "step": 60 + }, + { + "epoch": 0.0743298131600325, + "grad_norm": 0.5945888357559133, + "learning_rate": 2.4796747967479676e-06, + "loss": 0.7593, + "step": 61 + }, + { + "epoch": 0.07554833468724614, + "grad_norm": 0.5566344615882963, + "learning_rate": 2.5203252032520324e-06, + "loss": 0.7783, + "step": 62 + }, + { + "epoch": 0.07676685621445979, + "grad_norm": 0.6010029344681969, + "learning_rate": 2.5609756097560977e-06, + "loss": 0.7824, + "step": 63 + }, + { + "epoch": 0.07798537774167344, + "grad_norm": 0.5620208665027641, + "learning_rate": 2.601626016260163e-06, + "loss": 0.7538, + "step": 64 + }, + { + "epoch": 0.07920389926888709, + "grad_norm": 0.629839488738847, + "learning_rate": 2.6422764227642278e-06, + "loss": 0.7478, + "step": 65 + }, + { + "epoch": 0.08042242079610074, + "grad_norm": 0.5843721125393191, + "learning_rate": 2.682926829268293e-06, + "loss": 0.7551, + "step": 66 + }, + { + "epoch": 0.08164094232331438, + "grad_norm": 0.6807297637633912, + "learning_rate": 2.723577235772358e-06, + "loss": 0.7643, + "step": 67 + }, + { + "epoch": 0.08285946385052803, + "grad_norm": 0.5337293638802343, + "learning_rate": 2.764227642276423e-06, + "loss": 0.7581, + "step": 68 + }, + { + "epoch": 0.08407798537774168, + "grad_norm": 0.5557859370743894, + "learning_rate": 2.8048780487804884e-06, + "loss": 0.7485, + "step": 69 + }, + { + "epoch": 0.08529650690495533, + "grad_norm": 0.5518435051656209, + "learning_rate": 2.845528455284553e-06, + "loss": 0.7503, + "step": 70 + }, + { + "epoch": 0.08651502843216897, + "grad_norm": 0.5585475097227505, + "learning_rate": 2.8861788617886185e-06, + "loss": 0.7456, + "step": 71 + }, + { + "epoch": 0.08773354995938261, + "grad_norm": 0.5842062776799712, + "learning_rate": 2.926829268292683e-06, + "loss": 0.7457, + "step": 72 + }, + { + "epoch": 0.08895207148659626, + "grad_norm": 0.5691413037940362, + "learning_rate": 2.967479674796748e-06, + "loss": 0.7316, + "step": 73 + }, + { + "epoch": 0.0901705930138099, + "grad_norm": 0.592000953180703, + "learning_rate": 3.0081300813008134e-06, + "loss": 0.7379, + "step": 74 + }, + { + "epoch": 0.09138911454102355, + "grad_norm": 0.4986095202064355, + "learning_rate": 3.0487804878048782e-06, + "loss": 0.7026, + "step": 75 + }, + { + "epoch": 0.0926076360682372, + "grad_norm": 0.5610598503101445, + "learning_rate": 3.0894308943089435e-06, + "loss": 0.7366, + "step": 76 + }, + { + "epoch": 0.09382615759545085, + "grad_norm": 0.5189286785140359, + "learning_rate": 3.1300813008130083e-06, + "loss": 0.7343, + "step": 77 + }, + { + "epoch": 0.0950446791226645, + "grad_norm": 0.5352050364602269, + "learning_rate": 3.1707317073170736e-06, + "loss": 0.7128, + "step": 78 + }, + { + "epoch": 0.09626320064987814, + "grad_norm": 0.589544520130887, + "learning_rate": 3.211382113821139e-06, + "loss": 0.7377, + "step": 79 + }, + { + "epoch": 0.09748172217709179, + "grad_norm": 0.5170292516124821, + "learning_rate": 3.2520325203252037e-06, + "loss": 0.751, + "step": 80 + }, + { + "epoch": 0.09870024370430544, + "grad_norm": 0.5178115988752247, + "learning_rate": 3.292682926829269e-06, + "loss": 0.7263, + "step": 81 + }, + { + "epoch": 0.09991876523151909, + "grad_norm": 0.5758324455305359, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.7204, + "step": 82 + }, + { + "epoch": 0.10113728675873274, + "grad_norm": 0.5191922407454059, + "learning_rate": 3.3739837398373986e-06, + "loss": 0.7323, + "step": 83 + }, + { + "epoch": 0.10235580828594638, + "grad_norm": 0.5706404216543195, + "learning_rate": 3.414634146341464e-06, + "loss": 0.7343, + "step": 84 + }, + { + "epoch": 0.10357432981316003, + "grad_norm": 0.5166974408338545, + "learning_rate": 3.4552845528455287e-06, + "loss": 0.7347, + "step": 85 + }, + { + "epoch": 0.10479285134037368, + "grad_norm": 0.575076347441057, + "learning_rate": 3.495934959349594e-06, + "loss": 0.729, + "step": 86 + }, + { + "epoch": 0.10601137286758733, + "grad_norm": 0.5503219216241421, + "learning_rate": 3.5365853658536588e-06, + "loss": 0.7247, + "step": 87 + }, + { + "epoch": 0.10722989439480098, + "grad_norm": 0.5315644262103328, + "learning_rate": 3.577235772357724e-06, + "loss": 0.7188, + "step": 88 + }, + { + "epoch": 0.10844841592201462, + "grad_norm": 0.5283688627559194, + "learning_rate": 3.6178861788617893e-06, + "loss": 0.7328, + "step": 89 + }, + { + "epoch": 0.10966693744922827, + "grad_norm": 0.5657078936164937, + "learning_rate": 3.6585365853658537e-06, + "loss": 0.7317, + "step": 90 + }, + { + "epoch": 0.11088545897644192, + "grad_norm": 0.5285271136308272, + "learning_rate": 3.699186991869919e-06, + "loss": 0.7259, + "step": 91 + }, + { + "epoch": 0.11210398050365557, + "grad_norm": 0.5665339591374581, + "learning_rate": 3.7398373983739838e-06, + "loss": 0.71, + "step": 92 + }, + { + "epoch": 0.11332250203086922, + "grad_norm": 0.5408789367861271, + "learning_rate": 3.780487804878049e-06, + "loss": 0.7287, + "step": 93 + }, + { + "epoch": 0.11454102355808286, + "grad_norm": 0.530024765222071, + "learning_rate": 3.821138211382115e-06, + "loss": 0.7158, + "step": 94 + }, + { + "epoch": 0.11575954508529651, + "grad_norm": 0.525972820727265, + "learning_rate": 3.861788617886179e-06, + "loss": 0.6953, + "step": 95 + }, + { + "epoch": 0.11697806661251016, + "grad_norm": 0.5538892198758983, + "learning_rate": 3.902439024390244e-06, + "loss": 0.7213, + "step": 96 + }, + { + "epoch": 0.11819658813972381, + "grad_norm": 0.544929946076996, + "learning_rate": 3.943089430894309e-06, + "loss": 0.6996, + "step": 97 + }, + { + "epoch": 0.11941510966693746, + "grad_norm": 0.5734775021447369, + "learning_rate": 3.983739837398374e-06, + "loss": 0.7154, + "step": 98 + }, + { + "epoch": 0.1206336311941511, + "grad_norm": 0.5045397032963369, + "learning_rate": 4.024390243902439e-06, + "loss": 0.7024, + "step": 99 + }, + { + "epoch": 0.12185215272136475, + "grad_norm": 0.5236114609222794, + "learning_rate": 4.0650406504065046e-06, + "loss": 0.7226, + "step": 100 + }, + { + "epoch": 0.12307067424857839, + "grad_norm": 0.5641036987903533, + "learning_rate": 4.10569105691057e-06, + "loss": 0.7054, + "step": 101 + }, + { + "epoch": 0.12428919577579203, + "grad_norm": 0.508465869676476, + "learning_rate": 4.146341463414634e-06, + "loss": 0.7003, + "step": 102 + }, + { + "epoch": 0.1255077173030057, + "grad_norm": 0.5202630797257376, + "learning_rate": 4.1869918699186995e-06, + "loss": 0.7204, + "step": 103 + }, + { + "epoch": 0.12672623883021933, + "grad_norm": 0.5552933325377176, + "learning_rate": 4.227642276422765e-06, + "loss": 0.7279, + "step": 104 + }, + { + "epoch": 0.127944760357433, + "grad_norm": 0.5416012915563714, + "learning_rate": 4.268292682926829e-06, + "loss": 0.7093, + "step": 105 + }, + { + "epoch": 0.12916328188464662, + "grad_norm": 0.5104004921064896, + "learning_rate": 4.308943089430894e-06, + "loss": 0.7013, + "step": 106 + }, + { + "epoch": 0.1303818034118603, + "grad_norm": 0.5765782104977603, + "learning_rate": 4.34959349593496e-06, + "loss": 0.7045, + "step": 107 + }, + { + "epoch": 0.13160032493907392, + "grad_norm": 0.5017287673543831, + "learning_rate": 4.390243902439025e-06, + "loss": 0.6997, + "step": 108 + }, + { + "epoch": 0.13281884646628758, + "grad_norm": 0.48808935518722807, + "learning_rate": 4.43089430894309e-06, + "loss": 0.7184, + "step": 109 + }, + { + "epoch": 0.13403736799350122, + "grad_norm": 0.531216252027469, + "learning_rate": 4.471544715447155e-06, + "loss": 0.7069, + "step": 110 + }, + { + "epoch": 0.13525588952071488, + "grad_norm": 0.5149543102282697, + "learning_rate": 4.51219512195122e-06, + "loss": 0.7023, + "step": 111 + }, + { + "epoch": 0.1364744110479285, + "grad_norm": 0.5291257352871406, + "learning_rate": 4.552845528455285e-06, + "loss": 0.7185, + "step": 112 + }, + { + "epoch": 0.13769293257514217, + "grad_norm": 0.47858897961333036, + "learning_rate": 4.59349593495935e-06, + "loss": 0.7149, + "step": 113 + }, + { + "epoch": 0.1389114541023558, + "grad_norm": 0.5359903721383661, + "learning_rate": 4.634146341463416e-06, + "loss": 0.7118, + "step": 114 + }, + { + "epoch": 0.14012997562956944, + "grad_norm": 0.5023325811416011, + "learning_rate": 4.67479674796748e-06, + "loss": 0.6986, + "step": 115 + }, + { + "epoch": 0.1413484971567831, + "grad_norm": 0.507102678949565, + "learning_rate": 4.715447154471545e-06, + "loss": 0.7053, + "step": 116 + }, + { + "epoch": 0.14256701868399674, + "grad_norm": 0.5162377887307996, + "learning_rate": 4.75609756097561e-06, + "loss": 0.6971, + "step": 117 + }, + { + "epoch": 0.1437855402112104, + "grad_norm": 0.5228780554370066, + "learning_rate": 4.796747967479675e-06, + "loss": 0.6915, + "step": 118 + }, + { + "epoch": 0.14500406173842403, + "grad_norm": 0.5539538888660016, + "learning_rate": 4.83739837398374e-06, + "loss": 0.6979, + "step": 119 + }, + { + "epoch": 0.1462225832656377, + "grad_norm": 0.6135340785022244, + "learning_rate": 4.8780487804878055e-06, + "loss": 0.7197, + "step": 120 + }, + { + "epoch": 0.14744110479285133, + "grad_norm": 0.5261935119823417, + "learning_rate": 4.918699186991871e-06, + "loss": 0.6957, + "step": 121 + }, + { + "epoch": 0.148659626320065, + "grad_norm": 0.5941876044718514, + "learning_rate": 4.959349593495935e-06, + "loss": 0.7031, + "step": 122 + }, + { + "epoch": 0.14987814784727863, + "grad_norm": 0.5436866255986976, + "learning_rate": 5e-06, + "loss": 0.7068, + "step": 123 + }, + { + "epoch": 0.1510966693744923, + "grad_norm": 0.5295736343510782, + "learning_rate": 5.040650406504065e-06, + "loss": 0.686, + "step": 124 + }, + { + "epoch": 0.15231519090170592, + "grad_norm": 0.5536691790810129, + "learning_rate": 5.081300813008131e-06, + "loss": 0.681, + "step": 125 + }, + { + "epoch": 0.15353371242891958, + "grad_norm": 0.6057295035493935, + "learning_rate": 5.121951219512195e-06, + "loss": 0.7195, + "step": 126 + }, + { + "epoch": 0.15475223395613322, + "grad_norm": 0.49006287650569935, + "learning_rate": 5.162601626016261e-06, + "loss": 0.6846, + "step": 127 + }, + { + "epoch": 0.15597075548334688, + "grad_norm": 0.5312531193234717, + "learning_rate": 5.203252032520326e-06, + "loss": 0.7038, + "step": 128 + }, + { + "epoch": 0.1571892770105605, + "grad_norm": 0.5581018411430491, + "learning_rate": 5.243902439024391e-06, + "loss": 0.7184, + "step": 129 + }, + { + "epoch": 0.15840779853777417, + "grad_norm": 0.5133234429893759, + "learning_rate": 5.2845528455284555e-06, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.1596263200649878, + "grad_norm": 0.5191238918744202, + "learning_rate": 5.32520325203252e-06, + "loss": 0.6921, + "step": 131 + }, + { + "epoch": 0.16084484159220147, + "grad_norm": 0.6134860482308477, + "learning_rate": 5.365853658536586e-06, + "loss": 0.6764, + "step": 132 + }, + { + "epoch": 0.1620633631194151, + "grad_norm": 0.6271296104201189, + "learning_rate": 5.4065040650406504e-06, + "loss": 0.6829, + "step": 133 + }, + { + "epoch": 0.16328188464662877, + "grad_norm": 0.6385973016730124, + "learning_rate": 5.447154471544716e-06, + "loss": 0.6957, + "step": 134 + }, + { + "epoch": 0.1645004061738424, + "grad_norm": 0.5686127235373731, + "learning_rate": 5.487804878048781e-06, + "loss": 0.698, + "step": 135 + }, + { + "epoch": 0.16571892770105606, + "grad_norm": 0.6833156687934043, + "learning_rate": 5.528455284552846e-06, + "loss": 0.6851, + "step": 136 + }, + { + "epoch": 0.1669374492282697, + "grad_norm": 0.700308205780514, + "learning_rate": 5.569105691056911e-06, + "loss": 0.6806, + "step": 137 + }, + { + "epoch": 0.16815597075548336, + "grad_norm": 0.5462617932277235, + "learning_rate": 5.609756097560977e-06, + "loss": 0.7091, + "step": 138 + }, + { + "epoch": 0.169374492282697, + "grad_norm": 0.7852807393490868, + "learning_rate": 5.650406504065041e-06, + "loss": 0.7123, + "step": 139 + }, + { + "epoch": 0.17059301380991065, + "grad_norm": 0.6057177569541422, + "learning_rate": 5.691056910569106e-06, + "loss": 0.7078, + "step": 140 + }, + { + "epoch": 0.1718115353371243, + "grad_norm": 0.6623630085315475, + "learning_rate": 5.731707317073171e-06, + "loss": 0.6685, + "step": 141 + }, + { + "epoch": 0.17303005686433795, + "grad_norm": 0.6456913396682378, + "learning_rate": 5.772357723577237e-06, + "loss": 0.7076, + "step": 142 + }, + { + "epoch": 0.17424857839155158, + "grad_norm": 0.5490338737139909, + "learning_rate": 5.813008130081301e-06, + "loss": 0.6679, + "step": 143 + }, + { + "epoch": 0.17546709991876522, + "grad_norm": 0.6684035267057585, + "learning_rate": 5.853658536585366e-06, + "loss": 0.6862, + "step": 144 + }, + { + "epoch": 0.17668562144597888, + "grad_norm": 0.5217072648450134, + "learning_rate": 5.894308943089432e-06, + "loss": 0.6846, + "step": 145 + }, + { + "epoch": 0.17790414297319251, + "grad_norm": 0.5999701195487821, + "learning_rate": 5.934959349593496e-06, + "loss": 0.6971, + "step": 146 + }, + { + "epoch": 0.17912266450040618, + "grad_norm": 0.6672016415742844, + "learning_rate": 5.9756097560975615e-06, + "loss": 0.7023, + "step": 147 + }, + { + "epoch": 0.1803411860276198, + "grad_norm": 0.5638253505945849, + "learning_rate": 6.016260162601627e-06, + "loss": 0.6718, + "step": 148 + }, + { + "epoch": 0.18155970755483347, + "grad_norm": 0.5443164616899534, + "learning_rate": 6.056910569105692e-06, + "loss": 0.678, + "step": 149 + }, + { + "epoch": 0.1827782290820471, + "grad_norm": 0.5515636708718161, + "learning_rate": 6.0975609756097564e-06, + "loss": 0.6914, + "step": 150 + }, + { + "epoch": 0.18399675060926077, + "grad_norm": 0.6385969983161707, + "learning_rate": 6.138211382113821e-06, + "loss": 0.6796, + "step": 151 + }, + { + "epoch": 0.1852152721364744, + "grad_norm": 0.6113406592810082, + "learning_rate": 6.178861788617887e-06, + "loss": 0.682, + "step": 152 + }, + { + "epoch": 0.18643379366368806, + "grad_norm": 0.6906350808865743, + "learning_rate": 6.219512195121951e-06, + "loss": 0.6671, + "step": 153 + }, + { + "epoch": 0.1876523151909017, + "grad_norm": 0.7020113339328089, + "learning_rate": 6.260162601626017e-06, + "loss": 0.6835, + "step": 154 + }, + { + "epoch": 0.18887083671811536, + "grad_norm": 0.5548828056807938, + "learning_rate": 6.300813008130082e-06, + "loss": 0.6809, + "step": 155 + }, + { + "epoch": 0.190089358245329, + "grad_norm": 0.8352572415357199, + "learning_rate": 6.341463414634147e-06, + "loss": 0.6809, + "step": 156 + }, + { + "epoch": 0.19130787977254265, + "grad_norm": 0.6517742914384106, + "learning_rate": 6.3821138211382115e-06, + "loss": 0.6791, + "step": 157 + }, + { + "epoch": 0.1925264012997563, + "grad_norm": 0.6204344146843959, + "learning_rate": 6.422764227642278e-06, + "loss": 0.6766, + "step": 158 + }, + { + "epoch": 0.19374492282696995, + "grad_norm": 0.8219899409754744, + "learning_rate": 6.463414634146342e-06, + "loss": 0.6726, + "step": 159 + }, + { + "epoch": 0.19496344435418358, + "grad_norm": 0.6541183549209502, + "learning_rate": 6.504065040650407e-06, + "loss": 0.6781, + "step": 160 + }, + { + "epoch": 0.19618196588139725, + "grad_norm": 0.566310879262149, + "learning_rate": 6.544715447154472e-06, + "loss": 0.6702, + "step": 161 + }, + { + "epoch": 0.19740048740861088, + "grad_norm": 0.775755089339994, + "learning_rate": 6.585365853658538e-06, + "loss": 0.6905, + "step": 162 + }, + { + "epoch": 0.19861900893582454, + "grad_norm": 0.6288678821845954, + "learning_rate": 6.626016260162602e-06, + "loss": 0.6881, + "step": 163 + }, + { + "epoch": 0.19983753046303818, + "grad_norm": 0.7640676261377178, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6737, + "step": 164 + }, + { + "epoch": 0.20105605199025184, + "grad_norm": 0.5731637259066372, + "learning_rate": 6.707317073170733e-06, + "loss": 0.674, + "step": 165 + }, + { + "epoch": 0.20227457351746547, + "grad_norm": 0.7761516718399211, + "learning_rate": 6.747967479674797e-06, + "loss": 0.6928, + "step": 166 + }, + { + "epoch": 0.20349309504467913, + "grad_norm": 0.7100095841961804, + "learning_rate": 6.788617886178862e-06, + "loss": 0.6727, + "step": 167 + }, + { + "epoch": 0.20471161657189277, + "grad_norm": 0.569930635478734, + "learning_rate": 6.829268292682928e-06, + "loss": 0.689, + "step": 168 + }, + { + "epoch": 0.20593013809910643, + "grad_norm": 0.7691268212355195, + "learning_rate": 6.869918699186993e-06, + "loss": 0.6973, + "step": 169 + }, + { + "epoch": 0.20714865962632006, + "grad_norm": 0.560097362553805, + "learning_rate": 6.910569105691057e-06, + "loss": 0.6681, + "step": 170 + }, + { + "epoch": 0.20836718115353373, + "grad_norm": 0.6849837037178143, + "learning_rate": 6.951219512195122e-06, + "loss": 0.6592, + "step": 171 + }, + { + "epoch": 0.20958570268074736, + "grad_norm": 0.7951681541297303, + "learning_rate": 6.991869918699188e-06, + "loss": 0.6812, + "step": 172 + }, + { + "epoch": 0.210804224207961, + "grad_norm": 0.5428585266109707, + "learning_rate": 7.032520325203252e-06, + "loss": 0.696, + "step": 173 + }, + { + "epoch": 0.21202274573517466, + "grad_norm": 0.7462142080092842, + "learning_rate": 7.0731707317073175e-06, + "loss": 0.6793, + "step": 174 + }, + { + "epoch": 0.2132412672623883, + "grad_norm": 0.6370138105851062, + "learning_rate": 7.113821138211383e-06, + "loss": 0.6717, + "step": 175 + }, + { + "epoch": 0.21445978878960195, + "grad_norm": 0.566025113423941, + "learning_rate": 7.154471544715448e-06, + "loss": 0.6694, + "step": 176 + }, + { + "epoch": 0.21567831031681559, + "grad_norm": 0.6632467338949928, + "learning_rate": 7.1951219512195125e-06, + "loss": 0.679, + "step": 177 + }, + { + "epoch": 0.21689683184402925, + "grad_norm": 0.5775437329049822, + "learning_rate": 7.2357723577235786e-06, + "loss": 0.6738, + "step": 178 + }, + { + "epoch": 0.21811535337124288, + "grad_norm": 0.6763254821774859, + "learning_rate": 7.276422764227643e-06, + "loss": 0.6885, + "step": 179 + }, + { + "epoch": 0.21933387489845654, + "grad_norm": 0.6525555364778458, + "learning_rate": 7.317073170731707e-06, + "loss": 0.6829, + "step": 180 + }, + { + "epoch": 0.22055239642567018, + "grad_norm": 0.6376488223620492, + "learning_rate": 7.357723577235773e-06, + "loss": 0.6611, + "step": 181 + }, + { + "epoch": 0.22177091795288384, + "grad_norm": 0.6135443136132807, + "learning_rate": 7.398373983739838e-06, + "loss": 0.6875, + "step": 182 + }, + { + "epoch": 0.22298943948009747, + "grad_norm": 0.6616707267536054, + "learning_rate": 7.439024390243903e-06, + "loss": 0.6637, + "step": 183 + }, + { + "epoch": 0.22420796100731114, + "grad_norm": 0.6601543949811752, + "learning_rate": 7.4796747967479676e-06, + "loss": 0.6714, + "step": 184 + }, + { + "epoch": 0.22542648253452477, + "grad_norm": 0.689531862633905, + "learning_rate": 7.520325203252034e-06, + "loss": 0.6717, + "step": 185 + }, + { + "epoch": 0.22664500406173843, + "grad_norm": 0.6693067594219624, + "learning_rate": 7.560975609756098e-06, + "loss": 0.6538, + "step": 186 + }, + { + "epoch": 0.22786352558895206, + "grad_norm": 0.6877489613732909, + "learning_rate": 7.601626016260163e-06, + "loss": 0.6791, + "step": 187 + }, + { + "epoch": 0.22908204711616573, + "grad_norm": 0.6102935004924294, + "learning_rate": 7.64227642276423e-06, + "loss": 0.6697, + "step": 188 + }, + { + "epoch": 0.23030056864337936, + "grad_norm": 0.7109056322063843, + "learning_rate": 7.682926829268293e-06, + "loss": 0.679, + "step": 189 + }, + { + "epoch": 0.23151909017059302, + "grad_norm": 0.6183616410187914, + "learning_rate": 7.723577235772358e-06, + "loss": 0.662, + "step": 190 + }, + { + "epoch": 0.23273761169780666, + "grad_norm": 0.6117992409555106, + "learning_rate": 7.764227642276424e-06, + "loss": 0.6671, + "step": 191 + }, + { + "epoch": 0.23395613322502032, + "grad_norm": 0.7288006220266883, + "learning_rate": 7.804878048780489e-06, + "loss": 0.7049, + "step": 192 + }, + { + "epoch": 0.23517465475223395, + "grad_norm": 0.6795369812377434, + "learning_rate": 7.845528455284554e-06, + "loss": 0.6638, + "step": 193 + }, + { + "epoch": 0.23639317627944761, + "grad_norm": 0.6336366896960686, + "learning_rate": 7.886178861788618e-06, + "loss": 0.6738, + "step": 194 + }, + { + "epoch": 0.23761169780666125, + "grad_norm": 0.7184491761674651, + "learning_rate": 7.926829268292685e-06, + "loss": 0.6628, + "step": 195 + }, + { + "epoch": 0.2388302193338749, + "grad_norm": 0.659177288266525, + "learning_rate": 7.967479674796748e-06, + "loss": 0.6805, + "step": 196 + }, + { + "epoch": 0.24004874086108854, + "grad_norm": 0.578465157473097, + "learning_rate": 8.008130081300813e-06, + "loss": 0.6894, + "step": 197 + }, + { + "epoch": 0.2412672623883022, + "grad_norm": 0.6868825593189901, + "learning_rate": 8.048780487804879e-06, + "loss": 0.6847, + "step": 198 + }, + { + "epoch": 0.24248578391551584, + "grad_norm": 0.6185564483778565, + "learning_rate": 8.089430894308944e-06, + "loss": 0.6667, + "step": 199 + }, + { + "epoch": 0.2437043054427295, + "grad_norm": 0.7195682220690447, + "learning_rate": 8.130081300813009e-06, + "loss": 0.6681, + "step": 200 + }, + { + "epoch": 0.24492282696994314, + "grad_norm": 0.5988887592571761, + "learning_rate": 8.170731707317073e-06, + "loss": 0.663, + "step": 201 + }, + { + "epoch": 0.24614134849715677, + "grad_norm": 0.6728160874756142, + "learning_rate": 8.21138211382114e-06, + "loss": 0.6688, + "step": 202 + }, + { + "epoch": 0.24735987002437043, + "grad_norm": 0.6167676512934452, + "learning_rate": 8.252032520325203e-06, + "loss": 0.6597, + "step": 203 + }, + { + "epoch": 0.24857839155158407, + "grad_norm": 0.6963420894322225, + "learning_rate": 8.292682926829268e-06, + "loss": 0.6739, + "step": 204 + }, + { + "epoch": 0.24979691307879773, + "grad_norm": 0.6374482813383724, + "learning_rate": 8.333333333333334e-06, + "loss": 0.6782, + "step": 205 + }, + { + "epoch": 0.2510154346060114, + "grad_norm": 0.7061607202293944, + "learning_rate": 8.373983739837399e-06, + "loss": 0.6784, + "step": 206 + }, + { + "epoch": 0.252233956133225, + "grad_norm": 0.6588145062673179, + "learning_rate": 8.414634146341464e-06, + "loss": 0.6785, + "step": 207 + }, + { + "epoch": 0.25345247766043866, + "grad_norm": 0.5575233234513415, + "learning_rate": 8.45528455284553e-06, + "loss": 0.659, + "step": 208 + }, + { + "epoch": 0.2546709991876523, + "grad_norm": 0.660881658687861, + "learning_rate": 8.495934959349595e-06, + "loss": 0.672, + "step": 209 + }, + { + "epoch": 0.255889520714866, + "grad_norm": 0.8528802704630485, + "learning_rate": 8.536585365853658e-06, + "loss": 0.6758, + "step": 210 + }, + { + "epoch": 0.2571080422420796, + "grad_norm": 0.5423728405429434, + "learning_rate": 8.577235772357724e-06, + "loss": 0.6526, + "step": 211 + }, + { + "epoch": 0.25832656376929325, + "grad_norm": 0.7828577081027186, + "learning_rate": 8.617886178861789e-06, + "loss": 0.6619, + "step": 212 + }, + { + "epoch": 0.2595450852965069, + "grad_norm": 0.683409796151077, + "learning_rate": 8.658536585365854e-06, + "loss": 0.6609, + "step": 213 + }, + { + "epoch": 0.2607636068237206, + "grad_norm": 0.7965663856791516, + "learning_rate": 8.69918699186992e-06, + "loss": 0.6771, + "step": 214 + }, + { + "epoch": 0.2619821283509342, + "grad_norm": 0.5296355193224814, + "learning_rate": 8.739837398373985e-06, + "loss": 0.6631, + "step": 215 + }, + { + "epoch": 0.26320064987814784, + "grad_norm": 0.6996343517682662, + "learning_rate": 8.78048780487805e-06, + "loss": 0.6778, + "step": 216 + }, + { + "epoch": 0.2644191714053615, + "grad_norm": 0.5685534992796657, + "learning_rate": 8.821138211382113e-06, + "loss": 0.6542, + "step": 217 + }, + { + "epoch": 0.26563769293257516, + "grad_norm": 0.5812188798996168, + "learning_rate": 8.86178861788618e-06, + "loss": 0.6665, + "step": 218 + }, + { + "epoch": 0.2668562144597888, + "grad_norm": 0.5872098802829594, + "learning_rate": 8.902439024390244e-06, + "loss": 0.6793, + "step": 219 + }, + { + "epoch": 0.26807473598700243, + "grad_norm": 0.6138295440628797, + "learning_rate": 8.94308943089431e-06, + "loss": 0.6845, + "step": 220 + }, + { + "epoch": 0.26929325751421607, + "grad_norm": 0.5555943793492026, + "learning_rate": 8.983739837398374e-06, + "loss": 0.6456, + "step": 221 + }, + { + "epoch": 0.27051177904142976, + "grad_norm": 0.5716563368438368, + "learning_rate": 9.02439024390244e-06, + "loss": 0.6881, + "step": 222 + }, + { + "epoch": 0.2717303005686434, + "grad_norm": 0.5657678117161605, + "learning_rate": 9.065040650406505e-06, + "loss": 0.6721, + "step": 223 + }, + { + "epoch": 0.272948822095857, + "grad_norm": 0.6494717196083141, + "learning_rate": 9.10569105691057e-06, + "loss": 0.6856, + "step": 224 + }, + { + "epoch": 0.27416734362307066, + "grad_norm": 0.5769888402008441, + "learning_rate": 9.146341463414635e-06, + "loss": 0.65, + "step": 225 + }, + { + "epoch": 0.27538586515028435, + "grad_norm": 0.6372976291357912, + "learning_rate": 9.1869918699187e-06, + "loss": 0.679, + "step": 226 + }, + { + "epoch": 0.276604386677498, + "grad_norm": 0.61020413240267, + "learning_rate": 9.227642276422764e-06, + "loss": 0.6462, + "step": 227 + }, + { + "epoch": 0.2778229082047116, + "grad_norm": 0.7347279477946881, + "learning_rate": 9.268292682926831e-06, + "loss": 0.6502, + "step": 228 + }, + { + "epoch": 0.27904142973192525, + "grad_norm": 0.6513241840116742, + "learning_rate": 9.308943089430895e-06, + "loss": 0.6707, + "step": 229 + }, + { + "epoch": 0.2802599512591389, + "grad_norm": 0.6578226137183896, + "learning_rate": 9.34959349593496e-06, + "loss": 0.6559, + "step": 230 + }, + { + "epoch": 0.2814784727863526, + "grad_norm": 0.6443126179924461, + "learning_rate": 9.390243902439025e-06, + "loss": 0.6815, + "step": 231 + }, + { + "epoch": 0.2826969943135662, + "grad_norm": 0.5681908489104979, + "learning_rate": 9.43089430894309e-06, + "loss": 0.6483, + "step": 232 + }, + { + "epoch": 0.28391551584077984, + "grad_norm": 0.638868530973396, + "learning_rate": 9.471544715447156e-06, + "loss": 0.6663, + "step": 233 + }, + { + "epoch": 0.2851340373679935, + "grad_norm": 0.5345735736702238, + "learning_rate": 9.51219512195122e-06, + "loss": 0.6507, + "step": 234 + }, + { + "epoch": 0.28635255889520717, + "grad_norm": 0.6170557049684545, + "learning_rate": 9.552845528455286e-06, + "loss": 0.6533, + "step": 235 + }, + { + "epoch": 0.2875710804224208, + "grad_norm": 0.6282001318911594, + "learning_rate": 9.59349593495935e-06, + "loss": 0.6715, + "step": 236 + }, + { + "epoch": 0.28878960194963443, + "grad_norm": 0.548783110101442, + "learning_rate": 9.634146341463415e-06, + "loss": 0.6536, + "step": 237 + }, + { + "epoch": 0.29000812347684807, + "grad_norm": 0.6300302160047813, + "learning_rate": 9.67479674796748e-06, + "loss": 0.657, + "step": 238 + }, + { + "epoch": 0.29122664500406176, + "grad_norm": 0.5955216072274768, + "learning_rate": 9.715447154471546e-06, + "loss": 0.6767, + "step": 239 + }, + { + "epoch": 0.2924451665312754, + "grad_norm": 0.6216921714562351, + "learning_rate": 9.756097560975611e-06, + "loss": 0.6492, + "step": 240 + }, + { + "epoch": 0.293663688058489, + "grad_norm": 0.6909539613975563, + "learning_rate": 9.796747967479675e-06, + "loss": 0.6618, + "step": 241 + }, + { + "epoch": 0.29488220958570266, + "grad_norm": 0.8137292747107515, + "learning_rate": 9.837398373983741e-06, + "loss": 0.6614, + "step": 242 + }, + { + "epoch": 0.29610073111291635, + "grad_norm": 0.5855911517789665, + "learning_rate": 9.878048780487805e-06, + "loss": 0.6561, + "step": 243 + }, + { + "epoch": 0.29731925264013, + "grad_norm": 0.8851136874577217, + "learning_rate": 9.91869918699187e-06, + "loss": 0.6498, + "step": 244 + }, + { + "epoch": 0.2985377741673436, + "grad_norm": 0.57227502230073, + "learning_rate": 9.959349593495936e-06, + "loss": 0.6606, + "step": 245 + }, + { + "epoch": 0.29975629569455725, + "grad_norm": 0.9576157821693805, + "learning_rate": 1e-05, + "loss": 0.648, + "step": 246 + }, + { + "epoch": 0.30097481722177094, + "grad_norm": 0.574426873878406, + "learning_rate": 9.999994966333388e-06, + "loss": 0.6543, + "step": 247 + }, + { + "epoch": 0.3021933387489846, + "grad_norm": 0.7230465083023617, + "learning_rate": 9.99997986534369e-06, + "loss": 0.6654, + "step": 248 + }, + { + "epoch": 0.3034118602761982, + "grad_norm": 0.5421626680587527, + "learning_rate": 9.999954697061305e-06, + "loss": 0.6343, + "step": 249 + }, + { + "epoch": 0.30463038180341184, + "grad_norm": 0.6129301937842085, + "learning_rate": 9.999919461536915e-06, + "loss": 0.6449, + "step": 250 + }, + { + "epoch": 0.30584890333062553, + "grad_norm": 0.563497786259594, + "learning_rate": 9.999874158841462e-06, + "loss": 0.66, + "step": 251 + }, + { + "epoch": 0.30706742485783917, + "grad_norm": 0.6709530297921161, + "learning_rate": 9.999818789066164e-06, + "loss": 0.6575, + "step": 252 + }, + { + "epoch": 0.3082859463850528, + "grad_norm": 0.6033112191541231, + "learning_rate": 9.999753352322502e-06, + "loss": 0.6745, + "step": 253 + }, + { + "epoch": 0.30950446791226643, + "grad_norm": 0.7085418197042371, + "learning_rate": 9.999677848742238e-06, + "loss": 0.645, + "step": 254 + }, + { + "epoch": 0.3107229894394801, + "grad_norm": 0.6149439429340515, + "learning_rate": 9.999592278477389e-06, + "loss": 0.6553, + "step": 255 + }, + { + "epoch": 0.31194151096669376, + "grad_norm": 0.5361824485289747, + "learning_rate": 9.999496641700249e-06, + "loss": 0.6394, + "step": 256 + }, + { + "epoch": 0.3131600324939074, + "grad_norm": 0.7876266919973667, + "learning_rate": 9.99939093860338e-06, + "loss": 0.651, + "step": 257 + }, + { + "epoch": 0.314378554021121, + "grad_norm": 0.5240336550865616, + "learning_rate": 9.999275169399614e-06, + "loss": 0.6445, + "step": 258 + }, + { + "epoch": 0.31559707554833466, + "grad_norm": 0.9003012478867778, + "learning_rate": 9.999149334322047e-06, + "loss": 0.6759, + "step": 259 + }, + { + "epoch": 0.31681559707554835, + "grad_norm": 0.520552428762164, + "learning_rate": 9.999013433624042e-06, + "loss": 0.6656, + "step": 260 + }, + { + "epoch": 0.318034118602762, + "grad_norm": 0.8451285058918907, + "learning_rate": 9.998867467579234e-06, + "loss": 0.6393, + "step": 261 + }, + { + "epoch": 0.3192526401299756, + "grad_norm": 0.6368634173244008, + "learning_rate": 9.998711436481519e-06, + "loss": 0.6544, + "step": 262 + }, + { + "epoch": 0.32047116165718925, + "grad_norm": 0.690099709138949, + "learning_rate": 9.998545340645058e-06, + "loss": 0.6609, + "step": 263 + }, + { + "epoch": 0.32168968318440294, + "grad_norm": 0.7144861500132949, + "learning_rate": 9.998369180404283e-06, + "loss": 0.6647, + "step": 264 + }, + { + "epoch": 0.3229082047116166, + "grad_norm": 0.6362319514002672, + "learning_rate": 9.998182956113885e-06, + "loss": 0.6533, + "step": 265 + }, + { + "epoch": 0.3241267262388302, + "grad_norm": 0.6488964510495924, + "learning_rate": 9.99798666814882e-06, + "loss": 0.6504, + "step": 266 + }, + { + "epoch": 0.32534524776604384, + "grad_norm": 0.6063198470537309, + "learning_rate": 9.99778031690431e-06, + "loss": 0.6563, + "step": 267 + }, + { + "epoch": 0.32656376929325753, + "grad_norm": 0.5938533025522102, + "learning_rate": 9.997563902795834e-06, + "loss": 0.6675, + "step": 268 + }, + { + "epoch": 0.32778229082047117, + "grad_norm": 0.7515871090930308, + "learning_rate": 9.997337426259134e-06, + "loss": 0.6792, + "step": 269 + }, + { + "epoch": 0.3290008123476848, + "grad_norm": 0.703279934707329, + "learning_rate": 9.997100887750215e-06, + "loss": 0.6635, + "step": 270 + }, + { + "epoch": 0.33021933387489844, + "grad_norm": 0.695544945955001, + "learning_rate": 9.996854287745337e-06, + "loss": 0.645, + "step": 271 + }, + { + "epoch": 0.3314378554021121, + "grad_norm": 0.7462833994362996, + "learning_rate": 9.996597626741023e-06, + "loss": 0.6478, + "step": 272 + }, + { + "epoch": 0.33265637692932576, + "grad_norm": 0.6876699055946316, + "learning_rate": 9.99633090525405e-06, + "loss": 0.6495, + "step": 273 + }, + { + "epoch": 0.3338748984565394, + "grad_norm": 0.6161949269900944, + "learning_rate": 9.996054123821455e-06, + "loss": 0.6477, + "step": 274 + }, + { + "epoch": 0.335093419983753, + "grad_norm": 0.6992818714334844, + "learning_rate": 9.995767283000526e-06, + "loss": 0.6471, + "step": 275 + }, + { + "epoch": 0.3363119415109667, + "grad_norm": 0.6649545633189144, + "learning_rate": 9.995470383368808e-06, + "loss": 0.6526, + "step": 276 + }, + { + "epoch": 0.33753046303818035, + "grad_norm": 0.7069772548058584, + "learning_rate": 9.995163425524097e-06, + "loss": 0.6622, + "step": 277 + }, + { + "epoch": 0.338748984565394, + "grad_norm": 0.7343365884623839, + "learning_rate": 9.994846410084447e-06, + "loss": 0.6401, + "step": 278 + }, + { + "epoch": 0.3399675060926076, + "grad_norm": 0.7666383023534878, + "learning_rate": 9.994519337688152e-06, + "loss": 0.6351, + "step": 279 + }, + { + "epoch": 0.3411860276198213, + "grad_norm": 0.7101687784996984, + "learning_rate": 9.994182208993766e-06, + "loss": 0.6686, + "step": 280 + }, + { + "epoch": 0.34240454914703494, + "grad_norm": 0.794098416336116, + "learning_rate": 9.993835024680084e-06, + "loss": 0.6534, + "step": 281 + }, + { + "epoch": 0.3436230706742486, + "grad_norm": 0.6476191969862704, + "learning_rate": 9.993477785446151e-06, + "loss": 0.6321, + "step": 282 + }, + { + "epoch": 0.3448415922014622, + "grad_norm": 0.7027462161925977, + "learning_rate": 9.993110492011256e-06, + "loss": 0.6677, + "step": 283 + }, + { + "epoch": 0.3460601137286759, + "grad_norm": 0.7368948502336647, + "learning_rate": 9.992733145114932e-06, + "loss": 0.6332, + "step": 284 + }, + { + "epoch": 0.34727863525588953, + "grad_norm": 0.769793462172428, + "learning_rate": 9.992345745516954e-06, + "loss": 0.6627, + "step": 285 + }, + { + "epoch": 0.34849715678310317, + "grad_norm": 0.6391657112532801, + "learning_rate": 9.99194829399734e-06, + "loss": 0.6364, + "step": 286 + }, + { + "epoch": 0.3497156783103168, + "grad_norm": 0.8671328129231476, + "learning_rate": 9.991540791356342e-06, + "loss": 0.6558, + "step": 287 + }, + { + "epoch": 0.35093419983753044, + "grad_norm": 0.6143371180878986, + "learning_rate": 9.991123238414455e-06, + "loss": 0.6725, + "step": 288 + }, + { + "epoch": 0.3521527213647441, + "grad_norm": 0.7114612477683598, + "learning_rate": 9.99069563601241e-06, + "loss": 0.6386, + "step": 289 + }, + { + "epoch": 0.35337124289195776, + "grad_norm": 0.5910112375855043, + "learning_rate": 9.990257985011168e-06, + "loss": 0.6648, + "step": 290 + }, + { + "epoch": 0.3545897644191714, + "grad_norm": 0.6709399619542642, + "learning_rate": 9.989810286291923e-06, + "loss": 0.6641, + "step": 291 + }, + { + "epoch": 0.35580828594638503, + "grad_norm": 0.5876086675256037, + "learning_rate": 9.989352540756103e-06, + "loss": 0.6519, + "step": 292 + }, + { + "epoch": 0.3570268074735987, + "grad_norm": 0.4993245470857056, + "learning_rate": 9.988884749325366e-06, + "loss": 0.6409, + "step": 293 + }, + { + "epoch": 0.35824532900081235, + "grad_norm": 0.6361394412220084, + "learning_rate": 9.988406912941591e-06, + "loss": 0.6543, + "step": 294 + }, + { + "epoch": 0.359463850528026, + "grad_norm": 0.5972665446098098, + "learning_rate": 9.987919032566885e-06, + "loss": 0.6379, + "step": 295 + }, + { + "epoch": 0.3606823720552396, + "grad_norm": 0.5332779981117456, + "learning_rate": 9.987421109183581e-06, + "loss": 0.6362, + "step": 296 + }, + { + "epoch": 0.3619008935824533, + "grad_norm": 0.6057994457076236, + "learning_rate": 9.986913143794232e-06, + "loss": 0.6455, + "step": 297 + }, + { + "epoch": 0.36311941510966694, + "grad_norm": 0.6075132715056499, + "learning_rate": 9.986395137421607e-06, + "loss": 0.6624, + "step": 298 + }, + { + "epoch": 0.3643379366368806, + "grad_norm": 0.5258247408109219, + "learning_rate": 9.985867091108697e-06, + "loss": 0.638, + "step": 299 + }, + { + "epoch": 0.3655564581640942, + "grad_norm": 0.5267906230313797, + "learning_rate": 9.985329005918702e-06, + "loss": 0.6362, + "step": 300 + }, + { + "epoch": 0.3667749796913079, + "grad_norm": 0.5638416352250496, + "learning_rate": 9.984780882935043e-06, + "loss": 0.6301, + "step": 301 + }, + { + "epoch": 0.36799350121852153, + "grad_norm": 0.545011579464239, + "learning_rate": 9.984222723261344e-06, + "loss": 0.6599, + "step": 302 + }, + { + "epoch": 0.36921202274573517, + "grad_norm": 0.5606014722546357, + "learning_rate": 9.983654528021442e-06, + "loss": 0.6542, + "step": 303 + }, + { + "epoch": 0.3704305442729488, + "grad_norm": 0.6018343388636366, + "learning_rate": 9.98307629835938e-06, + "loss": 0.6368, + "step": 304 + }, + { + "epoch": 0.3716490658001625, + "grad_norm": 0.6118602452372705, + "learning_rate": 9.982488035439401e-06, + "loss": 0.6513, + "step": 305 + }, + { + "epoch": 0.3728675873273761, + "grad_norm": 0.6022653337990805, + "learning_rate": 9.981889740445958e-06, + "loss": 0.6496, + "step": 306 + }, + { + "epoch": 0.37408610885458976, + "grad_norm": 0.569004250440184, + "learning_rate": 9.981281414583693e-06, + "loss": 0.6598, + "step": 307 + }, + { + "epoch": 0.3753046303818034, + "grad_norm": 0.5713014740165444, + "learning_rate": 9.980663059077453e-06, + "loss": 0.6613, + "step": 308 + }, + { + "epoch": 0.3765231519090171, + "grad_norm": 0.6154580840564017, + "learning_rate": 9.980034675172274e-06, + "loss": 0.6442, + "step": 309 + }, + { + "epoch": 0.3777416734362307, + "grad_norm": 0.5917553562402863, + "learning_rate": 9.979396264133388e-06, + "loss": 0.6431, + "step": 310 + }, + { + "epoch": 0.37896019496344435, + "grad_norm": 0.578864320620872, + "learning_rate": 9.978747827246214e-06, + "loss": 0.6589, + "step": 311 + }, + { + "epoch": 0.380178716490658, + "grad_norm": 0.6460070122884725, + "learning_rate": 9.978089365816357e-06, + "loss": 0.6267, + "step": 312 + }, + { + "epoch": 0.3813972380178717, + "grad_norm": 0.6165901634865715, + "learning_rate": 9.977420881169607e-06, + "loss": 0.6357, + "step": 313 + }, + { + "epoch": 0.3826157595450853, + "grad_norm": 0.6862027434641219, + "learning_rate": 9.976742374651936e-06, + "loss": 0.6607, + "step": 314 + }, + { + "epoch": 0.38383428107229894, + "grad_norm": 0.6447789605505084, + "learning_rate": 9.976053847629496e-06, + "loss": 0.6464, + "step": 315 + }, + { + "epoch": 0.3850528025995126, + "grad_norm": 0.597882927094437, + "learning_rate": 9.97535530148861e-06, + "loss": 0.6337, + "step": 316 + }, + { + "epoch": 0.3862713241267262, + "grad_norm": 0.6296819593414332, + "learning_rate": 9.974646737635781e-06, + "loss": 0.6474, + "step": 317 + }, + { + "epoch": 0.3874898456539399, + "grad_norm": 0.6313838311506389, + "learning_rate": 9.973928157497675e-06, + "loss": 0.6289, + "step": 318 + }, + { + "epoch": 0.38870836718115354, + "grad_norm": 0.6255452790127047, + "learning_rate": 9.97319956252113e-06, + "loss": 0.6418, + "step": 319 + }, + { + "epoch": 0.38992688870836717, + "grad_norm": 0.501125482187719, + "learning_rate": 9.972460954173149e-06, + "loss": 0.6469, + "step": 320 + }, + { + "epoch": 0.3911454102355808, + "grad_norm": 0.5644277137540713, + "learning_rate": 9.971712333940896e-06, + "loss": 0.6431, + "step": 321 + }, + { + "epoch": 0.3923639317627945, + "grad_norm": 0.5401625089221826, + "learning_rate": 9.970953703331692e-06, + "loss": 0.6399, + "step": 322 + }, + { + "epoch": 0.3935824532900081, + "grad_norm": 0.6126970579614653, + "learning_rate": 9.970185063873012e-06, + "loss": 0.6312, + "step": 323 + }, + { + "epoch": 0.39480097481722176, + "grad_norm": 0.6237167355625934, + "learning_rate": 9.969406417112489e-06, + "loss": 0.6492, + "step": 324 + }, + { + "epoch": 0.3960194963444354, + "grad_norm": 0.6083530680570769, + "learning_rate": 9.9686177646179e-06, + "loss": 0.6404, + "step": 325 + }, + { + "epoch": 0.3972380178716491, + "grad_norm": 0.6156210783234582, + "learning_rate": 9.967819107977175e-06, + "loss": 0.626, + "step": 326 + }, + { + "epoch": 0.3984565393988627, + "grad_norm": 0.6913246389420981, + "learning_rate": 9.967010448798376e-06, + "loss": 0.6464, + "step": 327 + }, + { + "epoch": 0.39967506092607635, + "grad_norm": 0.6430895031047548, + "learning_rate": 9.966191788709716e-06, + "loss": 0.6482, + "step": 328 + }, + { + "epoch": 0.40089358245329, + "grad_norm": 0.670581453307023, + "learning_rate": 9.965363129359537e-06, + "loss": 0.649, + "step": 329 + }, + { + "epoch": 0.4021121039805037, + "grad_norm": 0.6373745499675882, + "learning_rate": 9.964524472416319e-06, + "loss": 0.6231, + "step": 330 + }, + { + "epoch": 0.4033306255077173, + "grad_norm": 0.5729524017518108, + "learning_rate": 9.96367581956867e-06, + "loss": 0.639, + "step": 331 + }, + { + "epoch": 0.40454914703493094, + "grad_norm": 0.60528048612915, + "learning_rate": 9.962817172525323e-06, + "loss": 0.6412, + "step": 332 + }, + { + "epoch": 0.4057676685621446, + "grad_norm": 0.5439146819119978, + "learning_rate": 9.961948533015135e-06, + "loss": 0.6463, + "step": 333 + }, + { + "epoch": 0.40698619008935827, + "grad_norm": 0.6696342043794363, + "learning_rate": 9.961069902787082e-06, + "loss": 0.6559, + "step": 334 + }, + { + "epoch": 0.4082047116165719, + "grad_norm": 0.6137113821251218, + "learning_rate": 9.96018128361026e-06, + "loss": 0.6186, + "step": 335 + }, + { + "epoch": 0.40942323314378554, + "grad_norm": 0.7521896228588043, + "learning_rate": 9.959282677273869e-06, + "loss": 0.6585, + "step": 336 + }, + { + "epoch": 0.41064175467099917, + "grad_norm": 0.6161644621872354, + "learning_rate": 9.958374085587228e-06, + "loss": 0.6511, + "step": 337 + }, + { + "epoch": 0.41186027619821286, + "grad_norm": 0.6232166791838529, + "learning_rate": 9.957455510379753e-06, + "loss": 0.6421, + "step": 338 + }, + { + "epoch": 0.4130787977254265, + "grad_norm": 0.6575837363786434, + "learning_rate": 9.956526953500965e-06, + "loss": 0.6288, + "step": 339 + }, + { + "epoch": 0.41429731925264013, + "grad_norm": 0.624761687952515, + "learning_rate": 9.955588416820482e-06, + "loss": 0.6397, + "step": 340 + }, + { + "epoch": 0.41551584077985376, + "grad_norm": 0.6332930756907055, + "learning_rate": 9.954639902228018e-06, + "loss": 0.6444, + "step": 341 + }, + { + "epoch": 0.41673436230706745, + "grad_norm": 0.5746664206825376, + "learning_rate": 9.953681411633376e-06, + "loss": 0.6414, + "step": 342 + }, + { + "epoch": 0.4179528838342811, + "grad_norm": 0.6762777021979247, + "learning_rate": 9.952712946966441e-06, + "loss": 0.6306, + "step": 343 + }, + { + "epoch": 0.4191714053614947, + "grad_norm": 0.6244129529931802, + "learning_rate": 9.951734510177187e-06, + "loss": 0.6366, + "step": 344 + }, + { + "epoch": 0.42038992688870835, + "grad_norm": 0.6226787509569254, + "learning_rate": 9.950746103235663e-06, + "loss": 0.6302, + "step": 345 + }, + { + "epoch": 0.421608448415922, + "grad_norm": 0.6520199261370837, + "learning_rate": 9.949747728131994e-06, + "loss": 0.6816, + "step": 346 + }, + { + "epoch": 0.4228269699431357, + "grad_norm": 0.6026134976644628, + "learning_rate": 9.948739386876376e-06, + "loss": 0.6385, + "step": 347 + }, + { + "epoch": 0.4240454914703493, + "grad_norm": 0.6012466224483265, + "learning_rate": 9.947721081499068e-06, + "loss": 0.6458, + "step": 348 + }, + { + "epoch": 0.42526401299756295, + "grad_norm": 0.5524226925373649, + "learning_rate": 9.946692814050396e-06, + "loss": 0.6281, + "step": 349 + }, + { + "epoch": 0.4264825345247766, + "grad_norm": 0.6055953304742949, + "learning_rate": 9.945654586600741e-06, + "loss": 0.6467, + "step": 350 + }, + { + "epoch": 0.42770105605199027, + "grad_norm": 0.586137745210729, + "learning_rate": 9.944606401240538e-06, + "loss": 0.6379, + "step": 351 + }, + { + "epoch": 0.4289195775792039, + "grad_norm": 0.5125599093697626, + "learning_rate": 9.943548260080277e-06, + "loss": 0.6523, + "step": 352 + }, + { + "epoch": 0.43013809910641754, + "grad_norm": 0.6305973658118967, + "learning_rate": 9.942480165250487e-06, + "loss": 0.6389, + "step": 353 + }, + { + "epoch": 0.43135662063363117, + "grad_norm": 0.5220411272087411, + "learning_rate": 9.941402118901743e-06, + "loss": 0.6425, + "step": 354 + }, + { + "epoch": 0.43257514216084486, + "grad_norm": 0.5753441957701829, + "learning_rate": 9.940314123204656e-06, + "loss": 0.6441, + "step": 355 + }, + { + "epoch": 0.4337936636880585, + "grad_norm": 0.584328279121849, + "learning_rate": 9.939216180349864e-06, + "loss": 0.6359, + "step": 356 + }, + { + "epoch": 0.43501218521527213, + "grad_norm": 0.6135441335146246, + "learning_rate": 9.938108292548044e-06, + "loss": 0.6267, + "step": 357 + }, + { + "epoch": 0.43623070674248576, + "grad_norm": 0.5429972724232678, + "learning_rate": 9.93699046202989e-06, + "loss": 0.611, + "step": 358 + }, + { + "epoch": 0.43744922826969945, + "grad_norm": 0.6487815842031103, + "learning_rate": 9.935862691046114e-06, + "loss": 0.6395, + "step": 359 + }, + { + "epoch": 0.4386677497969131, + "grad_norm": 0.5638558609882317, + "learning_rate": 9.934724981867447e-06, + "loss": 0.6398, + "step": 360 + }, + { + "epoch": 0.4398862713241267, + "grad_norm": 0.7915256825394801, + "learning_rate": 9.93357733678463e-06, + "loss": 0.6275, + "step": 361 + }, + { + "epoch": 0.44110479285134035, + "grad_norm": 0.6072564790199728, + "learning_rate": 9.932419758108403e-06, + "loss": 0.6313, + "step": 362 + }, + { + "epoch": 0.44232331437855404, + "grad_norm": 0.7829204972438968, + "learning_rate": 9.931252248169518e-06, + "loss": 0.6334, + "step": 363 + }, + { + "epoch": 0.4435418359057677, + "grad_norm": 0.6029448727505217, + "learning_rate": 9.930074809318714e-06, + "loss": 0.6469, + "step": 364 + }, + { + "epoch": 0.4447603574329813, + "grad_norm": 0.6793840267075067, + "learning_rate": 9.928887443926725e-06, + "loss": 0.6334, + "step": 365 + }, + { + "epoch": 0.44597887896019495, + "grad_norm": 0.5488302948299049, + "learning_rate": 9.927690154384273e-06, + "loss": 0.6213, + "step": 366 + }, + { + "epoch": 0.44719740048740864, + "grad_norm": 0.7346734434148855, + "learning_rate": 9.92648294310206e-06, + "loss": 0.6295, + "step": 367 + }, + { + "epoch": 0.44841592201462227, + "grad_norm": 0.7457059967309784, + "learning_rate": 9.925265812510767e-06, + "loss": 0.6379, + "step": 368 + }, + { + "epoch": 0.4496344435418359, + "grad_norm": 0.621543177481449, + "learning_rate": 9.924038765061042e-06, + "loss": 0.641, + "step": 369 + }, + { + "epoch": 0.45085296506904954, + "grad_norm": 0.8188643504363363, + "learning_rate": 9.922801803223506e-06, + "loss": 0.6481, + "step": 370 + }, + { + "epoch": 0.45207148659626323, + "grad_norm": 0.6040894853255576, + "learning_rate": 9.921554929488741e-06, + "loss": 0.6493, + "step": 371 + }, + { + "epoch": 0.45329000812347686, + "grad_norm": 0.8455545003582287, + "learning_rate": 9.920298146367287e-06, + "loss": 0.6436, + "step": 372 + }, + { + "epoch": 0.4545085296506905, + "grad_norm": 0.626392939964308, + "learning_rate": 9.919031456389632e-06, + "loss": 0.6303, + "step": 373 + }, + { + "epoch": 0.45572705117790413, + "grad_norm": 0.7483260656404666, + "learning_rate": 9.917754862106216e-06, + "loss": 0.6306, + "step": 374 + }, + { + "epoch": 0.45694557270511776, + "grad_norm": 0.6122181327172058, + "learning_rate": 9.916468366087418e-06, + "loss": 0.6409, + "step": 375 + }, + { + "epoch": 0.45816409423233145, + "grad_norm": 0.5593648989087618, + "learning_rate": 9.915171970923556e-06, + "loss": 0.6583, + "step": 376 + }, + { + "epoch": 0.4593826157595451, + "grad_norm": 0.7626157086282944, + "learning_rate": 9.913865679224876e-06, + "loss": 0.648, + "step": 377 + }, + { + "epoch": 0.4606011372867587, + "grad_norm": 0.5027545868887003, + "learning_rate": 9.912549493621555e-06, + "loss": 0.6378, + "step": 378 + }, + { + "epoch": 0.46181965881397236, + "grad_norm": 0.6593540069533284, + "learning_rate": 9.911223416763689e-06, + "loss": 0.6487, + "step": 379 + }, + { + "epoch": 0.46303818034118605, + "grad_norm": 0.7507657782021496, + "learning_rate": 9.909887451321288e-06, + "loss": 0.6628, + "step": 380 + }, + { + "epoch": 0.4642567018683997, + "grad_norm": 0.5963371403892291, + "learning_rate": 9.908541599984276e-06, + "loss": 0.6304, + "step": 381 + }, + { + "epoch": 0.4654752233956133, + "grad_norm": 0.7456866534587581, + "learning_rate": 9.907185865462476e-06, + "loss": 0.6362, + "step": 382 + }, + { + "epoch": 0.46669374492282695, + "grad_norm": 0.5547991254906135, + "learning_rate": 9.905820250485619e-06, + "loss": 0.631, + "step": 383 + }, + { + "epoch": 0.46791226645004064, + "grad_norm": 0.7089080919365149, + "learning_rate": 9.904444757803322e-06, + "loss": 0.6281, + "step": 384 + }, + { + "epoch": 0.46913078797725427, + "grad_norm": 0.5003916403857714, + "learning_rate": 9.903059390185093e-06, + "loss": 0.6412, + "step": 385 + }, + { + "epoch": 0.4703493095044679, + "grad_norm": 0.6729850093918749, + "learning_rate": 9.901664150420328e-06, + "loss": 0.6329, + "step": 386 + }, + { + "epoch": 0.47156783103168154, + "grad_norm": 0.5557718878026181, + "learning_rate": 9.90025904131829e-06, + "loss": 0.6226, + "step": 387 + }, + { + "epoch": 0.47278635255889523, + "grad_norm": 0.6260971706778755, + "learning_rate": 9.898844065708121e-06, + "loss": 0.6257, + "step": 388 + }, + { + "epoch": 0.47400487408610886, + "grad_norm": 0.5411961675821981, + "learning_rate": 9.89741922643883e-06, + "loss": 0.6517, + "step": 389 + }, + { + "epoch": 0.4752233956133225, + "grad_norm": 0.5597130938499267, + "learning_rate": 9.895984526379282e-06, + "loss": 0.6157, + "step": 390 + }, + { + "epoch": 0.47644191714053613, + "grad_norm": 0.58052501455543, + "learning_rate": 9.894539968418195e-06, + "loss": 0.6322, + "step": 391 + }, + { + "epoch": 0.4776604386677498, + "grad_norm": 0.5211161945377233, + "learning_rate": 9.893085555464143e-06, + "loss": 0.6089, + "step": 392 + }, + { + "epoch": 0.47887896019496345, + "grad_norm": 0.6838111314182518, + "learning_rate": 9.891621290445534e-06, + "loss": 0.632, + "step": 393 + }, + { + "epoch": 0.4800974817221771, + "grad_norm": 0.5785699696283433, + "learning_rate": 9.890147176310618e-06, + "loss": 0.623, + "step": 394 + }, + { + "epoch": 0.4813160032493907, + "grad_norm": 0.6260781225868985, + "learning_rate": 9.888663216027477e-06, + "loss": 0.6433, + "step": 395 + }, + { + "epoch": 0.4825345247766044, + "grad_norm": 0.5634389513735794, + "learning_rate": 9.887169412584012e-06, + "loss": 0.6359, + "step": 396 + }, + { + "epoch": 0.48375304630381805, + "grad_norm": 0.576861556797157, + "learning_rate": 9.885665768987947e-06, + "loss": 0.6289, + "step": 397 + }, + { + "epoch": 0.4849715678310317, + "grad_norm": 0.5991685983326442, + "learning_rate": 9.88415228826682e-06, + "loss": 0.6345, + "step": 398 + }, + { + "epoch": 0.4861900893582453, + "grad_norm": 0.5331826337156919, + "learning_rate": 9.882628973467972e-06, + "loss": 0.6282, + "step": 399 + }, + { + "epoch": 0.487408610885459, + "grad_norm": 0.5052439699487477, + "learning_rate": 9.881095827658548e-06, + "loss": 0.629, + "step": 400 + }, + { + "epoch": 0.48862713241267264, + "grad_norm": 0.5842564825983466, + "learning_rate": 9.879552853925486e-06, + "loss": 0.6518, + "step": 401 + }, + { + "epoch": 0.48984565393988627, + "grad_norm": 0.5538659465643975, + "learning_rate": 9.878000055375512e-06, + "loss": 0.6333, + "step": 402 + }, + { + "epoch": 0.4910641754670999, + "grad_norm": 0.5200827864775698, + "learning_rate": 9.876437435135133e-06, + "loss": 0.6348, + "step": 403 + }, + { + "epoch": 0.49228269699431354, + "grad_norm": 0.6043127912027646, + "learning_rate": 9.874864996350633e-06, + "loss": 0.6136, + "step": 404 + }, + { + "epoch": 0.49350121852152723, + "grad_norm": 0.4948272003142496, + "learning_rate": 9.873282742188066e-06, + "loss": 0.6301, + "step": 405 + }, + { + "epoch": 0.49471974004874086, + "grad_norm": 0.5983030540970795, + "learning_rate": 9.871690675833248e-06, + "loss": 0.6354, + "step": 406 + }, + { + "epoch": 0.4959382615759545, + "grad_norm": 0.5309927588463559, + "learning_rate": 9.87008880049175e-06, + "loss": 0.6316, + "step": 407 + }, + { + "epoch": 0.49715678310316813, + "grad_norm": 0.46510544628039285, + "learning_rate": 9.868477119388897e-06, + "loss": 0.641, + "step": 408 + }, + { + "epoch": 0.4983753046303818, + "grad_norm": 0.4745237655389145, + "learning_rate": 9.866855635769753e-06, + "loss": 0.6484, + "step": 409 + }, + { + "epoch": 0.49959382615759546, + "grad_norm": 0.562173043770555, + "learning_rate": 9.86522435289912e-06, + "loss": 0.6263, + "step": 410 + }, + { + "epoch": 0.5008123476848091, + "grad_norm": 0.5419982591023096, + "learning_rate": 9.863583274061535e-06, + "loss": 0.6197, + "step": 411 + }, + { + "epoch": 0.5020308692120228, + "grad_norm": 0.5709095665576734, + "learning_rate": 9.861932402561253e-06, + "loss": 0.6253, + "step": 412 + }, + { + "epoch": 0.5032493907392364, + "grad_norm": 0.5575561882923015, + "learning_rate": 9.86027174172225e-06, + "loss": 0.6257, + "step": 413 + }, + { + "epoch": 0.50446791226645, + "grad_norm": 0.5818761313113621, + "learning_rate": 9.858601294888212e-06, + "loss": 0.6375, + "step": 414 + }, + { + "epoch": 0.5056864337936637, + "grad_norm": 0.55560278003152, + "learning_rate": 9.856921065422527e-06, + "loss": 0.6327, + "step": 415 + }, + { + "epoch": 0.5069049553208773, + "grad_norm": 0.5142680238787152, + "learning_rate": 9.855231056708281e-06, + "loss": 0.6347, + "step": 416 + }, + { + "epoch": 0.508123476848091, + "grad_norm": 0.5468260799033448, + "learning_rate": 9.853531272148248e-06, + "loss": 0.6165, + "step": 417 + }, + { + "epoch": 0.5093419983753046, + "grad_norm": 0.5366215405716666, + "learning_rate": 9.851821715164891e-06, + "loss": 0.6232, + "step": 418 + }, + { + "epoch": 0.5105605199025183, + "grad_norm": 0.6815769917483668, + "learning_rate": 9.850102389200346e-06, + "loss": 0.6375, + "step": 419 + }, + { + "epoch": 0.511779041429732, + "grad_norm": 0.5766636790628379, + "learning_rate": 9.848373297716414e-06, + "loss": 0.6411, + "step": 420 + }, + { + "epoch": 0.5129975629569455, + "grad_norm": 0.6508434213004275, + "learning_rate": 9.846634444194568e-06, + "loss": 0.6277, + "step": 421 + }, + { + "epoch": 0.5142160844841592, + "grad_norm": 0.5654811023161467, + "learning_rate": 9.844885832135928e-06, + "loss": 0.6192, + "step": 422 + }, + { + "epoch": 0.5154346060113729, + "grad_norm": 0.6220408843438429, + "learning_rate": 9.84312746506127e-06, + "loss": 0.6254, + "step": 423 + }, + { + "epoch": 0.5166531275385865, + "grad_norm": 0.5550144456923615, + "learning_rate": 9.841359346511004e-06, + "loss": 0.6288, + "step": 424 + }, + { + "epoch": 0.5178716490658002, + "grad_norm": 0.5804117244385404, + "learning_rate": 9.83958148004518e-06, + "loss": 0.6244, + "step": 425 + }, + { + "epoch": 0.5190901705930138, + "grad_norm": 0.6245742605810847, + "learning_rate": 9.837793869243468e-06, + "loss": 0.6209, + "step": 426 + }, + { + "epoch": 0.5203086921202275, + "grad_norm": 0.5661037548895256, + "learning_rate": 9.83599651770517e-06, + "loss": 0.6279, + "step": 427 + }, + { + "epoch": 0.5215272136474411, + "grad_norm": 0.5358603369119569, + "learning_rate": 9.834189429049188e-06, + "loss": 0.6307, + "step": 428 + }, + { + "epoch": 0.5227457351746547, + "grad_norm": 0.6122007731857034, + "learning_rate": 9.832372606914038e-06, + "loss": 0.6158, + "step": 429 + }, + { + "epoch": 0.5239642567018684, + "grad_norm": 0.5972271574369769, + "learning_rate": 9.830546054957828e-06, + "loss": 0.6204, + "step": 430 + }, + { + "epoch": 0.525182778229082, + "grad_norm": 0.5443858161988891, + "learning_rate": 9.82870977685826e-06, + "loss": 0.621, + "step": 431 + }, + { + "epoch": 0.5264012997562957, + "grad_norm": 0.6250123596443754, + "learning_rate": 9.826863776312621e-06, + "loss": 0.6408, + "step": 432 + }, + { + "epoch": 0.5276198212835094, + "grad_norm": 0.5933038352389216, + "learning_rate": 9.825008057037769e-06, + "loss": 0.6588, + "step": 433 + }, + { + "epoch": 0.528838342810723, + "grad_norm": 0.6567920347058966, + "learning_rate": 9.823142622770135e-06, + "loss": 0.625, + "step": 434 + }, + { + "epoch": 0.5300568643379366, + "grad_norm": 0.5779776066299945, + "learning_rate": 9.821267477265705e-06, + "loss": 0.6387, + "step": 435 + }, + { + "epoch": 0.5312753858651503, + "grad_norm": 0.570082080677981, + "learning_rate": 9.819382624300027e-06, + "loss": 0.6324, + "step": 436 + }, + { + "epoch": 0.5324939073923639, + "grad_norm": 0.5818606827175574, + "learning_rate": 9.817488067668186e-06, + "loss": 0.644, + "step": 437 + }, + { + "epoch": 0.5337124289195776, + "grad_norm": 0.5476824827124901, + "learning_rate": 9.815583811184809e-06, + "loss": 0.6189, + "step": 438 + }, + { + "epoch": 0.5349309504467912, + "grad_norm": 0.5768267508522074, + "learning_rate": 9.813669858684054e-06, + "loss": 0.6222, + "step": 439 + }, + { + "epoch": 0.5361494719740049, + "grad_norm": 0.5120867453918215, + "learning_rate": 9.8117462140196e-06, + "loss": 0.6204, + "step": 440 + }, + { + "epoch": 0.5373679935012186, + "grad_norm": 0.5186146607717382, + "learning_rate": 9.80981288106464e-06, + "loss": 0.6195, + "step": 441 + }, + { + "epoch": 0.5385865150284321, + "grad_norm": 0.5895698622661449, + "learning_rate": 9.807869863711878e-06, + "loss": 0.6205, + "step": 442 + }, + { + "epoch": 0.5398050365556458, + "grad_norm": 0.5421346973971489, + "learning_rate": 9.805917165873515e-06, + "loss": 0.6303, + "step": 443 + }, + { + "epoch": 0.5410235580828595, + "grad_norm": 0.5227058266380313, + "learning_rate": 9.803954791481239e-06, + "loss": 0.6196, + "step": 444 + }, + { + "epoch": 0.5422420796100731, + "grad_norm": 0.4665750459631165, + "learning_rate": 9.801982744486229e-06, + "loss": 0.628, + "step": 445 + }, + { + "epoch": 0.5434606011372868, + "grad_norm": 0.5340051839015313, + "learning_rate": 9.800001028859135e-06, + "loss": 0.6321, + "step": 446 + }, + { + "epoch": 0.5446791226645004, + "grad_norm": 0.49569443009344233, + "learning_rate": 9.798009648590073e-06, + "loss": 0.6295, + "step": 447 + }, + { + "epoch": 0.545897644191714, + "grad_norm": 0.5589394978947685, + "learning_rate": 9.796008607688624e-06, + "loss": 0.6458, + "step": 448 + }, + { + "epoch": 0.5471161657189277, + "grad_norm": 0.5349825334411198, + "learning_rate": 9.793997910183815e-06, + "loss": 0.6348, + "step": 449 + }, + { + "epoch": 0.5483346872461413, + "grad_norm": 0.5406756193824626, + "learning_rate": 9.79197756012412e-06, + "loss": 0.6352, + "step": 450 + }, + { + "epoch": 0.549553208773355, + "grad_norm": 0.5590939249326192, + "learning_rate": 9.789947561577445e-06, + "loss": 0.6345, + "step": 451 + }, + { + "epoch": 0.5507717303005687, + "grad_norm": 0.5138272981689205, + "learning_rate": 9.787907918631125e-06, + "loss": 0.6457, + "step": 452 + }, + { + "epoch": 0.5519902518277823, + "grad_norm": 0.5967975071520696, + "learning_rate": 9.785858635391913e-06, + "loss": 0.6059, + "step": 453 + }, + { + "epoch": 0.553208773354996, + "grad_norm": 0.4912288949887055, + "learning_rate": 9.783799715985973e-06, + "loss": 0.6254, + "step": 454 + }, + { + "epoch": 0.5544272948822095, + "grad_norm": 0.5903941074513651, + "learning_rate": 9.78173116455887e-06, + "loss": 0.6108, + "step": 455 + }, + { + "epoch": 0.5556458164094232, + "grad_norm": 0.5632794329839387, + "learning_rate": 9.779652985275562e-06, + "loss": 0.6187, + "step": 456 + }, + { + "epoch": 0.5568643379366369, + "grad_norm": 0.5941486268629673, + "learning_rate": 9.777565182320396e-06, + "loss": 0.6184, + "step": 457 + }, + { + "epoch": 0.5580828594638505, + "grad_norm": 0.6416650599158464, + "learning_rate": 9.775467759897092e-06, + "loss": 0.6331, + "step": 458 + }, + { + "epoch": 0.5593013809910642, + "grad_norm": 0.5651281069823211, + "learning_rate": 9.773360722228742e-06, + "loss": 0.6307, + "step": 459 + }, + { + "epoch": 0.5605199025182778, + "grad_norm": 0.6620891236551917, + "learning_rate": 9.771244073557792e-06, + "loss": 0.6078, + "step": 460 + }, + { + "epoch": 0.5617384240454915, + "grad_norm": 0.6015785675867341, + "learning_rate": 9.769117818146048e-06, + "loss": 0.6237, + "step": 461 + }, + { + "epoch": 0.5629569455727051, + "grad_norm": 0.8038047522794796, + "learning_rate": 9.766981960274653e-06, + "loss": 0.6173, + "step": 462 + }, + { + "epoch": 0.5641754670999187, + "grad_norm": 0.6163269598618792, + "learning_rate": 9.764836504244086e-06, + "loss": 0.6264, + "step": 463 + }, + { + "epoch": 0.5653939886271324, + "grad_norm": 0.6244153487192251, + "learning_rate": 9.762681454374148e-06, + "loss": 0.6112, + "step": 464 + }, + { + "epoch": 0.5666125101543461, + "grad_norm": 0.724456218504814, + "learning_rate": 9.760516815003965e-06, + "loss": 0.6255, + "step": 465 + }, + { + "epoch": 0.5678310316815597, + "grad_norm": 0.580652096091434, + "learning_rate": 9.758342590491961e-06, + "loss": 0.6342, + "step": 466 + }, + { + "epoch": 0.5690495532087734, + "grad_norm": 0.6644456071205537, + "learning_rate": 9.756158785215866e-06, + "loss": 0.6127, + "step": 467 + }, + { + "epoch": 0.570268074735987, + "grad_norm": 0.5736293156748269, + "learning_rate": 9.753965403572703e-06, + "loss": 0.6313, + "step": 468 + }, + { + "epoch": 0.5714865962632006, + "grad_norm": 0.6178186373387958, + "learning_rate": 9.751762449978767e-06, + "loss": 0.643, + "step": 469 + }, + { + "epoch": 0.5727051177904143, + "grad_norm": 0.584712916385393, + "learning_rate": 9.749549928869636e-06, + "loss": 0.5948, + "step": 470 + }, + { + "epoch": 0.5739236393176279, + "grad_norm": 0.6116917271773714, + "learning_rate": 9.747327844700147e-06, + "loss": 0.6297, + "step": 471 + }, + { + "epoch": 0.5751421608448416, + "grad_norm": 0.4903955649085751, + "learning_rate": 9.745096201944391e-06, + "loss": 0.6251, + "step": 472 + }, + { + "epoch": 0.5763606823720553, + "grad_norm": 0.6968313476556924, + "learning_rate": 9.742855005095706e-06, + "loss": 0.6117, + "step": 473 + }, + { + "epoch": 0.5775792038992689, + "grad_norm": 0.48897486873959584, + "learning_rate": 9.740604258666668e-06, + "loss": 0.6058, + "step": 474 + }, + { + "epoch": 0.5787977254264826, + "grad_norm": 0.7217629239411762, + "learning_rate": 9.73834396718908e-06, + "loss": 0.6265, + "step": 475 + }, + { + "epoch": 0.5800162469536961, + "grad_norm": 0.613354162646377, + "learning_rate": 9.736074135213962e-06, + "loss": 0.6399, + "step": 476 + }, + { + "epoch": 0.5812347684809098, + "grad_norm": 0.6447055703309105, + "learning_rate": 9.733794767311545e-06, + "loss": 0.6335, + "step": 477 + }, + { + "epoch": 0.5824532900081235, + "grad_norm": 0.5823448015058018, + "learning_rate": 9.731505868071262e-06, + "loss": 0.6262, + "step": 478 + }, + { + "epoch": 0.5836718115353371, + "grad_norm": 0.5125864553684497, + "learning_rate": 9.729207442101736e-06, + "loss": 0.6101, + "step": 479 + }, + { + "epoch": 0.5848903330625508, + "grad_norm": 0.6147081791430226, + "learning_rate": 9.726899494030768e-06, + "loss": 0.6411, + "step": 480 + }, + { + "epoch": 0.5861088545897645, + "grad_norm": 0.5467046907537908, + "learning_rate": 9.724582028505336e-06, + "loss": 0.6203, + "step": 481 + }, + { + "epoch": 0.587327376116978, + "grad_norm": 0.5741960101018327, + "learning_rate": 9.72225505019158e-06, + "loss": 0.624, + "step": 482 + }, + { + "epoch": 0.5885458976441917, + "grad_norm": 0.6709034274446143, + "learning_rate": 9.719918563774793e-06, + "loss": 0.6316, + "step": 483 + }, + { + "epoch": 0.5897644191714053, + "grad_norm": 0.5633926121392079, + "learning_rate": 9.71757257395941e-06, + "loss": 0.6205, + "step": 484 + }, + { + "epoch": 0.590982940698619, + "grad_norm": 0.5752003286544818, + "learning_rate": 9.715217085469009e-06, + "loss": 0.601, + "step": 485 + }, + { + "epoch": 0.5922014622258327, + "grad_norm": 0.6676085473844594, + "learning_rate": 9.712852103046281e-06, + "loss": 0.6425, + "step": 486 + }, + { + "epoch": 0.5934199837530463, + "grad_norm": 0.43714860457984767, + "learning_rate": 9.710477631453044e-06, + "loss": 0.6264, + "step": 487 + }, + { + "epoch": 0.59463850528026, + "grad_norm": 0.7834186015627101, + "learning_rate": 9.708093675470214e-06, + "loss": 0.6294, + "step": 488 + }, + { + "epoch": 0.5958570268074735, + "grad_norm": 0.5229823852593044, + "learning_rate": 9.705700239897809e-06, + "loss": 0.6253, + "step": 489 + }, + { + "epoch": 0.5970755483346872, + "grad_norm": 0.6641427142623177, + "learning_rate": 9.70329732955493e-06, + "loss": 0.6208, + "step": 490 + }, + { + "epoch": 0.5982940698619009, + "grad_norm": 0.5777300627058165, + "learning_rate": 9.70088494927976e-06, + "loss": 0.62, + "step": 491 + }, + { + "epoch": 0.5995125913891145, + "grad_norm": 0.47427848956457735, + "learning_rate": 9.698463103929542e-06, + "loss": 0.6168, + "step": 492 + }, + { + "epoch": 0.6007311129163282, + "grad_norm": 0.6176694192284208, + "learning_rate": 9.696031798380586e-06, + "loss": 0.6192, + "step": 493 + }, + { + "epoch": 0.6019496344435419, + "grad_norm": 0.5380294280704867, + "learning_rate": 9.693591037528239e-06, + "loss": 0.6324, + "step": 494 + }, + { + "epoch": 0.6031681559707555, + "grad_norm": 0.5270092433580651, + "learning_rate": 9.691140826286893e-06, + "loss": 0.6275, + "step": 495 + }, + { + "epoch": 0.6043866774979691, + "grad_norm": 0.5928211370503502, + "learning_rate": 9.688681169589971e-06, + "loss": 0.6295, + "step": 496 + }, + { + "epoch": 0.6056051990251827, + "grad_norm": 0.487281690093329, + "learning_rate": 9.686212072389904e-06, + "loss": 0.6157, + "step": 497 + }, + { + "epoch": 0.6068237205523964, + "grad_norm": 0.5179266059337351, + "learning_rate": 9.68373353965814e-06, + "loss": 0.6098, + "step": 498 + }, + { + "epoch": 0.6080422420796101, + "grad_norm": 0.5314913870970437, + "learning_rate": 9.68124557638512e-06, + "loss": 0.6173, + "step": 499 + }, + { + "epoch": 0.6092607636068237, + "grad_norm": 0.4844744555610714, + "learning_rate": 9.678748187580278e-06, + "loss": 0.6186, + "step": 500 + }, + { + "epoch": 0.6104792851340374, + "grad_norm": 0.5188776477142794, + "learning_rate": 9.676241378272022e-06, + "loss": 0.6168, + "step": 501 + }, + { + "epoch": 0.6116978066612511, + "grad_norm": 0.49668970689497427, + "learning_rate": 9.673725153507727e-06, + "loss": 0.6128, + "step": 502 + }, + { + "epoch": 0.6129163281884646, + "grad_norm": 0.5049088012633238, + "learning_rate": 9.67119951835373e-06, + "loss": 0.6204, + "step": 503 + }, + { + "epoch": 0.6141348497156783, + "grad_norm": 0.5286755135827618, + "learning_rate": 9.66866447789531e-06, + "loss": 0.6321, + "step": 504 + }, + { + "epoch": 0.6153533712428919, + "grad_norm": 0.5414829955250333, + "learning_rate": 9.666120037236692e-06, + "loss": 0.6073, + "step": 505 + }, + { + "epoch": 0.6165718927701056, + "grad_norm": 0.5929807296645003, + "learning_rate": 9.663566201501017e-06, + "loss": 0.6219, + "step": 506 + }, + { + "epoch": 0.6177904142973193, + "grad_norm": 0.565513002212362, + "learning_rate": 9.66100297583035e-06, + "loss": 0.6218, + "step": 507 + }, + { + "epoch": 0.6190089358245329, + "grad_norm": 0.48043459347807704, + "learning_rate": 9.65843036538566e-06, + "loss": 0.607, + "step": 508 + }, + { + "epoch": 0.6202274573517466, + "grad_norm": 0.6289509926942585, + "learning_rate": 9.655848375346812e-06, + "loss": 0.6396, + "step": 509 + }, + { + "epoch": 0.6214459788789602, + "grad_norm": 0.5609440147588081, + "learning_rate": 9.65325701091256e-06, + "loss": 0.6303, + "step": 510 + }, + { + "epoch": 0.6226645004061738, + "grad_norm": 0.5893573188478602, + "learning_rate": 9.650656277300525e-06, + "loss": 0.6166, + "step": 511 + }, + { + "epoch": 0.6238830219333875, + "grad_norm": 0.5628137809478111, + "learning_rate": 9.6480461797472e-06, + "loss": 0.6291, + "step": 512 + }, + { + "epoch": 0.6251015434606011, + "grad_norm": 0.5493464215154626, + "learning_rate": 9.645426723507929e-06, + "loss": 0.6222, + "step": 513 + }, + { + "epoch": 0.6263200649878148, + "grad_norm": 0.5629698357909129, + "learning_rate": 9.6427979138569e-06, + "loss": 0.6317, + "step": 514 + }, + { + "epoch": 0.6275385865150285, + "grad_norm": 0.6664927672498832, + "learning_rate": 9.640159756087136e-06, + "loss": 0.6382, + "step": 515 + }, + { + "epoch": 0.628757108042242, + "grad_norm": 0.5522749634660304, + "learning_rate": 9.637512255510475e-06, + "loss": 0.6143, + "step": 516 + }, + { + "epoch": 0.6299756295694557, + "grad_norm": 0.5532267628661862, + "learning_rate": 9.63485541745757e-06, + "loss": 0.6374, + "step": 517 + }, + { + "epoch": 0.6311941510966693, + "grad_norm": 0.6876124654936631, + "learning_rate": 9.632189247277885e-06, + "loss": 0.6392, + "step": 518 + }, + { + "epoch": 0.632412672623883, + "grad_norm": 0.653192030137328, + "learning_rate": 9.629513750339656e-06, + "loss": 0.6146, + "step": 519 + }, + { + "epoch": 0.6336311941510967, + "grad_norm": 0.5264590327684809, + "learning_rate": 9.626828932029907e-06, + "loss": 0.6187, + "step": 520 + }, + { + "epoch": 0.6348497156783103, + "grad_norm": 0.6140627235902801, + "learning_rate": 9.624134797754437e-06, + "loss": 0.5948, + "step": 521 + }, + { + "epoch": 0.636068237205524, + "grad_norm": 0.715948251788629, + "learning_rate": 9.62143135293779e-06, + "loss": 0.6221, + "step": 522 + }, + { + "epoch": 0.6372867587327377, + "grad_norm": 0.6814424426040064, + "learning_rate": 9.618718603023261e-06, + "loss": 0.6279, + "step": 523 + }, + { + "epoch": 0.6385052802599512, + "grad_norm": 0.600168318088034, + "learning_rate": 9.615996553472885e-06, + "loss": 0.6267, + "step": 524 + }, + { + "epoch": 0.6397238017871649, + "grad_norm": 0.5619413500131725, + "learning_rate": 9.613265209767417e-06, + "loss": 0.6288, + "step": 525 + }, + { + "epoch": 0.6409423233143785, + "grad_norm": 0.5903652755615201, + "learning_rate": 9.610524577406325e-06, + "loss": 0.6305, + "step": 526 + }, + { + "epoch": 0.6421608448415922, + "grad_norm": 0.5087861988940737, + "learning_rate": 9.607774661907783e-06, + "loss": 0.6192, + "step": 527 + }, + { + "epoch": 0.6433793663688059, + "grad_norm": 0.6555944853088764, + "learning_rate": 9.605015468808651e-06, + "loss": 0.6255, + "step": 528 + }, + { + "epoch": 0.6445978878960195, + "grad_norm": 0.6123139168204214, + "learning_rate": 9.602247003664476e-06, + "loss": 0.6185, + "step": 529 + }, + { + "epoch": 0.6458164094232332, + "grad_norm": 0.5503960050113602, + "learning_rate": 9.599469272049468e-06, + "loss": 0.6385, + "step": 530 + }, + { + "epoch": 0.6470349309504468, + "grad_norm": 0.5823472571150912, + "learning_rate": 9.596682279556499e-06, + "loss": 0.6241, + "step": 531 + }, + { + "epoch": 0.6482534524776604, + "grad_norm": 0.5840631388468679, + "learning_rate": 9.593886031797081e-06, + "loss": 0.625, + "step": 532 + }, + { + "epoch": 0.6494719740048741, + "grad_norm": 0.5622117171111194, + "learning_rate": 9.591080534401371e-06, + "loss": 0.6192, + "step": 533 + }, + { + "epoch": 0.6506904955320877, + "grad_norm": 0.5707745901206253, + "learning_rate": 9.588265793018141e-06, + "loss": 0.6391, + "step": 534 + }, + { + "epoch": 0.6519090170593014, + "grad_norm": 0.5896800585312665, + "learning_rate": 9.58544181331478e-06, + "loss": 0.6339, + "step": 535 + }, + { + "epoch": 0.6531275385865151, + "grad_norm": 0.5209906229065117, + "learning_rate": 9.582608600977276e-06, + "loss": 0.601, + "step": 536 + }, + { + "epoch": 0.6543460601137286, + "grad_norm": 0.5155011577582275, + "learning_rate": 9.579766161710209e-06, + "loss": 0.6015, + "step": 537 + }, + { + "epoch": 0.6555645816409423, + "grad_norm": 0.48807425767261786, + "learning_rate": 9.576914501236734e-06, + "loss": 0.6167, + "step": 538 + }, + { + "epoch": 0.656783103168156, + "grad_norm": 0.5579148908182612, + "learning_rate": 9.574053625298577e-06, + "loss": 0.6193, + "step": 539 + }, + { + "epoch": 0.6580016246953696, + "grad_norm": 0.5287053319535842, + "learning_rate": 9.571183539656011e-06, + "loss": 0.6291, + "step": 540 + }, + { + "epoch": 0.6592201462225833, + "grad_norm": 0.6191360016551267, + "learning_rate": 9.568304250087864e-06, + "loss": 0.6139, + "step": 541 + }, + { + "epoch": 0.6604386677497969, + "grad_norm": 0.5099069268786582, + "learning_rate": 9.565415762391485e-06, + "loss": 0.6013, + "step": 542 + }, + { + "epoch": 0.6616571892770106, + "grad_norm": 0.5421293076141, + "learning_rate": 9.562518082382751e-06, + "loss": 0.5907, + "step": 543 + }, + { + "epoch": 0.6628757108042242, + "grad_norm": 0.5498541039203616, + "learning_rate": 9.559611215896041e-06, + "loss": 0.627, + "step": 544 + }, + { + "epoch": 0.6640942323314378, + "grad_norm": 0.5680961983046815, + "learning_rate": 9.556695168784236e-06, + "loss": 0.5952, + "step": 545 + }, + { + "epoch": 0.6653127538586515, + "grad_norm": 0.5218060004228549, + "learning_rate": 9.553769946918698e-06, + "loss": 0.6228, + "step": 546 + }, + { + "epoch": 0.6665312753858651, + "grad_norm": 0.5543031912725007, + "learning_rate": 9.550835556189264e-06, + "loss": 0.6338, + "step": 547 + }, + { + "epoch": 0.6677497969130788, + "grad_norm": 0.5668524593324846, + "learning_rate": 9.547892002504233e-06, + "loss": 0.6219, + "step": 548 + }, + { + "epoch": 0.6689683184402925, + "grad_norm": 0.5873694380478705, + "learning_rate": 9.544939291790352e-06, + "loss": 0.624, + "step": 549 + }, + { + "epoch": 0.670186839967506, + "grad_norm": 0.5399986226537774, + "learning_rate": 9.541977429992803e-06, + "loss": 0.6385, + "step": 550 + }, + { + "epoch": 0.6714053614947197, + "grad_norm": 0.7171400926799747, + "learning_rate": 9.5390064230752e-06, + "loss": 0.621, + "step": 551 + }, + { + "epoch": 0.6726238830219334, + "grad_norm": 0.6092647452638789, + "learning_rate": 9.536026277019562e-06, + "loss": 0.6223, + "step": 552 + }, + { + "epoch": 0.673842404549147, + "grad_norm": 0.683988747327427, + "learning_rate": 9.533036997826315e-06, + "loss": 0.6199, + "step": 553 + }, + { + "epoch": 0.6750609260763607, + "grad_norm": 0.5791819914636441, + "learning_rate": 9.530038591514275e-06, + "loss": 0.6328, + "step": 554 + }, + { + "epoch": 0.6762794476035743, + "grad_norm": 0.6782628719672897, + "learning_rate": 9.527031064120632e-06, + "loss": 0.6127, + "step": 555 + }, + { + "epoch": 0.677497969130788, + "grad_norm": 0.6767775073979123, + "learning_rate": 9.524014421700942e-06, + "loss": 0.6186, + "step": 556 + }, + { + "epoch": 0.6787164906580017, + "grad_norm": 0.5114857558759379, + "learning_rate": 9.520988670329114e-06, + "loss": 0.63, + "step": 557 + }, + { + "epoch": 0.6799350121852152, + "grad_norm": 0.5501380880007342, + "learning_rate": 9.517953816097396e-06, + "loss": 0.5915, + "step": 558 + }, + { + "epoch": 0.6811535337124289, + "grad_norm": 0.6714746829201106, + "learning_rate": 9.514909865116368e-06, + "loss": 0.6067, + "step": 559 + }, + { + "epoch": 0.6823720552396426, + "grad_norm": 0.5375092336126965, + "learning_rate": 9.511856823514924e-06, + "loss": 0.596, + "step": 560 + }, + { + "epoch": 0.6835905767668562, + "grad_norm": 0.6176188040728243, + "learning_rate": 9.508794697440257e-06, + "loss": 0.6333, + "step": 561 + }, + { + "epoch": 0.6848090982940699, + "grad_norm": 0.6212303271054956, + "learning_rate": 9.505723493057862e-06, + "loss": 0.6178, + "step": 562 + }, + { + "epoch": 0.6860276198212835, + "grad_norm": 0.5377188134801542, + "learning_rate": 9.502643216551502e-06, + "loss": 0.6017, + "step": 563 + }, + { + "epoch": 0.6872461413484972, + "grad_norm": 0.6362000539969834, + "learning_rate": 9.499553874123213e-06, + "loss": 0.6392, + "step": 564 + }, + { + "epoch": 0.6884646628757108, + "grad_norm": 0.5480382319562058, + "learning_rate": 9.496455471993284e-06, + "loss": 0.6113, + "step": 565 + }, + { + "epoch": 0.6896831844029244, + "grad_norm": 0.6994517506614581, + "learning_rate": 9.49334801640024e-06, + "loss": 0.6327, + "step": 566 + }, + { + "epoch": 0.6909017059301381, + "grad_norm": 0.5335729160289857, + "learning_rate": 9.490231513600842e-06, + "loss": 0.6218, + "step": 567 + }, + { + "epoch": 0.6921202274573518, + "grad_norm": 0.6063268804347564, + "learning_rate": 9.487105969870068e-06, + "loss": 0.6174, + "step": 568 + }, + { + "epoch": 0.6933387489845654, + "grad_norm": 0.6267394635949436, + "learning_rate": 9.48397139150109e-06, + "loss": 0.605, + "step": 569 + }, + { + "epoch": 0.6945572705117791, + "grad_norm": 0.48229350211609867, + "learning_rate": 9.480827784805278e-06, + "loss": 0.6138, + "step": 570 + }, + { + "epoch": 0.6957757920389926, + "grad_norm": 0.6094361236823382, + "learning_rate": 9.477675156112183e-06, + "loss": 0.616, + "step": 571 + }, + { + "epoch": 0.6969943135662063, + "grad_norm": 0.5646668548267415, + "learning_rate": 9.474513511769513e-06, + "loss": 0.6257, + "step": 572 + }, + { + "epoch": 0.69821283509342, + "grad_norm": 0.5605266691062354, + "learning_rate": 9.47134285814314e-06, + "loss": 0.623, + "step": 573 + }, + { + "epoch": 0.6994313566206336, + "grad_norm": 0.5976205093855237, + "learning_rate": 9.468163201617063e-06, + "loss": 0.6182, + "step": 574 + }, + { + "epoch": 0.7006498781478473, + "grad_norm": 0.5736754942220608, + "learning_rate": 9.464974548593415e-06, + "loss": 0.5973, + "step": 575 + }, + { + "epoch": 0.7018683996750609, + "grad_norm": 0.5782971035374301, + "learning_rate": 9.461776905492446e-06, + "loss": 0.6021, + "step": 576 + }, + { + "epoch": 0.7030869212022746, + "grad_norm": 0.5094228164183464, + "learning_rate": 9.458570278752501e-06, + "loss": 0.6028, + "step": 577 + }, + { + "epoch": 0.7043054427294883, + "grad_norm": 0.5803305530484321, + "learning_rate": 9.455354674830016e-06, + "loss": 0.6224, + "step": 578 + }, + { + "epoch": 0.7055239642567018, + "grad_norm": 0.5229464149205902, + "learning_rate": 9.452130100199504e-06, + "loss": 0.6157, + "step": 579 + }, + { + "epoch": 0.7067424857839155, + "grad_norm": 0.5965075801420928, + "learning_rate": 9.448896561353536e-06, + "loss": 0.6062, + "step": 580 + }, + { + "epoch": 0.7079610073111292, + "grad_norm": 0.5275236801559984, + "learning_rate": 9.445654064802738e-06, + "loss": 0.611, + "step": 581 + }, + { + "epoch": 0.7091795288383428, + "grad_norm": 0.511555457965572, + "learning_rate": 9.442402617075765e-06, + "loss": 0.6263, + "step": 582 + }, + { + "epoch": 0.7103980503655565, + "grad_norm": 0.5490562182756723, + "learning_rate": 9.439142224719302e-06, + "loss": 0.6236, + "step": 583 + }, + { + "epoch": 0.7116165718927701, + "grad_norm": 0.5258200584782562, + "learning_rate": 9.435872894298037e-06, + "loss": 0.6106, + "step": 584 + }, + { + "epoch": 0.7128350934199837, + "grad_norm": 0.5189357566107585, + "learning_rate": 9.43259463239466e-06, + "loss": 0.636, + "step": 585 + }, + { + "epoch": 0.7140536149471974, + "grad_norm": 0.5097577073371684, + "learning_rate": 9.429307445609841e-06, + "loss": 0.6337, + "step": 586 + }, + { + "epoch": 0.715272136474411, + "grad_norm": 0.6069103268356187, + "learning_rate": 9.426011340562222e-06, + "loss": 0.6177, + "step": 587 + }, + { + "epoch": 0.7164906580016247, + "grad_norm": 0.48842546371203027, + "learning_rate": 9.422706323888398e-06, + "loss": 0.6011, + "step": 588 + }, + { + "epoch": 0.7177091795288384, + "grad_norm": 0.5365657101299985, + "learning_rate": 9.419392402242912e-06, + "loss": 0.6007, + "step": 589 + }, + { + "epoch": 0.718927701056052, + "grad_norm": 0.5101507591790149, + "learning_rate": 9.416069582298236e-06, + "loss": 0.6175, + "step": 590 + }, + { + "epoch": 0.7201462225832657, + "grad_norm": 0.4516555710559031, + "learning_rate": 9.412737870744752e-06, + "loss": 0.6107, + "step": 591 + }, + { + "epoch": 0.7213647441104792, + "grad_norm": 0.4881759934731241, + "learning_rate": 9.409397274290756e-06, + "loss": 0.6224, + "step": 592 + }, + { + "epoch": 0.7225832656376929, + "grad_norm": 0.45459978443672416, + "learning_rate": 9.406047799662426e-06, + "loss": 0.6089, + "step": 593 + }, + { + "epoch": 0.7238017871649066, + "grad_norm": 0.505751917086364, + "learning_rate": 9.402689453603815e-06, + "loss": 0.6244, + "step": 594 + }, + { + "epoch": 0.7250203086921202, + "grad_norm": 0.5110751597586063, + "learning_rate": 9.399322242876843e-06, + "loss": 0.601, + "step": 595 + }, + { + "epoch": 0.7262388302193339, + "grad_norm": 0.504579475445371, + "learning_rate": 9.395946174261274e-06, + "loss": 0.6216, + "step": 596 + }, + { + "epoch": 0.7274573517465476, + "grad_norm": 0.534595723022526, + "learning_rate": 9.392561254554712e-06, + "loss": 0.6067, + "step": 597 + }, + { + "epoch": 0.7286758732737612, + "grad_norm": 0.5583009202449097, + "learning_rate": 9.38916749057258e-06, + "loss": 0.6249, + "step": 598 + }, + { + "epoch": 0.7298943948009748, + "grad_norm": 0.5059716144312469, + "learning_rate": 9.385764889148107e-06, + "loss": 0.6115, + "step": 599 + }, + { + "epoch": 0.7311129163281884, + "grad_norm": 0.6121449401534393, + "learning_rate": 9.382353457132318e-06, + "loss": 0.6077, + "step": 600 + }, + { + "epoch": 0.7323314378554021, + "grad_norm": 0.4829522546788395, + "learning_rate": 9.378933201394019e-06, + "loss": 0.6216, + "step": 601 + }, + { + "epoch": 0.7335499593826158, + "grad_norm": 0.5436028145378481, + "learning_rate": 9.375504128819779e-06, + "loss": 0.6185, + "step": 602 + }, + { + "epoch": 0.7347684809098294, + "grad_norm": 0.5172970082009579, + "learning_rate": 9.372066246313922e-06, + "loss": 0.644, + "step": 603 + }, + { + "epoch": 0.7359870024370431, + "grad_norm": 0.4738987982796835, + "learning_rate": 9.368619560798511e-06, + "loss": 0.6246, + "step": 604 + }, + { + "epoch": 0.7372055239642566, + "grad_norm": 0.4525040495867516, + "learning_rate": 9.36516407921333e-06, + "loss": 0.6109, + "step": 605 + }, + { + "epoch": 0.7384240454914703, + "grad_norm": 0.5076237716007553, + "learning_rate": 9.361699808515877e-06, + "loss": 0.6151, + "step": 606 + }, + { + "epoch": 0.739642567018684, + "grad_norm": 0.5074655977130175, + "learning_rate": 9.358226755681342e-06, + "loss": 0.6082, + "step": 607 + }, + { + "epoch": 0.7408610885458976, + "grad_norm": 0.4840933107276308, + "learning_rate": 9.354744927702607e-06, + "loss": 0.615, + "step": 608 + }, + { + "epoch": 0.7420796100731113, + "grad_norm": 0.5219253787729252, + "learning_rate": 9.351254331590216e-06, + "loss": 0.5996, + "step": 609 + }, + { + "epoch": 0.743298131600325, + "grad_norm": 0.5601150249253273, + "learning_rate": 9.347754974372365e-06, + "loss": 0.6032, + "step": 610 + }, + { + "epoch": 0.7445166531275386, + "grad_norm": 0.4986838680038737, + "learning_rate": 9.344246863094893e-06, + "loss": 0.5976, + "step": 611 + }, + { + "epoch": 0.7457351746547523, + "grad_norm": 0.4948788586568317, + "learning_rate": 9.340730004821266e-06, + "loss": 0.6085, + "step": 612 + }, + { + "epoch": 0.7469536961819658, + "grad_norm": 0.5238689007424114, + "learning_rate": 9.33720440663256e-06, + "loss": 0.6129, + "step": 613 + }, + { + "epoch": 0.7481722177091795, + "grad_norm": 0.47607891045094536, + "learning_rate": 9.33367007562745e-06, + "loss": 0.6199, + "step": 614 + }, + { + "epoch": 0.7493907392363932, + "grad_norm": 0.4955962984701164, + "learning_rate": 9.330127018922195e-06, + "loss": 0.5949, + "step": 615 + }, + { + "epoch": 0.7506092607636068, + "grad_norm": 0.6100851359106775, + "learning_rate": 9.326575243650618e-06, + "loss": 0.6143, + "step": 616 + }, + { + "epoch": 0.7518277822908205, + "grad_norm": 0.48084331485799453, + "learning_rate": 9.323014756964104e-06, + "loss": 0.6064, + "step": 617 + }, + { + "epoch": 0.7530463038180342, + "grad_norm": 0.6768728956598579, + "learning_rate": 9.31944556603157e-06, + "loss": 0.6229, + "step": 618 + }, + { + "epoch": 0.7542648253452477, + "grad_norm": 0.6664441895394185, + "learning_rate": 9.315867678039469e-06, + "loss": 0.631, + "step": 619 + }, + { + "epoch": 0.7554833468724614, + "grad_norm": 0.6265982250759069, + "learning_rate": 9.312281100191752e-06, + "loss": 0.63, + "step": 620 + }, + { + "epoch": 0.756701868399675, + "grad_norm": 0.6297592873763573, + "learning_rate": 9.308685839709878e-06, + "loss": 0.6264, + "step": 621 + }, + { + "epoch": 0.7579203899268887, + "grad_norm": 0.5583877292859594, + "learning_rate": 9.305081903832784e-06, + "loss": 0.5974, + "step": 622 + }, + { + "epoch": 0.7591389114541024, + "grad_norm": 0.5001555304308823, + "learning_rate": 9.301469299816874e-06, + "loss": 0.6117, + "step": 623 + }, + { + "epoch": 0.760357432981316, + "grad_norm": 0.5390093336249369, + "learning_rate": 9.297848034936007e-06, + "loss": 0.6088, + "step": 624 + }, + { + "epoch": 0.7615759545085297, + "grad_norm": 0.5678848176997396, + "learning_rate": 9.294218116481476e-06, + "loss": 0.6018, + "step": 625 + }, + { + "epoch": 0.7627944760357434, + "grad_norm": 0.5844799796481355, + "learning_rate": 9.290579551762002e-06, + "loss": 0.604, + "step": 626 + }, + { + "epoch": 0.7640129975629569, + "grad_norm": 0.5159143134307803, + "learning_rate": 9.286932348103716e-06, + "loss": 0.6083, + "step": 627 + }, + { + "epoch": 0.7652315190901706, + "grad_norm": 0.5326620021016965, + "learning_rate": 9.283276512850137e-06, + "loss": 0.6206, + "step": 628 + }, + { + "epoch": 0.7664500406173842, + "grad_norm": 0.5963411548189359, + "learning_rate": 9.27961205336217e-06, + "loss": 0.6108, + "step": 629 + }, + { + "epoch": 0.7676685621445979, + "grad_norm": 0.5014319447503888, + "learning_rate": 9.275938977018082e-06, + "loss": 0.6034, + "step": 630 + }, + { + "epoch": 0.7688870836718116, + "grad_norm": 0.5126870488620024, + "learning_rate": 9.272257291213488e-06, + "loss": 0.6176, + "step": 631 + }, + { + "epoch": 0.7701056051990252, + "grad_norm": 0.4787184158365945, + "learning_rate": 9.268567003361341e-06, + "loss": 0.607, + "step": 632 + }, + { + "epoch": 0.7713241267262388, + "grad_norm": 0.557057771330538, + "learning_rate": 9.264868120891913e-06, + "loss": 0.6318, + "step": 633 + }, + { + "epoch": 0.7725426482534524, + "grad_norm": 0.535409561474859, + "learning_rate": 9.261160651252778e-06, + "loss": 0.62, + "step": 634 + }, + { + "epoch": 0.7737611697806661, + "grad_norm": 0.4814507650875912, + "learning_rate": 9.257444601908806e-06, + "loss": 0.6074, + "step": 635 + }, + { + "epoch": 0.7749796913078798, + "grad_norm": 0.6101990877396614, + "learning_rate": 9.253719980342134e-06, + "loss": 0.6208, + "step": 636 + }, + { + "epoch": 0.7761982128350934, + "grad_norm": 0.5403900228621851, + "learning_rate": 9.249986794052168e-06, + "loss": 0.5968, + "step": 637 + }, + { + "epoch": 0.7774167343623071, + "grad_norm": 0.5703352381203307, + "learning_rate": 9.24624505055555e-06, + "loss": 0.626, + "step": 638 + }, + { + "epoch": 0.7786352558895208, + "grad_norm": 0.5241053254774348, + "learning_rate": 9.24249475738616e-06, + "loss": 0.5959, + "step": 639 + }, + { + "epoch": 0.7798537774167343, + "grad_norm": 0.5780889050780196, + "learning_rate": 9.238735922095083e-06, + "loss": 0.5783, + "step": 640 + }, + { + "epoch": 0.781072298943948, + "grad_norm": 0.5164354758896532, + "learning_rate": 9.234968552250612e-06, + "loss": 0.6192, + "step": 641 + }, + { + "epoch": 0.7822908204711616, + "grad_norm": 0.5672667605052139, + "learning_rate": 9.231192655438222e-06, + "loss": 0.6003, + "step": 642 + }, + { + "epoch": 0.7835093419983753, + "grad_norm": 0.5135255221881695, + "learning_rate": 9.22740823926055e-06, + "loss": 0.6082, + "step": 643 + }, + { + "epoch": 0.784727863525589, + "grad_norm": 0.5584536390516718, + "learning_rate": 9.223615311337395e-06, + "loss": 0.614, + "step": 644 + }, + { + "epoch": 0.7859463850528026, + "grad_norm": 0.5216134140261057, + "learning_rate": 9.219813879305692e-06, + "loss": 0.6012, + "step": 645 + }, + { + "epoch": 0.7871649065800163, + "grad_norm": 0.5736410922364097, + "learning_rate": 9.216003950819497e-06, + "loss": 0.6194, + "step": 646 + }, + { + "epoch": 0.7883834281072299, + "grad_norm": 0.5049300976776431, + "learning_rate": 9.21218553354997e-06, + "loss": 0.6115, + "step": 647 + }, + { + "epoch": 0.7896019496344435, + "grad_norm": 0.5596092247163901, + "learning_rate": 9.208358635185372e-06, + "loss": 0.6002, + "step": 648 + }, + { + "epoch": 0.7908204711616572, + "grad_norm": 0.6492697062225624, + "learning_rate": 9.204523263431034e-06, + "loss": 0.6087, + "step": 649 + }, + { + "epoch": 0.7920389926888708, + "grad_norm": 0.5493287831302429, + "learning_rate": 9.200679426009347e-06, + "loss": 0.6134, + "step": 650 + }, + { + "epoch": 0.7932575142160845, + "grad_norm": 0.5393423473357866, + "learning_rate": 9.196827130659752e-06, + "loss": 0.6077, + "step": 651 + }, + { + "epoch": 0.7944760357432982, + "grad_norm": 0.4822437257768845, + "learning_rate": 9.192966385138714e-06, + "loss": 0.6206, + "step": 652 + }, + { + "epoch": 0.7956945572705117, + "grad_norm": 0.5489723911011465, + "learning_rate": 9.189097197219718e-06, + "loss": 0.6237, + "step": 653 + }, + { + "epoch": 0.7969130787977254, + "grad_norm": 0.465446021569481, + "learning_rate": 9.185219574693242e-06, + "loss": 0.5969, + "step": 654 + }, + { + "epoch": 0.7981316003249391, + "grad_norm": 0.5608574163560325, + "learning_rate": 9.181333525366756e-06, + "loss": 0.6116, + "step": 655 + }, + { + "epoch": 0.7993501218521527, + "grad_norm": 0.47338894132856235, + "learning_rate": 9.177439057064684e-06, + "loss": 0.5898, + "step": 656 + }, + { + "epoch": 0.8005686433793664, + "grad_norm": 0.5538432939088667, + "learning_rate": 9.17353617762841e-06, + "loss": 0.6042, + "step": 657 + }, + { + "epoch": 0.80178716490658, + "grad_norm": 0.5129997268787104, + "learning_rate": 9.169624894916252e-06, + "loss": 0.6045, + "step": 658 + }, + { + "epoch": 0.8030056864337937, + "grad_norm": 0.491484979669411, + "learning_rate": 9.165705216803446e-06, + "loss": 0.6159, + "step": 659 + }, + { + "epoch": 0.8042242079610074, + "grad_norm": 0.4865407913972347, + "learning_rate": 9.161777151182137e-06, + "loss": 0.6095, + "step": 660 + }, + { + "epoch": 0.8054427294882209, + "grad_norm": 0.5482167186016993, + "learning_rate": 9.15784070596135e-06, + "loss": 0.6063, + "step": 661 + }, + { + "epoch": 0.8066612510154346, + "grad_norm": 0.4899874123032885, + "learning_rate": 9.153895889066988e-06, + "loss": 0.5993, + "step": 662 + }, + { + "epoch": 0.8078797725426482, + "grad_norm": 0.4971658879090838, + "learning_rate": 9.149942708441808e-06, + "loss": 0.6349, + "step": 663 + }, + { + "epoch": 0.8090982940698619, + "grad_norm": 0.4774943646678603, + "learning_rate": 9.145981172045407e-06, + "loss": 0.5937, + "step": 664 + }, + { + "epoch": 0.8103168155970756, + "grad_norm": 0.5239506111079297, + "learning_rate": 9.142011287854206e-06, + "loss": 0.596, + "step": 665 + }, + { + "epoch": 0.8115353371242892, + "grad_norm": 0.49171964255133527, + "learning_rate": 9.138033063861436e-06, + "loss": 0.5866, + "step": 666 + }, + { + "epoch": 0.8127538586515028, + "grad_norm": 0.5198610207245239, + "learning_rate": 9.134046508077116e-06, + "loss": 0.6022, + "step": 667 + }, + { + "epoch": 0.8139723801787165, + "grad_norm": 0.4768598644726109, + "learning_rate": 9.130051628528046e-06, + "loss": 0.6057, + "step": 668 + }, + { + "epoch": 0.8151909017059301, + "grad_norm": 0.539806947114795, + "learning_rate": 9.12604843325778e-06, + "loss": 0.6175, + "step": 669 + }, + { + "epoch": 0.8164094232331438, + "grad_norm": 0.49480984634291075, + "learning_rate": 9.122036930326618e-06, + "loss": 0.6214, + "step": 670 + }, + { + "epoch": 0.8176279447603574, + "grad_norm": 0.5006857848218066, + "learning_rate": 9.118017127811591e-06, + "loss": 0.6084, + "step": 671 + }, + { + "epoch": 0.8188464662875711, + "grad_norm": 0.4713529456554149, + "learning_rate": 9.113989033806434e-06, + "loss": 0.6177, + "step": 672 + }, + { + "epoch": 0.8200649878147848, + "grad_norm": 0.5234744664186434, + "learning_rate": 9.10995265642158e-06, + "loss": 0.623, + "step": 673 + }, + { + "epoch": 0.8212835093419983, + "grad_norm": 0.46959588708419714, + "learning_rate": 9.105908003784142e-06, + "loss": 0.6223, + "step": 674 + }, + { + "epoch": 0.822502030869212, + "grad_norm": 0.483130564646199, + "learning_rate": 9.101855084037893e-06, + "loss": 0.6079, + "step": 675 + }, + { + "epoch": 0.8237205523964257, + "grad_norm": 0.4707432015389284, + "learning_rate": 9.097793905343251e-06, + "loss": 0.6246, + "step": 676 + }, + { + "epoch": 0.8249390739236393, + "grad_norm": 0.5109208158836949, + "learning_rate": 9.093724475877262e-06, + "loss": 0.6223, + "step": 677 + }, + { + "epoch": 0.826157595450853, + "grad_norm": 0.524528742300806, + "learning_rate": 9.089646803833589e-06, + "loss": 0.6054, + "step": 678 + }, + { + "epoch": 0.8273761169780666, + "grad_norm": 0.48479589382874644, + "learning_rate": 9.085560897422487e-06, + "loss": 0.5978, + "step": 679 + }, + { + "epoch": 0.8285946385052803, + "grad_norm": 0.520310530932384, + "learning_rate": 9.081466764870795e-06, + "loss": 0.6141, + "step": 680 + }, + { + "epoch": 0.829813160032494, + "grad_norm": 0.5320998645898771, + "learning_rate": 9.07736441442191e-06, + "loss": 0.5952, + "step": 681 + }, + { + "epoch": 0.8310316815597075, + "grad_norm": 0.522944143229052, + "learning_rate": 9.073253854335777e-06, + "loss": 0.5966, + "step": 682 + }, + { + "epoch": 0.8322502030869212, + "grad_norm": 0.5438608445694643, + "learning_rate": 9.069135092888874e-06, + "loss": 0.6036, + "step": 683 + }, + { + "epoch": 0.8334687246141349, + "grad_norm": 0.4929729088140395, + "learning_rate": 9.06500813837419e-06, + "loss": 0.603, + "step": 684 + }, + { + "epoch": 0.8346872461413485, + "grad_norm": 0.5376420120613337, + "learning_rate": 9.060872999101206e-06, + "loss": 0.6151, + "step": 685 + }, + { + "epoch": 0.8359057676685622, + "grad_norm": 0.52471690520972, + "learning_rate": 9.056729683395892e-06, + "loss": 0.581, + "step": 686 + }, + { + "epoch": 0.8371242891957758, + "grad_norm": 0.49865247625736375, + "learning_rate": 9.052578199600675e-06, + "loss": 0.6067, + "step": 687 + }, + { + "epoch": 0.8383428107229894, + "grad_norm": 0.5035636474694776, + "learning_rate": 9.048418556074425e-06, + "loss": 0.605, + "step": 688 + }, + { + "epoch": 0.8395613322502031, + "grad_norm": 0.5460518150855164, + "learning_rate": 9.04425076119245e-06, + "loss": 0.6008, + "step": 689 + }, + { + "epoch": 0.8407798537774167, + "grad_norm": 0.5154326591857874, + "learning_rate": 9.040074823346466e-06, + "loss": 0.612, + "step": 690 + }, + { + "epoch": 0.8419983753046304, + "grad_norm": 0.41895451726050503, + "learning_rate": 9.035890750944583e-06, + "loss": 0.5947, + "step": 691 + }, + { + "epoch": 0.843216896831844, + "grad_norm": 0.49674088276174516, + "learning_rate": 9.03169855241129e-06, + "loss": 0.625, + "step": 692 + }, + { + "epoch": 0.8444354183590577, + "grad_norm": 0.5650371934623263, + "learning_rate": 9.02749823618744e-06, + "loss": 0.5954, + "step": 693 + }, + { + "epoch": 0.8456539398862714, + "grad_norm": 0.5010709938981562, + "learning_rate": 9.02328981073023e-06, + "loss": 0.6071, + "step": 694 + }, + { + "epoch": 0.8468724614134849, + "grad_norm": 0.5831039880668286, + "learning_rate": 9.019073284513184e-06, + "loss": 0.5989, + "step": 695 + }, + { + "epoch": 0.8480909829406986, + "grad_norm": 0.5796544622455602, + "learning_rate": 9.014848666026138e-06, + "loss": 0.6328, + "step": 696 + }, + { + "epoch": 0.8493095044679123, + "grad_norm": 0.5898423233515925, + "learning_rate": 9.01061596377522e-06, + "loss": 0.6316, + "step": 697 + }, + { + "epoch": 0.8505280259951259, + "grad_norm": 0.576717321636104, + "learning_rate": 9.006375186282832e-06, + "loss": 0.6129, + "step": 698 + }, + { + "epoch": 0.8517465475223396, + "grad_norm": 0.5274725251295577, + "learning_rate": 9.002126342087643e-06, + "loss": 0.6103, + "step": 699 + }, + { + "epoch": 0.8529650690495532, + "grad_norm": 0.5405289062395403, + "learning_rate": 8.997869439744555e-06, + "loss": 0.6252, + "step": 700 + }, + { + "epoch": 0.8541835905767668, + "grad_norm": 0.5521347732238037, + "learning_rate": 8.993604487824701e-06, + "loss": 0.6008, + "step": 701 + }, + { + "epoch": 0.8554021121039805, + "grad_norm": 0.5196724445810474, + "learning_rate": 8.989331494915417e-06, + "loss": 0.6185, + "step": 702 + }, + { + "epoch": 0.8566206336311941, + "grad_norm": 0.5683878673891257, + "learning_rate": 8.985050469620236e-06, + "loss": 0.6245, + "step": 703 + }, + { + "epoch": 0.8578391551584078, + "grad_norm": 0.5407694973000146, + "learning_rate": 8.980761420558855e-06, + "loss": 0.6142, + "step": 704 + }, + { + "epoch": 0.8590576766856215, + "grad_norm": 0.5649995760138024, + "learning_rate": 8.976464356367133e-06, + "loss": 0.5985, + "step": 705 + }, + { + "epoch": 0.8602761982128351, + "grad_norm": 0.4922853729727254, + "learning_rate": 8.972159285697066e-06, + "loss": 0.6128, + "step": 706 + }, + { + "epoch": 0.8614947197400488, + "grad_norm": 0.5653149236554849, + "learning_rate": 8.967846217216771e-06, + "loss": 0.6085, + "step": 707 + }, + { + "epoch": 0.8627132412672623, + "grad_norm": 0.5367471044143063, + "learning_rate": 8.963525159610465e-06, + "loss": 0.6148, + "step": 708 + }, + { + "epoch": 0.863931762794476, + "grad_norm": 0.6165337631503633, + "learning_rate": 8.959196121578455e-06, + "loss": 0.6152, + "step": 709 + }, + { + "epoch": 0.8651502843216897, + "grad_norm": 0.4805242301641202, + "learning_rate": 8.954859111837115e-06, + "loss": 0.6012, + "step": 710 + }, + { + "epoch": 0.8663688058489033, + "grad_norm": 0.5673830583367931, + "learning_rate": 8.950514139118868e-06, + "loss": 0.6137, + "step": 711 + }, + { + "epoch": 0.867587327376117, + "grad_norm": 0.6116666852593193, + "learning_rate": 8.946161212172172e-06, + "loss": 0.6067, + "step": 712 + }, + { + "epoch": 0.8688058489033307, + "grad_norm": 0.4787324171983748, + "learning_rate": 8.941800339761503e-06, + "loss": 0.6229, + "step": 713 + }, + { + "epoch": 0.8700243704305443, + "grad_norm": 0.5603801815973803, + "learning_rate": 8.937431530667329e-06, + "loss": 0.6105, + "step": 714 + }, + { + "epoch": 0.871242891957758, + "grad_norm": 0.5681506397184968, + "learning_rate": 8.933054793686102e-06, + "loss": 0.6196, + "step": 715 + }, + { + "epoch": 0.8724614134849715, + "grad_norm": 0.4745590461881841, + "learning_rate": 8.928670137630236e-06, + "loss": 0.6041, + "step": 716 + }, + { + "epoch": 0.8736799350121852, + "grad_norm": 0.5290850478804046, + "learning_rate": 8.924277571328091e-06, + "loss": 0.5968, + "step": 717 + }, + { + "epoch": 0.8748984565393989, + "grad_norm": 0.4724468577981056, + "learning_rate": 8.919877103623949e-06, + "loss": 0.5888, + "step": 718 + }, + { + "epoch": 0.8761169780666125, + "grad_norm": 0.4710021425585232, + "learning_rate": 8.915468743378009e-06, + "loss": 0.6039, + "step": 719 + }, + { + "epoch": 0.8773354995938262, + "grad_norm": 0.5615817507996624, + "learning_rate": 8.911052499466358e-06, + "loss": 0.611, + "step": 720 + }, + { + "epoch": 0.8785540211210398, + "grad_norm": 0.5372617716587773, + "learning_rate": 8.906628380780951e-06, + "loss": 0.5853, + "step": 721 + }, + { + "epoch": 0.8797725426482534, + "grad_norm": 0.4671881493526463, + "learning_rate": 8.902196396229605e-06, + "loss": 0.6135, + "step": 722 + }, + { + "epoch": 0.8809910641754671, + "grad_norm": 0.6571538751607443, + "learning_rate": 8.897756554735976e-06, + "loss": 0.6166, + "step": 723 + }, + { + "epoch": 0.8822095857026807, + "grad_norm": 0.5407143640334066, + "learning_rate": 8.893308865239536e-06, + "loss": 0.5946, + "step": 724 + }, + { + "epoch": 0.8834281072298944, + "grad_norm": 0.53845654868447, + "learning_rate": 8.888853336695558e-06, + "loss": 0.6056, + "step": 725 + }, + { + "epoch": 0.8846466287571081, + "grad_norm": 0.5501103328024185, + "learning_rate": 8.884389978075098e-06, + "loss": 0.5983, + "step": 726 + }, + { + "epoch": 0.8858651502843217, + "grad_norm": 0.5308109296782529, + "learning_rate": 8.879918798364984e-06, + "loss": 0.5777, + "step": 727 + }, + { + "epoch": 0.8870836718115354, + "grad_norm": 0.5017325039220928, + "learning_rate": 8.875439806567786e-06, + "loss": 0.6045, + "step": 728 + }, + { + "epoch": 0.8883021933387489, + "grad_norm": 0.5901206372277947, + "learning_rate": 8.870953011701804e-06, + "loss": 0.604, + "step": 729 + }, + { + "epoch": 0.8895207148659626, + "grad_norm": 0.45439896535640995, + "learning_rate": 8.866458422801048e-06, + "loss": 0.6073, + "step": 730 + }, + { + "epoch": 0.8907392363931763, + "grad_norm": 0.5577426986098635, + "learning_rate": 8.861956048915225e-06, + "loss": 0.5915, + "step": 731 + }, + { + "epoch": 0.8919577579203899, + "grad_norm": 0.6016567936834477, + "learning_rate": 8.857445899109716e-06, + "loss": 0.6046, + "step": 732 + }, + { + "epoch": 0.8931762794476036, + "grad_norm": 0.5445868957449489, + "learning_rate": 8.852927982465553e-06, + "loss": 0.6106, + "step": 733 + }, + { + "epoch": 0.8943948009748173, + "grad_norm": 0.74687623190731, + "learning_rate": 8.848402308079415e-06, + "loss": 0.6106, + "step": 734 + }, + { + "epoch": 0.8956133225020309, + "grad_norm": 0.5720296451679941, + "learning_rate": 8.843868885063594e-06, + "loss": 0.6051, + "step": 735 + }, + { + "epoch": 0.8968318440292445, + "grad_norm": 0.6556133763306434, + "learning_rate": 8.839327722545985e-06, + "loss": 0.6167, + "step": 736 + }, + { + "epoch": 0.8980503655564581, + "grad_norm": 0.564067584928174, + "learning_rate": 8.83477882967007e-06, + "loss": 0.5994, + "step": 737 + }, + { + "epoch": 0.8992688870836718, + "grad_norm": 0.7349456844478599, + "learning_rate": 8.83022221559489e-06, + "loss": 0.6114, + "step": 738 + }, + { + "epoch": 0.9004874086108855, + "grad_norm": 0.5690040907358448, + "learning_rate": 8.82565788949504e-06, + "loss": 0.5881, + "step": 739 + }, + { + "epoch": 0.9017059301380991, + "grad_norm": 0.6984688918514965, + "learning_rate": 8.821085860560633e-06, + "loss": 0.5983, + "step": 740 + }, + { + "epoch": 0.9029244516653128, + "grad_norm": 0.5870268436598589, + "learning_rate": 8.8165061379973e-06, + "loss": 0.6158, + "step": 741 + }, + { + "epoch": 0.9041429731925265, + "grad_norm": 0.730806962459982, + "learning_rate": 8.81191873102616e-06, + "loss": 0.6058, + "step": 742 + }, + { + "epoch": 0.90536149471974, + "grad_norm": 0.5520509944838993, + "learning_rate": 8.807323648883802e-06, + "loss": 0.6076, + "step": 743 + }, + { + "epoch": 0.9065800162469537, + "grad_norm": 0.5674479495642151, + "learning_rate": 8.80272090082227e-06, + "loss": 0.6017, + "step": 744 + }, + { + "epoch": 0.9077985377741673, + "grad_norm": 0.6471015570221698, + "learning_rate": 8.798110496109047e-06, + "loss": 0.6114, + "step": 745 + }, + { + "epoch": 0.909017059301381, + "grad_norm": 0.5077905529540144, + "learning_rate": 8.793492444027027e-06, + "loss": 0.6086, + "step": 746 + }, + { + "epoch": 0.9102355808285947, + "grad_norm": 0.5684591151412205, + "learning_rate": 8.788866753874504e-06, + "loss": 0.5939, + "step": 747 + }, + { + "epoch": 0.9114541023558083, + "grad_norm": 0.5373473945369368, + "learning_rate": 8.784233434965149e-06, + "loss": 0.605, + "step": 748 + }, + { + "epoch": 0.912672623883022, + "grad_norm": 0.4922150085749876, + "learning_rate": 8.779592496627998e-06, + "loss": 0.6016, + "step": 749 + }, + { + "epoch": 0.9138911454102355, + "grad_norm": 0.5346368247367626, + "learning_rate": 8.774943948207427e-06, + "loss": 0.5894, + "step": 750 + }, + { + "epoch": 0.9151096669374492, + "grad_norm": 0.5910293461390073, + "learning_rate": 8.770287799063128e-06, + "loss": 0.5928, + "step": 751 + }, + { + "epoch": 0.9163281884646629, + "grad_norm": 0.45941353467858154, + "learning_rate": 8.765624058570106e-06, + "loss": 0.606, + "step": 752 + }, + { + "epoch": 0.9175467099918765, + "grad_norm": 0.5187731411231332, + "learning_rate": 8.760952736118645e-06, + "loss": 0.6128, + "step": 753 + }, + { + "epoch": 0.9187652315190902, + "grad_norm": 0.5257713049314863, + "learning_rate": 8.756273841114297e-06, + "loss": 0.5954, + "step": 754 + }, + { + "epoch": 0.9199837530463039, + "grad_norm": 0.5158216045537021, + "learning_rate": 8.751587382977862e-06, + "loss": 0.6016, + "step": 755 + }, + { + "epoch": 0.9212022745735174, + "grad_norm": 0.48265635843326626, + "learning_rate": 8.746893371145367e-06, + "loss": 0.6023, + "step": 756 + }, + { + "epoch": 0.9224207961007311, + "grad_norm": 0.56635290896361, + "learning_rate": 8.742191815068048e-06, + "loss": 0.6168, + "step": 757 + }, + { + "epoch": 0.9236393176279447, + "grad_norm": 0.5246869149929032, + "learning_rate": 8.737482724212331e-06, + "loss": 0.6073, + "step": 758 + }, + { + "epoch": 0.9248578391551584, + "grad_norm": 0.5675558144411569, + "learning_rate": 8.732766108059814e-06, + "loss": 0.6089, + "step": 759 + }, + { + "epoch": 0.9260763606823721, + "grad_norm": 0.5373680000020842, + "learning_rate": 8.728041976107247e-06, + "loss": 0.6229, + "step": 760 + }, + { + "epoch": 0.9272948822095857, + "grad_norm": 0.4781724675355625, + "learning_rate": 8.723310337866508e-06, + "loss": 0.6109, + "step": 761 + }, + { + "epoch": 0.9285134037367994, + "grad_norm": 0.5425148864348092, + "learning_rate": 8.718571202864598e-06, + "loss": 0.6135, + "step": 762 + }, + { + "epoch": 0.929731925264013, + "grad_norm": 0.5848574183660457, + "learning_rate": 8.713824580643606e-06, + "loss": 0.5856, + "step": 763 + }, + { + "epoch": 0.9309504467912266, + "grad_norm": 0.5359644668976268, + "learning_rate": 8.709070480760696e-06, + "loss": 0.6005, + "step": 764 + }, + { + "epoch": 0.9321689683184403, + "grad_norm": 0.620026762890768, + "learning_rate": 8.70430891278809e-06, + "loss": 0.6068, + "step": 765 + }, + { + "epoch": 0.9333874898456539, + "grad_norm": 0.47117448230839937, + "learning_rate": 8.699539886313047e-06, + "loss": 0.6252, + "step": 766 + }, + { + "epoch": 0.9346060113728676, + "grad_norm": 0.5057879313386596, + "learning_rate": 8.69476341093784e-06, + "loss": 0.6043, + "step": 767 + }, + { + "epoch": 0.9358245329000813, + "grad_norm": 0.5719864673466165, + "learning_rate": 8.689979496279747e-06, + "loss": 0.6021, + "step": 768 + }, + { + "epoch": 0.9370430544272949, + "grad_norm": 0.4550279435061135, + "learning_rate": 8.685188151971018e-06, + "loss": 0.5903, + "step": 769 + }, + { + "epoch": 0.9382615759545085, + "grad_norm": 0.5815823584929373, + "learning_rate": 8.680389387658866e-06, + "loss": 0.5994, + "step": 770 + }, + { + "epoch": 0.9394800974817222, + "grad_norm": 0.5037028317625714, + "learning_rate": 8.675583213005443e-06, + "loss": 0.619, + "step": 771 + }, + { + "epoch": 0.9406986190089358, + "grad_norm": 0.5242690261886358, + "learning_rate": 8.67076963768782e-06, + "loss": 0.6048, + "step": 772 + }, + { + "epoch": 0.9419171405361495, + "grad_norm": 0.6218367099845817, + "learning_rate": 8.66594867139797e-06, + "loss": 0.5839, + "step": 773 + }, + { + "epoch": 0.9431356620633631, + "grad_norm": 0.47012822627564055, + "learning_rate": 8.661120323842751e-06, + "loss": 0.5901, + "step": 774 + }, + { + "epoch": 0.9443541835905768, + "grad_norm": 0.5922308137676237, + "learning_rate": 8.656284604743877e-06, + "loss": 0.5949, + "step": 775 + }, + { + "epoch": 0.9455727051177905, + "grad_norm": 0.5371260230634575, + "learning_rate": 8.651441523837908e-06, + "loss": 0.623, + "step": 776 + }, + { + "epoch": 0.946791226645004, + "grad_norm": 0.5773759686297267, + "learning_rate": 8.646591090876225e-06, + "loss": 0.6234, + "step": 777 + }, + { + "epoch": 0.9480097481722177, + "grad_norm": 0.5887590407239388, + "learning_rate": 8.641733315625014e-06, + "loss": 0.6111, + "step": 778 + }, + { + "epoch": 0.9492282696994313, + "grad_norm": 0.5226241995561731, + "learning_rate": 8.636868207865244e-06, + "loss": 0.6206, + "step": 779 + }, + { + "epoch": 0.950446791226645, + "grad_norm": 0.6014897765265561, + "learning_rate": 8.631995777392645e-06, + "loss": 0.6098, + "step": 780 + }, + { + "epoch": 0.9516653127538587, + "grad_norm": 0.4728664792789181, + "learning_rate": 8.627116034017697e-06, + "loss": 0.6175, + "step": 781 + }, + { + "epoch": 0.9528838342810723, + "grad_norm": 0.5599521599776955, + "learning_rate": 8.622228987565597e-06, + "loss": 0.6121, + "step": 782 + }, + { + "epoch": 0.954102355808286, + "grad_norm": 0.45561297167703785, + "learning_rate": 8.61733464787625e-06, + "loss": 0.585, + "step": 783 + }, + { + "epoch": 0.9553208773354996, + "grad_norm": 0.4965712938546266, + "learning_rate": 8.612433024804246e-06, + "loss": 0.5844, + "step": 784 + }, + { + "epoch": 0.9565393988627132, + "grad_norm": 0.49923609484853176, + "learning_rate": 8.607524128218842e-06, + "loss": 0.6056, + "step": 785 + }, + { + "epoch": 0.9577579203899269, + "grad_norm": 0.5194489854997212, + "learning_rate": 8.602607968003935e-06, + "loss": 0.6157, + "step": 786 + }, + { + "epoch": 0.9589764419171405, + "grad_norm": 0.45374807644787585, + "learning_rate": 8.597684554058053e-06, + "loss": 0.6131, + "step": 787 + }, + { + "epoch": 0.9601949634443542, + "grad_norm": 0.48980331599376176, + "learning_rate": 8.59275389629432e-06, + "loss": 0.6277, + "step": 788 + }, + { + "epoch": 0.9614134849715679, + "grad_norm": 0.512984376262805, + "learning_rate": 8.587816004640456e-06, + "loss": 0.6079, + "step": 789 + }, + { + "epoch": 0.9626320064987814, + "grad_norm": 0.46938679490869983, + "learning_rate": 8.58287088903874e-06, + "loss": 0.6024, + "step": 790 + }, + { + "epoch": 0.9638505280259951, + "grad_norm": 0.5727370279954419, + "learning_rate": 8.577918559445994e-06, + "loss": 0.6133, + "step": 791 + }, + { + "epoch": 0.9650690495532088, + "grad_norm": 0.46813355754433694, + "learning_rate": 8.572959025833573e-06, + "loss": 0.6091, + "step": 792 + }, + { + "epoch": 0.9662875710804224, + "grad_norm": 0.5352006872401892, + "learning_rate": 8.56799229818733e-06, + "loss": 0.5926, + "step": 793 + }, + { + "epoch": 0.9675060926076361, + "grad_norm": 0.5423797070420179, + "learning_rate": 8.563018386507607e-06, + "loss": 0.6055, + "step": 794 + }, + { + "epoch": 0.9687246141348497, + "grad_norm": 0.5598760717169532, + "learning_rate": 8.558037300809209e-06, + "loss": 0.601, + "step": 795 + }, + { + "epoch": 0.9699431356620634, + "grad_norm": 0.5899307915518814, + "learning_rate": 8.553049051121383e-06, + "loss": 0.5925, + "step": 796 + }, + { + "epoch": 0.971161657189277, + "grad_norm": 0.5817700253793735, + "learning_rate": 8.548053647487808e-06, + "loss": 0.5794, + "step": 797 + }, + { + "epoch": 0.9723801787164906, + "grad_norm": 0.6684891953193655, + "learning_rate": 8.543051099966558e-06, + "loss": 0.6158, + "step": 798 + }, + { + "epoch": 0.9735987002437043, + "grad_norm": 0.6186641627844115, + "learning_rate": 8.538041418630099e-06, + "loss": 0.6045, + "step": 799 + }, + { + "epoch": 0.974817221770918, + "grad_norm": 0.5620245115548018, + "learning_rate": 8.533024613565256e-06, + "loss": 0.6074, + "step": 800 + }, + { + "epoch": 0.9760357432981316, + "grad_norm": 0.5360734619477909, + "learning_rate": 8.5280006948732e-06, + "loss": 0.5781, + "step": 801 + }, + { + "epoch": 0.9772542648253453, + "grad_norm": 0.5649861774930516, + "learning_rate": 8.522969672669419e-06, + "loss": 0.603, + "step": 802 + }, + { + "epoch": 0.9784727863525589, + "grad_norm": 0.5524388375136041, + "learning_rate": 8.517931557083713e-06, + "loss": 0.5927, + "step": 803 + }, + { + "epoch": 0.9796913078797725, + "grad_norm": 0.5048333497363491, + "learning_rate": 8.512886358260162e-06, + "loss": 0.6218, + "step": 804 + }, + { + "epoch": 0.9809098294069862, + "grad_norm": 0.5532699810235799, + "learning_rate": 8.5078340863571e-06, + "loss": 0.5935, + "step": 805 + }, + { + "epoch": 0.9821283509341998, + "grad_norm": 0.482227106626454, + "learning_rate": 8.502774751547108e-06, + "loss": 0.5946, + "step": 806 + }, + { + "epoch": 0.9833468724614135, + "grad_norm": 0.5612628853741157, + "learning_rate": 8.49770836401699e-06, + "loss": 0.6174, + "step": 807 + }, + { + "epoch": 0.9845653939886271, + "grad_norm": 0.5165079876207431, + "learning_rate": 8.492634933967749e-06, + "loss": 0.586, + "step": 808 + }, + { + "epoch": 0.9857839155158408, + "grad_norm": 0.5243350260461674, + "learning_rate": 8.487554471614568e-06, + "loss": 0.598, + "step": 809 + }, + { + "epoch": 0.9870024370430545, + "grad_norm": 0.5334138693548346, + "learning_rate": 8.482466987186785e-06, + "loss": 0.6156, + "step": 810 + }, + { + "epoch": 0.988220958570268, + "grad_norm": 0.5183630888601999, + "learning_rate": 8.477372490927882e-06, + "loss": 0.6043, + "step": 811 + }, + { + "epoch": 0.9894394800974817, + "grad_norm": 0.5064511107410842, + "learning_rate": 8.47227099309546e-06, + "loss": 0.618, + "step": 812 + }, + { + "epoch": 0.9906580016246954, + "grad_norm": 0.502910387382079, + "learning_rate": 8.467162503961209e-06, + "loss": 0.5921, + "step": 813 + }, + { + "epoch": 0.991876523151909, + "grad_norm": 0.6360985673189292, + "learning_rate": 8.462047033810906e-06, + "loss": 0.6196, + "step": 814 + }, + { + "epoch": 0.9930950446791227, + "grad_norm": 0.48804000994343705, + "learning_rate": 8.456924592944377e-06, + "loss": 0.5874, + "step": 815 + }, + { + "epoch": 0.9943135662063363, + "grad_norm": 0.5525784026778128, + "learning_rate": 8.451795191675488e-06, + "loss": 0.6121, + "step": 816 + }, + { + "epoch": 0.99553208773355, + "grad_norm": 0.6244758885512404, + "learning_rate": 8.446658840332115e-06, + "loss": 0.6117, + "step": 817 + }, + { + "epoch": 0.9967506092607636, + "grad_norm": 0.5125354504575084, + "learning_rate": 8.441515549256134e-06, + "loss": 0.6029, + "step": 818 + }, + { + "epoch": 0.9979691307879772, + "grad_norm": 0.48689738688414835, + "learning_rate": 8.436365328803386e-06, + "loss": 0.6118, + "step": 819 + }, + { + "epoch": 0.9991876523151909, + "grad_norm": 0.6498259018985348, + "learning_rate": 8.43120818934367e-06, + "loss": 0.6102, + "step": 820 + }, + { + "epoch": 1.0008123476848092, + "grad_norm": 0.9638337013283915, + "learning_rate": 8.426044141260712e-06, + "loss": 0.9573, + "step": 821 + }, + { + "epoch": 1.0020308692120228, + "grad_norm": 0.49843778312392245, + "learning_rate": 8.420873194952153e-06, + "loss": 0.5312, + "step": 822 + }, + { + "epoch": 1.0032493907392364, + "grad_norm": 0.5736142039977695, + "learning_rate": 8.415695360829521e-06, + "loss": 0.5481, + "step": 823 + }, + { + "epoch": 1.00446791226645, + "grad_norm": 0.5588125856539439, + "learning_rate": 8.410510649318211e-06, + "loss": 0.6112, + "step": 824 + }, + { + "epoch": 1.0056864337936637, + "grad_norm": 0.5238730088532109, + "learning_rate": 8.405319070857466e-06, + "loss": 0.5738, + "step": 825 + }, + { + "epoch": 1.0069049553208773, + "grad_norm": 0.5729923093577028, + "learning_rate": 8.40012063590036e-06, + "loss": 0.563, + "step": 826 + }, + { + "epoch": 1.008123476848091, + "grad_norm": 0.542308645982848, + "learning_rate": 8.394915354913763e-06, + "loss": 0.5825, + "step": 827 + }, + { + "epoch": 1.0093419983753047, + "grad_norm": 0.5635399800755453, + "learning_rate": 8.38970323837834e-06, + "loss": 0.5596, + "step": 828 + }, + { + "epoch": 1.0105605199025183, + "grad_norm": 0.5412240812641438, + "learning_rate": 8.384484296788509e-06, + "loss": 0.583, + "step": 829 + }, + { + "epoch": 1.0117790414297319, + "grad_norm": 0.4985722523246039, + "learning_rate": 8.379258540652438e-06, + "loss": 0.5269, + "step": 830 + }, + { + "epoch": 1.0129975629569457, + "grad_norm": 0.5577073237880519, + "learning_rate": 8.37402598049201e-06, + "loss": 0.5971, + "step": 831 + }, + { + "epoch": 1.0142160844841592, + "grad_norm": 0.5397320632233633, + "learning_rate": 8.368786626842815e-06, + "loss": 0.576, + "step": 832 + }, + { + "epoch": 1.0154346060113728, + "grad_norm": 0.5446374373068642, + "learning_rate": 8.363540490254111e-06, + "loss": 0.5604, + "step": 833 + }, + { + "epoch": 1.0166531275385866, + "grad_norm": 0.5916157265480478, + "learning_rate": 8.358287581288824e-06, + "loss": 0.5977, + "step": 834 + }, + { + "epoch": 1.0178716490658002, + "grad_norm": 0.44317757053413465, + "learning_rate": 8.353027910523506e-06, + "loss": 0.5386, + "step": 835 + }, + { + "epoch": 1.0190901705930138, + "grad_norm": 0.5306644900080113, + "learning_rate": 8.347761488548334e-06, + "loss": 0.5685, + "step": 836 + }, + { + "epoch": 1.0203086921202273, + "grad_norm": 0.554634789156319, + "learning_rate": 8.342488325967068e-06, + "loss": 0.5906, + "step": 837 + }, + { + "epoch": 1.0215272136474411, + "grad_norm": 0.46926134132806735, + "learning_rate": 8.337208433397051e-06, + "loss": 0.5518, + "step": 838 + }, + { + "epoch": 1.0227457351746547, + "grad_norm": 0.5223237573306092, + "learning_rate": 8.331921821469164e-06, + "loss": 0.5482, + "step": 839 + }, + { + "epoch": 1.0239642567018683, + "grad_norm": 0.6456110639127597, + "learning_rate": 8.326628500827826e-06, + "loss": 0.5533, + "step": 840 + }, + { + "epoch": 1.025182778229082, + "grad_norm": 0.49817045727119846, + "learning_rate": 8.321328482130967e-06, + "loss": 0.5828, + "step": 841 + }, + { + "epoch": 1.0264012997562957, + "grad_norm": 0.6439926526967455, + "learning_rate": 8.31602177604999e-06, + "loss": 0.5445, + "step": 842 + }, + { + "epoch": 1.0276198212835093, + "grad_norm": 0.5597217590326287, + "learning_rate": 8.310708393269773e-06, + "loss": 0.5919, + "step": 843 + }, + { + "epoch": 1.028838342810723, + "grad_norm": 0.5067484108191753, + "learning_rate": 8.305388344488636e-06, + "loss": 0.5119, + "step": 844 + }, + { + "epoch": 1.0300568643379366, + "grad_norm": 0.6138111359383427, + "learning_rate": 8.300061640418322e-06, + "loss": 0.5819, + "step": 845 + }, + { + "epoch": 1.0312753858651502, + "grad_norm": 0.5228439245226578, + "learning_rate": 8.294728291783967e-06, + "loss": 0.5488, + "step": 846 + }, + { + "epoch": 1.032493907392364, + "grad_norm": 0.5069119735333029, + "learning_rate": 8.289388309324094e-06, + "loss": 0.5531, + "step": 847 + }, + { + "epoch": 1.0337124289195776, + "grad_norm": 0.6055259774711721, + "learning_rate": 8.284041703790578e-06, + "loss": 0.6323, + "step": 848 + }, + { + "epoch": 1.0349309504467912, + "grad_norm": 0.40577407124920994, + "learning_rate": 8.278688485948634e-06, + "loss": 0.5171, + "step": 849 + }, + { + "epoch": 1.036149471974005, + "grad_norm": 0.5480653507617855, + "learning_rate": 8.273328666576783e-06, + "loss": 0.5708, + "step": 850 + }, + { + "epoch": 1.0373679935012186, + "grad_norm": 0.5332307457846426, + "learning_rate": 8.267962256466845e-06, + "loss": 0.5802, + "step": 851 + }, + { + "epoch": 1.0385865150284321, + "grad_norm": 0.45617231239236866, + "learning_rate": 8.262589266423908e-06, + "loss": 0.5367, + "step": 852 + }, + { + "epoch": 1.0398050365556457, + "grad_norm": 0.4487718203264924, + "learning_rate": 8.257209707266308e-06, + "loss": 0.5412, + "step": 853 + }, + { + "epoch": 1.0410235580828595, + "grad_norm": 0.49617901681065096, + "learning_rate": 8.251823589825608e-06, + "loss": 0.582, + "step": 854 + }, + { + "epoch": 1.042242079610073, + "grad_norm": 0.47465221989539974, + "learning_rate": 8.246430924946575e-06, + "loss": 0.5377, + "step": 855 + }, + { + "epoch": 1.0434606011372867, + "grad_norm": 0.4988725203576914, + "learning_rate": 8.24103172348716e-06, + "loss": 0.6148, + "step": 856 + }, + { + "epoch": 1.0446791226645005, + "grad_norm": 0.4769299659284957, + "learning_rate": 8.235625996318475e-06, + "loss": 0.5376, + "step": 857 + }, + { + "epoch": 1.045897644191714, + "grad_norm": 0.5418879737499556, + "learning_rate": 8.230213754324773e-06, + "loss": 0.5688, + "step": 858 + }, + { + "epoch": 1.0471161657189276, + "grad_norm": 0.4361367720716124, + "learning_rate": 8.22479500840342e-06, + "loss": 0.5337, + "step": 859 + }, + { + "epoch": 1.0483346872461414, + "grad_norm": 0.5323815827851344, + "learning_rate": 8.219369769464883e-06, + "loss": 0.6055, + "step": 860 + }, + { + "epoch": 1.049553208773355, + "grad_norm": 0.5879673529136081, + "learning_rate": 8.213938048432697e-06, + "loss": 0.5415, + "step": 861 + }, + { + "epoch": 1.0507717303005686, + "grad_norm": 0.4684259408064238, + "learning_rate": 8.208499856243453e-06, + "loss": 0.5515, + "step": 862 + }, + { + "epoch": 1.0519902518277824, + "grad_norm": 0.5196995774290054, + "learning_rate": 8.20305520384677e-06, + "loss": 0.5934, + "step": 863 + }, + { + "epoch": 1.053208773354996, + "grad_norm": 0.555821404956024, + "learning_rate": 8.19760410220527e-06, + "loss": 0.5608, + "step": 864 + }, + { + "epoch": 1.0544272948822095, + "grad_norm": 0.49067810902195214, + "learning_rate": 8.19214656229457e-06, + "loss": 0.5338, + "step": 865 + }, + { + "epoch": 1.0556458164094233, + "grad_norm": 0.5035110725818862, + "learning_rate": 8.186682595103241e-06, + "loss": 0.579, + "step": 866 + }, + { + "epoch": 1.056864337936637, + "grad_norm": 0.5005979772843533, + "learning_rate": 8.1812122116328e-06, + "loss": 0.5824, + "step": 867 + }, + { + "epoch": 1.0580828594638505, + "grad_norm": 0.5504829458164456, + "learning_rate": 8.175735422897682e-06, + "loss": 0.5574, + "step": 868 + }, + { + "epoch": 1.059301380991064, + "grad_norm": 0.5207101568397476, + "learning_rate": 8.170252239925215e-06, + "loss": 0.5894, + "step": 869 + }, + { + "epoch": 1.0605199025182779, + "grad_norm": 0.41793216877614997, + "learning_rate": 8.16476267375561e-06, + "loss": 0.509, + "step": 870 + }, + { + "epoch": 1.0617384240454915, + "grad_norm": 0.5270083025323902, + "learning_rate": 8.159266735441922e-06, + "loss": 0.584, + "step": 871 + }, + { + "epoch": 1.062956945572705, + "grad_norm": 0.4966922910229618, + "learning_rate": 8.15376443605004e-06, + "loss": 0.5269, + "step": 872 + }, + { + "epoch": 1.0641754670999188, + "grad_norm": 0.4961677071135526, + "learning_rate": 8.148255786658661e-06, + "loss": 0.6035, + "step": 873 + }, + { + "epoch": 1.0653939886271324, + "grad_norm": 0.4946533201405728, + "learning_rate": 8.142740798359268e-06, + "loss": 0.5932, + "step": 874 + }, + { + "epoch": 1.066612510154346, + "grad_norm": 0.49312465250267673, + "learning_rate": 8.137219482256102e-06, + "loss": 0.5337, + "step": 875 + }, + { + "epoch": 1.0678310316815598, + "grad_norm": 0.5074238436289318, + "learning_rate": 8.131691849466154e-06, + "loss": 0.5536, + "step": 876 + }, + { + "epoch": 1.0690495532087734, + "grad_norm": 0.5179722934326702, + "learning_rate": 8.126157911119124e-06, + "loss": 0.5781, + "step": 877 + }, + { + "epoch": 1.070268074735987, + "grad_norm": 0.42106727984073683, + "learning_rate": 8.120617678357415e-06, + "loss": 0.5364, + "step": 878 + }, + { + "epoch": 1.0714865962632008, + "grad_norm": 0.5619541047984238, + "learning_rate": 8.115071162336099e-06, + "loss": 0.6302, + "step": 879 + }, + { + "epoch": 1.0727051177904143, + "grad_norm": 0.48218497269212, + "learning_rate": 8.109518374222902e-06, + "loss": 0.5081, + "step": 880 + }, + { + "epoch": 1.073923639317628, + "grad_norm": 0.5288776434466912, + "learning_rate": 8.103959325198178e-06, + "loss": 0.6161, + "step": 881 + }, + { + "epoch": 1.0751421608448415, + "grad_norm": 0.4396305550189922, + "learning_rate": 8.098394026454886e-06, + "loss": 0.5269, + "step": 882 + }, + { + "epoch": 1.0763606823720553, + "grad_norm": 0.5705187563085431, + "learning_rate": 8.09282248919857e-06, + "loss": 0.5918, + "step": 883 + }, + { + "epoch": 1.0775792038992689, + "grad_norm": 0.5173394574008403, + "learning_rate": 8.087244724647333e-06, + "loss": 0.55, + "step": 884 + }, + { + "epoch": 1.0787977254264824, + "grad_norm": 0.5259195540857357, + "learning_rate": 8.081660744031818e-06, + "loss": 0.5587, + "step": 885 + }, + { + "epoch": 1.0800162469536962, + "grad_norm": 0.5013768900277689, + "learning_rate": 8.076070558595188e-06, + "loss": 0.5847, + "step": 886 + }, + { + "epoch": 1.0812347684809098, + "grad_norm": 0.5113716323758455, + "learning_rate": 8.070474179593088e-06, + "loss": 0.5841, + "step": 887 + }, + { + "epoch": 1.0824532900081234, + "grad_norm": 0.4304893769830929, + "learning_rate": 8.064871618293647e-06, + "loss": 0.474, + "step": 888 + }, + { + "epoch": 1.0836718115353372, + "grad_norm": 0.5581590870053381, + "learning_rate": 8.05926288597743e-06, + "loss": 0.5883, + "step": 889 + }, + { + "epoch": 1.0848903330625508, + "grad_norm": 0.5966885478295298, + "learning_rate": 8.053647993937436e-06, + "loss": 0.6114, + "step": 890 + }, + { + "epoch": 1.0861088545897644, + "grad_norm": 0.45798182910038504, + "learning_rate": 8.048026953479062e-06, + "loss": 0.5349, + "step": 891 + }, + { + "epoch": 1.0873273761169782, + "grad_norm": 0.5977190234288519, + "learning_rate": 8.042399775920084e-06, + "loss": 0.5822, + "step": 892 + }, + { + "epoch": 1.0885458976441917, + "grad_norm": 0.5579549068887683, + "learning_rate": 8.036766472590636e-06, + "loss": 0.5892, + "step": 893 + }, + { + "epoch": 1.0897644191714053, + "grad_norm": 0.5035624965150097, + "learning_rate": 8.031127054833192e-06, + "loss": 0.5278, + "step": 894 + }, + { + "epoch": 1.090982940698619, + "grad_norm": 0.569184764093924, + "learning_rate": 8.025481534002524e-06, + "loss": 0.5904, + "step": 895 + }, + { + "epoch": 1.0922014622258327, + "grad_norm": 0.47339482033152885, + "learning_rate": 8.019829921465703e-06, + "loss": 0.5598, + "step": 896 + }, + { + "epoch": 1.0934199837530463, + "grad_norm": 0.4510131001279952, + "learning_rate": 8.014172228602063e-06, + "loss": 0.5218, + "step": 897 + }, + { + "epoch": 1.0946385052802599, + "grad_norm": 0.5778676271124781, + "learning_rate": 8.00850846680318e-06, + "loss": 0.6047, + "step": 898 + }, + { + "epoch": 1.0958570268074737, + "grad_norm": 0.437095810398411, + "learning_rate": 8.002838647472848e-06, + "loss": 0.5497, + "step": 899 + }, + { + "epoch": 1.0970755483346872, + "grad_norm": 0.5562520913467127, + "learning_rate": 7.997162782027061e-06, + "loss": 0.5555, + "step": 900 + }, + { + "epoch": 1.0982940698619008, + "grad_norm": 0.49447252137766545, + "learning_rate": 7.991480881893982e-06, + "loss": 0.5282, + "step": 901 + }, + { + "epoch": 1.0995125913891146, + "grad_norm": 0.5223776301957348, + "learning_rate": 7.985792958513932e-06, + "loss": 0.5936, + "step": 902 + }, + { + "epoch": 1.1007311129163282, + "grad_norm": 0.43743454592876513, + "learning_rate": 7.98009902333935e-06, + "loss": 0.5209, + "step": 903 + }, + { + "epoch": 1.1019496344435418, + "grad_norm": 0.48630293369462313, + "learning_rate": 7.974399087834786e-06, + "loss": 0.5629, + "step": 904 + }, + { + "epoch": 1.1031681559707556, + "grad_norm": 0.4518898797022784, + "learning_rate": 7.968693163476872e-06, + "loss": 0.5469, + "step": 905 + }, + { + "epoch": 1.1043866774979691, + "grad_norm": 0.5599257334925746, + "learning_rate": 7.962981261754295e-06, + "loss": 0.6093, + "step": 906 + }, + { + "epoch": 1.1056051990251827, + "grad_norm": 0.508379851023288, + "learning_rate": 7.957263394167778e-06, + "loss": 0.5502, + "step": 907 + }, + { + "epoch": 1.1068237205523965, + "grad_norm": 0.46905549399423435, + "learning_rate": 7.951539572230058e-06, + "loss": 0.5498, + "step": 908 + }, + { + "epoch": 1.10804224207961, + "grad_norm": 0.5331570716206057, + "learning_rate": 7.945809807465857e-06, + "loss": 0.5936, + "step": 909 + }, + { + "epoch": 1.1092607636068237, + "grad_norm": 0.43447287523932976, + "learning_rate": 7.940074111411869e-06, + "loss": 0.5205, + "step": 910 + }, + { + "epoch": 1.1104792851340373, + "grad_norm": 0.4675250634574423, + "learning_rate": 7.934332495616723e-06, + "loss": 0.5921, + "step": 911 + }, + { + "epoch": 1.111697806661251, + "grad_norm": 0.5710382430607513, + "learning_rate": 7.928584971640974e-06, + "loss": 0.5528, + "step": 912 + }, + { + "epoch": 1.1129163281884646, + "grad_norm": 0.43616129376419555, + "learning_rate": 7.922831551057068e-06, + "loss": 0.5304, + "step": 913 + }, + { + "epoch": 1.1141348497156782, + "grad_norm": 0.4931780007557348, + "learning_rate": 7.917072245449327e-06, + "loss": 0.5667, + "step": 914 + }, + { + "epoch": 1.115353371242892, + "grad_norm": 0.46266355232192513, + "learning_rate": 7.91130706641392e-06, + "loss": 0.557, + "step": 915 + }, + { + "epoch": 1.1165718927701056, + "grad_norm": 0.4769121004651534, + "learning_rate": 7.90553602555884e-06, + "loss": 0.5761, + "step": 916 + }, + { + "epoch": 1.1177904142973192, + "grad_norm": 0.4543130942521957, + "learning_rate": 7.899759134503888e-06, + "loss": 0.5667, + "step": 917 + }, + { + "epoch": 1.119008935824533, + "grad_norm": 0.4622820207175306, + "learning_rate": 7.893976404880643e-06, + "loss": 0.5217, + "step": 918 + }, + { + "epoch": 1.1202274573517466, + "grad_norm": 0.45946359941638926, + "learning_rate": 7.888187848332434e-06, + "loss": 0.552, + "step": 919 + }, + { + "epoch": 1.1214459788789601, + "grad_norm": 0.5221530283186372, + "learning_rate": 7.88239347651433e-06, + "loss": 0.6037, + "step": 920 + }, + { + "epoch": 1.122664500406174, + "grad_norm": 0.490304437758209, + "learning_rate": 7.876593301093104e-06, + "loss": 0.5435, + "step": 921 + }, + { + "epoch": 1.1238830219333875, + "grad_norm": 0.5353872887084351, + "learning_rate": 7.870787333747216e-06, + "loss": 0.5465, + "step": 922 + }, + { + "epoch": 1.125101543460601, + "grad_norm": 0.5305459219097892, + "learning_rate": 7.864975586166788e-06, + "loss": 0.5401, + "step": 923 + }, + { + "epoch": 1.126320064987815, + "grad_norm": 0.4522121891276298, + "learning_rate": 7.859158070053578e-06, + "loss": 0.56, + "step": 924 + }, + { + "epoch": 1.1275385865150285, + "grad_norm": 0.5400674612069138, + "learning_rate": 7.853334797120961e-06, + "loss": 0.5938, + "step": 925 + }, + { + "epoch": 1.128757108042242, + "grad_norm": 0.4735679351556697, + "learning_rate": 7.847505779093906e-06, + "loss": 0.5517, + "step": 926 + }, + { + "epoch": 1.1299756295694556, + "grad_norm": 0.48850903658646466, + "learning_rate": 7.841671027708945e-06, + "loss": 0.5805, + "step": 927 + }, + { + "epoch": 1.1311941510966694, + "grad_norm": 0.4465079826503964, + "learning_rate": 7.835830554714153e-06, + "loss": 0.5332, + "step": 928 + }, + { + "epoch": 1.132412672623883, + "grad_norm": 0.5630070888376983, + "learning_rate": 7.82998437186913e-06, + "loss": 0.5744, + "step": 929 + }, + { + "epoch": 1.1336311941510966, + "grad_norm": 0.4850227941162986, + "learning_rate": 7.824132490944968e-06, + "loss": 0.5284, + "step": 930 + }, + { + "epoch": 1.1348497156783104, + "grad_norm": 0.5473017535296978, + "learning_rate": 7.818274923724237e-06, + "loss": 0.5853, + "step": 931 + }, + { + "epoch": 1.136068237205524, + "grad_norm": 0.6180360857968815, + "learning_rate": 7.81241168200095e-06, + "loss": 0.6005, + "step": 932 + }, + { + "epoch": 1.1372867587327375, + "grad_norm": 0.606221772548701, + "learning_rate": 7.80654277758055e-06, + "loss": 0.5534, + "step": 933 + }, + { + "epoch": 1.1385052802599513, + "grad_norm": 0.4683974906247182, + "learning_rate": 7.80066822227988e-06, + "loss": 0.5557, + "step": 934 + }, + { + "epoch": 1.139723801787165, + "grad_norm": 0.5733918926578689, + "learning_rate": 7.794788027927165e-06, + "loss": 0.5617, + "step": 935 + }, + { + "epoch": 1.1409423233143785, + "grad_norm": 0.5394769205967501, + "learning_rate": 7.788902206361974e-06, + "loss": 0.5949, + "step": 936 + }, + { + "epoch": 1.1421608448415923, + "grad_norm": 0.4616046919338432, + "learning_rate": 7.783010769435216e-06, + "loss": 0.5173, + "step": 937 + }, + { + "epoch": 1.1433793663688059, + "grad_norm": 0.5796955213884182, + "learning_rate": 7.7771137290091e-06, + "loss": 0.5924, + "step": 938 + }, + { + "epoch": 1.1445978878960195, + "grad_norm": 0.5847720129488866, + "learning_rate": 7.771211096957125e-06, + "loss": 0.5562, + "step": 939 + }, + { + "epoch": 1.145816409423233, + "grad_norm": 0.5171314095714995, + "learning_rate": 7.765302885164038e-06, + "loss": 0.5548, + "step": 940 + }, + { + "epoch": 1.1470349309504468, + "grad_norm": 0.49901458608547633, + "learning_rate": 7.759389105525832e-06, + "loss": 0.5725, + "step": 941 + }, + { + "epoch": 1.1482534524776604, + "grad_norm": 0.5352472484551857, + "learning_rate": 7.753469769949701e-06, + "loss": 0.5582, + "step": 942 + }, + { + "epoch": 1.149471974004874, + "grad_norm": 0.6669984026812862, + "learning_rate": 7.747544890354031e-06, + "loss": 0.6313, + "step": 943 + }, + { + "epoch": 1.1506904955320878, + "grad_norm": 0.4640017618478166, + "learning_rate": 7.74161447866837e-06, + "loss": 0.5275, + "step": 944 + }, + { + "epoch": 1.1519090170593014, + "grad_norm": 0.5032260303359475, + "learning_rate": 7.735678546833403e-06, + "loss": 0.5405, + "step": 945 + }, + { + "epoch": 1.153127538586515, + "grad_norm": 0.545384651096698, + "learning_rate": 7.729737106800932e-06, + "loss": 0.5856, + "step": 946 + }, + { + "epoch": 1.1543460601137288, + "grad_norm": 0.5735240939112272, + "learning_rate": 7.723790170533848e-06, + "loss": 0.571, + "step": 947 + }, + { + "epoch": 1.1555645816409423, + "grad_norm": 0.4552234746793405, + "learning_rate": 7.717837750006106e-06, + "loss": 0.5067, + "step": 948 + }, + { + "epoch": 1.156783103168156, + "grad_norm": 0.49406048197174507, + "learning_rate": 7.71187985720271e-06, + "loss": 0.592, + "step": 949 + }, + { + "epoch": 1.1580016246953697, + "grad_norm": 0.5489847996831881, + "learning_rate": 7.705916504119679e-06, + "loss": 0.5716, + "step": 950 + }, + { + "epoch": 1.1592201462225833, + "grad_norm": 0.48074624532511123, + "learning_rate": 7.699947702764021e-06, + "loss": 0.5287, + "step": 951 + }, + { + "epoch": 1.1604386677497969, + "grad_norm": 0.4833115004977427, + "learning_rate": 7.693973465153724e-06, + "loss": 0.5667, + "step": 952 + }, + { + "epoch": 1.1616571892770104, + "grad_norm": 0.5472052571967937, + "learning_rate": 7.68799380331771e-06, + "loss": 0.5806, + "step": 953 + }, + { + "epoch": 1.1628757108042242, + "grad_norm": 0.4381241429842595, + "learning_rate": 7.682008729295834e-06, + "loss": 0.5448, + "step": 954 + }, + { + "epoch": 1.1640942323314378, + "grad_norm": 0.6129536550799662, + "learning_rate": 7.676018255138841e-06, + "loss": 0.6091, + "step": 955 + }, + { + "epoch": 1.1653127538586514, + "grad_norm": 0.524234969513479, + "learning_rate": 7.67002239290835e-06, + "loss": 0.5363, + "step": 956 + }, + { + "epoch": 1.1665312753858652, + "grad_norm": 0.43755065750263256, + "learning_rate": 7.664021154676828e-06, + "loss": 0.5683, + "step": 957 + }, + { + "epoch": 1.1677497969130788, + "grad_norm": 0.4767439220213808, + "learning_rate": 7.658014552527572e-06, + "loss": 0.5201, + "step": 958 + }, + { + "epoch": 1.1689683184402924, + "grad_norm": 0.6051473086713034, + "learning_rate": 7.652002598554675e-06, + "loss": 0.6148, + "step": 959 + }, + { + "epoch": 1.1701868399675062, + "grad_norm": 0.442810424258257, + "learning_rate": 7.645985304863004e-06, + "loss": 0.5089, + "step": 960 + }, + { + "epoch": 1.1714053614947197, + "grad_norm": 0.5212534237408961, + "learning_rate": 7.639962683568178e-06, + "loss": 0.6398, + "step": 961 + }, + { + "epoch": 1.1726238830219333, + "grad_norm": 0.4782128214916858, + "learning_rate": 7.633934746796545e-06, + "loss": 0.5247, + "step": 962 + }, + { + "epoch": 1.1738424045491471, + "grad_norm": 0.555997733569589, + "learning_rate": 7.627901506685157e-06, + "loss": 0.57, + "step": 963 + }, + { + "epoch": 1.1750609260763607, + "grad_norm": 0.4524690440478936, + "learning_rate": 7.621862975381739e-06, + "loss": 0.5032, + "step": 964 + }, + { + "epoch": 1.1762794476035743, + "grad_norm": 0.5558207018565952, + "learning_rate": 7.615819165044671e-06, + "loss": 0.6055, + "step": 965 + }, + { + "epoch": 1.1774979691307879, + "grad_norm": 0.5285401986639633, + "learning_rate": 7.609770087842969e-06, + "loss": 0.5232, + "step": 966 + }, + { + "epoch": 1.1787164906580017, + "grad_norm": 0.4906926197877719, + "learning_rate": 7.603715755956243e-06, + "loss": 0.6184, + "step": 967 + }, + { + "epoch": 1.1799350121852152, + "grad_norm": 0.5453800647325697, + "learning_rate": 7.597656181574691e-06, + "loss": 0.5449, + "step": 968 + }, + { + "epoch": 1.181153533712429, + "grad_norm": 0.532023507332386, + "learning_rate": 7.5915913768990615e-06, + "loss": 0.574, + "step": 969 + }, + { + "epoch": 1.1823720552396426, + "grad_norm": 0.46068002444123424, + "learning_rate": 7.585521354140638e-06, + "loss": 0.5616, + "step": 970 + }, + { + "epoch": 1.1835905767668562, + "grad_norm": 0.45366600351939207, + "learning_rate": 7.57944612552121e-06, + "loss": 0.5576, + "step": 971 + }, + { + "epoch": 1.1848090982940698, + "grad_norm": 0.5035963241142227, + "learning_rate": 7.573365703273045e-06, + "loss": 0.5842, + "step": 972 + }, + { + "epoch": 1.1860276198212836, + "grad_norm": 0.46429524269523453, + "learning_rate": 7.567280099638874e-06, + "loss": 0.5603, + "step": 973 + }, + { + "epoch": 1.1872461413484972, + "grad_norm": 0.4391995658392802, + "learning_rate": 7.561189326871854e-06, + "loss": 0.5483, + "step": 974 + }, + { + "epoch": 1.1884646628757107, + "grad_norm": 0.5688078918566764, + "learning_rate": 7.555093397235553e-06, + "loss": 0.6145, + "step": 975 + }, + { + "epoch": 1.1896831844029245, + "grad_norm": 0.4535069143341333, + "learning_rate": 7.548992323003923e-06, + "loss": 0.529, + "step": 976 + }, + { + "epoch": 1.190901705930138, + "grad_norm": 0.5610828923463264, + "learning_rate": 7.542886116461272e-06, + "loss": 0.5604, + "step": 977 + }, + { + "epoch": 1.1921202274573517, + "grad_norm": 0.49771566362561265, + "learning_rate": 7.536774789902246e-06, + "loss": 0.5339, + "step": 978 + }, + { + "epoch": 1.1933387489845655, + "grad_norm": 0.5055933911391732, + "learning_rate": 7.530658355631795e-06, + "loss": 0.5307, + "step": 979 + }, + { + "epoch": 1.194557270511779, + "grad_norm": 0.5075577294535538, + "learning_rate": 7.524536825965154e-06, + "loss": 0.5604, + "step": 980 + }, + { + "epoch": 1.1957757920389926, + "grad_norm": 0.5520230309503728, + "learning_rate": 7.518410213227823e-06, + "loss": 0.6162, + "step": 981 + }, + { + "epoch": 1.1969943135662064, + "grad_norm": 0.5218152039597276, + "learning_rate": 7.512278529755529e-06, + "loss": 0.5613, + "step": 982 + }, + { + "epoch": 1.19821283509342, + "grad_norm": 0.4971095496314555, + "learning_rate": 7.506141787894214e-06, + "loss": 0.5643, + "step": 983 + }, + { + "epoch": 1.1994313566206336, + "grad_norm": 0.5351931771239321, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5365, + "step": 984 + }, + { + "epoch": 1.2006498781478472, + "grad_norm": 0.49713221603010127, + "learning_rate": 7.493853178439177e-06, + "loss": 0.5276, + "step": 985 + }, + { + "epoch": 1.201868399675061, + "grad_norm": 0.49687942243856253, + "learning_rate": 7.48770133558816e-06, + "loss": 0.5705, + "step": 986 + }, + { + "epoch": 1.2030869212022746, + "grad_norm": 0.4638420387813551, + "learning_rate": 7.481544483833485e-06, + "loss": 0.5143, + "step": 987 + }, + { + "epoch": 1.2043054427294881, + "grad_norm": 0.5737984880330318, + "learning_rate": 7.475382635571761e-06, + "loss": 0.6105, + "step": 988 + }, + { + "epoch": 1.205523964256702, + "grad_norm": 0.4548720894167483, + "learning_rate": 7.4692158032096706e-06, + "loss": 0.5409, + "step": 989 + }, + { + "epoch": 1.2067424857839155, + "grad_norm": 0.49711497244164915, + "learning_rate": 7.463043999163919e-06, + "loss": 0.5803, + "step": 990 + }, + { + "epoch": 1.207961007311129, + "grad_norm": 0.47268020267724503, + "learning_rate": 7.456867235861231e-06, + "loss": 0.563, + "step": 991 + }, + { + "epoch": 1.209179528838343, + "grad_norm": 0.4431695796449243, + "learning_rate": 7.450685525738315e-06, + "loss": 0.5458, + "step": 992 + }, + { + "epoch": 1.2103980503655565, + "grad_norm": 0.5514220959709781, + "learning_rate": 7.444498881241835e-06, + "loss": 0.5719, + "step": 993 + }, + { + "epoch": 1.21161657189277, + "grad_norm": 0.48730651156910637, + "learning_rate": 7.4383073148283945e-06, + "loss": 0.5547, + "step": 994 + }, + { + "epoch": 1.2128350934199839, + "grad_norm": 0.48026701020561735, + "learning_rate": 7.432110838964508e-06, + "loss": 0.5446, + "step": 995 + }, + { + "epoch": 1.2140536149471974, + "grad_norm": 0.49526550877005804, + "learning_rate": 7.4259094661265685e-06, + "loss": 0.5539, + "step": 996 + }, + { + "epoch": 1.215272136474411, + "grad_norm": 0.5033075517007225, + "learning_rate": 7.419703208800839e-06, + "loss": 0.5885, + "step": 997 + }, + { + "epoch": 1.2164906580016246, + "grad_norm": 0.4591330610679407, + "learning_rate": 7.413492079483405e-06, + "loss": 0.4958, + "step": 998 + }, + { + "epoch": 1.2177091795288384, + "grad_norm": 0.5435516527726211, + "learning_rate": 7.407276090680173e-06, + "loss": 0.5941, + "step": 999 + }, + { + "epoch": 1.218927701056052, + "grad_norm": 0.5014818934661753, + "learning_rate": 7.401055254906829e-06, + "loss": 0.5674, + "step": 1000 + }, + { + "epoch": 1.2201462225832655, + "grad_norm": 0.5506374382220622, + "learning_rate": 7.394829584688816e-06, + "loss": 0.5623, + "step": 1001 + }, + { + "epoch": 1.2213647441104794, + "grad_norm": 0.47988582460651985, + "learning_rate": 7.388599092561315e-06, + "loss": 0.579, + "step": 1002 + }, + { + "epoch": 1.222583265637693, + "grad_norm": 0.5116646928435937, + "learning_rate": 7.382363791069214e-06, + "loss": 0.5789, + "step": 1003 + }, + { + "epoch": 1.2238017871649065, + "grad_norm": 0.5815639981335669, + "learning_rate": 7.376123692767084e-06, + "loss": 0.5306, + "step": 1004 + }, + { + "epoch": 1.2250203086921203, + "grad_norm": 0.47545875532554605, + "learning_rate": 7.369878810219154e-06, + "loss": 0.574, + "step": 1005 + }, + { + "epoch": 1.2262388302193339, + "grad_norm": 0.5843762256050973, + "learning_rate": 7.363629155999289e-06, + "loss": 0.5835, + "step": 1006 + }, + { + "epoch": 1.2274573517465475, + "grad_norm": 0.49038029629420044, + "learning_rate": 7.357374742690956e-06, + "loss": 0.5277, + "step": 1007 + }, + { + "epoch": 1.2286758732737613, + "grad_norm": 0.4825203440227731, + "learning_rate": 7.351115582887212e-06, + "loss": 0.5749, + "step": 1008 + }, + { + "epoch": 1.2298943948009748, + "grad_norm": 0.5230621508499962, + "learning_rate": 7.344851689190662e-06, + "loss": 0.5494, + "step": 1009 + }, + { + "epoch": 1.2311129163281884, + "grad_norm": 0.49942387299855917, + "learning_rate": 7.33858307421345e-06, + "loss": 0.5684, + "step": 1010 + }, + { + "epoch": 1.232331437855402, + "grad_norm": 0.5550781071831415, + "learning_rate": 7.3323097505772225e-06, + "loss": 0.5552, + "step": 1011 + }, + { + "epoch": 1.2335499593826158, + "grad_norm": 0.5160851429477965, + "learning_rate": 7.326031730913107e-06, + "loss": 0.5365, + "step": 1012 + }, + { + "epoch": 1.2347684809098294, + "grad_norm": 0.5594132080926748, + "learning_rate": 7.319749027861687e-06, + "loss": 0.5805, + "step": 1013 + }, + { + "epoch": 1.235987002437043, + "grad_norm": 0.5035664881102385, + "learning_rate": 7.313461654072974e-06, + "loss": 0.5572, + "step": 1014 + }, + { + "epoch": 1.2372055239642568, + "grad_norm": 0.5011647298301126, + "learning_rate": 7.3071696222063874e-06, + "loss": 0.5736, + "step": 1015 + }, + { + "epoch": 1.2384240454914703, + "grad_norm": 0.5003447796526637, + "learning_rate": 7.300872944930724e-06, + "loss": 0.5724, + "step": 1016 + }, + { + "epoch": 1.239642567018684, + "grad_norm": 0.4488541730554654, + "learning_rate": 7.2945716349241305e-06, + "loss": 0.5271, + "step": 1017 + }, + { + "epoch": 1.2408610885458977, + "grad_norm": 0.48397897498100484, + "learning_rate": 7.288265704874089e-06, + "loss": 0.5702, + "step": 1018 + }, + { + "epoch": 1.2420796100731113, + "grad_norm": 0.46076984680494393, + "learning_rate": 7.281955167477372e-06, + "loss": 0.5235, + "step": 1019 + }, + { + "epoch": 1.2432981316003249, + "grad_norm": 0.46851694123351845, + "learning_rate": 7.2756400354400445e-06, + "loss": 0.5237, + "step": 1020 + }, + { + "epoch": 1.2445166531275387, + "grad_norm": 0.48677378118465786, + "learning_rate": 7.2693203214774084e-06, + "loss": 0.6109, + "step": 1021 + }, + { + "epoch": 1.2457351746547523, + "grad_norm": 0.4780766187805638, + "learning_rate": 7.262996038314001e-06, + "loss": 0.5765, + "step": 1022 + }, + { + "epoch": 1.2469536961819658, + "grad_norm": 0.4640167779478858, + "learning_rate": 7.2566671986835515e-06, + "loss": 0.5642, + "step": 1023 + }, + { + "epoch": 1.2481722177091794, + "grad_norm": 0.48778459720464146, + "learning_rate": 7.25033381532897e-06, + "loss": 0.4946, + "step": 1024 + }, + { + "epoch": 1.2493907392363932, + "grad_norm": 0.4659728876017271, + "learning_rate": 7.243995901002312e-06, + "loss": 0.5638, + "step": 1025 + }, + { + "epoch": 1.2506092607636068, + "grad_norm": 0.4038916973792116, + "learning_rate": 7.237653468464756e-06, + "loss": 0.5607, + "step": 1026 + }, + { + "epoch": 1.2518277822908206, + "grad_norm": 0.5567339438269147, + "learning_rate": 7.231306530486579e-06, + "loss": 0.5561, + "step": 1027 + }, + { + "epoch": 1.2530463038180342, + "grad_norm": 0.4641852200663108, + "learning_rate": 7.224955099847129e-06, + "loss": 0.6096, + "step": 1028 + }, + { + "epoch": 1.2542648253452477, + "grad_norm": 0.4411515265169084, + "learning_rate": 7.218599189334799e-06, + "loss": 0.4709, + "step": 1029 + }, + { + "epoch": 1.2554833468724613, + "grad_norm": 0.5058133934757223, + "learning_rate": 7.212238811747003e-06, + "loss": 0.5904, + "step": 1030 + }, + { + "epoch": 1.2567018683996751, + "grad_norm": 0.41291563737696013, + "learning_rate": 7.205873979890151e-06, + "loss": 0.5436, + "step": 1031 + }, + { + "epoch": 1.2579203899268887, + "grad_norm": 0.4994662597356207, + "learning_rate": 7.199504706579617e-06, + "loss": 0.6102, + "step": 1032 + }, + { + "epoch": 1.2591389114541023, + "grad_norm": 0.419031706073167, + "learning_rate": 7.193131004639722e-06, + "loss": 0.5104, + "step": 1033 + }, + { + "epoch": 1.260357432981316, + "grad_norm": 0.4373098819276125, + "learning_rate": 7.186752886903702e-06, + "loss": 0.5539, + "step": 1034 + }, + { + "epoch": 1.2615759545085297, + "grad_norm": 0.42312469752099624, + "learning_rate": 7.180370366213684e-06, + "loss": 0.5685, + "step": 1035 + }, + { + "epoch": 1.2627944760357432, + "grad_norm": 0.4976440200214435, + "learning_rate": 7.173983455420659e-06, + "loss": 0.5886, + "step": 1036 + }, + { + "epoch": 1.2640129975629568, + "grad_norm": 0.4458571719063019, + "learning_rate": 7.167592167384461e-06, + "loss": 0.5481, + "step": 1037 + }, + { + "epoch": 1.2652315190901706, + "grad_norm": 0.5011046191959967, + "learning_rate": 7.161196514973735e-06, + "loss": 0.591, + "step": 1038 + }, + { + "epoch": 1.2664500406173842, + "grad_norm": 0.49133842958144974, + "learning_rate": 7.154796511065914e-06, + "loss": 0.5523, + "step": 1039 + }, + { + "epoch": 1.267668562144598, + "grad_norm": 0.47022131838731085, + "learning_rate": 7.148392168547191e-06, + "loss": 0.5736, + "step": 1040 + }, + { + "epoch": 1.2688870836718116, + "grad_norm": 0.41386960779050597, + "learning_rate": 7.141983500312498e-06, + "loss": 0.5529, + "step": 1041 + }, + { + "epoch": 1.2701056051990252, + "grad_norm": 0.44977069875020453, + "learning_rate": 7.135570519265473e-06, + "loss": 0.548, + "step": 1042 + }, + { + "epoch": 1.2713241267262387, + "grad_norm": 0.505607270978524, + "learning_rate": 7.129153238318441e-06, + "loss": 0.5685, + "step": 1043 + }, + { + "epoch": 1.2725426482534525, + "grad_norm": 0.4473291490790123, + "learning_rate": 7.122731670392381e-06, + "loss": 0.5914, + "step": 1044 + }, + { + "epoch": 1.2737611697806661, + "grad_norm": 0.42761462653683685, + "learning_rate": 7.116305828416907e-06, + "loss": 0.5596, + "step": 1045 + }, + { + "epoch": 1.2749796913078797, + "grad_norm": 0.5367569602527996, + "learning_rate": 7.109875725330239e-06, + "loss": 0.5705, + "step": 1046 + }, + { + "epoch": 1.2761982128350935, + "grad_norm": 0.4239534982631823, + "learning_rate": 7.1034413740791705e-06, + "loss": 0.4988, + "step": 1047 + }, + { + "epoch": 1.277416734362307, + "grad_norm": 0.5193109373280052, + "learning_rate": 7.097002787619059e-06, + "loss": 0.5812, + "step": 1048 + }, + { + "epoch": 1.2786352558895206, + "grad_norm": 0.5147411712979314, + "learning_rate": 7.090559978913781e-06, + "loss": 0.5916, + "step": 1049 + }, + { + "epoch": 1.2798537774167342, + "grad_norm": 0.4224143215053458, + "learning_rate": 7.0841129609357165e-06, + "loss": 0.4905, + "step": 1050 + }, + { + "epoch": 1.281072298943948, + "grad_norm": 0.47217055541643876, + "learning_rate": 7.0776617466657196e-06, + "loss": 0.5592, + "step": 1051 + }, + { + "epoch": 1.2822908204711616, + "grad_norm": 0.4826081486423026, + "learning_rate": 7.071206349093097e-06, + "loss": 0.5822, + "step": 1052 + }, + { + "epoch": 1.2835093419983754, + "grad_norm": 0.42489592319050484, + "learning_rate": 7.064746781215578e-06, + "loss": 0.539, + "step": 1053 + }, + { + "epoch": 1.284727863525589, + "grad_norm": 0.4378036437269882, + "learning_rate": 7.058283056039283e-06, + "loss": 0.5224, + "step": 1054 + }, + { + "epoch": 1.2859463850528026, + "grad_norm": 0.5090205584091956, + "learning_rate": 7.051815186578711e-06, + "loss": 0.6022, + "step": 1055 + }, + { + "epoch": 1.2871649065800161, + "grad_norm": 0.421460820182392, + "learning_rate": 7.045343185856701e-06, + "loss": 0.5371, + "step": 1056 + }, + { + "epoch": 1.28838342810723, + "grad_norm": 0.45568572401745694, + "learning_rate": 7.038867066904407e-06, + "loss": 0.5549, + "step": 1057 + }, + { + "epoch": 1.2896019496344435, + "grad_norm": 0.4249363344861208, + "learning_rate": 7.032386842761282e-06, + "loss": 0.5434, + "step": 1058 + }, + { + "epoch": 1.2908204711616573, + "grad_norm": 0.4562034562178344, + "learning_rate": 7.025902526475039e-06, + "loss": 0.5494, + "step": 1059 + }, + { + "epoch": 1.292038992688871, + "grad_norm": 0.5341880271433396, + "learning_rate": 7.0194141311016336e-06, + "loss": 0.613, + "step": 1060 + }, + { + "epoch": 1.2932575142160845, + "grad_norm": 0.4504428137448532, + "learning_rate": 7.0129216697052345e-06, + "loss": 0.5016, + "step": 1061 + }, + { + "epoch": 1.294476035743298, + "grad_norm": 0.48710310604219204, + "learning_rate": 7.006425155358195e-06, + "loss": 0.5966, + "step": 1062 + }, + { + "epoch": 1.2956945572705119, + "grad_norm": 0.4178638324054384, + "learning_rate": 6.99992460114103e-06, + "loss": 0.518, + "step": 1063 + }, + { + "epoch": 1.2969130787977254, + "grad_norm": 0.4592904842250764, + "learning_rate": 6.993420020142389e-06, + "loss": 0.5731, + "step": 1064 + }, + { + "epoch": 1.298131600324939, + "grad_norm": 0.44542709276757847, + "learning_rate": 6.986911425459028e-06, + "loss": 0.5713, + "step": 1065 + }, + { + "epoch": 1.2993501218521528, + "grad_norm": 0.43271038208431817, + "learning_rate": 6.980398830195785e-06, + "loss": 0.5394, + "step": 1066 + }, + { + "epoch": 1.3005686433793664, + "grad_norm": 0.42858083262689106, + "learning_rate": 6.9738822474655555e-06, + "loss": 0.5593, + "step": 1067 + }, + { + "epoch": 1.30178716490658, + "grad_norm": 0.45958843226910784, + "learning_rate": 6.967361690389258e-06, + "loss": 0.6054, + "step": 1068 + }, + { + "epoch": 1.3030056864337936, + "grad_norm": 0.4289960695158536, + "learning_rate": 6.960837172095822e-06, + "loss": 0.5548, + "step": 1069 + }, + { + "epoch": 1.3042242079610074, + "grad_norm": 0.47468738466334404, + "learning_rate": 6.954308705722142e-06, + "loss": 0.572, + "step": 1070 + }, + { + "epoch": 1.305442729488221, + "grad_norm": 0.47013938140744177, + "learning_rate": 6.947776304413072e-06, + "loss": 0.5705, + "step": 1071 + }, + { + "epoch": 1.3066612510154347, + "grad_norm": 0.42486037624655837, + "learning_rate": 6.941239981321379e-06, + "loss": 0.5541, + "step": 1072 + }, + { + "epoch": 1.3078797725426483, + "grad_norm": 0.49246997027712336, + "learning_rate": 6.9346997496077365e-06, + "loss": 0.5955, + "step": 1073 + }, + { + "epoch": 1.309098294069862, + "grad_norm": 0.4472253157123058, + "learning_rate": 6.92815562244068e-06, + "loss": 0.5347, + "step": 1074 + }, + { + "epoch": 1.3103168155970755, + "grad_norm": 0.4795845777067209, + "learning_rate": 6.921607612996591e-06, + "loss": 0.544, + "step": 1075 + }, + { + "epoch": 1.3115353371242893, + "grad_norm": 0.4858592748082412, + "learning_rate": 6.915055734459669e-06, + "loss": 0.5825, + "step": 1076 + }, + { + "epoch": 1.3127538586515028, + "grad_norm": 0.440529958757846, + "learning_rate": 6.908500000021905e-06, + "loss": 0.4894, + "step": 1077 + }, + { + "epoch": 1.3139723801787164, + "grad_norm": 0.49777302193386763, + "learning_rate": 6.9019404228830465e-06, + "loss": 0.6143, + "step": 1078 + }, + { + "epoch": 1.3151909017059302, + "grad_norm": 0.42490987305110145, + "learning_rate": 6.895377016250589e-06, + "loss": 0.5383, + "step": 1079 + }, + { + "epoch": 1.3164094232331438, + "grad_norm": 0.4250241686101231, + "learning_rate": 6.888809793339729e-06, + "loss": 0.5343, + "step": 1080 + }, + { + "epoch": 1.3176279447603574, + "grad_norm": 0.47908573640303, + "learning_rate": 6.882238767373352e-06, + "loss": 0.5766, + "step": 1081 + }, + { + "epoch": 1.318846466287571, + "grad_norm": 0.4393923199378749, + "learning_rate": 6.875663951582e-06, + "loss": 0.518, + "step": 1082 + }, + { + "epoch": 1.3200649878147848, + "grad_norm": 0.5298761025962999, + "learning_rate": 6.869085359203844e-06, + "loss": 0.5687, + "step": 1083 + }, + { + "epoch": 1.3212835093419983, + "grad_norm": 0.4742825873608696, + "learning_rate": 6.862503003484662e-06, + "loss": 0.5804, + "step": 1084 + }, + { + "epoch": 1.3225020308692121, + "grad_norm": 0.4633225475847929, + "learning_rate": 6.855916897677806e-06, + "loss": 0.556, + "step": 1085 + }, + { + "epoch": 1.3237205523964257, + "grad_norm": 0.5225783981999679, + "learning_rate": 6.849327055044182e-06, + "loss": 0.5814, + "step": 1086 + }, + { + "epoch": 1.3249390739236393, + "grad_norm": 0.4288152429153542, + "learning_rate": 6.842733488852218e-06, + "loss": 0.5576, + "step": 1087 + }, + { + "epoch": 1.3261575954508529, + "grad_norm": 0.5221719185878941, + "learning_rate": 6.836136212377839e-06, + "loss": 0.5535, + "step": 1088 + }, + { + "epoch": 1.3273761169780667, + "grad_norm": 0.5296939222461858, + "learning_rate": 6.82953523890444e-06, + "loss": 0.5367, + "step": 1089 + }, + { + "epoch": 1.3285946385052803, + "grad_norm": 0.4975997883807605, + "learning_rate": 6.822930581722864e-06, + "loss": 0.5888, + "step": 1090 + }, + { + "epoch": 1.3298131600324938, + "grad_norm": 0.5680495533922292, + "learning_rate": 6.8163222541313646e-06, + "loss": 0.5797, + "step": 1091 + }, + { + "epoch": 1.3310316815597076, + "grad_norm": 0.4587905010305772, + "learning_rate": 6.80971026943559e-06, + "loss": 0.5202, + "step": 1092 + }, + { + "epoch": 1.3322502030869212, + "grad_norm": 0.551574996506335, + "learning_rate": 6.803094640948553e-06, + "loss": 0.5777, + "step": 1093 + }, + { + "epoch": 1.3334687246141348, + "grad_norm": 0.5703735684360373, + "learning_rate": 6.796475381990598e-06, + "loss": 0.5764, + "step": 1094 + }, + { + "epoch": 1.3346872461413484, + "grad_norm": 0.4925036270565778, + "learning_rate": 6.789852505889384e-06, + "loss": 0.528, + "step": 1095 + }, + { + "epoch": 1.3359057676685622, + "grad_norm": 0.47585637004253845, + "learning_rate": 6.78322602597985e-06, + "loss": 0.5379, + "step": 1096 + }, + { + "epoch": 1.3371242891957758, + "grad_norm": 0.5098349120934949, + "learning_rate": 6.776595955604192e-06, + "loss": 0.5564, + "step": 1097 + }, + { + "epoch": 1.3383428107229896, + "grad_norm": 0.45580609051116194, + "learning_rate": 6.769962308111839e-06, + "loss": 0.5753, + "step": 1098 + }, + { + "epoch": 1.3395613322502031, + "grad_norm": 0.5171674920493432, + "learning_rate": 6.7633250968594145e-06, + "loss": 0.5949, + "step": 1099 + }, + { + "epoch": 1.3407798537774167, + "grad_norm": 0.4877120256762604, + "learning_rate": 6.756684335210724e-06, + "loss": 0.515, + "step": 1100 + }, + { + "epoch": 1.3419983753046303, + "grad_norm": 0.4814845112101113, + "learning_rate": 6.750040036536718e-06, + "loss": 0.5684, + "step": 1101 + }, + { + "epoch": 1.343216896831844, + "grad_norm": 0.5705372014720597, + "learning_rate": 6.743392214215473e-06, + "loss": 0.6171, + "step": 1102 + }, + { + "epoch": 1.3444354183590577, + "grad_norm": 0.41955386315882853, + "learning_rate": 6.736740881632156e-06, + "loss": 0.5509, + "step": 1103 + }, + { + "epoch": 1.3456539398862712, + "grad_norm": 0.5028763983027598, + "learning_rate": 6.7300860521790034e-06, + "loss": 0.5519, + "step": 1104 + }, + { + "epoch": 1.346872461413485, + "grad_norm": 0.4751712922206779, + "learning_rate": 6.723427739255291e-06, + "loss": 0.5871, + "step": 1105 + }, + { + "epoch": 1.3480909829406986, + "grad_norm": 0.44250278427343415, + "learning_rate": 6.716765956267313e-06, + "loss": 0.5563, + "step": 1106 + }, + { + "epoch": 1.3493095044679122, + "grad_norm": 0.42447760271061347, + "learning_rate": 6.710100716628345e-06, + "loss": 0.5246, + "step": 1107 + }, + { + "epoch": 1.3505280259951258, + "grad_norm": 0.4884332973463199, + "learning_rate": 6.7034320337586236e-06, + "loss": 0.5906, + "step": 1108 + }, + { + "epoch": 1.3517465475223396, + "grad_norm": 0.47868995347975324, + "learning_rate": 6.696759921085321e-06, + "loss": 0.56, + "step": 1109 + }, + { + "epoch": 1.3529650690495532, + "grad_norm": 0.4891201962969247, + "learning_rate": 6.690084392042514e-06, + "loss": 0.5387, + "step": 1110 + }, + { + "epoch": 1.354183590576767, + "grad_norm": 0.4799279297524152, + "learning_rate": 6.683405460071158e-06, + "loss": 0.5584, + "step": 1111 + }, + { + "epoch": 1.3554021121039805, + "grad_norm": 0.468730614409241, + "learning_rate": 6.676723138619056e-06, + "loss": 0.5639, + "step": 1112 + }, + { + "epoch": 1.3566206336311941, + "grad_norm": 0.4691028535874034, + "learning_rate": 6.670037441140844e-06, + "loss": 0.5249, + "step": 1113 + }, + { + "epoch": 1.3578391551584077, + "grad_norm": 0.5055139224683095, + "learning_rate": 6.663348381097949e-06, + "loss": 0.5668, + "step": 1114 + }, + { + "epoch": 1.3590576766856215, + "grad_norm": 0.4641835440622289, + "learning_rate": 6.656655971958569e-06, + "loss": 0.5168, + "step": 1115 + }, + { + "epoch": 1.360276198212835, + "grad_norm": 0.5446202821644559, + "learning_rate": 6.649960227197648e-06, + "loss": 0.613, + "step": 1116 + }, + { + "epoch": 1.3614947197400489, + "grad_norm": 0.4947120887114883, + "learning_rate": 6.6432611602968445e-06, + "loss": 0.5567, + "step": 1117 + }, + { + "epoch": 1.3627132412672625, + "grad_norm": 0.43139439093199355, + "learning_rate": 6.636558784744507e-06, + "loss": 0.5242, + "step": 1118 + }, + { + "epoch": 1.363931762794476, + "grad_norm": 0.5614131203778131, + "learning_rate": 6.629853114035643e-06, + "loss": 0.5333, + "step": 1119 + }, + { + "epoch": 1.3651502843216896, + "grad_norm": 0.47984259019139797, + "learning_rate": 6.623144161671899e-06, + "loss": 0.6073, + "step": 1120 + }, + { + "epoch": 1.3663688058489034, + "grad_norm": 0.48772092734746075, + "learning_rate": 6.616431941161525e-06, + "loss": 0.519, + "step": 1121 + }, + { + "epoch": 1.367587327376117, + "grad_norm": 0.4944650047147664, + "learning_rate": 6.609716466019356e-06, + "loss": 0.5982, + "step": 1122 + }, + { + "epoch": 1.3688058489033306, + "grad_norm": 0.4514730750801606, + "learning_rate": 6.602997749766773e-06, + "loss": 0.5215, + "step": 1123 + }, + { + "epoch": 1.3700243704305444, + "grad_norm": 0.4806270554361702, + "learning_rate": 6.596275805931691e-06, + "loss": 0.6507, + "step": 1124 + }, + { + "epoch": 1.371242891957758, + "grad_norm": 0.42879599863826967, + "learning_rate": 6.589550648048517e-06, + "loss": 0.5263, + "step": 1125 + }, + { + "epoch": 1.3724614134849715, + "grad_norm": 0.5002076010149914, + "learning_rate": 6.582822289658134e-06, + "loss": 0.544, + "step": 1126 + }, + { + "epoch": 1.373679935012185, + "grad_norm": 0.49996651647577767, + "learning_rate": 6.576090744307866e-06, + "loss": 0.6115, + "step": 1127 + }, + { + "epoch": 1.374898456539399, + "grad_norm": 0.47192523752862847, + "learning_rate": 6.569356025551454e-06, + "loss": 0.5044, + "step": 1128 + }, + { + "epoch": 1.3761169780666125, + "grad_norm": 0.5486850848812702, + "learning_rate": 6.562618146949033e-06, + "loss": 0.5963, + "step": 1129 + }, + { + "epoch": 1.3773354995938263, + "grad_norm": 0.44516782959863155, + "learning_rate": 6.5558771220670935e-06, + "loss": 0.5424, + "step": 1130 + }, + { + "epoch": 1.3785540211210399, + "grad_norm": 0.49271550503516953, + "learning_rate": 6.5491329644784655e-06, + "loss": 0.5241, + "step": 1131 + }, + { + "epoch": 1.3797725426482534, + "grad_norm": 0.5660845065509308, + "learning_rate": 6.542385687762287e-06, + "loss": 0.6154, + "step": 1132 + }, + { + "epoch": 1.380991064175467, + "grad_norm": 0.4271740206518289, + "learning_rate": 6.53563530550397e-06, + "loss": 0.4689, + "step": 1133 + }, + { + "epoch": 1.3822095857026808, + "grad_norm": 0.5195908868358481, + "learning_rate": 6.5288818312951886e-06, + "loss": 0.5462, + "step": 1134 + }, + { + "epoch": 1.3834281072298944, + "grad_norm": 0.5034196032593611, + "learning_rate": 6.5221252787338365e-06, + "loss": 0.587, + "step": 1135 + }, + { + "epoch": 1.384646628757108, + "grad_norm": 0.5196583715973591, + "learning_rate": 6.515365661424007e-06, + "loss": 0.577, + "step": 1136 + }, + { + "epoch": 1.3858651502843218, + "grad_norm": 0.47148796040432117, + "learning_rate": 6.508602992975963e-06, + "loss": 0.5516, + "step": 1137 + }, + { + "epoch": 1.3870836718115354, + "grad_norm": 0.47240263639853314, + "learning_rate": 6.501837287006112e-06, + "loss": 0.5017, + "step": 1138 + }, + { + "epoch": 1.388302193338749, + "grad_norm": 0.4848195827079731, + "learning_rate": 6.495068557136979e-06, + "loss": 0.6068, + "step": 1139 + }, + { + "epoch": 1.3895207148659625, + "grad_norm": 0.464916968432065, + "learning_rate": 6.4882968169971734e-06, + "loss": 0.5114, + "step": 1140 + }, + { + "epoch": 1.3907392363931763, + "grad_norm": 0.4672169290921844, + "learning_rate": 6.4815220802213705e-06, + "loss": 0.571, + "step": 1141 + }, + { + "epoch": 1.39195775792039, + "grad_norm": 0.45354629086847004, + "learning_rate": 6.474744360450274e-06, + "loss": 0.559, + "step": 1142 + }, + { + "epoch": 1.3931762794476037, + "grad_norm": 0.49697460412752753, + "learning_rate": 6.467963671330602e-06, + "loss": 0.5712, + "step": 1143 + }, + { + "epoch": 1.3943948009748173, + "grad_norm": 0.42597106705666193, + "learning_rate": 6.461180026515038e-06, + "loss": 0.4836, + "step": 1144 + }, + { + "epoch": 1.3956133225020309, + "grad_norm": 0.5696838757187256, + "learning_rate": 6.45439343966223e-06, + "loss": 0.6293, + "step": 1145 + }, + { + "epoch": 1.3968318440292444, + "grad_norm": 0.44015111009766694, + "learning_rate": 6.447603924436744e-06, + "loss": 0.5672, + "step": 1146 + }, + { + "epoch": 1.3980503655564582, + "grad_norm": 0.5171923824405892, + "learning_rate": 6.44081149450904e-06, + "loss": 0.543, + "step": 1147 + }, + { + "epoch": 1.3992688870836718, + "grad_norm": 0.4861104307146921, + "learning_rate": 6.434016163555452e-06, + "loss": 0.5536, + "step": 1148 + }, + { + "epoch": 1.4004874086108854, + "grad_norm": 0.4672428316707098, + "learning_rate": 6.4272179452581505e-06, + "loss": 0.5513, + "step": 1149 + }, + { + "epoch": 1.4017059301380992, + "grad_norm": 0.5041699642923018, + "learning_rate": 6.42041685330512e-06, + "loss": 0.5579, + "step": 1150 + }, + { + "epoch": 1.4029244516653128, + "grad_norm": 0.5689659183656529, + "learning_rate": 6.413612901390136e-06, + "loss": 0.5171, + "step": 1151 + }, + { + "epoch": 1.4041429731925263, + "grad_norm": 0.4852858521398993, + "learning_rate": 6.406806103212725e-06, + "loss": 0.619, + "step": 1152 + }, + { + "epoch": 1.40536149471974, + "grad_norm": 0.5193110473839417, + "learning_rate": 6.39999647247815e-06, + "loss": 0.549, + "step": 1153 + }, + { + "epoch": 1.4065800162469537, + "grad_norm": 0.4769214566829931, + "learning_rate": 6.393184022897375e-06, + "loss": 0.526, + "step": 1154 + }, + { + "epoch": 1.4077985377741673, + "grad_norm": 0.4334565428180597, + "learning_rate": 6.38636876818704e-06, + "loss": 0.5511, + "step": 1155 + }, + { + "epoch": 1.409017059301381, + "grad_norm": 0.7019468412425452, + "learning_rate": 6.3795507220694335e-06, + "loss": 0.6058, + "step": 1156 + }, + { + "epoch": 1.4102355808285947, + "grad_norm": 0.4559746858499104, + "learning_rate": 6.372729898272463e-06, + "loss": 0.5623, + "step": 1157 + }, + { + "epoch": 1.4114541023558083, + "grad_norm": 0.4873072450531043, + "learning_rate": 6.365906310529631e-06, + "loss": 0.526, + "step": 1158 + }, + { + "epoch": 1.4126726238830218, + "grad_norm": 0.4930877948197653, + "learning_rate": 6.359079972580001e-06, + "loss": 0.5417, + "step": 1159 + }, + { + "epoch": 1.4138911454102356, + "grad_norm": 0.4585436187425684, + "learning_rate": 6.352250898168181e-06, + "loss": 0.5558, + "step": 1160 + }, + { + "epoch": 1.4151096669374492, + "grad_norm": 0.4843507493218003, + "learning_rate": 6.345419101044281e-06, + "loss": 0.6178, + "step": 1161 + }, + { + "epoch": 1.4163281884646628, + "grad_norm": 0.4103238225687281, + "learning_rate": 6.338584594963898e-06, + "loss": 0.486, + "step": 1162 + }, + { + "epoch": 1.4175467099918766, + "grad_norm": 0.4339709319145015, + "learning_rate": 6.3317473936880814e-06, + "loss": 0.5516, + "step": 1163 + }, + { + "epoch": 1.4187652315190902, + "grad_norm": 0.5006035176222503, + "learning_rate": 6.32490751098331e-06, + "loss": 0.5893, + "step": 1164 + }, + { + "epoch": 1.4199837530463038, + "grad_norm": 0.43944118536778887, + "learning_rate": 6.318064960621456e-06, + "loss": 0.554, + "step": 1165 + }, + { + "epoch": 1.4212022745735173, + "grad_norm": 0.4205988668702698, + "learning_rate": 6.31121975637977e-06, + "loss": 0.5705, + "step": 1166 + }, + { + "epoch": 1.4224207961007311, + "grad_norm": 0.42492946208091176, + "learning_rate": 6.30437191204084e-06, + "loss": 0.5382, + "step": 1167 + }, + { + "epoch": 1.4236393176279447, + "grad_norm": 0.4782081072972405, + "learning_rate": 6.297521441392572e-06, + "loss": 0.6081, + "step": 1168 + }, + { + "epoch": 1.4248578391551585, + "grad_norm": 0.4056428801301219, + "learning_rate": 6.290668358228162e-06, + "loss": 0.5448, + "step": 1169 + }, + { + "epoch": 1.426076360682372, + "grad_norm": 0.4346131086300656, + "learning_rate": 6.2838126763460635e-06, + "loss": 0.5339, + "step": 1170 + }, + { + "epoch": 1.4272948822095857, + "grad_norm": 0.4104447709327887, + "learning_rate": 6.276954409549963e-06, + "loss": 0.5399, + "step": 1171 + }, + { + "epoch": 1.4285134037367992, + "grad_norm": 0.46444896204069186, + "learning_rate": 6.270093571648752e-06, + "loss": 0.5941, + "step": 1172 + }, + { + "epoch": 1.429731925264013, + "grad_norm": 0.4451786529645794, + "learning_rate": 6.263230176456497e-06, + "loss": 0.5384, + "step": 1173 + }, + { + "epoch": 1.4309504467912266, + "grad_norm": 0.47981749578622157, + "learning_rate": 6.256364237792419e-06, + "loss": 0.5765, + "step": 1174 + }, + { + "epoch": 1.4321689683184404, + "grad_norm": 0.4367054717344673, + "learning_rate": 6.249495769480856e-06, + "loss": 0.5124, + "step": 1175 + }, + { + "epoch": 1.433387489845654, + "grad_norm": 0.42899069684022384, + "learning_rate": 6.2426247853512355e-06, + "loss": 0.5524, + "step": 1176 + }, + { + "epoch": 1.4346060113728676, + "grad_norm": 0.4904917718170387, + "learning_rate": 6.23575129923806e-06, + "loss": 0.5613, + "step": 1177 + }, + { + "epoch": 1.4358245329000812, + "grad_norm": 0.7624825038153906, + "learning_rate": 6.228875324980862e-06, + "loss": 0.5469, + "step": 1178 + }, + { + "epoch": 1.437043054427295, + "grad_norm": 0.48032758007828885, + "learning_rate": 6.221996876424186e-06, + "loss": 0.6088, + "step": 1179 + }, + { + "epoch": 1.4382615759545085, + "grad_norm": 0.4261676949483954, + "learning_rate": 6.21511596741756e-06, + "loss": 0.5269, + "step": 1180 + }, + { + "epoch": 1.4394800974817221, + "grad_norm": 0.44938704101588606, + "learning_rate": 6.208232611815463e-06, + "loss": 0.5497, + "step": 1181 + }, + { + "epoch": 1.440698619008936, + "grad_norm": 0.47843420481431187, + "learning_rate": 6.2013468234773034e-06, + "loss": 0.5673, + "step": 1182 + }, + { + "epoch": 1.4419171405361495, + "grad_norm": 0.4143118724051908, + "learning_rate": 6.194458616267388e-06, + "loss": 0.5561, + "step": 1183 + }, + { + "epoch": 1.443135662063363, + "grad_norm": 0.4687400706518928, + "learning_rate": 6.187568004054888e-06, + "loss": 0.5599, + "step": 1184 + }, + { + "epoch": 1.4443541835905767, + "grad_norm": 0.43117586360472987, + "learning_rate": 6.180675000713825e-06, + "loss": 0.5579, + "step": 1185 + }, + { + "epoch": 1.4455727051177905, + "grad_norm": 0.4677168526332838, + "learning_rate": 6.173779620123028e-06, + "loss": 0.5377, + "step": 1186 + }, + { + "epoch": 1.446791226645004, + "grad_norm": 0.4684613773900322, + "learning_rate": 6.166881876166119e-06, + "loss": 0.5505, + "step": 1187 + }, + { + "epoch": 1.4480097481722178, + "grad_norm": 0.45099302981330264, + "learning_rate": 6.1599817827314744e-06, + "loss": 0.5349, + "step": 1188 + }, + { + "epoch": 1.4492282696994314, + "grad_norm": 0.44725643758516653, + "learning_rate": 6.153079353712201e-06, + "loss": 0.5445, + "step": 1189 + }, + { + "epoch": 1.450446791226645, + "grad_norm": 0.509822041445509, + "learning_rate": 6.14617460300611e-06, + "loss": 0.6048, + "step": 1190 + }, + { + "epoch": 1.4516653127538586, + "grad_norm": 0.48251767820083963, + "learning_rate": 6.139267544515689e-06, + "loss": 0.5214, + "step": 1191 + }, + { + "epoch": 1.4528838342810724, + "grad_norm": 0.462469865969966, + "learning_rate": 6.132358192148065e-06, + "loss": 0.5628, + "step": 1192 + }, + { + "epoch": 1.454102355808286, + "grad_norm": 0.42592720114566546, + "learning_rate": 6.125446559814994e-06, + "loss": 0.4844, + "step": 1193 + }, + { + "epoch": 1.4553208773354995, + "grad_norm": 0.49275532261036237, + "learning_rate": 6.118532661432812e-06, + "loss": 0.5944, + "step": 1194 + }, + { + "epoch": 1.4565393988627133, + "grad_norm": 0.4649906751266784, + "learning_rate": 6.111616510922426e-06, + "loss": 0.5493, + "step": 1195 + }, + { + "epoch": 1.457757920389927, + "grad_norm": 0.46291320623399196, + "learning_rate": 6.104698122209274e-06, + "loss": 0.5172, + "step": 1196 + }, + { + "epoch": 1.4589764419171405, + "grad_norm": 0.5426739834419568, + "learning_rate": 6.097777509223299e-06, + "loss": 0.5666, + "step": 1197 + }, + { + "epoch": 1.460194963444354, + "grad_norm": 0.45093365966871296, + "learning_rate": 6.090854685898928e-06, + "loss": 0.5357, + "step": 1198 + }, + { + "epoch": 1.4614134849715679, + "grad_norm": 0.46357917186858966, + "learning_rate": 6.083929666175031e-06, + "loss": 0.5102, + "step": 1199 + }, + { + "epoch": 1.4626320064987814, + "grad_norm": 0.42735860218881255, + "learning_rate": 6.077002463994908e-06, + "loss": 0.5353, + "step": 1200 + }, + { + "epoch": 1.4638505280259952, + "grad_norm": 0.48773472737225, + "learning_rate": 6.070073093306246e-06, + "loss": 0.5969, + "step": 1201 + }, + { + "epoch": 1.4650690495532088, + "grad_norm": 0.45583834308371346, + "learning_rate": 6.063141568061104e-06, + "loss": 0.5501, + "step": 1202 + }, + { + "epoch": 1.4662875710804224, + "grad_norm": 0.48230795906015783, + "learning_rate": 6.056207902215874e-06, + "loss": 0.5943, + "step": 1203 + }, + { + "epoch": 1.467506092607636, + "grad_norm": 0.48530024356797447, + "learning_rate": 6.049272109731266e-06, + "loss": 0.535, + "step": 1204 + }, + { + "epoch": 1.4687246141348498, + "grad_norm": 0.39847364405399893, + "learning_rate": 6.042334204572261e-06, + "loss": 0.5088, + "step": 1205 + }, + { + "epoch": 1.4699431356620634, + "grad_norm": 0.4192802944065179, + "learning_rate": 6.035394200708104e-06, + "loss": 0.5541, + "step": 1206 + }, + { + "epoch": 1.471161657189277, + "grad_norm": 0.5095459416726968, + "learning_rate": 6.02845211211226e-06, + "loss": 0.6044, + "step": 1207 + }, + { + "epoch": 1.4723801787164907, + "grad_norm": 0.4834365213328995, + "learning_rate": 6.021507952762392e-06, + "loss": 0.5698, + "step": 1208 + }, + { + "epoch": 1.4735987002437043, + "grad_norm": 0.43629510532697163, + "learning_rate": 6.014561736640334e-06, + "loss": 0.536, + "step": 1209 + }, + { + "epoch": 1.474817221770918, + "grad_norm": 0.469188019208721, + "learning_rate": 6.007613477732061e-06, + "loss": 0.5495, + "step": 1210 + }, + { + "epoch": 1.4760357432981315, + "grad_norm": 0.4901471440756352, + "learning_rate": 6.000663190027658e-06, + "loss": 0.5661, + "step": 1211 + }, + { + "epoch": 1.4772542648253453, + "grad_norm": 0.4686562631964871, + "learning_rate": 5.993710887521302e-06, + "loss": 0.5812, + "step": 1212 + }, + { + "epoch": 1.4784727863525589, + "grad_norm": 0.48734085024012297, + "learning_rate": 5.986756584211217e-06, + "loss": 0.5335, + "step": 1213 + }, + { + "epoch": 1.4796913078797727, + "grad_norm": 0.5326878131009583, + "learning_rate": 5.979800294099666e-06, + "loss": 0.5689, + "step": 1214 + }, + { + "epoch": 1.4809098294069862, + "grad_norm": 0.4253596342133157, + "learning_rate": 5.972842031192901e-06, + "loss": 0.5265, + "step": 1215 + }, + { + "epoch": 1.4821283509341998, + "grad_norm": 0.4985627825685433, + "learning_rate": 5.965881809501158e-06, + "loss": 0.5632, + "step": 1216 + }, + { + "epoch": 1.4833468724614134, + "grad_norm": 0.45204140138324095, + "learning_rate": 5.958919643038609e-06, + "loss": 0.5569, + "step": 1217 + }, + { + "epoch": 1.4845653939886272, + "grad_norm": 0.4483567522219748, + "learning_rate": 5.951955545823342e-06, + "loss": 0.5731, + "step": 1218 + }, + { + "epoch": 1.4857839155158408, + "grad_norm": 0.4426846776302582, + "learning_rate": 5.944989531877337e-06, + "loss": 0.528, + "step": 1219 + }, + { + "epoch": 1.4870024370430543, + "grad_norm": 0.44399265576382146, + "learning_rate": 5.938021615226431e-06, + "loss": 0.5489, + "step": 1220 + }, + { + "epoch": 1.4882209585702681, + "grad_norm": 0.4581264239244066, + "learning_rate": 5.93105180990029e-06, + "loss": 0.5794, + "step": 1221 + }, + { + "epoch": 1.4894394800974817, + "grad_norm": 0.4198932355319627, + "learning_rate": 5.924080129932386e-06, + "loss": 0.5179, + "step": 1222 + }, + { + "epoch": 1.4906580016246953, + "grad_norm": 0.47789225286091797, + "learning_rate": 5.9171065893599625e-06, + "loss": 0.5638, + "step": 1223 + }, + { + "epoch": 1.4918765231519089, + "grad_norm": 0.4321558944637844, + "learning_rate": 5.910131202224011e-06, + "loss": 0.5057, + "step": 1224 + }, + { + "epoch": 1.4930950446791227, + "grad_norm": 0.4450307808888705, + "learning_rate": 5.903153982569243e-06, + "loss": 0.5421, + "step": 1225 + }, + { + "epoch": 1.4943135662063363, + "grad_norm": 0.5226652256866916, + "learning_rate": 5.8961749444440555e-06, + "loss": 0.576, + "step": 1226 + }, + { + "epoch": 1.49553208773355, + "grad_norm": 0.42973467257152986, + "learning_rate": 5.8891941019005095e-06, + "loss": 0.6013, + "step": 1227 + }, + { + "epoch": 1.4967506092607636, + "grad_norm": 0.40820805235816976, + "learning_rate": 5.882211468994299e-06, + "loss": 0.5175, + "step": 1228 + }, + { + "epoch": 1.4979691307879772, + "grad_norm": 0.4801201178398426, + "learning_rate": 5.87522705978472e-06, + "loss": 0.5833, + "step": 1229 + }, + { + "epoch": 1.4991876523151908, + "grad_norm": 0.45266942815985767, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.5284, + "step": 1230 + }, + { + "epoch": 1.5004061738424046, + "grad_norm": 0.4353303644229792, + "learning_rate": 5.8612529687105156e-06, + "loss": 0.526, + "step": 1231 + }, + { + "epoch": 1.5016246953696182, + "grad_norm": 0.48827760996687436, + "learning_rate": 5.854263314982252e-06, + "loss": 0.5955, + "step": 1232 + }, + { + "epoch": 1.502843216896832, + "grad_norm": 0.4255509928321056, + "learning_rate": 5.847271941223301e-06, + "loss": 0.5442, + "step": 1233 + }, + { + "epoch": 1.5040617384240456, + "grad_norm": 0.43808076740492513, + "learning_rate": 5.840278861510555e-06, + "loss": 0.5433, + "step": 1234 + }, + { + "epoch": 1.5052802599512591, + "grad_norm": 0.4760892817675488, + "learning_rate": 5.83328408992435e-06, + "loss": 0.5702, + "step": 1235 + }, + { + "epoch": 1.5064987814784727, + "grad_norm": 0.4481600783932247, + "learning_rate": 5.826287640548425e-06, + "loss": 0.5946, + "step": 1236 + }, + { + "epoch": 1.5077173030056863, + "grad_norm": 0.42699536589107157, + "learning_rate": 5.819289527469897e-06, + "loss": 0.5642, + "step": 1237 + }, + { + "epoch": 1.5089358245329, + "grad_norm": 0.4633211620631564, + "learning_rate": 5.812289764779232e-06, + "loss": 0.4845, + "step": 1238 + }, + { + "epoch": 1.510154346060114, + "grad_norm": 0.4571770115639661, + "learning_rate": 5.80528836657022e-06, + "loss": 0.5513, + "step": 1239 + }, + { + "epoch": 1.5113728675873275, + "grad_norm": 0.4701604947751102, + "learning_rate": 5.798285346939942e-06, + "loss": 0.559, + "step": 1240 + }, + { + "epoch": 1.512591389114541, + "grad_norm": 0.4865903026982408, + "learning_rate": 5.791280719988747e-06, + "loss": 0.5878, + "step": 1241 + }, + { + "epoch": 1.5138099106417546, + "grad_norm": 0.4412282841651163, + "learning_rate": 5.784274499820214e-06, + "loss": 0.5197, + "step": 1242 + }, + { + "epoch": 1.5150284321689682, + "grad_norm": 0.5747443137859876, + "learning_rate": 5.777266700541134e-06, + "loss": 0.6011, + "step": 1243 + }, + { + "epoch": 1.516246953696182, + "grad_norm": 0.4564303892112499, + "learning_rate": 5.770257336261482e-06, + "loss": 0.5279, + "step": 1244 + }, + { + "epoch": 1.5174654752233956, + "grad_norm": 0.45997367471162187, + "learning_rate": 5.763246421094373e-06, + "loss": 0.5255, + "step": 1245 + }, + { + "epoch": 1.5186839967506094, + "grad_norm": 0.4695480650402549, + "learning_rate": 5.7562339691560556e-06, + "loss": 0.5885, + "step": 1246 + }, + { + "epoch": 1.519902518277823, + "grad_norm": 0.5356612979245375, + "learning_rate": 5.749219994565863e-06, + "loss": 0.5569, + "step": 1247 + }, + { + "epoch": 1.5211210398050365, + "grad_norm": 0.5813954013182587, + "learning_rate": 5.742204511446203e-06, + "loss": 0.5544, + "step": 1248 + }, + { + "epoch": 1.5223395613322501, + "grad_norm": 0.43618938610834346, + "learning_rate": 5.7351875339225164e-06, + "loss": 0.5374, + "step": 1249 + }, + { + "epoch": 1.5235580828594637, + "grad_norm": 0.4937073666394837, + "learning_rate": 5.7281690761232515e-06, + "loss": 0.5162, + "step": 1250 + }, + { + "epoch": 1.5247766043866775, + "grad_norm": 0.4780704238400619, + "learning_rate": 5.72114915217984e-06, + "loss": 0.542, + "step": 1251 + }, + { + "epoch": 1.5259951259138913, + "grad_norm": 0.458787226662822, + "learning_rate": 5.714127776226667e-06, + "loss": 0.5708, + "step": 1252 + }, + { + "epoch": 1.5272136474411049, + "grad_norm": 0.4727970564003603, + "learning_rate": 5.707104962401034e-06, + "loss": 0.5678, + "step": 1253 + }, + { + "epoch": 1.5284321689683185, + "grad_norm": 0.42019947987975415, + "learning_rate": 5.7000807248431466e-06, + "loss": 0.4449, + "step": 1254 + }, + { + "epoch": 1.529650690495532, + "grad_norm": 0.5313576192243948, + "learning_rate": 5.693055077696069e-06, + "loss": 0.62, + "step": 1255 + }, + { + "epoch": 1.5308692120227456, + "grad_norm": 0.4133150947222481, + "learning_rate": 5.686028035105711e-06, + "loss": 0.5446, + "step": 1256 + }, + { + "epoch": 1.5320877335499594, + "grad_norm": 0.5182558216138413, + "learning_rate": 5.6789996112207865e-06, + "loss": 0.5589, + "step": 1257 + }, + { + "epoch": 1.533306255077173, + "grad_norm": 0.5235310986043601, + "learning_rate": 5.671969820192794e-06, + "loss": 0.5516, + "step": 1258 + }, + { + "epoch": 1.5345247766043868, + "grad_norm": 0.43543733715186, + "learning_rate": 5.664938676175982e-06, + "loss": 0.5463, + "step": 1259 + }, + { + "epoch": 1.5357432981316004, + "grad_norm": 0.542523163223611, + "learning_rate": 5.657906193327325e-06, + "loss": 0.5289, + "step": 1260 + }, + { + "epoch": 1.536961819658814, + "grad_norm": 0.6705586961902954, + "learning_rate": 5.650872385806492e-06, + "loss": 0.6, + "step": 1261 + }, + { + "epoch": 1.5381803411860275, + "grad_norm": 0.4252405119039053, + "learning_rate": 5.64383726777582e-06, + "loss": 0.5558, + "step": 1262 + }, + { + "epoch": 1.5393988627132411, + "grad_norm": 0.5168792668343379, + "learning_rate": 5.636800853400285e-06, + "loss": 0.5427, + "step": 1263 + }, + { + "epoch": 1.540617384240455, + "grad_norm": 0.56808607878734, + "learning_rate": 5.6297631568474705e-06, + "loss": 0.5785, + "step": 1264 + }, + { + "epoch": 1.5418359057676687, + "grad_norm": 0.4194312889852155, + "learning_rate": 5.622724192287548e-06, + "loss": 0.5061, + "step": 1265 + }, + { + "epoch": 1.5430544272948823, + "grad_norm": 0.46739113422443607, + "learning_rate": 5.615683973893235e-06, + "loss": 0.5543, + "step": 1266 + }, + { + "epoch": 1.5442729488220959, + "grad_norm": 0.4711436329274137, + "learning_rate": 5.608642515839777e-06, + "loss": 0.5468, + "step": 1267 + }, + { + "epoch": 1.5454914703493094, + "grad_norm": 0.45925976085714865, + "learning_rate": 5.601599832304915e-06, + "loss": 0.5533, + "step": 1268 + }, + { + "epoch": 1.546709991876523, + "grad_norm": 0.4629430310984532, + "learning_rate": 5.594555937468856e-06, + "loss": 0.6238, + "step": 1269 + }, + { + "epoch": 1.5479285134037368, + "grad_norm": 0.4430875583116207, + "learning_rate": 5.587510845514249e-06, + "loss": 0.5334, + "step": 1270 + }, + { + "epoch": 1.5491470349309504, + "grad_norm": 0.4964005647402626, + "learning_rate": 5.5804645706261515e-06, + "loss": 0.5563, + "step": 1271 + }, + { + "epoch": 1.5503655564581642, + "grad_norm": 0.47908339446690995, + "learning_rate": 5.573417126992004e-06, + "loss": 0.5761, + "step": 1272 + }, + { + "epoch": 1.5515840779853778, + "grad_norm": 0.4320719099995596, + "learning_rate": 5.5663685288015955e-06, + "loss": 0.5519, + "step": 1273 + }, + { + "epoch": 1.5528025995125914, + "grad_norm": 0.45893470814872167, + "learning_rate": 5.5593187902470465e-06, + "loss": 0.5122, + "step": 1274 + }, + { + "epoch": 1.554021121039805, + "grad_norm": 0.47949830848407404, + "learning_rate": 5.55226792552277e-06, + "loss": 0.5839, + "step": 1275 + }, + { + "epoch": 1.5552396425670185, + "grad_norm": 0.415731009852519, + "learning_rate": 5.545215948825447e-06, + "loss": 0.5378, + "step": 1276 + }, + { + "epoch": 1.5564581640942323, + "grad_norm": 0.466056698108541, + "learning_rate": 5.538162874353994e-06, + "loss": 0.4983, + "step": 1277 + }, + { + "epoch": 1.5576766856214461, + "grad_norm": 0.5916240577351891, + "learning_rate": 5.5311087163095475e-06, + "loss": 0.6251, + "step": 1278 + }, + { + "epoch": 1.5588952071486597, + "grad_norm": 0.44367509738450317, + "learning_rate": 5.524053488895413e-06, + "loss": 0.5488, + "step": 1279 + }, + { + "epoch": 1.5601137286758733, + "grad_norm": 0.47062048808194906, + "learning_rate": 5.516997206317061e-06, + "loss": 0.5563, + "step": 1280 + }, + { + "epoch": 1.5613322502030869, + "grad_norm": 0.5420478722656378, + "learning_rate": 5.509939882782077e-06, + "loss": 0.5416, + "step": 1281 + }, + { + "epoch": 1.5625507717303004, + "grad_norm": 0.5222284367927739, + "learning_rate": 5.502881532500149e-06, + "loss": 0.5965, + "step": 1282 + }, + { + "epoch": 1.5637692932575142, + "grad_norm": 0.42208342526415327, + "learning_rate": 5.49582216968303e-06, + "loss": 0.5467, + "step": 1283 + }, + { + "epoch": 1.5649878147847278, + "grad_norm": 0.4294650898913376, + "learning_rate": 5.4887618085445094e-06, + "loss": 0.5287, + "step": 1284 + }, + { + "epoch": 1.5662063363119416, + "grad_norm": 0.46855647055671784, + "learning_rate": 5.48170046330039e-06, + "loss": 0.5628, + "step": 1285 + }, + { + "epoch": 1.5674248578391552, + "grad_norm": 0.4699651333558714, + "learning_rate": 5.474638148168456e-06, + "loss": 0.5574, + "step": 1286 + }, + { + "epoch": 1.5686433793663688, + "grad_norm": 0.5135379339848296, + "learning_rate": 5.467574877368441e-06, + "loss": 0.547, + "step": 1287 + }, + { + "epoch": 1.5698619008935824, + "grad_norm": 0.4810680839376017, + "learning_rate": 5.460510665122007e-06, + "loss": 0.557, + "step": 1288 + }, + { + "epoch": 1.5710804224207962, + "grad_norm": 0.4098166771088161, + "learning_rate": 5.453445525652711e-06, + "loss": 0.5418, + "step": 1289 + }, + { + "epoch": 1.5722989439480097, + "grad_norm": 0.450215288957951, + "learning_rate": 5.446379473185972e-06, + "loss": 0.5357, + "step": 1290 + }, + { + "epoch": 1.5735174654752235, + "grad_norm": 0.5294521431799539, + "learning_rate": 5.4393125219490536e-06, + "loss": 0.5643, + "step": 1291 + }, + { + "epoch": 1.574735987002437, + "grad_norm": 0.4592328236388863, + "learning_rate": 5.432244686171025e-06, + "loss": 0.5579, + "step": 1292 + }, + { + "epoch": 1.5759545085296507, + "grad_norm": 0.43283051916010107, + "learning_rate": 5.42517598008274e-06, + "loss": 0.5045, + "step": 1293 + }, + { + "epoch": 1.5771730300568643, + "grad_norm": 0.5659434705667795, + "learning_rate": 5.418106417916799e-06, + "loss": 0.6214, + "step": 1294 + }, + { + "epoch": 1.5783915515840778, + "grad_norm": 0.43767902318474483, + "learning_rate": 5.411036013907534e-06, + "loss": 0.4785, + "step": 1295 + }, + { + "epoch": 1.5796100731112916, + "grad_norm": 0.49107247160929135, + "learning_rate": 5.403964782290962e-06, + "loss": 0.6033, + "step": 1296 + }, + { + "epoch": 1.5808285946385054, + "grad_norm": 0.4941184832970728, + "learning_rate": 5.396892737304779e-06, + "loss": 0.5625, + "step": 1297 + }, + { + "epoch": 1.582047116165719, + "grad_norm": 0.45207210705440176, + "learning_rate": 5.389819893188304e-06, + "loss": 0.5955, + "step": 1298 + }, + { + "epoch": 1.5832656376929326, + "grad_norm": 0.41624551022025036, + "learning_rate": 5.38274626418248e-06, + "loss": 0.4859, + "step": 1299 + }, + { + "epoch": 1.5844841592201462, + "grad_norm": 0.5355211526596017, + "learning_rate": 5.375671864529817e-06, + "loss": 0.5847, + "step": 1300 + }, + { + "epoch": 1.5857026807473598, + "grad_norm": 0.4975201469339488, + "learning_rate": 5.368596708474388e-06, + "loss": 0.5338, + "step": 1301 + }, + { + "epoch": 1.5869212022745736, + "grad_norm": 0.4863357216575736, + "learning_rate": 5.361520810261779e-06, + "loss": 0.5535, + "step": 1302 + }, + { + "epoch": 1.5881397238017871, + "grad_norm": 0.4458515473467672, + "learning_rate": 5.354444184139077e-06, + "loss": 0.5457, + "step": 1303 + }, + { + "epoch": 1.589358245329001, + "grad_norm": 0.4614906452198629, + "learning_rate": 5.347366844354833e-06, + "loss": 0.5398, + "step": 1304 + }, + { + "epoch": 1.5905767668562145, + "grad_norm": 0.4685010422012627, + "learning_rate": 5.340288805159037e-06, + "loss": 0.5407, + "step": 1305 + }, + { + "epoch": 1.591795288383428, + "grad_norm": 0.48804182586096323, + "learning_rate": 5.33321008080308e-06, + "loss": 0.547, + "step": 1306 + }, + { + "epoch": 1.5930138099106417, + "grad_norm": 0.44694564705893386, + "learning_rate": 5.3261306855397395e-06, + "loss": 0.5459, + "step": 1307 + }, + { + "epoch": 1.5942323314378553, + "grad_norm": 0.4139859944920655, + "learning_rate": 5.319050633623141e-06, + "loss": 0.5519, + "step": 1308 + }, + { + "epoch": 1.595450852965069, + "grad_norm": 0.5097755056565069, + "learning_rate": 5.311969939308736e-06, + "loss": 0.5901, + "step": 1309 + }, + { + "epoch": 1.5966693744922829, + "grad_norm": 0.47592489399723925, + "learning_rate": 5.304888616853265e-06, + "loss": 0.5324, + "step": 1310 + }, + { + "epoch": 1.5978878960194964, + "grad_norm": 0.4276883892776071, + "learning_rate": 5.297806680514731e-06, + "loss": 0.5106, + "step": 1311 + }, + { + "epoch": 1.59910641754671, + "grad_norm": 0.4681244968477927, + "learning_rate": 5.290724144552379e-06, + "loss": 0.6054, + "step": 1312 + }, + { + "epoch": 1.6003249390739236, + "grad_norm": 0.4896701927637777, + "learning_rate": 5.283641023226661e-06, + "loss": 0.5455, + "step": 1313 + }, + { + "epoch": 1.6015434606011372, + "grad_norm": 0.4245053792739156, + "learning_rate": 5.276557330799203e-06, + "loss": 0.5471, + "step": 1314 + }, + { + "epoch": 1.602761982128351, + "grad_norm": 0.4874649206218259, + "learning_rate": 5.269473081532785e-06, + "loss": 0.5782, + "step": 1315 + }, + { + "epoch": 1.6039805036555645, + "grad_norm": 0.47549962008011226, + "learning_rate": 5.262388289691303e-06, + "loss": 0.575, + "step": 1316 + }, + { + "epoch": 1.6051990251827783, + "grad_norm": 0.42642213924678707, + "learning_rate": 5.255302969539753e-06, + "loss": 0.5805, + "step": 1317 + }, + { + "epoch": 1.606417546709992, + "grad_norm": 0.42684200856960786, + "learning_rate": 5.248217135344191e-06, + "loss": 0.5072, + "step": 1318 + }, + { + "epoch": 1.6076360682372055, + "grad_norm": 0.4365701459872912, + "learning_rate": 5.241130801371704e-06, + "loss": 0.5658, + "step": 1319 + }, + { + "epoch": 1.608854589764419, + "grad_norm": 0.42471390001052695, + "learning_rate": 5.234043981890395e-06, + "loss": 0.5698, + "step": 1320 + }, + { + "epoch": 1.6100731112916327, + "grad_norm": 0.4535238587027896, + "learning_rate": 5.226956691169332e-06, + "loss": 0.5839, + "step": 1321 + }, + { + "epoch": 1.6112916328188465, + "grad_norm": 0.4247946464572348, + "learning_rate": 5.219868943478542e-06, + "loss": 0.5577, + "step": 1322 + }, + { + "epoch": 1.6125101543460603, + "grad_norm": 0.43376338736220743, + "learning_rate": 5.212780753088968e-06, + "loss": 0.5449, + "step": 1323 + }, + { + "epoch": 1.6137286758732738, + "grad_norm": 0.4061841147634886, + "learning_rate": 5.205692134272445e-06, + "loss": 0.5179, + "step": 1324 + }, + { + "epoch": 1.6149471974004874, + "grad_norm": 0.4596996267175098, + "learning_rate": 5.1986031013016706e-06, + "loss": 0.5818, + "step": 1325 + }, + { + "epoch": 1.616165718927701, + "grad_norm": 0.43123766272618486, + "learning_rate": 5.191513668450178e-06, + "loss": 0.5687, + "step": 1326 + }, + { + "epoch": 1.6173842404549146, + "grad_norm": 0.4329937345499755, + "learning_rate": 5.184423849992299e-06, + "loss": 0.5348, + "step": 1327 + }, + { + "epoch": 1.6186027619821284, + "grad_norm": 0.49663961496101067, + "learning_rate": 5.177333660203153e-06, + "loss": 0.5956, + "step": 1328 + }, + { + "epoch": 1.619821283509342, + "grad_norm": 0.3924685962518714, + "learning_rate": 5.170243113358594e-06, + "loss": 0.5125, + "step": 1329 + }, + { + "epoch": 1.6210398050365558, + "grad_norm": 0.4856207429888876, + "learning_rate": 5.163152223735206e-06, + "loss": 0.5778, + "step": 1330 + }, + { + "epoch": 1.6222583265637693, + "grad_norm": 0.45002527423182, + "learning_rate": 5.156061005610258e-06, + "loss": 0.5584, + "step": 1331 + }, + { + "epoch": 1.623476848090983, + "grad_norm": 0.4310106517218945, + "learning_rate": 5.1489694732616805e-06, + "loss": 0.5377, + "step": 1332 + }, + { + "epoch": 1.6246953696181965, + "grad_norm": 0.49448879444066074, + "learning_rate": 5.141877640968037e-06, + "loss": 0.623, + "step": 1333 + }, + { + "epoch": 1.62591389114541, + "grad_norm": 0.40362533961876157, + "learning_rate": 5.134785523008496e-06, + "loss": 0.5014, + "step": 1334 + }, + { + "epoch": 1.6271324126726239, + "grad_norm": 0.4269483197368071, + "learning_rate": 5.127693133662801e-06, + "loss": 0.573, + "step": 1335 + }, + { + "epoch": 1.6283509341998377, + "grad_norm": 0.4258879503760348, + "learning_rate": 5.12060048721124e-06, + "loss": 0.5314, + "step": 1336 + }, + { + "epoch": 1.6295694557270513, + "grad_norm": 0.44120462268057764, + "learning_rate": 5.11350759793462e-06, + "loss": 0.5373, + "step": 1337 + }, + { + "epoch": 1.6307879772542648, + "grad_norm": 0.4276083907367786, + "learning_rate": 5.106414480114238e-06, + "loss": 0.5276, + "step": 1338 + }, + { + "epoch": 1.6320064987814784, + "grad_norm": 0.4517524664721021, + "learning_rate": 5.099321148031851e-06, + "loss": 0.5504, + "step": 1339 + }, + { + "epoch": 1.633225020308692, + "grad_norm": 0.44913374968040776, + "learning_rate": 5.092227615969643e-06, + "loss": 0.553, + "step": 1340 + }, + { + "epoch": 1.6344435418359058, + "grad_norm": 0.49845971138611844, + "learning_rate": 5.085133898210208e-06, + "loss": 0.5653, + "step": 1341 + }, + { + "epoch": 1.6356620633631194, + "grad_norm": 0.4427260322632497, + "learning_rate": 5.078040009036509e-06, + "loss": 0.5213, + "step": 1342 + }, + { + "epoch": 1.6368805848903332, + "grad_norm": 0.4177253316358593, + "learning_rate": 5.070945962731854e-06, + "loss": 0.5397, + "step": 1343 + }, + { + "epoch": 1.6380991064175467, + "grad_norm": 0.47651126334983296, + "learning_rate": 5.06385177357987e-06, + "loss": 0.5595, + "step": 1344 + }, + { + "epoch": 1.6393176279447603, + "grad_norm": 0.5627892918210755, + "learning_rate": 5.056757455864469e-06, + "loss": 0.6096, + "step": 1345 + }, + { + "epoch": 1.640536149471974, + "grad_norm": 0.44180856064958623, + "learning_rate": 5.049663023869824e-06, + "loss": 0.5025, + "step": 1346 + }, + { + "epoch": 1.6417546709991877, + "grad_norm": 0.460979656039155, + "learning_rate": 5.042568491880338e-06, + "loss": 0.5982, + "step": 1347 + }, + { + "epoch": 1.6429731925264013, + "grad_norm": 0.4821324897781787, + "learning_rate": 5.035473874180612e-06, + "loss": 0.5598, + "step": 1348 + }, + { + "epoch": 1.644191714053615, + "grad_norm": 0.45517260087056105, + "learning_rate": 5.028379185055424e-06, + "loss": 0.5246, + "step": 1349 + }, + { + "epoch": 1.6454102355808287, + "grad_norm": 0.4413055629736707, + "learning_rate": 5.021284438789694e-06, + "loss": 0.5341, + "step": 1350 + }, + { + "epoch": 1.6466287571080422, + "grad_norm": 0.4614955719864221, + "learning_rate": 5.014189649668456e-06, + "loss": 0.5578, + "step": 1351 + }, + { + "epoch": 1.6478472786352558, + "grad_norm": 0.4953936356649888, + "learning_rate": 5.007094831976832e-06, + "loss": 0.5765, + "step": 1352 + }, + { + "epoch": 1.6490658001624694, + "grad_norm": 0.39648893153136167, + "learning_rate": 5e-06, + "loss": 0.5342, + "step": 1353 + }, + { + "epoch": 1.6502843216896832, + "grad_norm": 0.43855043725681864, + "learning_rate": 4.992905168023169e-06, + "loss": 0.543, + "step": 1354 + }, + { + "epoch": 1.6515028432168968, + "grad_norm": 0.5301209980205615, + "learning_rate": 4.985810350331544e-06, + "loss": 0.6293, + "step": 1355 + }, + { + "epoch": 1.6527213647441106, + "grad_norm": 0.38590596359640195, + "learning_rate": 4.9787155612103076e-06, + "loss": 0.5296, + "step": 1356 + }, + { + "epoch": 1.6539398862713242, + "grad_norm": 0.42738095322238806, + "learning_rate": 4.9716208149445776e-06, + "loss": 0.5308, + "step": 1357 + }, + { + "epoch": 1.6551584077985377, + "grad_norm": 0.4555041349632123, + "learning_rate": 4.96452612581939e-06, + "loss": 0.5788, + "step": 1358 + }, + { + "epoch": 1.6563769293257513, + "grad_norm": 0.4558921081759917, + "learning_rate": 4.9574315081196634e-06, + "loss": 0.5609, + "step": 1359 + }, + { + "epoch": 1.6575954508529651, + "grad_norm": 0.4503929824518257, + "learning_rate": 4.950336976130176e-06, + "loss": 0.5341, + "step": 1360 + }, + { + "epoch": 1.6588139723801787, + "grad_norm": 0.43711031728275695, + "learning_rate": 4.9432425441355334e-06, + "loss": 0.5793, + "step": 1361 + }, + { + "epoch": 1.6600324939073925, + "grad_norm": 0.39568528756580684, + "learning_rate": 4.936148226420133e-06, + "loss": 0.5069, + "step": 1362 + }, + { + "epoch": 1.661251015434606, + "grad_norm": 0.4309659404250017, + "learning_rate": 4.929054037268147e-06, + "loss": 0.5872, + "step": 1363 + }, + { + "epoch": 1.6624695369618196, + "grad_norm": 0.482908985444469, + "learning_rate": 4.921959990963493e-06, + "loss": 0.5583, + "step": 1364 + }, + { + "epoch": 1.6636880584890332, + "grad_norm": 0.4133363420753277, + "learning_rate": 4.914866101789793e-06, + "loss": 0.484, + "step": 1365 + }, + { + "epoch": 1.6649065800162468, + "grad_norm": 0.46336848283664533, + "learning_rate": 4.907772384030357e-06, + "loss": 0.6055, + "step": 1366 + }, + { + "epoch": 1.6661251015434606, + "grad_norm": 0.4021280914849084, + "learning_rate": 4.900678851968152e-06, + "loss": 0.4953, + "step": 1367 + }, + { + "epoch": 1.6673436230706744, + "grad_norm": 0.4496122068891948, + "learning_rate": 4.893585519885764e-06, + "loss": 0.5631, + "step": 1368 + }, + { + "epoch": 1.668562144597888, + "grad_norm": 0.4386416975070193, + "learning_rate": 4.886492402065381e-06, + "loss": 0.5632, + "step": 1369 + }, + { + "epoch": 1.6697806661251016, + "grad_norm": 0.4335033691327931, + "learning_rate": 4.8793995127887615e-06, + "loss": 0.5377, + "step": 1370 + }, + { + "epoch": 1.6709991876523151, + "grad_norm": 0.4639132609070873, + "learning_rate": 4.8723068663372005e-06, + "loss": 0.5658, + "step": 1371 + }, + { + "epoch": 1.6722177091795287, + "grad_norm": 0.4186533135703324, + "learning_rate": 4.865214476991506e-06, + "loss": 0.538, + "step": 1372 + }, + { + "epoch": 1.6734362307067425, + "grad_norm": 0.5100673554858591, + "learning_rate": 4.858122359031964e-06, + "loss": 0.5977, + "step": 1373 + }, + { + "epoch": 1.674654752233956, + "grad_norm": 0.4284001466166066, + "learning_rate": 4.851030526738321e-06, + "loss": 0.5325, + "step": 1374 + }, + { + "epoch": 1.67587327376117, + "grad_norm": 0.4048773843920905, + "learning_rate": 4.843938994389744e-06, + "loss": 0.4975, + "step": 1375 + }, + { + "epoch": 1.6770917952883835, + "grad_norm": 0.4074001135895807, + "learning_rate": 4.836847776264794e-06, + "loss": 0.5762, + "step": 1376 + }, + { + "epoch": 1.678310316815597, + "grad_norm": 0.41740364142746117, + "learning_rate": 4.829756886641408e-06, + "loss": 0.5731, + "step": 1377 + }, + { + "epoch": 1.6795288383428106, + "grad_norm": 0.4812773839220182, + "learning_rate": 4.82266633979685e-06, + "loss": 0.5849, + "step": 1378 + }, + { + "epoch": 1.6807473598700242, + "grad_norm": 0.39560445425868235, + "learning_rate": 4.815576150007702e-06, + "loss": 0.4699, + "step": 1379 + }, + { + "epoch": 1.681965881397238, + "grad_norm": 0.4414471548591453, + "learning_rate": 4.808486331549824e-06, + "loss": 0.5626, + "step": 1380 + }, + { + "epoch": 1.6831844029244518, + "grad_norm": 0.38187499198826846, + "learning_rate": 4.801396898698329e-06, + "loss": 0.5071, + "step": 1381 + }, + { + "epoch": 1.6844029244516654, + "grad_norm": 0.4892251033230591, + "learning_rate": 4.794307865727555e-06, + "loss": 0.5552, + "step": 1382 + }, + { + "epoch": 1.685621445978879, + "grad_norm": 0.482903388217794, + "learning_rate": 4.787219246911034e-06, + "loss": 0.5492, + "step": 1383 + }, + { + "epoch": 1.6868399675060926, + "grad_norm": 0.45801551724996875, + "learning_rate": 4.78013105652146e-06, + "loss": 0.5838, + "step": 1384 + }, + { + "epoch": 1.6880584890333061, + "grad_norm": 0.42866796779932836, + "learning_rate": 4.77304330883067e-06, + "loss": 0.5085, + "step": 1385 + }, + { + "epoch": 1.68927701056052, + "grad_norm": 0.4475021559493066, + "learning_rate": 4.765956018109607e-06, + "loss": 0.5505, + "step": 1386 + }, + { + "epoch": 1.6904955320877335, + "grad_norm": 0.4697192585313218, + "learning_rate": 4.758869198628296e-06, + "loss": 0.5479, + "step": 1387 + }, + { + "epoch": 1.6917140536149473, + "grad_norm": 0.465791753930643, + "learning_rate": 4.7517828646558115e-06, + "loss": 0.56, + "step": 1388 + }, + { + "epoch": 1.692932575142161, + "grad_norm": 0.4015038202394012, + "learning_rate": 4.744697030460248e-06, + "loss": 0.5492, + "step": 1389 + }, + { + "epoch": 1.6941510966693745, + "grad_norm": 0.5232226854854597, + "learning_rate": 4.7376117103086974e-06, + "loss": 0.5464, + "step": 1390 + }, + { + "epoch": 1.695369618196588, + "grad_norm": 0.4518351945360455, + "learning_rate": 4.730526918467217e-06, + "loss": 0.533, + "step": 1391 + }, + { + "epoch": 1.6965881397238016, + "grad_norm": 0.4769324521614458, + "learning_rate": 4.7234426692007985e-06, + "loss": 0.6265, + "step": 1392 + }, + { + "epoch": 1.6978066612510154, + "grad_norm": 0.39722631525643654, + "learning_rate": 4.716358976773342e-06, + "loss": 0.4616, + "step": 1393 + }, + { + "epoch": 1.6990251827782292, + "grad_norm": 0.5143439560883679, + "learning_rate": 4.7092758554476215e-06, + "loss": 0.5927, + "step": 1394 + }, + { + "epoch": 1.7002437043054428, + "grad_norm": 0.4893384326011186, + "learning_rate": 4.702193319485271e-06, + "loss": 0.581, + "step": 1395 + }, + { + "epoch": 1.7014622258326564, + "grad_norm": 0.40330674171655206, + "learning_rate": 4.695111383146738e-06, + "loss": 0.5152, + "step": 1396 + }, + { + "epoch": 1.70268074735987, + "grad_norm": 0.4812638116299566, + "learning_rate": 4.688030060691264e-06, + "loss": 0.6068, + "step": 1397 + }, + { + "epoch": 1.7038992688870835, + "grad_norm": 0.42808075960070036, + "learning_rate": 4.680949366376858e-06, + "loss": 0.5232, + "step": 1398 + }, + { + "epoch": 1.7051177904142973, + "grad_norm": 0.4186184139760809, + "learning_rate": 4.673869314460262e-06, + "loss": 0.5375, + "step": 1399 + }, + { + "epoch": 1.706336311941511, + "grad_norm": 0.4351340155979422, + "learning_rate": 4.666789919196923e-06, + "loss": 0.5493, + "step": 1400 + }, + { + "epoch": 1.7075548334687247, + "grad_norm": 0.5600164408896984, + "learning_rate": 4.659711194840964e-06, + "loss": 0.587, + "step": 1401 + }, + { + "epoch": 1.7087733549959383, + "grad_norm": 0.43365827783641364, + "learning_rate": 4.6526331556451674e-06, + "loss": 0.519, + "step": 1402 + }, + { + "epoch": 1.7099918765231519, + "grad_norm": 0.44015645831753214, + "learning_rate": 4.645555815860923e-06, + "loss": 0.5523, + "step": 1403 + }, + { + "epoch": 1.7112103980503655, + "grad_norm": 0.4552471646368589, + "learning_rate": 4.638479189738224e-06, + "loss": 0.5404, + "step": 1404 + }, + { + "epoch": 1.7124289195775793, + "grad_norm": 0.4535728417437257, + "learning_rate": 4.631403291525615e-06, + "loss": 0.5368, + "step": 1405 + }, + { + "epoch": 1.7136474411047928, + "grad_norm": 0.4734624014107752, + "learning_rate": 4.624328135470184e-06, + "loss": 0.5778, + "step": 1406 + }, + { + "epoch": 1.7148659626320066, + "grad_norm": 0.4934447889274217, + "learning_rate": 4.617253735817522e-06, + "loss": 0.5476, + "step": 1407 + }, + { + "epoch": 1.7160844841592202, + "grad_norm": 0.4984539363836997, + "learning_rate": 4.610180106811696e-06, + "loss": 0.5649, + "step": 1408 + }, + { + "epoch": 1.7173030056864338, + "grad_norm": 0.4848858968212611, + "learning_rate": 4.603107262695225e-06, + "loss": 0.5111, + "step": 1409 + }, + { + "epoch": 1.7185215272136474, + "grad_norm": 0.47036832640121645, + "learning_rate": 4.596035217709039e-06, + "loss": 0.5948, + "step": 1410 + }, + { + "epoch": 1.719740048740861, + "grad_norm": 0.44168165703224904, + "learning_rate": 4.588963986092468e-06, + "loss": 0.5941, + "step": 1411 + }, + { + "epoch": 1.7209585702680747, + "grad_norm": 0.39666220117961165, + "learning_rate": 4.5818935820832014e-06, + "loss": 0.4913, + "step": 1412 + }, + { + "epoch": 1.7221770917952883, + "grad_norm": 0.5025801254491269, + "learning_rate": 4.574824019917262e-06, + "loss": 0.5932, + "step": 1413 + }, + { + "epoch": 1.7233956133225021, + "grad_norm": 0.3845664023510723, + "learning_rate": 4.5677553138289764e-06, + "loss": 0.5369, + "step": 1414 + }, + { + "epoch": 1.7246141348497157, + "grad_norm": 0.42320355598590065, + "learning_rate": 4.560687478050947e-06, + "loss": 0.5294, + "step": 1415 + }, + { + "epoch": 1.7258326563769293, + "grad_norm": 0.4096157422530506, + "learning_rate": 4.553620526814029e-06, + "loss": 0.519, + "step": 1416 + }, + { + "epoch": 1.7270511779041429, + "grad_norm": 0.48631875630001814, + "learning_rate": 4.546554474347291e-06, + "loss": 0.6101, + "step": 1417 + }, + { + "epoch": 1.7282696994313567, + "grad_norm": 0.4768787594020578, + "learning_rate": 4.539489334877992e-06, + "loss": 0.5629, + "step": 1418 + }, + { + "epoch": 1.7294882209585702, + "grad_norm": 0.41978448851594347, + "learning_rate": 4.532425122631559e-06, + "loss": 0.5365, + "step": 1419 + }, + { + "epoch": 1.730706742485784, + "grad_norm": 0.4298141402145644, + "learning_rate": 4.5253618518315455e-06, + "loss": 0.5346, + "step": 1420 + }, + { + "epoch": 1.7319252640129976, + "grad_norm": 0.43330287443239485, + "learning_rate": 4.5182995366996115e-06, + "loss": 0.565, + "step": 1421 + }, + { + "epoch": 1.7331437855402112, + "grad_norm": 0.4618063094916825, + "learning_rate": 4.511238191455491e-06, + "loss": 0.5669, + "step": 1422 + }, + { + "epoch": 1.7343623070674248, + "grad_norm": 0.4330349372337, + "learning_rate": 4.504177830316971e-06, + "loss": 0.5563, + "step": 1423 + }, + { + "epoch": 1.7355808285946384, + "grad_norm": 0.4061046490367817, + "learning_rate": 4.497118467499852e-06, + "loss": 0.5371, + "step": 1424 + }, + { + "epoch": 1.7367993501218522, + "grad_norm": 0.4524064658816882, + "learning_rate": 4.490060117217925e-06, + "loss": 0.5273, + "step": 1425 + }, + { + "epoch": 1.738017871649066, + "grad_norm": 0.4153684807216417, + "learning_rate": 4.483002793682941e-06, + "loss": 0.5202, + "step": 1426 + }, + { + "epoch": 1.7392363931762795, + "grad_norm": 0.5126499568306361, + "learning_rate": 4.475946511104588e-06, + "loss": 0.5964, + "step": 1427 + }, + { + "epoch": 1.7404549147034931, + "grad_norm": 0.442175693450011, + "learning_rate": 4.468891283690454e-06, + "loss": 0.514, + "step": 1428 + }, + { + "epoch": 1.7416734362307067, + "grad_norm": 0.421309384527005, + "learning_rate": 4.461837125646007e-06, + "loss": 0.6091, + "step": 1429 + }, + { + "epoch": 1.7428919577579203, + "grad_norm": 0.4380243684629681, + "learning_rate": 4.4547840511745565e-06, + "loss": 0.4913, + "step": 1430 + }, + { + "epoch": 1.744110479285134, + "grad_norm": 0.4812216276097867, + "learning_rate": 4.447732074477233e-06, + "loss": 0.5582, + "step": 1431 + }, + { + "epoch": 1.7453290008123477, + "grad_norm": 0.40488056766666325, + "learning_rate": 4.440681209752955e-06, + "loss": 0.5758, + "step": 1432 + }, + { + "epoch": 1.7465475223395615, + "grad_norm": 0.4732265653920416, + "learning_rate": 4.433631471198406e-06, + "loss": 0.5962, + "step": 1433 + }, + { + "epoch": 1.747766043866775, + "grad_norm": 0.42539261148413177, + "learning_rate": 4.426582873007999e-06, + "loss": 0.4769, + "step": 1434 + }, + { + "epoch": 1.7489845653939886, + "grad_norm": 0.512036705158376, + "learning_rate": 4.4195354293738484e-06, + "loss": 0.582, + "step": 1435 + }, + { + "epoch": 1.7502030869212022, + "grad_norm": 0.4305096055128761, + "learning_rate": 4.412489154485752e-06, + "loss": 0.5326, + "step": 1436 + }, + { + "epoch": 1.7514216084484158, + "grad_norm": 0.5036201316708941, + "learning_rate": 4.405444062531145e-06, + "loss": 0.579, + "step": 1437 + }, + { + "epoch": 1.7526401299756296, + "grad_norm": 0.42814700978676606, + "learning_rate": 4.3984001676950875e-06, + "loss": 0.5706, + "step": 1438 + }, + { + "epoch": 1.7538586515028434, + "grad_norm": 0.4336700472324628, + "learning_rate": 4.391357484160223e-06, + "loss": 0.5429, + "step": 1439 + }, + { + "epoch": 1.755077173030057, + "grad_norm": 0.4197620066836796, + "learning_rate": 4.384316026106766e-06, + "loss": 0.5312, + "step": 1440 + }, + { + "epoch": 1.7562956945572705, + "grad_norm": 0.4358185412850227, + "learning_rate": 4.377275807712453e-06, + "loss": 0.5601, + "step": 1441 + }, + { + "epoch": 1.757514216084484, + "grad_norm": 0.4593898380941711, + "learning_rate": 4.37023684315253e-06, + "loss": 0.5522, + "step": 1442 + }, + { + "epoch": 1.7587327376116977, + "grad_norm": 0.41694136338662585, + "learning_rate": 4.363199146599717e-06, + "loss": 0.5436, + "step": 1443 + }, + { + "epoch": 1.7599512591389115, + "grad_norm": 0.41051974887386045, + "learning_rate": 4.3561627322241815e-06, + "loss": 0.5484, + "step": 1444 + }, + { + "epoch": 1.761169780666125, + "grad_norm": 0.49654495683052513, + "learning_rate": 4.34912761419351e-06, + "loss": 0.5471, + "step": 1445 + }, + { + "epoch": 1.7623883021933389, + "grad_norm": 0.46755105267929675, + "learning_rate": 4.342093806672678e-06, + "loss": 0.5675, + "step": 1446 + }, + { + "epoch": 1.7636068237205524, + "grad_norm": 0.4560949973440655, + "learning_rate": 4.335061323824019e-06, + "loss": 0.5921, + "step": 1447 + }, + { + "epoch": 1.764825345247766, + "grad_norm": 0.4254462067059595, + "learning_rate": 4.328030179807207e-06, + "loss": 0.4801, + "step": 1448 + }, + { + "epoch": 1.7660438667749796, + "grad_norm": 0.43590945113760904, + "learning_rate": 4.321000388779214e-06, + "loss": 0.55, + "step": 1449 + }, + { + "epoch": 1.7672623883021932, + "grad_norm": 0.45385792985801476, + "learning_rate": 4.313971964894289e-06, + "loss": 0.5936, + "step": 1450 + }, + { + "epoch": 1.768480909829407, + "grad_norm": 0.45173148922198614, + "learning_rate": 4.306944922303932e-06, + "loss": 0.5198, + "step": 1451 + }, + { + "epoch": 1.7696994313566208, + "grad_norm": 0.4738870866999846, + "learning_rate": 4.299919275156857e-06, + "loss": 0.5695, + "step": 1452 + }, + { + "epoch": 1.7709179528838344, + "grad_norm": 0.4308222859015806, + "learning_rate": 4.292895037598968e-06, + "loss": 0.5302, + "step": 1453 + }, + { + "epoch": 1.772136474411048, + "grad_norm": 0.43194034641229945, + "learning_rate": 4.285872223773336e-06, + "loss": 0.5277, + "step": 1454 + }, + { + "epoch": 1.7733549959382615, + "grad_norm": 0.44969920461261864, + "learning_rate": 4.278850847820161e-06, + "loss": 0.5552, + "step": 1455 + }, + { + "epoch": 1.774573517465475, + "grad_norm": 0.45716879761679, + "learning_rate": 4.2718309238767485e-06, + "loss": 0.5785, + "step": 1456 + }, + { + "epoch": 1.775792038992689, + "grad_norm": 0.4340108500215334, + "learning_rate": 4.264812466077486e-06, + "loss": 0.5973, + "step": 1457 + }, + { + "epoch": 1.7770105605199025, + "grad_norm": 0.40605446125162264, + "learning_rate": 4.2577954885537985e-06, + "loss": 0.5262, + "step": 1458 + }, + { + "epoch": 1.7782290820471163, + "grad_norm": 0.4862703366986213, + "learning_rate": 4.2507800054341385e-06, + "loss": 0.576, + "step": 1459 + }, + { + "epoch": 1.7794476035743299, + "grad_norm": 0.48359582578678745, + "learning_rate": 4.243766030843947e-06, + "loss": 0.5998, + "step": 1460 + }, + { + "epoch": 1.7806661251015434, + "grad_norm": 0.3661204928776939, + "learning_rate": 4.236753578905627e-06, + "loss": 0.4968, + "step": 1461 + }, + { + "epoch": 1.781884646628757, + "grad_norm": 0.43106534453803774, + "learning_rate": 4.229742663738521e-06, + "loss": 0.5418, + "step": 1462 + }, + { + "epoch": 1.7831031681559708, + "grad_norm": 0.48202616303627543, + "learning_rate": 4.2227332994588666e-06, + "loss": 0.5486, + "step": 1463 + }, + { + "epoch": 1.7843216896831844, + "grad_norm": 0.46787213059943966, + "learning_rate": 4.215725500179788e-06, + "loss": 0.5394, + "step": 1464 + }, + { + "epoch": 1.7855402112103982, + "grad_norm": 0.4786631164932734, + "learning_rate": 4.208719280011255e-06, + "loss": 0.6512, + "step": 1465 + }, + { + "epoch": 1.7867587327376118, + "grad_norm": 0.5007334603891063, + "learning_rate": 4.2017146530600585e-06, + "loss": 0.5262, + "step": 1466 + }, + { + "epoch": 1.7879772542648253, + "grad_norm": 0.4766688726563304, + "learning_rate": 4.194711633429782e-06, + "loss": 0.4996, + "step": 1467 + }, + { + "epoch": 1.789195775792039, + "grad_norm": 0.46491040345633633, + "learning_rate": 4.1877102352207695e-06, + "loss": 0.5968, + "step": 1468 + }, + { + "epoch": 1.7904142973192525, + "grad_norm": 0.4085352403505702, + "learning_rate": 4.180710472530105e-06, + "loss": 0.5262, + "step": 1469 + }, + { + "epoch": 1.7916328188464663, + "grad_norm": 0.43126540204858976, + "learning_rate": 4.173712359451576e-06, + "loss": 0.5407, + "step": 1470 + }, + { + "epoch": 1.7928513403736799, + "grad_norm": 0.5202009737302775, + "learning_rate": 4.16671591007565e-06, + "loss": 0.5644, + "step": 1471 + }, + { + "epoch": 1.7940698619008937, + "grad_norm": 0.43231572065106405, + "learning_rate": 4.159721138489445e-06, + "loss": 0.5143, + "step": 1472 + }, + { + "epoch": 1.7952883834281073, + "grad_norm": 0.4626044446914442, + "learning_rate": 4.152728058776701e-06, + "loss": 0.5853, + "step": 1473 + }, + { + "epoch": 1.7965069049553208, + "grad_norm": 0.43407883748754916, + "learning_rate": 4.145736685017749e-06, + "loss": 0.5239, + "step": 1474 + }, + { + "epoch": 1.7977254264825344, + "grad_norm": 0.4267857290356126, + "learning_rate": 4.138747031289485e-06, + "loss": 0.5558, + "step": 1475 + }, + { + "epoch": 1.7989439480097482, + "grad_norm": 0.4315508799133083, + "learning_rate": 4.131759111665349e-06, + "loss": 0.5807, + "step": 1476 + }, + { + "epoch": 1.8001624695369618, + "grad_norm": 0.4048772175153014, + "learning_rate": 4.124772940215279e-06, + "loss": 0.508, + "step": 1477 + }, + { + "epoch": 1.8013809910641756, + "grad_norm": 0.4268392177126804, + "learning_rate": 4.1177885310057045e-06, + "loss": 0.552, + "step": 1478 + }, + { + "epoch": 1.8025995125913892, + "grad_norm": 0.4495564163997895, + "learning_rate": 4.110805898099492e-06, + "loss": 0.5669, + "step": 1479 + }, + { + "epoch": 1.8038180341186028, + "grad_norm": 0.4570284740109343, + "learning_rate": 4.103825055555947e-06, + "loss": 0.5503, + "step": 1480 + }, + { + "epoch": 1.8050365556458163, + "grad_norm": 0.45712926339273185, + "learning_rate": 4.096846017430758e-06, + "loss": 0.5861, + "step": 1481 + }, + { + "epoch": 1.80625507717303, + "grad_norm": 0.4363450699012883, + "learning_rate": 4.0898687977759895e-06, + "loss": 0.5698, + "step": 1482 + }, + { + "epoch": 1.8074735987002437, + "grad_norm": 0.36642253412778386, + "learning_rate": 4.08289341064004e-06, + "loss": 0.4882, + "step": 1483 + }, + { + "epoch": 1.8086921202274575, + "grad_norm": 0.4626576143609871, + "learning_rate": 4.075919870067617e-06, + "loss": 0.5695, + "step": 1484 + }, + { + "epoch": 1.809910641754671, + "grad_norm": 0.46018408439267183, + "learning_rate": 4.068948190099711e-06, + "loss": 0.5529, + "step": 1485 + }, + { + "epoch": 1.8111291632818847, + "grad_norm": 0.4119449731431994, + "learning_rate": 4.06197838477357e-06, + "loss": 0.5024, + "step": 1486 + }, + { + "epoch": 1.8123476848090982, + "grad_norm": 0.4015730766144408, + "learning_rate": 4.0550104681226635e-06, + "loss": 0.5656, + "step": 1487 + }, + { + "epoch": 1.8135662063363118, + "grad_norm": 0.4503343260237984, + "learning_rate": 4.048044454176658e-06, + "loss": 0.5661, + "step": 1488 + }, + { + "epoch": 1.8147847278635256, + "grad_norm": 0.43240190245880916, + "learning_rate": 4.041080356961393e-06, + "loss": 0.4974, + "step": 1489 + }, + { + "epoch": 1.8160032493907392, + "grad_norm": 0.4734473361008657, + "learning_rate": 4.034118190498843e-06, + "loss": 0.5663, + "step": 1490 + }, + { + "epoch": 1.817221770917953, + "grad_norm": 0.43362890265223014, + "learning_rate": 4.0271579688071e-06, + "loss": 0.5531, + "step": 1491 + }, + { + "epoch": 1.8184402924451666, + "grad_norm": 0.46894586845233727, + "learning_rate": 4.020199705900335e-06, + "loss": 0.5534, + "step": 1492 + }, + { + "epoch": 1.8196588139723802, + "grad_norm": 0.5328522267534698, + "learning_rate": 4.013243415788783e-06, + "loss": 0.6018, + "step": 1493 + }, + { + "epoch": 1.8208773354995937, + "grad_norm": 0.41829831723127575, + "learning_rate": 4.0062891124787e-06, + "loss": 0.5562, + "step": 1494 + }, + { + "epoch": 1.8220958570268073, + "grad_norm": 0.45791268251247896, + "learning_rate": 3.999336809972343e-06, + "loss": 0.5226, + "step": 1495 + }, + { + "epoch": 1.8233143785540211, + "grad_norm": 0.52749121116633, + "learning_rate": 3.99238652226794e-06, + "loss": 0.5885, + "step": 1496 + }, + { + "epoch": 1.824532900081235, + "grad_norm": 0.4102080465654976, + "learning_rate": 3.985438263359667e-06, + "loss": 0.4996, + "step": 1497 + }, + { + "epoch": 1.8257514216084485, + "grad_norm": 0.453636908099918, + "learning_rate": 3.978492047237608e-06, + "loss": 0.568, + "step": 1498 + }, + { + "epoch": 1.826969943135662, + "grad_norm": 0.49160181331071584, + "learning_rate": 3.971547887887742e-06, + "loss": 0.574, + "step": 1499 + }, + { + "epoch": 1.8281884646628757, + "grad_norm": 0.4128170671886108, + "learning_rate": 3.964605799291897e-06, + "loss": 0.4792, + "step": 1500 + }, + { + "epoch": 1.8294069861900892, + "grad_norm": 0.4714468421227976, + "learning_rate": 3.9576657954277406e-06, + "loss": 0.5527, + "step": 1501 + }, + { + "epoch": 1.830625507717303, + "grad_norm": 0.4759788646029719, + "learning_rate": 3.950727890268736e-06, + "loss": 0.564, + "step": 1502 + }, + { + "epoch": 1.8318440292445166, + "grad_norm": 0.4269752606026449, + "learning_rate": 3.943792097784126e-06, + "loss": 0.5733, + "step": 1503 + }, + { + "epoch": 1.8330625507717304, + "grad_norm": 0.4407801675115479, + "learning_rate": 3.936858431938899e-06, + "loss": 0.501, + "step": 1504 + }, + { + "epoch": 1.834281072298944, + "grad_norm": 0.417785240435009, + "learning_rate": 3.929926906693757e-06, + "loss": 0.5292, + "step": 1505 + }, + { + "epoch": 1.8354995938261576, + "grad_norm": 0.4954357818413886, + "learning_rate": 3.922997536005094e-06, + "loss": 0.5834, + "step": 1506 + }, + { + "epoch": 1.8367181153533712, + "grad_norm": 0.4520246987276718, + "learning_rate": 3.91607033382497e-06, + "loss": 0.601, + "step": 1507 + }, + { + "epoch": 1.8379366368805847, + "grad_norm": 0.41894543258809586, + "learning_rate": 3.909145314101074e-06, + "loss": 0.5201, + "step": 1508 + }, + { + "epoch": 1.8391551584077985, + "grad_norm": 0.48207218741173996, + "learning_rate": 3.9022224907767e-06, + "loss": 0.5478, + "step": 1509 + }, + { + "epoch": 1.8403736799350123, + "grad_norm": 0.45664125938234335, + "learning_rate": 3.895301877790728e-06, + "loss": 0.5646, + "step": 1510 + }, + { + "epoch": 1.841592201462226, + "grad_norm": 0.4171767656165807, + "learning_rate": 3.888383489077576e-06, + "loss": 0.511, + "step": 1511 + }, + { + "epoch": 1.8428107229894395, + "grad_norm": 0.43354385082610986, + "learning_rate": 3.88146733856719e-06, + "loss": 0.526, + "step": 1512 + }, + { + "epoch": 1.844029244516653, + "grad_norm": 0.4920523523977966, + "learning_rate": 3.874553440185008e-06, + "loss": 0.5767, + "step": 1513 + }, + { + "epoch": 1.8452477660438666, + "grad_norm": 0.46705637995124133, + "learning_rate": 3.867641807851935e-06, + "loss": 0.5835, + "step": 1514 + }, + { + "epoch": 1.8464662875710804, + "grad_norm": 0.4461828469934018, + "learning_rate": 3.860732455484314e-06, + "loss": 0.4961, + "step": 1515 + }, + { + "epoch": 1.847684809098294, + "grad_norm": 0.4633901834358105, + "learning_rate": 3.853825396993891e-06, + "loss": 0.5811, + "step": 1516 + }, + { + "epoch": 1.8489033306255078, + "grad_norm": 0.4438313726356196, + "learning_rate": 3.8469206462878e-06, + "loss": 0.5655, + "step": 1517 + }, + { + "epoch": 1.8501218521527214, + "grad_norm": 0.4546281191367206, + "learning_rate": 3.840018217268527e-06, + "loss": 0.5556, + "step": 1518 + }, + { + "epoch": 1.851340373679935, + "grad_norm": 0.39692829259522316, + "learning_rate": 3.833118123833881e-06, + "loss": 0.5083, + "step": 1519 + }, + { + "epoch": 1.8525588952071486, + "grad_norm": 0.4367611773412124, + "learning_rate": 3.826220379876974e-06, + "loss": 0.5621, + "step": 1520 + }, + { + "epoch": 1.8537774167343624, + "grad_norm": 0.46250207668673327, + "learning_rate": 3.819324999286177e-06, + "loss": 0.5502, + "step": 1521 + }, + { + "epoch": 1.854995938261576, + "grad_norm": 0.42252748085937586, + "learning_rate": 3.8124319959451133e-06, + "loss": 0.5428, + "step": 1522 + }, + { + "epoch": 1.8562144597887897, + "grad_norm": 0.42520604252823624, + "learning_rate": 3.8055413837326133e-06, + "loss": 0.5484, + "step": 1523 + }, + { + "epoch": 1.8574329813160033, + "grad_norm": 0.41242969185302275, + "learning_rate": 3.7986531765226965e-06, + "loss": 0.521, + "step": 1524 + }, + { + "epoch": 1.858651502843217, + "grad_norm": 0.43001901614946786, + "learning_rate": 3.7917673881845373e-06, + "loss": 0.5943, + "step": 1525 + }, + { + "epoch": 1.8598700243704305, + "grad_norm": 0.4097185174805625, + "learning_rate": 3.7848840325824428e-06, + "loss": 0.5407, + "step": 1526 + }, + { + "epoch": 1.861088545897644, + "grad_norm": 0.4509948723964594, + "learning_rate": 3.778003123575815e-06, + "loss": 0.5526, + "step": 1527 + }, + { + "epoch": 1.8623070674248579, + "grad_norm": 0.458525992527633, + "learning_rate": 3.77112467501914e-06, + "loss": 0.5546, + "step": 1528 + }, + { + "epoch": 1.8635255889520714, + "grad_norm": 0.407821722457764, + "learning_rate": 3.7642487007619417e-06, + "loss": 0.5205, + "step": 1529 + }, + { + "epoch": 1.8647441104792852, + "grad_norm": 0.4630986805415289, + "learning_rate": 3.757375214648764e-06, + "loss": 0.5733, + "step": 1530 + }, + { + "epoch": 1.8659626320064988, + "grad_norm": 0.46336209457236627, + "learning_rate": 3.7505042305191463e-06, + "loss": 0.5653, + "step": 1531 + }, + { + "epoch": 1.8671811535337124, + "grad_norm": 0.39311369649530103, + "learning_rate": 3.743635762207582e-06, + "loss": 0.5342, + "step": 1532 + }, + { + "epoch": 1.868399675060926, + "grad_norm": 0.42025451422654897, + "learning_rate": 3.7367698235435036e-06, + "loss": 0.5474, + "step": 1533 + }, + { + "epoch": 1.8696181965881398, + "grad_norm": 0.42303104824107784, + "learning_rate": 3.72990642835125e-06, + "loss": 0.52, + "step": 1534 + }, + { + "epoch": 1.8708367181153533, + "grad_norm": 0.40431884043673944, + "learning_rate": 3.7230455904500385e-06, + "loss": 0.5468, + "step": 1535 + }, + { + "epoch": 1.8720552396425671, + "grad_norm": 0.4312111249145443, + "learning_rate": 3.716187323653939e-06, + "loss": 0.5888, + "step": 1536 + }, + { + "epoch": 1.8732737611697807, + "grad_norm": 0.41823157797464644, + "learning_rate": 3.7093316417718407e-06, + "loss": 0.5638, + "step": 1537 + }, + { + "epoch": 1.8744922826969943, + "grad_norm": 0.42521945264285177, + "learning_rate": 3.702478558607429e-06, + "loss": 0.5357, + "step": 1538 + }, + { + "epoch": 1.8757108042242079, + "grad_norm": 0.4652010212406273, + "learning_rate": 3.695628087959162e-06, + "loss": 0.5809, + "step": 1539 + }, + { + "epoch": 1.8769293257514215, + "grad_norm": 0.399398106227539, + "learning_rate": 3.6887802436202307e-06, + "loss": 0.5233, + "step": 1540 + }, + { + "epoch": 1.8781478472786353, + "grad_norm": 0.40874149994794573, + "learning_rate": 3.6819350393785445e-06, + "loss": 0.5534, + "step": 1541 + }, + { + "epoch": 1.879366368805849, + "grad_norm": 0.4553334343530874, + "learning_rate": 3.675092489016693e-06, + "loss": 0.5369, + "step": 1542 + }, + { + "epoch": 1.8805848903330626, + "grad_norm": 0.40028666583281924, + "learning_rate": 3.6682526063119206e-06, + "loss": 0.5209, + "step": 1543 + }, + { + "epoch": 1.8818034118602762, + "grad_norm": 0.41838120396321116, + "learning_rate": 3.661415405036103e-06, + "loss": 0.5752, + "step": 1544 + }, + { + "epoch": 1.8830219333874898, + "grad_norm": 0.4136128207763801, + "learning_rate": 3.654580898955721e-06, + "loss": 0.5277, + "step": 1545 + }, + { + "epoch": 1.8842404549147034, + "grad_norm": 0.4024921294917515, + "learning_rate": 3.647749101831821e-06, + "loss": 0.5239, + "step": 1546 + }, + { + "epoch": 1.8854589764419172, + "grad_norm": 0.4141269485652032, + "learning_rate": 3.640920027420001e-06, + "loss": 0.5508, + "step": 1547 + }, + { + "epoch": 1.8866774979691308, + "grad_norm": 0.440719562642394, + "learning_rate": 3.6340936894703717e-06, + "loss": 0.5702, + "step": 1548 + }, + { + "epoch": 1.8878960194963446, + "grad_norm": 0.4786206622488289, + "learning_rate": 3.6272701017275385e-06, + "loss": 0.5721, + "step": 1549 + }, + { + "epoch": 1.8891145410235581, + "grad_norm": 0.4129960293133917, + "learning_rate": 3.6204492779305678e-06, + "loss": 0.5382, + "step": 1550 + }, + { + "epoch": 1.8903330625507717, + "grad_norm": 0.425696733632996, + "learning_rate": 3.61363123181296e-06, + "loss": 0.546, + "step": 1551 + }, + { + "epoch": 1.8915515840779853, + "grad_norm": 0.4653679268493305, + "learning_rate": 3.6068159771026267e-06, + "loss": 0.5789, + "step": 1552 + }, + { + "epoch": 1.8927701056051989, + "grad_norm": 0.4522003365392251, + "learning_rate": 3.6000035275218515e-06, + "loss": 0.5224, + "step": 1553 + }, + { + "epoch": 1.8939886271324127, + "grad_norm": 0.3920499944414549, + "learning_rate": 3.593193896787277e-06, + "loss": 0.4976, + "step": 1554 + }, + { + "epoch": 1.8952071486596265, + "grad_norm": 0.45058258754829983, + "learning_rate": 3.5863870986098655e-06, + "loss": 0.5745, + "step": 1555 + }, + { + "epoch": 1.89642567018684, + "grad_norm": 0.4244815509498337, + "learning_rate": 3.5795831466948805e-06, + "loss": 0.5414, + "step": 1556 + }, + { + "epoch": 1.8976441917140536, + "grad_norm": 0.4331016099950002, + "learning_rate": 3.5727820547418525e-06, + "loss": 0.539, + "step": 1557 + }, + { + "epoch": 1.8988627132412672, + "grad_norm": 0.417995629833821, + "learning_rate": 3.5659838364445505e-06, + "loss": 0.5156, + "step": 1558 + }, + { + "epoch": 1.9000812347684808, + "grad_norm": 0.4734474494296379, + "learning_rate": 3.5591885054909605e-06, + "loss": 0.5925, + "step": 1559 + }, + { + "epoch": 1.9012997562956946, + "grad_norm": 0.46115486081361745, + "learning_rate": 3.5523960755632573e-06, + "loss": 0.5091, + "step": 1560 + }, + { + "epoch": 1.9025182778229082, + "grad_norm": 0.40875274875883305, + "learning_rate": 3.5456065603377697e-06, + "loss": 0.5567, + "step": 1561 + }, + { + "epoch": 1.903736799350122, + "grad_norm": 0.45386074829816486, + "learning_rate": 3.5388199734849626e-06, + "loss": 0.5578, + "step": 1562 + }, + { + "epoch": 1.9049553208773355, + "grad_norm": 0.38709828752403175, + "learning_rate": 3.5320363286694015e-06, + "loss": 0.5179, + "step": 1563 + }, + { + "epoch": 1.9061738424045491, + "grad_norm": 0.42735900716697534, + "learning_rate": 3.5252556395497274e-06, + "loss": 0.5712, + "step": 1564 + }, + { + "epoch": 1.9073923639317627, + "grad_norm": 0.4181700954501109, + "learning_rate": 3.518477919778631e-06, + "loss": 0.5781, + "step": 1565 + }, + { + "epoch": 1.9086108854589763, + "grad_norm": 0.421517198534864, + "learning_rate": 3.5117031830028274e-06, + "loss": 0.5214, + "step": 1566 + }, + { + "epoch": 1.90982940698619, + "grad_norm": 0.44082135382564575, + "learning_rate": 3.504931442863023e-06, + "loss": 0.5929, + "step": 1567 + }, + { + "epoch": 1.9110479285134039, + "grad_norm": 0.3829707161093217, + "learning_rate": 3.49816271299389e-06, + "loss": 0.4973, + "step": 1568 + }, + { + "epoch": 1.9122664500406175, + "grad_norm": 0.4241401054720212, + "learning_rate": 3.4913970070240388e-06, + "loss": 0.5694, + "step": 1569 + }, + { + "epoch": 1.913484971567831, + "grad_norm": 0.4287983948624245, + "learning_rate": 3.484634338575995e-06, + "loss": 0.5123, + "step": 1570 + }, + { + "epoch": 1.9147034930950446, + "grad_norm": 0.40950888500677163, + "learning_rate": 3.4778747212661647e-06, + "loss": 0.5595, + "step": 1571 + }, + { + "epoch": 1.9159220146222582, + "grad_norm": 0.4272739781741268, + "learning_rate": 3.4711181687048114e-06, + "loss": 0.5609, + "step": 1572 + }, + { + "epoch": 1.917140536149472, + "grad_norm": 0.41564421693161757, + "learning_rate": 3.464364694496031e-06, + "loss": 0.5336, + "step": 1573 + }, + { + "epoch": 1.9183590576766856, + "grad_norm": 0.4359864387668293, + "learning_rate": 3.457614312237716e-06, + "loss": 0.5371, + "step": 1574 + }, + { + "epoch": 1.9195775792038994, + "grad_norm": 0.4799831871173569, + "learning_rate": 3.450867035521536e-06, + "loss": 0.5299, + "step": 1575 + }, + { + "epoch": 1.920796100731113, + "grad_norm": 0.4453426614702043, + "learning_rate": 3.4441228779329073e-06, + "loss": 0.5502, + "step": 1576 + }, + { + "epoch": 1.9220146222583265, + "grad_norm": 0.4238694285973907, + "learning_rate": 3.4373818530509686e-06, + "loss": 0.5275, + "step": 1577 + }, + { + "epoch": 1.9232331437855401, + "grad_norm": 0.41917397616804203, + "learning_rate": 3.4306439744485453e-06, + "loss": 0.5761, + "step": 1578 + }, + { + "epoch": 1.924451665312754, + "grad_norm": 0.425933885933589, + "learning_rate": 3.423909255692137e-06, + "loss": 0.515, + "step": 1579 + }, + { + "epoch": 1.9256701868399675, + "grad_norm": 0.4456890458176112, + "learning_rate": 3.417177710341868e-06, + "loss": 0.5522, + "step": 1580 + }, + { + "epoch": 1.9268887083671813, + "grad_norm": 0.41653994234849523, + "learning_rate": 3.4104493519514844e-06, + "loss": 0.5675, + "step": 1581 + }, + { + "epoch": 1.9281072298943949, + "grad_norm": 0.4065472343645043, + "learning_rate": 3.40372419406831e-06, + "loss": 0.5016, + "step": 1582 + }, + { + "epoch": 1.9293257514216084, + "grad_norm": 0.4554171323483848, + "learning_rate": 3.3970022502332273e-06, + "loss": 0.5919, + "step": 1583 + }, + { + "epoch": 1.930544272948822, + "grad_norm": 0.4247465501155384, + "learning_rate": 3.3902835339806463e-06, + "loss": 0.565, + "step": 1584 + }, + { + "epoch": 1.9317627944760356, + "grad_norm": 0.44295772231878283, + "learning_rate": 3.3835680588384767e-06, + "loss": 0.5046, + "step": 1585 + }, + { + "epoch": 1.9329813160032494, + "grad_norm": 0.4268447826543325, + "learning_rate": 3.3768558383281024e-06, + "loss": 0.5193, + "step": 1586 + }, + { + "epoch": 1.934199837530463, + "grad_norm": 0.44799827489237776, + "learning_rate": 3.3701468859643583e-06, + "loss": 0.5631, + "step": 1587 + }, + { + "epoch": 1.9354183590576768, + "grad_norm": 0.4332023943083594, + "learning_rate": 3.363441215255495e-06, + "loss": 0.5724, + "step": 1588 + }, + { + "epoch": 1.9366368805848904, + "grad_norm": 0.3996520998473681, + "learning_rate": 3.356738839703158e-06, + "loss": 0.5255, + "step": 1589 + }, + { + "epoch": 1.937855402112104, + "grad_norm": 0.44409520862327806, + "learning_rate": 3.3500397728023536e-06, + "loss": 0.5425, + "step": 1590 + }, + { + "epoch": 1.9390739236393175, + "grad_norm": 0.4405722390495154, + "learning_rate": 3.343344028041433e-06, + "loss": 0.6053, + "step": 1591 + }, + { + "epoch": 1.9402924451665313, + "grad_norm": 0.40826303676438436, + "learning_rate": 3.336651618902054e-06, + "loss": 0.5324, + "step": 1592 + }, + { + "epoch": 1.941510966693745, + "grad_norm": 0.3851707449193732, + "learning_rate": 3.3299625588591568e-06, + "loss": 0.5088, + "step": 1593 + }, + { + "epoch": 1.9427294882209587, + "grad_norm": 0.40859643518575894, + "learning_rate": 3.3232768613809453e-06, + "loss": 0.581, + "step": 1594 + }, + { + "epoch": 1.9439480097481723, + "grad_norm": 0.3722542671220263, + "learning_rate": 3.316594539928845e-06, + "loss": 0.4977, + "step": 1595 + }, + { + "epoch": 1.9451665312753859, + "grad_norm": 0.4383598182132238, + "learning_rate": 3.309915607957487e-06, + "loss": 0.6137, + "step": 1596 + }, + { + "epoch": 1.9463850528025994, + "grad_norm": 0.4011690274677538, + "learning_rate": 3.303240078914679e-06, + "loss": 0.563, + "step": 1597 + }, + { + "epoch": 1.947603574329813, + "grad_norm": 0.36889683412600394, + "learning_rate": 3.2965679662413772e-06, + "loss": 0.4968, + "step": 1598 + }, + { + "epoch": 1.9488220958570268, + "grad_norm": 0.43411779671306283, + "learning_rate": 3.289899283371657e-06, + "loss": 0.5802, + "step": 1599 + }, + { + "epoch": 1.9500406173842406, + "grad_norm": 0.3803158460715809, + "learning_rate": 3.283234043732689e-06, + "loss": 0.5093, + "step": 1600 + }, + { + "epoch": 1.9512591389114542, + "grad_norm": 0.4391648613289349, + "learning_rate": 3.276572260744709e-06, + "loss": 0.565, + "step": 1601 + }, + { + "epoch": 1.9524776604386678, + "grad_norm": 0.40723374350566227, + "learning_rate": 3.2699139478209987e-06, + "loss": 0.514, + "step": 1602 + }, + { + "epoch": 1.9536961819658814, + "grad_norm": 0.454313203169353, + "learning_rate": 3.263259118367845e-06, + "loss": 0.6135, + "step": 1603 + }, + { + "epoch": 1.954914703493095, + "grad_norm": 0.4178433726812962, + "learning_rate": 3.256607785784527e-06, + "loss": 0.5301, + "step": 1604 + }, + { + "epoch": 1.9561332250203087, + "grad_norm": 0.42422765865196377, + "learning_rate": 3.249959963463283e-06, + "loss": 0.5278, + "step": 1605 + }, + { + "epoch": 1.9573517465475223, + "grad_norm": 0.4507513531340492, + "learning_rate": 3.2433156647892784e-06, + "loss": 0.5154, + "step": 1606 + }, + { + "epoch": 1.958570268074736, + "grad_norm": 0.4155636470893334, + "learning_rate": 3.2366749031405875e-06, + "loss": 0.5627, + "step": 1607 + }, + { + "epoch": 1.9597887896019497, + "grad_norm": 0.41760402999256324, + "learning_rate": 3.2300376918881628e-06, + "loss": 0.5779, + "step": 1608 + }, + { + "epoch": 1.9610073111291633, + "grad_norm": 0.4262673425446038, + "learning_rate": 3.223404044395808e-06, + "loss": 0.5939, + "step": 1609 + }, + { + "epoch": 1.9622258326563768, + "grad_norm": 0.39002137904137807, + "learning_rate": 3.216773974020152e-06, + "loss": 0.4796, + "step": 1610 + }, + { + "epoch": 1.9634443541835904, + "grad_norm": 0.48341233875628054, + "learning_rate": 3.210147494110618e-06, + "loss": 0.5623, + "step": 1611 + }, + { + "epoch": 1.9646628757108042, + "grad_norm": 0.4263996506849499, + "learning_rate": 3.203524618009403e-06, + "loss": 0.5771, + "step": 1612 + }, + { + "epoch": 1.965881397238018, + "grad_norm": 0.39778372039996124, + "learning_rate": 3.1969053590514487e-06, + "loss": 0.5291, + "step": 1613 + }, + { + "epoch": 1.9670999187652316, + "grad_norm": 0.4303659167460693, + "learning_rate": 3.19028973056441e-06, + "loss": 0.5488, + "step": 1614 + }, + { + "epoch": 1.9683184402924452, + "grad_norm": 0.4137612167679553, + "learning_rate": 3.1836777458686363e-06, + "loss": 0.5619, + "step": 1615 + }, + { + "epoch": 1.9695369618196588, + "grad_norm": 0.36766081180510835, + "learning_rate": 3.177069418277139e-06, + "loss": 0.4946, + "step": 1616 + }, + { + "epoch": 1.9707554833468723, + "grad_norm": 0.4182717579515874, + "learning_rate": 3.1704647610955618e-06, + "loss": 0.5297, + "step": 1617 + }, + { + "epoch": 1.9719740048740861, + "grad_norm": 0.4584959850560608, + "learning_rate": 3.163863787622162e-06, + "loss": 0.6143, + "step": 1618 + }, + { + "epoch": 1.9731925264012997, + "grad_norm": 0.4458263983902489, + "learning_rate": 3.157266511147783e-06, + "loss": 0.5079, + "step": 1619 + }, + { + "epoch": 1.9744110479285135, + "grad_norm": 0.43613330489917596, + "learning_rate": 3.150672944955818e-06, + "loss": 0.5714, + "step": 1620 + }, + { + "epoch": 1.975629569455727, + "grad_norm": 0.3858901721024831, + "learning_rate": 3.1440831023221952e-06, + "loss": 0.5283, + "step": 1621 + }, + { + "epoch": 1.9768480909829407, + "grad_norm": 0.40043306380620164, + "learning_rate": 3.137496996515339e-06, + "loss": 0.5618, + "step": 1622 + }, + { + "epoch": 1.9780666125101543, + "grad_norm": 0.4155561403542389, + "learning_rate": 3.1309146407961565e-06, + "loss": 0.5793, + "step": 1623 + }, + { + "epoch": 1.9792851340373678, + "grad_norm": 0.48452491106537393, + "learning_rate": 3.1243360484180012e-06, + "loss": 0.5955, + "step": 1624 + }, + { + "epoch": 1.9805036555645816, + "grad_norm": 0.4052625054606517, + "learning_rate": 3.117761232626648e-06, + "loss": 0.5113, + "step": 1625 + }, + { + "epoch": 1.9817221770917954, + "grad_norm": 0.42003854375542504, + "learning_rate": 3.111190206660273e-06, + "loss": 0.5462, + "step": 1626 + }, + { + "epoch": 1.982940698619009, + "grad_norm": 0.425058799574285, + "learning_rate": 3.1046229837494123e-06, + "loss": 0.5244, + "step": 1627 + }, + { + "epoch": 1.9841592201462226, + "grad_norm": 0.4113830672023175, + "learning_rate": 3.0980595771169543e-06, + "loss": 0.5297, + "step": 1628 + }, + { + "epoch": 1.9853777416734362, + "grad_norm": 0.4015669403567964, + "learning_rate": 3.091499999978097e-06, + "loss": 0.5261, + "step": 1629 + }, + { + "epoch": 1.9865962632006497, + "grad_norm": 0.4283955622994288, + "learning_rate": 3.0849442655403315e-06, + "loss": 0.5755, + "step": 1630 + }, + { + "epoch": 1.9878147847278635, + "grad_norm": 0.41898536957171045, + "learning_rate": 3.0783923870034094e-06, + "loss": 0.5468, + "step": 1631 + }, + { + "epoch": 1.9890333062550771, + "grad_norm": 0.39218815049699407, + "learning_rate": 3.0718443775593233e-06, + "loss": 0.5094, + "step": 1632 + }, + { + "epoch": 1.990251827782291, + "grad_norm": 0.4279916922662056, + "learning_rate": 3.065300250392265e-06, + "loss": 0.5914, + "step": 1633 + }, + { + "epoch": 1.9914703493095045, + "grad_norm": 0.41267484908021385, + "learning_rate": 3.058760018678622e-06, + "loss": 0.5182, + "step": 1634 + }, + { + "epoch": 1.992688870836718, + "grad_norm": 0.44135796853573733, + "learning_rate": 3.0522236955869293e-06, + "loss": 0.5306, + "step": 1635 + }, + { + "epoch": 1.9939073923639317, + "grad_norm": 0.47924737111572846, + "learning_rate": 3.0456912942778585e-06, + "loss": 0.5286, + "step": 1636 + }, + { + "epoch": 1.9951259138911455, + "grad_norm": 0.42225974588467396, + "learning_rate": 3.0391628279041797e-06, + "loss": 0.5143, + "step": 1637 + }, + { + "epoch": 1.996344435418359, + "grad_norm": 0.443505986438843, + "learning_rate": 3.0326383096107424e-06, + "loss": 0.603, + "step": 1638 + }, + { + "epoch": 1.9975629569455728, + "grad_norm": 0.43300461200802, + "learning_rate": 3.0261177525344458e-06, + "loss": 0.529, + "step": 1639 + }, + { + "epoch": 1.9987814784727864, + "grad_norm": 0.44310730809971327, + "learning_rate": 3.019601169804216e-06, + "loss": 0.5712, + "step": 1640 + }, + { + "epoch": 2.0004061738424044, + "grad_norm": 0.9468576561635692, + "learning_rate": 3.0130885745409744e-06, + "loss": 0.9149, + "step": 1641 + }, + { + "epoch": 2.0016246953696184, + "grad_norm": 0.4706438333540491, + "learning_rate": 3.0065799798576146e-06, + "loss": 0.4931, + "step": 1642 + }, + { + "epoch": 2.002843216896832, + "grad_norm": 0.4720816823665595, + "learning_rate": 3.0000753988589717e-06, + "loss": 0.4837, + "step": 1643 + }, + { + "epoch": 2.0040617384240456, + "grad_norm": 0.47408797235071, + "learning_rate": 2.993574844641807e-06, + "loss": 0.4923, + "step": 1644 + }, + { + "epoch": 2.005280259951259, + "grad_norm": 0.4519771428113875, + "learning_rate": 2.987078330294767e-06, + "loss": 0.5211, + "step": 1645 + }, + { + "epoch": 2.0064987814784727, + "grad_norm": 0.519886504494184, + "learning_rate": 2.9805858688983656e-06, + "loss": 0.5746, + "step": 1646 + }, + { + "epoch": 2.0077173030056863, + "grad_norm": 0.42316406477644747, + "learning_rate": 2.9740974735249627e-06, + "loss": 0.4762, + "step": 1647 + }, + { + "epoch": 2.0089358245329, + "grad_norm": 0.4459084131070543, + "learning_rate": 2.96761315723872e-06, + "loss": 0.517, + "step": 1648 + }, + { + "epoch": 2.010154346060114, + "grad_norm": 0.4787475311586957, + "learning_rate": 2.961132933095595e-06, + "loss": 0.5475, + "step": 1649 + }, + { + "epoch": 2.0113728675873275, + "grad_norm": 0.44287904587306054, + "learning_rate": 2.9546568141433007e-06, + "loss": 0.513, + "step": 1650 + }, + { + "epoch": 2.012591389114541, + "grad_norm": 0.3984732881743886, + "learning_rate": 2.94818481342129e-06, + "loss": 0.5093, + "step": 1651 + }, + { + "epoch": 2.0138099106417546, + "grad_norm": 0.4399121723080827, + "learning_rate": 2.941716943960716e-06, + "loss": 0.511, + "step": 1652 + }, + { + "epoch": 2.015028432168968, + "grad_norm": 0.44831302014249225, + "learning_rate": 2.9352532187844254e-06, + "loss": 0.4984, + "step": 1653 + }, + { + "epoch": 2.016246953696182, + "grad_norm": 0.42996322667286735, + "learning_rate": 2.9287936509069036e-06, + "loss": 0.5191, + "step": 1654 + }, + { + "epoch": 2.017465475223396, + "grad_norm": 0.4183124555440696, + "learning_rate": 2.9223382533342825e-06, + "loss": 0.545, + "step": 1655 + }, + { + "epoch": 2.0186839967506094, + "grad_norm": 0.40446852524401855, + "learning_rate": 2.915887039064287e-06, + "loss": 0.503, + "step": 1656 + }, + { + "epoch": 2.019902518277823, + "grad_norm": 0.46166747184685086, + "learning_rate": 2.9094400210862206e-06, + "loss": 0.5397, + "step": 1657 + }, + { + "epoch": 2.0211210398050365, + "grad_norm": 0.44952433573058875, + "learning_rate": 2.9029972123809425e-06, + "loss": 0.5055, + "step": 1658 + }, + { + "epoch": 2.02233956133225, + "grad_norm": 0.4250306396302766, + "learning_rate": 2.8965586259208295e-06, + "loss": 0.521, + "step": 1659 + }, + { + "epoch": 2.0235580828594637, + "grad_norm": 0.4183203719527761, + "learning_rate": 2.890124274669764e-06, + "loss": 0.4974, + "step": 1660 + }, + { + "epoch": 2.0247766043866773, + "grad_norm": 0.4411815617611041, + "learning_rate": 2.8836941715830943e-06, + "loss": 0.5129, + "step": 1661 + }, + { + "epoch": 2.0259951259138913, + "grad_norm": 0.44795060381325624, + "learning_rate": 2.8772683296076197e-06, + "loss": 0.5142, + "step": 1662 + }, + { + "epoch": 2.027213647441105, + "grad_norm": 0.41579452280547835, + "learning_rate": 2.8708467616815606e-06, + "loss": 0.4951, + "step": 1663 + }, + { + "epoch": 2.0284321689683185, + "grad_norm": 0.4012708682220002, + "learning_rate": 2.864429480734529e-06, + "loss": 0.512, + "step": 1664 + }, + { + "epoch": 2.029650690495532, + "grad_norm": 0.4273458204316277, + "learning_rate": 2.858016499687503e-06, + "loss": 0.5401, + "step": 1665 + }, + { + "epoch": 2.0308692120227456, + "grad_norm": 0.45185500116453636, + "learning_rate": 2.8516078314528082e-06, + "loss": 0.4782, + "step": 1666 + }, + { + "epoch": 2.032087733549959, + "grad_norm": 0.49808257517442245, + "learning_rate": 2.8452034889340874e-06, + "loss": 0.5078, + "step": 1667 + }, + { + "epoch": 2.033306255077173, + "grad_norm": 0.4147857957964427, + "learning_rate": 2.838803485026265e-06, + "loss": 0.5092, + "step": 1668 + }, + { + "epoch": 2.034524776604387, + "grad_norm": 0.4270575824323386, + "learning_rate": 2.8324078326155403e-06, + "loss": 0.5239, + "step": 1669 + }, + { + "epoch": 2.0357432981316004, + "grad_norm": 0.4396795083275381, + "learning_rate": 2.8260165445793417e-06, + "loss": 0.5106, + "step": 1670 + }, + { + "epoch": 2.036961819658814, + "grad_norm": 0.42042014253641513, + "learning_rate": 2.819629633786319e-06, + "loss": 0.4699, + "step": 1671 + }, + { + "epoch": 2.0381803411860275, + "grad_norm": 0.4430726691393502, + "learning_rate": 2.8132471130962997e-06, + "loss": 0.4899, + "step": 1672 + }, + { + "epoch": 2.039398862713241, + "grad_norm": 0.40352990345385337, + "learning_rate": 2.806868995360278e-06, + "loss": 0.5271, + "step": 1673 + }, + { + "epoch": 2.0406173842404547, + "grad_norm": 0.4264717440525784, + "learning_rate": 2.800495293420384e-06, + "loss": 0.5358, + "step": 1674 + }, + { + "epoch": 2.0418359057676687, + "grad_norm": 0.406701314146554, + "learning_rate": 2.7941260201098513e-06, + "loss": 0.5347, + "step": 1675 + }, + { + "epoch": 2.0430544272948823, + "grad_norm": 0.3789120115073334, + "learning_rate": 2.7877611882529978e-06, + "loss": 0.5291, + "step": 1676 + }, + { + "epoch": 2.044272948822096, + "grad_norm": 0.376105306550124, + "learning_rate": 2.781400810665201e-06, + "loss": 0.4798, + "step": 1677 + }, + { + "epoch": 2.0454914703493094, + "grad_norm": 0.41689803288099314, + "learning_rate": 2.775044900152873e-06, + "loss": 0.5603, + "step": 1678 + }, + { + "epoch": 2.046709991876523, + "grad_norm": 0.39084573445190973, + "learning_rate": 2.7686934695134237e-06, + "loss": 0.5172, + "step": 1679 + }, + { + "epoch": 2.0479285134037366, + "grad_norm": 0.3983625559018197, + "learning_rate": 2.762346531535246e-06, + "loss": 0.5169, + "step": 1680 + }, + { + "epoch": 2.0491470349309506, + "grad_norm": 0.4214853676308127, + "learning_rate": 2.7560040989976894e-06, + "loss": 0.4956, + "step": 1681 + }, + { + "epoch": 2.050365556458164, + "grad_norm": 0.3999213184120299, + "learning_rate": 2.749666184671032e-06, + "loss": 0.4772, + "step": 1682 + }, + { + "epoch": 2.051584077985378, + "grad_norm": 0.47662956549783836, + "learning_rate": 2.7433328013164493e-06, + "loss": 0.5384, + "step": 1683 + }, + { + "epoch": 2.0528025995125914, + "grad_norm": 0.41183336935775333, + "learning_rate": 2.737003961686e-06, + "loss": 0.5383, + "step": 1684 + }, + { + "epoch": 2.054021121039805, + "grad_norm": 0.4162288645157988, + "learning_rate": 2.730679678522592e-06, + "loss": 0.4879, + "step": 1685 + }, + { + "epoch": 2.0552396425670185, + "grad_norm": 0.4208621537377079, + "learning_rate": 2.724359964559958e-06, + "loss": 0.5302, + "step": 1686 + }, + { + "epoch": 2.0564581640942325, + "grad_norm": 0.46940669999316137, + "learning_rate": 2.7180448325226283e-06, + "loss": 0.5038, + "step": 1687 + }, + { + "epoch": 2.057676685621446, + "grad_norm": 0.42207329602275184, + "learning_rate": 2.711734295125913e-06, + "loss": 0.5136, + "step": 1688 + }, + { + "epoch": 2.0588952071486597, + "grad_norm": 0.42923148362073005, + "learning_rate": 2.705428365075868e-06, + "loss": 0.4974, + "step": 1689 + }, + { + "epoch": 2.0601137286758733, + "grad_norm": 0.4486303504796547, + "learning_rate": 2.6991270550692794e-06, + "loss": 0.4896, + "step": 1690 + }, + { + "epoch": 2.061332250203087, + "grad_norm": 0.4450084773476215, + "learning_rate": 2.692830377793614e-06, + "loss": 0.5368, + "step": 1691 + }, + { + "epoch": 2.0625507717303004, + "grad_norm": 0.41459790491115805, + "learning_rate": 2.686538345927027e-06, + "loss": 0.5181, + "step": 1692 + }, + { + "epoch": 2.063769293257514, + "grad_norm": 0.39428684445951, + "learning_rate": 2.680250972138314e-06, + "loss": 0.5002, + "step": 1693 + }, + { + "epoch": 2.064987814784728, + "grad_norm": 0.46824872444922333, + "learning_rate": 2.6739682690868947e-06, + "loss": 0.5303, + "step": 1694 + }, + { + "epoch": 2.0662063363119416, + "grad_norm": 0.4328466707209096, + "learning_rate": 2.6676902494227795e-06, + "loss": 0.5603, + "step": 1695 + }, + { + "epoch": 2.067424857839155, + "grad_norm": 0.3839348433674553, + "learning_rate": 2.6614169257865513e-06, + "loss": 0.4682, + "step": 1696 + }, + { + "epoch": 2.0686433793663688, + "grad_norm": 0.4527195063930813, + "learning_rate": 2.6551483108093378e-06, + "loss": 0.5468, + "step": 1697 + }, + { + "epoch": 2.0698619008935824, + "grad_norm": 0.3864719481645061, + "learning_rate": 2.6488844171127903e-06, + "loss": 0.4596, + "step": 1698 + }, + { + "epoch": 2.071080422420796, + "grad_norm": 0.46426343352179134, + "learning_rate": 2.6426252573090437e-06, + "loss": 0.56, + "step": 1699 + }, + { + "epoch": 2.07229894394801, + "grad_norm": 0.4288286079073576, + "learning_rate": 2.6363708440007136e-06, + "loss": 0.5161, + "step": 1700 + }, + { + "epoch": 2.0735174654752235, + "grad_norm": 0.40637110346427036, + "learning_rate": 2.6301211897808463e-06, + "loss": 0.5389, + "step": 1701 + }, + { + "epoch": 2.074735987002437, + "grad_norm": 0.3951734512163483, + "learning_rate": 2.623876307232919e-06, + "loss": 0.526, + "step": 1702 + }, + { + "epoch": 2.0759545085296507, + "grad_norm": 0.3885989062888163, + "learning_rate": 2.6176362089307873e-06, + "loss": 0.4725, + "step": 1703 + }, + { + "epoch": 2.0771730300568643, + "grad_norm": 0.43823376795203267, + "learning_rate": 2.611400907438685e-06, + "loss": 0.5124, + "step": 1704 + }, + { + "epoch": 2.078391551584078, + "grad_norm": 0.39530623859105424, + "learning_rate": 2.6051704153111847e-06, + "loss": 0.4934, + "step": 1705 + }, + { + "epoch": 2.0796100731112914, + "grad_norm": 0.3703827995949618, + "learning_rate": 2.598944745093174e-06, + "loss": 0.477, + "step": 1706 + }, + { + "epoch": 2.0808285946385054, + "grad_norm": 0.399727045865617, + "learning_rate": 2.5927239093198273e-06, + "loss": 0.5887, + "step": 1707 + }, + { + "epoch": 2.082047116165719, + "grad_norm": 0.366776126773729, + "learning_rate": 2.5865079205165953e-06, + "loss": 0.4682, + "step": 1708 + }, + { + "epoch": 2.0832656376929326, + "grad_norm": 0.4208917158039958, + "learning_rate": 2.5802967911991637e-06, + "loss": 0.5203, + "step": 1709 + }, + { + "epoch": 2.084484159220146, + "grad_norm": 0.4493230756017366, + "learning_rate": 2.574090533873431e-06, + "loss": 0.5273, + "step": 1710 + }, + { + "epoch": 2.0857026807473598, + "grad_norm": 0.4464314211780269, + "learning_rate": 2.567889161035494e-06, + "loss": 0.589, + "step": 1711 + }, + { + "epoch": 2.0869212022745733, + "grad_norm": 0.37995030061127644, + "learning_rate": 2.5616926851716055e-06, + "loss": 0.4443, + "step": 1712 + }, + { + "epoch": 2.0881397238017874, + "grad_norm": 0.43366495148118606, + "learning_rate": 2.555501118758167e-06, + "loss": 0.5068, + "step": 1713 + }, + { + "epoch": 2.089358245329001, + "grad_norm": 0.43859783988060524, + "learning_rate": 2.549314474261686e-06, + "loss": 0.5061, + "step": 1714 + }, + { + "epoch": 2.0905767668562145, + "grad_norm": 0.41699646136345586, + "learning_rate": 2.5431327641387682e-06, + "loss": 0.5149, + "step": 1715 + }, + { + "epoch": 2.091795288383428, + "grad_norm": 0.4456011191053756, + "learning_rate": 2.5369560008360826e-06, + "loss": 0.521, + "step": 1716 + }, + { + "epoch": 2.0930138099106417, + "grad_norm": 0.3959554858313093, + "learning_rate": 2.5307841967903337e-06, + "loss": 0.5048, + "step": 1717 + }, + { + "epoch": 2.0942323314378553, + "grad_norm": 0.41396649662012225, + "learning_rate": 2.52461736442824e-06, + "loss": 0.5162, + "step": 1718 + }, + { + "epoch": 2.095450852965069, + "grad_norm": 0.42258000014128283, + "learning_rate": 2.518455516166517e-06, + "loss": 0.5517, + "step": 1719 + }, + { + "epoch": 2.096669374492283, + "grad_norm": 0.39450373632220187, + "learning_rate": 2.512298664411841e-06, + "loss": 0.4964, + "step": 1720 + }, + { + "epoch": 2.0978878960194964, + "grad_norm": 0.3723209543555369, + "learning_rate": 2.5061468215608243e-06, + "loss": 0.5218, + "step": 1721 + }, + { + "epoch": 2.09910641754671, + "grad_norm": 0.41901342453157836, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.5245, + "step": 1722 + }, + { + "epoch": 2.1003249390739236, + "grad_norm": 0.40321870380750857, + "learning_rate": 2.493858212105788e-06, + "loss": 0.5008, + "step": 1723 + }, + { + "epoch": 2.101543460601137, + "grad_norm": 0.41094871301137714, + "learning_rate": 2.487721470244473e-06, + "loss": 0.5255, + "step": 1724 + }, + { + "epoch": 2.1027619821283507, + "grad_norm": 0.38906496669749396, + "learning_rate": 2.481589786772178e-06, + "loss": 0.5077, + "step": 1725 + }, + { + "epoch": 2.1039805036555648, + "grad_norm": 0.4041745743019385, + "learning_rate": 2.4754631740348455e-06, + "loss": 0.5387, + "step": 1726 + }, + { + "epoch": 2.1051990251827783, + "grad_norm": 0.393377125185753, + "learning_rate": 2.4693416443682074e-06, + "loss": 0.5206, + "step": 1727 + }, + { + "epoch": 2.106417546709992, + "grad_norm": 0.43228252852381843, + "learning_rate": 2.4632252100977567e-06, + "loss": 0.5457, + "step": 1728 + }, + { + "epoch": 2.1076360682372055, + "grad_norm": 0.3665069578741783, + "learning_rate": 2.4571138835387293e-06, + "loss": 0.4513, + "step": 1729 + }, + { + "epoch": 2.108854589764419, + "grad_norm": 0.40575471379817674, + "learning_rate": 2.4510076769960784e-06, + "loss": 0.486, + "step": 1730 + }, + { + "epoch": 2.1100731112916327, + "grad_norm": 0.43487942611154445, + "learning_rate": 2.4449066027644473e-06, + "loss": 0.542, + "step": 1731 + }, + { + "epoch": 2.1112916328188467, + "grad_norm": 0.41618547734069145, + "learning_rate": 2.4388106731281496e-06, + "loss": 0.5405, + "step": 1732 + }, + { + "epoch": 2.1125101543460603, + "grad_norm": 0.37250571753343925, + "learning_rate": 2.4327199003611285e-06, + "loss": 0.5298, + "step": 1733 + }, + { + "epoch": 2.113728675873274, + "grad_norm": 0.38311544589645186, + "learning_rate": 2.426634296726955e-06, + "loss": 0.4806, + "step": 1734 + }, + { + "epoch": 2.1149471974004874, + "grad_norm": 0.41137106387325834, + "learning_rate": 2.4205538744787904e-06, + "loss": 0.5201, + "step": 1735 + }, + { + "epoch": 2.116165718927701, + "grad_norm": 0.39341688405639397, + "learning_rate": 2.4144786458593635e-06, + "loss": 0.4973, + "step": 1736 + }, + { + "epoch": 2.1173842404549146, + "grad_norm": 0.4261520463046671, + "learning_rate": 2.40840862310094e-06, + "loss": 0.5574, + "step": 1737 + }, + { + "epoch": 2.118602761982128, + "grad_norm": 0.4270827914830235, + "learning_rate": 2.4023438184253115e-06, + "loss": 0.5011, + "step": 1738 + }, + { + "epoch": 2.119821283509342, + "grad_norm": 0.37282235236530675, + "learning_rate": 2.3962842440437584e-06, + "loss": 0.4675, + "step": 1739 + }, + { + "epoch": 2.1210398050365558, + "grad_norm": 0.44674425439539434, + "learning_rate": 2.3902299121570332e-06, + "loss": 0.5741, + "step": 1740 + }, + { + "epoch": 2.1222583265637693, + "grad_norm": 0.41462268384604345, + "learning_rate": 2.384180834955329e-06, + "loss": 0.4876, + "step": 1741 + }, + { + "epoch": 2.123476848090983, + "grad_norm": 0.4534841107739354, + "learning_rate": 2.378137024618262e-06, + "loss": 0.5135, + "step": 1742 + }, + { + "epoch": 2.1246953696181965, + "grad_norm": 0.3978485819501479, + "learning_rate": 2.3720984933148443e-06, + "loss": 0.5208, + "step": 1743 + }, + { + "epoch": 2.12591389114541, + "grad_norm": 0.37184067874428883, + "learning_rate": 2.366065253203456e-06, + "loss": 0.5007, + "step": 1744 + }, + { + "epoch": 2.1271324126726237, + "grad_norm": 0.4276632980042562, + "learning_rate": 2.360037316431823e-06, + "loss": 0.5317, + "step": 1745 + }, + { + "epoch": 2.1283509341998377, + "grad_norm": 0.4617864367024641, + "learning_rate": 2.354014695136997e-06, + "loss": 0.5064, + "step": 1746 + }, + { + "epoch": 2.1295694557270513, + "grad_norm": 0.3859544135248513, + "learning_rate": 2.3479974014453255e-06, + "loss": 0.4865, + "step": 1747 + }, + { + "epoch": 2.130787977254265, + "grad_norm": 0.39298854898729213, + "learning_rate": 2.3419854474724284e-06, + "loss": 0.5399, + "step": 1748 + }, + { + "epoch": 2.1320064987814784, + "grad_norm": 0.4537583444692293, + "learning_rate": 2.3359788453231723e-06, + "loss": 0.5134, + "step": 1749 + }, + { + "epoch": 2.133225020308692, + "grad_norm": 0.4143013070479495, + "learning_rate": 2.329977607091652e-06, + "loss": 0.5128, + "step": 1750 + }, + { + "epoch": 2.1344435418359056, + "grad_norm": 0.37360321916937234, + "learning_rate": 2.323981744861162e-06, + "loss": 0.5181, + "step": 1751 + }, + { + "epoch": 2.1356620633631196, + "grad_norm": 0.4044550908338376, + "learning_rate": 2.317991270704167e-06, + "loss": 0.5197, + "step": 1752 + }, + { + "epoch": 2.136880584890333, + "grad_norm": 0.4067716384869161, + "learning_rate": 2.3120061966822915e-06, + "loss": 0.4899, + "step": 1753 + }, + { + "epoch": 2.1380991064175467, + "grad_norm": 0.4141293721178242, + "learning_rate": 2.3060265348462777e-06, + "loss": 0.5499, + "step": 1754 + }, + { + "epoch": 2.1393176279447603, + "grad_norm": 0.4005349519648887, + "learning_rate": 2.3000522972359803e-06, + "loss": 0.5395, + "step": 1755 + }, + { + "epoch": 2.140536149471974, + "grad_norm": 0.39248292555402464, + "learning_rate": 2.2940834958803228e-06, + "loss": 0.4931, + "step": 1756 + }, + { + "epoch": 2.1417546709991875, + "grad_norm": 0.38822332167948, + "learning_rate": 2.2881201427972894e-06, + "loss": 0.4722, + "step": 1757 + }, + { + "epoch": 2.1429731925264015, + "grad_norm": 0.38077200885472606, + "learning_rate": 2.282162249993895e-06, + "loss": 0.5326, + "step": 1758 + }, + { + "epoch": 2.144191714053615, + "grad_norm": 0.38524212575031547, + "learning_rate": 2.2762098294661556e-06, + "loss": 0.5109, + "step": 1759 + }, + { + "epoch": 2.1454102355808287, + "grad_norm": 0.40347299946589055, + "learning_rate": 2.27026289319907e-06, + "loss": 0.5579, + "step": 1760 + }, + { + "epoch": 2.1466287571080422, + "grad_norm": 0.3924233257276901, + "learning_rate": 2.264321453166598e-06, + "loss": 0.5165, + "step": 1761 + }, + { + "epoch": 2.147847278635256, + "grad_norm": 0.37575656828172116, + "learning_rate": 2.2583855213316326e-06, + "loss": 0.4895, + "step": 1762 + }, + { + "epoch": 2.1490658001624694, + "grad_norm": 0.3937169968667044, + "learning_rate": 2.2524551096459703e-06, + "loss": 0.53, + "step": 1763 + }, + { + "epoch": 2.150284321689683, + "grad_norm": 0.39457712101518455, + "learning_rate": 2.2465302300503012e-06, + "loss": 0.4689, + "step": 1764 + }, + { + "epoch": 2.151502843216897, + "grad_norm": 0.4393918889033505, + "learning_rate": 2.2406108944741696e-06, + "loss": 0.5178, + "step": 1765 + }, + { + "epoch": 2.1527213647441106, + "grad_norm": 0.4147988916944454, + "learning_rate": 2.234697114835963e-06, + "loss": 0.5385, + "step": 1766 + }, + { + "epoch": 2.153939886271324, + "grad_norm": 0.40920290551320604, + "learning_rate": 2.228788903042877e-06, + "loss": 0.5229, + "step": 1767 + }, + { + "epoch": 2.1551584077985377, + "grad_norm": 0.3841912028808192, + "learning_rate": 2.2228862709909e-06, + "loss": 0.4859, + "step": 1768 + }, + { + "epoch": 2.1563769293257513, + "grad_norm": 0.4017528120034774, + "learning_rate": 2.2169892305647865e-06, + "loss": 0.5067, + "step": 1769 + }, + { + "epoch": 2.157595450852965, + "grad_norm": 0.4106356542533839, + "learning_rate": 2.211097793638029e-06, + "loss": 0.511, + "step": 1770 + }, + { + "epoch": 2.158813972380179, + "grad_norm": 0.3922497877364856, + "learning_rate": 2.2052119720728375e-06, + "loss": 0.5213, + "step": 1771 + }, + { + "epoch": 2.1600324939073925, + "grad_norm": 0.41142355231120237, + "learning_rate": 2.1993317777201197e-06, + "loss": 0.55, + "step": 1772 + }, + { + "epoch": 2.161251015434606, + "grad_norm": 0.3793501040249683, + "learning_rate": 2.19345722241945e-06, + "loss": 0.4942, + "step": 1773 + }, + { + "epoch": 2.1624695369618196, + "grad_norm": 0.4181558266950597, + "learning_rate": 2.1875883179990515e-06, + "loss": 0.5179, + "step": 1774 + }, + { + "epoch": 2.1636880584890332, + "grad_norm": 0.4101546221225696, + "learning_rate": 2.1817250762757657e-06, + "loss": 0.4854, + "step": 1775 + }, + { + "epoch": 2.164906580016247, + "grad_norm": 0.40370317236247594, + "learning_rate": 2.175867509055033e-06, + "loss": 0.5675, + "step": 1776 + }, + { + "epoch": 2.166125101543461, + "grad_norm": 0.34161977537337374, + "learning_rate": 2.170015628130871e-06, + "loss": 0.4693, + "step": 1777 + }, + { + "epoch": 2.1673436230706744, + "grad_norm": 0.3929226138427561, + "learning_rate": 2.1641694452858486e-06, + "loss": 0.4932, + "step": 1778 + }, + { + "epoch": 2.168562144597888, + "grad_norm": 0.414139184233712, + "learning_rate": 2.158328972291056e-06, + "loss": 0.5428, + "step": 1779 + }, + { + "epoch": 2.1697806661251016, + "grad_norm": 0.4021702827253732, + "learning_rate": 2.1524942209060944e-06, + "loss": 0.553, + "step": 1780 + }, + { + "epoch": 2.170999187652315, + "grad_norm": 0.3914173100634304, + "learning_rate": 2.1466652028790384e-06, + "loss": 0.4846, + "step": 1781 + }, + { + "epoch": 2.1722177091795287, + "grad_norm": 0.4155952702393289, + "learning_rate": 2.1408419299464245e-06, + "loss": 0.5062, + "step": 1782 + }, + { + "epoch": 2.1734362307067423, + "grad_norm": 0.4029405381679447, + "learning_rate": 2.1350244138332143e-06, + "loss": 0.5543, + "step": 1783 + }, + { + "epoch": 2.1746547522339563, + "grad_norm": 0.3847467238608974, + "learning_rate": 2.1292126662527846e-06, + "loss": 0.4783, + "step": 1784 + }, + { + "epoch": 2.17587327376117, + "grad_norm": 0.3774561138941112, + "learning_rate": 2.1234066989068972e-06, + "loss": 0.5736, + "step": 1785 + }, + { + "epoch": 2.1770917952883835, + "grad_norm": 0.3791792062977483, + "learning_rate": 2.1176065234856725e-06, + "loss": 0.4782, + "step": 1786 + }, + { + "epoch": 2.178310316815597, + "grad_norm": 0.40849575227250023, + "learning_rate": 2.111812151667567e-06, + "loss": 0.498, + "step": 1787 + }, + { + "epoch": 2.1795288383428106, + "grad_norm": 0.36289578272484346, + "learning_rate": 2.106023595119358e-06, + "loss": 0.4866, + "step": 1788 + }, + { + "epoch": 2.180747359870024, + "grad_norm": 0.3869017558290611, + "learning_rate": 2.1002408654961124e-06, + "loss": 0.4643, + "step": 1789 + }, + { + "epoch": 2.181965881397238, + "grad_norm": 0.4954162333959639, + "learning_rate": 2.0944639744411627e-06, + "loss": 0.5415, + "step": 1790 + }, + { + "epoch": 2.183184402924452, + "grad_norm": 0.42167227063607843, + "learning_rate": 2.088692933586083e-06, + "loss": 0.5359, + "step": 1791 + }, + { + "epoch": 2.1844029244516654, + "grad_norm": 0.38236452945033045, + "learning_rate": 2.0829277545506736e-06, + "loss": 0.4971, + "step": 1792 + }, + { + "epoch": 2.185621445978879, + "grad_norm": 0.4285834277650781, + "learning_rate": 2.077168448942933e-06, + "loss": 0.5475, + "step": 1793 + }, + { + "epoch": 2.1868399675060926, + "grad_norm": 0.4023920796181869, + "learning_rate": 2.071415028359026e-06, + "loss": 0.4797, + "step": 1794 + }, + { + "epoch": 2.188058489033306, + "grad_norm": 0.4074540063011702, + "learning_rate": 2.065667504383276e-06, + "loss": 0.5254, + "step": 1795 + }, + { + "epoch": 2.1892770105605197, + "grad_norm": 0.39510353720495955, + "learning_rate": 2.0599258885881317e-06, + "loss": 0.4899, + "step": 1796 + }, + { + "epoch": 2.1904955320877337, + "grad_norm": 0.5548900318530214, + "learning_rate": 2.0541901925341446e-06, + "loss": 0.5198, + "step": 1797 + }, + { + "epoch": 2.1917140536149473, + "grad_norm": 0.3825302397550243, + "learning_rate": 2.0484604277699437e-06, + "loss": 0.5098, + "step": 1798 + }, + { + "epoch": 2.192932575142161, + "grad_norm": 0.40564085390755233, + "learning_rate": 2.042736605832222e-06, + "loss": 0.5323, + "step": 1799 + }, + { + "epoch": 2.1941510966693745, + "grad_norm": 0.39704056600422283, + "learning_rate": 2.037018738245707e-06, + "loss": 0.5108, + "step": 1800 + }, + { + "epoch": 2.195369618196588, + "grad_norm": 0.41600542688505693, + "learning_rate": 2.0313068365231303e-06, + "loss": 0.4978, + "step": 1801 + }, + { + "epoch": 2.1965881397238016, + "grad_norm": 0.39387757095797193, + "learning_rate": 2.0256009121652147e-06, + "loss": 0.5273, + "step": 1802 + }, + { + "epoch": 2.1978066612510156, + "grad_norm": 0.36242925936592724, + "learning_rate": 2.019900976660651e-06, + "loss": 0.4982, + "step": 1803 + }, + { + "epoch": 2.1990251827782292, + "grad_norm": 0.35509245865119315, + "learning_rate": 2.0142070414860704e-06, + "loss": 0.4878, + "step": 1804 + }, + { + "epoch": 2.200243704305443, + "grad_norm": 0.38952269804673, + "learning_rate": 2.0085191181060176e-06, + "loss": 0.5369, + "step": 1805 + }, + { + "epoch": 2.2014622258326564, + "grad_norm": 0.3866235545814812, + "learning_rate": 2.0028372179729405e-06, + "loss": 0.4802, + "step": 1806 + }, + { + "epoch": 2.20268074735987, + "grad_norm": 0.3922622961361272, + "learning_rate": 1.9971613525271523e-06, + "loss": 0.5284, + "step": 1807 + }, + { + "epoch": 2.2038992688870835, + "grad_norm": 0.38718886845940065, + "learning_rate": 1.9914915331968217e-06, + "loss": 0.4846, + "step": 1808 + }, + { + "epoch": 2.205117790414297, + "grad_norm": 0.39994907283167946, + "learning_rate": 1.985827771397938e-06, + "loss": 0.5433, + "step": 1809 + }, + { + "epoch": 2.206336311941511, + "grad_norm": 0.37905307147188894, + "learning_rate": 1.980170078534297e-06, + "loss": 0.5145, + "step": 1810 + }, + { + "epoch": 2.2075548334687247, + "grad_norm": 0.4192247244192786, + "learning_rate": 1.9745184659974764e-06, + "loss": 0.5118, + "step": 1811 + }, + { + "epoch": 2.2087733549959383, + "grad_norm": 0.3522566177407128, + "learning_rate": 1.9688729451668116e-06, + "loss": 0.4751, + "step": 1812 + }, + { + "epoch": 2.209991876523152, + "grad_norm": 0.3772830661699162, + "learning_rate": 1.9632335274093645e-06, + "loss": 0.4859, + "step": 1813 + }, + { + "epoch": 2.2112103980503655, + "grad_norm": 0.41564321254010056, + "learning_rate": 1.957600224079917e-06, + "loss": 0.5474, + "step": 1814 + }, + { + "epoch": 2.212428919577579, + "grad_norm": 0.40463784380961515, + "learning_rate": 1.9519730465209384e-06, + "loss": 0.5135, + "step": 1815 + }, + { + "epoch": 2.213647441104793, + "grad_norm": 0.42048597280314337, + "learning_rate": 1.9463520060625647e-06, + "loss": 0.51, + "step": 1816 + }, + { + "epoch": 2.2148659626320066, + "grad_norm": 0.4057972852875916, + "learning_rate": 1.940737114022572e-06, + "loss": 0.5291, + "step": 1817 + }, + { + "epoch": 2.21608448415922, + "grad_norm": 0.3860966909234876, + "learning_rate": 1.935128381706355e-06, + "loss": 0.4638, + "step": 1818 + }, + { + "epoch": 2.217303005686434, + "grad_norm": 0.3992185398314597, + "learning_rate": 1.9295258204069116e-06, + "loss": 0.4846, + "step": 1819 + }, + { + "epoch": 2.2185215272136474, + "grad_norm": 0.4414225404994573, + "learning_rate": 1.9239294414048143e-06, + "loss": 0.5729, + "step": 1820 + }, + { + "epoch": 2.219740048740861, + "grad_norm": 0.38820997327095225, + "learning_rate": 1.9183392559681812e-06, + "loss": 0.4883, + "step": 1821 + }, + { + "epoch": 2.2209585702680745, + "grad_norm": 0.3830961932249294, + "learning_rate": 1.9127552753526683e-06, + "loss": 0.4959, + "step": 1822 + }, + { + "epoch": 2.2221770917952886, + "grad_norm": 0.40371127197935186, + "learning_rate": 1.907177510801431e-06, + "loss": 0.5322, + "step": 1823 + }, + { + "epoch": 2.223395613322502, + "grad_norm": 0.4422584980389884, + "learning_rate": 1.901605973545116e-06, + "loss": 0.544, + "step": 1824 + }, + { + "epoch": 2.2246141348497157, + "grad_norm": 0.3671460120788879, + "learning_rate": 1.8960406748018229e-06, + "loss": 0.447, + "step": 1825 + }, + { + "epoch": 2.2258326563769293, + "grad_norm": 0.41787679838261416, + "learning_rate": 1.8904816257770976e-06, + "loss": 0.4837, + "step": 1826 + }, + { + "epoch": 2.227051177904143, + "grad_norm": 0.40739753945636065, + "learning_rate": 1.884928837663902e-06, + "loss": 0.5215, + "step": 1827 + }, + { + "epoch": 2.2282696994313564, + "grad_norm": 0.418750898184457, + "learning_rate": 1.8793823216425872e-06, + "loss": 0.5042, + "step": 1828 + }, + { + "epoch": 2.2294882209585705, + "grad_norm": 0.42352381285835144, + "learning_rate": 1.8738420888808767e-06, + "loss": 0.5266, + "step": 1829 + }, + { + "epoch": 2.230706742485784, + "grad_norm": 0.3766712142861196, + "learning_rate": 1.8683081505338468e-06, + "loss": 0.4898, + "step": 1830 + }, + { + "epoch": 2.2319252640129976, + "grad_norm": 0.3646915065525184, + "learning_rate": 1.8627805177438984e-06, + "loss": 0.5102, + "step": 1831 + }, + { + "epoch": 2.233143785540211, + "grad_norm": 0.3728256406154505, + "learning_rate": 1.8572592016407337e-06, + "loss": 0.5124, + "step": 1832 + }, + { + "epoch": 2.234362307067425, + "grad_norm": 0.4171588204608096, + "learning_rate": 1.8517442133413405e-06, + "loss": 0.543, + "step": 1833 + }, + { + "epoch": 2.2355808285946384, + "grad_norm": 0.38601375413922817, + "learning_rate": 1.8462355639499614e-06, + "loss": 0.4802, + "step": 1834 + }, + { + "epoch": 2.236799350121852, + "grad_norm": 0.41191751363191975, + "learning_rate": 1.8407332645580805e-06, + "loss": 0.498, + "step": 1835 + }, + { + "epoch": 2.238017871649066, + "grad_norm": 0.36858808021561745, + "learning_rate": 1.8352373262443918e-06, + "loss": 0.5308, + "step": 1836 + }, + { + "epoch": 2.2392363931762795, + "grad_norm": 0.39319359572416623, + "learning_rate": 1.8297477600747854e-06, + "loss": 0.5147, + "step": 1837 + }, + { + "epoch": 2.240454914703493, + "grad_norm": 0.4146287760930502, + "learning_rate": 1.8242645771023205e-06, + "loss": 0.4951, + "step": 1838 + }, + { + "epoch": 2.2416734362307067, + "grad_norm": 0.42074375031523503, + "learning_rate": 1.8187877883672024e-06, + "loss": 0.5238, + "step": 1839 + }, + { + "epoch": 2.2428919577579203, + "grad_norm": 0.3933564686288827, + "learning_rate": 1.81331740489676e-06, + "loss": 0.5319, + "step": 1840 + }, + { + "epoch": 2.244110479285134, + "grad_norm": 0.39931640514560685, + "learning_rate": 1.8078534377054303e-06, + "loss": 0.4921, + "step": 1841 + }, + { + "epoch": 2.245329000812348, + "grad_norm": 0.43198490048399485, + "learning_rate": 1.8023958977947303e-06, + "loss": 0.55, + "step": 1842 + }, + { + "epoch": 2.2465475223395615, + "grad_norm": 0.4043002892149948, + "learning_rate": 1.7969447961532333e-06, + "loss": 0.4992, + "step": 1843 + }, + { + "epoch": 2.247766043866775, + "grad_norm": 0.4118718760088592, + "learning_rate": 1.7915001437565481e-06, + "loss": 0.4981, + "step": 1844 + }, + { + "epoch": 2.2489845653939886, + "grad_norm": 0.4099852822242898, + "learning_rate": 1.7860619515673034e-06, + "loss": 0.5036, + "step": 1845 + }, + { + "epoch": 2.250203086921202, + "grad_norm": 0.4061790060023943, + "learning_rate": 1.7806302305351191e-06, + "loss": 0.518, + "step": 1846 + }, + { + "epoch": 2.2514216084484158, + "grad_norm": 0.38920893319388733, + "learning_rate": 1.7752049915965807e-06, + "loss": 0.5347, + "step": 1847 + }, + { + "epoch": 2.25264012997563, + "grad_norm": 0.378445552638885, + "learning_rate": 1.7697862456752273e-06, + "loss": 0.4489, + "step": 1848 + }, + { + "epoch": 2.2538586515028434, + "grad_norm": 0.43624250871726583, + "learning_rate": 1.764374003681526e-06, + "loss": 0.5076, + "step": 1849 + }, + { + "epoch": 2.255077173030057, + "grad_norm": 0.4301972293121319, + "learning_rate": 1.7589682765128424e-06, + "loss": 0.5106, + "step": 1850 + }, + { + "epoch": 2.2562956945572705, + "grad_norm": 0.4206525298070121, + "learning_rate": 1.7535690750534268e-06, + "loss": 0.5224, + "step": 1851 + }, + { + "epoch": 2.257514216084484, + "grad_norm": 0.3846425486629517, + "learning_rate": 1.7481764101743925e-06, + "loss": 0.4962, + "step": 1852 + }, + { + "epoch": 2.2587327376116977, + "grad_norm": 0.3994656170665921, + "learning_rate": 1.7427902927336932e-06, + "loss": 0.5142, + "step": 1853 + }, + { + "epoch": 2.2599512591389113, + "grad_norm": 0.4156779226062884, + "learning_rate": 1.7374107335760937e-06, + "loss": 0.5224, + "step": 1854 + }, + { + "epoch": 2.2611697806661253, + "grad_norm": 0.44546616602212363, + "learning_rate": 1.732037743533156e-06, + "loss": 0.49, + "step": 1855 + }, + { + "epoch": 2.262388302193339, + "grad_norm": 0.41287367815029863, + "learning_rate": 1.7266713334232177e-06, + "loss": 0.5125, + "step": 1856 + }, + { + "epoch": 2.2636068237205524, + "grad_norm": 0.4171809094817276, + "learning_rate": 1.7213115140513687e-06, + "loss": 0.4866, + "step": 1857 + }, + { + "epoch": 2.264825345247766, + "grad_norm": 0.4055921589949916, + "learning_rate": 1.7159582962094224e-06, + "loss": 0.5221, + "step": 1858 + }, + { + "epoch": 2.2660438667749796, + "grad_norm": 0.37574794466013317, + "learning_rate": 1.710611690675908e-06, + "loss": 0.5475, + "step": 1859 + }, + { + "epoch": 2.267262388302193, + "grad_norm": 0.3923703686975919, + "learning_rate": 1.7052717082160348e-06, + "loss": 0.502, + "step": 1860 + }, + { + "epoch": 2.2684809098294068, + "grad_norm": 0.4104360688359129, + "learning_rate": 1.6999383595816816e-06, + "loss": 0.4915, + "step": 1861 + }, + { + "epoch": 2.2696994313566208, + "grad_norm": 0.42448276721497596, + "learning_rate": 1.694611655511365e-06, + "loss": 0.5187, + "step": 1862 + }, + { + "epoch": 2.2709179528838344, + "grad_norm": 0.4182208678556554, + "learning_rate": 1.6892916067302279e-06, + "loss": 0.5431, + "step": 1863 + }, + { + "epoch": 2.272136474411048, + "grad_norm": 0.3809524496490651, + "learning_rate": 1.6839782239500114e-06, + "loss": 0.4962, + "step": 1864 + }, + { + "epoch": 2.2733549959382615, + "grad_norm": 0.4044653821113984, + "learning_rate": 1.6786715178690372e-06, + "loss": 0.5455, + "step": 1865 + }, + { + "epoch": 2.274573517465475, + "grad_norm": 0.4017956989739632, + "learning_rate": 1.6733714991721738e-06, + "loss": 0.5124, + "step": 1866 + }, + { + "epoch": 2.275792038992689, + "grad_norm": 0.3948697674235281, + "learning_rate": 1.668078178530837e-06, + "loss": 0.5121, + "step": 1867 + }, + { + "epoch": 2.2770105605199027, + "grad_norm": 0.406458521824727, + "learning_rate": 1.6627915666029503e-06, + "loss": 0.5111, + "step": 1868 + }, + { + "epoch": 2.2782290820471163, + "grad_norm": 0.3778951735019038, + "learning_rate": 1.6575116740329316e-06, + "loss": 0.4983, + "step": 1869 + }, + { + "epoch": 2.27944760357433, + "grad_norm": 0.3577602501518659, + "learning_rate": 1.6522385114516681e-06, + "loss": 0.4748, + "step": 1870 + }, + { + "epoch": 2.2806661251015434, + "grad_norm": 0.39147597379268634, + "learning_rate": 1.6469720894764945e-06, + "loss": 0.5167, + "step": 1871 + }, + { + "epoch": 2.281884646628757, + "grad_norm": 0.41526652630800426, + "learning_rate": 1.6417124187111778e-06, + "loss": 0.4856, + "step": 1872 + }, + { + "epoch": 2.2831031681559706, + "grad_norm": 0.46476506582341715, + "learning_rate": 1.6364595097458901e-06, + "loss": 0.5541, + "step": 1873 + }, + { + "epoch": 2.2843216896831846, + "grad_norm": 0.4413380147809054, + "learning_rate": 1.6312133731571867e-06, + "loss": 0.5681, + "step": 1874 + }, + { + "epoch": 2.285540211210398, + "grad_norm": 0.41316580395580427, + "learning_rate": 1.6259740195079903e-06, + "loss": 0.4902, + "step": 1875 + }, + { + "epoch": 2.2867587327376118, + "grad_norm": 0.375223833084769, + "learning_rate": 1.6207414593475634e-06, + "loss": 0.5059, + "step": 1876 + }, + { + "epoch": 2.2879772542648253, + "grad_norm": 0.4191217491734361, + "learning_rate": 1.6155157032114926e-06, + "loss": 0.4903, + "step": 1877 + }, + { + "epoch": 2.289195775792039, + "grad_norm": 0.4104399365050538, + "learning_rate": 1.610296761621662e-06, + "loss": 0.4978, + "step": 1878 + }, + { + "epoch": 2.2904142973192525, + "grad_norm": 0.452132557586679, + "learning_rate": 1.6050846450862368e-06, + "loss": 0.5529, + "step": 1879 + }, + { + "epoch": 2.291632818846466, + "grad_norm": 0.38189023256593707, + "learning_rate": 1.5998793640996418e-06, + "loss": 0.4534, + "step": 1880 + }, + { + "epoch": 2.29285134037368, + "grad_norm": 0.4105896104811722, + "learning_rate": 1.5946809291425352e-06, + "loss": 0.5157, + "step": 1881 + }, + { + "epoch": 2.2940698619008937, + "grad_norm": 0.39415858749113886, + "learning_rate": 1.589489350681791e-06, + "loss": 0.504, + "step": 1882 + }, + { + "epoch": 2.2952883834281073, + "grad_norm": 0.35980183287414685, + "learning_rate": 1.5843046391704802e-06, + "loss": 0.5077, + "step": 1883 + }, + { + "epoch": 2.296506904955321, + "grad_norm": 0.38396469649464077, + "learning_rate": 1.5791268050478487e-06, + "loss": 0.5051, + "step": 1884 + }, + { + "epoch": 2.2977254264825344, + "grad_norm": 0.3821275005707828, + "learning_rate": 1.573955858739289e-06, + "loss": 0.5345, + "step": 1885 + }, + { + "epoch": 2.298943948009748, + "grad_norm": 0.39018524126022086, + "learning_rate": 1.5687918106563326e-06, + "loss": 0.4713, + "step": 1886 + }, + { + "epoch": 2.3001624695369616, + "grad_norm": 0.4269492782915114, + "learning_rate": 1.5636346711966154e-06, + "loss": 0.5396, + "step": 1887 + }, + { + "epoch": 2.3013809910641756, + "grad_norm": 0.40024049549443624, + "learning_rate": 1.5584844507438678e-06, + "loss": 0.5119, + "step": 1888 + }, + { + "epoch": 2.302599512591389, + "grad_norm": 0.3769373466661974, + "learning_rate": 1.5533411596678843e-06, + "loss": 0.4858, + "step": 1889 + }, + { + "epoch": 2.3038180341186028, + "grad_norm": 0.40957300518215145, + "learning_rate": 1.5482048083245116e-06, + "loss": 0.5299, + "step": 1890 + }, + { + "epoch": 2.3050365556458163, + "grad_norm": 0.3887283184505036, + "learning_rate": 1.543075407055623e-06, + "loss": 0.5276, + "step": 1891 + }, + { + "epoch": 2.30625507717303, + "grad_norm": 0.3944358213197462, + "learning_rate": 1.5379529661890956e-06, + "loss": 0.512, + "step": 1892 + }, + { + "epoch": 2.307473598700244, + "grad_norm": 0.38335955851758596, + "learning_rate": 1.532837496038792e-06, + "loss": 0.4802, + "step": 1893 + }, + { + "epoch": 2.3086921202274575, + "grad_norm": 0.40294432984687933, + "learning_rate": 1.5277290069045414e-06, + "loss": 0.5171, + "step": 1894 + }, + { + "epoch": 2.309910641754671, + "grad_norm": 0.39774386298303005, + "learning_rate": 1.5226275090721183e-06, + "loss": 0.4993, + "step": 1895 + }, + { + "epoch": 2.3111291632818847, + "grad_norm": 0.4200593782198228, + "learning_rate": 1.517533012813217e-06, + "loss": 0.5606, + "step": 1896 + }, + { + "epoch": 2.3123476848090982, + "grad_norm": 0.4204431416124692, + "learning_rate": 1.512445528385434e-06, + "loss": 0.5538, + "step": 1897 + }, + { + "epoch": 2.313566206336312, + "grad_norm": 0.33671871586550156, + "learning_rate": 1.5073650660322509e-06, + "loss": 0.4575, + "step": 1898 + }, + { + "epoch": 2.3147847278635254, + "grad_norm": 0.42276990891040056, + "learning_rate": 1.5022916359830114e-06, + "loss": 0.5744, + "step": 1899 + }, + { + "epoch": 2.3160032493907394, + "grad_norm": 0.35916851603197664, + "learning_rate": 1.4972252484528938e-06, + "loss": 0.4721, + "step": 1900 + }, + { + "epoch": 2.317221770917953, + "grad_norm": 0.40308654798116644, + "learning_rate": 1.4921659136429022e-06, + "loss": 0.5283, + "step": 1901 + }, + { + "epoch": 2.3184402924451666, + "grad_norm": 0.36536184604215355, + "learning_rate": 1.4871136417398407e-06, + "loss": 0.4748, + "step": 1902 + }, + { + "epoch": 2.31965881397238, + "grad_norm": 0.42296472427505094, + "learning_rate": 1.4820684429162879e-06, + "loss": 0.6, + "step": 1903 + }, + { + "epoch": 2.3208773354995937, + "grad_norm": 0.3598564220383708, + "learning_rate": 1.477030327330582e-06, + "loss": 0.4422, + "step": 1904 + }, + { + "epoch": 2.3220958570268073, + "grad_norm": 0.3948776090594206, + "learning_rate": 1.4719993051268023e-06, + "loss": 0.5343, + "step": 1905 + }, + { + "epoch": 2.323314378554021, + "grad_norm": 0.4028197151468667, + "learning_rate": 1.466975386434744e-06, + "loss": 0.5355, + "step": 1906 + }, + { + "epoch": 2.324532900081235, + "grad_norm": 0.3958394776231179, + "learning_rate": 1.4619585813699032e-06, + "loss": 0.5119, + "step": 1907 + }, + { + "epoch": 2.3257514216084485, + "grad_norm": 0.3640898960169131, + "learning_rate": 1.4569489000334435e-06, + "loss": 0.4749, + "step": 1908 + }, + { + "epoch": 2.326969943135662, + "grad_norm": 0.42215058629769103, + "learning_rate": 1.4519463525121934e-06, + "loss": 0.5157, + "step": 1909 + }, + { + "epoch": 2.3281884646628757, + "grad_norm": 0.38263832424003186, + "learning_rate": 1.4469509488786165e-06, + "loss": 0.509, + "step": 1910 + }, + { + "epoch": 2.3294069861900892, + "grad_norm": 0.41466783476778923, + "learning_rate": 1.4419626991907925e-06, + "loss": 0.5222, + "step": 1911 + }, + { + "epoch": 2.330625507717303, + "grad_norm": 0.3818440846598212, + "learning_rate": 1.436981613492394e-06, + "loss": 0.5153, + "step": 1912 + }, + { + "epoch": 2.331844029244517, + "grad_norm": 0.3744448281484142, + "learning_rate": 1.4320077018126704e-06, + "loss": 0.4932, + "step": 1913 + }, + { + "epoch": 2.3330625507717304, + "grad_norm": 0.35650000061694337, + "learning_rate": 1.427040974166427e-06, + "loss": 0.4711, + "step": 1914 + }, + { + "epoch": 2.334281072298944, + "grad_norm": 0.44375949024201433, + "learning_rate": 1.4220814405540067e-06, + "loss": 0.6081, + "step": 1915 + }, + { + "epoch": 2.3354995938261576, + "grad_norm": 0.3632417016214969, + "learning_rate": 1.4171291109612618e-06, + "loss": 0.4439, + "step": 1916 + }, + { + "epoch": 2.336718115353371, + "grad_norm": 0.391995578036673, + "learning_rate": 1.412183995359544e-06, + "loss": 0.5208, + "step": 1917 + }, + { + "epoch": 2.3379366368805847, + "grad_norm": 0.4106831300879619, + "learning_rate": 1.4072461037056806e-06, + "loss": 0.5185, + "step": 1918 + }, + { + "epoch": 2.3391551584077988, + "grad_norm": 0.37156715583934985, + "learning_rate": 1.4023154459419497e-06, + "loss": 0.492, + "step": 1919 + }, + { + "epoch": 2.3403736799350123, + "grad_norm": 0.4194748560314305, + "learning_rate": 1.3973920319960654e-06, + "loss": 0.5387, + "step": 1920 + }, + { + "epoch": 2.341592201462226, + "grad_norm": 0.4089237007134858, + "learning_rate": 1.3924758717811582e-06, + "loss": 0.5258, + "step": 1921 + }, + { + "epoch": 2.3428107229894395, + "grad_norm": 0.3694012500954815, + "learning_rate": 1.3875669751957548e-06, + "loss": 0.4604, + "step": 1922 + }, + { + "epoch": 2.344029244516653, + "grad_norm": 0.4054490233784107, + "learning_rate": 1.3826653521237526e-06, + "loss": 0.5113, + "step": 1923 + }, + { + "epoch": 2.3452477660438666, + "grad_norm": 0.39405086162290015, + "learning_rate": 1.3777710124344058e-06, + "loss": 0.5753, + "step": 1924 + }, + { + "epoch": 2.3464662875710802, + "grad_norm": 0.3708763755034275, + "learning_rate": 1.3728839659823045e-06, + "loss": 0.5154, + "step": 1925 + }, + { + "epoch": 2.3476848090982942, + "grad_norm": 0.3772580609058518, + "learning_rate": 1.3680042226073554e-06, + "loss": 0.4871, + "step": 1926 + }, + { + "epoch": 2.348903330625508, + "grad_norm": 0.39403743871876235, + "learning_rate": 1.3631317921347564e-06, + "loss": 0.5306, + "step": 1927 + }, + { + "epoch": 2.3501218521527214, + "grad_norm": 0.3873906953232042, + "learning_rate": 1.358266684374987e-06, + "loss": 0.5123, + "step": 1928 + }, + { + "epoch": 2.351340373679935, + "grad_norm": 0.37942846145056086, + "learning_rate": 1.3534089091237757e-06, + "loss": 0.5054, + "step": 1929 + }, + { + "epoch": 2.3525588952071486, + "grad_norm": 0.3607773079027305, + "learning_rate": 1.348558476162094e-06, + "loss": 0.481, + "step": 1930 + }, + { + "epoch": 2.353777416734362, + "grad_norm": 0.4005225791492384, + "learning_rate": 1.343715395256124e-06, + "loss": 0.5331, + "step": 1931 + }, + { + "epoch": 2.3549959382615757, + "grad_norm": 0.36782048392498773, + "learning_rate": 1.3388796761572493e-06, + "loss": 0.4872, + "step": 1932 + }, + { + "epoch": 2.3562144597887897, + "grad_norm": 0.38694592195175675, + "learning_rate": 1.3340513286020307e-06, + "loss": 0.5245, + "step": 1933 + }, + { + "epoch": 2.3574329813160033, + "grad_norm": 0.3951746347940695, + "learning_rate": 1.3292303623121828e-06, + "loss": 0.5296, + "step": 1934 + }, + { + "epoch": 2.358651502843217, + "grad_norm": 0.4201044387513612, + "learning_rate": 1.324416786994559e-06, + "loss": 0.5284, + "step": 1935 + }, + { + "epoch": 2.3598700243704305, + "grad_norm": 0.4088086029813002, + "learning_rate": 1.3196106123411345e-06, + "loss": 0.5212, + "step": 1936 + }, + { + "epoch": 2.361088545897644, + "grad_norm": 0.38621048916855116, + "learning_rate": 1.3148118480289834e-06, + "loss": 0.5078, + "step": 1937 + }, + { + "epoch": 2.362307067424858, + "grad_norm": 0.38423444639816645, + "learning_rate": 1.310020503720254e-06, + "loss": 0.5363, + "step": 1938 + }, + { + "epoch": 2.3635255889520717, + "grad_norm": 0.3807864286378039, + "learning_rate": 1.3052365890621615e-06, + "loss": 0.5349, + "step": 1939 + }, + { + "epoch": 2.3647441104792852, + "grad_norm": 0.3969153122233334, + "learning_rate": 1.3004601136869555e-06, + "loss": 0.5245, + "step": 1940 + }, + { + "epoch": 2.365962632006499, + "grad_norm": 0.3706070359987908, + "learning_rate": 1.295691087211912e-06, + "loss": 0.4639, + "step": 1941 + }, + { + "epoch": 2.3671811535337124, + "grad_norm": 0.4076566869065765, + "learning_rate": 1.2909295192393057e-06, + "loss": 0.5623, + "step": 1942 + }, + { + "epoch": 2.368399675060926, + "grad_norm": 0.3609321931094141, + "learning_rate": 1.2861754193563948e-06, + "loss": 0.4532, + "step": 1943 + }, + { + "epoch": 2.3696181965881395, + "grad_norm": 0.3775351942907534, + "learning_rate": 1.2814287971354023e-06, + "loss": 0.5515, + "step": 1944 + }, + { + "epoch": 2.3708367181153536, + "grad_norm": 0.37390993961577534, + "learning_rate": 1.2766896621334928e-06, + "loss": 0.5097, + "step": 1945 + }, + { + "epoch": 2.372055239642567, + "grad_norm": 0.37384134850125694, + "learning_rate": 1.2719580238927553e-06, + "loss": 0.5557, + "step": 1946 + }, + { + "epoch": 2.3732737611697807, + "grad_norm": 0.3775237757952146, + "learning_rate": 1.2672338919401866e-06, + "loss": 0.5197, + "step": 1947 + }, + { + "epoch": 2.3744922826969943, + "grad_norm": 0.3845839355688612, + "learning_rate": 1.2625172757876691e-06, + "loss": 0.5175, + "step": 1948 + }, + { + "epoch": 2.375710804224208, + "grad_norm": 0.3930000499088082, + "learning_rate": 1.2578081849319547e-06, + "loss": 0.4908, + "step": 1949 + }, + { + "epoch": 2.3769293257514215, + "grad_norm": 0.3679733160971826, + "learning_rate": 1.253106628854635e-06, + "loss": 0.4807, + "step": 1950 + }, + { + "epoch": 2.378147847278635, + "grad_norm": 0.41138237630333274, + "learning_rate": 1.2484126170221388e-06, + "loss": 0.5494, + "step": 1951 + }, + { + "epoch": 2.379366368805849, + "grad_norm": 0.3644013554869518, + "learning_rate": 1.2437261588857037e-06, + "loss": 0.4715, + "step": 1952 + }, + { + "epoch": 2.3805848903330626, + "grad_norm": 0.3754357671998505, + "learning_rate": 1.2390472638813572e-06, + "loss": 0.5106, + "step": 1953 + }, + { + "epoch": 2.381803411860276, + "grad_norm": 0.4210050485232648, + "learning_rate": 1.2343759414298955e-06, + "loss": 0.5755, + "step": 1954 + }, + { + "epoch": 2.38302193338749, + "grad_norm": 0.3648248351254956, + "learning_rate": 1.229712200936874e-06, + "loss": 0.4928, + "step": 1955 + }, + { + "epoch": 2.3842404549147034, + "grad_norm": 0.34960648428885643, + "learning_rate": 1.2250560517925747e-06, + "loss": 0.4643, + "step": 1956 + }, + { + "epoch": 2.385458976441917, + "grad_norm": 0.3790545819611439, + "learning_rate": 1.2204075033720025e-06, + "loss": 0.4949, + "step": 1957 + }, + { + "epoch": 2.386677497969131, + "grad_norm": 0.3711764526859297, + "learning_rate": 1.2157665650348516e-06, + "loss": 0.4838, + "step": 1958 + }, + { + "epoch": 2.3878960194963446, + "grad_norm": 0.4040275413347584, + "learning_rate": 1.211133246125497e-06, + "loss": 0.5255, + "step": 1959 + }, + { + "epoch": 2.389114541023558, + "grad_norm": 0.40676437477424116, + "learning_rate": 1.2065075559729749e-06, + "loss": 0.5417, + "step": 1960 + }, + { + "epoch": 2.3903330625507717, + "grad_norm": 0.3828497235073974, + "learning_rate": 1.201889503890955e-06, + "loss": 0.5003, + "step": 1961 + }, + { + "epoch": 2.3915515840779853, + "grad_norm": 0.40024563629448995, + "learning_rate": 1.197279099177731e-06, + "loss": 0.5627, + "step": 1962 + }, + { + "epoch": 2.392770105605199, + "grad_norm": 0.3508330713937016, + "learning_rate": 1.1926763511161993e-06, + "loss": 0.4607, + "step": 1963 + }, + { + "epoch": 2.393988627132413, + "grad_norm": 0.4280688001820654, + "learning_rate": 1.188081268973842e-06, + "loss": 0.5389, + "step": 1964 + }, + { + "epoch": 2.3952071486596265, + "grad_norm": 0.3853051982812674, + "learning_rate": 1.183493862002702e-06, + "loss": 0.4576, + "step": 1965 + }, + { + "epoch": 2.39642567018684, + "grad_norm": 0.4045347572603379, + "learning_rate": 1.1789141394393683e-06, + "loss": 0.5698, + "step": 1966 + }, + { + "epoch": 2.3976441917140536, + "grad_norm": 0.3983176516664499, + "learning_rate": 1.1743421105049612e-06, + "loss": 0.4725, + "step": 1967 + }, + { + "epoch": 2.398862713241267, + "grad_norm": 0.39056153824479495, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.5365, + "step": 1968 + }, + { + "epoch": 2.400081234768481, + "grad_norm": 0.38372959754674263, + "learning_rate": 1.165221170329931e-06, + "loss": 0.5051, + "step": 1969 + }, + { + "epoch": 2.4012997562956944, + "grad_norm": 0.3938860376877866, + "learning_rate": 1.1606722774540146e-06, + "loss": 0.4948, + "step": 1970 + }, + { + "epoch": 2.4025182778229084, + "grad_norm": 0.38806865450446354, + "learning_rate": 1.1561311149364075e-06, + "loss": 0.5132, + "step": 1971 + }, + { + "epoch": 2.403736799350122, + "grad_norm": 0.41405852900105905, + "learning_rate": 1.1515976919205869e-06, + "loss": 0.5287, + "step": 1972 + }, + { + "epoch": 2.4049553208773355, + "grad_norm": 0.41486030319490325, + "learning_rate": 1.1470720175344473e-06, + "loss": 0.4826, + "step": 1973 + }, + { + "epoch": 2.406173842404549, + "grad_norm": 0.4073479693226464, + "learning_rate": 1.1425541008902852e-06, + "loss": 0.5061, + "step": 1974 + }, + { + "epoch": 2.4073923639317627, + "grad_norm": 0.3698628863486537, + "learning_rate": 1.1380439510847757e-06, + "loss": 0.4822, + "step": 1975 + }, + { + "epoch": 2.4086108854589763, + "grad_norm": 0.40735700660846696, + "learning_rate": 1.1335415771989538e-06, + "loss": 0.5198, + "step": 1976 + }, + { + "epoch": 2.40982940698619, + "grad_norm": 0.39248680656683244, + "learning_rate": 1.1290469882981987e-06, + "loss": 0.5513, + "step": 1977 + }, + { + "epoch": 2.411047928513404, + "grad_norm": 0.36602544238604245, + "learning_rate": 1.1245601934322148e-06, + "loss": 0.5042, + "step": 1978 + }, + { + "epoch": 2.4122664500406175, + "grad_norm": 0.3765733599147946, + "learning_rate": 1.1200812016350172e-06, + "loss": 0.5031, + "step": 1979 + }, + { + "epoch": 2.413484971567831, + "grad_norm": 0.34319770025526913, + "learning_rate": 1.1156100219249022e-06, + "loss": 0.5049, + "step": 1980 + }, + { + "epoch": 2.4147034930950446, + "grad_norm": 0.4344345273336274, + "learning_rate": 1.1111466633044448e-06, + "loss": 0.6097, + "step": 1981 + }, + { + "epoch": 2.415922014622258, + "grad_norm": 0.4011208632543373, + "learning_rate": 1.1066911347604653e-06, + "loss": 0.4408, + "step": 1982 + }, + { + "epoch": 2.417140536149472, + "grad_norm": 0.3602299607841865, + "learning_rate": 1.1022434452640252e-06, + "loss": 0.4878, + "step": 1983 + }, + { + "epoch": 2.418359057676686, + "grad_norm": 0.3938174382324399, + "learning_rate": 1.0978036037703955e-06, + "loss": 0.5246, + "step": 1984 + }, + { + "epoch": 2.4195775792038994, + "grad_norm": 0.40627392150384173, + "learning_rate": 1.0933716192190502e-06, + "loss": 0.5191, + "step": 1985 + }, + { + "epoch": 2.420796100731113, + "grad_norm": 0.3960581508676551, + "learning_rate": 1.0889475005336447e-06, + "loss": 0.4755, + "step": 1986 + }, + { + "epoch": 2.4220146222583265, + "grad_norm": 0.3909127795418108, + "learning_rate": 1.0845312566219924e-06, + "loss": 0.5128, + "step": 1987 + }, + { + "epoch": 2.42323314378554, + "grad_norm": 0.39827799729251556, + "learning_rate": 1.0801228963760518e-06, + "loss": 0.5425, + "step": 1988 + }, + { + "epoch": 2.4244516653127537, + "grad_norm": 0.37511159704629343, + "learning_rate": 1.075722428671911e-06, + "loss": 0.4761, + "step": 1989 + }, + { + "epoch": 2.4256701868399677, + "grad_norm": 0.3982398641015277, + "learning_rate": 1.0713298623697654e-06, + "loss": 0.5386, + "step": 1990 + }, + { + "epoch": 2.4268887083671813, + "grad_norm": 0.3947556572315106, + "learning_rate": 1.0669452063138992e-06, + "loss": 0.4842, + "step": 1991 + }, + { + "epoch": 2.428107229894395, + "grad_norm": 0.40576214681574757, + "learning_rate": 1.0625684693326727e-06, + "loss": 0.5423, + "step": 1992 + }, + { + "epoch": 2.4293257514216084, + "grad_norm": 0.40693906455637163, + "learning_rate": 1.0581996602384975e-06, + "loss": 0.5159, + "step": 1993 + }, + { + "epoch": 2.430544272948822, + "grad_norm": 0.3488770060857356, + "learning_rate": 1.0538387878278283e-06, + "loss": 0.5187, + "step": 1994 + }, + { + "epoch": 2.4317627944760356, + "grad_norm": 0.4158444319209436, + "learning_rate": 1.0494858608811326e-06, + "loss": 0.5313, + "step": 1995 + }, + { + "epoch": 2.432981316003249, + "grad_norm": 0.4169970056929273, + "learning_rate": 1.0451408881628855e-06, + "loss": 0.4866, + "step": 1996 + }, + { + "epoch": 2.434199837530463, + "grad_norm": 0.3823144671122315, + "learning_rate": 1.0408038784215462e-06, + "loss": 0.4871, + "step": 1997 + }, + { + "epoch": 2.435418359057677, + "grad_norm": 0.38435379244248535, + "learning_rate": 1.0364748403895368e-06, + "loss": 0.5276, + "step": 1998 + }, + { + "epoch": 2.4366368805848904, + "grad_norm": 0.39978814004238356, + "learning_rate": 1.0321537827832311e-06, + "loss": 0.5374, + "step": 1999 + }, + { + "epoch": 2.437855402112104, + "grad_norm": 0.3858037469760484, + "learning_rate": 1.0278407143029346e-06, + "loss": 0.4967, + "step": 2000 + }, + { + "epoch": 2.4390739236393175, + "grad_norm": 0.36427416648269983, + "learning_rate": 1.0235356436328675e-06, + "loss": 0.5147, + "step": 2001 + }, + { + "epoch": 2.440292445166531, + "grad_norm": 0.42530639031865014, + "learning_rate": 1.019238579441148e-06, + "loss": 0.4949, + "step": 2002 + }, + { + "epoch": 2.4415109666937447, + "grad_norm": 0.4047119130435624, + "learning_rate": 1.014949530379767e-06, + "loss": 0.491, + "step": 2003 + }, + { + "epoch": 2.4427294882209587, + "grad_norm": 0.38798880943669517, + "learning_rate": 1.0106685050845838e-06, + "loss": 0.5433, + "step": 2004 + }, + { + "epoch": 2.4439480097481723, + "grad_norm": 0.4009909590310544, + "learning_rate": 1.0063955121752999e-06, + "loss": 0.5113, + "step": 2005 + }, + { + "epoch": 2.445166531275386, + "grad_norm": 0.35796810794196016, + "learning_rate": 1.0021305602554459e-06, + "loss": 0.5113, + "step": 2006 + }, + { + "epoch": 2.4463850528025994, + "grad_norm": 0.3755800564081513, + "learning_rate": 9.978736579123577e-07, + "loss": 0.5004, + "step": 2007 + }, + { + "epoch": 2.447603574329813, + "grad_norm": 0.3683007765287416, + "learning_rate": 9.936248137171684e-07, + "loss": 0.4974, + "step": 2008 + }, + { + "epoch": 2.448822095857027, + "grad_norm": 0.3952937239349372, + "learning_rate": 9.893840362247809e-07, + "loss": 0.4971, + "step": 2009 + }, + { + "epoch": 2.4500406173842406, + "grad_norm": 0.4284741934979282, + "learning_rate": 9.851513339738627e-07, + "loss": 0.561, + "step": 2010 + }, + { + "epoch": 2.451259138911454, + "grad_norm": 0.4056720067131079, + "learning_rate": 9.809267154868163e-07, + "loss": 0.5179, + "step": 2011 + }, + { + "epoch": 2.4524776604386678, + "grad_norm": 0.35255993479863745, + "learning_rate": 9.7671018926977e-07, + "loss": 0.4424, + "step": 2012 + }, + { + "epoch": 2.4536961819658814, + "grad_norm": 0.40547305130915784, + "learning_rate": 9.725017638125612e-07, + "loss": 0.5524, + "step": 2013 + }, + { + "epoch": 2.454914703493095, + "grad_norm": 0.35656299866798635, + "learning_rate": 9.683014475887126e-07, + "loss": 0.4676, + "step": 2014 + }, + { + "epoch": 2.4561332250203085, + "grad_norm": 0.37086670515583137, + "learning_rate": 9.641092490554195e-07, + "loss": 0.5398, + "step": 2015 + }, + { + "epoch": 2.4573517465475225, + "grad_norm": 0.3791246421084454, + "learning_rate": 9.599251766535344e-07, + "loss": 0.4933, + "step": 2016 + }, + { + "epoch": 2.458570268074736, + "grad_norm": 0.40912338906986023, + "learning_rate": 9.5574923880755e-07, + "loss": 0.562, + "step": 2017 + }, + { + "epoch": 2.4597887896019497, + "grad_norm": 0.41181426146954847, + "learning_rate": 9.51581443925576e-07, + "loss": 0.4892, + "step": 2018 + }, + { + "epoch": 2.4610073111291633, + "grad_norm": 0.4026513287049664, + "learning_rate": 9.474218003993275e-07, + "loss": 0.5278, + "step": 2019 + }, + { + "epoch": 2.462225832656377, + "grad_norm": 0.38613963899490084, + "learning_rate": 9.432703166041085e-07, + "loss": 0.4996, + "step": 2020 + }, + { + "epoch": 2.4634443541835904, + "grad_norm": 0.38482050851065364, + "learning_rate": 9.391270008987946e-07, + "loss": 0.5189, + "step": 2021 + }, + { + "epoch": 2.464662875710804, + "grad_norm": 0.39758696168412205, + "learning_rate": 9.349918616258113e-07, + "loss": 0.5078, + "step": 2022 + }, + { + "epoch": 2.465881397238018, + "grad_norm": 0.38614047610686164, + "learning_rate": 9.308649071111259e-07, + "loss": 0.4729, + "step": 2023 + }, + { + "epoch": 2.4670999187652316, + "grad_norm": 0.37024789300038397, + "learning_rate": 9.267461456642235e-07, + "loss": 0.5187, + "step": 2024 + }, + { + "epoch": 2.468318440292445, + "grad_norm": 0.393874443470908, + "learning_rate": 9.226355855780922e-07, + "loss": 0.5266, + "step": 2025 + }, + { + "epoch": 2.4695369618196588, + "grad_norm": 0.39515255892810097, + "learning_rate": 9.185332351292059e-07, + "loss": 0.4979, + "step": 2026 + }, + { + "epoch": 2.4707554833468723, + "grad_norm": 0.36209573390534977, + "learning_rate": 9.144391025775123e-07, + "loss": 0.4685, + "step": 2027 + }, + { + "epoch": 2.471974004874086, + "grad_norm": 0.3690675939364142, + "learning_rate": 9.10353196166412e-07, + "loss": 0.5109, + "step": 2028 + }, + { + "epoch": 2.4731925264013, + "grad_norm": 0.4126364096164172, + "learning_rate": 9.0627552412274e-07, + "loss": 0.551, + "step": 2029 + }, + { + "epoch": 2.4744110479285135, + "grad_norm": 0.39808305056022897, + "learning_rate": 9.022060946567512e-07, + "loss": 0.4829, + "step": 2030 + }, + { + "epoch": 2.475629569455727, + "grad_norm": 0.3791592284080857, + "learning_rate": 8.981449159621075e-07, + "loss": 0.4993, + "step": 2031 + }, + { + "epoch": 2.4768480909829407, + "grad_norm": 0.3890390516980624, + "learning_rate": 8.940919962158584e-07, + "loss": 0.5213, + "step": 2032 + }, + { + "epoch": 2.4780666125101543, + "grad_norm": 0.42524657999992466, + "learning_rate": 8.900473435784196e-07, + "loss": 0.5666, + "step": 2033 + }, + { + "epoch": 2.479285134037368, + "grad_norm": 0.3815964696084361, + "learning_rate": 8.860109661935673e-07, + "loss": 0.4625, + "step": 2034 + }, + { + "epoch": 2.480503655564582, + "grad_norm": 0.42469861666467223, + "learning_rate": 8.819828721884094e-07, + "loss": 0.5373, + "step": 2035 + }, + { + "epoch": 2.4817221770917954, + "grad_norm": 0.38320361649924684, + "learning_rate": 8.779630696733821e-07, + "loss": 0.5375, + "step": 2036 + }, + { + "epoch": 2.482940698619009, + "grad_norm": 0.3687214848508832, + "learning_rate": 8.739515667422211e-07, + "loss": 0.4435, + "step": 2037 + }, + { + "epoch": 2.4841592201462226, + "grad_norm": 0.40061711007827416, + "learning_rate": 8.699483714719547e-07, + "loss": 0.5467, + "step": 2038 + }, + { + "epoch": 2.485377741673436, + "grad_norm": 0.40521522379814523, + "learning_rate": 8.659534919228845e-07, + "loss": 0.536, + "step": 2039 + }, + { + "epoch": 2.4865962632006497, + "grad_norm": 0.3672113864753048, + "learning_rate": 8.619669361385663e-07, + "loss": 0.4978, + "step": 2040 + }, + { + "epoch": 2.4878147847278633, + "grad_norm": 0.3620893091593676, + "learning_rate": 8.579887121457952e-07, + "loss": 0.5038, + "step": 2041 + }, + { + "epoch": 2.4890333062550773, + "grad_norm": 0.3663778220669876, + "learning_rate": 8.540188279545942e-07, + "loss": 0.4862, + "step": 2042 + }, + { + "epoch": 2.490251827782291, + "grad_norm": 0.38043441191253624, + "learning_rate": 8.500572915581923e-07, + "loss": 0.5152, + "step": 2043 + }, + { + "epoch": 2.4914703493095045, + "grad_norm": 0.3942635437659399, + "learning_rate": 8.461041109330132e-07, + "loss": 0.5055, + "step": 2044 + }, + { + "epoch": 2.492688870836718, + "grad_norm": 0.3729970950643679, + "learning_rate": 8.421592940386514e-07, + "loss": 0.5022, + "step": 2045 + }, + { + "epoch": 2.4939073923639317, + "grad_norm": 0.40364708615741257, + "learning_rate": 8.382228488178639e-07, + "loss": 0.5297, + "step": 2046 + }, + { + "epoch": 2.4951259138911452, + "grad_norm": 0.3841836875471797, + "learning_rate": 8.342947831965537e-07, + "loss": 0.4594, + "step": 2047 + }, + { + "epoch": 2.496344435418359, + "grad_norm": 0.39167222446559674, + "learning_rate": 8.3037510508375e-07, + "loss": 0.538, + "step": 2048 + }, + { + "epoch": 2.497562956945573, + "grad_norm": 0.36017475560838597, + "learning_rate": 8.264638223715916e-07, + "loss": 0.4904, + "step": 2049 + }, + { + "epoch": 2.4987814784727864, + "grad_norm": 0.38494521342543364, + "learning_rate": 8.225609429353187e-07, + "loss": 0.5098, + "step": 2050 + }, + { + "epoch": 2.5, + "grad_norm": 0.3915568133305174, + "learning_rate": 8.186664746332457e-07, + "loss": 0.5479, + "step": 2051 + }, + { + "epoch": 2.5012185215272136, + "grad_norm": 0.3653001722783512, + "learning_rate": 8.147804253067581e-07, + "loss": 0.5505, + "step": 2052 + }, + { + "epoch": 2.502437043054427, + "grad_norm": 0.38529539896383097, + "learning_rate": 8.109028027802834e-07, + "loss": 0.5075, + "step": 2053 + }, + { + "epoch": 2.503655564581641, + "grad_norm": 0.32985269566739706, + "learning_rate": 8.070336148612873e-07, + "loss": 0.4737, + "step": 2054 + }, + { + "epoch": 2.5048740861088543, + "grad_norm": 0.3688596078684635, + "learning_rate": 8.031728693402502e-07, + "loss": 0.4933, + "step": 2055 + }, + { + "epoch": 2.5060926076360683, + "grad_norm": 0.3574147764462336, + "learning_rate": 7.993205739906551e-07, + "loss": 0.5036, + "step": 2056 + }, + { + "epoch": 2.507311129163282, + "grad_norm": 0.3933673997370336, + "learning_rate": 7.954767365689675e-07, + "loss": 0.5284, + "step": 2057 + }, + { + "epoch": 2.5085296506904955, + "grad_norm": 0.3804892757598497, + "learning_rate": 7.916413648146282e-07, + "loss": 0.5314, + "step": 2058 + }, + { + "epoch": 2.509748172217709, + "grad_norm": 0.3972280482401795, + "learning_rate": 7.878144664500304e-07, + "loss": 0.5042, + "step": 2059 + }, + { + "epoch": 2.5109666937449227, + "grad_norm": 0.4139170649517036, + "learning_rate": 7.839960491805048e-07, + "loss": 0.513, + "step": 2060 + }, + { + "epoch": 2.5121852152721367, + "grad_norm": 0.3682306221980358, + "learning_rate": 7.80186120694309e-07, + "loss": 0.5082, + "step": 2061 + }, + { + "epoch": 2.5134037367993503, + "grad_norm": 0.40743403282060575, + "learning_rate": 7.763846886626048e-07, + "loss": 0.4982, + "step": 2062 + }, + { + "epoch": 2.514622258326564, + "grad_norm": 0.3807959558438016, + "learning_rate": 7.725917607394512e-07, + "loss": 0.4893, + "step": 2063 + }, + { + "epoch": 2.5158407798537774, + "grad_norm": 0.3774151979891591, + "learning_rate": 7.6880734456178e-07, + "loss": 0.5308, + "step": 2064 + }, + { + "epoch": 2.517059301380991, + "grad_norm": 0.39200277210093626, + "learning_rate": 7.650314477493875e-07, + "loss": 0.5221, + "step": 2065 + }, + { + "epoch": 2.5182778229082046, + "grad_norm": 0.3987158423288902, + "learning_rate": 7.612640779049174e-07, + "loss": 0.5387, + "step": 2066 + }, + { + "epoch": 2.519496344435418, + "grad_norm": 0.3432299316334845, + "learning_rate": 7.575052426138424e-07, + "loss": 0.448, + "step": 2067 + }, + { + "epoch": 2.520714865962632, + "grad_norm": 0.40306877146829656, + "learning_rate": 7.537549494444502e-07, + "loss": 0.5319, + "step": 2068 + }, + { + "epoch": 2.5219333874898457, + "grad_norm": 0.3624054180666312, + "learning_rate": 7.500132059478327e-07, + "loss": 0.4755, + "step": 2069 + }, + { + "epoch": 2.5231519090170593, + "grad_norm": 0.3943720013643357, + "learning_rate": 7.462800196578662e-07, + "loss": 0.5517, + "step": 2070 + }, + { + "epoch": 2.524370430544273, + "grad_norm": 0.3760692184644974, + "learning_rate": 7.425553980911959e-07, + "loss": 0.5198, + "step": 2071 + }, + { + "epoch": 2.5255889520714865, + "grad_norm": 0.36875663183404517, + "learning_rate": 7.388393487472223e-07, + "loss": 0.5099, + "step": 2072 + }, + { + "epoch": 2.5268074735987005, + "grad_norm": 0.3765385801804941, + "learning_rate": 7.351318791080881e-07, + "loss": 0.4877, + "step": 2073 + }, + { + "epoch": 2.5280259951259136, + "grad_norm": 0.3880867322532877, + "learning_rate": 7.314329966386596e-07, + "loss": 0.5191, + "step": 2074 + }, + { + "epoch": 2.5292445166531277, + "grad_norm": 0.38480762192630874, + "learning_rate": 7.277427087865124e-07, + "loss": 0.5367, + "step": 2075 + }, + { + "epoch": 2.5304630381803412, + "grad_norm": 0.37367690807549686, + "learning_rate": 7.240610229819195e-07, + "loss": 0.4796, + "step": 2076 + }, + { + "epoch": 2.531681559707555, + "grad_norm": 0.356459470205227, + "learning_rate": 7.203879466378311e-07, + "loss": 0.4846, + "step": 2077 + }, + { + "epoch": 2.5329000812347684, + "grad_norm": 0.368312803237026, + "learning_rate": 7.167234871498646e-07, + "loss": 0.512, + "step": 2078 + }, + { + "epoch": 2.534118602761982, + "grad_norm": 0.42790949260764394, + "learning_rate": 7.130676518962859e-07, + "loss": 0.5199, + "step": 2079 + }, + { + "epoch": 2.535337124289196, + "grad_norm": 0.3760245356587111, + "learning_rate": 7.094204482379985e-07, + "loss": 0.5206, + "step": 2080 + }, + { + "epoch": 2.5365556458164096, + "grad_norm": 0.36529563925832975, + "learning_rate": 7.057818835185243e-07, + "loss": 0.5169, + "step": 2081 + }, + { + "epoch": 2.537774167343623, + "grad_norm": 0.37415123963436103, + "learning_rate": 7.021519650639952e-07, + "loss": 0.4682, + "step": 2082 + }, + { + "epoch": 2.5389926888708367, + "grad_norm": 0.3599256024573686, + "learning_rate": 6.985307001831266e-07, + "loss": 0.5237, + "step": 2083 + }, + { + "epoch": 2.5402112103980503, + "grad_norm": 0.37172969261280475, + "learning_rate": 6.949180961672159e-07, + "loss": 0.5229, + "step": 2084 + }, + { + "epoch": 2.541429731925264, + "grad_norm": 0.3692464609849223, + "learning_rate": 6.913141602901213e-07, + "loss": 0.4967, + "step": 2085 + }, + { + "epoch": 2.5426482534524775, + "grad_norm": 0.41481021551912467, + "learning_rate": 6.877188998082484e-07, + "loss": 0.5364, + "step": 2086 + }, + { + "epoch": 2.5438667749796915, + "grad_norm": 0.3587567944310898, + "learning_rate": 6.841323219605333e-07, + "loss": 0.477, + "step": 2087 + }, + { + "epoch": 2.545085296506905, + "grad_norm": 0.36227017983644627, + "learning_rate": 6.805544339684295e-07, + "loss": 0.5186, + "step": 2088 + }, + { + "epoch": 2.5463038180341186, + "grad_norm": 0.3848961894752312, + "learning_rate": 6.769852430358969e-07, + "loss": 0.494, + "step": 2089 + }, + { + "epoch": 2.5475223395613322, + "grad_norm": 0.400827672871941, + "learning_rate": 6.734247563493829e-07, + "loss": 0.5104, + "step": 2090 + }, + { + "epoch": 2.548740861088546, + "grad_norm": 0.3858206572812583, + "learning_rate": 6.698729810778065e-07, + "loss": 0.5203, + "step": 2091 + }, + { + "epoch": 2.5499593826157594, + "grad_norm": 0.39420570104347397, + "learning_rate": 6.663299243725512e-07, + "loss": 0.514, + "step": 2092 + }, + { + "epoch": 2.551177904142973, + "grad_norm": 0.37623344903141814, + "learning_rate": 6.627955933674412e-07, + "loss": 0.4675, + "step": 2093 + }, + { + "epoch": 2.552396425670187, + "grad_norm": 0.37984856280561025, + "learning_rate": 6.592699951787362e-07, + "loss": 0.5349, + "step": 2094 + }, + { + "epoch": 2.5536149471974006, + "grad_norm": 0.38942296808421134, + "learning_rate": 6.55753136905109e-07, + "loss": 0.5222, + "step": 2095 + }, + { + "epoch": 2.554833468724614, + "grad_norm": 0.38744941426091656, + "learning_rate": 6.522450256276363e-07, + "loss": 0.4997, + "step": 2096 + }, + { + "epoch": 2.5560519902518277, + "grad_norm": 0.40862429991424404, + "learning_rate": 6.487456684097848e-07, + "loss": 0.5409, + "step": 2097 + }, + { + "epoch": 2.5572705117790413, + "grad_norm": 0.37635062650001033, + "learning_rate": 6.452550722973927e-07, + "loss": 0.4627, + "step": 2098 + }, + { + "epoch": 2.5584890333062553, + "grad_norm": 0.4221777822228316, + "learning_rate": 6.417732443186575e-07, + "loss": 0.5358, + "step": 2099 + }, + { + "epoch": 2.5597075548334685, + "grad_norm": 0.39847174733267055, + "learning_rate": 6.383001914841252e-07, + "loss": 0.5012, + "step": 2100 + }, + { + "epoch": 2.5609260763606825, + "grad_norm": 0.3748715416676312, + "learning_rate": 6.348359207866722e-07, + "loss": 0.4956, + "step": 2101 + }, + { + "epoch": 2.562144597887896, + "grad_norm": 0.37750025006496746, + "learning_rate": 6.313804392014905e-07, + "loss": 0.4854, + "step": 2102 + }, + { + "epoch": 2.5633631194151096, + "grad_norm": 0.3998375296968308, + "learning_rate": 6.279337536860786e-07, + "loss": 0.5143, + "step": 2103 + }, + { + "epoch": 2.564581640942323, + "grad_norm": 0.3710721048856582, + "learning_rate": 6.244958711802213e-07, + "loss": 0.5591, + "step": 2104 + }, + { + "epoch": 2.565800162469537, + "grad_norm": 0.34868738151134687, + "learning_rate": 6.210667986059821e-07, + "loss": 0.4551, + "step": 2105 + }, + { + "epoch": 2.567018683996751, + "grad_norm": 0.35595641503961983, + "learning_rate": 6.17646542867682e-07, + "loss": 0.5152, + "step": 2106 + }, + { + "epoch": 2.5682372055239644, + "grad_norm": 0.36663979047928985, + "learning_rate": 6.142351108518929e-07, + "loss": 0.503, + "step": 2107 + }, + { + "epoch": 2.569455727051178, + "grad_norm": 0.34787252687208675, + "learning_rate": 6.108325094274209e-07, + "loss": 0.5031, + "step": 2108 + }, + { + "epoch": 2.5706742485783916, + "grad_norm": 0.39033263561688103, + "learning_rate": 6.074387454452891e-07, + "loss": 0.5214, + "step": 2109 + }, + { + "epoch": 2.571892770105605, + "grad_norm": 0.38512927731883373, + "learning_rate": 6.040538257387268e-07, + "loss": 0.5198, + "step": 2110 + }, + { + "epoch": 2.5731112916328187, + "grad_norm": 0.3590301126097114, + "learning_rate": 6.006777571231587e-07, + "loss": 0.5027, + "step": 2111 + }, + { + "epoch": 2.5743298131600323, + "grad_norm": 0.3732504638805604, + "learning_rate": 5.973105463961864e-07, + "loss": 0.5066, + "step": 2112 + }, + { + "epoch": 2.5755483346872463, + "grad_norm": 0.3729739011338398, + "learning_rate": 5.939522003375753e-07, + "loss": 0.4958, + "step": 2113 + }, + { + "epoch": 2.57676685621446, + "grad_norm": 0.37186730911837346, + "learning_rate": 5.906027257092444e-07, + "loss": 0.4761, + "step": 2114 + }, + { + "epoch": 2.5779853777416735, + "grad_norm": 0.3661760756265481, + "learning_rate": 5.872621292552477e-07, + "loss": 0.5327, + "step": 2115 + }, + { + "epoch": 2.579203899268887, + "grad_norm": 0.40542839626324956, + "learning_rate": 5.839304177017663e-07, + "loss": 0.5512, + "step": 2116 + }, + { + "epoch": 2.5804224207961006, + "grad_norm": 0.3840467276263846, + "learning_rate": 5.806075977570886e-07, + "loss": 0.4793, + "step": 2117 + }, + { + "epoch": 2.5816409423233146, + "grad_norm": 0.37820337565321277, + "learning_rate": 5.772936761116027e-07, + "loss": 0.506, + "step": 2118 + }, + { + "epoch": 2.582859463850528, + "grad_norm": 0.3797306170789339, + "learning_rate": 5.739886594377803e-07, + "loss": 0.508, + "step": 2119 + }, + { + "epoch": 2.584077985377742, + "grad_norm": 0.3828935693851265, + "learning_rate": 5.706925543901609e-07, + "loss": 0.5097, + "step": 2120 + }, + { + "epoch": 2.5852965069049554, + "grad_norm": 0.3900080504691436, + "learning_rate": 5.674053676053415e-07, + "loss": 0.5168, + "step": 2121 + }, + { + "epoch": 2.586515028432169, + "grad_norm": 0.3587725291460617, + "learning_rate": 5.641271057019637e-07, + "loss": 0.4565, + "step": 2122 + }, + { + "epoch": 2.5877335499593825, + "grad_norm": 0.3939424632788925, + "learning_rate": 5.608577752806987e-07, + "loss": 0.5494, + "step": 2123 + }, + { + "epoch": 2.588952071486596, + "grad_norm": 0.3725432276278501, + "learning_rate": 5.575973829242365e-07, + "loss": 0.4588, + "step": 2124 + }, + { + "epoch": 2.59017059301381, + "grad_norm": 0.38604468058456287, + "learning_rate": 5.543459351972635e-07, + "loss": 0.529, + "step": 2125 + }, + { + "epoch": 2.5913891145410237, + "grad_norm": 0.36341318860508387, + "learning_rate": 5.511034386464642e-07, + "loss": 0.494, + "step": 2126 + }, + { + "epoch": 2.5926076360682373, + "grad_norm": 0.35625493095798805, + "learning_rate": 5.478698998004967e-07, + "loss": 0.5456, + "step": 2127 + }, + { + "epoch": 2.593826157595451, + "grad_norm": 0.36227564286221264, + "learning_rate": 5.446453251699851e-07, + "loss": 0.514, + "step": 2128 + }, + { + "epoch": 2.5950446791226645, + "grad_norm": 0.3662431166869742, + "learning_rate": 5.414297212475012e-07, + "loss": 0.5157, + "step": 2129 + }, + { + "epoch": 2.596263200649878, + "grad_norm": 0.3558072452798451, + "learning_rate": 5.382230945075556e-07, + "loss": 0.4961, + "step": 2130 + }, + { + "epoch": 2.5974817221770916, + "grad_norm": 0.3795263836967965, + "learning_rate": 5.350254514065856e-07, + "loss": 0.5127, + "step": 2131 + }, + { + "epoch": 2.5987002437043056, + "grad_norm": 0.3690040036136185, + "learning_rate": 5.318367983829393e-07, + "loss": 0.4908, + "step": 2132 + }, + { + "epoch": 2.599918765231519, + "grad_norm": 0.3608821461773019, + "learning_rate": 5.286571418568615e-07, + "loss": 0.5289, + "step": 2133 + }, + { + "epoch": 2.601137286758733, + "grad_norm": 0.4006495491671045, + "learning_rate": 5.254864882304855e-07, + "loss": 0.5254, + "step": 2134 + }, + { + "epoch": 2.6023558082859464, + "grad_norm": 0.38150929128537214, + "learning_rate": 5.223248438878176e-07, + "loss": 0.4622, + "step": 2135 + }, + { + "epoch": 2.60357432981316, + "grad_norm": 0.400783680185111, + "learning_rate": 5.191722151947227e-07, + "loss": 0.5474, + "step": 2136 + }, + { + "epoch": 2.6047928513403735, + "grad_norm": 0.3662412318337768, + "learning_rate": 5.160286084989119e-07, + "loss": 0.536, + "step": 2137 + }, + { + "epoch": 2.606011372867587, + "grad_norm": 0.37308572148487257, + "learning_rate": 5.128940301299334e-07, + "loss": 0.4731, + "step": 2138 + }, + { + "epoch": 2.607229894394801, + "grad_norm": 0.39187078975715245, + "learning_rate": 5.097684863991575e-07, + "loss": 0.5249, + "step": 2139 + }, + { + "epoch": 2.6084484159220147, + "grad_norm": 0.3885064721528569, + "learning_rate": 5.066519835997613e-07, + "loss": 0.5225, + "step": 2140 + }, + { + "epoch": 2.6096669374492283, + "grad_norm": 0.41543896829402627, + "learning_rate": 5.03544528006718e-07, + "loss": 0.5476, + "step": 2141 + }, + { + "epoch": 2.610885458976442, + "grad_norm": 0.33915812403705176, + "learning_rate": 5.004461258767873e-07, + "loss": 0.4825, + "step": 2142 + }, + { + "epoch": 2.6121039805036554, + "grad_norm": 0.39963867108256157, + "learning_rate": 4.973567834484988e-07, + "loss": 0.4868, + "step": 2143 + }, + { + "epoch": 2.6133225020308695, + "grad_norm": 0.4052069661227251, + "learning_rate": 4.942765069421384e-07, + "loss": 0.5707, + "step": 2144 + }, + { + "epoch": 2.6145410235580826, + "grad_norm": 0.3744715850855104, + "learning_rate": 4.91205302559743e-07, + "loss": 0.4698, + "step": 2145 + }, + { + "epoch": 2.6157595450852966, + "grad_norm": 0.39172802789195654, + "learning_rate": 4.881431764850775e-07, + "loss": 0.5429, + "step": 2146 + }, + { + "epoch": 2.61697806661251, + "grad_norm": 0.3617617734796279, + "learning_rate": 4.850901348836328e-07, + "loss": 0.5195, + "step": 2147 + }, + { + "epoch": 2.618196588139724, + "grad_norm": 0.3582182319101665, + "learning_rate": 4.820461839026047e-07, + "loss": 0.5237, + "step": 2148 + }, + { + "epoch": 2.6194151096669374, + "grad_norm": 0.382565389265081, + "learning_rate": 4.79011329670887e-07, + "loss": 0.508, + "step": 2149 + }, + { + "epoch": 2.620633631194151, + "grad_norm": 0.36371999280375944, + "learning_rate": 4.7598557829905913e-07, + "loss": 0.5138, + "step": 2150 + }, + { + "epoch": 2.621852152721365, + "grad_norm": 0.36372813807546805, + "learning_rate": 4.729689358793693e-07, + "loss": 0.4863, + "step": 2151 + }, + { + "epoch": 2.6230706742485785, + "grad_norm": 0.4358272328748702, + "learning_rate": 4.699614084857257e-07, + "loss": 0.5501, + "step": 2152 + }, + { + "epoch": 2.624289195775792, + "grad_norm": 0.40082789201202496, + "learning_rate": 4.669630021736854e-07, + "loss": 0.4957, + "step": 2153 + }, + { + "epoch": 2.6255077173030057, + "grad_norm": 0.38531826765138316, + "learning_rate": 4.639737229804403e-07, + "loss": 0.5189, + "step": 2154 + }, + { + "epoch": 2.6267262388302193, + "grad_norm": 0.3510117904392168, + "learning_rate": 4.609935769248025e-07, + "loss": 0.4438, + "step": 2155 + }, + { + "epoch": 2.627944760357433, + "grad_norm": 0.3854632098940677, + "learning_rate": 4.5802257000719885e-07, + "loss": 0.5672, + "step": 2156 + }, + { + "epoch": 2.6291632818846464, + "grad_norm": 0.356713590588076, + "learning_rate": 4.5506070820964973e-07, + "loss": 0.4941, + "step": 2157 + }, + { + "epoch": 2.6303818034118605, + "grad_norm": 0.37107534196018116, + "learning_rate": 4.5210799749576815e-07, + "loss": 0.537, + "step": 2158 + }, + { + "epoch": 2.631600324939074, + "grad_norm": 0.36951174703750844, + "learning_rate": 4.4916444381073674e-07, + "loss": 0.487, + "step": 2159 + }, + { + "epoch": 2.6328188464662876, + "grad_norm": 0.3737744583819628, + "learning_rate": 4.4623005308130243e-07, + "loss": 0.5047, + "step": 2160 + }, + { + "epoch": 2.634037367993501, + "grad_norm": 0.41814109045623277, + "learning_rate": 4.433048312157651e-07, + "loss": 0.4921, + "step": 2161 + }, + { + "epoch": 2.6352558895207148, + "grad_norm": 0.38314084064991044, + "learning_rate": 4.4038878410396003e-07, + "loss": 0.545, + "step": 2162 + }, + { + "epoch": 2.636474411047929, + "grad_norm": 0.34232486717400545, + "learning_rate": 4.374819176172501e-07, + "loss": 0.451, + "step": 2163 + }, + { + "epoch": 2.637692932575142, + "grad_norm": 0.4161225048009829, + "learning_rate": 4.3458423760851523e-07, + "loss": 0.5468, + "step": 2164 + }, + { + "epoch": 2.638911454102356, + "grad_norm": 0.3670515864821956, + "learning_rate": 4.316957499121377e-07, + "loss": 0.5067, + "step": 2165 + }, + { + "epoch": 2.6401299756295695, + "grad_norm": 0.3624129319596192, + "learning_rate": 4.2881646034398926e-07, + "loss": 0.4816, + "step": 2166 + }, + { + "epoch": 2.641348497156783, + "grad_norm": 0.3972920967019095, + "learning_rate": 4.2594637470142587e-07, + "loss": 0.5452, + "step": 2167 + }, + { + "epoch": 2.6425670186839967, + "grad_norm": 0.36647997443354524, + "learning_rate": 4.230854987632671e-07, + "loss": 0.4962, + "step": 2168 + }, + { + "epoch": 2.6437855402112103, + "grad_norm": 0.38616087967711843, + "learning_rate": 4.2023383828979305e-07, + "loss": 0.5471, + "step": 2169 + }, + { + "epoch": 2.6450040617384243, + "grad_norm": 0.35103710867257426, + "learning_rate": 4.173913990227252e-07, + "loss": 0.4679, + "step": 2170 + }, + { + "epoch": 2.6462225832656374, + "grad_norm": 0.39309483512948734, + "learning_rate": 4.145581866852211e-07, + "loss": 0.5224, + "step": 2171 + }, + { + "epoch": 2.6474411047928514, + "grad_norm": 0.38439655446848475, + "learning_rate": 4.1173420698186027e-07, + "loss": 0.504, + "step": 2172 + }, + { + "epoch": 2.648659626320065, + "grad_norm": 0.3577220323312346, + "learning_rate": 4.089194655986306e-07, + "loss": 0.5131, + "step": 2173 + }, + { + "epoch": 2.6498781478472786, + "grad_norm": 0.36326136971896916, + "learning_rate": 4.0611396820291915e-07, + "loss": 0.5451, + "step": 2174 + }, + { + "epoch": 2.651096669374492, + "grad_norm": 0.36056618051796013, + "learning_rate": 4.0331772044350235e-07, + "loss": 0.5175, + "step": 2175 + }, + { + "epoch": 2.6523151909017058, + "grad_norm": 0.35979010393998906, + "learning_rate": 4.0053072795053163e-07, + "loss": 0.5057, + "step": 2176 + }, + { + "epoch": 2.6535337124289198, + "grad_norm": 0.38026873249239673, + "learning_rate": 3.9775299633552535e-07, + "loss": 0.5173, + "step": 2177 + }, + { + "epoch": 2.6547522339561334, + "grad_norm": 0.34812609416605766, + "learning_rate": 3.9498453119134917e-07, + "loss": 0.4774, + "step": 2178 + }, + { + "epoch": 2.655970755483347, + "grad_norm": 0.37873267778767, + "learning_rate": 3.9222533809221864e-07, + "loss": 0.5171, + "step": 2179 + }, + { + "epoch": 2.6571892770105605, + "grad_norm": 0.3876778226332137, + "learning_rate": 3.894754225936753e-07, + "loss": 0.5367, + "step": 2180 + }, + { + "epoch": 2.658407798537774, + "grad_norm": 0.37800746580583733, + "learning_rate": 3.8673479023258464e-07, + "loss": 0.5366, + "step": 2181 + }, + { + "epoch": 2.6596263200649877, + "grad_norm": 0.3592372247497727, + "learning_rate": 3.840034465271164e-07, + "loss": 0.4612, + "step": 2182 + }, + { + "epoch": 2.6608448415922012, + "grad_norm": 0.37676898244480056, + "learning_rate": 3.812813969767398e-07, + "loss": 0.5335, + "step": 2183 + }, + { + "epoch": 2.6620633631194153, + "grad_norm": 0.3889687505966972, + "learning_rate": 3.7856864706221187e-07, + "loss": 0.5379, + "step": 2184 + }, + { + "epoch": 2.663281884646629, + "grad_norm": 0.3429621452435135, + "learning_rate": 3.7586520224556444e-07, + "loss": 0.4249, + "step": 2185 + }, + { + "epoch": 2.6645004061738424, + "grad_norm": 0.4100593265019823, + "learning_rate": 3.731710679700923e-07, + "loss": 0.5571, + "step": 2186 + }, + { + "epoch": 2.665718927701056, + "grad_norm": 0.3754827320099358, + "learning_rate": 3.7048624966034506e-07, + "loss": 0.4772, + "step": 2187 + }, + { + "epoch": 2.6669374492282696, + "grad_norm": 0.506362418039737, + "learning_rate": 3.6781075272211643e-07, + "loss": 0.4898, + "step": 2188 + }, + { + "epoch": 2.6681559707554836, + "grad_norm": 0.39960584269392463, + "learning_rate": 3.6514458254242936e-07, + "loss": 0.5355, + "step": 2189 + }, + { + "epoch": 2.6693744922826967, + "grad_norm": 0.38884516821746157, + "learning_rate": 3.6248774448952695e-07, + "loss": 0.4607, + "step": 2190 + }, + { + "epoch": 2.6705930138099108, + "grad_norm": 0.38681697956869593, + "learning_rate": 3.598402439128656e-07, + "loss": 0.5662, + "step": 2191 + }, + { + "epoch": 2.6718115353371243, + "grad_norm": 0.3756082857300239, + "learning_rate": 3.572020861430997e-07, + "loss": 0.5143, + "step": 2192 + }, + { + "epoch": 2.673030056864338, + "grad_norm": 0.40002189934283794, + "learning_rate": 3.545732764920717e-07, + "loss": 0.5061, + "step": 2193 + }, + { + "epoch": 2.6742485783915515, + "grad_norm": 0.36384776166641386, + "learning_rate": 3.519538202528011e-07, + "loss": 0.504, + "step": 2194 + }, + { + "epoch": 2.675467099918765, + "grad_norm": 0.3788979703828696, + "learning_rate": 3.4934372269947613e-07, + "loss": 0.4801, + "step": 2195 + }, + { + "epoch": 2.676685621445979, + "grad_norm": 0.3818268978478761, + "learning_rate": 3.467429890874424e-07, + "loss": 0.5279, + "step": 2196 + }, + { + "epoch": 2.6779041429731927, + "grad_norm": 0.35141288719000796, + "learning_rate": 3.4415162465318843e-07, + "loss": 0.4803, + "step": 2197 + }, + { + "epoch": 2.6791226645004063, + "grad_norm": 0.39220258510601774, + "learning_rate": 3.4156963461434156e-07, + "loss": 0.5009, + "step": 2198 + }, + { + "epoch": 2.68034118602762, + "grad_norm": 0.4103479084725928, + "learning_rate": 3.3899702416965166e-07, + "loss": 0.6119, + "step": 2199 + }, + { + "epoch": 2.6815597075548334, + "grad_norm": 0.3797647584117213, + "learning_rate": 3.364337984989846e-07, + "loss": 0.4665, + "step": 2200 + }, + { + "epoch": 2.682778229082047, + "grad_norm": 0.3540938338082574, + "learning_rate": 3.3387996276330934e-07, + "loss": 0.4382, + "step": 2201 + }, + { + "epoch": 2.6839967506092606, + "grad_norm": 0.3743322734466896, + "learning_rate": 3.313355221046888e-07, + "loss": 0.5334, + "step": 2202 + }, + { + "epoch": 2.6852152721364746, + "grad_norm": 0.38933035048539233, + "learning_rate": 3.2880048164627087e-07, + "loss": 0.5351, + "step": 2203 + }, + { + "epoch": 2.686433793663688, + "grad_norm": 0.37060820135278527, + "learning_rate": 3.262748464922738e-07, + "loss": 0.5097, + "step": 2204 + }, + { + "epoch": 2.6876523151909018, + "grad_norm": 0.38495794293474345, + "learning_rate": 3.2375862172797866e-07, + "loss": 0.5678, + "step": 2205 + }, + { + "epoch": 2.6888708367181153, + "grad_norm": 0.36198556723986514, + "learning_rate": 3.212518124197217e-07, + "loss": 0.4704, + "step": 2206 + }, + { + "epoch": 2.690089358245329, + "grad_norm": 0.36538988897114305, + "learning_rate": 3.1875442361487987e-07, + "loss": 0.5394, + "step": 2207 + }, + { + "epoch": 2.6913078797725425, + "grad_norm": 0.3529695149923601, + "learning_rate": 3.1626646034186084e-07, + "loss": 0.4924, + "step": 2208 + }, + { + "epoch": 2.692526401299756, + "grad_norm": 0.3589469252207183, + "learning_rate": 3.1378792761009745e-07, + "loss": 0.5141, + "step": 2209 + }, + { + "epoch": 2.69374492282697, + "grad_norm": 0.3716457330306638, + "learning_rate": 3.1131883041003065e-07, + "loss": 0.5162, + "step": 2210 + }, + { + "epoch": 2.6949634443541837, + "grad_norm": 0.39002214118541273, + "learning_rate": 3.0885917371310745e-07, + "loss": 0.5371, + "step": 2211 + }, + { + "epoch": 2.6961819658813972, + "grad_norm": 0.38634067585511356, + "learning_rate": 3.0640896247176257e-07, + "loss": 0.5303, + "step": 2212 + }, + { + "epoch": 2.697400487408611, + "grad_norm": 0.3679719670190458, + "learning_rate": 3.039682016194162e-07, + "loss": 0.4844, + "step": 2213 + }, + { + "epoch": 2.6986190089358244, + "grad_norm": 0.3650561363535968, + "learning_rate": 3.015368960704584e-07, + "loss": 0.5491, + "step": 2214 + }, + { + "epoch": 2.6998375304630384, + "grad_norm": 0.35589244778023654, + "learning_rate": 2.9911505072024173e-07, + "loss": 0.4435, + "step": 2215 + }, + { + "epoch": 2.7010560519902516, + "grad_norm": 0.3971577742524068, + "learning_rate": 2.967026704450704e-07, + "loss": 0.5417, + "step": 2216 + }, + { + "epoch": 2.7022745735174656, + "grad_norm": 0.36768123246070666, + "learning_rate": 2.942997601021924e-07, + "loss": 0.4946, + "step": 2217 + }, + { + "epoch": 2.703493095044679, + "grad_norm": 0.3868406198179559, + "learning_rate": 2.9190632452978706e-07, + "loss": 0.5273, + "step": 2218 + }, + { + "epoch": 2.7047116165718927, + "grad_norm": 0.3708200192787325, + "learning_rate": 2.895223685469578e-07, + "loss": 0.5005, + "step": 2219 + }, + { + "epoch": 2.7059301380991063, + "grad_norm": 0.37953093384871284, + "learning_rate": 2.871478969537206e-07, + "loss": 0.5435, + "step": 2220 + }, + { + "epoch": 2.70714865962632, + "grad_norm": 0.3614383708651899, + "learning_rate": 2.847829145309933e-07, + "loss": 0.4749, + "step": 2221 + }, + { + "epoch": 2.708367181153534, + "grad_norm": 0.3737932290872502, + "learning_rate": 2.824274260405896e-07, + "loss": 0.5178, + "step": 2222 + }, + { + "epoch": 2.7095857026807475, + "grad_norm": 0.3678542573451642, + "learning_rate": 2.800814362252091e-07, + "loss": 0.5328, + "step": 2223 + }, + { + "epoch": 2.710804224207961, + "grad_norm": 0.3590587724355208, + "learning_rate": 2.7774494980842117e-07, + "loss": 0.488, + "step": 2224 + }, + { + "epoch": 2.7120227457351747, + "grad_norm": 0.37585088629389257, + "learning_rate": 2.754179714946653e-07, + "loss": 0.4925, + "step": 2225 + }, + { + "epoch": 2.7132412672623882, + "grad_norm": 0.3719950620904676, + "learning_rate": 2.7310050596923323e-07, + "loss": 0.4999, + "step": 2226 + }, + { + "epoch": 2.714459788789602, + "grad_norm": 0.3568715514712545, + "learning_rate": 2.7079255789826565e-07, + "loss": 0.4807, + "step": 2227 + }, + { + "epoch": 2.7156783103168154, + "grad_norm": 0.35482907121703094, + "learning_rate": 2.6849413192873816e-07, + "loss": 0.4793, + "step": 2228 + }, + { + "epoch": 2.7168968318440294, + "grad_norm": 0.395651827257008, + "learning_rate": 2.662052326884551e-07, + "loss": 0.544, + "step": 2229 + }, + { + "epoch": 2.718115353371243, + "grad_norm": 0.3935098863850654, + "learning_rate": 2.639258647860399e-07, + "loss": 0.5635, + "step": 2230 + }, + { + "epoch": 2.7193338748984566, + "grad_norm": 0.40165173172581203, + "learning_rate": 2.616560328109219e-07, + "loss": 0.4864, + "step": 2231 + }, + { + "epoch": 2.72055239642567, + "grad_norm": 0.37629992760618575, + "learning_rate": 2.593957413333331e-07, + "loss": 0.4642, + "step": 2232 + }, + { + "epoch": 2.7217709179528837, + "grad_norm": 0.39061948229389115, + "learning_rate": 2.571449949042942e-07, + "loss": 0.4931, + "step": 2233 + }, + { + "epoch": 2.7229894394800978, + "grad_norm": 0.37406505289417313, + "learning_rate": 2.549037980556096e-07, + "loss": 0.5149, + "step": 2234 + }, + { + "epoch": 2.724207961007311, + "grad_norm": 0.3865469301953249, + "learning_rate": 2.5267215529985346e-07, + "loss": 0.5662, + "step": 2235 + }, + { + "epoch": 2.725426482534525, + "grad_norm": 0.4176157521247648, + "learning_rate": 2.5045007113036315e-07, + "loss": 0.4846, + "step": 2236 + }, + { + "epoch": 2.7266450040617385, + "grad_norm": 0.3593515844357923, + "learning_rate": 2.4823755002123253e-07, + "loss": 0.5028, + "step": 2237 + }, + { + "epoch": 2.727863525588952, + "grad_norm": 0.3763091273664034, + "learning_rate": 2.4603459642729867e-07, + "loss": 0.4883, + "step": 2238 + }, + { + "epoch": 2.7290820471161656, + "grad_norm": 0.3254080082849582, + "learning_rate": 2.4384121478413403e-07, + "loss": 0.4552, + "step": 2239 + }, + { + "epoch": 2.7303005686433792, + "grad_norm": 0.3704714874750766, + "learning_rate": 2.416574095080404e-07, + "loss": 0.5491, + "step": 2240 + }, + { + "epoch": 2.7315190901705932, + "grad_norm": 0.36236149508058013, + "learning_rate": 2.394831849960377e-07, + "loss": 0.5425, + "step": 2241 + }, + { + "epoch": 2.732737611697807, + "grad_norm": 0.3749697833768894, + "learning_rate": 2.373185456258531e-07, + "loss": 0.5278, + "step": 2242 + }, + { + "epoch": 2.7339561332250204, + "grad_norm": 0.34972552754271036, + "learning_rate": 2.3516349575591568e-07, + "loss": 0.4618, + "step": 2243 + }, + { + "epoch": 2.735174654752234, + "grad_norm": 0.375574385039016, + "learning_rate": 2.330180397253473e-07, + "loss": 0.5175, + "step": 2244 + }, + { + "epoch": 2.7363931762794476, + "grad_norm": 0.37141386139233595, + "learning_rate": 2.3088218185395195e-07, + "loss": 0.5511, + "step": 2245 + }, + { + "epoch": 2.737611697806661, + "grad_norm": 0.3813893728380543, + "learning_rate": 2.2875592644220846e-07, + "loss": 0.4508, + "step": 2246 + }, + { + "epoch": 2.7388302193338747, + "grad_norm": 0.3775198226604257, + "learning_rate": 2.266392777712595e-07, + "loss": 0.4983, + "step": 2247 + }, + { + "epoch": 2.7400487408610887, + "grad_norm": 0.3750870423354922, + "learning_rate": 2.245322401029082e-07, + "loss": 0.5044, + "step": 2248 + }, + { + "epoch": 2.7412672623883023, + "grad_norm": 0.40047002764422285, + "learning_rate": 2.2243481767960483e-07, + "loss": 0.5827, + "step": 2249 + }, + { + "epoch": 2.742485783915516, + "grad_norm": 0.36949294277977374, + "learning_rate": 2.2034701472443854e-07, + "loss": 0.4752, + "step": 2250 + }, + { + "epoch": 2.7437043054427295, + "grad_norm": 0.3824346198409812, + "learning_rate": 2.1826883544113165e-07, + "loss": 0.5286, + "step": 2251 + }, + { + "epoch": 2.744922826969943, + "grad_norm": 0.3417914549036838, + "learning_rate": 2.1620028401402815e-07, + "loss": 0.4697, + "step": 2252 + }, + { + "epoch": 2.7461413484971566, + "grad_norm": 0.41461070303616865, + "learning_rate": 2.141413646080881e-07, + "loss": 0.5349, + "step": 2253 + }, + { + "epoch": 2.74735987002437, + "grad_norm": 0.37741611628599325, + "learning_rate": 2.1209208136887593e-07, + "loss": 0.5375, + "step": 2254 + }, + { + "epoch": 2.7485783915515842, + "grad_norm": 0.39321969452161015, + "learning_rate": 2.1005243842255552e-07, + "loss": 0.5025, + "step": 2255 + }, + { + "epoch": 2.749796913078798, + "grad_norm": 0.36043353231485903, + "learning_rate": 2.0802243987588068e-07, + "loss": 0.479, + "step": 2256 + }, + { + "epoch": 2.7510154346060114, + "grad_norm": 0.3771749654256085, + "learning_rate": 2.060020898161863e-07, + "loss": 0.5296, + "step": 2257 + }, + { + "epoch": 2.752233956133225, + "grad_norm": 0.36344961189872743, + "learning_rate": 2.0399139231137731e-07, + "loss": 0.513, + "step": 2258 + }, + { + "epoch": 2.7534524776604385, + "grad_norm": 0.37387207502104647, + "learning_rate": 2.019903514099275e-07, + "loss": 0.4837, + "step": 2259 + }, + { + "epoch": 2.7546709991876526, + "grad_norm": 0.4074258198058105, + "learning_rate": 1.999989711408662e-07, + "loss": 0.5165, + "step": 2260 + }, + { + "epoch": 2.7558895207148657, + "grad_norm": 0.40163838399796564, + "learning_rate": 1.9801725551377217e-07, + "loss": 0.484, + "step": 2261 + }, + { + "epoch": 2.7571080422420797, + "grad_norm": 0.39748068787202046, + "learning_rate": 1.9604520851876196e-07, + "loss": 0.5346, + "step": 2262 + }, + { + "epoch": 2.7583265637692933, + "grad_norm": 0.3837696844219795, + "learning_rate": 1.940828341264861e-07, + "loss": 0.5195, + "step": 2263 + }, + { + "epoch": 2.759545085296507, + "grad_norm": 0.372851636737616, + "learning_rate": 1.9213013628812173e-07, + "loss": 0.5025, + "step": 2264 + }, + { + "epoch": 2.7607636068237205, + "grad_norm": 0.3824295932103617, + "learning_rate": 1.9018711893535991e-07, + "loss": 0.4982, + "step": 2265 + }, + { + "epoch": 2.761982128350934, + "grad_norm": 0.3941575511286188, + "learning_rate": 1.8825378598040067e-07, + "loss": 0.4943, + "step": 2266 + }, + { + "epoch": 2.763200649878148, + "grad_norm": 0.3942171147413845, + "learning_rate": 1.863301413159474e-07, + "loss": 0.5597, + "step": 2267 + }, + { + "epoch": 2.7644191714053616, + "grad_norm": 0.3934183991195872, + "learning_rate": 1.8441618881519186e-07, + "loss": 0.483, + "step": 2268 + }, + { + "epoch": 2.765637692932575, + "grad_norm": 0.37982935450069527, + "learning_rate": 1.825119323318153e-07, + "loss": 0.4977, + "step": 2269 + }, + { + "epoch": 2.766856214459789, + "grad_norm": 0.35859063843934896, + "learning_rate": 1.8061737569997407e-07, + "loss": 0.5082, + "step": 2270 + }, + { + "epoch": 2.7680747359870024, + "grad_norm": 0.3882067848746523, + "learning_rate": 1.787325227342951e-07, + "loss": 0.5204, + "step": 2271 + }, + { + "epoch": 2.769293257514216, + "grad_norm": 0.3679939619950258, + "learning_rate": 1.768573772298665e-07, + "loss": 0.5395, + "step": 2272 + }, + { + "epoch": 2.7705117790414295, + "grad_norm": 0.3720953279482073, + "learning_rate": 1.7499194296223209e-07, + "loss": 0.5176, + "step": 2273 + }, + { + "epoch": 2.7717303005686436, + "grad_norm": 0.3858227060687811, + "learning_rate": 1.7313622368738014e-07, + "loss": 0.5067, + "step": 2274 + }, + { + "epoch": 2.772948822095857, + "grad_norm": 0.3747485160463101, + "learning_rate": 1.7129022314174015e-07, + "loss": 0.4811, + "step": 2275 + }, + { + "epoch": 2.7741673436230707, + "grad_norm": 0.38326441088479096, + "learning_rate": 1.694539450421734e-07, + "loss": 0.4991, + "step": 2276 + }, + { + "epoch": 2.7753858651502843, + "grad_norm": 0.34922626985376715, + "learning_rate": 1.6762739308596343e-07, + "loss": 0.5068, + "step": 2277 + }, + { + "epoch": 2.776604386677498, + "grad_norm": 0.37804059096779785, + "learning_rate": 1.6581057095081288e-07, + "loss": 0.4969, + "step": 2278 + }, + { + "epoch": 2.777822908204712, + "grad_norm": 0.39482724733491026, + "learning_rate": 1.640034822948311e-07, + "loss": 0.5356, + "step": 2279 + }, + { + "epoch": 2.779041429731925, + "grad_norm": 0.3564142364029908, + "learning_rate": 1.6220613075653201e-07, + "loss": 0.5082, + "step": 2280 + }, + { + "epoch": 2.780259951259139, + "grad_norm": 0.3808785336291618, + "learning_rate": 1.604185199548225e-07, + "loss": 0.5012, + "step": 2281 + }, + { + "epoch": 2.7814784727863526, + "grad_norm": 0.3466837847337903, + "learning_rate": 1.586406534889967e-07, + "loss": 0.5215, + "step": 2282 + }, + { + "epoch": 2.782696994313566, + "grad_norm": 0.35748226035408104, + "learning_rate": 1.5687253493873068e-07, + "loss": 0.4975, + "step": 2283 + }, + { + "epoch": 2.78391551584078, + "grad_norm": 0.38342498504946887, + "learning_rate": 1.5511416786407164e-07, + "loss": 0.499, + "step": 2284 + }, + { + "epoch": 2.7851340373679934, + "grad_norm": 0.38368966272625266, + "learning_rate": 1.5336555580543256e-07, + "loss": 0.5289, + "step": 2285 + }, + { + "epoch": 2.7863525588952074, + "grad_norm": 0.3761350556362916, + "learning_rate": 1.51626702283586e-07, + "loss": 0.5334, + "step": 2286 + }, + { + "epoch": 2.7875710804224205, + "grad_norm": 0.34488391672914276, + "learning_rate": 1.4989761079965583e-07, + "loss": 0.4731, + "step": 2287 + }, + { + "epoch": 2.7887896019496345, + "grad_norm": 0.3711824659401226, + "learning_rate": 1.4817828483510933e-07, + "loss": 0.5647, + "step": 2288 + }, + { + "epoch": 2.790008123476848, + "grad_norm": 0.36364946680537624, + "learning_rate": 1.4646872785175182e-07, + "loss": 0.5068, + "step": 2289 + }, + { + "epoch": 2.7912266450040617, + "grad_norm": 0.3640609104391418, + "learning_rate": 1.4476894329172042e-07, + "loss": 0.5129, + "step": 2290 + }, + { + "epoch": 2.7924451665312753, + "grad_norm": 0.36422428829864073, + "learning_rate": 1.4307893457747358e-07, + "loss": 0.5234, + "step": 2291 + }, + { + "epoch": 2.793663688058489, + "grad_norm": 0.3541446608840156, + "learning_rate": 1.4139870511178767e-07, + "loss": 0.5035, + "step": 2292 + }, + { + "epoch": 2.794882209585703, + "grad_norm": 0.3772886381428785, + "learning_rate": 1.3972825827774928e-07, + "loss": 0.5069, + "step": 2293 + }, + { + "epoch": 2.7961007311129165, + "grad_norm": 0.39173775427502877, + "learning_rate": 1.3806759743874688e-07, + "loss": 0.5421, + "step": 2294 + }, + { + "epoch": 2.79731925264013, + "grad_norm": 0.3652175638097129, + "learning_rate": 1.3641672593846632e-07, + "loss": 0.5213, + "step": 2295 + }, + { + "epoch": 2.7985377741673436, + "grad_norm": 0.36544225086563453, + "learning_rate": 1.3477564710088097e-07, + "loss": 0.4687, + "step": 2296 + }, + { + "epoch": 2.799756295694557, + "grad_norm": 0.3740249931778571, + "learning_rate": 1.3314436423024935e-07, + "loss": 0.518, + "step": 2297 + }, + { + "epoch": 2.8009748172217708, + "grad_norm": 0.37454883087183666, + "learning_rate": 1.3152288061110518e-07, + "loss": 0.4902, + "step": 2298 + }, + { + "epoch": 2.8021933387489844, + "grad_norm": 0.3629763422238737, + "learning_rate": 1.2991119950825138e-07, + "loss": 0.5329, + "step": 2299 + }, + { + "epoch": 2.8034118602761984, + "grad_norm": 0.3825610104170137, + "learning_rate": 1.2830932416675323e-07, + "loss": 0.5217, + "step": 2300 + }, + { + "epoch": 2.804630381803412, + "grad_norm": 0.35458779804689705, + "learning_rate": 1.2671725781193467e-07, + "loss": 0.482, + "step": 2301 + }, + { + "epoch": 2.8058489033306255, + "grad_norm": 0.4060620936449498, + "learning_rate": 1.251350036493676e-07, + "loss": 0.5396, + "step": 2302 + }, + { + "epoch": 2.807067424857839, + "grad_norm": 0.37363725202431575, + "learning_rate": 1.2356256486486806e-07, + "loss": 0.4898, + "step": 2303 + }, + { + "epoch": 2.8082859463850527, + "grad_norm": 0.35835154484517495, + "learning_rate": 1.2199994462448906e-07, + "loss": 0.493, + "step": 2304 + }, + { + "epoch": 2.8095044679122667, + "grad_norm": 0.42120102584212404, + "learning_rate": 1.2044714607451436e-07, + "loss": 0.5257, + "step": 2305 + }, + { + "epoch": 2.81072298943948, + "grad_norm": 0.3556943551485984, + "learning_rate": 1.1890417234145246e-07, + "loss": 0.5095, + "step": 2306 + }, + { + "epoch": 2.811941510966694, + "grad_norm": 0.38386985507916965, + "learning_rate": 1.1737102653202825e-07, + "loss": 0.5279, + "step": 2307 + }, + { + "epoch": 2.8131600324939074, + "grad_norm": 0.36317152822127746, + "learning_rate": 1.1584771173318076e-07, + "loss": 0.4927, + "step": 2308 + }, + { + "epoch": 2.814378554021121, + "grad_norm": 0.4011884968558589, + "learning_rate": 1.1433423101205321e-07, + "loss": 0.5282, + "step": 2309 + }, + { + "epoch": 2.8155970755483346, + "grad_norm": 0.3642862846860186, + "learning_rate": 1.1283058741598962e-07, + "loss": 0.4734, + "step": 2310 + }, + { + "epoch": 2.816815597075548, + "grad_norm": 0.38393727230766617, + "learning_rate": 1.1133678397252434e-07, + "loss": 0.5357, + "step": 2311 + }, + { + "epoch": 2.818034118602762, + "grad_norm": 0.3837474969152823, + "learning_rate": 1.0985282368938199e-07, + "loss": 0.5024, + "step": 2312 + }, + { + "epoch": 2.819252640129976, + "grad_norm": 0.4030179784599761, + "learning_rate": 1.0837870955446639e-07, + "loss": 0.5339, + "step": 2313 + }, + { + "epoch": 2.8204711616571894, + "grad_norm": 0.3539980646211641, + "learning_rate": 1.0691444453585775e-07, + "loss": 0.4979, + "step": 2314 + }, + { + "epoch": 2.821689683184403, + "grad_norm": 0.3490699508092425, + "learning_rate": 1.0546003158180496e-07, + "loss": 0.4861, + "step": 2315 + }, + { + "epoch": 2.8229082047116165, + "grad_norm": 0.3504210969211817, + "learning_rate": 1.0401547362071939e-07, + "loss": 0.4995, + "step": 2316 + }, + { + "epoch": 2.82412672623883, + "grad_norm": 0.3626756496837975, + "learning_rate": 1.0258077356117057e-07, + "loss": 0.5019, + "step": 2317 + }, + { + "epoch": 2.8253452477660437, + "grad_norm": 0.39120905301617104, + "learning_rate": 1.0115593429187942e-07, + "loss": 0.5056, + "step": 2318 + }, + { + "epoch": 2.8265637692932577, + "grad_norm": 0.38360724438244953, + "learning_rate": 9.974095868171164e-08, + "loss": 0.4574, + "step": 2319 + }, + { + "epoch": 2.8277822908204713, + "grad_norm": 0.4154087623735947, + "learning_rate": 9.833584957967491e-08, + "loss": 0.5459, + "step": 2320 + }, + { + "epoch": 2.829000812347685, + "grad_norm": 0.3684945590836955, + "learning_rate": 9.694060981490783e-08, + "loss": 0.5044, + "step": 2321 + }, + { + "epoch": 2.8302193338748984, + "grad_norm": 0.3759625212141255, + "learning_rate": 9.555524219667989e-08, + "loss": 0.5045, + "step": 2322 + }, + { + "epoch": 2.831437855402112, + "grad_norm": 0.3711414030240098, + "learning_rate": 9.417974951438203e-08, + "loss": 0.4909, + "step": 2323 + }, + { + "epoch": 2.8326563769293256, + "grad_norm": 0.40793520425182356, + "learning_rate": 9.281413453752386e-08, + "loss": 0.5911, + "step": 2324 + }, + { + "epoch": 2.833874898456539, + "grad_norm": 0.35038615442889337, + "learning_rate": 9.145840001572537e-08, + "loss": 0.5061, + "step": 2325 + }, + { + "epoch": 2.835093419983753, + "grad_norm": 0.33298280777144973, + "learning_rate": 9.011254867871244e-08, + "loss": 0.4843, + "step": 2326 + }, + { + "epoch": 2.8363119415109668, + "grad_norm": 0.37732042868872595, + "learning_rate": 8.877658323631188e-08, + "loss": 0.5434, + "step": 2327 + }, + { + "epoch": 2.8375304630381804, + "grad_norm": 0.3989852512203333, + "learning_rate": 8.745050637844532e-08, + "loss": 0.5179, + "step": 2328 + }, + { + "epoch": 2.838748984565394, + "grad_norm": 0.38524826151374814, + "learning_rate": 8.613432077512474e-08, + "loss": 0.5135, + "step": 2329 + }, + { + "epoch": 2.8399675060926075, + "grad_norm": 0.3561681050422304, + "learning_rate": 8.482802907644528e-08, + "loss": 0.5332, + "step": 2330 + }, + { + "epoch": 2.8411860276198215, + "grad_norm": 0.35905357929351883, + "learning_rate": 8.353163391258302e-08, + "loss": 0.4736, + "step": 2331 + }, + { + "epoch": 2.8424045491470347, + "grad_norm": 0.3771943557678179, + "learning_rate": 8.224513789378497e-08, + "loss": 0.4974, + "step": 2332 + }, + { + "epoch": 2.8436230706742487, + "grad_norm": 0.3689008459698471, + "learning_rate": 8.09685436103691e-08, + "loss": 0.5007, + "step": 2333 + }, + { + "epoch": 2.8448415922014623, + "grad_norm": 0.3946551692601427, + "learning_rate": 7.970185363271432e-08, + "loss": 0.5555, + "step": 2334 + }, + { + "epoch": 2.846060113728676, + "grad_norm": 0.3814135760411076, + "learning_rate": 7.844507051125937e-08, + "loss": 0.4953, + "step": 2335 + }, + { + "epoch": 2.8472786352558894, + "grad_norm": 0.3877741958314108, + "learning_rate": 7.71981967764951e-08, + "loss": 0.5059, + "step": 2336 + }, + { + "epoch": 2.848497156783103, + "grad_norm": 0.364018962944753, + "learning_rate": 7.59612349389599e-08, + "loss": 0.4948, + "step": 2337 + }, + { + "epoch": 2.849715678310317, + "grad_norm": 0.4075558874034061, + "learning_rate": 7.473418748923545e-08, + "loss": 0.5948, + "step": 2338 + }, + { + "epoch": 2.8509341998375306, + "grad_norm": 0.37855730911487784, + "learning_rate": 7.351705689794042e-08, + "loss": 0.424, + "step": 2339 + }, + { + "epoch": 2.852152721364744, + "grad_norm": 0.3738096901546095, + "learning_rate": 7.230984561572729e-08, + "loss": 0.5434, + "step": 2340 + }, + { + "epoch": 2.8533712428919578, + "grad_norm": 0.37627980072639083, + "learning_rate": 7.11125560732756e-08, + "loss": 0.4934, + "step": 2341 + }, + { + "epoch": 2.8545897644191713, + "grad_norm": 0.39211088076356204, + "learning_rate": 6.992519068128701e-08, + "loss": 0.4979, + "step": 2342 + }, + { + "epoch": 2.855808285946385, + "grad_norm": 0.36180712823693784, + "learning_rate": 6.8747751830483e-08, + "loss": 0.54, + "step": 2343 + }, + { + "epoch": 2.8570268074735985, + "grad_norm": 0.3422753121701595, + "learning_rate": 6.758024189159718e-08, + "loss": 0.4674, + "step": 2344 + }, + { + "epoch": 2.8582453290008125, + "grad_norm": 0.3670191252106066, + "learning_rate": 6.64226632153725e-08, + "loss": 0.5208, + "step": 2345 + }, + { + "epoch": 2.859463850528026, + "grad_norm": 0.3796974952937155, + "learning_rate": 6.527501813255344e-08, + "loss": 0.5399, + "step": 2346 + }, + { + "epoch": 2.8606823720552397, + "grad_norm": 0.36481445761250786, + "learning_rate": 6.413730895388714e-08, + "loss": 0.5072, + "step": 2347 + }, + { + "epoch": 2.8619008935824533, + "grad_norm": 0.3636784036139616, + "learning_rate": 6.300953797011178e-08, + "loss": 0.5291, + "step": 2348 + }, + { + "epoch": 2.863119415109667, + "grad_norm": 0.35701639845422045, + "learning_rate": 6.18917074519565e-08, + "loss": 0.503, + "step": 2349 + }, + { + "epoch": 2.864337936636881, + "grad_norm": 0.3780638398451471, + "learning_rate": 6.078381965013646e-08, + "loss": 0.526, + "step": 2350 + }, + { + "epoch": 2.865556458164094, + "grad_norm": 0.3863922669183168, + "learning_rate": 5.968587679534621e-08, + "loss": 0.4887, + "step": 2351 + }, + { + "epoch": 2.866774979691308, + "grad_norm": 0.36868437000440496, + "learning_rate": 5.8597881098257924e-08, + "loss": 0.535, + "step": 2352 + }, + { + "epoch": 2.8679935012185216, + "grad_norm": 0.3667806587451281, + "learning_rate": 5.751983474951317e-08, + "loss": 0.5357, + "step": 2353 + }, + { + "epoch": 2.869212022745735, + "grad_norm": 0.35160456413985003, + "learning_rate": 5.6451739919723417e-08, + "loss": 0.4966, + "step": 2354 + }, + { + "epoch": 2.8704305442729487, + "grad_norm": 0.3702286255153397, + "learning_rate": 5.539359875946171e-08, + "loss": 0.5364, + "step": 2355 + }, + { + "epoch": 2.8716490658001623, + "grad_norm": 0.3545425220241147, + "learning_rate": 5.434541339926047e-08, + "loss": 0.4989, + "step": 2356 + }, + { + "epoch": 2.8728675873273763, + "grad_norm": 0.37190280510667345, + "learning_rate": 5.3307185949605935e-08, + "loss": 0.5177, + "step": 2357 + }, + { + "epoch": 2.87408610885459, + "grad_norm": 0.39562598350439043, + "learning_rate": 5.227891850093314e-08, + "loss": 0.5489, + "step": 2358 + }, + { + "epoch": 2.8753046303818035, + "grad_norm": 0.3653775053445267, + "learning_rate": 5.12606131236254e-08, + "loss": 0.485, + "step": 2359 + }, + { + "epoch": 2.876523151909017, + "grad_norm": 0.3641584823261951, + "learning_rate": 5.025227186800652e-08, + "loss": 0.5217, + "step": 2360 + }, + { + "epoch": 2.8777416734362307, + "grad_norm": 0.38104906167780134, + "learning_rate": 4.925389676433745e-08, + "loss": 0.485, + "step": 2361 + }, + { + "epoch": 2.8789601949634442, + "grad_norm": 0.39160093264511175, + "learning_rate": 4.8265489822814094e-08, + "loss": 0.515, + "step": 2362 + }, + { + "epoch": 2.880178716490658, + "grad_norm": 0.37954553391255796, + "learning_rate": 4.728705303356007e-08, + "loss": 0.4743, + "step": 2363 + }, + { + "epoch": 2.881397238017872, + "grad_norm": 0.3786501167734631, + "learning_rate": 4.631858836662562e-08, + "loss": 0.5282, + "step": 2364 + }, + { + "epoch": 2.8826157595450854, + "grad_norm": 0.3847521896579275, + "learning_rate": 4.536009777198203e-08, + "loss": 0.4954, + "step": 2365 + }, + { + "epoch": 2.883834281072299, + "grad_norm": 0.3686885036028533, + "learning_rate": 4.441158317951777e-08, + "loss": 0.5, + "step": 2366 + }, + { + "epoch": 2.8850528025995126, + "grad_norm": 0.3476414945698473, + "learning_rate": 4.347304649903572e-08, + "loss": 0.5112, + "step": 2367 + }, + { + "epoch": 2.886271324126726, + "grad_norm": 0.3501811924875589, + "learning_rate": 4.2544489620248155e-08, + "loss": 0.5212, + "step": 2368 + }, + { + "epoch": 2.8874898456539397, + "grad_norm": 0.3522575951304983, + "learning_rate": 4.162591441277341e-08, + "loss": 0.5216, + "step": 2369 + }, + { + "epoch": 2.8887083671811533, + "grad_norm": 0.34261605478893353, + "learning_rate": 4.071732272613149e-08, + "loss": 0.4688, + "step": 2370 + }, + { + "epoch": 2.8899268887083673, + "grad_norm": 0.3647353980991927, + "learning_rate": 3.981871638974177e-08, + "loss": 0.5131, + "step": 2371 + }, + { + "epoch": 2.891145410235581, + "grad_norm": 0.37709939404151627, + "learning_rate": 3.8930097212918625e-08, + "loss": 0.5103, + "step": 2372 + }, + { + "epoch": 2.8923639317627945, + "grad_norm": 0.35808464221731223, + "learning_rate": 3.805146698486695e-08, + "loss": 0.4684, + "step": 2373 + }, + { + "epoch": 2.893582453290008, + "grad_norm": 0.3812340713874895, + "learning_rate": 3.7182827474678273e-08, + "loss": 0.5575, + "step": 2374 + }, + { + "epoch": 2.8948009748172217, + "grad_norm": 0.36259528565916516, + "learning_rate": 3.632418043133079e-08, + "loss": 0.513, + "step": 2375 + }, + { + "epoch": 2.8960194963444357, + "grad_norm": 0.3422911070110535, + "learning_rate": 3.5475527583681005e-08, + "loss": 0.4727, + "step": 2376 + }, + { + "epoch": 2.897238017871649, + "grad_norm": 0.3609331496351682, + "learning_rate": 3.463687064046317e-08, + "loss": 0.529, + "step": 2377 + }, + { + "epoch": 2.898456539398863, + "grad_norm": 0.3819585041721209, + "learning_rate": 3.3808211290284886e-08, + "loss": 0.481, + "step": 2378 + }, + { + "epoch": 2.8996750609260764, + "grad_norm": 0.3731434598411168, + "learning_rate": 3.2989551201624836e-08, + "loss": 0.5226, + "step": 2379 + }, + { + "epoch": 2.90089358245329, + "grad_norm": 0.3885400700621752, + "learning_rate": 3.2180892022826705e-08, + "loss": 0.5329, + "step": 2380 + }, + { + "epoch": 2.9021121039805036, + "grad_norm": 0.3572845398778449, + "learning_rate": 3.138223538209973e-08, + "loss": 0.4753, + "step": 2381 + }, + { + "epoch": 2.903330625507717, + "grad_norm": 0.3614383781834413, + "learning_rate": 3.059358288751202e-08, + "loss": 0.5409, + "step": 2382 + }, + { + "epoch": 2.904549147034931, + "grad_norm": 0.37189556032401827, + "learning_rate": 2.981493612698838e-08, + "loss": 0.5195, + "step": 2383 + }, + { + "epoch": 2.9057676685621447, + "grad_norm": 0.3789215584826568, + "learning_rate": 2.9046296668309716e-08, + "loss": 0.5074, + "step": 2384 + }, + { + "epoch": 2.9069861900893583, + "grad_norm": 0.370512893367381, + "learning_rate": 2.8287666059104713e-08, + "loss": 0.5191, + "step": 2385 + }, + { + "epoch": 2.908204711616572, + "grad_norm": 0.38297199689289846, + "learning_rate": 2.753904582685096e-08, + "loss": 0.4737, + "step": 2386 + }, + { + "epoch": 2.9094232331437855, + "grad_norm": 0.3959947838736986, + "learning_rate": 2.6800437478870512e-08, + "loss": 0.5115, + "step": 2387 + }, + { + "epoch": 2.910641754670999, + "grad_norm": 0.3957470998942836, + "learning_rate": 2.6071842502326526e-08, + "loss": 0.5063, + "step": 2388 + }, + { + "epoch": 2.9118602761982126, + "grad_norm": 0.3723580924084453, + "learning_rate": 2.535326236422053e-08, + "loss": 0.4892, + "step": 2389 + }, + { + "epoch": 2.9130787977254267, + "grad_norm": 0.37260731194295516, + "learning_rate": 2.464469851139073e-08, + "loss": 0.5542, + "step": 2390 + }, + { + "epoch": 2.9142973192526402, + "grad_norm": 0.35868024153034395, + "learning_rate": 2.394615237050535e-08, + "loss": 0.523, + "step": 2391 + }, + { + "epoch": 2.915515840779854, + "grad_norm": 0.36371587526669685, + "learning_rate": 2.3257625348064306e-08, + "loss": 0.4825, + "step": 2392 + }, + { + "epoch": 2.9167343623070674, + "grad_norm": 0.38007578965240923, + "learning_rate": 2.2579118830393654e-08, + "loss": 0.5096, + "step": 2393 + }, + { + "epoch": 2.917952883834281, + "grad_norm": 0.3894212168804138, + "learning_rate": 2.1910634183644475e-08, + "loss": 0.4839, + "step": 2394 + }, + { + "epoch": 2.9191714053614946, + "grad_norm": 0.40052317739398724, + "learning_rate": 2.1252172753787324e-08, + "loss": 0.5651, + "step": 2395 + }, + { + "epoch": 2.920389926888708, + "grad_norm": 0.35188112107988123, + "learning_rate": 2.060373586661224e-08, + "loss": 0.4977, + "step": 2396 + }, + { + "epoch": 2.921608448415922, + "grad_norm": 0.3499426602627878, + "learning_rate": 1.996532482772595e-08, + "loss": 0.4519, + "step": 2397 + }, + { + "epoch": 2.9228269699431357, + "grad_norm": 0.3812985888964239, + "learning_rate": 1.933694092254801e-08, + "loss": 0.5197, + "step": 2398 + }, + { + "epoch": 2.9240454914703493, + "grad_norm": 0.37294376101878496, + "learning_rate": 1.8718585416307443e-08, + "loss": 0.5252, + "step": 2399 + }, + { + "epoch": 2.925264012997563, + "grad_norm": 0.390284947297633, + "learning_rate": 1.811025955404333e-08, + "loss": 0.4939, + "step": 2400 + }, + { + "epoch": 2.9264825345247765, + "grad_norm": 0.38299656914528474, + "learning_rate": 1.751196456059867e-08, + "loss": 0.5282, + "step": 2401 + }, + { + "epoch": 2.9277010560519905, + "grad_norm": 0.4033256657794055, + "learning_rate": 1.6923701640621514e-08, + "loss": 0.5516, + "step": 2402 + }, + { + "epoch": 2.9289195775792036, + "grad_norm": 0.3564131129340729, + "learning_rate": 1.6345471978558847e-08, + "loss": 0.4492, + "step": 2403 + }, + { + "epoch": 2.9301380991064176, + "grad_norm": 0.3696818550866779, + "learning_rate": 1.577727673865659e-08, + "loss": 0.5282, + "step": 2404 + }, + { + "epoch": 2.9313566206336312, + "grad_norm": 0.3511572891694367, + "learning_rate": 1.5219117064957934e-08, + "loss": 0.5573, + "step": 2405 + }, + { + "epoch": 2.932575142160845, + "grad_norm": 0.3777779794080296, + "learning_rate": 1.4670994081297796e-08, + "loss": 0.4964, + "step": 2406 + }, + { + "epoch": 2.9337936636880584, + "grad_norm": 0.3408974732458375, + "learning_rate": 1.413290889130392e-08, + "loss": 0.5008, + "step": 2407 + }, + { + "epoch": 2.935012185215272, + "grad_norm": 0.35326628190060844, + "learning_rate": 1.3604862578392996e-08, + "loss": 0.4734, + "step": 2408 + }, + { + "epoch": 2.936230706742486, + "grad_norm": 0.3885332399006649, + "learning_rate": 1.3086856205768439e-08, + "loss": 0.5695, + "step": 2409 + }, + { + "epoch": 2.9374492282696996, + "grad_norm": 0.36720005893901214, + "learning_rate": 1.257889081641872e-08, + "loss": 0.4626, + "step": 2410 + }, + { + "epoch": 2.938667749796913, + "grad_norm": 0.38654367810832296, + "learning_rate": 1.208096743311571e-08, + "loss": 0.5201, + "step": 2411 + }, + { + "epoch": 2.9398862713241267, + "grad_norm": 0.35989398437475195, + "learning_rate": 1.159308705841078e-08, + "loss": 0.524, + "step": 2412 + }, + { + "epoch": 2.9411047928513403, + "grad_norm": 0.36447060978418805, + "learning_rate": 1.111525067463537e-08, + "loss": 0.4977, + "step": 2413 + }, + { + "epoch": 2.942323314378554, + "grad_norm": 0.3941566187065989, + "learning_rate": 1.0647459243897095e-08, + "loss": 0.5241, + "step": 2414 + }, + { + "epoch": 2.9435418359057675, + "grad_norm": 0.3971195756945423, + "learning_rate": 1.0189713708078086e-08, + "loss": 0.5083, + "step": 2415 + }, + { + "epoch": 2.9447603574329815, + "grad_norm": 0.3617746907444414, + "learning_rate": 9.74201498883387e-09, + "loss": 0.4824, + "step": 2416 + }, + { + "epoch": 2.945978878960195, + "grad_norm": 0.38983833247818245, + "learning_rate": 9.304363987591158e-09, + "loss": 0.5426, + "step": 2417 + }, + { + "epoch": 2.9471974004874086, + "grad_norm": 0.3739423997155145, + "learning_rate": 8.87676158554507e-09, + "loss": 0.4452, + "step": 2418 + }, + { + "epoch": 2.948415922014622, + "grad_norm": 0.37952476055522766, + "learning_rate": 8.459208643659122e-09, + "loss": 0.5432, + "step": 2419 + }, + { + "epoch": 2.949634443541836, + "grad_norm": 0.3634354801661458, + "learning_rate": 8.051706002661919e-09, + "loss": 0.5223, + "step": 2420 + }, + { + "epoch": 2.95085296506905, + "grad_norm": 0.3655917645054435, + "learning_rate": 7.65425448304713e-09, + "loss": 0.4784, + "step": 2421 + }, + { + "epoch": 2.952071486596263, + "grad_norm": 0.3878857225849821, + "learning_rate": 7.266854885069619e-09, + "loss": 0.536, + "step": 2422 + }, + { + "epoch": 2.953290008123477, + "grad_norm": 0.3860010124815134, + "learning_rate": 6.889507988745436e-09, + "loss": 0.5343, + "step": 2423 + }, + { + "epoch": 2.9545085296506906, + "grad_norm": 0.37137826994221546, + "learning_rate": 6.5222145538501595e-09, + "loss": 0.4683, + "step": 2424 + }, + { + "epoch": 2.955727051177904, + "grad_norm": 0.40358716261973293, + "learning_rate": 6.164975319917221e-09, + "loss": 0.5118, + "step": 2425 + }, + { + "epoch": 2.9569455727051177, + "grad_norm": 0.3805519912768825, + "learning_rate": 5.817791006235141e-09, + "loss": 0.5446, + "step": 2426 + }, + { + "epoch": 2.9581640942323313, + "grad_norm": 0.3645099929908209, + "learning_rate": 5.480662311848628e-09, + "loss": 0.4789, + "step": 2427 + }, + { + "epoch": 2.9593826157595453, + "grad_norm": 0.36366977745298745, + "learning_rate": 5.153589915554702e-09, + "loss": 0.5268, + "step": 2428 + }, + { + "epoch": 2.960601137286759, + "grad_norm": 0.35084949812297817, + "learning_rate": 4.836574475903244e-09, + "loss": 0.4545, + "step": 2429 + }, + { + "epoch": 2.9618196588139725, + "grad_norm": 0.3688342331523407, + "learning_rate": 4.5296166311931125e-09, + "loss": 0.5512, + "step": 2430 + }, + { + "epoch": 2.963038180341186, + "grad_norm": 0.3521398793576215, + "learning_rate": 4.232716999474917e-09, + "loss": 0.5379, + "step": 2431 + }, + { + "epoch": 2.9642567018683996, + "grad_norm": 0.3706937567678196, + "learning_rate": 3.9458761785460266e-09, + "loss": 0.5445, + "step": 2432 + }, + { + "epoch": 2.965475223395613, + "grad_norm": 0.35060638658108007, + "learning_rate": 3.669094745950008e-09, + "loss": 0.5027, + "step": 2433 + }, + { + "epoch": 2.966693744922827, + "grad_norm": 0.33178253347322945, + "learning_rate": 3.4023732589777426e-09, + "loss": 0.4681, + "step": 2434 + }, + { + "epoch": 2.967912266450041, + "grad_norm": 0.35617115102694386, + "learning_rate": 3.1457122546635353e-09, + "loss": 0.5019, + "step": 2435 + }, + { + "epoch": 2.9691307879772544, + "grad_norm": 0.3548326105732938, + "learning_rate": 2.899112249786229e-09, + "loss": 0.5276, + "step": 2436 + }, + { + "epoch": 2.970349309504468, + "grad_norm": 0.3984987781160192, + "learning_rate": 2.6625737408669804e-09, + "loss": 0.5172, + "step": 2437 + }, + { + "epoch": 2.9715678310316815, + "grad_norm": 0.362754815960633, + "learning_rate": 2.436097204167043e-09, + "loss": 0.5206, + "step": 2438 + }, + { + "epoch": 2.972786352558895, + "grad_norm": 0.38356526868895263, + "learning_rate": 2.2196830956905392e-09, + "loss": 0.4762, + "step": 2439 + }, + { + "epoch": 2.9740048740861087, + "grad_norm": 0.37390570550389457, + "learning_rate": 2.0133318511800227e-09, + "loss": 0.5343, + "step": 2440 + }, + { + "epoch": 2.9752233956133223, + "grad_norm": 0.35679383760064753, + "learning_rate": 1.8170438861159212e-09, + "loss": 0.4894, + "step": 2441 + }, + { + "epoch": 2.9764419171405363, + "grad_norm": 0.3592057260000064, + "learning_rate": 1.6308195957182028e-09, + "loss": 0.5362, + "step": 2442 + }, + { + "epoch": 2.97766043866775, + "grad_norm": 0.3259038266407775, + "learning_rate": 1.4546593549424892e-09, + "loss": 0.4727, + "step": 2443 + }, + { + "epoch": 2.9788789601949635, + "grad_norm": 0.37363652956982973, + "learning_rate": 1.2885635184828326e-09, + "loss": 0.5277, + "step": 2444 + }, + { + "epoch": 2.980097481722177, + "grad_norm": 0.35379174534673236, + "learning_rate": 1.1325324207667187e-09, + "loss": 0.4837, + "step": 2445 + }, + { + "epoch": 2.9813160032493906, + "grad_norm": 0.3675995235345453, + "learning_rate": 9.865663759578426e-10, + "loss": 0.5461, + "step": 2446 + }, + { + "epoch": 2.9825345247766046, + "grad_norm": 0.37636771913858935, + "learning_rate": 8.50665677953888e-10, + "loss": 0.4862, + "step": 2447 + }, + { + "epoch": 2.9837530463038178, + "grad_norm": 0.3681022019205161, + "learning_rate": 7.24830600386528e-10, + "loss": 0.4963, + "step": 2448 + }, + { + "epoch": 2.984971567831032, + "grad_norm": 0.3746370138837488, + "learning_rate": 6.09061396620314e-10, + "loss": 0.534, + "step": 2449 + }, + { + "epoch": 2.9861900893582454, + "grad_norm": 0.36486933688431966, + "learning_rate": 5.033582997526765e-10, + "loss": 0.577, + "step": 2450 + }, + { + "epoch": 2.987408610885459, + "grad_norm": 0.3650821349016925, + "learning_rate": 4.0772152261336906e-10, + "loss": 0.4508, + "step": 2451 + }, + { + "epoch": 2.9886271324126725, + "grad_norm": 0.380070504025212, + "learning_rate": 3.221512577639141e-10, + "loss": 0.5051, + "step": 2452 + }, + { + "epoch": 2.989845653939886, + "grad_norm": 0.3786185974021647, + "learning_rate": 2.466476774970472e-10, + "loss": 0.4845, + "step": 2453 + }, + { + "epoch": 2.9910641754671, + "grad_norm": 0.360482935976555, + "learning_rate": 1.812109338367174e-10, + "loss": 0.5414, + "step": 2454 + }, + { + "epoch": 2.9922826969943137, + "grad_norm": 0.372228898631232, + "learning_rate": 1.2584115853808697e-10, + "loss": 0.5194, + "step": 2455 + }, + { + "epoch": 2.9935012185215273, + "grad_norm": 0.38381176412541346, + "learning_rate": 8.053846308531122e-11, + "loss": 0.4912, + "step": 2456 + }, + { + "epoch": 2.994719740048741, + "grad_norm": 0.3934048198873252, + "learning_rate": 4.53029386948689e-11, + "loss": 0.5137, + "step": 2457 + }, + { + "epoch": 2.9959382615759544, + "grad_norm": 0.38117458951656574, + "learning_rate": 2.0134656311676658e-11, + "loss": 0.5353, + "step": 2458 + }, + { + "epoch": 2.997156783103168, + "grad_norm": 0.38135142008825745, + "learning_rate": 5.033666611864441e-12, + "loss": 0.4967, + "step": 2459 + }, + { + "epoch": 2.9983753046303816, + "grad_norm": 0.36779717909800114, + "learning_rate": 0.0, + "loss": 0.5127, + "step": 2460 + }, + { + "epoch": 2.9983753046303816, + "step": 2460, + "total_flos": 2708356203970560.0, + "train_loss": 0.5759637464110444, + "train_runtime": 38974.7056, + "train_samples_per_second": 6.062, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1, + "max_steps": 2460, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2708356203970560.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}