{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2930, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017064846416382253, "grad_norm": 5.5382735829047816, "learning_rate": 1.3651877133105803e-07, "loss": 0.9275, "step": 1 }, { "epoch": 0.0034129692832764505, "grad_norm": 5.560911534512389, "learning_rate": 2.7303754266211607e-07, "loss": 0.8735, "step": 2 }, { "epoch": 0.005119453924914676, "grad_norm": 5.571629079857404, "learning_rate": 4.0955631399317407e-07, "loss": 0.8948, "step": 3 }, { "epoch": 0.006825938566552901, "grad_norm": 5.399904062260569, "learning_rate": 5.460750853242321e-07, "loss": 0.8989, "step": 4 }, { "epoch": 0.008532423208191127, "grad_norm": 5.520885319552546, "learning_rate": 6.825938566552902e-07, "loss": 0.8903, "step": 5 }, { "epoch": 0.010238907849829351, "grad_norm": 5.516912402763377, "learning_rate": 8.191126279863481e-07, "loss": 0.8867, "step": 6 }, { "epoch": 0.011945392491467578, "grad_norm": 5.557317289645866, "learning_rate": 9.556313993174062e-07, "loss": 0.8591, "step": 7 }, { "epoch": 0.013651877133105802, "grad_norm": 5.045508456607984, "learning_rate": 1.0921501706484643e-06, "loss": 0.8438, "step": 8 }, { "epoch": 0.015358361774744027, "grad_norm": 4.867775053276732, "learning_rate": 1.2286689419795223e-06, "loss": 0.8324, "step": 9 }, { "epoch": 0.017064846416382253, "grad_norm": 4.17269521078131, "learning_rate": 1.3651877133105804e-06, "loss": 0.7919, "step": 10 }, { "epoch": 0.01877133105802048, "grad_norm": 3.8684320532409604, "learning_rate": 1.5017064846416384e-06, "loss": 0.7654, "step": 11 }, { "epoch": 0.020477815699658702, "grad_norm": 4.0237028768787875, "learning_rate": 1.6382252559726963e-06, "loss": 0.8797, "step": 12 }, { "epoch": 0.02218430034129693, "grad_norm": 2.3381353337879407, "learning_rate": 1.7747440273037543e-06, "loss": 0.7645, "step": 13 }, { "epoch": 0.023890784982935155, "grad_norm": 2.2908419509729003, "learning_rate": 1.9112627986348124e-06, "loss": 0.7652, "step": 14 }, { "epoch": 0.025597269624573378, "grad_norm": 2.1380170510125645, "learning_rate": 2.0477815699658705e-06, "loss": 0.8098, "step": 15 }, { "epoch": 0.027303754266211604, "grad_norm": 1.9451355783607645, "learning_rate": 2.1843003412969285e-06, "loss": 0.7624, "step": 16 }, { "epoch": 0.02901023890784983, "grad_norm": 2.4088929607724725, "learning_rate": 2.3208191126279866e-06, "loss": 0.747, "step": 17 }, { "epoch": 0.030716723549488054, "grad_norm": 3.075601225318945, "learning_rate": 2.4573378839590446e-06, "loss": 0.7657, "step": 18 }, { "epoch": 0.032423208191126277, "grad_norm": 3.216111415255342, "learning_rate": 2.5938566552901023e-06, "loss": 0.7787, "step": 19 }, { "epoch": 0.034129692832764506, "grad_norm": 2.93934194565305, "learning_rate": 2.7303754266211608e-06, "loss": 0.7258, "step": 20 }, { "epoch": 0.03583617747440273, "grad_norm": 2.8243675906534116, "learning_rate": 2.8668941979522184e-06, "loss": 0.7321, "step": 21 }, { "epoch": 0.03754266211604096, "grad_norm": 2.782095780866637, "learning_rate": 3.003412969283277e-06, "loss": 0.7147, "step": 22 }, { "epoch": 0.03924914675767918, "grad_norm": 2.1270711160975204, "learning_rate": 3.139931740614335e-06, "loss": 0.6972, "step": 23 }, { "epoch": 0.040955631399317405, "grad_norm": 1.6607845981555187, "learning_rate": 3.2764505119453926e-06, "loss": 0.7115, "step": 24 }, { "epoch": 0.042662116040955635, "grad_norm": 1.4607261467117063, "learning_rate": 3.412969283276451e-06, "loss": 0.6944, "step": 25 }, { "epoch": 0.04436860068259386, "grad_norm": 1.1103212511747629, "learning_rate": 3.5494880546075087e-06, "loss": 0.6621, "step": 26 }, { "epoch": 0.04607508532423208, "grad_norm": 1.0962073209819554, "learning_rate": 3.6860068259385667e-06, "loss": 0.633, "step": 27 }, { "epoch": 0.04778156996587031, "grad_norm": 1.2994991639042826, "learning_rate": 3.822525597269625e-06, "loss": 0.6254, "step": 28 }, { "epoch": 0.04948805460750853, "grad_norm": 1.24110998413488, "learning_rate": 3.959044368600683e-06, "loss": 0.6478, "step": 29 }, { "epoch": 0.051194539249146756, "grad_norm": 1.377261198618834, "learning_rate": 4.095563139931741e-06, "loss": 0.692, "step": 30 }, { "epoch": 0.052901023890784986, "grad_norm": 1.0661079292122833, "learning_rate": 4.232081911262799e-06, "loss": 0.6809, "step": 31 }, { "epoch": 0.05460750853242321, "grad_norm": 0.9607749961051216, "learning_rate": 4.368600682593857e-06, "loss": 0.6222, "step": 32 }, { "epoch": 0.05631399317406143, "grad_norm": 0.8427434655009416, "learning_rate": 4.505119453924915e-06, "loss": 0.6109, "step": 33 }, { "epoch": 0.05802047781569966, "grad_norm": 0.7136777520094485, "learning_rate": 4.641638225255973e-06, "loss": 0.6084, "step": 34 }, { "epoch": 0.059726962457337884, "grad_norm": 0.8210756508341455, "learning_rate": 4.778156996587031e-06, "loss": 0.6158, "step": 35 }, { "epoch": 0.06143344709897611, "grad_norm": 1.0150335798080576, "learning_rate": 4.914675767918089e-06, "loss": 0.6199, "step": 36 }, { "epoch": 0.06313993174061433, "grad_norm": 0.9063806967726155, "learning_rate": 5.051194539249147e-06, "loss": 0.6453, "step": 37 }, { "epoch": 0.06484641638225255, "grad_norm": 0.7068201160652776, "learning_rate": 5.1877133105802046e-06, "loss": 0.6104, "step": 38 }, { "epoch": 0.06655290102389079, "grad_norm": 0.638263361178943, "learning_rate": 5.324232081911264e-06, "loss": 0.59, "step": 39 }, { "epoch": 0.06825938566552901, "grad_norm": 0.6916379175425308, "learning_rate": 5.4607508532423215e-06, "loss": 0.5712, "step": 40 }, { "epoch": 0.06996587030716724, "grad_norm": 0.8611281068730197, "learning_rate": 5.597269624573379e-06, "loss": 0.6005, "step": 41 }, { "epoch": 0.07167235494880546, "grad_norm": 0.8014386934383825, "learning_rate": 5.733788395904437e-06, "loss": 0.573, "step": 42 }, { "epoch": 0.07337883959044368, "grad_norm": 0.6815974545922857, "learning_rate": 5.870307167235495e-06, "loss": 0.6091, "step": 43 }, { "epoch": 0.07508532423208192, "grad_norm": 0.601937526209857, "learning_rate": 6.006825938566554e-06, "loss": 0.5869, "step": 44 }, { "epoch": 0.07679180887372014, "grad_norm": 0.6872760313892801, "learning_rate": 6.143344709897611e-06, "loss": 0.6399, "step": 45 }, { "epoch": 0.07849829351535836, "grad_norm": 0.5584926915208507, "learning_rate": 6.27986348122867e-06, "loss": 0.6031, "step": 46 }, { "epoch": 0.08020477815699659, "grad_norm": 0.7413461854532756, "learning_rate": 6.4163822525597275e-06, "loss": 0.6285, "step": 47 }, { "epoch": 0.08191126279863481, "grad_norm": 0.7484828621746611, "learning_rate": 6.552901023890785e-06, "loss": 0.6211, "step": 48 }, { "epoch": 0.08361774744027303, "grad_norm": 0.59460985132373, "learning_rate": 6.689419795221843e-06, "loss": 0.6042, "step": 49 }, { "epoch": 0.08532423208191127, "grad_norm": 0.5303831678889555, "learning_rate": 6.825938566552902e-06, "loss": 0.5623, "step": 50 }, { "epoch": 0.08703071672354949, "grad_norm": 0.5993589782434914, "learning_rate": 6.96245733788396e-06, "loss": 0.6152, "step": 51 }, { "epoch": 0.08873720136518772, "grad_norm": 0.7099366121307583, "learning_rate": 7.098976109215017e-06, "loss": 0.5996, "step": 52 }, { "epoch": 0.09044368600682594, "grad_norm": 0.5195563613459195, "learning_rate": 7.235494880546076e-06, "loss": 0.586, "step": 53 }, { "epoch": 0.09215017064846416, "grad_norm": 0.5328364263084424, "learning_rate": 7.3720136518771335e-06, "loss": 0.5986, "step": 54 }, { "epoch": 0.09385665529010238, "grad_norm": 0.59504684976116, "learning_rate": 7.508532423208191e-06, "loss": 0.5781, "step": 55 }, { "epoch": 0.09556313993174062, "grad_norm": 0.5423417430735504, "learning_rate": 7.64505119453925e-06, "loss": 0.5317, "step": 56 }, { "epoch": 0.09726962457337884, "grad_norm": 0.6595692463709092, "learning_rate": 7.781569965870308e-06, "loss": 0.5683, "step": 57 }, { "epoch": 0.09897610921501707, "grad_norm": 0.7062945077387907, "learning_rate": 7.918088737201367e-06, "loss": 0.6021, "step": 58 }, { "epoch": 0.10068259385665529, "grad_norm": 0.5117294569725397, "learning_rate": 8.054607508532423e-06, "loss": 0.573, "step": 59 }, { "epoch": 0.10238907849829351, "grad_norm": 0.5628977910968079, "learning_rate": 8.191126279863482e-06, "loss": 0.5784, "step": 60 }, { "epoch": 0.10409556313993173, "grad_norm": 0.561096384808619, "learning_rate": 8.327645051194539e-06, "loss": 0.5733, "step": 61 }, { "epoch": 0.10580204778156997, "grad_norm": 0.6314272293109521, "learning_rate": 8.464163822525599e-06, "loss": 0.5638, "step": 62 }, { "epoch": 0.1075085324232082, "grad_norm": 0.6462442946319007, "learning_rate": 8.600682593856656e-06, "loss": 0.5847, "step": 63 }, { "epoch": 0.10921501706484642, "grad_norm": 0.4906413352665859, "learning_rate": 8.737201365187714e-06, "loss": 0.5353, "step": 64 }, { "epoch": 0.11092150170648464, "grad_norm": 0.5876681283539944, "learning_rate": 8.873720136518773e-06, "loss": 0.5841, "step": 65 }, { "epoch": 0.11262798634812286, "grad_norm": 0.5448387276272628, "learning_rate": 9.01023890784983e-06, "loss": 0.5838, "step": 66 }, { "epoch": 0.11433447098976109, "grad_norm": 0.5664187342399059, "learning_rate": 9.146757679180888e-06, "loss": 0.5755, "step": 67 }, { "epoch": 0.11604095563139932, "grad_norm": 0.5701448535010463, "learning_rate": 9.283276450511946e-06, "loss": 0.6011, "step": 68 }, { "epoch": 0.11774744027303755, "grad_norm": 0.5958601312411905, "learning_rate": 9.419795221843005e-06, "loss": 0.5972, "step": 69 }, { "epoch": 0.11945392491467577, "grad_norm": 0.5505177998669957, "learning_rate": 9.556313993174062e-06, "loss": 0.5856, "step": 70 }, { "epoch": 0.12116040955631399, "grad_norm": 0.6211369988588423, "learning_rate": 9.69283276450512e-06, "loss": 0.5607, "step": 71 }, { "epoch": 0.12286689419795221, "grad_norm": 0.5830089207049569, "learning_rate": 9.829351535836179e-06, "loss": 0.5609, "step": 72 }, { "epoch": 0.12457337883959044, "grad_norm": 0.5771202366609306, "learning_rate": 9.965870307167235e-06, "loss": 0.5753, "step": 73 }, { "epoch": 0.12627986348122866, "grad_norm": 0.5512514871122207, "learning_rate": 1.0102389078498294e-05, "loss": 0.5501, "step": 74 }, { "epoch": 0.12798634812286688, "grad_norm": 0.5921576744642906, "learning_rate": 1.0238907849829352e-05, "loss": 0.588, "step": 75 }, { "epoch": 0.1296928327645051, "grad_norm": 0.6279629362229029, "learning_rate": 1.0375426621160409e-05, "loss": 0.596, "step": 76 }, { "epoch": 0.13139931740614336, "grad_norm": 0.5328651531767897, "learning_rate": 1.051194539249147e-05, "loss": 0.6041, "step": 77 }, { "epoch": 0.13310580204778158, "grad_norm": 0.5245734699897833, "learning_rate": 1.0648464163822528e-05, "loss": 0.5617, "step": 78 }, { "epoch": 0.1348122866894198, "grad_norm": 0.5820231240535302, "learning_rate": 1.0784982935153585e-05, "loss": 0.5996, "step": 79 }, { "epoch": 0.13651877133105803, "grad_norm": 0.5787551695285894, "learning_rate": 1.0921501706484643e-05, "loss": 0.5913, "step": 80 }, { "epoch": 0.13822525597269625, "grad_norm": 0.5775924449404031, "learning_rate": 1.1058020477815702e-05, "loss": 0.5504, "step": 81 }, { "epoch": 0.13993174061433447, "grad_norm": 0.6000730341856517, "learning_rate": 1.1194539249146758e-05, "loss": 0.5753, "step": 82 }, { "epoch": 0.1416382252559727, "grad_norm": 0.5027518145184813, "learning_rate": 1.1331058020477817e-05, "loss": 0.5215, "step": 83 }, { "epoch": 0.14334470989761092, "grad_norm": 0.665396255158122, "learning_rate": 1.1467576791808874e-05, "loss": 0.5909, "step": 84 }, { "epoch": 0.14505119453924914, "grad_norm": 0.6507897383778756, "learning_rate": 1.1604095563139932e-05, "loss": 0.5719, "step": 85 }, { "epoch": 0.14675767918088736, "grad_norm": 0.5608661072691324, "learning_rate": 1.174061433447099e-05, "loss": 0.5437, "step": 86 }, { "epoch": 0.14846416382252559, "grad_norm": 0.5915603711419299, "learning_rate": 1.1877133105802047e-05, "loss": 0.5533, "step": 87 }, { "epoch": 0.15017064846416384, "grad_norm": 0.6332333514132811, "learning_rate": 1.2013651877133108e-05, "loss": 0.5432, "step": 88 }, { "epoch": 0.15187713310580206, "grad_norm": 0.5868221722267082, "learning_rate": 1.2150170648464166e-05, "loss": 0.5495, "step": 89 }, { "epoch": 0.15358361774744028, "grad_norm": 0.7081815015621716, "learning_rate": 1.2286689419795223e-05, "loss": 0.5258, "step": 90 }, { "epoch": 0.1552901023890785, "grad_norm": 0.71392633614486, "learning_rate": 1.2423208191126281e-05, "loss": 0.5762, "step": 91 }, { "epoch": 0.15699658703071673, "grad_norm": 0.6981523026906576, "learning_rate": 1.255972696245734e-05, "loss": 0.582, "step": 92 }, { "epoch": 0.15870307167235495, "grad_norm": 0.6171926262629204, "learning_rate": 1.2696245733788397e-05, "loss": 0.5607, "step": 93 }, { "epoch": 0.16040955631399317, "grad_norm": 0.5545169364183319, "learning_rate": 1.2832764505119455e-05, "loss": 0.5247, "step": 94 }, { "epoch": 0.1621160409556314, "grad_norm": 0.7302617590838634, "learning_rate": 1.2969283276450513e-05, "loss": 0.5377, "step": 95 }, { "epoch": 0.16382252559726962, "grad_norm": 0.6212142146844655, "learning_rate": 1.310580204778157e-05, "loss": 0.5573, "step": 96 }, { "epoch": 0.16552901023890784, "grad_norm": 0.7259534885617567, "learning_rate": 1.3242320819112629e-05, "loss": 0.5702, "step": 97 }, { "epoch": 0.16723549488054607, "grad_norm": 0.690308401698924, "learning_rate": 1.3378839590443686e-05, "loss": 0.4928, "step": 98 }, { "epoch": 0.1689419795221843, "grad_norm": 0.5541121051324966, "learning_rate": 1.3515358361774744e-05, "loss": 0.5138, "step": 99 }, { "epoch": 0.17064846416382254, "grad_norm": 0.6225627341053582, "learning_rate": 1.3651877133105804e-05, "loss": 0.5467, "step": 100 }, { "epoch": 0.17235494880546076, "grad_norm": 0.6549365302580259, "learning_rate": 1.3788395904436863e-05, "loss": 0.5784, "step": 101 }, { "epoch": 0.17406143344709898, "grad_norm": 0.5738375615533333, "learning_rate": 1.392491467576792e-05, "loss": 0.5478, "step": 102 }, { "epoch": 0.1757679180887372, "grad_norm": 0.7174652077251786, "learning_rate": 1.4061433447098978e-05, "loss": 0.5154, "step": 103 }, { "epoch": 0.17747440273037543, "grad_norm": 0.6259420624867417, "learning_rate": 1.4197952218430035e-05, "loss": 0.5328, "step": 104 }, { "epoch": 0.17918088737201365, "grad_norm": 0.6980024100865326, "learning_rate": 1.4334470989761093e-05, "loss": 0.5543, "step": 105 }, { "epoch": 0.18088737201365188, "grad_norm": 0.592418945044328, "learning_rate": 1.4470989761092152e-05, "loss": 0.5487, "step": 106 }, { "epoch": 0.1825938566552901, "grad_norm": 0.7138884575404962, "learning_rate": 1.4607508532423209e-05, "loss": 0.5359, "step": 107 }, { "epoch": 0.18430034129692832, "grad_norm": 0.601790646337058, "learning_rate": 1.4744027303754267e-05, "loss": 0.6079, "step": 108 }, { "epoch": 0.18600682593856654, "grad_norm": 0.7129207866642714, "learning_rate": 1.4880546075085325e-05, "loss": 0.5701, "step": 109 }, { "epoch": 0.18771331058020477, "grad_norm": 0.6382641762018992, "learning_rate": 1.5017064846416382e-05, "loss": 0.5662, "step": 110 }, { "epoch": 0.189419795221843, "grad_norm": 0.697222342232993, "learning_rate": 1.515358361774744e-05, "loss": 0.5631, "step": 111 }, { "epoch": 0.19112627986348124, "grad_norm": 0.627950035362892, "learning_rate": 1.52901023890785e-05, "loss": 0.5299, "step": 112 }, { "epoch": 0.19283276450511946, "grad_norm": 0.7254030376716648, "learning_rate": 1.5426621160409558e-05, "loss": 0.5587, "step": 113 }, { "epoch": 0.1945392491467577, "grad_norm": 0.6339029349597561, "learning_rate": 1.5563139931740616e-05, "loss": 0.5233, "step": 114 }, { "epoch": 0.1962457337883959, "grad_norm": 0.6136035997518353, "learning_rate": 1.5699658703071675e-05, "loss": 0.5658, "step": 115 }, { "epoch": 0.19795221843003413, "grad_norm": 0.638647805167733, "learning_rate": 1.5836177474402733e-05, "loss": 0.5164, "step": 116 }, { "epoch": 0.19965870307167236, "grad_norm": 0.5708056648144988, "learning_rate": 1.5972696245733788e-05, "loss": 0.5721, "step": 117 }, { "epoch": 0.20136518771331058, "grad_norm": 0.582867595107491, "learning_rate": 1.6109215017064847e-05, "loss": 0.5087, "step": 118 }, { "epoch": 0.2030716723549488, "grad_norm": 0.7041608007844309, "learning_rate": 1.6245733788395905e-05, "loss": 0.5429, "step": 119 }, { "epoch": 0.20477815699658702, "grad_norm": 0.7296029016815011, "learning_rate": 1.6382252559726964e-05, "loss": 0.523, "step": 120 }, { "epoch": 0.20648464163822525, "grad_norm": 0.7187410525850392, "learning_rate": 1.6518771331058022e-05, "loss": 0.5277, "step": 121 }, { "epoch": 0.20819112627986347, "grad_norm": 0.7371160136610254, "learning_rate": 1.6655290102389077e-05, "loss": 0.5416, "step": 122 }, { "epoch": 0.2098976109215017, "grad_norm": 0.705271247094977, "learning_rate": 1.6791808873720136e-05, "loss": 0.5919, "step": 123 }, { "epoch": 0.21160409556313994, "grad_norm": 0.6215027705274428, "learning_rate": 1.6928327645051198e-05, "loss": 0.5458, "step": 124 }, { "epoch": 0.21331058020477817, "grad_norm": 0.5530765715474965, "learning_rate": 1.7064846416382256e-05, "loss": 0.5468, "step": 125 }, { "epoch": 0.2150170648464164, "grad_norm": 0.6835601219469564, "learning_rate": 1.720136518771331e-05, "loss": 0.506, "step": 126 }, { "epoch": 0.2167235494880546, "grad_norm": 0.677833683372293, "learning_rate": 1.733788395904437e-05, "loss": 0.5408, "step": 127 }, { "epoch": 0.21843003412969283, "grad_norm": 0.6494275011141496, "learning_rate": 1.7474402730375428e-05, "loss": 0.555, "step": 128 }, { "epoch": 0.22013651877133106, "grad_norm": 0.726392896816137, "learning_rate": 1.7610921501706487e-05, "loss": 0.5315, "step": 129 }, { "epoch": 0.22184300341296928, "grad_norm": 0.5611722977626344, "learning_rate": 1.7747440273037545e-05, "loss": 0.5089, "step": 130 }, { "epoch": 0.2235494880546075, "grad_norm": 0.6652759423536284, "learning_rate": 1.78839590443686e-05, "loss": 0.5568, "step": 131 }, { "epoch": 0.22525597269624573, "grad_norm": 0.7259929707728502, "learning_rate": 1.802047781569966e-05, "loss": 0.5351, "step": 132 }, { "epoch": 0.22696245733788395, "grad_norm": 0.6196339136069524, "learning_rate": 1.8156996587030717e-05, "loss": 0.5078, "step": 133 }, { "epoch": 0.22866894197952217, "grad_norm": 0.6352424805125334, "learning_rate": 1.8293515358361776e-05, "loss": 0.5305, "step": 134 }, { "epoch": 0.23037542662116042, "grad_norm": 0.6756053919720748, "learning_rate": 1.8430034129692834e-05, "loss": 0.595, "step": 135 }, { "epoch": 0.23208191126279865, "grad_norm": 0.7257648478517666, "learning_rate": 1.8566552901023893e-05, "loss": 0.5047, "step": 136 }, { "epoch": 0.23378839590443687, "grad_norm": 0.6020515565047796, "learning_rate": 1.870307167235495e-05, "loss": 0.51, "step": 137 }, { "epoch": 0.2354948805460751, "grad_norm": 0.6805295832228313, "learning_rate": 1.883959044368601e-05, "loss": 0.5564, "step": 138 }, { "epoch": 0.23720136518771331, "grad_norm": 0.6016742179261152, "learning_rate": 1.8976109215017068e-05, "loss": 0.5324, "step": 139 }, { "epoch": 0.23890784982935154, "grad_norm": 0.7159623015806224, "learning_rate": 1.9112627986348123e-05, "loss": 0.5604, "step": 140 }, { "epoch": 0.24061433447098976, "grad_norm": 0.5731729805511779, "learning_rate": 1.924914675767918e-05, "loss": 0.5536, "step": 141 }, { "epoch": 0.24232081911262798, "grad_norm": 0.6701475664152404, "learning_rate": 1.938566552901024e-05, "loss": 0.5562, "step": 142 }, { "epoch": 0.2440273037542662, "grad_norm": 0.6546936591609688, "learning_rate": 1.95221843003413e-05, "loss": 0.5069, "step": 143 }, { "epoch": 0.24573378839590443, "grad_norm": 0.6563483278900194, "learning_rate": 1.9658703071672357e-05, "loss": 0.5073, "step": 144 }, { "epoch": 0.24744027303754265, "grad_norm": 0.6528440297406726, "learning_rate": 1.9795221843003412e-05, "loss": 0.5237, "step": 145 }, { "epoch": 0.24914675767918087, "grad_norm": 0.8771673210309611, "learning_rate": 1.993174061433447e-05, "loss": 0.5513, "step": 146 }, { "epoch": 0.2508532423208191, "grad_norm": 0.6642617286794137, "learning_rate": 2.0068259385665533e-05, "loss": 0.5514, "step": 147 }, { "epoch": 0.2525597269624573, "grad_norm": 0.6735728295163484, "learning_rate": 2.0204778156996588e-05, "loss": 0.4947, "step": 148 }, { "epoch": 0.25426621160409557, "grad_norm": 0.705298539331165, "learning_rate": 2.0341296928327646e-05, "loss": 0.5239, "step": 149 }, { "epoch": 0.25597269624573377, "grad_norm": 0.8255167567788236, "learning_rate": 2.0477815699658705e-05, "loss": 0.5451, "step": 150 }, { "epoch": 0.257679180887372, "grad_norm": 0.6253757777768068, "learning_rate": 2.0614334470989763e-05, "loss": 0.5471, "step": 151 }, { "epoch": 0.2593856655290102, "grad_norm": 0.6655165402469364, "learning_rate": 2.0750853242320818e-05, "loss": 0.5169, "step": 152 }, { "epoch": 0.26109215017064846, "grad_norm": 0.7135634503327206, "learning_rate": 2.088737201365188e-05, "loss": 0.5764, "step": 153 }, { "epoch": 0.2627986348122867, "grad_norm": 0.557034065124935, "learning_rate": 2.102389078498294e-05, "loss": 0.5264, "step": 154 }, { "epoch": 0.2645051194539249, "grad_norm": 0.7903317566711652, "learning_rate": 2.1160409556313994e-05, "loss": 0.5408, "step": 155 }, { "epoch": 0.26621160409556316, "grad_norm": 0.5705963001825619, "learning_rate": 2.1296928327645056e-05, "loss": 0.5145, "step": 156 }, { "epoch": 0.26791808873720135, "grad_norm": 0.7379094063463202, "learning_rate": 2.143344709897611e-05, "loss": 0.5544, "step": 157 }, { "epoch": 0.2696245733788396, "grad_norm": 0.5092582654098079, "learning_rate": 2.156996587030717e-05, "loss": 0.5334, "step": 158 }, { "epoch": 0.2713310580204778, "grad_norm": 0.7637584549009019, "learning_rate": 2.1706484641638224e-05, "loss": 0.5629, "step": 159 }, { "epoch": 0.27303754266211605, "grad_norm": 0.6039638458104883, "learning_rate": 2.1843003412969286e-05, "loss": 0.4974, "step": 160 }, { "epoch": 0.27474402730375425, "grad_norm": 0.6339412678654934, "learning_rate": 2.197952218430034e-05, "loss": 0.5186, "step": 161 }, { "epoch": 0.2764505119453925, "grad_norm": 0.6411901060597484, "learning_rate": 2.2116040955631403e-05, "loss": 0.5544, "step": 162 }, { "epoch": 0.2781569965870307, "grad_norm": 0.5670224727349668, "learning_rate": 2.2252559726962458e-05, "loss": 0.5144, "step": 163 }, { "epoch": 0.27986348122866894, "grad_norm": 0.6239260800233626, "learning_rate": 2.2389078498293517e-05, "loss": 0.5236, "step": 164 }, { "epoch": 0.2815699658703072, "grad_norm": 0.6347144073249235, "learning_rate": 2.252559726962458e-05, "loss": 0.5449, "step": 165 }, { "epoch": 0.2832764505119454, "grad_norm": 0.5902846060642897, "learning_rate": 2.2662116040955634e-05, "loss": 0.5467, "step": 166 }, { "epoch": 0.28498293515358364, "grad_norm": 0.6702489702179578, "learning_rate": 2.2798634812286692e-05, "loss": 0.5508, "step": 167 }, { "epoch": 0.28668941979522183, "grad_norm": 0.521860586936388, "learning_rate": 2.2935153583617747e-05, "loss": 0.4819, "step": 168 }, { "epoch": 0.2883959044368601, "grad_norm": 0.6757634905743073, "learning_rate": 2.307167235494881e-05, "loss": 0.5019, "step": 169 }, { "epoch": 0.2901023890784983, "grad_norm": 0.643206610737337, "learning_rate": 2.3208191126279864e-05, "loss": 0.5483, "step": 170 }, { "epoch": 0.29180887372013653, "grad_norm": 0.6980931391123527, "learning_rate": 2.3344709897610926e-05, "loss": 0.5236, "step": 171 }, { "epoch": 0.2935153583617747, "grad_norm": 0.7843824537207991, "learning_rate": 2.348122866894198e-05, "loss": 0.5377, "step": 172 }, { "epoch": 0.295221843003413, "grad_norm": 0.6537672086818195, "learning_rate": 2.361774744027304e-05, "loss": 0.5315, "step": 173 }, { "epoch": 0.29692832764505117, "grad_norm": 1.0082111848546023, "learning_rate": 2.3754266211604095e-05, "loss": 0.5339, "step": 174 }, { "epoch": 0.2986348122866894, "grad_norm": 0.9355730184429999, "learning_rate": 2.3890784982935157e-05, "loss": 0.545, "step": 175 }, { "epoch": 0.3003412969283277, "grad_norm": 0.8312045452721931, "learning_rate": 2.4027303754266215e-05, "loss": 0.5138, "step": 176 }, { "epoch": 0.30204778156996587, "grad_norm": 0.7766276707556691, "learning_rate": 2.416382252559727e-05, "loss": 0.5596, "step": 177 }, { "epoch": 0.3037542662116041, "grad_norm": 0.7036513475468814, "learning_rate": 2.4300341296928332e-05, "loss": 0.5588, "step": 178 }, { "epoch": 0.3054607508532423, "grad_norm": 0.8199422623918116, "learning_rate": 2.4436860068259387e-05, "loss": 0.5156, "step": 179 }, { "epoch": 0.30716723549488056, "grad_norm": 0.718906109811401, "learning_rate": 2.4573378839590446e-05, "loss": 0.5513, "step": 180 }, { "epoch": 0.30887372013651876, "grad_norm": 0.8086075980393629, "learning_rate": 2.4709897610921504e-05, "loss": 0.5273, "step": 181 }, { "epoch": 0.310580204778157, "grad_norm": 0.6153099660722545, "learning_rate": 2.4846416382252563e-05, "loss": 0.5369, "step": 182 }, { "epoch": 0.3122866894197952, "grad_norm": 0.8095994740813184, "learning_rate": 2.4982935153583618e-05, "loss": 0.5493, "step": 183 }, { "epoch": 0.31399317406143346, "grad_norm": 0.5922586913811912, "learning_rate": 2.511945392491468e-05, "loss": 0.5209, "step": 184 }, { "epoch": 0.31569965870307165, "grad_norm": 0.7562463377738635, "learning_rate": 2.5255972696245735e-05, "loss": 0.5187, "step": 185 }, { "epoch": 0.3174061433447099, "grad_norm": 0.6423657920096462, "learning_rate": 2.5392491467576793e-05, "loss": 0.4866, "step": 186 }, { "epoch": 0.3191126279863481, "grad_norm": 0.7239640355125908, "learning_rate": 2.5529010238907848e-05, "loss": 0.5409, "step": 187 }, { "epoch": 0.32081911262798635, "grad_norm": 0.7326794140013588, "learning_rate": 2.566552901023891e-05, "loss": 0.5832, "step": 188 }, { "epoch": 0.3225255972696246, "grad_norm": 0.6853080082888039, "learning_rate": 2.580204778156997e-05, "loss": 0.5526, "step": 189 }, { "epoch": 0.3242320819112628, "grad_norm": 0.7850430606444814, "learning_rate": 2.5938566552901027e-05, "loss": 0.5489, "step": 190 }, { "epoch": 0.32593856655290104, "grad_norm": 0.6849175762613122, "learning_rate": 2.6075085324232085e-05, "loss": 0.5107, "step": 191 }, { "epoch": 0.32764505119453924, "grad_norm": 0.7255246675949937, "learning_rate": 2.621160409556314e-05, "loss": 0.5333, "step": 192 }, { "epoch": 0.3293515358361775, "grad_norm": 0.6490845962032434, "learning_rate": 2.6348122866894202e-05, "loss": 0.4867, "step": 193 }, { "epoch": 0.3310580204778157, "grad_norm": 0.6075482567970028, "learning_rate": 2.6484641638225258e-05, "loss": 0.5384, "step": 194 }, { "epoch": 0.33276450511945393, "grad_norm": 0.6837704922056123, "learning_rate": 2.6621160409556316e-05, "loss": 0.5357, "step": 195 }, { "epoch": 0.33447098976109213, "grad_norm": 0.569359613071419, "learning_rate": 2.675767918088737e-05, "loss": 0.5543, "step": 196 }, { "epoch": 0.3361774744027304, "grad_norm": 0.6092108859734203, "learning_rate": 2.6894197952218433e-05, "loss": 0.5334, "step": 197 }, { "epoch": 0.3378839590443686, "grad_norm": 0.524514989396669, "learning_rate": 2.7030716723549488e-05, "loss": 0.6771, "step": 198 }, { "epoch": 0.3395904436860068, "grad_norm": 0.5628727302131958, "learning_rate": 2.716723549488055e-05, "loss": 0.5679, "step": 199 }, { "epoch": 0.3412969283276451, "grad_norm": 0.5947009175582536, "learning_rate": 2.730375426621161e-05, "loss": 0.5075, "step": 200 }, { "epoch": 0.3430034129692833, "grad_norm": 0.5624210026982975, "learning_rate": 2.7440273037542664e-05, "loss": 0.5353, "step": 201 }, { "epoch": 0.3447098976109215, "grad_norm": 0.6721813831160073, "learning_rate": 2.7576791808873725e-05, "loss": 0.5301, "step": 202 }, { "epoch": 0.3464163822525597, "grad_norm": 0.5492717552749656, "learning_rate": 2.771331058020478e-05, "loss": 0.5593, "step": 203 }, { "epoch": 0.34812286689419797, "grad_norm": 0.5813072784903112, "learning_rate": 2.784982935153584e-05, "loss": 0.5434, "step": 204 }, { "epoch": 0.34982935153583616, "grad_norm": 0.6527862562597682, "learning_rate": 2.7986348122866894e-05, "loss": 0.532, "step": 205 }, { "epoch": 0.3515358361774744, "grad_norm": 0.682141809401472, "learning_rate": 2.8122866894197956e-05, "loss": 0.5652, "step": 206 }, { "epoch": 0.3532423208191126, "grad_norm": 0.7797701437737451, "learning_rate": 2.825938566552901e-05, "loss": 0.5081, "step": 207 }, { "epoch": 0.35494880546075086, "grad_norm": 0.7324241925774362, "learning_rate": 2.839590443686007e-05, "loss": 0.5356, "step": 208 }, { "epoch": 0.35665529010238906, "grad_norm": 0.6576419396573902, "learning_rate": 2.8532423208191128e-05, "loss": 0.4907, "step": 209 }, { "epoch": 0.3583617747440273, "grad_norm": 0.6867342503402145, "learning_rate": 2.8668941979522186e-05, "loss": 0.5129, "step": 210 }, { "epoch": 0.36006825938566556, "grad_norm": 0.6549644929381108, "learning_rate": 2.880546075085325e-05, "loss": 0.5486, "step": 211 }, { "epoch": 0.36177474402730375, "grad_norm": 0.662278431802467, "learning_rate": 2.8941979522184303e-05, "loss": 0.5403, "step": 212 }, { "epoch": 0.363481228668942, "grad_norm": 0.526648439460945, "learning_rate": 2.9078498293515362e-05, "loss": 0.4915, "step": 213 }, { "epoch": 0.3651877133105802, "grad_norm": 0.6386994912227945, "learning_rate": 2.9215017064846417e-05, "loss": 0.5583, "step": 214 }, { "epoch": 0.36689419795221845, "grad_norm": 0.5749334384233534, "learning_rate": 2.935153583617748e-05, "loss": 0.5254, "step": 215 }, { "epoch": 0.36860068259385664, "grad_norm": 0.7850556328751596, "learning_rate": 2.9488054607508534e-05, "loss": 0.574, "step": 216 }, { "epoch": 0.3703071672354949, "grad_norm": 0.5715196045398925, "learning_rate": 2.9624573378839592e-05, "loss": 0.5269, "step": 217 }, { "epoch": 0.3720136518771331, "grad_norm": 0.7461124241882047, "learning_rate": 2.976109215017065e-05, "loss": 0.5514, "step": 218 }, { "epoch": 0.37372013651877134, "grad_norm": 0.6191886408557776, "learning_rate": 2.989761092150171e-05, "loss": 0.5224, "step": 219 }, { "epoch": 0.37542662116040953, "grad_norm": 0.6005041736662892, "learning_rate": 3.0034129692832765e-05, "loss": 0.4765, "step": 220 }, { "epoch": 0.3771331058020478, "grad_norm": 0.724603231134017, "learning_rate": 3.0170648464163826e-05, "loss": 0.5405, "step": 221 }, { "epoch": 0.378839590443686, "grad_norm": 0.5826856216983127, "learning_rate": 3.030716723549488e-05, "loss": 0.5345, "step": 222 }, { "epoch": 0.38054607508532423, "grad_norm": 0.7198123114976283, "learning_rate": 3.044368600682594e-05, "loss": 0.5247, "step": 223 }, { "epoch": 0.3822525597269625, "grad_norm": 0.5762779778908053, "learning_rate": 3.0580204778157e-05, "loss": 0.5119, "step": 224 }, { "epoch": 0.3839590443686007, "grad_norm": 0.7830870037164162, "learning_rate": 3.0716723549488054e-05, "loss": 0.5465, "step": 225 }, { "epoch": 0.3856655290102389, "grad_norm": 0.5420792016735733, "learning_rate": 3.0853242320819115e-05, "loss": 0.4986, "step": 226 }, { "epoch": 0.3873720136518771, "grad_norm": 0.7105286095689686, "learning_rate": 3.098976109215017e-05, "loss": 0.4984, "step": 227 }, { "epoch": 0.3890784982935154, "grad_norm": 0.6107287514130099, "learning_rate": 3.112627986348123e-05, "loss": 0.5422, "step": 228 }, { "epoch": 0.39078498293515357, "grad_norm": 0.6725313318953229, "learning_rate": 3.126279863481229e-05, "loss": 0.5472, "step": 229 }, { "epoch": 0.3924914675767918, "grad_norm": 0.6667927283774585, "learning_rate": 3.139931740614335e-05, "loss": 0.5205, "step": 230 }, { "epoch": 0.39419795221843, "grad_norm": 0.5771264923474617, "learning_rate": 3.1535836177474404e-05, "loss": 0.5335, "step": 231 }, { "epoch": 0.39590443686006827, "grad_norm": 0.5743686892022483, "learning_rate": 3.1672354948805466e-05, "loss": 0.567, "step": 232 }, { "epoch": 0.39761092150170646, "grad_norm": 0.642334479475538, "learning_rate": 3.180887372013652e-05, "loss": 0.5661, "step": 233 }, { "epoch": 0.3993174061433447, "grad_norm": 0.5708609452890699, "learning_rate": 3.1945392491467577e-05, "loss": 0.5227, "step": 234 }, { "epoch": 0.40102389078498296, "grad_norm": 0.6687187653936159, "learning_rate": 3.208191126279864e-05, "loss": 0.5339, "step": 235 }, { "epoch": 0.40273037542662116, "grad_norm": 0.6645444871635297, "learning_rate": 3.2218430034129693e-05, "loss": 0.5457, "step": 236 }, { "epoch": 0.4044368600682594, "grad_norm": 0.5313583267845954, "learning_rate": 3.2354948805460755e-05, "loss": 0.5193, "step": 237 }, { "epoch": 0.4061433447098976, "grad_norm": 0.6166169566756322, "learning_rate": 3.249146757679181e-05, "loss": 0.509, "step": 238 }, { "epoch": 0.40784982935153585, "grad_norm": 0.4855449343720997, "learning_rate": 3.262798634812287e-05, "loss": 0.5065, "step": 239 }, { "epoch": 0.40955631399317405, "grad_norm": 0.6096286686967779, "learning_rate": 3.276450511945393e-05, "loss": 0.5372, "step": 240 }, { "epoch": 0.4112627986348123, "grad_norm": 0.6881439297443295, "learning_rate": 3.290102389078499e-05, "loss": 0.5462, "step": 241 }, { "epoch": 0.4129692832764505, "grad_norm": 0.6556036894082107, "learning_rate": 3.3037542662116044e-05, "loss": 0.546, "step": 242 }, { "epoch": 0.41467576791808874, "grad_norm": 0.5420472627486871, "learning_rate": 3.31740614334471e-05, "loss": 0.5107, "step": 243 }, { "epoch": 0.41638225255972694, "grad_norm": 0.55567844718282, "learning_rate": 3.3310580204778155e-05, "loss": 0.5174, "step": 244 }, { "epoch": 0.4180887372013652, "grad_norm": 0.6166867575282067, "learning_rate": 3.3447098976109216e-05, "loss": 0.5108, "step": 245 }, { "epoch": 0.4197952218430034, "grad_norm": 0.5185587182303334, "learning_rate": 3.358361774744027e-05, "loss": 0.5152, "step": 246 }, { "epoch": 0.42150170648464164, "grad_norm": 0.6043195051507727, "learning_rate": 3.3720136518771333e-05, "loss": 0.5329, "step": 247 }, { "epoch": 0.4232081911262799, "grad_norm": 0.6320652628525706, "learning_rate": 3.3856655290102395e-05, "loss": 0.5258, "step": 248 }, { "epoch": 0.4249146757679181, "grad_norm": 0.5445988520895617, "learning_rate": 3.399317406143345e-05, "loss": 0.5106, "step": 249 }, { "epoch": 0.42662116040955633, "grad_norm": 0.6001675298572623, "learning_rate": 3.412969283276451e-05, "loss": 0.5319, "step": 250 }, { "epoch": 0.4283276450511945, "grad_norm": 0.5259131674323454, "learning_rate": 3.426621160409557e-05, "loss": 0.4957, "step": 251 }, { "epoch": 0.4300341296928328, "grad_norm": 0.723456289561426, "learning_rate": 3.440273037542662e-05, "loss": 0.5187, "step": 252 }, { "epoch": 0.431740614334471, "grad_norm": 0.6062601161437798, "learning_rate": 3.453924914675768e-05, "loss": 0.5288, "step": 253 }, { "epoch": 0.4334470989761092, "grad_norm": 0.6357304201610898, "learning_rate": 3.467576791808874e-05, "loss": 0.4921, "step": 254 }, { "epoch": 0.4351535836177474, "grad_norm": 0.6726290740589141, "learning_rate": 3.4812286689419794e-05, "loss": 0.5185, "step": 255 }, { "epoch": 0.43686006825938567, "grad_norm": 0.7087160989243053, "learning_rate": 3.4948805460750856e-05, "loss": 0.533, "step": 256 }, { "epoch": 0.43856655290102387, "grad_norm": 0.666647645226124, "learning_rate": 3.508532423208191e-05, "loss": 0.5353, "step": 257 }, { "epoch": 0.4402730375426621, "grad_norm": 0.6270896669729984, "learning_rate": 3.522184300341297e-05, "loss": 0.4978, "step": 258 }, { "epoch": 0.44197952218430037, "grad_norm": 0.6759682019764348, "learning_rate": 3.5358361774744035e-05, "loss": 0.5251, "step": 259 }, { "epoch": 0.44368600682593856, "grad_norm": 0.6676073872340431, "learning_rate": 3.549488054607509e-05, "loss": 0.4391, "step": 260 }, { "epoch": 0.4453924914675768, "grad_norm": 0.6956499802081617, "learning_rate": 3.5631399317406145e-05, "loss": 0.5426, "step": 261 }, { "epoch": 0.447098976109215, "grad_norm": 0.8039012535909295, "learning_rate": 3.57679180887372e-05, "loss": 0.5619, "step": 262 }, { "epoch": 0.44880546075085326, "grad_norm": 0.6701469728153912, "learning_rate": 3.590443686006826e-05, "loss": 0.5252, "step": 263 }, { "epoch": 0.45051194539249145, "grad_norm": 0.6490618284705951, "learning_rate": 3.604095563139932e-05, "loss": 0.5219, "step": 264 }, { "epoch": 0.4522184300341297, "grad_norm": 0.721560807463733, "learning_rate": 3.617747440273038e-05, "loss": 0.5359, "step": 265 }, { "epoch": 0.4539249146757679, "grad_norm": 0.5561673893746749, "learning_rate": 3.6313993174061434e-05, "loss": 0.6168, "step": 266 }, { "epoch": 0.45563139931740615, "grad_norm": 0.7584229071720511, "learning_rate": 3.6450511945392496e-05, "loss": 0.5315, "step": 267 }, { "epoch": 0.45733788395904434, "grad_norm": 0.5649763445593471, "learning_rate": 3.658703071672355e-05, "loss": 0.5245, "step": 268 }, { "epoch": 0.4590443686006826, "grad_norm": 0.6056674738757325, "learning_rate": 3.672354948805461e-05, "loss": 0.4945, "step": 269 }, { "epoch": 0.46075085324232085, "grad_norm": 0.5421799420542381, "learning_rate": 3.686006825938567e-05, "loss": 0.518, "step": 270 }, { "epoch": 0.46245733788395904, "grad_norm": 0.5751187796420072, "learning_rate": 3.6996587030716723e-05, "loss": 0.5393, "step": 271 }, { "epoch": 0.4641638225255973, "grad_norm": 0.5859129942902703, "learning_rate": 3.7133105802047785e-05, "loss": 0.5293, "step": 272 }, { "epoch": 0.4658703071672355, "grad_norm": 0.5938721485777587, "learning_rate": 3.726962457337884e-05, "loss": 0.5388, "step": 273 }, { "epoch": 0.46757679180887374, "grad_norm": 0.5218225688533202, "learning_rate": 3.74061433447099e-05, "loss": 0.5415, "step": 274 }, { "epoch": 0.46928327645051193, "grad_norm": 0.6449052599396903, "learning_rate": 3.754266211604096e-05, "loss": 0.5264, "step": 275 }, { "epoch": 0.4709897610921502, "grad_norm": 0.732037053998216, "learning_rate": 3.767918088737202e-05, "loss": 0.571, "step": 276 }, { "epoch": 0.4726962457337884, "grad_norm": 0.5440753439319131, "learning_rate": 3.7815699658703074e-05, "loss": 0.5385, "step": 277 }, { "epoch": 0.47440273037542663, "grad_norm": 0.6329161957095896, "learning_rate": 3.7952218430034136e-05, "loss": 0.5456, "step": 278 }, { "epoch": 0.4761092150170648, "grad_norm": 0.6528500511006126, "learning_rate": 3.808873720136519e-05, "loss": 0.5448, "step": 279 }, { "epoch": 0.4778156996587031, "grad_norm": 0.5545510586675965, "learning_rate": 3.8225255972696246e-05, "loss": 0.4856, "step": 280 }, { "epoch": 0.47952218430034127, "grad_norm": 0.5786537016897892, "learning_rate": 3.83617747440273e-05, "loss": 0.536, "step": 281 }, { "epoch": 0.4812286689419795, "grad_norm": 0.641252312841796, "learning_rate": 3.849829351535836e-05, "loss": 0.5591, "step": 282 }, { "epoch": 0.48293515358361777, "grad_norm": 0.5838397942330007, "learning_rate": 3.8634812286689425e-05, "loss": 0.5596, "step": 283 }, { "epoch": 0.48464163822525597, "grad_norm": 0.6148987590733005, "learning_rate": 3.877133105802048e-05, "loss": 0.4954, "step": 284 }, { "epoch": 0.4863481228668942, "grad_norm": 0.6518937049519072, "learning_rate": 3.890784982935154e-05, "loss": 0.5341, "step": 285 }, { "epoch": 0.4880546075085324, "grad_norm": 0.5410865955347726, "learning_rate": 3.90443686006826e-05, "loss": 0.5382, "step": 286 }, { "epoch": 0.48976109215017066, "grad_norm": 0.7101085537879301, "learning_rate": 3.918088737201366e-05, "loss": 0.592, "step": 287 }, { "epoch": 0.49146757679180886, "grad_norm": 0.6240221422741028, "learning_rate": 3.9317406143344714e-05, "loss": 0.5299, "step": 288 }, { "epoch": 0.4931740614334471, "grad_norm": 0.7583190153817212, "learning_rate": 3.945392491467577e-05, "loss": 0.5454, "step": 289 }, { "epoch": 0.4948805460750853, "grad_norm": 0.6085094169710514, "learning_rate": 3.9590443686006824e-05, "loss": 0.5159, "step": 290 }, { "epoch": 0.49658703071672355, "grad_norm": 0.549608958818555, "learning_rate": 3.9726962457337886e-05, "loss": 0.5009, "step": 291 }, { "epoch": 0.49829351535836175, "grad_norm": 0.6214117294853628, "learning_rate": 3.986348122866894e-05, "loss": 0.5484, "step": 292 }, { "epoch": 0.5, "grad_norm": 0.5328158045651903, "learning_rate": 4e-05, "loss": 0.5332, "step": 293 }, { "epoch": 0.5017064846416383, "grad_norm": 0.5554681326449042, "learning_rate": 3.9999985806829025e-05, "loss": 0.5124, "step": 294 }, { "epoch": 0.5034129692832765, "grad_norm": 0.5918511463290707, "learning_rate": 3.999994322733625e-05, "loss": 0.5466, "step": 295 }, { "epoch": 0.5051194539249146, "grad_norm": 0.6742018044203787, "learning_rate": 3.99998722615821e-05, "loss": 0.5973, "step": 296 }, { "epoch": 0.5068259385665529, "grad_norm": 0.6783398500413388, "learning_rate": 3.999977290966729e-05, "loss": 0.5627, "step": 297 }, { "epoch": 0.5085324232081911, "grad_norm": 0.6443965373732842, "learning_rate": 3.999964517173286e-05, "loss": 0.502, "step": 298 }, { "epoch": 0.5102389078498294, "grad_norm": 0.75173297356595, "learning_rate": 3.999948904796009e-05, "loss": 0.5753, "step": 299 }, { "epoch": 0.5119453924914675, "grad_norm": 0.5578886470083074, "learning_rate": 3.9999304538570564e-05, "loss": 0.5828, "step": 300 }, { "epoch": 0.5136518771331058, "grad_norm": 0.646565069167516, "learning_rate": 3.9999091643826175e-05, "loss": 0.5104, "step": 301 }, { "epoch": 0.515358361774744, "grad_norm": 0.601549331413928, "learning_rate": 3.999885036402908e-05, "loss": 0.5578, "step": 302 }, { "epoch": 0.5170648464163823, "grad_norm": 0.6385380164309451, "learning_rate": 3.999858069952173e-05, "loss": 0.542, "step": 303 }, { "epoch": 0.5187713310580204, "grad_norm": 0.6533143947463091, "learning_rate": 3.999828265068687e-05, "loss": 0.4904, "step": 304 }, { "epoch": 0.5204778156996587, "grad_norm": 0.6452412797735231, "learning_rate": 3.9997956217947525e-05, "loss": 0.5721, "step": 305 }, { "epoch": 0.5221843003412969, "grad_norm": 0.7700498239283877, "learning_rate": 3.999760140176701e-05, "loss": 0.5299, "step": 306 }, { "epoch": 0.5238907849829352, "grad_norm": 0.5347847319347675, "learning_rate": 3.999721820264891e-05, "loss": 0.4806, "step": 307 }, { "epoch": 0.5255972696245734, "grad_norm": 0.7201943625961811, "learning_rate": 3.999680662113711e-05, "loss": 0.5431, "step": 308 }, { "epoch": 0.5273037542662116, "grad_norm": 0.6587459195924609, "learning_rate": 3.9996366657815784e-05, "loss": 0.4872, "step": 309 }, { "epoch": 0.5290102389078498, "grad_norm": 0.568648667150652, "learning_rate": 3.999589831330937e-05, "loss": 0.5303, "step": 310 }, { "epoch": 0.5307167235494881, "grad_norm": 0.755987795579431, "learning_rate": 3.99954015882826e-05, "loss": 0.5326, "step": 311 }, { "epoch": 0.5324232081911263, "grad_norm": 0.7495800369506979, "learning_rate": 3.9994876483440483e-05, "loss": 0.5329, "step": 312 }, { "epoch": 0.5341296928327645, "grad_norm": 0.5368426553441042, "learning_rate": 3.999432299952831e-05, "loss": 0.5349, "step": 313 }, { "epoch": 0.5358361774744027, "grad_norm": 0.5423808226328886, "learning_rate": 3.999374113733165e-05, "loss": 0.4707, "step": 314 }, { "epoch": 0.537542662116041, "grad_norm": 0.4979739764128441, "learning_rate": 3.999313089767635e-05, "loss": 0.5378, "step": 315 }, { "epoch": 0.5392491467576792, "grad_norm": 0.5607008750404271, "learning_rate": 3.999249228142854e-05, "loss": 0.5719, "step": 316 }, { "epoch": 0.5409556313993175, "grad_norm": 0.5635028320952156, "learning_rate": 3.999182528949462e-05, "loss": 0.5007, "step": 317 }, { "epoch": 0.5426621160409556, "grad_norm": 0.4774574283561093, "learning_rate": 3.9991129922821244e-05, "loss": 0.5356, "step": 318 }, { "epoch": 0.5443686006825939, "grad_norm": 0.6841699434711654, "learning_rate": 3.999040618239537e-05, "loss": 0.5723, "step": 319 }, { "epoch": 0.5460750853242321, "grad_norm": 0.536155548533752, "learning_rate": 3.998965406924422e-05, "loss": 0.5195, "step": 320 }, { "epoch": 0.5477815699658704, "grad_norm": 0.5995649286161172, "learning_rate": 3.998887358443528e-05, "loss": 0.5412, "step": 321 }, { "epoch": 0.5494880546075085, "grad_norm": 0.5112550518958299, "learning_rate": 3.99880647290763e-05, "loss": 0.5296, "step": 322 }, { "epoch": 0.5511945392491467, "grad_norm": 0.5700086851182184, "learning_rate": 3.9987227504315295e-05, "loss": 0.593, "step": 323 }, { "epoch": 0.552901023890785, "grad_norm": 0.5291855952380958, "learning_rate": 3.998636191134057e-05, "loss": 0.542, "step": 324 }, { "epoch": 0.5546075085324232, "grad_norm": 0.5119590767300913, "learning_rate": 3.9985467951380666e-05, "loss": 0.584, "step": 325 }, { "epoch": 0.5563139931740614, "grad_norm": 0.5594566122074516, "learning_rate": 3.9984545625704396e-05, "loss": 0.5336, "step": 326 }, { "epoch": 0.5580204778156996, "grad_norm": 0.4637946091135965, "learning_rate": 3.9983594935620835e-05, "loss": 0.5618, "step": 327 }, { "epoch": 0.5597269624573379, "grad_norm": 0.6359265452032311, "learning_rate": 3.998261588247931e-05, "loss": 0.5676, "step": 328 }, { "epoch": 0.5614334470989761, "grad_norm": 0.47797621922130384, "learning_rate": 3.998160846766941e-05, "loss": 0.5135, "step": 329 }, { "epoch": 0.5631399317406144, "grad_norm": 0.5906982722443385, "learning_rate": 3.998057269262099e-05, "loss": 0.5358, "step": 330 }, { "epoch": 0.5648464163822525, "grad_norm": 0.5203977398556296, "learning_rate": 3.997950855880411e-05, "loss": 0.5072, "step": 331 }, { "epoch": 0.5665529010238908, "grad_norm": 0.6067712351750322, "learning_rate": 3.997841606772914e-05, "loss": 0.5626, "step": 332 }, { "epoch": 0.568259385665529, "grad_norm": 0.6157080556517592, "learning_rate": 3.997729522094667e-05, "loss": 0.5352, "step": 333 }, { "epoch": 0.5699658703071673, "grad_norm": 0.590679814892138, "learning_rate": 3.997614602004752e-05, "loss": 0.4935, "step": 334 }, { "epoch": 0.5716723549488054, "grad_norm": 0.6323221642452715, "learning_rate": 3.997496846666279e-05, "loss": 0.5341, "step": 335 }, { "epoch": 0.5733788395904437, "grad_norm": 0.6151594886516842, "learning_rate": 3.997376256246379e-05, "loss": 0.5615, "step": 336 }, { "epoch": 0.5750853242320819, "grad_norm": 0.5846274003132382, "learning_rate": 3.9972528309162086e-05, "loss": 0.5096, "step": 337 }, { "epoch": 0.5767918088737202, "grad_norm": 0.6245632779110738, "learning_rate": 3.997126570850947e-05, "loss": 0.5144, "step": 338 }, { "epoch": 0.5784982935153583, "grad_norm": 0.6498707890711793, "learning_rate": 3.9969974762297974e-05, "loss": 0.5371, "step": 339 }, { "epoch": 0.5802047781569966, "grad_norm": 0.6551387671037868, "learning_rate": 3.996865547235987e-05, "loss": 0.5, "step": 340 }, { "epoch": 0.5819112627986348, "grad_norm": 0.5854419220167429, "learning_rate": 3.996730784056763e-05, "loss": 0.5355, "step": 341 }, { "epoch": 0.5836177474402731, "grad_norm": 0.6766082992863565, "learning_rate": 3.9965931868833984e-05, "loss": 0.5334, "step": 342 }, { "epoch": 0.5853242320819113, "grad_norm": 0.5653804448656627, "learning_rate": 3.996452755911187e-05, "loss": 0.492, "step": 343 }, { "epoch": 0.5870307167235495, "grad_norm": 0.6505973904692599, "learning_rate": 3.996309491339445e-05, "loss": 0.5765, "step": 344 }, { "epoch": 0.5887372013651877, "grad_norm": 0.5830522080149915, "learning_rate": 3.99616339337151e-05, "loss": 0.5502, "step": 345 }, { "epoch": 0.590443686006826, "grad_norm": 0.6184398266691763, "learning_rate": 3.996014462214741e-05, "loss": 0.5188, "step": 346 }, { "epoch": 0.5921501706484642, "grad_norm": 0.5602745636382449, "learning_rate": 3.99586269808052e-05, "loss": 0.579, "step": 347 }, { "epoch": 0.5938566552901023, "grad_norm": 0.5417892827926399, "learning_rate": 3.995708101184246e-05, "loss": 0.5066, "step": 348 }, { "epoch": 0.5955631399317406, "grad_norm": 0.5635084844223573, "learning_rate": 3.995550671745343e-05, "loss": 0.5273, "step": 349 }, { "epoch": 0.5972696245733788, "grad_norm": 0.49850321668504116, "learning_rate": 3.9953904099872525e-05, "loss": 0.5225, "step": 350 }, { "epoch": 0.5989761092150171, "grad_norm": 0.5897835179612174, "learning_rate": 3.9952273161374366e-05, "loss": 0.5169, "step": 351 }, { "epoch": 0.6006825938566553, "grad_norm": 0.5115903280203131, "learning_rate": 3.9950613904273786e-05, "loss": 0.5376, "step": 352 }, { "epoch": 0.6023890784982935, "grad_norm": 0.5655155488320919, "learning_rate": 3.9948926330925775e-05, "loss": 0.5826, "step": 353 }, { "epoch": 0.6040955631399317, "grad_norm": 0.5548295358049624, "learning_rate": 3.994721044372555e-05, "loss": 0.5233, "step": 354 }, { "epoch": 0.60580204778157, "grad_norm": 0.49586722317702214, "learning_rate": 3.994546624510849e-05, "loss": 0.4997, "step": 355 }, { "epoch": 0.6075085324232082, "grad_norm": 0.7159231199083884, "learning_rate": 3.994369373755018e-05, "loss": 0.5276, "step": 356 }, { "epoch": 0.6092150170648464, "grad_norm": 0.5695196393270819, "learning_rate": 3.9941892923566354e-05, "loss": 0.5091, "step": 357 }, { "epoch": 0.6109215017064846, "grad_norm": 0.63179271946944, "learning_rate": 3.994006380571295e-05, "loss": 0.4945, "step": 358 }, { "epoch": 0.6126279863481229, "grad_norm": 0.4900888065252049, "learning_rate": 3.993820638658606e-05, "loss": 0.4921, "step": 359 }, { "epoch": 0.6143344709897611, "grad_norm": 0.5130463209318009, "learning_rate": 3.993632066882195e-05, "loss": 0.5001, "step": 360 }, { "epoch": 0.6160409556313993, "grad_norm": 0.5950557347893138, "learning_rate": 3.9934406655097055e-05, "loss": 0.5462, "step": 361 }, { "epoch": 0.6177474402730375, "grad_norm": 0.48765122887867945, "learning_rate": 3.9932464348127965e-05, "loss": 0.5383, "step": 362 }, { "epoch": 0.6194539249146758, "grad_norm": 0.6614230862754317, "learning_rate": 3.993049375067143e-05, "loss": 0.5632, "step": 363 }, { "epoch": 0.621160409556314, "grad_norm": 0.5271955351184203, "learning_rate": 3.992849486552435e-05, "loss": 0.5603, "step": 364 }, { "epoch": 0.6228668941979523, "grad_norm": 0.6424373509828155, "learning_rate": 3.992646769552379e-05, "loss": 0.5454, "step": 365 }, { "epoch": 0.6245733788395904, "grad_norm": 0.5912704776713205, "learning_rate": 3.992441224354693e-05, "loss": 0.5441, "step": 366 }, { "epoch": 0.6262798634812287, "grad_norm": 0.5659728833578781, "learning_rate": 3.9922328512511114e-05, "loss": 0.533, "step": 367 }, { "epoch": 0.6279863481228669, "grad_norm": 0.5759290370965763, "learning_rate": 3.992021650537382e-05, "loss": 0.5595, "step": 368 }, { "epoch": 0.6296928327645052, "grad_norm": 0.47616510524297306, "learning_rate": 3.991807622513266e-05, "loss": 0.5279, "step": 369 }, { "epoch": 0.6313993174061433, "grad_norm": 0.5825601686484897, "learning_rate": 3.9915907674825356e-05, "loss": 0.5413, "step": 370 }, { "epoch": 0.6331058020477816, "grad_norm": 0.4483345812815246, "learning_rate": 3.9913710857529784e-05, "loss": 0.4962, "step": 371 }, { "epoch": 0.6348122866894198, "grad_norm": 0.6775345195695525, "learning_rate": 3.991148577636391e-05, "loss": 0.5157, "step": 372 }, { "epoch": 0.636518771331058, "grad_norm": 0.4624283233342401, "learning_rate": 3.9909232434485836e-05, "loss": 0.5205, "step": 373 }, { "epoch": 0.6382252559726962, "grad_norm": 0.6699100743983422, "learning_rate": 3.990695083509378e-05, "loss": 0.5256, "step": 374 }, { "epoch": 0.6399317406143344, "grad_norm": 0.47189183222271325, "learning_rate": 3.990464098142604e-05, "loss": 0.5237, "step": 375 }, { "epoch": 0.6416382252559727, "grad_norm": 0.7343933179790109, "learning_rate": 3.990230287676103e-05, "loss": 0.5695, "step": 376 }, { "epoch": 0.643344709897611, "grad_norm": 0.47194980144112525, "learning_rate": 3.9899936524417274e-05, "loss": 0.4663, "step": 377 }, { "epoch": 0.6450511945392492, "grad_norm": 0.7087714839868563, "learning_rate": 3.9897541927753365e-05, "loss": 0.5266, "step": 378 }, { "epoch": 0.6467576791808873, "grad_norm": 0.5193346775206192, "learning_rate": 3.9895119090168e-05, "loss": 0.528, "step": 379 }, { "epoch": 0.6484641638225256, "grad_norm": 0.6482737816606934, "learning_rate": 3.989266801509996e-05, "loss": 0.5674, "step": 380 }, { "epoch": 0.6501706484641638, "grad_norm": 0.5638383348296886, "learning_rate": 3.9890188706028084e-05, "loss": 0.4937, "step": 381 }, { "epoch": 0.6518771331058021, "grad_norm": 0.4752960584294783, "learning_rate": 3.988768116647131e-05, "loss": 0.5186, "step": 382 }, { "epoch": 0.6535836177474402, "grad_norm": 0.5731454087459066, "learning_rate": 3.988514539998862e-05, "loss": 0.5342, "step": 383 }, { "epoch": 0.6552901023890785, "grad_norm": 0.616267780212079, "learning_rate": 3.988258141017909e-05, "loss": 0.5393, "step": 384 }, { "epoch": 0.6569965870307167, "grad_norm": 0.5773096287704862, "learning_rate": 3.987998920068181e-05, "loss": 0.536, "step": 385 }, { "epoch": 0.658703071672355, "grad_norm": 0.6696008091283474, "learning_rate": 3.987736877517597e-05, "loss": 0.5336, "step": 386 }, { "epoch": 0.6604095563139932, "grad_norm": 0.5434061720204441, "learning_rate": 3.987472013738076e-05, "loss": 0.5243, "step": 387 }, { "epoch": 0.6621160409556314, "grad_norm": 0.6591616426149787, "learning_rate": 3.987204329105547e-05, "loss": 0.4863, "step": 388 }, { "epoch": 0.6638225255972696, "grad_norm": 0.5679402238967721, "learning_rate": 3.986933823999936e-05, "loss": 0.4915, "step": 389 }, { "epoch": 0.6655290102389079, "grad_norm": 0.556965760174508, "learning_rate": 3.986660498805177e-05, "loss": 0.524, "step": 390 }, { "epoch": 0.6672354948805461, "grad_norm": 0.6852120576628149, "learning_rate": 3.986384353909205e-05, "loss": 0.5152, "step": 391 }, { "epoch": 0.6689419795221843, "grad_norm": 0.46228477287702213, "learning_rate": 3.9861053897039585e-05, "loss": 0.5128, "step": 392 }, { "epoch": 0.6706484641638225, "grad_norm": 0.5520827992904221, "learning_rate": 3.9858236065853745e-05, "loss": 0.5262, "step": 393 }, { "epoch": 0.6723549488054608, "grad_norm": 0.5637411541812409, "learning_rate": 3.985539004953393e-05, "loss": 0.5347, "step": 394 }, { "epoch": 0.674061433447099, "grad_norm": 0.48720007300892015, "learning_rate": 3.9852515852119535e-05, "loss": 0.4894, "step": 395 }, { "epoch": 0.6757679180887372, "grad_norm": 0.5142995634230677, "learning_rate": 3.9849613477689964e-05, "loss": 0.5536, "step": 396 }, { "epoch": 0.6774744027303754, "grad_norm": 0.5518178426385149, "learning_rate": 3.9846682930364614e-05, "loss": 0.5666, "step": 397 }, { "epoch": 0.6791808873720137, "grad_norm": 0.46116045794249394, "learning_rate": 3.9843724214302844e-05, "loss": 0.5038, "step": 398 }, { "epoch": 0.6808873720136519, "grad_norm": 0.4856736298162217, "learning_rate": 3.984073733370402e-05, "loss": 0.4917, "step": 399 }, { "epoch": 0.6825938566552902, "grad_norm": 0.4968308292207791, "learning_rate": 3.9837722292807465e-05, "loss": 0.4928, "step": 400 }, { "epoch": 0.6843003412969283, "grad_norm": 0.4375164806803662, "learning_rate": 3.9834679095892494e-05, "loss": 0.5173, "step": 401 }, { "epoch": 0.6860068259385665, "grad_norm": 0.4632871905277133, "learning_rate": 3.983160774727836e-05, "loss": 0.5085, "step": 402 }, { "epoch": 0.6877133105802048, "grad_norm": 0.5600303956613327, "learning_rate": 3.982850825132428e-05, "loss": 0.5597, "step": 403 }, { "epoch": 0.689419795221843, "grad_norm": 0.4806062870610982, "learning_rate": 3.982538061242941e-05, "loss": 0.5102, "step": 404 }, { "epoch": 0.6911262798634812, "grad_norm": 0.6085239375371078, "learning_rate": 3.982222483503288e-05, "loss": 0.4918, "step": 405 }, { "epoch": 0.6928327645051194, "grad_norm": 0.530816198322567, "learning_rate": 3.9819040923613734e-05, "loss": 0.5419, "step": 406 }, { "epoch": 0.6945392491467577, "grad_norm": 0.5500182844719029, "learning_rate": 3.981582888269094e-05, "loss": 0.5077, "step": 407 }, { "epoch": 0.6962457337883959, "grad_norm": 0.5420512450147166, "learning_rate": 3.9812588716823424e-05, "loss": 0.5057, "step": 408 }, { "epoch": 0.6979522184300341, "grad_norm": 0.5157060237685485, "learning_rate": 3.980932043060999e-05, "loss": 0.5346, "step": 409 }, { "epoch": 0.6996587030716723, "grad_norm": 0.651581330036418, "learning_rate": 3.9806024028689376e-05, "loss": 0.5202, "step": 410 }, { "epoch": 0.7013651877133106, "grad_norm": 0.5661892497248172, "learning_rate": 3.980269951574022e-05, "loss": 0.5427, "step": 411 }, { "epoch": 0.7030716723549488, "grad_norm": 0.5967504503872553, "learning_rate": 3.979934689648108e-05, "loss": 0.5249, "step": 412 }, { "epoch": 0.7047781569965871, "grad_norm": 0.5592298812958022, "learning_rate": 3.979596617567036e-05, "loss": 0.5261, "step": 413 }, { "epoch": 0.7064846416382252, "grad_norm": 0.558091676610057, "learning_rate": 3.9792557358106385e-05, "loss": 0.4912, "step": 414 }, { "epoch": 0.7081911262798635, "grad_norm": 0.5022628834006392, "learning_rate": 3.978912044862735e-05, "loss": 0.4906, "step": 415 }, { "epoch": 0.7098976109215017, "grad_norm": 0.5017534110957725, "learning_rate": 3.978565545211132e-05, "loss": 0.5587, "step": 416 }, { "epoch": 0.71160409556314, "grad_norm": 0.46967662094610146, "learning_rate": 3.978216237347622e-05, "loss": 0.5136, "step": 417 }, { "epoch": 0.7133105802047781, "grad_norm": 0.4744131857049406, "learning_rate": 3.977864121767985e-05, "loss": 0.5402, "step": 418 }, { "epoch": 0.7150170648464164, "grad_norm": 0.4959077190551545, "learning_rate": 3.977509198971982e-05, "loss": 0.5232, "step": 419 }, { "epoch": 0.7167235494880546, "grad_norm": 0.45485104221578665, "learning_rate": 3.977151469463363e-05, "loss": 0.5463, "step": 420 }, { "epoch": 0.7184300341296929, "grad_norm": 0.4693252104059602, "learning_rate": 3.9767909337498584e-05, "loss": 0.5141, "step": 421 }, { "epoch": 0.7201365187713311, "grad_norm": 0.38234318138230633, "learning_rate": 3.9764275923431836e-05, "loss": 0.4884, "step": 422 }, { "epoch": 0.7218430034129693, "grad_norm": 0.483426578271764, "learning_rate": 3.976061445759035e-05, "loss": 0.5446, "step": 423 }, { "epoch": 0.7235494880546075, "grad_norm": 0.4313890083656951, "learning_rate": 3.9756924945170914e-05, "loss": 0.4995, "step": 424 }, { "epoch": 0.7252559726962458, "grad_norm": 0.45736306408385247, "learning_rate": 3.97532073914101e-05, "loss": 0.4908, "step": 425 }, { "epoch": 0.726962457337884, "grad_norm": 0.4431863764164522, "learning_rate": 3.974946180158431e-05, "loss": 0.5272, "step": 426 }, { "epoch": 0.7286689419795221, "grad_norm": 0.5360713837969308, "learning_rate": 3.9745688181009716e-05, "loss": 0.537, "step": 427 }, { "epoch": 0.7303754266211604, "grad_norm": 0.45208450010601464, "learning_rate": 3.974188653504229e-05, "loss": 0.5191, "step": 428 }, { "epoch": 0.7320819112627986, "grad_norm": 0.5989479913873644, "learning_rate": 3.973805686907777e-05, "loss": 0.5399, "step": 429 }, { "epoch": 0.7337883959044369, "grad_norm": 0.4208920353633511, "learning_rate": 3.9734199188551655e-05, "loss": 0.5136, "step": 430 }, { "epoch": 0.735494880546075, "grad_norm": 0.5169436211718849, "learning_rate": 3.9730313498939225e-05, "loss": 0.5047, "step": 431 }, { "epoch": 0.7372013651877133, "grad_norm": 0.45544132069369514, "learning_rate": 3.972639980575552e-05, "loss": 0.5041, "step": 432 }, { "epoch": 0.7389078498293515, "grad_norm": 0.45687457737266457, "learning_rate": 3.972245811455529e-05, "loss": 0.4646, "step": 433 }, { "epoch": 0.7406143344709898, "grad_norm": 0.5039590590912099, "learning_rate": 3.971848843093305e-05, "loss": 0.5065, "step": 434 }, { "epoch": 0.742320819112628, "grad_norm": 0.4656271645725468, "learning_rate": 3.971449076052305e-05, "loss": 0.5204, "step": 435 }, { "epoch": 0.7440273037542662, "grad_norm": 0.4755396482692981, "learning_rate": 3.9710465108999245e-05, "loss": 0.5004, "step": 436 }, { "epoch": 0.7457337883959044, "grad_norm": 0.5280369390574324, "learning_rate": 3.9706411482075304e-05, "loss": 0.5469, "step": 437 }, { "epoch": 0.7474402730375427, "grad_norm": 0.4588996567849872, "learning_rate": 3.970232988550462e-05, "loss": 0.5059, "step": 438 }, { "epoch": 0.7491467576791809, "grad_norm": 0.5067536606016981, "learning_rate": 3.9698220325080275e-05, "loss": 0.5132, "step": 439 }, { "epoch": 0.7508532423208191, "grad_norm": 0.4701245104251668, "learning_rate": 3.9694082806635026e-05, "loss": 0.4901, "step": 440 }, { "epoch": 0.7525597269624573, "grad_norm": 0.46968979252829884, "learning_rate": 3.9689917336041336e-05, "loss": 0.5426, "step": 441 }, { "epoch": 0.7542662116040956, "grad_norm": 0.49805975671632624, "learning_rate": 3.9685723919211316e-05, "loss": 0.5136, "step": 442 }, { "epoch": 0.7559726962457338, "grad_norm": 0.4726176407036461, "learning_rate": 3.9681502562096764e-05, "loss": 0.5295, "step": 443 }, { "epoch": 0.757679180887372, "grad_norm": 0.5199428138549627, "learning_rate": 3.9677253270689116e-05, "loss": 0.4818, "step": 444 }, { "epoch": 0.7593856655290102, "grad_norm": 0.40331331997993186, "learning_rate": 3.9672976051019477e-05, "loss": 0.5205, "step": 445 }, { "epoch": 0.7610921501706485, "grad_norm": 0.5327355482683795, "learning_rate": 3.9668670909158565e-05, "loss": 0.5009, "step": 446 }, { "epoch": 0.7627986348122867, "grad_norm": 0.43032587423532564, "learning_rate": 3.966433785121675e-05, "loss": 0.5193, "step": 447 }, { "epoch": 0.764505119453925, "grad_norm": 0.4506217825601562, "learning_rate": 3.965997688334401e-05, "loss": 0.5237, "step": 448 }, { "epoch": 0.7662116040955631, "grad_norm": 0.4492241327004955, "learning_rate": 3.965558801172994e-05, "loss": 0.5027, "step": 449 }, { "epoch": 0.7679180887372014, "grad_norm": 0.5212413159647389, "learning_rate": 3.9651171242603746e-05, "loss": 0.6042, "step": 450 }, { "epoch": 0.7696245733788396, "grad_norm": 0.5075927470071195, "learning_rate": 3.964672658223422e-05, "loss": 0.5733, "step": 451 }, { "epoch": 0.7713310580204779, "grad_norm": 0.5239354700021109, "learning_rate": 3.964225403692975e-05, "loss": 0.5156, "step": 452 }, { "epoch": 0.773037542662116, "grad_norm": 0.4531072066408163, "learning_rate": 3.963775361303829e-05, "loss": 0.5297, "step": 453 }, { "epoch": 0.7747440273037542, "grad_norm": 0.4599191961695215, "learning_rate": 3.963322531694737e-05, "loss": 0.5257, "step": 454 }, { "epoch": 0.7764505119453925, "grad_norm": 0.5001588299554403, "learning_rate": 3.962866915508408e-05, "loss": 0.4549, "step": 455 }, { "epoch": 0.7781569965870307, "grad_norm": 0.5038191454985079, "learning_rate": 3.962408513391505e-05, "loss": 0.5941, "step": 456 }, { "epoch": 0.7798634812286689, "grad_norm": 0.5772536453395402, "learning_rate": 3.961947325994648e-05, "loss": 0.6, "step": 457 }, { "epoch": 0.7815699658703071, "grad_norm": 0.4475196746073441, "learning_rate": 3.961483353972406e-05, "loss": 0.535, "step": 458 }, { "epoch": 0.7832764505119454, "grad_norm": 0.5310562522442643, "learning_rate": 3.961016597983303e-05, "loss": 0.5483, "step": 459 }, { "epoch": 0.7849829351535836, "grad_norm": 0.4589968743514119, "learning_rate": 3.960547058689814e-05, "loss": 0.5876, "step": 460 }, { "epoch": 0.7866894197952219, "grad_norm": 0.5334530981422132, "learning_rate": 3.960074736758365e-05, "loss": 0.5141, "step": 461 }, { "epoch": 0.78839590443686, "grad_norm": 0.530776151808494, "learning_rate": 3.9595996328593293e-05, "loss": 0.5312, "step": 462 }, { "epoch": 0.7901023890784983, "grad_norm": 0.4857358421044376, "learning_rate": 3.9591217476670306e-05, "loss": 0.5749, "step": 463 }, { "epoch": 0.7918088737201365, "grad_norm": 0.48944429444835524, "learning_rate": 3.958641081859739e-05, "loss": 0.4869, "step": 464 }, { "epoch": 0.7935153583617748, "grad_norm": 0.5512252561788058, "learning_rate": 3.958157636119672e-05, "loss": 0.5265, "step": 465 }, { "epoch": 0.7952218430034129, "grad_norm": 0.5493204923836218, "learning_rate": 3.9576714111329926e-05, "loss": 0.479, "step": 466 }, { "epoch": 0.7969283276450512, "grad_norm": 0.42907462218706693, "learning_rate": 3.957182407589809e-05, "loss": 0.5077, "step": 467 }, { "epoch": 0.7986348122866894, "grad_norm": 0.5833402640110258, "learning_rate": 3.9566906261841694e-05, "loss": 0.5156, "step": 468 }, { "epoch": 0.8003412969283277, "grad_norm": 0.5027207242408734, "learning_rate": 3.956196067614071e-05, "loss": 0.5389, "step": 469 }, { "epoch": 0.8020477815699659, "grad_norm": 0.5903775732262893, "learning_rate": 3.9556987325814474e-05, "loss": 0.5127, "step": 470 }, { "epoch": 0.8037542662116041, "grad_norm": 0.4489790026763905, "learning_rate": 3.9551986217921755e-05, "loss": 0.5033, "step": 471 }, { "epoch": 0.8054607508532423, "grad_norm": 0.6737608327501232, "learning_rate": 3.9546957359560704e-05, "loss": 0.4755, "step": 472 }, { "epoch": 0.8071672354948806, "grad_norm": 0.4866463991441708, "learning_rate": 3.954190075786887e-05, "loss": 0.5537, "step": 473 }, { "epoch": 0.8088737201365188, "grad_norm": 0.6150681127724358, "learning_rate": 3.953681642002317e-05, "loss": 0.5031, "step": 474 }, { "epoch": 0.810580204778157, "grad_norm": 0.43728211056504895, "learning_rate": 3.9531704353239895e-05, "loss": 0.5052, "step": 475 }, { "epoch": 0.8122866894197952, "grad_norm": 0.510925067172793, "learning_rate": 3.9526564564774685e-05, "loss": 0.5382, "step": 476 }, { "epoch": 0.8139931740614335, "grad_norm": 0.6163408488743702, "learning_rate": 3.9521397061922536e-05, "loss": 0.5657, "step": 477 }, { "epoch": 0.8156996587030717, "grad_norm": 0.5176686859913134, "learning_rate": 3.951620185201777e-05, "loss": 0.521, "step": 478 }, { "epoch": 0.8174061433447098, "grad_norm": 0.4271267095214705, "learning_rate": 3.951097894243404e-05, "loss": 0.5036, "step": 479 }, { "epoch": 0.8191126279863481, "grad_norm": 0.6514774607052338, "learning_rate": 3.9505728340584305e-05, "loss": 0.5813, "step": 480 }, { "epoch": 0.8208191126279863, "grad_norm": 0.4368536071383127, "learning_rate": 3.950045005392084e-05, "loss": 0.5416, "step": 481 }, { "epoch": 0.8225255972696246, "grad_norm": 0.5044516773539471, "learning_rate": 3.94951440899352e-05, "loss": 0.4848, "step": 482 }, { "epoch": 0.8242320819112628, "grad_norm": 0.5344943071649257, "learning_rate": 3.948981045615823e-05, "loss": 0.6499, "step": 483 }, { "epoch": 0.825938566552901, "grad_norm": 0.515701849186856, "learning_rate": 3.9484449160160064e-05, "loss": 0.5208, "step": 484 }, { "epoch": 0.8276450511945392, "grad_norm": 0.518510086224629, "learning_rate": 3.9479060209550066e-05, "loss": 0.5357, "step": 485 }, { "epoch": 0.8293515358361775, "grad_norm": 0.4916579823521775, "learning_rate": 3.947364361197687e-05, "loss": 0.5195, "step": 486 }, { "epoch": 0.8310580204778157, "grad_norm": 0.5022037731108061, "learning_rate": 3.946819937512835e-05, "loss": 0.6218, "step": 487 }, { "epoch": 0.8327645051194539, "grad_norm": 0.5325920446887586, "learning_rate": 3.9462727506731584e-05, "loss": 0.5368, "step": 488 }, { "epoch": 0.8344709897610921, "grad_norm": 0.47314747320582723, "learning_rate": 3.9457228014552916e-05, "loss": 0.4832, "step": 489 }, { "epoch": 0.8361774744027304, "grad_norm": 0.5165856181868046, "learning_rate": 3.9451700906397855e-05, "loss": 0.5393, "step": 490 }, { "epoch": 0.8378839590443686, "grad_norm": 0.604855358248186, "learning_rate": 3.944614619011112e-05, "loss": 0.5501, "step": 491 }, { "epoch": 0.8395904436860068, "grad_norm": 0.5374469090634256, "learning_rate": 3.944056387357662e-05, "loss": 0.5336, "step": 492 }, { "epoch": 0.841296928327645, "grad_norm": 0.4921963833425323, "learning_rate": 3.9434953964717424e-05, "loss": 0.5294, "step": 493 }, { "epoch": 0.8430034129692833, "grad_norm": 0.5319944552023902, "learning_rate": 3.9429316471495777e-05, "loss": 0.4934, "step": 494 }, { "epoch": 0.8447098976109215, "grad_norm": 0.4481641231154688, "learning_rate": 3.9423651401913074e-05, "loss": 0.5038, "step": 495 }, { "epoch": 0.8464163822525598, "grad_norm": 0.6046057595807798, "learning_rate": 3.941795876400984e-05, "loss": 0.5661, "step": 496 }, { "epoch": 0.8481228668941979, "grad_norm": 0.4588428131899579, "learning_rate": 3.941223856586573e-05, "loss": 0.5179, "step": 497 }, { "epoch": 0.8498293515358362, "grad_norm": 0.49445101667923497, "learning_rate": 3.940649081559953e-05, "loss": 0.5383, "step": 498 }, { "epoch": 0.8515358361774744, "grad_norm": 0.47792535762506255, "learning_rate": 3.9400715521369106e-05, "loss": 0.494, "step": 499 }, { "epoch": 0.8532423208191127, "grad_norm": 0.4709617414602991, "learning_rate": 3.939491269137144e-05, "loss": 0.5158, "step": 500 }, { "epoch": 0.8549488054607508, "grad_norm": 0.4287421664931594, "learning_rate": 3.938908233384259e-05, "loss": 0.5037, "step": 501 }, { "epoch": 0.856655290102389, "grad_norm": 0.4554952824428464, "learning_rate": 3.9383224457057676e-05, "loss": 0.5316, "step": 502 }, { "epoch": 0.8583617747440273, "grad_norm": 0.47415170222164305, "learning_rate": 3.937733906933089e-05, "loss": 0.5595, "step": 503 }, { "epoch": 0.8600682593856656, "grad_norm": 0.4784593159132243, "learning_rate": 3.937142617901545e-05, "loss": 0.5256, "step": 504 }, { "epoch": 0.8617747440273038, "grad_norm": 0.5224405316117319, "learning_rate": 3.936548579450364e-05, "loss": 0.5356, "step": 505 }, { "epoch": 0.863481228668942, "grad_norm": 0.5062461175533289, "learning_rate": 3.9359517924226734e-05, "loss": 0.4793, "step": 506 }, { "epoch": 0.8651877133105802, "grad_norm": 0.46137996691684924, "learning_rate": 3.9353522576655045e-05, "loss": 0.5226, "step": 507 }, { "epoch": 0.8668941979522184, "grad_norm": 0.57463265864944, "learning_rate": 3.9347499760297864e-05, "loss": 0.541, "step": 508 }, { "epoch": 0.8686006825938567, "grad_norm": 0.446650800221027, "learning_rate": 3.9341449483703474e-05, "loss": 0.5278, "step": 509 }, { "epoch": 0.8703071672354948, "grad_norm": 0.6374975100826807, "learning_rate": 3.933537175545914e-05, "loss": 0.487, "step": 510 }, { "epoch": 0.8720136518771331, "grad_norm": 0.5051408760304202, "learning_rate": 3.93292665841911e-05, "loss": 0.5465, "step": 511 }, { "epoch": 0.8737201365187713, "grad_norm": 0.4877550532041799, "learning_rate": 3.9323133978564506e-05, "loss": 0.4792, "step": 512 }, { "epoch": 0.8754266211604096, "grad_norm": 0.47071774112823295, "learning_rate": 3.931697394728348e-05, "loss": 0.5222, "step": 513 }, { "epoch": 0.8771331058020477, "grad_norm": 0.4832616488734822, "learning_rate": 3.9310786499091055e-05, "loss": 0.5238, "step": 514 }, { "epoch": 0.878839590443686, "grad_norm": 0.4622626510648489, "learning_rate": 3.9304571642769194e-05, "loss": 0.5331, "step": 515 }, { "epoch": 0.8805460750853242, "grad_norm": 0.49609456268044955, "learning_rate": 3.9298329387138735e-05, "loss": 0.5017, "step": 516 }, { "epoch": 0.8822525597269625, "grad_norm": 0.3894121360741194, "learning_rate": 3.9292059741059426e-05, "loss": 0.5245, "step": 517 }, { "epoch": 0.8839590443686007, "grad_norm": 0.5017588035273783, "learning_rate": 3.928576271342988e-05, "loss": 0.5081, "step": 518 }, { "epoch": 0.8856655290102389, "grad_norm": 0.4077781952167856, "learning_rate": 3.927943831318757e-05, "loss": 0.4843, "step": 519 }, { "epoch": 0.8873720136518771, "grad_norm": 0.47941716917477173, "learning_rate": 3.927308654930884e-05, "loss": 0.5113, "step": 520 }, { "epoch": 0.8890784982935154, "grad_norm": 0.48013869845261975, "learning_rate": 3.9266707430808845e-05, "loss": 0.5065, "step": 521 }, { "epoch": 0.8907849829351536, "grad_norm": 0.469542715885672, "learning_rate": 3.926030096674159e-05, "loss": 0.5461, "step": 522 }, { "epoch": 0.8924914675767918, "grad_norm": 0.4060262709657794, "learning_rate": 3.925386716619986e-05, "loss": 0.502, "step": 523 }, { "epoch": 0.89419795221843, "grad_norm": 0.48621669422457514, "learning_rate": 3.9247406038315274e-05, "loss": 0.5422, "step": 524 }, { "epoch": 0.8959044368600683, "grad_norm": 0.4199510705050333, "learning_rate": 3.924091759225821e-05, "loss": 0.5113, "step": 525 }, { "epoch": 0.8976109215017065, "grad_norm": 0.450553534259471, "learning_rate": 3.9234401837237846e-05, "loss": 0.5366, "step": 526 }, { "epoch": 0.8993174061433447, "grad_norm": 0.40428967397574317, "learning_rate": 3.9227858782502084e-05, "loss": 0.4976, "step": 527 }, { "epoch": 0.9010238907849829, "grad_norm": 0.4956441669290184, "learning_rate": 3.92212884373376e-05, "loss": 0.4949, "step": 528 }, { "epoch": 0.9027303754266212, "grad_norm": 0.421791557031393, "learning_rate": 3.9214690811069814e-05, "loss": 0.485, "step": 529 }, { "epoch": 0.9044368600682594, "grad_norm": 0.5055936934279345, "learning_rate": 3.9208065913062824e-05, "loss": 0.5163, "step": 530 }, { "epoch": 0.9061433447098977, "grad_norm": 0.5545843080358152, "learning_rate": 3.9201413752719484e-05, "loss": 0.556, "step": 531 }, { "epoch": 0.9078498293515358, "grad_norm": 0.4589640114774441, "learning_rate": 3.9194734339481304e-05, "loss": 0.4826, "step": 532 }, { "epoch": 0.909556313993174, "grad_norm": 0.5207402038709883, "learning_rate": 3.9188027682828494e-05, "loss": 0.5142, "step": 533 }, { "epoch": 0.9112627986348123, "grad_norm": 0.44635435968918563, "learning_rate": 3.918129379227992e-05, "loss": 0.5301, "step": 534 }, { "epoch": 0.9129692832764505, "grad_norm": 0.4126548402669452, "learning_rate": 3.917453267739313e-05, "loss": 0.4616, "step": 535 }, { "epoch": 0.9146757679180887, "grad_norm": 0.40596036295652865, "learning_rate": 3.916774434776426e-05, "loss": 0.5175, "step": 536 }, { "epoch": 0.9163822525597269, "grad_norm": 0.4071549783983065, "learning_rate": 3.916092881302812e-05, "loss": 0.4828, "step": 537 }, { "epoch": 0.9180887372013652, "grad_norm": 0.41815613090929515, "learning_rate": 3.915408608285812e-05, "loss": 0.5181, "step": 538 }, { "epoch": 0.9197952218430034, "grad_norm": 0.4646522949101528, "learning_rate": 3.914721616696625e-05, "loss": 0.5372, "step": 539 }, { "epoch": 0.9215017064846417, "grad_norm": 0.386392680741229, "learning_rate": 3.9140319075103105e-05, "loss": 0.4834, "step": 540 }, { "epoch": 0.9232081911262798, "grad_norm": 0.4248500201352696, "learning_rate": 3.9133394817057844e-05, "loss": 0.5204, "step": 541 }, { "epoch": 0.9249146757679181, "grad_norm": 0.4582581997897516, "learning_rate": 3.912644340265819e-05, "loss": 0.4969, "step": 542 }, { "epoch": 0.9266211604095563, "grad_norm": 0.3959557100960288, "learning_rate": 3.91194648417704e-05, "loss": 0.5118, "step": 543 }, { "epoch": 0.9283276450511946, "grad_norm": 0.47939921367511734, "learning_rate": 3.9112459144299255e-05, "loss": 0.5207, "step": 544 }, { "epoch": 0.9300341296928327, "grad_norm": 0.43658501873671096, "learning_rate": 3.910542632018808e-05, "loss": 0.502, "step": 545 }, { "epoch": 0.931740614334471, "grad_norm": 0.4608285168964317, "learning_rate": 3.909836637941867e-05, "loss": 0.4967, "step": 546 }, { "epoch": 0.9334470989761092, "grad_norm": 0.4452967998329967, "learning_rate": 3.909127933201133e-05, "loss": 0.5175, "step": 547 }, { "epoch": 0.9351535836177475, "grad_norm": 0.42903249907341506, "learning_rate": 3.908416518802481e-05, "loss": 0.4919, "step": 548 }, { "epoch": 0.9368600682593856, "grad_norm": 0.4983362738620335, "learning_rate": 3.907702395755636e-05, "loss": 0.5544, "step": 549 }, { "epoch": 0.9385665529010239, "grad_norm": 0.48137760016486825, "learning_rate": 3.906985565074163e-05, "loss": 0.4978, "step": 550 }, { "epoch": 0.9402730375426621, "grad_norm": 0.42814076114002747, "learning_rate": 3.9062660277754726e-05, "loss": 0.5696, "step": 551 }, { "epoch": 0.9419795221843004, "grad_norm": 0.480834195450671, "learning_rate": 3.905543784880817e-05, "loss": 0.5441, "step": 552 }, { "epoch": 0.9436860068259386, "grad_norm": 0.46675322544792136, "learning_rate": 3.9048188374152875e-05, "loss": 0.5334, "step": 553 }, { "epoch": 0.9453924914675768, "grad_norm": 0.459122809136642, "learning_rate": 3.904091186407815e-05, "loss": 0.4805, "step": 554 }, { "epoch": 0.947098976109215, "grad_norm": 0.45818196676392814, "learning_rate": 3.9033608328911655e-05, "loss": 0.5095, "step": 555 }, { "epoch": 0.9488054607508533, "grad_norm": 0.441105921685137, "learning_rate": 3.9026277779019434e-05, "loss": 0.4756, "step": 556 }, { "epoch": 0.9505119453924915, "grad_norm": 0.5008988029978901, "learning_rate": 3.901892022480586e-05, "loss": 0.5224, "step": 557 }, { "epoch": 0.9522184300341296, "grad_norm": 0.47988835378586664, "learning_rate": 3.9011535676713636e-05, "loss": 0.4853, "step": 558 }, { "epoch": 0.9539249146757679, "grad_norm": 0.4372032012177308, "learning_rate": 3.900412414522378e-05, "loss": 0.4946, "step": 559 }, { "epoch": 0.9556313993174061, "grad_norm": 0.5358214314599237, "learning_rate": 3.899668564085559e-05, "loss": 0.5006, "step": 560 }, { "epoch": 0.9573378839590444, "grad_norm": 0.45911660732913245, "learning_rate": 3.898922017416668e-05, "loss": 0.5374, "step": 561 }, { "epoch": 0.9590443686006825, "grad_norm": 0.44658419169074315, "learning_rate": 3.898172775575291e-05, "loss": 0.5209, "step": 562 }, { "epoch": 0.9607508532423208, "grad_norm": 0.4310985352672766, "learning_rate": 3.89742083962484e-05, "loss": 0.4737, "step": 563 }, { "epoch": 0.962457337883959, "grad_norm": 0.4122180714819184, "learning_rate": 3.8966662106325495e-05, "loss": 0.5104, "step": 564 }, { "epoch": 0.9641638225255973, "grad_norm": 0.44427959014053525, "learning_rate": 3.8959088896694785e-05, "loss": 0.4915, "step": 565 }, { "epoch": 0.9658703071672355, "grad_norm": 0.44001548707309424, "learning_rate": 3.8951488778105054e-05, "loss": 0.5102, "step": 566 }, { "epoch": 0.9675767918088737, "grad_norm": 0.47205644918158723, "learning_rate": 3.894386176134327e-05, "loss": 0.5043, "step": 567 }, { "epoch": 0.9692832764505119, "grad_norm": 0.4799588163560951, "learning_rate": 3.89362078572346e-05, "loss": 0.5007, "step": 568 }, { "epoch": 0.9709897610921502, "grad_norm": 0.4371472093668799, "learning_rate": 3.892852707664235e-05, "loss": 0.5157, "step": 569 }, { "epoch": 0.9726962457337884, "grad_norm": 0.45620099061021785, "learning_rate": 3.892081943046799e-05, "loss": 0.5083, "step": 570 }, { "epoch": 0.9744027303754266, "grad_norm": 0.4902069995205142, "learning_rate": 3.891308492965112e-05, "loss": 0.5263, "step": 571 }, { "epoch": 0.9761092150170648, "grad_norm": 0.37562437603564053, "learning_rate": 3.890532358516944e-05, "loss": 0.4775, "step": 572 }, { "epoch": 0.9778156996587031, "grad_norm": 0.45149513920661594, "learning_rate": 3.889753540803876e-05, "loss": 0.5118, "step": 573 }, { "epoch": 0.9795221843003413, "grad_norm": 0.45070569518442194, "learning_rate": 3.888972040931299e-05, "loss": 0.4998, "step": 574 }, { "epoch": 0.9812286689419796, "grad_norm": 0.4554162489722807, "learning_rate": 3.8881878600084053e-05, "loss": 0.5143, "step": 575 }, { "epoch": 0.9829351535836177, "grad_norm": 0.46808574128865893, "learning_rate": 3.8874009991482e-05, "loss": 0.5106, "step": 576 }, { "epoch": 0.984641638225256, "grad_norm": 0.44264827285152525, "learning_rate": 3.8866114594674865e-05, "loss": 0.5096, "step": 577 }, { "epoch": 0.9863481228668942, "grad_norm": 0.38689828803482745, "learning_rate": 3.885819242086872e-05, "loss": 0.4652, "step": 578 }, { "epoch": 0.9880546075085325, "grad_norm": 0.44863248742872985, "learning_rate": 3.885024348130765e-05, "loss": 0.5021, "step": 579 }, { "epoch": 0.9897610921501706, "grad_norm": 0.4376711926299938, "learning_rate": 3.884226778727371e-05, "loss": 0.4894, "step": 580 }, { "epoch": 0.9914675767918089, "grad_norm": 0.4836618691935011, "learning_rate": 3.883426535008694e-05, "loss": 0.4912, "step": 581 }, { "epoch": 0.9931740614334471, "grad_norm": 0.4634093982781147, "learning_rate": 3.8826236181105344e-05, "loss": 0.5249, "step": 582 }, { "epoch": 0.9948805460750854, "grad_norm": 0.45182596219251353, "learning_rate": 3.8818180291724855e-05, "loss": 0.4644, "step": 583 }, { "epoch": 0.9965870307167235, "grad_norm": 0.4524376905770041, "learning_rate": 3.8810097693379336e-05, "loss": 0.5234, "step": 584 }, { "epoch": 0.9982935153583617, "grad_norm": 0.41527674638945133, "learning_rate": 3.8801988397540554e-05, "loss": 0.5019, "step": 585 }, { "epoch": 1.0, "grad_norm": 0.45259487503676415, "learning_rate": 3.879385241571817e-05, "loss": 0.527, "step": 586 }, { "epoch": 1.0017064846416381, "grad_norm": 0.5439507243208022, "learning_rate": 3.878568975945973e-05, "loss": 0.4537, "step": 587 }, { "epoch": 1.0034129692832765, "grad_norm": 0.5332516474258484, "learning_rate": 3.877750044035062e-05, "loss": 0.4203, "step": 588 }, { "epoch": 1.0051194539249146, "grad_norm": 0.48766492268460837, "learning_rate": 3.876928447001409e-05, "loss": 0.4501, "step": 589 }, { "epoch": 1.006825938566553, "grad_norm": 0.4831041742336393, "learning_rate": 3.8761041860111206e-05, "loss": 0.3917, "step": 590 }, { "epoch": 1.0085324232081911, "grad_norm": 0.5381540681172681, "learning_rate": 3.875277262234083e-05, "loss": 0.4446, "step": 591 }, { "epoch": 1.0102389078498293, "grad_norm": 0.5320198688078498, "learning_rate": 3.874447676843966e-05, "loss": 0.4561, "step": 592 }, { "epoch": 1.0119453924914676, "grad_norm": 0.46283902661104653, "learning_rate": 3.873615431018213e-05, "loss": 0.4123, "step": 593 }, { "epoch": 1.0136518771331058, "grad_norm": 0.4538227503029545, "learning_rate": 3.872780525938044e-05, "loss": 0.4069, "step": 594 }, { "epoch": 1.015358361774744, "grad_norm": 0.44200735879492625, "learning_rate": 3.8719429627884544e-05, "loss": 0.4557, "step": 595 }, { "epoch": 1.0170648464163823, "grad_norm": 0.46052859455189543, "learning_rate": 3.8711027427582126e-05, "loss": 0.3977, "step": 596 }, { "epoch": 1.0187713310580204, "grad_norm": 0.46912938323050846, "learning_rate": 3.870259867039857e-05, "loss": 0.4253, "step": 597 }, { "epoch": 1.0204778156996588, "grad_norm": 0.48104790336408815, "learning_rate": 3.869414336829695e-05, "loss": 0.4317, "step": 598 }, { "epoch": 1.022184300341297, "grad_norm": 0.5070473741322639, "learning_rate": 3.8685661533278026e-05, "loss": 0.4017, "step": 599 }, { "epoch": 1.023890784982935, "grad_norm": 0.5337220776817938, "learning_rate": 3.8677153177380206e-05, "loss": 0.3987, "step": 600 }, { "epoch": 1.0255972696245734, "grad_norm": 0.5046087234277966, "learning_rate": 3.8668618312679556e-05, "loss": 0.4662, "step": 601 }, { "epoch": 1.0273037542662116, "grad_norm": 0.5209637119395667, "learning_rate": 3.866005695128974e-05, "loss": 0.4637, "step": 602 }, { "epoch": 1.02901023890785, "grad_norm": 0.4826434962536877, "learning_rate": 3.865146910536206e-05, "loss": 0.4091, "step": 603 }, { "epoch": 1.030716723549488, "grad_norm": 0.47968781942819244, "learning_rate": 3.864285478708538e-05, "loss": 0.4089, "step": 604 }, { "epoch": 1.0324232081911262, "grad_norm": 0.46153618460883944, "learning_rate": 3.8634214008686155e-05, "loss": 0.4268, "step": 605 }, { "epoch": 1.0341296928327646, "grad_norm": 0.4255108453767918, "learning_rate": 3.862554678242839e-05, "loss": 0.4401, "step": 606 }, { "epoch": 1.0358361774744027, "grad_norm": 0.4091807950411335, "learning_rate": 3.8616853120613634e-05, "loss": 0.4352, "step": 607 }, { "epoch": 1.0375426621160408, "grad_norm": 0.4837900126036535, "learning_rate": 3.860813303558093e-05, "loss": 0.4419, "step": 608 }, { "epoch": 1.0392491467576792, "grad_norm": 0.43491684501985955, "learning_rate": 3.8599386539706866e-05, "loss": 0.4003, "step": 609 }, { "epoch": 1.0409556313993173, "grad_norm": 0.4400347750814093, "learning_rate": 3.859061364540548e-05, "loss": 0.4547, "step": 610 }, { "epoch": 1.0426621160409557, "grad_norm": 0.48834670025862204, "learning_rate": 3.858181436512829e-05, "loss": 0.4637, "step": 611 }, { "epoch": 1.0443686006825939, "grad_norm": 0.5126681768767233, "learning_rate": 3.8572988711364275e-05, "loss": 0.4379, "step": 612 }, { "epoch": 1.046075085324232, "grad_norm": 0.4626678296479312, "learning_rate": 3.8564136696639826e-05, "loss": 0.4137, "step": 613 }, { "epoch": 1.0477815699658704, "grad_norm": 0.5528573683773701, "learning_rate": 3.855525833351876e-05, "loss": 0.4286, "step": 614 }, { "epoch": 1.0494880546075085, "grad_norm": 0.495307267501836, "learning_rate": 3.85463536346023e-05, "loss": 0.4075, "step": 615 }, { "epoch": 1.0511945392491469, "grad_norm": 0.5650705271490135, "learning_rate": 3.8537422612529025e-05, "loss": 0.4348, "step": 616 }, { "epoch": 1.052901023890785, "grad_norm": 0.5586265260826336, "learning_rate": 3.85284652799749e-05, "loss": 0.4509, "step": 617 }, { "epoch": 1.0546075085324231, "grad_norm": 0.5266504778950317, "learning_rate": 3.851948164965321e-05, "loss": 0.4733, "step": 618 }, { "epoch": 1.0563139931740615, "grad_norm": 0.5712865595016923, "learning_rate": 3.851047173431458e-05, "loss": 0.4183, "step": 619 }, { "epoch": 1.0580204778156996, "grad_norm": 0.471589194011068, "learning_rate": 3.8501435546746926e-05, "loss": 0.4552, "step": 620 }, { "epoch": 1.0597269624573378, "grad_norm": 0.46799408444546003, "learning_rate": 3.849237309977548e-05, "loss": 0.3926, "step": 621 }, { "epoch": 1.0614334470989761, "grad_norm": 0.512683696291688, "learning_rate": 3.848328440626271e-05, "loss": 0.423, "step": 622 }, { "epoch": 1.0631399317406143, "grad_norm": 0.40009463640214077, "learning_rate": 3.847416947910837e-05, "loss": 0.4254, "step": 623 }, { "epoch": 1.0648464163822526, "grad_norm": 0.46965072279488435, "learning_rate": 3.846502833124943e-05, "loss": 0.5496, "step": 624 }, { "epoch": 1.0665529010238908, "grad_norm": 0.4447820975915501, "learning_rate": 3.8455860975660073e-05, "loss": 0.4308, "step": 625 }, { "epoch": 1.068259385665529, "grad_norm": 0.482882463484246, "learning_rate": 3.844666742535168e-05, "loss": 0.4017, "step": 626 }, { "epoch": 1.0699658703071673, "grad_norm": 0.46911565381234427, "learning_rate": 3.843744769337282e-05, "loss": 0.4043, "step": 627 }, { "epoch": 1.0716723549488054, "grad_norm": 0.4615084891322283, "learning_rate": 3.8428201792809213e-05, "loss": 0.431, "step": 628 }, { "epoch": 1.0733788395904438, "grad_norm": 0.4654575319131965, "learning_rate": 3.841892973678373e-05, "loss": 0.4445, "step": 629 }, { "epoch": 1.075085324232082, "grad_norm": 0.4556300734049713, "learning_rate": 3.840963153845635e-05, "loss": 0.4035, "step": 630 }, { "epoch": 1.07679180887372, "grad_norm": 0.44366473510398396, "learning_rate": 3.840030721102417e-05, "loss": 0.4491, "step": 631 }, { "epoch": 1.0784982935153584, "grad_norm": 0.482008116003779, "learning_rate": 3.839095676772137e-05, "loss": 0.4003, "step": 632 }, { "epoch": 1.0802047781569966, "grad_norm": 0.48158786376442975, "learning_rate": 3.838158022181918e-05, "loss": 0.424, "step": 633 }, { "epoch": 1.0819112627986347, "grad_norm": 0.4672233450530056, "learning_rate": 3.837217758662592e-05, "loss": 0.4267, "step": 634 }, { "epoch": 1.083617747440273, "grad_norm": 0.624999999598963, "learning_rate": 3.836274887548688e-05, "loss": 0.4857, "step": 635 }, { "epoch": 1.0853242320819112, "grad_norm": 0.4778073909596797, "learning_rate": 3.83532941017844e-05, "loss": 0.4125, "step": 636 }, { "epoch": 1.0870307167235496, "grad_norm": 0.5669499222940174, "learning_rate": 3.8343813278937815e-05, "loss": 0.4105, "step": 637 }, { "epoch": 1.0887372013651877, "grad_norm": 0.5457371005641741, "learning_rate": 3.8334306420403404e-05, "loss": 0.4825, "step": 638 }, { "epoch": 1.0904436860068258, "grad_norm": 0.3923472466345296, "learning_rate": 3.832477353967442e-05, "loss": 0.4099, "step": 639 }, { "epoch": 1.0921501706484642, "grad_norm": 0.6081290992214772, "learning_rate": 3.8315214650281045e-05, "loss": 0.4539, "step": 640 }, { "epoch": 1.0938566552901023, "grad_norm": 0.5100099873776106, "learning_rate": 3.830562976579038e-05, "loss": 0.4124, "step": 641 }, { "epoch": 1.0955631399317407, "grad_norm": 0.5143473347132098, "learning_rate": 3.82960188998064e-05, "loss": 0.4049, "step": 642 }, { "epoch": 1.0972696245733788, "grad_norm": 0.5584803377698523, "learning_rate": 3.828638206596998e-05, "loss": 0.4204, "step": 643 }, { "epoch": 1.098976109215017, "grad_norm": 0.5338983168276245, "learning_rate": 3.8276719277958847e-05, "loss": 0.4217, "step": 644 }, { "epoch": 1.1006825938566553, "grad_norm": 0.4978895498293814, "learning_rate": 3.8267030549487546e-05, "loss": 0.4365, "step": 645 }, { "epoch": 1.1023890784982935, "grad_norm": 0.5306887611710531, "learning_rate": 3.8257315894307474e-05, "loss": 0.4129, "step": 646 }, { "epoch": 1.1040955631399316, "grad_norm": 0.5311987023404002, "learning_rate": 3.8247575326206795e-05, "loss": 0.4556, "step": 647 }, { "epoch": 1.10580204778157, "grad_norm": 0.42657444240974196, "learning_rate": 3.823780885901047e-05, "loss": 0.5134, "step": 648 }, { "epoch": 1.1075085324232081, "grad_norm": 0.5998945716495959, "learning_rate": 3.8228016506580215e-05, "loss": 0.4621, "step": 649 }, { "epoch": 1.1092150170648465, "grad_norm": 0.5196911656376497, "learning_rate": 3.821819828281447e-05, "loss": 0.4487, "step": 650 }, { "epoch": 1.1109215017064846, "grad_norm": 0.5323892395331172, "learning_rate": 3.820835420164842e-05, "loss": 0.4337, "step": 651 }, { "epoch": 1.1126279863481228, "grad_norm": 0.49936369160051053, "learning_rate": 3.819848427705393e-05, "loss": 0.4733, "step": 652 }, { "epoch": 1.1143344709897611, "grad_norm": 0.43757305047961276, "learning_rate": 3.8188588523039575e-05, "loss": 0.4172, "step": 653 }, { "epoch": 1.1160409556313993, "grad_norm": 0.5284302122594836, "learning_rate": 3.817866695365053e-05, "loss": 0.4687, "step": 654 }, { "epoch": 1.1177474402730376, "grad_norm": 0.5129813630203046, "learning_rate": 3.8168719582968676e-05, "loss": 0.4799, "step": 655 }, { "epoch": 1.1194539249146758, "grad_norm": 0.5120020068358729, "learning_rate": 3.8158746425112484e-05, "loss": 0.4151, "step": 656 }, { "epoch": 1.121160409556314, "grad_norm": 0.526318315169048, "learning_rate": 3.814874749423701e-05, "loss": 0.4262, "step": 657 }, { "epoch": 1.1228668941979523, "grad_norm": 0.4801676547137466, "learning_rate": 3.8138722804533924e-05, "loss": 0.4371, "step": 658 }, { "epoch": 1.1245733788395904, "grad_norm": 0.5183019208351913, "learning_rate": 3.8128672370231437e-05, "loss": 0.4571, "step": 659 }, { "epoch": 1.1262798634812285, "grad_norm": 0.5112906104349612, "learning_rate": 3.811859620559429e-05, "loss": 0.4344, "step": 660 }, { "epoch": 1.127986348122867, "grad_norm": 0.4419751955606906, "learning_rate": 3.8108494324923776e-05, "loss": 0.4553, "step": 661 }, { "epoch": 1.129692832764505, "grad_norm": 0.46052945892140157, "learning_rate": 3.8098366742557655e-05, "loss": 0.4115, "step": 662 }, { "epoch": 1.1313993174061434, "grad_norm": 0.41139701881709145, "learning_rate": 3.8088213472870184e-05, "loss": 0.4382, "step": 663 }, { "epoch": 1.1331058020477816, "grad_norm": 0.42760230770614577, "learning_rate": 3.8078034530272064e-05, "loss": 0.4446, "step": 664 }, { "epoch": 1.13481228668942, "grad_norm": 0.42098905402498127, "learning_rate": 3.806782992921044e-05, "loss": 0.4481, "step": 665 }, { "epoch": 1.136518771331058, "grad_norm": 0.4730154695502598, "learning_rate": 3.8057599684168885e-05, "loss": 0.4374, "step": 666 }, { "epoch": 1.1382252559726962, "grad_norm": 0.40018138361332156, "learning_rate": 3.8047343809667364e-05, "loss": 0.4039, "step": 667 }, { "epoch": 1.1399317406143346, "grad_norm": 0.4506043190981934, "learning_rate": 3.803706232026221e-05, "loss": 0.4259, "step": 668 }, { "epoch": 1.1416382252559727, "grad_norm": 0.4233013092094586, "learning_rate": 3.802675523054611e-05, "loss": 0.4213, "step": 669 }, { "epoch": 1.1433447098976108, "grad_norm": 0.5067727131656333, "learning_rate": 3.8016422555148095e-05, "loss": 0.4307, "step": 670 }, { "epoch": 1.1450511945392492, "grad_norm": 0.443308195447026, "learning_rate": 3.8006064308733525e-05, "loss": 0.4743, "step": 671 }, { "epoch": 1.1467576791808873, "grad_norm": 0.4638145022775026, "learning_rate": 3.7995680506004016e-05, "loss": 0.4194, "step": 672 }, { "epoch": 1.1484641638225255, "grad_norm": 0.43737972177867845, "learning_rate": 3.7985271161697476e-05, "loss": 0.4589, "step": 673 }, { "epoch": 1.1501706484641638, "grad_norm": 0.38056511468491, "learning_rate": 3.797483629058809e-05, "loss": 0.421, "step": 674 }, { "epoch": 1.151877133105802, "grad_norm": 0.42431704532297027, "learning_rate": 3.796437590748622e-05, "loss": 0.4154, "step": 675 }, { "epoch": 1.1535836177474403, "grad_norm": 0.41259449298798845, "learning_rate": 3.795389002723848e-05, "loss": 0.4534, "step": 676 }, { "epoch": 1.1552901023890785, "grad_norm": 0.43351259341884885, "learning_rate": 3.7943378664727665e-05, "loss": 0.3991, "step": 677 }, { "epoch": 1.1569965870307168, "grad_norm": 0.4144554120400783, "learning_rate": 3.7932841834872714e-05, "loss": 0.457, "step": 678 }, { "epoch": 1.158703071672355, "grad_norm": 0.4493500592051183, "learning_rate": 3.792227955262875e-05, "loss": 0.4329, "step": 679 }, { "epoch": 1.1604095563139931, "grad_norm": 0.46932545805628595, "learning_rate": 3.7911691832986986e-05, "loss": 0.4137, "step": 680 }, { "epoch": 1.1621160409556315, "grad_norm": 0.4339728498300188, "learning_rate": 3.790107869097475e-05, "loss": 0.4419, "step": 681 }, { "epoch": 1.1638225255972696, "grad_norm": 0.6040022644435886, "learning_rate": 3.789044014165548e-05, "loss": 0.4257, "step": 682 }, { "epoch": 1.1655290102389078, "grad_norm": 0.47726214656604243, "learning_rate": 3.787977620012863e-05, "loss": 0.4569, "step": 683 }, { "epoch": 1.1672354948805461, "grad_norm": 0.3996425683267564, "learning_rate": 3.786908688152971e-05, "loss": 0.419, "step": 684 }, { "epoch": 1.1689419795221843, "grad_norm": 0.5629467403716398, "learning_rate": 3.785837220103027e-05, "loss": 0.4321, "step": 685 }, { "epoch": 1.1706484641638226, "grad_norm": 0.3980783427094222, "learning_rate": 3.784763217383783e-05, "loss": 0.4458, "step": 686 }, { "epoch": 1.1723549488054608, "grad_norm": 0.4265806939898511, "learning_rate": 3.7836866815195896e-05, "loss": 0.4646, "step": 687 }, { "epoch": 1.174061433447099, "grad_norm": 0.4434867718245713, "learning_rate": 3.782607614038393e-05, "loss": 0.464, "step": 688 }, { "epoch": 1.1757679180887373, "grad_norm": 0.44452649741611283, "learning_rate": 3.7815260164717314e-05, "loss": 0.4209, "step": 689 }, { "epoch": 1.1774744027303754, "grad_norm": 0.41431565381119073, "learning_rate": 3.780441890354735e-05, "loss": 0.4593, "step": 690 }, { "epoch": 1.1791808873720138, "grad_norm": 0.41728408825524926, "learning_rate": 3.779355237226122e-05, "loss": 0.4554, "step": 691 }, { "epoch": 1.180887372013652, "grad_norm": 0.4321526604917808, "learning_rate": 3.778266058628199e-05, "loss": 0.4583, "step": 692 }, { "epoch": 1.18259385665529, "grad_norm": 0.37239568412353885, "learning_rate": 3.7771743561068546e-05, "loss": 0.4296, "step": 693 }, { "epoch": 1.1843003412969284, "grad_norm": 0.4197587486467048, "learning_rate": 3.776080131211561e-05, "loss": 0.4455, "step": 694 }, { "epoch": 1.1860068259385665, "grad_norm": 0.3726077401897924, "learning_rate": 3.7749833854953714e-05, "loss": 0.4437, "step": 695 }, { "epoch": 1.1877133105802047, "grad_norm": 0.5031733990320283, "learning_rate": 3.773884120514915e-05, "loss": 0.4137, "step": 696 }, { "epoch": 1.189419795221843, "grad_norm": 0.4265025413297216, "learning_rate": 3.7727823378303974e-05, "loss": 0.4479, "step": 697 }, { "epoch": 1.1911262798634812, "grad_norm": 0.4569892386107676, "learning_rate": 3.771678039005597e-05, "loss": 0.4362, "step": 698 }, { "epoch": 1.1928327645051195, "grad_norm": 0.424109578056102, "learning_rate": 3.770571225607865e-05, "loss": 0.4381, "step": 699 }, { "epoch": 1.1945392491467577, "grad_norm": 0.42906844196264826, "learning_rate": 3.76946189920812e-05, "loss": 0.3963, "step": 700 }, { "epoch": 1.1962457337883958, "grad_norm": 0.4890031891369991, "learning_rate": 3.768350061380848e-05, "loss": 0.4269, "step": 701 }, { "epoch": 1.1979522184300342, "grad_norm": 0.4350188302553573, "learning_rate": 3.7672357137041e-05, "loss": 0.4631, "step": 702 }, { "epoch": 1.1996587030716723, "grad_norm": 0.4852383658977233, "learning_rate": 3.7661188577594875e-05, "loss": 0.4529, "step": 703 }, { "epoch": 1.2013651877133107, "grad_norm": 0.4697744180206133, "learning_rate": 3.764999495132185e-05, "loss": 0.4469, "step": 704 }, { "epoch": 1.2030716723549488, "grad_norm": 0.4445829932767305, "learning_rate": 3.763877627410921e-05, "loss": 0.4167, "step": 705 }, { "epoch": 1.204778156996587, "grad_norm": 0.4676689122099299, "learning_rate": 3.7627532561879833e-05, "loss": 0.4339, "step": 706 }, { "epoch": 1.2064846416382253, "grad_norm": 0.47678804282414183, "learning_rate": 3.761626383059209e-05, "loss": 0.4514, "step": 707 }, { "epoch": 1.2081911262798635, "grad_norm": 0.4433655984117378, "learning_rate": 3.760497009623991e-05, "loss": 0.4444, "step": 708 }, { "epoch": 1.2098976109215016, "grad_norm": 0.41479676956479195, "learning_rate": 3.759365137485267e-05, "loss": 0.4388, "step": 709 }, { "epoch": 1.21160409556314, "grad_norm": 0.5205221037848315, "learning_rate": 3.7582307682495225e-05, "loss": 0.4375, "step": 710 }, { "epoch": 1.213310580204778, "grad_norm": 0.40919927859417016, "learning_rate": 3.757093903526788e-05, "loss": 0.4171, "step": 711 }, { "epoch": 1.2150170648464165, "grad_norm": 0.5071053572893132, "learning_rate": 3.755954544930633e-05, "loss": 0.4026, "step": 712 }, { "epoch": 1.2167235494880546, "grad_norm": 0.4573968991390098, "learning_rate": 3.754812694078171e-05, "loss": 0.455, "step": 713 }, { "epoch": 1.2184300341296928, "grad_norm": 0.39043408323060524, "learning_rate": 3.753668352590049e-05, "loss": 0.4048, "step": 714 }, { "epoch": 1.2201365187713311, "grad_norm": 0.4590394037207356, "learning_rate": 3.752521522090451e-05, "loss": 0.4259, "step": 715 }, { "epoch": 1.2218430034129693, "grad_norm": 0.42435704513968003, "learning_rate": 3.751372204207093e-05, "loss": 0.4332, "step": 716 }, { "epoch": 1.2235494880546076, "grad_norm": 0.40530298060100095, "learning_rate": 3.750220400571221e-05, "loss": 0.4482, "step": 717 }, { "epoch": 1.2252559726962458, "grad_norm": 0.4190752701415436, "learning_rate": 3.7490661128176105e-05, "loss": 0.4168, "step": 718 }, { "epoch": 1.226962457337884, "grad_norm": 0.40080779836791597, "learning_rate": 3.747909342584561e-05, "loss": 0.5669, "step": 719 }, { "epoch": 1.2286689419795223, "grad_norm": 0.3987684782960706, "learning_rate": 3.746750091513897e-05, "loss": 0.4324, "step": 720 }, { "epoch": 1.2303754266211604, "grad_norm": 0.40562391090602207, "learning_rate": 3.745588361250963e-05, "loss": 0.4643, "step": 721 }, { "epoch": 1.2320819112627985, "grad_norm": 0.4474485919797283, "learning_rate": 3.744424153444623e-05, "loss": 0.4573, "step": 722 }, { "epoch": 1.233788395904437, "grad_norm": 0.4004606169913817, "learning_rate": 3.7432574697472564e-05, "loss": 0.4734, "step": 723 }, { "epoch": 1.235494880546075, "grad_norm": 0.41481424422596996, "learning_rate": 3.742088311814758e-05, "loss": 0.4689, "step": 724 }, { "epoch": 1.2372013651877134, "grad_norm": 0.4613744985595309, "learning_rate": 3.740916681306533e-05, "loss": 0.421, "step": 725 }, { "epoch": 1.2389078498293515, "grad_norm": 0.4349483795031661, "learning_rate": 3.7397425798854964e-05, "loss": 0.4414, "step": 726 }, { "epoch": 1.2406143344709897, "grad_norm": 0.4444615372530873, "learning_rate": 3.738566009218071e-05, "loss": 0.4778, "step": 727 }, { "epoch": 1.242320819112628, "grad_norm": 0.5550194560618841, "learning_rate": 3.737386970974185e-05, "loss": 0.4493, "step": 728 }, { "epoch": 1.2440273037542662, "grad_norm": 0.4731437171419562, "learning_rate": 3.736205466827265e-05, "loss": 0.4664, "step": 729 }, { "epoch": 1.2457337883959045, "grad_norm": 0.47858053424822616, "learning_rate": 3.7350214984542416e-05, "loss": 0.4302, "step": 730 }, { "epoch": 1.2474402730375427, "grad_norm": 0.5045909061179693, "learning_rate": 3.73383506753554e-05, "loss": 0.4638, "step": 731 }, { "epoch": 1.2491467576791808, "grad_norm": 0.42161732820421655, "learning_rate": 3.732646175755084e-05, "loss": 0.4389, "step": 732 }, { "epoch": 1.2508532423208192, "grad_norm": 0.5085164211620015, "learning_rate": 3.731454824800286e-05, "loss": 0.4667, "step": 733 }, { "epoch": 1.2525597269624573, "grad_norm": 0.5141788654510248, "learning_rate": 3.730261016362052e-05, "loss": 0.4422, "step": 734 }, { "epoch": 1.2542662116040955, "grad_norm": 0.44945335088525795, "learning_rate": 3.729064752134774e-05, "loss": 0.4203, "step": 735 }, { "epoch": 1.2559726962457338, "grad_norm": 0.46033952481751483, "learning_rate": 3.727866033816331e-05, "loss": 0.4015, "step": 736 }, { "epoch": 1.257679180887372, "grad_norm": 0.4558626734368758, "learning_rate": 3.726664863108084e-05, "loss": 0.3963, "step": 737 }, { "epoch": 1.25938566552901, "grad_norm": 0.40734766802181244, "learning_rate": 3.7254612417148744e-05, "loss": 0.4098, "step": 738 }, { "epoch": 1.2610921501706485, "grad_norm": 0.5049229183639216, "learning_rate": 3.724255171345024e-05, "loss": 0.4601, "step": 739 }, { "epoch": 1.2627986348122868, "grad_norm": 0.3723550240432109, "learning_rate": 3.723046653710329e-05, "loss": 0.435, "step": 740 }, { "epoch": 1.264505119453925, "grad_norm": 0.4834866392138083, "learning_rate": 3.7218356905260576e-05, "loss": 0.4851, "step": 741 }, { "epoch": 1.266211604095563, "grad_norm": 0.47059437880819743, "learning_rate": 3.7206222835109525e-05, "loss": 0.4371, "step": 742 }, { "epoch": 1.2679180887372015, "grad_norm": 0.4488476614201091, "learning_rate": 3.719406434387221e-05, "loss": 0.4293, "step": 743 }, { "epoch": 1.2696245733788396, "grad_norm": 0.4769670725825094, "learning_rate": 3.7181881448805407e-05, "loss": 0.4587, "step": 744 }, { "epoch": 1.2713310580204777, "grad_norm": 0.4558559954017582, "learning_rate": 3.716967416720049e-05, "loss": 0.4418, "step": 745 }, { "epoch": 1.273037542662116, "grad_norm": 0.45087258818774806, "learning_rate": 3.715744251638347e-05, "loss": 0.4146, "step": 746 }, { "epoch": 1.2747440273037542, "grad_norm": 0.4534888156553091, "learning_rate": 3.714518651371494e-05, "loss": 0.4686, "step": 747 }, { "epoch": 1.2764505119453924, "grad_norm": 0.462002171339421, "learning_rate": 3.713290617659005e-05, "loss": 0.4693, "step": 748 }, { "epoch": 1.2781569965870307, "grad_norm": 0.5096654338817803, "learning_rate": 3.712060152243849e-05, "loss": 0.4496, "step": 749 }, { "epoch": 1.2798634812286689, "grad_norm": 0.4369904424706895, "learning_rate": 3.710827256872447e-05, "loss": 0.4306, "step": 750 }, { "epoch": 1.2815699658703072, "grad_norm": 0.4775228575976149, "learning_rate": 3.7095919332946693e-05, "loss": 0.4427, "step": 751 }, { "epoch": 1.2832764505119454, "grad_norm": 0.4394095447413763, "learning_rate": 3.7083541832638304e-05, "loss": 0.4592, "step": 752 }, { "epoch": 1.2849829351535837, "grad_norm": 0.4461061424192823, "learning_rate": 3.70711400853669e-05, "loss": 0.4175, "step": 753 }, { "epoch": 1.286689419795222, "grad_norm": 0.4143666448961568, "learning_rate": 3.7058714108734503e-05, "loss": 0.4174, "step": 754 }, { "epoch": 1.28839590443686, "grad_norm": 0.3840367032155938, "learning_rate": 3.704626392037751e-05, "loss": 0.4461, "step": 755 }, { "epoch": 1.2901023890784984, "grad_norm": 0.46008333986661104, "learning_rate": 3.703378953796669e-05, "loss": 0.4372, "step": 756 }, { "epoch": 1.2918088737201365, "grad_norm": 0.4913797925551799, "learning_rate": 3.702129097920715e-05, "loss": 0.4848, "step": 757 }, { "epoch": 1.2935153583617747, "grad_norm": 0.4137229879559694, "learning_rate": 3.700876826183829e-05, "loss": 0.4168, "step": 758 }, { "epoch": 1.295221843003413, "grad_norm": 0.44606644673173096, "learning_rate": 3.699622140363383e-05, "loss": 0.4635, "step": 759 }, { "epoch": 1.2969283276450512, "grad_norm": 0.4237165781520206, "learning_rate": 3.6983650422401744e-05, "loss": 0.4529, "step": 760 }, { "epoch": 1.2986348122866893, "grad_norm": 0.42451416478377646, "learning_rate": 3.697105533598423e-05, "loss": 0.4914, "step": 761 }, { "epoch": 1.3003412969283277, "grad_norm": 0.3878869863717339, "learning_rate": 3.695843616225772e-05, "loss": 0.4716, "step": 762 }, { "epoch": 1.3020477815699658, "grad_norm": 0.4134415686037431, "learning_rate": 3.694579291913282e-05, "loss": 0.4601, "step": 763 }, { "epoch": 1.3037542662116042, "grad_norm": 0.8003369338048761, "learning_rate": 3.693312562455429e-05, "loss": 0.4911, "step": 764 }, { "epoch": 1.3054607508532423, "grad_norm": 0.37537128276066495, "learning_rate": 3.692043429650105e-05, "loss": 0.4469, "step": 765 }, { "epoch": 1.3071672354948807, "grad_norm": 0.37529715858826523, "learning_rate": 3.690771895298612e-05, "loss": 0.4242, "step": 766 }, { "epoch": 1.3088737201365188, "grad_norm": 0.38984349190821227, "learning_rate": 3.6894979612056596e-05, "loss": 0.4187, "step": 767 }, { "epoch": 1.310580204778157, "grad_norm": 0.4055181076986435, "learning_rate": 3.688221629179365e-05, "loss": 0.4613, "step": 768 }, { "epoch": 1.3122866894197953, "grad_norm": 0.4029344250787094, "learning_rate": 3.686942901031247e-05, "loss": 0.4316, "step": 769 }, { "epoch": 1.3139931740614335, "grad_norm": 0.4079829615017142, "learning_rate": 3.6856617785762286e-05, "loss": 0.4329, "step": 770 }, { "epoch": 1.3156996587030716, "grad_norm": 0.4069157151626814, "learning_rate": 3.6843782636326256e-05, "loss": 0.3958, "step": 771 }, { "epoch": 1.31740614334471, "grad_norm": 0.47706876658562686, "learning_rate": 3.6830923580221556e-05, "loss": 0.4329, "step": 772 }, { "epoch": 1.319112627986348, "grad_norm": 1.2357596289802686, "learning_rate": 3.6818040635699245e-05, "loss": 0.4568, "step": 773 }, { "epoch": 1.3208191126279862, "grad_norm": 0.43601104388944756, "learning_rate": 3.680513382104432e-05, "loss": 0.4458, "step": 774 }, { "epoch": 1.3225255972696246, "grad_norm": 0.39840852817000055, "learning_rate": 3.679220315457563e-05, "loss": 0.449, "step": 775 }, { "epoch": 1.3242320819112627, "grad_norm": 0.44289749853193855, "learning_rate": 3.67792486546459e-05, "loss": 0.4745, "step": 776 }, { "epoch": 1.325938566552901, "grad_norm": 0.38501530038108883, "learning_rate": 3.676627033964167e-05, "loss": 0.4284, "step": 777 }, { "epoch": 1.3276450511945392, "grad_norm": 0.4436655072824138, "learning_rate": 3.675326822798329e-05, "loss": 0.4182, "step": 778 }, { "epoch": 1.3293515358361776, "grad_norm": 0.3984948609435545, "learning_rate": 3.674024233812487e-05, "loss": 0.4206, "step": 779 }, { "epoch": 1.3310580204778157, "grad_norm": 0.4387932025256469, "learning_rate": 3.672719268855429e-05, "loss": 0.4298, "step": 780 }, { "epoch": 1.3327645051194539, "grad_norm": 0.4090726444133126, "learning_rate": 3.671411929779313e-05, "loss": 0.4216, "step": 781 }, { "epoch": 1.3344709897610922, "grad_norm": 0.5003333111778476, "learning_rate": 3.670102218439669e-05, "loss": 0.4496, "step": 782 }, { "epoch": 1.3361774744027304, "grad_norm": 0.3999097982807462, "learning_rate": 3.66879013669539e-05, "loss": 0.4106, "step": 783 }, { "epoch": 1.3378839590443685, "grad_norm": 0.39528289174297737, "learning_rate": 3.667475686408739e-05, "loss": 0.4192, "step": 784 }, { "epoch": 1.3395904436860069, "grad_norm": 0.3530174875714856, "learning_rate": 3.666158869445336e-05, "loss": 0.4202, "step": 785 }, { "epoch": 1.341296928327645, "grad_norm": 0.42240313951429387, "learning_rate": 3.664839687674163e-05, "loss": 0.461, "step": 786 }, { "epoch": 1.3430034129692832, "grad_norm": 0.37962678593801646, "learning_rate": 3.663518142967557e-05, "loss": 0.4435, "step": 787 }, { "epoch": 1.3447098976109215, "grad_norm": 0.4508809400632154, "learning_rate": 3.662194237201208e-05, "loss": 0.4367, "step": 788 }, { "epoch": 1.3464163822525597, "grad_norm": 0.517558585482028, "learning_rate": 3.660867972254159e-05, "loss": 0.4544, "step": 789 }, { "epoch": 1.348122866894198, "grad_norm": 0.42198247927274773, "learning_rate": 3.6595393500088e-05, "loss": 0.4426, "step": 790 }, { "epoch": 1.3498293515358362, "grad_norm": 0.38044452785973637, "learning_rate": 3.658208372350868e-05, "loss": 0.4183, "step": 791 }, { "epoch": 1.3515358361774745, "grad_norm": 0.38931030207281575, "learning_rate": 3.656875041169442e-05, "loss": 0.419, "step": 792 }, { "epoch": 1.3532423208191127, "grad_norm": 0.3871251663984767, "learning_rate": 3.655539358356941e-05, "loss": 0.4161, "step": 793 }, { "epoch": 1.3549488054607508, "grad_norm": 0.41111760528386887, "learning_rate": 3.6542013258091236e-05, "loss": 0.3992, "step": 794 }, { "epoch": 1.3566552901023892, "grad_norm": 0.39052845005499515, "learning_rate": 3.652860945425082e-05, "loss": 0.4208, "step": 795 }, { "epoch": 1.3583617747440273, "grad_norm": 0.38865687702951424, "learning_rate": 3.65151821910724e-05, "loss": 0.4049, "step": 796 }, { "epoch": 1.3600682593856654, "grad_norm": 0.40903580121063793, "learning_rate": 3.650173148761353e-05, "loss": 0.4149, "step": 797 }, { "epoch": 1.3617747440273038, "grad_norm": 0.3868920258274055, "learning_rate": 3.6488257362965026e-05, "loss": 0.4377, "step": 798 }, { "epoch": 1.363481228668942, "grad_norm": 0.4033212911143925, "learning_rate": 3.6474759836250936e-05, "loss": 0.4143, "step": 799 }, { "epoch": 1.36518771331058, "grad_norm": 0.45515120908056866, "learning_rate": 3.646123892662854e-05, "loss": 0.4009, "step": 800 }, { "epoch": 1.3668941979522184, "grad_norm": 0.4308414454379085, "learning_rate": 3.644769465328828e-05, "loss": 0.4518, "step": 801 }, { "epoch": 1.3686006825938566, "grad_norm": 0.4619818515515126, "learning_rate": 3.643412703545378e-05, "loss": 0.4365, "step": 802 }, { "epoch": 1.370307167235495, "grad_norm": 0.38985140685038283, "learning_rate": 3.642053609238181e-05, "loss": 0.44, "step": 803 }, { "epoch": 1.372013651877133, "grad_norm": 0.4499721307861487, "learning_rate": 3.640692184336221e-05, "loss": 0.4532, "step": 804 }, { "epoch": 1.3737201365187715, "grad_norm": 0.4228258907033133, "learning_rate": 3.639328430771792e-05, "loss": 0.4286, "step": 805 }, { "epoch": 1.3754266211604096, "grad_norm": 0.43820754146061625, "learning_rate": 3.637962350480492e-05, "loss": 0.4308, "step": 806 }, { "epoch": 1.3771331058020477, "grad_norm": 0.4104514343792071, "learning_rate": 3.636593945401224e-05, "loss": 0.4435, "step": 807 }, { "epoch": 1.378839590443686, "grad_norm": 0.4523514499643776, "learning_rate": 3.6352232174761865e-05, "loss": 0.4238, "step": 808 }, { "epoch": 1.3805460750853242, "grad_norm": 0.4055406278240278, "learning_rate": 3.633850168650879e-05, "loss": 0.4741, "step": 809 }, { "epoch": 1.3822525597269624, "grad_norm": 0.43571497578619445, "learning_rate": 3.6324748008740925e-05, "loss": 0.4182, "step": 810 }, { "epoch": 1.3839590443686007, "grad_norm": 0.38896961713941225, "learning_rate": 3.63109711609791e-05, "loss": 0.4329, "step": 811 }, { "epoch": 1.3856655290102389, "grad_norm": 0.4011016710632721, "learning_rate": 3.629717116277702e-05, "loss": 0.4561, "step": 812 }, { "epoch": 1.387372013651877, "grad_norm": 0.3774279212199751, "learning_rate": 3.628334803372127e-05, "loss": 0.4574, "step": 813 }, { "epoch": 1.3890784982935154, "grad_norm": 0.3854368012109252, "learning_rate": 3.626950179343126e-05, "loss": 0.4419, "step": 814 }, { "epoch": 1.3907849829351535, "grad_norm": 0.4029903450946548, "learning_rate": 3.6255632461559176e-05, "loss": 0.4679, "step": 815 }, { "epoch": 1.3924914675767919, "grad_norm": 0.4146348888054304, "learning_rate": 3.624174005779002e-05, "loss": 0.4171, "step": 816 }, { "epoch": 1.39419795221843, "grad_norm": 0.42837264640372247, "learning_rate": 3.62278246018415e-05, "loss": 0.4318, "step": 817 }, { "epoch": 1.3959044368600684, "grad_norm": 0.4144169325480012, "learning_rate": 3.621388611346407e-05, "loss": 0.4142, "step": 818 }, { "epoch": 1.3976109215017065, "grad_norm": 0.42446552773541035, "learning_rate": 3.6199924612440855e-05, "loss": 0.4448, "step": 819 }, { "epoch": 1.3993174061433447, "grad_norm": 0.3969940864973572, "learning_rate": 3.6185940118587673e-05, "loss": 0.4332, "step": 820 }, { "epoch": 1.401023890784983, "grad_norm": 0.41046129721730956, "learning_rate": 3.617193265175293e-05, "loss": 0.4448, "step": 821 }, { "epoch": 1.4027303754266212, "grad_norm": 0.3735310996078358, "learning_rate": 3.615790223181768e-05, "loss": 0.3887, "step": 822 }, { "epoch": 1.4044368600682593, "grad_norm": 0.4665973159613071, "learning_rate": 3.614384887869553e-05, "loss": 0.4463, "step": 823 }, { "epoch": 1.4061433447098977, "grad_norm": 0.3937368085046607, "learning_rate": 3.612977261233265e-05, "loss": 0.4331, "step": 824 }, { "epoch": 1.4078498293515358, "grad_norm": 0.4617674156449415, "learning_rate": 3.611567345270772e-05, "loss": 0.4387, "step": 825 }, { "epoch": 1.409556313993174, "grad_norm": 0.4203672742229765, "learning_rate": 3.610155141983192e-05, "loss": 0.4434, "step": 826 }, { "epoch": 1.4112627986348123, "grad_norm": 0.40848430637835254, "learning_rate": 3.608740653374889e-05, "loss": 0.4229, "step": 827 }, { "epoch": 1.4129692832764504, "grad_norm": 0.47426540651765864, "learning_rate": 3.607323881453472e-05, "loss": 0.4154, "step": 828 }, { "epoch": 1.4146757679180888, "grad_norm": 0.41797039048076445, "learning_rate": 3.6059048282297887e-05, "loss": 0.4241, "step": 829 }, { "epoch": 1.416382252559727, "grad_norm": 0.4135512178844431, "learning_rate": 3.604483495717926e-05, "loss": 0.4031, "step": 830 }, { "epoch": 1.4180887372013653, "grad_norm": 0.41837197750420363, "learning_rate": 3.603059885935205e-05, "loss": 0.4146, "step": 831 }, { "epoch": 1.4197952218430034, "grad_norm": 0.40027620826967797, "learning_rate": 3.601634000902179e-05, "loss": 0.423, "step": 832 }, { "epoch": 1.4215017064846416, "grad_norm": 0.4256842466443175, "learning_rate": 3.600205842642632e-05, "loss": 0.4551, "step": 833 }, { "epoch": 1.42320819112628, "grad_norm": 0.4550854880984585, "learning_rate": 3.598775413183573e-05, "loss": 0.4599, "step": 834 }, { "epoch": 1.424914675767918, "grad_norm": 0.4815447932080825, "learning_rate": 3.597342714555235e-05, "loss": 0.4036, "step": 835 }, { "epoch": 1.4266211604095562, "grad_norm": 0.43871354888787345, "learning_rate": 3.595907748791071e-05, "loss": 0.4119, "step": 836 }, { "epoch": 1.4283276450511946, "grad_norm": 0.46235121968022785, "learning_rate": 3.594470517927755e-05, "loss": 0.4477, "step": 837 }, { "epoch": 1.4300341296928327, "grad_norm": 0.5716499129913892, "learning_rate": 3.59303102400517e-05, "loss": 0.4315, "step": 838 }, { "epoch": 1.4317406143344709, "grad_norm": 0.4513547768347061, "learning_rate": 3.591589269066416e-05, "loss": 0.4293, "step": 839 }, { "epoch": 1.4334470989761092, "grad_norm": 0.5258567217520957, "learning_rate": 3.5901452551578e-05, "loss": 0.4333, "step": 840 }, { "epoch": 1.4351535836177474, "grad_norm": 0.4446607367154777, "learning_rate": 3.5886989843288364e-05, "loss": 0.4607, "step": 841 }, { "epoch": 1.4368600682593857, "grad_norm": 0.4557637105087839, "learning_rate": 3.587250458632241e-05, "loss": 0.4472, "step": 842 }, { "epoch": 1.4385665529010239, "grad_norm": 0.4577223794812873, "learning_rate": 3.585799680123932e-05, "loss": 0.4487, "step": 843 }, { "epoch": 1.4402730375426622, "grad_norm": 0.42430168888349257, "learning_rate": 3.584346650863024e-05, "loss": 0.4452, "step": 844 }, { "epoch": 1.4419795221843004, "grad_norm": 0.37398876259118063, "learning_rate": 3.582891372911825e-05, "loss": 0.3806, "step": 845 }, { "epoch": 1.4436860068259385, "grad_norm": 0.416566462619093, "learning_rate": 3.581433848335838e-05, "loss": 0.4733, "step": 846 }, { "epoch": 1.4453924914675769, "grad_norm": 0.36234070654329864, "learning_rate": 3.5799740792037515e-05, "loss": 0.4224, "step": 847 }, { "epoch": 1.447098976109215, "grad_norm": 0.4129583494033684, "learning_rate": 3.578512067587441e-05, "loss": 0.4305, "step": 848 }, { "epoch": 1.4488054607508531, "grad_norm": 0.40331761774448877, "learning_rate": 3.5770478155619636e-05, "loss": 0.4727, "step": 849 }, { "epoch": 1.4505119453924915, "grad_norm": 0.5056379260288237, "learning_rate": 3.575581325205558e-05, "loss": 0.4231, "step": 850 }, { "epoch": 1.4522184300341296, "grad_norm": 0.3654321179751699, "learning_rate": 3.574112598599639e-05, "loss": 0.4269, "step": 851 }, { "epoch": 1.4539249146757678, "grad_norm": 0.4250546017967828, "learning_rate": 3.5726416378287965e-05, "loss": 0.4627, "step": 852 }, { "epoch": 1.4556313993174061, "grad_norm": 0.3678010879072778, "learning_rate": 3.571168444980788e-05, "loss": 0.3916, "step": 853 }, { "epoch": 1.4573378839590443, "grad_norm": 0.39832118428914814, "learning_rate": 3.5696930221465427e-05, "loss": 0.4594, "step": 854 }, { "epoch": 1.4590443686006827, "grad_norm": 0.35873588194403805, "learning_rate": 3.568215371420153e-05, "loss": 0.4414, "step": 855 }, { "epoch": 1.4607508532423208, "grad_norm": 0.42738781031288464, "learning_rate": 3.566735494898875e-05, "loss": 0.4259, "step": 856 }, { "epoch": 1.4624573378839592, "grad_norm": 0.4421005033450358, "learning_rate": 3.565253394683121e-05, "loss": 0.4236, "step": 857 }, { "epoch": 1.4641638225255973, "grad_norm": 0.4006965058599362, "learning_rate": 3.563769072876463e-05, "loss": 0.4778, "step": 858 }, { "epoch": 1.4658703071672354, "grad_norm": 0.42058567843765865, "learning_rate": 3.5622825315856223e-05, "loss": 0.4762, "step": 859 }, { "epoch": 1.4675767918088738, "grad_norm": 0.3584682257159212, "learning_rate": 3.560793772920474e-05, "loss": 0.4568, "step": 860 }, { "epoch": 1.469283276450512, "grad_norm": 0.40671444966184894, "learning_rate": 3.559302798994038e-05, "loss": 0.5237, "step": 861 }, { "epoch": 1.47098976109215, "grad_norm": 0.4258033641750018, "learning_rate": 3.557809611922479e-05, "loss": 0.4448, "step": 862 }, { "epoch": 1.4726962457337884, "grad_norm": 0.49700500782089985, "learning_rate": 3.556314213825103e-05, "loss": 0.4191, "step": 863 }, { "epoch": 1.4744027303754266, "grad_norm": 0.4343913869278087, "learning_rate": 3.5548166068243554e-05, "loss": 0.4241, "step": 864 }, { "epoch": 1.4761092150170647, "grad_norm": 0.4111341986973055, "learning_rate": 3.553316793045813e-05, "loss": 0.4802, "step": 865 }, { "epoch": 1.477815699658703, "grad_norm": 0.4258209373245728, "learning_rate": 3.551814774618189e-05, "loss": 0.4434, "step": 866 }, { "epoch": 1.4795221843003412, "grad_norm": 0.46306345542355654, "learning_rate": 3.550310553673323e-05, "loss": 0.4619, "step": 867 }, { "epoch": 1.4812286689419796, "grad_norm": 0.4581807591999424, "learning_rate": 3.548804132346182e-05, "loss": 0.4149, "step": 868 }, { "epoch": 1.4829351535836177, "grad_norm": 0.3949912398526118, "learning_rate": 3.547295512774855e-05, "loss": 0.4335, "step": 869 }, { "epoch": 1.484641638225256, "grad_norm": 0.4220703031918266, "learning_rate": 3.545784697100551e-05, "loss": 0.3977, "step": 870 }, { "epoch": 1.4863481228668942, "grad_norm": 0.46599454689196135, "learning_rate": 3.544271687467599e-05, "loss": 0.4523, "step": 871 }, { "epoch": 1.4880546075085324, "grad_norm": 0.401862878134278, "learning_rate": 3.542756486023437e-05, "loss": 0.4362, "step": 872 }, { "epoch": 1.4897610921501707, "grad_norm": 0.4813492793507341, "learning_rate": 3.541239094918617e-05, "loss": 0.4271, "step": 873 }, { "epoch": 1.4914675767918089, "grad_norm": 0.4182023307175445, "learning_rate": 3.5397195163067985e-05, "loss": 0.4476, "step": 874 }, { "epoch": 1.493174061433447, "grad_norm": 0.42276483733512216, "learning_rate": 3.5381977523447454e-05, "loss": 0.4163, "step": 875 }, { "epoch": 1.4948805460750854, "grad_norm": 0.47632383003430656, "learning_rate": 3.536673805192323e-05, "loss": 0.4698, "step": 876 }, { "epoch": 1.4965870307167235, "grad_norm": 0.36931128067340063, "learning_rate": 3.535147677012495e-05, "loss": 0.4486, "step": 877 }, { "epoch": 1.4982935153583616, "grad_norm": 0.4604522960110994, "learning_rate": 3.533619369971322e-05, "loss": 0.4119, "step": 878 }, { "epoch": 1.5, "grad_norm": 0.3975754189073352, "learning_rate": 3.532088886237956e-05, "loss": 0.4479, "step": 879 }, { "epoch": 1.5017064846416384, "grad_norm": 0.37769858386666094, "learning_rate": 3.530556227984639e-05, "loss": 0.4059, "step": 880 }, { "epoch": 1.5034129692832765, "grad_norm": 0.43939814983028674, "learning_rate": 3.5290213973867e-05, "loss": 0.448, "step": 881 }, { "epoch": 1.5051194539249146, "grad_norm": 0.3868394972875524, "learning_rate": 3.527484396622548e-05, "loss": 0.4237, "step": 882 }, { "epoch": 1.506825938566553, "grad_norm": 0.3887541284770232, "learning_rate": 3.525945227873676e-05, "loss": 0.4658, "step": 883 }, { "epoch": 1.5085324232081911, "grad_norm": 0.37052252214316006, "learning_rate": 3.524403893324653e-05, "loss": 0.4039, "step": 884 }, { "epoch": 1.5102389078498293, "grad_norm": 0.39225305033832947, "learning_rate": 3.52286039516312e-05, "loss": 0.4296, "step": 885 }, { "epoch": 1.5119453924914676, "grad_norm": 0.4668921452833352, "learning_rate": 3.52131473557979e-05, "loss": 0.4836, "step": 886 }, { "epoch": 1.5136518771331058, "grad_norm": 0.41657018354420566, "learning_rate": 3.519766916768447e-05, "loss": 0.4585, "step": 887 }, { "epoch": 1.515358361774744, "grad_norm": 0.4013173259210855, "learning_rate": 3.518216940925934e-05, "loss": 0.4553, "step": 888 }, { "epoch": 1.5170648464163823, "grad_norm": 0.5024901693039066, "learning_rate": 3.516664810252159e-05, "loss": 0.4347, "step": 889 }, { "epoch": 1.5187713310580204, "grad_norm": 0.37789225448719616, "learning_rate": 3.5151105269500876e-05, "loss": 0.4156, "step": 890 }, { "epoch": 1.5204778156996586, "grad_norm": 0.508454307999391, "learning_rate": 3.513554093225741e-05, "loss": 0.4029, "step": 891 }, { "epoch": 1.522184300341297, "grad_norm": 0.45808430716907134, "learning_rate": 3.511995511288191e-05, "loss": 0.4261, "step": 892 }, { "epoch": 1.5238907849829353, "grad_norm": 0.48105430594252147, "learning_rate": 3.510434783349562e-05, "loss": 0.4382, "step": 893 }, { "epoch": 1.5255972696245734, "grad_norm": 0.40020881894592886, "learning_rate": 3.50887191162502e-05, "loss": 0.4129, "step": 894 }, { "epoch": 1.5273037542662116, "grad_norm": 0.46194774910126624, "learning_rate": 3.507306898332775e-05, "loss": 0.4352, "step": 895 }, { "epoch": 1.52901023890785, "grad_norm": 0.4170030314335261, "learning_rate": 3.5057397456940786e-05, "loss": 0.417, "step": 896 }, { "epoch": 1.530716723549488, "grad_norm": 0.5223695395475298, "learning_rate": 3.504170455933216e-05, "loss": 0.4974, "step": 897 }, { "epoch": 1.5324232081911262, "grad_norm": 0.40890728841524154, "learning_rate": 3.502599031277509e-05, "loss": 0.4783, "step": 898 }, { "epoch": 1.5341296928327646, "grad_norm": 0.4870710591339736, "learning_rate": 3.501025473957305e-05, "loss": 0.4178, "step": 899 }, { "epoch": 1.5358361774744027, "grad_norm": 0.42576048970858366, "learning_rate": 3.4994497862059824e-05, "loss": 0.4206, "step": 900 }, { "epoch": 1.5375426621160408, "grad_norm": 0.38954421729882216, "learning_rate": 3.497871970259942e-05, "loss": 0.4397, "step": 901 }, { "epoch": 1.5392491467576792, "grad_norm": 0.41905274040841317, "learning_rate": 3.496292028358604e-05, "loss": 0.4368, "step": 902 }, { "epoch": 1.5409556313993176, "grad_norm": 0.43303839375027164, "learning_rate": 3.4947099627444074e-05, "loss": 0.4106, "step": 903 }, { "epoch": 1.5426621160409555, "grad_norm": 0.450215779773176, "learning_rate": 3.493125775662805e-05, "loss": 0.4352, "step": 904 }, { "epoch": 1.5443686006825939, "grad_norm": 0.47692891229434176, "learning_rate": 3.49153946936226e-05, "loss": 0.4565, "step": 905 }, { "epoch": 1.5460750853242322, "grad_norm": 0.4373631729864461, "learning_rate": 3.489951046094245e-05, "loss": 0.4176, "step": 906 }, { "epoch": 1.5477815699658704, "grad_norm": 0.45636172992483093, "learning_rate": 3.488360508113235e-05, "loss": 0.4953, "step": 907 }, { "epoch": 1.5494880546075085, "grad_norm": 0.39474307753636256, "learning_rate": 3.4867678576767093e-05, "loss": 0.4087, "step": 908 }, { "epoch": 1.5511945392491469, "grad_norm": 0.4661296639102082, "learning_rate": 3.4851730970451434e-05, "loss": 0.4117, "step": 909 }, { "epoch": 1.552901023890785, "grad_norm": 0.45467468666026545, "learning_rate": 3.483576228482008e-05, "loss": 0.4424, "step": 910 }, { "epoch": 1.5546075085324231, "grad_norm": 0.48820547483802595, "learning_rate": 3.481977254253765e-05, "loss": 0.4964, "step": 911 }, { "epoch": 1.5563139931740615, "grad_norm": 0.4204745027701614, "learning_rate": 3.480376176629868e-05, "loss": 0.4365, "step": 912 }, { "epoch": 1.5580204778156996, "grad_norm": 0.42272646261560387, "learning_rate": 3.478772997882753e-05, "loss": 0.4425, "step": 913 }, { "epoch": 1.5597269624573378, "grad_norm": 0.4468729774453571, "learning_rate": 3.4771677202878385e-05, "loss": 0.5113, "step": 914 }, { "epoch": 1.5614334470989761, "grad_norm": 0.4375932249905856, "learning_rate": 3.475560346123523e-05, "loss": 0.4551, "step": 915 }, { "epoch": 1.5631399317406145, "grad_norm": 0.34994039025065354, "learning_rate": 3.473950877671179e-05, "loss": 0.4147, "step": 916 }, { "epoch": 1.5648464163822524, "grad_norm": 0.4038765449860422, "learning_rate": 3.472339317215154e-05, "loss": 0.4225, "step": 917 }, { "epoch": 1.5665529010238908, "grad_norm": 0.4761160218072004, "learning_rate": 3.4707256670427627e-05, "loss": 0.4501, "step": 918 }, { "epoch": 1.5682593856655291, "grad_norm": 0.3461893030643718, "learning_rate": 3.4691099294442864e-05, "loss": 0.4178, "step": 919 }, { "epoch": 1.5699658703071673, "grad_norm": 0.39888863696078497, "learning_rate": 3.467492106712969e-05, "loss": 0.4386, "step": 920 }, { "epoch": 1.5716723549488054, "grad_norm": 0.4003656769181341, "learning_rate": 3.4658722011450145e-05, "loss": 0.481, "step": 921 }, { "epoch": 1.5733788395904438, "grad_norm": 0.42337446898808284, "learning_rate": 3.464250215039582e-05, "loss": 0.3887, "step": 922 }, { "epoch": 1.575085324232082, "grad_norm": 0.4076560575146216, "learning_rate": 3.4626261506987834e-05, "loss": 0.4886, "step": 923 }, { "epoch": 1.57679180887372, "grad_norm": 0.4039618681457414, "learning_rate": 3.461000010427683e-05, "loss": 0.4292, "step": 924 }, { "epoch": 1.5784982935153584, "grad_norm": 0.41161267348770914, "learning_rate": 3.4593717965342884e-05, "loss": 0.4227, "step": 925 }, { "epoch": 1.5802047781569966, "grad_norm": 0.4363220450461537, "learning_rate": 3.457741511329551e-05, "loss": 0.3945, "step": 926 }, { "epoch": 1.5819112627986347, "grad_norm": 0.4212142249085022, "learning_rate": 3.4561091571273625e-05, "loss": 0.4064, "step": 927 }, { "epoch": 1.583617747440273, "grad_norm": 0.4073724699295551, "learning_rate": 3.4544747362445524e-05, "loss": 0.4875, "step": 928 }, { "epoch": 1.5853242320819114, "grad_norm": 0.48761202784212476, "learning_rate": 3.45283825100088e-05, "loss": 0.4615, "step": 929 }, { "epoch": 1.5870307167235493, "grad_norm": 0.3988491731909571, "learning_rate": 3.451199703719039e-05, "loss": 0.4145, "step": 930 }, { "epoch": 1.5887372013651877, "grad_norm": 0.42670855330547913, "learning_rate": 3.449559096724646e-05, "loss": 0.452, "step": 931 }, { "epoch": 1.590443686006826, "grad_norm": 0.389587683305919, "learning_rate": 3.4479164323462436e-05, "loss": 0.4067, "step": 932 }, { "epoch": 1.5921501706484642, "grad_norm": 0.41461434900801203, "learning_rate": 3.446271712915294e-05, "loss": 0.4555, "step": 933 }, { "epoch": 1.5938566552901023, "grad_norm": 0.40604885019880904, "learning_rate": 3.444624940766173e-05, "loss": 0.4417, "step": 934 }, { "epoch": 1.5955631399317407, "grad_norm": 0.3909617504686526, "learning_rate": 3.442976118236175e-05, "loss": 0.4226, "step": 935 }, { "epoch": 1.5972696245733788, "grad_norm": 0.4369790065778818, "learning_rate": 3.4413252476655e-05, "loss": 0.4459, "step": 936 }, { "epoch": 1.598976109215017, "grad_norm": 0.4160896979655948, "learning_rate": 3.439672331397259e-05, "loss": 0.4324, "step": 937 }, { "epoch": 1.6006825938566553, "grad_norm": 0.40068228702102676, "learning_rate": 3.4380173717774635e-05, "loss": 0.4231, "step": 938 }, { "epoch": 1.6023890784982935, "grad_norm": 0.5049147324987331, "learning_rate": 3.436360371155025e-05, "loss": 0.4489, "step": 939 }, { "epoch": 1.6040955631399316, "grad_norm": 0.41968825833903556, "learning_rate": 3.434701331881754e-05, "loss": 0.4231, "step": 940 }, { "epoch": 1.60580204778157, "grad_norm": 0.45615630046040506, "learning_rate": 3.433040256312352e-05, "loss": 0.4533, "step": 941 }, { "epoch": 1.6075085324232083, "grad_norm": 0.42769386304041224, "learning_rate": 3.431377146804414e-05, "loss": 0.4653, "step": 942 }, { "epoch": 1.6092150170648463, "grad_norm": 0.41982860108854264, "learning_rate": 3.429712005718417e-05, "loss": 0.4273, "step": 943 }, { "epoch": 1.6109215017064846, "grad_norm": 0.4127398132688913, "learning_rate": 3.4280448354177275e-05, "loss": 0.4324, "step": 944 }, { "epoch": 1.612627986348123, "grad_norm": 0.4386242108058767, "learning_rate": 3.426375638268586e-05, "loss": 0.4409, "step": 945 }, { "epoch": 1.6143344709897611, "grad_norm": 0.41443492371586427, "learning_rate": 3.424704416640115e-05, "loss": 0.4001, "step": 946 }, { "epoch": 1.6160409556313993, "grad_norm": 0.41419820104479615, "learning_rate": 3.423031172904305e-05, "loss": 0.4275, "step": 947 }, { "epoch": 1.6177474402730376, "grad_norm": 0.4171958425823162, "learning_rate": 3.421355909436022e-05, "loss": 0.4578, "step": 948 }, { "epoch": 1.6194539249146758, "grad_norm": 0.35257128369438767, "learning_rate": 3.4196786286129945e-05, "loss": 0.4287, "step": 949 }, { "epoch": 1.621160409556314, "grad_norm": 0.4523630638066935, "learning_rate": 3.417999332815817e-05, "loss": 0.4475, "step": 950 }, { "epoch": 1.6228668941979523, "grad_norm": 0.38733410491674525, "learning_rate": 3.416318024427942e-05, "loss": 0.4415, "step": 951 }, { "epoch": 1.6245733788395904, "grad_norm": 0.4853033858137463, "learning_rate": 3.414634705835679e-05, "loss": 0.4385, "step": 952 }, { "epoch": 1.6262798634812285, "grad_norm": 0.398748208356677, "learning_rate": 3.412949379428192e-05, "loss": 0.4077, "step": 953 }, { "epoch": 1.627986348122867, "grad_norm": 0.4275818403560876, "learning_rate": 3.411262047597492e-05, "loss": 0.4107, "step": 954 }, { "epoch": 1.6296928327645053, "grad_norm": 0.4535435912627168, "learning_rate": 3.40957271273844e-05, "loss": 0.4263, "step": 955 }, { "epoch": 1.6313993174061432, "grad_norm": 0.40462878667784596, "learning_rate": 3.407881377248736e-05, "loss": 0.4792, "step": 956 }, { "epoch": 1.6331058020477816, "grad_norm": 0.37436342835477465, "learning_rate": 3.4061880435289214e-05, "loss": 0.4293, "step": 957 }, { "epoch": 1.63481228668942, "grad_norm": 0.41145182800772445, "learning_rate": 3.404492713982375e-05, "loss": 0.4532, "step": 958 }, { "epoch": 1.636518771331058, "grad_norm": 0.47069268076938797, "learning_rate": 3.402795391015307e-05, "loss": 0.4421, "step": 959 }, { "epoch": 1.6382252559726962, "grad_norm": 0.3894286690190037, "learning_rate": 3.401096077036755e-05, "loss": 0.4508, "step": 960 }, { "epoch": 1.6399317406143346, "grad_norm": 0.4354063816331828, "learning_rate": 3.399394774458586e-05, "loss": 0.464, "step": 961 }, { "epoch": 1.6416382252559727, "grad_norm": 0.4388846590151156, "learning_rate": 3.3976914856954876e-05, "loss": 0.4518, "step": 962 }, { "epoch": 1.6433447098976108, "grad_norm": 0.45091502302612824, "learning_rate": 3.3959862131649665e-05, "loss": 0.4192, "step": 963 }, { "epoch": 1.6450511945392492, "grad_norm": 0.4651907434306189, "learning_rate": 3.3942789592873454e-05, "loss": 0.5461, "step": 964 }, { "epoch": 1.6467576791808873, "grad_norm": 0.4245109764212635, "learning_rate": 3.392569726485759e-05, "loss": 0.4212, "step": 965 }, { "epoch": 1.6484641638225255, "grad_norm": 0.4288455867724874, "learning_rate": 3.390858517186149e-05, "loss": 0.421, "step": 966 }, { "epoch": 1.6501706484641638, "grad_norm": 0.4864298431309527, "learning_rate": 3.389145333817266e-05, "loss": 0.4871, "step": 967 }, { "epoch": 1.6518771331058022, "grad_norm": 0.4545088336033929, "learning_rate": 3.387430178810661e-05, "loss": 0.4202, "step": 968 }, { "epoch": 1.6535836177474401, "grad_norm": 0.37090624906992437, "learning_rate": 3.38571305460068e-05, "loss": 0.4182, "step": 969 }, { "epoch": 1.6552901023890785, "grad_norm": 0.3815866609881445, "learning_rate": 3.383993963624469e-05, "loss": 0.4115, "step": 970 }, { "epoch": 1.6569965870307168, "grad_norm": 0.39516260065390635, "learning_rate": 3.3822729083219635e-05, "loss": 0.4106, "step": 971 }, { "epoch": 1.658703071672355, "grad_norm": 0.41165425795653404, "learning_rate": 3.380549891135884e-05, "loss": 0.4509, "step": 972 }, { "epoch": 1.6604095563139931, "grad_norm": 0.4106639445516941, "learning_rate": 3.378824914511741e-05, "loss": 0.4139, "step": 973 }, { "epoch": 1.6621160409556315, "grad_norm": 0.42566925025520447, "learning_rate": 3.3770979808978225e-05, "loss": 0.456, "step": 974 }, { "epoch": 1.6638225255972696, "grad_norm": 0.39512181124894374, "learning_rate": 3.375369092745195e-05, "loss": 0.4079, "step": 975 }, { "epoch": 1.6655290102389078, "grad_norm": 0.38230396858721344, "learning_rate": 3.373638252507698e-05, "loss": 0.4228, "step": 976 }, { "epoch": 1.6672354948805461, "grad_norm": 0.47063681118009787, "learning_rate": 3.371905462641944e-05, "loss": 0.4472, "step": 977 }, { "epoch": 1.6689419795221843, "grad_norm": 0.4447077825481116, "learning_rate": 3.3701707256073105e-05, "loss": 0.4496, "step": 978 }, { "epoch": 1.6706484641638224, "grad_norm": 0.4060212414485413, "learning_rate": 3.3684340438659405e-05, "loss": 0.4272, "step": 979 }, { "epoch": 1.6723549488054608, "grad_norm": 0.3795742306274751, "learning_rate": 3.366695419882734e-05, "loss": 0.4231, "step": 980 }, { "epoch": 1.6740614334470991, "grad_norm": 0.43991563290482527, "learning_rate": 3.364954856125351e-05, "loss": 0.4345, "step": 981 }, { "epoch": 1.675767918088737, "grad_norm": 0.3877476024110567, "learning_rate": 3.363212355064205e-05, "loss": 0.4349, "step": 982 }, { "epoch": 1.6774744027303754, "grad_norm": 0.4179531198525114, "learning_rate": 3.361467919172454e-05, "loss": 0.4507, "step": 983 }, { "epoch": 1.6791808873720138, "grad_norm": 0.39344144979568285, "learning_rate": 3.3597215509260086e-05, "loss": 0.444, "step": 984 }, { "epoch": 1.680887372013652, "grad_norm": 0.3724567972038973, "learning_rate": 3.357973252803518e-05, "loss": 0.4416, "step": 985 }, { "epoch": 1.68259385665529, "grad_norm": 0.3595142334695815, "learning_rate": 3.356223027286372e-05, "loss": 0.4182, "step": 986 }, { "epoch": 1.6843003412969284, "grad_norm": 0.4257994094890164, "learning_rate": 3.354470876858695e-05, "loss": 0.4591, "step": 987 }, { "epoch": 1.6860068259385665, "grad_norm": 0.38949058168413975, "learning_rate": 3.3527168040073446e-05, "loss": 0.4545, "step": 988 }, { "epoch": 1.6877133105802047, "grad_norm": 0.42007113519918304, "learning_rate": 3.3509608112219055e-05, "loss": 0.4369, "step": 989 }, { "epoch": 1.689419795221843, "grad_norm": 0.36248815732923834, "learning_rate": 3.34920290099469e-05, "loss": 0.4129, "step": 990 }, { "epoch": 1.6911262798634812, "grad_norm": 0.4458775071227498, "learning_rate": 3.347443075820729e-05, "loss": 0.4485, "step": 991 }, { "epoch": 1.6928327645051193, "grad_norm": 0.3893563280856113, "learning_rate": 3.345681338197772e-05, "loss": 0.432, "step": 992 }, { "epoch": 1.6945392491467577, "grad_norm": 0.3739107046460591, "learning_rate": 3.3439176906262835e-05, "loss": 0.4681, "step": 993 }, { "epoch": 1.696245733788396, "grad_norm": 0.36502308371838477, "learning_rate": 3.34215213560944e-05, "loss": 0.4231, "step": 994 }, { "epoch": 1.697952218430034, "grad_norm": 0.40461580892457993, "learning_rate": 3.340384675653123e-05, "loss": 0.4812, "step": 995 }, { "epoch": 1.6996587030716723, "grad_norm": 0.4621930292777579, "learning_rate": 3.3386153132659184e-05, "loss": 0.4498, "step": 996 }, { "epoch": 1.7013651877133107, "grad_norm": 0.3505765117737797, "learning_rate": 3.336844050959113e-05, "loss": 0.4362, "step": 997 }, { "epoch": 1.7030716723549488, "grad_norm": 0.4085279897692518, "learning_rate": 3.335070891246689e-05, "loss": 0.5195, "step": 998 }, { "epoch": 1.704778156996587, "grad_norm": 0.4011492088215556, "learning_rate": 3.3332958366453225e-05, "loss": 0.429, "step": 999 }, { "epoch": 1.7064846416382253, "grad_norm": 0.3873576769554521, "learning_rate": 3.3315188896743796e-05, "loss": 0.4301, "step": 1000 }, { "epoch": 1.7081911262798635, "grad_norm": 0.43132909483485776, "learning_rate": 3.32974005285591e-05, "loss": 0.4484, "step": 1001 }, { "epoch": 1.7098976109215016, "grad_norm": 0.34576970317395267, "learning_rate": 3.327959328714649e-05, "loss": 0.4332, "step": 1002 }, { "epoch": 1.71160409556314, "grad_norm": 0.42511892675815227, "learning_rate": 3.326176719778008e-05, "loss": 0.4389, "step": 1003 }, { "epoch": 1.713310580204778, "grad_norm": 0.3821873635784118, "learning_rate": 3.3243922285760736e-05, "loss": 0.4691, "step": 1004 }, { "epoch": 1.7150170648464163, "grad_norm": 0.3892273970414259, "learning_rate": 3.322605857641606e-05, "loss": 0.406, "step": 1005 }, { "epoch": 1.7167235494880546, "grad_norm": 0.4004541387857402, "learning_rate": 3.320817609510032e-05, "loss": 0.3913, "step": 1006 }, { "epoch": 1.718430034129693, "grad_norm": 0.4130824978333927, "learning_rate": 3.319027486719441e-05, "loss": 0.4311, "step": 1007 }, { "epoch": 1.7201365187713311, "grad_norm": 0.35282306819720366, "learning_rate": 3.3172354918105864e-05, "loss": 0.4356, "step": 1008 }, { "epoch": 1.7218430034129693, "grad_norm": 0.4109825192006032, "learning_rate": 3.3154416273268766e-05, "loss": 0.4107, "step": 1009 }, { "epoch": 1.7235494880546076, "grad_norm": 0.3951503042835757, "learning_rate": 3.313645895814375e-05, "loss": 0.4129, "step": 1010 }, { "epoch": 1.7252559726962458, "grad_norm": 0.381791539552012, "learning_rate": 3.311848299821793e-05, "loss": 0.4365, "step": 1011 }, { "epoch": 1.726962457337884, "grad_norm": 0.47677304464409714, "learning_rate": 3.31004884190049e-05, "loss": 0.4561, "step": 1012 }, { "epoch": 1.7286689419795223, "grad_norm": 0.3537629215873041, "learning_rate": 3.3082475246044666e-05, "loss": 0.4624, "step": 1013 }, { "epoch": 1.7303754266211604, "grad_norm": 0.37454400532288973, "learning_rate": 3.306444350490364e-05, "loss": 0.4184, "step": 1014 }, { "epoch": 1.7320819112627985, "grad_norm": 0.4527335732070691, "learning_rate": 3.3046393221174584e-05, "loss": 0.4619, "step": 1015 }, { "epoch": 1.733788395904437, "grad_norm": 0.34280779884824736, "learning_rate": 3.302832442047656e-05, "loss": 0.3938, "step": 1016 }, { "epoch": 1.735494880546075, "grad_norm": 0.39422292233838113, "learning_rate": 3.301023712845494e-05, "loss": 0.4844, "step": 1017 }, { "epoch": 1.7372013651877132, "grad_norm": 0.40200103116961866, "learning_rate": 3.2992131370781324e-05, "loss": 0.4306, "step": 1018 }, { "epoch": 1.7389078498293515, "grad_norm": 0.3432667076140728, "learning_rate": 3.297400717315351e-05, "loss": 0.4444, "step": 1019 }, { "epoch": 1.74061433447099, "grad_norm": 0.39592905119657307, "learning_rate": 3.29558645612955e-05, "loss": 0.4496, "step": 1020 }, { "epoch": 1.742320819112628, "grad_norm": 0.3732006670729543, "learning_rate": 3.2937703560957405e-05, "loss": 0.4519, "step": 1021 }, { "epoch": 1.7440273037542662, "grad_norm": 0.4016176887258834, "learning_rate": 3.2919524197915436e-05, "loss": 0.4372, "step": 1022 }, { "epoch": 1.7457337883959045, "grad_norm": 0.4392872292719323, "learning_rate": 3.290132649797188e-05, "loss": 0.4682, "step": 1023 }, { "epoch": 1.7474402730375427, "grad_norm": 0.3769716755751727, "learning_rate": 3.288311048695506e-05, "loss": 0.416, "step": 1024 }, { "epoch": 1.7491467576791808, "grad_norm": 0.4341939032109703, "learning_rate": 3.2864876190719245e-05, "loss": 0.4574, "step": 1025 }, { "epoch": 1.7508532423208192, "grad_norm": 0.38497051556421485, "learning_rate": 3.28466236351447e-05, "loss": 0.4247, "step": 1026 }, { "epoch": 1.7525597269624573, "grad_norm": 0.38460545318593387, "learning_rate": 3.282835284613759e-05, "loss": 0.4178, "step": 1027 }, { "epoch": 1.7542662116040955, "grad_norm": 0.44963309509926697, "learning_rate": 3.281006384962994e-05, "loss": 0.4662, "step": 1028 }, { "epoch": 1.7559726962457338, "grad_norm": 0.39444392040125487, "learning_rate": 3.279175667157966e-05, "loss": 0.4409, "step": 1029 }, { "epoch": 1.757679180887372, "grad_norm": 0.379860254515413, "learning_rate": 3.277343133797042e-05, "loss": 0.4346, "step": 1030 }, { "epoch": 1.75938566552901, "grad_norm": 0.3928991006625797, "learning_rate": 3.2755087874811696e-05, "loss": 0.4365, "step": 1031 }, { "epoch": 1.7610921501706485, "grad_norm": 0.3758416937166645, "learning_rate": 3.2736726308138666e-05, "loss": 0.436, "step": 1032 }, { "epoch": 1.7627986348122868, "grad_norm": 0.3984699724279319, "learning_rate": 3.271834666401222e-05, "loss": 0.4159, "step": 1033 }, { "epoch": 1.764505119453925, "grad_norm": 0.40347824679120803, "learning_rate": 3.2699948968518905e-05, "loss": 0.4212, "step": 1034 }, { "epoch": 1.766211604095563, "grad_norm": 0.41223483424837204, "learning_rate": 3.268153324777088e-05, "loss": 0.4342, "step": 1035 }, { "epoch": 1.7679180887372015, "grad_norm": 0.37001044900354435, "learning_rate": 3.26630995279059e-05, "loss": 0.434, "step": 1036 }, { "epoch": 1.7696245733788396, "grad_norm": 0.39536129614409976, "learning_rate": 3.264464783508724e-05, "loss": 0.4762, "step": 1037 }, { "epoch": 1.7713310580204777, "grad_norm": 0.386527238328391, "learning_rate": 3.2626178195503725e-05, "loss": 0.4104, "step": 1038 }, { "epoch": 1.773037542662116, "grad_norm": 0.415262031466017, "learning_rate": 3.260769063536962e-05, "loss": 0.477, "step": 1039 }, { "epoch": 1.7747440273037542, "grad_norm": 0.37361364115284673, "learning_rate": 3.2589185180924634e-05, "loss": 0.4211, "step": 1040 }, { "epoch": 1.7764505119453924, "grad_norm": 0.41212711102387606, "learning_rate": 3.257066185843388e-05, "loss": 0.4284, "step": 1041 }, { "epoch": 1.7781569965870307, "grad_norm": 0.369270748557867, "learning_rate": 3.255212069418782e-05, "loss": 0.4463, "step": 1042 }, { "epoch": 1.7798634812286689, "grad_norm": 0.3712719038302196, "learning_rate": 3.253356171450225e-05, "loss": 0.4596, "step": 1043 }, { "epoch": 1.781569965870307, "grad_norm": 0.3792687716129807, "learning_rate": 3.251498494571825e-05, "loss": 0.4536, "step": 1044 }, { "epoch": 1.7832764505119454, "grad_norm": 0.3548862654831631, "learning_rate": 3.249639041420214e-05, "loss": 0.4211, "step": 1045 }, { "epoch": 1.7849829351535837, "grad_norm": 0.35616337807709136, "learning_rate": 3.247777814634545e-05, "loss": 0.4152, "step": 1046 }, { "epoch": 1.786689419795222, "grad_norm": 0.38087475817301647, "learning_rate": 3.245914816856491e-05, "loss": 0.398, "step": 1047 }, { "epoch": 1.78839590443686, "grad_norm": 0.4399343175733236, "learning_rate": 3.244050050730235e-05, "loss": 0.4572, "step": 1048 }, { "epoch": 1.7901023890784984, "grad_norm": 0.33778622101738565, "learning_rate": 3.242183518902471e-05, "loss": 0.4066, "step": 1049 }, { "epoch": 1.7918088737201365, "grad_norm": 0.34462130214733205, "learning_rate": 3.2403152240224016e-05, "loss": 0.4171, "step": 1050 }, { "epoch": 1.7935153583617747, "grad_norm": 0.3696951944918031, "learning_rate": 3.238445168741728e-05, "loss": 0.4322, "step": 1051 }, { "epoch": 1.795221843003413, "grad_norm": 0.34184919207550546, "learning_rate": 3.2365733557146524e-05, "loss": 0.4431, "step": 1052 }, { "epoch": 1.7969283276450512, "grad_norm": 0.38993811879144025, "learning_rate": 3.23469978759787e-05, "loss": 0.4215, "step": 1053 }, { "epoch": 1.7986348122866893, "grad_norm": 0.327127168964827, "learning_rate": 3.232824467050569e-05, "loss": 0.4107, "step": 1054 }, { "epoch": 1.8003412969283277, "grad_norm": 0.3996487580489501, "learning_rate": 3.2309473967344246e-05, "loss": 0.4827, "step": 1055 }, { "epoch": 1.802047781569966, "grad_norm": 0.3674756187328674, "learning_rate": 3.229068579313593e-05, "loss": 0.4398, "step": 1056 }, { "epoch": 1.803754266211604, "grad_norm": 0.41368242038508635, "learning_rate": 3.227188017454713e-05, "loss": 0.4392, "step": 1057 }, { "epoch": 1.8054607508532423, "grad_norm": 0.39890941941714153, "learning_rate": 3.225305713826898e-05, "loss": 0.4428, "step": 1058 }, { "epoch": 1.8071672354948807, "grad_norm": 0.41038117710239236, "learning_rate": 3.223421671101734e-05, "loss": 0.4228, "step": 1059 }, { "epoch": 1.8088737201365188, "grad_norm": 0.4226904975833523, "learning_rate": 3.2215358919532735e-05, "loss": 0.4105, "step": 1060 }, { "epoch": 1.810580204778157, "grad_norm": 0.3882678446191072, "learning_rate": 3.219648379058037e-05, "loss": 0.4476, "step": 1061 }, { "epoch": 1.8122866894197953, "grad_norm": 0.4087205311385779, "learning_rate": 3.217759135095004e-05, "loss": 0.429, "step": 1062 }, { "epoch": 1.8139931740614335, "grad_norm": 0.3340537469358327, "learning_rate": 3.215868162745609e-05, "loss": 0.4568, "step": 1063 }, { "epoch": 1.8156996587030716, "grad_norm": 0.37739535451555306, "learning_rate": 3.213975464693743e-05, "loss": 0.4448, "step": 1064 }, { "epoch": 1.81740614334471, "grad_norm": 0.359159589987258, "learning_rate": 3.2120810436257435e-05, "loss": 0.4423, "step": 1065 }, { "epoch": 1.819112627986348, "grad_norm": 0.3493876724755677, "learning_rate": 3.2101849022303955e-05, "loss": 0.4196, "step": 1066 }, { "epoch": 1.8208191126279862, "grad_norm": 0.3574344041797343, "learning_rate": 3.2082870431989245e-05, "loss": 0.4215, "step": 1067 }, { "epoch": 1.8225255972696246, "grad_norm": 0.3449199113564368, "learning_rate": 3.2063874692249947e-05, "loss": 0.4101, "step": 1068 }, { "epoch": 1.824232081911263, "grad_norm": 0.3778493662630658, "learning_rate": 3.204486183004703e-05, "loss": 0.4136, "step": 1069 }, { "epoch": 1.8259385665529009, "grad_norm": 0.3524828063713649, "learning_rate": 3.2025831872365784e-05, "loss": 0.4536, "step": 1070 }, { "epoch": 1.8276450511945392, "grad_norm": 0.33278250688476424, "learning_rate": 3.200678484621575e-05, "loss": 0.4134, "step": 1071 }, { "epoch": 1.8293515358361776, "grad_norm": 0.40248605788959163, "learning_rate": 3.19877207786307e-05, "loss": 0.4232, "step": 1072 }, { "epoch": 1.8310580204778157, "grad_norm": 0.3703730711217583, "learning_rate": 3.1968639696668584e-05, "loss": 0.4307, "step": 1073 }, { "epoch": 1.8327645051194539, "grad_norm": 0.3614945164676251, "learning_rate": 3.194954162741152e-05, "loss": 0.4467, "step": 1074 }, { "epoch": 1.8344709897610922, "grad_norm": 0.3540166492601531, "learning_rate": 3.1930426597965714e-05, "loss": 0.4411, "step": 1075 }, { "epoch": 1.8361774744027304, "grad_norm": 0.3696926170561594, "learning_rate": 3.1911294635461455e-05, "loss": 0.4348, "step": 1076 }, { "epoch": 1.8378839590443685, "grad_norm": 0.3662622788229689, "learning_rate": 3.189214576705307e-05, "loss": 0.4277, "step": 1077 }, { "epoch": 1.8395904436860069, "grad_norm": 0.3895871706871898, "learning_rate": 3.1872980019918864e-05, "loss": 0.4163, "step": 1078 }, { "epoch": 1.841296928327645, "grad_norm": 0.3438737423599471, "learning_rate": 3.1853797421261125e-05, "loss": 0.4037, "step": 1079 }, { "epoch": 1.8430034129692832, "grad_norm": 0.40485789953961415, "learning_rate": 3.183459799830603e-05, "loss": 0.4358, "step": 1080 }, { "epoch": 1.8447098976109215, "grad_norm": 0.3678614520421831, "learning_rate": 3.181538177830366e-05, "loss": 0.4076, "step": 1081 }, { "epoch": 1.8464163822525599, "grad_norm": 0.3891439721848829, "learning_rate": 3.179614878852792e-05, "loss": 0.4644, "step": 1082 }, { "epoch": 1.8481228668941978, "grad_norm": 0.5129982908158024, "learning_rate": 3.177689905627651e-05, "loss": 0.4325, "step": 1083 }, { "epoch": 1.8498293515358362, "grad_norm": 0.43424400708864125, "learning_rate": 3.1757632608870915e-05, "loss": 0.4242, "step": 1084 }, { "epoch": 1.8515358361774745, "grad_norm": 0.4108442271574386, "learning_rate": 3.173834947365634e-05, "loss": 0.4156, "step": 1085 }, { "epoch": 1.8532423208191127, "grad_norm": 0.3703682959673418, "learning_rate": 3.171904967800166e-05, "loss": 0.423, "step": 1086 }, { "epoch": 1.8549488054607508, "grad_norm": 0.3731841091953381, "learning_rate": 3.1699733249299395e-05, "loss": 0.4447, "step": 1087 }, { "epoch": 1.8566552901023892, "grad_norm": 0.3924445032580443, "learning_rate": 3.16804002149657e-05, "loss": 0.4099, "step": 1088 }, { "epoch": 1.8583617747440273, "grad_norm": 0.36438230574683184, "learning_rate": 3.166105060244029e-05, "loss": 0.4359, "step": 1089 }, { "epoch": 1.8600682593856654, "grad_norm": 0.3672867920210704, "learning_rate": 3.164168443918636e-05, "loss": 0.4371, "step": 1090 }, { "epoch": 1.8617747440273038, "grad_norm": 0.3619548462607363, "learning_rate": 3.1622301752690675e-05, "loss": 0.4287, "step": 1091 }, { "epoch": 1.863481228668942, "grad_norm": 0.3989562928398588, "learning_rate": 3.1602902570463396e-05, "loss": 0.3808, "step": 1092 }, { "epoch": 1.86518771331058, "grad_norm": 0.3400880684976596, "learning_rate": 3.158348692003812e-05, "loss": 0.4552, "step": 1093 }, { "epoch": 1.8668941979522184, "grad_norm": 0.3884228545028214, "learning_rate": 3.156405482897181e-05, "loss": 0.4424, "step": 1094 }, { "epoch": 1.8686006825938568, "grad_norm": 0.47001264022373496, "learning_rate": 3.154460632484477e-05, "loss": 0.4518, "step": 1095 }, { "epoch": 1.8703071672354947, "grad_norm": 0.3709605969510673, "learning_rate": 3.152514143526058e-05, "loss": 0.4298, "step": 1096 }, { "epoch": 1.872013651877133, "grad_norm": 0.4383716509647053, "learning_rate": 3.15056601878461e-05, "loss": 0.4678, "step": 1097 }, { "epoch": 1.8737201365187715, "grad_norm": 0.38594310190286124, "learning_rate": 3.1486162610251405e-05, "loss": 0.4393, "step": 1098 }, { "epoch": 1.8754266211604096, "grad_norm": 0.40721100356765266, "learning_rate": 3.146664873014973e-05, "loss": 0.435, "step": 1099 }, { "epoch": 1.8771331058020477, "grad_norm": 0.49506906473357676, "learning_rate": 3.144711857523746e-05, "loss": 0.4317, "step": 1100 }, { "epoch": 1.878839590443686, "grad_norm": 0.4090982341013302, "learning_rate": 3.142757217323408e-05, "loss": 0.4682, "step": 1101 }, { "epoch": 1.8805460750853242, "grad_norm": 0.39453830650395644, "learning_rate": 3.140800955188213e-05, "loss": 0.4127, "step": 1102 }, { "epoch": 1.8822525597269624, "grad_norm": 0.3727138401061236, "learning_rate": 3.138843073894717e-05, "loss": 0.4639, "step": 1103 }, { "epoch": 1.8839590443686007, "grad_norm": 0.3867992352973858, "learning_rate": 3.1368835762217755e-05, "loss": 0.4368, "step": 1104 }, { "epoch": 1.8856655290102389, "grad_norm": 0.3946509268348863, "learning_rate": 3.1349224649505366e-05, "loss": 0.4406, "step": 1105 }, { "epoch": 1.887372013651877, "grad_norm": 0.36077305576822366, "learning_rate": 3.132959742864438e-05, "loss": 0.4051, "step": 1106 }, { "epoch": 1.8890784982935154, "grad_norm": 0.40475493268433543, "learning_rate": 3.130995412749206e-05, "loss": 0.4724, "step": 1107 }, { "epoch": 1.8907849829351537, "grad_norm": 0.3787893148385328, "learning_rate": 3.129029477392848e-05, "loss": 0.4264, "step": 1108 }, { "epoch": 1.8924914675767917, "grad_norm": 0.3886535714745907, "learning_rate": 3.127061939585649e-05, "loss": 0.4208, "step": 1109 }, { "epoch": 1.89419795221843, "grad_norm": 0.40410505742416974, "learning_rate": 3.125092802120169e-05, "loss": 0.4343, "step": 1110 }, { "epoch": 1.8959044368600684, "grad_norm": 0.3793707170377886, "learning_rate": 3.123122067791238e-05, "loss": 0.3967, "step": 1111 }, { "epoch": 1.8976109215017065, "grad_norm": 0.38968057558625735, "learning_rate": 3.1211497393959546e-05, "loss": 0.3915, "step": 1112 }, { "epoch": 1.8993174061433447, "grad_norm": 0.43184149778702685, "learning_rate": 3.119175819733677e-05, "loss": 0.4112, "step": 1113 }, { "epoch": 1.901023890784983, "grad_norm": 0.4547391887609193, "learning_rate": 3.117200311606023e-05, "loss": 0.4073, "step": 1114 }, { "epoch": 1.9027303754266212, "grad_norm": 0.4023504384319604, "learning_rate": 3.1152232178168655e-05, "loss": 0.4562, "step": 1115 }, { "epoch": 1.9044368600682593, "grad_norm": 0.43798414339531605, "learning_rate": 3.113244541172328e-05, "loss": 0.4463, "step": 1116 }, { "epoch": 1.9061433447098977, "grad_norm": 0.43205765378414585, "learning_rate": 3.111264284480779e-05, "loss": 0.4811, "step": 1117 }, { "epoch": 1.9078498293515358, "grad_norm": 0.40707407192136424, "learning_rate": 3.109282450552831e-05, "loss": 0.4366, "step": 1118 }, { "epoch": 1.909556313993174, "grad_norm": 0.41441282044999433, "learning_rate": 3.1072990422013354e-05, "loss": 0.4344, "step": 1119 }, { "epoch": 1.9112627986348123, "grad_norm": 0.3471608850863164, "learning_rate": 3.105314062241377e-05, "loss": 0.4064, "step": 1120 }, { "epoch": 1.9129692832764507, "grad_norm": 0.407103583087269, "learning_rate": 3.1033275134902714e-05, "loss": 0.4307, "step": 1121 }, { "epoch": 1.9146757679180886, "grad_norm": 0.37090540027247704, "learning_rate": 3.1013393987675624e-05, "loss": 0.4482, "step": 1122 }, { "epoch": 1.916382252559727, "grad_norm": 0.3905360275094505, "learning_rate": 3.099349720895015e-05, "loss": 0.4102, "step": 1123 }, { "epoch": 1.9180887372013653, "grad_norm": 0.44609768346190243, "learning_rate": 3.0973584826966114e-05, "loss": 0.4424, "step": 1124 }, { "epoch": 1.9197952218430034, "grad_norm": 0.3742218681777803, "learning_rate": 3.095365686998552e-05, "loss": 0.4411, "step": 1125 }, { "epoch": 1.9215017064846416, "grad_norm": 0.4251254043386818, "learning_rate": 3.093371336629245e-05, "loss": 0.4514, "step": 1126 }, { "epoch": 1.92320819112628, "grad_norm": 0.36785879492426965, "learning_rate": 3.091375434419306e-05, "loss": 0.4343, "step": 1127 }, { "epoch": 1.924914675767918, "grad_norm": 0.3703783175452118, "learning_rate": 3.089377983201553e-05, "loss": 0.4405, "step": 1128 }, { "epoch": 1.9266211604095562, "grad_norm": 0.3659750932274629, "learning_rate": 3.0873789858110037e-05, "loss": 0.4056, "step": 1129 }, { "epoch": 1.9283276450511946, "grad_norm": 0.3853425455044206, "learning_rate": 3.085378445084868e-05, "loss": 0.4262, "step": 1130 }, { "epoch": 1.9300341296928327, "grad_norm": 0.3851004913621087, "learning_rate": 3.0833763638625466e-05, "loss": 0.447, "step": 1131 }, { "epoch": 1.9317406143344709, "grad_norm": 0.4442090248664147, "learning_rate": 3.0813727449856305e-05, "loss": 0.467, "step": 1132 }, { "epoch": 1.9334470989761092, "grad_norm": 0.3786585427874933, "learning_rate": 3.0793675912978875e-05, "loss": 0.4297, "step": 1133 }, { "epoch": 1.9351535836177476, "grad_norm": 0.38519490340606855, "learning_rate": 3.0773609056452683e-05, "loss": 0.4627, "step": 1134 }, { "epoch": 1.9368600682593855, "grad_norm": 0.35724071458801504, "learning_rate": 3.0753526908758956e-05, "loss": 0.4645, "step": 1135 }, { "epoch": 1.9385665529010239, "grad_norm": 0.426621147363609, "learning_rate": 3.073342949840063e-05, "loss": 0.4512, "step": 1136 }, { "epoch": 1.9402730375426622, "grad_norm": 0.37380802533576596, "learning_rate": 3.0713316853902296e-05, "loss": 0.4411, "step": 1137 }, { "epoch": 1.9419795221843004, "grad_norm": 0.4066743908256873, "learning_rate": 3.069318900381019e-05, "loss": 0.4259, "step": 1138 }, { "epoch": 1.9436860068259385, "grad_norm": 0.39421066591401493, "learning_rate": 3.0673045976692095e-05, "loss": 0.4505, "step": 1139 }, { "epoch": 1.9453924914675769, "grad_norm": 0.4285964084137661, "learning_rate": 3.0652887801137365e-05, "loss": 0.6663, "step": 1140 }, { "epoch": 1.947098976109215, "grad_norm": 0.39704888749269823, "learning_rate": 3.063271450575685e-05, "loss": 0.4357, "step": 1141 }, { "epoch": 1.9488054607508531, "grad_norm": 0.40234313660149934, "learning_rate": 3.061252611918283e-05, "loss": 0.4256, "step": 1142 }, { "epoch": 1.9505119453924915, "grad_norm": 0.39051818078690365, "learning_rate": 3.0592322670069044e-05, "loss": 0.4429, "step": 1143 }, { "epoch": 1.9522184300341296, "grad_norm": 0.4453280616407959, "learning_rate": 3.05721041870906e-05, "loss": 0.4346, "step": 1144 }, { "epoch": 1.9539249146757678, "grad_norm": 0.39418177898309475, "learning_rate": 3.055187069894392e-05, "loss": 0.4532, "step": 1145 }, { "epoch": 1.9556313993174061, "grad_norm": 0.40025147717303106, "learning_rate": 3.0531622234346747e-05, "loss": 0.444, "step": 1146 }, { "epoch": 1.9573378839590445, "grad_norm": 0.37661192218548234, "learning_rate": 3.0511358822038075e-05, "loss": 0.4127, "step": 1147 }, { "epoch": 1.9590443686006824, "grad_norm": 0.38856559022753606, "learning_rate": 3.0491080490778105e-05, "loss": 0.4829, "step": 1148 }, { "epoch": 1.9607508532423208, "grad_norm": 0.3888783912215676, "learning_rate": 3.0470787269348218e-05, "loss": 0.4567, "step": 1149 }, { "epoch": 1.9624573378839592, "grad_norm": 0.41945768107269354, "learning_rate": 3.0450479186550948e-05, "loss": 0.451, "step": 1150 }, { "epoch": 1.9641638225255973, "grad_norm": 0.4513414837155506, "learning_rate": 3.043015627120989e-05, "loss": 0.4557, "step": 1151 }, { "epoch": 1.9658703071672354, "grad_norm": 0.3882336660858084, "learning_rate": 3.04098185521697e-05, "loss": 0.4433, "step": 1152 }, { "epoch": 1.9675767918088738, "grad_norm": 0.4117897053320961, "learning_rate": 3.038946605829606e-05, "loss": 0.427, "step": 1153 }, { "epoch": 1.969283276450512, "grad_norm": 0.39225920778552686, "learning_rate": 3.0369098818475612e-05, "loss": 0.4669, "step": 1154 }, { "epoch": 1.97098976109215, "grad_norm": 0.3642477857528382, "learning_rate": 3.0348716861615917e-05, "loss": 0.4248, "step": 1155 }, { "epoch": 1.9726962457337884, "grad_norm": 0.35814160996825456, "learning_rate": 3.032832021664544e-05, "loss": 0.4495, "step": 1156 }, { "epoch": 1.9744027303754266, "grad_norm": 0.3724344732843097, "learning_rate": 3.0307908912513507e-05, "loss": 0.4421, "step": 1157 }, { "epoch": 1.9761092150170647, "grad_norm": 0.3981452207001675, "learning_rate": 3.0287482978190207e-05, "loss": 0.4369, "step": 1158 }, { "epoch": 1.977815699658703, "grad_norm": 0.3830660314911436, "learning_rate": 3.0267042442666423e-05, "loss": 0.4401, "step": 1159 }, { "epoch": 1.9795221843003414, "grad_norm": 0.37721324646779664, "learning_rate": 3.0246587334953772e-05, "loss": 0.3923, "step": 1160 }, { "epoch": 1.9812286689419796, "grad_norm": 0.37845672699312793, "learning_rate": 3.022611768408451e-05, "loss": 0.4174, "step": 1161 }, { "epoch": 1.9829351535836177, "grad_norm": 0.43344727294576857, "learning_rate": 3.0205633519111583e-05, "loss": 0.4488, "step": 1162 }, { "epoch": 1.984641638225256, "grad_norm": 0.4359501540294188, "learning_rate": 3.018513486910852e-05, "loss": 0.4113, "step": 1163 }, { "epoch": 1.9863481228668942, "grad_norm": 0.35369746565652127, "learning_rate": 3.0164621763169384e-05, "loss": 0.4292, "step": 1164 }, { "epoch": 1.9880546075085324, "grad_norm": 0.40236101912475597, "learning_rate": 3.0144094230408796e-05, "loss": 0.427, "step": 1165 }, { "epoch": 1.9897610921501707, "grad_norm": 0.4326007117434534, "learning_rate": 3.012355229996183e-05, "loss": 0.4336, "step": 1166 }, { "epoch": 1.9914675767918089, "grad_norm": 0.4036010077074162, "learning_rate": 3.0102996000983993e-05, "loss": 0.4203, "step": 1167 }, { "epoch": 1.993174061433447, "grad_norm": 0.39351860542297634, "learning_rate": 3.0082425362651197e-05, "loss": 0.4511, "step": 1168 }, { "epoch": 1.9948805460750854, "grad_norm": 0.38401207247759234, "learning_rate": 3.00618404141597e-05, "loss": 0.4477, "step": 1169 }, { "epoch": 1.9965870307167235, "grad_norm": 0.36355677275483467, "learning_rate": 3.004124118472607e-05, "loss": 0.4213, "step": 1170 }, { "epoch": 1.9982935153583616, "grad_norm": 0.38159766177942894, "learning_rate": 3.0020627703587154e-05, "loss": 0.4753, "step": 1171 }, { "epoch": 2.0, "grad_norm": 0.43975034858095224, "learning_rate": 3.0000000000000004e-05, "loss": 0.4433, "step": 1172 }, { "epoch": 2.0017064846416384, "grad_norm": 0.457128745992015, "learning_rate": 2.997935810324188e-05, "loss": 0.3122, "step": 1173 }, { "epoch": 2.0034129692832763, "grad_norm": 0.4139908575686257, "learning_rate": 2.9958702042610176e-05, "loss": 0.3422, "step": 1174 }, { "epoch": 2.0051194539249146, "grad_norm": 0.6828806786379927, "learning_rate": 2.9938031847422395e-05, "loss": 0.3496, "step": 1175 }, { "epoch": 2.006825938566553, "grad_norm": 0.4453379166984815, "learning_rate": 2.99173475470161e-05, "loss": 0.3221, "step": 1176 }, { "epoch": 2.008532423208191, "grad_norm": 0.481091150644822, "learning_rate": 2.9896649170748864e-05, "loss": 0.2986, "step": 1177 }, { "epoch": 2.0102389078498293, "grad_norm": 0.4273047300781224, "learning_rate": 2.987593674799826e-05, "loss": 0.3229, "step": 1178 }, { "epoch": 2.0119453924914676, "grad_norm": 0.43016109425591, "learning_rate": 2.985521030816177e-05, "loss": 0.3332, "step": 1179 }, { "epoch": 2.013651877133106, "grad_norm": 0.43062017675061975, "learning_rate": 2.983446988065679e-05, "loss": 0.3085, "step": 1180 }, { "epoch": 2.015358361774744, "grad_norm": 0.42147762229991387, "learning_rate": 2.9813715494920568e-05, "loss": 0.3715, "step": 1181 }, { "epoch": 2.0170648464163823, "grad_norm": 0.411732748179639, "learning_rate": 2.9792947180410146e-05, "loss": 0.3308, "step": 1182 }, { "epoch": 2.0187713310580206, "grad_norm": 0.43182507847266327, "learning_rate": 2.9772164966602362e-05, "loss": 0.3562, "step": 1183 }, { "epoch": 2.0204778156996586, "grad_norm": 0.39303469490960924, "learning_rate": 2.9751368882993765e-05, "loss": 0.3442, "step": 1184 }, { "epoch": 2.022184300341297, "grad_norm": 0.44901601334467767, "learning_rate": 2.9730558959100585e-05, "loss": 0.3258, "step": 1185 }, { "epoch": 2.0238907849829353, "grad_norm": 0.4234197234589005, "learning_rate": 2.9709735224458703e-05, "loss": 0.3233, "step": 1186 }, { "epoch": 2.025597269624573, "grad_norm": 0.4145126381242284, "learning_rate": 2.968889770862361e-05, "loss": 0.3139, "step": 1187 }, { "epoch": 2.0273037542662116, "grad_norm": 0.4650028395470894, "learning_rate": 2.9668046441170338e-05, "loss": 0.3258, "step": 1188 }, { "epoch": 2.02901023890785, "grad_norm": 0.4138016958262979, "learning_rate": 2.9647181451693456e-05, "loss": 0.3418, "step": 1189 }, { "epoch": 2.030716723549488, "grad_norm": 0.44527009359826686, "learning_rate": 2.962630276980699e-05, "loss": 0.3341, "step": 1190 }, { "epoch": 2.032423208191126, "grad_norm": 0.449841389006582, "learning_rate": 2.960541042514443e-05, "loss": 0.3972, "step": 1191 }, { "epoch": 2.0341296928327646, "grad_norm": 0.4114068718397478, "learning_rate": 2.9584504447358617e-05, "loss": 0.3244, "step": 1192 }, { "epoch": 2.035836177474403, "grad_norm": 0.44429247596201654, "learning_rate": 2.956358486612177e-05, "loss": 0.3535, "step": 1193 }, { "epoch": 2.037542662116041, "grad_norm": 0.44617564995683495, "learning_rate": 2.9542651711125413e-05, "loss": 0.2977, "step": 1194 }, { "epoch": 2.039249146757679, "grad_norm": 0.4096953046125742, "learning_rate": 2.9521705012080326e-05, "loss": 0.3218, "step": 1195 }, { "epoch": 2.0409556313993176, "grad_norm": 0.3979622530922502, "learning_rate": 2.9500744798716515e-05, "loss": 0.3509, "step": 1196 }, { "epoch": 2.0426621160409555, "grad_norm": 0.40047495999040483, "learning_rate": 2.947977110078317e-05, "loss": 0.3017, "step": 1197 }, { "epoch": 2.044368600682594, "grad_norm": 0.3847757055516993, "learning_rate": 2.945878394804863e-05, "loss": 0.3071, "step": 1198 }, { "epoch": 2.046075085324232, "grad_norm": 0.3600892843081187, "learning_rate": 2.9437783370300302e-05, "loss": 0.3573, "step": 1199 }, { "epoch": 2.04778156996587, "grad_norm": 0.38126019571864667, "learning_rate": 2.9416769397344685e-05, "loss": 0.3506, "step": 1200 }, { "epoch": 2.0494880546075085, "grad_norm": 0.3554100311173552, "learning_rate": 2.939574205900725e-05, "loss": 0.3072, "step": 1201 }, { "epoch": 2.051194539249147, "grad_norm": 0.38334979368635375, "learning_rate": 2.9374701385132472e-05, "loss": 0.3259, "step": 1202 }, { "epoch": 2.0529010238907848, "grad_norm": 0.3703136644266881, "learning_rate": 2.935364740558373e-05, "loss": 0.315, "step": 1203 }, { "epoch": 2.054607508532423, "grad_norm": 0.40500653297603784, "learning_rate": 2.93325801502433e-05, "loss": 0.3408, "step": 1204 }, { "epoch": 2.0563139931740615, "grad_norm": 0.4062486109717856, "learning_rate": 2.9311499649012304e-05, "loss": 0.2987, "step": 1205 }, { "epoch": 2.0580204778157, "grad_norm": 0.3616658692205193, "learning_rate": 2.929040593181065e-05, "loss": 0.3149, "step": 1206 }, { "epoch": 2.0597269624573378, "grad_norm": 0.3848905374239776, "learning_rate": 2.9269299028577016e-05, "loss": 0.3253, "step": 1207 }, { "epoch": 2.061433447098976, "grad_norm": 0.4298981718447302, "learning_rate": 2.924817896926879e-05, "loss": 0.3256, "step": 1208 }, { "epoch": 2.0631399317406145, "grad_norm": 0.3534847095397029, "learning_rate": 2.9227045783862026e-05, "loss": 0.3597, "step": 1209 }, { "epoch": 2.0648464163822524, "grad_norm": 0.33638127804053347, "learning_rate": 2.9205899502351427e-05, "loss": 0.3273, "step": 1210 }, { "epoch": 2.0665529010238908, "grad_norm": 0.3855231208992533, "learning_rate": 2.9184740154750265e-05, "loss": 0.3138, "step": 1211 }, { "epoch": 2.068259385665529, "grad_norm": 0.381774736275018, "learning_rate": 2.9163567771090368e-05, "loss": 0.3043, "step": 1212 }, { "epoch": 2.069965870307167, "grad_norm": 0.37464619845398417, "learning_rate": 2.9142382381422058e-05, "loss": 0.316, "step": 1213 }, { "epoch": 2.0716723549488054, "grad_norm": 0.4126972583021004, "learning_rate": 2.912118401581412e-05, "loss": 0.2867, "step": 1214 }, { "epoch": 2.073378839590444, "grad_norm": 0.4314536077963715, "learning_rate": 2.9099972704353763e-05, "loss": 0.327, "step": 1215 }, { "epoch": 2.0750853242320817, "grad_norm": 0.36047470235701123, "learning_rate": 2.9078748477146552e-05, "loss": 0.3445, "step": 1216 }, { "epoch": 2.07679180887372, "grad_norm": 0.4301966987859649, "learning_rate": 2.905751136431641e-05, "loss": 0.3275, "step": 1217 }, { "epoch": 2.0784982935153584, "grad_norm": 0.40268655007936277, "learning_rate": 2.9036261396005526e-05, "loss": 0.3288, "step": 1218 }, { "epoch": 2.080204778156997, "grad_norm": 0.3992363790721564, "learning_rate": 2.9014998602374345e-05, "loss": 0.363, "step": 1219 }, { "epoch": 2.0819112627986347, "grad_norm": 0.3883051261851283, "learning_rate": 2.899372301360152e-05, "loss": 0.3166, "step": 1220 }, { "epoch": 2.083617747440273, "grad_norm": 0.4050737657795587, "learning_rate": 2.8972434659883847e-05, "loss": 0.3298, "step": 1221 }, { "epoch": 2.0853242320819114, "grad_norm": 0.3984900692632608, "learning_rate": 2.8951133571436255e-05, "loss": 0.3272, "step": 1222 }, { "epoch": 2.0870307167235493, "grad_norm": 0.35897624852421484, "learning_rate": 2.8929819778491736e-05, "loss": 0.3574, "step": 1223 }, { "epoch": 2.0887372013651877, "grad_norm": 0.37441217352575806, "learning_rate": 2.8908493311301336e-05, "loss": 0.3392, "step": 1224 }, { "epoch": 2.090443686006826, "grad_norm": 0.3806877568546792, "learning_rate": 2.8887154200134066e-05, "loss": 0.3405, "step": 1225 }, { "epoch": 2.092150170648464, "grad_norm": 0.3635010398502254, "learning_rate": 2.8865802475276888e-05, "loss": 0.3308, "step": 1226 }, { "epoch": 2.0938566552901023, "grad_norm": 0.393071249205216, "learning_rate": 2.8844438167034675e-05, "loss": 0.3389, "step": 1227 }, { "epoch": 2.0955631399317407, "grad_norm": 0.3623276745352252, "learning_rate": 2.8823061305730154e-05, "loss": 0.3172, "step": 1228 }, { "epoch": 2.0972696245733786, "grad_norm": 0.3840492877615427, "learning_rate": 2.8801671921703875e-05, "loss": 0.385, "step": 1229 }, { "epoch": 2.098976109215017, "grad_norm": 0.35674517742092077, "learning_rate": 2.878027004531414e-05, "loss": 0.3354, "step": 1230 }, { "epoch": 2.1006825938566553, "grad_norm": 0.3724437131382105, "learning_rate": 2.8758855706937015e-05, "loss": 0.3244, "step": 1231 }, { "epoch": 2.1023890784982937, "grad_norm": 0.42648734108694186, "learning_rate": 2.873742893696623e-05, "loss": 0.3322, "step": 1232 }, { "epoch": 2.1040955631399316, "grad_norm": 0.3731360928602656, "learning_rate": 2.871598976581317e-05, "loss": 0.2982, "step": 1233 }, { "epoch": 2.10580204778157, "grad_norm": 0.40421291050408936, "learning_rate": 2.8694538223906812e-05, "loss": 0.354, "step": 1234 }, { "epoch": 2.1075085324232083, "grad_norm": 0.3816196487587043, "learning_rate": 2.8673074341693698e-05, "loss": 0.3645, "step": 1235 }, { "epoch": 2.1092150170648463, "grad_norm": 0.37710300585084694, "learning_rate": 2.865159814963788e-05, "loss": 0.3484, "step": 1236 }, { "epoch": 2.1109215017064846, "grad_norm": 0.43153582922337635, "learning_rate": 2.863010967822089e-05, "loss": 0.3541, "step": 1237 }, { "epoch": 2.112627986348123, "grad_norm": 0.38541541269540536, "learning_rate": 2.8608608957941677e-05, "loss": 0.3148, "step": 1238 }, { "epoch": 2.114334470989761, "grad_norm": 0.3976182070732549, "learning_rate": 2.8587096019316588e-05, "loss": 0.3631, "step": 1239 }, { "epoch": 2.1160409556313993, "grad_norm": 0.4170776432996827, "learning_rate": 2.8565570892879308e-05, "loss": 0.3037, "step": 1240 }, { "epoch": 2.1177474402730376, "grad_norm": 0.4067765892230644, "learning_rate": 2.8544033609180797e-05, "loss": 0.3303, "step": 1241 }, { "epoch": 2.1194539249146755, "grad_norm": 0.35639261403736644, "learning_rate": 2.8522484198789308e-05, "loss": 0.3323, "step": 1242 }, { "epoch": 2.121160409556314, "grad_norm": 0.39487236200655806, "learning_rate": 2.8500922692290284e-05, "loss": 0.3783, "step": 1243 }, { "epoch": 2.1228668941979523, "grad_norm": 0.35821615465041945, "learning_rate": 2.8479349120286337e-05, "loss": 0.3407, "step": 1244 }, { "epoch": 2.1245733788395906, "grad_norm": 0.35341164274325937, "learning_rate": 2.8457763513397206e-05, "loss": 0.3313, "step": 1245 }, { "epoch": 2.1262798634812285, "grad_norm": 0.3957886110355856, "learning_rate": 2.8436165902259717e-05, "loss": 0.38, "step": 1246 }, { "epoch": 2.127986348122867, "grad_norm": 0.36024142111095514, "learning_rate": 2.8414556317527722e-05, "loss": 0.3296, "step": 1247 }, { "epoch": 2.1296928327645053, "grad_norm": 0.39155019349124676, "learning_rate": 2.839293478987208e-05, "loss": 0.3411, "step": 1248 }, { "epoch": 2.131399317406143, "grad_norm": 0.39297329760609745, "learning_rate": 2.8371301349980593e-05, "loss": 0.3202, "step": 1249 }, { "epoch": 2.1331058020477816, "grad_norm": 0.41730207596559865, "learning_rate": 2.834965602855797e-05, "loss": 0.3689, "step": 1250 }, { "epoch": 2.13481228668942, "grad_norm": 0.3628482280926389, "learning_rate": 2.8327998856325788e-05, "loss": 0.345, "step": 1251 }, { "epoch": 2.136518771331058, "grad_norm": 0.3744238919178526, "learning_rate": 2.8306329864022446e-05, "loss": 0.3209, "step": 1252 }, { "epoch": 2.138225255972696, "grad_norm": 0.3947466710094417, "learning_rate": 2.8284649082403107e-05, "loss": 0.345, "step": 1253 }, { "epoch": 2.1399317406143346, "grad_norm": 0.38032210318900517, "learning_rate": 2.8262956542239678e-05, "loss": 0.3365, "step": 1254 }, { "epoch": 2.1416382252559725, "grad_norm": 0.34228417774558967, "learning_rate": 2.8241252274320753e-05, "loss": 0.3256, "step": 1255 }, { "epoch": 2.143344709897611, "grad_norm": 0.42236716597469687, "learning_rate": 2.8219536309451566e-05, "loss": 0.3158, "step": 1256 }, { "epoch": 2.145051194539249, "grad_norm": 0.3638834569248063, "learning_rate": 2.8197808678453965e-05, "loss": 0.3294, "step": 1257 }, { "epoch": 2.1467576791808876, "grad_norm": 0.38632598483966657, "learning_rate": 2.8176069412166345e-05, "loss": 0.3295, "step": 1258 }, { "epoch": 2.1484641638225255, "grad_norm": 0.4170935336781794, "learning_rate": 2.815431854144362e-05, "loss": 0.3097, "step": 1259 }, { "epoch": 2.150170648464164, "grad_norm": 0.3525829213286291, "learning_rate": 2.813255609715717e-05, "loss": 0.353, "step": 1260 }, { "epoch": 2.151877133105802, "grad_norm": 0.3854869447902711, "learning_rate": 2.81107821101948e-05, "loss": 0.3359, "step": 1261 }, { "epoch": 2.15358361774744, "grad_norm": 0.3413451536237944, "learning_rate": 2.808899661146072e-05, "loss": 0.2912, "step": 1262 }, { "epoch": 2.1552901023890785, "grad_norm": 0.3819099640985915, "learning_rate": 2.806719963187543e-05, "loss": 0.3383, "step": 1263 }, { "epoch": 2.156996587030717, "grad_norm": 0.37496052907993016, "learning_rate": 2.804539120237578e-05, "loss": 0.3249, "step": 1264 }, { "epoch": 2.1587030716723548, "grad_norm": 0.3831512033150945, "learning_rate": 2.8023571353914846e-05, "loss": 0.319, "step": 1265 }, { "epoch": 2.160409556313993, "grad_norm": 0.396726145433138, "learning_rate": 2.80017401174619e-05, "loss": 0.3739, "step": 1266 }, { "epoch": 2.1621160409556315, "grad_norm": 0.40561507479220216, "learning_rate": 2.79798975240024e-05, "loss": 0.3481, "step": 1267 }, { "epoch": 2.1638225255972694, "grad_norm": 0.3275781604015692, "learning_rate": 2.795804360453791e-05, "loss": 0.2998, "step": 1268 }, { "epoch": 2.1655290102389078, "grad_norm": 0.3522773371443922, "learning_rate": 2.793617839008606e-05, "loss": 0.3093, "step": 1269 }, { "epoch": 2.167235494880546, "grad_norm": 0.3887568155863699, "learning_rate": 2.7914301911680535e-05, "loss": 0.2977, "step": 1270 }, { "epoch": 2.1689419795221845, "grad_norm": 0.3648550969703528, "learning_rate": 2.7892414200371e-05, "loss": 0.3187, "step": 1271 }, { "epoch": 2.1706484641638224, "grad_norm": 0.43367590800895783, "learning_rate": 2.7870515287223043e-05, "loss": 0.3514, "step": 1272 }, { "epoch": 2.1723549488054608, "grad_norm": 0.417822649774299, "learning_rate": 2.7848605203318177e-05, "loss": 0.3289, "step": 1273 }, { "epoch": 2.174061433447099, "grad_norm": 0.3626889608693872, "learning_rate": 2.7826683979753753e-05, "loss": 0.4058, "step": 1274 }, { "epoch": 2.175767918088737, "grad_norm": 0.3782938440184352, "learning_rate": 2.780475164764294e-05, "loss": 0.3226, "step": 1275 }, { "epoch": 2.1774744027303754, "grad_norm": 0.40971833875259145, "learning_rate": 2.778280823811467e-05, "loss": 0.3582, "step": 1276 }, { "epoch": 2.1791808873720138, "grad_norm": 0.3757058724125517, "learning_rate": 2.7760853782313598e-05, "loss": 0.3343, "step": 1277 }, { "epoch": 2.1808873720136517, "grad_norm": 0.37586921864323336, "learning_rate": 2.7738888311400066e-05, "loss": 0.3334, "step": 1278 }, { "epoch": 2.18259385665529, "grad_norm": 0.336446479477883, "learning_rate": 2.7716911856550036e-05, "loss": 0.3061, "step": 1279 }, { "epoch": 2.1843003412969284, "grad_norm": 0.4029523350484026, "learning_rate": 2.7694924448955072e-05, "loss": 0.3611, "step": 1280 }, { "epoch": 2.1860068259385663, "grad_norm": 0.39868971446555357, "learning_rate": 2.7672926119822272e-05, "loss": 0.3715, "step": 1281 }, { "epoch": 2.1877133105802047, "grad_norm": 0.3642945128867239, "learning_rate": 2.7650916900374238e-05, "loss": 0.3316, "step": 1282 }, { "epoch": 2.189419795221843, "grad_norm": 0.38044114907318405, "learning_rate": 2.762889682184904e-05, "loss": 0.3496, "step": 1283 }, { "epoch": 2.1911262798634814, "grad_norm": 0.3756040633783301, "learning_rate": 2.7606865915500148e-05, "loss": 0.3428, "step": 1284 }, { "epoch": 2.1928327645051193, "grad_norm": 0.3766036687251031, "learning_rate": 2.7584824212596396e-05, "loss": 0.33, "step": 1285 }, { "epoch": 2.1945392491467577, "grad_norm": 0.3793415403883485, "learning_rate": 2.7562771744421974e-05, "loss": 0.3257, "step": 1286 }, { "epoch": 2.196245733788396, "grad_norm": 0.38676018867366924, "learning_rate": 2.7540708542276297e-05, "loss": 0.3334, "step": 1287 }, { "epoch": 2.197952218430034, "grad_norm": 0.34957567057729205, "learning_rate": 2.7518634637474063e-05, "loss": 0.3429, "step": 1288 }, { "epoch": 2.1996587030716723, "grad_norm": 0.3686311749994957, "learning_rate": 2.7496550061345138e-05, "loss": 0.3222, "step": 1289 }, { "epoch": 2.2013651877133107, "grad_norm": 0.35075494112369326, "learning_rate": 2.7474454845234534e-05, "loss": 0.3607, "step": 1290 }, { "epoch": 2.2030716723549486, "grad_norm": 0.3490460231118274, "learning_rate": 2.7452349020502377e-05, "loss": 0.3203, "step": 1291 }, { "epoch": 2.204778156996587, "grad_norm": 0.4159082349899918, "learning_rate": 2.7430232618523846e-05, "loss": 0.3329, "step": 1292 }, { "epoch": 2.2064846416382253, "grad_norm": 0.3935325819376942, "learning_rate": 2.7408105670689114e-05, "loss": 0.3626, "step": 1293 }, { "epoch": 2.2081911262798632, "grad_norm": 0.356625458585148, "learning_rate": 2.7385968208403343e-05, "loss": 0.3621, "step": 1294 }, { "epoch": 2.2098976109215016, "grad_norm": 0.3823241707511623, "learning_rate": 2.7363820263086616e-05, "loss": 0.3767, "step": 1295 }, { "epoch": 2.21160409556314, "grad_norm": 0.35899165701810964, "learning_rate": 2.7341661866173882e-05, "loss": 0.321, "step": 1296 }, { "epoch": 2.2133105802047783, "grad_norm": 0.3291105534040846, "learning_rate": 2.7319493049114937e-05, "loss": 0.2973, "step": 1297 }, { "epoch": 2.2150170648464163, "grad_norm": 0.41428730038665074, "learning_rate": 2.7297313843374364e-05, "loss": 0.3517, "step": 1298 }, { "epoch": 2.2167235494880546, "grad_norm": 0.6282577075784936, "learning_rate": 2.7275124280431492e-05, "loss": 0.3486, "step": 1299 }, { "epoch": 2.218430034129693, "grad_norm": 0.3819749469181485, "learning_rate": 2.7252924391780338e-05, "loss": 0.358, "step": 1300 }, { "epoch": 2.220136518771331, "grad_norm": 0.3734912146512999, "learning_rate": 2.723071420892959e-05, "loss": 0.3751, "step": 1301 }, { "epoch": 2.2218430034129693, "grad_norm": 0.4060889025671746, "learning_rate": 2.7208493763402538e-05, "loss": 0.3393, "step": 1302 }, { "epoch": 2.2235494880546076, "grad_norm": 0.3542469612727659, "learning_rate": 2.7186263086737034e-05, "loss": 0.3743, "step": 1303 }, { "epoch": 2.2252559726962455, "grad_norm": 0.3690400547180425, "learning_rate": 2.7164022210485468e-05, "loss": 0.3269, "step": 1304 }, { "epoch": 2.226962457337884, "grad_norm": 0.4127396663588053, "learning_rate": 2.7141771166214694e-05, "loss": 0.3322, "step": 1305 }, { "epoch": 2.2286689419795223, "grad_norm": 0.3606825518166459, "learning_rate": 2.7119509985505997e-05, "loss": 0.3413, "step": 1306 }, { "epoch": 2.2303754266211606, "grad_norm": 0.37041989688669297, "learning_rate": 2.709723869995505e-05, "loss": 0.3264, "step": 1307 }, { "epoch": 2.2320819112627985, "grad_norm": 0.3960536252124662, "learning_rate": 2.7074957341171874e-05, "loss": 0.3375, "step": 1308 }, { "epoch": 2.233788395904437, "grad_norm": 0.3818893032667079, "learning_rate": 2.705266594078078e-05, "loss": 0.368, "step": 1309 }, { "epoch": 2.2354948805460753, "grad_norm": 0.3664428592188226, "learning_rate": 2.703036453042033e-05, "loss": 0.3223, "step": 1310 }, { "epoch": 2.237201365187713, "grad_norm": 0.3561441401803114, "learning_rate": 2.7008053141743298e-05, "loss": 0.3245, "step": 1311 }, { "epoch": 2.2389078498293515, "grad_norm": 0.3485024959150887, "learning_rate": 2.6985731806416623e-05, "loss": 0.3191, "step": 1312 }, { "epoch": 2.24061433447099, "grad_norm": 0.3809419114703872, "learning_rate": 2.6963400556121362e-05, "loss": 0.3182, "step": 1313 }, { "epoch": 2.242320819112628, "grad_norm": 0.41760905832690415, "learning_rate": 2.6941059422552635e-05, "loss": 0.3032, "step": 1314 }, { "epoch": 2.244027303754266, "grad_norm": 0.39218141888484626, "learning_rate": 2.691870843741959e-05, "loss": 0.335, "step": 1315 }, { "epoch": 2.2457337883959045, "grad_norm": 0.3728659579751814, "learning_rate": 2.689634763244537e-05, "loss": 0.3345, "step": 1316 }, { "epoch": 2.2474402730375425, "grad_norm": 0.37450328090186463, "learning_rate": 2.687397703936704e-05, "loss": 0.3302, "step": 1317 }, { "epoch": 2.249146757679181, "grad_norm": 0.36789679345836473, "learning_rate": 2.6851596689935574e-05, "loss": 0.333, "step": 1318 }, { "epoch": 2.250853242320819, "grad_norm": 0.390239075495621, "learning_rate": 2.682920661591578e-05, "loss": 0.3322, "step": 1319 }, { "epoch": 2.252559726962457, "grad_norm": 0.39697227505570915, "learning_rate": 2.6806806849086276e-05, "loss": 0.3276, "step": 1320 }, { "epoch": 2.2542662116040955, "grad_norm": 0.41570925076951337, "learning_rate": 2.678439742123943e-05, "loss": 0.3717, "step": 1321 }, { "epoch": 2.255972696245734, "grad_norm": 0.38928174172708657, "learning_rate": 2.6761978364181323e-05, "loss": 0.3765, "step": 1322 }, { "epoch": 2.257679180887372, "grad_norm": 0.37549967121944744, "learning_rate": 2.673954970973172e-05, "loss": 0.3589, "step": 1323 }, { "epoch": 2.25938566552901, "grad_norm": 0.37697727591598484, "learning_rate": 2.671711148972398e-05, "loss": 0.3661, "step": 1324 }, { "epoch": 2.2610921501706485, "grad_norm": 0.38798438711320105, "learning_rate": 2.6694663736005054e-05, "loss": 0.3421, "step": 1325 }, { "epoch": 2.262798634812287, "grad_norm": 0.3406417472710425, "learning_rate": 2.6672206480435433e-05, "loss": 0.3245, "step": 1326 }, { "epoch": 2.2645051194539247, "grad_norm": 0.3639364853861221, "learning_rate": 2.664973975488907e-05, "loss": 0.36, "step": 1327 }, { "epoch": 2.266211604095563, "grad_norm": 0.38183582673797606, "learning_rate": 2.6627263591253382e-05, "loss": 0.3161, "step": 1328 }, { "epoch": 2.2679180887372015, "grad_norm": 0.3596190854379249, "learning_rate": 2.6604778021429164e-05, "loss": 0.3381, "step": 1329 }, { "epoch": 2.26962457337884, "grad_norm": 0.380270558721218, "learning_rate": 2.6582283077330582e-05, "loss": 0.3403, "step": 1330 }, { "epoch": 2.2713310580204777, "grad_norm": 0.3745524915536513, "learning_rate": 2.6559778790885084e-05, "loss": 0.3428, "step": 1331 }, { "epoch": 2.273037542662116, "grad_norm": 0.35135954321568064, "learning_rate": 2.653726519403339e-05, "loss": 0.3196, "step": 1332 }, { "epoch": 2.274744027303754, "grad_norm": 0.327555590634362, "learning_rate": 2.6514742318729445e-05, "loss": 0.351, "step": 1333 }, { "epoch": 2.2764505119453924, "grad_norm": 0.36208360425157576, "learning_rate": 2.649221019694033e-05, "loss": 0.3515, "step": 1334 }, { "epoch": 2.2781569965870307, "grad_norm": 0.3848771308126638, "learning_rate": 2.646966886064629e-05, "loss": 0.3119, "step": 1335 }, { "epoch": 2.279863481228669, "grad_norm": 0.3621027257971774, "learning_rate": 2.644711834184062e-05, "loss": 0.3243, "step": 1336 }, { "epoch": 2.281569965870307, "grad_norm": 0.39052460845925924, "learning_rate": 2.6424558672529648e-05, "loss": 0.3249, "step": 1337 }, { "epoch": 2.2832764505119454, "grad_norm": 0.362282332219868, "learning_rate": 2.6401989884732716e-05, "loss": 0.3268, "step": 1338 }, { "epoch": 2.2849829351535837, "grad_norm": 0.35499716624898975, "learning_rate": 2.6379412010482087e-05, "loss": 0.3052, "step": 1339 }, { "epoch": 2.2866894197952217, "grad_norm": 0.38850349063964607, "learning_rate": 2.635682508182291e-05, "loss": 0.3151, "step": 1340 }, { "epoch": 2.28839590443686, "grad_norm": 0.5053926075264994, "learning_rate": 2.6334229130813212e-05, "loss": 0.3476, "step": 1341 }, { "epoch": 2.2901023890784984, "grad_norm": 0.445190693428244, "learning_rate": 2.6311624189523818e-05, "loss": 0.357, "step": 1342 }, { "epoch": 2.2918088737201368, "grad_norm": 0.37232420913577263, "learning_rate": 2.6289010290038287e-05, "loss": 0.3304, "step": 1343 }, { "epoch": 2.2935153583617747, "grad_norm": 0.4092725423806077, "learning_rate": 2.6266387464452926e-05, "loss": 0.3307, "step": 1344 }, { "epoch": 2.295221843003413, "grad_norm": 0.44424690257197885, "learning_rate": 2.6243755744876706e-05, "loss": 0.3268, "step": 1345 }, { "epoch": 2.296928327645051, "grad_norm": 0.3537529521792447, "learning_rate": 2.62211151634312e-05, "loss": 0.3202, "step": 1346 }, { "epoch": 2.2986348122866893, "grad_norm": 0.353289356438286, "learning_rate": 2.6198465752250575e-05, "loss": 0.3319, "step": 1347 }, { "epoch": 2.3003412969283277, "grad_norm": 0.42972592961031836, "learning_rate": 2.6175807543481533e-05, "loss": 0.5392, "step": 1348 }, { "epoch": 2.302047781569966, "grad_norm": 0.4006275659437903, "learning_rate": 2.615314056928325e-05, "loss": 0.311, "step": 1349 }, { "epoch": 2.303754266211604, "grad_norm": 0.3513733934426921, "learning_rate": 2.6130464861827355e-05, "loss": 0.2919, "step": 1350 }, { "epoch": 2.3054607508532423, "grad_norm": 0.35056098323764556, "learning_rate": 2.6107780453297867e-05, "loss": 0.3157, "step": 1351 }, { "epoch": 2.3071672354948807, "grad_norm": 0.4774862257969973, "learning_rate": 2.6085087375891148e-05, "loss": 0.3239, "step": 1352 }, { "epoch": 2.3088737201365186, "grad_norm": 0.4243137448825834, "learning_rate": 2.6062385661815883e-05, "loss": 0.3867, "step": 1353 }, { "epoch": 2.310580204778157, "grad_norm": 0.4005873582549145, "learning_rate": 2.6039675343293e-05, "loss": 0.3492, "step": 1354 }, { "epoch": 2.3122866894197953, "grad_norm": 0.4082423081612512, "learning_rate": 2.6016956452555634e-05, "loss": 0.513, "step": 1355 }, { "epoch": 2.3139931740614337, "grad_norm": 0.3992966329238439, "learning_rate": 2.5994229021849098e-05, "loss": 0.352, "step": 1356 }, { "epoch": 2.3156996587030716, "grad_norm": 0.42991402649117133, "learning_rate": 2.597149308343083e-05, "loss": 0.3406, "step": 1357 }, { "epoch": 2.31740614334471, "grad_norm": 0.3662392973111248, "learning_rate": 2.5948748669570325e-05, "loss": 0.3365, "step": 1358 }, { "epoch": 2.319112627986348, "grad_norm": 0.3885234377213803, "learning_rate": 2.5925995812549126e-05, "loss": 0.3511, "step": 1359 }, { "epoch": 2.3208191126279862, "grad_norm": 0.35696602212521295, "learning_rate": 2.5903234544660755e-05, "loss": 0.2986, "step": 1360 }, { "epoch": 2.3225255972696246, "grad_norm": 0.40269050230313447, "learning_rate": 2.588046489821066e-05, "loss": 0.3347, "step": 1361 }, { "epoch": 2.324232081911263, "grad_norm": 0.38282631602250244, "learning_rate": 2.5857686905516195e-05, "loss": 0.3403, "step": 1362 }, { "epoch": 2.325938566552901, "grad_norm": 0.3630691576269162, "learning_rate": 2.5834900598906557e-05, "loss": 0.2834, "step": 1363 }, { "epoch": 2.3276450511945392, "grad_norm": 0.3654111923242123, "learning_rate": 2.5812106010722732e-05, "loss": 0.386, "step": 1364 }, { "epoch": 2.3293515358361776, "grad_norm": 0.3962095056625424, "learning_rate": 2.578930317331747e-05, "loss": 0.4225, "step": 1365 }, { "epoch": 2.3310580204778155, "grad_norm": 0.4003552112952632, "learning_rate": 2.5766492119055237e-05, "loss": 0.3027, "step": 1366 }, { "epoch": 2.332764505119454, "grad_norm": 0.3867896985438215, "learning_rate": 2.5743672880312152e-05, "loss": 0.3704, "step": 1367 }, { "epoch": 2.3344709897610922, "grad_norm": 0.3886665243250387, "learning_rate": 2.5720845489475935e-05, "loss": 0.3538, "step": 1368 }, { "epoch": 2.3361774744027306, "grad_norm": 0.3986068443722344, "learning_rate": 2.569800997894591e-05, "loss": 0.3632, "step": 1369 }, { "epoch": 2.3378839590443685, "grad_norm": 0.35079424216905164, "learning_rate": 2.5675166381132895e-05, "loss": 0.3129, "step": 1370 }, { "epoch": 2.339590443686007, "grad_norm": 0.3548076543694736, "learning_rate": 2.5652314728459207e-05, "loss": 0.3136, "step": 1371 }, { "epoch": 2.3412969283276452, "grad_norm": 0.38468021466593455, "learning_rate": 2.5629455053358582e-05, "loss": 0.3387, "step": 1372 }, { "epoch": 2.343003412969283, "grad_norm": 0.39646317065531794, "learning_rate": 2.5606587388276153e-05, "loss": 0.3569, "step": 1373 }, { "epoch": 2.3447098976109215, "grad_norm": 0.34637816706673646, "learning_rate": 2.558371176566839e-05, "loss": 0.3191, "step": 1374 }, { "epoch": 2.34641638225256, "grad_norm": 0.3455783144141557, "learning_rate": 2.556082821800304e-05, "loss": 0.3044, "step": 1375 }, { "epoch": 2.348122866894198, "grad_norm": 0.39109613972413526, "learning_rate": 2.5537936777759137e-05, "loss": 0.3628, "step": 1376 }, { "epoch": 2.349829351535836, "grad_norm": 0.36307973446643577, "learning_rate": 2.5515037477426865e-05, "loss": 0.2975, "step": 1377 }, { "epoch": 2.3515358361774745, "grad_norm": 0.3938124454749116, "learning_rate": 2.5492130349507615e-05, "loss": 0.3196, "step": 1378 }, { "epoch": 2.3532423208191124, "grad_norm": 0.4033717777247042, "learning_rate": 2.546921542651386e-05, "loss": 0.3157, "step": 1379 }, { "epoch": 2.354948805460751, "grad_norm": 0.353514246210565, "learning_rate": 2.5446292740969137e-05, "loss": 0.3563, "step": 1380 }, { "epoch": 2.356655290102389, "grad_norm": 0.3874177630384359, "learning_rate": 2.5423362325408012e-05, "loss": 0.342, "step": 1381 }, { "epoch": 2.3583617747440275, "grad_norm": 0.38665862860940753, "learning_rate": 2.5400424212376016e-05, "loss": 0.3283, "step": 1382 }, { "epoch": 2.3600682593856654, "grad_norm": 0.3899975436857683, "learning_rate": 2.5377478434429597e-05, "loss": 0.3314, "step": 1383 }, { "epoch": 2.361774744027304, "grad_norm": 0.39935415423996007, "learning_rate": 2.535452502413609e-05, "loss": 0.3253, "step": 1384 }, { "epoch": 2.363481228668942, "grad_norm": 0.3971800997252506, "learning_rate": 2.533156401407367e-05, "loss": 0.3645, "step": 1385 }, { "epoch": 2.36518771331058, "grad_norm": 0.3712698641390973, "learning_rate": 2.5308595436831293e-05, "loss": 0.3369, "step": 1386 }, { "epoch": 2.3668941979522184, "grad_norm": 0.4292161442786262, "learning_rate": 2.5285619325008642e-05, "loss": 0.3321, "step": 1387 }, { "epoch": 2.368600682593857, "grad_norm": 0.32124725800414833, "learning_rate": 2.526263571121612e-05, "loss": 0.3016, "step": 1388 }, { "epoch": 2.3703071672354947, "grad_norm": 0.42628150818210847, "learning_rate": 2.5239644628074753e-05, "loss": 0.3302, "step": 1389 }, { "epoch": 2.372013651877133, "grad_norm": 0.41628538728162245, "learning_rate": 2.5216646108216178e-05, "loss": 0.3614, "step": 1390 }, { "epoch": 2.3737201365187715, "grad_norm": 0.3644672453853354, "learning_rate": 2.519364018428259e-05, "loss": 0.3288, "step": 1391 }, { "epoch": 2.3754266211604094, "grad_norm": 0.4152115385707028, "learning_rate": 2.517062688892669e-05, "loss": 0.3291, "step": 1392 }, { "epoch": 2.3771331058020477, "grad_norm": 0.3594864619523619, "learning_rate": 2.5147606254811644e-05, "loss": 0.3225, "step": 1393 }, { "epoch": 2.378839590443686, "grad_norm": 0.38511608243923456, "learning_rate": 2.5124578314611028e-05, "loss": 0.3375, "step": 1394 }, { "epoch": 2.3805460750853245, "grad_norm": 0.3547306658802128, "learning_rate": 2.5101543101008795e-05, "loss": 0.3311, "step": 1395 }, { "epoch": 2.3822525597269624, "grad_norm": 0.348365710623611, "learning_rate": 2.507850064669921e-05, "loss": 0.3309, "step": 1396 }, { "epoch": 2.3839590443686007, "grad_norm": 0.39395625941786366, "learning_rate": 2.5055450984386828e-05, "loss": 0.3446, "step": 1397 }, { "epoch": 2.385665529010239, "grad_norm": 0.37869370932349095, "learning_rate": 2.5032394146786434e-05, "loss": 0.3721, "step": 1398 }, { "epoch": 2.387372013651877, "grad_norm": 0.416001557948223, "learning_rate": 2.500933016662298e-05, "loss": 0.3184, "step": 1399 }, { "epoch": 2.3890784982935154, "grad_norm": 0.3708619882788283, "learning_rate": 2.498625907663158e-05, "loss": 0.3231, "step": 1400 }, { "epoch": 2.3907849829351537, "grad_norm": 0.3734903623021443, "learning_rate": 2.4963180909557413e-05, "loss": 0.337, "step": 1401 }, { "epoch": 2.3924914675767917, "grad_norm": 0.3619258193640416, "learning_rate": 2.4940095698155728e-05, "loss": 0.3325, "step": 1402 }, { "epoch": 2.39419795221843, "grad_norm": 0.43457933401171456, "learning_rate": 2.4917003475191752e-05, "loss": 0.3566, "step": 1403 }, { "epoch": 2.3959044368600684, "grad_norm": 0.36789493194294154, "learning_rate": 2.4893904273440677e-05, "loss": 0.3305, "step": 1404 }, { "epoch": 2.3976109215017063, "grad_norm": 0.3702806744907535, "learning_rate": 2.487079812568759e-05, "loss": 0.339, "step": 1405 }, { "epoch": 2.3993174061433447, "grad_norm": 0.34765241619067994, "learning_rate": 2.484768506472745e-05, "loss": 0.3303, "step": 1406 }, { "epoch": 2.401023890784983, "grad_norm": 0.3691851322616644, "learning_rate": 2.4824565123365013e-05, "loss": 0.3561, "step": 1407 }, { "epoch": 2.4027303754266214, "grad_norm": 0.33489601093114274, "learning_rate": 2.4801438334414808e-05, "loss": 0.3086, "step": 1408 }, { "epoch": 2.4044368600682593, "grad_norm": 0.4886959502080782, "learning_rate": 2.477830473070108e-05, "loss": 0.3248, "step": 1409 }, { "epoch": 2.4061433447098977, "grad_norm": 0.4439476834476966, "learning_rate": 2.4755164345057754e-05, "loss": 0.3346, "step": 1410 }, { "epoch": 2.407849829351536, "grad_norm": 0.34873223446808826, "learning_rate": 2.473201721032837e-05, "loss": 0.3368, "step": 1411 }, { "epoch": 2.409556313993174, "grad_norm": 0.41435991243131887, "learning_rate": 2.4708863359366056e-05, "loss": 0.3307, "step": 1412 }, { "epoch": 2.4112627986348123, "grad_norm": 0.36887177211123173, "learning_rate": 2.4685702825033464e-05, "loss": 0.3461, "step": 1413 }, { "epoch": 2.4129692832764507, "grad_norm": 0.3812688309491389, "learning_rate": 2.4662535640202737e-05, "loss": 0.329, "step": 1414 }, { "epoch": 2.4146757679180886, "grad_norm": 0.37755571817285283, "learning_rate": 2.4639361837755463e-05, "loss": 0.3127, "step": 1415 }, { "epoch": 2.416382252559727, "grad_norm": 0.38208082202786353, "learning_rate": 2.4616181450582613e-05, "loss": 0.3714, "step": 1416 }, { "epoch": 2.4180887372013653, "grad_norm": 0.424806909408548, "learning_rate": 2.459299451158449e-05, "loss": 0.3207, "step": 1417 }, { "epoch": 2.419795221843003, "grad_norm": 0.394188998633754, "learning_rate": 2.456980105367074e-05, "loss": 0.3147, "step": 1418 }, { "epoch": 2.4215017064846416, "grad_norm": 0.35728493684993823, "learning_rate": 2.4546601109760223e-05, "loss": 0.3321, "step": 1419 }, { "epoch": 2.42320819112628, "grad_norm": 0.3390171300960544, "learning_rate": 2.4523394712781014e-05, "loss": 0.3382, "step": 1420 }, { "epoch": 2.4249146757679183, "grad_norm": 0.41077102645216684, "learning_rate": 2.4500181895670353e-05, "loss": 0.311, "step": 1421 }, { "epoch": 2.426621160409556, "grad_norm": 0.3670602580780559, "learning_rate": 2.4476962691374582e-05, "loss": 0.314, "step": 1422 }, { "epoch": 2.4283276450511946, "grad_norm": 0.3537707122343819, "learning_rate": 2.445373713284912e-05, "loss": 0.3366, "step": 1423 }, { "epoch": 2.430034129692833, "grad_norm": 0.3337392371509326, "learning_rate": 2.4430505253058394e-05, "loss": 0.3547, "step": 1424 }, { "epoch": 2.431740614334471, "grad_norm": 0.3933018784100391, "learning_rate": 2.4407267084975815e-05, "loss": 0.314, "step": 1425 }, { "epoch": 2.4334470989761092, "grad_norm": 0.37056767427899595, "learning_rate": 2.4384022661583705e-05, "loss": 0.3111, "step": 1426 }, { "epoch": 2.4351535836177476, "grad_norm": 0.4441057268863796, "learning_rate": 2.4360772015873274e-05, "loss": 0.3791, "step": 1427 }, { "epoch": 2.4368600682593855, "grad_norm": 0.3429781436405533, "learning_rate": 2.4337515180844573e-05, "loss": 0.3561, "step": 1428 }, { "epoch": 2.438566552901024, "grad_norm": 0.36718679917444935, "learning_rate": 2.4314252189506408e-05, "loss": 0.3323, "step": 1429 }, { "epoch": 2.4402730375426622, "grad_norm": 0.31672706515917426, "learning_rate": 2.429098307487635e-05, "loss": 0.3236, "step": 1430 }, { "epoch": 2.4419795221843, "grad_norm": 0.3919414240234656, "learning_rate": 2.4267707869980646e-05, "loss": 0.3345, "step": 1431 }, { "epoch": 2.4436860068259385, "grad_norm": 0.3372894842089935, "learning_rate": 2.4244426607854193e-05, "loss": 0.3024, "step": 1432 }, { "epoch": 2.445392491467577, "grad_norm": 0.3816449925186258, "learning_rate": 2.422113932154049e-05, "loss": 0.3394, "step": 1433 }, { "epoch": 2.4470989761092152, "grad_norm": 0.3669988886690805, "learning_rate": 2.4197846044091585e-05, "loss": 0.3791, "step": 1434 }, { "epoch": 2.448805460750853, "grad_norm": 0.34702738131934063, "learning_rate": 2.417454680856801e-05, "loss": 0.3294, "step": 1435 }, { "epoch": 2.4505119453924915, "grad_norm": 0.36254456020321973, "learning_rate": 2.415124164803877e-05, "loss": 0.3303, "step": 1436 }, { "epoch": 2.45221843003413, "grad_norm": 0.3419956969338929, "learning_rate": 2.4127930595581285e-05, "loss": 0.3223, "step": 1437 }, { "epoch": 2.453924914675768, "grad_norm": 0.37916769686625507, "learning_rate": 2.410461368428133e-05, "loss": 0.2957, "step": 1438 }, { "epoch": 2.455631399317406, "grad_norm": 0.39593285539957496, "learning_rate": 2.4081290947232993e-05, "loss": 0.3332, "step": 1439 }, { "epoch": 2.4573378839590445, "grad_norm": 0.3641399475866781, "learning_rate": 2.405796241753864e-05, "loss": 0.3252, "step": 1440 }, { "epoch": 2.4590443686006824, "grad_norm": 0.37308343106539177, "learning_rate": 2.4034628128308844e-05, "loss": 0.296, "step": 1441 }, { "epoch": 2.460750853242321, "grad_norm": 0.3569807734609345, "learning_rate": 2.4011288112662364e-05, "loss": 0.2946, "step": 1442 }, { "epoch": 2.462457337883959, "grad_norm": 0.4090553354996803, "learning_rate": 2.398794240372608e-05, "loss": 0.3319, "step": 1443 }, { "epoch": 2.464163822525597, "grad_norm": 0.3730810745041935, "learning_rate": 2.396459103463496e-05, "loss": 0.3222, "step": 1444 }, { "epoch": 2.4658703071672354, "grad_norm": 0.3379437485115968, "learning_rate": 2.3941234038532e-05, "loss": 0.3389, "step": 1445 }, { "epoch": 2.467576791808874, "grad_norm": 0.3638076547668745, "learning_rate": 2.3917871448568182e-05, "loss": 0.3443, "step": 1446 }, { "epoch": 2.469283276450512, "grad_norm": 0.37797090219784407, "learning_rate": 2.3894503297902437e-05, "loss": 0.3314, "step": 1447 }, { "epoch": 2.47098976109215, "grad_norm": 0.3454814124459251, "learning_rate": 2.387112961970157e-05, "loss": 0.3179, "step": 1448 }, { "epoch": 2.4726962457337884, "grad_norm": 0.3890717934183557, "learning_rate": 2.384775044714025e-05, "loss": 0.3438, "step": 1449 }, { "epoch": 2.474402730375427, "grad_norm": 0.3985238454597805, "learning_rate": 2.3824365813400934e-05, "loss": 0.3945, "step": 1450 }, { "epoch": 2.4761092150170647, "grad_norm": 0.41221527808061414, "learning_rate": 2.3800975751673825e-05, "loss": 0.3986, "step": 1451 }, { "epoch": 2.477815699658703, "grad_norm": 0.3969985176310822, "learning_rate": 2.377758029515685e-05, "loss": 0.3191, "step": 1452 }, { "epoch": 2.4795221843003414, "grad_norm": 0.3880421847396879, "learning_rate": 2.3754179477055576e-05, "loss": 0.3541, "step": 1453 }, { "epoch": 2.4812286689419794, "grad_norm": 0.3715763075289722, "learning_rate": 2.3730773330583183e-05, "loss": 0.3288, "step": 1454 }, { "epoch": 2.4829351535836177, "grad_norm": 0.4031941085825435, "learning_rate": 2.3707361888960413e-05, "loss": 0.3153, "step": 1455 }, { "epoch": 2.484641638225256, "grad_norm": 0.33837450641052536, "learning_rate": 2.3683945185415528e-05, "loss": 0.2986, "step": 1456 }, { "epoch": 2.486348122866894, "grad_norm": 0.39223149797213314, "learning_rate": 2.3660523253184254e-05, "loss": 0.3654, "step": 1457 }, { "epoch": 2.4880546075085324, "grad_norm": 0.4584458087319988, "learning_rate": 2.3637096125509737e-05, "loss": 0.3326, "step": 1458 }, { "epoch": 2.4897610921501707, "grad_norm": 0.35742942089729385, "learning_rate": 2.3613663835642515e-05, "loss": 0.3503, "step": 1459 }, { "epoch": 2.491467576791809, "grad_norm": 0.4003287666095139, "learning_rate": 2.3590226416840415e-05, "loss": 0.3748, "step": 1460 }, { "epoch": 2.493174061433447, "grad_norm": 0.38386986612118523, "learning_rate": 2.3566783902368586e-05, "loss": 0.3567, "step": 1461 }, { "epoch": 2.4948805460750854, "grad_norm": 0.3898899370166114, "learning_rate": 2.354333632549938e-05, "loss": 0.3233, "step": 1462 }, { "epoch": 2.4965870307167237, "grad_norm": 0.34310571632457215, "learning_rate": 2.3519883719512345e-05, "loss": 0.3556, "step": 1463 }, { "epoch": 2.4982935153583616, "grad_norm": 0.3367770889277449, "learning_rate": 2.349642611769416e-05, "loss": 0.3526, "step": 1464 }, { "epoch": 2.5, "grad_norm": 0.38653236875272007, "learning_rate": 2.3472963553338614e-05, "loss": 0.4039, "step": 1465 }, { "epoch": 2.5017064846416384, "grad_norm": 0.3797790080161611, "learning_rate": 2.3449496059746508e-05, "loss": 0.3071, "step": 1466 }, { "epoch": 2.5034129692832767, "grad_norm": 0.4026662760948025, "learning_rate": 2.3426023670225674e-05, "loss": 0.4287, "step": 1467 }, { "epoch": 2.5051194539249146, "grad_norm": 0.36107542161294814, "learning_rate": 2.340254641809087e-05, "loss": 0.297, "step": 1468 }, { "epoch": 2.506825938566553, "grad_norm": 0.35554618094449175, "learning_rate": 2.3379064336663754e-05, "loss": 0.3542, "step": 1469 }, { "epoch": 2.508532423208191, "grad_norm": 0.36658748821399895, "learning_rate": 2.3355577459272856e-05, "loss": 0.352, "step": 1470 }, { "epoch": 2.5102389078498293, "grad_norm": 0.40957016863765, "learning_rate": 2.3332085819253494e-05, "loss": 0.3212, "step": 1471 }, { "epoch": 2.5119453924914676, "grad_norm": 0.3330358802773706, "learning_rate": 2.330858944994776e-05, "loss": 0.2995, "step": 1472 }, { "epoch": 2.513651877133106, "grad_norm": 0.37169872025122885, "learning_rate": 2.328508838470445e-05, "loss": 0.3471, "step": 1473 }, { "epoch": 2.515358361774744, "grad_norm": 0.4068903182174979, "learning_rate": 2.326158265687903e-05, "loss": 0.3515, "step": 1474 }, { "epoch": 2.5170648464163823, "grad_norm": 0.40512073054173614, "learning_rate": 2.3238072299833584e-05, "loss": 0.3025, "step": 1475 }, { "epoch": 2.51877133105802, "grad_norm": 0.35816508402342645, "learning_rate": 2.3214557346936755e-05, "loss": 0.3154, "step": 1476 }, { "epoch": 2.5204778156996586, "grad_norm": 0.45982283442918426, "learning_rate": 2.3191037831563727e-05, "loss": 0.345, "step": 1477 }, { "epoch": 2.522184300341297, "grad_norm": 0.4173484682436333, "learning_rate": 2.316751378709614e-05, "loss": 0.3406, "step": 1478 }, { "epoch": 2.5238907849829353, "grad_norm": 0.36961090311820594, "learning_rate": 2.3143985246922077e-05, "loss": 0.3778, "step": 1479 }, { "epoch": 2.5255972696245736, "grad_norm": 0.37553258774310944, "learning_rate": 2.3120452244436e-05, "loss": 0.3119, "step": 1480 }, { "epoch": 2.5273037542662116, "grad_norm": 0.3750417092831229, "learning_rate": 2.309691481303871e-05, "loss": 0.3246, "step": 1481 }, { "epoch": 2.52901023890785, "grad_norm": 0.3141087844075222, "learning_rate": 2.3073372986137274e-05, "loss": 0.344, "step": 1482 }, { "epoch": 2.530716723549488, "grad_norm": 0.3378382304761785, "learning_rate": 2.3049826797145002e-05, "loss": 0.3533, "step": 1483 }, { "epoch": 2.532423208191126, "grad_norm": 0.3202302348119175, "learning_rate": 2.302627627948142e-05, "loss": 0.332, "step": 1484 }, { "epoch": 2.5341296928327646, "grad_norm": 0.3417342609194838, "learning_rate": 2.3002721466572168e-05, "loss": 0.3396, "step": 1485 }, { "epoch": 2.535836177474403, "grad_norm": 0.35991105559347536, "learning_rate": 2.2979162391849003e-05, "loss": 0.3366, "step": 1486 }, { "epoch": 2.537542662116041, "grad_norm": 0.3355428033489997, "learning_rate": 2.2955599088749722e-05, "loss": 0.4948, "step": 1487 }, { "epoch": 2.539249146757679, "grad_norm": 0.36609471776900404, "learning_rate": 2.2932031590718116e-05, "loss": 0.3213, "step": 1488 }, { "epoch": 2.5409556313993176, "grad_norm": 0.35296771491544104, "learning_rate": 2.2908459931203947e-05, "loss": 0.3242, "step": 1489 }, { "epoch": 2.5426621160409555, "grad_norm": 0.3853064302037463, "learning_rate": 2.2884884143662867e-05, "loss": 0.3418, "step": 1490 }, { "epoch": 2.544368600682594, "grad_norm": 0.3185441446222499, "learning_rate": 2.2861304261556393e-05, "loss": 0.3416, "step": 1491 }, { "epoch": 2.546075085324232, "grad_norm": 0.3427064024986631, "learning_rate": 2.2837720318351866e-05, "loss": 0.348, "step": 1492 }, { "epoch": 2.5477815699658706, "grad_norm": 0.36539764520827644, "learning_rate": 2.2814132347522375e-05, "loss": 0.327, "step": 1493 }, { "epoch": 2.5494880546075085, "grad_norm": 0.34389922932611533, "learning_rate": 2.2790540382546724e-05, "loss": 0.3053, "step": 1494 }, { "epoch": 2.551194539249147, "grad_norm": 0.39591972206641485, "learning_rate": 2.27669444569094e-05, "loss": 0.3255, "step": 1495 }, { "epoch": 2.5529010238907848, "grad_norm": 0.36868469745591775, "learning_rate": 2.27433446041005e-05, "loss": 0.3016, "step": 1496 }, { "epoch": 2.554607508532423, "grad_norm": 0.37489919340094, "learning_rate": 2.2719740857615697e-05, "loss": 0.3004, "step": 1497 }, { "epoch": 2.5563139931740615, "grad_norm": 0.3511340577664287, "learning_rate": 2.2696133250956192e-05, "loss": 0.3285, "step": 1498 }, { "epoch": 2.5580204778157, "grad_norm": 0.3454939870308825, "learning_rate": 2.267252181762867e-05, "loss": 0.347, "step": 1499 }, { "epoch": 2.5597269624573378, "grad_norm": 0.4154911702244027, "learning_rate": 2.2648906591145238e-05, "loss": 0.3419, "step": 1500 }, { "epoch": 2.561433447098976, "grad_norm": 0.3570111180021174, "learning_rate": 2.2625287605023392e-05, "loss": 0.3769, "step": 1501 }, { "epoch": 2.5631399317406145, "grad_norm": 0.3401737285458632, "learning_rate": 2.260166489278597e-05, "loss": 0.3474, "step": 1502 }, { "epoch": 2.5648464163822524, "grad_norm": 0.44603488244193745, "learning_rate": 2.2578038487961075e-05, "loss": 0.3603, "step": 1503 }, { "epoch": 2.5665529010238908, "grad_norm": 0.33278179482227377, "learning_rate": 2.2554408424082075e-05, "loss": 0.3017, "step": 1504 }, { "epoch": 2.568259385665529, "grad_norm": 0.33587061726853057, "learning_rate": 2.2530774734687525e-05, "loss": 0.3323, "step": 1505 }, { "epoch": 2.5699658703071675, "grad_norm": 0.4400712644895227, "learning_rate": 2.2507137453321125e-05, "loss": 0.356, "step": 1506 }, { "epoch": 2.5716723549488054, "grad_norm": 0.3571140421651242, "learning_rate": 2.248349661353167e-05, "loss": 0.3155, "step": 1507 }, { "epoch": 2.573378839590444, "grad_norm": 0.364079170575246, "learning_rate": 2.2459852248873012e-05, "loss": 0.3482, "step": 1508 }, { "epoch": 2.5750853242320817, "grad_norm": 0.42970346416624294, "learning_rate": 2.2436204392904006e-05, "loss": 0.3525, "step": 1509 }, { "epoch": 2.57679180887372, "grad_norm": 0.39056825360322367, "learning_rate": 2.241255307918844e-05, "loss": 0.3062, "step": 1510 }, { "epoch": 2.5784982935153584, "grad_norm": 0.34620813919722987, "learning_rate": 2.2388898341295053e-05, "loss": 0.4253, "step": 1511 }, { "epoch": 2.580204778156997, "grad_norm": 0.39111293003093417, "learning_rate": 2.2365240212797397e-05, "loss": 0.3122, "step": 1512 }, { "epoch": 2.5819112627986347, "grad_norm": 0.36781307725420737, "learning_rate": 2.234157872727387e-05, "loss": 0.3157, "step": 1513 }, { "epoch": 2.583617747440273, "grad_norm": 0.3509137695917543, "learning_rate": 2.2317913918307616e-05, "loss": 0.32, "step": 1514 }, { "epoch": 2.5853242320819114, "grad_norm": 0.36137579661930413, "learning_rate": 2.2294245819486515e-05, "loss": 0.3439, "step": 1515 }, { "epoch": 2.5870307167235493, "grad_norm": 0.37850530442502667, "learning_rate": 2.227057446440309e-05, "loss": 0.3305, "step": 1516 }, { "epoch": 2.5887372013651877, "grad_norm": 0.3450905118912204, "learning_rate": 2.2246899886654512e-05, "loss": 0.332, "step": 1517 }, { "epoch": 2.590443686006826, "grad_norm": 0.3540659388313431, "learning_rate": 2.2223222119842505e-05, "loss": 0.3285, "step": 1518 }, { "epoch": 2.5921501706484644, "grad_norm": 0.33810482101335076, "learning_rate": 2.219954119757333e-05, "loss": 0.3101, "step": 1519 }, { "epoch": 2.5938566552901023, "grad_norm": 0.4027706707970301, "learning_rate": 2.2175857153457733e-05, "loss": 0.3008, "step": 1520 }, { "epoch": 2.5955631399317407, "grad_norm": 0.3850159272249956, "learning_rate": 2.2152170021110876e-05, "loss": 0.3737, "step": 1521 }, { "epoch": 2.5972696245733786, "grad_norm": 0.3420595026622498, "learning_rate": 2.2128479834152303e-05, "loss": 0.3571, "step": 1522 }, { "epoch": 2.598976109215017, "grad_norm": 0.36231382375440196, "learning_rate": 2.2104786626205915e-05, "loss": 0.3179, "step": 1523 }, { "epoch": 2.6006825938566553, "grad_norm": 0.35324028732062923, "learning_rate": 2.208109043089988e-05, "loss": 0.3779, "step": 1524 }, { "epoch": 2.6023890784982937, "grad_norm": 0.3599439627039347, "learning_rate": 2.2057391281866617e-05, "loss": 0.3225, "step": 1525 }, { "epoch": 2.6040955631399316, "grad_norm": 0.4297972491076275, "learning_rate": 2.203368921274273e-05, "loss": 0.3475, "step": 1526 }, { "epoch": 2.60580204778157, "grad_norm": 0.32920065294422945, "learning_rate": 2.2009984257168978e-05, "loss": 0.3201, "step": 1527 }, { "epoch": 2.6075085324232083, "grad_norm": 0.36500147382442766, "learning_rate": 2.19862764487902e-05, "loss": 0.3312, "step": 1528 }, { "epoch": 2.6092150170648463, "grad_norm": 0.3520609399732075, "learning_rate": 2.196256582125529e-05, "loss": 0.2805, "step": 1529 }, { "epoch": 2.6109215017064846, "grad_norm": 0.35033065080394465, "learning_rate": 2.1938852408217168e-05, "loss": 0.37, "step": 1530 }, { "epoch": 2.612627986348123, "grad_norm": 0.34259172335469495, "learning_rate": 2.1915136243332662e-05, "loss": 0.3181, "step": 1531 }, { "epoch": 2.6143344709897613, "grad_norm": 0.37189251013844604, "learning_rate": 2.189141736026255e-05, "loss": 0.3168, "step": 1532 }, { "epoch": 2.6160409556313993, "grad_norm": 0.3721659335744151, "learning_rate": 2.186769579267144e-05, "loss": 0.3202, "step": 1533 }, { "epoch": 2.6177474402730376, "grad_norm": 0.3639098058840575, "learning_rate": 2.1843971574227755e-05, "loss": 0.3074, "step": 1534 }, { "epoch": 2.6194539249146755, "grad_norm": 0.3769605832514304, "learning_rate": 2.1820244738603686e-05, "loss": 0.356, "step": 1535 }, { "epoch": 2.621160409556314, "grad_norm": 0.3655217175141363, "learning_rate": 2.1796515319475144e-05, "loss": 0.3438, "step": 1536 }, { "epoch": 2.6228668941979523, "grad_norm": 0.35103512956620203, "learning_rate": 2.177278335052169e-05, "loss": 0.3687, "step": 1537 }, { "epoch": 2.6245733788395906, "grad_norm": 0.3645165351023154, "learning_rate": 2.174904886542651e-05, "loss": 0.3403, "step": 1538 }, { "epoch": 2.6262798634812285, "grad_norm": 0.33120848612975773, "learning_rate": 2.172531189787638e-05, "loss": 0.3352, "step": 1539 }, { "epoch": 2.627986348122867, "grad_norm": 0.34308548415086215, "learning_rate": 2.1701572481561574e-05, "loss": 0.3373, "step": 1540 }, { "epoch": 2.6296928327645053, "grad_norm": 0.3624467272308664, "learning_rate": 2.167783065017585e-05, "loss": 0.3129, "step": 1541 }, { "epoch": 2.631399317406143, "grad_norm": 0.3275048899706405, "learning_rate": 2.1654086437416394e-05, "loss": 0.376, "step": 1542 }, { "epoch": 2.6331058020477816, "grad_norm": 0.34876972572695425, "learning_rate": 2.1630339876983783e-05, "loss": 0.3044, "step": 1543 }, { "epoch": 2.63481228668942, "grad_norm": 0.354065295382463, "learning_rate": 2.160659100258191e-05, "loss": 0.3273, "step": 1544 }, { "epoch": 2.6365187713310583, "grad_norm": 0.3605089626247716, "learning_rate": 2.1582839847917954e-05, "loss": 0.3437, "step": 1545 }, { "epoch": 2.638225255972696, "grad_norm": 0.39176557422172903, "learning_rate": 2.155908644670234e-05, "loss": 0.3292, "step": 1546 }, { "epoch": 2.6399317406143346, "grad_norm": 0.34142414891699385, "learning_rate": 2.1535330832648677e-05, "loss": 0.3296, "step": 1547 }, { "epoch": 2.6416382252559725, "grad_norm": 0.3628720766562326, "learning_rate": 2.151157303947371e-05, "loss": 0.3651, "step": 1548 }, { "epoch": 2.643344709897611, "grad_norm": 0.35653396070721, "learning_rate": 2.1487813100897287e-05, "loss": 0.3165, "step": 1549 }, { "epoch": 2.645051194539249, "grad_norm": 0.37500779656869304, "learning_rate": 2.146405105064229e-05, "loss": 0.3008, "step": 1550 }, { "epoch": 2.6467576791808876, "grad_norm": 0.3638063646129447, "learning_rate": 2.1440286922434604e-05, "loss": 0.3527, "step": 1551 }, { "epoch": 2.6484641638225255, "grad_norm": 0.3975840120709893, "learning_rate": 2.1416520750003065e-05, "loss": 0.339, "step": 1552 }, { "epoch": 2.650170648464164, "grad_norm": 0.3997896639599607, "learning_rate": 2.139275256707941e-05, "loss": 0.3032, "step": 1553 }, { "epoch": 2.651877133105802, "grad_norm": 0.3379880488943614, "learning_rate": 2.1368982407398218e-05, "loss": 0.3407, "step": 1554 }, { "epoch": 2.65358361774744, "grad_norm": 0.3584110053295769, "learning_rate": 2.13452103046969e-05, "loss": 0.2919, "step": 1555 }, { "epoch": 2.6552901023890785, "grad_norm": 0.32809341209082954, "learning_rate": 2.1321436292715587e-05, "loss": 0.3373, "step": 1556 }, { "epoch": 2.656996587030717, "grad_norm": 0.34693589171988704, "learning_rate": 2.1297660405197155e-05, "loss": 0.302, "step": 1557 }, { "epoch": 2.658703071672355, "grad_norm": 0.417036926378175, "learning_rate": 2.1273882675887122e-05, "loss": 0.3456, "step": 1558 }, { "epoch": 2.660409556313993, "grad_norm": 0.3987269943492533, "learning_rate": 2.125010313853362e-05, "loss": 0.3279, "step": 1559 }, { "epoch": 2.6621160409556315, "grad_norm": 0.3725601004877647, "learning_rate": 2.1226321826887368e-05, "loss": 0.3287, "step": 1560 }, { "epoch": 2.6638225255972694, "grad_norm": 0.3603161553659546, "learning_rate": 2.120253877470158e-05, "loss": 0.3253, "step": 1561 }, { "epoch": 2.6655290102389078, "grad_norm": 0.37497768836582135, "learning_rate": 2.1178754015731945e-05, "loss": 0.3633, "step": 1562 }, { "epoch": 2.667235494880546, "grad_norm": 0.34696377080955054, "learning_rate": 2.1154967583736584e-05, "loss": 0.344, "step": 1563 }, { "epoch": 2.6689419795221845, "grad_norm": 0.3839859277931613, "learning_rate": 2.113117951247598e-05, "loss": 0.3626, "step": 1564 }, { "epoch": 2.6706484641638224, "grad_norm": 0.3575518356176394, "learning_rate": 2.1107389835712955e-05, "loss": 0.324, "step": 1565 }, { "epoch": 2.6723549488054608, "grad_norm": 0.30616261327690975, "learning_rate": 2.1083598587212605e-05, "loss": 0.3274, "step": 1566 }, { "epoch": 2.674061433447099, "grad_norm": 0.34876946647688606, "learning_rate": 2.105980580074226e-05, "loss": 0.3529, "step": 1567 }, { "epoch": 2.675767918088737, "grad_norm": 0.36759236932565875, "learning_rate": 2.1036011510071416e-05, "loss": 0.2979, "step": 1568 }, { "epoch": 2.6774744027303754, "grad_norm": 0.36493991987989527, "learning_rate": 2.101221574897172e-05, "loss": 0.3081, "step": 1569 }, { "epoch": 2.6791808873720138, "grad_norm": 0.37896500199069055, "learning_rate": 2.0988418551216912e-05, "loss": 0.3458, "step": 1570 }, { "epoch": 2.680887372013652, "grad_norm": 1.1167198478547995, "learning_rate": 2.0964619950582747e-05, "loss": 0.3232, "step": 1571 }, { "epoch": 2.68259385665529, "grad_norm": 0.35455315066160753, "learning_rate": 2.0940819980846992e-05, "loss": 0.3542, "step": 1572 }, { "epoch": 2.6843003412969284, "grad_norm": 0.4023373744928067, "learning_rate": 2.091701867578936e-05, "loss": 0.3243, "step": 1573 }, { "epoch": 2.6860068259385663, "grad_norm": 0.36866692733463213, "learning_rate": 2.0893216069191437e-05, "loss": 0.3368, "step": 1574 }, { "epoch": 2.6877133105802047, "grad_norm": 0.38032693691732394, "learning_rate": 2.0869412194836677e-05, "loss": 0.3318, "step": 1575 }, { "epoch": 2.689419795221843, "grad_norm": 0.3310774792170441, "learning_rate": 2.084560708651033e-05, "loss": 0.3083, "step": 1576 }, { "epoch": 2.6911262798634814, "grad_norm": 0.34387431070918356, "learning_rate": 2.082180077799938e-05, "loss": 0.3443, "step": 1577 }, { "epoch": 2.6928327645051193, "grad_norm": 0.4166481093910801, "learning_rate": 2.079799330309254e-05, "loss": 0.3494, "step": 1578 }, { "epoch": 2.6945392491467577, "grad_norm": 0.3728399532458675, "learning_rate": 2.077418469558015e-05, "loss": 0.3397, "step": 1579 }, { "epoch": 2.696245733788396, "grad_norm": 0.33987847638407687, "learning_rate": 2.07503749892542e-05, "loss": 0.3516, "step": 1580 }, { "epoch": 2.697952218430034, "grad_norm": 0.3625117258574951, "learning_rate": 2.0726564217908188e-05, "loss": 0.3135, "step": 1581 }, { "epoch": 2.6996587030716723, "grad_norm": 0.3215747834204709, "learning_rate": 2.070275241533716e-05, "loss": 0.3459, "step": 1582 }, { "epoch": 2.7013651877133107, "grad_norm": 0.36407703958316107, "learning_rate": 2.0678939615337625e-05, "loss": 0.3124, "step": 1583 }, { "epoch": 2.703071672354949, "grad_norm": 0.3841532131926999, "learning_rate": 2.065512585170747e-05, "loss": 0.3196, "step": 1584 }, { "epoch": 2.704778156996587, "grad_norm": 0.3984499240967021, "learning_rate": 2.0631311158246002e-05, "loss": 0.2921, "step": 1585 }, { "epoch": 2.7064846416382253, "grad_norm": 0.36633412889607153, "learning_rate": 2.060749556875381e-05, "loss": 0.3278, "step": 1586 }, { "epoch": 2.7081911262798632, "grad_norm": 0.34262907420461297, "learning_rate": 2.058367911703277e-05, "loss": 0.3335, "step": 1587 }, { "epoch": 2.7098976109215016, "grad_norm": 0.32516339887309825, "learning_rate": 2.055986183688598e-05, "loss": 0.3561, "step": 1588 }, { "epoch": 2.71160409556314, "grad_norm": 0.36094019810397276, "learning_rate": 2.0536043762117717e-05, "loss": 0.3347, "step": 1589 }, { "epoch": 2.7133105802047783, "grad_norm": 0.3678192871584571, "learning_rate": 2.0512224926533375e-05, "loss": 0.317, "step": 1590 }, { "epoch": 2.7150170648464163, "grad_norm": 0.35626547301694567, "learning_rate": 2.0488405363939434e-05, "loss": 0.3646, "step": 1591 }, { "epoch": 2.7167235494880546, "grad_norm": 0.3876448802387714, "learning_rate": 2.046458510814341e-05, "loss": 0.3623, "step": 1592 }, { "epoch": 2.718430034129693, "grad_norm": 0.3737984481229139, "learning_rate": 2.0440764192953805e-05, "loss": 0.357, "step": 1593 }, { "epoch": 2.720136518771331, "grad_norm": 0.47120557756429443, "learning_rate": 2.0416942652180037e-05, "loss": 0.3219, "step": 1594 }, { "epoch": 2.7218430034129693, "grad_norm": 0.34760756372777724, "learning_rate": 2.0393120519632444e-05, "loss": 0.3269, "step": 1595 }, { "epoch": 2.7235494880546076, "grad_norm": 0.3365726738286948, "learning_rate": 2.0369297829122168e-05, "loss": 0.3386, "step": 1596 }, { "epoch": 2.725255972696246, "grad_norm": 0.3602677916560659, "learning_rate": 2.034547461446117e-05, "loss": 0.3206, "step": 1597 }, { "epoch": 2.726962457337884, "grad_norm": 0.3730257272181104, "learning_rate": 2.0321650909462144e-05, "loss": 0.3596, "step": 1598 }, { "epoch": 2.7286689419795223, "grad_norm": 0.30881143810988376, "learning_rate": 2.0297826747938483e-05, "loss": 0.3277, "step": 1599 }, { "epoch": 2.73037542662116, "grad_norm": 0.36938060678999873, "learning_rate": 2.0274002163704226e-05, "loss": 0.3485, "step": 1600 }, { "epoch": 2.7320819112627985, "grad_norm": 0.3369892272185491, "learning_rate": 2.0250177190574023e-05, "loss": 0.342, "step": 1601 }, { "epoch": 2.733788395904437, "grad_norm": 0.34992532725689274, "learning_rate": 2.0226351862363043e-05, "loss": 0.3325, "step": 1602 }, { "epoch": 2.7354948805460753, "grad_norm": 0.3501014851111137, "learning_rate": 2.0202526212887003e-05, "loss": 0.3738, "step": 1603 }, { "epoch": 2.737201365187713, "grad_norm": 0.3476924097993854, "learning_rate": 2.0178700275962044e-05, "loss": 0.3377, "step": 1604 }, { "epoch": 2.7389078498293515, "grad_norm": 0.33874329422942734, "learning_rate": 2.0154874085404724e-05, "loss": 0.3583, "step": 1605 }, { "epoch": 2.74061433447099, "grad_norm": 0.36032833957102245, "learning_rate": 2.013104767503197e-05, "loss": 0.3462, "step": 1606 }, { "epoch": 2.742320819112628, "grad_norm": 0.3326759957576823, "learning_rate": 2.0107221078661016e-05, "loss": 0.3117, "step": 1607 }, { "epoch": 2.744027303754266, "grad_norm": 0.40181150515698444, "learning_rate": 2.008339433010934e-05, "loss": 0.3226, "step": 1608 }, { "epoch": 2.7457337883959045, "grad_norm": 0.3620467965946172, "learning_rate": 2.0059567463194675e-05, "loss": 0.3106, "step": 1609 }, { "epoch": 2.747440273037543, "grad_norm": 0.3574953575687224, "learning_rate": 2.0035740511734892e-05, "loss": 0.3368, "step": 1610 }, { "epoch": 2.749146757679181, "grad_norm": 0.3794964523542467, "learning_rate": 2.0011913509547983e-05, "loss": 0.3577, "step": 1611 }, { "epoch": 2.750853242320819, "grad_norm": 0.3306234888248439, "learning_rate": 1.9988086490452027e-05, "loss": 0.3071, "step": 1612 }, { "epoch": 2.752559726962457, "grad_norm": 0.3293412215545551, "learning_rate": 1.996425948826512e-05, "loss": 0.3205, "step": 1613 }, { "epoch": 2.7542662116040955, "grad_norm": 0.3504073187990615, "learning_rate": 1.9940432536805332e-05, "loss": 0.3686, "step": 1614 }, { "epoch": 2.755972696245734, "grad_norm": 0.38665914863114975, "learning_rate": 1.9916605669890662e-05, "loss": 0.3348, "step": 1615 }, { "epoch": 2.757679180887372, "grad_norm": 0.351258212938779, "learning_rate": 1.9892778921338994e-05, "loss": 0.3601, "step": 1616 }, { "epoch": 2.75938566552901, "grad_norm": 0.3397611048333248, "learning_rate": 1.986895232496803e-05, "loss": 0.3313, "step": 1617 }, { "epoch": 2.7610921501706485, "grad_norm": 0.3458145745845894, "learning_rate": 1.9845125914595283e-05, "loss": 0.3112, "step": 1618 }, { "epoch": 2.762798634812287, "grad_norm": 0.3659654358977534, "learning_rate": 1.9821299724037963e-05, "loss": 0.313, "step": 1619 }, { "epoch": 2.7645051194539247, "grad_norm": 0.38214107263163416, "learning_rate": 1.9797473787113004e-05, "loss": 0.3483, "step": 1620 }, { "epoch": 2.766211604095563, "grad_norm": 0.364102572664005, "learning_rate": 1.977364813763696e-05, "loss": 0.3597, "step": 1621 }, { "epoch": 2.7679180887372015, "grad_norm": 0.34318240802525257, "learning_rate": 1.9749822809425984e-05, "loss": 0.3345, "step": 1622 }, { "epoch": 2.76962457337884, "grad_norm": 0.3389410304007314, "learning_rate": 1.9725997836295774e-05, "loss": 0.341, "step": 1623 }, { "epoch": 2.7713310580204777, "grad_norm": 0.35781642598500735, "learning_rate": 1.9702173252061523e-05, "loss": 0.3224, "step": 1624 }, { "epoch": 2.773037542662116, "grad_norm": 0.3323242259119811, "learning_rate": 1.9678349090537863e-05, "loss": 0.3277, "step": 1625 }, { "epoch": 2.774744027303754, "grad_norm": 0.3864060483057039, "learning_rate": 1.9654525385538835e-05, "loss": 0.3512, "step": 1626 }, { "epoch": 2.7764505119453924, "grad_norm": 0.335503887449825, "learning_rate": 1.963070217087784e-05, "loss": 0.3151, "step": 1627 }, { "epoch": 2.7781569965870307, "grad_norm": 0.3273278255736187, "learning_rate": 1.9606879480367566e-05, "loss": 0.3143, "step": 1628 }, { "epoch": 2.779863481228669, "grad_norm": 0.31165697054945707, "learning_rate": 1.9583057347819966e-05, "loss": 0.356, "step": 1629 }, { "epoch": 2.781569965870307, "grad_norm": 0.7722818020380438, "learning_rate": 1.95592358070462e-05, "loss": 0.3595, "step": 1630 }, { "epoch": 2.7832764505119454, "grad_norm": 0.3569523719652423, "learning_rate": 1.9535414891856594e-05, "loss": 0.3354, "step": 1631 }, { "epoch": 2.7849829351535837, "grad_norm": 0.41552819027089377, "learning_rate": 1.9511594636060572e-05, "loss": 0.3636, "step": 1632 }, { "epoch": 2.7866894197952217, "grad_norm": 0.37048344346462647, "learning_rate": 1.9487775073466632e-05, "loss": 0.354, "step": 1633 }, { "epoch": 2.78839590443686, "grad_norm": 0.34630389475085166, "learning_rate": 1.9463956237882286e-05, "loss": 0.2968, "step": 1634 }, { "epoch": 2.7901023890784984, "grad_norm": 0.3428750144611517, "learning_rate": 1.944013816311402e-05, "loss": 0.3554, "step": 1635 }, { "epoch": 2.7918088737201368, "grad_norm": 0.35617944498917076, "learning_rate": 1.941632088296723e-05, "loss": 0.3153, "step": 1636 }, { "epoch": 2.7935153583617747, "grad_norm": 0.3612441828462749, "learning_rate": 1.93925044312462e-05, "loss": 0.3321, "step": 1637 }, { "epoch": 2.795221843003413, "grad_norm": 0.3656782507777216, "learning_rate": 1.9368688841754004e-05, "loss": 0.332, "step": 1638 }, { "epoch": 2.796928327645051, "grad_norm": 0.3753492388272683, "learning_rate": 1.9344874148292535e-05, "loss": 0.3284, "step": 1639 }, { "epoch": 2.7986348122866893, "grad_norm": 0.3832829508141039, "learning_rate": 1.9321060384662386e-05, "loss": 0.3678, "step": 1640 }, { "epoch": 2.8003412969283277, "grad_norm": 0.3266071892760176, "learning_rate": 1.929724758466284e-05, "loss": 0.3722, "step": 1641 }, { "epoch": 2.802047781569966, "grad_norm": 0.35389760032277373, "learning_rate": 1.9273435782091815e-05, "loss": 0.3357, "step": 1642 }, { "epoch": 2.803754266211604, "grad_norm": 0.40707048073722657, "learning_rate": 1.9249625010745814e-05, "loss": 0.3267, "step": 1643 }, { "epoch": 2.8054607508532423, "grad_norm": 0.3310891060298458, "learning_rate": 1.9225815304419856e-05, "loss": 0.3418, "step": 1644 }, { "epoch": 2.8071672354948807, "grad_norm": 0.4085721022507227, "learning_rate": 1.920200669690747e-05, "loss": 0.3156, "step": 1645 }, { "epoch": 2.8088737201365186, "grad_norm": 0.37154394826080867, "learning_rate": 1.9178199222000626e-05, "loss": 0.3323, "step": 1646 }, { "epoch": 2.810580204778157, "grad_norm": 0.36333177012949025, "learning_rate": 1.9154392913489677e-05, "loss": 0.3432, "step": 1647 }, { "epoch": 2.8122866894197953, "grad_norm": 0.382956873945717, "learning_rate": 1.9130587805163323e-05, "loss": 0.3371, "step": 1648 }, { "epoch": 2.8139931740614337, "grad_norm": 0.40629424241154394, "learning_rate": 1.9106783930808573e-05, "loss": 0.3458, "step": 1649 }, { "epoch": 2.8156996587030716, "grad_norm": 0.35089230236120816, "learning_rate": 1.908298132421065e-05, "loss": 0.3479, "step": 1650 }, { "epoch": 2.81740614334471, "grad_norm": 0.39715671994578994, "learning_rate": 1.9059180019153015e-05, "loss": 0.3408, "step": 1651 }, { "epoch": 2.819112627986348, "grad_norm": 0.34745493026514485, "learning_rate": 1.9035380049417257e-05, "loss": 0.3659, "step": 1652 }, { "epoch": 2.8208191126279862, "grad_norm": 0.3389434273750528, "learning_rate": 1.9011581448783098e-05, "loss": 0.3319, "step": 1653 }, { "epoch": 2.8225255972696246, "grad_norm": 0.38343909047401437, "learning_rate": 1.8987784251028284e-05, "loss": 0.337, "step": 1654 }, { "epoch": 2.824232081911263, "grad_norm": 0.35147212457480564, "learning_rate": 1.8963988489928594e-05, "loss": 0.3379, "step": 1655 }, { "epoch": 2.825938566552901, "grad_norm": 0.38603576817115554, "learning_rate": 1.894019419925775e-05, "loss": 0.3364, "step": 1656 }, { "epoch": 2.8276450511945392, "grad_norm": 0.37816149358219, "learning_rate": 1.89164014127874e-05, "loss": 0.3402, "step": 1657 }, { "epoch": 2.8293515358361776, "grad_norm": 0.3744403182409511, "learning_rate": 1.8892610164287048e-05, "loss": 0.3789, "step": 1658 }, { "epoch": 2.8310580204778155, "grad_norm": 0.34421138287281045, "learning_rate": 1.8868820487524022e-05, "loss": 0.3275, "step": 1659 }, { "epoch": 2.832764505119454, "grad_norm": 0.3817176866198686, "learning_rate": 1.884503241626342e-05, "loss": 0.3106, "step": 1660 }, { "epoch": 2.8344709897610922, "grad_norm": 0.38572147672185014, "learning_rate": 1.8821245984268065e-05, "loss": 0.3052, "step": 1661 }, { "epoch": 2.8361774744027306, "grad_norm": 0.3575545479916945, "learning_rate": 1.879746122529843e-05, "loss": 0.3366, "step": 1662 }, { "epoch": 2.8378839590443685, "grad_norm": 0.37463725801175146, "learning_rate": 1.877367817311264e-05, "loss": 0.2992, "step": 1663 }, { "epoch": 2.839590443686007, "grad_norm": 0.36923264203118955, "learning_rate": 1.8749896861466382e-05, "loss": 0.3733, "step": 1664 }, { "epoch": 2.841296928327645, "grad_norm": 0.7918302733905488, "learning_rate": 1.8726117324112888e-05, "loss": 0.415, "step": 1665 }, { "epoch": 2.843003412969283, "grad_norm": 0.3842100597264359, "learning_rate": 1.870233959480285e-05, "loss": 0.3697, "step": 1666 }, { "epoch": 2.8447098976109215, "grad_norm": 0.3682276116500696, "learning_rate": 1.8678563707284413e-05, "loss": 0.3252, "step": 1667 }, { "epoch": 2.84641638225256, "grad_norm": 0.34451700978454036, "learning_rate": 1.865478969530311e-05, "loss": 0.3486, "step": 1668 }, { "epoch": 2.848122866894198, "grad_norm": 0.3724225173806301, "learning_rate": 1.8631017592601785e-05, "loss": 0.3416, "step": 1669 }, { "epoch": 2.849829351535836, "grad_norm": 0.38269679188125283, "learning_rate": 1.8607247432920595e-05, "loss": 0.355, "step": 1670 }, { "epoch": 2.8515358361774745, "grad_norm": 0.3332971620180318, "learning_rate": 1.8583479249996938e-05, "loss": 0.3152, "step": 1671 }, { "epoch": 2.8532423208191124, "grad_norm": 0.3815614723642687, "learning_rate": 1.85597130775654e-05, "loss": 0.3375, "step": 1672 }, { "epoch": 2.854948805460751, "grad_norm": 0.37556318578572173, "learning_rate": 1.8535948949357713e-05, "loss": 0.3649, "step": 1673 }, { "epoch": 2.856655290102389, "grad_norm": 0.33362137910336215, "learning_rate": 1.8512186899102723e-05, "loss": 0.3464, "step": 1674 }, { "epoch": 2.8583617747440275, "grad_norm": 0.37871549210871974, "learning_rate": 1.8488426960526297e-05, "loss": 0.3092, "step": 1675 }, { "epoch": 2.8600682593856654, "grad_norm": 0.3569331714652457, "learning_rate": 1.8464669167351333e-05, "loss": 0.3316, "step": 1676 }, { "epoch": 2.861774744027304, "grad_norm": 0.3996301598780719, "learning_rate": 1.8440913553297666e-05, "loss": 0.3309, "step": 1677 }, { "epoch": 2.8634812286689417, "grad_norm": 0.3716298479462723, "learning_rate": 1.8417160152082053e-05, "loss": 0.3446, "step": 1678 }, { "epoch": 2.86518771331058, "grad_norm": 0.33360159229287817, "learning_rate": 1.8393408997418098e-05, "loss": 0.3164, "step": 1679 }, { "epoch": 2.8668941979522184, "grad_norm": 0.3524761121384744, "learning_rate": 1.8369660123016227e-05, "loss": 0.3283, "step": 1680 }, { "epoch": 2.868600682593857, "grad_norm": 0.3101343794372301, "learning_rate": 1.834591356258361e-05, "loss": 0.3213, "step": 1681 }, { "epoch": 2.8703071672354947, "grad_norm": 0.38322792602580813, "learning_rate": 1.8322169349824157e-05, "loss": 0.3386, "step": 1682 }, { "epoch": 2.872013651877133, "grad_norm": 0.3526255439636646, "learning_rate": 1.8298427518438433e-05, "loss": 0.3619, "step": 1683 }, { "epoch": 2.8737201365187715, "grad_norm": 0.34664419024831267, "learning_rate": 1.8274688102123622e-05, "loss": 0.333, "step": 1684 }, { "epoch": 2.8754266211604094, "grad_norm": 0.3696462343494238, "learning_rate": 1.825095113457349e-05, "loss": 0.327, "step": 1685 }, { "epoch": 2.8771331058020477, "grad_norm": 0.35472191617978754, "learning_rate": 1.822721664947832e-05, "loss": 0.3333, "step": 1686 }, { "epoch": 2.878839590443686, "grad_norm": 0.33826733032433637, "learning_rate": 1.8203484680524863e-05, "loss": 0.3613, "step": 1687 }, { "epoch": 2.8805460750853245, "grad_norm": 0.3063764356850511, "learning_rate": 1.8179755261396318e-05, "loss": 0.3553, "step": 1688 }, { "epoch": 2.8822525597269624, "grad_norm": 0.35193178090811733, "learning_rate": 1.815602842577225e-05, "loss": 0.3063, "step": 1689 }, { "epoch": 2.8839590443686007, "grad_norm": 0.3516029177930882, "learning_rate": 1.8132304207328566e-05, "loss": 0.3261, "step": 1690 }, { "epoch": 2.8856655290102387, "grad_norm": 0.3456046758424527, "learning_rate": 1.8108582639737455e-05, "loss": 0.2941, "step": 1691 }, { "epoch": 2.887372013651877, "grad_norm": 0.334817789772746, "learning_rate": 1.808486375666734e-05, "loss": 0.3105, "step": 1692 }, { "epoch": 2.8890784982935154, "grad_norm": 0.38880289358555925, "learning_rate": 1.8061147591782842e-05, "loss": 0.2991, "step": 1693 }, { "epoch": 2.8907849829351537, "grad_norm": 0.38970751626981504, "learning_rate": 1.8037434178744712e-05, "loss": 0.3139, "step": 1694 }, { "epoch": 2.8924914675767917, "grad_norm": 0.36346955054676744, "learning_rate": 1.801372355120981e-05, "loss": 0.3581, "step": 1695 }, { "epoch": 2.89419795221843, "grad_norm": 0.3554126585708228, "learning_rate": 1.799001574283103e-05, "loss": 0.3126, "step": 1696 }, { "epoch": 2.8959044368600684, "grad_norm": 0.35115634959577685, "learning_rate": 1.796631078725727e-05, "loss": 0.3256, "step": 1697 }, { "epoch": 2.8976109215017063, "grad_norm": 0.3568065378240726, "learning_rate": 1.794260871813339e-05, "loss": 0.3343, "step": 1698 }, { "epoch": 2.8993174061433447, "grad_norm": 0.34053271734915225, "learning_rate": 1.7918909569100126e-05, "loss": 0.3137, "step": 1699 }, { "epoch": 2.901023890784983, "grad_norm": 0.2999742909350015, "learning_rate": 1.789521337379409e-05, "loss": 0.3283, "step": 1700 }, { "epoch": 2.9027303754266214, "grad_norm": 0.379850473840393, "learning_rate": 1.7871520165847704e-05, "loss": 0.3117, "step": 1701 }, { "epoch": 2.9044368600682593, "grad_norm": 0.4541116141212034, "learning_rate": 1.7847829978889134e-05, "loss": 0.346, "step": 1702 }, { "epoch": 2.9061433447098977, "grad_norm": 0.3513194920970446, "learning_rate": 1.782414284654227e-05, "loss": 0.3692, "step": 1703 }, { "epoch": 2.9078498293515356, "grad_norm": 0.3433079443982604, "learning_rate": 1.780045880242667e-05, "loss": 0.3309, "step": 1704 }, { "epoch": 2.909556313993174, "grad_norm": 0.3711508304255128, "learning_rate": 1.77767778801575e-05, "loss": 0.3495, "step": 1705 }, { "epoch": 2.9112627986348123, "grad_norm": 0.33974962485669125, "learning_rate": 1.7753100113345495e-05, "loss": 0.3435, "step": 1706 }, { "epoch": 2.9129692832764507, "grad_norm": 0.35878700876429676, "learning_rate": 1.7729425535596915e-05, "loss": 0.3442, "step": 1707 }, { "epoch": 2.9146757679180886, "grad_norm": 0.3514467106273154, "learning_rate": 1.7705754180513492e-05, "loss": 0.3625, "step": 1708 }, { "epoch": 2.916382252559727, "grad_norm": 0.3266246564680988, "learning_rate": 1.7682086081692384e-05, "loss": 0.3608, "step": 1709 }, { "epoch": 2.9180887372013653, "grad_norm": 0.3870515552127354, "learning_rate": 1.7658421272726135e-05, "loss": 0.328, "step": 1710 }, { "epoch": 2.919795221843003, "grad_norm": 0.3295969332551737, "learning_rate": 1.7634759787202616e-05, "loss": 0.3068, "step": 1711 }, { "epoch": 2.9215017064846416, "grad_norm": 0.3576330018796421, "learning_rate": 1.7611101658704957e-05, "loss": 0.3187, "step": 1712 }, { "epoch": 2.92320819112628, "grad_norm": 0.3390664369715398, "learning_rate": 1.7587446920811563e-05, "loss": 0.3558, "step": 1713 }, { "epoch": 2.9249146757679183, "grad_norm": 0.34732048934447574, "learning_rate": 1.7563795607096e-05, "loss": 0.3278, "step": 1714 }, { "epoch": 2.926621160409556, "grad_norm": 0.35650434645488815, "learning_rate": 1.7540147751126988e-05, "loss": 0.329, "step": 1715 }, { "epoch": 2.9283276450511946, "grad_norm": 0.36947783277043117, "learning_rate": 1.7516503386468332e-05, "loss": 0.3957, "step": 1716 }, { "epoch": 2.9300341296928325, "grad_norm": 0.3782148284432186, "learning_rate": 1.7492862546678885e-05, "loss": 0.3592, "step": 1717 }, { "epoch": 2.931740614334471, "grad_norm": 0.35540209250899896, "learning_rate": 1.7469225265312485e-05, "loss": 0.3571, "step": 1718 }, { "epoch": 2.9334470989761092, "grad_norm": 0.34653886642231185, "learning_rate": 1.744559157591793e-05, "loss": 0.3562, "step": 1719 }, { "epoch": 2.9351535836177476, "grad_norm": 0.3258636850661726, "learning_rate": 1.7421961512038935e-05, "loss": 0.3267, "step": 1720 }, { "epoch": 2.9368600682593855, "grad_norm": 0.34482993782625104, "learning_rate": 1.739833510721404e-05, "loss": 0.3169, "step": 1721 }, { "epoch": 2.938566552901024, "grad_norm": 0.37757768964280464, "learning_rate": 1.737471239497661e-05, "loss": 0.322, "step": 1722 }, { "epoch": 2.9402730375426622, "grad_norm": 0.34239014533985274, "learning_rate": 1.7351093408854772e-05, "loss": 0.33, "step": 1723 }, { "epoch": 2.9419795221843, "grad_norm": 0.3646764896000102, "learning_rate": 1.7327478182371336e-05, "loss": 0.3115, "step": 1724 }, { "epoch": 2.9436860068259385, "grad_norm": 0.3231747953703291, "learning_rate": 1.7303866749043814e-05, "loss": 0.3576, "step": 1725 }, { "epoch": 2.945392491467577, "grad_norm": 0.344845352925776, "learning_rate": 1.728025914238431e-05, "loss": 0.3588, "step": 1726 }, { "epoch": 2.9470989761092152, "grad_norm": 0.35901698400371646, "learning_rate": 1.7256655395899504e-05, "loss": 0.3351, "step": 1727 }, { "epoch": 2.948805460750853, "grad_norm": 0.35284111975516075, "learning_rate": 1.7233055543090603e-05, "loss": 0.3909, "step": 1728 }, { "epoch": 2.9505119453924915, "grad_norm": 0.32083376472315694, "learning_rate": 1.7209459617453286e-05, "loss": 0.3374, "step": 1729 }, { "epoch": 2.9522184300341294, "grad_norm": 0.36171096746041753, "learning_rate": 1.7185867652477635e-05, "loss": 0.3213, "step": 1730 }, { "epoch": 2.953924914675768, "grad_norm": 0.36505654380300484, "learning_rate": 1.716227968164814e-05, "loss": 0.3379, "step": 1731 }, { "epoch": 2.955631399317406, "grad_norm": 0.40374314394985816, "learning_rate": 1.713869573844361e-05, "loss": 0.3448, "step": 1732 }, { "epoch": 2.9573378839590445, "grad_norm": 0.33805725343076615, "learning_rate": 1.7115115856337136e-05, "loss": 0.3998, "step": 1733 }, { "epoch": 2.9590443686006824, "grad_norm": 0.34102697790356706, "learning_rate": 1.7091540068796057e-05, "loss": 0.3498, "step": 1734 }, { "epoch": 2.960750853242321, "grad_norm": 0.3040015821061206, "learning_rate": 1.7067968409281884e-05, "loss": 0.3166, "step": 1735 }, { "epoch": 2.962457337883959, "grad_norm": 0.3590008059595252, "learning_rate": 1.704440091125029e-05, "loss": 0.3588, "step": 1736 }, { "epoch": 2.964163822525597, "grad_norm": 0.3493592992297364, "learning_rate": 1.7020837608151e-05, "loss": 0.3254, "step": 1737 }, { "epoch": 2.9658703071672354, "grad_norm": 0.31100361374726077, "learning_rate": 1.6997278533427835e-05, "loss": 0.3252, "step": 1738 }, { "epoch": 2.967576791808874, "grad_norm": 0.3598789653436826, "learning_rate": 1.6973723720518588e-05, "loss": 0.3709, "step": 1739 }, { "epoch": 2.969283276450512, "grad_norm": 0.34279502689764196, "learning_rate": 1.6950173202854998e-05, "loss": 0.2954, "step": 1740 }, { "epoch": 2.97098976109215, "grad_norm": 0.32542441687699925, "learning_rate": 1.692662701386273e-05, "loss": 0.3274, "step": 1741 }, { "epoch": 2.9726962457337884, "grad_norm": 0.3601029282375435, "learning_rate": 1.69030851869613e-05, "loss": 0.3437, "step": 1742 }, { "epoch": 2.9744027303754264, "grad_norm": 0.34962987836091736, "learning_rate": 1.6879547755564002e-05, "loss": 0.3137, "step": 1743 }, { "epoch": 2.9761092150170647, "grad_norm": 0.34961152926425815, "learning_rate": 1.6856014753077926e-05, "loss": 0.3414, "step": 1744 }, { "epoch": 2.977815699658703, "grad_norm": 0.36859741563002496, "learning_rate": 1.6832486212903866e-05, "loss": 0.3391, "step": 1745 }, { "epoch": 2.9795221843003414, "grad_norm": 0.3765022977379053, "learning_rate": 1.6808962168436283e-05, "loss": 0.3483, "step": 1746 }, { "epoch": 2.98122866894198, "grad_norm": 0.3677976039002658, "learning_rate": 1.6785442653063248e-05, "loss": 0.3081, "step": 1747 }, { "epoch": 2.9829351535836177, "grad_norm": 0.3467096282103713, "learning_rate": 1.6761927700166426e-05, "loss": 0.2911, "step": 1748 }, { "epoch": 2.984641638225256, "grad_norm": 0.3324479185783913, "learning_rate": 1.6738417343120977e-05, "loss": 0.3371, "step": 1749 }, { "epoch": 2.986348122866894, "grad_norm": 0.3853773281218314, "learning_rate": 1.6714911615295556e-05, "loss": 0.3381, "step": 1750 }, { "epoch": 2.9880546075085324, "grad_norm": 0.3832177388719795, "learning_rate": 1.6691410550052247e-05, "loss": 0.3019, "step": 1751 }, { "epoch": 2.9897610921501707, "grad_norm": 0.4141979351193689, "learning_rate": 1.6667914180746512e-05, "loss": 0.296, "step": 1752 }, { "epoch": 2.991467576791809, "grad_norm": 0.3230747420619479, "learning_rate": 1.664442254072715e-05, "loss": 0.3179, "step": 1753 }, { "epoch": 2.993174061433447, "grad_norm": 0.3696078043835276, "learning_rate": 1.6620935663336256e-05, "loss": 0.3791, "step": 1754 }, { "epoch": 2.9948805460750854, "grad_norm": 0.3503901883425271, "learning_rate": 1.659745358190914e-05, "loss": 0.3818, "step": 1755 }, { "epoch": 2.9965870307167233, "grad_norm": 0.31827258026790567, "learning_rate": 1.6573976329774333e-05, "loss": 0.316, "step": 1756 }, { "epoch": 2.9982935153583616, "grad_norm": 0.35080157401751505, "learning_rate": 1.6550503940253495e-05, "loss": 0.318, "step": 1757 }, { "epoch": 3.0, "grad_norm": 0.3239978521112306, "learning_rate": 1.6527036446661396e-05, "loss": 0.3189, "step": 1758 }, { "epoch": 3.0017064846416384, "grad_norm": 0.5023542824981905, "learning_rate": 1.6503573882305844e-05, "loss": 0.2291, "step": 1759 }, { "epoch": 3.0034129692832763, "grad_norm": 0.3622111307251838, "learning_rate": 1.6480116280487668e-05, "loss": 0.261, "step": 1760 }, { "epoch": 3.0051194539249146, "grad_norm": 0.4661124368191468, "learning_rate": 1.6456663674500627e-05, "loss": 0.2145, "step": 1761 }, { "epoch": 3.006825938566553, "grad_norm": 0.5106325649604367, "learning_rate": 1.643321609763142e-05, "loss": 0.2429, "step": 1762 }, { "epoch": 3.008532423208191, "grad_norm": 0.3938572896434494, "learning_rate": 1.6409773583159588e-05, "loss": 0.2594, "step": 1763 }, { "epoch": 3.0102389078498293, "grad_norm": 0.430218824863444, "learning_rate": 1.638633616435749e-05, "loss": 0.2301, "step": 1764 }, { "epoch": 3.0119453924914676, "grad_norm": 0.44749877614295497, "learning_rate": 1.6362903874490263e-05, "loss": 0.2239, "step": 1765 }, { "epoch": 3.013651877133106, "grad_norm": 0.3865086318406251, "learning_rate": 1.6339476746815756e-05, "loss": 0.2755, "step": 1766 }, { "epoch": 3.015358361774744, "grad_norm": 0.3662040320567563, "learning_rate": 1.6316054814584483e-05, "loss": 0.2271, "step": 1767 }, { "epoch": 3.0170648464163823, "grad_norm": 0.39800939298770655, "learning_rate": 1.6292638111039597e-05, "loss": 0.2185, "step": 1768 }, { "epoch": 3.0187713310580206, "grad_norm": 0.4240591084534608, "learning_rate": 1.6269226669416824e-05, "loss": 0.2504, "step": 1769 }, { "epoch": 3.0204778156996586, "grad_norm": 0.37156845230020247, "learning_rate": 1.6245820522944427e-05, "loss": 0.2239, "step": 1770 }, { "epoch": 3.022184300341297, "grad_norm": 0.37450054665593224, "learning_rate": 1.6222419704843154e-05, "loss": 0.2453, "step": 1771 }, { "epoch": 3.0238907849829353, "grad_norm": 0.3278205522266145, "learning_rate": 1.6199024248326175e-05, "loss": 0.2295, "step": 1772 }, { "epoch": 3.025597269624573, "grad_norm": 0.36357733046965546, "learning_rate": 1.6175634186599076e-05, "loss": 0.2129, "step": 1773 }, { "epoch": 3.0273037542662116, "grad_norm": 0.3563255311678045, "learning_rate": 1.6152249552859758e-05, "loss": 0.2638, "step": 1774 }, { "epoch": 3.02901023890785, "grad_norm": 0.3420762406482644, "learning_rate": 1.6128870380298436e-05, "loss": 0.2645, "step": 1775 }, { "epoch": 3.030716723549488, "grad_norm": 0.493384909929465, "learning_rate": 1.610549670209757e-05, "loss": 0.2558, "step": 1776 }, { "epoch": 3.032423208191126, "grad_norm": 0.3565257496434803, "learning_rate": 1.6082128551431818e-05, "loss": 0.265, "step": 1777 }, { "epoch": 3.0341296928327646, "grad_norm": 0.333066759624748, "learning_rate": 1.6058765961468e-05, "loss": 0.2147, "step": 1778 }, { "epoch": 3.035836177474403, "grad_norm": 0.46879563123055723, "learning_rate": 1.6035408965365043e-05, "loss": 0.2278, "step": 1779 }, { "epoch": 3.037542662116041, "grad_norm": 0.35351271163216785, "learning_rate": 1.6012057596273923e-05, "loss": 0.2514, "step": 1780 }, { "epoch": 3.039249146757679, "grad_norm": 0.33527172164589053, "learning_rate": 1.5988711887337643e-05, "loss": 0.2607, "step": 1781 }, { "epoch": 3.0409556313993176, "grad_norm": 0.39044995830132523, "learning_rate": 1.596537187169116e-05, "loss": 0.2226, "step": 1782 }, { "epoch": 3.0426621160409555, "grad_norm": 0.3892517718830978, "learning_rate": 1.594203758246136e-05, "loss": 0.243, "step": 1783 }, { "epoch": 3.044368600682594, "grad_norm": 0.3495506564595455, "learning_rate": 1.5918709052767004e-05, "loss": 0.2549, "step": 1784 }, { "epoch": 3.046075085324232, "grad_norm": 0.3506167624324571, "learning_rate": 1.5895386315718675e-05, "loss": 0.2154, "step": 1785 }, { "epoch": 3.04778156996587, "grad_norm": 0.390251454799263, "learning_rate": 1.587206940441872e-05, "loss": 0.2167, "step": 1786 }, { "epoch": 3.0494880546075085, "grad_norm": 0.32700113070011627, "learning_rate": 1.584875835196124e-05, "loss": 0.2425, "step": 1787 }, { "epoch": 3.051194539249147, "grad_norm": 0.36416392434798417, "learning_rate": 1.5825453191432e-05, "loss": 0.23, "step": 1788 }, { "epoch": 3.0529010238907848, "grad_norm": 0.3494229842343675, "learning_rate": 1.5802153955908425e-05, "loss": 0.2101, "step": 1789 }, { "epoch": 3.054607508532423, "grad_norm": 0.35812200045695386, "learning_rate": 1.577886067845951e-05, "loss": 0.2645, "step": 1790 }, { "epoch": 3.0563139931740615, "grad_norm": 0.3669156298724796, "learning_rate": 1.5755573392145814e-05, "loss": 0.245, "step": 1791 }, { "epoch": 3.0580204778157, "grad_norm": 0.3750095044042606, "learning_rate": 1.573229213001936e-05, "loss": 0.2709, "step": 1792 }, { "epoch": 3.0597269624573378, "grad_norm": 0.3634012555607957, "learning_rate": 1.5709016925123658e-05, "loss": 0.2636, "step": 1793 }, { "epoch": 3.061433447098976, "grad_norm": 0.3292429504701851, "learning_rate": 1.5685747810493596e-05, "loss": 0.2754, "step": 1794 }, { "epoch": 3.0631399317406145, "grad_norm": 0.34796794426658706, "learning_rate": 1.5662484819155434e-05, "loss": 0.2431, "step": 1795 }, { "epoch": 3.0648464163822524, "grad_norm": 0.3916271190395842, "learning_rate": 1.5639227984126722e-05, "loss": 0.2253, "step": 1796 }, { "epoch": 3.0665529010238908, "grad_norm": 0.3617107429258443, "learning_rate": 1.5615977338416305e-05, "loss": 0.2331, "step": 1797 }, { "epoch": 3.068259385665529, "grad_norm": 0.33604907347977464, "learning_rate": 1.55927329150242e-05, "loss": 0.2338, "step": 1798 }, { "epoch": 3.069965870307167, "grad_norm": 0.35007626103547385, "learning_rate": 1.5569494746941613e-05, "loss": 0.2087, "step": 1799 }, { "epoch": 3.0716723549488054, "grad_norm": 0.33924243027213385, "learning_rate": 1.5546262867150888e-05, "loss": 0.2349, "step": 1800 }, { "epoch": 3.073378839590444, "grad_norm": 0.3241741854770072, "learning_rate": 1.5523037308625424e-05, "loss": 0.2304, "step": 1801 }, { "epoch": 3.0750853242320817, "grad_norm": 0.3261934992254656, "learning_rate": 1.549981810432965e-05, "loss": 0.2494, "step": 1802 }, { "epoch": 3.07679180887372, "grad_norm": 0.3559673508201443, "learning_rate": 1.5476605287218997e-05, "loss": 0.2463, "step": 1803 }, { "epoch": 3.0784982935153584, "grad_norm": 0.3685695289688876, "learning_rate": 1.5453398890239784e-05, "loss": 0.239, "step": 1804 }, { "epoch": 3.080204778156997, "grad_norm": 0.4127238074822048, "learning_rate": 1.5430198946329266e-05, "loss": 0.256, "step": 1805 }, { "epoch": 3.0819112627986347, "grad_norm": 0.35952788137173813, "learning_rate": 1.540700548841551e-05, "loss": 0.2453, "step": 1806 }, { "epoch": 3.083617747440273, "grad_norm": 0.369170728501119, "learning_rate": 1.5383818549417397e-05, "loss": 0.2218, "step": 1807 }, { "epoch": 3.0853242320819114, "grad_norm": 0.3575433147530976, "learning_rate": 1.536063816224454e-05, "loss": 0.2065, "step": 1808 }, { "epoch": 3.0870307167235493, "grad_norm": 0.3343538027842782, "learning_rate": 1.533746435979726e-05, "loss": 0.2187, "step": 1809 }, { "epoch": 3.0887372013651877, "grad_norm": 0.3685384119739101, "learning_rate": 1.5314297174966543e-05, "loss": 0.2275, "step": 1810 }, { "epoch": 3.090443686006826, "grad_norm": 0.37265269793468075, "learning_rate": 1.529113664063395e-05, "loss": 0.2544, "step": 1811 }, { "epoch": 3.092150170648464, "grad_norm": 0.34520980463000883, "learning_rate": 1.5267982789671636e-05, "loss": 0.2696, "step": 1812 }, { "epoch": 3.0938566552901023, "grad_norm": 0.3744550897419855, "learning_rate": 1.5244835654942252e-05, "loss": 0.2778, "step": 1813 }, { "epoch": 3.0955631399317407, "grad_norm": 0.3508486292671562, "learning_rate": 1.5221695269298918e-05, "loss": 0.2563, "step": 1814 }, { "epoch": 3.0972696245733786, "grad_norm": 0.33880674873198074, "learning_rate": 1.5198561665585192e-05, "loss": 0.2554, "step": 1815 }, { "epoch": 3.098976109215017, "grad_norm": 0.37550221100309866, "learning_rate": 1.5175434876634994e-05, "loss": 0.2367, "step": 1816 }, { "epoch": 3.1006825938566553, "grad_norm": 0.3547039204864811, "learning_rate": 1.5152314935272556e-05, "loss": 0.3123, "step": 1817 }, { "epoch": 3.1023890784982937, "grad_norm": 0.31596627546541817, "learning_rate": 1.5129201874312414e-05, "loss": 0.2216, "step": 1818 }, { "epoch": 3.1040955631399316, "grad_norm": 0.36454602271429526, "learning_rate": 1.5106095726559328e-05, "loss": 0.2528, "step": 1819 }, { "epoch": 3.10580204778157, "grad_norm": 0.3710957779992305, "learning_rate": 1.5082996524808251e-05, "loss": 0.234, "step": 1820 }, { "epoch": 3.1075085324232083, "grad_norm": 0.3640748644132522, "learning_rate": 1.5059904301844272e-05, "loss": 0.2506, "step": 1821 }, { "epoch": 3.1092150170648463, "grad_norm": 0.37604614593850916, "learning_rate": 1.5036819090442594e-05, "loss": 0.2482, "step": 1822 }, { "epoch": 3.1109215017064846, "grad_norm": 0.3408757814876876, "learning_rate": 1.501374092336843e-05, "loss": 0.2459, "step": 1823 }, { "epoch": 3.112627986348123, "grad_norm": 0.3591411983387152, "learning_rate": 1.4990669833377025e-05, "loss": 0.2352, "step": 1824 }, { "epoch": 3.114334470989761, "grad_norm": 0.32486352500740007, "learning_rate": 1.4967605853213573e-05, "loss": 0.2346, "step": 1825 }, { "epoch": 3.1160409556313993, "grad_norm": 0.36102677067398153, "learning_rate": 1.4944549015613175e-05, "loss": 0.2392, "step": 1826 }, { "epoch": 3.1177474402730376, "grad_norm": 0.32565962763060924, "learning_rate": 1.4921499353300795e-05, "loss": 0.2402, "step": 1827 }, { "epoch": 3.1194539249146755, "grad_norm": 0.35285773270868653, "learning_rate": 1.4898456898991216e-05, "loss": 0.2221, "step": 1828 }, { "epoch": 3.121160409556314, "grad_norm": 0.3509794023449979, "learning_rate": 1.487542168538898e-05, "loss": 0.2344, "step": 1829 }, { "epoch": 3.1228668941979523, "grad_norm": 0.3503379450918871, "learning_rate": 1.4852393745188365e-05, "loss": 0.2318, "step": 1830 }, { "epoch": 3.1245733788395906, "grad_norm": 0.3570192207186589, "learning_rate": 1.4829373111073318e-05, "loss": 0.2502, "step": 1831 }, { "epoch": 3.1262798634812285, "grad_norm": 0.36819319677663465, "learning_rate": 1.4806359815717416e-05, "loss": 0.2375, "step": 1832 }, { "epoch": 3.127986348122867, "grad_norm": 0.3421945346630767, "learning_rate": 1.4783353891783829e-05, "loss": 0.2418, "step": 1833 }, { "epoch": 3.1296928327645053, "grad_norm": 0.33862533559137625, "learning_rate": 1.4760355371925257e-05, "loss": 0.2483, "step": 1834 }, { "epoch": 3.131399317406143, "grad_norm": 0.38210124875299717, "learning_rate": 1.4737364288783888e-05, "loss": 0.2633, "step": 1835 }, { "epoch": 3.1331058020477816, "grad_norm": 0.3530735041752546, "learning_rate": 1.4714380674991362e-05, "loss": 0.2592, "step": 1836 }, { "epoch": 3.13481228668942, "grad_norm": 0.3502007412316067, "learning_rate": 1.4691404563168714e-05, "loss": 0.2427, "step": 1837 }, { "epoch": 3.136518771331058, "grad_norm": 0.33108415507707345, "learning_rate": 1.4668435985926333e-05, "loss": 0.2272, "step": 1838 }, { "epoch": 3.138225255972696, "grad_norm": 0.3837508654592469, "learning_rate": 1.4645474975863914e-05, "loss": 0.2858, "step": 1839 }, { "epoch": 3.1399317406143346, "grad_norm": 0.3649616719606301, "learning_rate": 1.4622521565570416e-05, "loss": 0.2362, "step": 1840 }, { "epoch": 3.1416382252559725, "grad_norm": 0.3615495063252914, "learning_rate": 1.4599575787623996e-05, "loss": 0.2286, "step": 1841 }, { "epoch": 3.143344709897611, "grad_norm": 0.3403965739967832, "learning_rate": 1.4576637674591994e-05, "loss": 0.2601, "step": 1842 }, { "epoch": 3.145051194539249, "grad_norm": 0.35068584616614407, "learning_rate": 1.4553707259030868e-05, "loss": 0.2581, "step": 1843 }, { "epoch": 3.1467576791808876, "grad_norm": 0.37626957059735494, "learning_rate": 1.4530784573486145e-05, "loss": 0.2172, "step": 1844 }, { "epoch": 3.1484641638225255, "grad_norm": 0.3379602035732661, "learning_rate": 1.4507869650492388e-05, "loss": 0.2283, "step": 1845 }, { "epoch": 3.150170648464164, "grad_norm": 0.3446202357655234, "learning_rate": 1.4484962522573139e-05, "loss": 0.2756, "step": 1846 }, { "epoch": 3.151877133105802, "grad_norm": 0.3587557392379511, "learning_rate": 1.4462063222240876e-05, "loss": 0.2863, "step": 1847 }, { "epoch": 3.15358361774744, "grad_norm": 0.34599281859684994, "learning_rate": 1.4439171781996963e-05, "loss": 0.2164, "step": 1848 }, { "epoch": 3.1552901023890785, "grad_norm": 0.36023909082037764, "learning_rate": 1.4416288234331619e-05, "loss": 0.2281, "step": 1849 }, { "epoch": 3.156996587030717, "grad_norm": 0.32250540221869584, "learning_rate": 1.439341261172385e-05, "loss": 0.2823, "step": 1850 }, { "epoch": 3.1587030716723548, "grad_norm": 0.36124174178421486, "learning_rate": 1.4370544946641417e-05, "loss": 0.2247, "step": 1851 }, { "epoch": 3.160409556313993, "grad_norm": 0.3452908730969287, "learning_rate": 1.4347685271540796e-05, "loss": 0.2418, "step": 1852 }, { "epoch": 3.1621160409556315, "grad_norm": 0.33620773915850727, "learning_rate": 1.4324833618867109e-05, "loss": 0.219, "step": 1853 }, { "epoch": 3.1638225255972694, "grad_norm": 0.33711611163939365, "learning_rate": 1.4301990021054097e-05, "loss": 0.2226, "step": 1854 }, { "epoch": 3.1655290102389078, "grad_norm": 0.3348941686990772, "learning_rate": 1.4279154510524067e-05, "loss": 0.2029, "step": 1855 }, { "epoch": 3.167235494880546, "grad_norm": 0.3653135426903386, "learning_rate": 1.4256327119687856e-05, "loss": 0.2376, "step": 1856 }, { "epoch": 3.1689419795221845, "grad_norm": 0.3361146834254551, "learning_rate": 1.4233507880944763e-05, "loss": 0.2553, "step": 1857 }, { "epoch": 3.1706484641638224, "grad_norm": 0.359233530077925, "learning_rate": 1.4210696826682528e-05, "loss": 0.271, "step": 1858 }, { "epoch": 3.1723549488054608, "grad_norm": 0.34714011682073453, "learning_rate": 1.4187893989277276e-05, "loss": 0.2314, "step": 1859 }, { "epoch": 3.174061433447099, "grad_norm": 0.34784780024400985, "learning_rate": 1.4165099401093451e-05, "loss": 0.2265, "step": 1860 }, { "epoch": 3.175767918088737, "grad_norm": 0.3348934881845762, "learning_rate": 1.4142313094483809e-05, "loss": 0.2383, "step": 1861 }, { "epoch": 3.1774744027303754, "grad_norm": 0.37631062261917697, "learning_rate": 1.4119535101789343e-05, "loss": 0.248, "step": 1862 }, { "epoch": 3.1791808873720138, "grad_norm": 0.35434134240920667, "learning_rate": 1.409676545533925e-05, "loss": 0.247, "step": 1863 }, { "epoch": 3.1808873720136517, "grad_norm": 0.31177329554546984, "learning_rate": 1.4074004187450875e-05, "loss": 0.2699, "step": 1864 }, { "epoch": 3.18259385665529, "grad_norm": 0.3692566348232966, "learning_rate": 1.4051251330429687e-05, "loss": 0.2614, "step": 1865 }, { "epoch": 3.1843003412969284, "grad_norm": 0.3591545979896418, "learning_rate": 1.402850691656918e-05, "loss": 0.222, "step": 1866 }, { "epoch": 3.1860068259385663, "grad_norm": 0.32658477622950105, "learning_rate": 1.4005770978150908e-05, "loss": 0.2384, "step": 1867 }, { "epoch": 3.1877133105802047, "grad_norm": 0.34270865867485484, "learning_rate": 1.3983043547444372e-05, "loss": 0.2669, "step": 1868 }, { "epoch": 3.189419795221843, "grad_norm": 0.3266859760124682, "learning_rate": 1.3960324656707007e-05, "loss": 0.235, "step": 1869 }, { "epoch": 3.1911262798634814, "grad_norm": 0.36711686303123836, "learning_rate": 1.3937614338184118e-05, "loss": 0.2142, "step": 1870 }, { "epoch": 3.1928327645051193, "grad_norm": 0.3429333059841405, "learning_rate": 1.3914912624108859e-05, "loss": 0.2263, "step": 1871 }, { "epoch": 3.1945392491467577, "grad_norm": 0.3384884011293629, "learning_rate": 1.3892219546702146e-05, "loss": 0.2171, "step": 1872 }, { "epoch": 3.196245733788396, "grad_norm": 0.3275230728976038, "learning_rate": 1.386953513817265e-05, "loss": 0.2254, "step": 1873 }, { "epoch": 3.197952218430034, "grad_norm": 0.33125861049669947, "learning_rate": 1.3846859430716754e-05, "loss": 0.2393, "step": 1874 }, { "epoch": 3.1996587030716723, "grad_norm": 0.3172722221260696, "learning_rate": 1.3824192456518473e-05, "loss": 0.2813, "step": 1875 }, { "epoch": 3.2013651877133107, "grad_norm": 0.3227587267254146, "learning_rate": 1.3801534247749429e-05, "loss": 0.2456, "step": 1876 }, { "epoch": 3.2030716723549486, "grad_norm": 0.3382551334984173, "learning_rate": 1.3778884836568805e-05, "loss": 0.2408, "step": 1877 }, { "epoch": 3.204778156996587, "grad_norm": 0.34909231303508853, "learning_rate": 1.3756244255123306e-05, "loss": 0.2433, "step": 1878 }, { "epoch": 3.2064846416382253, "grad_norm": 0.3525890311139006, "learning_rate": 1.3733612535547079e-05, "loss": 0.2273, "step": 1879 }, { "epoch": 3.2081911262798632, "grad_norm": 0.4012175842687751, "learning_rate": 1.3710989709961715e-05, "loss": 0.2228, "step": 1880 }, { "epoch": 3.2098976109215016, "grad_norm": 0.34042317206269157, "learning_rate": 1.3688375810476187e-05, "loss": 0.2442, "step": 1881 }, { "epoch": 3.21160409556314, "grad_norm": 0.2984314761610252, "learning_rate": 1.3665770869186786e-05, "loss": 0.2519, "step": 1882 }, { "epoch": 3.2133105802047783, "grad_norm": 0.36221064689138793, "learning_rate": 1.3643174918177087e-05, "loss": 0.2739, "step": 1883 }, { "epoch": 3.2150170648464163, "grad_norm": 0.34011365766330337, "learning_rate": 1.3620587989517923e-05, "loss": 0.2128, "step": 1884 }, { "epoch": 3.2167235494880546, "grad_norm": 0.3374640431132146, "learning_rate": 1.3598010115267291e-05, "loss": 0.2466, "step": 1885 }, { "epoch": 3.218430034129693, "grad_norm": 0.36025845239555393, "learning_rate": 1.3575441327470355e-05, "loss": 0.2444, "step": 1886 }, { "epoch": 3.220136518771331, "grad_norm": 0.34504373418480144, "learning_rate": 1.3552881658159387e-05, "loss": 0.2134, "step": 1887 }, { "epoch": 3.2218430034129693, "grad_norm": 0.3602740625682377, "learning_rate": 1.3530331139353714e-05, "loss": 0.2213, "step": 1888 }, { "epoch": 3.2235494880546076, "grad_norm": 0.3733850968683052, "learning_rate": 1.3507789803059668e-05, "loss": 0.2458, "step": 1889 }, { "epoch": 3.2252559726962455, "grad_norm": 0.34137926019999193, "learning_rate": 1.3485257681270566e-05, "loss": 0.2335, "step": 1890 }, { "epoch": 3.226962457337884, "grad_norm": 0.34130405558457455, "learning_rate": 1.3462734805966613e-05, "loss": 0.2215, "step": 1891 }, { "epoch": 3.2286689419795223, "grad_norm": 0.3575521878547094, "learning_rate": 1.3440221209114923e-05, "loss": 0.2184, "step": 1892 }, { "epoch": 3.2303754266211606, "grad_norm": 0.3551736749480817, "learning_rate": 1.3417716922669426e-05, "loss": 0.2428, "step": 1893 }, { "epoch": 3.2320819112627985, "grad_norm": 0.3446580440186442, "learning_rate": 1.3395221978570838e-05, "loss": 0.2775, "step": 1894 }, { "epoch": 3.233788395904437, "grad_norm": 0.3627323605040583, "learning_rate": 1.3372736408746621e-05, "loss": 0.2369, "step": 1895 }, { "epoch": 3.2354948805460753, "grad_norm": 0.3498572644209508, "learning_rate": 1.3350260245110937e-05, "loss": 0.2322, "step": 1896 }, { "epoch": 3.237201365187713, "grad_norm": 0.36722623239785107, "learning_rate": 1.3327793519564578e-05, "loss": 0.2467, "step": 1897 }, { "epoch": 3.2389078498293515, "grad_norm": 0.34721827634004465, "learning_rate": 1.330533626399495e-05, "loss": 0.2481, "step": 1898 }, { "epoch": 3.24061433447099, "grad_norm": 0.37345264955293433, "learning_rate": 1.3282888510276026e-05, "loss": 0.2369, "step": 1899 }, { "epoch": 3.242320819112628, "grad_norm": 0.29310257573596366, "learning_rate": 1.3260450290268287e-05, "loss": 0.2529, "step": 1900 }, { "epoch": 3.244027303754266, "grad_norm": 0.33081350524602654, "learning_rate": 1.3238021635818678e-05, "loss": 0.2188, "step": 1901 }, { "epoch": 3.2457337883959045, "grad_norm": 0.3465974170610297, "learning_rate": 1.3215602578760577e-05, "loss": 0.2382, "step": 1902 }, { "epoch": 3.2474402730375425, "grad_norm": 0.35455396116047055, "learning_rate": 1.3193193150913733e-05, "loss": 0.2152, "step": 1903 }, { "epoch": 3.249146757679181, "grad_norm": 0.37453236806803086, "learning_rate": 1.3170793384084225e-05, "loss": 0.2339, "step": 1904 }, { "epoch": 3.250853242320819, "grad_norm": 0.35469744803904324, "learning_rate": 1.3148403310064433e-05, "loss": 0.249, "step": 1905 }, { "epoch": 3.252559726962457, "grad_norm": 0.36018751994429793, "learning_rate": 1.3126022960632967e-05, "loss": 0.2274, "step": 1906 }, { "epoch": 3.2542662116040955, "grad_norm": 0.34554921252142795, "learning_rate": 1.3103652367554638e-05, "loss": 0.2548, "step": 1907 }, { "epoch": 3.255972696245734, "grad_norm": 0.3406967968970072, "learning_rate": 1.308129156258042e-05, "loss": 0.2647, "step": 1908 }, { "epoch": 3.257679180887372, "grad_norm": 0.34705004086797836, "learning_rate": 1.3058940577447377e-05, "loss": 0.2791, "step": 1909 }, { "epoch": 3.25938566552901, "grad_norm": 0.32784231106076733, "learning_rate": 1.3036599443878646e-05, "loss": 0.2373, "step": 1910 }, { "epoch": 3.2610921501706485, "grad_norm": 0.3069148460143671, "learning_rate": 1.3014268193583379e-05, "loss": 0.2348, "step": 1911 }, { "epoch": 3.262798634812287, "grad_norm": 0.337863834585334, "learning_rate": 1.2991946858256706e-05, "loss": 0.2419, "step": 1912 }, { "epoch": 3.2645051194539247, "grad_norm": 0.33560728930860845, "learning_rate": 1.2969635469579678e-05, "loss": 0.2447, "step": 1913 }, { "epoch": 3.266211604095563, "grad_norm": 0.35065106962297304, "learning_rate": 1.2947334059219228e-05, "loss": 0.2296, "step": 1914 }, { "epoch": 3.2679180887372015, "grad_norm": 0.34540691636026183, "learning_rate": 1.2925042658828133e-05, "loss": 0.2553, "step": 1915 }, { "epoch": 3.26962457337884, "grad_norm": 0.3371687676374983, "learning_rate": 1.2902761300044955e-05, "loss": 0.2365, "step": 1916 }, { "epoch": 3.2713310580204777, "grad_norm": 0.3113923658360549, "learning_rate": 1.2880490014494007e-05, "loss": 0.2299, "step": 1917 }, { "epoch": 3.273037542662116, "grad_norm": 0.3279319065598936, "learning_rate": 1.285822883378531e-05, "loss": 0.2393, "step": 1918 }, { "epoch": 3.274744027303754, "grad_norm": 0.33565710343698707, "learning_rate": 1.2835977789514534e-05, "loss": 0.2496, "step": 1919 }, { "epoch": 3.2764505119453924, "grad_norm": 0.3284855761396808, "learning_rate": 1.2813736913262966e-05, "loss": 0.245, "step": 1920 }, { "epoch": 3.2781569965870307, "grad_norm": 0.3520433933608297, "learning_rate": 1.279150623659747e-05, "loss": 0.2443, "step": 1921 }, { "epoch": 3.279863481228669, "grad_norm": 0.3064093157127476, "learning_rate": 1.2769285791070418e-05, "loss": 0.222, "step": 1922 }, { "epoch": 3.281569965870307, "grad_norm": 0.31084278254740383, "learning_rate": 1.2747075608219669e-05, "loss": 0.2589, "step": 1923 }, { "epoch": 3.2832764505119454, "grad_norm": 0.3346063879894871, "learning_rate": 1.2724875719568513e-05, "loss": 0.2461, "step": 1924 }, { "epoch": 3.2849829351535837, "grad_norm": 0.3297140693650807, "learning_rate": 1.270268615662564e-05, "loss": 0.2338, "step": 1925 }, { "epoch": 3.2866894197952217, "grad_norm": 0.3531062928841853, "learning_rate": 1.2680506950885065e-05, "loss": 0.2804, "step": 1926 }, { "epoch": 3.28839590443686, "grad_norm": 0.39237415889349203, "learning_rate": 1.2658338133826126e-05, "loss": 0.3519, "step": 1927 }, { "epoch": 3.2901023890784984, "grad_norm": 0.3655010611659869, "learning_rate": 1.2636179736913392e-05, "loss": 0.2332, "step": 1928 }, { "epoch": 3.2918088737201368, "grad_norm": 0.3526753256875661, "learning_rate": 1.2614031791596663e-05, "loss": 0.2602, "step": 1929 }, { "epoch": 3.2935153583617747, "grad_norm": 0.3363745793702122, "learning_rate": 1.2591894329310895e-05, "loss": 0.2583, "step": 1930 }, { "epoch": 3.295221843003413, "grad_norm": 0.3256636494750763, "learning_rate": 1.2569767381476161e-05, "loss": 0.2633, "step": 1931 }, { "epoch": 3.296928327645051, "grad_norm": 0.3502170397655713, "learning_rate": 1.2547650979497623e-05, "loss": 0.2397, "step": 1932 }, { "epoch": 3.2986348122866893, "grad_norm": 0.33003496997319587, "learning_rate": 1.2525545154765471e-05, "loss": 0.2458, "step": 1933 }, { "epoch": 3.3003412969283277, "grad_norm": 0.3579124619667584, "learning_rate": 1.250344993865487e-05, "loss": 0.235, "step": 1934 }, { "epoch": 3.302047781569966, "grad_norm": 0.34465714702561495, "learning_rate": 1.2481365362525944e-05, "loss": 0.2427, "step": 1935 }, { "epoch": 3.303754266211604, "grad_norm": 0.35744696182435814, "learning_rate": 1.2459291457723708e-05, "loss": 0.2204, "step": 1936 }, { "epoch": 3.3054607508532423, "grad_norm": 0.35339834478638066, "learning_rate": 1.2437228255578036e-05, "loss": 0.227, "step": 1937 }, { "epoch": 3.3071672354948807, "grad_norm": 0.3701262370914736, "learning_rate": 1.2415175787403602e-05, "loss": 0.237, "step": 1938 }, { "epoch": 3.3088737201365186, "grad_norm": 0.33875916075918705, "learning_rate": 1.239313408449986e-05, "loss": 0.239, "step": 1939 }, { "epoch": 3.310580204778157, "grad_norm": 0.34045558186040226, "learning_rate": 1.2371103178150965e-05, "loss": 0.3002, "step": 1940 }, { "epoch": 3.3122866894197953, "grad_norm": 0.338262407610433, "learning_rate": 1.2349083099625764e-05, "loss": 0.2131, "step": 1941 }, { "epoch": 3.3139931740614337, "grad_norm": 0.34534813428288663, "learning_rate": 1.2327073880177735e-05, "loss": 0.2468, "step": 1942 }, { "epoch": 3.3156996587030716, "grad_norm": 0.3854984150875541, "learning_rate": 1.2305075551044934e-05, "loss": 0.2383, "step": 1943 }, { "epoch": 3.31740614334471, "grad_norm": 0.33135716332895926, "learning_rate": 1.2283088143449966e-05, "loss": 0.2351, "step": 1944 }, { "epoch": 3.319112627986348, "grad_norm": 0.35284557516011333, "learning_rate": 1.2261111688599944e-05, "loss": 0.2827, "step": 1945 }, { "epoch": 3.3208191126279862, "grad_norm": 0.34011517179545575, "learning_rate": 1.223914621768641e-05, "loss": 0.2559, "step": 1946 }, { "epoch": 3.3225255972696246, "grad_norm": 0.3265219606050788, "learning_rate": 1.2217191761885339e-05, "loss": 0.2481, "step": 1947 }, { "epoch": 3.324232081911263, "grad_norm": 0.34878977356026486, "learning_rate": 1.2195248352357067e-05, "loss": 0.2245, "step": 1948 }, { "epoch": 3.325938566552901, "grad_norm": 0.3375538846496755, "learning_rate": 1.217331602024625e-05, "loss": 0.2398, "step": 1949 }, { "epoch": 3.3276450511945392, "grad_norm": 0.34580986789699825, "learning_rate": 1.2151394796681826e-05, "loss": 0.2367, "step": 1950 }, { "epoch": 3.3293515358361776, "grad_norm": 0.3442862008039689, "learning_rate": 1.2129484712776955e-05, "loss": 0.2309, "step": 1951 }, { "epoch": 3.3310580204778155, "grad_norm": 0.3541233175172183, "learning_rate": 1.2107585799629009e-05, "loss": 0.2557, "step": 1952 }, { "epoch": 3.332764505119454, "grad_norm": 0.3805049307635654, "learning_rate": 1.2085698088319468e-05, "loss": 0.2529, "step": 1953 }, { "epoch": 3.3344709897610922, "grad_norm": 0.3577101787428832, "learning_rate": 1.2063821609913941e-05, "loss": 0.2449, "step": 1954 }, { "epoch": 3.3361774744027306, "grad_norm": 0.32569582925826074, "learning_rate": 1.2041956395462098e-05, "loss": 0.2151, "step": 1955 }, { "epoch": 3.3378839590443685, "grad_norm": 0.3474692955307898, "learning_rate": 1.20201024759976e-05, "loss": 0.2637, "step": 1956 }, { "epoch": 3.339590443686007, "grad_norm": 0.34904041376292966, "learning_rate": 1.19982598825381e-05, "loss": 0.2235, "step": 1957 }, { "epoch": 3.3412969283276452, "grad_norm": 0.3842071665617674, "learning_rate": 1.1976428646085163e-05, "loss": 0.251, "step": 1958 }, { "epoch": 3.343003412969283, "grad_norm": 0.35094761850865375, "learning_rate": 1.1954608797624225e-05, "loss": 0.2362, "step": 1959 }, { "epoch": 3.3447098976109215, "grad_norm": 0.32792563908726285, "learning_rate": 1.1932800368124578e-05, "loss": 0.2448, "step": 1960 }, { "epoch": 3.34641638225256, "grad_norm": 0.36989155687533665, "learning_rate": 1.1911003388539291e-05, "loss": 0.2548, "step": 1961 }, { "epoch": 3.348122866894198, "grad_norm": 0.34510986347654204, "learning_rate": 1.18892178898052e-05, "loss": 0.2762, "step": 1962 }, { "epoch": 3.349829351535836, "grad_norm": 0.3267422575767667, "learning_rate": 1.1867443902842832e-05, "loss": 0.2522, "step": 1963 }, { "epoch": 3.3515358361774745, "grad_norm": 0.3044265017602334, "learning_rate": 1.1845681458556389e-05, "loss": 0.2503, "step": 1964 }, { "epoch": 3.3532423208191124, "grad_norm": 0.3319703442252075, "learning_rate": 1.1823930587833661e-05, "loss": 0.2703, "step": 1965 }, { "epoch": 3.354948805460751, "grad_norm": 0.3518177021180182, "learning_rate": 1.1802191321546042e-05, "loss": 0.2934, "step": 1966 }, { "epoch": 3.356655290102389, "grad_norm": 0.33236123352508423, "learning_rate": 1.1780463690548439e-05, "loss": 0.2333, "step": 1967 }, { "epoch": 3.3583617747440275, "grad_norm": 0.31062378619814635, "learning_rate": 1.1758747725679252e-05, "loss": 0.2506, "step": 1968 }, { "epoch": 3.3600682593856654, "grad_norm": 0.3582359855071637, "learning_rate": 1.1737043457760327e-05, "loss": 0.2527, "step": 1969 }, { "epoch": 3.361774744027304, "grad_norm": 0.3512082083980241, "learning_rate": 1.1715350917596905e-05, "loss": 0.2484, "step": 1970 }, { "epoch": 3.363481228668942, "grad_norm": 0.35249205053243116, "learning_rate": 1.1693670135977564e-05, "loss": 0.2408, "step": 1971 }, { "epoch": 3.36518771331058, "grad_norm": 0.3625026984233858, "learning_rate": 1.1672001143674212e-05, "loss": 0.2577, "step": 1972 }, { "epoch": 3.3668941979522184, "grad_norm": 0.3032013995413711, "learning_rate": 1.1650343971442035e-05, "loss": 0.2443, "step": 1973 }, { "epoch": 3.368600682593857, "grad_norm": 0.358853915448751, "learning_rate": 1.162869865001941e-05, "loss": 0.2429, "step": 1974 }, { "epoch": 3.3703071672354947, "grad_norm": 0.3595995927875013, "learning_rate": 1.1607065210127924e-05, "loss": 0.2487, "step": 1975 }, { "epoch": 3.372013651877133, "grad_norm": 0.3455297346187812, "learning_rate": 1.1585443682472286e-05, "loss": 0.2706, "step": 1976 }, { "epoch": 3.3737201365187715, "grad_norm": 0.32154204618152804, "learning_rate": 1.156383409774029e-05, "loss": 0.2409, "step": 1977 }, { "epoch": 3.3754266211604094, "grad_norm": 0.3249442534415948, "learning_rate": 1.1542236486602803e-05, "loss": 0.2343, "step": 1978 }, { "epoch": 3.3771331058020477, "grad_norm": 0.3597188145479468, "learning_rate": 1.1520650879713667e-05, "loss": 0.2506, "step": 1979 }, { "epoch": 3.378839590443686, "grad_norm": 0.32518884717067953, "learning_rate": 1.1499077307709723e-05, "loss": 0.2527, "step": 1980 }, { "epoch": 3.3805460750853245, "grad_norm": 0.3530381665715562, "learning_rate": 1.1477515801210695e-05, "loss": 0.244, "step": 1981 }, { "epoch": 3.3822525597269624, "grad_norm": 0.3463271542168002, "learning_rate": 1.1455966390819207e-05, "loss": 0.2601, "step": 1982 }, { "epoch": 3.3839590443686007, "grad_norm": 0.34753476438002995, "learning_rate": 1.1434429107120706e-05, "loss": 0.2605, "step": 1983 }, { "epoch": 3.385665529010239, "grad_norm": 0.3222606859771089, "learning_rate": 1.1412903980683412e-05, "loss": 0.2643, "step": 1984 }, { "epoch": 3.387372013651877, "grad_norm": 0.3235759469848687, "learning_rate": 1.1391391042058326e-05, "loss": 0.2409, "step": 1985 }, { "epoch": 3.3890784982935154, "grad_norm": 0.34987651338927844, "learning_rate": 1.1369890321779111e-05, "loss": 0.2952, "step": 1986 }, { "epoch": 3.3907849829351537, "grad_norm": 0.3546307720901437, "learning_rate": 1.1348401850362123e-05, "loss": 0.2693, "step": 1987 }, { "epoch": 3.3924914675767917, "grad_norm": 0.3197376481900684, "learning_rate": 1.1326925658306305e-05, "loss": 0.2303, "step": 1988 }, { "epoch": 3.39419795221843, "grad_norm": 0.3276668625330492, "learning_rate": 1.1305461776093201e-05, "loss": 0.2117, "step": 1989 }, { "epoch": 3.3959044368600684, "grad_norm": 0.36466775304996984, "learning_rate": 1.1284010234186837e-05, "loss": 0.2889, "step": 1990 }, { "epoch": 3.3976109215017063, "grad_norm": 0.3574113143457065, "learning_rate": 1.126257106303377e-05, "loss": 0.231, "step": 1991 }, { "epoch": 3.3993174061433447, "grad_norm": 0.3191063881050614, "learning_rate": 1.1241144293062987e-05, "loss": 0.2291, "step": 1992 }, { "epoch": 3.401023890784983, "grad_norm": 0.36727546588401, "learning_rate": 1.1219729954685859e-05, "loss": 0.2558, "step": 1993 }, { "epoch": 3.4027303754266214, "grad_norm": 0.3488165644525964, "learning_rate": 1.1198328078296132e-05, "loss": 0.2258, "step": 1994 }, { "epoch": 3.4044368600682593, "grad_norm": 0.3358403662067082, "learning_rate": 1.1176938694269852e-05, "loss": 0.2697, "step": 1995 }, { "epoch": 3.4061433447098977, "grad_norm": 0.33128698103955495, "learning_rate": 1.1155561832965333e-05, "loss": 0.2049, "step": 1996 }, { "epoch": 3.407849829351536, "grad_norm": 0.3380251770320218, "learning_rate": 1.1134197524723119e-05, "loss": 0.2352, "step": 1997 }, { "epoch": 3.409556313993174, "grad_norm": 0.323003213828888, "learning_rate": 1.1112845799865939e-05, "loss": 0.2637, "step": 1998 }, { "epoch": 3.4112627986348123, "grad_norm": 0.3582840796815567, "learning_rate": 1.1091506688698668e-05, "loss": 0.2423, "step": 1999 }, { "epoch": 3.4129692832764507, "grad_norm": 0.3531054241705733, "learning_rate": 1.1070180221508262e-05, "loss": 0.2426, "step": 2000 }, { "epoch": 3.4146757679180886, "grad_norm": 0.3712259718653561, "learning_rate": 1.104886642856376e-05, "loss": 0.206, "step": 2001 }, { "epoch": 3.416382252559727, "grad_norm": 0.3660185687278045, "learning_rate": 1.1027565340116161e-05, "loss": 0.328, "step": 2002 }, { "epoch": 3.4180887372013653, "grad_norm": 0.33910687317691934, "learning_rate": 1.1006276986398494e-05, "loss": 0.2502, "step": 2003 }, { "epoch": 3.419795221843003, "grad_norm": 0.3566924270411525, "learning_rate": 1.0985001397625656e-05, "loss": 0.2381, "step": 2004 }, { "epoch": 3.4215017064846416, "grad_norm": 0.3663963696457625, "learning_rate": 1.0963738603994472e-05, "loss": 0.2192, "step": 2005 }, { "epoch": 3.42320819112628, "grad_norm": 0.31861520786270336, "learning_rate": 1.0942488635683593e-05, "loss": 0.2296, "step": 2006 }, { "epoch": 3.4249146757679183, "grad_norm": 0.3480641618683181, "learning_rate": 1.0921251522853451e-05, "loss": 0.2474, "step": 2007 }, { "epoch": 3.426621160409556, "grad_norm": 0.3390906993385012, "learning_rate": 1.090002729564625e-05, "loss": 0.2539, "step": 2008 }, { "epoch": 3.4283276450511946, "grad_norm": 0.4183257078823607, "learning_rate": 1.0878815984185885e-05, "loss": 0.2501, "step": 2009 }, { "epoch": 3.430034129692833, "grad_norm": 0.34479899996859115, "learning_rate": 1.0857617618577952e-05, "loss": 0.2154, "step": 2010 }, { "epoch": 3.431740614334471, "grad_norm": 0.36386280275147787, "learning_rate": 1.0836432228909635e-05, "loss": 0.2516, "step": 2011 }, { "epoch": 3.4334470989761092, "grad_norm": 0.3302625894981507, "learning_rate": 1.0815259845249732e-05, "loss": 0.2189, "step": 2012 }, { "epoch": 3.4351535836177476, "grad_norm": 0.3762940139922422, "learning_rate": 1.0794100497648583e-05, "loss": 0.2359, "step": 2013 }, { "epoch": 3.4368600682593855, "grad_norm": 0.36364143241072366, "learning_rate": 1.0772954216137976e-05, "loss": 0.2126, "step": 2014 }, { "epoch": 3.438566552901024, "grad_norm": 0.3432791677901663, "learning_rate": 1.075182103073122e-05, "loss": 0.2494, "step": 2015 }, { "epoch": 3.4402730375426622, "grad_norm": 0.37294533024769344, "learning_rate": 1.0730700971422987e-05, "loss": 0.2829, "step": 2016 }, { "epoch": 3.4419795221843, "grad_norm": 0.3241303234593021, "learning_rate": 1.0709594068189358e-05, "loss": 0.2291, "step": 2017 }, { "epoch": 3.4436860068259385, "grad_norm": 0.33216601505329185, "learning_rate": 1.0688500350987698e-05, "loss": 0.2066, "step": 2018 }, { "epoch": 3.445392491467577, "grad_norm": 0.3116101017638262, "learning_rate": 1.0667419849756694e-05, "loss": 0.2605, "step": 2019 }, { "epoch": 3.4470989761092152, "grad_norm": 0.3460080728945283, "learning_rate": 1.0646352594416281e-05, "loss": 0.2388, "step": 2020 }, { "epoch": 3.448805460750853, "grad_norm": 0.3376151726400388, "learning_rate": 1.0625298614867536e-05, "loss": 0.2088, "step": 2021 }, { "epoch": 3.4505119453924915, "grad_norm": 0.33395702364836977, "learning_rate": 1.0604257940992757e-05, "loss": 0.2462, "step": 2022 }, { "epoch": 3.45221843003413, "grad_norm": 0.3668360281337767, "learning_rate": 1.0583230602655324e-05, "loss": 0.2518, "step": 2023 }, { "epoch": 3.453924914675768, "grad_norm": 0.34744879848627575, "learning_rate": 1.0562216629699701e-05, "loss": 0.2125, "step": 2024 }, { "epoch": 3.455631399317406, "grad_norm": 0.3826813486094946, "learning_rate": 1.0541216051951374e-05, "loss": 0.2667, "step": 2025 }, { "epoch": 3.4573378839590445, "grad_norm": 0.3377490570187433, "learning_rate": 1.052022889921683e-05, "loss": 0.2041, "step": 2026 }, { "epoch": 3.4590443686006824, "grad_norm": 0.3761261493220408, "learning_rate": 1.0499255201283493e-05, "loss": 0.2508, "step": 2027 }, { "epoch": 3.460750853242321, "grad_norm": 0.31098876312161366, "learning_rate": 1.047829498791968e-05, "loss": 0.2243, "step": 2028 }, { "epoch": 3.462457337883959, "grad_norm": 0.3636135752348594, "learning_rate": 1.0457348288874595e-05, "loss": 0.2202, "step": 2029 }, { "epoch": 3.464163822525597, "grad_norm": 0.32537791997034504, "learning_rate": 1.0436415133878233e-05, "loss": 0.2314, "step": 2030 }, { "epoch": 3.4658703071672354, "grad_norm": 0.3735986322438891, "learning_rate": 1.041549555264139e-05, "loss": 0.2142, "step": 2031 }, { "epoch": 3.467576791808874, "grad_norm": 0.3293558147258305, "learning_rate": 1.0394589574855583e-05, "loss": 0.2327, "step": 2032 }, { "epoch": 3.469283276450512, "grad_norm": 0.3476570831175653, "learning_rate": 1.037369723019301e-05, "loss": 0.2343, "step": 2033 }, { "epoch": 3.47098976109215, "grad_norm": 0.36212969651527965, "learning_rate": 1.0352818548306554e-05, "loss": 0.2288, "step": 2034 }, { "epoch": 3.4726962457337884, "grad_norm": 0.3856953437181421, "learning_rate": 1.0331953558829663e-05, "loss": 0.1916, "step": 2035 }, { "epoch": 3.474402730375427, "grad_norm": 0.34621353791171144, "learning_rate": 1.03111022913764e-05, "loss": 0.1997, "step": 2036 }, { "epoch": 3.4761092150170647, "grad_norm": 0.3663934082576361, "learning_rate": 1.0290264775541297e-05, "loss": 0.2376, "step": 2037 }, { "epoch": 3.477815699658703, "grad_norm": 0.35422542247938005, "learning_rate": 1.0269441040899422e-05, "loss": 0.2333, "step": 2038 }, { "epoch": 3.4795221843003414, "grad_norm": 0.3751510764701517, "learning_rate": 1.0248631117006243e-05, "loss": 0.2322, "step": 2039 }, { "epoch": 3.4812286689419794, "grad_norm": 0.31375231625716266, "learning_rate": 1.0227835033397638e-05, "loss": 0.2511, "step": 2040 }, { "epoch": 3.4829351535836177, "grad_norm": 0.31947992957786364, "learning_rate": 1.0207052819589855e-05, "loss": 0.2655, "step": 2041 }, { "epoch": 3.484641638225256, "grad_norm": 0.3254591679354409, "learning_rate": 1.0186284505079435e-05, "loss": 0.2314, "step": 2042 }, { "epoch": 3.486348122866894, "grad_norm": 0.3481138793380576, "learning_rate": 1.0165530119343214e-05, "loss": 0.264, "step": 2043 }, { "epoch": 3.4880546075085324, "grad_norm": 0.345451971216344, "learning_rate": 1.0144789691838239e-05, "loss": 0.2707, "step": 2044 }, { "epoch": 3.4897610921501707, "grad_norm": 0.3201471895581909, "learning_rate": 1.0124063252001745e-05, "loss": 0.2688, "step": 2045 }, { "epoch": 3.491467576791809, "grad_norm": 0.32945610263536973, "learning_rate": 1.010335082925114e-05, "loss": 0.2336, "step": 2046 }, { "epoch": 3.493174061433447, "grad_norm": 0.37693022505129287, "learning_rate": 1.0082652452983902e-05, "loss": 0.2251, "step": 2047 }, { "epoch": 3.4948805460750854, "grad_norm": 0.3724222324989047, "learning_rate": 1.006196815257761e-05, "loss": 0.2363, "step": 2048 }, { "epoch": 3.4965870307167237, "grad_norm": 0.3427774678713566, "learning_rate": 1.0041297957389826e-05, "loss": 0.249, "step": 2049 }, { "epoch": 3.4982935153583616, "grad_norm": 0.33088959732927303, "learning_rate": 1.0020641896758127e-05, "loss": 0.237, "step": 2050 }, { "epoch": 3.5, "grad_norm": 0.36103123726832864, "learning_rate": 1.0000000000000006e-05, "loss": 0.2115, "step": 2051 }, { "epoch": 3.5017064846416384, "grad_norm": 0.33607964075042507, "learning_rate": 9.97937229641285e-06, "loss": 0.2769, "step": 2052 }, { "epoch": 3.5034129692832767, "grad_norm": 0.3160307593162131, "learning_rate": 9.958758815273932e-06, "loss": 0.264, "step": 2053 }, { "epoch": 3.5051194539249146, "grad_norm": 0.3394635698134081, "learning_rate": 9.9381595858403e-06, "loss": 0.2331, "step": 2054 }, { "epoch": 3.506825938566553, "grad_norm": 0.34937552515141296, "learning_rate": 9.917574637348806e-06, "loss": 0.2599, "step": 2055 }, { "epoch": 3.508532423208191, "grad_norm": 0.4101937972209689, "learning_rate": 9.897003999016006e-06, "loss": 0.2768, "step": 2056 }, { "epoch": 3.5102389078498293, "grad_norm": 0.35787731128662253, "learning_rate": 9.876447700038175e-06, "loss": 0.2661, "step": 2057 }, { "epoch": 3.5119453924914676, "grad_norm": 0.3443731772940693, "learning_rate": 9.85590576959121e-06, "loss": 0.2497, "step": 2058 }, { "epoch": 3.513651877133106, "grad_norm": 0.3993109958736273, "learning_rate": 9.835378236830618e-06, "loss": 0.22, "step": 2059 }, { "epoch": 3.515358361774744, "grad_norm": 0.3605083597229052, "learning_rate": 9.814865130891489e-06, "loss": 0.2394, "step": 2060 }, { "epoch": 3.5170648464163823, "grad_norm": 0.3608372999361292, "learning_rate": 9.794366480888415e-06, "loss": 0.271, "step": 2061 }, { "epoch": 3.51877133105802, "grad_norm": 0.31714881311600096, "learning_rate": 9.773882315915494e-06, "loss": 0.2183, "step": 2062 }, { "epoch": 3.5204778156996586, "grad_norm": 0.3373085642605433, "learning_rate": 9.75341266504624e-06, "loss": 0.2594, "step": 2063 }, { "epoch": 3.522184300341297, "grad_norm": 0.3335303524801546, "learning_rate": 9.732957557333575e-06, "loss": 0.2658, "step": 2064 }, { "epoch": 3.5238907849829353, "grad_norm": 0.34937036857673354, "learning_rate": 9.712517021809798e-06, "loss": 0.2468, "step": 2065 }, { "epoch": 3.5255972696245736, "grad_norm": 0.3087690690490283, "learning_rate": 9.692091087486495e-06, "loss": 0.2542, "step": 2066 }, { "epoch": 3.5273037542662116, "grad_norm": 0.36445015588519614, "learning_rate": 9.671679783354557e-06, "loss": 0.2388, "step": 2067 }, { "epoch": 3.52901023890785, "grad_norm": 0.35192548669396095, "learning_rate": 9.651283138384084e-06, "loss": 0.2722, "step": 2068 }, { "epoch": 3.530716723549488, "grad_norm": 0.352238143899704, "learning_rate": 9.630901181524406e-06, "loss": 0.2539, "step": 2069 }, { "epoch": 3.532423208191126, "grad_norm": 0.3254943879594676, "learning_rate": 9.61053394170395e-06, "loss": 0.2523, "step": 2070 }, { "epoch": 3.5341296928327646, "grad_norm": 0.3471578549897025, "learning_rate": 9.590181447830305e-06, "loss": 0.2196, "step": 2071 }, { "epoch": 3.535836177474403, "grad_norm": 0.3428321981949603, "learning_rate": 9.56984372879012e-06, "loss": 0.2667, "step": 2072 }, { "epoch": 3.537542662116041, "grad_norm": 0.3351455769276011, "learning_rate": 9.549520813449053e-06, "loss": 0.2729, "step": 2073 }, { "epoch": 3.539249146757679, "grad_norm": 0.32223649268587223, "learning_rate": 9.52921273065178e-06, "loss": 0.2213, "step": 2074 }, { "epoch": 3.5409556313993176, "grad_norm": 0.3420021574309786, "learning_rate": 9.508919509221903e-06, "loss": 0.2326, "step": 2075 }, { "epoch": 3.5426621160409555, "grad_norm": 0.3453782665927399, "learning_rate": 9.488641177961939e-06, "loss": 0.2556, "step": 2076 }, { "epoch": 3.544368600682594, "grad_norm": 0.3203712108150496, "learning_rate": 9.46837776565326e-06, "loss": 0.2867, "step": 2077 }, { "epoch": 3.546075085324232, "grad_norm": 0.3287923565115119, "learning_rate": 9.448129301056083e-06, "loss": 0.2315, "step": 2078 }, { "epoch": 3.5477815699658706, "grad_norm": 0.3418083531162421, "learning_rate": 9.427895812909406e-06, "loss": 0.2489, "step": 2079 }, { "epoch": 3.5494880546075085, "grad_norm": 0.31476289542989877, "learning_rate": 9.407677329930953e-06, "loss": 0.2333, "step": 2080 }, { "epoch": 3.551194539249147, "grad_norm": 0.294170684685756, "learning_rate": 9.387473880817182e-06, "loss": 0.2683, "step": 2081 }, { "epoch": 3.5529010238907848, "grad_norm": 0.30412989872613483, "learning_rate": 9.367285494243164e-06, "loss": 0.2215, "step": 2082 }, { "epoch": 3.554607508532423, "grad_norm": 0.32414311515130295, "learning_rate": 9.347112198862645e-06, "loss": 0.2154, "step": 2083 }, { "epoch": 3.5563139931740615, "grad_norm": 0.32488273820018754, "learning_rate": 9.32695402330791e-06, "loss": 0.2352, "step": 2084 }, { "epoch": 3.5580204778157, "grad_norm": 0.3312057994129262, "learning_rate": 9.306810996189823e-06, "loss": 0.2377, "step": 2085 }, { "epoch": 3.5597269624573378, "grad_norm": 0.34094265625638326, "learning_rate": 9.286683146097705e-06, "loss": 0.2474, "step": 2086 }, { "epoch": 3.561433447098976, "grad_norm": 0.3714211231506183, "learning_rate": 9.266570501599372e-06, "loss": 0.2181, "step": 2087 }, { "epoch": 3.5631399317406145, "grad_norm": 0.33872908334548024, "learning_rate": 9.246473091241056e-06, "loss": 0.2087, "step": 2088 }, { "epoch": 3.5648464163822524, "grad_norm": 0.36716903367936693, "learning_rate": 9.226390943547322e-06, "loss": 0.2201, "step": 2089 }, { "epoch": 3.5665529010238908, "grad_norm": 0.32987230116430716, "learning_rate": 9.206324087021132e-06, "loss": 0.2371, "step": 2090 }, { "epoch": 3.568259385665529, "grad_norm": 0.3650003583650618, "learning_rate": 9.186272550143702e-06, "loss": 0.229, "step": 2091 }, { "epoch": 3.5699658703071675, "grad_norm": 0.3323941837267181, "learning_rate": 9.166236361374539e-06, "loss": 0.2271, "step": 2092 }, { "epoch": 3.5716723549488054, "grad_norm": 0.34853938656176586, "learning_rate": 9.14621554915133e-06, "loss": 0.2366, "step": 2093 }, { "epoch": 3.573378839590444, "grad_norm": 0.31455666309193503, "learning_rate": 9.126210141889974e-06, "loss": 0.247, "step": 2094 }, { "epoch": 3.5750853242320817, "grad_norm": 0.3179121796258658, "learning_rate": 9.106220167984474e-06, "loss": 0.2248, "step": 2095 }, { "epoch": 3.57679180887372, "grad_norm": 0.3216632641238737, "learning_rate": 9.08624565580694e-06, "loss": 0.2492, "step": 2096 }, { "epoch": 3.5784982935153584, "grad_norm": 0.32704473572032683, "learning_rate": 9.066286633707552e-06, "loss": 0.2413, "step": 2097 }, { "epoch": 3.580204778156997, "grad_norm": 0.4293241626059611, "learning_rate": 9.04634313001448e-06, "loss": 0.2424, "step": 2098 }, { "epoch": 3.5819112627986347, "grad_norm": 0.33355020607853786, "learning_rate": 9.026415173033886e-06, "loss": 0.2911, "step": 2099 }, { "epoch": 3.583617747440273, "grad_norm": 0.32899497678876044, "learning_rate": 9.006502791049861e-06, "loss": 0.2401, "step": 2100 }, { "epoch": 3.5853242320819114, "grad_norm": 0.33560992790127164, "learning_rate": 8.986606012324376e-06, "loss": 0.2224, "step": 2101 }, { "epoch": 3.5870307167235493, "grad_norm": 0.3149671347227518, "learning_rate": 8.96672486509729e-06, "loss": 0.2616, "step": 2102 }, { "epoch": 3.5887372013651877, "grad_norm": 0.3229912121146895, "learning_rate": 8.946859377586236e-06, "loss": 0.2551, "step": 2103 }, { "epoch": 3.590443686006826, "grad_norm": 0.33545725996995696, "learning_rate": 8.927009577986654e-06, "loss": 0.2136, "step": 2104 }, { "epoch": 3.5921501706484644, "grad_norm": 0.31534595869214316, "learning_rate": 8.907175494471693e-06, "loss": 0.2394, "step": 2105 }, { "epoch": 3.5938566552901023, "grad_norm": 0.3527301933284398, "learning_rate": 8.887357155192218e-06, "loss": 0.2323, "step": 2106 }, { "epoch": 3.5955631399317407, "grad_norm": 0.36778167133207873, "learning_rate": 8.867554588276732e-06, "loss": 0.2074, "step": 2107 }, { "epoch": 3.5972696245733786, "grad_norm": 0.3269001726303635, "learning_rate": 8.847767821831347e-06, "loss": 0.2668, "step": 2108 }, { "epoch": 3.598976109215017, "grad_norm": 0.3380568718247642, "learning_rate": 8.827996883939779e-06, "loss": 0.2431, "step": 2109 }, { "epoch": 3.6006825938566553, "grad_norm": 0.3529140072056332, "learning_rate": 8.808241802663236e-06, "loss": 0.2095, "step": 2110 }, { "epoch": 3.6023890784982937, "grad_norm": 0.31665187204189843, "learning_rate": 8.78850260604046e-06, "loss": 0.2277, "step": 2111 }, { "epoch": 3.6040955631399316, "grad_norm": 0.3412747364772627, "learning_rate": 8.768779322087626e-06, "loss": 0.2406, "step": 2112 }, { "epoch": 3.60580204778157, "grad_norm": 0.33124871502489983, "learning_rate": 8.749071978798319e-06, "loss": 0.2282, "step": 2113 }, { "epoch": 3.6075085324232083, "grad_norm": 0.35546659291499355, "learning_rate": 8.72938060414352e-06, "loss": 0.2705, "step": 2114 }, { "epoch": 3.6092150170648463, "grad_norm": 0.34706721829273074, "learning_rate": 8.709705226071526e-06, "loss": 0.2359, "step": 2115 }, { "epoch": 3.6109215017064846, "grad_norm": 0.35923389833000235, "learning_rate": 8.690045872507944e-06, "loss": 0.2538, "step": 2116 }, { "epoch": 3.612627986348123, "grad_norm": 0.30490467326500825, "learning_rate": 8.67040257135562e-06, "loss": 0.2699, "step": 2117 }, { "epoch": 3.6143344709897613, "grad_norm": 0.3216326103324856, "learning_rate": 8.650775350494643e-06, "loss": 0.2473, "step": 2118 }, { "epoch": 3.6160409556313993, "grad_norm": 0.337849661154815, "learning_rate": 8.631164237782253e-06, "loss": 0.2286, "step": 2119 }, { "epoch": 3.6177474402730376, "grad_norm": 0.3357868273033272, "learning_rate": 8.611569261052833e-06, "loss": 0.2234, "step": 2120 }, { "epoch": 3.6194539249146755, "grad_norm": 0.31790624894855846, "learning_rate": 8.59199044811788e-06, "loss": 0.249, "step": 2121 }, { "epoch": 3.621160409556314, "grad_norm": 0.3253336788323475, "learning_rate": 8.572427826765926e-06, "loss": 0.2522, "step": 2122 }, { "epoch": 3.6228668941979523, "grad_norm": 0.3437765066648847, "learning_rate": 8.55288142476255e-06, "loss": 0.2145, "step": 2123 }, { "epoch": 3.6245733788395906, "grad_norm": 0.3694147767833111, "learning_rate": 8.533351269850273e-06, "loss": 0.2315, "step": 2124 }, { "epoch": 3.6262798634812285, "grad_norm": 0.33413722060276174, "learning_rate": 8.5138373897486e-06, "loss": 0.259, "step": 2125 }, { "epoch": 3.627986348122867, "grad_norm": 0.32437481899148385, "learning_rate": 8.494339812153905e-06, "loss": 0.2442, "step": 2126 }, { "epoch": 3.6296928327645053, "grad_norm": 0.3611094623293024, "learning_rate": 8.474858564739423e-06, "loss": 0.2128, "step": 2127 }, { "epoch": 3.631399317406143, "grad_norm": 0.3138052807257615, "learning_rate": 8.455393675155239e-06, "loss": 0.2337, "step": 2128 }, { "epoch": 3.6331058020477816, "grad_norm": 0.33835867675852266, "learning_rate": 8.43594517102819e-06, "loss": 0.216, "step": 2129 }, { "epoch": 3.63481228668942, "grad_norm": 0.34307857648303824, "learning_rate": 8.41651307996188e-06, "loss": 0.2323, "step": 2130 }, { "epoch": 3.6365187713310583, "grad_norm": 0.31969100992776645, "learning_rate": 8.39709742953661e-06, "loss": 0.2518, "step": 2131 }, { "epoch": 3.638225255972696, "grad_norm": 0.34710848014672774, "learning_rate": 8.377698247309327e-06, "loss": 0.2374, "step": 2132 }, { "epoch": 3.6399317406143346, "grad_norm": 0.389195318887567, "learning_rate": 8.358315560813642e-06, "loss": 0.2039, "step": 2133 }, { "epoch": 3.6416382252559725, "grad_norm": 0.35978442598178156, "learning_rate": 8.33894939755972e-06, "loss": 0.2332, "step": 2134 }, { "epoch": 3.643344709897611, "grad_norm": 0.31619041833521977, "learning_rate": 8.319599785034296e-06, "loss": 0.2508, "step": 2135 }, { "epoch": 3.645051194539249, "grad_norm": 0.3851432754060637, "learning_rate": 8.300266750700598e-06, "loss": 0.3253, "step": 2136 }, { "epoch": 3.6467576791808876, "grad_norm": 0.3017952759413884, "learning_rate": 8.28095032199835e-06, "loss": 0.2465, "step": 2137 }, { "epoch": 3.6484641638225255, "grad_norm": 0.33636218269981427, "learning_rate": 8.261650526343665e-06, "loss": 0.212, "step": 2138 }, { "epoch": 3.650170648464164, "grad_norm": 0.4071704606431761, "learning_rate": 8.242367391129082e-06, "loss": 0.2115, "step": 2139 }, { "epoch": 3.651877133105802, "grad_norm": 0.3892103197274746, "learning_rate": 8.223100943723494e-06, "loss": 0.2438, "step": 2140 }, { "epoch": 3.65358361774744, "grad_norm": 0.34583333849310055, "learning_rate": 8.203851211472088e-06, "loss": 0.2224, "step": 2141 }, { "epoch": 3.6552901023890785, "grad_norm": 0.35043746125060526, "learning_rate": 8.184618221696346e-06, "loss": 0.2478, "step": 2142 }, { "epoch": 3.656996587030717, "grad_norm": 0.352112680488658, "learning_rate": 8.165402001693976e-06, "loss": 0.2115, "step": 2143 }, { "epoch": 3.658703071672355, "grad_norm": 0.31977179892520446, "learning_rate": 8.146202578738887e-06, "loss": 0.2366, "step": 2144 }, { "epoch": 3.660409556313993, "grad_norm": 0.38326623640423296, "learning_rate": 8.127019980081141e-06, "loss": 0.2525, "step": 2145 }, { "epoch": 3.6621160409556315, "grad_norm": 0.3179398491073991, "learning_rate": 8.107854232946937e-06, "loss": 0.2643, "step": 2146 }, { "epoch": 3.6638225255972694, "grad_norm": 0.3351679457817645, "learning_rate": 8.088705364538552e-06, "loss": 0.2265, "step": 2147 }, { "epoch": 3.6655290102389078, "grad_norm": 0.33559257148440325, "learning_rate": 8.06957340203429e-06, "loss": 0.2277, "step": 2148 }, { "epoch": 3.667235494880546, "grad_norm": 0.315222850177876, "learning_rate": 8.050458372588493e-06, "loss": 0.2481, "step": 2149 }, { "epoch": 3.6689419795221845, "grad_norm": 0.316354698233717, "learning_rate": 8.031360303331419e-06, "loss": 0.2461, "step": 2150 }, { "epoch": 3.6706484641638224, "grad_norm": 0.3299376154281702, "learning_rate": 8.012279221369308e-06, "loss": 0.2764, "step": 2151 }, { "epoch": 3.6723549488054608, "grad_norm": 0.3358898591519496, "learning_rate": 7.993215153784254e-06, "loss": 0.2412, "step": 2152 }, { "epoch": 3.674061433447099, "grad_norm": 0.34779848701641763, "learning_rate": 7.974168127634214e-06, "loss": 0.2123, "step": 2153 }, { "epoch": 3.675767918088737, "grad_norm": 0.34578906731774767, "learning_rate": 7.955138169952972e-06, "loss": 0.2425, "step": 2154 }, { "epoch": 3.6774744027303754, "grad_norm": 0.3179010754096789, "learning_rate": 7.936125307750062e-06, "loss": 0.2584, "step": 2155 }, { "epoch": 3.6791808873720138, "grad_norm": 0.3284611461610992, "learning_rate": 7.917129568010764e-06, "loss": 0.3071, "step": 2156 }, { "epoch": 3.680887372013652, "grad_norm": 0.3492684100554768, "learning_rate": 7.898150977696051e-06, "loss": 0.2251, "step": 2157 }, { "epoch": 3.68259385665529, "grad_norm": 0.33288502140176757, "learning_rate": 7.879189563742574e-06, "loss": 0.2009, "step": 2158 }, { "epoch": 3.6843003412969284, "grad_norm": 0.34974732061580127, "learning_rate": 7.860245353062575e-06, "loss": 0.2255, "step": 2159 }, { "epoch": 3.6860068259385663, "grad_norm": 0.3324104074612852, "learning_rate": 7.841318372543906e-06, "loss": 0.2708, "step": 2160 }, { "epoch": 3.6877133105802047, "grad_norm": 0.3081862433519352, "learning_rate": 7.822408649049963e-06, "loss": 0.2203, "step": 2161 }, { "epoch": 3.689419795221843, "grad_norm": 0.31222019978876575, "learning_rate": 7.803516209419631e-06, "loss": 0.2464, "step": 2162 }, { "epoch": 3.6911262798634814, "grad_norm": 0.3073306272647333, "learning_rate": 7.784641080467272e-06, "loss": 0.2685, "step": 2163 }, { "epoch": 3.6928327645051193, "grad_norm": 0.35335232359685936, "learning_rate": 7.76578328898267e-06, "loss": 0.2664, "step": 2164 }, { "epoch": 3.6945392491467577, "grad_norm": 0.3272278578330255, "learning_rate": 7.74694286173103e-06, "loss": 0.2718, "step": 2165 }, { "epoch": 3.696245733788396, "grad_norm": 0.3329431402767, "learning_rate": 7.728119825452875e-06, "loss": 0.2178, "step": 2166 }, { "epoch": 3.697952218430034, "grad_norm": 0.3365044939427897, "learning_rate": 7.709314206864071e-06, "loss": 0.2396, "step": 2167 }, { "epoch": 3.6996587030716723, "grad_norm": 0.3415383532921338, "learning_rate": 7.690526032655768e-06, "loss": 0.266, "step": 2168 }, { "epoch": 3.7013651877133107, "grad_norm": 0.3204323451928413, "learning_rate": 7.671755329494312e-06, "loss": 0.1995, "step": 2169 }, { "epoch": 3.703071672354949, "grad_norm": 0.32460920939744076, "learning_rate": 7.653002124021307e-06, "loss": 0.2149, "step": 2170 }, { "epoch": 3.704778156996587, "grad_norm": 0.33696350421488724, "learning_rate": 7.634266442853485e-06, "loss": 0.2153, "step": 2171 }, { "epoch": 3.7064846416382253, "grad_norm": 0.34418317167429263, "learning_rate": 7.615548312582728e-06, "loss": 0.2084, "step": 2172 }, { "epoch": 3.7081911262798632, "grad_norm": 0.34758003716187136, "learning_rate": 7.596847759775987e-06, "loss": 0.2628, "step": 2173 }, { "epoch": 3.7098976109215016, "grad_norm": 0.32713967444234604, "learning_rate": 7.5781648109752904e-06, "loss": 0.2353, "step": 2174 }, { "epoch": 3.71160409556314, "grad_norm": 0.4392955751004257, "learning_rate": 7.559499492697662e-06, "loss": 0.231, "step": 2175 }, { "epoch": 3.7133105802047783, "grad_norm": 0.37151134656871065, "learning_rate": 7.540851831435097e-06, "loss": 0.241, "step": 2176 }, { "epoch": 3.7150170648464163, "grad_norm": 0.3108353138523825, "learning_rate": 7.522221853654554e-06, "loss": 0.2597, "step": 2177 }, { "epoch": 3.7167235494880546, "grad_norm": 0.34212178021994255, "learning_rate": 7.503609585797866e-06, "loss": 0.2795, "step": 2178 }, { "epoch": 3.718430034129693, "grad_norm": 0.33501937974174234, "learning_rate": 7.485015054281757e-06, "loss": 0.2221, "step": 2179 }, { "epoch": 3.720136518771331, "grad_norm": 0.33226356966856707, "learning_rate": 7.4664382854977564e-06, "loss": 0.2105, "step": 2180 }, { "epoch": 3.7218430034129693, "grad_norm": 0.35084184784766803, "learning_rate": 7.447879305812184e-06, "loss": 0.2219, "step": 2181 }, { "epoch": 3.7235494880546076, "grad_norm": 0.370333932074055, "learning_rate": 7.429338141566129e-06, "loss": 0.2341, "step": 2182 }, { "epoch": 3.725255972696246, "grad_norm": 0.3485737212375025, "learning_rate": 7.410814819075369e-06, "loss": 0.2254, "step": 2183 }, { "epoch": 3.726962457337884, "grad_norm": 0.35474527253456567, "learning_rate": 7.392309364630388e-06, "loss": 0.2371, "step": 2184 }, { "epoch": 3.7286689419795223, "grad_norm": 0.3239195091455572, "learning_rate": 7.373821804496277e-06, "loss": 0.2137, "step": 2185 }, { "epoch": 3.73037542662116, "grad_norm": 0.37912144087539207, "learning_rate": 7.355352164912763e-06, "loss": 0.2174, "step": 2186 }, { "epoch": 3.7320819112627985, "grad_norm": 0.34093396600499953, "learning_rate": 7.336900472094113e-06, "loss": 0.2144, "step": 2187 }, { "epoch": 3.733788395904437, "grad_norm": 0.36912720858317266, "learning_rate": 7.318466752229123e-06, "loss": 0.2506, "step": 2188 }, { "epoch": 3.7354948805460753, "grad_norm": 0.3400048147976415, "learning_rate": 7.300051031481101e-06, "loss": 0.2558, "step": 2189 }, { "epoch": 3.737201365187713, "grad_norm": 0.3616647897221543, "learning_rate": 7.281653335987782e-06, "loss": 0.2323, "step": 2190 }, { "epoch": 3.7389078498293515, "grad_norm": 0.33049342906173645, "learning_rate": 7.263273691861341e-06, "loss": 0.2316, "step": 2191 }, { "epoch": 3.74061433447099, "grad_norm": 0.35189500703155224, "learning_rate": 7.244912125188308e-06, "loss": 0.2412, "step": 2192 }, { "epoch": 3.742320819112628, "grad_norm": 0.3864810014202174, "learning_rate": 7.226568662029583e-06, "loss": 0.226, "step": 2193 }, { "epoch": 3.744027303754266, "grad_norm": 0.3372212223722344, "learning_rate": 7.208243328420348e-06, "loss": 0.2394, "step": 2194 }, { "epoch": 3.7457337883959045, "grad_norm": 0.33764911560400823, "learning_rate": 7.189936150370061e-06, "loss": 0.2161, "step": 2195 }, { "epoch": 3.747440273037543, "grad_norm": 0.29995034049716035, "learning_rate": 7.17164715386242e-06, "loss": 0.2302, "step": 2196 }, { "epoch": 3.749146757679181, "grad_norm": 0.30065277674622454, "learning_rate": 7.153376364855298e-06, "loss": 0.2355, "step": 2197 }, { "epoch": 3.750853242320819, "grad_norm": 0.3233014807766729, "learning_rate": 7.135123809280755e-06, "loss": 0.2206, "step": 2198 }, { "epoch": 3.752559726962457, "grad_norm": 0.3085572665587782, "learning_rate": 7.116889513044947e-06, "loss": 0.2431, "step": 2199 }, { "epoch": 3.7542662116040955, "grad_norm": 0.3099328328660361, "learning_rate": 7.098673502028115e-06, "loss": 0.2432, "step": 2200 }, { "epoch": 3.755972696245734, "grad_norm": 0.3453231654873741, "learning_rate": 7.08047580208457e-06, "loss": 0.2463, "step": 2201 }, { "epoch": 3.757679180887372, "grad_norm": 0.34884193800760066, "learning_rate": 7.062296439042602e-06, "loss": 0.2295, "step": 2202 }, { "epoch": 3.75938566552901, "grad_norm": 0.3118987064147418, "learning_rate": 7.044135438704509e-06, "loss": 0.2682, "step": 2203 }, { "epoch": 3.7610921501706485, "grad_norm": 0.35472956867938155, "learning_rate": 7.025992826846493e-06, "loss": 0.2351, "step": 2204 }, { "epoch": 3.762798634812287, "grad_norm": 0.30905315147129464, "learning_rate": 7.007868629218686e-06, "loss": 0.2411, "step": 2205 }, { "epoch": 3.7645051194539247, "grad_norm": 0.336875372461944, "learning_rate": 6.989762871545069e-06, "loss": 0.2832, "step": 2206 }, { "epoch": 3.766211604095563, "grad_norm": 0.5530281983642463, "learning_rate": 6.971675579523443e-06, "loss": 0.3134, "step": 2207 }, { "epoch": 3.7679180887372015, "grad_norm": 0.3346356161529318, "learning_rate": 6.953606778825426e-06, "loss": 0.2121, "step": 2208 }, { "epoch": 3.76962457337884, "grad_norm": 0.3457212843139903, "learning_rate": 6.9355564950963606e-06, "loss": 0.2487, "step": 2209 }, { "epoch": 3.7713310580204777, "grad_norm": 0.30021566156978446, "learning_rate": 6.917524753955338e-06, "loss": 0.2632, "step": 2210 }, { "epoch": 3.773037542662116, "grad_norm": 0.37622942535137993, "learning_rate": 6.899511580995111e-06, "loss": 0.2448, "step": 2211 }, { "epoch": 3.774744027303754, "grad_norm": 0.33896457300845667, "learning_rate": 6.881517001782074e-06, "loss": 0.2577, "step": 2212 }, { "epoch": 3.7764505119453924, "grad_norm": 0.3362204559926037, "learning_rate": 6.8635410418562585e-06, "loss": 0.2014, "step": 2213 }, { "epoch": 3.7781569965870307, "grad_norm": 0.36751468114360947, "learning_rate": 6.845583726731236e-06, "loss": 0.2032, "step": 2214 }, { "epoch": 3.779863481228669, "grad_norm": 0.3390314224721075, "learning_rate": 6.827645081894141e-06, "loss": 0.2356, "step": 2215 }, { "epoch": 3.781569965870307, "grad_norm": 0.3571072257977754, "learning_rate": 6.809725132805591e-06, "loss": 0.2418, "step": 2216 }, { "epoch": 3.7832764505119454, "grad_norm": 0.350285070405977, "learning_rate": 6.791823904899695e-06, "loss": 0.2256, "step": 2217 }, { "epoch": 3.7849829351535837, "grad_norm": 0.32614851056606337, "learning_rate": 6.773941423583945e-06, "loss": 0.2377, "step": 2218 }, { "epoch": 3.7866894197952217, "grad_norm": 0.3467495156868907, "learning_rate": 6.756077714239264e-06, "loss": 0.2482, "step": 2219 }, { "epoch": 3.78839590443686, "grad_norm": 0.3416010808769771, "learning_rate": 6.7382328022199265e-06, "loss": 0.2546, "step": 2220 }, { "epoch": 3.7901023890784984, "grad_norm": 0.3177041264236061, "learning_rate": 6.720406712853511e-06, "loss": 0.2522, "step": 2221 }, { "epoch": 3.7918088737201368, "grad_norm": 0.34743021750533976, "learning_rate": 6.7025994714409004e-06, "loss": 0.2487, "step": 2222 }, { "epoch": 3.7935153583617747, "grad_norm": 0.3337937791850411, "learning_rate": 6.684811103256215e-06, "loss": 0.2148, "step": 2223 }, { "epoch": 3.795221843003413, "grad_norm": 0.32356246884646855, "learning_rate": 6.667041633546785e-06, "loss": 0.2401, "step": 2224 }, { "epoch": 3.796928327645051, "grad_norm": 0.3113785634712896, "learning_rate": 6.649291087533119e-06, "loss": 0.2329, "step": 2225 }, { "epoch": 3.7986348122866893, "grad_norm": 0.3424029121429665, "learning_rate": 6.631559490408874e-06, "loss": 0.2345, "step": 2226 }, { "epoch": 3.8003412969283277, "grad_norm": 0.299235607502441, "learning_rate": 6.613846867340821e-06, "loss": 0.2521, "step": 2227 }, { "epoch": 3.802047781569966, "grad_norm": 0.3321546834321874, "learning_rate": 6.5961532434687704e-06, "loss": 0.2202, "step": 2228 }, { "epoch": 3.803754266211604, "grad_norm": 0.3061966973035992, "learning_rate": 6.578478643905601e-06, "loss": 0.2475, "step": 2229 }, { "epoch": 3.8054607508532423, "grad_norm": 0.33110608821165544, "learning_rate": 6.560823093737165e-06, "loss": 0.248, "step": 2230 }, { "epoch": 3.8071672354948807, "grad_norm": 0.3080709292000393, "learning_rate": 6.54318661802229e-06, "loss": 0.2633, "step": 2231 }, { "epoch": 3.8088737201365186, "grad_norm": 0.35665467275094986, "learning_rate": 6.52556924179272e-06, "loss": 0.2292, "step": 2232 }, { "epoch": 3.810580204778157, "grad_norm": 0.33053205317040113, "learning_rate": 6.507970990053103e-06, "loss": 0.2418, "step": 2233 }, { "epoch": 3.8122866894197953, "grad_norm": 0.30625806374467224, "learning_rate": 6.490391887780947e-06, "loss": 0.2344, "step": 2234 }, { "epoch": 3.8139931740614337, "grad_norm": 0.30583391518437175, "learning_rate": 6.472831959926558e-06, "loss": 0.2452, "step": 2235 }, { "epoch": 3.8156996587030716, "grad_norm": 0.3135255557559026, "learning_rate": 6.4552912314130614e-06, "loss": 0.2172, "step": 2236 }, { "epoch": 3.81740614334471, "grad_norm": 0.3307861502183436, "learning_rate": 6.43776972713629e-06, "loss": 0.2244, "step": 2237 }, { "epoch": 3.819112627986348, "grad_norm": 0.3206168661451337, "learning_rate": 6.420267471964829e-06, "loss": 0.2305, "step": 2238 }, { "epoch": 3.8208191126279862, "grad_norm": 0.29036335019893844, "learning_rate": 6.40278449073992e-06, "loss": 0.2733, "step": 2239 }, { "epoch": 3.8225255972696246, "grad_norm": 0.3094904433899518, "learning_rate": 6.385320808275459e-06, "loss": 0.229, "step": 2240 }, { "epoch": 3.824232081911263, "grad_norm": 0.3121188827480497, "learning_rate": 6.36787644935796e-06, "loss": 0.2651, "step": 2241 }, { "epoch": 3.825938566552901, "grad_norm": 0.3329944582433921, "learning_rate": 6.3504514387464925e-06, "loss": 0.2161, "step": 2242 }, { "epoch": 3.8276450511945392, "grad_norm": 0.38151076966358494, "learning_rate": 6.333045801172668e-06, "loss": 0.2189, "step": 2243 }, { "epoch": 3.8293515358361776, "grad_norm": 0.34522340292194326, "learning_rate": 6.315659561340606e-06, "loss": 0.2924, "step": 2244 }, { "epoch": 3.8310580204778155, "grad_norm": 0.34706992737004416, "learning_rate": 6.298292743926901e-06, "loss": 0.2191, "step": 2245 }, { "epoch": 3.832764505119454, "grad_norm": 0.29693843677243775, "learning_rate": 6.280945373580563e-06, "loss": 0.244, "step": 2246 }, { "epoch": 3.8344709897610922, "grad_norm": 0.35326438823935363, "learning_rate": 6.263617474923019e-06, "loss": 0.2571, "step": 2247 }, { "epoch": 3.8361774744027306, "grad_norm": 0.33319605929871543, "learning_rate": 6.246309072548062e-06, "loss": 0.2292, "step": 2248 }, { "epoch": 3.8378839590443685, "grad_norm": 0.37060000034961466, "learning_rate": 6.22902019102178e-06, "loss": 0.2536, "step": 2249 }, { "epoch": 3.839590443686007, "grad_norm": 0.35893056819086877, "learning_rate": 6.211750854882594e-06, "loss": 0.327, "step": 2250 }, { "epoch": 3.841296928327645, "grad_norm": 0.354582764238136, "learning_rate": 6.19450108864116e-06, "loss": 0.22, "step": 2251 }, { "epoch": 3.843003412969283, "grad_norm": 0.3232038907232125, "learning_rate": 6.177270916780378e-06, "loss": 0.2261, "step": 2252 }, { "epoch": 3.8447098976109215, "grad_norm": 0.34937331755549195, "learning_rate": 6.160060363755311e-06, "loss": 0.2511, "step": 2253 }, { "epoch": 3.84641638225256, "grad_norm": 0.3288608821927333, "learning_rate": 6.142869453993203e-06, "loss": 0.229, "step": 2254 }, { "epoch": 3.848122866894198, "grad_norm": 0.3414056843048828, "learning_rate": 6.125698211893403e-06, "loss": 0.243, "step": 2255 }, { "epoch": 3.849829351535836, "grad_norm": 0.3665089696862573, "learning_rate": 6.108546661827339e-06, "loss": 0.2791, "step": 2256 }, { "epoch": 3.8515358361774745, "grad_norm": 0.33870992023890073, "learning_rate": 6.0914148281385134e-06, "loss": 0.2369, "step": 2257 }, { "epoch": 3.8532423208191124, "grad_norm": 0.37553628654683074, "learning_rate": 6.074302735142419e-06, "loss": 0.2446, "step": 2258 }, { "epoch": 3.854948805460751, "grad_norm": 0.350783647429083, "learning_rate": 6.057210407126552e-06, "loss": 0.2203, "step": 2259 }, { "epoch": 3.856655290102389, "grad_norm": 0.3459254788092183, "learning_rate": 6.040137868350342e-06, "loss": 0.2236, "step": 2260 }, { "epoch": 3.8583617747440275, "grad_norm": 0.3601623602081703, "learning_rate": 6.0230851430451265e-06, "loss": 0.2306, "step": 2261 }, { "epoch": 3.8600682593856654, "grad_norm": 0.33960158131789747, "learning_rate": 6.006052255414145e-06, "loss": 0.2158, "step": 2262 }, { "epoch": 3.861774744027304, "grad_norm": 0.305629609959642, "learning_rate": 5.989039229632454e-06, "loss": 0.2609, "step": 2263 }, { "epoch": 3.8634812286689417, "grad_norm": 0.332401760716566, "learning_rate": 5.972046089846941e-06, "loss": 0.2387, "step": 2264 }, { "epoch": 3.86518771331058, "grad_norm": 0.3265704471848609, "learning_rate": 5.95507286017625e-06, "loss": 0.2874, "step": 2265 }, { "epoch": 3.8668941979522184, "grad_norm": 0.3080184933089667, "learning_rate": 5.938119564710787e-06, "loss": 0.2034, "step": 2266 }, { "epoch": 3.868600682593857, "grad_norm": 0.3289585999305886, "learning_rate": 5.92118622751265e-06, "loss": 0.2406, "step": 2267 }, { "epoch": 3.8703071672354947, "grad_norm": 0.3088107402469846, "learning_rate": 5.904272872615606e-06, "loss": 0.2669, "step": 2268 }, { "epoch": 3.872013651877133, "grad_norm": 0.33557172655003475, "learning_rate": 5.887379524025083e-06, "loss": 0.2157, "step": 2269 }, { "epoch": 3.8737201365187715, "grad_norm": 0.30262976875520914, "learning_rate": 5.870506205718085e-06, "loss": 0.236, "step": 2270 }, { "epoch": 3.8754266211604094, "grad_norm": 0.343779664813241, "learning_rate": 5.853652941643213e-06, "loss": 0.2378, "step": 2271 }, { "epoch": 3.8771331058020477, "grad_norm": 0.36997023186316547, "learning_rate": 5.836819755720584e-06, "loss": 0.2414, "step": 2272 }, { "epoch": 3.878839590443686, "grad_norm": 0.32154972520442693, "learning_rate": 5.820006671841836e-06, "loss": 0.2221, "step": 2273 }, { "epoch": 3.8805460750853245, "grad_norm": 0.31894735071449115, "learning_rate": 5.803213713870059e-06, "loss": 0.2388, "step": 2274 }, { "epoch": 3.8822525597269624, "grad_norm": 0.32642540892236, "learning_rate": 5.786440905639785e-06, "loss": 0.2871, "step": 2275 }, { "epoch": 3.8839590443686007, "grad_norm": 0.34316752007851276, "learning_rate": 5.769688270956955e-06, "loss": 0.2549, "step": 2276 }, { "epoch": 3.8856655290102387, "grad_norm": 0.33117472072087556, "learning_rate": 5.7529558335988565e-06, "loss": 0.1963, "step": 2277 }, { "epoch": 3.887372013651877, "grad_norm": 0.3466509582332143, "learning_rate": 5.736243617314141e-06, "loss": 0.2527, "step": 2278 }, { "epoch": 3.8890784982935154, "grad_norm": 0.36290764683736154, "learning_rate": 5.719551645822732e-06, "loss": 0.2143, "step": 2279 }, { "epoch": 3.8907849829351537, "grad_norm": 0.3077642498955797, "learning_rate": 5.702879942815827e-06, "loss": 0.2556, "step": 2280 }, { "epoch": 3.8924914675767917, "grad_norm": 0.29458913299329875, "learning_rate": 5.686228531955868e-06, "loss": 0.2473, "step": 2281 }, { "epoch": 3.89419795221843, "grad_norm": 0.37471296755406247, "learning_rate": 5.6695974368764795e-06, "loss": 0.2266, "step": 2282 }, { "epoch": 3.8959044368600684, "grad_norm": 0.32745265445978655, "learning_rate": 5.652986681182469e-06, "loss": 0.2553, "step": 2283 }, { "epoch": 3.8976109215017063, "grad_norm": 0.32490435963202086, "learning_rate": 5.6363962884497525e-06, "loss": 0.2368, "step": 2284 }, { "epoch": 3.8993174061433447, "grad_norm": 0.3216794269515537, "learning_rate": 5.619826282225374e-06, "loss": 0.2542, "step": 2285 }, { "epoch": 3.901023890784983, "grad_norm": 0.3277939858056872, "learning_rate": 5.603276686027415e-06, "loss": 0.2526, "step": 2286 }, { "epoch": 3.9027303754266214, "grad_norm": 0.30229387775788785, "learning_rate": 5.586747523345e-06, "loss": 0.257, "step": 2287 }, { "epoch": 3.9044368600682593, "grad_norm": 0.2910221901714991, "learning_rate": 5.570238817638261e-06, "loss": 0.2414, "step": 2288 }, { "epoch": 3.9061433447098977, "grad_norm": 0.3176948209851626, "learning_rate": 5.553750592338274e-06, "loss": 0.2415, "step": 2289 }, { "epoch": 3.9078498293515356, "grad_norm": 0.37527095766258994, "learning_rate": 5.537282870847071e-06, "loss": 0.2223, "step": 2290 }, { "epoch": 3.909556313993174, "grad_norm": 0.3196489250939008, "learning_rate": 5.520835676537568e-06, "loss": 0.2302, "step": 2291 }, { "epoch": 3.9112627986348123, "grad_norm": 0.32730931571869526, "learning_rate": 5.504409032753539e-06, "loss": 0.2634, "step": 2292 }, { "epoch": 3.9129692832764507, "grad_norm": 0.31263443085071635, "learning_rate": 5.4880029628096154e-06, "loss": 0.297, "step": 2293 }, { "epoch": 3.9146757679180886, "grad_norm": 0.3062321902336411, "learning_rate": 5.471617489991199e-06, "loss": 0.2359, "step": 2294 }, { "epoch": 3.916382252559727, "grad_norm": 0.3064753602125174, "learning_rate": 5.455252637554485e-06, "loss": 0.2169, "step": 2295 }, { "epoch": 3.9180887372013653, "grad_norm": 0.3258559818378836, "learning_rate": 5.438908428726375e-06, "loss": 0.2459, "step": 2296 }, { "epoch": 3.919795221843003, "grad_norm": 0.31517624160939745, "learning_rate": 5.422584886704503e-06, "loss": 0.2418, "step": 2297 }, { "epoch": 3.9215017064846416, "grad_norm": 0.3254440422368787, "learning_rate": 5.406282034657124e-06, "loss": 0.2363, "step": 2298 }, { "epoch": 3.92320819112628, "grad_norm": 0.35719119206231137, "learning_rate": 5.389999895723171e-06, "loss": 0.2463, "step": 2299 }, { "epoch": 3.9249146757679183, "grad_norm": 0.34033229275445037, "learning_rate": 5.3737384930121664e-06, "loss": 0.2377, "step": 2300 }, { "epoch": 3.926621160409556, "grad_norm": 0.31523156058285745, "learning_rate": 5.357497849604185e-06, "loss": 0.269, "step": 2301 }, { "epoch": 3.9283276450511946, "grad_norm": 0.3346505640474263, "learning_rate": 5.341277988549863e-06, "loss": 0.2719, "step": 2302 }, { "epoch": 3.9300341296928325, "grad_norm": 0.3224474214926856, "learning_rate": 5.325078932870311e-06, "loss": 0.1977, "step": 2303 }, { "epoch": 3.931740614334471, "grad_norm": 0.3295470584186461, "learning_rate": 5.308900705557147e-06, "loss": 0.2422, "step": 2304 }, { "epoch": 3.9334470989761092, "grad_norm": 0.3411314928029215, "learning_rate": 5.292743329572381e-06, "loss": 0.3056, "step": 2305 }, { "epoch": 3.9351535836177476, "grad_norm": 0.31089877772154434, "learning_rate": 5.276606827848463e-06, "loss": 0.2731, "step": 2306 }, { "epoch": 3.9368600682593855, "grad_norm": 0.3202486943451624, "learning_rate": 5.2604912232882156e-06, "loss": 0.223, "step": 2307 }, { "epoch": 3.938566552901024, "grad_norm": 0.35034122491047537, "learning_rate": 5.244396538764775e-06, "loss": 0.2836, "step": 2308 }, { "epoch": 3.9402730375426622, "grad_norm": 0.3493013375936862, "learning_rate": 5.228322797121619e-06, "loss": 0.2349, "step": 2309 }, { "epoch": 3.9419795221843, "grad_norm": 0.34220479674079957, "learning_rate": 5.212270021172477e-06, "loss": 0.206, "step": 2310 }, { "epoch": 3.9436860068259385, "grad_norm": 0.3280333198375722, "learning_rate": 5.196238233701325e-06, "loss": 0.2146, "step": 2311 }, { "epoch": 3.945392491467577, "grad_norm": 0.3030451926278049, "learning_rate": 5.18022745746235e-06, "loss": 0.2659, "step": 2312 }, { "epoch": 3.9470989761092152, "grad_norm": 0.3034563225242389, "learning_rate": 5.164237715179925e-06, "loss": 0.266, "step": 2313 }, { "epoch": 3.948805460750853, "grad_norm": 0.33878900097053566, "learning_rate": 5.148269029548571e-06, "loss": 0.2592, "step": 2314 }, { "epoch": 3.9505119453924915, "grad_norm": 0.29709169171885424, "learning_rate": 5.132321423232906e-06, "loss": 0.2514, "step": 2315 }, { "epoch": 3.9522184300341294, "grad_norm": 0.30617037457440516, "learning_rate": 5.116394918867655e-06, "loss": 0.2784, "step": 2316 }, { "epoch": 3.953924914675768, "grad_norm": 0.3277167449507937, "learning_rate": 5.100489539057558e-06, "loss": 0.2547, "step": 2317 }, { "epoch": 3.955631399317406, "grad_norm": 0.3652591747948567, "learning_rate": 5.084605306377408e-06, "loss": 0.2121, "step": 2318 }, { "epoch": 3.9573378839590445, "grad_norm": 0.33869441229654995, "learning_rate": 5.068742243371958e-06, "loss": 0.2413, "step": 2319 }, { "epoch": 3.9590443686006824, "grad_norm": 0.3200939822375758, "learning_rate": 5.0529003725559336e-06, "loss": 0.2938, "step": 2320 }, { "epoch": 3.960750853242321, "grad_norm": 0.3255871580066888, "learning_rate": 5.037079716413962e-06, "loss": 0.2306, "step": 2321 }, { "epoch": 3.962457337883959, "grad_norm": 0.33548813018959317, "learning_rate": 5.021280297400584e-06, "loss": 0.2539, "step": 2322 }, { "epoch": 3.964163822525597, "grad_norm": 0.3237828408910613, "learning_rate": 5.005502137940179e-06, "loss": 0.2591, "step": 2323 }, { "epoch": 3.9658703071672354, "grad_norm": 0.3858997700192493, "learning_rate": 4.989745260426952e-06, "loss": 0.2894, "step": 2324 }, { "epoch": 3.967576791808874, "grad_norm": 0.3107408904982704, "learning_rate": 4.974009687224919e-06, "loss": 0.2434, "step": 2325 }, { "epoch": 3.969283276450512, "grad_norm": 0.2898058224896841, "learning_rate": 4.95829544066784e-06, "loss": 0.2727, "step": 2326 }, { "epoch": 3.97098976109215, "grad_norm": 0.3488440701342725, "learning_rate": 4.942602543059223e-06, "loss": 0.228, "step": 2327 }, { "epoch": 3.9726962457337884, "grad_norm": 0.3258586338371556, "learning_rate": 4.926931016672259e-06, "loss": 0.2544, "step": 2328 }, { "epoch": 3.9744027303754264, "grad_norm": 0.330967486944079, "learning_rate": 4.91128088374981e-06, "loss": 0.2427, "step": 2329 }, { "epoch": 3.9761092150170647, "grad_norm": 0.3520407541519079, "learning_rate": 4.895652166504388e-06, "loss": 0.3261, "step": 2330 }, { "epoch": 3.977815699658703, "grad_norm": 0.3209888995589713, "learning_rate": 4.880044887118087e-06, "loss": 0.2429, "step": 2331 }, { "epoch": 3.9795221843003414, "grad_norm": 0.33335280650284016, "learning_rate": 4.864459067742595e-06, "loss": 0.2401, "step": 2332 }, { "epoch": 3.98122866894198, "grad_norm": 0.3202836880283363, "learning_rate": 4.848894730499125e-06, "loss": 0.2485, "step": 2333 }, { "epoch": 3.9829351535836177, "grad_norm": 0.34117405364873554, "learning_rate": 4.833351897478413e-06, "loss": 0.2533, "step": 2334 }, { "epoch": 3.984641638225256, "grad_norm": 0.30368478133877486, "learning_rate": 4.817830590740666e-06, "loss": 0.23, "step": 2335 }, { "epoch": 3.986348122866894, "grad_norm": 0.34107955341438967, "learning_rate": 4.802330832315534e-06, "loss": 0.2074, "step": 2336 }, { "epoch": 3.9880546075085324, "grad_norm": 0.3952011415516192, "learning_rate": 4.786852644202098e-06, "loss": 0.2755, "step": 2337 }, { "epoch": 3.9897610921501707, "grad_norm": 0.32140344822370476, "learning_rate": 4.771396048368806e-06, "loss": 0.2435, "step": 2338 }, { "epoch": 3.991467576791809, "grad_norm": 0.33607156170289026, "learning_rate": 4.7559610667534806e-06, "loss": 0.243, "step": 2339 }, { "epoch": 3.993174061433447, "grad_norm": 0.309209732048636, "learning_rate": 4.7405477212632404e-06, "loss": 0.2509, "step": 2340 }, { "epoch": 3.9948805460750854, "grad_norm": 0.32793268229295297, "learning_rate": 4.725156033774523e-06, "loss": 0.2386, "step": 2341 }, { "epoch": 3.9965870307167233, "grad_norm": 0.33004739302554115, "learning_rate": 4.70978602613301e-06, "loss": 0.22, "step": 2342 }, { "epoch": 3.9982935153583616, "grad_norm": 0.3779354844844152, "learning_rate": 4.6944377201536085e-06, "loss": 0.2241, "step": 2343 }, { "epoch": 4.0, "grad_norm": 0.3290126249506407, "learning_rate": 4.679111137620442e-06, "loss": 0.2136, "step": 2344 }, { "epoch": 4.001706484641638, "grad_norm": 0.4195821381484528, "learning_rate": 4.663806300286781e-06, "loss": 0.181, "step": 2345 }, { "epoch": 4.003412969283277, "grad_norm": 0.380339643137854, "learning_rate": 4.648523229875057e-06, "loss": 0.1879, "step": 2346 }, { "epoch": 4.005119453924915, "grad_norm": 0.3186044596112882, "learning_rate": 4.633261948076782e-06, "loss": 0.1376, "step": 2347 }, { "epoch": 4.006825938566553, "grad_norm": 0.31859418181041815, "learning_rate": 4.618022476552553e-06, "loss": 0.1901, "step": 2348 }, { "epoch": 4.008532423208191, "grad_norm": 0.3635020562324159, "learning_rate": 4.6028048369320195e-06, "loss": 0.1927, "step": 2349 }, { "epoch": 4.010238907849829, "grad_norm": 0.39829087215604436, "learning_rate": 4.5876090508138305e-06, "loss": 0.1687, "step": 2350 }, { "epoch": 4.011945392491468, "grad_norm": 0.3747006339515895, "learning_rate": 4.572435139765637e-06, "loss": 0.1651, "step": 2351 }, { "epoch": 4.013651877133106, "grad_norm": 0.35305428459036114, "learning_rate": 4.557283125324012e-06, "loss": 0.1892, "step": 2352 }, { "epoch": 4.015358361774744, "grad_norm": 0.3267597738262304, "learning_rate": 4.542153028994487e-06, "loss": 0.1759, "step": 2353 }, { "epoch": 4.017064846416382, "grad_norm": 0.34994523827678176, "learning_rate": 4.527044872251458e-06, "loss": 0.1868, "step": 2354 }, { "epoch": 4.01877133105802, "grad_norm": 0.38266239654081796, "learning_rate": 4.511958676538186e-06, "loss": 0.1628, "step": 2355 }, { "epoch": 4.020477815699659, "grad_norm": 0.40574871016875586, "learning_rate": 4.4968944632667764e-06, "loss": 0.1634, "step": 2356 }, { "epoch": 4.022184300341297, "grad_norm": 0.3570647899059514, "learning_rate": 4.481852253818113e-06, "loss": 0.186, "step": 2357 }, { "epoch": 4.023890784982935, "grad_norm": 0.35037574886916495, "learning_rate": 4.4668320695418736e-06, "loss": 0.1886, "step": 2358 }, { "epoch": 4.025597269624574, "grad_norm": 0.3230046933285519, "learning_rate": 4.451833931756457e-06, "loss": 0.1969, "step": 2359 }, { "epoch": 4.027303754266212, "grad_norm": 0.3246664508384852, "learning_rate": 4.436857861748969e-06, "loss": 0.184, "step": 2360 }, { "epoch": 4.0290102389078495, "grad_norm": 0.3382136883702398, "learning_rate": 4.4219038807752135e-06, "loss": 0.1727, "step": 2361 }, { "epoch": 4.030716723549488, "grad_norm": 0.3574461598883982, "learning_rate": 4.406972010059623e-06, "loss": 0.1693, "step": 2362 }, { "epoch": 4.032423208191126, "grad_norm": 0.3159101539493147, "learning_rate": 4.3920622707952635e-06, "loss": 0.1863, "step": 2363 }, { "epoch": 4.034129692832765, "grad_norm": 0.3027313233728572, "learning_rate": 4.3771746841437765e-06, "loss": 0.1543, "step": 2364 }, { "epoch": 4.035836177474403, "grad_norm": 0.29884331247701995, "learning_rate": 4.362309271235374e-06, "loss": 0.1762, "step": 2365 }, { "epoch": 4.037542662116041, "grad_norm": 0.31625900200267343, "learning_rate": 4.3474660531687915e-06, "loss": 0.179, "step": 2366 }, { "epoch": 4.039249146757679, "grad_norm": 0.32403350412788073, "learning_rate": 4.332645051011253e-06, "loss": 0.2008, "step": 2367 }, { "epoch": 4.040955631399317, "grad_norm": 0.30085829152297205, "learning_rate": 4.3178462857984705e-06, "loss": 0.1471, "step": 2368 }, { "epoch": 4.0426621160409555, "grad_norm": 0.3434443386369388, "learning_rate": 4.303069778534574e-06, "loss": 0.1585, "step": 2369 }, { "epoch": 4.044368600682594, "grad_norm": 0.3194979721641485, "learning_rate": 4.288315550192126e-06, "loss": 0.164, "step": 2370 }, { "epoch": 4.046075085324232, "grad_norm": 0.31954843401356814, "learning_rate": 4.273583621712041e-06, "loss": 0.1791, "step": 2371 }, { "epoch": 4.047781569965871, "grad_norm": 0.30033373224280846, "learning_rate": 4.258874014003616e-06, "loss": 0.1512, "step": 2372 }, { "epoch": 4.049488054607509, "grad_norm": 0.33525231379828074, "learning_rate": 4.244186747944425e-06, "loss": 0.157, "step": 2373 }, { "epoch": 4.051194539249146, "grad_norm": 0.3391082338188415, "learning_rate": 4.2295218443803686e-06, "loss": 0.131, "step": 2374 }, { "epoch": 4.052901023890785, "grad_norm": 0.3236061516370624, "learning_rate": 4.214879324125601e-06, "loss": 0.1891, "step": 2375 }, { "epoch": 4.054607508532423, "grad_norm": 0.30222161809585246, "learning_rate": 4.200259207962487e-06, "loss": 0.1932, "step": 2376 }, { "epoch": 4.0563139931740615, "grad_norm": 0.2995732617456124, "learning_rate": 4.185661516641622e-06, "loss": 0.233, "step": 2377 }, { "epoch": 4.0580204778157, "grad_norm": 0.2935346367299693, "learning_rate": 4.171086270881752e-06, "loss": 0.1659, "step": 2378 }, { "epoch": 4.059726962457338, "grad_norm": 0.31560070678665797, "learning_rate": 4.156533491369772e-06, "loss": 0.1645, "step": 2379 }, { "epoch": 4.061433447098976, "grad_norm": 0.30346193217864753, "learning_rate": 4.142003198760685e-06, "loss": 0.1734, "step": 2380 }, { "epoch": 4.063139931740614, "grad_norm": 0.30241178077042385, "learning_rate": 4.127495413677592e-06, "loss": 0.1854, "step": 2381 }, { "epoch": 4.064846416382252, "grad_norm": 0.30940900115979975, "learning_rate": 4.1130101567116435e-06, "loss": 0.225, "step": 2382 }, { "epoch": 4.066552901023891, "grad_norm": 0.2980665181561457, "learning_rate": 4.0985474484219986e-06, "loss": 0.1982, "step": 2383 }, { "epoch": 4.068259385665529, "grad_norm": 0.29077509347783315, "learning_rate": 4.08410730933585e-06, "loss": 0.1957, "step": 2384 }, { "epoch": 4.0699658703071675, "grad_norm": 0.32745222882898095, "learning_rate": 4.069689759948308e-06, "loss": 0.1886, "step": 2385 }, { "epoch": 4.071672354948806, "grad_norm": 0.30247319005231443, "learning_rate": 4.055294820722462e-06, "loss": 0.1575, "step": 2386 }, { "epoch": 4.073378839590443, "grad_norm": 0.30760195988207906, "learning_rate": 4.040922512089287e-06, "loss": 0.1918, "step": 2387 }, { "epoch": 4.075085324232082, "grad_norm": 0.36117705419328205, "learning_rate": 4.02657285444765e-06, "loss": 0.1721, "step": 2388 }, { "epoch": 4.07679180887372, "grad_norm": 0.3094996833260368, "learning_rate": 4.012245868164273e-06, "loss": 0.198, "step": 2389 }, { "epoch": 4.078498293515358, "grad_norm": 0.317353138990636, "learning_rate": 3.997941573573685e-06, "loss": 0.2555, "step": 2390 }, { "epoch": 4.080204778156997, "grad_norm": 0.28796017944724317, "learning_rate": 3.983659990978217e-06, "loss": 0.1837, "step": 2391 }, { "epoch": 4.081911262798635, "grad_norm": 0.31493694678711875, "learning_rate": 3.969401140647957e-06, "loss": 0.1891, "step": 2392 }, { "epoch": 4.083617747440273, "grad_norm": 0.3113813772180343, "learning_rate": 3.955165042820748e-06, "loss": 0.1497, "step": 2393 }, { "epoch": 4.085324232081911, "grad_norm": 0.28613972465653337, "learning_rate": 3.940951717702115e-06, "loss": 0.1808, "step": 2394 }, { "epoch": 4.087030716723549, "grad_norm": 0.3173301121228953, "learning_rate": 3.926761185465277e-06, "loss": 0.1959, "step": 2395 }, { "epoch": 4.088737201365188, "grad_norm": 0.28874789354399394, "learning_rate": 3.912593466251111e-06, "loss": 0.2134, "step": 2396 }, { "epoch": 4.090443686006826, "grad_norm": 0.3179204985820542, "learning_rate": 3.898448580168084e-06, "loss": 0.2062, "step": 2397 }, { "epoch": 4.092150170648464, "grad_norm": 0.3498702154663877, "learning_rate": 3.8843265472922874e-06, "loss": 0.1824, "step": 2398 }, { "epoch": 4.093856655290103, "grad_norm": 0.3181203277230868, "learning_rate": 3.870227387667355e-06, "loss": 0.1751, "step": 2399 }, { "epoch": 4.09556313993174, "grad_norm": 0.32833596189353553, "learning_rate": 3.856151121304477e-06, "loss": 0.1827, "step": 2400 }, { "epoch": 4.097269624573379, "grad_norm": 0.3097861506682424, "learning_rate": 3.842097768182324e-06, "loss": 0.1596, "step": 2401 }, { "epoch": 4.098976109215017, "grad_norm": 0.29847487443923326, "learning_rate": 3.828067348247076e-06, "loss": 0.1888, "step": 2402 }, { "epoch": 4.100682593856655, "grad_norm": 0.32331562735973857, "learning_rate": 3.8140598814123374e-06, "loss": 0.1648, "step": 2403 }, { "epoch": 4.102389078498294, "grad_norm": 0.31911722607997095, "learning_rate": 3.8000753875591455e-06, "loss": 0.1648, "step": 2404 }, { "epoch": 4.104095563139932, "grad_norm": 0.3168256761024783, "learning_rate": 3.7861138865359383e-06, "loss": 0.1689, "step": 2405 }, { "epoch": 4.1058020477815695, "grad_norm": 0.3201312099698668, "learning_rate": 3.772175398158504e-06, "loss": 0.1677, "step": 2406 }, { "epoch": 4.107508532423208, "grad_norm": 0.3088327738845882, "learning_rate": 3.7582599422099873e-06, "loss": 0.1412, "step": 2407 }, { "epoch": 4.109215017064846, "grad_norm": 0.30505958330845273, "learning_rate": 3.744367538440823e-06, "loss": 0.1517, "step": 2408 }, { "epoch": 4.110921501706485, "grad_norm": 0.32653606387534573, "learning_rate": 3.7304982065687447e-06, "loss": 0.1906, "step": 2409 }, { "epoch": 4.112627986348123, "grad_norm": 0.332667019462028, "learning_rate": 3.7166519662787327e-06, "loss": 0.1678, "step": 2410 }, { "epoch": 4.114334470989761, "grad_norm": 0.2994498270660051, "learning_rate": 3.7028288372229825e-06, "loss": 0.1931, "step": 2411 }, { "epoch": 4.1160409556314, "grad_norm": 0.28551721653406226, "learning_rate": 3.6890288390209093e-06, "loss": 0.1698, "step": 2412 }, { "epoch": 4.117747440273037, "grad_norm": 0.31365733078967856, "learning_rate": 3.675251991259079e-06, "loss": 0.2001, "step": 2413 }, { "epoch": 4.1194539249146755, "grad_norm": 0.31358514885701966, "learning_rate": 3.661498313491214e-06, "loss": 0.1492, "step": 2414 }, { "epoch": 4.121160409556314, "grad_norm": 0.29088423041712363, "learning_rate": 3.6477678252381375e-06, "loss": 0.1716, "step": 2415 }, { "epoch": 4.122866894197952, "grad_norm": 0.31473294787636624, "learning_rate": 3.6340605459877675e-06, "loss": 0.1633, "step": 2416 }, { "epoch": 4.124573378839591, "grad_norm": 0.337747855502917, "learning_rate": 3.6203764951950836e-06, "loss": 0.184, "step": 2417 }, { "epoch": 4.126279863481229, "grad_norm": 0.3060822525059725, "learning_rate": 3.6067156922820877e-06, "loss": 0.1865, "step": 2418 }, { "epoch": 4.1279863481228665, "grad_norm": 0.3430301730706332, "learning_rate": 3.593078156637797e-06, "loss": 0.1935, "step": 2419 }, { "epoch": 4.129692832764505, "grad_norm": 0.30029683914160743, "learning_rate": 3.5794639076181924e-06, "loss": 0.1868, "step": 2420 }, { "epoch": 4.131399317406143, "grad_norm": 0.3452712998400332, "learning_rate": 3.5658729645462175e-06, "loss": 0.1679, "step": 2421 }, { "epoch": 4.1331058020477816, "grad_norm": 0.2948759494429693, "learning_rate": 3.5523053467117287e-06, "loss": 0.1866, "step": 2422 }, { "epoch": 4.13481228668942, "grad_norm": 0.346765414439053, "learning_rate": 3.5387610733714685e-06, "loss": 0.1677, "step": 2423 }, { "epoch": 4.136518771331058, "grad_norm": 0.3307964280323542, "learning_rate": 3.5252401637490683e-06, "loss": 0.1778, "step": 2424 }, { "epoch": 4.138225255972697, "grad_norm": 0.2995036414158578, "learning_rate": 3.5117426370349763e-06, "loss": 0.1825, "step": 2425 }, { "epoch": 4.139931740614334, "grad_norm": 0.32137369482037303, "learning_rate": 3.4982685123864712e-06, "loss": 0.1963, "step": 2426 }, { "epoch": 4.1416382252559725, "grad_norm": 0.30769668129711114, "learning_rate": 3.484817808927605e-06, "loss": 0.1787, "step": 2427 }, { "epoch": 4.143344709897611, "grad_norm": 0.33124297434962124, "learning_rate": 3.471390545749187e-06, "loss": 0.1649, "step": 2428 }, { "epoch": 4.145051194539249, "grad_norm": 0.3157941746181479, "learning_rate": 3.4579867419087696e-06, "loss": 0.1941, "step": 2429 }, { "epoch": 4.146757679180888, "grad_norm": 0.31393737449028575, "learning_rate": 3.444606416430594e-06, "loss": 0.1619, "step": 2430 }, { "epoch": 4.148464163822526, "grad_norm": 0.29530891100487067, "learning_rate": 3.4312495883055898e-06, "loss": 0.1539, "step": 2431 }, { "epoch": 4.150170648464163, "grad_norm": 0.3001566132614726, "learning_rate": 3.417916276491324e-06, "loss": 0.1701, "step": 2432 }, { "epoch": 4.151877133105802, "grad_norm": 0.31297145051268754, "learning_rate": 3.404606499912004e-06, "loss": 0.2238, "step": 2433 }, { "epoch": 4.15358361774744, "grad_norm": 0.3008453140731978, "learning_rate": 3.3913202774584187e-06, "loss": 0.1826, "step": 2434 }, { "epoch": 4.1552901023890785, "grad_norm": 0.33268967523986326, "learning_rate": 3.378057627987925e-06, "loss": 0.2026, "step": 2435 }, { "epoch": 4.156996587030717, "grad_norm": 0.31920950857284724, "learning_rate": 3.3648185703244396e-06, "loss": 0.1999, "step": 2436 }, { "epoch": 4.158703071672355, "grad_norm": 0.30205249689919317, "learning_rate": 3.3516031232583737e-06, "loss": 0.1965, "step": 2437 }, { "epoch": 4.160409556313994, "grad_norm": 0.3074081500263038, "learning_rate": 3.3384113055466428e-06, "loss": 0.2154, "step": 2438 }, { "epoch": 4.162116040955631, "grad_norm": 0.34729139465173037, "learning_rate": 3.3252431359126147e-06, "loss": 0.1595, "step": 2439 }, { "epoch": 4.163822525597269, "grad_norm": 0.3054807917727276, "learning_rate": 3.3120986330461036e-06, "loss": 0.149, "step": 2440 }, { "epoch": 4.165529010238908, "grad_norm": 0.30757842212638165, "learning_rate": 3.2989778156033257e-06, "loss": 0.1673, "step": 2441 }, { "epoch": 4.167235494880546, "grad_norm": 0.3142441653237269, "learning_rate": 3.285880702206874e-06, "loss": 0.1913, "step": 2442 }, { "epoch": 4.1689419795221845, "grad_norm": 0.3152934803543005, "learning_rate": 3.272807311445716e-06, "loss": 0.1665, "step": 2443 }, { "epoch": 4.170648464163823, "grad_norm": 0.3286907418748899, "learning_rate": 3.259757661875129e-06, "loss": 0.1797, "step": 2444 }, { "epoch": 4.172354948805461, "grad_norm": 0.30941258169800717, "learning_rate": 3.2467317720167135e-06, "loss": 0.1973, "step": 2445 }, { "epoch": 4.174061433447099, "grad_norm": 0.3055585061508135, "learning_rate": 3.2337296603583336e-06, "loss": 0.1947, "step": 2446 }, { "epoch": 4.175767918088737, "grad_norm": 0.34521820630003597, "learning_rate": 3.2207513453541027e-06, "loss": 0.176, "step": 2447 }, { "epoch": 4.177474402730375, "grad_norm": 0.2809607338036196, "learning_rate": 3.2077968454243757e-06, "loss": 0.181, "step": 2448 }, { "epoch": 4.179180887372014, "grad_norm": 0.33299427285978445, "learning_rate": 3.1948661789556844e-06, "loss": 0.1574, "step": 2449 }, { "epoch": 4.180887372013652, "grad_norm": 0.30328282365657666, "learning_rate": 3.1819593643007574e-06, "loss": 0.1883, "step": 2450 }, { "epoch": 4.1825938566552905, "grad_norm": 0.33795654538569, "learning_rate": 3.1690764197784453e-06, "loss": 0.1714, "step": 2451 }, { "epoch": 4.184300341296928, "grad_norm": 0.30990167385479767, "learning_rate": 3.156217363673748e-06, "loss": 0.1939, "step": 2452 }, { "epoch": 4.186006825938566, "grad_norm": 0.31593916081353907, "learning_rate": 3.1433822142377222e-06, "loss": 0.1479, "step": 2453 }, { "epoch": 4.187713310580205, "grad_norm": 0.30879416774852353, "learning_rate": 3.1305709896875267e-06, "loss": 0.2035, "step": 2454 }, { "epoch": 4.189419795221843, "grad_norm": 0.31106106097285646, "learning_rate": 3.1177837082063565e-06, "loss": 0.1769, "step": 2455 }, { "epoch": 4.191126279863481, "grad_norm": 0.3285079312818511, "learning_rate": 3.105020387943405e-06, "loss": 0.1665, "step": 2456 }, { "epoch": 4.19283276450512, "grad_norm": 0.35199684412836346, "learning_rate": 3.0922810470138854e-06, "loss": 0.1617, "step": 2457 }, { "epoch": 4.194539249146757, "grad_norm": 0.31623575446419255, "learning_rate": 3.079565703498957e-06, "loss": 0.1905, "step": 2458 }, { "epoch": 4.196245733788396, "grad_norm": 0.31649344891525233, "learning_rate": 3.0668743754457207e-06, "loss": 0.1663, "step": 2459 }, { "epoch": 4.197952218430034, "grad_norm": 0.3283474471234332, "learning_rate": 3.054207080867191e-06, "loss": 0.1781, "step": 2460 }, { "epoch": 4.199658703071672, "grad_norm": 0.30898068543856966, "learning_rate": 3.0415638377422853e-06, "loss": 0.2372, "step": 2461 }, { "epoch": 4.201365187713311, "grad_norm": 0.31695022179054894, "learning_rate": 3.0289446640157736e-06, "loss": 0.1655, "step": 2462 }, { "epoch": 4.203071672354949, "grad_norm": 0.2830330422400834, "learning_rate": 3.016349577598261e-06, "loss": 0.1886, "step": 2463 }, { "epoch": 4.204778156996587, "grad_norm": 0.2892447590865804, "learning_rate": 3.003778596366178e-06, "loss": 0.1727, "step": 2464 }, { "epoch": 4.206484641638225, "grad_norm": 0.3289191933004354, "learning_rate": 2.991231738161717e-06, "loss": 0.1761, "step": 2465 }, { "epoch": 4.208191126279863, "grad_norm": 0.29484730013264737, "learning_rate": 2.9787090207928627e-06, "loss": 0.1913, "step": 2466 }, { "epoch": 4.209897610921502, "grad_norm": 0.3409795446165211, "learning_rate": 2.9662104620333122e-06, "loss": 0.1706, "step": 2467 }, { "epoch": 4.21160409556314, "grad_norm": 0.29846831357916814, "learning_rate": 2.953736079622487e-06, "loss": 0.2099, "step": 2468 }, { "epoch": 4.213310580204778, "grad_norm": 0.32455537188833833, "learning_rate": 2.9412858912654973e-06, "loss": 0.1622, "step": 2469 }, { "epoch": 4.215017064846417, "grad_norm": 0.2924663053130245, "learning_rate": 2.9288599146331043e-06, "loss": 0.1695, "step": 2470 }, { "epoch": 4.216723549488055, "grad_norm": 0.31636765141272133, "learning_rate": 2.916458167361709e-06, "loss": 0.1685, "step": 2471 }, { "epoch": 4.2184300341296925, "grad_norm": 0.29890500727297004, "learning_rate": 2.904080667053315e-06, "loss": 0.1848, "step": 2472 }, { "epoch": 4.220136518771331, "grad_norm": 0.3159287635158706, "learning_rate": 2.891727431275535e-06, "loss": 0.166, "step": 2473 }, { "epoch": 4.221843003412969, "grad_norm": 0.32374950088405524, "learning_rate": 2.879398477561515e-06, "loss": 0.1511, "step": 2474 }, { "epoch": 4.223549488054608, "grad_norm": 0.29710815719678113, "learning_rate": 2.8670938234099544e-06, "loss": 0.216, "step": 2475 }, { "epoch": 4.225255972696246, "grad_norm": 0.30868037596993714, "learning_rate": 2.854813486285066e-06, "loss": 0.21, "step": 2476 }, { "epoch": 4.226962457337884, "grad_norm": 0.3152187498011452, "learning_rate": 2.8425574836165347e-06, "loss": 0.1402, "step": 2477 }, { "epoch": 4.228668941979522, "grad_norm": 0.31038840558797376, "learning_rate": 2.8303258327995164e-06, "loss": 0.1557, "step": 2478 }, { "epoch": 4.23037542662116, "grad_norm": 0.34017375830113183, "learning_rate": 2.8181185511945997e-06, "loss": 0.1758, "step": 2479 }, { "epoch": 4.2320819112627985, "grad_norm": 0.31419921149308355, "learning_rate": 2.805935656127794e-06, "loss": 0.179, "step": 2480 }, { "epoch": 4.233788395904437, "grad_norm": 0.3087576948893619, "learning_rate": 2.793777164890481e-06, "loss": 0.2254, "step": 2481 }, { "epoch": 4.235494880546075, "grad_norm": 0.2897950303673048, "learning_rate": 2.7816430947394234e-06, "loss": 0.1704, "step": 2482 }, { "epoch": 4.237201365187714, "grad_norm": 0.30271889939946317, "learning_rate": 2.7695334628967186e-06, "loss": 0.1715, "step": 2483 }, { "epoch": 4.238907849829351, "grad_norm": 0.32471275307318515, "learning_rate": 2.757448286549762e-06, "loss": 0.1731, "step": 2484 }, { "epoch": 4.2406143344709895, "grad_norm": 0.30425370243267175, "learning_rate": 2.74538758285126e-06, "loss": 0.1662, "step": 2485 }, { "epoch": 4.242320819112628, "grad_norm": 0.3310020579739023, "learning_rate": 2.733351368919166e-06, "loss": 0.1838, "step": 2486 }, { "epoch": 4.244027303754266, "grad_norm": 0.3229193120957275, "learning_rate": 2.7213396618366973e-06, "loss": 0.1635, "step": 2487 }, { "epoch": 4.2457337883959045, "grad_norm": 0.3057093282694775, "learning_rate": 2.709352478652263e-06, "loss": 0.1788, "step": 2488 }, { "epoch": 4.247440273037543, "grad_norm": 0.32147087828457116, "learning_rate": 2.697389836379487e-06, "loss": 0.1454, "step": 2489 }, { "epoch": 4.249146757679181, "grad_norm": 0.2910842326849101, "learning_rate": 2.685451751997148e-06, "loss": 0.178, "step": 2490 }, { "epoch": 4.250853242320819, "grad_norm": 0.3054245893841373, "learning_rate": 2.6735382424491675e-06, "loss": 0.1415, "step": 2491 }, { "epoch": 4.252559726962457, "grad_norm": 0.3052755561261774, "learning_rate": 2.661649324644604e-06, "loss": 0.1798, "step": 2492 }, { "epoch": 4.2542662116040955, "grad_norm": 0.3182082318814107, "learning_rate": 2.649785015457591e-06, "loss": 0.1748, "step": 2493 }, { "epoch": 4.255972696245734, "grad_norm": 0.29962345203407825, "learning_rate": 2.637945331727356e-06, "loss": 0.1745, "step": 2494 }, { "epoch": 4.257679180887372, "grad_norm": 0.32292803056798236, "learning_rate": 2.6261302902581597e-06, "loss": 0.1657, "step": 2495 }, { "epoch": 4.2593856655290105, "grad_norm": 0.3132811007902274, "learning_rate": 2.6143399078192877e-06, "loss": 0.1414, "step": 2496 }, { "epoch": 4.261092150170649, "grad_norm": 0.33403720279621085, "learning_rate": 2.6025742011450406e-06, "loss": 0.1657, "step": 2497 }, { "epoch": 4.262798634812286, "grad_norm": 0.32096632949756077, "learning_rate": 2.590833186934676e-06, "loss": 0.1731, "step": 2498 }, { "epoch": 4.264505119453925, "grad_norm": 0.29495802159678225, "learning_rate": 2.5791168818524283e-06, "loss": 0.1773, "step": 2499 }, { "epoch": 4.266211604095563, "grad_norm": 0.2944433363612561, "learning_rate": 2.5674253025274396e-06, "loss": 0.198, "step": 2500 }, { "epoch": 4.2679180887372015, "grad_norm": 0.3176713163870496, "learning_rate": 2.5557584655537746e-06, "loss": 0.1539, "step": 2501 }, { "epoch": 4.26962457337884, "grad_norm": 0.2853734440639459, "learning_rate": 2.5441163874903742e-06, "loss": 0.1919, "step": 2502 }, { "epoch": 4.271331058020478, "grad_norm": 0.32089791476011936, "learning_rate": 2.532499084861033e-06, "loss": 0.1641, "step": 2503 }, { "epoch": 4.273037542662116, "grad_norm": 0.3061287654995635, "learning_rate": 2.5209065741543936e-06, "loss": 0.1725, "step": 2504 }, { "epoch": 4.274744027303754, "grad_norm": 0.27879825597833974, "learning_rate": 2.5093388718238987e-06, "loss": 0.139, "step": 2505 }, { "epoch": 4.276450511945392, "grad_norm": 0.31504174633544063, "learning_rate": 2.497795994287795e-06, "loss": 0.1702, "step": 2506 }, { "epoch": 4.278156996587031, "grad_norm": 0.35568855400749266, "learning_rate": 2.4862779579290797e-06, "loss": 0.1431, "step": 2507 }, { "epoch": 4.279863481228669, "grad_norm": 0.31786679741994495, "learning_rate": 2.474784779095496e-06, "loss": 0.1591, "step": 2508 }, { "epoch": 4.2815699658703075, "grad_norm": 0.29747899647345816, "learning_rate": 2.4633164740995154e-06, "loss": 0.2301, "step": 2509 }, { "epoch": 4.283276450511945, "grad_norm": 0.29501765904829275, "learning_rate": 2.4518730592182926e-06, "loss": 0.1597, "step": 2510 }, { "epoch": 4.284982935153583, "grad_norm": 0.2955468959025003, "learning_rate": 2.4404545506936716e-06, "loss": 0.2034, "step": 2511 }, { "epoch": 4.286689419795222, "grad_norm": 0.30088377990607584, "learning_rate": 2.429060964732126e-06, "loss": 0.2104, "step": 2512 }, { "epoch": 4.28839590443686, "grad_norm": 0.3020198754204438, "learning_rate": 2.4176923175047763e-06, "loss": 0.1778, "step": 2513 }, { "epoch": 4.290102389078498, "grad_norm": 0.3102244929124343, "learning_rate": 2.4063486251473344e-06, "loss": 0.1793, "step": 2514 }, { "epoch": 4.291808873720137, "grad_norm": 0.3253556220298246, "learning_rate": 2.395029903760091e-06, "loss": 0.1489, "step": 2515 }, { "epoch": 4.293515358361775, "grad_norm": 0.3410550422933944, "learning_rate": 2.3837361694079107e-06, "loss": 0.1485, "step": 2516 }, { "epoch": 4.295221843003413, "grad_norm": 0.3130313366088956, "learning_rate": 2.372467438120174e-06, "loss": 0.1833, "step": 2517 }, { "epoch": 4.296928327645051, "grad_norm": 0.3120907160997788, "learning_rate": 2.3612237258907957e-06, "loss": 0.1779, "step": 2518 }, { "epoch": 4.298634812286689, "grad_norm": 0.2922644753111327, "learning_rate": 2.3500050486781566e-06, "loss": 0.1672, "step": 2519 }, { "epoch": 4.300341296928328, "grad_norm": 0.3227869521532051, "learning_rate": 2.338811422405127e-06, "loss": 0.1571, "step": 2520 }, { "epoch": 4.302047781569966, "grad_norm": 0.3034733437198407, "learning_rate": 2.3276428629590075e-06, "loss": 0.2009, "step": 2521 }, { "epoch": 4.303754266211604, "grad_norm": 0.3263899032863054, "learning_rate": 2.316499386191522e-06, "loss": 0.1496, "step": 2522 }, { "epoch": 4.305460750853243, "grad_norm": 0.30830203624371233, "learning_rate": 2.3053810079188057e-06, "loss": 0.1644, "step": 2523 }, { "epoch": 4.30716723549488, "grad_norm": 0.3050694718628497, "learning_rate": 2.2942877439213528e-06, "loss": 0.1994, "step": 2524 }, { "epoch": 4.308873720136519, "grad_norm": 0.29845586994255363, "learning_rate": 2.283219609944034e-06, "loss": 0.1786, "step": 2525 }, { "epoch": 4.310580204778157, "grad_norm": 0.3418072194790865, "learning_rate": 2.272176621696034e-06, "loss": 0.1735, "step": 2526 }, { "epoch": 4.312286689419795, "grad_norm": 0.29544505841648233, "learning_rate": 2.261158794850853e-06, "loss": 0.1902, "step": 2527 }, { "epoch": 4.313993174061434, "grad_norm": 0.31134334943651026, "learning_rate": 2.2501661450462886e-06, "loss": 0.179, "step": 2528 }, { "epoch": 4.315699658703072, "grad_norm": 0.301310338429921, "learning_rate": 2.2391986878843876e-06, "loss": 0.1682, "step": 2529 }, { "epoch": 4.3174061433447095, "grad_norm": 0.29330584727835124, "learning_rate": 2.2282564389314576e-06, "loss": 0.1794, "step": 2530 }, { "epoch": 4.319112627986348, "grad_norm": 0.3070215194233873, "learning_rate": 2.217339413718014e-06, "loss": 0.1417, "step": 2531 }, { "epoch": 4.320819112627986, "grad_norm": 0.30821668681842695, "learning_rate": 2.2064476277387858e-06, "loss": 0.1736, "step": 2532 }, { "epoch": 4.322525597269625, "grad_norm": 0.3184121253545697, "learning_rate": 2.1955810964526593e-06, "loss": 0.2005, "step": 2533 }, { "epoch": 4.324232081911263, "grad_norm": 0.31780555744096844, "learning_rate": 2.18473983528269e-06, "loss": 0.1924, "step": 2534 }, { "epoch": 4.325938566552901, "grad_norm": 0.3072965040016955, "learning_rate": 2.173923859616076e-06, "loss": 0.18, "step": 2535 }, { "epoch": 4.327645051194539, "grad_norm": 0.2824134182741914, "learning_rate": 2.1631331848041025e-06, "loss": 0.16, "step": 2536 }, { "epoch": 4.329351535836177, "grad_norm": 0.31297368541999177, "learning_rate": 2.1523678261621715e-06, "loss": 0.1593, "step": 2537 }, { "epoch": 4.3310580204778155, "grad_norm": 0.32119774641052234, "learning_rate": 2.1416277989697344e-06, "loss": 0.157, "step": 2538 }, { "epoch": 4.332764505119454, "grad_norm": 0.2886738863134529, "learning_rate": 2.130913118470297e-06, "loss": 0.2005, "step": 2539 }, { "epoch": 4.334470989761092, "grad_norm": 0.2872363431291567, "learning_rate": 2.1202237998713814e-06, "loss": 0.1568, "step": 2540 }, { "epoch": 4.336177474402731, "grad_norm": 0.2913090673455017, "learning_rate": 2.1095598583445255e-06, "loss": 0.1927, "step": 2541 }, { "epoch": 4.337883959044369, "grad_norm": 0.3110281755536224, "learning_rate": 2.09892130902525e-06, "loss": 0.1517, "step": 2542 }, { "epoch": 4.339590443686006, "grad_norm": 0.2896544715959582, "learning_rate": 2.0883081670130202e-06, "loss": 0.1644, "step": 2543 }, { "epoch": 4.341296928327645, "grad_norm": 0.29564797273531257, "learning_rate": 2.0777204473712564e-06, "loss": 0.1809, "step": 2544 }, { "epoch": 4.343003412969283, "grad_norm": 0.30223501178504436, "learning_rate": 2.06715816512729e-06, "loss": 0.1618, "step": 2545 }, { "epoch": 4.3447098976109215, "grad_norm": 0.2796922695719279, "learning_rate": 2.056621335272344e-06, "loss": 0.1909, "step": 2546 }, { "epoch": 4.34641638225256, "grad_norm": 0.3059516004912932, "learning_rate": 2.046109972761523e-06, "loss": 0.2023, "step": 2547 }, { "epoch": 4.348122866894198, "grad_norm": 0.2909794324825243, "learning_rate": 2.0356240925137816e-06, "loss": 0.14, "step": 2548 }, { "epoch": 4.349829351535837, "grad_norm": 0.30160162314385436, "learning_rate": 2.025163709411917e-06, "loss": 0.2013, "step": 2549 }, { "epoch": 4.351535836177474, "grad_norm": 0.2830562144601256, "learning_rate": 2.0147288383025197e-06, "loss": 0.1638, "step": 2550 }, { "epoch": 4.353242320819112, "grad_norm": 0.3332359895409523, "learning_rate": 2.004319493995992e-06, "loss": 0.1801, "step": 2551 }, { "epoch": 4.354948805460751, "grad_norm": 0.3047315291970305, "learning_rate": 1.993935691266482e-06, "loss": 0.1702, "step": 2552 }, { "epoch": 4.356655290102389, "grad_norm": 0.3091788010415166, "learning_rate": 1.9835774448519075e-06, "loss": 0.1949, "step": 2553 }, { "epoch": 4.3583617747440275, "grad_norm": 0.29617004681223497, "learning_rate": 1.973244769453897e-06, "loss": 0.1979, "step": 2554 }, { "epoch": 4.360068259385666, "grad_norm": 0.29985425485567585, "learning_rate": 1.9629376797378e-06, "loss": 0.2125, "step": 2555 }, { "epoch": 4.361774744027303, "grad_norm": 0.29373314664152933, "learning_rate": 1.95265619033264e-06, "loss": 0.1832, "step": 2556 }, { "epoch": 4.363481228668942, "grad_norm": 0.30556795498559286, "learning_rate": 1.9424003158311187e-06, "loss": 0.1862, "step": 2557 }, { "epoch": 4.36518771331058, "grad_norm": 0.3373225031070908, "learning_rate": 1.9321700707895672e-06, "loss": 0.1708, "step": 2558 }, { "epoch": 4.3668941979522184, "grad_norm": 0.31772294187974187, "learning_rate": 1.9219654697279443e-06, "loss": 0.2783, "step": 2559 }, { "epoch": 4.368600682593857, "grad_norm": 0.29033686690165184, "learning_rate": 1.9117865271298264e-06, "loss": 0.1807, "step": 2560 }, { "epoch": 4.370307167235495, "grad_norm": 0.2997392442069027, "learning_rate": 1.9016332574423479e-06, "loss": 0.1626, "step": 2561 }, { "epoch": 4.372013651877133, "grad_norm": 0.30622537739214434, "learning_rate": 1.8915056750762261e-06, "loss": 0.1916, "step": 2562 }, { "epoch": 4.373720136518771, "grad_norm": 0.31749443847272735, "learning_rate": 1.8814037944057117e-06, "loss": 0.1934, "step": 2563 }, { "epoch": 4.375426621160409, "grad_norm": 0.3081795435254109, "learning_rate": 1.8713276297685712e-06, "loss": 0.1489, "step": 2564 }, { "epoch": 4.377133105802048, "grad_norm": 0.3030137285901555, "learning_rate": 1.8612771954660825e-06, "loss": 0.1535, "step": 2565 }, { "epoch": 4.378839590443686, "grad_norm": 0.31679080381563246, "learning_rate": 1.851252505762995e-06, "loss": 0.1608, "step": 2566 }, { "epoch": 4.3805460750853245, "grad_norm": 0.3178725577997585, "learning_rate": 1.841253574887527e-06, "loss": 0.2116, "step": 2567 }, { "epoch": 4.382252559726963, "grad_norm": 0.3120407911523594, "learning_rate": 1.831280417031327e-06, "loss": 0.1739, "step": 2568 }, { "epoch": 4.3839590443686, "grad_norm": 0.3156116274518043, "learning_rate": 1.8213330463494738e-06, "loss": 0.134, "step": 2569 }, { "epoch": 4.385665529010239, "grad_norm": 0.3129024986980865, "learning_rate": 1.8114114769604363e-06, "loss": 0.1667, "step": 2570 }, { "epoch": 4.387372013651877, "grad_norm": 0.3251618265488506, "learning_rate": 1.8015157229460656e-06, "loss": 0.1775, "step": 2571 }, { "epoch": 4.389078498293515, "grad_norm": 0.30972584219226884, "learning_rate": 1.7916457983515822e-06, "loss": 0.1906, "step": 2572 }, { "epoch": 4.390784982935154, "grad_norm": 0.3167115292804992, "learning_rate": 1.7818017171855318e-06, "loss": 0.1734, "step": 2573 }, { "epoch": 4.392491467576792, "grad_norm": 0.3096780871319159, "learning_rate": 1.771983493419791e-06, "loss": 0.1772, "step": 2574 }, { "epoch": 4.3941979522184305, "grad_norm": 0.29489330937597885, "learning_rate": 1.7621911409895332e-06, "loss": 0.1867, "step": 2575 }, { "epoch": 4.395904436860068, "grad_norm": 0.3133041098716859, "learning_rate": 1.7524246737932072e-06, "loss": 0.1606, "step": 2576 }, { "epoch": 4.397610921501706, "grad_norm": 0.3117573213645619, "learning_rate": 1.7426841056925315e-06, "loss": 0.1984, "step": 2577 }, { "epoch": 4.399317406143345, "grad_norm": 0.29972628372325183, "learning_rate": 1.732969450512456e-06, "loss": 0.2101, "step": 2578 }, { "epoch": 4.401023890784983, "grad_norm": 0.3092356765885778, "learning_rate": 1.7232807220411629e-06, "loss": 0.1716, "step": 2579 }, { "epoch": 4.402730375426621, "grad_norm": 0.3062746650892749, "learning_rate": 1.713617934030023e-06, "loss": 0.14, "step": 2580 }, { "epoch": 4.40443686006826, "grad_norm": 0.2734822135973346, "learning_rate": 1.7039811001936056e-06, "loss": 0.2018, "step": 2581 }, { "epoch": 4.406143344709897, "grad_norm": 0.3173603903458222, "learning_rate": 1.694370234209628e-06, "loss": 0.1619, "step": 2582 }, { "epoch": 4.407849829351536, "grad_norm": 0.320973709722284, "learning_rate": 1.6847853497189538e-06, "loss": 0.2777, "step": 2583 }, { "epoch": 4.409556313993174, "grad_norm": 0.2884858345193101, "learning_rate": 1.675226460325583e-06, "loss": 0.1649, "step": 2584 }, { "epoch": 4.411262798634812, "grad_norm": 0.3112597061860702, "learning_rate": 1.6656935795965989e-06, "loss": 0.1983, "step": 2585 }, { "epoch": 4.412969283276451, "grad_norm": 0.3059564652787005, "learning_rate": 1.6561867210621918e-06, "loss": 0.1928, "step": 2586 }, { "epoch": 4.414675767918089, "grad_norm": 0.2982886465732699, "learning_rate": 1.6467058982156015e-06, "loss": 0.1839, "step": 2587 }, { "epoch": 4.4163822525597265, "grad_norm": 0.3149448537193366, "learning_rate": 1.6372511245131285e-06, "loss": 0.1489, "step": 2588 }, { "epoch": 4.418088737201365, "grad_norm": 0.31512048989873015, "learning_rate": 1.6278224133740917e-06, "loss": 0.193, "step": 2589 }, { "epoch": 4.419795221843003, "grad_norm": 0.3286822254843685, "learning_rate": 1.6184197781808197e-06, "loss": 0.1561, "step": 2590 }, { "epoch": 4.421501706484642, "grad_norm": 0.35473759580754277, "learning_rate": 1.6090432322786375e-06, "loss": 0.1887, "step": 2591 }, { "epoch": 4.42320819112628, "grad_norm": 0.3042147572286529, "learning_rate": 1.5996927889758307e-06, "loss": 0.1905, "step": 2592 }, { "epoch": 4.424914675767918, "grad_norm": 0.2966243777562204, "learning_rate": 1.5903684615436542e-06, "loss": 0.191, "step": 2593 }, { "epoch": 4.426621160409557, "grad_norm": 0.31376177368311275, "learning_rate": 1.5810702632162755e-06, "loss": 0.1643, "step": 2594 }, { "epoch": 4.428327645051194, "grad_norm": 0.31289484267866075, "learning_rate": 1.571798207190789e-06, "loss": 0.1781, "step": 2595 }, { "epoch": 4.4300341296928325, "grad_norm": 0.29694775918388083, "learning_rate": 1.5625523066271852e-06, "loss": 0.1659, "step": 2596 }, { "epoch": 4.431740614334471, "grad_norm": 0.29147605821295364, "learning_rate": 1.553332574648323e-06, "loss": 0.1593, "step": 2597 }, { "epoch": 4.433447098976109, "grad_norm": 0.3057342409234324, "learning_rate": 1.5441390243399345e-06, "loss": 0.1639, "step": 2598 }, { "epoch": 4.435153583617748, "grad_norm": 0.28642803934421673, "learning_rate": 1.5349716687505733e-06, "loss": 0.1626, "step": 2599 }, { "epoch": 4.436860068259386, "grad_norm": 0.285548146724235, "learning_rate": 1.5258305208916314e-06, "loss": 0.1606, "step": 2600 }, { "epoch": 4.438566552901024, "grad_norm": 0.32479682544844474, "learning_rate": 1.5167155937372947e-06, "loss": 0.1888, "step": 2601 }, { "epoch": 4.440273037542662, "grad_norm": 0.3378938837647975, "learning_rate": 1.5076269002245304e-06, "loss": 0.1761, "step": 2602 }, { "epoch": 4.4419795221843, "grad_norm": 0.29406065899669787, "learning_rate": 1.4985644532530819e-06, "loss": 0.2741, "step": 2603 }, { "epoch": 4.4436860068259385, "grad_norm": 0.291056615154014, "learning_rate": 1.4895282656854293e-06, "loss": 0.1709, "step": 2604 }, { "epoch": 4.445392491467577, "grad_norm": 0.3077417839945242, "learning_rate": 1.4805183503467979e-06, "loss": 0.201, "step": 2605 }, { "epoch": 4.447098976109215, "grad_norm": 0.31781018398972843, "learning_rate": 1.4715347200251052e-06, "loss": 0.1696, "step": 2606 }, { "epoch": 4.448805460750854, "grad_norm": 0.3163121595251067, "learning_rate": 1.462577387470978e-06, "loss": 0.1956, "step": 2607 }, { "epoch": 4.450511945392491, "grad_norm": 0.29257827516208407, "learning_rate": 1.4536463653977028e-06, "loss": 0.1598, "step": 2608 }, { "epoch": 4.452218430034129, "grad_norm": 0.31633914007976754, "learning_rate": 1.4447416664812374e-06, "loss": 0.1907, "step": 2609 }, { "epoch": 4.453924914675768, "grad_norm": 0.33350557899838706, "learning_rate": 1.4358633033601788e-06, "loss": 0.1561, "step": 2610 }, { "epoch": 4.455631399317406, "grad_norm": 0.3167449232019445, "learning_rate": 1.427011288635729e-06, "loss": 0.1559, "step": 2611 }, { "epoch": 4.4573378839590445, "grad_norm": 0.30530600953842574, "learning_rate": 1.418185634871716e-06, "loss": 0.1641, "step": 2612 }, { "epoch": 4.459044368600683, "grad_norm": 0.3131984591858436, "learning_rate": 1.4093863545945263e-06, "loss": 0.177, "step": 2613 }, { "epoch": 4.460750853242321, "grad_norm": 0.3380908183071507, "learning_rate": 1.4006134602931408e-06, "loss": 0.1825, "step": 2614 }, { "epoch": 4.462457337883959, "grad_norm": 0.28330310082518084, "learning_rate": 1.3918669644190708e-06, "loss": 0.1774, "step": 2615 }, { "epoch": 4.464163822525597, "grad_norm": 0.31826886606580584, "learning_rate": 1.3831468793863701e-06, "loss": 0.1836, "step": 2616 }, { "epoch": 4.465870307167235, "grad_norm": 0.28804015352123924, "learning_rate": 1.3744532175716098e-06, "loss": 0.241, "step": 2617 }, { "epoch": 4.467576791808874, "grad_norm": 0.3430602007229569, "learning_rate": 1.3657859913138437e-06, "loss": 0.2021, "step": 2618 }, { "epoch": 4.469283276450512, "grad_norm": 0.30126030775145457, "learning_rate": 1.3571452129146234e-06, "loss": 0.1724, "step": 2619 }, { "epoch": 4.4709897610921505, "grad_norm": 0.31609525596538085, "learning_rate": 1.348530894637945e-06, "loss": 0.1823, "step": 2620 }, { "epoch": 4.472696245733788, "grad_norm": 0.2911441143738094, "learning_rate": 1.3399430487102638e-06, "loss": 0.1721, "step": 2621 }, { "epoch": 4.474402730375426, "grad_norm": 0.29289498103999745, "learning_rate": 1.33138168732045e-06, "loss": 0.2007, "step": 2622 }, { "epoch": 4.476109215017065, "grad_norm": 0.3081640723691369, "learning_rate": 1.3228468226197944e-06, "loss": 0.1615, "step": 2623 }, { "epoch": 4.477815699658703, "grad_norm": 0.2785466270617825, "learning_rate": 1.3143384667219783e-06, "loss": 0.1776, "step": 2624 }, { "epoch": 4.479522184300341, "grad_norm": 0.2884258060547518, "learning_rate": 1.3058566317030551e-06, "loss": 0.1906, "step": 2625 }, { "epoch": 4.48122866894198, "grad_norm": 0.31444204672002174, "learning_rate": 1.2974013296014376e-06, "loss": 0.1669, "step": 2626 }, { "epoch": 4.482935153583618, "grad_norm": 0.3195633943847116, "learning_rate": 1.288972572417877e-06, "loss": 0.2045, "step": 2627 }, { "epoch": 4.484641638225256, "grad_norm": 0.31559512088158087, "learning_rate": 1.2805703721154594e-06, "loss": 0.1702, "step": 2628 }, { "epoch": 4.486348122866894, "grad_norm": 0.3166673710140947, "learning_rate": 1.2721947406195657e-06, "loss": 0.1788, "step": 2629 }, { "epoch": 4.488054607508532, "grad_norm": 0.3060831513316056, "learning_rate": 1.2638456898178752e-06, "loss": 0.1873, "step": 2630 }, { "epoch": 4.489761092150171, "grad_norm": 0.31154060582160964, "learning_rate": 1.2555232315603449e-06, "loss": 0.2374, "step": 2631 }, { "epoch": 4.491467576791809, "grad_norm": 0.3076884170379762, "learning_rate": 1.247227377659168e-06, "loss": 0.1839, "step": 2632 }, { "epoch": 4.493174061433447, "grad_norm": 0.2985982338062527, "learning_rate": 1.238958139888804e-06, "loss": 0.175, "step": 2633 }, { "epoch": 4.494880546075085, "grad_norm": 0.31920058660397355, "learning_rate": 1.2307155299859153e-06, "loss": 0.2152, "step": 2634 }, { "epoch": 4.496587030716723, "grad_norm": 0.34402587537787804, "learning_rate": 1.222499559649386e-06, "loss": 0.1808, "step": 2635 }, { "epoch": 4.498293515358362, "grad_norm": 0.317583860848277, "learning_rate": 1.2143102405402751e-06, "loss": 0.2015, "step": 2636 }, { "epoch": 4.5, "grad_norm": 0.3282545929788059, "learning_rate": 1.2061475842818337e-06, "loss": 0.1948, "step": 2637 }, { "epoch": 4.501706484641638, "grad_norm": 0.2838840587396505, "learning_rate": 1.1980116024594524e-06, "loss": 0.1838, "step": 2638 }, { "epoch": 4.503412969283277, "grad_norm": 0.2987084175711107, "learning_rate": 1.1899023066206671e-06, "loss": 0.1957, "step": 2639 }, { "epoch": 4.505119453924914, "grad_norm": 0.3013170240642986, "learning_rate": 1.1818197082751493e-06, "loss": 0.1787, "step": 2640 }, { "epoch": 4.506825938566553, "grad_norm": 0.31558048418216567, "learning_rate": 1.1737638188946577e-06, "loss": 0.1733, "step": 2641 }, { "epoch": 4.508532423208191, "grad_norm": 0.2906170661542559, "learning_rate": 1.165734649913064e-06, "loss": 0.1824, "step": 2642 }, { "epoch": 4.510238907849829, "grad_norm": 0.30098580797968205, "learning_rate": 1.157732212726299e-06, "loss": 0.1826, "step": 2643 }, { "epoch": 4.511945392491468, "grad_norm": 0.31994314706625765, "learning_rate": 1.1497565186923575e-06, "loss": 0.2408, "step": 2644 }, { "epoch": 4.513651877133106, "grad_norm": 0.3106424840527709, "learning_rate": 1.1418075791312843e-06, "loss": 0.1755, "step": 2645 }, { "epoch": 4.515358361774744, "grad_norm": 0.3027234786599201, "learning_rate": 1.133885405325139e-06, "loss": 0.2042, "step": 2646 }, { "epoch": 4.517064846416382, "grad_norm": 0.2923515699191684, "learning_rate": 1.1259900085180054e-06, "loss": 0.1585, "step": 2647 }, { "epoch": 4.51877133105802, "grad_norm": 0.2817260535186955, "learning_rate": 1.1181213999159458e-06, "loss": 0.1941, "step": 2648 }, { "epoch": 4.520477815699659, "grad_norm": 0.31735840851041724, "learning_rate": 1.1102795906870223e-06, "loss": 0.1669, "step": 2649 }, { "epoch": 4.522184300341297, "grad_norm": 0.3055142775670233, "learning_rate": 1.1024645919612386e-06, "loss": 0.1757, "step": 2650 }, { "epoch": 4.523890784982935, "grad_norm": 0.3314596012225001, "learning_rate": 1.09467641483056e-06, "loss": 0.149, "step": 2651 }, { "epoch": 4.525597269624574, "grad_norm": 0.2962199127617063, "learning_rate": 1.0869150703488818e-06, "loss": 0.1703, "step": 2652 }, { "epoch": 4.527303754266212, "grad_norm": 0.3144511395605482, "learning_rate": 1.079180569532008e-06, "loss": 0.2131, "step": 2653 }, { "epoch": 4.5290102389078495, "grad_norm": 0.3059647228514497, "learning_rate": 1.0714729233576526e-06, "loss": 0.1623, "step": 2654 }, { "epoch": 4.530716723549488, "grad_norm": 0.3198382002725939, "learning_rate": 1.0637921427654052e-06, "loss": 0.1648, "step": 2655 }, { "epoch": 4.532423208191126, "grad_norm": 0.313476882585119, "learning_rate": 1.0561382386567342e-06, "loss": 0.1728, "step": 2656 }, { "epoch": 4.534129692832765, "grad_norm": 0.32461608946860504, "learning_rate": 1.0485112218949544e-06, "loss": 0.1991, "step": 2657 }, { "epoch": 4.535836177474403, "grad_norm": 0.3067653776366247, "learning_rate": 1.0409111033052154e-06, "loss": 0.1965, "step": 2658 }, { "epoch": 4.537542662116041, "grad_norm": 0.3210134260484315, "learning_rate": 1.0333378936745064e-06, "loss": 0.1726, "step": 2659 }, { "epoch": 4.53924914675768, "grad_norm": 0.2937600173082321, "learning_rate": 1.0257916037516025e-06, "loss": 0.1459, "step": 2660 }, { "epoch": 4.540955631399317, "grad_norm": 0.28590331785936346, "learning_rate": 1.0182722442470894e-06, "loss": 0.1525, "step": 2661 }, { "epoch": 4.5426621160409555, "grad_norm": 0.2849589712836118, "learning_rate": 1.0107798258333213e-06, "loss": 0.1642, "step": 2662 }, { "epoch": 4.544368600682594, "grad_norm": 0.2850599390185159, "learning_rate": 1.0033143591444116e-06, "loss": 0.1999, "step": 2663 }, { "epoch": 4.546075085324232, "grad_norm": 0.3246837318250444, "learning_rate": 9.958758547762292e-07, "loss": 0.1728, "step": 2664 }, { "epoch": 4.547781569965871, "grad_norm": 0.31034182132595006, "learning_rate": 9.884643232863666e-07, "loss": 0.1603, "step": 2665 }, { "epoch": 4.549488054607508, "grad_norm": 0.2699307113298506, "learning_rate": 9.810797751941448e-07, "loss": 0.173, "step": 2666 }, { "epoch": 4.551194539249146, "grad_norm": 0.3164873422498817, "learning_rate": 9.737222209805686e-07, "loss": 0.1479, "step": 2667 }, { "epoch": 4.552901023890785, "grad_norm": 0.3101121665742394, "learning_rate": 9.663916710883493e-07, "loss": 0.1798, "step": 2668 }, { "epoch": 4.554607508532423, "grad_norm": 0.31326479172343186, "learning_rate": 9.590881359218595e-07, "loss": 0.1951, "step": 2669 }, { "epoch": 4.5563139931740615, "grad_norm": 0.29438538514221124, "learning_rate": 9.518116258471254e-07, "loss": 0.1652, "step": 2670 }, { "epoch": 4.5580204778157, "grad_norm": 0.2991309107110425, "learning_rate": 9.445621511918324e-07, "loss": 0.1897, "step": 2671 }, { "epoch": 4.559726962457338, "grad_norm": 0.2955512942850018, "learning_rate": 9.373397222452741e-07, "loss": 0.1792, "step": 2672 }, { "epoch": 4.561433447098976, "grad_norm": 0.31376758353537465, "learning_rate": 9.301443492583751e-07, "loss": 0.1648, "step": 2673 }, { "epoch": 4.563139931740614, "grad_norm": 0.3204276037498276, "learning_rate": 9.229760424436462e-07, "loss": 0.1761, "step": 2674 }, { "epoch": 4.564846416382252, "grad_norm": 0.3012058029999409, "learning_rate": 9.158348119751892e-07, "loss": 0.1626, "step": 2675 }, { "epoch": 4.566552901023891, "grad_norm": 0.287669382311409, "learning_rate": 9.087206679886762e-07, "loss": 0.1712, "step": 2676 }, { "epoch": 4.568259385665529, "grad_norm": 0.2784332470218125, "learning_rate": 9.016336205813303e-07, "loss": 0.1856, "step": 2677 }, { "epoch": 4.5699658703071675, "grad_norm": 0.3157487210825833, "learning_rate": 8.945736798119253e-07, "loss": 0.1801, "step": 2678 }, { "epoch": 4.571672354948806, "grad_norm": 0.33192547669048056, "learning_rate": 8.875408557007459e-07, "loss": 0.1917, "step": 2679 }, { "epoch": 4.573378839590443, "grad_norm": 0.29465625247378635, "learning_rate": 8.805351582296118e-07, "loss": 0.181, "step": 2680 }, { "epoch": 4.575085324232082, "grad_norm": 0.2975756901413667, "learning_rate": 8.735565973418181e-07, "loss": 0.1888, "step": 2681 }, { "epoch": 4.57679180887372, "grad_norm": 0.30854480493941716, "learning_rate": 8.666051829421596e-07, "loss": 0.159, "step": 2682 }, { "epoch": 4.578498293515358, "grad_norm": 0.3145222152129316, "learning_rate": 8.596809248968996e-07, "loss": 0.169, "step": 2683 }, { "epoch": 4.580204778156997, "grad_norm": 0.3063202595906734, "learning_rate": 8.527838330337524e-07, "loss": 0.1644, "step": 2684 }, { "epoch": 4.581911262798635, "grad_norm": 0.31189663379964455, "learning_rate": 8.459139171418851e-07, "loss": 0.1621, "step": 2685 }, { "epoch": 4.5836177474402735, "grad_norm": 0.2904213106630353, "learning_rate": 8.390711869718782e-07, "loss": 0.1588, "step": 2686 }, { "epoch": 4.585324232081911, "grad_norm": 0.3179560834185949, "learning_rate": 8.322556522357427e-07, "loss": 0.1677, "step": 2687 }, { "epoch": 4.587030716723549, "grad_norm": 0.31293041696083573, "learning_rate": 8.254673226068788e-07, "loss": 0.1641, "step": 2688 }, { "epoch": 4.588737201365188, "grad_norm": 0.33577700473897765, "learning_rate": 8.18706207720077e-07, "loss": 0.1561, "step": 2689 }, { "epoch": 4.590443686006826, "grad_norm": 0.3157403650864591, "learning_rate": 8.119723171715122e-07, "loss": 0.1641, "step": 2690 }, { "epoch": 4.592150170648464, "grad_norm": 0.2862777973290719, "learning_rate": 8.052656605187015e-07, "loss": 0.1494, "step": 2691 }, { "epoch": 4.593856655290102, "grad_norm": 0.30784437239644985, "learning_rate": 7.985862472805217e-07, "loss": 0.1907, "step": 2692 }, { "epoch": 4.59556313993174, "grad_norm": 0.28642816392311105, "learning_rate": 7.919340869371783e-07, "loss": 0.1514, "step": 2693 }, { "epoch": 4.597269624573379, "grad_norm": 0.29941313106236994, "learning_rate": 7.853091889301944e-07, "loss": 0.168, "step": 2694 }, { "epoch": 4.598976109215017, "grad_norm": 0.2916049076030227, "learning_rate": 7.78711562662402e-07, "loss": 0.1596, "step": 2695 }, { "epoch": 4.600682593856655, "grad_norm": 0.2919761829480753, "learning_rate": 7.721412174979214e-07, "loss": 0.1494, "step": 2696 }, { "epoch": 4.602389078498294, "grad_norm": 0.2860719533668131, "learning_rate": 7.655981627621645e-07, "loss": 0.1636, "step": 2697 }, { "epoch": 4.604095563139932, "grad_norm": 0.3015545961815166, "learning_rate": 7.590824077417913e-07, "loss": 0.2241, "step": 2698 }, { "epoch": 4.6058020477815695, "grad_norm": 0.2890766624921712, "learning_rate": 7.525939616847333e-07, "loss": 0.136, "step": 2699 }, { "epoch": 4.607508532423208, "grad_norm": 0.3042706431257455, "learning_rate": 7.461328338001417e-07, "loss": 0.146, "step": 2700 }, { "epoch": 4.609215017064846, "grad_norm": 0.2947006269527326, "learning_rate": 7.396990332584164e-07, "loss": 0.183, "step": 2701 }, { "epoch": 4.610921501706485, "grad_norm": 0.31901339926876754, "learning_rate": 7.33292569191153e-07, "loss": 0.1826, "step": 2702 }, { "epoch": 4.612627986348123, "grad_norm": 0.31810941062398185, "learning_rate": 7.269134506911579e-07, "loss": 0.1568, "step": 2703 }, { "epoch": 4.614334470989761, "grad_norm": 0.30449542970454335, "learning_rate": 7.205616868124288e-07, "loss": 0.1724, "step": 2704 }, { "epoch": 4.6160409556314, "grad_norm": 0.2955978987021187, "learning_rate": 7.142372865701253e-07, "loss": 0.1761, "step": 2705 }, { "epoch": 4.617747440273037, "grad_norm": 0.291363412093148, "learning_rate": 7.079402589405804e-07, "loss": 0.1659, "step": 2706 }, { "epoch": 4.6194539249146755, "grad_norm": 0.28902122490739873, "learning_rate": 7.016706128612694e-07, "loss": 0.2135, "step": 2707 }, { "epoch": 4.621160409556314, "grad_norm": 0.2915347981940905, "learning_rate": 6.954283572308118e-07, "loss": 0.1962, "step": 2708 }, { "epoch": 4.622866894197952, "grad_norm": 0.32533380821156654, "learning_rate": 6.892135009089451e-07, "loss": 0.1733, "step": 2709 }, { "epoch": 4.624573378839591, "grad_norm": 0.3271404140899551, "learning_rate": 6.830260527165222e-07, "loss": 0.196, "step": 2710 }, { "epoch": 4.626279863481229, "grad_norm": 0.31781236648785877, "learning_rate": 6.768660214355005e-07, "loss": 0.1554, "step": 2711 }, { "epoch": 4.627986348122867, "grad_norm": 0.3217830186925165, "learning_rate": 6.707334158089063e-07, "loss": 0.159, "step": 2712 }, { "epoch": 4.629692832764505, "grad_norm": 0.31911002436978775, "learning_rate": 6.646282445408591e-07, "loss": 0.2502, "step": 2713 }, { "epoch": 4.631399317406143, "grad_norm": 0.3143470280291693, "learning_rate": 6.5855051629653e-07, "loss": 0.1961, "step": 2714 }, { "epoch": 4.6331058020477816, "grad_norm": 0.30478601257389953, "learning_rate": 6.525002397021451e-07, "loss": 0.1847, "step": 2715 }, { "epoch": 4.63481228668942, "grad_norm": 0.27885189050541664, "learning_rate": 6.464774233449622e-07, "loss": 0.1727, "step": 2716 }, { "epoch": 4.636518771331058, "grad_norm": 0.2767544904678458, "learning_rate": 6.4048207577327e-07, "loss": 0.1964, "step": 2717 }, { "epoch": 4.638225255972696, "grad_norm": 0.3440955133555953, "learning_rate": 6.345142054963682e-07, "loss": 0.1421, "step": 2718 }, { "epoch": 4.639931740614334, "grad_norm": 0.3081120729111336, "learning_rate": 6.285738209845527e-07, "loss": 0.2058, "step": 2719 }, { "epoch": 4.6416382252559725, "grad_norm": 0.29037800407941694, "learning_rate": 6.226609306691189e-07, "loss": 0.1725, "step": 2720 }, { "epoch": 4.643344709897611, "grad_norm": 0.2972819480569761, "learning_rate": 6.167755429423272e-07, "loss": 0.174, "step": 2721 }, { "epoch": 4.645051194539249, "grad_norm": 0.31806866417226826, "learning_rate": 6.109176661574134e-07, "loss": 0.173, "step": 2722 }, { "epoch": 4.646757679180888, "grad_norm": 0.29740846813435423, "learning_rate": 6.050873086285602e-07, "loss": 0.214, "step": 2723 }, { "epoch": 4.648464163822526, "grad_norm": 0.3242311357095905, "learning_rate": 5.992844786308971e-07, "loss": 0.1989, "step": 2724 }, { "epoch": 4.650170648464163, "grad_norm": 0.31894716450438126, "learning_rate": 5.935091844004759e-07, "loss": 0.1722, "step": 2725 }, { "epoch": 4.651877133105802, "grad_norm": 0.28662564392079753, "learning_rate": 5.877614341342708e-07, "loss": 0.1811, "step": 2726 }, { "epoch": 4.65358361774744, "grad_norm": 0.304397679136295, "learning_rate": 5.820412359901629e-07, "loss": 0.1791, "step": 2727 }, { "epoch": 4.6552901023890785, "grad_norm": 0.29630041278839964, "learning_rate": 5.763485980869265e-07, "loss": 0.1843, "step": 2728 }, { "epoch": 4.656996587030717, "grad_norm": 0.3165607930599475, "learning_rate": 5.706835285042233e-07, "loss": 0.1731, "step": 2729 }, { "epoch": 4.658703071672355, "grad_norm": 0.29492874795878143, "learning_rate": 5.650460352825793e-07, "loss": 0.1825, "step": 2730 }, { "epoch": 4.660409556313994, "grad_norm": 0.3243112959246551, "learning_rate": 5.594361264233849e-07, "loss": 0.1783, "step": 2731 }, { "epoch": 4.662116040955631, "grad_norm": 0.3032385848030944, "learning_rate": 5.538538098888846e-07, "loss": 0.1832, "step": 2732 }, { "epoch": 4.663822525597269, "grad_norm": 0.31122906686810375, "learning_rate": 5.482990936021493e-07, "loss": 0.1775, "step": 2733 }, { "epoch": 4.665529010238908, "grad_norm": 0.3128666110482303, "learning_rate": 5.427719854470881e-07, "loss": 0.1573, "step": 2734 }, { "epoch": 4.667235494880546, "grad_norm": 0.3072523033104867, "learning_rate": 5.37272493268417e-07, "loss": 0.1822, "step": 2735 }, { "epoch": 4.6689419795221845, "grad_norm": 0.31487425009550746, "learning_rate": 5.318006248716589e-07, "loss": 0.1713, "step": 2736 }, { "epoch": 4.670648464163823, "grad_norm": 0.31277740188190023, "learning_rate": 5.263563880231348e-07, "loss": 0.16, "step": 2737 }, { "epoch": 4.672354948805461, "grad_norm": 0.31728623002589723, "learning_rate": 5.209397904499369e-07, "loss": 0.1844, "step": 2738 }, { "epoch": 4.674061433447099, "grad_norm": 0.31231543604810585, "learning_rate": 5.155508398399378e-07, "loss": 0.1937, "step": 2739 }, { "epoch": 4.675767918088737, "grad_norm": 0.29585155268736635, "learning_rate": 5.101895438417659e-07, "loss": 0.176, "step": 2740 }, { "epoch": 4.677474402730375, "grad_norm": 0.30976890927663414, "learning_rate": 5.048559100648054e-07, "loss": 0.1624, "step": 2741 }, { "epoch": 4.679180887372014, "grad_norm": 0.33309511940289294, "learning_rate": 4.995499460791675e-07, "loss": 0.1892, "step": 2742 }, { "epoch": 4.680887372013652, "grad_norm": 0.31102619428772654, "learning_rate": 4.942716594156993e-07, "loss": 0.1675, "step": 2743 }, { "epoch": 4.6825938566552905, "grad_norm": 0.31624748102064804, "learning_rate": 4.89021057565966e-07, "loss": 0.1485, "step": 2744 }, { "epoch": 4.684300341296928, "grad_norm": 0.298870727149146, "learning_rate": 4.837981479822307e-07, "loss": 0.2, "step": 2745 }, { "epoch": 4.686006825938566, "grad_norm": 0.30261983612884613, "learning_rate": 4.78602938077466e-07, "loss": 0.1727, "step": 2746 }, { "epoch": 4.687713310580205, "grad_norm": 0.29766006341673873, "learning_rate": 4.7343543522531563e-07, "loss": 0.1545, "step": 2747 }, { "epoch": 4.689419795221843, "grad_norm": 0.30366717370402896, "learning_rate": 4.6829564676011076e-07, "loss": 0.1776, "step": 2748 }, { "epoch": 4.691126279863481, "grad_norm": 0.3122046083113992, "learning_rate": 4.6318357997683583e-07, "loss": 0.1541, "step": 2749 }, { "epoch": 4.69283276450512, "grad_norm": 0.295379614278616, "learning_rate": 4.580992421311359e-07, "loss": 0.1702, "step": 2750 }, { "epoch": 4.694539249146757, "grad_norm": 0.28431269348999033, "learning_rate": 4.530426404393007e-07, "loss": 0.1764, "step": 2751 }, { "epoch": 4.696245733788396, "grad_norm": 0.3081064890767569, "learning_rate": 4.480137820782493e-07, "loss": 0.1709, "step": 2752 }, { "epoch": 4.697952218430034, "grad_norm": 0.331527140545914, "learning_rate": 4.4301267418552786e-07, "loss": 0.1802, "step": 2753 }, { "epoch": 4.699658703071672, "grad_norm": 0.2918451725241063, "learning_rate": 4.380393238592917e-07, "loss": 0.1924, "step": 2754 }, { "epoch": 4.701365187713311, "grad_norm": 0.3260580720198137, "learning_rate": 4.3309373815830334e-07, "loss": 0.175, "step": 2755 }, { "epoch": 4.703071672354949, "grad_norm": 0.2960709556526829, "learning_rate": 4.281759241019212e-07, "loss": 0.1755, "step": 2756 }, { "epoch": 4.704778156996587, "grad_norm": 0.306517715233806, "learning_rate": 4.2328588867007526e-07, "loss": 0.1767, "step": 2757 }, { "epoch": 4.706484641638225, "grad_norm": 0.3270552723774275, "learning_rate": 4.184236388032825e-07, "loss": 0.1829, "step": 2758 }, { "epoch": 4.708191126279863, "grad_norm": 0.3213406072411129, "learning_rate": 4.1358918140261385e-07, "loss": 0.1905, "step": 2759 }, { "epoch": 4.709897610921502, "grad_norm": 0.3154723719671582, "learning_rate": 4.0878252332970046e-07, "loss": 0.1803, "step": 2760 }, { "epoch": 4.71160409556314, "grad_norm": 0.30352072765361676, "learning_rate": 4.040036714067119e-07, "loss": 0.1895, "step": 2761 }, { "epoch": 4.713310580204778, "grad_norm": 0.2948796231530349, "learning_rate": 3.992526324163537e-07, "loss": 0.1802, "step": 2762 }, { "epoch": 4.715017064846417, "grad_norm": 0.31508476168355987, "learning_rate": 3.945294131018584e-07, "loss": 0.192, "step": 2763 }, { "epoch": 4.716723549488055, "grad_norm": 0.29702285597614453, "learning_rate": 3.898340201669726e-07, "loss": 0.1668, "step": 2764 }, { "epoch": 4.7184300341296925, "grad_norm": 0.31669356081958894, "learning_rate": 3.851664602759453e-07, "loss": 0.1841, "step": 2765 }, { "epoch": 4.720136518771331, "grad_norm": 0.3068568373032543, "learning_rate": 3.805267400535262e-07, "loss": 0.1633, "step": 2766 }, { "epoch": 4.721843003412969, "grad_norm": 0.2865610931038237, "learning_rate": 3.759148660849521e-07, "loss": 0.1592, "step": 2767 }, { "epoch": 4.723549488054608, "grad_norm": 0.31918610916972, "learning_rate": 3.71330844915927e-07, "loss": 0.1925, "step": 2768 }, { "epoch": 4.725255972696246, "grad_norm": 0.3350479762390741, "learning_rate": 3.667746830526331e-07, "loss": 0.1523, "step": 2769 }, { "epoch": 4.726962457337884, "grad_norm": 0.3294918911488467, "learning_rate": 3.622463869617154e-07, "loss": 0.1693, "step": 2770 }, { "epoch": 4.728668941979522, "grad_norm": 0.3373615262899913, "learning_rate": 3.577459630702551e-07, "loss": 0.1717, "step": 2771 }, { "epoch": 4.73037542662116, "grad_norm": 0.31649543808060154, "learning_rate": 3.5327341776578263e-07, "loss": 0.1744, "step": 2772 }, { "epoch": 4.7320819112627985, "grad_norm": 0.3015120607731921, "learning_rate": 3.488287573962601e-07, "loss": 0.1697, "step": 2773 }, { "epoch": 4.733788395904437, "grad_norm": 0.34245813217671023, "learning_rate": 3.444119882700658e-07, "loss": 0.1776, "step": 2774 }, { "epoch": 4.735494880546075, "grad_norm": 0.3141969796974187, "learning_rate": 3.400231166559986e-07, "loss": 0.1855, "step": 2775 }, { "epoch": 4.737201365187714, "grad_norm": 0.27139431295185523, "learning_rate": 3.3566214878325564e-07, "loss": 0.1515, "step": 2776 }, { "epoch": 4.738907849829351, "grad_norm": 0.2907069336102253, "learning_rate": 3.3132909084143906e-07, "loss": 0.1714, "step": 2777 }, { "epoch": 4.7406143344709895, "grad_norm": 0.2925091837910021, "learning_rate": 3.2702394898052936e-07, "loss": 0.1865, "step": 2778 }, { "epoch": 4.742320819112628, "grad_norm": 0.33924125096350244, "learning_rate": 3.2274672931088766e-07, "loss": 0.1804, "step": 2779 }, { "epoch": 4.744027303754266, "grad_norm": 0.3012524961675928, "learning_rate": 3.184974379032424e-07, "loss": 0.1508, "step": 2780 }, { "epoch": 4.7457337883959045, "grad_norm": 0.28754242174783323, "learning_rate": 3.1427608078869133e-07, "loss": 0.1791, "step": 2781 }, { "epoch": 4.747440273037543, "grad_norm": 0.30860274649893404, "learning_rate": 3.100826639586707e-07, "loss": 0.154, "step": 2782 }, { "epoch": 4.749146757679181, "grad_norm": 0.3113431887945359, "learning_rate": 3.059171933649752e-07, "loss": 0.1662, "step": 2783 }, { "epoch": 4.750853242320819, "grad_norm": 0.32115456934856906, "learning_rate": 3.0177967491972884e-07, "loss": 0.1577, "step": 2784 }, { "epoch": 4.752559726962457, "grad_norm": 0.3028379125307131, "learning_rate": 2.976701144953786e-07, "loss": 0.18, "step": 2785 }, { "epoch": 4.7542662116040955, "grad_norm": 0.28721084226302274, "learning_rate": 2.9358851792469665e-07, "loss": 0.2204, "step": 2786 }, { "epoch": 4.755972696245734, "grad_norm": 0.3092608536419546, "learning_rate": 2.8953489100076003e-07, "loss": 0.1989, "step": 2787 }, { "epoch": 4.757679180887372, "grad_norm": 0.3056259123470543, "learning_rate": 2.855092394769532e-07, "loss": 0.1785, "step": 2788 }, { "epoch": 4.7593856655290105, "grad_norm": 0.2815407935497288, "learning_rate": 2.815115690669501e-07, "loss": 0.1791, "step": 2789 }, { "epoch": 4.761092150170649, "grad_norm": 0.31139205545351045, "learning_rate": 2.7754188544471426e-07, "loss": 0.1717, "step": 2790 }, { "epoch": 4.762798634812286, "grad_norm": 0.30346067170537583, "learning_rate": 2.7360019424448545e-07, "loss": 0.1797, "step": 2791 }, { "epoch": 4.764505119453925, "grad_norm": 0.31944855771688185, "learning_rate": 2.6968650106077296e-07, "loss": 0.1546, "step": 2792 }, { "epoch": 4.766211604095563, "grad_norm": 0.320352778892074, "learning_rate": 2.6580081144834903e-07, "loss": 0.1647, "step": 2793 }, { "epoch": 4.7679180887372015, "grad_norm": 0.3221202846879678, "learning_rate": 2.6194313092223756e-07, "loss": 0.1541, "step": 2794 }, { "epoch": 4.76962457337884, "grad_norm": 0.31167416292528777, "learning_rate": 2.5811346495771436e-07, "loss": 0.194, "step": 2795 }, { "epoch": 4.771331058020478, "grad_norm": 0.2748895521787072, "learning_rate": 2.5431181899028267e-07, "loss": 0.2068, "step": 2796 }, { "epoch": 4.773037542662116, "grad_norm": 0.30999551577917733, "learning_rate": 2.5053819841569295e-07, "loss": 0.1688, "step": 2797 }, { "epoch": 4.774744027303754, "grad_norm": 0.2905927479723217, "learning_rate": 2.4679260858990306e-07, "loss": 0.1815, "step": 2798 }, { "epoch": 4.776450511945392, "grad_norm": 0.33157245543394975, "learning_rate": 2.4307505482909166e-07, "loss": 0.2107, "step": 2799 }, { "epoch": 4.778156996587031, "grad_norm": 0.2928265507497819, "learning_rate": 2.393855424096514e-07, "loss": 0.2077, "step": 2800 }, { "epoch": 4.779863481228669, "grad_norm": 0.32357915474682025, "learning_rate": 2.3572407656816676e-07, "loss": 0.1586, "step": 2801 }, { "epoch": 4.7815699658703075, "grad_norm": 0.32482969682952206, "learning_rate": 2.3209066250142077e-07, "loss": 0.1842, "step": 2802 }, { "epoch": 4.783276450511945, "grad_norm": 0.318429985615523, "learning_rate": 2.2848530536637713e-07, "loss": 0.157, "step": 2803 }, { "epoch": 4.784982935153583, "grad_norm": 0.3027980996848901, "learning_rate": 2.2490801028018704e-07, "loss": 0.1555, "step": 2804 }, { "epoch": 4.786689419795222, "grad_norm": 0.3243052592109917, "learning_rate": 2.2135878232016016e-07, "loss": 0.1785, "step": 2805 }, { "epoch": 4.78839590443686, "grad_norm": 0.28715838094365875, "learning_rate": 2.1783762652377806e-07, "loss": 0.2175, "step": 2806 }, { "epoch": 4.790102389078498, "grad_norm": 0.28236179247227394, "learning_rate": 2.1434454788867854e-07, "loss": 0.1893, "step": 2807 }, { "epoch": 4.791808873720137, "grad_norm": 0.30194532666071067, "learning_rate": 2.1087955137264694e-07, "loss": 0.1522, "step": 2808 }, { "epoch": 4.793515358361775, "grad_norm": 0.3300862918201864, "learning_rate": 2.0744264189361373e-07, "loss": 0.1923, "step": 2809 }, { "epoch": 4.795221843003413, "grad_norm": 0.28551516089001727, "learning_rate": 2.0403382432964358e-07, "loss": 0.1659, "step": 2810 }, { "epoch": 4.796928327645051, "grad_norm": 0.304688556538885, "learning_rate": 2.006531035189241e-07, "loss": 0.1434, "step": 2811 }, { "epoch": 4.798634812286689, "grad_norm": 0.2985728529439217, "learning_rate": 1.97300484259777e-07, "loss": 0.1734, "step": 2812 }, { "epoch": 4.800341296928328, "grad_norm": 0.28064308184830183, "learning_rate": 1.9397597131062929e-07, "loss": 0.1966, "step": 2813 }, { "epoch": 4.802047781569966, "grad_norm": 0.3211313780680713, "learning_rate": 1.9067956939001763e-07, "loss": 0.2012, "step": 2814 }, { "epoch": 4.803754266211604, "grad_norm": 0.29065414759287017, "learning_rate": 1.8741128317658176e-07, "loss": 0.1672, "step": 2815 }, { "epoch": 4.805460750853243, "grad_norm": 0.3228224201642899, "learning_rate": 1.841711173090599e-07, "loss": 0.1898, "step": 2816 }, { "epoch": 4.80716723549488, "grad_norm": 0.28289833310229995, "learning_rate": 1.809590763862712e-07, "loss": 0.1695, "step": 2817 }, { "epoch": 4.808873720136519, "grad_norm": 0.31084333570315364, "learning_rate": 1.777751649671222e-07, "loss": 0.208, "step": 2818 }, { "epoch": 4.810580204778157, "grad_norm": 0.3262453918891251, "learning_rate": 1.746193875705915e-07, "loss": 0.187, "step": 2819 }, { "epoch": 4.812286689419795, "grad_norm": 0.2989793802404444, "learning_rate": 1.7149174867572725e-07, "loss": 0.2378, "step": 2820 }, { "epoch": 4.813993174061434, "grad_norm": 0.3005972736647702, "learning_rate": 1.6839225272164306e-07, "loss": 0.182, "step": 2821 }, { "epoch": 4.815699658703072, "grad_norm": 0.28847734685992565, "learning_rate": 1.6532090410750656e-07, "loss": 0.1665, "step": 2822 }, { "epoch": 4.8174061433447095, "grad_norm": 0.27575510564221406, "learning_rate": 1.6227770719253299e-07, "loss": 0.1784, "step": 2823 }, { "epoch": 4.819112627986348, "grad_norm": 0.33633905780001094, "learning_rate": 1.5926266629598507e-07, "loss": 0.182, "step": 2824 }, { "epoch": 4.820819112627986, "grad_norm": 0.31140841186578966, "learning_rate": 1.5627578569715974e-07, "loss": 0.1858, "step": 2825 }, { "epoch": 4.822525597269625, "grad_norm": 0.3140449726136613, "learning_rate": 1.533170696353925e-07, "loss": 0.1453, "step": 2826 }, { "epoch": 4.824232081911263, "grad_norm": 0.3025865907438611, "learning_rate": 1.5038652231003759e-07, "loss": 0.2066, "step": 2827 }, { "epoch": 4.825938566552901, "grad_norm": 0.3117915855683816, "learning_rate": 1.4748414788046783e-07, "loss": 0.2041, "step": 2828 }, { "epoch": 4.827645051194539, "grad_norm": 0.3105049324517198, "learning_rate": 1.4460995046607694e-07, "loss": 0.167, "step": 2829 }, { "epoch": 4.829351535836177, "grad_norm": 0.30577258623016473, "learning_rate": 1.4176393414625956e-07, "loss": 0.2138, "step": 2830 }, { "epoch": 4.8310580204778155, "grad_norm": 0.2904177630274955, "learning_rate": 1.3894610296041776e-07, "loss": 0.1692, "step": 2831 }, { "epoch": 4.832764505119454, "grad_norm": 0.3126283626493536, "learning_rate": 1.3615646090794575e-07, "loss": 0.2796, "step": 2832 }, { "epoch": 4.834470989761092, "grad_norm": 0.3167202436833116, "learning_rate": 1.333950119482319e-07, "loss": 0.1588, "step": 2833 }, { "epoch": 4.836177474402731, "grad_norm": 0.30706499524181907, "learning_rate": 1.3066176000064545e-07, "loss": 0.1853, "step": 2834 }, { "epoch": 4.837883959044369, "grad_norm": 0.31719703891509415, "learning_rate": 1.279567089445388e-07, "loss": 0.1661, "step": 2835 }, { "epoch": 4.839590443686006, "grad_norm": 0.3105845229605263, "learning_rate": 1.2527986261923863e-07, "loss": 0.1982, "step": 2836 }, { "epoch": 4.841296928327645, "grad_norm": 0.29004040836338585, "learning_rate": 1.2263122482403688e-07, "loss": 0.175, "step": 2837 }, { "epoch": 4.843003412969283, "grad_norm": 0.31375723515635084, "learning_rate": 1.2001079931819093e-07, "loss": 0.2174, "step": 2838 }, { "epoch": 4.8447098976109215, "grad_norm": 0.2965738464238479, "learning_rate": 1.1741858982091459e-07, "loss": 0.1822, "step": 2839 }, { "epoch": 4.84641638225256, "grad_norm": 0.3146061658375514, "learning_rate": 1.1485460001137816e-07, "loss": 0.1661, "step": 2840 }, { "epoch": 4.848122866894198, "grad_norm": 0.3090491363527829, "learning_rate": 1.1231883352869288e-07, "loss": 0.1805, "step": 2841 }, { "epoch": 4.849829351535837, "grad_norm": 0.3111542712054954, "learning_rate": 1.0981129397191759e-07, "loss": 0.1617, "step": 2842 }, { "epoch": 4.851535836177474, "grad_norm": 0.3095343867608167, "learning_rate": 1.0733198490004537e-07, "loss": 0.1437, "step": 2843 }, { "epoch": 4.853242320819112, "grad_norm": 0.27464120927047125, "learning_rate": 1.0488090983199917e-07, "loss": 0.2008, "step": 2844 }, { "epoch": 4.854948805460751, "grad_norm": 0.2960470457039208, "learning_rate": 1.0245807224663839e-07, "loss": 0.1984, "step": 2845 }, { "epoch": 4.856655290102389, "grad_norm": 0.31142625347213476, "learning_rate": 1.0006347558273011e-07, "loss": 0.1721, "step": 2846 }, { "epoch": 4.8583617747440275, "grad_norm": 0.2941382931020921, "learning_rate": 9.76971232389734e-08, "loss": 0.1897, "step": 2847 }, { "epoch": 4.860068259385666, "grad_norm": 0.32342748889464856, "learning_rate": 9.535901857396612e-08, "loss": 0.1814, "step": 2848 }, { "epoch": 4.861774744027304, "grad_norm": 0.31555805635068296, "learning_rate": 9.304916490622484e-08, "loss": 0.1559, "step": 2849 }, { "epoch": 4.863481228668942, "grad_norm": 0.29670548942782626, "learning_rate": 9.076756551416266e-08, "loss": 0.1854, "step": 2850 }, { "epoch": 4.86518771331058, "grad_norm": 0.303231056138658, "learning_rate": 8.851422363609363e-08, "loss": 0.1793, "step": 2851 }, { "epoch": 4.8668941979522184, "grad_norm": 0.29573043410037203, "learning_rate": 8.628914247022168e-08, "loss": 0.1831, "step": 2852 }, { "epoch": 4.868600682593857, "grad_norm": 0.323917051878582, "learning_rate": 8.409232517464727e-08, "loss": 0.1927, "step": 2853 }, { "epoch": 4.870307167235495, "grad_norm": 0.3124109856277249, "learning_rate": 8.192377486734516e-08, "loss": 0.1396, "step": 2854 }, { "epoch": 4.872013651877133, "grad_norm": 0.2886688196130614, "learning_rate": 7.978349462617996e-08, "loss": 0.175, "step": 2855 }, { "epoch": 4.873720136518771, "grad_norm": 0.3114793111963149, "learning_rate": 7.76714874888862e-08, "loss": 0.2133, "step": 2856 }, { "epoch": 4.875426621160409, "grad_norm": 0.2938654172339554, "learning_rate": 7.55877564530727e-08, "loss": 0.1905, "step": 2857 }, { "epoch": 4.877133105802048, "grad_norm": 0.2840451874449184, "learning_rate": 7.353230447621373e-08, "loss": 0.1889, "step": 2858 }, { "epoch": 4.878839590443686, "grad_norm": 0.31742584514956634, "learning_rate": 7.1505134475649e-08, "loss": 0.1533, "step": 2859 }, { "epoch": 4.8805460750853245, "grad_norm": 0.3036746756978537, "learning_rate": 6.950624932857253e-08, "loss": 0.1991, "step": 2860 }, { "epoch": 4.882252559726963, "grad_norm": 0.4697302986930764, "learning_rate": 6.753565187203937e-08, "loss": 0.1602, "step": 2861 }, { "epoch": 4.8839590443686, "grad_norm": 0.2984718074892485, "learning_rate": 6.559334490294778e-08, "loss": 0.1782, "step": 2862 }, { "epoch": 4.885665529010239, "grad_norm": 0.2927837797491726, "learning_rate": 6.367933117805258e-08, "loss": 0.1844, "step": 2863 }, { "epoch": 4.887372013651877, "grad_norm": 0.32155411946981005, "learning_rate": 6.179361341394297e-08, "loss": 0.1636, "step": 2864 }, { "epoch": 4.889078498293515, "grad_norm": 0.3008982510208331, "learning_rate": 5.993619428705355e-08, "loss": 0.1883, "step": 2865 }, { "epoch": 4.890784982935154, "grad_norm": 0.3004924236775075, "learning_rate": 5.810707643364666e-08, "loss": 0.1823, "step": 2866 }, { "epoch": 4.892491467576792, "grad_norm": 0.28338462990858665, "learning_rate": 5.6306262449823403e-08, "loss": 0.1724, "step": 2867 }, { "epoch": 4.8941979522184305, "grad_norm": 0.30429095342862417, "learning_rate": 5.453375489150814e-08, "loss": 0.1987, "step": 2868 }, { "epoch": 4.895904436860068, "grad_norm": 0.3075393001944788, "learning_rate": 5.2789556274452925e-08, "loss": 0.1801, "step": 2869 }, { "epoch": 4.897610921501706, "grad_norm": 0.3030911501777876, "learning_rate": 5.1073669074228616e-08, "loss": 0.1859, "step": 2870 }, { "epoch": 4.899317406143345, "grad_norm": 0.283894859345853, "learning_rate": 4.938609572622044e-08, "loss": 0.1939, "step": 2871 }, { "epoch": 4.901023890784983, "grad_norm": 0.29741908715480087, "learning_rate": 4.772683862563465e-08, "loss": 0.1489, "step": 2872 }, { "epoch": 4.902730375426621, "grad_norm": 0.3081236236789463, "learning_rate": 4.609590012747856e-08, "loss": 0.1609, "step": 2873 }, { "epoch": 4.90443686006826, "grad_norm": 0.2912861896734738, "learning_rate": 4.4493282546573815e-08, "loss": 0.1732, "step": 2874 }, { "epoch": 4.906143344709898, "grad_norm": 0.29829247907729073, "learning_rate": 4.291898815754314e-08, "loss": 0.2248, "step": 2875 }, { "epoch": 4.907849829351536, "grad_norm": 0.32330446741770824, "learning_rate": 4.1373019194808074e-08, "loss": 0.1876, "step": 2876 }, { "epoch": 4.909556313993174, "grad_norm": 0.3082796970775283, "learning_rate": 3.985537785259119e-08, "loss": 0.1795, "step": 2877 }, { "epoch": 4.911262798634812, "grad_norm": 0.3089579069750301, "learning_rate": 3.83660662849028e-08, "loss": 0.1678, "step": 2878 }, { "epoch": 4.912969283276451, "grad_norm": 0.3021216791497954, "learning_rate": 3.690508660555203e-08, "loss": 0.1623, "step": 2879 }, { "epoch": 4.914675767918089, "grad_norm": 0.3155362336068173, "learning_rate": 3.547244088812907e-08, "loss": 0.1788, "step": 2880 }, { "epoch": 4.9163822525597265, "grad_norm": 0.31759061550143486, "learning_rate": 3.4068131166016264e-08, "loss": 0.149, "step": 2881 }, { "epoch": 4.918088737201365, "grad_norm": 0.30188078406571833, "learning_rate": 3.2692159432370364e-08, "loss": 0.1841, "step": 2882 }, { "epoch": 4.919795221843003, "grad_norm": 0.3105937487156836, "learning_rate": 3.134452764013363e-08, "loss": 0.1682, "step": 2883 }, { "epoch": 4.921501706484642, "grad_norm": 0.3280878154771452, "learning_rate": 3.002523770202492e-08, "loss": 0.1934, "step": 2884 }, { "epoch": 4.92320819112628, "grad_norm": 0.30362182221058903, "learning_rate": 2.8734291490530863e-08, "loss": 0.1708, "step": 2885 }, { "epoch": 4.924914675767918, "grad_norm": 0.31904967974094284, "learning_rate": 2.7471690837916897e-08, "loss": 0.1638, "step": 2886 }, { "epoch": 4.926621160409557, "grad_norm": 0.2850030617567953, "learning_rate": 2.6237437536211774e-08, "loss": 0.1527, "step": 2887 }, { "epoch": 4.928327645051194, "grad_norm": 0.30847599156420746, "learning_rate": 2.5031533337211978e-08, "loss": 0.1799, "step": 2888 }, { "epoch": 4.9300341296928325, "grad_norm": 0.3091155118863489, "learning_rate": 2.3853979952481733e-08, "loss": 0.1766, "step": 2889 }, { "epoch": 4.931740614334471, "grad_norm": 0.3172787205950717, "learning_rate": 2.2704779053337456e-08, "loss": 0.1852, "step": 2890 }, { "epoch": 4.933447098976109, "grad_norm": 0.29594053893996564, "learning_rate": 2.1583932270863307e-08, "loss": 0.159, "step": 2891 }, { "epoch": 4.935153583617748, "grad_norm": 0.3084174629664152, "learning_rate": 2.0491441195893412e-08, "loss": 0.2037, "step": 2892 }, { "epoch": 4.936860068259386, "grad_norm": 0.3192737298258064, "learning_rate": 1.9427307379020765e-08, "loss": 0.1704, "step": 2893 }, { "epoch": 4.938566552901024, "grad_norm": 0.3081067773374951, "learning_rate": 1.8391532330590544e-08, "loss": 0.2199, "step": 2894 }, { "epoch": 4.940273037542662, "grad_norm": 0.3143303371306659, "learning_rate": 1.7384117520691246e-08, "loss": 0.1593, "step": 2895 }, { "epoch": 4.9419795221843, "grad_norm": 0.2841289933841045, "learning_rate": 1.640506437917022e-08, "loss": 0.1685, "step": 2896 }, { "epoch": 4.9436860068259385, "grad_norm": 0.2947377064229445, "learning_rate": 1.545437429560703e-08, "loss": 0.1894, "step": 2897 }, { "epoch": 4.945392491467577, "grad_norm": 0.37895918120738786, "learning_rate": 1.453204861933788e-08, "loss": 0.2802, "step": 2898 }, { "epoch": 4.947098976109215, "grad_norm": 0.30593636302043853, "learning_rate": 1.363808865943339e-08, "loss": 0.1536, "step": 2899 }, { "epoch": 4.948805460750854, "grad_norm": 0.32220876524546754, "learning_rate": 1.277249568470751e-08, "loss": 0.1849, "step": 2900 }, { "epoch": 4.950511945392492, "grad_norm": 0.2937325209008791, "learning_rate": 1.1935270923708609e-08, "loss": 0.2105, "step": 2901 }, { "epoch": 4.952218430034129, "grad_norm": 0.31260924234270576, "learning_rate": 1.1126415564726157e-08, "loss": 0.1763, "step": 2902 }, { "epoch": 4.953924914675768, "grad_norm": 0.30275187594512826, "learning_rate": 1.034593075578183e-08, "loss": 0.1669, "step": 2903 }, { "epoch": 4.955631399317406, "grad_norm": 0.3368758053021886, "learning_rate": 9.59381760463174e-09, "loss": 0.1716, "step": 2904 }, { "epoch": 4.9573378839590445, "grad_norm": 0.29851385572075545, "learning_rate": 8.870077178761981e-09, "loss": 0.1782, "step": 2905 }, { "epoch": 4.959044368600683, "grad_norm": 0.2764806802342295, "learning_rate": 8.17471050538865e-09, "loss": 0.2035, "step": 2906 }, { "epoch": 4.96075085324232, "grad_norm": 0.31223657501904756, "learning_rate": 7.507718571460044e-09, "loss": 0.1478, "step": 2907 }, { "epoch": 4.962457337883959, "grad_norm": 0.32092822818481126, "learning_rate": 6.8691023236477914e-09, "loss": 0.18, "step": 2908 }, { "epoch": 4.964163822525597, "grad_norm": 0.3139808834377375, "learning_rate": 6.258862668351296e-09, "loss": 0.1992, "step": 2909 }, { "epoch": 4.965870307167235, "grad_norm": 0.36154629547941414, "learning_rate": 5.677000471693283e-09, "loss": 0.1739, "step": 2910 }, { "epoch": 4.967576791808874, "grad_norm": 0.29872209839584657, "learning_rate": 5.123516559522035e-09, "loss": 0.1777, "step": 2911 }, { "epoch": 4.969283276450512, "grad_norm": 0.2961551390547971, "learning_rate": 4.598411717404716e-09, "loss": 0.1676, "step": 2912 }, { "epoch": 4.9709897610921505, "grad_norm": 0.30619626425180235, "learning_rate": 4.1016866906340435e-09, "loss": 0.1828, "step": 2913 }, { "epoch": 4.972696245733788, "grad_norm": 0.3010614571479018, "learning_rate": 3.6333421842194015e-09, "loss": 0.1393, "step": 2914 }, { "epoch": 4.974402730375426, "grad_norm": 0.2809770847281617, "learning_rate": 3.193378862891283e-09, "loss": 0.1452, "step": 2915 }, { "epoch": 4.976109215017065, "grad_norm": 0.29638133049217613, "learning_rate": 2.7817973510946284e-09, "loss": 0.2233, "step": 2916 }, { "epoch": 4.977815699658703, "grad_norm": 0.30868966487380983, "learning_rate": 2.398598232995486e-09, "loss": 0.1718, "step": 2917 }, { "epoch": 4.979522184300341, "grad_norm": 0.2928606482705561, "learning_rate": 2.0437820524743524e-09, "loss": 0.1662, "step": 2918 }, { "epoch": 4.98122866894198, "grad_norm": 0.32612854136308245, "learning_rate": 1.7173493131283914e-09, "loss": 0.1693, "step": 2919 }, { "epoch": 4.982935153583618, "grad_norm": 0.28808533238959344, "learning_rate": 1.4193004782692144e-09, "loss": 0.1895, "step": 2920 }, { "epoch": 4.984641638225256, "grad_norm": 0.3318512216485675, "learning_rate": 1.1496359709228798e-09, "loss": 0.181, "step": 2921 }, { "epoch": 4.986348122866894, "grad_norm": 0.3745280144589005, "learning_rate": 9.083561738276736e-10, "loss": 0.2369, "step": 2922 }, { "epoch": 4.988054607508532, "grad_norm": 0.30803026547551043, "learning_rate": 6.954614294385486e-10, "loss": 0.1786, "step": 2923 }, { "epoch": 4.989761092150171, "grad_norm": 0.30026959439019923, "learning_rate": 5.109520399182443e-10, "loss": 0.1634, "step": 2924 }, { "epoch": 4.991467576791809, "grad_norm": 0.2966519292573069, "learning_rate": 3.5482826714394733e-10, "loss": 0.1842, "step": 2925 }, { "epoch": 4.993174061433447, "grad_norm": 0.29530756983955236, "learning_rate": 2.2709033270729154e-10, "loss": 0.1511, "step": 2926 }, { "epoch": 4.994880546075086, "grad_norm": 0.3529263677322087, "learning_rate": 1.2773841790769682e-10, "loss": 0.1369, "step": 2927 }, { "epoch": 4.996587030716723, "grad_norm": 0.3007431179010985, "learning_rate": 5.677266375458956e-11, "loss": 0.2011, "step": 2928 }, { "epoch": 4.998293515358362, "grad_norm": 0.32241652480938615, "learning_rate": 1.4193170974063918e-11, "loss": 0.1771, "step": 2929 }, { "epoch": 5.0, "grad_norm": 0.3247430104362262, "learning_rate": 0.0, "loss": 0.1839, "step": 2930 }, { "epoch": 5.0, "step": 2930, "total_flos": 2698707716407296.0, "train_loss": 0.34839511024463704, "train_runtime": 52686.9412, "train_samples_per_second": 7.118, "train_steps_per_second": 0.056 } ], "logging_steps": 1, "max_steps": 2930, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2698707716407296.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }