{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8212203334154554, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 21.56344104740925, "learning_rate": 1.0928961748633881e-07, "loss": 1.7357, "step": 1 }, { "epoch": 0.0, "grad_norm": 20.11687502096145, "learning_rate": 2.1857923497267762e-07, "loss": 1.7866, "step": 2 }, { "epoch": 0.0, "grad_norm": 18.30159876291587, "learning_rate": 3.278688524590164e-07, "loss": 1.6454, "step": 3 }, { "epoch": 0.0, "grad_norm": 16.396305199700034, "learning_rate": 4.3715846994535524e-07, "loss": 1.817, "step": 4 }, { "epoch": 0.0, "grad_norm": 25.631219006991078, "learning_rate": 5.46448087431694e-07, "loss": 1.7098, "step": 5 }, { "epoch": 0.0, "grad_norm": 21.690014370499565, "learning_rate": 6.557377049180328e-07, "loss": 1.7704, "step": 6 }, { "epoch": 0.0, "grad_norm": 16.582193649060613, "learning_rate": 7.650273224043716e-07, "loss": 1.7751, "step": 7 }, { "epoch": 0.0, "grad_norm": 22.99539975067837, "learning_rate": 8.743169398907105e-07, "loss": 1.6372, "step": 8 }, { "epoch": 0.0, "grad_norm": 15.637076316010969, "learning_rate": 9.836065573770493e-07, "loss": 1.6881, "step": 9 }, { "epoch": 0.0, "grad_norm": 18.16329007687177, "learning_rate": 1.092896174863388e-06, "loss": 1.6045, "step": 10 }, { "epoch": 0.0, "grad_norm": 25.25849457870173, "learning_rate": 1.2021857923497268e-06, "loss": 1.5906, "step": 11 }, { "epoch": 0.0, "grad_norm": 0.5387399324110798, "learning_rate": 1.3114754098360657e-06, "loss": 0.2826, "step": 12 }, { "epoch": 0.0, "grad_norm": 11.793190609244604, "learning_rate": 1.4207650273224043e-06, "loss": 1.4454, "step": 13 }, { "epoch": 0.0, "grad_norm": 0.5515674800008298, "learning_rate": 1.5300546448087432e-06, "loss": 0.27, "step": 14 }, { "epoch": 0.0, "grad_norm": 8.270044836161924, "learning_rate": 1.6393442622950819e-06, "loss": 1.3505, "step": 15 }, { "epoch": 0.0, "grad_norm": 8.84889443209007, "learning_rate": 1.748633879781421e-06, "loss": 1.5237, "step": 16 }, { "epoch": 0.0, "grad_norm": 9.903838625184653, "learning_rate": 1.8579234972677599e-06, "loss": 1.3763, "step": 17 }, { "epoch": 0.0, "grad_norm": 8.773525150036203, "learning_rate": 1.9672131147540985e-06, "loss": 1.3221, "step": 18 }, { "epoch": 0.0, "grad_norm": 13.778782940813231, "learning_rate": 2.0765027322404376e-06, "loss": 1.3933, "step": 19 }, { "epoch": 0.0, "grad_norm": 0.6118386137647269, "learning_rate": 2.185792349726776e-06, "loss": 0.2624, "step": 20 }, { "epoch": 0.0, "grad_norm": 7.2195401894441344, "learning_rate": 2.295081967213115e-06, "loss": 1.2896, "step": 21 }, { "epoch": 0.0, "grad_norm": 10.721019556959442, "learning_rate": 2.4043715846994536e-06, "loss": 1.4756, "step": 22 }, { "epoch": 0.0, "grad_norm": 0.6333878082669411, "learning_rate": 2.5136612021857927e-06, "loss": 0.2688, "step": 23 }, { "epoch": 0.0, "grad_norm": 5.3158931801248945, "learning_rate": 2.6229508196721314e-06, "loss": 1.4242, "step": 24 }, { "epoch": 0.0, "grad_norm": 0.6219536618228323, "learning_rate": 2.7322404371584705e-06, "loss": 0.2544, "step": 25 }, { "epoch": 0.0, "grad_norm": 6.52021585434475, "learning_rate": 2.8415300546448087e-06, "loss": 1.5057, "step": 26 }, { "epoch": 0.0, "grad_norm": 6.082574031056085, "learning_rate": 2.9508196721311478e-06, "loss": 1.3905, "step": 27 }, { "epoch": 0.0, "grad_norm": 4.037903693962823, "learning_rate": 3.0601092896174864e-06, "loss": 1.2729, "step": 28 }, { "epoch": 0.0, "grad_norm": 4.382135275594614, "learning_rate": 3.1693989071038255e-06, "loss": 1.271, "step": 29 }, { "epoch": 0.0, "grad_norm": 3.6937766044240212, "learning_rate": 3.2786885245901638e-06, "loss": 1.2002, "step": 30 }, { "epoch": 0.01, "grad_norm": 4.841626328542114, "learning_rate": 3.387978142076503e-06, "loss": 1.3136, "step": 31 }, { "epoch": 0.01, "grad_norm": 6.020789996047191, "learning_rate": 3.497267759562842e-06, "loss": 1.2522, "step": 32 }, { "epoch": 0.01, "grad_norm": 5.119108620203576, "learning_rate": 3.6065573770491806e-06, "loss": 1.1921, "step": 33 }, { "epoch": 0.01, "grad_norm": 3.5058084832559757, "learning_rate": 3.7158469945355197e-06, "loss": 1.2199, "step": 34 }, { "epoch": 0.01, "grad_norm": 3.1341213953306, "learning_rate": 3.825136612021858e-06, "loss": 1.1525, "step": 35 }, { "epoch": 0.01, "grad_norm": 6.432464037122764, "learning_rate": 3.934426229508197e-06, "loss": 1.2913, "step": 36 }, { "epoch": 0.01, "grad_norm": 6.309741731469468, "learning_rate": 4.043715846994536e-06, "loss": 1.2743, "step": 37 }, { "epoch": 0.01, "grad_norm": 4.172774753447052, "learning_rate": 4.153005464480875e-06, "loss": 1.2437, "step": 38 }, { "epoch": 0.01, "grad_norm": 4.101023489955477, "learning_rate": 4.2622950819672135e-06, "loss": 1.1682, "step": 39 }, { "epoch": 0.01, "grad_norm": 3.3672197680881983, "learning_rate": 4.371584699453552e-06, "loss": 1.1693, "step": 40 }, { "epoch": 0.01, "grad_norm": 3.170746254929017, "learning_rate": 4.480874316939891e-06, "loss": 1.1436, "step": 41 }, { "epoch": 0.01, "grad_norm": 2.8635357480322883, "learning_rate": 4.59016393442623e-06, "loss": 1.1501, "step": 42 }, { "epoch": 0.01, "grad_norm": 4.186359013695381, "learning_rate": 4.699453551912569e-06, "loss": 1.0662, "step": 43 }, { "epoch": 0.01, "grad_norm": 3.3970318563464375, "learning_rate": 4.808743169398907e-06, "loss": 1.0673, "step": 44 }, { "epoch": 0.01, "grad_norm": 3.189949697895489, "learning_rate": 4.918032786885246e-06, "loss": 1.1806, "step": 45 }, { "epoch": 0.01, "grad_norm": 3.911926429717794, "learning_rate": 5.027322404371585e-06, "loss": 1.1497, "step": 46 }, { "epoch": 0.01, "grad_norm": 0.9449909144292845, "learning_rate": 5.1366120218579245e-06, "loss": 0.266, "step": 47 }, { "epoch": 0.01, "grad_norm": 4.9756269852361, "learning_rate": 5.245901639344263e-06, "loss": 1.2885, "step": 48 }, { "epoch": 0.01, "grad_norm": 5.193194115360891, "learning_rate": 5.355191256830602e-06, "loss": 1.1986, "step": 49 }, { "epoch": 0.01, "grad_norm": 2.6932942152665036, "learning_rate": 5.464480874316941e-06, "loss": 1.099, "step": 50 }, { "epoch": 0.01, "grad_norm": 3.773497630557456, "learning_rate": 5.573770491803278e-06, "loss": 1.1132, "step": 51 }, { "epoch": 0.01, "grad_norm": 3.54903990691928, "learning_rate": 5.683060109289617e-06, "loss": 1.1954, "step": 52 }, { "epoch": 0.01, "grad_norm": 4.024121808447138, "learning_rate": 5.7923497267759565e-06, "loss": 1.1791, "step": 53 }, { "epoch": 0.01, "grad_norm": 2.726779876394771, "learning_rate": 5.9016393442622956e-06, "loss": 1.1151, "step": 54 }, { "epoch": 0.01, "grad_norm": 4.978281730573192, "learning_rate": 6.010928961748635e-06, "loss": 1.1925, "step": 55 }, { "epoch": 0.01, "grad_norm": 3.067515560118367, "learning_rate": 6.120218579234973e-06, "loss": 1.1231, "step": 56 }, { "epoch": 0.01, "grad_norm": 2.6926953468676573, "learning_rate": 6.229508196721312e-06, "loss": 0.9862, "step": 57 }, { "epoch": 0.01, "grad_norm": 3.337480329159369, "learning_rate": 6.338797814207651e-06, "loss": 1.1763, "step": 58 }, { "epoch": 0.01, "grad_norm": 3.0682974158129253, "learning_rate": 6.44808743169399e-06, "loss": 1.0417, "step": 59 }, { "epoch": 0.01, "grad_norm": 3.2399485869366105, "learning_rate": 6.5573770491803276e-06, "loss": 1.1232, "step": 60 }, { "epoch": 0.01, "grad_norm": 3.7880946454737576, "learning_rate": 6.666666666666667e-06, "loss": 1.051, "step": 61 }, { "epoch": 0.01, "grad_norm": 5.981167794799814, "learning_rate": 6.775956284153006e-06, "loss": 1.1327, "step": 62 }, { "epoch": 0.01, "grad_norm": 4.045144018106987, "learning_rate": 6.885245901639345e-06, "loss": 1.063, "step": 63 }, { "epoch": 0.01, "grad_norm": 3.566893500285689, "learning_rate": 6.994535519125684e-06, "loss": 1.127, "step": 64 }, { "epoch": 0.01, "grad_norm": 3.985013192981782, "learning_rate": 7.103825136612022e-06, "loss": 1.1675, "step": 65 }, { "epoch": 0.01, "grad_norm": 2.9338182291306345, "learning_rate": 7.213114754098361e-06, "loss": 1.0432, "step": 66 }, { "epoch": 0.01, "grad_norm": 2.5043560817586146, "learning_rate": 7.3224043715847e-06, "loss": 1.0447, "step": 67 }, { "epoch": 0.01, "grad_norm": 3.117782028831934, "learning_rate": 7.4316939890710394e-06, "loss": 1.1209, "step": 68 }, { "epoch": 0.01, "grad_norm": 3.7668022151541507, "learning_rate": 7.540983606557377e-06, "loss": 1.0765, "step": 69 }, { "epoch": 0.01, "grad_norm": 3.788523147709726, "learning_rate": 7.650273224043716e-06, "loss": 1.0993, "step": 70 }, { "epoch": 0.01, "grad_norm": 4.627669776471835, "learning_rate": 7.759562841530056e-06, "loss": 1.0482, "step": 71 }, { "epoch": 0.01, "grad_norm": 3.47296914015289, "learning_rate": 7.868852459016394e-06, "loss": 1.0769, "step": 72 }, { "epoch": 0.01, "grad_norm": 3.22829730642909, "learning_rate": 7.978142076502732e-06, "loss": 1.1183, "step": 73 }, { "epoch": 0.01, "grad_norm": 2.8910011111490985, "learning_rate": 8.087431693989072e-06, "loss": 1.0446, "step": 74 }, { "epoch": 0.01, "grad_norm": 3.1524835375166123, "learning_rate": 8.19672131147541e-06, "loss": 0.9776, "step": 75 }, { "epoch": 0.01, "grad_norm": 2.828276079528656, "learning_rate": 8.30601092896175e-06, "loss": 1.1086, "step": 76 }, { "epoch": 0.01, "grad_norm": 5.884559458469066, "learning_rate": 8.415300546448089e-06, "loss": 1.0978, "step": 77 }, { "epoch": 0.01, "grad_norm": 3.4568222058785927, "learning_rate": 8.524590163934427e-06, "loss": 1.0068, "step": 78 }, { "epoch": 0.01, "grad_norm": 3.2928244933606368, "learning_rate": 8.633879781420765e-06, "loss": 1.0622, "step": 79 }, { "epoch": 0.01, "grad_norm": 9.070211090671194, "learning_rate": 8.743169398907103e-06, "loss": 1.0817, "step": 80 }, { "epoch": 0.01, "grad_norm": 3.3776659247181184, "learning_rate": 8.852459016393443e-06, "loss": 1.1163, "step": 81 }, { "epoch": 0.01, "grad_norm": 4.995640597309245, "learning_rate": 8.961748633879782e-06, "loss": 1.0085, "step": 82 }, { "epoch": 0.01, "grad_norm": 4.676504965419237, "learning_rate": 9.071038251366122e-06, "loss": 1.0254, "step": 83 }, { "epoch": 0.01, "grad_norm": 4.318446009244394, "learning_rate": 9.18032786885246e-06, "loss": 1.1155, "step": 84 }, { "epoch": 0.01, "grad_norm": 3.2316804130850727, "learning_rate": 9.2896174863388e-06, "loss": 1.021, "step": 85 }, { "epoch": 0.01, "grad_norm": 2.592608820798961, "learning_rate": 9.398907103825138e-06, "loss": 1.0529, "step": 86 }, { "epoch": 0.01, "grad_norm": 2.4067979288783254, "learning_rate": 9.508196721311476e-06, "loss": 0.9943, "step": 87 }, { "epoch": 0.01, "grad_norm": 4.089169403715935, "learning_rate": 9.617486338797814e-06, "loss": 1.1293, "step": 88 }, { "epoch": 0.01, "grad_norm": 5.810154001403819, "learning_rate": 9.726775956284153e-06, "loss": 1.0625, "step": 89 }, { "epoch": 0.01, "grad_norm": 3.3253886636281176, "learning_rate": 9.836065573770493e-06, "loss": 1.0921, "step": 90 }, { "epoch": 0.01, "grad_norm": 3.2168719019324867, "learning_rate": 9.945355191256831e-06, "loss": 1.1041, "step": 91 }, { "epoch": 0.02, "grad_norm": 4.489558707554039, "learning_rate": 1.005464480874317e-05, "loss": 1.0337, "step": 92 }, { "epoch": 0.02, "grad_norm": 2.203154953864817, "learning_rate": 1.0163934426229509e-05, "loss": 0.9964, "step": 93 }, { "epoch": 0.02, "grad_norm": 4.841394563656923, "learning_rate": 1.0273224043715849e-05, "loss": 1.1216, "step": 94 }, { "epoch": 0.02, "grad_norm": 2.7664896268767447, "learning_rate": 1.0382513661202187e-05, "loss": 1.1051, "step": 95 }, { "epoch": 0.02, "grad_norm": 2.840591299203819, "learning_rate": 1.0491803278688525e-05, "loss": 1.0498, "step": 96 }, { "epoch": 0.02, "grad_norm": 3.3068995222309634, "learning_rate": 1.0601092896174865e-05, "loss": 1.0382, "step": 97 }, { "epoch": 0.02, "grad_norm": 4.273812248717692, "learning_rate": 1.0710382513661204e-05, "loss": 1.0898, "step": 98 }, { "epoch": 0.02, "grad_norm": 4.62437388459103, "learning_rate": 1.0819672131147544e-05, "loss": 1.071, "step": 99 }, { "epoch": 0.02, "grad_norm": 1.202416064692781, "learning_rate": 1.0928961748633882e-05, "loss": 0.2859, "step": 100 }, { "epoch": 0.02, "grad_norm": 3.895352912564396, "learning_rate": 1.1038251366120218e-05, "loss": 1.0498, "step": 101 }, { "epoch": 0.02, "grad_norm": 3.2611551399186607, "learning_rate": 1.1147540983606557e-05, "loss": 0.9913, "step": 102 }, { "epoch": 0.02, "grad_norm": 4.014111671396235, "learning_rate": 1.1256830601092897e-05, "loss": 1.0044, "step": 103 }, { "epoch": 0.02, "grad_norm": 3.533368022052666, "learning_rate": 1.1366120218579235e-05, "loss": 1.0804, "step": 104 }, { "epoch": 0.02, "grad_norm": 3.235046498395457, "learning_rate": 1.1475409836065575e-05, "loss": 1.0539, "step": 105 }, { "epoch": 0.02, "grad_norm": 2.781086724903625, "learning_rate": 1.1584699453551913e-05, "loss": 1.1119, "step": 106 }, { "epoch": 0.02, "grad_norm": 4.045531938028331, "learning_rate": 1.1693989071038251e-05, "loss": 0.9991, "step": 107 }, { "epoch": 0.02, "grad_norm": 3.75843694868663, "learning_rate": 1.1803278688524591e-05, "loss": 0.9962, "step": 108 }, { "epoch": 0.02, "grad_norm": 4.93512074060094, "learning_rate": 1.191256830601093e-05, "loss": 1.0793, "step": 109 }, { "epoch": 0.02, "grad_norm": 2.5997259519513602, "learning_rate": 1.202185792349727e-05, "loss": 0.9982, "step": 110 }, { "epoch": 0.02, "grad_norm": 4.419999889412088, "learning_rate": 1.2131147540983608e-05, "loss": 1.0521, "step": 111 }, { "epoch": 0.02, "grad_norm": 6.694167842489097, "learning_rate": 1.2240437158469946e-05, "loss": 0.9717, "step": 112 }, { "epoch": 0.02, "grad_norm": 2.710007516258926, "learning_rate": 1.2349726775956286e-05, "loss": 0.9978, "step": 113 }, { "epoch": 0.02, "grad_norm": 3.3134009530831294, "learning_rate": 1.2459016393442624e-05, "loss": 1.0299, "step": 114 }, { "epoch": 0.02, "grad_norm": 3.4249060820190547, "learning_rate": 1.2568306010928964e-05, "loss": 0.9837, "step": 115 }, { "epoch": 0.02, "grad_norm": 3.342855689982545, "learning_rate": 1.2677595628415302e-05, "loss": 1.0322, "step": 116 }, { "epoch": 0.02, "grad_norm": 3.2329700544932027, "learning_rate": 1.2786885245901642e-05, "loss": 1.0326, "step": 117 }, { "epoch": 0.02, "grad_norm": 3.4533757923451023, "learning_rate": 1.289617486338798e-05, "loss": 1.0201, "step": 118 }, { "epoch": 0.02, "grad_norm": 4.116104865199823, "learning_rate": 1.3005464480874317e-05, "loss": 1.0441, "step": 119 }, { "epoch": 0.02, "grad_norm": 3.1106118702426566, "learning_rate": 1.3114754098360655e-05, "loss": 1.0112, "step": 120 }, { "epoch": 0.02, "grad_norm": 2.7803913045359923, "learning_rate": 1.3224043715846995e-05, "loss": 1.0329, "step": 121 }, { "epoch": 0.02, "grad_norm": 3.8584409055224014, "learning_rate": 1.3333333333333333e-05, "loss": 1.0795, "step": 122 }, { "epoch": 0.02, "grad_norm": 2.924022682605421, "learning_rate": 1.3442622950819673e-05, "loss": 0.9788, "step": 123 }, { "epoch": 0.02, "grad_norm": 4.050237281732988, "learning_rate": 1.3551912568306011e-05, "loss": 1.0942, "step": 124 }, { "epoch": 0.02, "grad_norm": 3.8717790785894723, "learning_rate": 1.366120218579235e-05, "loss": 0.9535, "step": 125 }, { "epoch": 0.02, "grad_norm": 82.5513456823171, "learning_rate": 1.377049180327869e-05, "loss": 0.9913, "step": 126 }, { "epoch": 0.02, "grad_norm": 3.0823180561351, "learning_rate": 1.3879781420765028e-05, "loss": 1.0227, "step": 127 }, { "epoch": 0.02, "grad_norm": 3.4770932161354104, "learning_rate": 1.3989071038251368e-05, "loss": 0.9803, "step": 128 }, { "epoch": 0.02, "grad_norm": 64.62993094705931, "learning_rate": 1.4098360655737706e-05, "loss": 1.0557, "step": 129 }, { "epoch": 0.02, "grad_norm": 3.0244072712414143, "learning_rate": 1.4207650273224044e-05, "loss": 1.032, "step": 130 }, { "epoch": 0.02, "grad_norm": 3.7710858184334803, "learning_rate": 1.4316939890710384e-05, "loss": 1.0226, "step": 131 }, { "epoch": 0.02, "grad_norm": 7.631169409699158, "learning_rate": 1.4426229508196722e-05, "loss": 1.084, "step": 132 }, { "epoch": 0.02, "grad_norm": 4.47749481462741, "learning_rate": 1.4535519125683062e-05, "loss": 1.0178, "step": 133 }, { "epoch": 0.02, "grad_norm": 3.195939357617075, "learning_rate": 1.46448087431694e-05, "loss": 0.3777, "step": 134 }, { "epoch": 0.02, "grad_norm": 3.9160919271477193, "learning_rate": 1.4754098360655739e-05, "loss": 1.0219, "step": 135 }, { "epoch": 0.02, "grad_norm": 2.968478498154621, "learning_rate": 1.4863387978142079e-05, "loss": 1.0012, "step": 136 }, { "epoch": 0.02, "grad_norm": 2.25520291845516, "learning_rate": 1.4972677595628417e-05, "loss": 1.0035, "step": 137 }, { "epoch": 0.02, "grad_norm": 3.615356454087918, "learning_rate": 1.5081967213114754e-05, "loss": 1.0235, "step": 138 }, { "epoch": 0.02, "grad_norm": 3.269947833311223, "learning_rate": 1.5191256830601094e-05, "loss": 1.0132, "step": 139 }, { "epoch": 0.02, "grad_norm": 4.047926804304574, "learning_rate": 1.5300546448087432e-05, "loss": 0.9297, "step": 140 }, { "epoch": 0.02, "grad_norm": 4.000146739548202, "learning_rate": 1.5409836065573772e-05, "loss": 1.0575, "step": 141 }, { "epoch": 0.02, "grad_norm": 2.7654631026880114, "learning_rate": 1.551912568306011e-05, "loss": 1.0188, "step": 142 }, { "epoch": 0.02, "grad_norm": 3.407562704308582, "learning_rate": 1.5628415300546448e-05, "loss": 1.0258, "step": 143 }, { "epoch": 0.02, "grad_norm": 2.4864272917236803, "learning_rate": 1.5737704918032788e-05, "loss": 1.0526, "step": 144 }, { "epoch": 0.02, "grad_norm": 3.0442317873615194, "learning_rate": 1.5846994535519128e-05, "loss": 0.9707, "step": 145 }, { "epoch": 0.02, "grad_norm": 2.912255004967118, "learning_rate": 1.5956284153005465e-05, "loss": 1.0263, "step": 146 }, { "epoch": 0.02, "grad_norm": 3.172547334598689, "learning_rate": 1.6065573770491805e-05, "loss": 1.0409, "step": 147 }, { "epoch": 0.02, "grad_norm": 2.8949881398891337, "learning_rate": 1.6174863387978145e-05, "loss": 1.0293, "step": 148 }, { "epoch": 0.02, "grad_norm": 2.8752043265108655, "learning_rate": 1.628415300546448e-05, "loss": 0.9346, "step": 149 }, { "epoch": 0.02, "grad_norm": 2.7397819736310525, "learning_rate": 1.639344262295082e-05, "loss": 0.9663, "step": 150 }, { "epoch": 0.02, "grad_norm": 3.5172014440963717, "learning_rate": 1.650273224043716e-05, "loss": 0.9538, "step": 151 }, { "epoch": 0.02, "grad_norm": 3.157545310081949, "learning_rate": 1.66120218579235e-05, "loss": 1.026, "step": 152 }, { "epoch": 0.03, "grad_norm": 3.0248702725125933, "learning_rate": 1.6721311475409837e-05, "loss": 1.0739, "step": 153 }, { "epoch": 0.03, "grad_norm": 3.3502467163274408, "learning_rate": 1.6830601092896177e-05, "loss": 0.9418, "step": 154 }, { "epoch": 0.03, "grad_norm": 3.437985522013736, "learning_rate": 1.6939890710382517e-05, "loss": 1.042, "step": 155 }, { "epoch": 0.03, "grad_norm": 6.921395420365113, "learning_rate": 1.7049180327868854e-05, "loss": 0.969, "step": 156 }, { "epoch": 0.03, "grad_norm": 2.8085291076481496, "learning_rate": 1.715846994535519e-05, "loss": 1.0475, "step": 157 }, { "epoch": 0.03, "grad_norm": 3.247413228828877, "learning_rate": 1.726775956284153e-05, "loss": 0.9531, "step": 158 }, { "epoch": 0.03, "grad_norm": 2.7408796043945984, "learning_rate": 1.737704918032787e-05, "loss": 1.0192, "step": 159 }, { "epoch": 0.03, "grad_norm": 2.720800753641785, "learning_rate": 1.7486338797814207e-05, "loss": 1.0259, "step": 160 }, { "epoch": 0.03, "grad_norm": 2.5318336223403546, "learning_rate": 1.7595628415300547e-05, "loss": 1.0423, "step": 161 }, { "epoch": 0.03, "grad_norm": 2.560086424364925, "learning_rate": 1.7704918032786887e-05, "loss": 1.0016, "step": 162 }, { "epoch": 0.03, "grad_norm": 2.754379700649437, "learning_rate": 1.7814207650273227e-05, "loss": 1.1403, "step": 163 }, { "epoch": 0.03, "grad_norm": 4.311303777864817, "learning_rate": 1.7923497267759563e-05, "loss": 0.9571, "step": 164 }, { "epoch": 0.03, "grad_norm": 2.7669610336123855, "learning_rate": 1.8032786885245903e-05, "loss": 0.9874, "step": 165 }, { "epoch": 0.03, "grad_norm": 3.159819204072131, "learning_rate": 1.8142076502732243e-05, "loss": 1.0179, "step": 166 }, { "epoch": 0.03, "grad_norm": 3.7977256372949215, "learning_rate": 1.825136612021858e-05, "loss": 0.885, "step": 167 }, { "epoch": 0.03, "grad_norm": 2.9959512573471363, "learning_rate": 1.836065573770492e-05, "loss": 1.0336, "step": 168 }, { "epoch": 0.03, "grad_norm": 2.982968037964523, "learning_rate": 1.846994535519126e-05, "loss": 1.0033, "step": 169 }, { "epoch": 0.03, "grad_norm": 2.4892013304302654, "learning_rate": 1.85792349726776e-05, "loss": 0.9776, "step": 170 }, { "epoch": 0.03, "grad_norm": 4.804442038861977, "learning_rate": 1.8688524590163936e-05, "loss": 1.0248, "step": 171 }, { "epoch": 0.03, "grad_norm": 2.216313229708277, "learning_rate": 1.8797814207650276e-05, "loss": 1.0029, "step": 172 }, { "epoch": 0.03, "grad_norm": 4.22020821761182, "learning_rate": 1.8907103825136616e-05, "loss": 0.9555, "step": 173 }, { "epoch": 0.03, "grad_norm": 4.660417206443456, "learning_rate": 1.9016393442622952e-05, "loss": 0.9954, "step": 174 }, { "epoch": 0.03, "grad_norm": 2.614008451080058, "learning_rate": 1.912568306010929e-05, "loss": 0.9534, "step": 175 }, { "epoch": 0.03, "grad_norm": 4.257126945099854, "learning_rate": 1.923497267759563e-05, "loss": 0.9563, "step": 176 }, { "epoch": 0.03, "grad_norm": 4.0951575429203, "learning_rate": 1.934426229508197e-05, "loss": 0.9335, "step": 177 }, { "epoch": 0.03, "grad_norm": 5.45451684116769, "learning_rate": 1.9453551912568305e-05, "loss": 0.9953, "step": 178 }, { "epoch": 0.03, "grad_norm": 3.2260242966236583, "learning_rate": 1.9562841530054645e-05, "loss": 1.0368, "step": 179 }, { "epoch": 0.03, "grad_norm": 3.2511908963812965, "learning_rate": 1.9672131147540985e-05, "loss": 1.0317, "step": 180 }, { "epoch": 0.03, "grad_norm": 2.370040666840348, "learning_rate": 1.9781420765027325e-05, "loss": 1.0345, "step": 181 }, { "epoch": 0.03, "grad_norm": 3.1039223740999073, "learning_rate": 1.9890710382513662e-05, "loss": 1.0634, "step": 182 }, { "epoch": 0.03, "grad_norm": 4.147021327481347, "learning_rate": 2e-05, "loss": 0.9478, "step": 183 }, { "epoch": 0.03, "grad_norm": 3.1472763530012196, "learning_rate": 1.999999858476052e-05, "loss": 0.9637, "step": 184 }, { "epoch": 0.03, "grad_norm": 3.4971068623996433, "learning_rate": 1.999999433904248e-05, "loss": 1.016, "step": 185 }, { "epoch": 0.03, "grad_norm": 3.748687567526857, "learning_rate": 1.999998726284708e-05, "loss": 0.9845, "step": 186 }, { "epoch": 0.03, "grad_norm": 3.3510157827801743, "learning_rate": 1.999997735617632e-05, "loss": 1.0113, "step": 187 }, { "epoch": 0.03, "grad_norm": 3.3146832171955163, "learning_rate": 1.999996461903301e-05, "loss": 1.0631, "step": 188 }, { "epoch": 0.03, "grad_norm": 3.7805585217500823, "learning_rate": 1.999994905142075e-05, "loss": 0.964, "step": 189 }, { "epoch": 0.03, "grad_norm": 2.0922445813586092, "learning_rate": 1.999993065334395e-05, "loss": 1.0247, "step": 190 }, { "epoch": 0.03, "grad_norm": 2.236565519587067, "learning_rate": 1.999990942480782e-05, "loss": 0.9822, "step": 191 }, { "epoch": 0.03, "grad_norm": 2.298854631536318, "learning_rate": 1.999988536581836e-05, "loss": 0.9435, "step": 192 }, { "epoch": 0.03, "grad_norm": 3.2840909928596065, "learning_rate": 1.9999858476382388e-05, "loss": 0.9516, "step": 193 }, { "epoch": 0.03, "grad_norm": 4.2543474953984735, "learning_rate": 1.9999828756507512e-05, "loss": 1.0895, "step": 194 }, { "epoch": 0.03, "grad_norm": 2.8325326237923365, "learning_rate": 1.999979620620214e-05, "loss": 0.9454, "step": 195 }, { "epoch": 0.03, "grad_norm": 2.8026645329936484, "learning_rate": 1.9999760825475496e-05, "loss": 0.9888, "step": 196 }, { "epoch": 0.03, "grad_norm": 2.721635267866406, "learning_rate": 1.9999722614337585e-05, "loss": 1.0376, "step": 197 }, { "epoch": 0.03, "grad_norm": 3.203262728636024, "learning_rate": 1.9999681572799226e-05, "loss": 1.0376, "step": 198 }, { "epoch": 0.03, "grad_norm": 2.0435625011420058, "learning_rate": 1.9999637700872037e-05, "loss": 0.9933, "step": 199 }, { "epoch": 0.03, "grad_norm": 2.8227786890891053, "learning_rate": 1.9999590998568432e-05, "loss": 1.0181, "step": 200 }, { "epoch": 0.03, "grad_norm": 2.6403807822358885, "learning_rate": 1.9999541465901636e-05, "loss": 1.0372, "step": 201 }, { "epoch": 0.03, "grad_norm": 3.8779377998841373, "learning_rate": 1.9999489102885657e-05, "loss": 1.0043, "step": 202 }, { "epoch": 0.03, "grad_norm": 2.866930674438406, "learning_rate": 1.9999433909535333e-05, "loss": 1.0216, "step": 203 }, { "epoch": 0.03, "grad_norm": 3.490673893351854, "learning_rate": 1.9999375885866272e-05, "loss": 0.9465, "step": 204 }, { "epoch": 0.03, "grad_norm": 2.2031794232787654, "learning_rate": 1.9999315031894908e-05, "loss": 0.9958, "step": 205 }, { "epoch": 0.03, "grad_norm": 2.843942144085718, "learning_rate": 1.999925134763846e-05, "loss": 1.026, "step": 206 }, { "epoch": 0.03, "grad_norm": 2.562323430413834, "learning_rate": 1.9999184833114952e-05, "loss": 1.0252, "step": 207 }, { "epoch": 0.03, "grad_norm": 2.9627363607177783, "learning_rate": 1.9999115488343213e-05, "loss": 1.0352, "step": 208 }, { "epoch": 0.03, "grad_norm": 2.833921449957381, "learning_rate": 1.9999043313342875e-05, "loss": 0.9347, "step": 209 }, { "epoch": 0.03, "grad_norm": 3.1189728263933176, "learning_rate": 1.9998968308134362e-05, "loss": 1.0006, "step": 210 }, { "epoch": 0.03, "grad_norm": 6.6715162907214, "learning_rate": 1.9998890472738902e-05, "loss": 1.0086, "step": 211 }, { "epoch": 0.03, "grad_norm": 2.8289351504187055, "learning_rate": 1.9998809807178533e-05, "loss": 1.081, "step": 212 }, { "epoch": 0.03, "grad_norm": 2.7002725834796415, "learning_rate": 1.999872631147608e-05, "loss": 0.9275, "step": 213 }, { "epoch": 0.04, "grad_norm": 4.644746580409732, "learning_rate": 1.9998639985655183e-05, "loss": 0.9307, "step": 214 }, { "epoch": 0.04, "grad_norm": 3.0790755464435273, "learning_rate": 1.9998550829740277e-05, "loss": 0.9932, "step": 215 }, { "epoch": 0.04, "grad_norm": 3.423922572380181, "learning_rate": 1.9998458843756587e-05, "loss": 1.0319, "step": 216 }, { "epoch": 0.04, "grad_norm": 2.5702952117029807, "learning_rate": 1.999836402773016e-05, "loss": 1.0349, "step": 217 }, { "epoch": 0.04, "grad_norm": 2.985969129660244, "learning_rate": 1.999826638168783e-05, "loss": 0.8944, "step": 218 }, { "epoch": 0.04, "grad_norm": 2.6410692094433075, "learning_rate": 1.999816590565723e-05, "loss": 0.9962, "step": 219 }, { "epoch": 0.04, "grad_norm": 3.3799600466520205, "learning_rate": 1.999806259966681e-05, "loss": 1.0219, "step": 220 }, { "epoch": 0.04, "grad_norm": 2.6759976555500873, "learning_rate": 1.9997956463745806e-05, "loss": 0.9543, "step": 221 }, { "epoch": 0.04, "grad_norm": 3.0463547834077995, "learning_rate": 1.9997847497924258e-05, "loss": 1.0579, "step": 222 }, { "epoch": 0.04, "grad_norm": 2.1269854364898575, "learning_rate": 1.9997735702233006e-05, "loss": 0.9701, "step": 223 }, { "epoch": 0.04, "grad_norm": 3.9717777805943295, "learning_rate": 1.99976210767037e-05, "loss": 0.9725, "step": 224 }, { "epoch": 0.04, "grad_norm": 2.7258107651051118, "learning_rate": 1.999750362136878e-05, "loss": 0.9629, "step": 225 }, { "epoch": 0.04, "grad_norm": 15.726360607621805, "learning_rate": 1.999738333626149e-05, "loss": 0.8994, "step": 226 }, { "epoch": 0.04, "grad_norm": 2.9025259731061253, "learning_rate": 1.9997260221415883e-05, "loss": 0.9238, "step": 227 }, { "epoch": 0.04, "grad_norm": 2.732791549125479, "learning_rate": 1.99971342768668e-05, "loss": 1.0148, "step": 228 }, { "epoch": 0.04, "grad_norm": 4.283656539514148, "learning_rate": 1.99970055026499e-05, "loss": 0.9915, "step": 229 }, { "epoch": 0.04, "grad_norm": 2.6096732064689885, "learning_rate": 1.9996873898801617e-05, "loss": 0.8894, "step": 230 }, { "epoch": 0.04, "grad_norm": 2.6826919063196826, "learning_rate": 1.9996739465359208e-05, "loss": 0.983, "step": 231 }, { "epoch": 0.04, "grad_norm": 2.793093899299533, "learning_rate": 1.9996602202360728e-05, "loss": 0.9923, "step": 232 }, { "epoch": 0.04, "grad_norm": 2.409212706240295, "learning_rate": 1.999646210984502e-05, "loss": 0.9783, "step": 233 }, { "epoch": 0.04, "grad_norm": 2.7627246973091437, "learning_rate": 1.999631918785175e-05, "loss": 0.905, "step": 234 }, { "epoch": 0.04, "grad_norm": 3.314652523990065, "learning_rate": 1.999617343642136e-05, "loss": 0.9518, "step": 235 }, { "epoch": 0.04, "grad_norm": 3.2624405233292166, "learning_rate": 1.999602485559511e-05, "loss": 0.9832, "step": 236 }, { "epoch": 0.04, "grad_norm": 2.0919976780499003, "learning_rate": 1.9995873445415056e-05, "loss": 0.9322, "step": 237 }, { "epoch": 0.04, "grad_norm": 2.9108295979592937, "learning_rate": 1.999571920592405e-05, "loss": 1.0025, "step": 238 }, { "epoch": 0.04, "grad_norm": 2.5906352609190315, "learning_rate": 1.9995562137165752e-05, "loss": 0.9777, "step": 239 }, { "epoch": 0.04, "grad_norm": 3.3060976283293617, "learning_rate": 1.9995402239184622e-05, "loss": 0.9821, "step": 240 }, { "epoch": 0.04, "grad_norm": 2.5919514574609073, "learning_rate": 1.9995239512025916e-05, "loss": 0.9117, "step": 241 }, { "epoch": 0.04, "grad_norm": 2.7094117259534807, "learning_rate": 1.9995073955735695e-05, "loss": 0.9502, "step": 242 }, { "epoch": 0.04, "grad_norm": 3.5667441522912156, "learning_rate": 1.9994905570360817e-05, "loss": 0.8838, "step": 243 }, { "epoch": 0.04, "grad_norm": 2.5317038225374544, "learning_rate": 1.9994734355948943e-05, "loss": 1.0629, "step": 244 }, { "epoch": 0.04, "grad_norm": 2.809440625277568, "learning_rate": 1.9994560312548538e-05, "loss": 0.9693, "step": 245 }, { "epoch": 0.04, "grad_norm": 3.071436318952301, "learning_rate": 1.9994383440208867e-05, "loss": 0.9181, "step": 246 }, { "epoch": 0.04, "grad_norm": 3.261241086549945, "learning_rate": 1.9994203738979984e-05, "loss": 0.9403, "step": 247 }, { "epoch": 0.04, "grad_norm": 5.341444397229066, "learning_rate": 1.999402120891276e-05, "loss": 0.981, "step": 248 }, { "epoch": 0.04, "grad_norm": 3.4625338061490565, "learning_rate": 1.999383585005886e-05, "loss": 0.9231, "step": 249 }, { "epoch": 0.04, "grad_norm": 2.770169837218384, "learning_rate": 1.9993647662470746e-05, "loss": 1.0157, "step": 250 }, { "epoch": 0.04, "grad_norm": 5.038330697631086, "learning_rate": 1.999345664620169e-05, "loss": 0.9644, "step": 251 }, { "epoch": 0.04, "grad_norm": 2.9800003400454753, "learning_rate": 1.999326280130575e-05, "loss": 0.9619, "step": 252 }, { "epoch": 0.04, "grad_norm": 2.489313694325693, "learning_rate": 1.99930661278378e-05, "loss": 0.9441, "step": 253 }, { "epoch": 0.04, "grad_norm": 2.867247919600943, "learning_rate": 1.999286662585351e-05, "loss": 0.9039, "step": 254 }, { "epoch": 0.04, "grad_norm": 2.771365158060359, "learning_rate": 1.9992664295409338e-05, "loss": 0.9404, "step": 255 }, { "epoch": 0.04, "grad_norm": 3.8967199516416073, "learning_rate": 1.999245913656256e-05, "loss": 0.9786, "step": 256 }, { "epoch": 0.04, "grad_norm": 3.731857420313871, "learning_rate": 1.9992251149371253e-05, "loss": 0.957, "step": 257 }, { "epoch": 0.04, "grad_norm": 2.405267400406675, "learning_rate": 1.9992040333894273e-05, "loss": 0.9192, "step": 258 }, { "epoch": 0.04, "grad_norm": 2.490455563338981, "learning_rate": 1.9991826690191304e-05, "loss": 0.915, "step": 259 }, { "epoch": 0.04, "grad_norm": 2.036676661125802, "learning_rate": 1.9991610218322804e-05, "loss": 0.9605, "step": 260 }, { "epoch": 0.04, "grad_norm": 4.053945286004571, "learning_rate": 1.9991390918350054e-05, "loss": 1.0228, "step": 261 }, { "epoch": 0.04, "grad_norm": 2.3770066798718577, "learning_rate": 1.999116879033513e-05, "loss": 0.969, "step": 262 }, { "epoch": 0.04, "grad_norm": 2.866496204794114, "learning_rate": 1.9990943834340893e-05, "loss": 0.9256, "step": 263 }, { "epoch": 0.04, "grad_norm": 2.0630073620206, "learning_rate": 1.9990716050431026e-05, "loss": 0.9556, "step": 264 }, { "epoch": 0.04, "grad_norm": 3.2956584626945125, "learning_rate": 1.9990485438669998e-05, "loss": 0.9474, "step": 265 }, { "epoch": 0.04, "grad_norm": 2.414602844317342, "learning_rate": 1.9990251999123086e-05, "loss": 0.8911, "step": 266 }, { "epoch": 0.04, "grad_norm": 4.6920684023503245, "learning_rate": 1.9990015731856366e-05, "loss": 0.9453, "step": 267 }, { "epoch": 0.04, "grad_norm": 2.318331006385827, "learning_rate": 1.9989776636936705e-05, "loss": 0.9813, "step": 268 }, { "epoch": 0.04, "grad_norm": 3.0098553177969833, "learning_rate": 1.9989534714431788e-05, "loss": 0.9648, "step": 269 }, { "epoch": 0.04, "grad_norm": 3.1199514199765668, "learning_rate": 1.9989289964410082e-05, "loss": 0.8166, "step": 270 }, { "epoch": 0.04, "grad_norm": 2.138392411831753, "learning_rate": 1.9989042386940872e-05, "loss": 0.8533, "step": 271 }, { "epoch": 0.04, "grad_norm": 1.3910623183473396, "learning_rate": 1.9988791982094224e-05, "loss": 0.3855, "step": 272 }, { "epoch": 0.04, "grad_norm": 2.033045197940747, "learning_rate": 1.9988538749941024e-05, "loss": 0.9667, "step": 273 }, { "epoch": 0.05, "grad_norm": 2.3280038435123265, "learning_rate": 1.998828269055294e-05, "loss": 0.8555, "step": 274 }, { "epoch": 0.05, "grad_norm": 2.700676974268106, "learning_rate": 1.9988023804002463e-05, "loss": 0.9566, "step": 275 }, { "epoch": 0.05, "grad_norm": 4.610433052828829, "learning_rate": 1.9987762090362855e-05, "loss": 0.9528, "step": 276 }, { "epoch": 0.05, "grad_norm": 2.333988924645032, "learning_rate": 1.9987497549708197e-05, "loss": 0.9862, "step": 277 }, { "epoch": 0.05, "grad_norm": 3.3401299918028027, "learning_rate": 1.9987230182113374e-05, "loss": 1.0048, "step": 278 }, { "epoch": 0.05, "grad_norm": 2.918986461405097, "learning_rate": 1.998695998765406e-05, "loss": 0.9872, "step": 279 }, { "epoch": 0.05, "grad_norm": 2.1548861198250227, "learning_rate": 1.9986686966406728e-05, "loss": 0.9479, "step": 280 }, { "epoch": 0.05, "grad_norm": 3.1301700497032567, "learning_rate": 1.9986411118448666e-05, "loss": 0.9405, "step": 281 }, { "epoch": 0.05, "grad_norm": 3.6786057786564683, "learning_rate": 1.9986132443857942e-05, "loss": 0.882, "step": 282 }, { "epoch": 0.05, "grad_norm": 2.805624202319603, "learning_rate": 1.998585094271344e-05, "loss": 0.9276, "step": 283 }, { "epoch": 0.05, "grad_norm": 1.813795758980809, "learning_rate": 1.9985566615094836e-05, "loss": 1.002, "step": 284 }, { "epoch": 0.05, "grad_norm": 3.4577354722819345, "learning_rate": 1.998527946108261e-05, "loss": 0.9025, "step": 285 }, { "epoch": 0.05, "grad_norm": 4.08147091362666, "learning_rate": 1.9984989480758038e-05, "loss": 0.9509, "step": 286 }, { "epoch": 0.05, "grad_norm": 2.3533123390669037, "learning_rate": 1.9984696674203204e-05, "loss": 1.0019, "step": 287 }, { "epoch": 0.05, "grad_norm": 2.117475257103754, "learning_rate": 1.998440104150098e-05, "loss": 0.9846, "step": 288 }, { "epoch": 0.05, "grad_norm": 2.7421902838559995, "learning_rate": 1.998410258273505e-05, "loss": 0.9958, "step": 289 }, { "epoch": 0.05, "grad_norm": 2.8657402766227595, "learning_rate": 1.9983801297989883e-05, "loss": 0.9337, "step": 290 }, { "epoch": 0.05, "grad_norm": 2.3061765224181636, "learning_rate": 1.9983497187350767e-05, "loss": 0.9168, "step": 291 }, { "epoch": 0.05, "grad_norm": 1.8303094680268286, "learning_rate": 1.998319025090377e-05, "loss": 0.9659, "step": 292 }, { "epoch": 0.05, "grad_norm": 12.406666744412941, "learning_rate": 1.998288048873578e-05, "loss": 0.9408, "step": 293 }, { "epoch": 0.05, "grad_norm": 2.0756604087589285, "learning_rate": 1.998256790093447e-05, "loss": 0.9196, "step": 294 }, { "epoch": 0.05, "grad_norm": 2.0597161108307698, "learning_rate": 1.9982252487588315e-05, "loss": 0.9065, "step": 295 }, { "epoch": 0.05, "grad_norm": 2.376162993773039, "learning_rate": 1.9981934248786596e-05, "loss": 0.9027, "step": 296 }, { "epoch": 0.05, "grad_norm": 2.372759726920483, "learning_rate": 1.9981613184619387e-05, "loss": 0.9794, "step": 297 }, { "epoch": 0.05, "grad_norm": 2.360305257572119, "learning_rate": 1.9981289295177566e-05, "loss": 1.0333, "step": 298 }, { "epoch": 0.05, "grad_norm": 3.311066059915006, "learning_rate": 1.9980962580552808e-05, "loss": 0.954, "step": 299 }, { "epoch": 0.05, "grad_norm": 2.577831060576765, "learning_rate": 1.998063304083759e-05, "loss": 0.9523, "step": 300 }, { "epoch": 0.05, "grad_norm": 2.457734129830656, "learning_rate": 1.9980300676125188e-05, "loss": 0.9718, "step": 301 }, { "epoch": 0.05, "grad_norm": 2.2842587658977407, "learning_rate": 1.9979965486509676e-05, "loss": 0.9912, "step": 302 }, { "epoch": 0.05, "grad_norm": 2.0725025532790466, "learning_rate": 1.9979627472085927e-05, "loss": 0.9326, "step": 303 }, { "epoch": 0.05, "grad_norm": 2.9074198158826112, "learning_rate": 1.997928663294962e-05, "loss": 0.9724, "step": 304 }, { "epoch": 0.05, "grad_norm": 2.4522149982464914, "learning_rate": 1.9978942969197224e-05, "loss": 0.9528, "step": 305 }, { "epoch": 0.05, "grad_norm": 3.3987018761944277, "learning_rate": 1.9978596480926017e-05, "loss": 1.025, "step": 306 }, { "epoch": 0.05, "grad_norm": 2.677785166439524, "learning_rate": 1.9978247168234065e-05, "loss": 0.9417, "step": 307 }, { "epoch": 0.05, "grad_norm": 2.2969462604381117, "learning_rate": 1.997789503122025e-05, "loss": 0.9231, "step": 308 }, { "epoch": 0.05, "grad_norm": 2.419940352990441, "learning_rate": 1.9977540069984233e-05, "loss": 1.0193, "step": 309 }, { "epoch": 0.05, "grad_norm": 2.2423072307257597, "learning_rate": 1.9977182284626492e-05, "loss": 0.9207, "step": 310 }, { "epoch": 0.05, "grad_norm": 2.4266902849074663, "learning_rate": 1.9976821675248298e-05, "loss": 0.957, "step": 311 }, { "epoch": 0.05, "grad_norm": 2.9258908298406143, "learning_rate": 1.9976458241951715e-05, "loss": 0.9495, "step": 312 }, { "epoch": 0.05, "grad_norm": 3.3494334255248717, "learning_rate": 1.9976091984839616e-05, "loss": 0.912, "step": 313 }, { "epoch": 0.05, "grad_norm": 2.971496182843357, "learning_rate": 1.997572290401567e-05, "loss": 0.9503, "step": 314 }, { "epoch": 0.05, "grad_norm": 3.833648387382831, "learning_rate": 1.9975350999584342e-05, "loss": 0.9628, "step": 315 }, { "epoch": 0.05, "grad_norm": 3.1999157460639918, "learning_rate": 1.99749762716509e-05, "loss": 0.8768, "step": 316 }, { "epoch": 0.05, "grad_norm": 2.6175401284972035, "learning_rate": 1.9974598720321407e-05, "loss": 0.9177, "step": 317 }, { "epoch": 0.05, "grad_norm": 2.532150064553135, "learning_rate": 1.9974218345702733e-05, "loss": 0.9865, "step": 318 }, { "epoch": 0.05, "grad_norm": 2.4922588781721413, "learning_rate": 1.997383514790254e-05, "loss": 0.9832, "step": 319 }, { "epoch": 0.05, "grad_norm": 2.2380831674339405, "learning_rate": 1.9973449127029296e-05, "loss": 0.9498, "step": 320 }, { "epoch": 0.05, "grad_norm": 2.556255701403137, "learning_rate": 1.9973060283192253e-05, "loss": 0.9252, "step": 321 }, { "epoch": 0.05, "grad_norm": 2.0500122367799616, "learning_rate": 1.997266861650148e-05, "loss": 0.9107, "step": 322 }, { "epoch": 0.05, "grad_norm": 2.751743931870002, "learning_rate": 1.9972274127067838e-05, "loss": 0.9733, "step": 323 }, { "epoch": 0.05, "grad_norm": 2.6166080675702488, "learning_rate": 1.997187681500298e-05, "loss": 0.9784, "step": 324 }, { "epoch": 0.05, "grad_norm": 2.583990968431525, "learning_rate": 1.9971476680419372e-05, "loss": 0.9333, "step": 325 }, { "epoch": 0.05, "grad_norm": 3.826789403298811, "learning_rate": 1.9971073723430266e-05, "loss": 0.9922, "step": 326 }, { "epoch": 0.05, "grad_norm": 2.739987053358858, "learning_rate": 1.997066794414972e-05, "loss": 0.9209, "step": 327 }, { "epoch": 0.05, "grad_norm": 2.242079561182652, "learning_rate": 1.997025934269259e-05, "loss": 0.9469, "step": 328 }, { "epoch": 0.05, "grad_norm": 6.9527567044832645, "learning_rate": 1.9969847919174525e-05, "loss": 0.9345, "step": 329 }, { "epoch": 0.05, "grad_norm": 2.3570312115519605, "learning_rate": 1.9969433673711984e-05, "loss": 1.0304, "step": 330 }, { "epoch": 0.05, "grad_norm": 2.885896519878295, "learning_rate": 1.9969016606422215e-05, "loss": 0.9122, "step": 331 }, { "epoch": 0.05, "grad_norm": 2.7200344116943738, "learning_rate": 1.996859671742327e-05, "loss": 0.9615, "step": 332 }, { "epoch": 0.05, "grad_norm": 0.7710675967492197, "learning_rate": 1.9968174006833996e-05, "loss": 0.3791, "step": 333 }, { "epoch": 0.05, "grad_norm": 2.3908860854558736, "learning_rate": 1.996774847477404e-05, "loss": 0.9002, "step": 334 }, { "epoch": 0.06, "grad_norm": 2.923254174697558, "learning_rate": 1.9967320121363845e-05, "loss": 0.958, "step": 335 }, { "epoch": 0.06, "grad_norm": 2.605139707637925, "learning_rate": 1.9966888946724663e-05, "loss": 0.9377, "step": 336 }, { "epoch": 0.06, "grad_norm": 2.128951004613318, "learning_rate": 1.9966454950978534e-05, "loss": 0.9976, "step": 337 }, { "epoch": 0.06, "grad_norm": 3.8037484295265993, "learning_rate": 1.9966018134248296e-05, "loss": 0.9414, "step": 338 }, { "epoch": 0.06, "grad_norm": 2.7623083084776163, "learning_rate": 1.9965578496657593e-05, "loss": 1.0106, "step": 339 }, { "epoch": 0.06, "grad_norm": 2.5482763892990574, "learning_rate": 1.996513603833086e-05, "loss": 1.0415, "step": 340 }, { "epoch": 0.06, "grad_norm": 3.3262014787740246, "learning_rate": 1.996469075939334e-05, "loss": 0.9417, "step": 341 }, { "epoch": 0.06, "grad_norm": 2.3939401587905302, "learning_rate": 1.9964242659971063e-05, "loss": 0.8942, "step": 342 }, { "epoch": 0.06, "grad_norm": 2.447772702842197, "learning_rate": 1.9963791740190863e-05, "loss": 0.9025, "step": 343 }, { "epoch": 0.06, "grad_norm": 2.564194653340543, "learning_rate": 1.9963338000180374e-05, "loss": 0.9062, "step": 344 }, { "epoch": 0.06, "grad_norm": 5.115867780379545, "learning_rate": 1.9962881440068026e-05, "loss": 0.9312, "step": 345 }, { "epoch": 0.06, "grad_norm": 0.7602730344640769, "learning_rate": 1.9962422059983045e-05, "loss": 0.3421, "step": 346 }, { "epoch": 0.06, "grad_norm": 2.3556711881866517, "learning_rate": 1.9961959860055456e-05, "loss": 0.9639, "step": 347 }, { "epoch": 0.06, "grad_norm": 2.186576324122532, "learning_rate": 1.996149484041609e-05, "loss": 0.962, "step": 348 }, { "epoch": 0.06, "grad_norm": 0.7138428819634327, "learning_rate": 1.9961027001196568e-05, "loss": 0.3515, "step": 349 }, { "epoch": 0.06, "grad_norm": 0.6508953544807635, "learning_rate": 1.996055634252931e-05, "loss": 0.3293, "step": 350 }, { "epoch": 0.06, "grad_norm": 2.388903412434706, "learning_rate": 1.9960082864547528e-05, "loss": 0.9492, "step": 351 }, { "epoch": 0.06, "grad_norm": 3.8508418423954964, "learning_rate": 1.995960656738525e-05, "loss": 0.9282, "step": 352 }, { "epoch": 0.06, "grad_norm": 2.3308046184035076, "learning_rate": 1.9959127451177287e-05, "loss": 1.0054, "step": 353 }, { "epoch": 0.06, "grad_norm": 2.364608275986422, "learning_rate": 1.9958645516059247e-05, "loss": 0.9512, "step": 354 }, { "epoch": 0.06, "grad_norm": 1.6758130032868634, "learning_rate": 1.995816076216755e-05, "loss": 0.9734, "step": 355 }, { "epoch": 0.06, "grad_norm": 2.69935518404202, "learning_rate": 1.9957673189639398e-05, "loss": 1.0034, "step": 356 }, { "epoch": 0.06, "grad_norm": 3.206955905333736, "learning_rate": 1.9957182798612797e-05, "loss": 0.9771, "step": 357 }, { "epoch": 0.06, "grad_norm": 2.5112624273141444, "learning_rate": 1.9956689589226555e-05, "loss": 0.9715, "step": 358 }, { "epoch": 0.06, "grad_norm": 1.7830415893368885, "learning_rate": 1.9956193561620267e-05, "loss": 0.9648, "step": 359 }, { "epoch": 0.06, "grad_norm": 1.8485804144337692, "learning_rate": 1.9955694715934344e-05, "loss": 0.9259, "step": 360 }, { "epoch": 0.06, "grad_norm": 2.385759000475386, "learning_rate": 1.9955193052309972e-05, "loss": 0.9553, "step": 361 }, { "epoch": 0.06, "grad_norm": 2.749826348747719, "learning_rate": 1.9954688570889152e-05, "loss": 0.9562, "step": 362 }, { "epoch": 0.06, "grad_norm": 2.744518558505291, "learning_rate": 1.9954181271814673e-05, "loss": 0.9695, "step": 363 }, { "epoch": 0.06, "grad_norm": 2.7353267446730225, "learning_rate": 1.995367115523013e-05, "loss": 1.0537, "step": 364 }, { "epoch": 0.06, "grad_norm": 2.5056965789020476, "learning_rate": 1.9953158221279906e-05, "loss": 0.9401, "step": 365 }, { "epoch": 0.06, "grad_norm": 3.9104271333378287, "learning_rate": 1.9952642470109185e-05, "loss": 0.9477, "step": 366 }, { "epoch": 0.06, "grad_norm": 2.3360559010301705, "learning_rate": 1.9952123901863953e-05, "loss": 0.8572, "step": 367 }, { "epoch": 0.06, "grad_norm": 2.3467002506866423, "learning_rate": 1.9951602516690988e-05, "loss": 0.9727, "step": 368 }, { "epoch": 0.06, "grad_norm": 3.227897283190837, "learning_rate": 1.995107831473787e-05, "loss": 0.9552, "step": 369 }, { "epoch": 0.06, "grad_norm": 3.2397248345681375, "learning_rate": 1.9950551296152965e-05, "loss": 0.9243, "step": 370 }, { "epoch": 0.06, "grad_norm": 3.4526545389200742, "learning_rate": 1.9950021461085452e-05, "loss": 0.885, "step": 371 }, { "epoch": 0.06, "grad_norm": 2.279140097409667, "learning_rate": 1.99494888096853e-05, "loss": 0.9268, "step": 372 }, { "epoch": 0.06, "grad_norm": 2.045663050094848, "learning_rate": 1.9948953342103268e-05, "loss": 0.8886, "step": 373 }, { "epoch": 0.06, "grad_norm": 2.3591827581055784, "learning_rate": 1.9948415058490926e-05, "loss": 0.9089, "step": 374 }, { "epoch": 0.06, "grad_norm": 2.3769076849543507, "learning_rate": 1.994787395900063e-05, "loss": 0.9494, "step": 375 }, { "epoch": 0.06, "grad_norm": 2.4072704793162454, "learning_rate": 1.994733004378554e-05, "loss": 0.9807, "step": 376 }, { "epoch": 0.06, "grad_norm": 2.4267380653251385, "learning_rate": 1.9946783312999606e-05, "loss": 0.9886, "step": 377 }, { "epoch": 0.06, "grad_norm": 2.186852273222558, "learning_rate": 1.994623376679758e-05, "loss": 0.8922, "step": 378 }, { "epoch": 0.06, "grad_norm": 1.3292696923435667, "learning_rate": 1.9945681405335016e-05, "loss": 0.4224, "step": 379 }, { "epoch": 0.06, "grad_norm": 1.8635942601250588, "learning_rate": 1.9945126228768252e-05, "loss": 0.9144, "step": 380 }, { "epoch": 0.06, "grad_norm": 1.9867972662769078, "learning_rate": 1.994456823725443e-05, "loss": 0.9374, "step": 381 }, { "epoch": 0.06, "grad_norm": 10.925435604120235, "learning_rate": 1.994400743095149e-05, "loss": 0.949, "step": 382 }, { "epoch": 0.06, "grad_norm": 2.2109151144932198, "learning_rate": 1.9943443810018174e-05, "loss": 0.9639, "step": 383 }, { "epoch": 0.06, "grad_norm": 2.2233702989458313, "learning_rate": 1.9942877374614e-05, "loss": 0.9466, "step": 384 }, { "epoch": 0.06, "grad_norm": 2.1422153954061818, "learning_rate": 1.9942308124899307e-05, "loss": 0.8845, "step": 385 }, { "epoch": 0.06, "grad_norm": 2.226649703720001, "learning_rate": 1.9941736061035213e-05, "loss": 0.9378, "step": 386 }, { "epoch": 0.06, "grad_norm": 1.954871544299893, "learning_rate": 1.9941161183183643e-05, "loss": 0.9116, "step": 387 }, { "epoch": 0.06, "grad_norm": 1.8963263222196753, "learning_rate": 1.9940583491507314e-05, "loss": 0.8596, "step": 388 }, { "epoch": 0.06, "grad_norm": 2.391055801416908, "learning_rate": 1.994000298616974e-05, "loss": 0.927, "step": 389 }, { "epoch": 0.06, "grad_norm": 2.834827599232177, "learning_rate": 1.9939419667335236e-05, "loss": 0.9066, "step": 390 }, { "epoch": 0.06, "grad_norm": 2.875925087984668, "learning_rate": 1.99388335351689e-05, "loss": 0.8866, "step": 391 }, { "epoch": 0.06, "grad_norm": 2.1891839025154094, "learning_rate": 1.9938244589836646e-05, "loss": 0.876, "step": 392 }, { "epoch": 0.06, "grad_norm": 2.6025515355252646, "learning_rate": 1.993765283150517e-05, "loss": 0.892, "step": 393 }, { "epoch": 0.06, "grad_norm": 9.566750163545116, "learning_rate": 1.9937058260341967e-05, "loss": 0.8989, "step": 394 }, { "epoch": 0.06, "grad_norm": 7.651049590904792, "learning_rate": 1.9936460876515323e-05, "loss": 0.9244, "step": 395 }, { "epoch": 0.07, "grad_norm": 2.4504794712438156, "learning_rate": 1.993586068019434e-05, "loss": 0.9294, "step": 396 }, { "epoch": 0.07, "grad_norm": 2.5505408618679453, "learning_rate": 1.993525767154889e-05, "loss": 0.8617, "step": 397 }, { "epoch": 0.07, "grad_norm": 1.820939412831626, "learning_rate": 1.9934651850749663e-05, "loss": 0.9932, "step": 398 }, { "epoch": 0.07, "grad_norm": 1.7774290880925636, "learning_rate": 1.9934043217968124e-05, "loss": 0.9422, "step": 399 }, { "epoch": 0.07, "grad_norm": 5.730427388606489, "learning_rate": 1.9933431773376554e-05, "loss": 0.9476, "step": 400 }, { "epoch": 0.07, "grad_norm": 2.3681776175401943, "learning_rate": 1.993281751714802e-05, "loss": 0.9789, "step": 401 }, { "epoch": 0.07, "grad_norm": 2.0933880539136243, "learning_rate": 1.9932200449456385e-05, "loss": 0.8744, "step": 402 }, { "epoch": 0.07, "grad_norm": 2.745344436322149, "learning_rate": 1.9931580570476306e-05, "loss": 0.9648, "step": 403 }, { "epoch": 0.07, "grad_norm": 2.4654684675964678, "learning_rate": 1.993095788038324e-05, "loss": 0.9059, "step": 404 }, { "epoch": 0.07, "grad_norm": 1.948719453430558, "learning_rate": 1.993033237935344e-05, "loss": 0.8481, "step": 405 }, { "epoch": 0.07, "grad_norm": 2.2915469756769817, "learning_rate": 1.9929704067563957e-05, "loss": 0.9164, "step": 406 }, { "epoch": 0.07, "grad_norm": 2.4443796658372148, "learning_rate": 1.992907294519262e-05, "loss": 0.9595, "step": 407 }, { "epoch": 0.07, "grad_norm": 2.3008879323036466, "learning_rate": 1.9928439012418076e-05, "loss": 1.0079, "step": 408 }, { "epoch": 0.07, "grad_norm": 2.152644633899642, "learning_rate": 1.992780226941976e-05, "loss": 0.9266, "step": 409 }, { "epoch": 0.07, "grad_norm": 1.983551596458812, "learning_rate": 1.9927162716377896e-05, "loss": 0.9244, "step": 410 }, { "epoch": 0.07, "grad_norm": 2.513416641547377, "learning_rate": 1.992652035347351e-05, "loss": 0.9355, "step": 411 }, { "epoch": 0.07, "grad_norm": 1.3561509625134847, "learning_rate": 1.9925875180888426e-05, "loss": 0.4508, "step": 412 }, { "epoch": 0.07, "grad_norm": 2.2949095162244255, "learning_rate": 1.9925227198805247e-05, "loss": 0.8438, "step": 413 }, { "epoch": 0.07, "grad_norm": 2.3999939285312526, "learning_rate": 1.9924576407407398e-05, "loss": 0.9823, "step": 414 }, { "epoch": 0.07, "grad_norm": 2.4614605283753543, "learning_rate": 1.992392280687907e-05, "loss": 0.9981, "step": 415 }, { "epoch": 0.07, "grad_norm": 2.7101177794010938, "learning_rate": 1.9923266397405273e-05, "loss": 0.9373, "step": 416 }, { "epoch": 0.07, "grad_norm": 2.3444099314009827, "learning_rate": 1.9922607179171796e-05, "loss": 1.0369, "step": 417 }, { "epoch": 0.07, "grad_norm": 2.2864618070163423, "learning_rate": 1.9921945152365235e-05, "loss": 0.9368, "step": 418 }, { "epoch": 0.07, "grad_norm": 2.3533086016954345, "learning_rate": 1.992128031717297e-05, "loss": 0.9538, "step": 419 }, { "epoch": 0.07, "grad_norm": 2.5041119819371667, "learning_rate": 1.9920612673783186e-05, "loss": 0.9768, "step": 420 }, { "epoch": 0.07, "grad_norm": 2.378179946862738, "learning_rate": 1.9919942222384855e-05, "loss": 0.855, "step": 421 }, { "epoch": 0.07, "grad_norm": 2.7701445094160704, "learning_rate": 1.9919268963167747e-05, "loss": 0.8332, "step": 422 }, { "epoch": 0.07, "grad_norm": 2.293667582553478, "learning_rate": 1.9918592896322432e-05, "loss": 0.8761, "step": 423 }, { "epoch": 0.07, "grad_norm": 1.936329685656113, "learning_rate": 1.9917914022040258e-05, "loss": 0.8887, "step": 424 }, { "epoch": 0.07, "grad_norm": 2.3194136605031583, "learning_rate": 1.9917232340513388e-05, "loss": 0.9152, "step": 425 }, { "epoch": 0.07, "grad_norm": 2.087428242652799, "learning_rate": 1.9916547851934768e-05, "loss": 0.9275, "step": 426 }, { "epoch": 0.07, "grad_norm": 2.129613857360127, "learning_rate": 1.991586055649814e-05, "loss": 0.9088, "step": 427 }, { "epoch": 0.07, "grad_norm": 2.1454942545734634, "learning_rate": 1.9915170454398045e-05, "loss": 0.8885, "step": 428 }, { "epoch": 0.07, "grad_norm": 3.3469738149023835, "learning_rate": 1.991447754582981e-05, "loss": 0.9564, "step": 429 }, { "epoch": 0.07, "grad_norm": 2.4031126383506214, "learning_rate": 1.9913781830989568e-05, "loss": 0.8476, "step": 430 }, { "epoch": 0.07, "grad_norm": 3.170163102058831, "learning_rate": 1.991308331007423e-05, "loss": 0.9471, "step": 431 }, { "epoch": 0.07, "grad_norm": 2.1892507908848367, "learning_rate": 1.9912381983281518e-05, "loss": 0.9795, "step": 432 }, { "epoch": 0.07, "grad_norm": 2.361014782496613, "learning_rate": 1.9911677850809943e-05, "loss": 0.8869, "step": 433 }, { "epoch": 0.07, "grad_norm": 3.2194709279415137, "learning_rate": 1.9910970912858802e-05, "loss": 0.9346, "step": 434 }, { "epoch": 0.07, "grad_norm": 2.156032800068117, "learning_rate": 1.9910261169628195e-05, "loss": 0.9118, "step": 435 }, { "epoch": 0.07, "grad_norm": 2.3011767354290376, "learning_rate": 1.9909548621319014e-05, "loss": 0.9022, "step": 436 }, { "epoch": 0.07, "grad_norm": 7.424647088885393, "learning_rate": 1.9908833268132943e-05, "loss": 1.0043, "step": 437 }, { "epoch": 0.07, "grad_norm": 2.519712438323664, "learning_rate": 1.9908115110272463e-05, "loss": 0.921, "step": 438 }, { "epoch": 0.07, "grad_norm": 2.2553594125015533, "learning_rate": 1.9907394147940845e-05, "loss": 1.0053, "step": 439 }, { "epoch": 0.07, "grad_norm": 2.4897063185711583, "learning_rate": 1.9906670381342156e-05, "loss": 0.9322, "step": 440 }, { "epoch": 0.07, "grad_norm": 4.305866540214456, "learning_rate": 1.9905943810681257e-05, "loss": 1.0081, "step": 441 }, { "epoch": 0.07, "grad_norm": 2.1619224642112305, "learning_rate": 1.9905214436163806e-05, "loss": 0.902, "step": 442 }, { "epoch": 0.07, "grad_norm": 2.5935043947925225, "learning_rate": 1.9904482257996244e-05, "loss": 0.9258, "step": 443 }, { "epoch": 0.07, "grad_norm": 3.3489011755111586, "learning_rate": 1.9903747276385816e-05, "loss": 0.9092, "step": 444 }, { "epoch": 0.07, "grad_norm": 2.3938243319653467, "learning_rate": 1.9903009491540558e-05, "loss": 0.9179, "step": 445 }, { "epoch": 0.07, "grad_norm": 5.410545861805351, "learning_rate": 1.99022689036693e-05, "loss": 0.85, "step": 446 }, { "epoch": 0.07, "grad_norm": 1.9643914274323189, "learning_rate": 1.9901525512981656e-05, "loss": 0.9293, "step": 447 }, { "epoch": 0.07, "grad_norm": 2.7165451648386805, "learning_rate": 1.990077931968805e-05, "loss": 0.9087, "step": 448 }, { "epoch": 0.07, "grad_norm": 1.9662065109964186, "learning_rate": 1.990003032399969e-05, "loss": 1.0137, "step": 449 }, { "epoch": 0.07, "grad_norm": 4.297943729039644, "learning_rate": 1.989927852612857e-05, "loss": 0.8593, "step": 450 }, { "epoch": 0.07, "grad_norm": 2.4394815710685953, "learning_rate": 1.9898523926287488e-05, "loss": 0.9162, "step": 451 }, { "epoch": 0.07, "grad_norm": 1.8058582604807583, "learning_rate": 1.9897766524690038e-05, "loss": 0.9154, "step": 452 }, { "epoch": 0.07, "grad_norm": 2.4467041053970684, "learning_rate": 1.9897006321550592e-05, "loss": 0.9478, "step": 453 }, { "epoch": 0.07, "grad_norm": 2.861448009091516, "learning_rate": 1.9896243317084333e-05, "loss": 0.8923, "step": 454 }, { "epoch": 0.07, "grad_norm": 2.2050298437162605, "learning_rate": 1.989547751150722e-05, "loss": 0.816, "step": 455 }, { "epoch": 0.07, "grad_norm": 2.5186898593300073, "learning_rate": 1.9894708905036015e-05, "loss": 0.9656, "step": 456 }, { "epoch": 0.08, "grad_norm": 2.1652868432103083, "learning_rate": 1.989393749788827e-05, "loss": 0.9532, "step": 457 }, { "epoch": 0.08, "grad_norm": 2.5328429497324794, "learning_rate": 1.9893163290282335e-05, "loss": 0.9169, "step": 458 }, { "epoch": 0.08, "grad_norm": 2.4936485854489567, "learning_rate": 1.9892386282437344e-05, "loss": 0.9585, "step": 459 }, { "epoch": 0.08, "grad_norm": 3.255512583905065, "learning_rate": 1.9891606474573225e-05, "loss": 1.0436, "step": 460 }, { "epoch": 0.08, "grad_norm": 2.163337497059468, "learning_rate": 1.9890823866910702e-05, "loss": 0.9899, "step": 461 }, { "epoch": 0.08, "grad_norm": 2.5398242271572475, "learning_rate": 1.9890038459671297e-05, "loss": 0.9402, "step": 462 }, { "epoch": 0.08, "grad_norm": 2.3011781767974573, "learning_rate": 1.9889250253077306e-05, "loss": 0.976, "step": 463 }, { "epoch": 0.08, "grad_norm": 4.17065074650111, "learning_rate": 1.9888459247351844e-05, "loss": 0.9232, "step": 464 }, { "epoch": 0.08, "grad_norm": 2.033557468372626, "learning_rate": 1.988766544271879e-05, "loss": 0.8693, "step": 465 }, { "epoch": 0.08, "grad_norm": 2.3517791912920094, "learning_rate": 1.9886868839402837e-05, "loss": 0.8539, "step": 466 }, { "epoch": 0.08, "grad_norm": 4.966394063161802, "learning_rate": 1.9886069437629456e-05, "loss": 0.8992, "step": 467 }, { "epoch": 0.08, "grad_norm": 3.8788228629511714, "learning_rate": 1.9885267237624923e-05, "loss": 0.9767, "step": 468 }, { "epoch": 0.08, "grad_norm": 2.1384971797913894, "learning_rate": 1.9884462239616292e-05, "loss": 0.894, "step": 469 }, { "epoch": 0.08, "grad_norm": 11.71928771880798, "learning_rate": 1.988365444383142e-05, "loss": 0.9658, "step": 470 }, { "epoch": 0.08, "grad_norm": 1.9482585861125072, "learning_rate": 1.988284385049895e-05, "loss": 0.9633, "step": 471 }, { "epoch": 0.08, "grad_norm": 2.399927281325999, "learning_rate": 1.9882030459848325e-05, "loss": 0.9044, "step": 472 }, { "epoch": 0.08, "grad_norm": 2.323180992939217, "learning_rate": 1.988121427210976e-05, "loss": 0.8935, "step": 473 }, { "epoch": 0.08, "grad_norm": 4.1982902564807425, "learning_rate": 1.9880395287514292e-05, "loss": 0.9296, "step": 474 }, { "epoch": 0.08, "grad_norm": 2.0930789732185606, "learning_rate": 1.987957350629372e-05, "loss": 0.8169, "step": 475 }, { "epoch": 0.08, "grad_norm": 2.732108681119152, "learning_rate": 1.987874892868065e-05, "loss": 0.9215, "step": 476 }, { "epoch": 0.08, "grad_norm": 3.500684097805785, "learning_rate": 1.987792155490848e-05, "loss": 0.8983, "step": 477 }, { "epoch": 0.08, "grad_norm": 3.2940709701237227, "learning_rate": 1.98770913852114e-05, "loss": 0.8308, "step": 478 }, { "epoch": 0.08, "grad_norm": 2.5414156183065657, "learning_rate": 1.9876258419824375e-05, "loss": 0.9183, "step": 479 }, { "epoch": 0.08, "grad_norm": 3.1898135252741584, "learning_rate": 1.987542265898319e-05, "loss": 0.9488, "step": 480 }, { "epoch": 0.08, "grad_norm": 2.639272899657891, "learning_rate": 1.9874584102924394e-05, "loss": 0.9383, "step": 481 }, { "epoch": 0.08, "grad_norm": 2.0547164555715067, "learning_rate": 1.987374275188534e-05, "loss": 0.9353, "step": 482 }, { "epoch": 0.08, "grad_norm": 1.8020432082501745, "learning_rate": 1.9872898606104175e-05, "loss": 0.9365, "step": 483 }, { "epoch": 0.08, "grad_norm": 2.2257434505869265, "learning_rate": 1.9872051665819828e-05, "loss": 0.934, "step": 484 }, { "epoch": 0.08, "grad_norm": 4.354327684955781, "learning_rate": 1.9871201931272027e-05, "loss": 0.946, "step": 485 }, { "epoch": 0.08, "grad_norm": 2.116492894700899, "learning_rate": 1.987034940270129e-05, "loss": 0.9343, "step": 486 }, { "epoch": 0.08, "grad_norm": 2.0117218890802793, "learning_rate": 1.9869494080348916e-05, "loss": 0.9907, "step": 487 }, { "epoch": 0.08, "grad_norm": 1.8722471545977493, "learning_rate": 1.9868635964457007e-05, "loss": 0.9645, "step": 488 }, { "epoch": 0.08, "grad_norm": 1.7658668863905869, "learning_rate": 1.986777505526845e-05, "loss": 0.9947, "step": 489 }, { "epoch": 0.08, "grad_norm": 1.7458295273125426, "learning_rate": 1.986691135302692e-05, "loss": 0.8855, "step": 490 }, { "epoch": 0.08, "grad_norm": 2.4908325060402454, "learning_rate": 1.9866044857976897e-05, "loss": 0.9647, "step": 491 }, { "epoch": 0.08, "grad_norm": 2.3738801072598554, "learning_rate": 1.986517557036363e-05, "loss": 0.9399, "step": 492 }, { "epoch": 0.08, "grad_norm": 2.9783768463739744, "learning_rate": 1.986430349043317e-05, "loss": 0.9513, "step": 493 }, { "epoch": 0.08, "grad_norm": 2.5863209051637903, "learning_rate": 1.9863428618432365e-05, "loss": 0.9394, "step": 494 }, { "epoch": 0.08, "grad_norm": 2.5609937341089064, "learning_rate": 1.9862550954608837e-05, "loss": 0.9682, "step": 495 }, { "epoch": 0.08, "grad_norm": 3.87152272140782, "learning_rate": 1.986167049921101e-05, "loss": 0.9174, "step": 496 }, { "epoch": 0.08, "grad_norm": 2.6286581136960567, "learning_rate": 1.9860787252488096e-05, "loss": 0.9388, "step": 497 }, { "epoch": 0.08, "grad_norm": 2.2177218401155603, "learning_rate": 1.9859901214690094e-05, "loss": 0.9082, "step": 498 }, { "epoch": 0.08, "grad_norm": 1.7069930131931756, "learning_rate": 1.98590123860678e-05, "loss": 1.0419, "step": 499 }, { "epoch": 0.08, "grad_norm": 3.6593633309281395, "learning_rate": 1.9858120766872785e-05, "loss": 0.8512, "step": 500 }, { "epoch": 0.08, "grad_norm": 3.0182676468730025, "learning_rate": 1.9857226357357432e-05, "loss": 0.9373, "step": 501 }, { "epoch": 0.08, "grad_norm": 2.6236047886003915, "learning_rate": 1.9856329157774897e-05, "loss": 0.9482, "step": 502 }, { "epoch": 0.08, "grad_norm": 2.4232471579799655, "learning_rate": 1.9855429168379127e-05, "loss": 0.9941, "step": 503 }, { "epoch": 0.08, "grad_norm": 1.9675696816260873, "learning_rate": 1.9854526389424867e-05, "loss": 0.9004, "step": 504 }, { "epoch": 0.08, "grad_norm": 3.104861012175659, "learning_rate": 1.985362082116764e-05, "loss": 0.8383, "step": 505 }, { "epoch": 0.08, "grad_norm": 2.4365469295513953, "learning_rate": 1.9852712463863777e-05, "loss": 0.8557, "step": 506 }, { "epoch": 0.08, "grad_norm": 2.9587903296853937, "learning_rate": 1.9851801317770375e-05, "loss": 0.8347, "step": 507 }, { "epoch": 0.08, "grad_norm": 2.7218700373271965, "learning_rate": 1.985088738314534e-05, "loss": 0.9422, "step": 508 }, { "epoch": 0.08, "grad_norm": 2.8062914133163006, "learning_rate": 1.9849970660247352e-05, "loss": 0.9513, "step": 509 }, { "epoch": 0.08, "grad_norm": 2.9990583192448517, "learning_rate": 1.984905114933589e-05, "loss": 0.9185, "step": 510 }, { "epoch": 0.08, "grad_norm": 1.9426478678006756, "learning_rate": 1.9848128850671222e-05, "loss": 0.9708, "step": 511 }, { "epoch": 0.08, "grad_norm": 2.389038894494837, "learning_rate": 1.98472037645144e-05, "loss": 0.8469, "step": 512 }, { "epoch": 0.08, "grad_norm": 2.475912577833772, "learning_rate": 1.9846275891127275e-05, "loss": 0.9185, "step": 513 }, { "epoch": 0.08, "grad_norm": 3.019612924569655, "learning_rate": 1.9845345230772467e-05, "loss": 0.9597, "step": 514 }, { "epoch": 0.08, "grad_norm": 3.7825504744819494, "learning_rate": 1.984441178371341e-05, "loss": 0.9452, "step": 515 }, { "epoch": 0.08, "grad_norm": 1.9513116038543883, "learning_rate": 1.9843475550214306e-05, "loss": 0.8936, "step": 516 }, { "epoch": 0.08, "grad_norm": 1.9175968268823067, "learning_rate": 1.9842536530540154e-05, "loss": 0.904, "step": 517 }, { "epoch": 0.09, "grad_norm": 2.334333411267245, "learning_rate": 1.9841594724956746e-05, "loss": 0.9217, "step": 518 }, { "epoch": 0.09, "grad_norm": 2.231931219124349, "learning_rate": 1.9840650133730657e-05, "loss": 0.9163, "step": 519 }, { "epoch": 0.09, "grad_norm": 2.2573231111608396, "learning_rate": 1.9839702757129248e-05, "loss": 0.9086, "step": 520 }, { "epoch": 0.09, "grad_norm": 2.3144708437930634, "learning_rate": 1.9838752595420674e-05, "loss": 0.9386, "step": 521 }, { "epoch": 0.09, "grad_norm": 2.7007001038167484, "learning_rate": 1.983779964887388e-05, "loss": 0.9379, "step": 522 }, { "epoch": 0.09, "grad_norm": 2.4838350509331897, "learning_rate": 1.9836843917758593e-05, "loss": 0.8388, "step": 523 }, { "epoch": 0.09, "grad_norm": 3.060591090991937, "learning_rate": 1.9835885402345325e-05, "loss": 0.9646, "step": 524 }, { "epoch": 0.09, "grad_norm": 1.7865274395787, "learning_rate": 1.983492410290539e-05, "loss": 0.9157, "step": 525 }, { "epoch": 0.09, "grad_norm": 2.761677272227999, "learning_rate": 1.9833960019710878e-05, "loss": 0.8642, "step": 526 }, { "epoch": 0.09, "grad_norm": 1.0654401969449843, "learning_rate": 1.983299315303467e-05, "loss": 0.3886, "step": 527 }, { "epoch": 0.09, "grad_norm": 2.0737458034640817, "learning_rate": 1.983202350315044e-05, "loss": 0.9739, "step": 528 }, { "epoch": 0.09, "grad_norm": 2.105771564573157, "learning_rate": 1.9831051070332642e-05, "loss": 0.9006, "step": 529 }, { "epoch": 0.09, "grad_norm": 0.7445486908039802, "learning_rate": 1.983007585485652e-05, "loss": 0.3683, "step": 530 }, { "epoch": 0.09, "grad_norm": 2.0129078366910944, "learning_rate": 1.9829097856998105e-05, "loss": 0.7745, "step": 531 }, { "epoch": 0.09, "grad_norm": 2.425792964820387, "learning_rate": 1.9828117077034225e-05, "loss": 0.8457, "step": 532 }, { "epoch": 0.09, "grad_norm": 2.0591529066600156, "learning_rate": 1.982713351524248e-05, "loss": 0.9963, "step": 533 }, { "epoch": 0.09, "grad_norm": 3.869436340016203, "learning_rate": 1.982614717190127e-05, "loss": 0.8854, "step": 534 }, { "epoch": 0.09, "grad_norm": 3.2591827762030667, "learning_rate": 1.982515804728977e-05, "loss": 0.9395, "step": 535 }, { "epoch": 0.09, "grad_norm": 2.3106797309930602, "learning_rate": 1.982416614168796e-05, "loss": 0.8943, "step": 536 }, { "epoch": 0.09, "grad_norm": 2.087381164666418, "learning_rate": 1.9823171455376592e-05, "loss": 0.8901, "step": 537 }, { "epoch": 0.09, "grad_norm": 2.51409209390378, "learning_rate": 1.982217398863721e-05, "loss": 0.905, "step": 538 }, { "epoch": 0.09, "grad_norm": 2.5292000162020902, "learning_rate": 1.982117374175214e-05, "loss": 0.9068, "step": 539 }, { "epoch": 0.09, "grad_norm": 2.128390022545363, "learning_rate": 1.982017071500451e-05, "loss": 0.9504, "step": 540 }, { "epoch": 0.09, "grad_norm": 2.4491189247800587, "learning_rate": 1.9819164908678216e-05, "loss": 0.8431, "step": 541 }, { "epoch": 0.09, "grad_norm": 1.900329566376489, "learning_rate": 1.9818156323057952e-05, "loss": 0.9288, "step": 542 }, { "epoch": 0.09, "grad_norm": 2.5770940262682833, "learning_rate": 1.98171449584292e-05, "loss": 0.9382, "step": 543 }, { "epoch": 0.09, "grad_norm": 3.032568057050696, "learning_rate": 1.9816130815078216e-05, "loss": 0.9993, "step": 544 }, { "epoch": 0.09, "grad_norm": 2.119445436056581, "learning_rate": 1.9815113893292058e-05, "loss": 0.8908, "step": 545 }, { "epoch": 0.09, "grad_norm": 2.1480846368114, "learning_rate": 1.981409419335856e-05, "loss": 0.87, "step": 546 }, { "epoch": 0.09, "grad_norm": 3.116627625060351, "learning_rate": 1.981307171556635e-05, "loss": 0.9459, "step": 547 }, { "epoch": 0.09, "grad_norm": 2.0728795582653516, "learning_rate": 1.9812046460204837e-05, "loss": 0.9145, "step": 548 }, { "epoch": 0.09, "grad_norm": 2.319032434639845, "learning_rate": 1.9811018427564213e-05, "loss": 0.8619, "step": 549 }, { "epoch": 0.09, "grad_norm": 2.4905615551290965, "learning_rate": 1.9809987617935468e-05, "loss": 0.9785, "step": 550 }, { "epoch": 0.09, "grad_norm": 2.0898750726556896, "learning_rate": 1.9808954031610362e-05, "loss": 0.9378, "step": 551 }, { "epoch": 0.09, "grad_norm": 3.4915787726405156, "learning_rate": 1.9807917668881455e-05, "loss": 0.9291, "step": 552 }, { "epoch": 0.09, "grad_norm": 2.4582494816830947, "learning_rate": 1.9806878530042083e-05, "loss": 0.9036, "step": 553 }, { "epoch": 0.09, "grad_norm": 3.5843484998026134, "learning_rate": 1.9805836615386378e-05, "loss": 0.8907, "step": 554 }, { "epoch": 0.09, "grad_norm": 2.6684560538448676, "learning_rate": 1.980479192520925e-05, "loss": 0.9036, "step": 555 }, { "epoch": 0.09, "grad_norm": 2.3011265844533373, "learning_rate": 1.9803744459806393e-05, "loss": 0.8968, "step": 556 }, { "epoch": 0.09, "grad_norm": 1.9511241923931653, "learning_rate": 1.980269421947429e-05, "loss": 0.9275, "step": 557 }, { "epoch": 0.09, "grad_norm": 1.7767735536312694, "learning_rate": 1.9801641204510216e-05, "loss": 0.8613, "step": 558 }, { "epoch": 0.09, "grad_norm": 2.1433981246724194, "learning_rate": 1.9800585415212214e-05, "loss": 0.9545, "step": 559 }, { "epoch": 0.09, "grad_norm": 2.488573291576342, "learning_rate": 1.979952685187913e-05, "loss": 0.8848, "step": 560 }, { "epoch": 0.09, "grad_norm": 6.1003887553123555, "learning_rate": 1.979846551481059e-05, "loss": 0.9209, "step": 561 }, { "epoch": 0.09, "grad_norm": 2.425818494547925, "learning_rate": 1.9797401404307e-05, "loss": 0.9542, "step": 562 }, { "epoch": 0.09, "grad_norm": 2.2205749655117426, "learning_rate": 1.9796334520669555e-05, "loss": 0.9321, "step": 563 }, { "epoch": 0.09, "grad_norm": 2.3348736217117265, "learning_rate": 1.9795264864200233e-05, "loss": 0.9517, "step": 564 }, { "epoch": 0.09, "grad_norm": 2.7521200141572546, "learning_rate": 1.9794192435201797e-05, "loss": 0.9165, "step": 565 }, { "epoch": 0.09, "grad_norm": 2.2067447235877173, "learning_rate": 1.97931172339778e-05, "loss": 0.4111, "step": 566 }, { "epoch": 0.09, "grad_norm": 2.0425649034529694, "learning_rate": 1.979203926083257e-05, "loss": 0.9473, "step": 567 }, { "epoch": 0.09, "grad_norm": 2.7434301074348726, "learning_rate": 1.9790958516071228e-05, "loss": 0.962, "step": 568 }, { "epoch": 0.09, "grad_norm": 1.8614093934592268, "learning_rate": 1.9789874999999678e-05, "loss": 0.8948, "step": 569 }, { "epoch": 0.09, "grad_norm": 1.9001293910142814, "learning_rate": 1.9788788712924606e-05, "loss": 0.946, "step": 570 }, { "epoch": 0.09, "grad_norm": 2.5928239964095448, "learning_rate": 1.978769965515348e-05, "loss": 0.8787, "step": 571 }, { "epoch": 0.09, "grad_norm": 2.289056525403706, "learning_rate": 1.9786607826994557e-05, "loss": 0.8654, "step": 572 }, { "epoch": 0.09, "grad_norm": 2.0385250560189587, "learning_rate": 1.978551322875688e-05, "loss": 0.8966, "step": 573 }, { "epoch": 0.09, "grad_norm": 1.8395239848170157, "learning_rate": 1.978441586075027e-05, "loss": 0.9068, "step": 574 }, { "epoch": 0.09, "grad_norm": 2.0587544525213968, "learning_rate": 1.978331572328534e-05, "loss": 0.8974, "step": 575 }, { "epoch": 0.09, "grad_norm": 3.197705748151783, "learning_rate": 1.9782212816673468e-05, "loss": 0.8849, "step": 576 }, { "epoch": 0.09, "grad_norm": 2.0404773743819407, "learning_rate": 1.9781107141226845e-05, "loss": 0.9411, "step": 577 }, { "epoch": 0.09, "grad_norm": 2.39705502148234, "learning_rate": 1.977999869725842e-05, "loss": 0.87, "step": 578 }, { "epoch": 0.1, "grad_norm": 9.213598210128348, "learning_rate": 1.977888748508194e-05, "loss": 0.8811, "step": 579 }, { "epoch": 0.1, "grad_norm": 1.1213034287947192, "learning_rate": 1.977777350501193e-05, "loss": 0.3919, "step": 580 }, { "epoch": 0.1, "grad_norm": 1.9409868714787295, "learning_rate": 1.97766567573637e-05, "loss": 0.9164, "step": 581 }, { "epoch": 0.1, "grad_norm": 2.9196022163022457, "learning_rate": 1.9775537242453347e-05, "loss": 1.0117, "step": 582 }, { "epoch": 0.1, "grad_norm": 2.6633733128058945, "learning_rate": 1.977441496059774e-05, "loss": 0.9146, "step": 583 }, { "epoch": 0.1, "grad_norm": 2.421247133556177, "learning_rate": 1.9773289912114543e-05, "loss": 0.884, "step": 584 }, { "epoch": 0.1, "grad_norm": 2.359234981667909, "learning_rate": 1.9772162097322195e-05, "loss": 0.8808, "step": 585 }, { "epoch": 0.1, "grad_norm": 2.9327225749583046, "learning_rate": 1.9771031516539928e-05, "loss": 0.8675, "step": 586 }, { "epoch": 0.1, "grad_norm": 1.6974729252351604, "learning_rate": 1.9769898170087743e-05, "loss": 0.8988, "step": 587 }, { "epoch": 0.1, "grad_norm": 2.468425318769097, "learning_rate": 1.9768762058286433e-05, "loss": 0.9698, "step": 588 }, { "epoch": 0.1, "grad_norm": 2.721712035196418, "learning_rate": 1.976762318145758e-05, "loss": 0.9014, "step": 589 }, { "epoch": 0.1, "grad_norm": 3.2288275366274832, "learning_rate": 1.9766481539923533e-05, "loss": 0.8868, "step": 590 }, { "epoch": 0.1, "grad_norm": 5.249532418580692, "learning_rate": 1.9765337134007432e-05, "loss": 0.8699, "step": 591 }, { "epoch": 0.1, "grad_norm": 5.50653589249449, "learning_rate": 1.9764189964033198e-05, "loss": 0.9228, "step": 592 }, { "epoch": 0.1, "grad_norm": 2.2482159933102874, "learning_rate": 1.976304003032554e-05, "loss": 0.9305, "step": 593 }, { "epoch": 0.1, "grad_norm": 2.6028803750978655, "learning_rate": 1.976188733320994e-05, "loss": 0.9475, "step": 594 }, { "epoch": 0.1, "grad_norm": 2.138687386769, "learning_rate": 1.9760731873012668e-05, "loss": 0.8923, "step": 595 }, { "epoch": 0.1, "grad_norm": 2.157075078871423, "learning_rate": 1.9759573650060774e-05, "loss": 0.9624, "step": 596 }, { "epoch": 0.1, "grad_norm": 2.3371054558419253, "learning_rate": 1.9758412664682088e-05, "loss": 0.8432, "step": 597 }, { "epoch": 0.1, "grad_norm": 2.5426205747772985, "learning_rate": 1.9757248917205228e-05, "loss": 0.9145, "step": 598 }, { "epoch": 0.1, "grad_norm": 2.2302753493066834, "learning_rate": 1.975608240795959e-05, "loss": 0.9531, "step": 599 }, { "epoch": 0.1, "grad_norm": 2.1515455079991987, "learning_rate": 1.9754913137275355e-05, "loss": 0.8657, "step": 600 }, { "epoch": 0.1, "grad_norm": 2.1867727396370067, "learning_rate": 1.9753741105483475e-05, "loss": 0.9103, "step": 601 }, { "epoch": 0.1, "grad_norm": 1.758215577012027, "learning_rate": 1.9752566312915697e-05, "loss": 0.9203, "step": 602 }, { "epoch": 0.1, "grad_norm": 1.8377408496521992, "learning_rate": 1.975138875990454e-05, "loss": 0.9075, "step": 603 }, { "epoch": 0.1, "grad_norm": 3.0512551592242887, "learning_rate": 1.975020844678331e-05, "loss": 0.8956, "step": 604 }, { "epoch": 0.1, "grad_norm": 2.3839850523938275, "learning_rate": 1.974902537388609e-05, "loss": 0.9195, "step": 605 }, { "epoch": 0.1, "grad_norm": 2.3613971480174087, "learning_rate": 1.9747839541547754e-05, "loss": 0.9811, "step": 606 }, { "epoch": 0.1, "grad_norm": 2.5375073151446377, "learning_rate": 1.974665095010394e-05, "loss": 0.8834, "step": 607 }, { "epoch": 0.1, "grad_norm": 2.440833223337978, "learning_rate": 1.974545959989108e-05, "loss": 0.9057, "step": 608 }, { "epoch": 0.1, "grad_norm": 1.8033322577881725, "learning_rate": 1.974426549124638e-05, "loss": 0.9374, "step": 609 }, { "epoch": 0.1, "grad_norm": 3.3276708811017945, "learning_rate": 1.9743068624507837e-05, "loss": 0.9674, "step": 610 }, { "epoch": 0.1, "grad_norm": 2.028930085662282, "learning_rate": 1.9741869000014217e-05, "loss": 0.9173, "step": 611 }, { "epoch": 0.1, "grad_norm": 2.4883050824151627, "learning_rate": 1.9740666618105067e-05, "loss": 0.842, "step": 612 }, { "epoch": 0.1, "grad_norm": 2.354208411257574, "learning_rate": 1.9739461479120727e-05, "loss": 0.9245, "step": 613 }, { "epoch": 0.1, "grad_norm": 2.4149327921245485, "learning_rate": 1.9738253583402306e-05, "loss": 0.8614, "step": 614 }, { "epoch": 0.1, "grad_norm": 2.936121149133854, "learning_rate": 1.973704293129169e-05, "loss": 0.9148, "step": 615 }, { "epoch": 0.1, "grad_norm": 2.9429629535628417, "learning_rate": 1.9735829523131564e-05, "loss": 0.8897, "step": 616 }, { "epoch": 0.1, "grad_norm": 2.070036404645153, "learning_rate": 1.9734613359265373e-05, "loss": 0.9198, "step": 617 }, { "epoch": 0.1, "grad_norm": 2.331403945607918, "learning_rate": 1.973339444003735e-05, "loss": 0.9745, "step": 618 }, { "epoch": 0.1, "grad_norm": 2.6558165007227545, "learning_rate": 1.9732172765792507e-05, "loss": 0.8474, "step": 619 }, { "epoch": 0.1, "grad_norm": 4.041681649383188, "learning_rate": 1.9730948336876637e-05, "loss": 0.9079, "step": 620 }, { "epoch": 0.1, "grad_norm": 1.9417610582489746, "learning_rate": 1.9729721153636312e-05, "loss": 0.9471, "step": 621 }, { "epoch": 0.1, "grad_norm": 2.1086586063984303, "learning_rate": 1.9728491216418884e-05, "loss": 0.8673, "step": 622 }, { "epoch": 0.1, "grad_norm": 1.9487713720041044, "learning_rate": 1.9727258525572487e-05, "loss": 0.9089, "step": 623 }, { "epoch": 0.1, "grad_norm": 3.9641540068343315, "learning_rate": 1.9726023081446026e-05, "loss": 0.9166, "step": 624 }, { "epoch": 0.1, "grad_norm": 2.117149121234961, "learning_rate": 1.9724784884389195e-05, "loss": 0.9678, "step": 625 }, { "epoch": 0.1, "grad_norm": 3.0646450879099785, "learning_rate": 1.972354393475246e-05, "loss": 0.9252, "step": 626 }, { "epoch": 0.1, "grad_norm": 3.2562926644324652, "learning_rate": 1.9722300232887073e-05, "loss": 0.9296, "step": 627 }, { "epoch": 0.1, "grad_norm": 2.2111094654134975, "learning_rate": 1.9721053779145057e-05, "loss": 1.0356, "step": 628 }, { "epoch": 0.1, "grad_norm": 2.6772742322689047, "learning_rate": 1.9719804573879223e-05, "loss": 0.8672, "step": 629 }, { "epoch": 0.1, "grad_norm": 2.516163010602797, "learning_rate": 1.971855261744315e-05, "loss": 0.9497, "step": 630 }, { "epoch": 0.1, "grad_norm": 1.6516240147465797, "learning_rate": 1.9717297910191204e-05, "loss": 0.8535, "step": 631 }, { "epoch": 0.1, "grad_norm": 1.925688542429231, "learning_rate": 1.9716040452478527e-05, "loss": 0.8701, "step": 632 }, { "epoch": 0.1, "grad_norm": 2.1124742887862036, "learning_rate": 1.9714780244661044e-05, "loss": 0.9295, "step": 633 }, { "epoch": 0.1, "grad_norm": 2.569949193766264, "learning_rate": 1.971351728709545e-05, "loss": 0.8706, "step": 634 }, { "epoch": 0.1, "grad_norm": 2.1025486074681923, "learning_rate": 1.9712251580139225e-05, "loss": 0.9914, "step": 635 }, { "epoch": 0.1, "grad_norm": 3.286191038153738, "learning_rate": 1.971098312415062e-05, "loss": 0.8948, "step": 636 }, { "epoch": 0.1, "grad_norm": 2.2509914156996724, "learning_rate": 1.9709711919488673e-05, "loss": 0.9532, "step": 637 }, { "epoch": 0.1, "grad_norm": 2.163039883613121, "learning_rate": 1.9708437966513196e-05, "loss": 0.965, "step": 638 }, { "epoch": 0.1, "grad_norm": 2.0283542364049776, "learning_rate": 1.9707161265584775e-05, "loss": 0.9059, "step": 639 }, { "epoch": 0.11, "grad_norm": 1.8455854107943133, "learning_rate": 1.970588181706478e-05, "loss": 0.9533, "step": 640 }, { "epoch": 0.11, "grad_norm": 2.044163645224987, "learning_rate": 1.970459962131536e-05, "loss": 0.8217, "step": 641 }, { "epoch": 0.11, "grad_norm": 2.6549750200906916, "learning_rate": 1.9703314678699426e-05, "loss": 0.8511, "step": 642 }, { "epoch": 0.11, "grad_norm": 2.450759703854966, "learning_rate": 1.9702026989580694e-05, "loss": 0.9442, "step": 643 }, { "epoch": 0.11, "grad_norm": 2.412384975694147, "learning_rate": 1.970073655432363e-05, "loss": 0.9301, "step": 644 }, { "epoch": 0.11, "grad_norm": 3.2345835047444593, "learning_rate": 1.9699443373293496e-05, "loss": 0.8456, "step": 645 }, { "epoch": 0.11, "grad_norm": 2.226710415540647, "learning_rate": 1.9698147446856316e-05, "loss": 0.8351, "step": 646 }, { "epoch": 0.11, "grad_norm": 1.7956765074010244, "learning_rate": 1.969684877537891e-05, "loss": 0.8633, "step": 647 }, { "epoch": 0.11, "grad_norm": 1.7217722202564567, "learning_rate": 1.969554735922885e-05, "loss": 0.8444, "step": 648 }, { "epoch": 0.11, "grad_norm": 2.3369045555482373, "learning_rate": 1.9694243198774516e-05, "loss": 0.7888, "step": 649 }, { "epoch": 0.11, "grad_norm": 2.458059754582472, "learning_rate": 1.9692936294385038e-05, "loss": 0.9043, "step": 650 }, { "epoch": 0.11, "grad_norm": 2.062661460432535, "learning_rate": 1.969162664643033e-05, "loss": 0.9145, "step": 651 }, { "epoch": 0.11, "grad_norm": 2.2663185341833776, "learning_rate": 1.9690314255281092e-05, "loss": 0.9372, "step": 652 }, { "epoch": 0.11, "grad_norm": 2.018166728761063, "learning_rate": 1.968899912130879e-05, "loss": 0.9138, "step": 653 }, { "epoch": 0.11, "grad_norm": 3.302329420721431, "learning_rate": 1.968768124488567e-05, "loss": 0.8486, "step": 654 }, { "epoch": 0.11, "grad_norm": 3.4142723349190125, "learning_rate": 1.9686360626384756e-05, "loss": 0.9224, "step": 655 }, { "epoch": 0.11, "grad_norm": 1.9294316093727935, "learning_rate": 1.9685037266179846e-05, "loss": 0.8738, "step": 656 }, { "epoch": 0.11, "grad_norm": 2.1196567628407745, "learning_rate": 1.9683711164645508e-05, "loss": 0.8681, "step": 657 }, { "epoch": 0.11, "grad_norm": 3.0290219715651046, "learning_rate": 1.9682382322157103e-05, "loss": 0.8279, "step": 658 }, { "epoch": 0.11, "grad_norm": 3.290603599856386, "learning_rate": 1.968105073909075e-05, "loss": 0.7853, "step": 659 }, { "epoch": 0.11, "grad_norm": 2.0969022862052897, "learning_rate": 1.9679716415823352e-05, "loss": 0.9773, "step": 660 }, { "epoch": 0.11, "grad_norm": 1.9721804751665086, "learning_rate": 1.9678379352732587e-05, "loss": 0.8722, "step": 661 }, { "epoch": 0.11, "grad_norm": 2.1242218787023712, "learning_rate": 1.967703955019691e-05, "loss": 0.8844, "step": 662 }, { "epoch": 0.11, "grad_norm": 2.0613522548535084, "learning_rate": 1.9675697008595545e-05, "loss": 0.8371, "step": 663 }, { "epoch": 0.11, "grad_norm": 3.654093168014004, "learning_rate": 1.9674351728308498e-05, "loss": 0.9007, "step": 664 }, { "epoch": 0.11, "grad_norm": 1.9787658272489446, "learning_rate": 1.9673003709716548e-05, "loss": 0.9286, "step": 665 }, { "epoch": 0.11, "grad_norm": 3.6290451340124545, "learning_rate": 1.9671652953201245e-05, "loss": 0.9311, "step": 666 }, { "epoch": 0.11, "grad_norm": 2.2831307324694277, "learning_rate": 1.9670299459144923e-05, "loss": 0.9489, "step": 667 }, { "epoch": 0.11, "grad_norm": 2.318851562593205, "learning_rate": 1.9668943227930686e-05, "loss": 1.0142, "step": 668 }, { "epoch": 0.11, "grad_norm": 1.783081173297949, "learning_rate": 1.9667584259942408e-05, "loss": 0.8197, "step": 669 }, { "epoch": 0.11, "grad_norm": 2.8669709936717567, "learning_rate": 1.9666222555564744e-05, "loss": 0.835, "step": 670 }, { "epoch": 0.11, "grad_norm": 1.2938282956658924, "learning_rate": 1.9664858115183122e-05, "loss": 0.3929, "step": 671 }, { "epoch": 0.11, "grad_norm": 2.1367618912550093, "learning_rate": 1.9663490939183744e-05, "loss": 0.8895, "step": 672 }, { "epoch": 0.11, "grad_norm": 2.534542373425531, "learning_rate": 1.966212102795358e-05, "loss": 0.8858, "step": 673 }, { "epoch": 0.11, "grad_norm": 2.313155434381525, "learning_rate": 1.9660748381880394e-05, "loss": 0.9315, "step": 674 }, { "epoch": 0.11, "grad_norm": 1.868367474318277, "learning_rate": 1.96593730013527e-05, "loss": 0.8553, "step": 675 }, { "epoch": 0.11, "grad_norm": 1.7964057133950868, "learning_rate": 1.96579948867598e-05, "loss": 0.858, "step": 676 }, { "epoch": 0.11, "grad_norm": 1.671429824552823, "learning_rate": 1.9656614038491765e-05, "loss": 0.8659, "step": 677 }, { "epoch": 0.11, "grad_norm": 1.758624871892802, "learning_rate": 1.965523045693944e-05, "loss": 0.9324, "step": 678 }, { "epoch": 0.11, "grad_norm": 2.8536575049301076, "learning_rate": 1.965384414249445e-05, "loss": 0.939, "step": 679 }, { "epoch": 0.11, "grad_norm": 1.7059680247023923, "learning_rate": 1.9652455095549188e-05, "loss": 0.8528, "step": 680 }, { "epoch": 0.11, "grad_norm": 1.1312751322376062, "learning_rate": 1.9651063316496813e-05, "loss": 0.4328, "step": 681 }, { "epoch": 0.11, "grad_norm": 3.382268663004055, "learning_rate": 1.9649668805731274e-05, "loss": 0.9496, "step": 682 }, { "epoch": 0.11, "grad_norm": 2.5798419854418952, "learning_rate": 1.964827156364728e-05, "loss": 0.9615, "step": 683 }, { "epoch": 0.11, "grad_norm": 1.881665478168072, "learning_rate": 1.9646871590640317e-05, "loss": 0.956, "step": 684 }, { "epoch": 0.11, "grad_norm": 2.075721583881333, "learning_rate": 1.9645468887106645e-05, "loss": 0.9612, "step": 685 }, { "epoch": 0.11, "grad_norm": 2.32979408831434, "learning_rate": 1.9644063453443296e-05, "loss": 0.951, "step": 686 }, { "epoch": 0.11, "grad_norm": 2.41054009040156, "learning_rate": 1.9642655290048077e-05, "loss": 0.9398, "step": 687 }, { "epoch": 0.11, "grad_norm": 1.881865392917621, "learning_rate": 1.964124439731957e-05, "loss": 0.8226, "step": 688 }, { "epoch": 0.11, "grad_norm": 2.166039771655383, "learning_rate": 1.9639830775657113e-05, "loss": 0.8328, "step": 689 }, { "epoch": 0.11, "grad_norm": 2.2591282192207283, "learning_rate": 1.9638414425460834e-05, "loss": 0.9216, "step": 690 }, { "epoch": 0.11, "grad_norm": 0.7865271581156172, "learning_rate": 1.9636995347131634e-05, "loss": 0.3684, "step": 691 }, { "epoch": 0.11, "grad_norm": 1.9544308343309975, "learning_rate": 1.9635573541071174e-05, "loss": 0.9277, "step": 692 }, { "epoch": 0.11, "grad_norm": 0.6997488206751402, "learning_rate": 1.9634149007681894e-05, "loss": 0.3872, "step": 693 }, { "epoch": 0.11, "grad_norm": 2.955457494729503, "learning_rate": 1.9632721747367005e-05, "loss": 0.9366, "step": 694 }, { "epoch": 0.11, "grad_norm": 2.603495534623339, "learning_rate": 1.9631291760530492e-05, "loss": 0.9173, "step": 695 }, { "epoch": 0.11, "grad_norm": 3.172564053715286, "learning_rate": 1.962985904757711e-05, "loss": 0.9742, "step": 696 }, { "epoch": 0.11, "grad_norm": 1.860340932998585, "learning_rate": 1.962842360891238e-05, "loss": 0.8272, "step": 697 }, { "epoch": 0.11, "grad_norm": 1.6275829411199878, "learning_rate": 1.962698544494261e-05, "loss": 0.8622, "step": 698 }, { "epoch": 0.11, "grad_norm": 1.8389741991531874, "learning_rate": 1.9625544556074857e-05, "loss": 0.9012, "step": 699 }, { "epoch": 0.11, "grad_norm": 2.193385477104711, "learning_rate": 1.962410094271697e-05, "loss": 0.9175, "step": 700 }, { "epoch": 0.12, "grad_norm": 2.518299492445112, "learning_rate": 1.962265460527756e-05, "loss": 0.7603, "step": 701 }, { "epoch": 0.12, "grad_norm": 2.6792771721561364, "learning_rate": 1.962120554416601e-05, "loss": 0.8783, "step": 702 }, { "epoch": 0.12, "grad_norm": 1.844961834121754, "learning_rate": 1.9619753759792466e-05, "loss": 0.9186, "step": 703 }, { "epoch": 0.12, "grad_norm": 2.2536557490574247, "learning_rate": 1.9618299252567863e-05, "loss": 0.8795, "step": 704 }, { "epoch": 0.12, "grad_norm": 2.2178189260603642, "learning_rate": 1.961684202290389e-05, "loss": 0.922, "step": 705 }, { "epoch": 0.12, "grad_norm": 1.5978001365902958, "learning_rate": 1.9615382071213017e-05, "loss": 0.8947, "step": 706 }, { "epoch": 0.12, "grad_norm": 1.5889808799456622, "learning_rate": 1.9613919397908473e-05, "loss": 0.8862, "step": 707 }, { "epoch": 0.12, "grad_norm": 2.5425883397396, "learning_rate": 1.961245400340427e-05, "loss": 0.9272, "step": 708 }, { "epoch": 0.12, "grad_norm": 2.2749993908991946, "learning_rate": 1.961098588811519e-05, "loss": 0.9182, "step": 709 }, { "epoch": 0.12, "grad_norm": 1.974440852304244, "learning_rate": 1.9609515052456768e-05, "loss": 0.9149, "step": 710 }, { "epoch": 0.12, "grad_norm": 2.3002862112994387, "learning_rate": 1.960804149684533e-05, "loss": 0.9277, "step": 711 }, { "epoch": 0.12, "grad_norm": 2.186419306467957, "learning_rate": 1.960656522169796e-05, "loss": 0.9135, "step": 712 }, { "epoch": 0.12, "grad_norm": 1.9693821687604047, "learning_rate": 1.9605086227432512e-05, "loss": 0.8544, "step": 713 }, { "epoch": 0.12, "grad_norm": 1.7287449707931417, "learning_rate": 1.9603604514467616e-05, "loss": 0.9482, "step": 714 }, { "epoch": 0.12, "grad_norm": 1.6355628938430604, "learning_rate": 1.9602120083222665e-05, "loss": 0.8941, "step": 715 }, { "epoch": 0.12, "grad_norm": 1.8499296096702724, "learning_rate": 1.960063293411783e-05, "loss": 0.8766, "step": 716 }, { "epoch": 0.12, "grad_norm": 2.360866648043748, "learning_rate": 1.9599143067574037e-05, "loss": 0.8684, "step": 717 }, { "epoch": 0.12, "grad_norm": 1.8525726540387695, "learning_rate": 1.9597650484012997e-05, "loss": 0.8593, "step": 718 }, { "epoch": 0.12, "grad_norm": 2.2124752854611747, "learning_rate": 1.9596155183857176e-05, "loss": 0.9075, "step": 719 }, { "epoch": 0.12, "grad_norm": 2.1596119166887644, "learning_rate": 1.9594657167529818e-05, "loss": 0.9512, "step": 720 }, { "epoch": 0.12, "grad_norm": 3.8175845093900724, "learning_rate": 1.9593156435454936e-05, "loss": 0.9041, "step": 721 }, { "epoch": 0.12, "grad_norm": 2.6291201071255883, "learning_rate": 1.9591652988057305e-05, "loss": 0.9421, "step": 722 }, { "epoch": 0.12, "grad_norm": 1.9702782384918651, "learning_rate": 1.9590146825762476e-05, "loss": 0.9429, "step": 723 }, { "epoch": 0.12, "grad_norm": 1.5908183422626168, "learning_rate": 1.9588637948996766e-05, "loss": 0.9946, "step": 724 }, { "epoch": 0.12, "grad_norm": 1.8648940785462693, "learning_rate": 1.9587126358187257e-05, "loss": 0.9113, "step": 725 }, { "epoch": 0.12, "grad_norm": 1.8074313516361102, "learning_rate": 1.9585612053761798e-05, "loss": 0.8957, "step": 726 }, { "epoch": 0.12, "grad_norm": 2.525314341006438, "learning_rate": 1.9584095036149016e-05, "loss": 0.864, "step": 727 }, { "epoch": 0.12, "grad_norm": 2.215172590984689, "learning_rate": 1.9582575305778297e-05, "loss": 0.8656, "step": 728 }, { "epoch": 0.12, "grad_norm": 7.389015815763534, "learning_rate": 1.95810528630798e-05, "loss": 0.9106, "step": 729 }, { "epoch": 0.12, "grad_norm": 2.078236690989936, "learning_rate": 1.9579527708484444e-05, "loss": 0.9368, "step": 730 }, { "epoch": 0.12, "grad_norm": 2.067235066484881, "learning_rate": 1.957799984242392e-05, "loss": 0.9569, "step": 731 }, { "epoch": 0.12, "grad_norm": 4.129499489056413, "learning_rate": 1.95764692653307e-05, "loss": 0.8538, "step": 732 }, { "epoch": 0.12, "grad_norm": 1.9834606383672428, "learning_rate": 1.9574935977637994e-05, "loss": 0.9311, "step": 733 }, { "epoch": 0.12, "grad_norm": 2.8298582669102244, "learning_rate": 1.9573399979779807e-05, "loss": 0.8386, "step": 734 }, { "epoch": 0.12, "grad_norm": 2.238618069892073, "learning_rate": 1.9571861272190898e-05, "loss": 0.9714, "step": 735 }, { "epoch": 0.12, "grad_norm": 2.4698978533757368, "learning_rate": 1.957031985530679e-05, "loss": 0.883, "step": 736 }, { "epoch": 0.12, "grad_norm": 1.6836892269095178, "learning_rate": 1.9568775729563782e-05, "loss": 0.9558, "step": 737 }, { "epoch": 0.12, "grad_norm": 2.270566942344756, "learning_rate": 1.9567228895398936e-05, "loss": 0.9104, "step": 738 }, { "epoch": 0.12, "grad_norm": 2.134458816723942, "learning_rate": 1.9565679353250077e-05, "loss": 0.8673, "step": 739 }, { "epoch": 0.12, "grad_norm": 2.0583365489604715, "learning_rate": 1.95641271035558e-05, "loss": 0.9136, "step": 740 }, { "epoch": 0.12, "grad_norm": 2.3126016812980197, "learning_rate": 1.9562572146755473e-05, "loss": 0.9017, "step": 741 }, { "epoch": 0.12, "grad_norm": 2.5558561054131292, "learning_rate": 1.956101448328921e-05, "loss": 0.8512, "step": 742 }, { "epoch": 0.12, "grad_norm": 3.913194639774604, "learning_rate": 1.955945411359792e-05, "loss": 0.9356, "step": 743 }, { "epoch": 0.12, "grad_norm": 1.9918285118137116, "learning_rate": 1.955789103812325e-05, "loss": 0.8832, "step": 744 }, { "epoch": 0.12, "grad_norm": 1.9409696280508062, "learning_rate": 1.955632525730763e-05, "loss": 0.8919, "step": 745 }, { "epoch": 0.12, "grad_norm": 2.2457851220502394, "learning_rate": 1.955475677159425e-05, "loss": 0.8394, "step": 746 }, { "epoch": 0.12, "grad_norm": 1.866508764400187, "learning_rate": 1.955318558142707e-05, "loss": 0.8854, "step": 747 }, { "epoch": 0.12, "grad_norm": 2.1210778611600882, "learning_rate": 1.9551611687250808e-05, "loss": 0.9287, "step": 748 }, { "epoch": 0.12, "grad_norm": 2.4373701720330034, "learning_rate": 1.9550035089510952e-05, "loss": 0.8943, "step": 749 }, { "epoch": 0.12, "grad_norm": 2.054376625781828, "learning_rate": 1.9548455788653754e-05, "loss": 0.9371, "step": 750 }, { "epoch": 0.12, "grad_norm": 2.2268985650696145, "learning_rate": 1.9546873785126237e-05, "loss": 0.896, "step": 751 }, { "epoch": 0.12, "grad_norm": 2.312651082815519, "learning_rate": 1.954528907937618e-05, "loss": 0.9049, "step": 752 }, { "epoch": 0.12, "grad_norm": 2.207148088266119, "learning_rate": 1.9543701671852127e-05, "loss": 0.8637, "step": 753 }, { "epoch": 0.12, "grad_norm": 2.1957885063236886, "learning_rate": 1.9542111563003393e-05, "loss": 0.8687, "step": 754 }, { "epoch": 0.12, "grad_norm": 1.8475657667310899, "learning_rate": 1.954051875328006e-05, "loss": 0.9232, "step": 755 }, { "epoch": 0.12, "grad_norm": 1.663010389721836, "learning_rate": 1.9538923243132967e-05, "loss": 0.8018, "step": 756 }, { "epoch": 0.12, "grad_norm": 3.7296875195928427, "learning_rate": 1.9537325033013714e-05, "loss": 0.7868, "step": 757 }, { "epoch": 0.12, "grad_norm": 2.6428764604439166, "learning_rate": 1.9535724123374674e-05, "loss": 0.8654, "step": 758 }, { "epoch": 0.12, "grad_norm": 2.4037569379888315, "learning_rate": 1.9534120514668987e-05, "loss": 0.8412, "step": 759 }, { "epoch": 0.12, "grad_norm": 3.031637906022457, "learning_rate": 1.9532514207350543e-05, "loss": 0.8547, "step": 760 }, { "epoch": 0.12, "grad_norm": 2.104655066548966, "learning_rate": 1.953090520187401e-05, "loss": 0.8831, "step": 761 }, { "epoch": 0.13, "grad_norm": 2.3730791699039018, "learning_rate": 1.952929349869481e-05, "loss": 0.9285, "step": 762 }, { "epoch": 0.13, "grad_norm": 2.33872142301465, "learning_rate": 1.952767909826913e-05, "loss": 0.9402, "step": 763 }, { "epoch": 0.13, "grad_norm": 1.765247082110891, "learning_rate": 1.9526062001053928e-05, "loss": 0.9059, "step": 764 }, { "epoch": 0.13, "grad_norm": 1.5740543221240688, "learning_rate": 1.9524442207506915e-05, "loss": 0.9301, "step": 765 }, { "epoch": 0.13, "grad_norm": 1.1847445836481367, "learning_rate": 1.9522819718086578e-05, "loss": 0.3869, "step": 766 }, { "epoch": 0.13, "grad_norm": 3.738288855110579, "learning_rate": 1.952119453325215e-05, "loss": 0.8492, "step": 767 }, { "epoch": 0.13, "grad_norm": 2.4283173449896536, "learning_rate": 1.951956665346364e-05, "loss": 0.8544, "step": 768 }, { "epoch": 0.13, "grad_norm": 1.5761610072835959, "learning_rate": 1.951793607918182e-05, "loss": 0.9556, "step": 769 }, { "epoch": 0.13, "grad_norm": 1.6941835641186795, "learning_rate": 1.9516302810868212e-05, "loss": 0.9114, "step": 770 }, { "epoch": 0.13, "grad_norm": 2.9033546098185985, "learning_rate": 1.9514666848985116e-05, "loss": 0.9315, "step": 771 }, { "epoch": 0.13, "grad_norm": 1.8343526640818522, "learning_rate": 1.9513028193995588e-05, "loss": 0.9232, "step": 772 }, { "epoch": 0.13, "grad_norm": 2.4130152552457362, "learning_rate": 1.951138684636344e-05, "loss": 0.9271, "step": 773 }, { "epoch": 0.13, "grad_norm": 2.096379042987898, "learning_rate": 1.950974280655326e-05, "loss": 0.8634, "step": 774 }, { "epoch": 0.13, "grad_norm": 1.9747021986028506, "learning_rate": 1.9508096075030378e-05, "loss": 0.9633, "step": 775 }, { "epoch": 0.13, "grad_norm": 0.9882129717034299, "learning_rate": 1.950644665226091e-05, "loss": 0.3992, "step": 776 }, { "epoch": 0.13, "grad_norm": 2.7504414420678422, "learning_rate": 1.9504794538711715e-05, "loss": 0.9101, "step": 777 }, { "epoch": 0.13, "grad_norm": 1.8473440186955363, "learning_rate": 1.9503139734850426e-05, "loss": 0.9089, "step": 778 }, { "epoch": 0.13, "grad_norm": 1.892004176303119, "learning_rate": 1.9501482241145422e-05, "loss": 0.8548, "step": 779 }, { "epoch": 0.13, "grad_norm": 2.1404428758662153, "learning_rate": 1.9499822058065863e-05, "loss": 0.8518, "step": 780 }, { "epoch": 0.13, "grad_norm": 1.7714096938723054, "learning_rate": 1.9498159186081656e-05, "loss": 0.8902, "step": 781 }, { "epoch": 0.13, "grad_norm": 2.037598805888141, "learning_rate": 1.949649362566347e-05, "loss": 0.9237, "step": 782 }, { "epoch": 0.13, "grad_norm": 1.9866409541042478, "learning_rate": 1.9494825377282746e-05, "loss": 0.9221, "step": 783 }, { "epoch": 0.13, "grad_norm": 2.0666837076448736, "learning_rate": 1.9493154441411673e-05, "loss": 0.8308, "step": 784 }, { "epoch": 0.13, "grad_norm": 2.073301501085506, "learning_rate": 1.949148081852321e-05, "loss": 0.9241, "step": 785 }, { "epoch": 0.13, "grad_norm": 1.5234207344353203, "learning_rate": 1.9489804509091066e-05, "loss": 0.8753, "step": 786 }, { "epoch": 0.13, "grad_norm": 1.9433771438335035, "learning_rate": 1.9488125513589722e-05, "loss": 1.0306, "step": 787 }, { "epoch": 0.13, "grad_norm": 1.7011552480169307, "learning_rate": 1.9486443832494414e-05, "loss": 0.9252, "step": 788 }, { "epoch": 0.13, "grad_norm": 1.7387435355952061, "learning_rate": 1.9484759466281132e-05, "loss": 0.9327, "step": 789 }, { "epoch": 0.13, "grad_norm": 2.3823456946546697, "learning_rate": 1.948307241542664e-05, "loss": 0.8316, "step": 790 }, { "epoch": 0.13, "grad_norm": 2.5399894330015065, "learning_rate": 1.9481382680408455e-05, "loss": 0.8023, "step": 791 }, { "epoch": 0.13, "grad_norm": 1.864810045044411, "learning_rate": 1.9479690261704848e-05, "loss": 0.8246, "step": 792 }, { "epoch": 0.13, "grad_norm": 2.220482352873975, "learning_rate": 1.9477995159794854e-05, "loss": 0.8492, "step": 793 }, { "epoch": 0.13, "grad_norm": 2.550602446586626, "learning_rate": 1.947629737515827e-05, "loss": 0.8886, "step": 794 }, { "epoch": 0.13, "grad_norm": 2.2062685868344536, "learning_rate": 1.947459690827565e-05, "loss": 0.9485, "step": 795 }, { "epoch": 0.13, "grad_norm": 2.081861708509109, "learning_rate": 1.9472893759628307e-05, "loss": 0.9334, "step": 796 }, { "epoch": 0.13, "grad_norm": 3.081819303409859, "learning_rate": 1.9471187929698317e-05, "loss": 0.9125, "step": 797 }, { "epoch": 0.13, "grad_norm": 1.9370653337178267, "learning_rate": 1.9469479418968506e-05, "loss": 0.9548, "step": 798 }, { "epoch": 0.13, "grad_norm": 2.908984614244022, "learning_rate": 1.946776822792247e-05, "loss": 0.867, "step": 799 }, { "epoch": 0.13, "grad_norm": 1.5852259903285426, "learning_rate": 1.9466054357044558e-05, "loss": 0.9125, "step": 800 }, { "epoch": 0.13, "grad_norm": 1.828926838676148, "learning_rate": 1.9464337806819872e-05, "loss": 0.9879, "step": 801 }, { "epoch": 0.13, "grad_norm": 5.206987245126495, "learning_rate": 1.946261857773428e-05, "loss": 0.8608, "step": 802 }, { "epoch": 0.13, "grad_norm": 2.449319227422848, "learning_rate": 1.9460896670274408e-05, "loss": 0.8929, "step": 803 }, { "epoch": 0.13, "grad_norm": 2.0515072218498642, "learning_rate": 1.9459172084927638e-05, "loss": 0.9044, "step": 804 }, { "epoch": 0.13, "grad_norm": 2.066430073550455, "learning_rate": 1.945744482218211e-05, "loss": 0.9423, "step": 805 }, { "epoch": 0.13, "grad_norm": 1.7537606228065161, "learning_rate": 1.945571488252672e-05, "loss": 0.7901, "step": 806 }, { "epoch": 0.13, "grad_norm": 2.6384886633540265, "learning_rate": 1.945398226645113e-05, "loss": 0.8969, "step": 807 }, { "epoch": 0.13, "grad_norm": 2.4014889646478492, "learning_rate": 1.9452246974445743e-05, "loss": 0.9085, "step": 808 }, { "epoch": 0.13, "grad_norm": 2.9276989867808063, "learning_rate": 1.9450509007001738e-05, "loss": 0.9243, "step": 809 }, { "epoch": 0.13, "grad_norm": 2.1483653288581057, "learning_rate": 1.9448768364611043e-05, "loss": 0.8863, "step": 810 }, { "epoch": 0.13, "grad_norm": 2.9010061020715705, "learning_rate": 1.944702504776634e-05, "loss": 0.8572, "step": 811 }, { "epoch": 0.13, "grad_norm": 0.8459741927234053, "learning_rate": 1.944527905696107e-05, "loss": 0.3855, "step": 812 }, { "epoch": 0.13, "grad_norm": 2.4216126748889244, "learning_rate": 1.9443530392689434e-05, "loss": 0.8921, "step": 813 }, { "epoch": 0.13, "grad_norm": 2.0187369239937323, "learning_rate": 1.9441779055446387e-05, "loss": 0.9205, "step": 814 }, { "epoch": 0.13, "grad_norm": 3.7583228649219134, "learning_rate": 1.9440025045727645e-05, "loss": 0.8404, "step": 815 }, { "epoch": 0.13, "grad_norm": 1.7969152979767677, "learning_rate": 1.9438268364029674e-05, "loss": 0.8334, "step": 816 }, { "epoch": 0.13, "grad_norm": 1.7464412701579242, "learning_rate": 1.9436509010849696e-05, "loss": 0.8957, "step": 817 }, { "epoch": 0.13, "grad_norm": 3.146026890826467, "learning_rate": 1.94347469866857e-05, "loss": 0.8409, "step": 818 }, { "epoch": 0.13, "grad_norm": 1.6954978711439128, "learning_rate": 1.9432982292036414e-05, "loss": 0.861, "step": 819 }, { "epoch": 0.13, "grad_norm": 2.327331252944611, "learning_rate": 1.9431214927401337e-05, "loss": 0.8751, "step": 820 }, { "epoch": 0.13, "grad_norm": 2.4027831905049646, "learning_rate": 1.9429444893280717e-05, "loss": 0.874, "step": 821 }, { "epoch": 0.14, "grad_norm": 1.8235074390350137, "learning_rate": 1.9427672190175557e-05, "loss": 0.8941, "step": 822 }, { "epoch": 0.14, "grad_norm": 2.0804457143720483, "learning_rate": 1.9425896818587615e-05, "loss": 0.8929, "step": 823 }, { "epoch": 0.14, "grad_norm": 2.1972700242402836, "learning_rate": 1.9424118779019415e-05, "loss": 0.8608, "step": 824 }, { "epoch": 0.14, "grad_norm": 1.9596056691689017, "learning_rate": 1.9422338071974215e-05, "loss": 0.9666, "step": 825 }, { "epoch": 0.14, "grad_norm": 1.97018019851573, "learning_rate": 1.9420554697956052e-05, "loss": 0.8779, "step": 826 }, { "epoch": 0.14, "grad_norm": 2.1751279406170387, "learning_rate": 1.9418768657469695e-05, "loss": 0.8142, "step": 827 }, { "epoch": 0.14, "grad_norm": 1.84997050170713, "learning_rate": 1.941697995102069e-05, "loss": 0.919, "step": 828 }, { "epoch": 0.14, "grad_norm": 1.5780440532498876, "learning_rate": 1.9415188579115315e-05, "loss": 0.8764, "step": 829 }, { "epoch": 0.14, "grad_norm": 13.439917952705096, "learning_rate": 1.941339454226063e-05, "loss": 0.9898, "step": 830 }, { "epoch": 0.14, "grad_norm": 1.8513284965153127, "learning_rate": 1.9411597840964414e-05, "loss": 0.8202, "step": 831 }, { "epoch": 0.14, "grad_norm": 2.5321752318762267, "learning_rate": 1.9409798475735234e-05, "loss": 0.8709, "step": 832 }, { "epoch": 0.14, "grad_norm": 2.4484089585981774, "learning_rate": 1.9407996447082394e-05, "loss": 0.8679, "step": 833 }, { "epoch": 0.14, "grad_norm": 1.7593399563535654, "learning_rate": 1.940619175551595e-05, "loss": 0.8655, "step": 834 }, { "epoch": 0.14, "grad_norm": 2.0325329993851136, "learning_rate": 1.940438440154672e-05, "loss": 0.8539, "step": 835 }, { "epoch": 0.14, "grad_norm": 0.9751849123895525, "learning_rate": 1.940257438568627e-05, "loss": 0.412, "step": 836 }, { "epoch": 0.14, "grad_norm": 1.7883645253957254, "learning_rate": 1.9400761708446918e-05, "loss": 0.9557, "step": 837 }, { "epoch": 0.14, "grad_norm": 2.1546960847946086, "learning_rate": 1.939894637034174e-05, "loss": 0.9297, "step": 838 }, { "epoch": 0.14, "grad_norm": 2.347538265431711, "learning_rate": 1.9397128371884575e-05, "loss": 0.9837, "step": 839 }, { "epoch": 0.14, "grad_norm": 1.9370264142754319, "learning_rate": 1.9395307713589987e-05, "loss": 0.8409, "step": 840 }, { "epoch": 0.14, "grad_norm": 2.15509566643683, "learning_rate": 1.939348439597332e-05, "loss": 0.9395, "step": 841 }, { "epoch": 0.14, "grad_norm": 5.232854328822752, "learning_rate": 1.9391658419550653e-05, "loss": 0.8793, "step": 842 }, { "epoch": 0.14, "grad_norm": 2.4272040069739576, "learning_rate": 1.9389829784838833e-05, "loss": 0.8396, "step": 843 }, { "epoch": 0.14, "grad_norm": 2.5685582818457338, "learning_rate": 1.9387998492355444e-05, "loss": 0.8503, "step": 844 }, { "epoch": 0.14, "grad_norm": 1.7700795585619826, "learning_rate": 1.9386164542618836e-05, "loss": 0.9048, "step": 845 }, { "epoch": 0.14, "grad_norm": 2.113237933361659, "learning_rate": 1.9384327936148095e-05, "loss": 0.9133, "step": 846 }, { "epoch": 0.14, "grad_norm": 2.2033591657236973, "learning_rate": 1.938248867346308e-05, "loss": 0.8387, "step": 847 }, { "epoch": 0.14, "grad_norm": 1.6384735576836562, "learning_rate": 1.938064675508438e-05, "loss": 0.971, "step": 848 }, { "epoch": 0.14, "grad_norm": 1.1771791199961001, "learning_rate": 1.9378802181533354e-05, "loss": 0.4332, "step": 849 }, { "epoch": 0.14, "grad_norm": 2.161908743970838, "learning_rate": 1.9376954953332104e-05, "loss": 0.9247, "step": 850 }, { "epoch": 0.14, "grad_norm": 2.0781437752468213, "learning_rate": 1.9375105071003476e-05, "loss": 0.9562, "step": 851 }, { "epoch": 0.14, "grad_norm": 1.5817900730246248, "learning_rate": 1.9373252535071087e-05, "loss": 0.8838, "step": 852 }, { "epoch": 0.14, "grad_norm": 2.534569866189063, "learning_rate": 1.9371397346059286e-05, "loss": 0.9561, "step": 853 }, { "epoch": 0.14, "grad_norm": 1.589096984066708, "learning_rate": 1.936953950449318e-05, "loss": 0.8981, "step": 854 }, { "epoch": 0.14, "grad_norm": 2.440033327990473, "learning_rate": 1.936767901089863e-05, "loss": 0.8256, "step": 855 }, { "epoch": 0.14, "grad_norm": 1.9923854756401254, "learning_rate": 1.9365815865802243e-05, "loss": 0.8778, "step": 856 }, { "epoch": 0.14, "grad_norm": 1.695201143193545, "learning_rate": 1.936395006973138e-05, "loss": 0.8909, "step": 857 }, { "epoch": 0.14, "grad_norm": 0.9244170086960798, "learning_rate": 1.936208162321415e-05, "loss": 0.3493, "step": 858 }, { "epoch": 0.14, "grad_norm": 2.202645078930874, "learning_rate": 1.9360210526779414e-05, "loss": 0.9176, "step": 859 }, { "epoch": 0.14, "grad_norm": 2.7795532778960377, "learning_rate": 1.9358336780956777e-05, "loss": 0.8364, "step": 860 }, { "epoch": 0.14, "grad_norm": 2.1276066833225684, "learning_rate": 1.9356460386276606e-05, "loss": 0.9975, "step": 861 }, { "epoch": 0.14, "grad_norm": 2.070007474134392, "learning_rate": 1.9354581343270006e-05, "loss": 0.9249, "step": 862 }, { "epoch": 0.14, "grad_norm": 2.6264222311617464, "learning_rate": 1.9352699652468835e-05, "loss": 0.9459, "step": 863 }, { "epoch": 0.14, "grad_norm": 2.276929716705796, "learning_rate": 1.9350815314405703e-05, "loss": 0.8648, "step": 864 }, { "epoch": 0.14, "grad_norm": 2.8309115908071365, "learning_rate": 1.934892832961397e-05, "loss": 0.8987, "step": 865 }, { "epoch": 0.14, "grad_norm": 1.858327135995301, "learning_rate": 1.9347038698627744e-05, "loss": 0.8999, "step": 866 }, { "epoch": 0.14, "grad_norm": 2.0891325673816663, "learning_rate": 1.9345146421981878e-05, "loss": 0.8461, "step": 867 }, { "epoch": 0.14, "grad_norm": 2.039752566981669, "learning_rate": 1.9343251500211977e-05, "loss": 0.8583, "step": 868 }, { "epoch": 0.14, "grad_norm": 1.9853410776832248, "learning_rate": 1.9341353933854396e-05, "loss": 0.8535, "step": 869 }, { "epoch": 0.14, "grad_norm": 1.8061988511392573, "learning_rate": 1.9339453723446234e-05, "loss": 0.9228, "step": 870 }, { "epoch": 0.14, "grad_norm": 2.0253686145981176, "learning_rate": 1.9337550869525344e-05, "loss": 0.8638, "step": 871 }, { "epoch": 0.14, "grad_norm": 1.673694033575065, "learning_rate": 1.933564537263033e-05, "loss": 0.8242, "step": 872 }, { "epoch": 0.14, "grad_norm": 1.716095174707711, "learning_rate": 1.933373723330053e-05, "loss": 0.8898, "step": 873 }, { "epoch": 0.14, "grad_norm": 1.7917662386625481, "learning_rate": 1.9331826452076044e-05, "loss": 0.7813, "step": 874 }, { "epoch": 0.14, "grad_norm": 1.8477688699552401, "learning_rate": 1.932991302949771e-05, "loss": 0.9748, "step": 875 }, { "epoch": 0.14, "grad_norm": 2.0353219046552207, "learning_rate": 1.9327996966107122e-05, "loss": 0.9386, "step": 876 }, { "epoch": 0.14, "grad_norm": 3.7031649788550305, "learning_rate": 1.932607826244662e-05, "loss": 0.9375, "step": 877 }, { "epoch": 0.14, "grad_norm": 2.3284143656550333, "learning_rate": 1.9324156919059286e-05, "loss": 0.8239, "step": 878 }, { "epoch": 0.14, "grad_norm": 1.9696825477764133, "learning_rate": 1.932223293648895e-05, "loss": 0.8313, "step": 879 }, { "epoch": 0.14, "grad_norm": 1.747508057742957, "learning_rate": 1.9320306315280196e-05, "loss": 0.9111, "step": 880 }, { "epoch": 0.14, "grad_norm": 2.1053631810175717, "learning_rate": 1.9318377055978342e-05, "loss": 0.9553, "step": 881 }, { "epoch": 0.14, "grad_norm": 23.51918707784314, "learning_rate": 1.9316445159129474e-05, "loss": 0.8794, "step": 882 }, { "epoch": 0.15, "grad_norm": 1.659019012203868, "learning_rate": 1.93145106252804e-05, "loss": 0.8799, "step": 883 }, { "epoch": 0.15, "grad_norm": 2.1188711714956394, "learning_rate": 1.931257345497869e-05, "loss": 0.894, "step": 884 }, { "epoch": 0.15, "grad_norm": 2.4204367069113846, "learning_rate": 1.9310633648772656e-05, "loss": 0.8545, "step": 885 }, { "epoch": 0.15, "grad_norm": 3.36550384981361, "learning_rate": 1.9308691207211353e-05, "loss": 0.9296, "step": 886 }, { "epoch": 0.15, "grad_norm": 1.8919814488514959, "learning_rate": 1.9306746130844593e-05, "loss": 0.9814, "step": 887 }, { "epoch": 0.15, "grad_norm": 1.8224400023049994, "learning_rate": 1.9304798420222918e-05, "loss": 0.9256, "step": 888 }, { "epoch": 0.15, "grad_norm": 1.8048294352558885, "learning_rate": 1.9302848075897624e-05, "loss": 0.9075, "step": 889 }, { "epoch": 0.15, "grad_norm": 2.4946664754505226, "learning_rate": 1.9300895098420753e-05, "loss": 0.8897, "step": 890 }, { "epoch": 0.15, "grad_norm": 2.7867251923368572, "learning_rate": 1.929893948834509e-05, "loss": 0.9583, "step": 891 }, { "epoch": 0.15, "grad_norm": 2.25476256914025, "learning_rate": 1.9296981246224173e-05, "loss": 0.8572, "step": 892 }, { "epoch": 0.15, "grad_norm": 1.847343659447101, "learning_rate": 1.9295020372612276e-05, "loss": 0.8552, "step": 893 }, { "epoch": 0.15, "grad_norm": 2.231923215303367, "learning_rate": 1.929305686806441e-05, "loss": 0.9076, "step": 894 }, { "epoch": 0.15, "grad_norm": 1.9614576454515877, "learning_rate": 1.9291090733136352e-05, "loss": 0.8372, "step": 895 }, { "epoch": 0.15, "grad_norm": 1.725637308972349, "learning_rate": 1.928912196838461e-05, "loss": 0.9081, "step": 896 }, { "epoch": 0.15, "grad_norm": 2.2146009998416, "learning_rate": 1.9287150574366432e-05, "loss": 0.937, "step": 897 }, { "epoch": 0.15, "grad_norm": 3.7083479428599015, "learning_rate": 1.9285176551639826e-05, "loss": 0.9361, "step": 898 }, { "epoch": 0.15, "grad_norm": 0.898462960587074, "learning_rate": 1.9283199900763533e-05, "loss": 0.4007, "step": 899 }, { "epoch": 0.15, "grad_norm": 2.011108962023324, "learning_rate": 1.9281220622297033e-05, "loss": 0.9077, "step": 900 }, { "epoch": 0.15, "grad_norm": 1.9968836738452556, "learning_rate": 1.927923871680057e-05, "loss": 0.8095, "step": 901 }, { "epoch": 0.15, "grad_norm": 0.7000406420594357, "learning_rate": 1.9277254184835105e-05, "loss": 0.3462, "step": 902 }, { "epoch": 0.15, "grad_norm": 2.240428150064432, "learning_rate": 1.9275267026962358e-05, "loss": 0.899, "step": 903 }, { "epoch": 0.15, "grad_norm": 2.4545377719044885, "learning_rate": 1.9273277243744797e-05, "loss": 0.8989, "step": 904 }, { "epoch": 0.15, "grad_norm": 2.035738015383646, "learning_rate": 1.927128483574562e-05, "loss": 0.8476, "step": 905 }, { "epoch": 0.15, "grad_norm": 2.1203900068050867, "learning_rate": 1.9269289803528775e-05, "loss": 0.9596, "step": 906 }, { "epoch": 0.15, "grad_norm": 2.0644681000408935, "learning_rate": 1.926729214765895e-05, "loss": 0.8308, "step": 907 }, { "epoch": 0.15, "grad_norm": 2.472803809925416, "learning_rate": 1.9265291868701584e-05, "loss": 0.8676, "step": 908 }, { "epoch": 0.15, "grad_norm": 2.331327216336826, "learning_rate": 1.9263288967222843e-05, "loss": 0.9453, "step": 909 }, { "epoch": 0.15, "grad_norm": 2.1368437716631554, "learning_rate": 1.926128344378965e-05, "loss": 0.8476, "step": 910 }, { "epoch": 0.15, "grad_norm": 3.4382940809735496, "learning_rate": 1.9259275298969663e-05, "loss": 0.8546, "step": 911 }, { "epoch": 0.15, "grad_norm": 1.69082724282921, "learning_rate": 1.925726453333128e-05, "loss": 0.921, "step": 912 }, { "epoch": 0.15, "grad_norm": 2.262919733090188, "learning_rate": 1.9255251147443646e-05, "loss": 0.8411, "step": 913 }, { "epoch": 0.15, "grad_norm": 1.6796406194634277, "learning_rate": 1.9253235141876646e-05, "loss": 0.9435, "step": 914 }, { "epoch": 0.15, "grad_norm": 1.6627803666720613, "learning_rate": 1.925121651720091e-05, "loss": 0.8445, "step": 915 }, { "epoch": 0.15, "grad_norm": 2.4741097867072086, "learning_rate": 1.92491952739878e-05, "loss": 0.9949, "step": 916 }, { "epoch": 0.15, "grad_norm": 1.9918221533807383, "learning_rate": 1.9247171412809423e-05, "loss": 0.8893, "step": 917 }, { "epoch": 0.15, "grad_norm": 2.3968018133336644, "learning_rate": 1.924514493423864e-05, "loss": 0.8547, "step": 918 }, { "epoch": 0.15, "grad_norm": 1.1441376011158784, "learning_rate": 1.9243115838849023e-05, "loss": 0.419, "step": 919 }, { "epoch": 0.15, "grad_norm": 3.1647612431303425, "learning_rate": 1.924108412721492e-05, "loss": 0.8915, "step": 920 }, { "epoch": 0.15, "grad_norm": 1.8193858655468738, "learning_rate": 1.9239049799911397e-05, "loss": 0.9047, "step": 921 }, { "epoch": 0.15, "grad_norm": 2.6354068408472444, "learning_rate": 1.923701285751426e-05, "loss": 0.8506, "step": 922 }, { "epoch": 0.15, "grad_norm": 1.9808456073077754, "learning_rate": 1.9234973300600074e-05, "loss": 0.9375, "step": 923 }, { "epoch": 0.15, "grad_norm": 1.7240383060125188, "learning_rate": 1.9232931129746116e-05, "loss": 0.8489, "step": 924 }, { "epoch": 0.15, "grad_norm": 1.653817798495122, "learning_rate": 1.9230886345530432e-05, "loss": 0.8627, "step": 925 }, { "epoch": 0.15, "grad_norm": 2.7395725242427518, "learning_rate": 1.9228838948531786e-05, "loss": 0.8957, "step": 926 }, { "epoch": 0.15, "grad_norm": 2.100392499067318, "learning_rate": 1.9226788939329693e-05, "loss": 0.8726, "step": 927 }, { "epoch": 0.15, "grad_norm": 2.154433887218647, "learning_rate": 1.92247363185044e-05, "loss": 0.8197, "step": 928 }, { "epoch": 0.15, "grad_norm": 2.291348442285421, "learning_rate": 1.92226810866369e-05, "loss": 0.828, "step": 929 }, { "epoch": 0.15, "grad_norm": 2.1662598740908763, "learning_rate": 1.922062324430892e-05, "loss": 0.9408, "step": 930 }, { "epoch": 0.15, "grad_norm": 2.0097692630940145, "learning_rate": 1.921856279210293e-05, "loss": 0.9061, "step": 931 }, { "epoch": 0.15, "grad_norm": 2.054284509625072, "learning_rate": 1.9216499730602135e-05, "loss": 0.9547, "step": 932 }, { "epoch": 0.15, "grad_norm": 1.844370236154679, "learning_rate": 1.9214434060390484e-05, "loss": 0.8606, "step": 933 }, { "epoch": 0.15, "grad_norm": 2.4017320572911594, "learning_rate": 1.9212365782052656e-05, "loss": 0.8843, "step": 934 }, { "epoch": 0.15, "grad_norm": 2.3309231686543903, "learning_rate": 1.9210294896174074e-05, "loss": 0.8395, "step": 935 }, { "epoch": 0.15, "grad_norm": 2.455454532952448, "learning_rate": 1.9208221403340895e-05, "loss": 0.8342, "step": 936 }, { "epoch": 0.15, "grad_norm": 1.9124143519699255, "learning_rate": 1.9206145304140026e-05, "loss": 0.8335, "step": 937 }, { "epoch": 0.15, "grad_norm": 4.57424595885976, "learning_rate": 1.9204066599159094e-05, "loss": 0.8683, "step": 938 }, { "epoch": 0.15, "grad_norm": 2.5273101777589893, "learning_rate": 1.920198528898648e-05, "loss": 0.8161, "step": 939 }, { "epoch": 0.15, "grad_norm": 2.2499186287144273, "learning_rate": 1.919990137421128e-05, "loss": 0.875, "step": 940 }, { "epoch": 0.15, "grad_norm": 2.0739875833047607, "learning_rate": 1.9197814855423357e-05, "loss": 0.9189, "step": 941 }, { "epoch": 0.15, "grad_norm": 0.9577880809085759, "learning_rate": 1.919572573321329e-05, "loss": 0.3917, "step": 942 }, { "epoch": 0.15, "grad_norm": 2.001153345685498, "learning_rate": 1.9193634008172396e-05, "loss": 0.8093, "step": 943 }, { "epoch": 0.16, "grad_norm": 2.3070491660778902, "learning_rate": 1.9191539680892738e-05, "loss": 0.8336, "step": 944 }, { "epoch": 0.16, "grad_norm": 1.7397400573065689, "learning_rate": 1.9189442751967117e-05, "loss": 0.8516, "step": 945 }, { "epoch": 0.16, "grad_norm": 3.3699740452980613, "learning_rate": 1.9187343221989052e-05, "loss": 0.869, "step": 946 }, { "epoch": 0.16, "grad_norm": 2.834141177569355, "learning_rate": 1.918524109155282e-05, "loss": 0.9556, "step": 947 }, { "epoch": 0.16, "grad_norm": 3.8501577591855614, "learning_rate": 1.9183136361253417e-05, "loss": 0.8253, "step": 948 }, { "epoch": 0.16, "grad_norm": 2.2818295432303293, "learning_rate": 1.918102903168659e-05, "loss": 0.8905, "step": 949 }, { "epoch": 0.16, "grad_norm": 1.9483162672499932, "learning_rate": 1.9178919103448807e-05, "loss": 0.8739, "step": 950 }, { "epoch": 0.16, "grad_norm": 1.5548575263098836, "learning_rate": 1.9176806577137285e-05, "loss": 0.9035, "step": 951 }, { "epoch": 0.16, "grad_norm": 2.0208503150045307, "learning_rate": 1.9174691453349967e-05, "loss": 0.879, "step": 952 }, { "epoch": 0.16, "grad_norm": 1.7197971149466047, "learning_rate": 1.917257373268554e-05, "loss": 0.8644, "step": 953 }, { "epoch": 0.16, "grad_norm": 3.7624410617264843, "learning_rate": 1.917045341574341e-05, "loss": 0.9143, "step": 954 }, { "epoch": 0.16, "grad_norm": 4.403139183991129, "learning_rate": 1.916833050312373e-05, "loss": 0.8521, "step": 955 }, { "epoch": 0.16, "grad_norm": 1.9473650467658634, "learning_rate": 1.9166204995427398e-05, "loss": 0.925, "step": 956 }, { "epoch": 0.16, "grad_norm": 1.9314120607515113, "learning_rate": 1.916407689325602e-05, "loss": 0.8896, "step": 957 }, { "epoch": 0.16, "grad_norm": 1.9724912551348392, "learning_rate": 1.916194619721196e-05, "loss": 0.8288, "step": 958 }, { "epoch": 0.16, "grad_norm": 1.9218198514032971, "learning_rate": 1.9159812907898304e-05, "loss": 0.8229, "step": 959 }, { "epoch": 0.16, "grad_norm": 1.8644387906965605, "learning_rate": 1.915767702591887e-05, "loss": 0.9399, "step": 960 }, { "epoch": 0.16, "grad_norm": 1.9117556027835623, "learning_rate": 1.9155538551878225e-05, "loss": 0.8577, "step": 961 }, { "epoch": 0.16, "grad_norm": 2.284983749692537, "learning_rate": 1.9153397486381657e-05, "loss": 0.985, "step": 962 }, { "epoch": 0.16, "grad_norm": 2.0762744442243584, "learning_rate": 1.915125383003518e-05, "loss": 0.9245, "step": 963 }, { "epoch": 0.16, "grad_norm": 2.120980156212696, "learning_rate": 1.9149107583445566e-05, "loss": 0.9169, "step": 964 }, { "epoch": 0.16, "grad_norm": 3.948422845628346, "learning_rate": 1.9146958747220292e-05, "loss": 0.8622, "step": 965 }, { "epoch": 0.16, "grad_norm": 2.7308206075566916, "learning_rate": 1.9144807321967594e-05, "loss": 0.9675, "step": 966 }, { "epoch": 0.16, "grad_norm": 2.502649832397417, "learning_rate": 1.914265330829642e-05, "loss": 0.8314, "step": 967 }, { "epoch": 0.16, "grad_norm": 1.9722775910809471, "learning_rate": 1.914049670681646e-05, "loss": 0.9057, "step": 968 }, { "epoch": 0.16, "grad_norm": 2.110522880369061, "learning_rate": 1.913833751813814e-05, "loss": 0.8877, "step": 969 }, { "epoch": 0.16, "grad_norm": 1.8334391517750348, "learning_rate": 1.9136175742872608e-05, "loss": 0.8747, "step": 970 }, { "epoch": 0.16, "grad_norm": 1.8052268562532643, "learning_rate": 1.9134011381631755e-05, "loss": 0.8293, "step": 971 }, { "epoch": 0.16, "grad_norm": 3.7105280841238275, "learning_rate": 1.9131844435028196e-05, "loss": 0.8872, "step": 972 }, { "epoch": 0.16, "grad_norm": 2.1494087015391554, "learning_rate": 1.912967490367528e-05, "loss": 0.8575, "step": 973 }, { "epoch": 0.16, "grad_norm": 1.8411038247059515, "learning_rate": 1.912750278818709e-05, "loss": 0.897, "step": 974 }, { "epoch": 0.16, "grad_norm": 2.025379429201976, "learning_rate": 1.9125328089178442e-05, "loss": 0.9097, "step": 975 }, { "epoch": 0.16, "grad_norm": 2.5096332895185136, "learning_rate": 1.9123150807264872e-05, "loss": 0.7976, "step": 976 }, { "epoch": 0.16, "grad_norm": 1.074711326949544, "learning_rate": 1.912097094306266e-05, "loss": 0.3885, "step": 977 }, { "epoch": 0.16, "grad_norm": 0.8453226248584228, "learning_rate": 1.9118788497188815e-05, "loss": 0.3782, "step": 978 }, { "epoch": 0.16, "grad_norm": 1.7152455737552983, "learning_rate": 1.9116603470261065e-05, "loss": 0.8354, "step": 979 }, { "epoch": 0.16, "grad_norm": 1.732838555297766, "learning_rate": 1.9114415862897883e-05, "loss": 0.9058, "step": 980 }, { "epoch": 0.16, "grad_norm": 2.5719934508104254, "learning_rate": 1.911222567571847e-05, "loss": 0.885, "step": 981 }, { "epoch": 0.16, "grad_norm": 0.9179984322028716, "learning_rate": 1.911003290934275e-05, "loss": 0.3776, "step": 982 }, { "epoch": 0.16, "grad_norm": 1.6684398504022415, "learning_rate": 1.9107837564391376e-05, "loss": 0.8777, "step": 983 }, { "epoch": 0.16, "grad_norm": 6.539373331497875, "learning_rate": 1.910563964148574e-05, "loss": 0.9114, "step": 984 }, { "epoch": 0.16, "grad_norm": 1.7189226565068794, "learning_rate": 1.9103439141247966e-05, "loss": 0.9144, "step": 985 }, { "epoch": 0.16, "grad_norm": 1.8873147211504855, "learning_rate": 1.9101236064300895e-05, "loss": 0.8832, "step": 986 }, { "epoch": 0.16, "grad_norm": 1.7927394262628322, "learning_rate": 1.90990304112681e-05, "loss": 0.9503, "step": 987 }, { "epoch": 0.16, "grad_norm": 0.8377106389729747, "learning_rate": 1.9096822182773887e-05, "loss": 0.3781, "step": 988 }, { "epoch": 0.16, "grad_norm": 2.108951999568073, "learning_rate": 1.9094611379443298e-05, "loss": 0.8965, "step": 989 }, { "epoch": 0.16, "grad_norm": 2.479724506447352, "learning_rate": 1.9092398001902092e-05, "loss": 0.8323, "step": 990 }, { "epoch": 0.16, "grad_norm": 1.677086800794358, "learning_rate": 1.909018205077676e-05, "loss": 0.9119, "step": 991 }, { "epoch": 0.16, "grad_norm": 7.197256519599932, "learning_rate": 1.908796352669452e-05, "loss": 0.8225, "step": 992 }, { "epoch": 0.16, "grad_norm": 1.8060768865754666, "learning_rate": 1.9085742430283322e-05, "loss": 0.896, "step": 993 }, { "epoch": 0.16, "grad_norm": 2.135031201880813, "learning_rate": 1.9083518762171847e-05, "loss": 0.8178, "step": 994 }, { "epoch": 0.16, "grad_norm": 2.656082140264609, "learning_rate": 1.9081292522989493e-05, "loss": 0.8496, "step": 995 }, { "epoch": 0.16, "grad_norm": 1.572577400312524, "learning_rate": 1.90790637133664e-05, "loss": 0.9217, "step": 996 }, { "epoch": 0.16, "grad_norm": 4.14790512841212, "learning_rate": 1.9076832333933423e-05, "loss": 0.8598, "step": 997 }, { "epoch": 0.16, "grad_norm": 3.0809407751985005, "learning_rate": 1.907459838532215e-05, "loss": 0.9028, "step": 998 }, { "epoch": 0.16, "grad_norm": 2.195171184971954, "learning_rate": 1.9072361868164892e-05, "loss": 0.8469, "step": 999 }, { "epoch": 0.16, "grad_norm": 3.372450402501369, "learning_rate": 1.9070122783094695e-05, "loss": 0.9391, "step": 1000 }, { "epoch": 0.16, "grad_norm": 2.70499731913344, "learning_rate": 1.9067881130745325e-05, "loss": 0.7962, "step": 1001 }, { "epoch": 0.16, "grad_norm": 3.2970762764938066, "learning_rate": 1.906563691175128e-05, "loss": 0.9454, "step": 1002 }, { "epoch": 0.16, "grad_norm": 2.1081235812142936, "learning_rate": 1.9063390126747778e-05, "loss": 0.9123, "step": 1003 }, { "epoch": 0.16, "grad_norm": 0.8859783380170746, "learning_rate": 1.906114077637077e-05, "loss": 0.3485, "step": 1004 }, { "epoch": 0.17, "grad_norm": 2.009110376700302, "learning_rate": 1.905888886125693e-05, "loss": 0.9963, "step": 1005 }, { "epoch": 0.17, "grad_norm": 1.720874117608156, "learning_rate": 1.9056634382043653e-05, "loss": 0.8766, "step": 1006 }, { "epoch": 0.17, "grad_norm": 0.666984981046444, "learning_rate": 1.905437733936907e-05, "loss": 0.3565, "step": 1007 }, { "epoch": 0.17, "grad_norm": 2.061317556228055, "learning_rate": 1.9052117733872025e-05, "loss": 0.8862, "step": 1008 }, { "epoch": 0.17, "grad_norm": 2.255986075303197, "learning_rate": 1.9049855566192105e-05, "loss": 0.8311, "step": 1009 }, { "epoch": 0.17, "grad_norm": 3.4608424527065065, "learning_rate": 1.9047590836969603e-05, "loss": 0.8555, "step": 1010 }, { "epoch": 0.17, "grad_norm": 1.7629412217267202, "learning_rate": 1.904532354684555e-05, "loss": 0.9078, "step": 1011 }, { "epoch": 0.17, "grad_norm": 3.2029477637012063, "learning_rate": 1.9043053696461696e-05, "loss": 0.8842, "step": 1012 }, { "epoch": 0.17, "grad_norm": 2.1786041404411605, "learning_rate": 1.904078128646052e-05, "loss": 0.8131, "step": 1013 }, { "epoch": 0.17, "grad_norm": 1.9548895054384827, "learning_rate": 1.903850631748522e-05, "loss": 0.8496, "step": 1014 }, { "epoch": 0.17, "grad_norm": 2.268461490850569, "learning_rate": 1.9036228790179722e-05, "loss": 0.8512, "step": 1015 }, { "epoch": 0.17, "grad_norm": 2.369477712045443, "learning_rate": 1.9033948705188673e-05, "loss": 0.9185, "step": 1016 }, { "epoch": 0.17, "grad_norm": 2.05782575529175, "learning_rate": 1.9031666063157453e-05, "loss": 0.8256, "step": 1017 }, { "epoch": 0.17, "grad_norm": 3.0885475721961684, "learning_rate": 1.902938086473215e-05, "loss": 0.8667, "step": 1018 }, { "epoch": 0.17, "grad_norm": 2.189385933439014, "learning_rate": 1.9027093110559596e-05, "loss": 0.8708, "step": 1019 }, { "epoch": 0.17, "grad_norm": 2.4138074480995817, "learning_rate": 1.9024802801287327e-05, "loss": 0.8783, "step": 1020 }, { "epoch": 0.17, "grad_norm": 2.472547003493697, "learning_rate": 1.9022509937563606e-05, "loss": 0.8618, "step": 1021 }, { "epoch": 0.17, "grad_norm": 1.8845548199855409, "learning_rate": 1.9020214520037433e-05, "loss": 0.8594, "step": 1022 }, { "epoch": 0.17, "grad_norm": 3.5274816761824024, "learning_rate": 1.901791654935852e-05, "loss": 0.8474, "step": 1023 }, { "epoch": 0.17, "grad_norm": 2.4745779110585033, "learning_rate": 1.9015616026177294e-05, "loss": 0.8741, "step": 1024 }, { "epoch": 0.17, "grad_norm": 1.9830883947803761, "learning_rate": 1.901331295114492e-05, "loss": 0.8117, "step": 1025 }, { "epoch": 0.17, "grad_norm": 2.374612093192304, "learning_rate": 1.9011007324913277e-05, "loss": 0.9213, "step": 1026 }, { "epoch": 0.17, "grad_norm": 2.4487527346780276, "learning_rate": 1.9008699148134967e-05, "loss": 0.8529, "step": 1027 }, { "epoch": 0.17, "grad_norm": 1.7964231321979225, "learning_rate": 1.9006388421463322e-05, "loss": 0.8329, "step": 1028 }, { "epoch": 0.17, "grad_norm": 1.8028743226889397, "learning_rate": 1.9004075145552378e-05, "loss": 0.8893, "step": 1029 }, { "epoch": 0.17, "grad_norm": 1.9830371422712623, "learning_rate": 1.9001759321056905e-05, "loss": 0.9005, "step": 1030 }, { "epoch": 0.17, "grad_norm": 2.7970489616171403, "learning_rate": 1.8999440948632397e-05, "loss": 0.8958, "step": 1031 }, { "epoch": 0.17, "grad_norm": 1.74223953316285, "learning_rate": 1.899712002893506e-05, "loss": 0.8852, "step": 1032 }, { "epoch": 0.17, "grad_norm": 2.271048791501519, "learning_rate": 1.899479656262183e-05, "loss": 0.8762, "step": 1033 }, { "epoch": 0.17, "grad_norm": 4.506163339773634, "learning_rate": 1.8992470550350356e-05, "loss": 0.842, "step": 1034 }, { "epoch": 0.17, "grad_norm": 1.868474128399253, "learning_rate": 1.8990141992779008e-05, "loss": 0.8509, "step": 1035 }, { "epoch": 0.17, "grad_norm": 2.084264777037623, "learning_rate": 1.8987810890566885e-05, "loss": 0.8771, "step": 1036 }, { "epoch": 0.17, "grad_norm": 1.6764378501286448, "learning_rate": 1.8985477244373796e-05, "loss": 0.8891, "step": 1037 }, { "epoch": 0.17, "grad_norm": 3.1586177981141095, "learning_rate": 1.898314105486028e-05, "loss": 0.8341, "step": 1038 }, { "epoch": 0.17, "grad_norm": 2.0244740802736776, "learning_rate": 1.8980802322687584e-05, "loss": 0.7512, "step": 1039 }, { "epoch": 0.17, "grad_norm": 1.792577859760056, "learning_rate": 1.8978461048517686e-05, "loss": 0.8158, "step": 1040 }, { "epoch": 0.17, "grad_norm": 2.515394716391508, "learning_rate": 1.8976117233013278e-05, "loss": 0.9006, "step": 1041 }, { "epoch": 0.17, "grad_norm": 2.0019063104266266, "learning_rate": 1.8973770876837772e-05, "loss": 0.8767, "step": 1042 }, { "epoch": 0.17, "grad_norm": 1.7878047056357762, "learning_rate": 1.8971421980655295e-05, "loss": 0.8988, "step": 1043 }, { "epoch": 0.17, "grad_norm": 1.516721224230333, "learning_rate": 1.8969070545130702e-05, "loss": 0.9457, "step": 1044 }, { "epoch": 0.17, "grad_norm": 2.5177250621979606, "learning_rate": 1.896671657092956e-05, "loss": 0.8795, "step": 1045 }, { "epoch": 0.17, "grad_norm": 1.8992884290200975, "learning_rate": 1.8964360058718162e-05, "loss": 0.9246, "step": 1046 }, { "epoch": 0.17, "grad_norm": 2.4531994472631586, "learning_rate": 1.8962001009163506e-05, "loss": 0.6996, "step": 1047 }, { "epoch": 0.17, "grad_norm": 2.0868039286677096, "learning_rate": 1.8959639422933316e-05, "loss": 0.8111, "step": 1048 }, { "epoch": 0.17, "grad_norm": 3.9376979040032243, "learning_rate": 1.8957275300696036e-05, "loss": 0.8192, "step": 1049 }, { "epoch": 0.17, "grad_norm": 2.2253739795291696, "learning_rate": 1.895490864312083e-05, "loss": 0.9208, "step": 1050 }, { "epoch": 0.17, "grad_norm": 3.1849635584803133, "learning_rate": 1.895253945087757e-05, "loss": 0.8657, "step": 1051 }, { "epoch": 0.17, "grad_norm": 2.9292118702285714, "learning_rate": 1.8950167724636856e-05, "loss": 0.9133, "step": 1052 }, { "epoch": 0.17, "grad_norm": 2.6695014759248825, "learning_rate": 1.894779346506999e-05, "loss": 0.862, "step": 1053 }, { "epoch": 0.17, "grad_norm": 1.7583923959442476, "learning_rate": 1.8945416672849014e-05, "loss": 0.8428, "step": 1054 }, { "epoch": 0.17, "grad_norm": 1.6764590543741544, "learning_rate": 1.8943037348646668e-05, "loss": 0.8248, "step": 1055 }, { "epoch": 0.17, "grad_norm": 2.046854794936482, "learning_rate": 1.8940655493136415e-05, "loss": 0.8283, "step": 1056 }, { "epoch": 0.17, "grad_norm": 1.757734043265519, "learning_rate": 1.8938271106992433e-05, "loss": 0.8617, "step": 1057 }, { "epoch": 0.17, "grad_norm": 2.692525441831166, "learning_rate": 1.893588419088962e-05, "loss": 0.9655, "step": 1058 }, { "epoch": 0.17, "grad_norm": 1.7681934402634598, "learning_rate": 1.8933494745503585e-05, "loss": 0.8131, "step": 1059 }, { "epoch": 0.17, "grad_norm": 2.633482486264521, "learning_rate": 1.8931102771510657e-05, "loss": 0.8716, "step": 1060 }, { "epoch": 0.17, "grad_norm": 1.7663387703505304, "learning_rate": 1.8928708269587876e-05, "loss": 0.8787, "step": 1061 }, { "epoch": 0.17, "grad_norm": 2.088974195226392, "learning_rate": 1.8926311240413008e-05, "loss": 0.8246, "step": 1062 }, { "epoch": 0.17, "grad_norm": 2.2436169857883708, "learning_rate": 1.892391168466452e-05, "loss": 0.8113, "step": 1063 }, { "epoch": 0.17, "grad_norm": 1.720664009629777, "learning_rate": 1.8921509603021606e-05, "loss": 0.9201, "step": 1064 }, { "epoch": 0.17, "grad_norm": 3.19949099600418, "learning_rate": 1.8919104996164167e-05, "loss": 0.8592, "step": 1065 }, { "epoch": 0.18, "grad_norm": 2.0569482818166525, "learning_rate": 1.8916697864772822e-05, "loss": 0.9124, "step": 1066 }, { "epoch": 0.18, "grad_norm": 2.8627167447474324, "learning_rate": 1.8914288209528907e-05, "loss": 0.9094, "step": 1067 }, { "epoch": 0.18, "grad_norm": 1.8983289333090332, "learning_rate": 1.891187603111447e-05, "loss": 0.8488, "step": 1068 }, { "epoch": 0.18, "grad_norm": 1.8625346588910066, "learning_rate": 1.8909461330212267e-05, "loss": 0.8434, "step": 1069 }, { "epoch": 0.18, "grad_norm": 2.0546458885469407, "learning_rate": 1.890704410750578e-05, "loss": 0.7736, "step": 1070 }, { "epoch": 0.18, "grad_norm": 1.8939488322354419, "learning_rate": 1.89046243636792e-05, "loss": 0.7921, "step": 1071 }, { "epoch": 0.18, "grad_norm": 1.7300163968205902, "learning_rate": 1.8902202099417425e-05, "loss": 0.897, "step": 1072 }, { "epoch": 0.18, "grad_norm": 1.869516342917936, "learning_rate": 1.8899777315406073e-05, "loss": 0.9175, "step": 1073 }, { "epoch": 0.18, "grad_norm": 1.8744606748784276, "learning_rate": 1.8897350012331478e-05, "loss": 0.89, "step": 1074 }, { "epoch": 0.18, "grad_norm": 1.733498552708239, "learning_rate": 1.889492019088068e-05, "loss": 0.9076, "step": 1075 }, { "epoch": 0.18, "grad_norm": 1.9623522817793, "learning_rate": 1.889248785174143e-05, "loss": 0.8846, "step": 1076 }, { "epoch": 0.18, "grad_norm": 1.972549835520931, "learning_rate": 1.8890052995602207e-05, "loss": 0.9321, "step": 1077 }, { "epoch": 0.18, "grad_norm": 1.976636433549586, "learning_rate": 1.8887615623152188e-05, "loss": 0.9142, "step": 1078 }, { "epoch": 0.18, "grad_norm": 1.692243903578713, "learning_rate": 1.888517573508126e-05, "loss": 0.8355, "step": 1079 }, { "epoch": 0.18, "grad_norm": 2.587812616035528, "learning_rate": 1.8882733332080038e-05, "loss": 0.8865, "step": 1080 }, { "epoch": 0.18, "grad_norm": 52.003231971729086, "learning_rate": 1.888028841483983e-05, "loss": 0.8207, "step": 1081 }, { "epoch": 0.18, "grad_norm": 1.8753248337950787, "learning_rate": 1.887784098405267e-05, "loss": 0.775, "step": 1082 }, { "epoch": 0.18, "grad_norm": 1.881774632160008, "learning_rate": 1.88753910404113e-05, "loss": 0.9611, "step": 1083 }, { "epoch": 0.18, "grad_norm": 1.815570761094261, "learning_rate": 1.8872938584609164e-05, "loss": 0.8465, "step": 1084 }, { "epoch": 0.18, "grad_norm": 3.1326451207594777, "learning_rate": 1.887048361734043e-05, "loss": 0.8124, "step": 1085 }, { "epoch": 0.18, "grad_norm": 2.314147898721125, "learning_rate": 1.886802613929997e-05, "loss": 0.8069, "step": 1086 }, { "epoch": 0.18, "grad_norm": 1.5655505672938368, "learning_rate": 1.8865566151183365e-05, "loss": 0.7969, "step": 1087 }, { "epoch": 0.18, "grad_norm": 0.8736301275848827, "learning_rate": 1.8863103653686917e-05, "loss": 0.4132, "step": 1088 }, { "epoch": 0.18, "grad_norm": 2.320932225443898, "learning_rate": 1.8860638647507622e-05, "loss": 0.9258, "step": 1089 }, { "epoch": 0.18, "grad_norm": 3.099695681273146, "learning_rate": 1.8858171133343202e-05, "loss": 0.9029, "step": 1090 }, { "epoch": 0.18, "grad_norm": 2.2624274265090416, "learning_rate": 1.885570111189208e-05, "loss": 0.8574, "step": 1091 }, { "epoch": 0.18, "grad_norm": 2.016208493850794, "learning_rate": 1.8853228583853384e-05, "loss": 0.9238, "step": 1092 }, { "epoch": 0.18, "grad_norm": 2.070905231289937, "learning_rate": 1.8850753549926967e-05, "loss": 0.9307, "step": 1093 }, { "epoch": 0.18, "grad_norm": 1.9079440622080799, "learning_rate": 1.8848276010813377e-05, "loss": 0.9876, "step": 1094 }, { "epoch": 0.18, "grad_norm": 2.405877955543917, "learning_rate": 1.8845795967213876e-05, "loss": 0.8061, "step": 1095 }, { "epoch": 0.18, "grad_norm": 2.1545669124631486, "learning_rate": 1.884331341983044e-05, "loss": 0.815, "step": 1096 }, { "epoch": 0.18, "grad_norm": 2.3475258258952105, "learning_rate": 1.8840828369365743e-05, "loss": 0.8917, "step": 1097 }, { "epoch": 0.18, "grad_norm": 1.6165949989420212, "learning_rate": 1.8838340816523175e-05, "loss": 0.9563, "step": 1098 }, { "epoch": 0.18, "grad_norm": 1.7559636114785748, "learning_rate": 1.883585076200683e-05, "loss": 0.8581, "step": 1099 }, { "epoch": 0.18, "grad_norm": 6.958300716001125, "learning_rate": 1.883335820652152e-05, "loss": 0.8873, "step": 1100 }, { "epoch": 0.18, "grad_norm": 1.7120904535886037, "learning_rate": 1.8830863150772754e-05, "loss": 0.9175, "step": 1101 }, { "epoch": 0.18, "grad_norm": 2.093014161173658, "learning_rate": 1.882836559546675e-05, "loss": 0.8154, "step": 1102 }, { "epoch": 0.18, "grad_norm": 0.7700867706842547, "learning_rate": 1.8825865541310438e-05, "loss": 0.3859, "step": 1103 }, { "epoch": 0.18, "grad_norm": 0.7186111650500127, "learning_rate": 1.882336298901145e-05, "loss": 0.4017, "step": 1104 }, { "epoch": 0.18, "grad_norm": 2.5098636421941634, "learning_rate": 1.8820857939278136e-05, "loss": 0.7341, "step": 1105 }, { "epoch": 0.18, "grad_norm": 1.945556281612496, "learning_rate": 1.8818350392819535e-05, "loss": 0.857, "step": 1106 }, { "epoch": 0.18, "grad_norm": 1.8618565717406341, "learning_rate": 1.881584035034541e-05, "loss": 0.89, "step": 1107 }, { "epoch": 0.18, "grad_norm": 2.111545092837553, "learning_rate": 1.8813327812566217e-05, "loss": 0.8062, "step": 1108 }, { "epoch": 0.18, "grad_norm": 1.6669343698753774, "learning_rate": 1.881081278019313e-05, "loss": 0.8896, "step": 1109 }, { "epoch": 0.18, "grad_norm": 1.4723426172947363, "learning_rate": 1.8808295253938025e-05, "loss": 0.8472, "step": 1110 }, { "epoch": 0.18, "grad_norm": 2.128525364372375, "learning_rate": 1.8805775234513476e-05, "loss": 0.9396, "step": 1111 }, { "epoch": 0.18, "grad_norm": 0.8165170089869843, "learning_rate": 1.8803252722632775e-05, "loss": 0.3841, "step": 1112 }, { "epoch": 0.18, "grad_norm": 1.930741135716546, "learning_rate": 1.880072771900991e-05, "loss": 0.8775, "step": 1113 }, { "epoch": 0.18, "grad_norm": 1.8803933858612647, "learning_rate": 1.879820022435958e-05, "loss": 0.8159, "step": 1114 }, { "epoch": 0.18, "grad_norm": 1.8753890730458596, "learning_rate": 1.8795670239397184e-05, "loss": 0.8424, "step": 1115 }, { "epoch": 0.18, "grad_norm": 1.773827228148669, "learning_rate": 1.8793137764838834e-05, "loss": 0.9, "step": 1116 }, { "epoch": 0.18, "grad_norm": 2.4482293511225457, "learning_rate": 1.879060280140134e-05, "loss": 0.8552, "step": 1117 }, { "epoch": 0.18, "grad_norm": 6.040603274996109, "learning_rate": 1.878806534980221e-05, "loss": 0.922, "step": 1118 }, { "epoch": 0.18, "grad_norm": 0.7049644736856956, "learning_rate": 1.8785525410759676e-05, "loss": 0.3511, "step": 1119 }, { "epoch": 0.18, "grad_norm": 2.342538935593085, "learning_rate": 1.878298298499266e-05, "loss": 0.8813, "step": 1120 }, { "epoch": 0.18, "grad_norm": 2.5841832202184185, "learning_rate": 1.8780438073220785e-05, "loss": 0.9591, "step": 1121 }, { "epoch": 0.18, "grad_norm": 2.7766978532417594, "learning_rate": 1.8777890676164387e-05, "loss": 0.8283, "step": 1122 }, { "epoch": 0.18, "grad_norm": 30.742142351913053, "learning_rate": 1.8775340794544497e-05, "loss": 0.852, "step": 1123 }, { "epoch": 0.18, "grad_norm": 1.8472717281758173, "learning_rate": 1.877278842908286e-05, "loss": 0.9977, "step": 1124 }, { "epoch": 0.18, "grad_norm": 2.330312082776347, "learning_rate": 1.8770233580501913e-05, "loss": 0.8306, "step": 1125 }, { "epoch": 0.18, "grad_norm": 2.835146168742063, "learning_rate": 1.87676762495248e-05, "loss": 0.779, "step": 1126 }, { "epoch": 0.19, "grad_norm": 2.5152881440619956, "learning_rate": 1.8765116436875374e-05, "loss": 0.7772, "step": 1127 }, { "epoch": 0.19, "grad_norm": 2.1937842527615996, "learning_rate": 1.876255414327818e-05, "loss": 0.8782, "step": 1128 }, { "epoch": 0.19, "grad_norm": 1.8046241984510678, "learning_rate": 1.8759989369458468e-05, "loss": 0.8763, "step": 1129 }, { "epoch": 0.19, "grad_norm": 1.646740577792337, "learning_rate": 1.8757422116142198e-05, "loss": 0.8109, "step": 1130 }, { "epoch": 0.19, "grad_norm": 2.072040027236746, "learning_rate": 1.875485238405602e-05, "loss": 0.8298, "step": 1131 }, { "epoch": 0.19, "grad_norm": 2.0221820622567006, "learning_rate": 1.875228017392729e-05, "loss": 0.7938, "step": 1132 }, { "epoch": 0.19, "grad_norm": 3.3172471264048697, "learning_rate": 1.8749705486484074e-05, "loss": 0.8438, "step": 1133 }, { "epoch": 0.19, "grad_norm": 2.2037229882976956, "learning_rate": 1.8747128322455128e-05, "loss": 0.8821, "step": 1134 }, { "epoch": 0.19, "grad_norm": 2.237795566058098, "learning_rate": 1.8744548682569914e-05, "loss": 0.8295, "step": 1135 }, { "epoch": 0.19, "grad_norm": 2.276555463414345, "learning_rate": 1.874196656755859e-05, "loss": 0.867, "step": 1136 }, { "epoch": 0.19, "grad_norm": 2.10736040587436, "learning_rate": 1.873938197815202e-05, "loss": 0.854, "step": 1137 }, { "epoch": 0.19, "grad_norm": 1.7992029692802138, "learning_rate": 1.8736794915081765e-05, "loss": 0.9078, "step": 1138 }, { "epoch": 0.19, "grad_norm": 3.4456416208387837, "learning_rate": 1.8734205379080093e-05, "loss": 0.8202, "step": 1139 }, { "epoch": 0.19, "grad_norm": 1.4564727242873468, "learning_rate": 1.8731613370879963e-05, "loss": 0.8591, "step": 1140 }, { "epoch": 0.19, "grad_norm": 2.291410640760212, "learning_rate": 1.8729018891215042e-05, "loss": 0.8956, "step": 1141 }, { "epoch": 0.19, "grad_norm": 2.3059622376696565, "learning_rate": 1.8726421940819683e-05, "loss": 0.863, "step": 1142 }, { "epoch": 0.19, "grad_norm": 1.8427074676548034, "learning_rate": 1.8723822520428954e-05, "loss": 0.7592, "step": 1143 }, { "epoch": 0.19, "grad_norm": 1.6654967439546067, "learning_rate": 1.8721220630778613e-05, "loss": 0.878, "step": 1144 }, { "epoch": 0.19, "grad_norm": 2.1657298792187585, "learning_rate": 1.871861627260512e-05, "loss": 0.9289, "step": 1145 }, { "epoch": 0.19, "grad_norm": 2.365096098472171, "learning_rate": 1.8716009446645636e-05, "loss": 0.7472, "step": 1146 }, { "epoch": 0.19, "grad_norm": 1.8418589190680597, "learning_rate": 1.8713400153638013e-05, "loss": 0.9585, "step": 1147 }, { "epoch": 0.19, "grad_norm": 1.749067362079497, "learning_rate": 1.8710788394320807e-05, "loss": 0.8235, "step": 1148 }, { "epoch": 0.19, "grad_norm": 1.8674193814571438, "learning_rate": 1.870817416943327e-05, "loss": 0.8957, "step": 1149 }, { "epoch": 0.19, "grad_norm": 2.0839370203334524, "learning_rate": 1.8705557479715363e-05, "loss": 0.9042, "step": 1150 }, { "epoch": 0.19, "grad_norm": 2.2126748015360973, "learning_rate": 1.870293832590772e-05, "loss": 0.8853, "step": 1151 }, { "epoch": 0.19, "grad_norm": 1.7411691692068685, "learning_rate": 1.8700316708751693e-05, "loss": 0.9266, "step": 1152 }, { "epoch": 0.19, "grad_norm": 1.8770808908890502, "learning_rate": 1.8697692628989327e-05, "loss": 0.86, "step": 1153 }, { "epoch": 0.19, "grad_norm": 2.086033368188509, "learning_rate": 1.869506608736336e-05, "loss": 0.8369, "step": 1154 }, { "epoch": 0.19, "grad_norm": 2.770945933731591, "learning_rate": 1.869243708461723e-05, "loss": 0.8663, "step": 1155 }, { "epoch": 0.19, "grad_norm": 2.0992680810959623, "learning_rate": 1.8689805621495072e-05, "loss": 0.8379, "step": 1156 }, { "epoch": 0.19, "grad_norm": 2.224324637635985, "learning_rate": 1.8687171698741714e-05, "loss": 0.8179, "step": 1157 }, { "epoch": 0.19, "grad_norm": 1.7399983524691691, "learning_rate": 1.868453531710268e-05, "loss": 0.962, "step": 1158 }, { "epoch": 0.19, "grad_norm": 1.8638847050895686, "learning_rate": 1.8681896477324198e-05, "loss": 0.9126, "step": 1159 }, { "epoch": 0.19, "grad_norm": 2.178641436379747, "learning_rate": 1.8679255180153184e-05, "loss": 0.9766, "step": 1160 }, { "epoch": 0.19, "grad_norm": 2.476699460562117, "learning_rate": 1.867661142633725e-05, "loss": 0.9214, "step": 1161 }, { "epoch": 0.19, "grad_norm": 1.721671380092223, "learning_rate": 1.8673965216624704e-05, "loss": 0.8927, "step": 1162 }, { "epoch": 0.19, "grad_norm": 1.6704359020191857, "learning_rate": 1.8671316551764552e-05, "loss": 0.8431, "step": 1163 }, { "epoch": 0.19, "grad_norm": 2.324633022436763, "learning_rate": 1.8668665432506496e-05, "loss": 0.9213, "step": 1164 }, { "epoch": 0.19, "grad_norm": 2.372891861947508, "learning_rate": 1.8666011859600925e-05, "loss": 0.9192, "step": 1165 }, { "epoch": 0.19, "grad_norm": 1.9527713998560505, "learning_rate": 1.8663355833798927e-05, "loss": 0.871, "step": 1166 }, { "epoch": 0.19, "grad_norm": 1.7794486501699032, "learning_rate": 1.8660697355852288e-05, "loss": 0.8885, "step": 1167 }, { "epoch": 0.19, "grad_norm": 2.301513040960532, "learning_rate": 1.865803642651348e-05, "loss": 0.8284, "step": 1168 }, { "epoch": 0.19, "grad_norm": 2.503401847298005, "learning_rate": 1.8655373046535682e-05, "loss": 0.9387, "step": 1169 }, { "epoch": 0.19, "grad_norm": 1.6408571011833417, "learning_rate": 1.8652707216672747e-05, "loss": 0.794, "step": 1170 }, { "epoch": 0.19, "grad_norm": 1.765241988067244, "learning_rate": 1.865003893767924e-05, "loss": 0.8086, "step": 1171 }, { "epoch": 0.19, "grad_norm": 2.719065347838753, "learning_rate": 1.864736821031041e-05, "loss": 0.9139, "step": 1172 }, { "epoch": 0.19, "grad_norm": 1.7190884669635957, "learning_rate": 1.8644695035322203e-05, "loss": 0.7934, "step": 1173 }, { "epoch": 0.19, "grad_norm": 2.0848476372306237, "learning_rate": 1.864201941347125e-05, "loss": 0.8214, "step": 1174 }, { "epoch": 0.19, "grad_norm": 4.6508631550208195, "learning_rate": 1.863934134551488e-05, "loss": 0.9359, "step": 1175 }, { "epoch": 0.19, "grad_norm": 1.751181247939186, "learning_rate": 1.8636660832211126e-05, "loss": 0.8764, "step": 1176 }, { "epoch": 0.19, "grad_norm": 1.9141575679583378, "learning_rate": 1.8633977874318686e-05, "loss": 0.8875, "step": 1177 }, { "epoch": 0.19, "grad_norm": 1.7274819031348967, "learning_rate": 1.8631292472596978e-05, "loss": 0.8748, "step": 1178 }, { "epoch": 0.19, "grad_norm": 3.3211476498147645, "learning_rate": 1.862860462780609e-05, "loss": 0.8742, "step": 1179 }, { "epoch": 0.19, "grad_norm": 1.8963715115560187, "learning_rate": 1.8625914340706818e-05, "loss": 0.8945, "step": 1180 }, { "epoch": 0.19, "grad_norm": 1.6634245161832295, "learning_rate": 1.862322161206064e-05, "loss": 0.7895, "step": 1181 }, { "epoch": 0.19, "grad_norm": 1.7653272007234528, "learning_rate": 1.862052644262972e-05, "loss": 0.8641, "step": 1182 }, { "epoch": 0.19, "grad_norm": 2.0989170322449926, "learning_rate": 1.8617828833176935e-05, "loss": 0.7898, "step": 1183 }, { "epoch": 0.19, "grad_norm": 1.7415291019717354, "learning_rate": 1.8615128784465826e-05, "loss": 0.9008, "step": 1184 }, { "epoch": 0.19, "grad_norm": 2.183786965531343, "learning_rate": 1.861242629726064e-05, "loss": 0.8548, "step": 1185 }, { "epoch": 0.19, "grad_norm": 2.122745922128027, "learning_rate": 1.8609721372326305e-05, "loss": 0.8208, "step": 1186 }, { "epoch": 0.19, "grad_norm": 1.513713107094212, "learning_rate": 1.8607014010428454e-05, "loss": 0.8319, "step": 1187 }, { "epoch": 0.2, "grad_norm": 1.8033344434152423, "learning_rate": 1.860430421233339e-05, "loss": 0.9298, "step": 1188 }, { "epoch": 0.2, "grad_norm": 1.839872755180427, "learning_rate": 1.8601591978808126e-05, "loss": 0.8135, "step": 1189 }, { "epoch": 0.2, "grad_norm": 2.5535597035789706, "learning_rate": 1.8598877310620347e-05, "loss": 0.781, "step": 1190 }, { "epoch": 0.2, "grad_norm": 2.2909567581932726, "learning_rate": 1.859616020853843e-05, "loss": 0.7969, "step": 1191 }, { "epoch": 0.2, "grad_norm": 2.4261948640966504, "learning_rate": 1.859344067333146e-05, "loss": 0.9213, "step": 1192 }, { "epoch": 0.2, "grad_norm": 1.7612775056774554, "learning_rate": 1.859071870576918e-05, "loss": 0.8995, "step": 1193 }, { "epoch": 0.2, "grad_norm": 2.395737189838693, "learning_rate": 1.8587994306622047e-05, "loss": 0.9096, "step": 1194 }, { "epoch": 0.2, "grad_norm": 2.338322608612075, "learning_rate": 1.8585267476661187e-05, "loss": 0.9284, "step": 1195 }, { "epoch": 0.2, "grad_norm": 1.6959867686092385, "learning_rate": 1.8582538216658435e-05, "loss": 0.9039, "step": 1196 }, { "epoch": 0.2, "grad_norm": 2.3677378858966165, "learning_rate": 1.85798065273863e-05, "loss": 0.8525, "step": 1197 }, { "epoch": 0.2, "grad_norm": 1.7624674246327612, "learning_rate": 1.857707240961797e-05, "loss": 0.8779, "step": 1198 }, { "epoch": 0.2, "grad_norm": 11.433431244120595, "learning_rate": 1.8574335864127344e-05, "loss": 0.8894, "step": 1199 }, { "epoch": 0.2, "grad_norm": 2.994095390349235, "learning_rate": 1.8571596891688988e-05, "loss": 0.8884, "step": 1200 }, { "epoch": 0.2, "grad_norm": 1.7660201268803142, "learning_rate": 1.8568855493078165e-05, "loss": 0.9132, "step": 1201 }, { "epoch": 0.2, "grad_norm": 2.2830629231553696, "learning_rate": 1.8566111669070822e-05, "loss": 0.8597, "step": 1202 }, { "epoch": 0.2, "grad_norm": 1.8367101013284686, "learning_rate": 1.8563365420443594e-05, "loss": 0.8912, "step": 1203 }, { "epoch": 0.2, "grad_norm": 2.661554691444122, "learning_rate": 1.85606167479738e-05, "loss": 0.864, "step": 1204 }, { "epoch": 0.2, "grad_norm": 3.8085965490088696, "learning_rate": 1.8557865652439445e-05, "loss": 0.8593, "step": 1205 }, { "epoch": 0.2, "grad_norm": 1.6784029834888075, "learning_rate": 1.8555112134619218e-05, "loss": 0.8371, "step": 1206 }, { "epoch": 0.2, "grad_norm": 3.0605526872738382, "learning_rate": 1.85523561952925e-05, "loss": 0.8892, "step": 1207 }, { "epoch": 0.2, "grad_norm": 1.6511071365122296, "learning_rate": 1.854959783523936e-05, "loss": 0.8224, "step": 1208 }, { "epoch": 0.2, "grad_norm": 1.6221065519689766, "learning_rate": 1.8546837055240536e-05, "loss": 0.811, "step": 1209 }, { "epoch": 0.2, "grad_norm": 2.3368743918176937, "learning_rate": 1.854407385607746e-05, "loss": 0.8885, "step": 1210 }, { "epoch": 0.2, "grad_norm": 1.9649692942493187, "learning_rate": 1.8541308238532257e-05, "loss": 0.8866, "step": 1211 }, { "epoch": 0.2, "grad_norm": 1.9753221088480115, "learning_rate": 1.853854020338773e-05, "loss": 0.9268, "step": 1212 }, { "epoch": 0.2, "grad_norm": 3.3977222218293877, "learning_rate": 1.853576975142736e-05, "loss": 0.8446, "step": 1213 }, { "epoch": 0.2, "grad_norm": 1.8687542618923665, "learning_rate": 1.853299688343532e-05, "loss": 0.8174, "step": 1214 }, { "epoch": 0.2, "grad_norm": 3.182172454574677, "learning_rate": 1.8530221600196462e-05, "loss": 0.8644, "step": 1215 }, { "epoch": 0.2, "grad_norm": 2.1431384761462917, "learning_rate": 1.8527443902496325e-05, "loss": 0.9522, "step": 1216 }, { "epoch": 0.2, "grad_norm": 2.0443023067939348, "learning_rate": 1.8524663791121134e-05, "loss": 0.8861, "step": 1217 }, { "epoch": 0.2, "grad_norm": 5.118618831062628, "learning_rate": 1.852188126685779e-05, "loss": 0.8205, "step": 1218 }, { "epoch": 0.2, "grad_norm": 2.0195124179155033, "learning_rate": 1.851909633049388e-05, "loss": 0.7926, "step": 1219 }, { "epoch": 0.2, "grad_norm": 2.0654804501610027, "learning_rate": 1.8516308982817685e-05, "loss": 0.7498, "step": 1220 }, { "epoch": 0.2, "grad_norm": 1.8148970492222387, "learning_rate": 1.851351922461814e-05, "loss": 0.8453, "step": 1221 }, { "epoch": 0.2, "grad_norm": 1.7337448495164063, "learning_rate": 1.851072705668489e-05, "loss": 0.8889, "step": 1222 }, { "epoch": 0.2, "grad_norm": 2.540509500784232, "learning_rate": 1.8507932479808254e-05, "loss": 0.8283, "step": 1223 }, { "epoch": 0.2, "grad_norm": 1.8684589845902162, "learning_rate": 1.8505135494779228e-05, "loss": 0.7897, "step": 1224 }, { "epoch": 0.2, "grad_norm": 1.666886181017365, "learning_rate": 1.8502336102389494e-05, "loss": 0.9082, "step": 1225 }, { "epoch": 0.2, "grad_norm": 0.9819919018954649, "learning_rate": 1.8499534303431414e-05, "loss": 0.413, "step": 1226 }, { "epoch": 0.2, "grad_norm": 2.0669005502940836, "learning_rate": 1.849673009869803e-05, "loss": 0.845, "step": 1227 }, { "epoch": 0.2, "grad_norm": 3.720838540290338, "learning_rate": 1.8493923488983066e-05, "loss": 0.8296, "step": 1228 }, { "epoch": 0.2, "grad_norm": 1.7040931009501825, "learning_rate": 1.849111447508093e-05, "loss": 0.8755, "step": 1229 }, { "epoch": 0.2, "grad_norm": 2.4500997868317542, "learning_rate": 1.8488303057786707e-05, "loss": 0.8889, "step": 1230 }, { "epoch": 0.2, "grad_norm": 1.7345908813463315, "learning_rate": 1.848548923789616e-05, "loss": 0.7651, "step": 1231 }, { "epoch": 0.2, "grad_norm": 3.8256505668282337, "learning_rate": 1.8482673016205734e-05, "loss": 0.8519, "step": 1232 }, { "epoch": 0.2, "grad_norm": 1.5998635984664715, "learning_rate": 1.847985439351256e-05, "loss": 0.943, "step": 1233 }, { "epoch": 0.2, "grad_norm": 0.9656389799591294, "learning_rate": 1.8477033370614438e-05, "loss": 0.4013, "step": 1234 }, { "epoch": 0.2, "grad_norm": 1.9689252616975657, "learning_rate": 1.8474209948309852e-05, "loss": 0.8452, "step": 1235 }, { "epoch": 0.2, "grad_norm": 2.003346256759847, "learning_rate": 1.8471384127397974e-05, "loss": 0.8288, "step": 1236 }, { "epoch": 0.2, "grad_norm": 2.0257861188268866, "learning_rate": 1.8468555908678638e-05, "loss": 0.9545, "step": 1237 }, { "epoch": 0.2, "grad_norm": 0.6834890794872964, "learning_rate": 1.846572529295237e-05, "loss": 0.3902, "step": 1238 }, { "epoch": 0.2, "grad_norm": 2.330659110903956, "learning_rate": 1.8462892281020365e-05, "loss": 0.8281, "step": 1239 }, { "epoch": 0.2, "grad_norm": 2.938676064388432, "learning_rate": 1.8460056873684503e-05, "loss": 0.7766, "step": 1240 }, { "epoch": 0.2, "grad_norm": 2.169716721683024, "learning_rate": 1.8457219071747345e-05, "loss": 0.8213, "step": 1241 }, { "epoch": 0.2, "grad_norm": 1.8205020600948945, "learning_rate": 1.845437887601212e-05, "loss": 0.8552, "step": 1242 }, { "epoch": 0.2, "grad_norm": 1.7389377898389242, "learning_rate": 1.845153628728274e-05, "loss": 0.9227, "step": 1243 }, { "epoch": 0.2, "grad_norm": 1.5630471822645184, "learning_rate": 1.8448691306363798e-05, "loss": 0.9198, "step": 1244 }, { "epoch": 0.2, "grad_norm": 4.559406187860835, "learning_rate": 1.844584393406055e-05, "loss": 0.8097, "step": 1245 }, { "epoch": 0.2, "grad_norm": 1.8529218890014032, "learning_rate": 1.8442994171178948e-05, "loss": 0.8823, "step": 1246 }, { "epoch": 0.2, "grad_norm": 1.8248742072862512, "learning_rate": 1.844014201852561e-05, "loss": 0.881, "step": 1247 }, { "epoch": 0.2, "grad_norm": 1.6866746877169219, "learning_rate": 1.8437287476907828e-05, "loss": 0.8658, "step": 1248 }, { "epoch": 0.21, "grad_norm": 1.6324777474380825, "learning_rate": 1.8434430547133576e-05, "loss": 0.8165, "step": 1249 }, { "epoch": 0.21, "grad_norm": 1.5191354455203832, "learning_rate": 1.8431571230011504e-05, "loss": 0.9028, "step": 1250 }, { "epoch": 0.21, "grad_norm": 1.4382888888865775, "learning_rate": 1.8428709526350932e-05, "loss": 0.8885, "step": 1251 }, { "epoch": 0.21, "grad_norm": 1.902811669130774, "learning_rate": 1.8425845436961863e-05, "loss": 0.9069, "step": 1252 }, { "epoch": 0.21, "grad_norm": 1.738541955697879, "learning_rate": 1.842297896265497e-05, "loss": 0.9548, "step": 1253 }, { "epoch": 0.21, "grad_norm": 2.68614814890453, "learning_rate": 1.8420110104241598e-05, "loss": 0.7003, "step": 1254 }, { "epoch": 0.21, "grad_norm": 1.9089769707821418, "learning_rate": 1.841723886253378e-05, "loss": 0.8352, "step": 1255 }, { "epoch": 0.21, "grad_norm": 3.0303353056796207, "learning_rate": 1.8414365238344208e-05, "loss": 0.9147, "step": 1256 }, { "epoch": 0.21, "grad_norm": 1.8027972388166424, "learning_rate": 1.8411489232486256e-05, "loss": 0.8375, "step": 1257 }, { "epoch": 0.21, "grad_norm": 1.9113435796851959, "learning_rate": 1.8408610845773974e-05, "loss": 0.8338, "step": 1258 }, { "epoch": 0.21, "grad_norm": 1.9709772838626143, "learning_rate": 1.8405730079022083e-05, "loss": 0.8392, "step": 1259 }, { "epoch": 0.21, "grad_norm": 1.5768194721879218, "learning_rate": 1.8402846933045974e-05, "loss": 0.8621, "step": 1260 }, { "epoch": 0.21, "grad_norm": 2.443405216343786, "learning_rate": 1.8399961408661725e-05, "loss": 0.8228, "step": 1261 }, { "epoch": 0.21, "grad_norm": 2.2739160887685754, "learning_rate": 1.8397073506686066e-05, "loss": 0.8356, "step": 1262 }, { "epoch": 0.21, "grad_norm": 2.4505910303167537, "learning_rate": 1.8394183227936418e-05, "loss": 0.8099, "step": 1263 }, { "epoch": 0.21, "grad_norm": 1.6450273644451137, "learning_rate": 1.839129057323087e-05, "loss": 0.8575, "step": 1264 }, { "epoch": 0.21, "grad_norm": 2.0413984856999288, "learning_rate": 1.8388395543388174e-05, "loss": 0.8876, "step": 1265 }, { "epoch": 0.21, "grad_norm": 1.8684341901100798, "learning_rate": 1.838549813922777e-05, "loss": 0.9469, "step": 1266 }, { "epoch": 0.21, "grad_norm": 1.8015909192271637, "learning_rate": 1.838259836156976e-05, "loss": 0.909, "step": 1267 }, { "epoch": 0.21, "grad_norm": 4.067874102689454, "learning_rate": 1.8379696211234918e-05, "loss": 0.9106, "step": 1268 }, { "epoch": 0.21, "grad_norm": 2.0648525767084474, "learning_rate": 1.8376791689044693e-05, "loss": 0.8026, "step": 1269 }, { "epoch": 0.21, "grad_norm": 1.7425813988241328, "learning_rate": 1.8373884795821203e-05, "loss": 0.8649, "step": 1270 }, { "epoch": 0.21, "grad_norm": 3.764229213406879, "learning_rate": 1.8370975532387237e-05, "loss": 0.8429, "step": 1271 }, { "epoch": 0.21, "grad_norm": 2.6544964580493904, "learning_rate": 1.8368063899566263e-05, "loss": 0.8175, "step": 1272 }, { "epoch": 0.21, "grad_norm": 1.8281335737956539, "learning_rate": 1.8365149898182403e-05, "loss": 0.9235, "step": 1273 }, { "epoch": 0.21, "grad_norm": 1.607092088915376, "learning_rate": 1.8362233529060464e-05, "loss": 0.8558, "step": 1274 }, { "epoch": 0.21, "grad_norm": 1.5597192345593678, "learning_rate": 1.8359314793025914e-05, "loss": 0.957, "step": 1275 }, { "epoch": 0.21, "grad_norm": 2.00939484823588, "learning_rate": 1.8356393690904904e-05, "loss": 0.878, "step": 1276 }, { "epoch": 0.21, "grad_norm": 2.6118507655192085, "learning_rate": 1.8353470223524237e-05, "loss": 0.8714, "step": 1277 }, { "epoch": 0.21, "grad_norm": 1.96496408958676, "learning_rate": 1.8350544391711396e-05, "loss": 0.857, "step": 1278 }, { "epoch": 0.21, "grad_norm": 2.016848824186858, "learning_rate": 1.8347616196294536e-05, "loss": 0.8686, "step": 1279 }, { "epoch": 0.21, "grad_norm": 2.119236171972037, "learning_rate": 1.8344685638102472e-05, "loss": 0.8638, "step": 1280 }, { "epoch": 0.21, "grad_norm": 1.8193600633574019, "learning_rate": 1.8341752717964696e-05, "loss": 0.8418, "step": 1281 }, { "epoch": 0.21, "grad_norm": 0.956202001266429, "learning_rate": 1.8338817436711358e-05, "loss": 0.395, "step": 1282 }, { "epoch": 0.21, "grad_norm": 3.2255986462207775, "learning_rate": 1.833587979517329e-05, "loss": 0.9181, "step": 1283 }, { "epoch": 0.21, "grad_norm": 2.0094592374878086, "learning_rate": 1.8332939794181986e-05, "loss": 0.8784, "step": 1284 }, { "epoch": 0.21, "grad_norm": 2.639968373786259, "learning_rate": 1.83299974345696e-05, "loss": 0.8245, "step": 1285 }, { "epoch": 0.21, "grad_norm": 1.9084639111888055, "learning_rate": 1.832705271716897e-05, "loss": 0.8459, "step": 1286 }, { "epoch": 0.21, "grad_norm": 2.3534199875493895, "learning_rate": 1.832410564281358e-05, "loss": 0.7391, "step": 1287 }, { "epoch": 0.21, "grad_norm": 1.939290409995376, "learning_rate": 1.8321156212337604e-05, "loss": 0.9487, "step": 1288 }, { "epoch": 0.21, "grad_norm": 2.2575885310079076, "learning_rate": 1.8318204426575873e-05, "loss": 0.8101, "step": 1289 }, { "epoch": 0.21, "grad_norm": 2.095954900122592, "learning_rate": 1.831525028636387e-05, "loss": 0.8869, "step": 1290 }, { "epoch": 0.21, "grad_norm": 1.5924016140664659, "learning_rate": 1.8312293792537773e-05, "loss": 0.8561, "step": 1291 }, { "epoch": 0.21, "grad_norm": 2.231588136022878, "learning_rate": 1.8309334945934402e-05, "loss": 0.8576, "step": 1292 }, { "epoch": 0.21, "grad_norm": 2.4469042074193994, "learning_rate": 1.830637374739126e-05, "loss": 0.8018, "step": 1293 }, { "epoch": 0.21, "grad_norm": 2.2447455107938863, "learning_rate": 1.83034101977465e-05, "loss": 0.9203, "step": 1294 }, { "epoch": 0.21, "grad_norm": 2.116243535209396, "learning_rate": 1.8300444297838955e-05, "loss": 0.8609, "step": 1295 }, { "epoch": 0.21, "grad_norm": 2.000120960454385, "learning_rate": 1.8297476048508113e-05, "loss": 0.7984, "step": 1296 }, { "epoch": 0.21, "grad_norm": 2.284068609294489, "learning_rate": 1.829450545059413e-05, "loss": 0.8957, "step": 1297 }, { "epoch": 0.21, "grad_norm": 1.8766020675843569, "learning_rate": 1.829153250493783e-05, "loss": 0.8193, "step": 1298 }, { "epoch": 0.21, "grad_norm": 1.8836366476625146, "learning_rate": 1.8288557212380703e-05, "loss": 0.808, "step": 1299 }, { "epoch": 0.21, "grad_norm": 2.7146484354217604, "learning_rate": 1.828557957376489e-05, "loss": 0.8605, "step": 1300 }, { "epoch": 0.21, "grad_norm": 2.4202417141283163, "learning_rate": 1.8282599589933214e-05, "loss": 0.8158, "step": 1301 }, { "epoch": 0.21, "grad_norm": 5.125353731973796, "learning_rate": 1.8279617261729142e-05, "loss": 0.892, "step": 1302 }, { "epoch": 0.21, "grad_norm": 1.973015680559492, "learning_rate": 1.827663258999683e-05, "loss": 0.9449, "step": 1303 }, { "epoch": 0.21, "grad_norm": 2.1850149781072266, "learning_rate": 1.827364557558107e-05, "loss": 0.8618, "step": 1304 }, { "epoch": 0.21, "grad_norm": 2.7769330100050773, "learning_rate": 1.827065621932734e-05, "loss": 0.8146, "step": 1305 }, { "epoch": 0.21, "grad_norm": 1.9796102469476196, "learning_rate": 1.8267664522081767e-05, "loss": 0.8605, "step": 1306 }, { "epoch": 0.21, "grad_norm": 1.7766616430140332, "learning_rate": 1.8264670484691144e-05, "loss": 0.8608, "step": 1307 }, { "epoch": 0.21, "grad_norm": 2.146383701201318, "learning_rate": 1.8261674108002925e-05, "loss": 0.8489, "step": 1308 }, { "epoch": 0.21, "grad_norm": 1.7765335813875198, "learning_rate": 1.8258675392865235e-05, "loss": 0.8467, "step": 1309 }, { "epoch": 0.22, "grad_norm": 0.9739090408746683, "learning_rate": 1.8255674340126847e-05, "loss": 0.3962, "step": 1310 }, { "epoch": 0.22, "grad_norm": 1.8608470195142885, "learning_rate": 1.8252670950637206e-05, "loss": 0.8809, "step": 1311 }, { "epoch": 0.22, "grad_norm": 2.4386391116446235, "learning_rate": 1.8249665225246417e-05, "loss": 0.8797, "step": 1312 }, { "epoch": 0.22, "grad_norm": 2.097424847645738, "learning_rate": 1.824665716480524e-05, "loss": 0.9759, "step": 1313 }, { "epoch": 0.22, "grad_norm": 2.6260076502286163, "learning_rate": 1.82436467701651e-05, "loss": 0.8308, "step": 1314 }, { "epoch": 0.22, "grad_norm": 2.246976525099728, "learning_rate": 1.8240634042178086e-05, "loss": 0.887, "step": 1315 }, { "epoch": 0.22, "grad_norm": 2.015290380050189, "learning_rate": 1.8237618981696944e-05, "loss": 0.8655, "step": 1316 }, { "epoch": 0.22, "grad_norm": 1.8294872147538126, "learning_rate": 1.823460158957508e-05, "loss": 0.8613, "step": 1317 }, { "epoch": 0.22, "grad_norm": 1.5865467200935528, "learning_rate": 1.823158186666656e-05, "loss": 0.8479, "step": 1318 }, { "epoch": 0.22, "grad_norm": 2.5574925153444092, "learning_rate": 1.8228559813826106e-05, "loss": 0.8768, "step": 1319 }, { "epoch": 0.22, "grad_norm": 2.266496376138212, "learning_rate": 1.8225535431909113e-05, "loss": 0.9065, "step": 1320 }, { "epoch": 0.22, "grad_norm": 1.5736023780122188, "learning_rate": 1.822250872177162e-05, "loss": 0.889, "step": 1321 }, { "epoch": 0.22, "grad_norm": 1.9949884659885608, "learning_rate": 1.821947968427033e-05, "loss": 0.9245, "step": 1322 }, { "epoch": 0.22, "grad_norm": 2.0367629857157312, "learning_rate": 1.821644832026261e-05, "loss": 0.8846, "step": 1323 }, { "epoch": 0.22, "grad_norm": 1.5341516982376553, "learning_rate": 1.821341463060648e-05, "loss": 0.8741, "step": 1324 }, { "epoch": 0.22, "grad_norm": 1.5643535265466053, "learning_rate": 1.8210378616160617e-05, "loss": 0.7997, "step": 1325 }, { "epoch": 0.22, "grad_norm": 1.926542974312375, "learning_rate": 1.8207340277784357e-05, "loss": 0.8809, "step": 1326 }, { "epoch": 0.22, "grad_norm": 1.703907486301483, "learning_rate": 1.82042996163377e-05, "loss": 0.8514, "step": 1327 }, { "epoch": 0.22, "grad_norm": 1.3098512768451922, "learning_rate": 1.82012566326813e-05, "loss": 0.4232, "step": 1328 }, { "epoch": 0.22, "grad_norm": 2.063627537003168, "learning_rate": 1.819821132767646e-05, "loss": 0.8473, "step": 1329 }, { "epoch": 0.22, "grad_norm": 2.6861752070855767, "learning_rate": 1.8195163702185153e-05, "loss": 0.9094, "step": 1330 }, { "epoch": 0.22, "grad_norm": 1.9514940457477628, "learning_rate": 1.819211375707e-05, "loss": 0.9117, "step": 1331 }, { "epoch": 0.22, "grad_norm": 2.6059310385313124, "learning_rate": 1.8189061493194283e-05, "loss": 0.8681, "step": 1332 }, { "epoch": 0.22, "grad_norm": 1.6031040957571416, "learning_rate": 1.8186006911421937e-05, "loss": 0.8613, "step": 1333 }, { "epoch": 0.22, "grad_norm": 1.4536114263255937, "learning_rate": 1.818295001261756e-05, "loss": 0.86, "step": 1334 }, { "epoch": 0.22, "grad_norm": 2.1615606869997714, "learning_rate": 1.8179890797646398e-05, "loss": 0.848, "step": 1335 }, { "epoch": 0.22, "grad_norm": 2.1357858963225214, "learning_rate": 1.817682926737435e-05, "loss": 0.7087, "step": 1336 }, { "epoch": 0.22, "grad_norm": 2.3003438955468507, "learning_rate": 1.8173765422667987e-05, "loss": 0.8145, "step": 1337 }, { "epoch": 0.22, "grad_norm": 1.542661033047219, "learning_rate": 1.817069926439451e-05, "loss": 0.7903, "step": 1338 }, { "epoch": 0.22, "grad_norm": 2.100673504704627, "learning_rate": 1.81676307934218e-05, "loss": 0.8797, "step": 1339 }, { "epoch": 0.22, "grad_norm": 2.431741568253031, "learning_rate": 1.8164560010618377e-05, "loss": 0.8374, "step": 1340 }, { "epoch": 0.22, "grad_norm": 2.0038795328090235, "learning_rate": 1.816148691685342e-05, "loss": 0.9177, "step": 1341 }, { "epoch": 0.22, "grad_norm": 2.210769304177981, "learning_rate": 1.815841151299676e-05, "loss": 0.8749, "step": 1342 }, { "epoch": 0.22, "grad_norm": 0.9758995409965102, "learning_rate": 1.8155333799918883e-05, "loss": 0.4247, "step": 1343 }, { "epoch": 0.22, "grad_norm": 1.8957421580882086, "learning_rate": 1.8152253778490933e-05, "loss": 0.9219, "step": 1344 }, { "epoch": 0.22, "grad_norm": 1.8080610229103915, "learning_rate": 1.8149171449584705e-05, "loss": 0.8648, "step": 1345 }, { "epoch": 0.22, "grad_norm": 1.9693666103213294, "learning_rate": 1.814608681407264e-05, "loss": 0.9149, "step": 1346 }, { "epoch": 0.22, "grad_norm": 2.622108175414741, "learning_rate": 1.814299987282784e-05, "loss": 0.8355, "step": 1347 }, { "epoch": 0.22, "grad_norm": 1.814602035892894, "learning_rate": 1.8139910626724058e-05, "loss": 0.834, "step": 1348 }, { "epoch": 0.22, "grad_norm": 1.9760535766869913, "learning_rate": 1.8136819076635696e-05, "loss": 0.9303, "step": 1349 }, { "epoch": 0.22, "grad_norm": 2.2024158750338074, "learning_rate": 1.8133725223437815e-05, "loss": 0.8197, "step": 1350 }, { "epoch": 0.22, "grad_norm": 2.1906105946900842, "learning_rate": 1.813062906800612e-05, "loss": 0.9043, "step": 1351 }, { "epoch": 0.22, "grad_norm": 1.4578023367745296, "learning_rate": 1.8127530611216973e-05, "loss": 0.7661, "step": 1352 }, { "epoch": 0.22, "grad_norm": 1.637594302878851, "learning_rate": 1.8124429853947387e-05, "loss": 0.811, "step": 1353 }, { "epoch": 0.22, "grad_norm": 2.9338108223886974, "learning_rate": 1.8121326797075022e-05, "loss": 0.8426, "step": 1354 }, { "epoch": 0.22, "grad_norm": 2.146470257547313, "learning_rate": 1.811822144147819e-05, "loss": 0.8794, "step": 1355 }, { "epoch": 0.22, "grad_norm": 2.215150104975052, "learning_rate": 1.8115113788035863e-05, "loss": 0.9081, "step": 1356 }, { "epoch": 0.22, "grad_norm": 1.758657433192692, "learning_rate": 1.8112003837627646e-05, "loss": 0.8096, "step": 1357 }, { "epoch": 0.22, "grad_norm": 1.6333286214386236, "learning_rate": 1.8108891591133812e-05, "loss": 0.8532, "step": 1358 }, { "epoch": 0.22, "grad_norm": 6.233285101712303, "learning_rate": 1.810577704943527e-05, "loss": 0.9155, "step": 1359 }, { "epoch": 0.22, "grad_norm": 1.9387618681198173, "learning_rate": 1.8102660213413593e-05, "loss": 0.8076, "step": 1360 }, { "epoch": 0.22, "grad_norm": 1.9343915330189905, "learning_rate": 1.8099541083950988e-05, "loss": 0.9083, "step": 1361 }, { "epoch": 0.22, "grad_norm": 2.6995304150942028, "learning_rate": 1.8096419661930313e-05, "loss": 0.8239, "step": 1362 }, { "epoch": 0.22, "grad_norm": 1.658181237216645, "learning_rate": 1.809329594823509e-05, "loss": 0.9003, "step": 1363 }, { "epoch": 0.22, "grad_norm": 2.2374828270627467, "learning_rate": 1.8090169943749477e-05, "loss": 0.9454, "step": 1364 }, { "epoch": 0.22, "grad_norm": 2.5553533297256408, "learning_rate": 1.808704164935828e-05, "loss": 0.9295, "step": 1365 }, { "epoch": 0.22, "grad_norm": 2.319211124097961, "learning_rate": 1.8083911065946958e-05, "loss": 0.9303, "step": 1366 }, { "epoch": 0.22, "grad_norm": 1.7967188604720294, "learning_rate": 1.8080778194401615e-05, "loss": 0.9659, "step": 1367 }, { "epoch": 0.22, "grad_norm": 3.22361506206818, "learning_rate": 1.8077643035609006e-05, "loss": 0.9144, "step": 1368 }, { "epoch": 0.22, "grad_norm": 1.900621842011984, "learning_rate": 1.807450559045653e-05, "loss": 0.8149, "step": 1369 }, { "epoch": 0.23, "grad_norm": 2.369759865102931, "learning_rate": 1.807136585983223e-05, "loss": 0.8132, "step": 1370 }, { "epoch": 0.23, "grad_norm": 1.3847980464406267, "learning_rate": 1.8068223844624806e-05, "loss": 0.8882, "step": 1371 }, { "epoch": 0.23, "grad_norm": 2.0692694407426386, "learning_rate": 1.80650795457236e-05, "loss": 0.8784, "step": 1372 }, { "epoch": 0.23, "grad_norm": 1.7792138870555, "learning_rate": 1.806193296401859e-05, "loss": 0.8681, "step": 1373 }, { "epoch": 0.23, "grad_norm": 1.899258119730882, "learning_rate": 1.8058784100400418e-05, "loss": 0.8753, "step": 1374 }, { "epoch": 0.23, "grad_norm": 0.7562343052600731, "learning_rate": 1.8055632955760364e-05, "loss": 0.3885, "step": 1375 }, { "epoch": 0.23, "grad_norm": 2.7532758002332542, "learning_rate": 1.8052479530990348e-05, "loss": 0.9292, "step": 1376 }, { "epoch": 0.23, "grad_norm": 2.0404789616493297, "learning_rate": 1.8049323826982942e-05, "loss": 0.8798, "step": 1377 }, { "epoch": 0.23, "grad_norm": 2.356383901311427, "learning_rate": 1.804616584463136e-05, "loss": 0.8894, "step": 1378 }, { "epoch": 0.23, "grad_norm": 2.046027111405783, "learning_rate": 1.804300558482946e-05, "loss": 0.7914, "step": 1379 }, { "epoch": 0.23, "grad_norm": 1.7576302549991776, "learning_rate": 1.8039843048471756e-05, "loss": 0.8216, "step": 1380 }, { "epoch": 0.23, "grad_norm": 1.4298801508849552, "learning_rate": 1.8036678236453387e-05, "loss": 0.8448, "step": 1381 }, { "epoch": 0.23, "grad_norm": 2.1907755392841675, "learning_rate": 1.8033511149670152e-05, "loss": 0.8431, "step": 1382 }, { "epoch": 0.23, "grad_norm": 2.3880112851868764, "learning_rate": 1.803034178901849e-05, "loss": 0.9119, "step": 1383 }, { "epoch": 0.23, "grad_norm": 2.2651397584416046, "learning_rate": 1.8027170155395476e-05, "loss": 0.8627, "step": 1384 }, { "epoch": 0.23, "grad_norm": 1.578444894369287, "learning_rate": 1.8023996249698836e-05, "loss": 0.8563, "step": 1385 }, { "epoch": 0.23, "grad_norm": 1.9067084126431681, "learning_rate": 1.802082007282694e-05, "loss": 0.9189, "step": 1386 }, { "epoch": 0.23, "grad_norm": 1.737555866978334, "learning_rate": 1.8017641625678797e-05, "loss": 0.9098, "step": 1387 }, { "epoch": 0.23, "grad_norm": 2.5738567984300618, "learning_rate": 1.8014460909154058e-05, "loss": 0.859, "step": 1388 }, { "epoch": 0.23, "grad_norm": 2.501219312874196, "learning_rate": 1.8011277924153022e-05, "loss": 0.8503, "step": 1389 }, { "epoch": 0.23, "grad_norm": 1.5985796001700217, "learning_rate": 1.8008092671576624e-05, "loss": 0.8693, "step": 1390 }, { "epoch": 0.23, "grad_norm": 1.6390666173740704, "learning_rate": 1.8004905152326443e-05, "loss": 0.87, "step": 1391 }, { "epoch": 0.23, "grad_norm": 2.454039862998943, "learning_rate": 1.8001715367304696e-05, "loss": 0.8406, "step": 1392 }, { "epoch": 0.23, "grad_norm": 2.0881739923391414, "learning_rate": 1.799852331741425e-05, "loss": 0.8213, "step": 1393 }, { "epoch": 0.23, "grad_norm": 1.5908331733022332, "learning_rate": 1.799532900355861e-05, "loss": 0.8101, "step": 1394 }, { "epoch": 0.23, "grad_norm": 1.8645868680230182, "learning_rate": 1.799213242664191e-05, "loss": 0.8148, "step": 1395 }, { "epoch": 0.23, "grad_norm": 1.9658727536536684, "learning_rate": 1.7988933587568948e-05, "loss": 0.8166, "step": 1396 }, { "epoch": 0.23, "grad_norm": 2.290085560024927, "learning_rate": 1.7985732487245132e-05, "loss": 0.8442, "step": 1397 }, { "epoch": 0.23, "grad_norm": 5.470914715474981, "learning_rate": 1.7982529126576543e-05, "loss": 0.8522, "step": 1398 }, { "epoch": 0.23, "grad_norm": 1.8705599379726363, "learning_rate": 1.7979323506469878e-05, "loss": 0.8455, "step": 1399 }, { "epoch": 0.23, "grad_norm": 2.2100831755931414, "learning_rate": 1.797611562783248e-05, "loss": 0.824, "step": 1400 }, { "epoch": 0.23, "grad_norm": 2.566492095716504, "learning_rate": 1.7972905491572334e-05, "loss": 0.8654, "step": 1401 }, { "epoch": 0.23, "grad_norm": 4.1126563270414005, "learning_rate": 1.7969693098598063e-05, "loss": 0.8949, "step": 1402 }, { "epoch": 0.23, "grad_norm": 0.8260155762597016, "learning_rate": 1.7966478449818925e-05, "loss": 0.3929, "step": 1403 }, { "epoch": 0.23, "grad_norm": 1.9555232483234357, "learning_rate": 1.7963261546144823e-05, "loss": 0.8683, "step": 1404 }, { "epoch": 0.23, "grad_norm": 2.21329342421257, "learning_rate": 1.7960042388486293e-05, "loss": 0.866, "step": 1405 }, { "epoch": 0.23, "grad_norm": 7.088593605020256, "learning_rate": 1.795682097775451e-05, "loss": 0.7844, "step": 1406 }, { "epoch": 0.23, "grad_norm": 1.64591292739628, "learning_rate": 1.795359731486129e-05, "loss": 0.8688, "step": 1407 }, { "epoch": 0.23, "grad_norm": 1.8764679456020739, "learning_rate": 1.7950371400719087e-05, "loss": 0.8224, "step": 1408 }, { "epoch": 0.23, "grad_norm": 1.7723376161057702, "learning_rate": 1.7947143236240983e-05, "loss": 0.8457, "step": 1409 }, { "epoch": 0.23, "grad_norm": 2.422365097365565, "learning_rate": 1.7943912822340702e-05, "loss": 0.8064, "step": 1410 }, { "epoch": 0.23, "grad_norm": 1.4537297142462597, "learning_rate": 1.7940680159932612e-05, "loss": 0.8421, "step": 1411 }, { "epoch": 0.23, "grad_norm": 1.7024383466271598, "learning_rate": 1.7937445249931706e-05, "loss": 0.8983, "step": 1412 }, { "epoch": 0.23, "grad_norm": 2.6509793279911764, "learning_rate": 1.7934208093253625e-05, "loss": 0.8521, "step": 1413 }, { "epoch": 0.23, "grad_norm": 1.8003432016196297, "learning_rate": 1.7930968690814634e-05, "loss": 0.8025, "step": 1414 }, { "epoch": 0.23, "grad_norm": 1.9734178661595194, "learning_rate": 1.7927727043531637e-05, "loss": 0.914, "step": 1415 }, { "epoch": 0.23, "grad_norm": 1.6401335841829958, "learning_rate": 1.7924483152322183e-05, "loss": 0.8581, "step": 1416 }, { "epoch": 0.23, "grad_norm": 2.0620560092634252, "learning_rate": 1.7921237018104443e-05, "loss": 0.7577, "step": 1417 }, { "epoch": 0.23, "grad_norm": 1.6181625224516065, "learning_rate": 1.7917988641797227e-05, "loss": 0.8876, "step": 1418 }, { "epoch": 0.23, "grad_norm": 2.146663405701542, "learning_rate": 1.791473802431999e-05, "loss": 0.7917, "step": 1419 }, { "epoch": 0.23, "grad_norm": 3.6748358518658284, "learning_rate": 1.7911485166592802e-05, "loss": 0.8165, "step": 1420 }, { "epoch": 0.23, "grad_norm": 4.791297913514388, "learning_rate": 1.790823006953638e-05, "loss": 0.8733, "step": 1421 }, { "epoch": 0.23, "grad_norm": 2.6205156820563, "learning_rate": 1.790497273407208e-05, "loss": 0.8816, "step": 1422 }, { "epoch": 0.23, "grad_norm": 2.50347281967651, "learning_rate": 1.7901713161121873e-05, "loss": 0.8414, "step": 1423 }, { "epoch": 0.23, "grad_norm": 1.6956751552496818, "learning_rate": 1.7898451351608385e-05, "loss": 0.9599, "step": 1424 }, { "epoch": 0.23, "grad_norm": 3.1008016356691637, "learning_rate": 1.7895187306454852e-05, "loss": 0.8878, "step": 1425 }, { "epoch": 0.23, "grad_norm": 1.9309924733400627, "learning_rate": 1.7891921026585167e-05, "loss": 0.8628, "step": 1426 }, { "epoch": 0.23, "grad_norm": 2.1001292649134875, "learning_rate": 1.7888652512923836e-05, "loss": 0.8474, "step": 1427 }, { "epoch": 0.23, "grad_norm": 4.915390797703064, "learning_rate": 1.7885381766396008e-05, "loss": 0.7826, "step": 1428 }, { "epoch": 0.23, "grad_norm": 1.7772339798347776, "learning_rate": 1.788210878792746e-05, "loss": 0.7941, "step": 1429 }, { "epoch": 0.23, "grad_norm": 1.9084851702488963, "learning_rate": 1.7878833578444603e-05, "loss": 0.7566, "step": 1430 }, { "epoch": 0.24, "grad_norm": 1.976173704240056, "learning_rate": 1.787555613887448e-05, "loss": 0.8678, "step": 1431 }, { "epoch": 0.24, "grad_norm": 2.164729190420277, "learning_rate": 1.7872276470144756e-05, "loss": 0.8518, "step": 1432 }, { "epoch": 0.24, "grad_norm": 1.9106556046370184, "learning_rate": 1.786899457318374e-05, "loss": 0.8402, "step": 1433 }, { "epoch": 0.24, "grad_norm": 1.9075466782713315, "learning_rate": 1.7865710448920365e-05, "loss": 0.897, "step": 1434 }, { "epoch": 0.24, "grad_norm": 1.7507566319645065, "learning_rate": 1.7862424098284197e-05, "loss": 0.8888, "step": 1435 }, { "epoch": 0.24, "grad_norm": 2.4792010427642936, "learning_rate": 1.7859135522205426e-05, "loss": 0.885, "step": 1436 }, { "epoch": 0.24, "grad_norm": 1.7943802487562306, "learning_rate": 1.7855844721614883e-05, "loss": 0.8864, "step": 1437 }, { "epoch": 0.24, "grad_norm": 3.0400653903432384, "learning_rate": 1.7852551697444017e-05, "loss": 0.8159, "step": 1438 }, { "epoch": 0.24, "grad_norm": 2.0954509270507153, "learning_rate": 1.7849256450624914e-05, "loss": 0.8623, "step": 1439 }, { "epoch": 0.24, "grad_norm": 0.8552670242327954, "learning_rate": 1.7845958982090287e-05, "loss": 0.4166, "step": 1440 }, { "epoch": 0.24, "grad_norm": 1.8815987670429122, "learning_rate": 1.7842659292773474e-05, "loss": 0.9008, "step": 1441 }, { "epoch": 0.24, "grad_norm": 2.1443682603663756, "learning_rate": 1.783935738360845e-05, "loss": 0.8302, "step": 1442 }, { "epoch": 0.24, "grad_norm": 1.5789539367328318, "learning_rate": 1.783605325552981e-05, "loss": 0.8664, "step": 1443 }, { "epoch": 0.24, "grad_norm": 2.0345269585948036, "learning_rate": 1.7832746909472783e-05, "loss": 0.763, "step": 1444 }, { "epoch": 0.24, "grad_norm": 1.7181721006072268, "learning_rate": 1.782943834637322e-05, "loss": 0.8774, "step": 1445 }, { "epoch": 0.24, "grad_norm": 1.9754039571736153, "learning_rate": 1.7826127567167602e-05, "loss": 0.7776, "step": 1446 }, { "epoch": 0.24, "grad_norm": 1.6294026978487683, "learning_rate": 1.7822814572793047e-05, "loss": 0.8546, "step": 1447 }, { "epoch": 0.24, "grad_norm": 2.5108321727141365, "learning_rate": 1.7819499364187282e-05, "loss": 0.7969, "step": 1448 }, { "epoch": 0.24, "grad_norm": 1.8869021977841631, "learning_rate": 1.7816181942288672e-05, "loss": 0.8143, "step": 1449 }, { "epoch": 0.24, "grad_norm": 2.8815603556045417, "learning_rate": 1.781286230803621e-05, "loss": 0.8926, "step": 1450 }, { "epoch": 0.24, "grad_norm": 1.8276184083846538, "learning_rate": 1.7809540462369505e-05, "loss": 0.7721, "step": 1451 }, { "epoch": 0.24, "grad_norm": 3.2093937563088324, "learning_rate": 1.7806216406228805e-05, "loss": 0.9033, "step": 1452 }, { "epoch": 0.24, "grad_norm": 1.7151918347517514, "learning_rate": 1.780289014055497e-05, "loss": 0.8941, "step": 1453 }, { "epoch": 0.24, "grad_norm": 2.195266642294602, "learning_rate": 1.77995616662895e-05, "loss": 0.9339, "step": 1454 }, { "epoch": 0.24, "grad_norm": 2.201668370532033, "learning_rate": 1.779623098437451e-05, "loss": 0.8939, "step": 1455 }, { "epoch": 0.24, "grad_norm": 1.4696971320191372, "learning_rate": 1.779289809575274e-05, "loss": 0.7871, "step": 1456 }, { "epoch": 0.24, "grad_norm": 2.532976449486127, "learning_rate": 1.7789563001367557e-05, "loss": 0.8736, "step": 1457 }, { "epoch": 0.24, "grad_norm": 1.8468877935194539, "learning_rate": 1.7786225702162955e-05, "loss": 0.8663, "step": 1458 }, { "epoch": 0.24, "grad_norm": 2.12920196228978, "learning_rate": 1.778288619908355e-05, "loss": 0.8131, "step": 1459 }, { "epoch": 0.24, "grad_norm": 3.754889535813794, "learning_rate": 1.777954449307458e-05, "loss": 0.7619, "step": 1460 }, { "epoch": 0.24, "grad_norm": 4.995049239539955, "learning_rate": 1.777620058508191e-05, "loss": 0.8779, "step": 1461 }, { "epoch": 0.24, "grad_norm": 3.2916566725524263, "learning_rate": 1.7772854476052023e-05, "loss": 0.9047, "step": 1462 }, { "epoch": 0.24, "grad_norm": 1.77548163879583, "learning_rate": 1.7769506166932026e-05, "loss": 0.8806, "step": 1463 }, { "epoch": 0.24, "grad_norm": 1.9708514034487061, "learning_rate": 1.7766155658669655e-05, "loss": 0.9138, "step": 1464 }, { "epoch": 0.24, "grad_norm": 2.2650214289407375, "learning_rate": 1.7762802952213264e-05, "loss": 0.8403, "step": 1465 }, { "epoch": 0.24, "grad_norm": 1.5919922049559938, "learning_rate": 1.7759448048511833e-05, "loss": 0.8396, "step": 1466 }, { "epoch": 0.24, "grad_norm": 2.0060782841030362, "learning_rate": 1.775609094851495e-05, "loss": 0.8651, "step": 1467 }, { "epoch": 0.24, "grad_norm": 2.2240568257873155, "learning_rate": 1.7752731653172847e-05, "loss": 0.8804, "step": 1468 }, { "epoch": 0.24, "grad_norm": 1.570976627549425, "learning_rate": 1.7749370163436353e-05, "loss": 0.8122, "step": 1469 }, { "epoch": 0.24, "grad_norm": 1.6944678498122536, "learning_rate": 1.7746006480256943e-05, "loss": 0.9243, "step": 1470 }, { "epoch": 0.24, "grad_norm": 1.9185203490403604, "learning_rate": 1.7742640604586694e-05, "loss": 0.8021, "step": 1471 }, { "epoch": 0.24, "grad_norm": 1.856769481579421, "learning_rate": 1.773927253737831e-05, "loss": 0.8962, "step": 1472 }, { "epoch": 0.24, "grad_norm": 2.195070614270247, "learning_rate": 1.7735902279585118e-05, "loss": 0.9206, "step": 1473 }, { "epoch": 0.24, "grad_norm": 0.8773339641054502, "learning_rate": 1.7732529832161057e-05, "loss": 0.4086, "step": 1474 }, { "epoch": 0.24, "grad_norm": 3.0526632812432877, "learning_rate": 1.7729155196060697e-05, "loss": 0.8447, "step": 1475 }, { "epoch": 0.24, "grad_norm": 2.8964018173484263, "learning_rate": 1.772577837223922e-05, "loss": 0.8473, "step": 1476 }, { "epoch": 0.24, "grad_norm": 1.6470954376052578, "learning_rate": 1.772239936165243e-05, "loss": 0.8394, "step": 1477 }, { "epoch": 0.24, "grad_norm": 1.6903988199481699, "learning_rate": 1.7719018165256745e-05, "loss": 0.8752, "step": 1478 }, { "epoch": 0.24, "grad_norm": 1.9062354339553225, "learning_rate": 1.7715634784009207e-05, "loss": 0.8937, "step": 1479 }, { "epoch": 0.24, "grad_norm": 2.7961962312303874, "learning_rate": 1.7712249218867476e-05, "loss": 0.8066, "step": 1480 }, { "epoch": 0.24, "grad_norm": 2.690211393963912, "learning_rate": 1.770886147078983e-05, "loss": 0.8724, "step": 1481 }, { "epoch": 0.24, "grad_norm": 4.3943156652578335, "learning_rate": 1.770547154073516e-05, "loss": 0.84, "step": 1482 }, { "epoch": 0.24, "grad_norm": 2.061406350541569, "learning_rate": 1.7702079429662986e-05, "loss": 0.8164, "step": 1483 }, { "epoch": 0.24, "grad_norm": 0.7257077822837118, "learning_rate": 1.769868513853343e-05, "loss": 0.4012, "step": 1484 }, { "epoch": 0.24, "grad_norm": 1.6697975649248038, "learning_rate": 1.769528866830724e-05, "loss": 0.8715, "step": 1485 }, { "epoch": 0.24, "grad_norm": 1.8912244249190426, "learning_rate": 1.7691890019945785e-05, "loss": 0.775, "step": 1486 }, { "epoch": 0.24, "grad_norm": 2.1050459875111205, "learning_rate": 1.768848919441104e-05, "loss": 0.9135, "step": 1487 }, { "epoch": 0.24, "grad_norm": 2.6615799647208753, "learning_rate": 1.7685086192665605e-05, "loss": 0.8025, "step": 1488 }, { "epoch": 0.24, "grad_norm": 2.085447552037292, "learning_rate": 1.7681681015672693e-05, "loss": 0.7308, "step": 1489 }, { "epoch": 0.24, "grad_norm": 2.3568346832141365, "learning_rate": 1.767827366439613e-05, "loss": 0.8708, "step": 1490 }, { "epoch": 0.24, "grad_norm": 2.347562269709098, "learning_rate": 1.7674864139800356e-05, "loss": 0.8644, "step": 1491 }, { "epoch": 0.25, "grad_norm": 0.7025898740669551, "learning_rate": 1.7671452442850438e-05, "loss": 0.3739, "step": 1492 }, { "epoch": 0.25, "grad_norm": 2.207095556063425, "learning_rate": 1.7668038574512045e-05, "loss": 0.8682, "step": 1493 }, { "epoch": 0.25, "grad_norm": 2.943143750305397, "learning_rate": 1.7664622535751463e-05, "loss": 0.7865, "step": 1494 }, { "epoch": 0.25, "grad_norm": 2.427091437919141, "learning_rate": 1.76612043275356e-05, "loss": 0.7725, "step": 1495 }, { "epoch": 0.25, "grad_norm": 1.5562471076991666, "learning_rate": 1.7657783950831965e-05, "loss": 0.8792, "step": 1496 }, { "epoch": 0.25, "grad_norm": 2.9125371131332796, "learning_rate": 1.76543614066087e-05, "loss": 0.8721, "step": 1497 }, { "epoch": 0.25, "grad_norm": 2.2865258506585295, "learning_rate": 1.7650936695834536e-05, "loss": 0.9748, "step": 1498 }, { "epoch": 0.25, "grad_norm": 0.7198178202571384, "learning_rate": 1.764750981947884e-05, "loss": 0.4137, "step": 1499 }, { "epoch": 0.25, "grad_norm": 2.1123713587358726, "learning_rate": 1.764408077851158e-05, "loss": 0.9264, "step": 1500 }, { "epoch": 0.25, "grad_norm": 1.7980913215591154, "learning_rate": 1.7640649573903334e-05, "loss": 0.8556, "step": 1501 }, { "epoch": 0.25, "grad_norm": 2.021418186778152, "learning_rate": 1.7637216206625303e-05, "loss": 0.8504, "step": 1502 }, { "epoch": 0.25, "grad_norm": 2.1045896369143717, "learning_rate": 1.763378067764929e-05, "loss": 0.9466, "step": 1503 }, { "epoch": 0.25, "grad_norm": 2.3372126435397362, "learning_rate": 1.7630342987947718e-05, "loss": 0.8824, "step": 1504 }, { "epoch": 0.25, "grad_norm": 1.7999011677976056, "learning_rate": 1.7626903138493614e-05, "loss": 0.8698, "step": 1505 }, { "epoch": 0.25, "grad_norm": 2.4921070660670326, "learning_rate": 1.7623461130260625e-05, "loss": 0.8528, "step": 1506 }, { "epoch": 0.25, "grad_norm": 1.8046977965477684, "learning_rate": 1.7620016964223e-05, "loss": 0.8734, "step": 1507 }, { "epoch": 0.25, "grad_norm": 2.390687203562049, "learning_rate": 1.7616570641355602e-05, "loss": 0.8763, "step": 1508 }, { "epoch": 0.25, "grad_norm": 2.4235576224526567, "learning_rate": 1.761312216263391e-05, "loss": 0.8422, "step": 1509 }, { "epoch": 0.25, "grad_norm": 2.62869855930674, "learning_rate": 1.7609671529034006e-05, "loss": 0.8466, "step": 1510 }, { "epoch": 0.25, "grad_norm": 2.355530550814703, "learning_rate": 1.7606218741532588e-05, "loss": 0.8879, "step": 1511 }, { "epoch": 0.25, "grad_norm": 2.421193936110241, "learning_rate": 1.7602763801106952e-05, "loss": 0.8879, "step": 1512 }, { "epoch": 0.25, "grad_norm": 1.8664353996016392, "learning_rate": 1.759930670873502e-05, "loss": 0.8715, "step": 1513 }, { "epoch": 0.25, "grad_norm": 2.0369857127696838, "learning_rate": 1.759584746539531e-05, "loss": 0.8714, "step": 1514 }, { "epoch": 0.25, "grad_norm": 2.3500129011721036, "learning_rate": 1.7592386072066962e-05, "loss": 0.8401, "step": 1515 }, { "epoch": 0.25, "grad_norm": 1.8288693975793788, "learning_rate": 1.7588922529729703e-05, "loss": 0.8428, "step": 1516 }, { "epoch": 0.25, "grad_norm": 2.4266534485767854, "learning_rate": 1.7585456839363886e-05, "loss": 0.8533, "step": 1517 }, { "epoch": 0.25, "grad_norm": 2.374497670983956, "learning_rate": 1.758198900195047e-05, "loss": 0.7833, "step": 1518 }, { "epoch": 0.25, "grad_norm": 3.0818185953761117, "learning_rate": 1.757851901847102e-05, "loss": 0.7402, "step": 1519 }, { "epoch": 0.25, "grad_norm": 2.336223239582647, "learning_rate": 1.7575046889907708e-05, "loss": 0.8376, "step": 1520 }, { "epoch": 0.25, "grad_norm": 1.8637194872895306, "learning_rate": 1.7571572617243307e-05, "loss": 0.8272, "step": 1521 }, { "epoch": 0.25, "grad_norm": 1.5548786165695039, "learning_rate": 1.7568096201461204e-05, "loss": 0.8627, "step": 1522 }, { "epoch": 0.25, "grad_norm": 1.83304784938162, "learning_rate": 1.7564617643545395e-05, "loss": 0.9355, "step": 1523 }, { "epoch": 0.25, "grad_norm": 2.191741418895367, "learning_rate": 1.7561136944480478e-05, "loss": 0.8303, "step": 1524 }, { "epoch": 0.25, "grad_norm": 2.3574229307814987, "learning_rate": 1.7557654105251657e-05, "loss": 0.846, "step": 1525 }, { "epoch": 0.25, "grad_norm": 1.9917945793531746, "learning_rate": 1.7554169126844736e-05, "loss": 0.8978, "step": 1526 }, { "epoch": 0.25, "grad_norm": 0.7308962610852756, "learning_rate": 1.755068201024614e-05, "loss": 0.3783, "step": 1527 }, { "epoch": 0.25, "grad_norm": 1.6604755309393837, "learning_rate": 1.7547192756442887e-05, "loss": 0.8121, "step": 1528 }, { "epoch": 0.25, "grad_norm": 2.0204521140506473, "learning_rate": 1.75437013664226e-05, "loss": 0.8308, "step": 1529 }, { "epoch": 0.25, "grad_norm": 2.0402627338339974, "learning_rate": 1.7540207841173513e-05, "loss": 0.8264, "step": 1530 }, { "epoch": 0.25, "grad_norm": 2.2499515995879835, "learning_rate": 1.753671218168446e-05, "loss": 0.8547, "step": 1531 }, { "epoch": 0.25, "grad_norm": 0.643803786418505, "learning_rate": 1.753321438894488e-05, "loss": 0.3703, "step": 1532 }, { "epoch": 0.25, "grad_norm": 2.0656925183774413, "learning_rate": 1.7529714463944815e-05, "loss": 0.7836, "step": 1533 }, { "epoch": 0.25, "grad_norm": 2.5194448925587127, "learning_rate": 1.7526212407674916e-05, "loss": 0.79, "step": 1534 }, { "epoch": 0.25, "grad_norm": 3.2136213657432156, "learning_rate": 1.7522708221126424e-05, "loss": 0.8531, "step": 1535 }, { "epoch": 0.25, "grad_norm": 1.9365891167438327, "learning_rate": 1.7519201905291195e-05, "loss": 0.7948, "step": 1536 }, { "epoch": 0.25, "grad_norm": 2.2058359513416295, "learning_rate": 1.7515693461161687e-05, "loss": 0.8953, "step": 1537 }, { "epoch": 0.25, "grad_norm": 2.314195478178499, "learning_rate": 1.751218288973096e-05, "loss": 0.8374, "step": 1538 }, { "epoch": 0.25, "grad_norm": 1.960872651836964, "learning_rate": 1.7508670191992667e-05, "loss": 0.8673, "step": 1539 }, { "epoch": 0.25, "grad_norm": 1.6170236453920799, "learning_rate": 1.7505155368941074e-05, "loss": 0.8699, "step": 1540 }, { "epoch": 0.25, "grad_norm": 1.6955364656817054, "learning_rate": 1.7501638421571045e-05, "loss": 0.8296, "step": 1541 }, { "epoch": 0.25, "grad_norm": 2.411892834676122, "learning_rate": 1.749811935087804e-05, "loss": 0.8487, "step": 1542 }, { "epoch": 0.25, "grad_norm": 2.159824436189334, "learning_rate": 1.7494598157858127e-05, "loss": 0.8548, "step": 1543 }, { "epoch": 0.25, "grad_norm": 1.680639505031137, "learning_rate": 1.7491074843507974e-05, "loss": 0.8587, "step": 1544 }, { "epoch": 0.25, "grad_norm": 1.8799091863897268, "learning_rate": 1.7487549408824845e-05, "loss": 0.7865, "step": 1545 }, { "epoch": 0.25, "grad_norm": 4.091951620712424, "learning_rate": 1.748402185480661e-05, "loss": 0.8562, "step": 1546 }, { "epoch": 0.25, "grad_norm": 1.7777047806629072, "learning_rate": 1.7480492182451735e-05, "loss": 0.8704, "step": 1547 }, { "epoch": 0.25, "grad_norm": 1.8465707968872345, "learning_rate": 1.7476960392759284e-05, "loss": 0.8915, "step": 1548 }, { "epoch": 0.25, "grad_norm": 2.5896802776039753, "learning_rate": 1.7473426486728925e-05, "loss": 0.8199, "step": 1549 }, { "epoch": 0.25, "grad_norm": 1.8513926776554988, "learning_rate": 1.746989046536092e-05, "loss": 0.8048, "step": 1550 }, { "epoch": 0.25, "grad_norm": 2.2228787690828673, "learning_rate": 1.7466352329656134e-05, "loss": 0.8386, "step": 1551 }, { "epoch": 0.25, "grad_norm": 1.3745159503822557, "learning_rate": 1.746281208061603e-05, "loss": 0.7701, "step": 1552 }, { "epoch": 0.26, "grad_norm": 3.432865251060606, "learning_rate": 1.7459269719242665e-05, "loss": 0.9215, "step": 1553 }, { "epoch": 0.26, "grad_norm": 2.9997214032780817, "learning_rate": 1.74557252465387e-05, "loss": 0.8391, "step": 1554 }, { "epoch": 0.26, "grad_norm": 1.8447782729674316, "learning_rate": 1.745217866350739e-05, "loss": 0.8525, "step": 1555 }, { "epoch": 0.26, "grad_norm": 2.0849594917078584, "learning_rate": 1.744862997115259e-05, "loss": 0.7822, "step": 1556 }, { "epoch": 0.26, "grad_norm": 7.602281233104669, "learning_rate": 1.7445079170478743e-05, "loss": 0.8259, "step": 1557 }, { "epoch": 0.26, "grad_norm": 1.7893942808767564, "learning_rate": 1.74415262624909e-05, "loss": 0.8744, "step": 1558 }, { "epoch": 0.26, "grad_norm": 2.0252869690526922, "learning_rate": 1.7437971248194706e-05, "loss": 0.7913, "step": 1559 }, { "epoch": 0.26, "grad_norm": 1.888118680831311, "learning_rate": 1.74344141285964e-05, "loss": 0.8411, "step": 1560 }, { "epoch": 0.26, "grad_norm": 1.7248208645912726, "learning_rate": 1.7430854904702806e-05, "loss": 0.9253, "step": 1561 }, { "epoch": 0.26, "grad_norm": 1.7652885476213989, "learning_rate": 1.7427293577521377e-05, "loss": 0.8104, "step": 1562 }, { "epoch": 0.26, "grad_norm": 2.7621462057252075, "learning_rate": 1.742373014806012e-05, "loss": 0.7821, "step": 1563 }, { "epoch": 0.26, "grad_norm": 2.7979634377357345, "learning_rate": 1.7420164617327662e-05, "loss": 0.8084, "step": 1564 }, { "epoch": 0.26, "grad_norm": 1.903040941406334, "learning_rate": 1.7416596986333222e-05, "loss": 0.8865, "step": 1565 }, { "epoch": 0.26, "grad_norm": 1.8550663847731397, "learning_rate": 1.7413027256086606e-05, "loss": 0.854, "step": 1566 }, { "epoch": 0.26, "grad_norm": 1.8852893944624673, "learning_rate": 1.7409455427598224e-05, "loss": 0.8298, "step": 1567 }, { "epoch": 0.26, "grad_norm": 2.386935570360993, "learning_rate": 1.740588150187907e-05, "loss": 0.8651, "step": 1568 }, { "epoch": 0.26, "grad_norm": 2.3359192350351354, "learning_rate": 1.7402305479940735e-05, "loss": 0.8679, "step": 1569 }, { "epoch": 0.26, "grad_norm": 1.9226396462694744, "learning_rate": 1.739872736279541e-05, "loss": 0.8879, "step": 1570 }, { "epoch": 0.26, "grad_norm": 1.7917741778968175, "learning_rate": 1.7395147151455868e-05, "loss": 0.8865, "step": 1571 }, { "epoch": 0.26, "grad_norm": 1.8761719711884033, "learning_rate": 1.7391564846935484e-05, "loss": 0.8717, "step": 1572 }, { "epoch": 0.26, "grad_norm": 2.174092536644497, "learning_rate": 1.7387980450248222e-05, "loss": 0.8819, "step": 1573 }, { "epoch": 0.26, "grad_norm": 2.0893025746728244, "learning_rate": 1.7384393962408633e-05, "loss": 0.8881, "step": 1574 }, { "epoch": 0.26, "grad_norm": 2.489898890878209, "learning_rate": 1.7380805384431875e-05, "loss": 0.7872, "step": 1575 }, { "epoch": 0.26, "grad_norm": 2.3444836721772377, "learning_rate": 1.7377214717333675e-05, "loss": 0.8427, "step": 1576 }, { "epoch": 0.26, "grad_norm": 2.143166224460345, "learning_rate": 1.7373621962130373e-05, "loss": 0.9715, "step": 1577 }, { "epoch": 0.26, "grad_norm": 2.291531583665644, "learning_rate": 1.7370027119838884e-05, "loss": 0.8537, "step": 1578 }, { "epoch": 0.26, "grad_norm": 2.336882724062957, "learning_rate": 1.7366430191476723e-05, "loss": 0.8146, "step": 1579 }, { "epoch": 0.26, "grad_norm": 1.7712469043470276, "learning_rate": 1.7362831178062e-05, "loss": 0.9614, "step": 1580 }, { "epoch": 0.26, "grad_norm": 2.106953381706513, "learning_rate": 1.73592300806134e-05, "loss": 0.817, "step": 1581 }, { "epoch": 0.26, "grad_norm": 0.7465184995880652, "learning_rate": 1.7355626900150202e-05, "loss": 0.4151, "step": 1582 }, { "epoch": 0.26, "grad_norm": 2.247021254901819, "learning_rate": 1.735202163769229e-05, "loss": 0.894, "step": 1583 }, { "epoch": 0.26, "grad_norm": 2.486704513136622, "learning_rate": 1.734841429426012e-05, "loss": 0.8887, "step": 1584 }, { "epoch": 0.26, "grad_norm": 1.7774456736435411, "learning_rate": 1.7344804870874744e-05, "loss": 0.9003, "step": 1585 }, { "epoch": 0.26, "grad_norm": 2.075987504503235, "learning_rate": 1.73411933685578e-05, "loss": 0.8538, "step": 1586 }, { "epoch": 0.26, "grad_norm": 2.3371312310216066, "learning_rate": 1.733757978833152e-05, "loss": 0.869, "step": 1587 }, { "epoch": 0.26, "grad_norm": 1.9207443892398042, "learning_rate": 1.7333964131218714e-05, "loss": 0.8881, "step": 1588 }, { "epoch": 0.26, "grad_norm": 2.692461213406648, "learning_rate": 1.7330346398242794e-05, "loss": 0.8292, "step": 1589 }, { "epoch": 0.26, "grad_norm": 2.4453962453536082, "learning_rate": 1.7326726590427747e-05, "loss": 0.8498, "step": 1590 }, { "epoch": 0.26, "grad_norm": 2.0040051510096974, "learning_rate": 1.732310470879815e-05, "loss": 0.7817, "step": 1591 }, { "epoch": 0.26, "grad_norm": 1.6437982043411161, "learning_rate": 1.7319480754379175e-05, "loss": 0.9389, "step": 1592 }, { "epoch": 0.26, "grad_norm": 1.8005558994364852, "learning_rate": 1.7315854728196568e-05, "loss": 0.8009, "step": 1593 }, { "epoch": 0.26, "grad_norm": 2.6380315225496243, "learning_rate": 1.7312226631276675e-05, "loss": 0.8217, "step": 1594 }, { "epoch": 0.26, "grad_norm": 2.614952810401372, "learning_rate": 1.7308596464646413e-05, "loss": 0.8698, "step": 1595 }, { "epoch": 0.26, "grad_norm": 5.384733163357381, "learning_rate": 1.7304964229333302e-05, "loss": 0.7424, "step": 1596 }, { "epoch": 0.26, "grad_norm": 2.21042978455716, "learning_rate": 1.7301329926365432e-05, "loss": 0.8188, "step": 1597 }, { "epoch": 0.26, "grad_norm": 2.2443071518591133, "learning_rate": 1.729769355677149e-05, "loss": 0.821, "step": 1598 }, { "epoch": 0.26, "grad_norm": 2.056925309535678, "learning_rate": 1.7294055121580735e-05, "loss": 0.8493, "step": 1599 }, { "epoch": 0.26, "grad_norm": 2.093816402055853, "learning_rate": 1.7290414621823026e-05, "loss": 0.8856, "step": 1600 }, { "epoch": 0.26, "grad_norm": 1.7205418873693514, "learning_rate": 1.7286772058528796e-05, "loss": 0.902, "step": 1601 }, { "epoch": 0.26, "grad_norm": 3.2365192653584356, "learning_rate": 1.7283127432729063e-05, "loss": 0.9019, "step": 1602 }, { "epoch": 0.26, "grad_norm": 1.9439309393198247, "learning_rate": 1.7279480745455433e-05, "loss": 0.8649, "step": 1603 }, { "epoch": 0.26, "grad_norm": 2.1767884202802184, "learning_rate": 1.7275831997740095e-05, "loss": 0.751, "step": 1604 }, { "epoch": 0.26, "grad_norm": 1.771044403857889, "learning_rate": 1.7272181190615812e-05, "loss": 0.8664, "step": 1605 }, { "epoch": 0.26, "grad_norm": 1.971027585047137, "learning_rate": 1.7268528325115947e-05, "loss": 0.8784, "step": 1606 }, { "epoch": 0.26, "grad_norm": 2.267193218889106, "learning_rate": 1.726487340227443e-05, "loss": 0.7294, "step": 1607 }, { "epoch": 0.26, "grad_norm": 2.135017247473641, "learning_rate": 1.7261216423125782e-05, "loss": 0.8062, "step": 1608 }, { "epoch": 0.26, "grad_norm": 1.8902053340500191, "learning_rate": 1.7257557388705098e-05, "loss": 0.8881, "step": 1609 }, { "epoch": 0.26, "grad_norm": 1.7851120919139976, "learning_rate": 1.725389630004807e-05, "loss": 0.8679, "step": 1610 }, { "epoch": 0.26, "grad_norm": 2.637550314716432, "learning_rate": 1.7250233158190948e-05, "loss": 0.8093, "step": 1611 }, { "epoch": 0.26, "grad_norm": 1.7934019071043734, "learning_rate": 1.7246567964170585e-05, "loss": 0.8589, "step": 1612 }, { "epoch": 0.26, "grad_norm": 4.320218605133056, "learning_rate": 1.724290071902441e-05, "loss": 0.8895, "step": 1613 }, { "epoch": 0.27, "grad_norm": 2.5159034094218824, "learning_rate": 1.723923142379042e-05, "loss": 0.8777, "step": 1614 }, { "epoch": 0.27, "grad_norm": 2.0226702205102853, "learning_rate": 1.7235560079507204e-05, "loss": 0.8913, "step": 1615 }, { "epoch": 0.27, "grad_norm": 1.9262965673975807, "learning_rate": 1.7231886687213936e-05, "loss": 0.765, "step": 1616 }, { "epoch": 0.27, "grad_norm": 2.13872353073941, "learning_rate": 1.722821124795035e-05, "loss": 0.8132, "step": 1617 }, { "epoch": 0.27, "grad_norm": 2.661339753595609, "learning_rate": 1.7224533762756775e-05, "loss": 0.8419, "step": 1618 }, { "epoch": 0.27, "grad_norm": 1.97815667458322, "learning_rate": 1.7220854232674127e-05, "loss": 0.8783, "step": 1619 }, { "epoch": 0.27, "grad_norm": 2.3118866270707503, "learning_rate": 1.721717265874387e-05, "loss": 0.8501, "step": 1620 }, { "epoch": 0.27, "grad_norm": 2.5822839370989192, "learning_rate": 1.721348904200808e-05, "loss": 0.8784, "step": 1621 }, { "epoch": 0.27, "grad_norm": 1.800387716079454, "learning_rate": 1.7209803383509394e-05, "loss": 0.8659, "step": 1622 }, { "epoch": 0.27, "grad_norm": 2.036686818686, "learning_rate": 1.720611568429103e-05, "loss": 0.9006, "step": 1623 }, { "epoch": 0.27, "grad_norm": 1.7058267700852414, "learning_rate": 1.7202425945396774e-05, "loss": 0.8069, "step": 1624 }, { "epoch": 0.27, "grad_norm": 2.4706698046191944, "learning_rate": 1.7198734167871015e-05, "loss": 0.8256, "step": 1625 }, { "epoch": 0.27, "grad_norm": 2.1128595646544546, "learning_rate": 1.719504035275869e-05, "loss": 0.805, "step": 1626 }, { "epoch": 0.27, "grad_norm": 1.689625093613573, "learning_rate": 1.7191344501105328e-05, "loss": 0.8881, "step": 1627 }, { "epoch": 0.27, "grad_norm": 1.6544196125605417, "learning_rate": 1.718764661395704e-05, "loss": 0.8448, "step": 1628 }, { "epoch": 0.27, "grad_norm": 3.4010658087473122, "learning_rate": 1.7183946692360495e-05, "loss": 0.88, "step": 1629 }, { "epoch": 0.27, "grad_norm": 2.416542817773903, "learning_rate": 1.7180244737362956e-05, "loss": 0.8507, "step": 1630 }, { "epoch": 0.27, "grad_norm": 1.8574656934044877, "learning_rate": 1.717654075001225e-05, "loss": 0.8082, "step": 1631 }, { "epoch": 0.27, "grad_norm": 2.3276159394698186, "learning_rate": 1.717283473135678e-05, "loss": 0.8028, "step": 1632 }, { "epoch": 0.27, "grad_norm": 1.6193120823622948, "learning_rate": 1.716912668244553e-05, "loss": 0.864, "step": 1633 }, { "epoch": 0.27, "grad_norm": 0.8502931224101196, "learning_rate": 1.7165416604328054e-05, "loss": 0.3928, "step": 1634 }, { "epoch": 0.27, "grad_norm": 1.7669980407026065, "learning_rate": 1.7161704498054485e-05, "loss": 0.8148, "step": 1635 }, { "epoch": 0.27, "grad_norm": 1.8382384928549933, "learning_rate": 1.7157990364675524e-05, "loss": 0.8199, "step": 1636 }, { "epoch": 0.27, "grad_norm": 2.052651222860288, "learning_rate": 1.7154274205242448e-05, "loss": 0.875, "step": 1637 }, { "epoch": 0.27, "grad_norm": 1.6723485360814607, "learning_rate": 1.715055602080711e-05, "loss": 0.807, "step": 1638 }, { "epoch": 0.27, "grad_norm": 1.7688466828269982, "learning_rate": 1.7146835812421937e-05, "loss": 0.8997, "step": 1639 }, { "epoch": 0.27, "grad_norm": 2.3608648178098672, "learning_rate": 1.714311358113992e-05, "loss": 0.7626, "step": 1640 }, { "epoch": 0.27, "grad_norm": 3.268006505834673, "learning_rate": 1.7139389328014634e-05, "loss": 0.8935, "step": 1641 }, { "epoch": 0.27, "grad_norm": 1.7919090697313698, "learning_rate": 1.7135663054100216e-05, "loss": 0.9092, "step": 1642 }, { "epoch": 0.27, "grad_norm": 2.277684138045734, "learning_rate": 1.7131934760451385e-05, "loss": 0.8182, "step": 1643 }, { "epoch": 0.27, "grad_norm": 2.5967773763637423, "learning_rate": 1.7128204448123422e-05, "loss": 0.8357, "step": 1644 }, { "epoch": 0.27, "grad_norm": 2.104736313412258, "learning_rate": 1.7124472118172187e-05, "loss": 0.8077, "step": 1645 }, { "epoch": 0.27, "grad_norm": 2.05767420238449, "learning_rate": 1.712073777165411e-05, "loss": 0.8365, "step": 1646 }, { "epoch": 0.27, "grad_norm": 1.7967162417038667, "learning_rate": 1.7117001409626185e-05, "loss": 0.835, "step": 1647 }, { "epoch": 0.27, "grad_norm": 1.7712715143987754, "learning_rate": 1.7113263033145985e-05, "loss": 0.8346, "step": 1648 }, { "epoch": 0.27, "grad_norm": 2.2704114906072377, "learning_rate": 1.7109522643271646e-05, "loss": 0.8325, "step": 1649 }, { "epoch": 0.27, "grad_norm": 1.788047339164699, "learning_rate": 1.7105780241061884e-05, "loss": 0.8821, "step": 1650 }, { "epoch": 0.27, "grad_norm": 5.93511567619482, "learning_rate": 1.710203582757597e-05, "loss": 0.8966, "step": 1651 }, { "epoch": 0.27, "grad_norm": 2.8631591566968435, "learning_rate": 1.7098289403873754e-05, "loss": 0.9341, "step": 1652 }, { "epoch": 0.27, "grad_norm": 2.3684374273305897, "learning_rate": 1.7094540971015663e-05, "loss": 0.8325, "step": 1653 }, { "epoch": 0.27, "grad_norm": 2.5369895718079145, "learning_rate": 1.709079053006267e-05, "loss": 0.8934, "step": 1654 }, { "epoch": 0.27, "grad_norm": 1.028483717181543, "learning_rate": 1.708703808207634e-05, "loss": 0.4075, "step": 1655 }, { "epoch": 0.27, "grad_norm": 0.7825124071577692, "learning_rate": 1.7083283628118786e-05, "loss": 0.3865, "step": 1656 }, { "epoch": 0.27, "grad_norm": 2.6936156061667496, "learning_rate": 1.7079527169252706e-05, "loss": 0.835, "step": 1657 }, { "epoch": 0.27, "grad_norm": 2.107602700767853, "learning_rate": 1.7075768706541355e-05, "loss": 0.8259, "step": 1658 }, { "epoch": 0.27, "grad_norm": 2.563042281808659, "learning_rate": 1.7072008241048555e-05, "loss": 0.8506, "step": 1659 }, { "epoch": 0.27, "grad_norm": 3.130787735441017, "learning_rate": 1.7068245773838703e-05, "loss": 0.9176, "step": 1660 }, { "epoch": 0.27, "grad_norm": 1.933698912390833, "learning_rate": 1.7064481305976754e-05, "loss": 0.8729, "step": 1661 }, { "epoch": 0.27, "grad_norm": 2.64769605857411, "learning_rate": 1.7060714838528234e-05, "loss": 0.875, "step": 1662 }, { "epoch": 0.27, "grad_norm": 1.770292856191582, "learning_rate": 1.7056946372559234e-05, "loss": 0.8776, "step": 1663 }, { "epoch": 0.27, "grad_norm": 3.0812722423835357, "learning_rate": 1.7053175909136406e-05, "loss": 0.8126, "step": 1664 }, { "epoch": 0.27, "grad_norm": 1.97963519751787, "learning_rate": 1.7049403449326982e-05, "loss": 0.8721, "step": 1665 }, { "epoch": 0.27, "grad_norm": 1.893926363524919, "learning_rate": 1.704562899419874e-05, "loss": 0.8595, "step": 1666 }, { "epoch": 0.27, "grad_norm": 2.122696150585661, "learning_rate": 1.704185254482003e-05, "loss": 0.8654, "step": 1667 }, { "epoch": 0.27, "grad_norm": 2.6068785368176015, "learning_rate": 1.7038074102259775e-05, "loss": 0.8583, "step": 1668 }, { "epoch": 0.27, "grad_norm": 1.8985450174349026, "learning_rate": 1.703429366758745e-05, "loss": 0.8501, "step": 1669 }, { "epoch": 0.27, "grad_norm": 1.7774224050694194, "learning_rate": 1.7030511241873107e-05, "loss": 0.837, "step": 1670 }, { "epoch": 0.27, "grad_norm": 2.2026845149161964, "learning_rate": 1.7026726826187343e-05, "loss": 0.8929, "step": 1671 }, { "epoch": 0.27, "grad_norm": 1.9352204246299622, "learning_rate": 1.7022940421601334e-05, "loss": 0.8439, "step": 1672 }, { "epoch": 0.27, "grad_norm": 1.8137405430006357, "learning_rate": 1.7019152029186817e-05, "loss": 0.7689, "step": 1673 }, { "epoch": 0.27, "grad_norm": 2.127130331328958, "learning_rate": 1.701536165001608e-05, "loss": 0.8075, "step": 1674 }, { "epoch": 0.28, "grad_norm": 1.7077415740094022, "learning_rate": 1.701156928516199e-05, "loss": 0.7813, "step": 1675 }, { "epoch": 0.28, "grad_norm": 1.8558281302627104, "learning_rate": 1.7007774935697966e-05, "loss": 0.8699, "step": 1676 }, { "epoch": 0.28, "grad_norm": 6.4808207417682135, "learning_rate": 1.7003978602697988e-05, "loss": 0.7883, "step": 1677 }, { "epoch": 0.28, "grad_norm": 1.742083552710024, "learning_rate": 1.70001802872366e-05, "loss": 0.8696, "step": 1678 }, { "epoch": 0.28, "grad_norm": 2.061643046743216, "learning_rate": 1.6996379990388908e-05, "loss": 0.8327, "step": 1679 }, { "epoch": 0.28, "grad_norm": 1.6441272860524925, "learning_rate": 1.6992577713230582e-05, "loss": 0.907, "step": 1680 }, { "epoch": 0.28, "grad_norm": 4.492908536805875, "learning_rate": 1.6988773456837847e-05, "loss": 0.8978, "step": 1681 }, { "epoch": 0.28, "grad_norm": 1.724660877178183, "learning_rate": 1.6984967222287484e-05, "loss": 0.8905, "step": 1682 }, { "epoch": 0.28, "grad_norm": 2.127257867582903, "learning_rate": 1.6981159010656847e-05, "loss": 0.7915, "step": 1683 }, { "epoch": 0.28, "grad_norm": 2.0102111596059875, "learning_rate": 1.6977348823023838e-05, "loss": 0.7433, "step": 1684 }, { "epoch": 0.28, "grad_norm": 2.016565946156358, "learning_rate": 1.697353666046692e-05, "loss": 0.8399, "step": 1685 }, { "epoch": 0.28, "grad_norm": 2.0124615176675333, "learning_rate": 1.6969722524065124e-05, "loss": 0.7816, "step": 1686 }, { "epoch": 0.28, "grad_norm": 2.3235966176170058, "learning_rate": 1.696590641489803e-05, "loss": 0.8195, "step": 1687 }, { "epoch": 0.28, "grad_norm": 2.196424462059843, "learning_rate": 1.6962088334045785e-05, "loss": 0.8179, "step": 1688 }, { "epoch": 0.28, "grad_norm": 1.52717840066287, "learning_rate": 1.695826828258908e-05, "loss": 0.7416, "step": 1689 }, { "epoch": 0.28, "grad_norm": 2.7208879600593514, "learning_rate": 1.6954446261609176e-05, "loss": 0.8057, "step": 1690 }, { "epoch": 0.28, "grad_norm": 6.056972888984837, "learning_rate": 1.6950622272187888e-05, "loss": 0.9123, "step": 1691 }, { "epoch": 0.28, "grad_norm": 1.81175350188784, "learning_rate": 1.6946796315407593e-05, "loss": 0.8189, "step": 1692 }, { "epoch": 0.28, "grad_norm": 2.3634685911545383, "learning_rate": 1.694296839235121e-05, "loss": 0.8556, "step": 1693 }, { "epoch": 0.28, "grad_norm": 2.567719413722125, "learning_rate": 1.6939138504102235e-05, "loss": 0.8713, "step": 1694 }, { "epoch": 0.28, "grad_norm": 2.8912497204546446, "learning_rate": 1.6935306651744704e-05, "loss": 0.8009, "step": 1695 }, { "epoch": 0.28, "grad_norm": 1.7738099746837444, "learning_rate": 1.6931472836363214e-05, "loss": 0.8113, "step": 1696 }, { "epoch": 0.28, "grad_norm": 3.1148643294522964, "learning_rate": 1.6927637059042918e-05, "loss": 0.8682, "step": 1697 }, { "epoch": 0.28, "grad_norm": 1.7650010583779516, "learning_rate": 1.692379932086953e-05, "loss": 0.4858, "step": 1698 }, { "epoch": 0.28, "grad_norm": 2.46444449534132, "learning_rate": 1.6919959622929312e-05, "loss": 0.8812, "step": 1699 }, { "epoch": 0.28, "grad_norm": 1.6959167966471136, "learning_rate": 1.691611796630908e-05, "loss": 0.814, "step": 1700 }, { "epoch": 0.28, "grad_norm": 1.8682118565649841, "learning_rate": 1.6912274352096207e-05, "loss": 0.7494, "step": 1701 }, { "epoch": 0.28, "grad_norm": 3.4994620293269416, "learning_rate": 1.690842878137862e-05, "loss": 0.7006, "step": 1702 }, { "epoch": 0.28, "grad_norm": 1.661044344118778, "learning_rate": 1.6904581255244802e-05, "loss": 0.8503, "step": 1703 }, { "epoch": 0.28, "grad_norm": 2.3041632567177324, "learning_rate": 1.690073177478379e-05, "loss": 0.8219, "step": 1704 }, { "epoch": 0.28, "grad_norm": 3.929078343168623, "learning_rate": 1.6896880341085158e-05, "loss": 0.847, "step": 1705 }, { "epoch": 0.28, "grad_norm": 0.7980723944859173, "learning_rate": 1.6893026955239062e-05, "loss": 0.3805, "step": 1706 }, { "epoch": 0.28, "grad_norm": 2.3384864530107583, "learning_rate": 1.688917161833618e-05, "loss": 0.8362, "step": 1707 }, { "epoch": 0.28, "grad_norm": 2.4565543614928993, "learning_rate": 1.688531433146777e-05, "loss": 0.8627, "step": 1708 }, { "epoch": 0.28, "grad_norm": 2.140359501933479, "learning_rate": 1.6881455095725627e-05, "loss": 0.8794, "step": 1709 }, { "epoch": 0.28, "grad_norm": 2.171042752884373, "learning_rate": 1.6877593912202094e-05, "loss": 0.9277, "step": 1710 }, { "epoch": 0.28, "grad_norm": 5.0498795420271945, "learning_rate": 1.6873730781990073e-05, "loss": 0.8499, "step": 1711 }, { "epoch": 0.28, "grad_norm": 2.1484958322166805, "learning_rate": 1.6869865706183017e-05, "loss": 0.8318, "step": 1712 }, { "epoch": 0.28, "grad_norm": 2.2859546829159667, "learning_rate": 1.6865998685874923e-05, "loss": 0.835, "step": 1713 }, { "epoch": 0.28, "grad_norm": 2.277821903836518, "learning_rate": 1.6862129722160347e-05, "loss": 0.842, "step": 1714 }, { "epoch": 0.28, "grad_norm": 1.879807742942365, "learning_rate": 1.685825881613439e-05, "loss": 0.8096, "step": 1715 }, { "epoch": 0.28, "grad_norm": 0.8299728431940573, "learning_rate": 1.6854385968892702e-05, "loss": 0.4046, "step": 1716 }, { "epoch": 0.28, "grad_norm": 2.464873426025545, "learning_rate": 1.6850511181531487e-05, "loss": 0.8263, "step": 1717 }, { "epoch": 0.28, "grad_norm": 2.0045409429089265, "learning_rate": 1.6846634455147498e-05, "loss": 0.9417, "step": 1718 }, { "epoch": 0.28, "grad_norm": 2.028981736677282, "learning_rate": 1.6842755790838025e-05, "loss": 0.8752, "step": 1719 }, { "epoch": 0.28, "grad_norm": 1.5138310680617462, "learning_rate": 1.6838875189700924e-05, "loss": 0.791, "step": 1720 }, { "epoch": 0.28, "grad_norm": 1.5831786122253744, "learning_rate": 1.6834992652834586e-05, "loss": 0.8369, "step": 1721 }, { "epoch": 0.28, "grad_norm": 1.926641517261588, "learning_rate": 1.6831108181337957e-05, "loss": 0.8003, "step": 1722 }, { "epoch": 0.28, "grad_norm": 1.8768935063225662, "learning_rate": 1.6827221776310532e-05, "loss": 0.8087, "step": 1723 }, { "epoch": 0.28, "grad_norm": 2.3267730625081704, "learning_rate": 1.6823333438852346e-05, "loss": 0.8818, "step": 1724 }, { "epoch": 0.28, "grad_norm": 2.3366450610964375, "learning_rate": 1.6819443170063983e-05, "loss": 0.837, "step": 1725 }, { "epoch": 0.28, "grad_norm": 1.9570042957493932, "learning_rate": 1.681555097104658e-05, "loss": 0.8603, "step": 1726 }, { "epoch": 0.28, "grad_norm": 2.2991936082250763, "learning_rate": 1.681165684290181e-05, "loss": 0.7935, "step": 1727 }, { "epoch": 0.28, "grad_norm": 1.9415802970722669, "learning_rate": 1.6807760786731905e-05, "loss": 0.8951, "step": 1728 }, { "epoch": 0.28, "grad_norm": 1.858766190170712, "learning_rate": 1.680386280363963e-05, "loss": 0.8891, "step": 1729 }, { "epoch": 0.28, "grad_norm": 2.3783851561412415, "learning_rate": 1.67999628947283e-05, "loss": 0.778, "step": 1730 }, { "epoch": 0.28, "grad_norm": 2.348840371328827, "learning_rate": 1.6796061061101782e-05, "loss": 0.8437, "step": 1731 }, { "epoch": 0.28, "grad_norm": 2.4352313409724378, "learning_rate": 1.6792157303864475e-05, "loss": 0.9114, "step": 1732 }, { "epoch": 0.28, "grad_norm": 2.315694584262233, "learning_rate": 1.6788251624121335e-05, "loss": 0.8438, "step": 1733 }, { "epoch": 0.28, "grad_norm": 2.232539653976177, "learning_rate": 1.678434402297785e-05, "loss": 0.9042, "step": 1734 }, { "epoch": 0.28, "grad_norm": 0.7460478072617135, "learning_rate": 1.678043450154007e-05, "loss": 0.3842, "step": 1735 }, { "epoch": 0.29, "grad_norm": 2.030861035385278, "learning_rate": 1.6776523060914565e-05, "loss": 0.9327, "step": 1736 }, { "epoch": 0.29, "grad_norm": 2.2327940819215435, "learning_rate": 1.677260970220846e-05, "loss": 0.7186, "step": 1737 }, { "epoch": 0.29, "grad_norm": 2.0860590930286955, "learning_rate": 1.6768694426529432e-05, "loss": 0.8068, "step": 1738 }, { "epoch": 0.29, "grad_norm": 2.0223060501948753, "learning_rate": 1.676477723498569e-05, "loss": 0.8159, "step": 1739 }, { "epoch": 0.29, "grad_norm": 2.710806518379919, "learning_rate": 1.6760858128685974e-05, "loss": 0.8223, "step": 1740 }, { "epoch": 0.29, "grad_norm": 2.342421962753445, "learning_rate": 1.6756937108739596e-05, "loss": 0.8304, "step": 1741 }, { "epoch": 0.29, "grad_norm": 2.558556709539942, "learning_rate": 1.675301417625638e-05, "loss": 0.802, "step": 1742 }, { "epoch": 0.29, "grad_norm": 2.5880312128828584, "learning_rate": 1.6749089332346714e-05, "loss": 0.797, "step": 1743 }, { "epoch": 0.29, "grad_norm": 2.522443701907335, "learning_rate": 1.6745162578121504e-05, "loss": 0.8367, "step": 1744 }, { "epoch": 0.29, "grad_norm": 1.6185168002522807, "learning_rate": 1.6741233914692223e-05, "loss": 0.8172, "step": 1745 }, { "epoch": 0.29, "grad_norm": 1.9329454361168303, "learning_rate": 1.6737303343170863e-05, "loss": 0.8491, "step": 1746 }, { "epoch": 0.29, "grad_norm": 2.5636291962143503, "learning_rate": 1.6733370864669965e-05, "loss": 0.7973, "step": 1747 }, { "epoch": 0.29, "grad_norm": 3.936111641933048, "learning_rate": 1.672943648030261e-05, "loss": 0.8309, "step": 1748 }, { "epoch": 0.29, "grad_norm": 1.8618805186223824, "learning_rate": 1.6725500191182415e-05, "loss": 0.8525, "step": 1749 }, { "epoch": 0.29, "grad_norm": 1.9020978012392047, "learning_rate": 1.672156199842354e-05, "loss": 0.8621, "step": 1750 }, { "epoch": 0.29, "grad_norm": 1.883990659936123, "learning_rate": 1.6717621903140686e-05, "loss": 0.8515, "step": 1751 }, { "epoch": 0.29, "grad_norm": 2.1290137331404257, "learning_rate": 1.6713679906449084e-05, "loss": 0.9379, "step": 1752 }, { "epoch": 0.29, "grad_norm": 2.684134810492437, "learning_rate": 1.6709736009464504e-05, "loss": 0.807, "step": 1753 }, { "epoch": 0.29, "grad_norm": 2.191473936948274, "learning_rate": 1.670579021330327e-05, "loss": 0.76, "step": 1754 }, { "epoch": 0.29, "grad_norm": 1.8061849879199248, "learning_rate": 1.670184251908222e-05, "loss": 0.8711, "step": 1755 }, { "epoch": 0.29, "grad_norm": 2.065693219499726, "learning_rate": 1.6697892927918742e-05, "loss": 0.8895, "step": 1756 }, { "epoch": 0.29, "grad_norm": 2.002056622876878, "learning_rate": 1.6693941440930768e-05, "loss": 0.8554, "step": 1757 }, { "epoch": 0.29, "grad_norm": 1.8827924926596937, "learning_rate": 1.668998805923675e-05, "loss": 0.8359, "step": 1758 }, { "epoch": 0.29, "grad_norm": 1.7536522643775474, "learning_rate": 1.668603278395568e-05, "loss": 0.8469, "step": 1759 }, { "epoch": 0.29, "grad_norm": 1.7405620810347324, "learning_rate": 1.6682075616207103e-05, "loss": 0.7141, "step": 1760 }, { "epoch": 0.29, "grad_norm": 2.472645458285529, "learning_rate": 1.667811655711108e-05, "loss": 0.7659, "step": 1761 }, { "epoch": 0.29, "grad_norm": 2.300304742425497, "learning_rate": 1.6674155607788214e-05, "loss": 0.8414, "step": 1762 }, { "epoch": 0.29, "grad_norm": 2.2281543280375846, "learning_rate": 1.6670192769359643e-05, "loss": 0.8587, "step": 1763 }, { "epoch": 0.29, "grad_norm": 1.7234075445912633, "learning_rate": 1.666622804294704e-05, "loss": 0.7951, "step": 1764 }, { "epoch": 0.29, "grad_norm": 0.7095758505325385, "learning_rate": 1.666226142967262e-05, "loss": 0.3709, "step": 1765 }, { "epoch": 0.29, "grad_norm": 1.9633134325005301, "learning_rate": 1.6658292930659115e-05, "loss": 0.9026, "step": 1766 }, { "epoch": 0.29, "grad_norm": 2.347680840208777, "learning_rate": 1.6654322547029803e-05, "loss": 0.8537, "step": 1767 }, { "epoch": 0.29, "grad_norm": 2.776300339635437, "learning_rate": 1.6650350279908497e-05, "loss": 0.8644, "step": 1768 }, { "epoch": 0.29, "grad_norm": 2.123079327142083, "learning_rate": 1.664637613041953e-05, "loss": 0.8563, "step": 1769 }, { "epoch": 0.29, "grad_norm": 1.7387084637606078, "learning_rate": 1.6642400099687787e-05, "loss": 0.8129, "step": 1770 }, { "epoch": 0.29, "grad_norm": 3.2129923113551486, "learning_rate": 1.6638422188838667e-05, "loss": 0.8949, "step": 1771 }, { "epoch": 0.29, "grad_norm": 2.0673787161585357, "learning_rate": 1.6634442398998115e-05, "loss": 0.8942, "step": 1772 }, { "epoch": 0.29, "grad_norm": 1.769913900522417, "learning_rate": 1.6630460731292597e-05, "loss": 0.942, "step": 1773 }, { "epoch": 0.29, "grad_norm": 0.6947512018307106, "learning_rate": 1.662647718684912e-05, "loss": 0.3788, "step": 1774 }, { "epoch": 0.29, "grad_norm": 2.2759831148892893, "learning_rate": 1.6622491766795215e-05, "loss": 0.8693, "step": 1775 }, { "epoch": 0.29, "grad_norm": 2.0953503514433605, "learning_rate": 1.661850447225895e-05, "loss": 0.9096, "step": 1776 }, { "epoch": 0.29, "grad_norm": 2.1420053374994197, "learning_rate": 1.6614515304368915e-05, "loss": 0.8642, "step": 1777 }, { "epoch": 0.29, "grad_norm": 1.7827798168602789, "learning_rate": 1.661052426425424e-05, "loss": 0.8482, "step": 1778 }, { "epoch": 0.29, "grad_norm": 0.677001588244224, "learning_rate": 1.6606531353044585e-05, "loss": 0.4021, "step": 1779 }, { "epoch": 0.29, "grad_norm": 0.6879501587029718, "learning_rate": 1.660253657187012e-05, "loss": 0.3818, "step": 1780 }, { "epoch": 0.29, "grad_norm": 2.2777943324220717, "learning_rate": 1.6598539921861573e-05, "loss": 0.7687, "step": 1781 }, { "epoch": 0.29, "grad_norm": 1.4486628960093522, "learning_rate": 1.6594541404150187e-05, "loss": 0.8333, "step": 1782 }, { "epoch": 0.29, "grad_norm": 1.7430118172318025, "learning_rate": 1.6590541019867722e-05, "loss": 0.7345, "step": 1783 }, { "epoch": 0.29, "grad_norm": 2.082919184110719, "learning_rate": 1.6586538770146495e-05, "loss": 0.7836, "step": 1784 }, { "epoch": 0.29, "grad_norm": 1.8229986694570588, "learning_rate": 1.658253465611932e-05, "loss": 0.8379, "step": 1785 }, { "epoch": 0.29, "grad_norm": 1.9814249986108528, "learning_rate": 1.6578528678919564e-05, "loss": 0.8677, "step": 1786 }, { "epoch": 0.29, "grad_norm": 0.6605203530267764, "learning_rate": 1.65745208396811e-05, "loss": 0.3657, "step": 1787 }, { "epoch": 0.29, "grad_norm": 1.9274387282843888, "learning_rate": 1.6570511139538348e-05, "loss": 0.8072, "step": 1788 }, { "epoch": 0.29, "grad_norm": 2.3151270483301674, "learning_rate": 1.6566499579626237e-05, "loss": 0.8856, "step": 1789 }, { "epoch": 0.29, "grad_norm": 2.561029953346571, "learning_rate": 1.656248616108024e-05, "loss": 0.8319, "step": 1790 }, { "epoch": 0.29, "grad_norm": 3.0382714804726167, "learning_rate": 1.655847088503634e-05, "loss": 0.9424, "step": 1791 }, { "epoch": 0.29, "grad_norm": 3.1599586621691844, "learning_rate": 1.655445375263105e-05, "loss": 0.8861, "step": 1792 }, { "epoch": 0.29, "grad_norm": 1.982165101020931, "learning_rate": 1.655043476500142e-05, "loss": 0.8428, "step": 1793 }, { "epoch": 0.29, "grad_norm": 1.8697219430540801, "learning_rate": 1.6546413923285008e-05, "loss": 0.8853, "step": 1794 }, { "epoch": 0.29, "grad_norm": 2.475999588833116, "learning_rate": 1.6542391228619906e-05, "loss": 0.7947, "step": 1795 }, { "epoch": 0.29, "grad_norm": 0.7501117656167103, "learning_rate": 1.6538366682144734e-05, "loss": 0.3961, "step": 1796 }, { "epoch": 0.3, "grad_norm": 2.6418494463413977, "learning_rate": 1.6534340284998626e-05, "loss": 0.9019, "step": 1797 }, { "epoch": 0.3, "grad_norm": 1.8841947246939421, "learning_rate": 1.6530312038321247e-05, "loss": 0.8799, "step": 1798 }, { "epoch": 0.3, "grad_norm": 1.774345334564937, "learning_rate": 1.6526281943252782e-05, "loss": 0.8423, "step": 1799 }, { "epoch": 0.3, "grad_norm": 7.3867301441338595, "learning_rate": 1.6522250000933948e-05, "loss": 0.8587, "step": 1800 }, { "epoch": 0.3, "grad_norm": 2.6374351050975045, "learning_rate": 1.6518216212505968e-05, "loss": 0.8104, "step": 1801 }, { "epoch": 0.3, "grad_norm": 1.8751848366355912, "learning_rate": 1.6514180579110606e-05, "loss": 0.7798, "step": 1802 }, { "epoch": 0.3, "grad_norm": 1.722601087668142, "learning_rate": 1.6510143101890136e-05, "loss": 0.8016, "step": 1803 }, { "epoch": 0.3, "grad_norm": 1.9008511719460381, "learning_rate": 1.6506103781987355e-05, "loss": 0.8748, "step": 1804 }, { "epoch": 0.3, "grad_norm": 1.8257349880004174, "learning_rate": 1.650206262054559e-05, "loss": 0.7641, "step": 1805 }, { "epoch": 0.3, "grad_norm": 2.063329031842509, "learning_rate": 1.6498019618708673e-05, "loss": 0.8594, "step": 1806 }, { "epoch": 0.3, "grad_norm": 1.7075785273963717, "learning_rate": 1.6493974777620976e-05, "loss": 0.8827, "step": 1807 }, { "epoch": 0.3, "grad_norm": 1.9240208477734735, "learning_rate": 1.6489928098427383e-05, "loss": 0.8435, "step": 1808 }, { "epoch": 0.3, "grad_norm": 2.11059197223388, "learning_rate": 1.648587958227329e-05, "loss": 0.847, "step": 1809 }, { "epoch": 0.3, "grad_norm": 2.1308250064252356, "learning_rate": 1.648182923030463e-05, "loss": 0.8568, "step": 1810 }, { "epoch": 0.3, "grad_norm": 1.8389794174130103, "learning_rate": 1.6477777043667846e-05, "loss": 0.8256, "step": 1811 }, { "epoch": 0.3, "grad_norm": 2.027881790457021, "learning_rate": 1.647372302350989e-05, "loss": 0.8241, "step": 1812 }, { "epoch": 0.3, "grad_norm": 1.7484712266817053, "learning_rate": 1.6469667170978258e-05, "loss": 0.788, "step": 1813 }, { "epoch": 0.3, "grad_norm": 2.1186159085085823, "learning_rate": 1.6465609487220942e-05, "loss": 0.877, "step": 1814 }, { "epoch": 0.3, "grad_norm": 2.0205771272550925, "learning_rate": 1.6461549973386464e-05, "loss": 0.8512, "step": 1815 }, { "epoch": 0.3, "grad_norm": 2.2710357061780204, "learning_rate": 1.645748863062386e-05, "loss": 0.7824, "step": 1816 }, { "epoch": 0.3, "grad_norm": 1.7568218714274244, "learning_rate": 1.6453425460082685e-05, "loss": 0.8753, "step": 1817 }, { "epoch": 0.3, "grad_norm": 1.969607189546747, "learning_rate": 1.6449360462913005e-05, "loss": 0.8318, "step": 1818 }, { "epoch": 0.3, "grad_norm": 2.3734315822374814, "learning_rate": 1.644529364026542e-05, "loss": 0.9235, "step": 1819 }, { "epoch": 0.3, "grad_norm": 1.9054904666176657, "learning_rate": 1.644122499329103e-05, "loss": 0.8699, "step": 1820 }, { "epoch": 0.3, "grad_norm": 0.878308274268544, "learning_rate": 1.6437154523141453e-05, "loss": 0.379, "step": 1821 }, { "epoch": 0.3, "grad_norm": 2.0707008759764403, "learning_rate": 1.6433082230968833e-05, "loss": 0.908, "step": 1822 }, { "epoch": 0.3, "grad_norm": 1.9223899178261543, "learning_rate": 1.642900811792582e-05, "loss": 0.9062, "step": 1823 }, { "epoch": 0.3, "grad_norm": 1.8349011613392703, "learning_rate": 1.6424932185165587e-05, "loss": 0.9252, "step": 1824 }, { "epoch": 0.3, "grad_norm": 2.02754830356114, "learning_rate": 1.6420854433841817e-05, "loss": 0.7679, "step": 1825 }, { "epoch": 0.3, "grad_norm": 0.6684209237688383, "learning_rate": 1.6416774865108706e-05, "loss": 0.38, "step": 1826 }, { "epoch": 0.3, "grad_norm": 24.26699544939561, "learning_rate": 1.641269348012097e-05, "loss": 0.856, "step": 1827 }, { "epoch": 0.3, "grad_norm": 1.718989975901777, "learning_rate": 1.640861028003383e-05, "loss": 0.8574, "step": 1828 }, { "epoch": 0.3, "grad_norm": 1.962517501739867, "learning_rate": 1.6404525266003037e-05, "loss": 0.7622, "step": 1829 }, { "epoch": 0.3, "grad_norm": 1.6431755480014938, "learning_rate": 1.6400438439184842e-05, "loss": 0.8029, "step": 1830 }, { "epoch": 0.3, "grad_norm": 1.5093249726800724, "learning_rate": 1.6396349800736012e-05, "loss": 0.8841, "step": 1831 }, { "epoch": 0.3, "grad_norm": 1.9441279741316617, "learning_rate": 1.6392259351813827e-05, "loss": 0.8362, "step": 1832 }, { "epoch": 0.3, "grad_norm": 1.865859811040008, "learning_rate": 1.6388167093576083e-05, "loss": 0.8587, "step": 1833 }, { "epoch": 0.3, "grad_norm": 2.538388219337807, "learning_rate": 1.638407302718108e-05, "loss": 0.7943, "step": 1834 }, { "epoch": 0.3, "grad_norm": 2.1916839334571616, "learning_rate": 1.6379977153787637e-05, "loss": 0.8368, "step": 1835 }, { "epoch": 0.3, "grad_norm": 2.6166204187579822, "learning_rate": 1.6375879474555084e-05, "loss": 0.8972, "step": 1836 }, { "epoch": 0.3, "grad_norm": 0.6963509306147047, "learning_rate": 1.637177999064326e-05, "loss": 0.3736, "step": 1837 }, { "epoch": 0.3, "grad_norm": 8.676417690761987, "learning_rate": 1.6367678703212515e-05, "loss": 0.917, "step": 1838 }, { "epoch": 0.3, "grad_norm": 2.288920096777723, "learning_rate": 1.636357561342371e-05, "loss": 0.8517, "step": 1839 }, { "epoch": 0.3, "grad_norm": 2.268591987752105, "learning_rate": 1.6359470722438212e-05, "loss": 0.9074, "step": 1840 }, { "epoch": 0.3, "grad_norm": 1.8037923564821556, "learning_rate": 1.6355364031417903e-05, "loss": 0.874, "step": 1841 }, { "epoch": 0.3, "grad_norm": 1.7425367714212532, "learning_rate": 1.6351255541525182e-05, "loss": 0.8762, "step": 1842 }, { "epoch": 0.3, "grad_norm": 2.2476249017299477, "learning_rate": 1.6347145253922942e-05, "loss": 0.9122, "step": 1843 }, { "epoch": 0.3, "grad_norm": 1.9271146367143592, "learning_rate": 1.6343033169774587e-05, "loss": 0.862, "step": 1844 }, { "epoch": 0.3, "grad_norm": 1.6804456539506647, "learning_rate": 1.633891929024404e-05, "loss": 0.8625, "step": 1845 }, { "epoch": 0.3, "grad_norm": 1.8331469813158643, "learning_rate": 1.6334803616495722e-05, "loss": 0.8923, "step": 1846 }, { "epoch": 0.3, "grad_norm": 1.4400053270921827, "learning_rate": 1.633068614969457e-05, "loss": 0.8312, "step": 1847 }, { "epoch": 0.3, "grad_norm": 1.9296638449184413, "learning_rate": 1.632656689100602e-05, "loss": 0.8683, "step": 1848 }, { "epoch": 0.3, "grad_norm": 1.8620193413999835, "learning_rate": 1.632244584159602e-05, "loss": 0.8936, "step": 1849 }, { "epoch": 0.3, "grad_norm": 2.029752594495288, "learning_rate": 1.631832300263103e-05, "loss": 0.8631, "step": 1850 }, { "epoch": 0.3, "grad_norm": 1.5349913418781707, "learning_rate": 1.6314198375278003e-05, "loss": 0.7816, "step": 1851 }, { "epoch": 0.3, "grad_norm": 1.800423060529115, "learning_rate": 1.6310071960704412e-05, "loss": 0.8145, "step": 1852 }, { "epoch": 0.3, "grad_norm": 1.7290919043395507, "learning_rate": 1.6305943760078226e-05, "loss": 0.8256, "step": 1853 }, { "epoch": 0.3, "grad_norm": 1.7006638289585816, "learning_rate": 1.630181377456793e-05, "loss": 0.8823, "step": 1854 }, { "epoch": 0.3, "grad_norm": 2.213090520697884, "learning_rate": 1.6297682005342497e-05, "loss": 0.8826, "step": 1855 }, { "epoch": 0.3, "grad_norm": 1.773502882074424, "learning_rate": 1.6293548453571422e-05, "loss": 0.8042, "step": 1856 }, { "epoch": 0.31, "grad_norm": 1.9418290657918793, "learning_rate": 1.62894131204247e-05, "loss": 0.7881, "step": 1857 }, { "epoch": 0.31, "grad_norm": 2.110264222845555, "learning_rate": 1.628527600707283e-05, "loss": 0.7938, "step": 1858 }, { "epoch": 0.31, "grad_norm": 1.9092256116628028, "learning_rate": 1.62811371146868e-05, "loss": 0.8432, "step": 1859 }, { "epoch": 0.31, "grad_norm": 2.1313368670302433, "learning_rate": 1.627699644443813e-05, "loss": 0.7597, "step": 1860 }, { "epoch": 0.31, "grad_norm": 1.7983763408452715, "learning_rate": 1.6272853997498822e-05, "loss": 0.8567, "step": 1861 }, { "epoch": 0.31, "grad_norm": 2.3659641396248503, "learning_rate": 1.6268709775041385e-05, "loss": 0.8895, "step": 1862 }, { "epoch": 0.31, "grad_norm": 0.7867813188639491, "learning_rate": 1.6264563778238834e-05, "loss": 0.3722, "step": 1863 }, { "epoch": 0.31, "grad_norm": 0.7273903292422771, "learning_rate": 1.6260416008264685e-05, "loss": 0.3512, "step": 1864 }, { "epoch": 0.31, "grad_norm": 0.7294837316950481, "learning_rate": 1.625626646629296e-05, "loss": 0.3867, "step": 1865 }, { "epoch": 0.31, "grad_norm": 1.5933194905045704, "learning_rate": 1.625211515349817e-05, "loss": 0.7951, "step": 1866 }, { "epoch": 0.31, "grad_norm": 1.8542832128641988, "learning_rate": 1.624796207105534e-05, "loss": 0.8803, "step": 1867 }, { "epoch": 0.31, "grad_norm": 1.847631327495523, "learning_rate": 1.6243807220139988e-05, "loss": 0.9022, "step": 1868 }, { "epoch": 0.31, "grad_norm": 1.8175713790806387, "learning_rate": 1.623965060192814e-05, "loss": 0.9249, "step": 1869 }, { "epoch": 0.31, "grad_norm": 0.9241490770906636, "learning_rate": 1.623549221759632e-05, "loss": 0.3748, "step": 1870 }, { "epoch": 0.31, "grad_norm": 1.8724963203115277, "learning_rate": 1.6231332068321538e-05, "loss": 0.8923, "step": 1871 }, { "epoch": 0.31, "grad_norm": 1.9043213543859765, "learning_rate": 1.622717015528133e-05, "loss": 0.8387, "step": 1872 }, { "epoch": 0.31, "grad_norm": 6.432503765520766, "learning_rate": 1.6223006479653708e-05, "loss": 0.856, "step": 1873 }, { "epoch": 0.31, "grad_norm": 2.0838787830695713, "learning_rate": 1.6218841042617196e-05, "loss": 0.7808, "step": 1874 }, { "epoch": 0.31, "grad_norm": 1.7003841469680725, "learning_rate": 1.621467384535081e-05, "loss": 0.7816, "step": 1875 }, { "epoch": 0.31, "grad_norm": 2.5693685954216057, "learning_rate": 1.6210504889034063e-05, "loss": 0.7906, "step": 1876 }, { "epoch": 0.31, "grad_norm": 1.8113053392906795, "learning_rate": 1.6206334174846974e-05, "loss": 0.898, "step": 1877 }, { "epoch": 0.31, "grad_norm": 1.7737939910087583, "learning_rate": 1.6202161703970057e-05, "loss": 0.8342, "step": 1878 }, { "epoch": 0.31, "grad_norm": 2.0730517554977026, "learning_rate": 1.6197987477584315e-05, "loss": 0.8564, "step": 1879 }, { "epoch": 0.31, "grad_norm": 1.5268181433097237, "learning_rate": 1.6193811496871256e-05, "loss": 0.8586, "step": 1880 }, { "epoch": 0.31, "grad_norm": 2.442071401887862, "learning_rate": 1.6189633763012885e-05, "loss": 0.8146, "step": 1881 }, { "epoch": 0.31, "grad_norm": 2.8199454507452675, "learning_rate": 1.61854542771917e-05, "loss": 0.8879, "step": 1882 }, { "epoch": 0.31, "grad_norm": 1.7388722171670763, "learning_rate": 1.6181273040590696e-05, "loss": 0.8069, "step": 1883 }, { "epoch": 0.31, "grad_norm": 2.222555525562842, "learning_rate": 1.617709005439336e-05, "loss": 0.8453, "step": 1884 }, { "epoch": 0.31, "grad_norm": 1.549653058187197, "learning_rate": 1.617290531978368e-05, "loss": 0.8744, "step": 1885 }, { "epoch": 0.31, "grad_norm": 2.057200727098878, "learning_rate": 1.6168718837946133e-05, "loss": 0.8354, "step": 1886 }, { "epoch": 0.31, "grad_norm": 3.138710148980037, "learning_rate": 1.61645306100657e-05, "loss": 0.8684, "step": 1887 }, { "epoch": 0.31, "grad_norm": 2.0317301226792144, "learning_rate": 1.616034063732785e-05, "loss": 0.8339, "step": 1888 }, { "epoch": 0.31, "grad_norm": 2.825753890429003, "learning_rate": 1.6156148920918538e-05, "loss": 0.8312, "step": 1889 }, { "epoch": 0.31, "grad_norm": 2.0886976871038736, "learning_rate": 1.6151955462024225e-05, "loss": 0.8058, "step": 1890 }, { "epoch": 0.31, "grad_norm": 1.8887585090247265, "learning_rate": 1.6147760261831866e-05, "loss": 0.8454, "step": 1891 }, { "epoch": 0.31, "grad_norm": 1.812274707803601, "learning_rate": 1.6143563321528893e-05, "loss": 0.9013, "step": 1892 }, { "epoch": 0.31, "grad_norm": 2.849900364490088, "learning_rate": 1.613936464230325e-05, "loss": 0.8655, "step": 1893 }, { "epoch": 0.31, "grad_norm": 2.1432863013846255, "learning_rate": 1.6135164225343357e-05, "loss": 0.8383, "step": 1894 }, { "epoch": 0.31, "grad_norm": 1.5738342395777822, "learning_rate": 1.613096207183814e-05, "loss": 0.9116, "step": 1895 }, { "epoch": 0.31, "grad_norm": 2.4431379837514444, "learning_rate": 1.6126758182977007e-05, "loss": 0.7744, "step": 1896 }, { "epoch": 0.31, "grad_norm": 1.508107779918409, "learning_rate": 1.612255255994986e-05, "loss": 0.8299, "step": 1897 }, { "epoch": 0.31, "grad_norm": 2.2266074916268215, "learning_rate": 1.6118345203947093e-05, "loss": 0.8855, "step": 1898 }, { "epoch": 0.31, "grad_norm": 1.6885604528105695, "learning_rate": 1.6114136116159585e-05, "loss": 0.8756, "step": 1899 }, { "epoch": 0.31, "grad_norm": 2.266327554470209, "learning_rate": 1.6109925297778717e-05, "loss": 0.8562, "step": 1900 }, { "epoch": 0.31, "grad_norm": 2.5522871315813713, "learning_rate": 1.6105712749996345e-05, "loss": 0.8434, "step": 1901 }, { "epoch": 0.31, "grad_norm": 1.7584247502422572, "learning_rate": 1.610149847400482e-05, "loss": 0.8496, "step": 1902 }, { "epoch": 0.31, "grad_norm": 1.6942587557255795, "learning_rate": 1.6097282470996997e-05, "loss": 0.858, "step": 1903 }, { "epoch": 0.31, "grad_norm": 0.8982051925590396, "learning_rate": 1.609306474216619e-05, "loss": 0.3956, "step": 1904 }, { "epoch": 0.31, "grad_norm": 0.8446240194201485, "learning_rate": 1.608884528870623e-05, "loss": 0.379, "step": 1905 }, { "epoch": 0.31, "grad_norm": 1.8599734859500767, "learning_rate": 1.608462411181142e-05, "loss": 0.8564, "step": 1906 }, { "epoch": 0.31, "grad_norm": 2.0246154402014302, "learning_rate": 1.6080401212676558e-05, "loss": 0.7852, "step": 1907 }, { "epoch": 0.31, "grad_norm": 1.7823285233912374, "learning_rate": 1.6076176592496926e-05, "loss": 0.8003, "step": 1908 }, { "epoch": 0.31, "grad_norm": 1.8718145469624783, "learning_rate": 1.6071950252468288e-05, "loss": 0.7969, "step": 1909 }, { "epoch": 0.31, "grad_norm": 2.0871652607993973, "learning_rate": 1.6067722193786907e-05, "loss": 0.9402, "step": 1910 }, { "epoch": 0.31, "grad_norm": 8.150785555821892, "learning_rate": 1.6063492417649528e-05, "loss": 0.8752, "step": 1911 }, { "epoch": 0.31, "grad_norm": 18.38293705750402, "learning_rate": 1.605926092525337e-05, "loss": 0.7821, "step": 1912 }, { "epoch": 0.31, "grad_norm": 1.9325617817379823, "learning_rate": 1.605502771779616e-05, "loss": 0.799, "step": 1913 }, { "epoch": 0.31, "grad_norm": 5.238806170783551, "learning_rate": 1.6050792796476092e-05, "loss": 0.8604, "step": 1914 }, { "epoch": 0.31, "grad_norm": 3.074271976455325, "learning_rate": 1.6046556162491852e-05, "loss": 0.9053, "step": 1915 }, { "epoch": 0.31, "grad_norm": 2.1729969399423155, "learning_rate": 1.604231781704261e-05, "loss": 0.8479, "step": 1916 }, { "epoch": 0.31, "grad_norm": 1.0193259978911349, "learning_rate": 1.6038077761328024e-05, "loss": 0.3955, "step": 1917 }, { "epoch": 0.32, "grad_norm": 1.5490538355702874, "learning_rate": 1.603383599654823e-05, "loss": 0.8784, "step": 1918 }, { "epoch": 0.32, "grad_norm": 0.8003268199303074, "learning_rate": 1.602959252390385e-05, "loss": 0.3793, "step": 1919 }, { "epoch": 0.32, "grad_norm": 1.9450510548822892, "learning_rate": 1.602534734459599e-05, "loss": 0.8323, "step": 1920 }, { "epoch": 0.32, "grad_norm": 1.6719190662510912, "learning_rate": 1.6021100459826243e-05, "loss": 0.8927, "step": 1921 }, { "epoch": 0.32, "grad_norm": 1.9171401611530068, "learning_rate": 1.601685187079668e-05, "loss": 0.8667, "step": 1922 }, { "epoch": 0.32, "grad_norm": 1.9291019829568892, "learning_rate": 1.601260157870985e-05, "loss": 0.8347, "step": 1923 }, { "epoch": 0.32, "grad_norm": 2.0503481989794197, "learning_rate": 1.6008349584768793e-05, "loss": 0.8934, "step": 1924 }, { "epoch": 0.32, "grad_norm": 2.0844984523069745, "learning_rate": 1.6004095890177026e-05, "loss": 0.805, "step": 1925 }, { "epoch": 0.32, "grad_norm": 1.8084412311659663, "learning_rate": 1.599984049613855e-05, "loss": 0.8603, "step": 1926 }, { "epoch": 0.32, "grad_norm": 2.383995504663974, "learning_rate": 1.5995583403857845e-05, "loss": 0.7509, "step": 1927 }, { "epoch": 0.32, "grad_norm": 2.5097534148653717, "learning_rate": 1.599132461453987e-05, "loss": 0.8503, "step": 1928 }, { "epoch": 0.32, "grad_norm": 3.3372921828349527, "learning_rate": 1.5987064129390066e-05, "loss": 0.7749, "step": 1929 }, { "epoch": 0.32, "grad_norm": 1.9414692701726683, "learning_rate": 1.5982801949614358e-05, "loss": 0.8267, "step": 1930 }, { "epoch": 0.32, "grad_norm": 1.639252846718301, "learning_rate": 1.5978538076419143e-05, "loss": 0.8097, "step": 1931 }, { "epoch": 0.32, "grad_norm": 2.192971275063852, "learning_rate": 1.5974272511011305e-05, "loss": 0.8333, "step": 1932 }, { "epoch": 0.32, "grad_norm": 2.0661787305500257, "learning_rate": 1.5970005254598204e-05, "loss": 0.7789, "step": 1933 }, { "epoch": 0.32, "grad_norm": 1.6620125449018874, "learning_rate": 1.5965736308387668e-05, "loss": 0.7784, "step": 1934 }, { "epoch": 0.32, "grad_norm": 1.5801504577471557, "learning_rate": 1.5961465673588027e-05, "loss": 0.8691, "step": 1935 }, { "epoch": 0.32, "grad_norm": 1.9900709982003904, "learning_rate": 1.5957193351408065e-05, "loss": 0.8002, "step": 1936 }, { "epoch": 0.32, "grad_norm": 1.740100346493199, "learning_rate": 1.595291934305706e-05, "loss": 0.7887, "step": 1937 }, { "epoch": 0.32, "grad_norm": 1.68166938487256, "learning_rate": 1.594864364974476e-05, "loss": 0.8861, "step": 1938 }, { "epoch": 0.32, "grad_norm": 2.4695173981155496, "learning_rate": 1.5944366272681386e-05, "loss": 0.8083, "step": 1939 }, { "epoch": 0.32, "grad_norm": 1.7293289562512104, "learning_rate": 1.5940087213077648e-05, "loss": 0.8581, "step": 1940 }, { "epoch": 0.32, "grad_norm": 1.827244889599664, "learning_rate": 1.593580647214472e-05, "loss": 0.8362, "step": 1941 }, { "epoch": 0.32, "grad_norm": 2.105194384485213, "learning_rate": 1.5931524051094254e-05, "loss": 0.8619, "step": 1942 }, { "epoch": 0.32, "grad_norm": 2.2437477153548766, "learning_rate": 1.592723995113839e-05, "loss": 0.7957, "step": 1943 }, { "epoch": 0.32, "grad_norm": 2.1264196393846215, "learning_rate": 1.5922954173489726e-05, "loss": 0.7876, "step": 1944 }, { "epoch": 0.32, "grad_norm": 2.4305773776181243, "learning_rate": 1.5918666719361346e-05, "loss": 0.752, "step": 1945 }, { "epoch": 0.32, "grad_norm": 2.5540198416753195, "learning_rate": 1.5914377589966798e-05, "loss": 0.8746, "step": 1946 }, { "epoch": 0.32, "grad_norm": 1.8698839826164242, "learning_rate": 1.5910086786520118e-05, "loss": 0.7805, "step": 1947 }, { "epoch": 0.32, "grad_norm": 1.3219460432881578, "learning_rate": 1.5905794310235808e-05, "loss": 0.8005, "step": 1948 }, { "epoch": 0.32, "grad_norm": 2.385244757306504, "learning_rate": 1.590150016232884e-05, "loss": 0.8522, "step": 1949 }, { "epoch": 0.32, "grad_norm": 1.7707005228648025, "learning_rate": 1.589720434401467e-05, "loss": 0.8087, "step": 1950 }, { "epoch": 0.32, "grad_norm": 1.799740869723133, "learning_rate": 1.5892906856509214e-05, "loss": 0.8438, "step": 1951 }, { "epoch": 0.32, "grad_norm": 1.569354214653461, "learning_rate": 1.5888607701028877e-05, "loss": 0.8185, "step": 1952 }, { "epoch": 0.32, "grad_norm": 1.6271044274686535, "learning_rate": 1.5884306878790512e-05, "loss": 0.8336, "step": 1953 }, { "epoch": 0.32, "grad_norm": 1.922995759167272, "learning_rate": 1.5880004391011464e-05, "loss": 0.9082, "step": 1954 }, { "epoch": 0.32, "grad_norm": 1.9504736676001495, "learning_rate": 1.5875700238909547e-05, "loss": 0.8995, "step": 1955 }, { "epoch": 0.32, "grad_norm": 1.7189187316093941, "learning_rate": 1.5871394423703036e-05, "loss": 0.8641, "step": 1956 }, { "epoch": 0.32, "grad_norm": 1.1842428712967321, "learning_rate": 1.5867086946610687e-05, "loss": 0.4414, "step": 1957 }, { "epoch": 0.32, "grad_norm": 1.5290878010254911, "learning_rate": 1.586277780885172e-05, "loss": 0.8351, "step": 1958 }, { "epoch": 0.32, "grad_norm": 1.9876864738151636, "learning_rate": 1.585846701164583e-05, "loss": 0.8333, "step": 1959 }, { "epoch": 0.32, "grad_norm": 2.103271505950095, "learning_rate": 1.585415455621318e-05, "loss": 0.8907, "step": 1960 }, { "epoch": 0.32, "grad_norm": 1.7426408763925145, "learning_rate": 1.5849840443774393e-05, "loss": 0.8273, "step": 1961 }, { "epoch": 0.32, "grad_norm": 1.607045049230483, "learning_rate": 1.584552467555058e-05, "loss": 0.9232, "step": 1962 }, { "epoch": 0.32, "grad_norm": 2.8783442915323962, "learning_rate": 1.58412072527633e-05, "loss": 0.8747, "step": 1963 }, { "epoch": 0.32, "grad_norm": 2.368396439299864, "learning_rate": 1.58368881766346e-05, "loss": 0.755, "step": 1964 }, { "epoch": 0.32, "grad_norm": 1.697699678977599, "learning_rate": 1.5832567448386985e-05, "loss": 0.8009, "step": 1965 }, { "epoch": 0.32, "grad_norm": 1.9448737218614007, "learning_rate": 1.5828245069243417e-05, "loss": 0.8325, "step": 1966 }, { "epoch": 0.32, "grad_norm": 1.8082781304003843, "learning_rate": 1.5823921040427348e-05, "loss": 0.7007, "step": 1967 }, { "epoch": 0.32, "grad_norm": 1.8288530315493772, "learning_rate": 1.5819595363162682e-05, "loss": 0.7215, "step": 1968 }, { "epoch": 0.32, "grad_norm": 2.4941338094002345, "learning_rate": 1.5815268038673786e-05, "loss": 0.8, "step": 1969 }, { "epoch": 0.32, "grad_norm": 1.9306608552764752, "learning_rate": 1.581093906818551e-05, "loss": 0.7957, "step": 1970 }, { "epoch": 0.32, "grad_norm": 1.8001591013098135, "learning_rate": 1.5806608452923158e-05, "loss": 0.9054, "step": 1971 }, { "epoch": 0.32, "grad_norm": 2.411879354430356, "learning_rate": 1.5802276194112498e-05, "loss": 0.8172, "step": 1972 }, { "epoch": 0.32, "grad_norm": 1.7865283300617805, "learning_rate": 1.5797942292979767e-05, "loss": 0.7809, "step": 1973 }, { "epoch": 0.32, "grad_norm": 2.0142342594225076, "learning_rate": 1.5793606750751668e-05, "loss": 0.7684, "step": 1974 }, { "epoch": 0.32, "grad_norm": 1.8985034780585057, "learning_rate": 1.578926956865537e-05, "loss": 0.7963, "step": 1975 }, { "epoch": 0.32, "grad_norm": 3.868818914766965, "learning_rate": 1.5784930747918492e-05, "loss": 0.8273, "step": 1976 }, { "epoch": 0.32, "grad_norm": 3.0817024924402854, "learning_rate": 1.578059028976914e-05, "loss": 0.9324, "step": 1977 }, { "epoch": 0.32, "grad_norm": 2.173166569219747, "learning_rate": 1.577624819543587e-05, "loss": 0.9264, "step": 1978 }, { "epoch": 0.33, "grad_norm": 2.2986160865875442, "learning_rate": 1.57719044661477e-05, "loss": 0.8108, "step": 1979 }, { "epoch": 0.33, "grad_norm": 2.0075026651585843, "learning_rate": 1.5767559103134114e-05, "loss": 0.8233, "step": 1980 }, { "epoch": 0.33, "grad_norm": 1.8446660125542664, "learning_rate": 1.5763212107625055e-05, "loss": 0.9411, "step": 1981 }, { "epoch": 0.33, "grad_norm": 1.666455914494619, "learning_rate": 1.5758863480850936e-05, "loss": 0.8118, "step": 1982 }, { "epoch": 0.33, "grad_norm": 1.4791015606957325, "learning_rate": 1.5754513224042625e-05, "loss": 0.838, "step": 1983 }, { "epoch": 0.33, "grad_norm": 3.0784895967029557, "learning_rate": 1.5750161338431452e-05, "loss": 0.769, "step": 1984 }, { "epoch": 0.33, "grad_norm": 2.331004946477376, "learning_rate": 1.5745807825249208e-05, "loss": 0.7738, "step": 1985 }, { "epoch": 0.33, "grad_norm": 1.4287516133527118, "learning_rate": 1.574145268572815e-05, "loss": 0.8452, "step": 1986 }, { "epoch": 0.33, "grad_norm": 2.162208531526389, "learning_rate": 1.5737095921100983e-05, "loss": 0.755, "step": 1987 }, { "epoch": 0.33, "grad_norm": 2.7918150579831886, "learning_rate": 1.573273753260089e-05, "loss": 0.8501, "step": 1988 }, { "epoch": 0.33, "grad_norm": 1.9292138869960516, "learning_rate": 1.5728377521461496e-05, "loss": 0.7757, "step": 1989 }, { "epoch": 0.33, "grad_norm": 2.3522513757035446, "learning_rate": 1.57240158889169e-05, "loss": 0.8272, "step": 1990 }, { "epoch": 0.33, "grad_norm": 2.2219544730555976, "learning_rate": 1.5719652636201646e-05, "loss": 0.8522, "step": 1991 }, { "epoch": 0.33, "grad_norm": 2.009497308841587, "learning_rate": 1.5715287764550745e-05, "loss": 0.8336, "step": 1992 }, { "epoch": 0.33, "grad_norm": 1.7174244665395644, "learning_rate": 1.571092127519967e-05, "loss": 0.789, "step": 1993 }, { "epoch": 0.33, "grad_norm": 2.1452815128389706, "learning_rate": 1.570655316938434e-05, "loss": 0.8169, "step": 1994 }, { "epoch": 0.33, "grad_norm": 1.8607011694986866, "learning_rate": 1.5702183448341143e-05, "loss": 0.8454, "step": 1995 }, { "epoch": 0.33, "grad_norm": 2.1194716935863545, "learning_rate": 1.5697812113306917e-05, "loss": 0.8622, "step": 1996 }, { "epoch": 0.33, "grad_norm": 2.1130060490291127, "learning_rate": 1.5693439165518957e-05, "loss": 0.8693, "step": 1997 }, { "epoch": 0.33, "grad_norm": 1.7844126513396419, "learning_rate": 1.568906460621502e-05, "loss": 0.7447, "step": 1998 }, { "epoch": 0.33, "grad_norm": 1.9155204848818537, "learning_rate": 1.5684688436633314e-05, "loss": 0.8372, "step": 1999 }, { "epoch": 0.33, "grad_norm": 2.704321936700019, "learning_rate": 1.5680310658012507e-05, "loss": 0.8055, "step": 2000 }, { "epoch": 0.33, "grad_norm": 1.9329491588889531, "learning_rate": 1.5675931271591717e-05, "loss": 0.8593, "step": 2001 }, { "epoch": 0.33, "grad_norm": 0.8018678315983114, "learning_rate": 1.5671550278610526e-05, "loss": 0.4042, "step": 2002 }, { "epoch": 0.33, "grad_norm": 0.7225849445115815, "learning_rate": 1.566716768030896e-05, "loss": 0.401, "step": 2003 }, { "epoch": 0.33, "grad_norm": 1.899224323267829, "learning_rate": 1.56627834779275e-05, "loss": 0.8279, "step": 2004 }, { "epoch": 0.33, "grad_norm": 4.010240433089867, "learning_rate": 1.5658397672707093e-05, "loss": 0.8231, "step": 2005 }, { "epoch": 0.33, "grad_norm": 1.9691580239821684, "learning_rate": 1.565401026588913e-05, "loss": 0.7478, "step": 2006 }, { "epoch": 0.33, "grad_norm": 2.9409719771461265, "learning_rate": 1.5649621258715454e-05, "loss": 0.7842, "step": 2007 }, { "epoch": 0.33, "grad_norm": 2.7702489416026403, "learning_rate": 1.5645230652428367e-05, "loss": 0.9464, "step": 2008 }, { "epoch": 0.33, "grad_norm": 2.1393957223904416, "learning_rate": 1.564083844827062e-05, "loss": 0.7542, "step": 2009 }, { "epoch": 0.33, "grad_norm": 2.4582433499286904, "learning_rate": 1.563644464748542e-05, "loss": 0.8137, "step": 2010 }, { "epoch": 0.33, "grad_norm": 1.9038404458470117, "learning_rate": 1.563204925131642e-05, "loss": 0.8749, "step": 2011 }, { "epoch": 0.33, "grad_norm": 2.4698906589740517, "learning_rate": 1.5627652261007726e-05, "loss": 0.9362, "step": 2012 }, { "epoch": 0.33, "grad_norm": 1.9339665476604802, "learning_rate": 1.5623253677803897e-05, "loss": 0.8287, "step": 2013 }, { "epoch": 0.33, "grad_norm": 2.1842875679403884, "learning_rate": 1.5618853502949948e-05, "loss": 0.9004, "step": 2014 }, { "epoch": 0.33, "grad_norm": 2.2705212204339302, "learning_rate": 1.5614451737691335e-05, "loss": 0.7695, "step": 2015 }, { "epoch": 0.33, "grad_norm": 2.0173085398507626, "learning_rate": 1.561004838327397e-05, "loss": 0.788, "step": 2016 }, { "epoch": 0.33, "grad_norm": 2.9225676500446514, "learning_rate": 1.5605643440944213e-05, "loss": 0.8454, "step": 2017 }, { "epoch": 0.33, "grad_norm": 1.9566739231395134, "learning_rate": 1.5601236911948876e-05, "loss": 0.8162, "step": 2018 }, { "epoch": 0.33, "grad_norm": 1.7675377125439615, "learning_rate": 1.559682879753521e-05, "loss": 0.8687, "step": 2019 }, { "epoch": 0.33, "grad_norm": 1.8128227323069843, "learning_rate": 1.559241909895093e-05, "loss": 0.8099, "step": 2020 }, { "epoch": 0.33, "grad_norm": 2.0142896557811616, "learning_rate": 1.558800781744419e-05, "loss": 0.9201, "step": 2021 }, { "epoch": 0.33, "grad_norm": 1.6709371241850444, "learning_rate": 1.5583594954263593e-05, "loss": 0.7716, "step": 2022 }, { "epoch": 0.33, "grad_norm": 2.0090043862391687, "learning_rate": 1.5579180510658187e-05, "loss": 0.8127, "step": 2023 }, { "epoch": 0.33, "grad_norm": 1.7945178826404526, "learning_rate": 1.557476448787748e-05, "loss": 0.7848, "step": 2024 }, { "epoch": 0.33, "grad_norm": 2.391024904011188, "learning_rate": 1.557034688717141e-05, "loss": 0.8679, "step": 2025 }, { "epoch": 0.33, "grad_norm": 2.0213357574483077, "learning_rate": 1.5565927709790377e-05, "loss": 0.7937, "step": 2026 }, { "epoch": 0.33, "grad_norm": 3.343602924467702, "learning_rate": 1.5561506956985213e-05, "loss": 0.9083, "step": 2027 }, { "epoch": 0.33, "grad_norm": 1.9489026096933884, "learning_rate": 1.5557084630007206e-05, "loss": 0.8617, "step": 2028 }, { "epoch": 0.33, "grad_norm": 2.670362182634909, "learning_rate": 1.5552660730108084e-05, "loss": 0.8917, "step": 2029 }, { "epoch": 0.33, "grad_norm": 2.159915085854689, "learning_rate": 1.5548235258540023e-05, "loss": 0.8303, "step": 2030 }, { "epoch": 0.33, "grad_norm": 1.9364389493638021, "learning_rate": 1.5543808216555645e-05, "loss": 0.7477, "step": 2031 }, { "epoch": 0.33, "grad_norm": 3.3037311798370057, "learning_rate": 1.5539379605408015e-05, "loss": 0.7746, "step": 2032 }, { "epoch": 0.33, "grad_norm": 1.6562332175833787, "learning_rate": 1.5534949426350642e-05, "loss": 0.7994, "step": 2033 }, { "epoch": 0.33, "grad_norm": 2.8646291920757863, "learning_rate": 1.5530517680637478e-05, "loss": 0.8372, "step": 2034 }, { "epoch": 0.33, "grad_norm": 2.725597100123372, "learning_rate": 1.552608436952292e-05, "loss": 0.8318, "step": 2035 }, { "epoch": 0.33, "grad_norm": 2.2311109962301487, "learning_rate": 1.552164949426181e-05, "loss": 0.8109, "step": 2036 }, { "epoch": 0.33, "grad_norm": 1.9362273338229359, "learning_rate": 1.551721305610942e-05, "loss": 0.8476, "step": 2037 }, { "epoch": 0.33, "grad_norm": 2.4513060689218245, "learning_rate": 1.551277505632149e-05, "loss": 0.7836, "step": 2038 }, { "epoch": 0.33, "grad_norm": 2.0688662671852582, "learning_rate": 1.550833549615417e-05, "loss": 0.7618, "step": 2039 }, { "epoch": 0.34, "grad_norm": 2.312172701681307, "learning_rate": 1.550389437686408e-05, "loss": 0.7615, "step": 2040 }, { "epoch": 0.34, "grad_norm": 2.766740407754993, "learning_rate": 1.549945169970827e-05, "loss": 0.8221, "step": 2041 }, { "epoch": 0.34, "grad_norm": 2.3167093101946437, "learning_rate": 1.549500746594422e-05, "loss": 0.9167, "step": 2042 }, { "epoch": 0.34, "grad_norm": 3.0832353620612185, "learning_rate": 1.549056167682987e-05, "loss": 0.7755, "step": 2043 }, { "epoch": 0.34, "grad_norm": 1.8633724504881548, "learning_rate": 1.5486114333623587e-05, "loss": 0.8508, "step": 2044 }, { "epoch": 0.34, "grad_norm": 1.901545764396517, "learning_rate": 1.5481665437584185e-05, "loss": 0.8757, "step": 2045 }, { "epoch": 0.34, "grad_norm": 1.8776506481336073, "learning_rate": 1.5477214989970916e-05, "loss": 0.7822, "step": 2046 }, { "epoch": 0.34, "grad_norm": 1.5496020566060742, "learning_rate": 1.547276299204346e-05, "loss": 0.4249, "step": 2047 }, { "epoch": 0.34, "grad_norm": 1.6841344353542673, "learning_rate": 1.546830944506196e-05, "loss": 0.793, "step": 2048 }, { "epoch": 0.34, "grad_norm": 1.8638666196283922, "learning_rate": 1.5463854350286972e-05, "loss": 0.7759, "step": 2049 }, { "epoch": 0.34, "grad_norm": 1.552290649898896, "learning_rate": 1.5459397708979508e-05, "loss": 0.7646, "step": 2050 }, { "epoch": 0.34, "grad_norm": 2.1539652366754027, "learning_rate": 1.5454939522401e-05, "loss": 0.8296, "step": 2051 }, { "epoch": 0.34, "grad_norm": 3.086565184014801, "learning_rate": 1.5450479791813348e-05, "loss": 0.7795, "step": 2052 }, { "epoch": 0.34, "grad_norm": 1.8399635795866065, "learning_rate": 1.544601851847885e-05, "loss": 0.7693, "step": 2053 }, { "epoch": 0.34, "grad_norm": 2.036868439305586, "learning_rate": 1.544155570366027e-05, "loss": 0.8503, "step": 2054 }, { "epoch": 0.34, "grad_norm": 2.498364659346975, "learning_rate": 1.5437091348620798e-05, "loss": 0.7589, "step": 2055 }, { "epoch": 0.34, "grad_norm": 2.466448831606738, "learning_rate": 1.5432625454624054e-05, "loss": 0.8443, "step": 2056 }, { "epoch": 0.34, "grad_norm": 2.2443267100855, "learning_rate": 1.5428158022934106e-05, "loss": 0.8499, "step": 2057 }, { "epoch": 0.34, "grad_norm": 2.008572677157006, "learning_rate": 1.542368905481545e-05, "loss": 0.8159, "step": 2058 }, { "epoch": 0.34, "grad_norm": 2.307675045827634, "learning_rate": 1.5419218551533017e-05, "loss": 0.785, "step": 2059 }, { "epoch": 0.34, "grad_norm": 2.557483503749395, "learning_rate": 1.5414746514352178e-05, "loss": 0.8065, "step": 2060 }, { "epoch": 0.34, "grad_norm": 2.123745132109018, "learning_rate": 1.5410272944538725e-05, "loss": 0.7984, "step": 2061 }, { "epoch": 0.34, "grad_norm": 2.188076678769291, "learning_rate": 1.5405797843358897e-05, "loss": 0.8091, "step": 2062 }, { "epoch": 0.34, "grad_norm": 1.9308448719620857, "learning_rate": 1.5401321212079366e-05, "loss": 0.7681, "step": 2063 }, { "epoch": 0.34, "grad_norm": 1.4215646095088788, "learning_rate": 1.5396843051967225e-05, "loss": 0.874, "step": 2064 }, { "epoch": 0.34, "grad_norm": 1.8789094657873677, "learning_rate": 1.5392363364290016e-05, "loss": 0.8642, "step": 2065 }, { "epoch": 0.34, "grad_norm": 1.7965074040171742, "learning_rate": 1.53878821503157e-05, "loss": 0.8687, "step": 2066 }, { "epoch": 0.34, "grad_norm": 2.48486581694748, "learning_rate": 1.5383399411312673e-05, "loss": 0.8916, "step": 2067 }, { "epoch": 0.34, "grad_norm": 1.9928887886826323, "learning_rate": 1.5378915148549772e-05, "loss": 0.7514, "step": 2068 }, { "epoch": 0.34, "grad_norm": 12.180249807064655, "learning_rate": 1.5374429363296252e-05, "loss": 0.8306, "step": 2069 }, { "epoch": 0.34, "grad_norm": 2.0601929842962377, "learning_rate": 1.536994205682181e-05, "loss": 0.8719, "step": 2070 }, { "epoch": 0.34, "grad_norm": 1.6315377032959848, "learning_rate": 1.536545323039657e-05, "loss": 0.7848, "step": 2071 }, { "epoch": 0.34, "grad_norm": 1.661613628511177, "learning_rate": 1.5360962885291074e-05, "loss": 0.7825, "step": 2072 }, { "epoch": 0.34, "grad_norm": 5.258092728805641, "learning_rate": 1.5356471022776315e-05, "loss": 0.8423, "step": 2073 }, { "epoch": 0.34, "grad_norm": 1.7070247260533982, "learning_rate": 1.5351977644123703e-05, "loss": 0.8233, "step": 2074 }, { "epoch": 0.34, "grad_norm": 1.0530777962727282, "learning_rate": 1.5347482750605072e-05, "loss": 0.4016, "step": 2075 }, { "epoch": 0.34, "grad_norm": 1.8400135500551364, "learning_rate": 1.5342986343492704e-05, "loss": 0.8143, "step": 2076 }, { "epoch": 0.34, "grad_norm": 2.0627282748536016, "learning_rate": 1.533848842405929e-05, "loss": 0.8658, "step": 2077 }, { "epoch": 0.34, "grad_norm": 2.6005808079768844, "learning_rate": 1.5333988993577958e-05, "loss": 0.7626, "step": 2078 }, { "epoch": 0.34, "grad_norm": 1.900174653486816, "learning_rate": 1.5329488053322266e-05, "loss": 0.8152, "step": 2079 }, { "epoch": 0.34, "grad_norm": 1.9982952426350695, "learning_rate": 1.5324985604566194e-05, "loss": 0.8083, "step": 2080 }, { "epoch": 0.34, "grad_norm": 1.6635710606796617, "learning_rate": 1.5320481648584147e-05, "loss": 0.8211, "step": 2081 }, { "epoch": 0.34, "grad_norm": 1.8482769038085674, "learning_rate": 1.5315976186650962e-05, "loss": 0.8275, "step": 2082 }, { "epoch": 0.34, "grad_norm": 2.8325878652923095, "learning_rate": 1.5311469220041903e-05, "loss": 0.8861, "step": 2083 }, { "epoch": 0.34, "grad_norm": 2.55442318436699, "learning_rate": 1.5306960750032657e-05, "loss": 0.7901, "step": 2084 }, { "epoch": 0.34, "grad_norm": 1.873628952730846, "learning_rate": 1.5302450777899332e-05, "loss": 0.7736, "step": 2085 }, { "epoch": 0.34, "grad_norm": 2.0231258382703445, "learning_rate": 1.529793930491847e-05, "loss": 0.7218, "step": 2086 }, { "epoch": 0.34, "grad_norm": 3.9710927936551244, "learning_rate": 1.5293426332367034e-05, "loss": 0.7732, "step": 2087 }, { "epoch": 0.34, "grad_norm": 1.7894079264991671, "learning_rate": 1.5288911861522413e-05, "loss": 0.7924, "step": 2088 }, { "epoch": 0.34, "grad_norm": 0.741311090646609, "learning_rate": 1.5284395893662414e-05, "loss": 0.3723, "step": 2089 }, { "epoch": 0.34, "grad_norm": 1.9780834045776048, "learning_rate": 1.5279878430065277e-05, "loss": 0.7435, "step": 2090 }, { "epoch": 0.34, "grad_norm": 0.7037791550655168, "learning_rate": 1.5275359472009656e-05, "loss": 0.4462, "step": 2091 }, { "epoch": 0.34, "grad_norm": 1.9096098536627608, "learning_rate": 1.5270839020774638e-05, "loss": 0.8825, "step": 2092 }, { "epoch": 0.34, "grad_norm": 1.887632022468453, "learning_rate": 1.526631707763972e-05, "loss": 0.8898, "step": 2093 }, { "epoch": 0.34, "grad_norm": 2.0906263969845282, "learning_rate": 1.5261793643884835e-05, "loss": 0.8483, "step": 2094 }, { "epoch": 0.34, "grad_norm": 2.2919214583153504, "learning_rate": 1.5257268720790328e-05, "loss": 0.7619, "step": 2095 }, { "epoch": 0.34, "grad_norm": 1.8602485340617558, "learning_rate": 1.5252742309636972e-05, "loss": 0.8892, "step": 2096 }, { "epoch": 0.34, "grad_norm": 2.1146271931882206, "learning_rate": 1.5248214411705955e-05, "loss": 0.7906, "step": 2097 }, { "epoch": 0.34, "grad_norm": 1.6223574039237667, "learning_rate": 1.5243685028278888e-05, "loss": 0.8496, "step": 2098 }, { "epoch": 0.34, "grad_norm": 1.8865571089483988, "learning_rate": 1.5239154160637805e-05, "loss": 0.7181, "step": 2099 }, { "epoch": 0.34, "grad_norm": 6.201091733949905, "learning_rate": 1.5234621810065164e-05, "loss": 0.7768, "step": 2100 }, { "epoch": 0.35, "grad_norm": 3.574262226828748, "learning_rate": 1.5230087977843826e-05, "loss": 0.8156, "step": 2101 }, { "epoch": 0.35, "grad_norm": 2.2055231474310184, "learning_rate": 1.5225552665257092e-05, "loss": 0.8023, "step": 2102 }, { "epoch": 0.35, "grad_norm": 2.1630856791584376, "learning_rate": 1.5221015873588672e-05, "loss": 0.7792, "step": 2103 }, { "epoch": 0.35, "grad_norm": 1.8363916116487995, "learning_rate": 1.521647760412269e-05, "loss": 0.7599, "step": 2104 }, { "epoch": 0.35, "grad_norm": 1.7720287063766407, "learning_rate": 1.5211937858143699e-05, "loss": 0.8089, "step": 2105 }, { "epoch": 0.35, "grad_norm": 1.8588681993587728, "learning_rate": 1.5207396636936662e-05, "loss": 0.8799, "step": 2106 }, { "epoch": 0.35, "grad_norm": 1.9189763353190386, "learning_rate": 1.520285394178696e-05, "loss": 0.7439, "step": 2107 }, { "epoch": 0.35, "grad_norm": 2.8403245565530786, "learning_rate": 1.5198309773980397e-05, "loss": 0.8566, "step": 2108 }, { "epoch": 0.35, "grad_norm": 1.6123978964908137, "learning_rate": 1.5193764134803188e-05, "loss": 0.8078, "step": 2109 }, { "epoch": 0.35, "grad_norm": 1.625930631123514, "learning_rate": 1.5189217025541969e-05, "loss": 0.8889, "step": 2110 }, { "epoch": 0.35, "grad_norm": 1.5962790411979082, "learning_rate": 1.5184668447483785e-05, "loss": 0.8294, "step": 2111 }, { "epoch": 0.35, "grad_norm": 2.402907894411056, "learning_rate": 1.5180118401916107e-05, "loss": 0.9068, "step": 2112 }, { "epoch": 0.35, "grad_norm": 2.030828687730218, "learning_rate": 1.5175566890126812e-05, "loss": 0.8498, "step": 2113 }, { "epoch": 0.35, "grad_norm": 1.4676135081006048, "learning_rate": 1.51710139134042e-05, "loss": 0.8383, "step": 2114 }, { "epoch": 0.35, "grad_norm": 2.2954831390273767, "learning_rate": 1.5166459473036977e-05, "loss": 0.7758, "step": 2115 }, { "epoch": 0.35, "grad_norm": 2.390275848820666, "learning_rate": 1.5161903570314268e-05, "loss": 0.8118, "step": 2116 }, { "epoch": 0.35, "grad_norm": 1.956632770981684, "learning_rate": 1.5157346206525613e-05, "loss": 0.781, "step": 2117 }, { "epoch": 0.35, "grad_norm": 1.4556983788269524, "learning_rate": 1.5152787382960968e-05, "loss": 0.8079, "step": 2118 }, { "epoch": 0.35, "grad_norm": 1.8482903174466425, "learning_rate": 1.5148227100910691e-05, "loss": 0.7594, "step": 2119 }, { "epoch": 0.35, "grad_norm": 2.2100264303902906, "learning_rate": 1.5143665361665565e-05, "loss": 0.8371, "step": 2120 }, { "epoch": 0.35, "grad_norm": 1.7609749399418613, "learning_rate": 1.5139102166516782e-05, "loss": 0.8599, "step": 2121 }, { "epoch": 0.35, "grad_norm": 1.6498390388955777, "learning_rate": 1.5134537516755938e-05, "loss": 0.8332, "step": 2122 }, { "epoch": 0.35, "grad_norm": 2.49683639346385, "learning_rate": 1.5129971413675055e-05, "loss": 0.8249, "step": 2123 }, { "epoch": 0.35, "grad_norm": 5.04194460202835, "learning_rate": 1.5125403858566552e-05, "loss": 0.8345, "step": 2124 }, { "epoch": 0.35, "grad_norm": 2.3060683747636177, "learning_rate": 1.5120834852723276e-05, "loss": 0.6801, "step": 2125 }, { "epoch": 0.35, "grad_norm": 2.7649999785966957, "learning_rate": 1.5116264397438465e-05, "loss": 0.8701, "step": 2126 }, { "epoch": 0.35, "grad_norm": 1.5915650150520038, "learning_rate": 1.511169249400578e-05, "loss": 0.8632, "step": 2127 }, { "epoch": 0.35, "grad_norm": 1.8038427619386133, "learning_rate": 1.510711914371929e-05, "loss": 0.8213, "step": 2128 }, { "epoch": 0.35, "grad_norm": 1.4931556011427864, "learning_rate": 1.5102544347873469e-05, "loss": 0.9101, "step": 2129 }, { "epoch": 0.35, "grad_norm": 1.6889487792776727, "learning_rate": 1.5097968107763205e-05, "loss": 0.8036, "step": 2130 }, { "epoch": 0.35, "grad_norm": 3.6581511595142606, "learning_rate": 1.5093390424683796e-05, "loss": 0.7681, "step": 2131 }, { "epoch": 0.35, "grad_norm": 1.9999144710629735, "learning_rate": 1.5088811299930942e-05, "loss": 0.8647, "step": 2132 }, { "epoch": 0.35, "grad_norm": 1.7043924228960288, "learning_rate": 1.5084230734800754e-05, "loss": 0.8344, "step": 2133 }, { "epoch": 0.35, "grad_norm": 2.4801649027965227, "learning_rate": 1.5079648730589753e-05, "loss": 0.8501, "step": 2134 }, { "epoch": 0.35, "grad_norm": 2.63977498207557, "learning_rate": 1.5075065288594864e-05, "loss": 0.778, "step": 2135 }, { "epoch": 0.35, "grad_norm": 0.7947944983921029, "learning_rate": 1.5070480410113427e-05, "loss": 0.3769, "step": 2136 }, { "epoch": 0.35, "grad_norm": 2.7566514493148433, "learning_rate": 1.5065894096443173e-05, "loss": 0.9088, "step": 2137 }, { "epoch": 0.35, "grad_norm": 2.9303606913341387, "learning_rate": 1.5061306348882252e-05, "loss": 0.8173, "step": 2138 }, { "epoch": 0.35, "grad_norm": 2.1617196037647353, "learning_rate": 1.505671716872922e-05, "loss": 0.8439, "step": 2139 }, { "epoch": 0.35, "grad_norm": 1.6895128727609416, "learning_rate": 1.5052126557283031e-05, "loss": 0.8829, "step": 2140 }, { "epoch": 0.35, "grad_norm": 2.0020832826141888, "learning_rate": 1.5047534515843047e-05, "loss": 0.9113, "step": 2141 }, { "epoch": 0.35, "grad_norm": 1.811140493857968, "learning_rate": 1.5042941045709039e-05, "loss": 0.8558, "step": 2142 }, { "epoch": 0.35, "grad_norm": 2.79608801567173, "learning_rate": 1.5038346148181178e-05, "loss": 0.8108, "step": 2143 }, { "epoch": 0.35, "grad_norm": 2.063130634228424, "learning_rate": 1.5033749824560037e-05, "loss": 0.833, "step": 2144 }, { "epoch": 0.35, "grad_norm": 2.19595958179818, "learning_rate": 1.50291520761466e-05, "loss": 0.8434, "step": 2145 }, { "epoch": 0.35, "grad_norm": 1.989150035964972, "learning_rate": 1.5024552904242246e-05, "loss": 0.7597, "step": 2146 }, { "epoch": 0.35, "grad_norm": 0.6714006760377983, "learning_rate": 1.5019952310148766e-05, "loss": 0.368, "step": 2147 }, { "epoch": 0.35, "grad_norm": 1.636184465093161, "learning_rate": 1.5015350295168344e-05, "loss": 0.8799, "step": 2148 }, { "epoch": 0.35, "grad_norm": 1.6143103059702, "learning_rate": 1.5010746860603575e-05, "loss": 0.8446, "step": 2149 }, { "epoch": 0.35, "grad_norm": 2.563279662732151, "learning_rate": 1.5006142007757446e-05, "loss": 0.906, "step": 2150 }, { "epoch": 0.35, "grad_norm": 2.3765040596402707, "learning_rate": 1.5001535737933355e-05, "loss": 0.7681, "step": 2151 }, { "epoch": 0.35, "grad_norm": 3.179453383099974, "learning_rate": 1.4996928052435095e-05, "loss": 0.8005, "step": 2152 }, { "epoch": 0.35, "grad_norm": 1.7028003173212223, "learning_rate": 1.4992318952566862e-05, "loss": 0.7611, "step": 2153 }, { "epoch": 0.35, "grad_norm": 2.1098247001431014, "learning_rate": 1.4987708439633255e-05, "loss": 0.9299, "step": 2154 }, { "epoch": 0.35, "grad_norm": 1.7868048415043474, "learning_rate": 1.4983096514939263e-05, "loss": 0.8463, "step": 2155 }, { "epoch": 0.35, "grad_norm": 1.655471541420493, "learning_rate": 1.497848317979029e-05, "loss": 0.8219, "step": 2156 }, { "epoch": 0.35, "grad_norm": 1.7706247194845468, "learning_rate": 1.4973868435492125e-05, "loss": 0.8665, "step": 2157 }, { "epoch": 0.35, "grad_norm": 1.887048432819098, "learning_rate": 1.4969252283350964e-05, "loss": 0.8212, "step": 2158 }, { "epoch": 0.35, "grad_norm": 1.8237347719279287, "learning_rate": 1.4964634724673397e-05, "loss": 0.812, "step": 2159 }, { "epoch": 0.35, "grad_norm": 2.42102874134759, "learning_rate": 1.4960015760766418e-05, "loss": 0.7847, "step": 2160 }, { "epoch": 0.35, "grad_norm": 0.7083975679141024, "learning_rate": 1.495539539293741e-05, "loss": 0.3748, "step": 2161 }, { "epoch": 0.36, "grad_norm": 2.9183466559804345, "learning_rate": 1.4950773622494166e-05, "loss": 0.856, "step": 2162 }, { "epoch": 0.36, "grad_norm": 1.9785254988799565, "learning_rate": 1.4946150450744859e-05, "loss": 0.8471, "step": 2163 }, { "epoch": 0.36, "grad_norm": 0.6422420011463903, "learning_rate": 1.4941525878998073e-05, "loss": 0.3588, "step": 2164 }, { "epoch": 0.36, "grad_norm": 3.5233151087074157, "learning_rate": 1.4936899908562788e-05, "loss": 0.9305, "step": 2165 }, { "epoch": 0.36, "grad_norm": 2.002527983953435, "learning_rate": 1.4932272540748366e-05, "loss": 0.8386, "step": 2166 }, { "epoch": 0.36, "grad_norm": 1.8363861248046243, "learning_rate": 1.4927643776864577e-05, "loss": 0.8559, "step": 2167 }, { "epoch": 0.36, "grad_norm": 1.785150085941153, "learning_rate": 1.4923013618221584e-05, "loss": 0.8561, "step": 2168 }, { "epoch": 0.36, "grad_norm": 10.546148335353934, "learning_rate": 1.4918382066129946e-05, "loss": 0.8276, "step": 2169 }, { "epoch": 0.36, "grad_norm": 2.174740035639297, "learning_rate": 1.4913749121900611e-05, "loss": 0.7888, "step": 2170 }, { "epoch": 0.36, "grad_norm": 1.9105249373524416, "learning_rate": 1.4909114786844925e-05, "loss": 0.9107, "step": 2171 }, { "epoch": 0.36, "grad_norm": 1.8273743515446144, "learning_rate": 1.4904479062274627e-05, "loss": 0.8055, "step": 2172 }, { "epoch": 0.36, "grad_norm": 1.7700134606901676, "learning_rate": 1.4899841949501845e-05, "loss": 0.8628, "step": 2173 }, { "epoch": 0.36, "grad_norm": 1.8622993492516662, "learning_rate": 1.4895203449839111e-05, "loss": 0.9475, "step": 2174 }, { "epoch": 0.36, "grad_norm": 1.9091406890238236, "learning_rate": 1.4890563564599337e-05, "loss": 0.812, "step": 2175 }, { "epoch": 0.36, "grad_norm": 0.6854049669191032, "learning_rate": 1.4885922295095836e-05, "loss": 0.3439, "step": 2176 }, { "epoch": 0.36, "grad_norm": 2.3720218198673892, "learning_rate": 1.4881279642642308e-05, "loss": 0.9166, "step": 2177 }, { "epoch": 0.36, "grad_norm": 1.9675405200727534, "learning_rate": 1.4876635608552845e-05, "loss": 0.8641, "step": 2178 }, { "epoch": 0.36, "grad_norm": 2.2776068055876655, "learning_rate": 1.4871990194141934e-05, "loss": 0.8164, "step": 2179 }, { "epoch": 0.36, "grad_norm": 2.750151884249917, "learning_rate": 1.486734340072445e-05, "loss": 0.8197, "step": 2180 }, { "epoch": 0.36, "grad_norm": 1.9706393383920575, "learning_rate": 1.4862695229615654e-05, "loss": 0.854, "step": 2181 }, { "epoch": 0.36, "grad_norm": 2.253569387271165, "learning_rate": 1.4858045682131203e-05, "loss": 0.8287, "step": 2182 }, { "epoch": 0.36, "grad_norm": 1.5387214264107478, "learning_rate": 1.4853394759587146e-05, "loss": 0.8283, "step": 2183 }, { "epoch": 0.36, "grad_norm": 5.230183509253853, "learning_rate": 1.4848742463299907e-05, "loss": 0.7834, "step": 2184 }, { "epoch": 0.36, "grad_norm": 1.6355972173306603, "learning_rate": 1.484408879458632e-05, "loss": 0.7547, "step": 2185 }, { "epoch": 0.36, "grad_norm": 2.295429275832243, "learning_rate": 1.4839433754763588e-05, "loss": 0.8036, "step": 2186 }, { "epoch": 0.36, "grad_norm": 1.6339329296570317, "learning_rate": 1.4834777345149313e-05, "loss": 0.9292, "step": 2187 }, { "epoch": 0.36, "grad_norm": 2.2936590354728215, "learning_rate": 1.4830119567061484e-05, "loss": 0.81, "step": 2188 }, { "epoch": 0.36, "grad_norm": 1.9078368210308763, "learning_rate": 1.4825460421818472e-05, "loss": 0.82, "step": 2189 }, { "epoch": 0.36, "grad_norm": 1.4264823583483768, "learning_rate": 1.4820799910739042e-05, "loss": 0.8461, "step": 2190 }, { "epoch": 0.36, "grad_norm": 2.426519408496212, "learning_rate": 1.4816138035142334e-05, "loss": 0.648, "step": 2191 }, { "epoch": 0.36, "grad_norm": 2.7499894788396464, "learning_rate": 1.481147479634789e-05, "loss": 0.8514, "step": 2192 }, { "epoch": 0.36, "grad_norm": 6.094882597854587, "learning_rate": 1.4806810195675627e-05, "loss": 0.9074, "step": 2193 }, { "epoch": 0.36, "grad_norm": 2.9706178025374155, "learning_rate": 1.480214423444585e-05, "loss": 0.8932, "step": 2194 }, { "epoch": 0.36, "grad_norm": 1.6598105393389202, "learning_rate": 1.4797476913979251e-05, "loss": 0.8888, "step": 2195 }, { "epoch": 0.36, "grad_norm": 2.108575954895274, "learning_rate": 1.4792808235596907e-05, "loss": 0.8021, "step": 2196 }, { "epoch": 0.36, "grad_norm": 2.0732482785080766, "learning_rate": 1.4788138200620272e-05, "loss": 0.8588, "step": 2197 }, { "epoch": 0.36, "grad_norm": 4.459215922537247, "learning_rate": 1.4783466810371195e-05, "loss": 0.8632, "step": 2198 }, { "epoch": 0.36, "grad_norm": 1.540126199598972, "learning_rate": 1.47787940661719e-05, "loss": 0.8424, "step": 2199 }, { "epoch": 0.36, "grad_norm": 2.675711180782332, "learning_rate": 1.4774119969344996e-05, "loss": 0.826, "step": 2200 }, { "epoch": 0.36, "grad_norm": 1.9359020810245071, "learning_rate": 1.4769444521213482e-05, "loss": 0.8649, "step": 2201 }, { "epoch": 0.36, "grad_norm": 1.5744409003793651, "learning_rate": 1.4764767723100729e-05, "loss": 0.8072, "step": 2202 }, { "epoch": 0.36, "grad_norm": 2.346470622917, "learning_rate": 1.4760089576330493e-05, "loss": 0.7564, "step": 2203 }, { "epoch": 0.36, "grad_norm": 2.9282245958530493, "learning_rate": 1.475541008222692e-05, "loss": 0.843, "step": 2204 }, { "epoch": 0.36, "grad_norm": 2.5233729997520395, "learning_rate": 1.4750729242114527e-05, "loss": 0.7829, "step": 2205 }, { "epoch": 0.36, "grad_norm": 2.6121402629415544, "learning_rate": 1.4746047057318217e-05, "loss": 0.8342, "step": 2206 }, { "epoch": 0.36, "grad_norm": 2.51204711807536, "learning_rate": 1.4741363529163273e-05, "loss": 0.8162, "step": 2207 }, { "epoch": 0.36, "grad_norm": 2.0013806747395595, "learning_rate": 1.4736678658975357e-05, "loss": 0.803, "step": 2208 }, { "epoch": 0.36, "grad_norm": 1.8018476570091682, "learning_rate": 1.4731992448080509e-05, "loss": 0.8252, "step": 2209 }, { "epoch": 0.36, "grad_norm": 2.2057243597129905, "learning_rate": 1.4727304897805157e-05, "loss": 0.8153, "step": 2210 }, { "epoch": 0.36, "grad_norm": 1.9161023638209593, "learning_rate": 1.47226160094761e-05, "loss": 0.7752, "step": 2211 }, { "epoch": 0.36, "grad_norm": 3.080024642308768, "learning_rate": 1.4717925784420514e-05, "loss": 0.8224, "step": 2212 }, { "epoch": 0.36, "grad_norm": 2.027568390523702, "learning_rate": 1.471323422396596e-05, "loss": 0.8787, "step": 2213 }, { "epoch": 0.36, "grad_norm": 2.152136376263146, "learning_rate": 1.4708541329440375e-05, "loss": 0.8811, "step": 2214 }, { "epoch": 0.36, "grad_norm": 1.8459128137439622, "learning_rate": 1.4703847102172074e-05, "loss": 0.7616, "step": 2215 }, { "epoch": 0.36, "grad_norm": 3.5031221027193773, "learning_rate": 1.4699151543489745e-05, "loss": 0.7956, "step": 2216 }, { "epoch": 0.36, "grad_norm": 22.97093678012669, "learning_rate": 1.4694454654722459e-05, "loss": 0.7622, "step": 2217 }, { "epoch": 0.36, "grad_norm": 4.194816040523114, "learning_rate": 1.4689756437199658e-05, "loss": 0.8287, "step": 2218 }, { "epoch": 0.36, "grad_norm": 2.0621175097052595, "learning_rate": 1.4685056892251167e-05, "loss": 0.9222, "step": 2219 }, { "epoch": 0.36, "grad_norm": 2.1285508812915466, "learning_rate": 1.4680356021207176e-05, "loss": 0.849, "step": 2220 }, { "epoch": 0.36, "grad_norm": 1.7846122273146852, "learning_rate": 1.4675653825398261e-05, "loss": 0.8495, "step": 2221 }, { "epoch": 0.36, "grad_norm": 3.1393573177590524, "learning_rate": 1.4670950306155368e-05, "loss": 0.7771, "step": 2222 }, { "epoch": 0.37, "grad_norm": 2.171576434660035, "learning_rate": 1.4666245464809818e-05, "loss": 0.7797, "step": 2223 }, { "epoch": 0.37, "grad_norm": 1.7698896855928694, "learning_rate": 1.4661539302693306e-05, "loss": 0.7995, "step": 2224 }, { "epoch": 0.37, "grad_norm": 2.360275155716215, "learning_rate": 1.46568318211379e-05, "loss": 0.8282, "step": 2225 }, { "epoch": 0.37, "grad_norm": 1.82790048873513, "learning_rate": 1.4652123021476044e-05, "loss": 0.8874, "step": 2226 }, { "epoch": 0.37, "grad_norm": 1.6802753340405483, "learning_rate": 1.4647412905040553e-05, "loss": 0.7905, "step": 2227 }, { "epoch": 0.37, "grad_norm": 1.6339847974374806, "learning_rate": 1.4642701473164618e-05, "loss": 0.7273, "step": 2228 }, { "epoch": 0.37, "grad_norm": 1.9896588285262578, "learning_rate": 1.4637988727181798e-05, "loss": 0.8714, "step": 2229 }, { "epoch": 0.37, "grad_norm": 1.9850917611423662, "learning_rate": 1.4633274668426028e-05, "loss": 0.8281, "step": 2230 }, { "epoch": 0.37, "grad_norm": 1.796633429152202, "learning_rate": 1.462855929823161e-05, "loss": 0.9171, "step": 2231 }, { "epoch": 0.37, "grad_norm": 1.9730710146998347, "learning_rate": 1.4623842617933219e-05, "loss": 0.8229, "step": 2232 }, { "epoch": 0.37, "grad_norm": 1.8217875657578972, "learning_rate": 1.4619124628865904e-05, "loss": 0.8941, "step": 2233 }, { "epoch": 0.37, "grad_norm": 2.0999643885160504, "learning_rate": 1.461440533236508e-05, "loss": 0.7839, "step": 2234 }, { "epoch": 0.37, "grad_norm": 1.6702232739182776, "learning_rate": 1.4609684729766536e-05, "loss": 0.8044, "step": 2235 }, { "epoch": 0.37, "grad_norm": 3.2189797202604886, "learning_rate": 1.4604962822406426e-05, "loss": 0.8274, "step": 2236 }, { "epoch": 0.37, "grad_norm": 2.0427812799226834, "learning_rate": 1.4600239611621274e-05, "loss": 0.8853, "step": 2237 }, { "epoch": 0.37, "grad_norm": 1.9562127243520582, "learning_rate": 1.459551509874798e-05, "loss": 0.7932, "step": 2238 }, { "epoch": 0.37, "grad_norm": 1.684797975274365, "learning_rate": 1.4590789285123808e-05, "loss": 0.7848, "step": 2239 }, { "epoch": 0.37, "grad_norm": 0.7328565027285715, "learning_rate": 1.4586062172086383e-05, "loss": 0.3673, "step": 2240 }, { "epoch": 0.37, "grad_norm": 1.6633703656181116, "learning_rate": 1.4581333760973713e-05, "loss": 0.8826, "step": 2241 }, { "epoch": 0.37, "grad_norm": 1.919826407226316, "learning_rate": 1.4576604053124154e-05, "loss": 0.7439, "step": 2242 }, { "epoch": 0.37, "grad_norm": 1.953945474864905, "learning_rate": 1.4571873049876452e-05, "loss": 0.8592, "step": 2243 }, { "epoch": 0.37, "grad_norm": 2.0146378322884546, "learning_rate": 1.4567140752569701e-05, "loss": 0.9097, "step": 2244 }, { "epoch": 0.37, "grad_norm": 1.8041975532584993, "learning_rate": 1.4562407162543367e-05, "loss": 0.789, "step": 2245 }, { "epoch": 0.37, "grad_norm": 2.1814224545293164, "learning_rate": 1.4557672281137286e-05, "loss": 0.7909, "step": 2246 }, { "epoch": 0.37, "grad_norm": 1.6888358469637015, "learning_rate": 1.455293610969165e-05, "loss": 0.8796, "step": 2247 }, { "epoch": 0.37, "grad_norm": 1.6054848117196423, "learning_rate": 1.454819864954703e-05, "loss": 0.7332, "step": 2248 }, { "epoch": 0.37, "grad_norm": 1.9808077330840037, "learning_rate": 1.4543459902044347e-05, "loss": 0.6897, "step": 2249 }, { "epoch": 0.37, "grad_norm": 2.509494039071811, "learning_rate": 1.45387198685249e-05, "loss": 0.9034, "step": 2250 }, { "epoch": 0.37, "grad_norm": 0.6540429986295979, "learning_rate": 1.4533978550330343e-05, "loss": 0.3666, "step": 2251 }, { "epoch": 0.37, "grad_norm": 1.6402829696604626, "learning_rate": 1.4529235948802696e-05, "loss": 0.7943, "step": 2252 }, { "epoch": 0.37, "grad_norm": 1.7519359945723454, "learning_rate": 1.4524492065284344e-05, "loss": 0.8035, "step": 2253 }, { "epoch": 0.37, "grad_norm": 2.108872760559317, "learning_rate": 1.4519746901118029e-05, "loss": 0.9056, "step": 2254 }, { "epoch": 0.37, "grad_norm": 2.026509102220706, "learning_rate": 1.4515000457646866e-05, "loss": 0.7814, "step": 2255 }, { "epoch": 0.37, "grad_norm": 1.6094360126814775, "learning_rate": 1.4510252736214318e-05, "loss": 0.7906, "step": 2256 }, { "epoch": 0.37, "grad_norm": 1.8201427212397945, "learning_rate": 1.4505503738164225e-05, "loss": 0.784, "step": 2257 }, { "epoch": 0.37, "grad_norm": 1.7422978629282324, "learning_rate": 1.4500753464840775e-05, "loss": 0.8171, "step": 2258 }, { "epoch": 0.37, "grad_norm": 1.6892019392212163, "learning_rate": 1.4496001917588528e-05, "loss": 0.8068, "step": 2259 }, { "epoch": 0.37, "grad_norm": 2.729500411767145, "learning_rate": 1.4491249097752393e-05, "loss": 0.8315, "step": 2260 }, { "epoch": 0.37, "grad_norm": 3.0835829419079217, "learning_rate": 1.448649500667765e-05, "loss": 0.854, "step": 2261 }, { "epoch": 0.37, "grad_norm": 2.6731101241079656, "learning_rate": 1.4481739645709935e-05, "loss": 0.7442, "step": 2262 }, { "epoch": 0.37, "grad_norm": 2.3997982376712925, "learning_rate": 1.4476983016195245e-05, "loss": 0.8669, "step": 2263 }, { "epoch": 0.37, "grad_norm": 1.8313358378161733, "learning_rate": 1.4472225119479928e-05, "loss": 0.8736, "step": 2264 }, { "epoch": 0.37, "grad_norm": 1.641689830069236, "learning_rate": 1.4467465956910704e-05, "loss": 0.8394, "step": 2265 }, { "epoch": 0.37, "grad_norm": 0.693854659307265, "learning_rate": 1.4462705529834635e-05, "loss": 0.3634, "step": 2266 }, { "epoch": 0.37, "grad_norm": 1.9858645369063024, "learning_rate": 1.4457943839599158e-05, "loss": 0.8225, "step": 2267 }, { "epoch": 0.37, "grad_norm": 2.5468604073453194, "learning_rate": 1.4453180887552052e-05, "loss": 0.7412, "step": 2268 }, { "epoch": 0.37, "grad_norm": 1.7812041470777449, "learning_rate": 1.4448416675041465e-05, "loss": 0.837, "step": 2269 }, { "epoch": 0.37, "grad_norm": 2.3066674767686752, "learning_rate": 1.44436512034159e-05, "loss": 0.8329, "step": 2270 }, { "epoch": 0.37, "grad_norm": 1.8967098143354546, "learning_rate": 1.443888447402421e-05, "loss": 0.7418, "step": 2271 }, { "epoch": 0.37, "grad_norm": 2.4463078735188177, "learning_rate": 1.4434116488215603e-05, "loss": 0.8788, "step": 2272 }, { "epoch": 0.37, "grad_norm": 3.3886679412693748, "learning_rate": 1.4429347247339656e-05, "loss": 0.802, "step": 2273 }, { "epoch": 0.37, "grad_norm": 9.165103782076205, "learning_rate": 1.4424576752746288e-05, "loss": 0.8107, "step": 2274 }, { "epoch": 0.37, "grad_norm": 2.024556721687842, "learning_rate": 1.4419805005785783e-05, "loss": 0.8574, "step": 2275 }, { "epoch": 0.37, "grad_norm": 2.3379222119534524, "learning_rate": 1.4415032007808767e-05, "loss": 0.7909, "step": 2276 }, { "epoch": 0.37, "grad_norm": 0.7530440511219118, "learning_rate": 1.441025776016623e-05, "loss": 0.4132, "step": 2277 }, { "epoch": 0.37, "grad_norm": 2.149722599822267, "learning_rate": 1.4405482264209512e-05, "loss": 0.8778, "step": 2278 }, { "epoch": 0.37, "grad_norm": 2.3674230767708795, "learning_rate": 1.4400705521290306e-05, "loss": 0.8346, "step": 2279 }, { "epoch": 0.37, "grad_norm": 2.0030696310939513, "learning_rate": 1.4395927532760664e-05, "loss": 0.8666, "step": 2280 }, { "epoch": 0.37, "grad_norm": 1.704968163525502, "learning_rate": 1.4391148299972978e-05, "loss": 0.7574, "step": 2281 }, { "epoch": 0.37, "grad_norm": 1.9467365959450882, "learning_rate": 1.4386367824280006e-05, "loss": 0.7205, "step": 2282 }, { "epoch": 0.37, "grad_norm": 1.8558894466907179, "learning_rate": 1.4381586107034849e-05, "loss": 0.7976, "step": 2283 }, { "epoch": 0.38, "grad_norm": 1.5431164548996337, "learning_rate": 1.437680314959096e-05, "loss": 0.8253, "step": 2284 }, { "epoch": 0.38, "grad_norm": 2.2841782162924957, "learning_rate": 1.437201895330215e-05, "loss": 0.8058, "step": 2285 }, { "epoch": 0.38, "grad_norm": 2.706865525051572, "learning_rate": 1.4367233519522571e-05, "loss": 0.761, "step": 2286 }, { "epoch": 0.38, "grad_norm": 2.003799656192294, "learning_rate": 1.4362446849606737e-05, "loss": 0.8276, "step": 2287 }, { "epoch": 0.38, "grad_norm": 1.7362896933307976, "learning_rate": 1.4357658944909496e-05, "loss": 0.8856, "step": 2288 }, { "epoch": 0.38, "grad_norm": 2.2042311539329305, "learning_rate": 1.4352869806786061e-05, "loss": 0.8091, "step": 2289 }, { "epoch": 0.38, "grad_norm": 0.6732077623604918, "learning_rate": 1.4348079436591982e-05, "loss": 0.3838, "step": 2290 }, { "epoch": 0.38, "grad_norm": 2.1505991094719406, "learning_rate": 1.4343287835683168e-05, "loss": 0.7253, "step": 2291 }, { "epoch": 0.38, "grad_norm": 1.5079645802457395, "learning_rate": 1.4338495005415869e-05, "loss": 0.7875, "step": 2292 }, { "epoch": 0.38, "grad_norm": 2.091766821149372, "learning_rate": 1.4333700947146686e-05, "loss": 0.8333, "step": 2293 }, { "epoch": 0.38, "grad_norm": 2.784063619762263, "learning_rate": 1.4328905662232567e-05, "loss": 0.8682, "step": 2294 }, { "epoch": 0.38, "grad_norm": 1.9706599565254341, "learning_rate": 1.4324109152030807e-05, "loss": 0.8457, "step": 2295 }, { "epoch": 0.38, "grad_norm": 1.7691637548350088, "learning_rate": 1.4319311417899048e-05, "loss": 0.8372, "step": 2296 }, { "epoch": 0.38, "grad_norm": 1.9647512021034677, "learning_rate": 1.431451246119528e-05, "loss": 0.8579, "step": 2297 }, { "epoch": 0.38, "grad_norm": 2.1854088032480132, "learning_rate": 1.4309712283277839e-05, "loss": 0.6898, "step": 2298 }, { "epoch": 0.38, "grad_norm": 2.1597299179174656, "learning_rate": 1.4304910885505404e-05, "loss": 0.8032, "step": 2299 }, { "epoch": 0.38, "grad_norm": 1.7503078580746103, "learning_rate": 1.4300108269236997e-05, "loss": 0.9081, "step": 2300 }, { "epoch": 0.38, "grad_norm": 1.707462679807849, "learning_rate": 1.429530443583199e-05, "loss": 0.7888, "step": 2301 }, { "epoch": 0.38, "grad_norm": 1.9682549958878497, "learning_rate": 1.4290499386650099e-05, "loss": 0.7636, "step": 2302 }, { "epoch": 0.38, "grad_norm": 0.6693774209919787, "learning_rate": 1.4285693123051385e-05, "loss": 0.3711, "step": 2303 }, { "epoch": 0.38, "grad_norm": 1.9567669428906813, "learning_rate": 1.4280885646396248e-05, "loss": 0.7975, "step": 2304 }, { "epoch": 0.38, "grad_norm": 2.0113362430290227, "learning_rate": 1.4276076958045436e-05, "loss": 0.7501, "step": 2305 }, { "epoch": 0.38, "grad_norm": 2.5116969037659933, "learning_rate": 1.4271267059360035e-05, "loss": 0.8518, "step": 2306 }, { "epoch": 0.38, "grad_norm": 1.6548625780316124, "learning_rate": 1.4266455951701476e-05, "loss": 0.8857, "step": 2307 }, { "epoch": 0.38, "grad_norm": 6.141601334972292, "learning_rate": 1.4261643636431539e-05, "loss": 0.8194, "step": 2308 }, { "epoch": 0.38, "grad_norm": 0.6444347174349033, "learning_rate": 1.4256830114912341e-05, "loss": 0.3856, "step": 2309 }, { "epoch": 0.38, "grad_norm": 1.7067794112935784, "learning_rate": 1.4252015388506328e-05, "loss": 0.841, "step": 2310 }, { "epoch": 0.38, "grad_norm": 6.906242381776612, "learning_rate": 1.4247199458576308e-05, "loss": 0.8842, "step": 2311 }, { "epoch": 0.38, "grad_norm": 2.029977387510528, "learning_rate": 1.4242382326485416e-05, "loss": 0.7937, "step": 2312 }, { "epoch": 0.38, "grad_norm": 1.690114740257253, "learning_rate": 1.4237563993597133e-05, "loss": 0.836, "step": 2313 }, { "epoch": 0.38, "grad_norm": 1.8502603234055852, "learning_rate": 1.4232744461275273e-05, "loss": 0.7987, "step": 2314 }, { "epoch": 0.38, "grad_norm": 1.6740034512702422, "learning_rate": 1.4227923730884001e-05, "loss": 0.8592, "step": 2315 }, { "epoch": 0.38, "grad_norm": 1.7416094866886265, "learning_rate": 1.4223101803787811e-05, "loss": 0.9098, "step": 2316 }, { "epoch": 0.38, "grad_norm": 2.5953458197199972, "learning_rate": 1.421827868135154e-05, "loss": 0.7989, "step": 2317 }, { "epoch": 0.38, "grad_norm": 1.7868004120726877, "learning_rate": 1.4213454364940362e-05, "loss": 0.8608, "step": 2318 }, { "epoch": 0.38, "grad_norm": 1.6145838805554376, "learning_rate": 1.420862885591979e-05, "loss": 0.8451, "step": 2319 }, { "epoch": 0.38, "grad_norm": 2.006055214926149, "learning_rate": 1.4203802155655677e-05, "loss": 0.7505, "step": 2320 }, { "epoch": 0.38, "grad_norm": 1.9824885197512132, "learning_rate": 1.4198974265514207e-05, "loss": 0.8592, "step": 2321 }, { "epoch": 0.38, "grad_norm": 2.0831353777489743, "learning_rate": 1.4194145186861902e-05, "loss": 0.7554, "step": 2322 }, { "epoch": 0.38, "grad_norm": 2.6041249108621756, "learning_rate": 1.4189314921065629e-05, "loss": 0.8235, "step": 2323 }, { "epoch": 0.38, "grad_norm": 1.7826433335438339, "learning_rate": 1.418448346949258e-05, "loss": 0.8107, "step": 2324 }, { "epoch": 0.38, "grad_norm": 1.8006696525339403, "learning_rate": 1.417965083351029e-05, "loss": 0.8427, "step": 2325 }, { "epoch": 0.38, "grad_norm": 2.8963327373084193, "learning_rate": 1.4174817014486622e-05, "loss": 0.7673, "step": 2326 }, { "epoch": 0.38, "grad_norm": 1.8528599813864766, "learning_rate": 1.4169982013789782e-05, "loss": 0.8539, "step": 2327 }, { "epoch": 0.38, "grad_norm": 1.6211218740794466, "learning_rate": 1.4165145832788305e-05, "loss": 0.817, "step": 2328 }, { "epoch": 0.38, "grad_norm": 1.9596264078133725, "learning_rate": 1.4160308472851065e-05, "loss": 0.7878, "step": 2329 }, { "epoch": 0.38, "grad_norm": 1.9204535342767024, "learning_rate": 1.4155469935347264e-05, "loss": 0.7622, "step": 2330 }, { "epoch": 0.38, "grad_norm": 2.4591004283229267, "learning_rate": 1.415063022164644e-05, "loss": 0.7937, "step": 2331 }, { "epoch": 0.38, "grad_norm": 1.5115714976279084, "learning_rate": 1.4145789333118462e-05, "loss": 0.7913, "step": 2332 }, { "epoch": 0.38, "grad_norm": 2.3968187083133223, "learning_rate": 1.4140947271133536e-05, "loss": 0.7554, "step": 2333 }, { "epoch": 0.38, "grad_norm": 1.8531223584293393, "learning_rate": 1.4136104037062197e-05, "loss": 0.8605, "step": 2334 }, { "epoch": 0.38, "grad_norm": 2.0354433208319476, "learning_rate": 1.4131259632275312e-05, "loss": 0.7203, "step": 2335 }, { "epoch": 0.38, "grad_norm": 2.6283986630642473, "learning_rate": 1.412641405814408e-05, "loss": 0.8581, "step": 2336 }, { "epoch": 0.38, "grad_norm": 1.5664853849794385, "learning_rate": 1.412156731604003e-05, "loss": 0.8472, "step": 2337 }, { "epoch": 0.38, "grad_norm": 1.5775573040698434, "learning_rate": 1.4116719407335022e-05, "loss": 0.84, "step": 2338 }, { "epoch": 0.38, "grad_norm": 2.8340928547052897, "learning_rate": 1.4111870333401246e-05, "loss": 0.8331, "step": 2339 }, { "epoch": 0.38, "grad_norm": 2.018547188288458, "learning_rate": 1.4107020095611223e-05, "loss": 0.8207, "step": 2340 }, { "epoch": 0.38, "grad_norm": 1.470980240560168, "learning_rate": 1.4102168695337804e-05, "loss": 0.8732, "step": 2341 }, { "epoch": 0.38, "grad_norm": 1.5758286703936852, "learning_rate": 1.4097316133954163e-05, "loss": 0.8541, "step": 2342 }, { "epoch": 0.38, "grad_norm": 3.2411747095279497, "learning_rate": 1.4092462412833811e-05, "loss": 0.922, "step": 2343 }, { "epoch": 0.38, "grad_norm": 1.794140957142626, "learning_rate": 1.4087607533350585e-05, "loss": 0.8059, "step": 2344 }, { "epoch": 0.39, "grad_norm": 1.531636575269395, "learning_rate": 1.4082751496878644e-05, "loss": 0.7305, "step": 2345 }, { "epoch": 0.39, "grad_norm": 2.051722057523368, "learning_rate": 1.4077894304792481e-05, "loss": 0.7941, "step": 2346 }, { "epoch": 0.39, "grad_norm": 1.7554410845839075, "learning_rate": 1.4073035958466916e-05, "loss": 0.8505, "step": 2347 }, { "epoch": 0.39, "grad_norm": 2.0413003856577485, "learning_rate": 1.406817645927709e-05, "loss": 0.8085, "step": 2348 }, { "epoch": 0.39, "grad_norm": 1.7004971728732823, "learning_rate": 1.4063315808598477e-05, "loss": 0.8094, "step": 2349 }, { "epoch": 0.39, "grad_norm": 3.726604051525071, "learning_rate": 1.4058454007806874e-05, "loss": 0.872, "step": 2350 }, { "epoch": 0.39, "grad_norm": 0.6505821395999621, "learning_rate": 1.4053591058278402e-05, "loss": 0.4015, "step": 2351 }, { "epoch": 0.39, "grad_norm": 1.7969597702835616, "learning_rate": 1.4048726961389508e-05, "loss": 0.8183, "step": 2352 }, { "epoch": 0.39, "grad_norm": 2.580971360695946, "learning_rate": 1.4043861718516964e-05, "loss": 0.9261, "step": 2353 }, { "epoch": 0.39, "grad_norm": 2.6819973632767486, "learning_rate": 1.403899533103787e-05, "loss": 0.7826, "step": 2354 }, { "epoch": 0.39, "grad_norm": 2.054718560645472, "learning_rate": 1.4034127800329645e-05, "loss": 0.8214, "step": 2355 }, { "epoch": 0.39, "grad_norm": 1.5666088710895194, "learning_rate": 1.4029259127770032e-05, "loss": 0.791, "step": 2356 }, { "epoch": 0.39, "grad_norm": 2.124686376146947, "learning_rate": 1.40243893147371e-05, "loss": 0.7202, "step": 2357 }, { "epoch": 0.39, "grad_norm": 1.877708026701892, "learning_rate": 1.4019518362609239e-05, "loss": 0.8335, "step": 2358 }, { "epoch": 0.39, "grad_norm": 2.4618092460168777, "learning_rate": 1.4014646272765162e-05, "loss": 0.8084, "step": 2359 }, { "epoch": 0.39, "grad_norm": 1.695160544860022, "learning_rate": 1.4009773046583904e-05, "loss": 0.7837, "step": 2360 }, { "epoch": 0.39, "grad_norm": 0.6547846186018278, "learning_rate": 1.4004898685444819e-05, "loss": 0.3993, "step": 2361 }, { "epoch": 0.39, "grad_norm": 7.268575453836989, "learning_rate": 1.4000023190727587e-05, "loss": 0.8176, "step": 2362 }, { "epoch": 0.39, "grad_norm": 1.955565879740572, "learning_rate": 1.399514656381221e-05, "loss": 0.8348, "step": 2363 }, { "epoch": 0.39, "grad_norm": 2.527273267792657, "learning_rate": 1.3990268806078999e-05, "loss": 0.8805, "step": 2364 }, { "epoch": 0.39, "grad_norm": 2.4504135627324297, "learning_rate": 1.39853899189086e-05, "loss": 0.8408, "step": 2365 }, { "epoch": 0.39, "grad_norm": 2.5760659165212036, "learning_rate": 1.3980509903681968e-05, "loss": 0.8041, "step": 2366 }, { "epoch": 0.39, "grad_norm": 2.7359319428080204, "learning_rate": 1.397562876178038e-05, "loss": 0.8736, "step": 2367 }, { "epoch": 0.39, "grad_norm": 2.069525696077259, "learning_rate": 1.3970746494585439e-05, "loss": 0.7821, "step": 2368 }, { "epoch": 0.39, "grad_norm": 2.1712102198342187, "learning_rate": 1.3965863103479054e-05, "loss": 0.8562, "step": 2369 }, { "epoch": 0.39, "grad_norm": 2.3455385735755296, "learning_rate": 1.3960978589843458e-05, "loss": 0.7544, "step": 2370 }, { "epoch": 0.39, "grad_norm": 2.300401781223307, "learning_rate": 1.3956092955061208e-05, "loss": 0.7929, "step": 2371 }, { "epoch": 0.39, "grad_norm": 2.437780497018699, "learning_rate": 1.3951206200515173e-05, "loss": 0.8286, "step": 2372 }, { "epoch": 0.39, "grad_norm": 1.9528928128509697, "learning_rate": 1.3946318327588534e-05, "loss": 0.7788, "step": 2373 }, { "epoch": 0.39, "grad_norm": 0.6530158891882688, "learning_rate": 1.3941429337664791e-05, "loss": 0.3499, "step": 2374 }, { "epoch": 0.39, "grad_norm": 0.650692670928044, "learning_rate": 1.3936539232127771e-05, "loss": 0.4107, "step": 2375 }, { "epoch": 0.39, "grad_norm": 1.9977531260249968, "learning_rate": 1.3931648012361599e-05, "loss": 0.8416, "step": 2376 }, { "epoch": 0.39, "grad_norm": 2.0214168664779764, "learning_rate": 1.392675567975073e-05, "loss": 0.7215, "step": 2377 }, { "epoch": 0.39, "grad_norm": 2.1918981929001404, "learning_rate": 1.3921862235679929e-05, "loss": 0.7804, "step": 2378 }, { "epoch": 0.39, "grad_norm": 1.6040938889581946, "learning_rate": 1.391696768153427e-05, "loss": 0.7617, "step": 2379 }, { "epoch": 0.39, "grad_norm": 1.7188616699960202, "learning_rate": 1.3912072018699152e-05, "loss": 0.751, "step": 2380 }, { "epoch": 0.39, "grad_norm": 2.6877630622507334, "learning_rate": 1.3907175248560276e-05, "loss": 0.8696, "step": 2381 }, { "epoch": 0.39, "grad_norm": 0.7195379606629012, "learning_rate": 1.390227737250367e-05, "loss": 0.3947, "step": 2382 }, { "epoch": 0.39, "grad_norm": 2.345693111345452, "learning_rate": 1.389737839191566e-05, "loss": 0.7769, "step": 2383 }, { "epoch": 0.39, "grad_norm": 1.8837730360348925, "learning_rate": 1.38924783081829e-05, "loss": 0.8188, "step": 2384 }, { "epoch": 0.39, "grad_norm": 2.074707705830639, "learning_rate": 1.3887577122692337e-05, "loss": 0.8682, "step": 2385 }, { "epoch": 0.39, "grad_norm": 2.221170744713218, "learning_rate": 1.3882674836831251e-05, "loss": 0.8587, "step": 2386 }, { "epoch": 0.39, "grad_norm": 7.156918460932519, "learning_rate": 1.3877771451987223e-05, "loss": 0.8561, "step": 2387 }, { "epoch": 0.39, "grad_norm": 2.0242455385274485, "learning_rate": 1.3872866969548143e-05, "loss": 0.8059, "step": 2388 }, { "epoch": 0.39, "grad_norm": 2.6760028574337267, "learning_rate": 1.3867961390902214e-05, "loss": 0.8321, "step": 2389 }, { "epoch": 0.39, "grad_norm": 2.9755806083148055, "learning_rate": 1.3863054717437952e-05, "loss": 0.713, "step": 2390 }, { "epoch": 0.39, "grad_norm": 2.0984761956527183, "learning_rate": 1.3858146950544178e-05, "loss": 0.8239, "step": 2391 }, { "epoch": 0.39, "grad_norm": 1.889662341684993, "learning_rate": 1.3853238091610029e-05, "loss": 0.8151, "step": 2392 }, { "epoch": 0.39, "grad_norm": 9.940314063027166, "learning_rate": 1.384832814202494e-05, "loss": 0.7882, "step": 2393 }, { "epoch": 0.39, "grad_norm": 2.064154894596113, "learning_rate": 1.3843417103178669e-05, "loss": 0.8458, "step": 2394 }, { "epoch": 0.39, "grad_norm": 1.7842082455293222, "learning_rate": 1.3838504976461278e-05, "loss": 0.8702, "step": 2395 }, { "epoch": 0.39, "grad_norm": 1.9743166404959303, "learning_rate": 1.3833591763263123e-05, "loss": 0.8981, "step": 2396 }, { "epoch": 0.39, "grad_norm": 0.6203856644718198, "learning_rate": 1.3828677464974885e-05, "loss": 0.3749, "step": 2397 }, { "epoch": 0.39, "grad_norm": 2.164187537216072, "learning_rate": 1.3823762082987544e-05, "loss": 0.7832, "step": 2398 }, { "epoch": 0.39, "grad_norm": 2.011588587422474, "learning_rate": 1.381884561869239e-05, "loss": 0.8748, "step": 2399 }, { "epoch": 0.39, "grad_norm": 2.214958143692335, "learning_rate": 1.3813928073481023e-05, "loss": 0.8334, "step": 2400 }, { "epoch": 0.39, "grad_norm": 1.9482890046014805, "learning_rate": 1.3809009448745334e-05, "loss": 0.8414, "step": 2401 }, { "epoch": 0.39, "grad_norm": 0.644543094876859, "learning_rate": 1.3804089745877536e-05, "loss": 0.3856, "step": 2402 }, { "epoch": 0.39, "grad_norm": 0.6890000391915759, "learning_rate": 1.3799168966270139e-05, "loss": 0.3928, "step": 2403 }, { "epoch": 0.39, "grad_norm": 0.6670814962991114, "learning_rate": 1.3794247111315955e-05, "loss": 0.3672, "step": 2404 }, { "epoch": 0.4, "grad_norm": 2.146214780479214, "learning_rate": 1.3789324182408112e-05, "loss": 0.8552, "step": 2405 }, { "epoch": 0.4, "grad_norm": 1.7548148726927997, "learning_rate": 1.3784400180940032e-05, "loss": 0.7714, "step": 2406 }, { "epoch": 0.4, "grad_norm": 1.7790735313443309, "learning_rate": 1.377947510830544e-05, "loss": 0.7623, "step": 2407 }, { "epoch": 0.4, "grad_norm": 1.6237052589411778, "learning_rate": 1.3774548965898371e-05, "loss": 0.8303, "step": 2408 }, { "epoch": 0.4, "grad_norm": 1.9892672385020624, "learning_rate": 1.3769621755113156e-05, "loss": 0.7941, "step": 2409 }, { "epoch": 0.4, "grad_norm": 0.7073133155764632, "learning_rate": 1.3764693477344435e-05, "loss": 0.3516, "step": 2410 }, { "epoch": 0.4, "grad_norm": 2.0343983918863517, "learning_rate": 1.3759764133987146e-05, "loss": 0.8427, "step": 2411 }, { "epoch": 0.4, "grad_norm": 2.065997589721717, "learning_rate": 1.375483372643653e-05, "loss": 0.8214, "step": 2412 }, { "epoch": 0.4, "grad_norm": 1.685059469500545, "learning_rate": 1.3749902256088125e-05, "loss": 0.8138, "step": 2413 }, { "epoch": 0.4, "grad_norm": 1.9898074397671683, "learning_rate": 1.3744969724337779e-05, "loss": 0.9087, "step": 2414 }, { "epoch": 0.4, "grad_norm": 2.223171871459857, "learning_rate": 1.3740036132581626e-05, "loss": 0.8322, "step": 2415 }, { "epoch": 0.4, "grad_norm": 2.077890198218692, "learning_rate": 1.3735101482216117e-05, "loss": 0.8184, "step": 2416 }, { "epoch": 0.4, "grad_norm": 1.8123650569606533, "learning_rate": 1.3730165774637994e-05, "loss": 0.7833, "step": 2417 }, { "epoch": 0.4, "grad_norm": 2.5483211136542736, "learning_rate": 1.3725229011244294e-05, "loss": 0.8138, "step": 2418 }, { "epoch": 0.4, "grad_norm": 2.219035902345559, "learning_rate": 1.3720291193432357e-05, "loss": 0.7757, "step": 2419 }, { "epoch": 0.4, "grad_norm": 1.708425565877656, "learning_rate": 1.3715352322599826e-05, "loss": 0.8311, "step": 2420 }, { "epoch": 0.4, "grad_norm": 2.0225742604035815, "learning_rate": 1.3710412400144637e-05, "loss": 0.8283, "step": 2421 }, { "epoch": 0.4, "grad_norm": 0.748071696656295, "learning_rate": 1.3705471427465025e-05, "loss": 0.4123, "step": 2422 }, { "epoch": 0.4, "grad_norm": 2.243840522624688, "learning_rate": 1.3700529405959517e-05, "loss": 0.7977, "step": 2423 }, { "epoch": 0.4, "grad_norm": 2.7093358206927487, "learning_rate": 1.3695586337026949e-05, "loss": 0.8195, "step": 2424 }, { "epoch": 0.4, "grad_norm": 2.1701377174784002, "learning_rate": 1.3690642222066445e-05, "loss": 0.6879, "step": 2425 }, { "epoch": 0.4, "grad_norm": 1.9780152836070173, "learning_rate": 1.3685697062477421e-05, "loss": 0.8445, "step": 2426 }, { "epoch": 0.4, "grad_norm": 2.1054191077994915, "learning_rate": 1.3680750859659599e-05, "loss": 0.6929, "step": 2427 }, { "epoch": 0.4, "grad_norm": 1.894807459656065, "learning_rate": 1.3675803615012993e-05, "loss": 0.7772, "step": 2428 }, { "epoch": 0.4, "grad_norm": 0.7071447275515502, "learning_rate": 1.3670855329937905e-05, "loss": 0.3867, "step": 2429 }, { "epoch": 0.4, "grad_norm": 2.335541581778697, "learning_rate": 1.3665906005834938e-05, "loss": 0.8268, "step": 2430 }, { "epoch": 0.4, "grad_norm": 1.9223980576917548, "learning_rate": 1.3660955644104985e-05, "loss": 0.7804, "step": 2431 }, { "epoch": 0.4, "grad_norm": 2.7468765219476037, "learning_rate": 1.3656004246149242e-05, "loss": 0.675, "step": 2432 }, { "epoch": 0.4, "grad_norm": 1.8627600204274986, "learning_rate": 1.3651051813369188e-05, "loss": 0.7548, "step": 2433 }, { "epoch": 0.4, "grad_norm": 1.7255562897416306, "learning_rate": 1.3646098347166598e-05, "loss": 0.7849, "step": 2434 }, { "epoch": 0.4, "grad_norm": 0.6739285793074007, "learning_rate": 1.3641143848943545e-05, "loss": 0.3852, "step": 2435 }, { "epoch": 0.4, "grad_norm": 2.642974821070624, "learning_rate": 1.3636188320102384e-05, "loss": 0.8978, "step": 2436 }, { "epoch": 0.4, "grad_norm": 1.8532557333256463, "learning_rate": 1.3631231762045768e-05, "loss": 0.8365, "step": 2437 }, { "epoch": 0.4, "grad_norm": 2.052142183201847, "learning_rate": 1.3626274176176645e-05, "loss": 0.7886, "step": 2438 }, { "epoch": 0.4, "grad_norm": 2.1790882524103243, "learning_rate": 1.3621315563898243e-05, "loss": 0.7765, "step": 2439 }, { "epoch": 0.4, "grad_norm": 2.3197802435668575, "learning_rate": 1.3616355926614089e-05, "loss": 0.9013, "step": 2440 }, { "epoch": 0.4, "grad_norm": 2.056881976884404, "learning_rate": 1.3611395265727998e-05, "loss": 0.8397, "step": 2441 }, { "epoch": 0.4, "grad_norm": 1.8683568336724232, "learning_rate": 1.3606433582644077e-05, "loss": 0.8253, "step": 2442 }, { "epoch": 0.4, "grad_norm": 2.311843575422822, "learning_rate": 1.3601470878766714e-05, "loss": 0.837, "step": 2443 }, { "epoch": 0.4, "grad_norm": 1.4876028611044891, "learning_rate": 1.3596507155500596e-05, "loss": 0.7862, "step": 2444 }, { "epoch": 0.4, "grad_norm": 0.651140806668553, "learning_rate": 1.3591542414250694e-05, "loss": 0.3607, "step": 2445 }, { "epoch": 0.4, "grad_norm": 1.8870549082963535, "learning_rate": 1.3586576656422268e-05, "loss": 0.7962, "step": 2446 }, { "epoch": 0.4, "grad_norm": 2.2207370899560788, "learning_rate": 1.3581609883420866e-05, "loss": 0.7878, "step": 2447 }, { "epoch": 0.4, "grad_norm": 0.6214111139677352, "learning_rate": 1.3576642096652322e-05, "loss": 0.3801, "step": 2448 }, { "epoch": 0.4, "grad_norm": 4.673399001480683, "learning_rate": 1.3571673297522759e-05, "loss": 0.8532, "step": 2449 }, { "epoch": 0.4, "grad_norm": 2.7184194005530062, "learning_rate": 1.3566703487438579e-05, "loss": 0.7299, "step": 2450 }, { "epoch": 0.4, "grad_norm": 2.2111567716493927, "learning_rate": 1.3561732667806481e-05, "loss": 0.7837, "step": 2451 }, { "epoch": 0.4, "grad_norm": 2.4552938518182463, "learning_rate": 1.3556760840033447e-05, "loss": 0.8144, "step": 2452 }, { "epoch": 0.4, "grad_norm": 2.151731219601962, "learning_rate": 1.3551788005526738e-05, "loss": 0.8737, "step": 2453 }, { "epoch": 0.4, "grad_norm": 1.8608391401612723, "learning_rate": 1.3546814165693909e-05, "loss": 0.7975, "step": 2454 }, { "epoch": 0.4, "grad_norm": 1.8344489593351816, "learning_rate": 1.3541839321942786e-05, "loss": 0.8067, "step": 2455 }, { "epoch": 0.4, "grad_norm": 2.1137279960147013, "learning_rate": 1.35368634756815e-05, "loss": 0.8172, "step": 2456 }, { "epoch": 0.4, "grad_norm": 1.974235498668537, "learning_rate": 1.3531886628318447e-05, "loss": 0.788, "step": 2457 }, { "epoch": 0.4, "grad_norm": 1.947810257377928, "learning_rate": 1.3526908781262314e-05, "loss": 0.8276, "step": 2458 }, { "epoch": 0.4, "grad_norm": 1.8703503093231788, "learning_rate": 1.352192993592207e-05, "loss": 0.8157, "step": 2459 }, { "epoch": 0.4, "grad_norm": 1.9566375811323642, "learning_rate": 1.3516950093706968e-05, "loss": 0.8323, "step": 2460 }, { "epoch": 0.4, "grad_norm": 2.0816276734275405, "learning_rate": 1.3511969256026542e-05, "loss": 0.8198, "step": 2461 }, { "epoch": 0.4, "grad_norm": 1.8126923149278626, "learning_rate": 1.3506987424290605e-05, "loss": 0.8518, "step": 2462 }, { "epoch": 0.4, "grad_norm": 2.107522238010668, "learning_rate": 1.3502004599909255e-05, "loss": 0.8075, "step": 2463 }, { "epoch": 0.4, "grad_norm": 8.337276075564908, "learning_rate": 1.349702078429287e-05, "loss": 0.7966, "step": 2464 }, { "epoch": 0.4, "grad_norm": 3.41430675838625, "learning_rate": 1.3492035978852114e-05, "loss": 0.7902, "step": 2465 }, { "epoch": 0.41, "grad_norm": 2.0454181854459925, "learning_rate": 1.3487050184997916e-05, "loss": 0.8015, "step": 2466 }, { "epoch": 0.41, "grad_norm": 2.123593201373334, "learning_rate": 1.3482063404141496e-05, "loss": 0.8228, "step": 2467 }, { "epoch": 0.41, "grad_norm": 2.2047106046068987, "learning_rate": 1.3477075637694362e-05, "loss": 0.8392, "step": 2468 }, { "epoch": 0.41, "grad_norm": 1.7416522287688199, "learning_rate": 1.347208688706828e-05, "loss": 0.8536, "step": 2469 }, { "epoch": 0.41, "grad_norm": 1.9766127021279778, "learning_rate": 1.3467097153675313e-05, "loss": 0.8014, "step": 2470 }, { "epoch": 0.41, "grad_norm": 1.7551178704556099, "learning_rate": 1.3462106438927788e-05, "loss": 0.8424, "step": 2471 }, { "epoch": 0.41, "grad_norm": 2.155150196805583, "learning_rate": 1.345711474423832e-05, "loss": 0.8093, "step": 2472 }, { "epoch": 0.41, "grad_norm": 2.0149067062198895, "learning_rate": 1.3452122071019797e-05, "loss": 0.8061, "step": 2473 }, { "epoch": 0.41, "grad_norm": 0.8566078970718504, "learning_rate": 1.3447128420685385e-05, "loss": 0.4039, "step": 2474 }, { "epoch": 0.41, "grad_norm": 1.6405414228171833, "learning_rate": 1.3442133794648521e-05, "loss": 0.8835, "step": 2475 }, { "epoch": 0.41, "grad_norm": 1.8909576210604748, "learning_rate": 1.3437138194322934e-05, "loss": 0.914, "step": 2476 }, { "epoch": 0.41, "grad_norm": 2.4817628968495633, "learning_rate": 1.3432141621122608e-05, "loss": 0.7677, "step": 2477 }, { "epoch": 0.41, "grad_norm": 2.932138553526105, "learning_rate": 1.3427144076461818e-05, "loss": 0.8306, "step": 2478 }, { "epoch": 0.41, "grad_norm": 3.381125621225941, "learning_rate": 1.3422145561755106e-05, "loss": 0.896, "step": 2479 }, { "epoch": 0.41, "grad_norm": 1.5305634391682703, "learning_rate": 1.3417146078417294e-05, "loss": 0.7876, "step": 2480 }, { "epoch": 0.41, "grad_norm": 1.6936821144691678, "learning_rate": 1.3412145627863473e-05, "loss": 0.8106, "step": 2481 }, { "epoch": 0.41, "grad_norm": 1.962162863172683, "learning_rate": 1.3407144211509014e-05, "loss": 0.797, "step": 2482 }, { "epoch": 0.41, "grad_norm": 1.812948617016314, "learning_rate": 1.3402141830769551e-05, "loss": 0.9433, "step": 2483 }, { "epoch": 0.41, "grad_norm": 2.263837062923174, "learning_rate": 1.3397138487060999e-05, "loss": 0.8307, "step": 2484 }, { "epoch": 0.41, "grad_norm": 1.8889998581457428, "learning_rate": 1.3392134181799547e-05, "loss": 0.8685, "step": 2485 }, { "epoch": 0.41, "grad_norm": 2.072670074927846, "learning_rate": 1.338712891640165e-05, "loss": 0.781, "step": 2486 }, { "epoch": 0.41, "grad_norm": 5.36534513481421, "learning_rate": 1.3382122692284041e-05, "loss": 0.7779, "step": 2487 }, { "epoch": 0.41, "grad_norm": 1.8678246702368186, "learning_rate": 1.3377115510863716e-05, "loss": 0.8053, "step": 2488 }, { "epoch": 0.41, "grad_norm": 1.6088584982546645, "learning_rate": 1.3372107373557955e-05, "loss": 0.7561, "step": 2489 }, { "epoch": 0.41, "grad_norm": 2.137217191107886, "learning_rate": 1.336709828178429e-05, "loss": 0.8363, "step": 2490 }, { "epoch": 0.41, "grad_norm": 1.5953999845064135, "learning_rate": 1.3362088236960544e-05, "loss": 0.778, "step": 2491 }, { "epoch": 0.41, "grad_norm": 2.128157101467935, "learning_rate": 1.3357077240504795e-05, "loss": 0.7774, "step": 2492 }, { "epoch": 0.41, "grad_norm": 1.9322235401452776, "learning_rate": 1.3352065293835399e-05, "loss": 0.8669, "step": 2493 }, { "epoch": 0.41, "grad_norm": 1.6593194667339717, "learning_rate": 1.3347052398370969e-05, "loss": 0.8737, "step": 2494 }, { "epoch": 0.41, "grad_norm": 2.1871253843254617, "learning_rate": 1.3342038555530403e-05, "loss": 0.7586, "step": 2495 }, { "epoch": 0.41, "grad_norm": 2.1116811284653463, "learning_rate": 1.3337023766732852e-05, "loss": 0.7025, "step": 2496 }, { "epoch": 0.41, "grad_norm": 2.2216545045741873, "learning_rate": 1.3332008033397746e-05, "loss": 0.8166, "step": 2497 }, { "epoch": 0.41, "grad_norm": 2.128668921640424, "learning_rate": 1.3326991356944776e-05, "loss": 0.8243, "step": 2498 }, { "epoch": 0.41, "grad_norm": 1.9296646278317358, "learning_rate": 1.33219737387939e-05, "loss": 0.8152, "step": 2499 }, { "epoch": 0.41, "grad_norm": 2.172930059271583, "learning_rate": 1.3316955180365348e-05, "loss": 0.8224, "step": 2500 }, { "epoch": 0.41, "grad_norm": 2.163234229139713, "learning_rate": 1.3311935683079611e-05, "loss": 0.7752, "step": 2501 }, { "epoch": 0.41, "grad_norm": 5.08231173817608, "learning_rate": 1.3306915248357442e-05, "loss": 0.8393, "step": 2502 }, { "epoch": 0.41, "grad_norm": 3.277996313658932, "learning_rate": 1.3301893877619874e-05, "loss": 0.7268, "step": 2503 }, { "epoch": 0.41, "grad_norm": 1.7812433637323803, "learning_rate": 1.3296871572288187e-05, "loss": 0.8202, "step": 2504 }, { "epoch": 0.41, "grad_norm": 1.89945276792083, "learning_rate": 1.3291848333783941e-05, "loss": 0.7779, "step": 2505 }, { "epoch": 0.41, "grad_norm": 2.303303185184824, "learning_rate": 1.3286824163528948e-05, "loss": 0.6776, "step": 2506 }, { "epoch": 0.41, "grad_norm": 2.0817156025114745, "learning_rate": 1.3281799062945291e-05, "loss": 0.7822, "step": 2507 }, { "epoch": 0.41, "grad_norm": 1.6880890852758779, "learning_rate": 1.3276773033455312e-05, "loss": 0.7815, "step": 2508 }, { "epoch": 0.41, "grad_norm": 2.048211543495535, "learning_rate": 1.3271746076481621e-05, "loss": 0.8132, "step": 2509 }, { "epoch": 0.41, "grad_norm": 2.046022346590687, "learning_rate": 1.3266718193447086e-05, "loss": 0.865, "step": 2510 }, { "epoch": 0.41, "grad_norm": 1.931050900354285, "learning_rate": 1.3261689385774839e-05, "loss": 0.8135, "step": 2511 }, { "epoch": 0.41, "grad_norm": 3.0626194330155223, "learning_rate": 1.3256659654888272e-05, "loss": 0.7736, "step": 2512 }, { "epoch": 0.41, "grad_norm": 1.8042992410585896, "learning_rate": 1.3251629002211042e-05, "loss": 0.8661, "step": 2513 }, { "epoch": 0.41, "grad_norm": 1.82512793678899, "learning_rate": 1.3246597429167066e-05, "loss": 0.8441, "step": 2514 }, { "epoch": 0.41, "grad_norm": 2.1262097095870547, "learning_rate": 1.3241564937180513e-05, "loss": 0.8313, "step": 2515 }, { "epoch": 0.41, "grad_norm": 1.7711684079542405, "learning_rate": 1.3236531527675828e-05, "loss": 0.848, "step": 2516 }, { "epoch": 0.41, "grad_norm": 1.6903436510676233, "learning_rate": 1.3231497202077701e-05, "loss": 0.7958, "step": 2517 }, { "epoch": 0.41, "grad_norm": 2.1265988702454326, "learning_rate": 1.322646196181109e-05, "loss": 0.8758, "step": 2518 }, { "epoch": 0.41, "grad_norm": 2.6835373274690224, "learning_rate": 1.3221425808301209e-05, "loss": 0.9112, "step": 2519 }, { "epoch": 0.41, "grad_norm": 10.537186098141769, "learning_rate": 1.3216388742973532e-05, "loss": 0.8511, "step": 2520 }, { "epoch": 0.41, "grad_norm": 1.5834552625386067, "learning_rate": 1.3211350767253786e-05, "loss": 0.8642, "step": 2521 }, { "epoch": 0.41, "grad_norm": 1.8465111984711624, "learning_rate": 1.320631188256796e-05, "loss": 0.8147, "step": 2522 }, { "epoch": 0.41, "grad_norm": 1.8931428905847938, "learning_rate": 1.3201272090342303e-05, "loss": 0.8042, "step": 2523 }, { "epoch": 0.41, "grad_norm": 0.8188749976576907, "learning_rate": 1.3196231392003316e-05, "loss": 0.4051, "step": 2524 }, { "epoch": 0.41, "grad_norm": 2.249661697930106, "learning_rate": 1.3191189788977758e-05, "loss": 0.7194, "step": 2525 }, { "epoch": 0.41, "grad_norm": 1.7050058767418437, "learning_rate": 1.3186147282692643e-05, "loss": 0.8448, "step": 2526 }, { "epoch": 0.42, "grad_norm": 1.5327631412633682, "learning_rate": 1.3181103874575243e-05, "loss": 0.7336, "step": 2527 }, { "epoch": 0.42, "grad_norm": 2.0604665557854713, "learning_rate": 1.3176059566053083e-05, "loss": 0.834, "step": 2528 }, { "epoch": 0.42, "grad_norm": 1.803615244711922, "learning_rate": 1.3171014358553946e-05, "loss": 0.8335, "step": 2529 }, { "epoch": 0.42, "grad_norm": 1.683616604218104, "learning_rate": 1.3165968253505865e-05, "loss": 0.9068, "step": 2530 }, { "epoch": 0.42, "grad_norm": 0.7291743695110957, "learning_rate": 1.3160921252337131e-05, "loss": 0.376, "step": 2531 }, { "epoch": 0.42, "grad_norm": 1.9312915137742797, "learning_rate": 1.3155873356476287e-05, "loss": 0.848, "step": 2532 }, { "epoch": 0.42, "grad_norm": 2.698556115654, "learning_rate": 1.3150824567352128e-05, "loss": 0.7388, "step": 2533 }, { "epoch": 0.42, "grad_norm": 2.3482760388301025, "learning_rate": 1.3145774886393704e-05, "loss": 0.8214, "step": 2534 }, { "epoch": 0.42, "grad_norm": 1.8234025600100159, "learning_rate": 1.3140724315030315e-05, "loss": 0.8354, "step": 2535 }, { "epoch": 0.42, "grad_norm": 3.8121039895076807, "learning_rate": 1.313567285469152e-05, "loss": 0.8468, "step": 2536 }, { "epoch": 0.42, "grad_norm": 1.804430312147642, "learning_rate": 1.3130620506807116e-05, "loss": 0.8434, "step": 2537 }, { "epoch": 0.42, "grad_norm": 2.1062033466880874, "learning_rate": 1.3125567272807167e-05, "loss": 0.8785, "step": 2538 }, { "epoch": 0.42, "grad_norm": 2.106870579616244, "learning_rate": 1.3120513154121976e-05, "loss": 0.9061, "step": 2539 }, { "epoch": 0.42, "grad_norm": 2.022862358391041, "learning_rate": 1.3115458152182102e-05, "loss": 0.7878, "step": 2540 }, { "epoch": 0.42, "grad_norm": 2.3582716655627394, "learning_rate": 1.3110402268418352e-05, "loss": 0.7058, "step": 2541 }, { "epoch": 0.42, "grad_norm": 1.6007976151338814, "learning_rate": 1.3105345504261781e-05, "loss": 0.8018, "step": 2542 }, { "epoch": 0.42, "grad_norm": 2.0208511141410774, "learning_rate": 1.3100287861143703e-05, "loss": 0.8525, "step": 2543 }, { "epoch": 0.42, "grad_norm": 2.067090096648786, "learning_rate": 1.3095229340495665e-05, "loss": 0.8612, "step": 2544 }, { "epoch": 0.42, "grad_norm": 2.7818657510316487, "learning_rate": 1.3090169943749475e-05, "loss": 0.7642, "step": 2545 }, { "epoch": 0.42, "grad_norm": 2.6607367168790885, "learning_rate": 1.3085109672337183e-05, "loss": 0.9012, "step": 2546 }, { "epoch": 0.42, "grad_norm": 1.4292833640477434, "learning_rate": 1.3080048527691092e-05, "loss": 0.7801, "step": 2547 }, { "epoch": 0.42, "grad_norm": 1.6580315382453505, "learning_rate": 1.3074986511243741e-05, "loss": 0.8464, "step": 2548 }, { "epoch": 0.42, "grad_norm": 2.6218615526202456, "learning_rate": 1.306992362442793e-05, "loss": 0.8343, "step": 2549 }, { "epoch": 0.42, "grad_norm": 81.55542518964353, "learning_rate": 1.3064859868676694e-05, "loss": 0.8182, "step": 2550 }, { "epoch": 0.42, "grad_norm": 3.014348121939768, "learning_rate": 1.3059795245423319e-05, "loss": 0.8099, "step": 2551 }, { "epoch": 0.42, "grad_norm": 1.8440653108831397, "learning_rate": 1.3054729756101338e-05, "loss": 0.7382, "step": 2552 }, { "epoch": 0.42, "grad_norm": 2.334689036273113, "learning_rate": 1.3049663402144528e-05, "loss": 0.8414, "step": 2553 }, { "epoch": 0.42, "grad_norm": 1.755676019374253, "learning_rate": 1.3044596184986906e-05, "loss": 0.7638, "step": 2554 }, { "epoch": 0.42, "grad_norm": 4.401019350205106, "learning_rate": 1.303952810606274e-05, "loss": 0.8672, "step": 2555 }, { "epoch": 0.42, "grad_norm": 0.7577605169408859, "learning_rate": 1.3034459166806537e-05, "loss": 0.3508, "step": 2556 }, { "epoch": 0.42, "grad_norm": 1.7657202713781401, "learning_rate": 1.3029389368653051e-05, "loss": 0.761, "step": 2557 }, { "epoch": 0.42, "grad_norm": 1.9734354028122503, "learning_rate": 1.302431871303728e-05, "loss": 0.8201, "step": 2558 }, { "epoch": 0.42, "grad_norm": 1.8890925064151776, "learning_rate": 1.3019247201394456e-05, "loss": 0.7649, "step": 2559 }, { "epoch": 0.42, "grad_norm": 0.6423320953265536, "learning_rate": 1.3014174835160065e-05, "loss": 0.402, "step": 2560 }, { "epoch": 0.42, "grad_norm": 1.6278416092594037, "learning_rate": 1.300910161576983e-05, "loss": 0.8666, "step": 2561 }, { "epoch": 0.42, "grad_norm": 2.070264233697305, "learning_rate": 1.3004027544659712e-05, "loss": 0.7425, "step": 2562 }, { "epoch": 0.42, "grad_norm": 2.7325036875103037, "learning_rate": 1.2998952623265917e-05, "loss": 0.8089, "step": 2563 }, { "epoch": 0.42, "grad_norm": 3.3480680680743653, "learning_rate": 1.2993876853024891e-05, "loss": 0.8676, "step": 2564 }, { "epoch": 0.42, "grad_norm": 2.3420285215380496, "learning_rate": 1.298880023537332e-05, "loss": 0.7624, "step": 2565 }, { "epoch": 0.42, "grad_norm": 2.479383550712505, "learning_rate": 1.2983722771748131e-05, "loss": 0.7652, "step": 2566 }, { "epoch": 0.42, "grad_norm": 1.9602245572927883, "learning_rate": 1.2978644463586489e-05, "loss": 0.8245, "step": 2567 }, { "epoch": 0.42, "grad_norm": 2.196256398334193, "learning_rate": 1.2973565312325798e-05, "loss": 0.7555, "step": 2568 }, { "epoch": 0.42, "grad_norm": 2.383212448067578, "learning_rate": 1.29684853194037e-05, "loss": 0.8692, "step": 2569 }, { "epoch": 0.42, "grad_norm": 1.4946890834358486, "learning_rate": 1.296340448625808e-05, "loss": 0.7637, "step": 2570 }, { "epoch": 0.42, "grad_norm": 1.7010773217943684, "learning_rate": 1.2958322814327053e-05, "loss": 0.8355, "step": 2571 }, { "epoch": 0.42, "grad_norm": 1.584874490105291, "learning_rate": 1.2953240305048978e-05, "loss": 0.8097, "step": 2572 }, { "epoch": 0.42, "grad_norm": 2.0997524633958826, "learning_rate": 1.2948156959862446e-05, "loss": 0.8552, "step": 2573 }, { "epoch": 0.42, "grad_norm": 1.7478678073187786, "learning_rate": 1.294307278020629e-05, "loss": 0.8194, "step": 2574 }, { "epoch": 0.42, "grad_norm": 2.1931482135677838, "learning_rate": 1.2937987767519576e-05, "loss": 0.7457, "step": 2575 }, { "epoch": 0.42, "grad_norm": 1.809013538504975, "learning_rate": 1.2932901923241603e-05, "loss": 0.8364, "step": 2576 }, { "epoch": 0.42, "grad_norm": 5.751147068052486, "learning_rate": 1.2927815248811913e-05, "loss": 0.81, "step": 2577 }, { "epoch": 0.42, "grad_norm": 2.1024956382711686, "learning_rate": 1.2922727745670276e-05, "loss": 0.8172, "step": 2578 }, { "epoch": 0.42, "grad_norm": 1.676657690848598, "learning_rate": 1.29176394152567e-05, "loss": 0.8673, "step": 2579 }, { "epoch": 0.42, "grad_norm": 2.0599350539355057, "learning_rate": 1.2912550259011422e-05, "loss": 0.8951, "step": 2580 }, { "epoch": 0.42, "grad_norm": 1.9165692344192646, "learning_rate": 1.2907460278374925e-05, "loss": 0.7994, "step": 2581 }, { "epoch": 0.42, "grad_norm": 1.9930272200953583, "learning_rate": 1.2902369474787912e-05, "loss": 0.7897, "step": 2582 }, { "epoch": 0.42, "grad_norm": 1.5194567265297532, "learning_rate": 1.2897277849691326e-05, "loss": 0.801, "step": 2583 }, { "epoch": 0.42, "grad_norm": 1.735087268822263, "learning_rate": 1.2892185404526338e-05, "loss": 0.7686, "step": 2584 }, { "epoch": 0.42, "grad_norm": 1.7731185879085, "learning_rate": 1.2887092140734357e-05, "loss": 0.8408, "step": 2585 }, { "epoch": 0.42, "grad_norm": 1.6740331426067898, "learning_rate": 1.288199805975702e-05, "loss": 0.8083, "step": 2586 }, { "epoch": 0.42, "grad_norm": 1.554685460010336, "learning_rate": 1.2876903163036194e-05, "loss": 0.7568, "step": 2587 }, { "epoch": 0.43, "grad_norm": 1.6985712294379902, "learning_rate": 1.2871807452013977e-05, "loss": 0.7635, "step": 2588 }, { "epoch": 0.43, "grad_norm": 1.4270725933620236, "learning_rate": 1.2866710928132709e-05, "loss": 0.8204, "step": 2589 }, { "epoch": 0.43, "grad_norm": 2.1135158193039025, "learning_rate": 1.2861613592834942e-05, "loss": 0.794, "step": 2590 }, { "epoch": 0.43, "grad_norm": 1.8625397476655827, "learning_rate": 1.2856515447563467e-05, "loss": 0.7861, "step": 2591 }, { "epoch": 0.43, "grad_norm": 0.7154830533064659, "learning_rate": 1.2851416493761301e-05, "loss": 0.3627, "step": 2592 }, { "epoch": 0.43, "grad_norm": 2.2957092736480793, "learning_rate": 1.28463167328717e-05, "loss": 0.8603, "step": 2593 }, { "epoch": 0.43, "grad_norm": 1.9775732985344212, "learning_rate": 1.2841216166338133e-05, "loss": 0.8232, "step": 2594 }, { "epoch": 0.43, "grad_norm": 2.568048580984328, "learning_rate": 1.2836114795604309e-05, "loss": 0.7504, "step": 2595 }, { "epoch": 0.43, "grad_norm": 2.174491357824313, "learning_rate": 1.2831012622114159e-05, "loss": 0.8214, "step": 2596 }, { "epoch": 0.43, "grad_norm": 1.8035489113989338, "learning_rate": 1.282590964731184e-05, "loss": 0.8334, "step": 2597 }, { "epoch": 0.43, "grad_norm": 2.0538513996372374, "learning_rate": 1.2820805872641745e-05, "loss": 0.845, "step": 2598 }, { "epoch": 0.43, "grad_norm": 3.3951782417121925, "learning_rate": 1.2815701299548478e-05, "loss": 0.8198, "step": 2599 }, { "epoch": 0.43, "grad_norm": 0.6467715036668125, "learning_rate": 1.2810595929476884e-05, "loss": 0.3563, "step": 2600 }, { "epoch": 0.43, "grad_norm": 1.6628118334599835, "learning_rate": 1.2805489763872026e-05, "loss": 0.7362, "step": 2601 }, { "epoch": 0.43, "grad_norm": 2.7883279178550255, "learning_rate": 1.280038280417919e-05, "loss": 0.7534, "step": 2602 }, { "epoch": 0.43, "grad_norm": 2.2770885324398273, "learning_rate": 1.2795275051843893e-05, "loss": 0.8557, "step": 2603 }, { "epoch": 0.43, "grad_norm": 2.353165389883061, "learning_rate": 1.2790166508311872e-05, "loss": 0.748, "step": 2604 }, { "epoch": 0.43, "grad_norm": 2.695995486575766, "learning_rate": 1.2785057175029092e-05, "loss": 0.8401, "step": 2605 }, { "epoch": 0.43, "grad_norm": 1.7931409487518988, "learning_rate": 1.277994705344174e-05, "loss": 0.8665, "step": 2606 }, { "epoch": 0.43, "grad_norm": 1.8112409649268004, "learning_rate": 1.2774836144996222e-05, "loss": 0.8015, "step": 2607 }, { "epoch": 0.43, "grad_norm": 2.0762896042523202, "learning_rate": 1.276972445113917e-05, "loss": 0.8275, "step": 2608 }, { "epoch": 0.43, "grad_norm": 2.1931398057025002, "learning_rate": 1.276461197331744e-05, "loss": 0.826, "step": 2609 }, { "epoch": 0.43, "grad_norm": 2.1320680730315607, "learning_rate": 1.2759498712978106e-05, "loss": 0.8993, "step": 2610 }, { "epoch": 0.43, "grad_norm": 0.650497919262475, "learning_rate": 1.2754384671568469e-05, "loss": 0.3399, "step": 2611 }, { "epoch": 0.43, "grad_norm": 1.7410607182832958, "learning_rate": 1.2749269850536045e-05, "loss": 0.7907, "step": 2612 }, { "epoch": 0.43, "grad_norm": 2.0136335132278735, "learning_rate": 1.2744154251328573e-05, "loss": 0.7838, "step": 2613 }, { "epoch": 0.43, "grad_norm": 2.1281897642374994, "learning_rate": 1.2739037875394013e-05, "loss": 0.8348, "step": 2614 }, { "epoch": 0.43, "grad_norm": 1.8088399462630396, "learning_rate": 1.2733920724180542e-05, "loss": 0.7456, "step": 2615 }, { "epoch": 0.43, "grad_norm": 1.8938023685495025, "learning_rate": 1.2728802799136566e-05, "loss": 0.6909, "step": 2616 }, { "epoch": 0.43, "grad_norm": 2.102965318013433, "learning_rate": 1.2723684101710696e-05, "loss": 0.83, "step": 2617 }, { "epoch": 0.43, "grad_norm": 18.10252137973754, "learning_rate": 1.2718564633351773e-05, "loss": 0.8513, "step": 2618 }, { "epoch": 0.43, "grad_norm": 1.6517549904949136, "learning_rate": 1.271344439550885e-05, "loss": 0.7935, "step": 2619 }, { "epoch": 0.43, "grad_norm": 2.1879218139983383, "learning_rate": 1.2708323389631198e-05, "loss": 0.8259, "step": 2620 }, { "epoch": 0.43, "grad_norm": 3.419257369792626, "learning_rate": 1.270320161716831e-05, "loss": 0.8358, "step": 2621 }, { "epoch": 0.43, "grad_norm": 1.7961441672642453, "learning_rate": 1.2698079079569891e-05, "loss": 0.7951, "step": 2622 }, { "epoch": 0.43, "grad_norm": 1.8913107191425071, "learning_rate": 1.2692955778285865e-05, "loss": 0.8549, "step": 2623 }, { "epoch": 0.43, "grad_norm": 2.5661990117356734, "learning_rate": 1.268783171476637e-05, "loss": 0.8515, "step": 2624 }, { "epoch": 0.43, "grad_norm": 2.2368628889406508, "learning_rate": 1.2682706890461764e-05, "loss": 0.9199, "step": 2625 }, { "epoch": 0.43, "grad_norm": 1.619640552271171, "learning_rate": 1.2677581306822613e-05, "loss": 0.798, "step": 2626 }, { "epoch": 0.43, "grad_norm": 1.657759012947803, "learning_rate": 1.267245496529971e-05, "loss": 0.8648, "step": 2627 }, { "epoch": 0.43, "grad_norm": 2.231767633321871, "learning_rate": 1.266732786734405e-05, "loss": 0.7561, "step": 2628 }, { "epoch": 0.43, "grad_norm": 1.6755381857776148, "learning_rate": 1.2662200014406848e-05, "loss": 0.8007, "step": 2629 }, { "epoch": 0.43, "grad_norm": 2.233245333657748, "learning_rate": 1.2657071407939536e-05, "loss": 0.7801, "step": 2630 }, { "epoch": 0.43, "grad_norm": 1.642579630813457, "learning_rate": 1.265194204939375e-05, "loss": 0.8665, "step": 2631 }, { "epoch": 0.43, "grad_norm": 3.178986395458267, "learning_rate": 1.2646811940221346e-05, "loss": 0.7202, "step": 2632 }, { "epoch": 0.43, "grad_norm": 1.4554669944505578, "learning_rate": 1.2641681081874394e-05, "loss": 0.7859, "step": 2633 }, { "epoch": 0.43, "grad_norm": 2.069533378745209, "learning_rate": 1.2636549475805165e-05, "loss": 0.8502, "step": 2634 }, { "epoch": 0.43, "grad_norm": 1.7121512384235649, "learning_rate": 1.2631417123466154e-05, "loss": 0.8373, "step": 2635 }, { "epoch": 0.43, "grad_norm": 2.025694580783905, "learning_rate": 1.2626284026310062e-05, "loss": 0.8337, "step": 2636 }, { "epoch": 0.43, "grad_norm": 1.6566175556413758, "learning_rate": 1.2621150185789803e-05, "loss": 0.7926, "step": 2637 }, { "epoch": 0.43, "grad_norm": 1.5363989768680915, "learning_rate": 1.2616015603358497e-05, "loss": 0.8913, "step": 2638 }, { "epoch": 0.43, "grad_norm": 2.2622582206993984, "learning_rate": 1.261088028046948e-05, "loss": 0.8315, "step": 2639 }, { "epoch": 0.43, "grad_norm": 2.119340971485186, "learning_rate": 1.260574421857629e-05, "loss": 0.8367, "step": 2640 }, { "epoch": 0.43, "grad_norm": 1.773636677775901, "learning_rate": 1.2600607419132685e-05, "loss": 0.8106, "step": 2641 }, { "epoch": 0.43, "grad_norm": 2.077762414381734, "learning_rate": 1.2595469883592617e-05, "loss": 0.8675, "step": 2642 }, { "epoch": 0.43, "grad_norm": 1.517219636013851, "learning_rate": 1.2590331613410261e-05, "loss": 0.8453, "step": 2643 }, { "epoch": 0.43, "grad_norm": 2.3988249882226635, "learning_rate": 1.258519261003999e-05, "loss": 0.7415, "step": 2644 }, { "epoch": 0.43, "grad_norm": 2.1446342374621943, "learning_rate": 1.2580052874936393e-05, "loss": 0.822, "step": 2645 }, { "epoch": 0.43, "grad_norm": 1.8577114544406699, "learning_rate": 1.2574912409554254e-05, "loss": 0.7671, "step": 2646 }, { "epoch": 0.43, "grad_norm": 1.6555393513911436, "learning_rate": 1.2569771215348576e-05, "loss": 0.8314, "step": 2647 }, { "epoch": 0.43, "grad_norm": 1.8886852939624381, "learning_rate": 1.2564629293774561e-05, "loss": 0.8059, "step": 2648 }, { "epoch": 0.44, "grad_norm": 2.3016341437452006, "learning_rate": 1.2559486646287622e-05, "loss": 0.8412, "step": 2649 }, { "epoch": 0.44, "grad_norm": 2.792760080905311, "learning_rate": 1.2554343274343367e-05, "loss": 0.8353, "step": 2650 }, { "epoch": 0.44, "grad_norm": 2.5898258103298226, "learning_rate": 1.2549199179397627e-05, "loss": 0.7951, "step": 2651 }, { "epoch": 0.44, "grad_norm": 2.381021696362323, "learning_rate": 1.2544054362906421e-05, "loss": 0.7617, "step": 2652 }, { "epoch": 0.44, "grad_norm": 1.8498908168251413, "learning_rate": 1.253890882632598e-05, "loss": 0.7479, "step": 2653 }, { "epoch": 0.44, "grad_norm": 1.7608937297900413, "learning_rate": 1.2533762571112737e-05, "loss": 0.851, "step": 2654 }, { "epoch": 0.44, "grad_norm": 1.9774847941144134, "learning_rate": 1.2528615598723333e-05, "loss": 0.7545, "step": 2655 }, { "epoch": 0.44, "grad_norm": 1.9557832103480424, "learning_rate": 1.2523467910614597e-05, "loss": 0.8556, "step": 2656 }, { "epoch": 0.44, "grad_norm": 2.700440624577718, "learning_rate": 1.2518319508243582e-05, "loss": 0.7986, "step": 2657 }, { "epoch": 0.44, "grad_norm": 1.7533626021080115, "learning_rate": 1.2513170393067527e-05, "loss": 0.8059, "step": 2658 }, { "epoch": 0.44, "grad_norm": 2.0042343385745967, "learning_rate": 1.2508020566543876e-05, "loss": 0.8443, "step": 2659 }, { "epoch": 0.44, "grad_norm": 1.470622665013301, "learning_rate": 1.2502870030130285e-05, "loss": 0.8684, "step": 2660 }, { "epoch": 0.44, "grad_norm": 1.5848749361581156, "learning_rate": 1.2497718785284594e-05, "loss": 0.8376, "step": 2661 }, { "epoch": 0.44, "grad_norm": 1.7353655254911606, "learning_rate": 1.2492566833464857e-05, "loss": 0.8349, "step": 2662 }, { "epoch": 0.44, "grad_norm": 3.0887697320251712, "learning_rate": 1.2487414176129322e-05, "loss": 0.835, "step": 2663 }, { "epoch": 0.44, "grad_norm": 1.7147536629429234, "learning_rate": 1.2482260814736438e-05, "loss": 0.8307, "step": 2664 }, { "epoch": 0.44, "grad_norm": 2.0169076058164763, "learning_rate": 1.2477106750744852e-05, "loss": 0.7823, "step": 2665 }, { "epoch": 0.44, "grad_norm": 15.887437002062697, "learning_rate": 1.2471951985613414e-05, "loss": 0.768, "step": 2666 }, { "epoch": 0.44, "grad_norm": 2.104653955262569, "learning_rate": 1.2466796520801163e-05, "loss": 0.8622, "step": 2667 }, { "epoch": 0.44, "grad_norm": 2.090845568642119, "learning_rate": 1.246164035776735e-05, "loss": 0.8597, "step": 2668 }, { "epoch": 0.44, "grad_norm": 0.7067860395589485, "learning_rate": 1.245648349797141e-05, "loss": 0.404, "step": 2669 }, { "epoch": 0.44, "grad_norm": 6.180468904450706, "learning_rate": 1.2451325942872984e-05, "loss": 0.8351, "step": 2670 }, { "epoch": 0.44, "grad_norm": 2.2061790050344947, "learning_rate": 1.2446167693931907e-05, "loss": 0.8824, "step": 2671 }, { "epoch": 0.44, "grad_norm": 2.0599122436585695, "learning_rate": 1.2441008752608212e-05, "loss": 0.7243, "step": 2672 }, { "epoch": 0.44, "grad_norm": 6.772510913647971, "learning_rate": 1.2435849120362123e-05, "loss": 0.7667, "step": 2673 }, { "epoch": 0.44, "grad_norm": 1.9186014351728722, "learning_rate": 1.2430688798654064e-05, "loss": 0.7216, "step": 2674 }, { "epoch": 0.44, "grad_norm": 2.005969295670048, "learning_rate": 1.2425527788944656e-05, "loss": 0.8349, "step": 2675 }, { "epoch": 0.44, "grad_norm": 1.451855251675784, "learning_rate": 1.2420366092694713e-05, "loss": 0.7736, "step": 2676 }, { "epoch": 0.44, "grad_norm": 1.791645723570941, "learning_rate": 1.2415203711365238e-05, "loss": 0.9081, "step": 2677 }, { "epoch": 0.44, "grad_norm": 1.6030146273374413, "learning_rate": 1.2410040646417431e-05, "loss": 0.7059, "step": 2678 }, { "epoch": 0.44, "grad_norm": 2.748797985698082, "learning_rate": 1.2404876899312693e-05, "loss": 0.7573, "step": 2679 }, { "epoch": 0.44, "grad_norm": 2.3619026407543777, "learning_rate": 1.2399712471512607e-05, "loss": 0.869, "step": 2680 }, { "epoch": 0.44, "grad_norm": 3.2597938822394212, "learning_rate": 1.239454736447895e-05, "loss": 0.8165, "step": 2681 }, { "epoch": 0.44, "grad_norm": 1.7120769470534838, "learning_rate": 1.2389381579673704e-05, "loss": 0.8434, "step": 2682 }, { "epoch": 0.44, "grad_norm": 1.8398211875125232, "learning_rate": 1.2384215118559027e-05, "loss": 0.8005, "step": 2683 }, { "epoch": 0.44, "grad_norm": 1.4322882415652154, "learning_rate": 1.2379047982597277e-05, "loss": 0.8256, "step": 2684 }, { "epoch": 0.44, "grad_norm": 2.106787816818749, "learning_rate": 1.2373880173250998e-05, "loss": 0.7375, "step": 2685 }, { "epoch": 0.44, "grad_norm": 1.984370226725397, "learning_rate": 1.2368711691982933e-05, "loss": 0.8376, "step": 2686 }, { "epoch": 0.44, "grad_norm": 1.7675989319790264, "learning_rate": 1.236354254025601e-05, "loss": 0.9189, "step": 2687 }, { "epoch": 0.44, "grad_norm": 1.932971824706353, "learning_rate": 1.235837271953334e-05, "loss": 0.9018, "step": 2688 }, { "epoch": 0.44, "grad_norm": 2.555827200879613, "learning_rate": 1.2353202231278232e-05, "loss": 0.8076, "step": 2689 }, { "epoch": 0.44, "grad_norm": 2.0633041747947583, "learning_rate": 1.2348031076954186e-05, "loss": 0.7618, "step": 2690 }, { "epoch": 0.44, "grad_norm": 1.8839196342794828, "learning_rate": 1.2342859258024882e-05, "loss": 0.8149, "step": 2691 }, { "epoch": 0.44, "grad_norm": 1.6166427124110945, "learning_rate": 1.2337686775954193e-05, "loss": 0.7972, "step": 2692 }, { "epoch": 0.44, "grad_norm": 2.24402147688355, "learning_rate": 1.2332513632206183e-05, "loss": 0.7753, "step": 2693 }, { "epoch": 0.44, "grad_norm": 1.8099457716510048, "learning_rate": 1.2327339828245092e-05, "loss": 0.8313, "step": 2694 }, { "epoch": 0.44, "grad_norm": 2.125179922827506, "learning_rate": 1.2322165365535364e-05, "loss": 0.8108, "step": 2695 }, { "epoch": 0.44, "grad_norm": 1.8290385992681588, "learning_rate": 1.2316990245541609e-05, "loss": 0.8401, "step": 2696 }, { "epoch": 0.44, "grad_norm": 2.3036402539087617, "learning_rate": 1.2311814469728643e-05, "loss": 0.8303, "step": 2697 }, { "epoch": 0.44, "grad_norm": 1.8300201905614202, "learning_rate": 1.2306638039561455e-05, "loss": 0.8342, "step": 2698 }, { "epoch": 0.44, "grad_norm": 2.0479328761048348, "learning_rate": 1.2301460956505225e-05, "loss": 0.7796, "step": 2699 }, { "epoch": 0.44, "grad_norm": 3.284686742080315, "learning_rate": 1.229628322202531e-05, "loss": 0.8326, "step": 2700 }, { "epoch": 0.44, "grad_norm": 1.755104549740373, "learning_rate": 1.229110483758726e-05, "loss": 0.8017, "step": 2701 }, { "epoch": 0.44, "grad_norm": 0.6987809740317876, "learning_rate": 1.2285925804656806e-05, "loss": 0.3731, "step": 2702 }, { "epoch": 0.44, "grad_norm": 2.438002100804544, "learning_rate": 1.2280746124699864e-05, "loss": 0.7619, "step": 2703 }, { "epoch": 0.44, "grad_norm": 2.1252303783292334, "learning_rate": 1.2275565799182527e-05, "loss": 0.8039, "step": 2704 }, { "epoch": 0.44, "grad_norm": 1.873748379530997, "learning_rate": 1.227038482957108e-05, "loss": 0.8108, "step": 2705 }, { "epoch": 0.44, "grad_norm": 2.384219513647643, "learning_rate": 1.2265203217331982e-05, "loss": 0.7929, "step": 2706 }, { "epoch": 0.44, "grad_norm": 1.9905763594225578, "learning_rate": 1.226002096393188e-05, "loss": 0.7661, "step": 2707 }, { "epoch": 0.44, "grad_norm": 2.0017028686601264, "learning_rate": 1.2254838070837596e-05, "loss": 0.8065, "step": 2708 }, { "epoch": 0.44, "grad_norm": 2.0519831340062464, "learning_rate": 1.2249654539516143e-05, "loss": 0.8502, "step": 2709 }, { "epoch": 0.45, "grad_norm": 2.414614355656643, "learning_rate": 1.2244470371434705e-05, "loss": 0.8192, "step": 2710 }, { "epoch": 0.45, "grad_norm": 1.5026744683329494, "learning_rate": 1.2239285568060651e-05, "loss": 0.9098, "step": 2711 }, { "epoch": 0.45, "grad_norm": 2.8220172102924743, "learning_rate": 1.2234100130861525e-05, "loss": 0.8019, "step": 2712 }, { "epoch": 0.45, "grad_norm": 2.6407655565663837, "learning_rate": 1.2228914061305059e-05, "loss": 0.8058, "step": 2713 }, { "epoch": 0.45, "grad_norm": 1.9633049203232824, "learning_rate": 1.2223727360859156e-05, "loss": 0.7654, "step": 2714 }, { "epoch": 0.45, "grad_norm": 2.301967871227839, "learning_rate": 1.2218540030991903e-05, "loss": 0.8692, "step": 2715 }, { "epoch": 0.45, "grad_norm": 1.7827485860130832, "learning_rate": 1.2213352073171562e-05, "loss": 0.7326, "step": 2716 }, { "epoch": 0.45, "grad_norm": 1.85259538638445, "learning_rate": 1.2208163488866573e-05, "loss": 0.8134, "step": 2717 }, { "epoch": 0.45, "grad_norm": 1.9688672075730518, "learning_rate": 1.2202974279545554e-05, "loss": 0.8101, "step": 2718 }, { "epoch": 0.45, "grad_norm": 1.6358036309414916, "learning_rate": 1.2197784446677299e-05, "loss": 0.8151, "step": 2719 }, { "epoch": 0.45, "grad_norm": 2.2653192412059235, "learning_rate": 1.2192593991730781e-05, "loss": 0.7974, "step": 2720 }, { "epoch": 0.45, "grad_norm": 2.0736366955204772, "learning_rate": 1.2187402916175146e-05, "loss": 0.7399, "step": 2721 }, { "epoch": 0.45, "grad_norm": 0.7176886697249744, "learning_rate": 1.2182211221479719e-05, "loss": 0.3585, "step": 2722 }, { "epoch": 0.45, "grad_norm": 1.8103005919437556, "learning_rate": 1.2177018909113994e-05, "loss": 0.8744, "step": 2723 }, { "epoch": 0.45, "grad_norm": 2.270649933515447, "learning_rate": 1.2171825980547646e-05, "loss": 0.803, "step": 2724 }, { "epoch": 0.45, "grad_norm": 1.9437569615355352, "learning_rate": 1.2166632437250527e-05, "loss": 0.7503, "step": 2725 }, { "epoch": 0.45, "grad_norm": 4.024257469140582, "learning_rate": 1.2161438280692655e-05, "loss": 0.6753, "step": 2726 }, { "epoch": 0.45, "grad_norm": 2.8619098578993856, "learning_rate": 1.215624351234422e-05, "loss": 0.7801, "step": 2727 }, { "epoch": 0.45, "grad_norm": 1.7909367168689079, "learning_rate": 1.21510481336756e-05, "loss": 0.7439, "step": 2728 }, { "epoch": 0.45, "grad_norm": 1.7639543912355127, "learning_rate": 1.214585214615733e-05, "loss": 0.8053, "step": 2729 }, { "epoch": 0.45, "grad_norm": 2.1265409927821075, "learning_rate": 1.2140655551260124e-05, "loss": 0.6602, "step": 2730 }, { "epoch": 0.45, "grad_norm": 6.446021505197956, "learning_rate": 1.2135458350454867e-05, "loss": 0.731, "step": 2731 }, { "epoch": 0.45, "grad_norm": 1.6671190448136215, "learning_rate": 1.2130260545212618e-05, "loss": 0.8952, "step": 2732 }, { "epoch": 0.45, "grad_norm": 1.8089807944576013, "learning_rate": 1.2125062137004602e-05, "loss": 0.7014, "step": 2733 }, { "epoch": 0.45, "grad_norm": 1.786643879062114, "learning_rate": 1.2119863127302221e-05, "loss": 0.8481, "step": 2734 }, { "epoch": 0.45, "grad_norm": 1.709512664516858, "learning_rate": 1.211466351757704e-05, "loss": 0.7829, "step": 2735 }, { "epoch": 0.45, "grad_norm": 1.9150804753455335, "learning_rate": 1.2109463309300798e-05, "loss": 0.8725, "step": 2736 }, { "epoch": 0.45, "grad_norm": 1.8304437555407216, "learning_rate": 1.2104262503945406e-05, "loss": 0.7925, "step": 2737 }, { "epoch": 0.45, "grad_norm": 1.6466508211196755, "learning_rate": 1.2099061102982939e-05, "loss": 0.7833, "step": 2738 }, { "epoch": 0.45, "grad_norm": 2.4463690062754013, "learning_rate": 1.2093859107885642e-05, "loss": 0.7985, "step": 2739 }, { "epoch": 0.45, "grad_norm": 1.859587821113339, "learning_rate": 1.2088656520125929e-05, "loss": 0.8094, "step": 2740 }, { "epoch": 0.45, "grad_norm": 1.9038032179239142, "learning_rate": 1.2083453341176386e-05, "loss": 0.756, "step": 2741 }, { "epoch": 0.45, "grad_norm": 2.0175918135285107, "learning_rate": 1.2078249572509755e-05, "loss": 0.8262, "step": 2742 }, { "epoch": 0.45, "grad_norm": 2.3217931709521924, "learning_rate": 1.2073045215598953e-05, "loss": 0.8117, "step": 2743 }, { "epoch": 0.45, "grad_norm": 2.3622291591612563, "learning_rate": 1.2067840271917066e-05, "loss": 0.8554, "step": 2744 }, { "epoch": 0.45, "grad_norm": 2.6768728649484954, "learning_rate": 1.206263474293734e-05, "loss": 0.8316, "step": 2745 }, { "epoch": 0.45, "grad_norm": 2.2650512556984523, "learning_rate": 1.205742863013319e-05, "loss": 0.7998, "step": 2746 }, { "epoch": 0.45, "grad_norm": 3.7325346326129645, "learning_rate": 1.2052221934978197e-05, "loss": 0.7508, "step": 2747 }, { "epoch": 0.45, "grad_norm": 2.7493505200923867, "learning_rate": 1.20470146589461e-05, "loss": 0.8464, "step": 2748 }, { "epoch": 0.45, "grad_norm": 1.781076057796516, "learning_rate": 1.2041806803510809e-05, "loss": 0.8217, "step": 2749 }, { "epoch": 0.45, "grad_norm": 2.0760218474110883, "learning_rate": 1.20365983701464e-05, "loss": 0.8682, "step": 2750 }, { "epoch": 0.45, "grad_norm": 2.0500836326355887, "learning_rate": 1.2031389360327106e-05, "loss": 0.8358, "step": 2751 }, { "epoch": 0.45, "grad_norm": 1.9913858445966808, "learning_rate": 1.202617977552733e-05, "loss": 0.7673, "step": 2752 }, { "epoch": 0.45, "grad_norm": 1.8632112586472342, "learning_rate": 1.2020969617221627e-05, "loss": 0.7868, "step": 2753 }, { "epoch": 0.45, "grad_norm": 1.7379613904563533, "learning_rate": 1.2015758886884727e-05, "loss": 0.7944, "step": 2754 }, { "epoch": 0.45, "grad_norm": 1.9770657142559167, "learning_rate": 1.2010547585991516e-05, "loss": 0.8278, "step": 2755 }, { "epoch": 0.45, "grad_norm": 1.8437229762358047, "learning_rate": 1.200533571601704e-05, "loss": 0.6848, "step": 2756 }, { "epoch": 0.45, "grad_norm": 1.9575674575206599, "learning_rate": 1.2000123278436508e-05, "loss": 0.864, "step": 2757 }, { "epoch": 0.45, "grad_norm": 1.8023743508032692, "learning_rate": 1.199491027472529e-05, "loss": 0.7751, "step": 2758 }, { "epoch": 0.45, "grad_norm": 2.0598713022999844, "learning_rate": 1.1989696706358917e-05, "loss": 0.7293, "step": 2759 }, { "epoch": 0.45, "grad_norm": 1.8787385928086295, "learning_rate": 1.1984482574813076e-05, "loss": 0.7507, "step": 2760 }, { "epoch": 0.45, "grad_norm": 1.9083579729906341, "learning_rate": 1.1979267881563618e-05, "loss": 0.7816, "step": 2761 }, { "epoch": 0.45, "grad_norm": 2.1853401867754503, "learning_rate": 1.197405262808655e-05, "loss": 0.8347, "step": 2762 }, { "epoch": 0.45, "grad_norm": 1.397727284286447, "learning_rate": 1.1968836815858038e-05, "loss": 0.858, "step": 2763 }, { "epoch": 0.45, "grad_norm": 1.8855744411054745, "learning_rate": 1.1963620446354406e-05, "loss": 0.8199, "step": 2764 }, { "epoch": 0.45, "grad_norm": 1.909937112655037, "learning_rate": 1.195840352105214e-05, "loss": 0.8363, "step": 2765 }, { "epoch": 0.45, "grad_norm": 1.9409344872829446, "learning_rate": 1.1953186041427878e-05, "loss": 0.8592, "step": 2766 }, { "epoch": 0.45, "grad_norm": 1.7401462436082638, "learning_rate": 1.1947968008958414e-05, "loss": 0.646, "step": 2767 }, { "epoch": 0.45, "grad_norm": 1.6580147457041323, "learning_rate": 1.1942749425120704e-05, "loss": 0.7643, "step": 2768 }, { "epoch": 0.45, "grad_norm": 1.8402920096119062, "learning_rate": 1.1937530291391857e-05, "loss": 0.7565, "step": 2769 }, { "epoch": 0.45, "grad_norm": 2.2886086444868043, "learning_rate": 1.1932310609249135e-05, "loss": 0.7271, "step": 2770 }, { "epoch": 0.46, "grad_norm": 1.5906788302338382, "learning_rate": 1.1927090380169963e-05, "loss": 0.8488, "step": 2771 }, { "epoch": 0.46, "grad_norm": 1.9195295127119014, "learning_rate": 1.1921869605631914e-05, "loss": 0.857, "step": 2772 }, { "epoch": 0.46, "grad_norm": 0.7023723371244763, "learning_rate": 1.1916648287112714e-05, "loss": 0.3502, "step": 2773 }, { "epoch": 0.46, "grad_norm": 2.2927960212387726, "learning_rate": 1.191142642609025e-05, "loss": 0.8305, "step": 2774 }, { "epoch": 0.46, "grad_norm": 1.782348604346053, "learning_rate": 1.1906204024042556e-05, "loss": 0.8127, "step": 2775 }, { "epoch": 0.46, "grad_norm": 1.8818976444810294, "learning_rate": 1.1900981082447822e-05, "loss": 0.8228, "step": 2776 }, { "epoch": 0.46, "grad_norm": 2.1095275406290086, "learning_rate": 1.1895757602784395e-05, "loss": 0.8982, "step": 2777 }, { "epoch": 0.46, "grad_norm": 1.864393110417774, "learning_rate": 1.1890533586530766e-05, "loss": 0.7428, "step": 2778 }, { "epoch": 0.46, "grad_norm": 1.7777461873708664, "learning_rate": 1.1885309035165582e-05, "loss": 0.7006, "step": 2779 }, { "epoch": 0.46, "grad_norm": 3.363035193367732, "learning_rate": 1.1880083950167642e-05, "loss": 0.7278, "step": 2780 }, { "epoch": 0.46, "grad_norm": 2.3974075264817563, "learning_rate": 1.1874858333015895e-05, "loss": 0.7719, "step": 2781 }, { "epoch": 0.46, "grad_norm": 1.7977334557936264, "learning_rate": 1.186963218518944e-05, "loss": 0.8356, "step": 2782 }, { "epoch": 0.46, "grad_norm": 1.8625576430404747, "learning_rate": 1.1864405508167532e-05, "loss": 0.7749, "step": 2783 }, { "epoch": 0.46, "grad_norm": 0.6639998812827311, "learning_rate": 1.1859178303429566e-05, "loss": 0.3868, "step": 2784 }, { "epoch": 0.46, "grad_norm": 2.1817603904232157, "learning_rate": 1.1853950572455093e-05, "loss": 0.8151, "step": 2785 }, { "epoch": 0.46, "grad_norm": 1.859489697942417, "learning_rate": 1.1848722316723809e-05, "loss": 0.8331, "step": 2786 }, { "epoch": 0.46, "grad_norm": 1.966968603431854, "learning_rate": 1.1843493537715563e-05, "loss": 0.6862, "step": 2787 }, { "epoch": 0.46, "grad_norm": 1.6025245358615223, "learning_rate": 1.1838264236910348e-05, "loss": 0.8409, "step": 2788 }, { "epoch": 0.46, "grad_norm": 1.934194196995202, "learning_rate": 1.183303441578831e-05, "loss": 0.7711, "step": 2789 }, { "epoch": 0.46, "grad_norm": 1.7322828705320978, "learning_rate": 1.1827804075829738e-05, "loss": 0.7664, "step": 2790 }, { "epoch": 0.46, "grad_norm": 2.1610873549074276, "learning_rate": 1.1822573218515068e-05, "loss": 0.7991, "step": 2791 }, { "epoch": 0.46, "grad_norm": 2.042907308282973, "learning_rate": 1.1817341845324882e-05, "loss": 0.789, "step": 2792 }, { "epoch": 0.46, "grad_norm": 2.1921502371229056, "learning_rate": 1.1812109957739907e-05, "loss": 0.8048, "step": 2793 }, { "epoch": 0.46, "grad_norm": 2.698443943669748, "learning_rate": 1.1806877557241023e-05, "loss": 0.7242, "step": 2794 }, { "epoch": 0.46, "grad_norm": 2.3312353406931523, "learning_rate": 1.1801644645309252e-05, "loss": 0.8422, "step": 2795 }, { "epoch": 0.46, "grad_norm": 1.8857638077294194, "learning_rate": 1.179641122342575e-05, "loss": 0.8177, "step": 2796 }, { "epoch": 0.46, "grad_norm": 2.2028005425831845, "learning_rate": 1.1791177293071831e-05, "loss": 0.7723, "step": 2797 }, { "epoch": 0.46, "grad_norm": 0.603722506904039, "learning_rate": 1.1785942855728945e-05, "loss": 0.3739, "step": 2798 }, { "epoch": 0.46, "grad_norm": 1.8137137299165869, "learning_rate": 1.1780707912878693e-05, "loss": 0.8668, "step": 2799 }, { "epoch": 0.46, "grad_norm": 2.3000001344095753, "learning_rate": 1.1775472466002812e-05, "loss": 0.8162, "step": 2800 }, { "epoch": 0.46, "grad_norm": 2.5443416614912118, "learning_rate": 1.1770236516583187e-05, "loss": 0.8566, "step": 2801 }, { "epoch": 0.46, "grad_norm": 1.5635616849534355, "learning_rate": 1.176500006610184e-05, "loss": 0.8419, "step": 2802 }, { "epoch": 0.46, "grad_norm": 2.4137605354032323, "learning_rate": 1.1759763116040936e-05, "loss": 0.7443, "step": 2803 }, { "epoch": 0.46, "grad_norm": 0.6742169872697725, "learning_rate": 1.1754525667882786e-05, "loss": 0.3638, "step": 2804 }, { "epoch": 0.46, "grad_norm": 1.7734637950342598, "learning_rate": 1.1749287723109834e-05, "loss": 0.8948, "step": 2805 }, { "epoch": 0.46, "grad_norm": 1.726784641726507, "learning_rate": 1.1744049283204677e-05, "loss": 0.8249, "step": 2806 }, { "epoch": 0.46, "grad_norm": 1.9237246706506215, "learning_rate": 1.1738810349650036e-05, "loss": 0.8311, "step": 2807 }, { "epoch": 0.46, "grad_norm": 2.5249686708319383, "learning_rate": 1.1733570923928785e-05, "loss": 0.782, "step": 2808 }, { "epoch": 0.46, "grad_norm": 0.6725353089043178, "learning_rate": 1.1728331007523928e-05, "loss": 0.3438, "step": 2809 }, { "epoch": 0.46, "grad_norm": 0.6151255263797533, "learning_rate": 1.1723090601918616e-05, "loss": 0.3674, "step": 2810 }, { "epoch": 0.46, "grad_norm": 2.4148948509426753, "learning_rate": 1.1717849708596136e-05, "loss": 0.7785, "step": 2811 }, { "epoch": 0.46, "grad_norm": 1.7773709010141387, "learning_rate": 1.171260832903991e-05, "loss": 0.8088, "step": 2812 }, { "epoch": 0.46, "grad_norm": 3.1424152624604087, "learning_rate": 1.1707366464733501e-05, "loss": 0.8267, "step": 2813 }, { "epoch": 0.46, "grad_norm": 0.6562125164863066, "learning_rate": 1.1702124117160603e-05, "loss": 0.4049, "step": 2814 }, { "epoch": 0.46, "grad_norm": 1.8317172024735255, "learning_rate": 1.1696881287805056e-05, "loss": 0.8391, "step": 2815 }, { "epoch": 0.46, "grad_norm": 14.224748552627517, "learning_rate": 1.1691637978150831e-05, "loss": 0.784, "step": 2816 }, { "epoch": 0.46, "grad_norm": 2.994821000679811, "learning_rate": 1.1686394189682035e-05, "loss": 0.8424, "step": 2817 }, { "epoch": 0.46, "grad_norm": 1.7627057552878191, "learning_rate": 1.1681149923882913e-05, "loss": 0.8215, "step": 2818 }, { "epoch": 0.46, "grad_norm": 1.8620609356572162, "learning_rate": 1.1675905182237839e-05, "loss": 0.7865, "step": 2819 }, { "epoch": 0.46, "grad_norm": 0.6960551071156126, "learning_rate": 1.167065996623133e-05, "loss": 0.3837, "step": 2820 }, { "epoch": 0.46, "grad_norm": 1.5962717767778152, "learning_rate": 1.166541427734803e-05, "loss": 0.747, "step": 2821 }, { "epoch": 0.46, "grad_norm": 3.8612227857922163, "learning_rate": 1.1660168117072725e-05, "loss": 0.7808, "step": 2822 }, { "epoch": 0.46, "grad_norm": 1.6144997113232018, "learning_rate": 1.1654921486890327e-05, "loss": 0.7801, "step": 2823 }, { "epoch": 0.46, "grad_norm": 1.899536525157087, "learning_rate": 1.1649674388285883e-05, "loss": 0.801, "step": 2824 }, { "epoch": 0.46, "grad_norm": 0.6371802618143, "learning_rate": 1.1644426822744575e-05, "loss": 0.3415, "step": 2825 }, { "epoch": 0.46, "grad_norm": 1.9010587984705654, "learning_rate": 1.1639178791751715e-05, "loss": 0.8361, "step": 2826 }, { "epoch": 0.46, "grad_norm": 1.704646535039375, "learning_rate": 1.1633930296792744e-05, "loss": 0.7921, "step": 2827 }, { "epoch": 0.46, "grad_norm": 2.3723185319276343, "learning_rate": 1.1628681339353244e-05, "loss": 0.7736, "step": 2828 }, { "epoch": 0.46, "grad_norm": 2.1859468727425493, "learning_rate": 1.1623431920918916e-05, "loss": 0.8498, "step": 2829 }, { "epoch": 0.46, "grad_norm": 1.800209194640924, "learning_rate": 1.1618182042975596e-05, "loss": 0.8496, "step": 2830 }, { "epoch": 0.46, "grad_norm": 1.6775060311469783, "learning_rate": 1.1612931707009253e-05, "loss": 0.7709, "step": 2831 }, { "epoch": 0.47, "grad_norm": 1.5969235139416489, "learning_rate": 1.1607680914505985e-05, "loss": 0.8487, "step": 2832 }, { "epoch": 0.47, "grad_norm": 0.6038230884951858, "learning_rate": 1.1602429666952015e-05, "loss": 0.3471, "step": 2833 }, { "epoch": 0.47, "grad_norm": 1.7946790654024463, "learning_rate": 1.15971779658337e-05, "loss": 0.8704, "step": 2834 }, { "epoch": 0.47, "grad_norm": 1.6936428102756533, "learning_rate": 1.1591925812637523e-05, "loss": 0.8632, "step": 2835 }, { "epoch": 0.47, "grad_norm": 0.6865231678278363, "learning_rate": 1.1586673208850091e-05, "loss": 0.38, "step": 2836 }, { "epoch": 0.47, "grad_norm": 1.9975478851918105, "learning_rate": 1.158142015595815e-05, "loss": 0.755, "step": 2837 }, { "epoch": 0.47, "grad_norm": 1.749542232948429, "learning_rate": 1.1576166655448558e-05, "loss": 0.8285, "step": 2838 }, { "epoch": 0.47, "grad_norm": 1.9252950892618508, "learning_rate": 1.1570912708808311e-05, "loss": 0.7575, "step": 2839 }, { "epoch": 0.47, "grad_norm": 3.143349506848307, "learning_rate": 1.1565658317524526e-05, "loss": 0.8733, "step": 2840 }, { "epoch": 0.47, "grad_norm": 1.477508985232883, "learning_rate": 1.1560403483084449e-05, "loss": 0.7696, "step": 2841 }, { "epoch": 0.47, "grad_norm": 2.004600415195024, "learning_rate": 1.1555148206975449e-05, "loss": 0.7988, "step": 2842 }, { "epoch": 0.47, "grad_norm": 2.123448008732707, "learning_rate": 1.1549892490685018e-05, "loss": 0.8893, "step": 2843 }, { "epoch": 0.47, "grad_norm": 1.9557241851642664, "learning_rate": 1.1544636335700778e-05, "loss": 0.7435, "step": 2844 }, { "epoch": 0.47, "grad_norm": 1.9006581251238814, "learning_rate": 1.1539379743510475e-05, "loss": 0.8376, "step": 2845 }, { "epoch": 0.47, "grad_norm": 1.8336984894384647, "learning_rate": 1.1534122715601974e-05, "loss": 0.8534, "step": 2846 }, { "epoch": 0.47, "grad_norm": 2.226283054018393, "learning_rate": 1.1528865253463266e-05, "loss": 0.7406, "step": 2847 }, { "epoch": 0.47, "grad_norm": 2.0147126924276777, "learning_rate": 1.1523607358582462e-05, "loss": 0.7943, "step": 2848 }, { "epoch": 0.47, "grad_norm": 2.241107377655835, "learning_rate": 1.1518349032447806e-05, "loss": 0.8098, "step": 2849 }, { "epoch": 0.47, "grad_norm": 1.78034907304151, "learning_rate": 1.1513090276547647e-05, "loss": 0.7774, "step": 2850 }, { "epoch": 0.47, "grad_norm": 1.9942533571843386, "learning_rate": 1.150783109237047e-05, "loss": 0.8105, "step": 2851 }, { "epoch": 0.47, "grad_norm": 1.6033345571132973, "learning_rate": 1.1502571481404873e-05, "loss": 0.7298, "step": 2852 }, { "epoch": 0.47, "grad_norm": 1.812406284066017, "learning_rate": 1.149731144513958e-05, "loss": 0.7466, "step": 2853 }, { "epoch": 0.47, "grad_norm": 1.9678084150144282, "learning_rate": 1.1492050985063432e-05, "loss": 0.8166, "step": 2854 }, { "epoch": 0.47, "grad_norm": 2.3165410101404706, "learning_rate": 1.1486790102665393e-05, "loss": 0.8567, "step": 2855 }, { "epoch": 0.47, "grad_norm": 1.8270487746830435, "learning_rate": 1.148152879943454e-05, "loss": 0.809, "step": 2856 }, { "epoch": 0.47, "grad_norm": 2.092313193469043, "learning_rate": 1.147626707686008e-05, "loss": 0.8022, "step": 2857 }, { "epoch": 0.47, "grad_norm": 1.57099344180747, "learning_rate": 1.1471004936431327e-05, "loss": 0.8361, "step": 2858 }, { "epoch": 0.47, "grad_norm": 2.1466008226474464, "learning_rate": 1.1465742379637725e-05, "loss": 0.7802, "step": 2859 }, { "epoch": 0.47, "grad_norm": 2.0317211691395305, "learning_rate": 1.1460479407968827e-05, "loss": 0.8897, "step": 2860 }, { "epoch": 0.47, "grad_norm": 3.045758253014785, "learning_rate": 1.1455216022914302e-05, "loss": 0.8101, "step": 2861 }, { "epoch": 0.47, "grad_norm": 1.6885910434475717, "learning_rate": 1.1449952225963946e-05, "loss": 0.7698, "step": 2862 }, { "epoch": 0.47, "grad_norm": 2.9816086962845656, "learning_rate": 1.144468801860766e-05, "loss": 0.7517, "step": 2863 }, { "epoch": 0.47, "grad_norm": 0.7161333698708825, "learning_rate": 1.143942340233547e-05, "loss": 0.3804, "step": 2864 }, { "epoch": 0.47, "grad_norm": 3.374650143272454, "learning_rate": 1.1434158378637514e-05, "loss": 0.789, "step": 2865 }, { "epoch": 0.47, "grad_norm": 1.5884214325417811, "learning_rate": 1.1428892949004049e-05, "loss": 0.7743, "step": 2866 }, { "epoch": 0.47, "grad_norm": 1.731824140068214, "learning_rate": 1.1423627114925434e-05, "loss": 0.8518, "step": 2867 }, { "epoch": 0.47, "grad_norm": 2.5659370574516642, "learning_rate": 1.1418360877892165e-05, "loss": 0.7874, "step": 2868 }, { "epoch": 0.47, "grad_norm": 1.5469767388245712, "learning_rate": 1.1413094239394833e-05, "loss": 0.7526, "step": 2869 }, { "epoch": 0.47, "grad_norm": 1.7059898622794223, "learning_rate": 1.140782720092415e-05, "loss": 0.854, "step": 2870 }, { "epoch": 0.47, "grad_norm": 2.4472499722423895, "learning_rate": 1.1402559763970943e-05, "loss": 0.7811, "step": 2871 }, { "epoch": 0.47, "grad_norm": 1.669132675956861, "learning_rate": 1.139729193002614e-05, "loss": 0.8537, "step": 2872 }, { "epoch": 0.47, "grad_norm": 2.3437541609012755, "learning_rate": 1.1392023700580796e-05, "loss": 0.7709, "step": 2873 }, { "epoch": 0.47, "grad_norm": 1.700730637760982, "learning_rate": 1.1386755077126073e-05, "loss": 0.7659, "step": 2874 }, { "epoch": 0.47, "grad_norm": 2.845196004600727, "learning_rate": 1.1381486061153244e-05, "loss": 0.7886, "step": 2875 }, { "epoch": 0.47, "grad_norm": 2.4489997730575106, "learning_rate": 1.1376216654153689e-05, "loss": 0.8424, "step": 2876 }, { "epoch": 0.47, "grad_norm": 1.547969017291147, "learning_rate": 1.1370946857618908e-05, "loss": 0.8319, "step": 2877 }, { "epoch": 0.47, "grad_norm": 1.9611968108401752, "learning_rate": 1.1365676673040502e-05, "loss": 0.8419, "step": 2878 }, { "epoch": 0.47, "grad_norm": 0.6688071345803641, "learning_rate": 1.1360406101910187e-05, "loss": 0.3838, "step": 2879 }, { "epoch": 0.47, "grad_norm": 1.8447887368139886, "learning_rate": 1.1355135145719784e-05, "loss": 0.8519, "step": 2880 }, { "epoch": 0.47, "grad_norm": 1.7536510099982612, "learning_rate": 1.1349863805961233e-05, "loss": 0.7131, "step": 2881 }, { "epoch": 0.47, "grad_norm": 2.076752348698794, "learning_rate": 1.1344592084126573e-05, "loss": 0.7883, "step": 2882 }, { "epoch": 0.47, "grad_norm": 1.642592049717221, "learning_rate": 1.133931998170795e-05, "loss": 0.7977, "step": 2883 }, { "epoch": 0.47, "grad_norm": 1.4568818162122217, "learning_rate": 1.1334047500197625e-05, "loss": 0.8336, "step": 2884 }, { "epoch": 0.47, "grad_norm": 1.7873604200345, "learning_rate": 1.1328774641087958e-05, "loss": 0.768, "step": 2885 }, { "epoch": 0.47, "grad_norm": 1.868603488776779, "learning_rate": 1.132350140587143e-05, "loss": 0.7354, "step": 2886 }, { "epoch": 0.47, "grad_norm": 1.442825117949762, "learning_rate": 1.1318227796040608e-05, "loss": 0.8396, "step": 2887 }, { "epoch": 0.47, "grad_norm": 2.3086208459722006, "learning_rate": 1.1312953813088183e-05, "loss": 0.7144, "step": 2888 }, { "epoch": 0.47, "grad_norm": 1.7252068967905847, "learning_rate": 1.1307679458506947e-05, "loss": 0.8703, "step": 2889 }, { "epoch": 0.47, "grad_norm": 1.9912492500836139, "learning_rate": 1.1302404733789787e-05, "loss": 0.7721, "step": 2890 }, { "epoch": 0.47, "grad_norm": 2.6500455558180342, "learning_rate": 1.1297129640429707e-05, "loss": 0.7952, "step": 2891 }, { "epoch": 0.47, "grad_norm": 1.7860493873941006, "learning_rate": 1.1291854179919812e-05, "loss": 0.7739, "step": 2892 }, { "epoch": 0.48, "grad_norm": 2.0187844575553195, "learning_rate": 1.1286578353753313e-05, "loss": 0.7229, "step": 2893 }, { "epoch": 0.48, "grad_norm": 1.7536414724298524, "learning_rate": 1.1281302163423515e-05, "loss": 0.8787, "step": 2894 }, { "epoch": 0.48, "grad_norm": 1.8294019874952534, "learning_rate": 1.1276025610423835e-05, "loss": 0.784, "step": 2895 }, { "epoch": 0.48, "grad_norm": 1.608555794392794, "learning_rate": 1.1270748696247791e-05, "loss": 0.8472, "step": 2896 }, { "epoch": 0.48, "grad_norm": 2.1213669270103943, "learning_rate": 1.1265471422389003e-05, "loss": 0.8366, "step": 2897 }, { "epoch": 0.48, "grad_norm": 1.818642824095989, "learning_rate": 1.1260193790341186e-05, "loss": 0.7959, "step": 2898 }, { "epoch": 0.48, "grad_norm": 2.3901156647860473, "learning_rate": 1.1254915801598173e-05, "loss": 0.8799, "step": 2899 }, { "epoch": 0.48, "grad_norm": 1.8579012151386014, "learning_rate": 1.1249637457653881e-05, "loss": 0.8579, "step": 2900 }, { "epoch": 0.48, "grad_norm": 1.6586390205026678, "learning_rate": 1.1244358760002337e-05, "loss": 0.7975, "step": 2901 }, { "epoch": 0.48, "grad_norm": 1.913039246464597, "learning_rate": 1.1239079710137659e-05, "loss": 0.789, "step": 2902 }, { "epoch": 0.48, "grad_norm": 0.62311119858615, "learning_rate": 1.1233800309554083e-05, "loss": 0.381, "step": 2903 }, { "epoch": 0.48, "grad_norm": 2.2638056320711986, "learning_rate": 1.1228520559745922e-05, "loss": 0.8229, "step": 2904 }, { "epoch": 0.48, "grad_norm": 1.578118029366939, "learning_rate": 1.1223240462207601e-05, "loss": 0.7544, "step": 2905 }, { "epoch": 0.48, "grad_norm": 2.2337996116155847, "learning_rate": 1.121796001843364e-05, "loss": 0.8553, "step": 2906 }, { "epoch": 0.48, "grad_norm": 2.6403899983070955, "learning_rate": 1.1212679229918657e-05, "loss": 0.8138, "step": 2907 }, { "epoch": 0.48, "grad_norm": 2.2861747536194614, "learning_rate": 1.1207398098157371e-05, "loss": 0.7152, "step": 2908 }, { "epoch": 0.48, "grad_norm": 1.74910922793143, "learning_rate": 1.1202116624644594e-05, "loss": 0.8469, "step": 2909 }, { "epoch": 0.48, "grad_norm": 2.1371669414054626, "learning_rate": 1.1196834810875234e-05, "loss": 0.7672, "step": 2910 }, { "epoch": 0.48, "grad_norm": 7.6679487600761185, "learning_rate": 1.11915526583443e-05, "loss": 0.774, "step": 2911 }, { "epoch": 0.48, "grad_norm": 2.3709413657972593, "learning_rate": 1.1186270168546891e-05, "loss": 0.8809, "step": 2912 }, { "epoch": 0.48, "grad_norm": 2.15823794527503, "learning_rate": 1.1180987342978209e-05, "loss": 0.8075, "step": 2913 }, { "epoch": 0.48, "grad_norm": 2.2835785512804136, "learning_rate": 1.1175704183133542e-05, "loss": 0.8183, "step": 2914 }, { "epoch": 0.48, "grad_norm": 1.9482885177030038, "learning_rate": 1.1170420690508281e-05, "loss": 0.7857, "step": 2915 }, { "epoch": 0.48, "grad_norm": 2.7086745370955208, "learning_rate": 1.1165136866597905e-05, "loss": 0.6482, "step": 2916 }, { "epoch": 0.48, "grad_norm": 0.6832630238193294, "learning_rate": 1.1159852712897989e-05, "loss": 0.3979, "step": 2917 }, { "epoch": 0.48, "grad_norm": 3.7099341018899272, "learning_rate": 1.1154568230904204e-05, "loss": 0.7046, "step": 2918 }, { "epoch": 0.48, "grad_norm": 2.276395526863815, "learning_rate": 1.1149283422112312e-05, "loss": 0.7867, "step": 2919 }, { "epoch": 0.48, "grad_norm": 0.6420529200824527, "learning_rate": 1.1143998288018163e-05, "loss": 0.3794, "step": 2920 }, { "epoch": 0.48, "grad_norm": 2.3392663958576425, "learning_rate": 1.1138712830117706e-05, "loss": 0.8148, "step": 2921 }, { "epoch": 0.48, "grad_norm": 1.7206469008451677, "learning_rate": 1.1133427049906978e-05, "loss": 0.8318, "step": 2922 }, { "epoch": 0.48, "grad_norm": 2.2269274717107472, "learning_rate": 1.1128140948882107e-05, "loss": 0.8483, "step": 2923 }, { "epoch": 0.48, "grad_norm": 2.489392491967421, "learning_rate": 1.1122854528539315e-05, "loss": 0.8915, "step": 2924 }, { "epoch": 0.48, "grad_norm": 2.2130554989239273, "learning_rate": 1.111756779037491e-05, "loss": 0.7745, "step": 2925 }, { "epoch": 0.48, "grad_norm": 1.640287721365421, "learning_rate": 1.1112280735885295e-05, "loss": 0.8104, "step": 2926 }, { "epoch": 0.48, "grad_norm": 2.0717590615058636, "learning_rate": 1.1106993366566957e-05, "loss": 0.8194, "step": 2927 }, { "epoch": 0.48, "grad_norm": 2.018685497271576, "learning_rate": 1.1101705683916473e-05, "loss": 0.7323, "step": 2928 }, { "epoch": 0.48, "grad_norm": 1.7865581730604858, "learning_rate": 1.1096417689430517e-05, "loss": 0.7956, "step": 2929 }, { "epoch": 0.48, "grad_norm": 2.1425587666653585, "learning_rate": 1.1091129384605837e-05, "loss": 0.7978, "step": 2930 }, { "epoch": 0.48, "grad_norm": 1.8724401234902552, "learning_rate": 1.1085840770939283e-05, "loss": 0.75, "step": 2931 }, { "epoch": 0.48, "grad_norm": 1.7701894375915628, "learning_rate": 1.108055184992778e-05, "loss": 0.8, "step": 2932 }, { "epoch": 0.48, "grad_norm": 1.513049850691372, "learning_rate": 1.1075262623068352e-05, "loss": 0.7772, "step": 2933 }, { "epoch": 0.48, "grad_norm": 2.2984781143061377, "learning_rate": 1.10699730918581e-05, "loss": 0.8447, "step": 2934 }, { "epoch": 0.48, "grad_norm": 1.935086339089957, "learning_rate": 1.1064683257794216e-05, "loss": 0.8153, "step": 2935 }, { "epoch": 0.48, "grad_norm": 1.7032925042920875, "learning_rate": 1.1059393122373976e-05, "loss": 0.806, "step": 2936 }, { "epoch": 0.48, "grad_norm": 2.647930776097991, "learning_rate": 1.1054102687094738e-05, "loss": 0.7906, "step": 2937 }, { "epoch": 0.48, "grad_norm": 1.5412207859474936, "learning_rate": 1.1048811953453955e-05, "loss": 0.8235, "step": 2938 }, { "epoch": 0.48, "grad_norm": 0.7416522451751737, "learning_rate": 1.1043520922949156e-05, "loss": 0.3576, "step": 2939 }, { "epoch": 0.48, "grad_norm": 2.5803184902317393, "learning_rate": 1.1038229597077954e-05, "loss": 0.8157, "step": 2940 }, { "epoch": 0.48, "grad_norm": 2.393230548811411, "learning_rate": 1.1032937977338048e-05, "loss": 0.8199, "step": 2941 }, { "epoch": 0.48, "grad_norm": 1.6722693853511506, "learning_rate": 1.1027646065227222e-05, "loss": 0.8923, "step": 2942 }, { "epoch": 0.48, "grad_norm": 1.7367225886547326, "learning_rate": 1.1022353862243338e-05, "loss": 0.791, "step": 2943 }, { "epoch": 0.48, "grad_norm": 2.8130346274864406, "learning_rate": 1.1017061369884345e-05, "loss": 0.8409, "step": 2944 }, { "epoch": 0.48, "grad_norm": 1.9221875565051714, "learning_rate": 1.101176858964827e-05, "loss": 0.7985, "step": 2945 }, { "epoch": 0.48, "grad_norm": 1.7055321466501285, "learning_rate": 1.1006475523033225e-05, "loss": 0.8293, "step": 2946 }, { "epoch": 0.48, "grad_norm": 0.6690010051090827, "learning_rate": 1.10011821715374e-05, "loss": 0.3766, "step": 2947 }, { "epoch": 0.48, "grad_norm": 1.5133543228769208, "learning_rate": 1.0995888536659067e-05, "loss": 0.792, "step": 2948 }, { "epoch": 0.48, "grad_norm": 2.0053180575162615, "learning_rate": 1.0990594619896581e-05, "loss": 0.8263, "step": 2949 }, { "epoch": 0.48, "grad_norm": 2.1391815875040154, "learning_rate": 1.098530042274837e-05, "loss": 0.7884, "step": 2950 }, { "epoch": 0.48, "grad_norm": 2.227927132516693, "learning_rate": 1.0980005946712949e-05, "loss": 0.7512, "step": 2951 }, { "epoch": 0.48, "grad_norm": 1.660033533009031, "learning_rate": 1.0974711193288906e-05, "loss": 0.8544, "step": 2952 }, { "epoch": 0.49, "grad_norm": 1.547701743026725, "learning_rate": 1.096941616397491e-05, "loss": 0.8717, "step": 2953 }, { "epoch": 0.49, "grad_norm": 11.278634561577208, "learning_rate": 1.0964120860269708e-05, "loss": 0.8191, "step": 2954 }, { "epoch": 0.49, "grad_norm": 2.0240500836812876, "learning_rate": 1.0958825283672126e-05, "loss": 0.8413, "step": 2955 }, { "epoch": 0.49, "grad_norm": 2.670878154411938, "learning_rate": 1.0953529435681063e-05, "loss": 0.7591, "step": 2956 }, { "epoch": 0.49, "grad_norm": 2.2650121319808, "learning_rate": 1.09482333177955e-05, "loss": 0.7831, "step": 2957 }, { "epoch": 0.49, "grad_norm": 2.912241275367314, "learning_rate": 1.0942936931514492e-05, "loss": 0.7927, "step": 2958 }, { "epoch": 0.49, "grad_norm": 2.149877965127423, "learning_rate": 1.0937640278337167e-05, "loss": 0.6942, "step": 2959 }, { "epoch": 0.49, "grad_norm": 1.4785910437341243, "learning_rate": 1.0932343359762736e-05, "loss": 0.7707, "step": 2960 }, { "epoch": 0.49, "grad_norm": 2.079290424886108, "learning_rate": 1.0927046177290477e-05, "loss": 0.8095, "step": 2961 }, { "epoch": 0.49, "grad_norm": 2.045383804074842, "learning_rate": 1.092174873241975e-05, "loss": 0.7864, "step": 2962 }, { "epoch": 0.49, "grad_norm": 1.9468165554812493, "learning_rate": 1.0916451026649981e-05, "loss": 0.8114, "step": 2963 }, { "epoch": 0.49, "grad_norm": 2.2679526523298392, "learning_rate": 1.091115306148068e-05, "loss": 0.8278, "step": 2964 }, { "epoch": 0.49, "grad_norm": 1.560042184219441, "learning_rate": 1.0905854838411418e-05, "loss": 0.7879, "step": 2965 }, { "epoch": 0.49, "grad_norm": 1.7185795453202275, "learning_rate": 1.0900556358941855e-05, "loss": 0.7827, "step": 2966 }, { "epoch": 0.49, "grad_norm": 1.8293317267315512, "learning_rate": 1.0895257624571705e-05, "loss": 0.7906, "step": 2967 }, { "epoch": 0.49, "grad_norm": 2.623168398643354, "learning_rate": 1.088995863680077e-05, "loss": 0.7998, "step": 2968 }, { "epoch": 0.49, "grad_norm": 2.4193242061386298, "learning_rate": 1.0884659397128911e-05, "loss": 0.8049, "step": 2969 }, { "epoch": 0.49, "grad_norm": 3.3212163488777695, "learning_rate": 1.0879359907056074e-05, "loss": 0.8266, "step": 2970 }, { "epoch": 0.49, "grad_norm": 1.766742078467462, "learning_rate": 1.0874060168082266e-05, "loss": 0.8307, "step": 2971 }, { "epoch": 0.49, "grad_norm": 1.9004704052626313, "learning_rate": 1.0868760181707565e-05, "loss": 0.8535, "step": 2972 }, { "epoch": 0.49, "grad_norm": 2.013628578463802, "learning_rate": 1.0863459949432122e-05, "loss": 0.798, "step": 2973 }, { "epoch": 0.49, "grad_norm": 1.9277934135117385, "learning_rate": 1.0858159472756157e-05, "loss": 0.7816, "step": 2974 }, { "epoch": 0.49, "grad_norm": 2.036086792764665, "learning_rate": 1.085285875317996e-05, "loss": 0.8163, "step": 2975 }, { "epoch": 0.49, "grad_norm": 1.6724527778430809, "learning_rate": 1.0847557792203886e-05, "loss": 0.7372, "step": 2976 }, { "epoch": 0.49, "grad_norm": 2.2541365900725303, "learning_rate": 1.0842256591328362e-05, "loss": 0.799, "step": 2977 }, { "epoch": 0.49, "grad_norm": 2.0305543917666955, "learning_rate": 1.0836955152053883e-05, "loss": 0.7436, "step": 2978 }, { "epoch": 0.49, "grad_norm": 1.69463211367848, "learning_rate": 1.083165347588101e-05, "loss": 0.736, "step": 2979 }, { "epoch": 0.49, "grad_norm": 2.1057598479266204, "learning_rate": 1.082635156431037e-05, "loss": 0.8325, "step": 2980 }, { "epoch": 0.49, "grad_norm": 2.358745312564923, "learning_rate": 1.0821049418842654e-05, "loss": 0.7759, "step": 2981 }, { "epoch": 0.49, "grad_norm": 1.8883072885826102, "learning_rate": 1.0815747040978628e-05, "loss": 0.7937, "step": 2982 }, { "epoch": 0.49, "grad_norm": 1.8693360034724038, "learning_rate": 1.081044443221912e-05, "loss": 0.7296, "step": 2983 }, { "epoch": 0.49, "grad_norm": 2.522094474780933, "learning_rate": 1.0805141594065022e-05, "loss": 0.76, "step": 2984 }, { "epoch": 0.49, "grad_norm": 2.0730152905104013, "learning_rate": 1.0799838528017288e-05, "loss": 0.8362, "step": 2985 }, { "epoch": 0.49, "grad_norm": 1.6723770188775566, "learning_rate": 1.0794535235576941e-05, "loss": 0.7727, "step": 2986 }, { "epoch": 0.49, "grad_norm": 1.9992612605225226, "learning_rate": 1.0789231718245069e-05, "loss": 0.844, "step": 2987 }, { "epoch": 0.49, "grad_norm": 2.9168115725794856, "learning_rate": 1.0783927977522819e-05, "loss": 0.7596, "step": 2988 }, { "epoch": 0.49, "grad_norm": 1.5741031530090488, "learning_rate": 1.0778624014911403e-05, "loss": 0.7504, "step": 2989 }, { "epoch": 0.49, "grad_norm": 2.2199741954272096, "learning_rate": 1.0773319831912099e-05, "loss": 0.8397, "step": 2990 }, { "epoch": 0.49, "grad_norm": 2.0420070326641837, "learning_rate": 1.0768015430026244e-05, "loss": 0.8724, "step": 2991 }, { "epoch": 0.49, "grad_norm": 1.7047938041253092, "learning_rate": 1.0762710810755234e-05, "loss": 0.7871, "step": 2992 }, { "epoch": 0.49, "grad_norm": 1.9666232890184727, "learning_rate": 1.0757405975600534e-05, "loss": 0.7524, "step": 2993 }, { "epoch": 0.49, "grad_norm": 1.8188627608509345, "learning_rate": 1.0752100926063669e-05, "loss": 0.8769, "step": 2994 }, { "epoch": 0.49, "grad_norm": 3.100021741806301, "learning_rate": 1.074679566364622e-05, "loss": 0.8415, "step": 2995 }, { "epoch": 0.49, "grad_norm": 1.6957987009343325, "learning_rate": 1.0741490189849826e-05, "loss": 0.822, "step": 2996 }, { "epoch": 0.49, "grad_norm": 1.6964523349805334, "learning_rate": 1.0736184506176195e-05, "loss": 0.8006, "step": 2997 }, { "epoch": 0.49, "grad_norm": 1.7656806593541154, "learning_rate": 1.0730878614127087e-05, "loss": 0.8757, "step": 2998 }, { "epoch": 0.49, "grad_norm": 1.495108619115121, "learning_rate": 1.0725572515204327e-05, "loss": 0.7852, "step": 2999 }, { "epoch": 0.49, "grad_norm": 2.1375996150222947, "learning_rate": 1.0720266210909793e-05, "loss": 0.8352, "step": 3000 }, { "epoch": 0.49, "grad_norm": 2.882822399775859, "learning_rate": 1.0714959702745424e-05, "loss": 0.7811, "step": 3001 }, { "epoch": 0.49, "grad_norm": 1.7269864035641767, "learning_rate": 1.0709652992213216e-05, "loss": 0.799, "step": 3002 }, { "epoch": 0.49, "grad_norm": 2.036439058086878, "learning_rate": 1.0704346080815218e-05, "loss": 0.8446, "step": 3003 }, { "epoch": 0.49, "grad_norm": 1.9934877180008346, "learning_rate": 1.0699038970053544e-05, "loss": 0.8033, "step": 3004 }, { "epoch": 0.49, "grad_norm": 6.562237904682171, "learning_rate": 1.069373166143036e-05, "loss": 0.7847, "step": 3005 }, { "epoch": 0.49, "grad_norm": 1.600075495010211, "learning_rate": 1.068842415644789e-05, "loss": 0.7565, "step": 3006 }, { "epoch": 0.49, "grad_norm": 1.9758774528245808, "learning_rate": 1.0683116456608411e-05, "loss": 0.7567, "step": 3007 }, { "epoch": 0.49, "grad_norm": 1.896230062420855, "learning_rate": 1.0677808563414256e-05, "loss": 0.744, "step": 3008 }, { "epoch": 0.49, "grad_norm": 2.1774742560363554, "learning_rate": 1.0672500478367813e-05, "loss": 0.7543, "step": 3009 }, { "epoch": 0.49, "grad_norm": 1.829308600561015, "learning_rate": 1.0667192202971525e-05, "loss": 0.821, "step": 3010 }, { "epoch": 0.49, "grad_norm": 1.943150362537447, "learning_rate": 1.0661883738727888e-05, "loss": 0.8087, "step": 3011 }, { "epoch": 0.49, "grad_norm": 1.545847657346782, "learning_rate": 1.0656575087139452e-05, "loss": 0.784, "step": 3012 }, { "epoch": 0.49, "grad_norm": 2.726305400612422, "learning_rate": 1.0651266249708816e-05, "loss": 0.8221, "step": 3013 }, { "epoch": 0.5, "grad_norm": 2.0038642715869166, "learning_rate": 1.064595722793864e-05, "loss": 0.7637, "step": 3014 }, { "epoch": 0.5, "grad_norm": 2.871231011069208, "learning_rate": 1.0640648023331625e-05, "loss": 0.778, "step": 3015 }, { "epoch": 0.5, "grad_norm": 2.0786367731279607, "learning_rate": 1.063533863739054e-05, "loss": 0.7688, "step": 3016 }, { "epoch": 0.5, "grad_norm": 1.7208606345252024, "learning_rate": 1.0630029071618188e-05, "loss": 0.7369, "step": 3017 }, { "epoch": 0.5, "grad_norm": 6.544539361228477, "learning_rate": 1.0624719327517434e-05, "loss": 0.8407, "step": 3018 }, { "epoch": 0.5, "grad_norm": 2.300120858224187, "learning_rate": 1.061940940659119e-05, "loss": 0.7727, "step": 3019 }, { "epoch": 0.5, "grad_norm": 1.5733664768770563, "learning_rate": 1.0614099310342414e-05, "loss": 0.733, "step": 3020 }, { "epoch": 0.5, "grad_norm": 0.6670015750694934, "learning_rate": 1.0608789040274122e-05, "loss": 0.386, "step": 3021 }, { "epoch": 0.5, "grad_norm": 3.3566063418030074, "learning_rate": 1.0603478597889374e-05, "loss": 0.7411, "step": 3022 }, { "epoch": 0.5, "grad_norm": 1.6300134387356293, "learning_rate": 1.0598167984691276e-05, "loss": 0.8141, "step": 3023 }, { "epoch": 0.5, "grad_norm": 2.3335915111215972, "learning_rate": 1.059285720218299e-05, "loss": 0.7244, "step": 3024 }, { "epoch": 0.5, "grad_norm": 2.0350653150095335, "learning_rate": 1.058754625186772e-05, "loss": 0.8186, "step": 3025 }, { "epoch": 0.5, "grad_norm": 2.0106339677029363, "learning_rate": 1.0582235135248718e-05, "loss": 0.7291, "step": 3026 }, { "epoch": 0.5, "grad_norm": 1.9186199025483108, "learning_rate": 1.0576923853829284e-05, "loss": 0.796, "step": 3027 }, { "epoch": 0.5, "grad_norm": 2.94972798950535, "learning_rate": 1.057161240911277e-05, "loss": 0.7158, "step": 3028 }, { "epoch": 0.5, "grad_norm": 1.7510968985785837, "learning_rate": 1.0566300802602565e-05, "loss": 0.7182, "step": 3029 }, { "epoch": 0.5, "grad_norm": 1.973878551417186, "learning_rate": 1.056098903580211e-05, "loss": 0.6955, "step": 3030 }, { "epoch": 0.5, "grad_norm": 1.9543287857608667, "learning_rate": 1.0555677110214889e-05, "loss": 0.8196, "step": 3031 }, { "epoch": 0.5, "grad_norm": 1.890044394934055, "learning_rate": 1.0550365027344432e-05, "loss": 0.7127, "step": 3032 }, { "epoch": 0.5, "grad_norm": 1.934282550268113, "learning_rate": 1.0545052788694312e-05, "loss": 0.8555, "step": 3033 }, { "epoch": 0.5, "grad_norm": 1.9926639533310717, "learning_rate": 1.0539740395768143e-05, "loss": 0.7379, "step": 3034 }, { "epoch": 0.5, "grad_norm": 2.0645493888973983, "learning_rate": 1.0534427850069595e-05, "loss": 0.7805, "step": 3035 }, { "epoch": 0.5, "grad_norm": 1.6444660197203427, "learning_rate": 1.0529115153102366e-05, "loss": 0.7608, "step": 3036 }, { "epoch": 0.5, "grad_norm": 2.0991387200918465, "learning_rate": 1.0523802306370206e-05, "loss": 0.8175, "step": 3037 }, { "epoch": 0.5, "grad_norm": 1.5116845995707284, "learning_rate": 1.0518489311376905e-05, "loss": 0.8524, "step": 3038 }, { "epoch": 0.5, "grad_norm": 2.968633641341017, "learning_rate": 1.0513176169626293e-05, "loss": 0.8111, "step": 3039 }, { "epoch": 0.5, "grad_norm": 2.0166969171592974, "learning_rate": 1.0507862882622249e-05, "loss": 0.8155, "step": 3040 }, { "epoch": 0.5, "grad_norm": 1.978248862436365, "learning_rate": 1.0502549451868683e-05, "loss": 0.8334, "step": 3041 }, { "epoch": 0.5, "grad_norm": 2.057134725483621, "learning_rate": 1.0497235878869554e-05, "loss": 0.7434, "step": 3042 }, { "epoch": 0.5, "grad_norm": 2.1145499985131537, "learning_rate": 1.0491922165128853e-05, "loss": 0.7811, "step": 3043 }, { "epoch": 0.5, "grad_norm": 1.7205740147252444, "learning_rate": 1.048660831215062e-05, "loss": 0.8172, "step": 3044 }, { "epoch": 0.5, "grad_norm": 1.697497960420134, "learning_rate": 1.0481294321438928e-05, "loss": 0.7782, "step": 3045 }, { "epoch": 0.5, "grad_norm": 3.3876503212437163, "learning_rate": 1.0475980194497892e-05, "loss": 0.7971, "step": 3046 }, { "epoch": 0.5, "grad_norm": 2.0632405297728433, "learning_rate": 1.0470665932831661e-05, "loss": 0.8086, "step": 3047 }, { "epoch": 0.5, "grad_norm": 1.778330103119036, "learning_rate": 1.0465351537944429e-05, "loss": 0.8031, "step": 3048 }, { "epoch": 0.5, "grad_norm": 1.7880293208924811, "learning_rate": 1.0460037011340422e-05, "loss": 0.8181, "step": 3049 }, { "epoch": 0.5, "grad_norm": 2.1391243471688153, "learning_rate": 1.0454722354523906e-05, "loss": 0.864, "step": 3050 }, { "epoch": 0.5, "grad_norm": 2.4705248640571518, "learning_rate": 1.0449407568999186e-05, "loss": 0.7357, "step": 3051 }, { "epoch": 0.5, "grad_norm": 3.3293503318884965, "learning_rate": 1.04440926562706e-05, "loss": 0.8114, "step": 3052 }, { "epoch": 0.5, "grad_norm": 2.440329148483958, "learning_rate": 1.043877761784252e-05, "loss": 0.7895, "step": 3053 }, { "epoch": 0.5, "grad_norm": 2.41265934706599, "learning_rate": 1.0433462455219359e-05, "loss": 0.7894, "step": 3054 }, { "epoch": 0.5, "grad_norm": 2.7483159141647127, "learning_rate": 1.0428147169905563e-05, "loss": 0.7265, "step": 3055 }, { "epoch": 0.5, "grad_norm": 2.268217870332376, "learning_rate": 1.0422831763405612e-05, "loss": 0.7834, "step": 3056 }, { "epoch": 0.5, "grad_norm": 2.031237612382003, "learning_rate": 1.0417516237224017e-05, "loss": 0.7905, "step": 3057 }, { "epoch": 0.5, "grad_norm": 2.1810075381526715, "learning_rate": 1.0412200592865331e-05, "loss": 0.7485, "step": 3058 }, { "epoch": 0.5, "grad_norm": 2.02791234100495, "learning_rate": 1.0406884831834133e-05, "loss": 0.7616, "step": 3059 }, { "epoch": 0.5, "grad_norm": 1.5082623421081043, "learning_rate": 1.0401568955635042e-05, "loss": 0.7847, "step": 3060 }, { "epoch": 0.5, "grad_norm": 2.1853447774490844, "learning_rate": 1.0396252965772702e-05, "loss": 0.7829, "step": 3061 }, { "epoch": 0.5, "grad_norm": 1.573495821217272, "learning_rate": 1.0390936863751791e-05, "loss": 0.849, "step": 3062 }, { "epoch": 0.5, "grad_norm": 2.3362182046867925, "learning_rate": 1.0385620651077024e-05, "loss": 0.8148, "step": 3063 }, { "epoch": 0.5, "grad_norm": 2.261182921630019, "learning_rate": 1.0380304329253144e-05, "loss": 0.7934, "step": 3064 }, { "epoch": 0.5, "grad_norm": 2.7602913104378257, "learning_rate": 1.0374987899784925e-05, "loss": 0.8062, "step": 3065 }, { "epoch": 0.5, "grad_norm": 1.4612369179670401, "learning_rate": 1.036967136417717e-05, "loss": 0.7908, "step": 3066 }, { "epoch": 0.5, "grad_norm": 1.8367466698313628, "learning_rate": 1.036435472393471e-05, "loss": 0.8304, "step": 3067 }, { "epoch": 0.5, "grad_norm": 3.3954604535167863, "learning_rate": 1.0359037980562416e-05, "loss": 0.7872, "step": 3068 }, { "epoch": 0.5, "grad_norm": 2.7837786718910875, "learning_rate": 1.0353721135565173e-05, "loss": 0.7634, "step": 3069 }, { "epoch": 0.5, "grad_norm": 2.1828144269947716, "learning_rate": 1.034840419044791e-05, "loss": 0.8614, "step": 3070 }, { "epoch": 0.5, "grad_norm": 0.7113692481091689, "learning_rate": 1.0343087146715573e-05, "loss": 0.3655, "step": 3071 }, { "epoch": 0.5, "grad_norm": 1.7487985912604131, "learning_rate": 1.033777000587314e-05, "loss": 0.8222, "step": 3072 }, { "epoch": 0.5, "grad_norm": 2.692648912379708, "learning_rate": 1.0332452769425619e-05, "loss": 0.9103, "step": 3073 }, { "epoch": 0.5, "grad_norm": 1.9097203682460668, "learning_rate": 1.0327135438878035e-05, "loss": 0.7426, "step": 3074 }, { "epoch": 0.51, "grad_norm": 1.7142099168517888, "learning_rate": 1.0321818015735459e-05, "loss": 0.7637, "step": 3075 }, { "epoch": 0.51, "grad_norm": 2.109688634353327, "learning_rate": 1.031650050150297e-05, "loss": 0.7949, "step": 3076 }, { "epoch": 0.51, "grad_norm": 1.8633069007111764, "learning_rate": 1.0311182897685681e-05, "loss": 0.78, "step": 3077 }, { "epoch": 0.51, "grad_norm": 1.9220341581576508, "learning_rate": 1.0305865205788728e-05, "loss": 0.809, "step": 3078 }, { "epoch": 0.51, "grad_norm": 1.9107192854173252, "learning_rate": 1.0300547427317269e-05, "loss": 0.871, "step": 3079 }, { "epoch": 0.51, "grad_norm": 1.745886854443446, "learning_rate": 1.0295229563776494e-05, "loss": 0.8214, "step": 3080 }, { "epoch": 0.51, "grad_norm": 1.7366836276177842, "learning_rate": 1.0289911616671613e-05, "loss": 0.8013, "step": 3081 }, { "epoch": 0.51, "grad_norm": 1.6322068234535865, "learning_rate": 1.0284593587507857e-05, "loss": 0.807, "step": 3082 }, { "epoch": 0.51, "grad_norm": 1.550352864368298, "learning_rate": 1.0279275477790487e-05, "loss": 0.7889, "step": 3083 }, { "epoch": 0.51, "grad_norm": 1.556565834218794, "learning_rate": 1.0273957289024778e-05, "loss": 0.7887, "step": 3084 }, { "epoch": 0.51, "grad_norm": 2.149683738967723, "learning_rate": 1.0268639022716033e-05, "loss": 0.7814, "step": 3085 }, { "epoch": 0.51, "grad_norm": 2.417179940410822, "learning_rate": 1.0263320680369581e-05, "loss": 0.7958, "step": 3086 }, { "epoch": 0.51, "grad_norm": 1.6466434093857272, "learning_rate": 1.0258002263490767e-05, "loss": 0.7095, "step": 3087 }, { "epoch": 0.51, "grad_norm": 1.7393650881423008, "learning_rate": 1.0252683773584953e-05, "loss": 0.8466, "step": 3088 }, { "epoch": 0.51, "grad_norm": 1.587978518743524, "learning_rate": 1.0247365212157527e-05, "loss": 0.7918, "step": 3089 }, { "epoch": 0.51, "grad_norm": 1.8445061093633663, "learning_rate": 1.02420465807139e-05, "loss": 0.8097, "step": 3090 }, { "epoch": 0.51, "grad_norm": 2.705290439086644, "learning_rate": 1.0236727880759496e-05, "loss": 0.7888, "step": 3091 }, { "epoch": 0.51, "grad_norm": 1.913429141207387, "learning_rate": 1.0231409113799764e-05, "loss": 0.8086, "step": 3092 }, { "epoch": 0.51, "grad_norm": 1.6998485922572157, "learning_rate": 1.0226090281340168e-05, "loss": 0.812, "step": 3093 }, { "epoch": 0.51, "grad_norm": 1.7674029308111825, "learning_rate": 1.0220771384886194e-05, "loss": 0.7461, "step": 3094 }, { "epoch": 0.51, "grad_norm": 1.8637602378119387, "learning_rate": 1.0215452425943346e-05, "loss": 0.796, "step": 3095 }, { "epoch": 0.51, "grad_norm": 1.806594773223512, "learning_rate": 1.021013340601714e-05, "loss": 0.8, "step": 3096 }, { "epoch": 0.51, "grad_norm": 2.337956732231649, "learning_rate": 1.0204814326613115e-05, "loss": 0.8279, "step": 3097 }, { "epoch": 0.51, "grad_norm": 2.540708141883109, "learning_rate": 1.0199495189236828e-05, "loss": 0.7954, "step": 3098 }, { "epoch": 0.51, "grad_norm": 1.7245382296454166, "learning_rate": 1.0194175995393847e-05, "loss": 0.8313, "step": 3099 }, { "epoch": 0.51, "grad_norm": 1.9189610399984727, "learning_rate": 1.0188856746589757e-05, "loss": 0.8159, "step": 3100 }, { "epoch": 0.51, "grad_norm": 1.8652045498183192, "learning_rate": 1.0183537444330165e-05, "loss": 0.7384, "step": 3101 }, { "epoch": 0.51, "grad_norm": 2.8240318777183995, "learning_rate": 1.0178218090120683e-05, "loss": 0.7941, "step": 3102 }, { "epoch": 0.51, "grad_norm": 0.7042817298866724, "learning_rate": 1.0172898685466947e-05, "loss": 0.3453, "step": 3103 }, { "epoch": 0.51, "grad_norm": 2.1398404531022734, "learning_rate": 1.01675792318746e-05, "loss": 0.8283, "step": 3104 }, { "epoch": 0.51, "grad_norm": 0.6616992795130131, "learning_rate": 1.0162259730849306e-05, "loss": 0.3573, "step": 3105 }, { "epoch": 0.51, "grad_norm": 2.2135331645698426, "learning_rate": 1.0156940183896737e-05, "loss": 0.824, "step": 3106 }, { "epoch": 0.51, "grad_norm": 1.560017658593601, "learning_rate": 1.0151620592522577e-05, "loss": 0.7575, "step": 3107 }, { "epoch": 0.51, "grad_norm": 1.6341559810788306, "learning_rate": 1.0146300958232528e-05, "loss": 0.8411, "step": 3108 }, { "epoch": 0.51, "grad_norm": 2.1065117926089956, "learning_rate": 1.0140981282532301e-05, "loss": 0.8622, "step": 3109 }, { "epoch": 0.51, "grad_norm": 1.9307290182596208, "learning_rate": 1.0135661566927619e-05, "loss": 0.8333, "step": 3110 }, { "epoch": 0.51, "grad_norm": 1.7308160176614782, "learning_rate": 1.0130341812924215e-05, "loss": 0.7564, "step": 3111 }, { "epoch": 0.51, "grad_norm": 1.8883953503661965, "learning_rate": 1.0125022022027834e-05, "loss": 0.8399, "step": 3112 }, { "epoch": 0.51, "grad_norm": 1.9143816152486846, "learning_rate": 1.0119702195744236e-05, "loss": 0.899, "step": 3113 }, { "epoch": 0.51, "grad_norm": 3.3861259790059903, "learning_rate": 1.011438233557918e-05, "loss": 0.8081, "step": 3114 }, { "epoch": 0.51, "grad_norm": 1.8105460033011342, "learning_rate": 1.0109062443038446e-05, "loss": 0.7373, "step": 3115 }, { "epoch": 0.51, "grad_norm": 2.2080675276413353, "learning_rate": 1.0103742519627818e-05, "loss": 0.7776, "step": 3116 }, { "epoch": 0.51, "grad_norm": 2.636624049673885, "learning_rate": 1.0098422566853086e-05, "loss": 0.7974, "step": 3117 }, { "epoch": 0.51, "grad_norm": 2.223165767585192, "learning_rate": 1.0093102586220056e-05, "loss": 0.7919, "step": 3118 }, { "epoch": 0.51, "grad_norm": 1.8482998499895589, "learning_rate": 1.0087782579234532e-05, "loss": 0.8026, "step": 3119 }, { "epoch": 0.51, "grad_norm": 1.9909917524897507, "learning_rate": 1.0082462547402337e-05, "loss": 0.7172, "step": 3120 }, { "epoch": 0.51, "grad_norm": 1.9870792491001286, "learning_rate": 1.0077142492229288e-05, "loss": 0.7533, "step": 3121 }, { "epoch": 0.51, "grad_norm": 2.5312831563238856, "learning_rate": 1.007182241522122e-05, "loss": 0.7208, "step": 3122 }, { "epoch": 0.51, "grad_norm": 1.6122531723084599, "learning_rate": 1.0066502317883969e-05, "loss": 0.8512, "step": 3123 }, { "epoch": 0.51, "grad_norm": 1.915345848017032, "learning_rate": 1.0061182201723377e-05, "loss": 0.7874, "step": 3124 }, { "epoch": 0.51, "grad_norm": 2.2597800426839316, "learning_rate": 1.005586206824529e-05, "loss": 0.704, "step": 3125 }, { "epoch": 0.51, "grad_norm": 1.96464059820775, "learning_rate": 1.0050541918955564e-05, "loss": 0.8066, "step": 3126 }, { "epoch": 0.51, "grad_norm": 2.8300524162802314, "learning_rate": 1.0045221755360053e-05, "loss": 0.7509, "step": 3127 }, { "epoch": 0.51, "grad_norm": 2.883291566277754, "learning_rate": 1.0039901578964619e-05, "loss": 0.8569, "step": 3128 }, { "epoch": 0.51, "grad_norm": 2.2649003926335216, "learning_rate": 1.0034581391275129e-05, "loss": 0.7975, "step": 3129 }, { "epoch": 0.51, "grad_norm": 1.8840497621947918, "learning_rate": 1.0029261193797446e-05, "loss": 0.8484, "step": 3130 }, { "epoch": 0.51, "grad_norm": 1.8223799395539055, "learning_rate": 1.0023940988037446e-05, "loss": 0.8159, "step": 3131 }, { "epoch": 0.51, "grad_norm": 1.5879291508780322, "learning_rate": 1.0018620775500999e-05, "loss": 0.7592, "step": 3132 }, { "epoch": 0.51, "grad_norm": 1.820225534540755, "learning_rate": 1.0013300557693981e-05, "loss": 0.8349, "step": 3133 }, { "epoch": 0.51, "grad_norm": 2.249086028745976, "learning_rate": 1.0007980336122267e-05, "loss": 0.75, "step": 3134 }, { "epoch": 0.51, "grad_norm": 1.8530551895522458, "learning_rate": 1.0002660112291736e-05, "loss": 0.719, "step": 3135 }, { "epoch": 0.52, "grad_norm": 2.199406845246707, "learning_rate": 9.997339887708269e-06, "loss": 0.8579, "step": 3136 }, { "epoch": 0.52, "grad_norm": 2.068423219422181, "learning_rate": 9.992019663877738e-06, "loss": 0.7609, "step": 3137 }, { "epoch": 0.52, "grad_norm": 2.089442275413046, "learning_rate": 9.986699442306025e-06, "loss": 0.7855, "step": 3138 }, { "epoch": 0.52, "grad_norm": 2.086794272552133, "learning_rate": 9.981379224499006e-06, "loss": 0.8663, "step": 3139 }, { "epoch": 0.52, "grad_norm": 2.008239014478185, "learning_rate": 9.976059011962557e-06, "loss": 0.8207, "step": 3140 }, { "epoch": 0.52, "grad_norm": 1.8696648291758604, "learning_rate": 9.970738806202557e-06, "loss": 0.7877, "step": 3141 }, { "epoch": 0.52, "grad_norm": 2.0836455067575645, "learning_rate": 9.965418608724875e-06, "loss": 0.8299, "step": 3142 }, { "epoch": 0.52, "grad_norm": 2.118405540768941, "learning_rate": 9.960098421035383e-06, "loss": 0.7497, "step": 3143 }, { "epoch": 0.52, "grad_norm": 1.7363696248493732, "learning_rate": 9.95477824463995e-06, "loss": 0.797, "step": 3144 }, { "epoch": 0.52, "grad_norm": 2.0209815503368906, "learning_rate": 9.94945808104444e-06, "loss": 0.7119, "step": 3145 }, { "epoch": 0.52, "grad_norm": 2.2097710637618406, "learning_rate": 9.944137931754712e-06, "loss": 0.7701, "step": 3146 }, { "epoch": 0.52, "grad_norm": 2.382137691040404, "learning_rate": 9.938817798276627e-06, "loss": 0.7685, "step": 3147 }, { "epoch": 0.52, "grad_norm": 2.267027265096861, "learning_rate": 9.933497682116035e-06, "loss": 0.7467, "step": 3148 }, { "epoch": 0.52, "grad_norm": 2.3017585438013106, "learning_rate": 9.928177584778783e-06, "loss": 0.8079, "step": 3149 }, { "epoch": 0.52, "grad_norm": 1.5905686779314532, "learning_rate": 9.922857507770716e-06, "loss": 0.7542, "step": 3150 }, { "epoch": 0.52, "grad_norm": 1.7186971827478945, "learning_rate": 9.917537452597667e-06, "loss": 0.8423, "step": 3151 }, { "epoch": 0.52, "grad_norm": 2.1611547137294185, "learning_rate": 9.912217420765471e-06, "loss": 0.752, "step": 3152 }, { "epoch": 0.52, "grad_norm": 2.0950287108758014, "learning_rate": 9.906897413779949e-06, "loss": 0.8464, "step": 3153 }, { "epoch": 0.52, "grad_norm": 1.9223667415484658, "learning_rate": 9.901577433146915e-06, "loss": 0.8107, "step": 3154 }, { "epoch": 0.52, "grad_norm": 2.494939247867733, "learning_rate": 9.896257480372184e-06, "loss": 0.8049, "step": 3155 }, { "epoch": 0.52, "grad_norm": 2.349698989394084, "learning_rate": 9.890937556961554e-06, "loss": 0.8705, "step": 3156 }, { "epoch": 0.52, "grad_norm": 2.9113651653188524, "learning_rate": 9.88561766442082e-06, "loss": 0.7323, "step": 3157 }, { "epoch": 0.52, "grad_norm": 1.8576178920304427, "learning_rate": 9.88029780425577e-06, "loss": 0.7578, "step": 3158 }, { "epoch": 0.52, "grad_norm": 1.9655271925894233, "learning_rate": 9.87497797797217e-06, "loss": 0.7791, "step": 3159 }, { "epoch": 0.52, "grad_norm": 1.6986356642502802, "learning_rate": 9.86965818707579e-06, "loss": 0.8508, "step": 3160 }, { "epoch": 0.52, "grad_norm": 2.0851353704677735, "learning_rate": 9.864338433072386e-06, "loss": 0.7067, "step": 3161 }, { "epoch": 0.52, "grad_norm": 3.9871137838730775, "learning_rate": 9.859018717467704e-06, "loss": 0.7879, "step": 3162 }, { "epoch": 0.52, "grad_norm": 1.5491063850781204, "learning_rate": 9.853699041767473e-06, "loss": 0.7699, "step": 3163 }, { "epoch": 0.52, "grad_norm": 2.0846588401168487, "learning_rate": 9.848379407477425e-06, "loss": 0.8039, "step": 3164 }, { "epoch": 0.52, "grad_norm": 1.853340331797549, "learning_rate": 9.843059816103267e-06, "loss": 0.8492, "step": 3165 }, { "epoch": 0.52, "grad_norm": 1.5032959472375464, "learning_rate": 9.837740269150696e-06, "loss": 0.8597, "step": 3166 }, { "epoch": 0.52, "grad_norm": 2.358278933170749, "learning_rate": 9.832420768125402e-06, "loss": 0.7714, "step": 3167 }, { "epoch": 0.52, "grad_norm": 1.8129233639347386, "learning_rate": 9.827101314533056e-06, "loss": 0.7582, "step": 3168 }, { "epoch": 0.52, "grad_norm": 2.1754205774220994, "learning_rate": 9.82178190987932e-06, "loss": 0.7239, "step": 3169 }, { "epoch": 0.52, "grad_norm": 1.7404747155170273, "learning_rate": 9.816462555669838e-06, "loss": 0.8275, "step": 3170 }, { "epoch": 0.52, "grad_norm": 1.5793832218008297, "learning_rate": 9.811143253410244e-06, "loss": 0.7897, "step": 3171 }, { "epoch": 0.52, "grad_norm": 2.042458409023525, "learning_rate": 9.805824004606156e-06, "loss": 0.7153, "step": 3172 }, { "epoch": 0.52, "grad_norm": 1.5382131738294422, "learning_rate": 9.800504810763176e-06, "loss": 0.7852, "step": 3173 }, { "epoch": 0.52, "grad_norm": 1.5983860216234518, "learning_rate": 9.795185673386886e-06, "loss": 0.8097, "step": 3174 }, { "epoch": 0.52, "grad_norm": 2.0373570204665565, "learning_rate": 9.789866593982863e-06, "loss": 0.8187, "step": 3175 }, { "epoch": 0.52, "grad_norm": 2.1979985663121218, "learning_rate": 9.784547574056657e-06, "loss": 0.8426, "step": 3176 }, { "epoch": 0.52, "grad_norm": 1.798674707698713, "learning_rate": 9.779228615113808e-06, "loss": 0.7794, "step": 3177 }, { "epoch": 0.52, "grad_norm": 1.9409646416716952, "learning_rate": 9.773909718659831e-06, "loss": 0.7964, "step": 3178 }, { "epoch": 0.52, "grad_norm": 1.6716459727166215, "learning_rate": 9.768590886200241e-06, "loss": 0.8117, "step": 3179 }, { "epoch": 0.52, "grad_norm": 2.274372521927738, "learning_rate": 9.76327211924051e-06, "loss": 0.8316, "step": 3180 }, { "epoch": 0.52, "grad_norm": 2.4932422733360755, "learning_rate": 9.757953419286107e-06, "loss": 0.7714, "step": 3181 }, { "epoch": 0.52, "grad_norm": 1.7717927206670907, "learning_rate": 9.752634787842478e-06, "loss": 0.7463, "step": 3182 }, { "epoch": 0.52, "grad_norm": 3.0530308684578156, "learning_rate": 9.747316226415052e-06, "loss": 0.7878, "step": 3183 }, { "epoch": 0.52, "grad_norm": 2.3607220558945214, "learning_rate": 9.741997736509238e-06, "loss": 0.6689, "step": 3184 }, { "epoch": 0.52, "grad_norm": 0.7047804423786189, "learning_rate": 9.73667931963042e-06, "loss": 0.4037, "step": 3185 }, { "epoch": 0.52, "grad_norm": 1.764054489335034, "learning_rate": 9.731360977283969e-06, "loss": 0.7972, "step": 3186 }, { "epoch": 0.52, "grad_norm": 1.7162328607642379, "learning_rate": 9.726042710975224e-06, "loss": 0.7463, "step": 3187 }, { "epoch": 0.52, "grad_norm": 1.8870147471571475, "learning_rate": 9.720724522209518e-06, "loss": 0.8496, "step": 3188 }, { "epoch": 0.52, "grad_norm": 1.9266169370119093, "learning_rate": 9.715406412492145e-06, "loss": 0.792, "step": 3189 }, { "epoch": 0.52, "grad_norm": 2.0723006919140823, "learning_rate": 9.710088383328392e-06, "loss": 0.7081, "step": 3190 }, { "epoch": 0.52, "grad_norm": 1.934182846670032, "learning_rate": 9.704770436223508e-06, "loss": 0.8182, "step": 3191 }, { "epoch": 0.52, "grad_norm": 1.8920890774830423, "learning_rate": 9.699452572682734e-06, "loss": 0.7603, "step": 3192 }, { "epoch": 0.52, "grad_norm": 2.3544371112677416, "learning_rate": 9.694134794211277e-06, "loss": 0.7765, "step": 3193 }, { "epoch": 0.52, "grad_norm": 2.1426657250509824, "learning_rate": 9.68881710231432e-06, "loss": 0.8112, "step": 3194 }, { "epoch": 0.52, "grad_norm": 2.2064188631445285, "learning_rate": 9.683499498497032e-06, "loss": 0.8096, "step": 3195 }, { "epoch": 0.52, "grad_norm": 1.8131098004041577, "learning_rate": 9.678181984264543e-06, "loss": 0.8396, "step": 3196 }, { "epoch": 0.53, "grad_norm": 1.8929586661619433, "learning_rate": 9.672864561121963e-06, "loss": 0.7633, "step": 3197 }, { "epoch": 0.53, "grad_norm": 2.099055998412845, "learning_rate": 9.667547230574386e-06, "loss": 0.8195, "step": 3198 }, { "epoch": 0.53, "grad_norm": 1.8575052739390518, "learning_rate": 9.662229994126862e-06, "loss": 0.7163, "step": 3199 }, { "epoch": 0.53, "grad_norm": 1.8544747401391104, "learning_rate": 9.65691285328443e-06, "loss": 0.8428, "step": 3200 }, { "epoch": 0.53, "grad_norm": 0.5857876584185482, "learning_rate": 9.651595809552094e-06, "loss": 0.3722, "step": 3201 }, { "epoch": 0.53, "grad_norm": 2.3399305694368646, "learning_rate": 9.64627886443483e-06, "loss": 0.7505, "step": 3202 }, { "epoch": 0.53, "grad_norm": 0.6393883317435544, "learning_rate": 9.64096201943759e-06, "loss": 0.3869, "step": 3203 }, { "epoch": 0.53, "grad_norm": 1.8941367156899436, "learning_rate": 9.635645276065293e-06, "loss": 0.7998, "step": 3204 }, { "epoch": 0.53, "grad_norm": 1.7578761278238686, "learning_rate": 9.630328635822835e-06, "loss": 0.7652, "step": 3205 }, { "epoch": 0.53, "grad_norm": 2.1457169491494636, "learning_rate": 9.625012100215078e-06, "loss": 0.8437, "step": 3206 }, { "epoch": 0.53, "grad_norm": 1.5141196393069436, "learning_rate": 9.61969567074686e-06, "loss": 0.8243, "step": 3207 }, { "epoch": 0.53, "grad_norm": 2.442249338035923, "learning_rate": 9.61437934892298e-06, "loss": 0.8409, "step": 3208 }, { "epoch": 0.53, "grad_norm": 1.7801000656098165, "learning_rate": 9.609063136248214e-06, "loss": 0.7974, "step": 3209 }, { "epoch": 0.53, "grad_norm": 1.8602339514607396, "learning_rate": 9.603747034227301e-06, "loss": 0.7547, "step": 3210 }, { "epoch": 0.53, "grad_norm": 3.109491001873453, "learning_rate": 9.598431044364963e-06, "loss": 0.7863, "step": 3211 }, { "epoch": 0.53, "grad_norm": 3.6992204449452637, "learning_rate": 9.593115168165868e-06, "loss": 0.7685, "step": 3212 }, { "epoch": 0.53, "grad_norm": 1.9624273434117885, "learning_rate": 9.587799407134672e-06, "loss": 0.6789, "step": 3213 }, { "epoch": 0.53, "grad_norm": 1.866067038611792, "learning_rate": 9.582483762775987e-06, "loss": 0.8375, "step": 3214 }, { "epoch": 0.53, "grad_norm": 1.980938480276971, "learning_rate": 9.577168236594393e-06, "loss": 0.9126, "step": 3215 }, { "epoch": 0.53, "grad_norm": 1.9529927967937595, "learning_rate": 9.571852830094439e-06, "loss": 0.8005, "step": 3216 }, { "epoch": 0.53, "grad_norm": 4.22607181371801, "learning_rate": 9.566537544780641e-06, "loss": 0.8496, "step": 3217 }, { "epoch": 0.53, "grad_norm": 1.9852959074779009, "learning_rate": 9.56122238215748e-06, "loss": 0.7765, "step": 3218 }, { "epoch": 0.53, "grad_norm": 1.6008192694697945, "learning_rate": 9.555907343729402e-06, "loss": 0.7641, "step": 3219 }, { "epoch": 0.53, "grad_norm": 1.6120213565890293, "learning_rate": 9.550592431000814e-06, "loss": 0.8413, "step": 3220 }, { "epoch": 0.53, "grad_norm": 2.2674248936981938, "learning_rate": 9.545277645476094e-06, "loss": 0.8229, "step": 3221 }, { "epoch": 0.53, "grad_norm": 1.8899498393105578, "learning_rate": 9.53996298865958e-06, "loss": 0.8426, "step": 3222 }, { "epoch": 0.53, "grad_norm": 2.568845212882856, "learning_rate": 9.534648462055576e-06, "loss": 0.7811, "step": 3223 }, { "epoch": 0.53, "grad_norm": 4.163831675362972, "learning_rate": 9.529334067168344e-06, "loss": 0.8039, "step": 3224 }, { "epoch": 0.53, "grad_norm": 2.3377288874775792, "learning_rate": 9.524019805502113e-06, "loss": 0.8208, "step": 3225 }, { "epoch": 0.53, "grad_norm": 2.1898161951164585, "learning_rate": 9.518705678561075e-06, "loss": 0.8741, "step": 3226 }, { "epoch": 0.53, "grad_norm": 2.1898959791454065, "learning_rate": 9.513391687849383e-06, "loss": 0.7806, "step": 3227 }, { "epoch": 0.53, "grad_norm": 1.7833262501195175, "learning_rate": 9.50807783487115e-06, "loss": 0.8002, "step": 3228 }, { "epoch": 0.53, "grad_norm": 1.8454777895128585, "learning_rate": 9.50276412113045e-06, "loss": 0.8553, "step": 3229 }, { "epoch": 0.53, "grad_norm": 1.8048115759573748, "learning_rate": 9.497450548131319e-06, "loss": 0.7507, "step": 3230 }, { "epoch": 0.53, "grad_norm": 2.2004954630525826, "learning_rate": 9.492137117377755e-06, "loss": 0.7668, "step": 3231 }, { "epoch": 0.53, "grad_norm": 1.6846772586741923, "learning_rate": 9.48682383037371e-06, "loss": 0.7579, "step": 3232 }, { "epoch": 0.53, "grad_norm": 2.220249676013056, "learning_rate": 9.481510688623098e-06, "loss": 0.8044, "step": 3233 }, { "epoch": 0.53, "grad_norm": 1.8038539337339041, "learning_rate": 9.476197693629798e-06, "loss": 0.8286, "step": 3234 }, { "epoch": 0.53, "grad_norm": 2.0638972517756415, "learning_rate": 9.470884846897638e-06, "loss": 0.8176, "step": 3235 }, { "epoch": 0.53, "grad_norm": 1.8439816089893157, "learning_rate": 9.465572149930408e-06, "loss": 0.7435, "step": 3236 }, { "epoch": 0.53, "grad_norm": 3.265394261878643, "learning_rate": 9.460259604231859e-06, "loss": 0.8622, "step": 3237 }, { "epoch": 0.53, "grad_norm": 1.5365655162087584, "learning_rate": 9.454947211305691e-06, "loss": 0.7623, "step": 3238 }, { "epoch": 0.53, "grad_norm": 2.003661816358806, "learning_rate": 9.44963497265557e-06, "loss": 0.7563, "step": 3239 }, { "epoch": 0.53, "grad_norm": 4.3190084119393966, "learning_rate": 9.444322889785111e-06, "loss": 0.7768, "step": 3240 }, { "epoch": 0.53, "grad_norm": 2.27366003569455, "learning_rate": 9.43901096419789e-06, "loss": 0.7925, "step": 3241 }, { "epoch": 0.53, "grad_norm": 1.729790482892869, "learning_rate": 9.433699197397435e-06, "loss": 0.7675, "step": 3242 }, { "epoch": 0.53, "grad_norm": 1.7578460726135636, "learning_rate": 9.42838759088723e-06, "loss": 0.9237, "step": 3243 }, { "epoch": 0.53, "grad_norm": 0.8204127766043651, "learning_rate": 9.423076146170718e-06, "loss": 0.3453, "step": 3244 }, { "epoch": 0.53, "grad_norm": 2.1285321942010365, "learning_rate": 9.417764864751287e-06, "loss": 0.7647, "step": 3245 }, { "epoch": 0.53, "grad_norm": 1.4681516143141236, "learning_rate": 9.412453748132286e-06, "loss": 0.8048, "step": 3246 }, { "epoch": 0.53, "grad_norm": 1.7568795030824385, "learning_rate": 9.407142797817014e-06, "loss": 0.7908, "step": 3247 }, { "epoch": 0.53, "grad_norm": 1.84215138502193, "learning_rate": 9.401832015308728e-06, "loss": 0.8315, "step": 3248 }, { "epoch": 0.53, "grad_norm": 1.5457576368393557, "learning_rate": 9.39652140211063e-06, "loss": 0.7301, "step": 3249 }, { "epoch": 0.53, "grad_norm": 2.7787345861594543, "learning_rate": 9.39121095972588e-06, "loss": 0.8474, "step": 3250 }, { "epoch": 0.53, "grad_norm": 0.6610850990142648, "learning_rate": 9.385900689657588e-06, "loss": 0.3735, "step": 3251 }, { "epoch": 0.53, "grad_norm": 1.7385469496052917, "learning_rate": 9.380590593408813e-06, "loss": 0.8386, "step": 3252 }, { "epoch": 0.53, "grad_norm": 2.579091772969421, "learning_rate": 9.375280672482567e-06, "loss": 0.7523, "step": 3253 }, { "epoch": 0.53, "grad_norm": 2.1513550740934817, "learning_rate": 9.369970928381813e-06, "loss": 0.7857, "step": 3254 }, { "epoch": 0.53, "grad_norm": 1.4115988669885953, "learning_rate": 9.364661362609464e-06, "loss": 0.7426, "step": 3255 }, { "epoch": 0.53, "grad_norm": 2.658866583171093, "learning_rate": 9.359351976668377e-06, "loss": 0.7742, "step": 3256 }, { "epoch": 0.53, "grad_norm": 3.1986684753074357, "learning_rate": 9.354042772061362e-06, "loss": 0.7534, "step": 3257 }, { "epoch": 0.54, "grad_norm": 2.1772115184944294, "learning_rate": 9.348733750291186e-06, "loss": 0.7692, "step": 3258 }, { "epoch": 0.54, "grad_norm": 1.7147388922792213, "learning_rate": 9.343424912860552e-06, "loss": 0.8572, "step": 3259 }, { "epoch": 0.54, "grad_norm": 1.817640458343489, "learning_rate": 9.338116261272114e-06, "loss": 0.7807, "step": 3260 }, { "epoch": 0.54, "grad_norm": 2.8999230960978304, "learning_rate": 9.332807797028476e-06, "loss": 0.7925, "step": 3261 }, { "epoch": 0.54, "grad_norm": 1.6971283280997043, "learning_rate": 9.327499521632187e-06, "loss": 0.7731, "step": 3262 }, { "epoch": 0.54, "grad_norm": 1.6630478236194421, "learning_rate": 9.322191436585745e-06, "loss": 0.7659, "step": 3263 }, { "epoch": 0.54, "grad_norm": 2.0215479704883212, "learning_rate": 9.316883543391589e-06, "loss": 0.7829, "step": 3264 }, { "epoch": 0.54, "grad_norm": 2.3925273408454792, "learning_rate": 9.31157584355211e-06, "loss": 0.8248, "step": 3265 }, { "epoch": 0.54, "grad_norm": 1.9346040329214214, "learning_rate": 9.306268338569643e-06, "loss": 0.8019, "step": 3266 }, { "epoch": 0.54, "grad_norm": 1.619575653245569, "learning_rate": 9.30096102994646e-06, "loss": 0.7397, "step": 3267 }, { "epoch": 0.54, "grad_norm": 1.7911148269724713, "learning_rate": 9.295653919184787e-06, "loss": 0.8391, "step": 3268 }, { "epoch": 0.54, "grad_norm": 1.7889938129582033, "learning_rate": 9.290347007786791e-06, "loss": 0.7, "step": 3269 }, { "epoch": 0.54, "grad_norm": 1.7114110185363696, "learning_rate": 9.28504029725458e-06, "loss": 0.7138, "step": 3270 }, { "epoch": 0.54, "grad_norm": 10.308216784536835, "learning_rate": 9.27973378909021e-06, "loss": 0.7051, "step": 3271 }, { "epoch": 0.54, "grad_norm": 1.9080836223431588, "learning_rate": 9.274427484795676e-06, "loss": 0.8251, "step": 3272 }, { "epoch": 0.54, "grad_norm": 2.278632111595444, "learning_rate": 9.269121385872915e-06, "loss": 0.7997, "step": 3273 }, { "epoch": 0.54, "grad_norm": 2.056814649785414, "learning_rate": 9.263815493823808e-06, "loss": 0.7742, "step": 3274 }, { "epoch": 0.54, "grad_norm": 6.076551800078371, "learning_rate": 9.258509810150177e-06, "loss": 0.7648, "step": 3275 }, { "epoch": 0.54, "grad_norm": 1.6272645875189222, "learning_rate": 9.253204336353786e-06, "loss": 0.7727, "step": 3276 }, { "epoch": 0.54, "grad_norm": 2.8472198361278367, "learning_rate": 9.247899073936334e-06, "loss": 0.8607, "step": 3277 }, { "epoch": 0.54, "grad_norm": 1.7456896366798382, "learning_rate": 9.242594024399467e-06, "loss": 0.7708, "step": 3278 }, { "epoch": 0.54, "grad_norm": 1.7201385142473804, "learning_rate": 9.237289189244769e-06, "loss": 0.8211, "step": 3279 }, { "epoch": 0.54, "grad_norm": 2.333696856196905, "learning_rate": 9.23198456997376e-06, "loss": 0.7926, "step": 3280 }, { "epoch": 0.54, "grad_norm": 2.276972115732793, "learning_rate": 9.226680168087903e-06, "loss": 0.8218, "step": 3281 }, { "epoch": 0.54, "grad_norm": 2.2274131983634606, "learning_rate": 9.221375985088597e-06, "loss": 0.8183, "step": 3282 }, { "epoch": 0.54, "grad_norm": 1.872718807164225, "learning_rate": 9.216072022477183e-06, "loss": 0.7786, "step": 3283 }, { "epoch": 0.54, "grad_norm": 1.6955496190246762, "learning_rate": 9.210768281754931e-06, "loss": 0.7407, "step": 3284 }, { "epoch": 0.54, "grad_norm": 1.8918998116412329, "learning_rate": 9.205464764423059e-06, "loss": 0.8193, "step": 3285 }, { "epoch": 0.54, "grad_norm": 2.0709188731801462, "learning_rate": 9.200161471982713e-06, "loss": 0.7961, "step": 3286 }, { "epoch": 0.54, "grad_norm": 1.8896845149619883, "learning_rate": 9.194858405934983e-06, "loss": 0.8096, "step": 3287 }, { "epoch": 0.54, "grad_norm": 0.6509333335816603, "learning_rate": 9.189555567780882e-06, "loss": 0.3683, "step": 3288 }, { "epoch": 0.54, "grad_norm": 2.2334271900572036, "learning_rate": 9.184252959021374e-06, "loss": 0.7359, "step": 3289 }, { "epoch": 0.54, "grad_norm": 2.501422231910201, "learning_rate": 9.17895058115735e-06, "loss": 0.8656, "step": 3290 }, { "epoch": 0.54, "grad_norm": 3.1105887739862044, "learning_rate": 9.173648435689637e-06, "loss": 0.8139, "step": 3291 }, { "epoch": 0.54, "grad_norm": 1.8645888207880474, "learning_rate": 9.168346524118994e-06, "loss": 0.855, "step": 3292 }, { "epoch": 0.54, "grad_norm": 2.640258027543885, "learning_rate": 9.16304484794612e-06, "loss": 0.7667, "step": 3293 }, { "epoch": 0.54, "grad_norm": 1.910203761612283, "learning_rate": 9.15774340867164e-06, "loss": 0.8743, "step": 3294 }, { "epoch": 0.54, "grad_norm": 1.8468150964535686, "learning_rate": 9.152442207796115e-06, "loss": 0.7823, "step": 3295 }, { "epoch": 0.54, "grad_norm": 2.807595842830293, "learning_rate": 9.147141246820042e-06, "loss": 0.7467, "step": 3296 }, { "epoch": 0.54, "grad_norm": 2.57872881207294, "learning_rate": 9.141840527243844e-06, "loss": 0.7609, "step": 3297 }, { "epoch": 0.54, "grad_norm": 1.887808064756005, "learning_rate": 9.13654005056788e-06, "loss": 0.7956, "step": 3298 }, { "epoch": 0.54, "grad_norm": 1.8170075215796018, "learning_rate": 9.131239818292438e-06, "loss": 0.8106, "step": 3299 }, { "epoch": 0.54, "grad_norm": 1.6945721223581416, "learning_rate": 9.125939831917738e-06, "loss": 0.8575, "step": 3300 }, { "epoch": 0.54, "grad_norm": 1.4883114502878585, "learning_rate": 9.120640092943929e-06, "loss": 0.7605, "step": 3301 }, { "epoch": 0.54, "grad_norm": 1.5876055738226669, "learning_rate": 9.11534060287109e-06, "loss": 0.8513, "step": 3302 }, { "epoch": 0.54, "grad_norm": 2.688138051664433, "learning_rate": 9.110041363199233e-06, "loss": 0.7341, "step": 3303 }, { "epoch": 0.54, "grad_norm": 2.009689118447587, "learning_rate": 9.104742375428297e-06, "loss": 0.7361, "step": 3304 }, { "epoch": 0.54, "grad_norm": 1.7857754030282245, "learning_rate": 9.099443641058147e-06, "loss": 0.8802, "step": 3305 }, { "epoch": 0.54, "grad_norm": 1.8973769867384749, "learning_rate": 9.094145161588582e-06, "loss": 0.8066, "step": 3306 }, { "epoch": 0.54, "grad_norm": 1.5677749762470328, "learning_rate": 9.088846938519322e-06, "loss": 0.8358, "step": 3307 }, { "epoch": 0.54, "grad_norm": 1.641318436895791, "learning_rate": 9.083548973350019e-06, "loss": 0.7493, "step": 3308 }, { "epoch": 0.54, "grad_norm": 1.8067013737169337, "learning_rate": 9.078251267580256e-06, "loss": 0.798, "step": 3309 }, { "epoch": 0.54, "grad_norm": 2.252369770362503, "learning_rate": 9.072953822709526e-06, "loss": 0.8387, "step": 3310 }, { "epoch": 0.54, "grad_norm": 1.9024473834790252, "learning_rate": 9.067656640237267e-06, "loss": 0.8549, "step": 3311 }, { "epoch": 0.54, "grad_norm": 1.9278132275486792, "learning_rate": 9.062359721662836e-06, "loss": 0.6887, "step": 3312 }, { "epoch": 0.54, "grad_norm": 1.4299108757132566, "learning_rate": 9.057063068485513e-06, "loss": 0.7818, "step": 3313 }, { "epoch": 0.54, "grad_norm": 2.7838195935077983, "learning_rate": 9.051766682204504e-06, "loss": 0.7261, "step": 3314 }, { "epoch": 0.54, "grad_norm": 1.624427296224291, "learning_rate": 9.04647056431894e-06, "loss": 0.833, "step": 3315 }, { "epoch": 0.54, "grad_norm": 1.9876904592787892, "learning_rate": 9.041174716327879e-06, "loss": 0.809, "step": 3316 }, { "epoch": 0.54, "grad_norm": 2.000480553957399, "learning_rate": 9.035879139730294e-06, "loss": 0.7687, "step": 3317 }, { "epoch": 0.54, "grad_norm": 1.4386798350674503, "learning_rate": 9.030583836025093e-06, "loss": 0.8419, "step": 3318 }, { "epoch": 0.55, "grad_norm": 2.1694086969442483, "learning_rate": 9.025288806711096e-06, "loss": 0.7621, "step": 3319 }, { "epoch": 0.55, "grad_norm": 2.3295689359295872, "learning_rate": 9.019994053287053e-06, "loss": 0.7731, "step": 3320 }, { "epoch": 0.55, "grad_norm": 1.609907567076277, "learning_rate": 9.014699577251631e-06, "loss": 0.8147, "step": 3321 }, { "epoch": 0.55, "grad_norm": 2.4770032985818258, "learning_rate": 9.009405380103422e-06, "loss": 0.8137, "step": 3322 }, { "epoch": 0.55, "grad_norm": 2.081359215501305, "learning_rate": 9.004111463340935e-06, "loss": 0.7205, "step": 3323 }, { "epoch": 0.55, "grad_norm": 2.352771534613105, "learning_rate": 8.998817828462603e-06, "loss": 0.6668, "step": 3324 }, { "epoch": 0.55, "grad_norm": 1.8558226256294126, "learning_rate": 8.993524476966779e-06, "loss": 0.8702, "step": 3325 }, { "epoch": 0.55, "grad_norm": 2.2629311372210306, "learning_rate": 8.988231410351731e-06, "loss": 0.7809, "step": 3326 }, { "epoch": 0.55, "grad_norm": 2.13615596505851, "learning_rate": 8.982938630115657e-06, "loss": 0.7844, "step": 3327 }, { "epoch": 0.55, "grad_norm": 2.403476179335205, "learning_rate": 8.977646137756662e-06, "loss": 0.8556, "step": 3328 }, { "epoch": 0.55, "grad_norm": 2.0586946929995453, "learning_rate": 8.97235393477278e-06, "loss": 0.8049, "step": 3329 }, { "epoch": 0.55, "grad_norm": 2.0450088325960065, "learning_rate": 8.967062022661952e-06, "loss": 0.8383, "step": 3330 }, { "epoch": 0.55, "grad_norm": 3.420093541157189, "learning_rate": 8.961770402922052e-06, "loss": 0.7815, "step": 3331 }, { "epoch": 0.55, "grad_norm": 1.8023469836601527, "learning_rate": 8.956479077050849e-06, "loss": 0.8874, "step": 3332 }, { "epoch": 0.55, "grad_norm": 1.6829664429864448, "learning_rate": 8.951188046546048e-06, "loss": 0.7299, "step": 3333 }, { "epoch": 0.55, "grad_norm": 1.638453883583959, "learning_rate": 8.945897312905265e-06, "loss": 0.7943, "step": 3334 }, { "epoch": 0.55, "grad_norm": 1.5556701487575768, "learning_rate": 8.940606877626028e-06, "loss": 0.8086, "step": 3335 }, { "epoch": 0.55, "grad_norm": 1.7284739458547258, "learning_rate": 8.935316742205787e-06, "loss": 0.8088, "step": 3336 }, { "epoch": 0.55, "grad_norm": 1.4354109053721626, "learning_rate": 8.930026908141902e-06, "loss": 0.7591, "step": 3337 }, { "epoch": 0.55, "grad_norm": 1.9372312254102304, "learning_rate": 8.924737376931651e-06, "loss": 0.7254, "step": 3338 }, { "epoch": 0.55, "grad_norm": 1.9165040097249317, "learning_rate": 8.919448150072221e-06, "loss": 0.8084, "step": 3339 }, { "epoch": 0.55, "grad_norm": 2.9526874536702707, "learning_rate": 8.91415922906072e-06, "loss": 0.7892, "step": 3340 }, { "epoch": 0.55, "grad_norm": 2.9583877812960013, "learning_rate": 8.908870615394164e-06, "loss": 0.819, "step": 3341 }, { "epoch": 0.55, "grad_norm": 2.0872879610756674, "learning_rate": 8.903582310569487e-06, "loss": 0.8282, "step": 3342 }, { "epoch": 0.55, "grad_norm": 0.677517848547068, "learning_rate": 8.898294316083529e-06, "loss": 0.3385, "step": 3343 }, { "epoch": 0.55, "grad_norm": 2.4498753754045866, "learning_rate": 8.893006633433048e-06, "loss": 0.8058, "step": 3344 }, { "epoch": 0.55, "grad_norm": 3.4786696157302646, "learning_rate": 8.887719264114709e-06, "loss": 0.8221, "step": 3345 }, { "epoch": 0.55, "grad_norm": 1.7828400222697784, "learning_rate": 8.882432209625092e-06, "loss": 0.804, "step": 3346 }, { "epoch": 0.55, "grad_norm": 0.6223539501295118, "learning_rate": 8.877145471460688e-06, "loss": 0.3616, "step": 3347 }, { "epoch": 0.55, "grad_norm": 2.5344873908734806, "learning_rate": 8.871859051117896e-06, "loss": 0.8488, "step": 3348 }, { "epoch": 0.55, "grad_norm": 3.093039664937443, "learning_rate": 8.866572950093026e-06, "loss": 0.7982, "step": 3349 }, { "epoch": 0.55, "grad_norm": 1.883499813354625, "learning_rate": 8.861287169882295e-06, "loss": 0.777, "step": 3350 }, { "epoch": 0.55, "grad_norm": 2.0480139100883608, "learning_rate": 8.856001711981839e-06, "loss": 0.7993, "step": 3351 }, { "epoch": 0.55, "grad_norm": 2.2115061694050784, "learning_rate": 8.850716577887695e-06, "loss": 0.8327, "step": 3352 }, { "epoch": 0.55, "grad_norm": 2.816214027691435, "learning_rate": 8.8454317690958e-06, "loss": 0.7196, "step": 3353 }, { "epoch": 0.55, "grad_norm": 2.1903040978186357, "learning_rate": 8.840147287102016e-06, "loss": 0.7545, "step": 3354 }, { "epoch": 0.55, "grad_norm": 2.0711231015193423, "learning_rate": 8.8348631334021e-06, "loss": 0.7996, "step": 3355 }, { "epoch": 0.55, "grad_norm": 1.8819272252155628, "learning_rate": 8.829579309491724e-06, "loss": 0.7132, "step": 3356 }, { "epoch": 0.55, "grad_norm": 2.322471132275514, "learning_rate": 8.824295816866463e-06, "loss": 0.7484, "step": 3357 }, { "epoch": 0.55, "grad_norm": 1.6378718942162593, "learning_rate": 8.819012657021794e-06, "loss": 0.8099, "step": 3358 }, { "epoch": 0.55, "grad_norm": 1.635616050140948, "learning_rate": 8.81372983145311e-06, "loss": 0.83, "step": 3359 }, { "epoch": 0.55, "grad_norm": 2.0622871688127016, "learning_rate": 8.808447341655703e-06, "loss": 0.812, "step": 3360 }, { "epoch": 0.55, "grad_norm": 1.692806061099114, "learning_rate": 8.803165189124768e-06, "loss": 0.7835, "step": 3361 }, { "epoch": 0.55, "grad_norm": 2.4834156714917657, "learning_rate": 8.79788337535541e-06, "loss": 0.7375, "step": 3362 }, { "epoch": 0.55, "grad_norm": 1.6329525636020283, "learning_rate": 8.79260190184263e-06, "loss": 0.8212, "step": 3363 }, { "epoch": 0.55, "grad_norm": 1.728905583060459, "learning_rate": 8.787320770081345e-06, "loss": 0.7293, "step": 3364 }, { "epoch": 0.55, "grad_norm": 2.1781888748907337, "learning_rate": 8.782039981566364e-06, "loss": 0.816, "step": 3365 }, { "epoch": 0.55, "grad_norm": 1.5209304969718838, "learning_rate": 8.776759537792402e-06, "loss": 0.7216, "step": 3366 }, { "epoch": 0.55, "grad_norm": 3.2167673606704756, "learning_rate": 8.771479440254082e-06, "loss": 0.7364, "step": 3367 }, { "epoch": 0.55, "grad_norm": 3.1663510677703695, "learning_rate": 8.76619969044592e-06, "loss": 0.8198, "step": 3368 }, { "epoch": 0.55, "grad_norm": 1.7504068467372014, "learning_rate": 8.760920289862341e-06, "loss": 0.7584, "step": 3369 }, { "epoch": 0.55, "grad_norm": 1.6259982698963866, "learning_rate": 8.755641239997667e-06, "loss": 0.7704, "step": 3370 }, { "epoch": 0.55, "grad_norm": 1.9528177162988243, "learning_rate": 8.75036254234612e-06, "loss": 0.7761, "step": 3371 }, { "epoch": 0.55, "grad_norm": 1.6996912369483554, "learning_rate": 8.745084198401828e-06, "loss": 0.8624, "step": 3372 }, { "epoch": 0.55, "grad_norm": 2.034770920860483, "learning_rate": 8.739806209658812e-06, "loss": 0.8042, "step": 3373 }, { "epoch": 0.55, "grad_norm": 2.8350104312163102, "learning_rate": 8.734528577611004e-06, "loss": 0.6899, "step": 3374 }, { "epoch": 0.55, "grad_norm": 1.6018218264896498, "learning_rate": 8.729251303752214e-06, "loss": 0.8794, "step": 3375 }, { "epoch": 0.55, "grad_norm": 1.7315183684855728, "learning_rate": 8.72397438957617e-06, "loss": 0.7489, "step": 3376 }, { "epoch": 0.55, "grad_norm": 1.889468307060134, "learning_rate": 8.71869783657649e-06, "loss": 0.89, "step": 3377 }, { "epoch": 0.55, "grad_norm": 2.200583515962664, "learning_rate": 8.713421646246692e-06, "loss": 0.823, "step": 3378 }, { "epoch": 0.55, "grad_norm": 1.7212203202254985, "learning_rate": 8.70814582008019e-06, "loss": 0.8417, "step": 3379 }, { "epoch": 0.56, "grad_norm": 1.7600132831785027, "learning_rate": 8.702870359570296e-06, "loss": 0.6954, "step": 3380 }, { "epoch": 0.56, "grad_norm": 1.8293945336974675, "learning_rate": 8.697595266210217e-06, "loss": 0.7609, "step": 3381 }, { "epoch": 0.56, "grad_norm": 2.0859740302160152, "learning_rate": 8.692320541493058e-06, "loss": 0.7805, "step": 3382 }, { "epoch": 0.56, "grad_norm": 2.556508283395247, "learning_rate": 8.687046186911819e-06, "loss": 0.7794, "step": 3383 }, { "epoch": 0.56, "grad_norm": 1.8890530058579214, "learning_rate": 8.681772203959395e-06, "loss": 0.7551, "step": 3384 }, { "epoch": 0.56, "grad_norm": 1.7911825196266973, "learning_rate": 8.676498594128576e-06, "loss": 0.7183, "step": 3385 }, { "epoch": 0.56, "grad_norm": 1.8102988293853182, "learning_rate": 8.671225358912044e-06, "loss": 0.7988, "step": 3386 }, { "epoch": 0.56, "grad_norm": 2.8750420970256623, "learning_rate": 8.665952499802379e-06, "loss": 0.8384, "step": 3387 }, { "epoch": 0.56, "grad_norm": 2.1904941911026254, "learning_rate": 8.660680018292053e-06, "loss": 0.8384, "step": 3388 }, { "epoch": 0.56, "grad_norm": 1.8599375796578341, "learning_rate": 8.65540791587343e-06, "loss": 0.7832, "step": 3389 }, { "epoch": 0.56, "grad_norm": 1.8830062088411523, "learning_rate": 8.650136194038767e-06, "loss": 0.8025, "step": 3390 }, { "epoch": 0.56, "grad_norm": 1.4954370190042925, "learning_rate": 8.644864854280214e-06, "loss": 0.8378, "step": 3391 }, { "epoch": 0.56, "grad_norm": 2.4563248456093523, "learning_rate": 8.639593898089815e-06, "loss": 0.8321, "step": 3392 }, { "epoch": 0.56, "grad_norm": 2.0698322280042647, "learning_rate": 8.634323326959501e-06, "loss": 0.7461, "step": 3393 }, { "epoch": 0.56, "grad_norm": 3.1291839124549785, "learning_rate": 8.629053142381093e-06, "loss": 0.8293, "step": 3394 }, { "epoch": 0.56, "grad_norm": 1.6036403955769822, "learning_rate": 8.623783345846313e-06, "loss": 0.8138, "step": 3395 }, { "epoch": 0.56, "grad_norm": 2.231450378761929, "learning_rate": 8.618513938846763e-06, "loss": 0.7634, "step": 3396 }, { "epoch": 0.56, "grad_norm": 1.4929171729432567, "learning_rate": 8.613244922873932e-06, "loss": 0.82, "step": 3397 }, { "epoch": 0.56, "grad_norm": 2.105146279909585, "learning_rate": 8.60797629941921e-06, "loss": 0.686, "step": 3398 }, { "epoch": 0.56, "grad_norm": 1.9561367352771095, "learning_rate": 8.602708069973866e-06, "loss": 0.8246, "step": 3399 }, { "epoch": 0.56, "grad_norm": 1.6635267034903571, "learning_rate": 8.597440236029064e-06, "loss": 0.7569, "step": 3400 }, { "epoch": 0.56, "grad_norm": 1.9821437868020506, "learning_rate": 8.592172799075853e-06, "loss": 0.7014, "step": 3401 }, { "epoch": 0.56, "grad_norm": 1.9309636790770235, "learning_rate": 8.586905760605169e-06, "loss": 0.8034, "step": 3402 }, { "epoch": 0.56, "grad_norm": 2.256864593493703, "learning_rate": 8.581639122107837e-06, "loss": 0.8215, "step": 3403 }, { "epoch": 0.56, "grad_norm": 1.4944324791005628, "learning_rate": 8.576372885074567e-06, "loss": 0.7395, "step": 3404 }, { "epoch": 0.56, "grad_norm": 0.6735255860572696, "learning_rate": 8.571107050995955e-06, "loss": 0.3579, "step": 3405 }, { "epoch": 0.56, "grad_norm": 1.7831129079061527, "learning_rate": 8.565841621362488e-06, "loss": 0.7994, "step": 3406 }, { "epoch": 0.56, "grad_norm": 2.9607114891740567, "learning_rate": 8.560576597664533e-06, "loss": 0.7924, "step": 3407 }, { "epoch": 0.56, "grad_norm": 1.7644173588932637, "learning_rate": 8.555311981392342e-06, "loss": 0.7557, "step": 3408 }, { "epoch": 0.56, "grad_norm": 1.5839125583312315, "learning_rate": 8.550047774036058e-06, "loss": 0.8533, "step": 3409 }, { "epoch": 0.56, "grad_norm": 2.131514780379521, "learning_rate": 8.5447839770857e-06, "loss": 0.8485, "step": 3410 }, { "epoch": 0.56, "grad_norm": 2.6947264557399593, "learning_rate": 8.539520592031176e-06, "loss": 0.7235, "step": 3411 }, { "epoch": 0.56, "grad_norm": 1.9268905276410064, "learning_rate": 8.534257620362277e-06, "loss": 0.8249, "step": 3412 }, { "epoch": 0.56, "grad_norm": 2.1140896322447515, "learning_rate": 8.528995063568673e-06, "loss": 0.726, "step": 3413 }, { "epoch": 0.56, "grad_norm": 2.492201059230434, "learning_rate": 8.523732923139922e-06, "loss": 0.7406, "step": 3414 }, { "epoch": 0.56, "grad_norm": 1.7814283543071805, "learning_rate": 8.518471200565461e-06, "loss": 0.7414, "step": 3415 }, { "epoch": 0.56, "grad_norm": 2.148774672732041, "learning_rate": 8.513209897334612e-06, "loss": 0.8361, "step": 3416 }, { "epoch": 0.56, "grad_norm": 0.6663143928142505, "learning_rate": 8.507949014936573e-06, "loss": 0.379, "step": 3417 }, { "epoch": 0.56, "grad_norm": 2.430615583947924, "learning_rate": 8.502688554860426e-06, "loss": 0.8403, "step": 3418 }, { "epoch": 0.56, "grad_norm": 1.541374289483077, "learning_rate": 8.497428518595132e-06, "loss": 0.7404, "step": 3419 }, { "epoch": 0.56, "grad_norm": 1.9215089713824909, "learning_rate": 8.492168907629534e-06, "loss": 0.6479, "step": 3420 }, { "epoch": 0.56, "grad_norm": 2.1715288983276926, "learning_rate": 8.486909723452356e-06, "loss": 0.8169, "step": 3421 }, { "epoch": 0.56, "grad_norm": 2.133654637883036, "learning_rate": 8.481650967552199e-06, "loss": 0.8331, "step": 3422 }, { "epoch": 0.56, "grad_norm": 2.1120136453870444, "learning_rate": 8.47639264141754e-06, "loss": 0.8361, "step": 3423 }, { "epoch": 0.56, "grad_norm": 1.6371549395913942, "learning_rate": 8.471134746536737e-06, "loss": 0.7536, "step": 3424 }, { "epoch": 0.56, "grad_norm": 8.35225227052984, "learning_rate": 8.465877284398029e-06, "loss": 0.8046, "step": 3425 }, { "epoch": 0.56, "grad_norm": 4.665118538828204, "learning_rate": 8.460620256489528e-06, "loss": 0.7772, "step": 3426 }, { "epoch": 0.56, "grad_norm": 1.7387256119569698, "learning_rate": 8.455363664299225e-06, "loss": 0.8445, "step": 3427 }, { "epoch": 0.56, "grad_norm": 1.7995678101977275, "learning_rate": 8.450107509314983e-06, "loss": 0.8354, "step": 3428 }, { "epoch": 0.56, "grad_norm": 1.9759871223249919, "learning_rate": 8.444851793024555e-06, "loss": 0.789, "step": 3429 }, { "epoch": 0.56, "grad_norm": 2.3937608493710005, "learning_rate": 8.439596516915553e-06, "loss": 0.8295, "step": 3430 }, { "epoch": 0.56, "grad_norm": 4.776294123308938, "learning_rate": 8.434341682475476e-06, "loss": 0.789, "step": 3431 }, { "epoch": 0.56, "grad_norm": 1.6381463948829729, "learning_rate": 8.42908729119169e-06, "loss": 0.7034, "step": 3432 }, { "epoch": 0.56, "grad_norm": 1.8870922564083226, "learning_rate": 8.423833344551443e-06, "loss": 0.7629, "step": 3433 }, { "epoch": 0.56, "grad_norm": 1.8159069201339997, "learning_rate": 8.418579844041852e-06, "loss": 0.8848, "step": 3434 }, { "epoch": 0.56, "grad_norm": 2.2340640653934076, "learning_rate": 8.413326791149909e-06, "loss": 0.8294, "step": 3435 }, { "epoch": 0.56, "grad_norm": 1.4419211184706389, "learning_rate": 8.408074187362479e-06, "loss": 0.7469, "step": 3436 }, { "epoch": 0.56, "grad_norm": 2.6727497990460023, "learning_rate": 8.402822034166301e-06, "loss": 0.8003, "step": 3437 }, { "epoch": 0.56, "grad_norm": 1.9609629504442505, "learning_rate": 8.397570333047985e-06, "loss": 0.7621, "step": 3438 }, { "epoch": 0.56, "grad_norm": 2.0189129567428568, "learning_rate": 8.392319085494018e-06, "loss": 0.7956, "step": 3439 }, { "epoch": 0.56, "grad_norm": 1.6335466420362161, "learning_rate": 8.38706829299075e-06, "loss": 0.7964, "step": 3440 }, { "epoch": 0.57, "grad_norm": 2.209316465056109, "learning_rate": 8.381817957024409e-06, "loss": 0.8336, "step": 3441 }, { "epoch": 0.57, "grad_norm": 1.6535016124060216, "learning_rate": 8.37656807908109e-06, "loss": 0.8037, "step": 3442 }, { "epoch": 0.57, "grad_norm": 1.9323321084408518, "learning_rate": 8.37131866064676e-06, "loss": 0.7671, "step": 3443 }, { "epoch": 0.57, "grad_norm": 2.1633846063068414, "learning_rate": 8.366069703207257e-06, "loss": 0.7056, "step": 3444 }, { "epoch": 0.57, "grad_norm": 2.384631070683646, "learning_rate": 8.360821208248289e-06, "loss": 0.8018, "step": 3445 }, { "epoch": 0.57, "grad_norm": 0.6398955443593287, "learning_rate": 8.355573177255428e-06, "loss": 0.3687, "step": 3446 }, { "epoch": 0.57, "grad_norm": 5.518176054264198, "learning_rate": 8.35032561171412e-06, "loss": 0.808, "step": 3447 }, { "epoch": 0.57, "grad_norm": 1.2923428229793406, "learning_rate": 8.345078513109677e-06, "loss": 0.7858, "step": 3448 }, { "epoch": 0.57, "grad_norm": 1.8778979777494509, "learning_rate": 8.339831882927279e-06, "loss": 0.7551, "step": 3449 }, { "epoch": 0.57, "grad_norm": 2.2317246123943986, "learning_rate": 8.334585722651973e-06, "loss": 0.8023, "step": 3450 }, { "epoch": 0.57, "grad_norm": 0.6194195367300762, "learning_rate": 8.329340033768672e-06, "loss": 0.3716, "step": 3451 }, { "epoch": 0.57, "grad_norm": 1.8210552978087, "learning_rate": 8.324094817762164e-06, "loss": 0.7831, "step": 3452 }, { "epoch": 0.57, "grad_norm": 2.067568976874779, "learning_rate": 8.31885007611709e-06, "loss": 0.7148, "step": 3453 }, { "epoch": 0.57, "grad_norm": 2.200266647695095, "learning_rate": 8.313605810317967e-06, "loss": 0.7939, "step": 3454 }, { "epoch": 0.57, "grad_norm": 4.268924325882358, "learning_rate": 8.30836202184917e-06, "loss": 0.7367, "step": 3455 }, { "epoch": 0.57, "grad_norm": 1.7970242211800425, "learning_rate": 8.303118712194944e-06, "loss": 0.7737, "step": 3456 }, { "epoch": 0.57, "grad_norm": 1.9330856016854676, "learning_rate": 8.297875882839397e-06, "loss": 0.8558, "step": 3457 }, { "epoch": 0.57, "grad_norm": 1.9000227850003033, "learning_rate": 8.2926335352665e-06, "loss": 0.7462, "step": 3458 }, { "epoch": 0.57, "grad_norm": 2.1718405784026324, "learning_rate": 8.28739167096009e-06, "loss": 0.7841, "step": 3459 }, { "epoch": 0.57, "grad_norm": 1.8243919373091628, "learning_rate": 8.282150291403867e-06, "loss": 0.7839, "step": 3460 }, { "epoch": 0.57, "grad_norm": 1.8256979283679926, "learning_rate": 8.276909398081387e-06, "loss": 0.7859, "step": 3461 }, { "epoch": 0.57, "grad_norm": 0.6734769952974113, "learning_rate": 8.271668992476077e-06, "loss": 0.344, "step": 3462 }, { "epoch": 0.57, "grad_norm": 1.6128885354117712, "learning_rate": 8.266429076071221e-06, "loss": 0.736, "step": 3463 }, { "epoch": 0.57, "grad_norm": 5.806109479933567, "learning_rate": 8.261189650349969e-06, "loss": 0.7231, "step": 3464 }, { "epoch": 0.57, "grad_norm": 1.6721735709746264, "learning_rate": 8.255950716795328e-06, "loss": 0.8301, "step": 3465 }, { "epoch": 0.57, "grad_norm": 2.000652919875578, "learning_rate": 8.250712276890168e-06, "loss": 0.8305, "step": 3466 }, { "epoch": 0.57, "grad_norm": 1.94010122190498, "learning_rate": 8.245474332117219e-06, "loss": 0.749, "step": 3467 }, { "epoch": 0.57, "grad_norm": 2.1051593494721637, "learning_rate": 8.240236883959067e-06, "loss": 0.7729, "step": 3468 }, { "epoch": 0.57, "grad_norm": 1.7611018716026237, "learning_rate": 8.234999933898164e-06, "loss": 0.8465, "step": 3469 }, { "epoch": 0.57, "grad_norm": 21.955763098198116, "learning_rate": 8.229763483416815e-06, "loss": 0.7736, "step": 3470 }, { "epoch": 0.57, "grad_norm": 2.5606719051067284, "learning_rate": 8.22452753399719e-06, "loss": 0.8151, "step": 3471 }, { "epoch": 0.57, "grad_norm": 2.5110473840831142, "learning_rate": 8.219292087121309e-06, "loss": 0.7352, "step": 3472 }, { "epoch": 0.57, "grad_norm": 1.6295556926440165, "learning_rate": 8.214057144271058e-06, "loss": 0.7497, "step": 3473 }, { "epoch": 0.57, "grad_norm": 1.6646298261245007, "learning_rate": 8.208822706928172e-06, "loss": 0.8182, "step": 3474 }, { "epoch": 0.57, "grad_norm": 1.8748505223751994, "learning_rate": 8.203588776574254e-06, "loss": 0.7857, "step": 3475 }, { "epoch": 0.57, "grad_norm": 1.9556102891625824, "learning_rate": 8.198355354690752e-06, "loss": 0.855, "step": 3476 }, { "epoch": 0.57, "grad_norm": 2.3139727080333277, "learning_rate": 8.193122442758977e-06, "loss": 0.8187, "step": 3477 }, { "epoch": 0.57, "grad_norm": 1.7025606478567499, "learning_rate": 8.187890042260094e-06, "loss": 0.8325, "step": 3478 }, { "epoch": 0.57, "grad_norm": 2.1032422653192278, "learning_rate": 8.18265815467512e-06, "loss": 0.8427, "step": 3479 }, { "epoch": 0.57, "grad_norm": 2.971100234359609, "learning_rate": 8.177426781484933e-06, "loss": 0.8156, "step": 3480 }, { "epoch": 0.57, "grad_norm": 2.7637470213611413, "learning_rate": 8.172195924170263e-06, "loss": 0.8076, "step": 3481 }, { "epoch": 0.57, "grad_norm": 1.7648338337170502, "learning_rate": 8.166965584211694e-06, "loss": 0.848, "step": 3482 }, { "epoch": 0.57, "grad_norm": 1.5638242277647505, "learning_rate": 8.161735763089654e-06, "loss": 0.8028, "step": 3483 }, { "epoch": 0.57, "grad_norm": 2.118231001664223, "learning_rate": 8.15650646228444e-06, "loss": 0.7256, "step": 3484 }, { "epoch": 0.57, "grad_norm": 2.175705722141854, "learning_rate": 8.151277683276196e-06, "loss": 0.699, "step": 3485 }, { "epoch": 0.57, "grad_norm": 2.29121815469541, "learning_rate": 8.146049427544912e-06, "loss": 0.7456, "step": 3486 }, { "epoch": 0.57, "grad_norm": 1.6584257596512113, "learning_rate": 8.140821696570439e-06, "loss": 0.7591, "step": 3487 }, { "epoch": 0.57, "grad_norm": 1.767531864666561, "learning_rate": 8.13559449183247e-06, "loss": 0.8151, "step": 3488 }, { "epoch": 0.57, "grad_norm": 2.449443766263658, "learning_rate": 8.130367814810561e-06, "loss": 0.7888, "step": 3489 }, { "epoch": 0.57, "grad_norm": 1.7236821170349503, "learning_rate": 8.125141666984107e-06, "loss": 0.7948, "step": 3490 }, { "epoch": 0.57, "grad_norm": 0.6433793344178996, "learning_rate": 8.119916049832362e-06, "loss": 0.3521, "step": 3491 }, { "epoch": 0.57, "grad_norm": 2.4468539866505457, "learning_rate": 8.114690964834422e-06, "loss": 0.7239, "step": 3492 }, { "epoch": 0.57, "grad_norm": 2.076642352633913, "learning_rate": 8.109466413469238e-06, "loss": 0.8218, "step": 3493 }, { "epoch": 0.57, "grad_norm": 1.5374666550347276, "learning_rate": 8.104242397215609e-06, "loss": 0.7766, "step": 3494 }, { "epoch": 0.57, "grad_norm": 2.293836585072057, "learning_rate": 8.09901891755218e-06, "loss": 0.7827, "step": 3495 }, { "epoch": 0.57, "grad_norm": 1.9368531240580906, "learning_rate": 8.093795975957449e-06, "loss": 0.7442, "step": 3496 }, { "epoch": 0.57, "grad_norm": 1.8398291762655663, "learning_rate": 8.088573573909755e-06, "loss": 0.7878, "step": 3497 }, { "epoch": 0.57, "grad_norm": 1.7097870444950378, "learning_rate": 8.083351712887288e-06, "loss": 0.7702, "step": 3498 }, { "epoch": 0.57, "grad_norm": 0.6241059031611371, "learning_rate": 8.078130394368088e-06, "loss": 0.3564, "step": 3499 }, { "epoch": 0.57, "grad_norm": 1.9392652346215238, "learning_rate": 8.072909619830037e-06, "loss": 0.8706, "step": 3500 }, { "epoch": 0.58, "grad_norm": 2.7767446893432783, "learning_rate": 8.067689390750863e-06, "loss": 0.7539, "step": 3501 }, { "epoch": 0.58, "grad_norm": 3.5327745006810494, "learning_rate": 8.062469708608144e-06, "loss": 0.8238, "step": 3502 }, { "epoch": 0.58, "grad_norm": 2.2523484853543416, "learning_rate": 8.057250574879296e-06, "loss": 0.8781, "step": 3503 }, { "epoch": 0.58, "grad_norm": 2.9530754187088495, "learning_rate": 8.052031991041591e-06, "loss": 0.8696, "step": 3504 }, { "epoch": 0.58, "grad_norm": 1.6254979346985865, "learning_rate": 8.046813958572129e-06, "loss": 0.8512, "step": 3505 }, { "epoch": 0.58, "grad_norm": 1.942895821552224, "learning_rate": 8.041596478947862e-06, "loss": 0.6915, "step": 3506 }, { "epoch": 0.58, "grad_norm": 1.5287306793666193, "learning_rate": 8.036379553645595e-06, "loss": 0.8418, "step": 3507 }, { "epoch": 0.58, "grad_norm": 2.0141796765325424, "learning_rate": 8.031163184141965e-06, "loss": 0.7955, "step": 3508 }, { "epoch": 0.58, "grad_norm": 1.7430676751332415, "learning_rate": 8.025947371913454e-06, "loss": 0.6775, "step": 3509 }, { "epoch": 0.58, "grad_norm": 1.927659581406483, "learning_rate": 8.020732118436385e-06, "loss": 0.7989, "step": 3510 }, { "epoch": 0.58, "grad_norm": 1.8737329829348337, "learning_rate": 8.015517425186926e-06, "loss": 0.7635, "step": 3511 }, { "epoch": 0.58, "grad_norm": 1.8687045052903024, "learning_rate": 8.010303293641086e-06, "loss": 0.6993, "step": 3512 }, { "epoch": 0.58, "grad_norm": 1.639946992779458, "learning_rate": 8.005089725274711e-06, "loss": 0.7796, "step": 3513 }, { "epoch": 0.58, "grad_norm": 1.5565145133195089, "learning_rate": 7.999876721563494e-06, "loss": 0.7659, "step": 3514 }, { "epoch": 0.58, "grad_norm": 2.007742302421909, "learning_rate": 7.994664283982962e-06, "loss": 0.7611, "step": 3515 }, { "epoch": 0.58, "grad_norm": 1.796603225432334, "learning_rate": 7.989452414008485e-06, "loss": 0.7785, "step": 3516 }, { "epoch": 0.58, "grad_norm": 1.77692111025282, "learning_rate": 7.984241113115275e-06, "loss": 0.784, "step": 3517 }, { "epoch": 0.58, "grad_norm": 0.7072119096801448, "learning_rate": 7.979030382778376e-06, "loss": 0.3458, "step": 3518 }, { "epoch": 0.58, "grad_norm": 2.4469835323999156, "learning_rate": 7.973820224472675e-06, "loss": 0.8561, "step": 3519 }, { "epoch": 0.58, "grad_norm": 0.6677186705506888, "learning_rate": 7.968610639672896e-06, "loss": 0.3813, "step": 3520 }, { "epoch": 0.58, "grad_norm": 2.326072670065839, "learning_rate": 7.9634016298536e-06, "loss": 0.8403, "step": 3521 }, { "epoch": 0.58, "grad_norm": 1.8313262250853477, "learning_rate": 7.958193196489191e-06, "loss": 0.7014, "step": 3522 }, { "epoch": 0.58, "grad_norm": 2.0498814807230428, "learning_rate": 7.952985341053902e-06, "loss": 0.8588, "step": 3523 }, { "epoch": 0.58, "grad_norm": 2.179914440751146, "learning_rate": 7.947778065021805e-06, "loss": 0.7407, "step": 3524 }, { "epoch": 0.58, "grad_norm": 5.880558863569268, "learning_rate": 7.942571369866814e-06, "loss": 0.7511, "step": 3525 }, { "epoch": 0.58, "grad_norm": 1.7699947873964996, "learning_rate": 7.937365257062664e-06, "loss": 0.7621, "step": 3526 }, { "epoch": 0.58, "grad_norm": 1.9622659348920684, "learning_rate": 7.932159728082938e-06, "loss": 0.8433, "step": 3527 }, { "epoch": 0.58, "grad_norm": 1.5063218840013435, "learning_rate": 7.92695478440105e-06, "loss": 0.7357, "step": 3528 }, { "epoch": 0.58, "grad_norm": 1.7462846688909974, "learning_rate": 7.921750427490248e-06, "loss": 0.8614, "step": 3529 }, { "epoch": 0.58, "grad_norm": 3.9461376925408223, "learning_rate": 7.916546658823618e-06, "loss": 0.7105, "step": 3530 }, { "epoch": 0.58, "grad_norm": 2.2267512807835868, "learning_rate": 7.911343479874073e-06, "loss": 0.7997, "step": 3531 }, { "epoch": 0.58, "grad_norm": 1.5541647589665175, "learning_rate": 7.906140892114361e-06, "loss": 0.7245, "step": 3532 }, { "epoch": 0.58, "grad_norm": 2.2420651705077055, "learning_rate": 7.900938897017064e-06, "loss": 0.8643, "step": 3533 }, { "epoch": 0.58, "grad_norm": 1.58878715770601, "learning_rate": 7.895737496054597e-06, "loss": 0.8462, "step": 3534 }, { "epoch": 0.58, "grad_norm": 1.8879261751381107, "learning_rate": 7.890536690699204e-06, "loss": 0.8443, "step": 3535 }, { "epoch": 0.58, "grad_norm": 2.6437226223877124, "learning_rate": 7.885336482422964e-06, "loss": 0.8142, "step": 3536 }, { "epoch": 0.58, "grad_norm": 1.8518009873298344, "learning_rate": 7.880136872697784e-06, "loss": 0.7173, "step": 3537 }, { "epoch": 0.58, "grad_norm": 1.8039066180061814, "learning_rate": 7.874937862995401e-06, "loss": 0.7937, "step": 3538 }, { "epoch": 0.58, "grad_norm": 2.372308374081903, "learning_rate": 7.869739454787385e-06, "loss": 0.7634, "step": 3539 }, { "epoch": 0.58, "grad_norm": 1.9450918696408848, "learning_rate": 7.864541649545135e-06, "loss": 0.8345, "step": 3540 }, { "epoch": 0.58, "grad_norm": 2.988873653901225, "learning_rate": 7.85934444873988e-06, "loss": 0.7844, "step": 3541 }, { "epoch": 0.58, "grad_norm": 2.0842908632240764, "learning_rate": 7.854147853842672e-06, "loss": 0.8233, "step": 3542 }, { "epoch": 0.58, "grad_norm": 0.7014898020830371, "learning_rate": 7.848951866324402e-06, "loss": 0.3348, "step": 3543 }, { "epoch": 0.58, "grad_norm": 1.8516679178727835, "learning_rate": 7.84375648765578e-06, "loss": 0.8557, "step": 3544 }, { "epoch": 0.58, "grad_norm": 2.0602625680912365, "learning_rate": 7.838561719307346e-06, "loss": 0.8208, "step": 3545 }, { "epoch": 0.58, "grad_norm": 1.8339691756069931, "learning_rate": 7.833367562749473e-06, "loss": 0.9226, "step": 3546 }, { "epoch": 0.58, "grad_norm": 1.6502594288631167, "learning_rate": 7.828174019452357e-06, "loss": 0.8395, "step": 3547 }, { "epoch": 0.58, "grad_norm": 1.617774234022904, "learning_rate": 7.822981090886011e-06, "loss": 0.8327, "step": 3548 }, { "epoch": 0.58, "grad_norm": 1.9182651842375154, "learning_rate": 7.817788778520288e-06, "loss": 0.804, "step": 3549 }, { "epoch": 0.58, "grad_norm": 2.2745876114699533, "learning_rate": 7.81259708382486e-06, "loss": 0.7933, "step": 3550 }, { "epoch": 0.58, "grad_norm": 2.5211697409448224, "learning_rate": 7.807406008269224e-06, "loss": 0.8141, "step": 3551 }, { "epoch": 0.58, "grad_norm": 2.1495582692600808, "learning_rate": 7.802215553322703e-06, "loss": 0.7599, "step": 3552 }, { "epoch": 0.58, "grad_norm": 2.219914305844641, "learning_rate": 7.79702572045445e-06, "loss": 0.8308, "step": 3553 }, { "epoch": 0.58, "grad_norm": 1.8227348812592694, "learning_rate": 7.791836511133429e-06, "loss": 0.7033, "step": 3554 }, { "epoch": 0.58, "grad_norm": 0.6984132629274821, "learning_rate": 7.78664792682844e-06, "loss": 0.371, "step": 3555 }, { "epoch": 0.58, "grad_norm": 2.8766284131078135, "learning_rate": 7.781459969008098e-06, "loss": 0.7656, "step": 3556 }, { "epoch": 0.58, "grad_norm": 1.5503555554855712, "learning_rate": 7.776272639140845e-06, "loss": 0.8898, "step": 3557 }, { "epoch": 0.58, "grad_norm": 3.1544455743218736, "learning_rate": 7.771085938694943e-06, "loss": 0.7645, "step": 3558 }, { "epoch": 0.58, "grad_norm": 1.894277245859462, "learning_rate": 7.765899869138478e-06, "loss": 0.7753, "step": 3559 }, { "epoch": 0.58, "grad_norm": 1.9026313913940975, "learning_rate": 7.760714431939354e-06, "loss": 0.7568, "step": 3560 }, { "epoch": 0.58, "grad_norm": 2.1922950962576677, "learning_rate": 7.755529628565298e-06, "loss": 0.7243, "step": 3561 }, { "epoch": 0.59, "grad_norm": 2.587306833920044, "learning_rate": 7.750345460483859e-06, "loss": 0.7059, "step": 3562 }, { "epoch": 0.59, "grad_norm": 2.621821934003996, "learning_rate": 7.745161929162405e-06, "loss": 0.8482, "step": 3563 }, { "epoch": 0.59, "grad_norm": 0.5929274394770929, "learning_rate": 7.739979036068125e-06, "loss": 0.332, "step": 3564 }, { "epoch": 0.59, "grad_norm": 2.246419107230401, "learning_rate": 7.734796782668021e-06, "loss": 0.7299, "step": 3565 }, { "epoch": 0.59, "grad_norm": 2.1118776546194318, "learning_rate": 7.729615170428923e-06, "loss": 0.8077, "step": 3566 }, { "epoch": 0.59, "grad_norm": 1.9438297342121686, "learning_rate": 7.724434200817473e-06, "loss": 0.7691, "step": 3567 }, { "epoch": 0.59, "grad_norm": 1.6237165587606368, "learning_rate": 7.719253875300138e-06, "loss": 0.8446, "step": 3568 }, { "epoch": 0.59, "grad_norm": 1.7657970448437603, "learning_rate": 7.7140741953432e-06, "loss": 0.8428, "step": 3569 }, { "epoch": 0.59, "grad_norm": 0.5997680613093782, "learning_rate": 7.708895162412745e-06, "loss": 0.3248, "step": 3570 }, { "epoch": 0.59, "grad_norm": 2.971104338522317, "learning_rate": 7.703716777974694e-06, "loss": 0.807, "step": 3571 }, { "epoch": 0.59, "grad_norm": 1.9576577731905676, "learning_rate": 7.69853904349478e-06, "loss": 0.7627, "step": 3572 }, { "epoch": 0.59, "grad_norm": 2.9971324087216598, "learning_rate": 7.693361960438548e-06, "loss": 0.7942, "step": 3573 }, { "epoch": 0.59, "grad_norm": 0.6402131926799342, "learning_rate": 7.688185530271359e-06, "loss": 0.3433, "step": 3574 }, { "epoch": 0.59, "grad_norm": 0.6248679228588497, "learning_rate": 7.683009754458394e-06, "loss": 0.3673, "step": 3575 }, { "epoch": 0.59, "grad_norm": 2.453432576235342, "learning_rate": 7.67783463446464e-06, "loss": 0.7457, "step": 3576 }, { "epoch": 0.59, "grad_norm": 2.0120536744259367, "learning_rate": 7.67266017175491e-06, "loss": 0.7889, "step": 3577 }, { "epoch": 0.59, "grad_norm": 2.2721957539996214, "learning_rate": 7.667486367793822e-06, "loss": 0.7899, "step": 3578 }, { "epoch": 0.59, "grad_norm": 2.2237606527512805, "learning_rate": 7.66231322404581e-06, "loss": 0.8193, "step": 3579 }, { "epoch": 0.59, "grad_norm": 2.510649980746846, "learning_rate": 7.657140741975121e-06, "loss": 0.7317, "step": 3580 }, { "epoch": 0.59, "grad_norm": 2.14397395884924, "learning_rate": 7.651968923045817e-06, "loss": 0.7543, "step": 3581 }, { "epoch": 0.59, "grad_norm": 0.5902400144418352, "learning_rate": 7.64679776872177e-06, "loss": 0.3555, "step": 3582 }, { "epoch": 0.59, "grad_norm": 1.7519351163690624, "learning_rate": 7.641627280466663e-06, "loss": 0.8086, "step": 3583 }, { "epoch": 0.59, "grad_norm": 1.712345384464395, "learning_rate": 7.636457459743993e-06, "loss": 0.7575, "step": 3584 }, { "epoch": 0.59, "grad_norm": 2.1054599970653127, "learning_rate": 7.631288308017068e-06, "loss": 0.7228, "step": 3585 }, { "epoch": 0.59, "grad_norm": 2.189394901718488, "learning_rate": 7.626119826749002e-06, "loss": 0.7128, "step": 3586 }, { "epoch": 0.59, "grad_norm": 2.1085071559138138, "learning_rate": 7.6209520174027255e-06, "loss": 0.8439, "step": 3587 }, { "epoch": 0.59, "grad_norm": 2.3401896100335797, "learning_rate": 7.615784881440975e-06, "loss": 0.7742, "step": 3588 }, { "epoch": 0.59, "grad_norm": 2.6794289791171733, "learning_rate": 7.610618420326299e-06, "loss": 0.8116, "step": 3589 }, { "epoch": 0.59, "grad_norm": 1.6930917730473092, "learning_rate": 7.605452635521054e-06, "loss": 0.7838, "step": 3590 }, { "epoch": 0.59, "grad_norm": 1.9930419376405961, "learning_rate": 7.6002875284874e-06, "loss": 0.7913, "step": 3591 }, { "epoch": 0.59, "grad_norm": 2.409151334010497, "learning_rate": 7.595123100687313e-06, "loss": 0.8031, "step": 3592 }, { "epoch": 0.59, "grad_norm": 1.7750448204438636, "learning_rate": 7.589959353582574e-06, "loss": 0.7832, "step": 3593 }, { "epoch": 0.59, "grad_norm": 1.7000490859256443, "learning_rate": 7.584796288634768e-06, "loss": 0.7484, "step": 3594 }, { "epoch": 0.59, "grad_norm": 1.7643475378953082, "learning_rate": 7.5796339073052915e-06, "loss": 0.8533, "step": 3595 }, { "epoch": 0.59, "grad_norm": 1.9862653437519315, "learning_rate": 7.574472211055346e-06, "loss": 0.8395, "step": 3596 }, { "epoch": 0.59, "grad_norm": 1.6676591150142093, "learning_rate": 7.569311201345939e-06, "loss": 0.8172, "step": 3597 }, { "epoch": 0.59, "grad_norm": 3.299616349403807, "learning_rate": 7.564150879637882e-06, "loss": 0.8717, "step": 3598 }, { "epoch": 0.59, "grad_norm": 1.9211993366192626, "learning_rate": 7.558991247391792e-06, "loss": 0.8203, "step": 3599 }, { "epoch": 0.59, "grad_norm": 1.9823968002326973, "learning_rate": 7.553832306068095e-06, "loss": 0.7913, "step": 3600 }, { "epoch": 0.59, "grad_norm": 1.7228982862466515, "learning_rate": 7.548674057127019e-06, "loss": 0.7946, "step": 3601 }, { "epoch": 0.59, "grad_norm": 1.8432657668550345, "learning_rate": 7.543516502028594e-06, "loss": 0.8053, "step": 3602 }, { "epoch": 0.59, "grad_norm": 2.5085011221658484, "learning_rate": 7.538359642232654e-06, "loss": 0.7946, "step": 3603 }, { "epoch": 0.59, "grad_norm": 2.0945984922828047, "learning_rate": 7.53320347919884e-06, "loss": 0.7411, "step": 3604 }, { "epoch": 0.59, "grad_norm": 2.1773176923543587, "learning_rate": 7.52804801438659e-06, "loss": 0.795, "step": 3605 }, { "epoch": 0.59, "grad_norm": 1.9383412462650385, "learning_rate": 7.52289324925515e-06, "loss": 0.7667, "step": 3606 }, { "epoch": 0.59, "grad_norm": 6.541965404865172, "learning_rate": 7.517739185263564e-06, "loss": 0.7138, "step": 3607 }, { "epoch": 0.59, "grad_norm": 2.5256503625914943, "learning_rate": 7.5125858238706785e-06, "loss": 0.787, "step": 3608 }, { "epoch": 0.59, "grad_norm": 1.5128942886385106, "learning_rate": 7.507433166535143e-06, "loss": 0.8477, "step": 3609 }, { "epoch": 0.59, "grad_norm": 1.785609485542979, "learning_rate": 7.5022812147154065e-06, "loss": 0.7536, "step": 3610 }, { "epoch": 0.59, "grad_norm": 2.481831545720269, "learning_rate": 7.497129969869718e-06, "loss": 0.747, "step": 3611 }, { "epoch": 0.59, "grad_norm": 2.7741544424769944, "learning_rate": 7.491979433456127e-06, "loss": 0.7515, "step": 3612 }, { "epoch": 0.59, "grad_norm": 2.034867369601893, "learning_rate": 7.486829606932478e-06, "loss": 0.7071, "step": 3613 }, { "epoch": 0.59, "grad_norm": 2.145135101752443, "learning_rate": 7.481680491756424e-06, "loss": 0.7882, "step": 3614 }, { "epoch": 0.59, "grad_norm": 1.710841727228741, "learning_rate": 7.476532089385407e-06, "loss": 0.7938, "step": 3615 }, { "epoch": 0.59, "grad_norm": 0.6630527937806614, "learning_rate": 7.471384401276674e-06, "loss": 0.368, "step": 3616 }, { "epoch": 0.59, "grad_norm": 2.633767290647655, "learning_rate": 7.466237428887265e-06, "loss": 0.7262, "step": 3617 }, { "epoch": 0.59, "grad_norm": 1.4612479855605165, "learning_rate": 7.461091173674022e-06, "loss": 0.8112, "step": 3618 }, { "epoch": 0.59, "grad_norm": 2.5004994611032556, "learning_rate": 7.455945637093581e-06, "loss": 0.7622, "step": 3619 }, { "epoch": 0.59, "grad_norm": 2.2603208937685824, "learning_rate": 7.450800820602375e-06, "loss": 0.8365, "step": 3620 }, { "epoch": 0.59, "grad_norm": 1.9658610180564189, "learning_rate": 7.445656725656634e-06, "loss": 0.7915, "step": 3621 }, { "epoch": 0.59, "grad_norm": 1.8066007161104511, "learning_rate": 7.440513353712381e-06, "loss": 0.817, "step": 3622 }, { "epoch": 0.6, "grad_norm": 1.7524688873596457, "learning_rate": 7.43537070622544e-06, "loss": 0.752, "step": 3623 }, { "epoch": 0.6, "grad_norm": 2.124107312885052, "learning_rate": 7.430228784651426e-06, "loss": 0.8037, "step": 3624 }, { "epoch": 0.6, "grad_norm": 2.1579649413356496, "learning_rate": 7.425087590445747e-06, "loss": 0.8271, "step": 3625 }, { "epoch": 0.6, "grad_norm": 1.771033123037695, "learning_rate": 7.419947125063609e-06, "loss": 0.8064, "step": 3626 }, { "epoch": 0.6, "grad_norm": 3.0613949975370405, "learning_rate": 7.41480738996001e-06, "loss": 0.7533, "step": 3627 }, { "epoch": 0.6, "grad_norm": 2.1797372378709214, "learning_rate": 7.40966838658974e-06, "loss": 0.8238, "step": 3628 }, { "epoch": 0.6, "grad_norm": 1.6706085407669629, "learning_rate": 7.4045301164073834e-06, "loss": 0.7895, "step": 3629 }, { "epoch": 0.6, "grad_norm": 1.7404323335467233, "learning_rate": 7.399392580867317e-06, "loss": 0.7753, "step": 3630 }, { "epoch": 0.6, "grad_norm": 2.40448436879378, "learning_rate": 7.394255781423709e-06, "loss": 0.6907, "step": 3631 }, { "epoch": 0.6, "grad_norm": 1.8965917389920295, "learning_rate": 7.389119719530522e-06, "loss": 0.8343, "step": 3632 }, { "epoch": 0.6, "grad_norm": 2.8206895392444786, "learning_rate": 7.383984396641506e-06, "loss": 0.8154, "step": 3633 }, { "epoch": 0.6, "grad_norm": 2.440009442015456, "learning_rate": 7.378849814210201e-06, "loss": 0.7108, "step": 3634 }, { "epoch": 0.6, "grad_norm": 2.4087512516169083, "learning_rate": 7.373715973689941e-06, "loss": 0.774, "step": 3635 }, { "epoch": 0.6, "grad_norm": 2.11158690726921, "learning_rate": 7.3685828765338495e-06, "loss": 0.6783, "step": 3636 }, { "epoch": 0.6, "grad_norm": 0.7697710136104695, "learning_rate": 7.363450524194839e-06, "loss": 0.3547, "step": 3637 }, { "epoch": 0.6, "grad_norm": 1.998656302887428, "learning_rate": 7.358318918125613e-06, "loss": 0.7893, "step": 3638 }, { "epoch": 0.6, "grad_norm": 2.1428956271369057, "learning_rate": 7.353188059778657e-06, "loss": 0.7969, "step": 3639 }, { "epoch": 0.6, "grad_norm": 0.6330318489912429, "learning_rate": 7.348057950606253e-06, "loss": 0.3711, "step": 3640 }, { "epoch": 0.6, "grad_norm": 1.5323609703548444, "learning_rate": 7.342928592060468e-06, "loss": 0.806, "step": 3641 }, { "epoch": 0.6, "grad_norm": 2.4682497908512917, "learning_rate": 7.337799985593152e-06, "loss": 0.8334, "step": 3642 }, { "epoch": 0.6, "grad_norm": 2.084612471805963, "learning_rate": 7.332672132655953e-06, "loss": 0.8393, "step": 3643 }, { "epoch": 0.6, "grad_norm": 1.9780034009032423, "learning_rate": 7.327545034700294e-06, "loss": 0.7791, "step": 3644 }, { "epoch": 0.6, "grad_norm": 2.2479127905670544, "learning_rate": 7.3224186931773885e-06, "loss": 0.7551, "step": 3645 }, { "epoch": 0.6, "grad_norm": 1.8506140886536793, "learning_rate": 7.317293109538239e-06, "loss": 0.7106, "step": 3646 }, { "epoch": 0.6, "grad_norm": 1.9204651715053376, "learning_rate": 7.312168285233633e-06, "loss": 0.7579, "step": 3647 }, { "epoch": 0.6, "grad_norm": 2.6737172613525133, "learning_rate": 7.307044221714139e-06, "loss": 0.8245, "step": 3648 }, { "epoch": 0.6, "grad_norm": 2.9056246690882674, "learning_rate": 7.3019209204301115e-06, "loss": 0.7131, "step": 3649 }, { "epoch": 0.6, "grad_norm": 1.9418693366409037, "learning_rate": 7.296798382831691e-06, "loss": 0.759, "step": 3650 }, { "epoch": 0.6, "grad_norm": 2.6602393145180896, "learning_rate": 7.291676610368803e-06, "loss": 0.7894, "step": 3651 }, { "epoch": 0.6, "grad_norm": 2.154715544107327, "learning_rate": 7.286555604491151e-06, "loss": 0.7574, "step": 3652 }, { "epoch": 0.6, "grad_norm": 7.493778113479824, "learning_rate": 7.2814353666482276e-06, "loss": 0.8383, "step": 3653 }, { "epoch": 0.6, "grad_norm": 0.6364693723742018, "learning_rate": 7.276315898289303e-06, "loss": 0.3797, "step": 3654 }, { "epoch": 0.6, "grad_norm": 1.500042164081984, "learning_rate": 7.271197200863438e-06, "loss": 0.8026, "step": 3655 }, { "epoch": 0.6, "grad_norm": 3.2624243905167747, "learning_rate": 7.2660792758194596e-06, "loss": 0.6638, "step": 3656 }, { "epoch": 0.6, "grad_norm": 1.979271103564838, "learning_rate": 7.260962124605993e-06, "loss": 0.7428, "step": 3657 }, { "epoch": 0.6, "grad_norm": 2.027385255319654, "learning_rate": 7.2558457486714316e-06, "loss": 0.7437, "step": 3658 }, { "epoch": 0.6, "grad_norm": 1.850488520919181, "learning_rate": 7.2507301494639605e-06, "loss": 0.7077, "step": 3659 }, { "epoch": 0.6, "grad_norm": 2.240711324892186, "learning_rate": 7.245615328431535e-06, "loss": 0.7345, "step": 3660 }, { "epoch": 0.6, "grad_norm": 2.638273363094175, "learning_rate": 7.240501287021897e-06, "loss": 0.7983, "step": 3661 }, { "epoch": 0.6, "grad_norm": 2.1378385366885717, "learning_rate": 7.2353880266825635e-06, "loss": 0.8427, "step": 3662 }, { "epoch": 0.6, "grad_norm": 1.613509078109845, "learning_rate": 7.230275548860833e-06, "loss": 0.7586, "step": 3663 }, { "epoch": 0.6, "grad_norm": 2.1010581638655355, "learning_rate": 7.225163855003781e-06, "loss": 0.7154, "step": 3664 }, { "epoch": 0.6, "grad_norm": 2.8506886035684835, "learning_rate": 7.220052946558262e-06, "loss": 0.805, "step": 3665 }, { "epoch": 0.6, "grad_norm": 2.1299153601999685, "learning_rate": 7.2149428249709095e-06, "loss": 0.7099, "step": 3666 }, { "epoch": 0.6, "grad_norm": 2.511283636936101, "learning_rate": 7.209833491688131e-06, "loss": 0.771, "step": 3667 }, { "epoch": 0.6, "grad_norm": 1.959090910425313, "learning_rate": 7.2047249481561125e-06, "loss": 0.6988, "step": 3668 }, { "epoch": 0.6, "grad_norm": 1.733326238608321, "learning_rate": 7.1996171958208125e-06, "loss": 0.7768, "step": 3669 }, { "epoch": 0.6, "grad_norm": 0.6265085486725387, "learning_rate": 7.194510236127978e-06, "loss": 0.354, "step": 3670 }, { "epoch": 0.6, "grad_norm": 2.455447829175959, "learning_rate": 7.189404070523118e-06, "loss": 0.8756, "step": 3671 }, { "epoch": 0.6, "grad_norm": 1.7866871371283688, "learning_rate": 7.184298700451524e-06, "loss": 0.817, "step": 3672 }, { "epoch": 0.6, "grad_norm": 1.7296398284318133, "learning_rate": 7.179194127358258e-06, "loss": 0.8307, "step": 3673 }, { "epoch": 0.6, "grad_norm": 1.8622162140513394, "learning_rate": 7.17409035268816e-06, "loss": 0.782, "step": 3674 }, { "epoch": 0.6, "grad_norm": 1.519602856733056, "learning_rate": 7.168987377885843e-06, "loss": 0.8823, "step": 3675 }, { "epoch": 0.6, "grad_norm": 2.3665504108842152, "learning_rate": 7.163885204395692e-06, "loss": 0.7544, "step": 3676 }, { "epoch": 0.6, "grad_norm": 2.2533974655069495, "learning_rate": 7.158783833661869e-06, "loss": 0.7237, "step": 3677 }, { "epoch": 0.6, "grad_norm": 2.130182331758718, "learning_rate": 7.153683267128304e-06, "loss": 0.7753, "step": 3678 }, { "epoch": 0.6, "grad_norm": 5.44726047124599, "learning_rate": 7.148583506238701e-06, "loss": 0.8062, "step": 3679 }, { "epoch": 0.6, "grad_norm": 2.0165125195024505, "learning_rate": 7.143484552436537e-06, "loss": 0.8297, "step": 3680 }, { "epoch": 0.6, "grad_norm": 2.186331350898845, "learning_rate": 7.1383864071650635e-06, "loss": 0.807, "step": 3681 }, { "epoch": 0.6, "grad_norm": 1.859333340416139, "learning_rate": 7.133289071867295e-06, "loss": 0.8387, "step": 3682 }, { "epoch": 0.6, "grad_norm": 1.8134818436434563, "learning_rate": 7.128192547986023e-06, "loss": 0.7999, "step": 3683 }, { "epoch": 0.61, "grad_norm": 2.140936387121875, "learning_rate": 7.1230968369638096e-06, "loss": 0.7516, "step": 3684 }, { "epoch": 0.61, "grad_norm": 3.918659405047983, "learning_rate": 7.118001940242984e-06, "loss": 0.7509, "step": 3685 }, { "epoch": 0.61, "grad_norm": 1.7637949923917946, "learning_rate": 7.112907859265646e-06, "loss": 0.7679, "step": 3686 }, { "epoch": 0.61, "grad_norm": 0.6996741526813356, "learning_rate": 7.1078145954736655e-06, "loss": 0.3414, "step": 3687 }, { "epoch": 0.61, "grad_norm": 6.540507488276241, "learning_rate": 7.102722150308678e-06, "loss": 0.7533, "step": 3688 }, { "epoch": 0.61, "grad_norm": 5.070481866860792, "learning_rate": 7.097630525212091e-06, "loss": 0.7736, "step": 3689 }, { "epoch": 0.61, "grad_norm": 2.21737149596881, "learning_rate": 7.092539721625078e-06, "loss": 0.8992, "step": 3690 }, { "epoch": 0.61, "grad_norm": 1.7297215409710587, "learning_rate": 7.087449740988579e-06, "loss": 0.8091, "step": 3691 }, { "epoch": 0.61, "grad_norm": 2.083159252302312, "learning_rate": 7.082360584743302e-06, "loss": 0.8214, "step": 3692 }, { "epoch": 0.61, "grad_norm": 2.000842155801136, "learning_rate": 7.077272254329726e-06, "loss": 0.7719, "step": 3693 }, { "epoch": 0.61, "grad_norm": 1.7080976624394617, "learning_rate": 7.072184751188088e-06, "loss": 0.7359, "step": 3694 }, { "epoch": 0.61, "grad_norm": 3.467994417610887, "learning_rate": 7.067098076758398e-06, "loss": 0.748, "step": 3695 }, { "epoch": 0.61, "grad_norm": 1.6798609755386842, "learning_rate": 7.062012232480427e-06, "loss": 0.7654, "step": 3696 }, { "epoch": 0.61, "grad_norm": 2.4059968208045124, "learning_rate": 7.056927219793711e-06, "loss": 0.6817, "step": 3697 }, { "epoch": 0.61, "grad_norm": 1.9051921618421552, "learning_rate": 7.051843040137558e-06, "loss": 0.8123, "step": 3698 }, { "epoch": 0.61, "grad_norm": 2.1869847315685487, "learning_rate": 7.046759694951029e-06, "loss": 0.7397, "step": 3699 }, { "epoch": 0.61, "grad_norm": 1.7484051190449603, "learning_rate": 7.04167718567295e-06, "loss": 0.7842, "step": 3700 }, { "epoch": 0.61, "grad_norm": 1.8282873789447929, "learning_rate": 7.036595513741924e-06, "loss": 0.8106, "step": 3701 }, { "epoch": 0.61, "grad_norm": 2.357851062356127, "learning_rate": 7.0315146805963004e-06, "loss": 0.783, "step": 3702 }, { "epoch": 0.61, "grad_norm": 2.0262199505952867, "learning_rate": 7.026434687674204e-06, "loss": 0.8238, "step": 3703 }, { "epoch": 0.61, "grad_norm": 0.5867367069644557, "learning_rate": 7.021355536413513e-06, "loss": 0.3614, "step": 3704 }, { "epoch": 0.61, "grad_norm": 2.07468725528853, "learning_rate": 7.016277228251871e-06, "loss": 0.799, "step": 3705 }, { "epoch": 0.61, "grad_norm": 1.954202718469666, "learning_rate": 7.011199764626682e-06, "loss": 0.8237, "step": 3706 }, { "epoch": 0.61, "grad_norm": 1.7825404584035576, "learning_rate": 7.006123146975112e-06, "loss": 0.7798, "step": 3707 }, { "epoch": 0.61, "grad_norm": 2.219191636942724, "learning_rate": 7.001047376734087e-06, "loss": 0.8345, "step": 3708 }, { "epoch": 0.61, "grad_norm": 1.777923310002072, "learning_rate": 6.995972455340292e-06, "loss": 0.7993, "step": 3709 }, { "epoch": 0.61, "grad_norm": 2.3325140326663396, "learning_rate": 6.990898384230174e-06, "loss": 0.7618, "step": 3710 }, { "epoch": 0.61, "grad_norm": 0.6349818819985403, "learning_rate": 6.985825164839937e-06, "loss": 0.3492, "step": 3711 }, { "epoch": 0.61, "grad_norm": 1.854269933227085, "learning_rate": 6.980752798605547e-06, "loss": 0.7792, "step": 3712 }, { "epoch": 0.61, "grad_norm": 2.2749147047192304, "learning_rate": 6.975681286962724e-06, "loss": 0.7873, "step": 3713 }, { "epoch": 0.61, "grad_norm": 1.808456991937728, "learning_rate": 6.970610631346951e-06, "loss": 0.8505, "step": 3714 }, { "epoch": 0.61, "grad_norm": 1.8285858112836426, "learning_rate": 6.965540833193464e-06, "loss": 0.7773, "step": 3715 }, { "epoch": 0.61, "grad_norm": 2.1313352246643844, "learning_rate": 6.9604718939372615e-06, "loss": 0.7516, "step": 3716 }, { "epoch": 0.61, "grad_norm": 2.350573758358006, "learning_rate": 6.9554038150130955e-06, "loss": 0.7746, "step": 3717 }, { "epoch": 0.61, "grad_norm": 1.3991596714038987, "learning_rate": 6.9503365978554735e-06, "loss": 0.7987, "step": 3718 }, { "epoch": 0.61, "grad_norm": 2.0934453647860827, "learning_rate": 6.945270243898662e-06, "loss": 0.792, "step": 3719 }, { "epoch": 0.61, "grad_norm": 1.916261523361249, "learning_rate": 6.940204754576685e-06, "loss": 0.8093, "step": 3720 }, { "epoch": 0.61, "grad_norm": 4.1753521545230825, "learning_rate": 6.935140131323312e-06, "loss": 0.7268, "step": 3721 }, { "epoch": 0.61, "grad_norm": 1.6853478974988259, "learning_rate": 6.930076375572077e-06, "loss": 0.7865, "step": 3722 }, { "epoch": 0.61, "grad_norm": 1.996081939671129, "learning_rate": 6.925013488756264e-06, "loss": 0.8772, "step": 3723 }, { "epoch": 0.61, "grad_norm": 2.047264772441053, "learning_rate": 6.919951472308912e-06, "loss": 0.8331, "step": 3724 }, { "epoch": 0.61, "grad_norm": 1.841597524118834, "learning_rate": 6.9148903276628175e-06, "loss": 0.7945, "step": 3725 }, { "epoch": 0.61, "grad_norm": 2.2291137626188737, "learning_rate": 6.909830056250527e-06, "loss": 0.7313, "step": 3726 }, { "epoch": 0.61, "grad_norm": 2.6197078900294706, "learning_rate": 6.904770659504336e-06, "loss": 0.8553, "step": 3727 }, { "epoch": 0.61, "grad_norm": 1.9343886948809796, "learning_rate": 6.8997121388563e-06, "loss": 0.7357, "step": 3728 }, { "epoch": 0.61, "grad_norm": 2.2956104596515354, "learning_rate": 6.89465449573822e-06, "loss": 0.8026, "step": 3729 }, { "epoch": 0.61, "grad_norm": 1.8978630361918274, "learning_rate": 6.889597731581652e-06, "loss": 0.775, "step": 3730 }, { "epoch": 0.61, "grad_norm": 4.514478774726146, "learning_rate": 6.8845418478179016e-06, "loss": 0.6917, "step": 3731 }, { "epoch": 0.61, "grad_norm": 2.0263211560613668, "learning_rate": 6.879486845878027e-06, "loss": 0.7834, "step": 3732 }, { "epoch": 0.61, "grad_norm": 1.7975869052799451, "learning_rate": 6.874432727192837e-06, "loss": 0.7021, "step": 3733 }, { "epoch": 0.61, "grad_norm": 2.0427686031291774, "learning_rate": 6.869379493192886e-06, "loss": 0.7396, "step": 3734 }, { "epoch": 0.61, "grad_norm": 1.601075084930034, "learning_rate": 6.8643271453084845e-06, "loss": 0.7907, "step": 3735 }, { "epoch": 0.61, "grad_norm": 1.7964203712463342, "learning_rate": 6.859275684969686e-06, "loss": 0.7779, "step": 3736 }, { "epoch": 0.61, "grad_norm": 2.200105687889724, "learning_rate": 6.854225113606299e-06, "loss": 0.7645, "step": 3737 }, { "epoch": 0.61, "grad_norm": 1.9455027344662656, "learning_rate": 6.849175432647875e-06, "loss": 0.7101, "step": 3738 }, { "epoch": 0.61, "grad_norm": 1.5245951555269845, "learning_rate": 6.844126643523714e-06, "loss": 0.8514, "step": 3739 }, { "epoch": 0.61, "grad_norm": 0.6385092399877019, "learning_rate": 6.839078747662871e-06, "loss": 0.344, "step": 3740 }, { "epoch": 0.61, "grad_norm": 2.2584126002747236, "learning_rate": 6.834031746494136e-06, "loss": 0.8225, "step": 3741 }, { "epoch": 0.61, "grad_norm": 1.567166158527675, "learning_rate": 6.8289856414460595e-06, "loss": 0.6937, "step": 3742 }, { "epoch": 0.61, "grad_norm": 1.8390461601628358, "learning_rate": 6.823940433946921e-06, "loss": 0.7608, "step": 3743 }, { "epoch": 0.61, "grad_norm": 2.3022341997076765, "learning_rate": 6.818896125424762e-06, "loss": 0.7375, "step": 3744 }, { "epoch": 0.62, "grad_norm": 1.9467566897181734, "learning_rate": 6.813852717307362e-06, "loss": 0.7023, "step": 3745 }, { "epoch": 0.62, "grad_norm": 2.280663456317717, "learning_rate": 6.808810211022248e-06, "loss": 0.8119, "step": 3746 }, { "epoch": 0.62, "grad_norm": 1.6461209491546192, "learning_rate": 6.803768607996686e-06, "loss": 0.8138, "step": 3747 }, { "epoch": 0.62, "grad_norm": 1.7441781185860126, "learning_rate": 6.798727909657698e-06, "loss": 0.7597, "step": 3748 }, { "epoch": 0.62, "grad_norm": 2.068127174991007, "learning_rate": 6.793688117432041e-06, "loss": 0.8101, "step": 3749 }, { "epoch": 0.62, "grad_norm": 2.015454506624647, "learning_rate": 6.788649232746217e-06, "loss": 0.7857, "step": 3750 }, { "epoch": 0.62, "grad_norm": 2.256317843754289, "learning_rate": 6.783611257026471e-06, "loss": 0.7304, "step": 3751 }, { "epoch": 0.62, "grad_norm": 2.005015027456095, "learning_rate": 6.778574191698793e-06, "loss": 0.7508, "step": 3752 }, { "epoch": 0.62, "grad_norm": 1.9654266606656556, "learning_rate": 6.773538038188912e-06, "loss": 0.8053, "step": 3753 }, { "epoch": 0.62, "grad_norm": 1.8131737358345175, "learning_rate": 6.768502797922301e-06, "loss": 0.8048, "step": 3754 }, { "epoch": 0.62, "grad_norm": 2.6055489754735746, "learning_rate": 6.763468472324175e-06, "loss": 0.845, "step": 3755 }, { "epoch": 0.62, "grad_norm": 2.0817009563627704, "learning_rate": 6.758435062819488e-06, "loss": 0.7259, "step": 3756 }, { "epoch": 0.62, "grad_norm": 1.6906687395221371, "learning_rate": 6.7534025708329385e-06, "loss": 0.7218, "step": 3757 }, { "epoch": 0.62, "grad_norm": 1.7537296339503752, "learning_rate": 6.74837099778896e-06, "loss": 0.8141, "step": 3758 }, { "epoch": 0.62, "grad_norm": 2.6855220809343407, "learning_rate": 6.743340345111731e-06, "loss": 0.8049, "step": 3759 }, { "epoch": 0.62, "grad_norm": 1.6329656275148663, "learning_rate": 6.738310614225164e-06, "loss": 0.797, "step": 3760 }, { "epoch": 0.62, "grad_norm": 1.9535362243857555, "learning_rate": 6.733281806552917e-06, "loss": 0.7771, "step": 3761 }, { "epoch": 0.62, "grad_norm": 2.8363311894902963, "learning_rate": 6.728253923518379e-06, "loss": 0.7953, "step": 3762 }, { "epoch": 0.62, "grad_norm": 1.7876249570686853, "learning_rate": 6.723226966544691e-06, "loss": 0.7372, "step": 3763 }, { "epoch": 0.62, "grad_norm": 2.252039992820522, "learning_rate": 6.718200937054714e-06, "loss": 0.7703, "step": 3764 }, { "epoch": 0.62, "grad_norm": 1.610742952072645, "learning_rate": 6.713175836471057e-06, "loss": 0.7937, "step": 3765 }, { "epoch": 0.62, "grad_norm": 2.0878942918387233, "learning_rate": 6.708151666216063e-06, "loss": 0.6888, "step": 3766 }, { "epoch": 0.62, "grad_norm": 1.7013531459790912, "learning_rate": 6.703128427711816e-06, "loss": 0.8557, "step": 3767 }, { "epoch": 0.62, "grad_norm": 1.4552892332981036, "learning_rate": 6.69810612238013e-06, "loss": 0.8305, "step": 3768 }, { "epoch": 0.62, "grad_norm": 1.961308514567969, "learning_rate": 6.6930847516425615e-06, "loss": 0.7372, "step": 3769 }, { "epoch": 0.62, "grad_norm": 1.5250586391367411, "learning_rate": 6.688064316920393e-06, "loss": 0.7939, "step": 3770 }, { "epoch": 0.62, "grad_norm": 2.3108961480350705, "learning_rate": 6.683044819634654e-06, "loss": 0.7389, "step": 3771 }, { "epoch": 0.62, "grad_norm": 1.5867351374349052, "learning_rate": 6.678026261206102e-06, "loss": 0.7131, "step": 3772 }, { "epoch": 0.62, "grad_norm": 2.609943302592258, "learning_rate": 6.673008643055228e-06, "loss": 0.7647, "step": 3773 }, { "epoch": 0.62, "grad_norm": 1.8832206013255732, "learning_rate": 6.667991966602257e-06, "loss": 0.7882, "step": 3774 }, { "epoch": 0.62, "grad_norm": 1.5835664408275425, "learning_rate": 6.66297623326715e-06, "loss": 0.6999, "step": 3775 }, { "epoch": 0.62, "grad_norm": 1.7714328853918224, "learning_rate": 6.657961444469601e-06, "loss": 0.8295, "step": 3776 }, { "epoch": 0.62, "grad_norm": 1.724638536624079, "learning_rate": 6.652947601629032e-06, "loss": 0.8297, "step": 3777 }, { "epoch": 0.62, "grad_norm": 3.9371086814669307, "learning_rate": 6.6479347061646046e-06, "loss": 0.8548, "step": 3778 }, { "epoch": 0.62, "grad_norm": 3.569400390301635, "learning_rate": 6.642922759495205e-06, "loss": 0.7987, "step": 3779 }, { "epoch": 0.62, "grad_norm": 2.120441248811595, "learning_rate": 6.637911763039457e-06, "loss": 0.8117, "step": 3780 }, { "epoch": 0.62, "grad_norm": 2.2345503791949284, "learning_rate": 6.632901718215711e-06, "loss": 0.8439, "step": 3781 }, { "epoch": 0.62, "grad_norm": 1.6660417843073267, "learning_rate": 6.627892626442049e-06, "loss": 0.8461, "step": 3782 }, { "epoch": 0.62, "grad_norm": 1.6508328779527923, "learning_rate": 6.622884489136286e-06, "loss": 0.7544, "step": 3783 }, { "epoch": 0.62, "grad_norm": 1.705495052397929, "learning_rate": 6.617877307715963e-06, "loss": 0.8339, "step": 3784 }, { "epoch": 0.62, "grad_norm": 0.6355286909921354, "learning_rate": 6.612871083598354e-06, "loss": 0.3817, "step": 3785 }, { "epoch": 0.62, "grad_norm": 2.1653692174073087, "learning_rate": 6.607865818200458e-06, "loss": 0.8324, "step": 3786 }, { "epoch": 0.62, "grad_norm": 2.3819344499258293, "learning_rate": 6.602861512939005e-06, "loss": 0.7257, "step": 3787 }, { "epoch": 0.62, "grad_norm": 1.865596684807114, "learning_rate": 6.597858169230454e-06, "loss": 0.7883, "step": 3788 }, { "epoch": 0.62, "grad_norm": 1.934298895559284, "learning_rate": 6.592855788490991e-06, "loss": 0.8105, "step": 3789 }, { "epoch": 0.62, "grad_norm": 2.0810939438279052, "learning_rate": 6.587854372136529e-06, "loss": 0.827, "step": 3790 }, { "epoch": 0.62, "grad_norm": 1.954831016468956, "learning_rate": 6.582853921582708e-06, "loss": 0.8282, "step": 3791 }, { "epoch": 0.62, "grad_norm": 2.3923508140169396, "learning_rate": 6.577854438244897e-06, "loss": 0.789, "step": 3792 }, { "epoch": 0.62, "grad_norm": 2.081749228675837, "learning_rate": 6.572855923538186e-06, "loss": 0.7872, "step": 3793 }, { "epoch": 0.62, "grad_norm": 2.417852458488274, "learning_rate": 6.567858378877394e-06, "loss": 0.7403, "step": 3794 }, { "epoch": 0.62, "grad_norm": 0.6019923492762541, "learning_rate": 6.5628618056770696e-06, "loss": 0.336, "step": 3795 }, { "epoch": 0.62, "grad_norm": 1.8138653136432086, "learning_rate": 6.557866205351479e-06, "loss": 0.78, "step": 3796 }, { "epoch": 0.62, "grad_norm": 1.885787576281024, "learning_rate": 6.552871579314619e-06, "loss": 0.7953, "step": 3797 }, { "epoch": 0.62, "grad_norm": 2.1250826747834592, "learning_rate": 6.547877928980206e-06, "loss": 0.8018, "step": 3798 }, { "epoch": 0.62, "grad_norm": 1.8230973294545507, "learning_rate": 6.542885255761682e-06, "loss": 0.7642, "step": 3799 }, { "epoch": 0.62, "grad_norm": 6.450120944371559, "learning_rate": 6.537893561072214e-06, "loss": 0.7879, "step": 3800 }, { "epoch": 0.62, "grad_norm": 1.8885503145318798, "learning_rate": 6.532902846324689e-06, "loss": 0.7293, "step": 3801 }, { "epoch": 0.62, "grad_norm": 2.1619351126096196, "learning_rate": 6.52791311293172e-06, "loss": 0.737, "step": 3802 }, { "epoch": 0.62, "grad_norm": 0.642108079131575, "learning_rate": 6.522924362305639e-06, "loss": 0.3669, "step": 3803 }, { "epoch": 0.62, "grad_norm": 2.3535413153297373, "learning_rate": 6.517936595858503e-06, "loss": 0.7927, "step": 3804 }, { "epoch": 0.62, "grad_norm": 0.5886150948328788, "learning_rate": 6.512949815002088e-06, "loss": 0.3602, "step": 3805 }, { "epoch": 0.63, "grad_norm": 1.7635269060628223, "learning_rate": 6.50796402114789e-06, "loss": 0.7997, "step": 3806 }, { "epoch": 0.63, "grad_norm": 1.7551294023811692, "learning_rate": 6.502979215707133e-06, "loss": 0.7673, "step": 3807 }, { "epoch": 0.63, "grad_norm": 1.7558182623041145, "learning_rate": 6.497995400090748e-06, "loss": 0.8084, "step": 3808 }, { "epoch": 0.63, "grad_norm": 1.941900826808954, "learning_rate": 6.4930125757094e-06, "loss": 0.7084, "step": 3809 }, { "epoch": 0.63, "grad_norm": 1.8172444658981595, "learning_rate": 6.488030743973463e-06, "loss": 0.82, "step": 3810 }, { "epoch": 0.63, "grad_norm": 1.929059195365352, "learning_rate": 6.483049906293035e-06, "loss": 0.7762, "step": 3811 }, { "epoch": 0.63, "grad_norm": 2.866930283123111, "learning_rate": 6.478070064077933e-06, "loss": 0.8524, "step": 3812 }, { "epoch": 0.63, "grad_norm": 1.8080426714716078, "learning_rate": 6.4730912187376895e-06, "loss": 0.7961, "step": 3813 }, { "epoch": 0.63, "grad_norm": 1.434529201782498, "learning_rate": 6.468113371681557e-06, "loss": 0.7192, "step": 3814 }, { "epoch": 0.63, "grad_norm": 2.121611289883668, "learning_rate": 6.463136524318503e-06, "loss": 0.7744, "step": 3815 }, { "epoch": 0.63, "grad_norm": 1.8813376921793272, "learning_rate": 6.4581606780572155e-06, "loss": 0.7872, "step": 3816 }, { "epoch": 0.63, "grad_norm": 1.7120239051814323, "learning_rate": 6.453185834306095e-06, "loss": 0.7076, "step": 3817 }, { "epoch": 0.63, "grad_norm": 1.7404325232449787, "learning_rate": 6.448211994473263e-06, "loss": 0.7165, "step": 3818 }, { "epoch": 0.63, "grad_norm": 1.7684081290385623, "learning_rate": 6.443239159966556e-06, "loss": 0.7949, "step": 3819 }, { "epoch": 0.63, "grad_norm": 1.6332538326183244, "learning_rate": 6.438267332193519e-06, "loss": 0.6996, "step": 3820 }, { "epoch": 0.63, "grad_norm": 1.960539514840596, "learning_rate": 6.4332965125614235e-06, "loss": 0.8517, "step": 3821 }, { "epoch": 0.63, "grad_norm": 1.9106563247465271, "learning_rate": 6.428326702477246e-06, "loss": 0.724, "step": 3822 }, { "epoch": 0.63, "grad_norm": 5.393585006322098, "learning_rate": 6.42335790334768e-06, "loss": 0.844, "step": 3823 }, { "epoch": 0.63, "grad_norm": 1.8669763827605759, "learning_rate": 6.418390116579134e-06, "loss": 0.7817, "step": 3824 }, { "epoch": 0.63, "grad_norm": 2.4013037046621024, "learning_rate": 6.4134233435777315e-06, "loss": 0.8362, "step": 3825 }, { "epoch": 0.63, "grad_norm": 2.6516696872942402, "learning_rate": 6.408457585749307e-06, "loss": 0.7959, "step": 3826 }, { "epoch": 0.63, "grad_norm": 0.6428423188881098, "learning_rate": 6.403492844499406e-06, "loss": 0.3659, "step": 3827 }, { "epoch": 0.63, "grad_norm": 1.9444532895654725, "learning_rate": 6.398529121233291e-06, "loss": 0.838, "step": 3828 }, { "epoch": 0.63, "grad_norm": 1.7830297277056435, "learning_rate": 6.39356641735593e-06, "loss": 0.7389, "step": 3829 }, { "epoch": 0.63, "grad_norm": 2.4564314478530935, "learning_rate": 6.388604734272006e-06, "loss": 0.8426, "step": 3830 }, { "epoch": 0.63, "grad_norm": 2.1208665710346724, "learning_rate": 6.383644073385915e-06, "loss": 0.8122, "step": 3831 }, { "epoch": 0.63, "grad_norm": 2.1298343539588527, "learning_rate": 6.378684436101761e-06, "loss": 0.8269, "step": 3832 }, { "epoch": 0.63, "grad_norm": 2.448648808781087, "learning_rate": 6.373725823823359e-06, "loss": 0.7631, "step": 3833 }, { "epoch": 0.63, "grad_norm": 2.362206176546178, "learning_rate": 6.368768237954234e-06, "loss": 0.8223, "step": 3834 }, { "epoch": 0.63, "grad_norm": 1.6758468893901257, "learning_rate": 6.363811679897618e-06, "loss": 0.7611, "step": 3835 }, { "epoch": 0.63, "grad_norm": 2.6012368981196086, "learning_rate": 6.358856151056458e-06, "loss": 0.7522, "step": 3836 }, { "epoch": 0.63, "grad_norm": 1.8140166087369602, "learning_rate": 6.353901652833403e-06, "loss": 0.7857, "step": 3837 }, { "epoch": 0.63, "grad_norm": 2.0007069177170758, "learning_rate": 6.348948186630815e-06, "loss": 0.7479, "step": 3838 }, { "epoch": 0.63, "grad_norm": 2.268203937374718, "learning_rate": 6.343995753850762e-06, "loss": 0.8121, "step": 3839 }, { "epoch": 0.63, "grad_norm": 2.264754626425907, "learning_rate": 6.339044355895016e-06, "loss": 0.7978, "step": 3840 }, { "epoch": 0.63, "grad_norm": 2.5760933059914715, "learning_rate": 6.334093994165067e-06, "loss": 0.8383, "step": 3841 }, { "epoch": 0.63, "grad_norm": 3.0634001117669665, "learning_rate": 6.3291446700621e-06, "loss": 0.7966, "step": 3842 }, { "epoch": 0.63, "grad_norm": 2.4061855374752734, "learning_rate": 6.324196384987009e-06, "loss": 0.7743, "step": 3843 }, { "epoch": 0.63, "grad_norm": 1.8541331557887712, "learning_rate": 6.3192491403404e-06, "loss": 0.7788, "step": 3844 }, { "epoch": 0.63, "grad_norm": 1.6182131922064755, "learning_rate": 6.3143029375225785e-06, "loss": 0.7745, "step": 3845 }, { "epoch": 0.63, "grad_norm": 1.807355976073444, "learning_rate": 6.309357777933555e-06, "loss": 0.8259, "step": 3846 }, { "epoch": 0.63, "grad_norm": 2.2159246240130455, "learning_rate": 6.30441366297305e-06, "loss": 0.7569, "step": 3847 }, { "epoch": 0.63, "grad_norm": 0.589500985965397, "learning_rate": 6.2994705940404825e-06, "loss": 0.3663, "step": 3848 }, { "epoch": 0.63, "grad_norm": 1.6051243715716226, "learning_rate": 6.294528572534977e-06, "loss": 0.7958, "step": 3849 }, { "epoch": 0.63, "grad_norm": 2.0370330643283054, "learning_rate": 6.289587599855367e-06, "loss": 0.7361, "step": 3850 }, { "epoch": 0.63, "grad_norm": 2.1518649618582897, "learning_rate": 6.284647677400177e-06, "loss": 0.7294, "step": 3851 }, { "epoch": 0.63, "grad_norm": 2.0240648539299637, "learning_rate": 6.279708806567646e-06, "loss": 0.6373, "step": 3852 }, { "epoch": 0.63, "grad_norm": 3.897112001212918, "learning_rate": 6.274770988755712e-06, "loss": 0.7479, "step": 3853 }, { "epoch": 0.63, "grad_norm": 5.395396531688008, "learning_rate": 6.2698342253620105e-06, "loss": 0.7868, "step": 3854 }, { "epoch": 0.63, "grad_norm": 2.011575058353737, "learning_rate": 6.264898517783885e-06, "loss": 0.7629, "step": 3855 }, { "epoch": 0.63, "grad_norm": 2.2606844597456184, "learning_rate": 6.259963867418375e-06, "loss": 0.7188, "step": 3856 }, { "epoch": 0.63, "grad_norm": 1.588064804703417, "learning_rate": 6.255030275662226e-06, "loss": 0.749, "step": 3857 }, { "epoch": 0.63, "grad_norm": 1.87098004486344, "learning_rate": 6.250097743911877e-06, "loss": 0.7361, "step": 3858 }, { "epoch": 0.63, "grad_norm": 1.7647089225481096, "learning_rate": 6.245166273563473e-06, "loss": 0.8021, "step": 3859 }, { "epoch": 0.63, "grad_norm": 0.6130294416366084, "learning_rate": 6.240235866012856e-06, "loss": 0.364, "step": 3860 }, { "epoch": 0.63, "grad_norm": 1.9543349700347736, "learning_rate": 6.235306522655566e-06, "loss": 0.7392, "step": 3861 }, { "epoch": 0.63, "grad_norm": 1.829077500747619, "learning_rate": 6.230378244886847e-06, "loss": 0.7552, "step": 3862 }, { "epoch": 0.63, "grad_norm": 1.7143976222160924, "learning_rate": 6.225451034101631e-06, "loss": 0.717, "step": 3863 }, { "epoch": 0.63, "grad_norm": 1.801741538597847, "learning_rate": 6.220524891694562e-06, "loss": 0.7525, "step": 3864 }, { "epoch": 0.63, "grad_norm": 1.6815579640571507, "learning_rate": 6.2155998190599705e-06, "loss": 0.7787, "step": 3865 }, { "epoch": 0.63, "grad_norm": 5.501386406503948, "learning_rate": 6.210675817591889e-06, "loss": 0.8168, "step": 3866 }, { "epoch": 0.64, "grad_norm": 25.942406375289423, "learning_rate": 6.2057528886840445e-06, "loss": 0.7007, "step": 3867 }, { "epoch": 0.64, "grad_norm": 2.1364031423331213, "learning_rate": 6.200831033729864e-06, "loss": 0.7516, "step": 3868 }, { "epoch": 0.64, "grad_norm": 1.9014441223049008, "learning_rate": 6.195910254122466e-06, "loss": 0.7878, "step": 3869 }, { "epoch": 0.64, "grad_norm": 1.9128112059191567, "learning_rate": 6.190990551254668e-06, "loss": 0.8423, "step": 3870 }, { "epoch": 0.64, "grad_norm": 2.025478447154917, "learning_rate": 6.186071926518984e-06, "loss": 0.7636, "step": 3871 }, { "epoch": 0.64, "grad_norm": 2.2763011538484257, "learning_rate": 6.18115438130761e-06, "loss": 0.73, "step": 3872 }, { "epoch": 0.64, "grad_norm": 2.1589530233445218, "learning_rate": 6.176237917012459e-06, "loss": 0.7956, "step": 3873 }, { "epoch": 0.64, "grad_norm": 2.2148089309983123, "learning_rate": 6.171322535025119e-06, "loss": 0.7457, "step": 3874 }, { "epoch": 0.64, "grad_norm": 2.0372176300014826, "learning_rate": 6.166408236736883e-06, "loss": 0.8034, "step": 3875 }, { "epoch": 0.64, "grad_norm": 3.6737021718219, "learning_rate": 6.161495023538729e-06, "loss": 0.818, "step": 3876 }, { "epoch": 0.64, "grad_norm": 1.7061620363887846, "learning_rate": 6.1565828968213325e-06, "loss": 0.833, "step": 3877 }, { "epoch": 0.64, "grad_norm": 2.0615410656053013, "learning_rate": 6.151671857975061e-06, "loss": 0.7257, "step": 3878 }, { "epoch": 0.64, "grad_norm": 2.042972205965157, "learning_rate": 6.146761908389975e-06, "loss": 0.7142, "step": 3879 }, { "epoch": 0.64, "grad_norm": 2.692062054651249, "learning_rate": 6.141853049455824e-06, "loss": 0.7802, "step": 3880 }, { "epoch": 0.64, "grad_norm": 1.962904655306594, "learning_rate": 6.1369452825620515e-06, "loss": 0.8534, "step": 3881 }, { "epoch": 0.64, "grad_norm": 1.7810734324533395, "learning_rate": 6.132038609097788e-06, "loss": 0.7828, "step": 3882 }, { "epoch": 0.64, "grad_norm": 1.8495565173134223, "learning_rate": 6.12713303045186e-06, "loss": 0.7758, "step": 3883 }, { "epoch": 0.64, "grad_norm": 2.14285584543365, "learning_rate": 6.1222285480127786e-06, "loss": 0.7497, "step": 3884 }, { "epoch": 0.64, "grad_norm": 2.7695613551567546, "learning_rate": 6.11732516316875e-06, "loss": 0.81, "step": 3885 }, { "epoch": 0.64, "grad_norm": 2.3296067187351226, "learning_rate": 6.112422877307664e-06, "loss": 0.8514, "step": 3886 }, { "epoch": 0.64, "grad_norm": 4.548085883781577, "learning_rate": 6.107521691817104e-06, "loss": 0.8176, "step": 3887 }, { "epoch": 0.64, "grad_norm": 1.9647608747713905, "learning_rate": 6.10262160808434e-06, "loss": 0.7579, "step": 3888 }, { "epoch": 0.64, "grad_norm": 0.6644061076935446, "learning_rate": 6.097722627496332e-06, "loss": 0.32, "step": 3889 }, { "epoch": 0.64, "grad_norm": 1.649051945051587, "learning_rate": 6.092824751439723e-06, "loss": 0.7595, "step": 3890 }, { "epoch": 0.64, "grad_norm": 1.657354642475329, "learning_rate": 6.0879279813008495e-06, "loss": 0.8154, "step": 3891 }, { "epoch": 0.64, "grad_norm": 2.1395643169108522, "learning_rate": 6.083032318465731e-06, "loss": 0.7866, "step": 3892 }, { "epoch": 0.64, "grad_norm": 3.0247461214472526, "learning_rate": 6.0781377643200765e-06, "loss": 0.8359, "step": 3893 }, { "epoch": 0.64, "grad_norm": 2.318382880467663, "learning_rate": 6.073244320249274e-06, "loss": 0.7625, "step": 3894 }, { "epoch": 0.64, "grad_norm": 2.024375558224969, "learning_rate": 6.0683519876384034e-06, "loss": 0.7674, "step": 3895 }, { "epoch": 0.64, "grad_norm": 1.9564621706628096, "learning_rate": 6.063460767872233e-06, "loss": 0.7152, "step": 3896 }, { "epoch": 0.64, "grad_norm": 3.0669001363846427, "learning_rate": 6.05857066233521e-06, "loss": 0.736, "step": 3897 }, { "epoch": 0.64, "grad_norm": 2.016517686993467, "learning_rate": 6.053681672411471e-06, "loss": 0.6896, "step": 3898 }, { "epoch": 0.64, "grad_norm": 2.2983428556699197, "learning_rate": 6.048793799484831e-06, "loss": 0.7442, "step": 3899 }, { "epoch": 0.64, "grad_norm": 2.3034941527244537, "learning_rate": 6.0439070449387924e-06, "loss": 0.7225, "step": 3900 }, { "epoch": 0.64, "grad_norm": 2.6725485337755743, "learning_rate": 6.039021410156542e-06, "loss": 0.7833, "step": 3901 }, { "epoch": 0.64, "grad_norm": 3.676940015513544, "learning_rate": 6.03413689652095e-06, "loss": 0.7374, "step": 3902 }, { "epoch": 0.64, "grad_norm": 2.81167894619309, "learning_rate": 6.029253505414565e-06, "loss": 0.7099, "step": 3903 }, { "epoch": 0.64, "grad_norm": 2.4811387222585832, "learning_rate": 6.024371238219622e-06, "loss": 0.7985, "step": 3904 }, { "epoch": 0.64, "grad_norm": 2.0989936783402876, "learning_rate": 6.019490096318036e-06, "loss": 0.7143, "step": 3905 }, { "epoch": 0.64, "grad_norm": 1.8962092617159472, "learning_rate": 6.014610081091403e-06, "loss": 0.809, "step": 3906 }, { "epoch": 0.64, "grad_norm": 2.1801842991348104, "learning_rate": 6.009731193921002e-06, "loss": 0.7866, "step": 3907 }, { "epoch": 0.64, "grad_norm": 1.9497899710423585, "learning_rate": 6.004853436187794e-06, "loss": 0.7751, "step": 3908 }, { "epoch": 0.64, "grad_norm": 2.5742515187888166, "learning_rate": 5.9999768092724145e-06, "loss": 0.6723, "step": 3909 }, { "epoch": 0.64, "grad_norm": 1.8898718227490032, "learning_rate": 5.995101314555181e-06, "loss": 0.712, "step": 3910 }, { "epoch": 0.64, "grad_norm": 2.8905278681811954, "learning_rate": 5.990226953416099e-06, "loss": 0.8254, "step": 3911 }, { "epoch": 0.64, "grad_norm": 1.9282696342541346, "learning_rate": 5.98535372723484e-06, "loss": 0.7939, "step": 3912 }, { "epoch": 0.64, "grad_norm": 1.7531397072165162, "learning_rate": 5.9804816373907625e-06, "loss": 0.7638, "step": 3913 }, { "epoch": 0.64, "grad_norm": 2.4149022318652813, "learning_rate": 5.975610685262902e-06, "loss": 0.7678, "step": 3914 }, { "epoch": 0.64, "grad_norm": 2.0645773221408237, "learning_rate": 5.970740872229974e-06, "loss": 0.7928, "step": 3915 }, { "epoch": 0.64, "grad_norm": 2.0391938080299337, "learning_rate": 5.965872199670362e-06, "loss": 0.7809, "step": 3916 }, { "epoch": 0.64, "grad_norm": 2.1866611427226843, "learning_rate": 5.961004668962136e-06, "loss": 0.8358, "step": 3917 }, { "epoch": 0.64, "grad_norm": 2.215212153873034, "learning_rate": 5.956138281483039e-06, "loss": 0.7923, "step": 3918 }, { "epoch": 0.64, "grad_norm": 2.2321828237982375, "learning_rate": 5.951273038610496e-06, "loss": 0.76, "step": 3919 }, { "epoch": 0.64, "grad_norm": 0.6821930462913366, "learning_rate": 5.946408941721602e-06, "loss": 0.37, "step": 3920 }, { "epoch": 0.64, "grad_norm": 0.635979421111533, "learning_rate": 5.941545992193129e-06, "loss": 0.3811, "step": 3921 }, { "epoch": 0.64, "grad_norm": 1.991842333563402, "learning_rate": 5.936684191401525e-06, "loss": 0.7702, "step": 3922 }, { "epoch": 0.64, "grad_norm": 1.6285573150432477, "learning_rate": 5.931823540722912e-06, "loss": 0.7665, "step": 3923 }, { "epoch": 0.64, "grad_norm": 1.8699169931910524, "learning_rate": 5.9269640415330875e-06, "loss": 0.768, "step": 3924 }, { "epoch": 0.64, "grad_norm": 2.0149135476002225, "learning_rate": 5.922105695207521e-06, "loss": 0.8199, "step": 3925 }, { "epoch": 0.64, "grad_norm": 1.9052389739876368, "learning_rate": 5.917248503121359e-06, "loss": 0.7483, "step": 3926 }, { "epoch": 0.64, "grad_norm": 1.9428158334748276, "learning_rate": 5.912392466649419e-06, "loss": 0.8387, "step": 3927 }, { "epoch": 0.65, "grad_norm": 2.115311407583, "learning_rate": 5.907537587166191e-06, "loss": 0.8229, "step": 3928 }, { "epoch": 0.65, "grad_norm": 2.483537866776351, "learning_rate": 5.90268386604584e-06, "loss": 0.7771, "step": 3929 }, { "epoch": 0.65, "grad_norm": 2.1045171432695646, "learning_rate": 5.897831304662201e-06, "loss": 0.7191, "step": 3930 }, { "epoch": 0.65, "grad_norm": 1.6833661867149812, "learning_rate": 5.892979904388781e-06, "loss": 0.7698, "step": 3931 }, { "epoch": 0.65, "grad_norm": 3.377613774725867, "learning_rate": 5.888129666598756e-06, "loss": 0.7543, "step": 3932 }, { "epoch": 0.65, "grad_norm": 1.736252459149677, "learning_rate": 5.883280592664979e-06, "loss": 0.7567, "step": 3933 }, { "epoch": 0.65, "grad_norm": 2.102920527236537, "learning_rate": 5.878432683959972e-06, "loss": 0.8588, "step": 3934 }, { "epoch": 0.65, "grad_norm": 2.1050432024852994, "learning_rate": 5.8735859418559206e-06, "loss": 0.7637, "step": 3935 }, { "epoch": 0.65, "grad_norm": 2.3148073717564777, "learning_rate": 5.868740367724692e-06, "loss": 0.7779, "step": 3936 }, { "epoch": 0.65, "grad_norm": 2.807128879508809, "learning_rate": 5.863895962937806e-06, "loss": 0.7503, "step": 3937 }, { "epoch": 0.65, "grad_norm": 1.6412438473422946, "learning_rate": 5.859052728866468e-06, "loss": 0.7566, "step": 3938 }, { "epoch": 0.65, "grad_norm": 2.1491387067389534, "learning_rate": 5.854210666881544e-06, "loss": 0.7477, "step": 3939 }, { "epoch": 0.65, "grad_norm": 1.9448612391134101, "learning_rate": 5.8493697783535665e-06, "loss": 0.7839, "step": 3940 }, { "epoch": 0.65, "grad_norm": 2.4819664876657628, "learning_rate": 5.844530064652742e-06, "loss": 0.7871, "step": 3941 }, { "epoch": 0.65, "grad_norm": 2.247181226574284, "learning_rate": 5.839691527148938e-06, "loss": 0.791, "step": 3942 }, { "epoch": 0.65, "grad_norm": 1.8798269096144682, "learning_rate": 5.834854167211699e-06, "loss": 0.7586, "step": 3943 }, { "epoch": 0.65, "grad_norm": 2.0600249185754143, "learning_rate": 5.8300179862102225e-06, "loss": 0.7286, "step": 3944 }, { "epoch": 0.65, "grad_norm": 1.699946278141287, "learning_rate": 5.825182985513383e-06, "loss": 0.7675, "step": 3945 }, { "epoch": 0.65, "grad_norm": 2.0320817055992513, "learning_rate": 5.820349166489716e-06, "loss": 0.8356, "step": 3946 }, { "epoch": 0.65, "grad_norm": 0.6339212114368681, "learning_rate": 5.8155165305074245e-06, "loss": 0.3494, "step": 3947 }, { "epoch": 0.65, "grad_norm": 2.0940686054576787, "learning_rate": 5.810685078934375e-06, "loss": 0.7692, "step": 3948 }, { "epoch": 0.65, "grad_norm": 2.185343983551419, "learning_rate": 5.805854813138098e-06, "loss": 0.7989, "step": 3949 }, { "epoch": 0.65, "grad_norm": 2.4132982823439466, "learning_rate": 5.801025734485794e-06, "loss": 0.7751, "step": 3950 }, { "epoch": 0.65, "grad_norm": 0.5883001963566382, "learning_rate": 5.796197844344325e-06, "loss": 0.3533, "step": 3951 }, { "epoch": 0.65, "grad_norm": 1.8678918111627567, "learning_rate": 5.791371144080209e-06, "loss": 0.7852, "step": 3952 }, { "epoch": 0.65, "grad_norm": 2.3362408905151897, "learning_rate": 5.78654563505964e-06, "loss": 0.7407, "step": 3953 }, { "epoch": 0.65, "grad_norm": 1.7525776321041289, "learning_rate": 5.781721318648461e-06, "loss": 0.8248, "step": 3954 }, { "epoch": 0.65, "grad_norm": 2.089286070407108, "learning_rate": 5.7768981962121906e-06, "loss": 0.7936, "step": 3955 }, { "epoch": 0.65, "grad_norm": 2.0928230033554573, "learning_rate": 5.772076269116001e-06, "loss": 0.8125, "step": 3956 }, { "epoch": 0.65, "grad_norm": 1.7965703976554268, "learning_rate": 5.7672555387247274e-06, "loss": 0.7931, "step": 3957 }, { "epoch": 0.65, "grad_norm": 2.271741097224809, "learning_rate": 5.762436006402874e-06, "loss": 0.7565, "step": 3958 }, { "epoch": 0.65, "grad_norm": 2.628247588812598, "learning_rate": 5.757617673514588e-06, "loss": 0.7743, "step": 3959 }, { "epoch": 0.65, "grad_norm": 1.7425098435257114, "learning_rate": 5.752800541423696e-06, "loss": 0.7459, "step": 3960 }, { "epoch": 0.65, "grad_norm": 1.5718942373036875, "learning_rate": 5.747984611493675e-06, "loss": 0.7579, "step": 3961 }, { "epoch": 0.65, "grad_norm": 1.9326660898249668, "learning_rate": 5.743169885087665e-06, "loss": 0.8185, "step": 3962 }, { "epoch": 0.65, "grad_norm": 1.6716801890739303, "learning_rate": 5.738356363568463e-06, "loss": 0.797, "step": 3963 }, { "epoch": 0.65, "grad_norm": 2.7150824793879864, "learning_rate": 5.733544048298526e-06, "loss": 0.6877, "step": 3964 }, { "epoch": 0.65, "grad_norm": 0.6403581632135402, "learning_rate": 5.728732940639972e-06, "loss": 0.3546, "step": 3965 }, { "epoch": 0.65, "grad_norm": 1.688235598671194, "learning_rate": 5.723923041954571e-06, "loss": 0.7785, "step": 3966 }, { "epoch": 0.65, "grad_norm": 2.865152636303603, "learning_rate": 5.719114353603757e-06, "loss": 0.7741, "step": 3967 }, { "epoch": 0.65, "grad_norm": 2.006878437198805, "learning_rate": 5.714306876948621e-06, "loss": 0.7915, "step": 3968 }, { "epoch": 0.65, "grad_norm": 1.7822851407790057, "learning_rate": 5.709500613349906e-06, "loss": 0.7223, "step": 3969 }, { "epoch": 0.65, "grad_norm": 2.0173287256426753, "learning_rate": 5.704695564168014e-06, "loss": 0.7486, "step": 3970 }, { "epoch": 0.65, "grad_norm": 2.251536070140807, "learning_rate": 5.6998917307630095e-06, "loss": 0.8126, "step": 3971 }, { "epoch": 0.65, "grad_norm": 2.515571869274979, "learning_rate": 5.695089114494599e-06, "loss": 0.8373, "step": 3972 }, { "epoch": 0.65, "grad_norm": 1.9112691049556727, "learning_rate": 5.69028771672216e-06, "loss": 0.8302, "step": 3973 }, { "epoch": 0.65, "grad_norm": 2.5844743447505993, "learning_rate": 5.685487538804718e-06, "loss": 0.705, "step": 3974 }, { "epoch": 0.65, "grad_norm": 1.6591020879918805, "learning_rate": 5.68068858210095e-06, "loss": 0.8258, "step": 3975 }, { "epoch": 0.65, "grad_norm": 2.1608647108301353, "learning_rate": 5.675890847969193e-06, "loss": 0.7207, "step": 3976 }, { "epoch": 0.65, "grad_norm": 1.7765754535675353, "learning_rate": 5.671094337767433e-06, "loss": 0.8152, "step": 3977 }, { "epoch": 0.65, "grad_norm": 1.6914286230396467, "learning_rate": 5.666299052853314e-06, "loss": 0.7946, "step": 3978 }, { "epoch": 0.65, "grad_norm": 1.540856335407112, "learning_rate": 5.661504994584133e-06, "loss": 0.7133, "step": 3979 }, { "epoch": 0.65, "grad_norm": 2.8597561213872003, "learning_rate": 5.656712164316838e-06, "loss": 0.7097, "step": 3980 }, { "epoch": 0.65, "grad_norm": 2.0144092855212206, "learning_rate": 5.651920563408022e-06, "loss": 0.7731, "step": 3981 }, { "epoch": 0.65, "grad_norm": 1.9371374854289973, "learning_rate": 5.647130193213945e-06, "loss": 0.7982, "step": 3982 }, { "epoch": 0.65, "grad_norm": 1.8784307659554131, "learning_rate": 5.642341055090508e-06, "loss": 0.8068, "step": 3983 }, { "epoch": 0.65, "grad_norm": 1.8002798646127305, "learning_rate": 5.637553150393268e-06, "loss": 0.7582, "step": 3984 }, { "epoch": 0.65, "grad_norm": 2.4264453639525434, "learning_rate": 5.632766480477432e-06, "loss": 0.7404, "step": 3985 }, { "epoch": 0.65, "grad_norm": 1.8779931666569387, "learning_rate": 5.6279810466978546e-06, "loss": 0.7968, "step": 3986 }, { "epoch": 0.65, "grad_norm": 3.673189009218264, "learning_rate": 5.623196850409044e-06, "loss": 0.8459, "step": 3987 }, { "epoch": 0.66, "grad_norm": 1.8934230044443927, "learning_rate": 5.618413892965158e-06, "loss": 0.7894, "step": 3988 }, { "epoch": 0.66, "grad_norm": 1.8090553491623371, "learning_rate": 5.613632175720001e-06, "loss": 0.8497, "step": 3989 }, { "epoch": 0.66, "grad_norm": 1.8681448486304744, "learning_rate": 5.6088517000270275e-06, "loss": 0.8435, "step": 3990 }, { "epoch": 0.66, "grad_norm": 10.136534710611825, "learning_rate": 5.604072467239343e-06, "loss": 0.7484, "step": 3991 }, { "epoch": 0.66, "grad_norm": 1.8975532360774021, "learning_rate": 5.599294478709698e-06, "loss": 0.8308, "step": 3992 }, { "epoch": 0.66, "grad_norm": 2.1444109834511345, "learning_rate": 5.5945177357904935e-06, "loss": 0.8538, "step": 3993 }, { "epoch": 0.66, "grad_norm": 1.5953287320080556, "learning_rate": 5.589742239833776e-06, "loss": 0.8, "step": 3994 }, { "epoch": 0.66, "grad_norm": 1.8326251334824621, "learning_rate": 5.584967992191234e-06, "loss": 0.7723, "step": 3995 }, { "epoch": 0.66, "grad_norm": 2.3163294639601957, "learning_rate": 5.580194994214216e-06, "loss": 0.711, "step": 3996 }, { "epoch": 0.66, "grad_norm": 1.6515671010227275, "learning_rate": 5.5754232472537086e-06, "loss": 0.8082, "step": 3997 }, { "epoch": 0.66, "grad_norm": 2.192825820715093, "learning_rate": 5.570652752660343e-06, "loss": 0.7793, "step": 3998 }, { "epoch": 0.66, "grad_norm": 1.8332789880879579, "learning_rate": 5.565883511784396e-06, "loss": 0.762, "step": 3999 }, { "epoch": 0.66, "grad_norm": 2.1928431310349015, "learning_rate": 5.561115525975793e-06, "loss": 0.7107, "step": 4000 }, { "epoch": 0.66, "grad_norm": 1.7303999350658084, "learning_rate": 5.5563487965841055e-06, "loss": 0.797, "step": 4001 }, { "epoch": 0.66, "grad_norm": 0.5902409961292221, "learning_rate": 5.5515833249585385e-06, "loss": 0.3765, "step": 4002 }, { "epoch": 0.66, "grad_norm": 1.9728586545456317, "learning_rate": 5.546819112447952e-06, "loss": 0.7891, "step": 4003 }, { "epoch": 0.66, "grad_norm": 2.155828254702798, "learning_rate": 5.542056160400848e-06, "loss": 0.7805, "step": 4004 }, { "epoch": 0.66, "grad_norm": 2.095819183431656, "learning_rate": 5.537294470165369e-06, "loss": 0.757, "step": 4005 }, { "epoch": 0.66, "grad_norm": 2.445313092617787, "learning_rate": 5.532534043089302e-06, "loss": 0.7833, "step": 4006 }, { "epoch": 0.66, "grad_norm": 2.4760340046299247, "learning_rate": 5.527774880520073e-06, "loss": 0.7463, "step": 4007 }, { "epoch": 0.66, "grad_norm": 1.7398048757320435, "learning_rate": 5.523016983804759e-06, "loss": 0.8087, "step": 4008 }, { "epoch": 0.66, "grad_norm": 1.484918047238164, "learning_rate": 5.518260354290066e-06, "loss": 0.73, "step": 4009 }, { "epoch": 0.66, "grad_norm": 2.1222914476739483, "learning_rate": 5.513504993322352e-06, "loss": 0.7704, "step": 4010 }, { "epoch": 0.66, "grad_norm": 2.5204468016167803, "learning_rate": 5.508750902247612e-06, "loss": 0.8297, "step": 4011 }, { "epoch": 0.66, "grad_norm": 2.3766563925709896, "learning_rate": 5.503998082411479e-06, "loss": 0.6897, "step": 4012 }, { "epoch": 0.66, "grad_norm": 1.816161033093716, "learning_rate": 5.499246535159231e-06, "loss": 0.8165, "step": 4013 }, { "epoch": 0.66, "grad_norm": 1.6629022648712595, "learning_rate": 5.494496261835781e-06, "loss": 0.8666, "step": 4014 }, { "epoch": 0.66, "grad_norm": 2.3456312759504296, "learning_rate": 5.489747263785687e-06, "loss": 0.7816, "step": 4015 }, { "epoch": 0.66, "grad_norm": 2.3953853058978183, "learning_rate": 5.48499954235314e-06, "loss": 0.8051, "step": 4016 }, { "epoch": 0.66, "grad_norm": 2.320209562659045, "learning_rate": 5.480253098881974e-06, "loss": 0.6762, "step": 4017 }, { "epoch": 0.66, "grad_norm": 1.992756291656641, "learning_rate": 5.47550793471566e-06, "loss": 0.7513, "step": 4018 }, { "epoch": 0.66, "grad_norm": 1.8999795808140572, "learning_rate": 5.470764051197302e-06, "loss": 0.7835, "step": 4019 }, { "epoch": 0.66, "grad_norm": 0.6269661541246372, "learning_rate": 5.466021449669655e-06, "loss": 0.3898, "step": 4020 }, { "epoch": 0.66, "grad_norm": 3.6500518462344744, "learning_rate": 5.461280131475099e-06, "loss": 0.8067, "step": 4021 }, { "epoch": 0.66, "grad_norm": 2.1503443304380565, "learning_rate": 5.456540097955652e-06, "loss": 0.7894, "step": 4022 }, { "epoch": 0.66, "grad_norm": 1.6858960426538714, "learning_rate": 5.451801350452975e-06, "loss": 0.7761, "step": 4023 }, { "epoch": 0.66, "grad_norm": 1.995486083599484, "learning_rate": 5.447063890308354e-06, "loss": 0.7838, "step": 4024 }, { "epoch": 0.66, "grad_norm": 0.5871129431362981, "learning_rate": 5.442327718862721e-06, "loss": 0.3706, "step": 4025 }, { "epoch": 0.66, "grad_norm": 0.6090526022182405, "learning_rate": 5.4375928374566376e-06, "loss": 0.3872, "step": 4026 }, { "epoch": 0.66, "grad_norm": 1.8255518685622998, "learning_rate": 5.432859247430303e-06, "loss": 0.7895, "step": 4027 }, { "epoch": 0.66, "grad_norm": 2.131468315022179, "learning_rate": 5.428126950123551e-06, "loss": 0.7949, "step": 4028 }, { "epoch": 0.66, "grad_norm": 1.9122645517681371, "learning_rate": 5.423395946875846e-06, "loss": 0.8847, "step": 4029 }, { "epoch": 0.66, "grad_norm": 2.141398862022088, "learning_rate": 5.418666239026291e-06, "loss": 0.6845, "step": 4030 }, { "epoch": 0.66, "grad_norm": 2.9322887887875724, "learning_rate": 5.413937827913619e-06, "loss": 0.8447, "step": 4031 }, { "epoch": 0.66, "grad_norm": 2.979150695800199, "learning_rate": 5.409210714876197e-06, "loss": 0.7649, "step": 4032 }, { "epoch": 0.66, "grad_norm": 2.053178314756457, "learning_rate": 5.404484901252023e-06, "loss": 0.7433, "step": 4033 }, { "epoch": 0.66, "grad_norm": 2.7390626780501917, "learning_rate": 5.399760388378729e-06, "loss": 0.7938, "step": 4034 }, { "epoch": 0.66, "grad_norm": 0.5784443943693932, "learning_rate": 5.395037177593579e-06, "loss": 0.3478, "step": 4035 }, { "epoch": 0.66, "grad_norm": 2.873589295022844, "learning_rate": 5.390315270233469e-06, "loss": 0.7315, "step": 4036 }, { "epoch": 0.66, "grad_norm": 1.9047766488014088, "learning_rate": 5.385594667634923e-06, "loss": 0.7896, "step": 4037 }, { "epoch": 0.66, "grad_norm": 2.003883556602434, "learning_rate": 5.3808753711341e-06, "loss": 0.7623, "step": 4038 }, { "epoch": 0.66, "grad_norm": 2.2486978058406897, "learning_rate": 5.376157382066784e-06, "loss": 0.8162, "step": 4039 }, { "epoch": 0.66, "grad_norm": 2.604952069728139, "learning_rate": 5.371440701768394e-06, "loss": 0.7891, "step": 4040 }, { "epoch": 0.66, "grad_norm": 1.929794023942342, "learning_rate": 5.366725331573974e-06, "loss": 0.7388, "step": 4041 }, { "epoch": 0.66, "grad_norm": 2.1654175037644494, "learning_rate": 5.3620112728182e-06, "loss": 0.6512, "step": 4042 }, { "epoch": 0.66, "grad_norm": 1.6079485914014873, "learning_rate": 5.357298526835381e-06, "loss": 0.723, "step": 4043 }, { "epoch": 0.66, "grad_norm": 0.6016156203732144, "learning_rate": 5.35258709495945e-06, "loss": 0.3452, "step": 4044 }, { "epoch": 0.66, "grad_norm": 2.1329115994975183, "learning_rate": 5.34787697852396e-06, "loss": 0.7375, "step": 4045 }, { "epoch": 0.66, "grad_norm": 1.7961236254359079, "learning_rate": 5.343168178862104e-06, "loss": 0.6944, "step": 4046 }, { "epoch": 0.66, "grad_norm": 2.118891344822424, "learning_rate": 5.338460697306699e-06, "loss": 0.7322, "step": 4047 }, { "epoch": 0.66, "grad_norm": 1.8965422050758443, "learning_rate": 5.333754535190186e-06, "loss": 0.8357, "step": 4048 }, { "epoch": 0.67, "grad_norm": 1.9816923733482736, "learning_rate": 5.329049693844635e-06, "loss": 0.7942, "step": 4049 }, { "epoch": 0.67, "grad_norm": 2.037012214221762, "learning_rate": 5.324346174601741e-06, "loss": 0.8194, "step": 4050 }, { "epoch": 0.67, "grad_norm": 1.9622127522258395, "learning_rate": 5.319643978792825e-06, "loss": 0.8193, "step": 4051 }, { "epoch": 0.67, "grad_norm": 2.263328640973967, "learning_rate": 5.314943107748836e-06, "loss": 0.6998, "step": 4052 }, { "epoch": 0.67, "grad_norm": 1.5078364743857366, "learning_rate": 5.3102435628003435e-06, "loss": 0.7283, "step": 4053 }, { "epoch": 0.67, "grad_norm": 1.8598718059236092, "learning_rate": 5.305545345277543e-06, "loss": 0.7386, "step": 4054 }, { "epoch": 0.67, "grad_norm": 1.78810850789584, "learning_rate": 5.300848456510257e-06, "loss": 0.7125, "step": 4055 }, { "epoch": 0.67, "grad_norm": 1.708927124938428, "learning_rate": 5.296152897827929e-06, "loss": 0.7257, "step": 4056 }, { "epoch": 0.67, "grad_norm": 1.7957154204850285, "learning_rate": 5.291458670559628e-06, "loss": 0.6887, "step": 4057 }, { "epoch": 0.67, "grad_norm": 2.727254504341152, "learning_rate": 5.286765776034044e-06, "loss": 0.7585, "step": 4058 }, { "epoch": 0.67, "grad_norm": 1.9890844634786815, "learning_rate": 5.282074215579492e-06, "loss": 0.8092, "step": 4059 }, { "epoch": 0.67, "grad_norm": 1.9413648114940092, "learning_rate": 5.277383990523905e-06, "loss": 0.749, "step": 4060 }, { "epoch": 0.67, "grad_norm": 3.3077685472128064, "learning_rate": 5.272695102194846e-06, "loss": 0.7594, "step": 4061 }, { "epoch": 0.67, "grad_norm": 1.5476199090490697, "learning_rate": 5.2680075519194926e-06, "loss": 0.8008, "step": 4062 }, { "epoch": 0.67, "grad_norm": 1.884384910536493, "learning_rate": 5.263321341024646e-06, "loss": 0.7519, "step": 4063 }, { "epoch": 0.67, "grad_norm": 1.8895682218744834, "learning_rate": 5.25863647083673e-06, "loss": 0.7905, "step": 4064 }, { "epoch": 0.67, "grad_norm": 1.984098287467309, "learning_rate": 5.253952942681782e-06, "loss": 0.7925, "step": 4065 }, { "epoch": 0.67, "grad_norm": 1.553365444351596, "learning_rate": 5.249270757885475e-06, "loss": 0.8241, "step": 4066 }, { "epoch": 0.67, "grad_norm": 0.6164592151464872, "learning_rate": 5.244589917773082e-06, "loss": 0.3783, "step": 4067 }, { "epoch": 0.67, "grad_norm": 2.271727986204832, "learning_rate": 5.239910423669509e-06, "loss": 0.7871, "step": 4068 }, { "epoch": 0.67, "grad_norm": 1.737336447293442, "learning_rate": 5.2352322768992755e-06, "loss": 0.8205, "step": 4069 }, { "epoch": 0.67, "grad_norm": 2.0049117360651767, "learning_rate": 5.230555478786522e-06, "loss": 0.7583, "step": 4070 }, { "epoch": 0.67, "grad_norm": 2.3030588488294508, "learning_rate": 5.225880030655006e-06, "loss": 0.7448, "step": 4071 }, { "epoch": 0.67, "grad_norm": 2.326850386796775, "learning_rate": 5.221205933828104e-06, "loss": 0.7923, "step": 4072 }, { "epoch": 0.67, "grad_norm": 1.7672361856502234, "learning_rate": 5.216533189628808e-06, "loss": 0.766, "step": 4073 }, { "epoch": 0.67, "grad_norm": 1.8365325026079586, "learning_rate": 5.211861799379731e-06, "loss": 0.8022, "step": 4074 }, { "epoch": 0.67, "grad_norm": 2.072016216827152, "learning_rate": 5.207191764403097e-06, "loss": 0.7597, "step": 4075 }, { "epoch": 0.67, "grad_norm": 2.135673338523845, "learning_rate": 5.20252308602075e-06, "loss": 0.7115, "step": 4076 }, { "epoch": 0.67, "grad_norm": 5.8931841402881515, "learning_rate": 5.197855765554152e-06, "loss": 0.7184, "step": 4077 }, { "epoch": 0.67, "grad_norm": 2.1394548575596226, "learning_rate": 5.193189804324376e-06, "loss": 0.7842, "step": 4078 }, { "epoch": 0.67, "grad_norm": 1.852192516847454, "learning_rate": 5.1885252036521125e-06, "loss": 0.7853, "step": 4079 }, { "epoch": 0.67, "grad_norm": 3.1051374283758504, "learning_rate": 5.183861964857669e-06, "loss": 0.7774, "step": 4080 }, { "epoch": 0.67, "grad_norm": 2.1340368416359987, "learning_rate": 5.179200089260964e-06, "loss": 0.8068, "step": 4081 }, { "epoch": 0.67, "grad_norm": 3.382213770738176, "learning_rate": 5.174539578181531e-06, "loss": 0.7414, "step": 4082 }, { "epoch": 0.67, "grad_norm": 1.7902958456960787, "learning_rate": 5.169880432938519e-06, "loss": 0.7629, "step": 4083 }, { "epoch": 0.67, "grad_norm": 2.0588394201007234, "learning_rate": 5.165222654850688e-06, "loss": 0.7469, "step": 4084 }, { "epoch": 0.67, "grad_norm": 1.8051252289882078, "learning_rate": 5.160566245236413e-06, "loss": 0.7592, "step": 4085 }, { "epoch": 0.67, "grad_norm": 1.9248095272153911, "learning_rate": 5.155911205413683e-06, "loss": 0.6935, "step": 4086 }, { "epoch": 0.67, "grad_norm": 2.2594720094888205, "learning_rate": 5.151257536700094e-06, "loss": 0.7488, "step": 4087 }, { "epoch": 0.67, "grad_norm": 2.414680482817305, "learning_rate": 5.146605240412859e-06, "loss": 0.8458, "step": 4088 }, { "epoch": 0.67, "grad_norm": 2.2588710075313285, "learning_rate": 5.141954317868798e-06, "loss": 0.7821, "step": 4089 }, { "epoch": 0.67, "grad_norm": 3.039545080928651, "learning_rate": 5.137304770384348e-06, "loss": 0.73, "step": 4090 }, { "epoch": 0.67, "grad_norm": 2.062201927903069, "learning_rate": 5.132656599275554e-06, "loss": 0.7857, "step": 4091 }, { "epoch": 0.67, "grad_norm": 2.2360992331438947, "learning_rate": 5.128009805858067e-06, "loss": 0.741, "step": 4092 }, { "epoch": 0.67, "grad_norm": 2.005861193763698, "learning_rate": 5.123364391447156e-06, "loss": 0.7758, "step": 4093 }, { "epoch": 0.67, "grad_norm": 1.7669145332688, "learning_rate": 5.118720357357696e-06, "loss": 0.7396, "step": 4094 }, { "epoch": 0.67, "grad_norm": 0.6159836996555562, "learning_rate": 5.114077704904168e-06, "loss": 0.3618, "step": 4095 }, { "epoch": 0.67, "grad_norm": 1.5158090037256313, "learning_rate": 5.109436435400667e-06, "loss": 0.782, "step": 4096 }, { "epoch": 0.67, "grad_norm": 1.7541952859059728, "learning_rate": 5.104796550160893e-06, "loss": 0.7976, "step": 4097 }, { "epoch": 0.67, "grad_norm": 2.1763034929713787, "learning_rate": 5.100158050498159e-06, "loss": 0.7313, "step": 4098 }, { "epoch": 0.67, "grad_norm": 0.6437696072145273, "learning_rate": 5.095520937725378e-06, "loss": 0.3736, "step": 4099 }, { "epoch": 0.67, "grad_norm": 2.298162961571771, "learning_rate": 5.090885213155079e-06, "loss": 0.7738, "step": 4100 }, { "epoch": 0.67, "grad_norm": 1.9634840176660255, "learning_rate": 5.0862508780993915e-06, "loss": 0.8067, "step": 4101 }, { "epoch": 0.67, "grad_norm": 2.1435620869531955, "learning_rate": 5.081617933870056e-06, "loss": 0.7084, "step": 4102 }, { "epoch": 0.67, "grad_norm": 2.48146240115696, "learning_rate": 5.076986381778417e-06, "loss": 0.7582, "step": 4103 }, { "epoch": 0.67, "grad_norm": 1.5298576741630117, "learning_rate": 5.072356223135425e-06, "loss": 0.7729, "step": 4104 }, { "epoch": 0.67, "grad_norm": 1.7982175876866324, "learning_rate": 5.067727459251638e-06, "loss": 0.7977, "step": 4105 }, { "epoch": 0.67, "grad_norm": 1.849654906975234, "learning_rate": 5.063100091437217e-06, "loss": 0.7027, "step": 4106 }, { "epoch": 0.67, "grad_norm": 1.9087976225616838, "learning_rate": 5.058474121001928e-06, "loss": 0.7474, "step": 4107 }, { "epoch": 0.67, "grad_norm": 2.5930844201144962, "learning_rate": 5.053849549255143e-06, "loss": 0.7819, "step": 4108 }, { "epoch": 0.67, "grad_norm": 2.8039978349802412, "learning_rate": 5.049226377505838e-06, "loss": 0.8226, "step": 4109 }, { "epoch": 0.68, "grad_norm": 1.6762157495943903, "learning_rate": 5.044604607062591e-06, "loss": 0.7982, "step": 4110 }, { "epoch": 0.68, "grad_norm": 1.9007538200127538, "learning_rate": 5.0399842392335856e-06, "loss": 0.7805, "step": 4111 }, { "epoch": 0.68, "grad_norm": 2.568287157105627, "learning_rate": 5.0353652753266045e-06, "loss": 0.7101, "step": 4112 }, { "epoch": 0.68, "grad_norm": 2.150715632580297, "learning_rate": 5.03074771664904e-06, "loss": 0.79, "step": 4113 }, { "epoch": 0.68, "grad_norm": 1.9948643791204024, "learning_rate": 5.026131564507878e-06, "loss": 0.8152, "step": 4114 }, { "epoch": 0.68, "grad_norm": 1.784537632596585, "learning_rate": 5.021516820209713e-06, "loss": 0.8199, "step": 4115 }, { "epoch": 0.68, "grad_norm": 2.73553274380513, "learning_rate": 5.016903485060738e-06, "loss": 0.6084, "step": 4116 }, { "epoch": 0.68, "grad_norm": 2.6699085579071724, "learning_rate": 5.0122915603667485e-06, "loss": 0.8195, "step": 4117 }, { "epoch": 0.68, "grad_norm": 1.8390134453557745, "learning_rate": 5.0076810474331395e-06, "loss": 0.7887, "step": 4118 }, { "epoch": 0.68, "grad_norm": 1.5893129816796796, "learning_rate": 5.003071947564908e-06, "loss": 0.7864, "step": 4119 }, { "epoch": 0.68, "grad_norm": 1.790985621152057, "learning_rate": 4.998464262066648e-06, "loss": 0.7283, "step": 4120 }, { "epoch": 0.68, "grad_norm": 2.748899217391012, "learning_rate": 4.993857992242557e-06, "loss": 0.6767, "step": 4121 }, { "epoch": 0.68, "grad_norm": 2.0455650262674454, "learning_rate": 4.9892531393964285e-06, "loss": 0.7497, "step": 4122 }, { "epoch": 0.68, "grad_norm": 2.1199920544704285, "learning_rate": 4.984649704831658e-06, "loss": 0.6665, "step": 4123 }, { "epoch": 0.68, "grad_norm": 2.4240061454682835, "learning_rate": 4.980047689851236e-06, "loss": 0.7453, "step": 4124 }, { "epoch": 0.68, "grad_norm": 0.6271129807044389, "learning_rate": 4.975447095757755e-06, "loss": 0.3795, "step": 4125 }, { "epoch": 0.68, "grad_norm": 2.8477071835056376, "learning_rate": 4.970847923853404e-06, "loss": 0.7859, "step": 4126 }, { "epoch": 0.68, "grad_norm": 2.1570562146706167, "learning_rate": 4.966250175439966e-06, "loss": 0.7818, "step": 4127 }, { "epoch": 0.68, "grad_norm": 1.52240053286142, "learning_rate": 4.961653851818827e-06, "loss": 0.7892, "step": 4128 }, { "epoch": 0.68, "grad_norm": 1.8294701892198204, "learning_rate": 4.957058954290964e-06, "loss": 0.8211, "step": 4129 }, { "epoch": 0.68, "grad_norm": 1.8059803155163505, "learning_rate": 4.952465484156956e-06, "loss": 0.8125, "step": 4130 }, { "epoch": 0.68, "grad_norm": 2.339573641457508, "learning_rate": 4.947873442716972e-06, "loss": 0.7969, "step": 4131 }, { "epoch": 0.68, "grad_norm": 2.343043595574645, "learning_rate": 4.9432828312707836e-06, "loss": 0.7364, "step": 4132 }, { "epoch": 0.68, "grad_norm": 2.510321840769168, "learning_rate": 4.938693651117751e-06, "loss": 0.8136, "step": 4133 }, { "epoch": 0.68, "grad_norm": 1.808277993804594, "learning_rate": 4.934105903556831e-06, "loss": 0.7938, "step": 4134 }, { "epoch": 0.68, "grad_norm": 1.878636186371232, "learning_rate": 4.929519589886578e-06, "loss": 0.7631, "step": 4135 }, { "epoch": 0.68, "grad_norm": 1.7923695118863447, "learning_rate": 4.924934711405138e-06, "loss": 0.7923, "step": 4136 }, { "epoch": 0.68, "grad_norm": 2.4268408294833264, "learning_rate": 4.920351269410251e-06, "loss": 0.7463, "step": 4137 }, { "epoch": 0.68, "grad_norm": 2.686350358632079, "learning_rate": 4.9157692651992495e-06, "loss": 0.8087, "step": 4138 }, { "epoch": 0.68, "grad_norm": 1.795770759115503, "learning_rate": 4.911188700069062e-06, "loss": 0.7217, "step": 4139 }, { "epoch": 0.68, "grad_norm": 2.7011525144728683, "learning_rate": 4.906609575316207e-06, "loss": 0.7812, "step": 4140 }, { "epoch": 0.68, "grad_norm": 3.3753776670949183, "learning_rate": 4.9020318922367956e-06, "loss": 0.7633, "step": 4141 }, { "epoch": 0.68, "grad_norm": 1.74872167240039, "learning_rate": 4.897455652126533e-06, "loss": 0.7571, "step": 4142 }, { "epoch": 0.68, "grad_norm": 1.6882655665796802, "learning_rate": 4.892880856280713e-06, "loss": 0.723, "step": 4143 }, { "epoch": 0.68, "grad_norm": 1.8706763830713975, "learning_rate": 4.888307505994222e-06, "loss": 0.8014, "step": 4144 }, { "epoch": 0.68, "grad_norm": 2.1025262992343077, "learning_rate": 4.883735602561537e-06, "loss": 0.8338, "step": 4145 }, { "epoch": 0.68, "grad_norm": 1.7127520993049359, "learning_rate": 4.879165147276726e-06, "loss": 0.6779, "step": 4146 }, { "epoch": 0.68, "grad_norm": 2.018151243738182, "learning_rate": 4.874596141433447e-06, "loss": 0.7072, "step": 4147 }, { "epoch": 0.68, "grad_norm": 1.6480167429867911, "learning_rate": 4.870028586324947e-06, "loss": 0.7475, "step": 4148 }, { "epoch": 0.68, "grad_norm": 1.7016351432611652, "learning_rate": 4.865462483244065e-06, "loss": 0.7043, "step": 4149 }, { "epoch": 0.68, "grad_norm": 0.5831945568589917, "learning_rate": 4.8608978334832225e-06, "loss": 0.36, "step": 4150 }, { "epoch": 0.68, "grad_norm": 2.1305297762308313, "learning_rate": 4.8563346383344375e-06, "loss": 0.7621, "step": 4151 }, { "epoch": 0.68, "grad_norm": 2.1151971118705823, "learning_rate": 4.851772899089312e-06, "loss": 0.7618, "step": 4152 }, { "epoch": 0.68, "grad_norm": 0.6013220410296867, "learning_rate": 4.847212617039037e-06, "loss": 0.3293, "step": 4153 }, { "epoch": 0.68, "grad_norm": 2.6002847826914106, "learning_rate": 4.842653793474389e-06, "loss": 0.7008, "step": 4154 }, { "epoch": 0.68, "grad_norm": 2.737772555637926, "learning_rate": 4.838096429685735e-06, "loss": 0.7454, "step": 4155 }, { "epoch": 0.68, "grad_norm": 1.8905869692819273, "learning_rate": 4.833540526963027e-06, "loss": 0.779, "step": 4156 }, { "epoch": 0.68, "grad_norm": 1.622676389939264, "learning_rate": 4.828986086595804e-06, "loss": 0.7929, "step": 4157 }, { "epoch": 0.68, "grad_norm": 3.4942209426742674, "learning_rate": 4.82443310987319e-06, "loss": 0.7361, "step": 4158 }, { "epoch": 0.68, "grad_norm": 2.590842607132965, "learning_rate": 4.819881598083895e-06, "loss": 0.7816, "step": 4159 }, { "epoch": 0.68, "grad_norm": 2.4201827721747433, "learning_rate": 4.815331552516217e-06, "loss": 0.7634, "step": 4160 }, { "epoch": 0.68, "grad_norm": 2.2205721747391483, "learning_rate": 4.810782974458035e-06, "loss": 0.7859, "step": 4161 }, { "epoch": 0.68, "grad_norm": 1.8785971309085883, "learning_rate": 4.806235865196815e-06, "loss": 0.7277, "step": 4162 }, { "epoch": 0.68, "grad_norm": 2.2211734543965913, "learning_rate": 4.801690226019606e-06, "loss": 0.7729, "step": 4163 }, { "epoch": 0.68, "grad_norm": 2.6285251102679834, "learning_rate": 4.7971460582130425e-06, "loss": 0.79, "step": 4164 }, { "epoch": 0.68, "grad_norm": 1.9775273434938003, "learning_rate": 4.792603363063342e-06, "loss": 0.6249, "step": 4165 }, { "epoch": 0.68, "grad_norm": 2.262253534360244, "learning_rate": 4.7880621418563035e-06, "loss": 0.6984, "step": 4166 }, { "epoch": 0.68, "grad_norm": 3.039504435098485, "learning_rate": 4.783522395877311e-06, "loss": 0.6699, "step": 4167 }, { "epoch": 0.68, "grad_norm": 2.4150184332677127, "learning_rate": 4.77898412641133e-06, "loss": 0.7824, "step": 4168 }, { "epoch": 0.68, "grad_norm": 1.626979225722871, "learning_rate": 4.774447334742908e-06, "loss": 0.8354, "step": 4169 }, { "epoch": 0.68, "grad_norm": 1.8765126575871154, "learning_rate": 4.769912022156175e-06, "loss": 0.766, "step": 4170 }, { "epoch": 0.69, "grad_norm": 2.629676641612955, "learning_rate": 4.7653781899348395e-06, "loss": 0.8495, "step": 4171 }, { "epoch": 0.69, "grad_norm": 0.5935835078858631, "learning_rate": 4.760845839362196e-06, "loss": 0.3792, "step": 4172 }, { "epoch": 0.69, "grad_norm": 0.6593429352271792, "learning_rate": 4.756314971721115e-06, "loss": 0.3477, "step": 4173 }, { "epoch": 0.69, "grad_norm": 2.0802129037033965, "learning_rate": 4.751785588294053e-06, "loss": 0.7514, "step": 4174 }, { "epoch": 0.69, "grad_norm": 0.6528764797833931, "learning_rate": 4.7472576903630314e-06, "loss": 0.3676, "step": 4175 }, { "epoch": 0.69, "grad_norm": 3.3229745448725403, "learning_rate": 4.742731279209674e-06, "loss": 0.7894, "step": 4176 }, { "epoch": 0.69, "grad_norm": 2.639466926632789, "learning_rate": 4.738206356115167e-06, "loss": 0.7386, "step": 4177 }, { "epoch": 0.69, "grad_norm": 3.2688406374899333, "learning_rate": 4.733682922360282e-06, "loss": 0.8316, "step": 4178 }, { "epoch": 0.69, "grad_norm": 0.688571279129453, "learning_rate": 4.729160979225365e-06, "loss": 0.3302, "step": 4179 }, { "epoch": 0.69, "grad_norm": 1.8549267988611187, "learning_rate": 4.724640527990345e-06, "loss": 0.7798, "step": 4180 }, { "epoch": 0.69, "grad_norm": 1.7598814463352677, "learning_rate": 4.720121569934726e-06, "loss": 0.7334, "step": 4181 }, { "epoch": 0.69, "grad_norm": 1.5570223430738326, "learning_rate": 4.715604106337587e-06, "loss": 0.7885, "step": 4182 }, { "epoch": 0.69, "grad_norm": 1.5784337257643573, "learning_rate": 4.71108813847759e-06, "loss": 0.7066, "step": 4183 }, { "epoch": 0.69, "grad_norm": 2.4382007092647147, "learning_rate": 4.706573667632967e-06, "loss": 0.7712, "step": 4184 }, { "epoch": 0.69, "grad_norm": 1.8316205579428713, "learning_rate": 4.702060695081532e-06, "loss": 0.8048, "step": 4185 }, { "epoch": 0.69, "grad_norm": 0.6555984097539165, "learning_rate": 4.69754922210067e-06, "loss": 0.3722, "step": 4186 }, { "epoch": 0.69, "grad_norm": 3.671828033550345, "learning_rate": 4.693039249967347e-06, "loss": 0.8055, "step": 4187 }, { "epoch": 0.69, "grad_norm": 2.419639840935903, "learning_rate": 4.688530779958099e-06, "loss": 0.7548, "step": 4188 }, { "epoch": 0.69, "grad_norm": 2.312277259808699, "learning_rate": 4.684023813349039e-06, "loss": 0.6964, "step": 4189 }, { "epoch": 0.69, "grad_norm": 1.9198763170421946, "learning_rate": 4.679518351415855e-06, "loss": 0.7916, "step": 4190 }, { "epoch": 0.69, "grad_norm": 0.6584638421382915, "learning_rate": 4.675014395433808e-06, "loss": 0.357, "step": 4191 }, { "epoch": 0.69, "grad_norm": 1.893643615453616, "learning_rate": 4.6705119466777334e-06, "loss": 0.762, "step": 4192 }, { "epoch": 0.69, "grad_norm": 2.504573467338602, "learning_rate": 4.666011006422041e-06, "loss": 0.813, "step": 4193 }, { "epoch": 0.69, "grad_norm": 1.829564829448877, "learning_rate": 4.661511575940712e-06, "loss": 0.8071, "step": 4194 }, { "epoch": 0.69, "grad_norm": 3.012406065792022, "learning_rate": 4.657013656507299e-06, "loss": 0.8355, "step": 4195 }, { "epoch": 0.69, "grad_norm": 1.9189451223129743, "learning_rate": 4.6525172493949335e-06, "loss": 0.8606, "step": 4196 }, { "epoch": 0.69, "grad_norm": 1.7865154118986653, "learning_rate": 4.648022355876307e-06, "loss": 0.7896, "step": 4197 }, { "epoch": 0.69, "grad_norm": 1.6555833524594918, "learning_rate": 4.643528977223689e-06, "loss": 0.7275, "step": 4198 }, { "epoch": 0.69, "grad_norm": 2.214916615834812, "learning_rate": 4.63903711470893e-06, "loss": 0.7869, "step": 4199 }, { "epoch": 0.69, "grad_norm": 2.210549811634632, "learning_rate": 4.634546769603436e-06, "loss": 0.7901, "step": 4200 }, { "epoch": 0.69, "grad_norm": 1.9671572370988868, "learning_rate": 4.6300579431781915e-06, "loss": 0.8193, "step": 4201 }, { "epoch": 0.69, "grad_norm": 0.6238190410869373, "learning_rate": 4.625570636703748e-06, "loss": 0.3446, "step": 4202 }, { "epoch": 0.69, "grad_norm": 2.143770289364746, "learning_rate": 4.621084851450229e-06, "loss": 0.8954, "step": 4203 }, { "epoch": 0.69, "grad_norm": 1.9766953993065022, "learning_rate": 4.616600588687327e-06, "loss": 0.8239, "step": 4204 }, { "epoch": 0.69, "grad_norm": 2.0777475048009255, "learning_rate": 4.6121178496843045e-06, "loss": 0.7701, "step": 4205 }, { "epoch": 0.69, "grad_norm": 1.9024272962807096, "learning_rate": 4.607636635709988e-06, "loss": 0.7989, "step": 4206 }, { "epoch": 0.69, "grad_norm": 1.971741626685406, "learning_rate": 4.603156948032776e-06, "loss": 0.677, "step": 4207 }, { "epoch": 0.69, "grad_norm": 1.996192304413556, "learning_rate": 4.5986787879206375e-06, "loss": 0.7997, "step": 4208 }, { "epoch": 0.69, "grad_norm": 2.0503631141124914, "learning_rate": 4.594202156641105e-06, "loss": 0.7234, "step": 4209 }, { "epoch": 0.69, "grad_norm": 1.6937390980792446, "learning_rate": 4.589727055461278e-06, "loss": 0.8494, "step": 4210 }, { "epoch": 0.69, "grad_norm": 1.9479774383464472, "learning_rate": 4.585253485647826e-06, "loss": 0.7418, "step": 4211 }, { "epoch": 0.69, "grad_norm": 1.9352917567218586, "learning_rate": 4.5807814484669835e-06, "loss": 0.8239, "step": 4212 }, { "epoch": 0.69, "grad_norm": 2.039430717173202, "learning_rate": 4.5763109451845515e-06, "loss": 0.834, "step": 4213 }, { "epoch": 0.69, "grad_norm": 1.8398802972430823, "learning_rate": 4.571841977065895e-06, "loss": 0.7638, "step": 4214 }, { "epoch": 0.69, "grad_norm": 1.7535345717623168, "learning_rate": 4.567374545375948e-06, "loss": 0.8184, "step": 4215 }, { "epoch": 0.69, "grad_norm": 1.7865397331553772, "learning_rate": 4.562908651379206e-06, "loss": 0.708, "step": 4216 }, { "epoch": 0.69, "grad_norm": 0.6045407586598848, "learning_rate": 4.558444296339731e-06, "loss": 0.3518, "step": 4217 }, { "epoch": 0.69, "grad_norm": 1.6464862367313315, "learning_rate": 4.553981481521156e-06, "loss": 0.6912, "step": 4218 }, { "epoch": 0.69, "grad_norm": 1.692841443288182, "learning_rate": 4.54952020818666e-06, "loss": 0.7285, "step": 4219 }, { "epoch": 0.69, "grad_norm": 1.978129463452241, "learning_rate": 4.545060477599002e-06, "loss": 0.769, "step": 4220 }, { "epoch": 0.69, "grad_norm": 2.164827268861112, "learning_rate": 4.540602291020499e-06, "loss": 0.7759, "step": 4221 }, { "epoch": 0.69, "grad_norm": 1.5752298991610814, "learning_rate": 4.536145649713029e-06, "loss": 0.7643, "step": 4222 }, { "epoch": 0.69, "grad_norm": 1.8023436306236, "learning_rate": 4.531690554938043e-06, "loss": 0.8175, "step": 4223 }, { "epoch": 0.69, "grad_norm": 1.77262120513245, "learning_rate": 4.52723700795654e-06, "loss": 0.7842, "step": 4224 }, { "epoch": 0.69, "grad_norm": 1.8017804038727947, "learning_rate": 4.522785010029087e-06, "loss": 0.6637, "step": 4225 }, { "epoch": 0.69, "grad_norm": 2.7204964182077225, "learning_rate": 4.518334562415816e-06, "loss": 0.6972, "step": 4226 }, { "epoch": 0.69, "grad_norm": 0.5888384865849882, "learning_rate": 4.513885666376413e-06, "loss": 0.3391, "step": 4227 }, { "epoch": 0.69, "grad_norm": 2.157179710576994, "learning_rate": 4.509438323170131e-06, "loss": 0.7123, "step": 4228 }, { "epoch": 0.69, "grad_norm": 1.890352392312811, "learning_rate": 4.504992534055781e-06, "loss": 0.7963, "step": 4229 }, { "epoch": 0.69, "grad_norm": 1.9394814364648512, "learning_rate": 4.500548300291732e-06, "loss": 0.8457, "step": 4230 }, { "epoch": 0.69, "grad_norm": 2.1959178138243365, "learning_rate": 4.496105623135919e-06, "loss": 0.7432, "step": 4231 }, { "epoch": 0.7, "grad_norm": 2.023940505875608, "learning_rate": 4.4916645038458295e-06, "loss": 0.6807, "step": 4232 }, { "epoch": 0.7, "grad_norm": 1.665145357996867, "learning_rate": 4.487224943678513e-06, "loss": 0.749, "step": 4233 }, { "epoch": 0.7, "grad_norm": 3.8315796841931635, "learning_rate": 4.482786943890579e-06, "loss": 0.8112, "step": 4234 }, { "epoch": 0.7, "grad_norm": 2.0104078773101697, "learning_rate": 4.478350505738194e-06, "loss": 0.7749, "step": 4235 }, { "epoch": 0.7, "grad_norm": 1.8979463906037302, "learning_rate": 4.47391563047708e-06, "loss": 0.7648, "step": 4236 }, { "epoch": 0.7, "grad_norm": 0.644311291379342, "learning_rate": 4.4694823193625225e-06, "loss": 0.3373, "step": 4237 }, { "epoch": 0.7, "grad_norm": 2.076859207935869, "learning_rate": 4.465050573649359e-06, "loss": 0.8317, "step": 4238 }, { "epoch": 0.7, "grad_norm": 0.598135431997505, "learning_rate": 4.460620394591989e-06, "loss": 0.3555, "step": 4239 }, { "epoch": 0.7, "grad_norm": 2.324178109124122, "learning_rate": 4.45619178344436e-06, "loss": 0.7594, "step": 4240 }, { "epoch": 0.7, "grad_norm": 1.8513362131018, "learning_rate": 4.451764741459983e-06, "loss": 0.6756, "step": 4241 }, { "epoch": 0.7, "grad_norm": 2.9190686227912725, "learning_rate": 4.447339269891923e-06, "loss": 0.7532, "step": 4242 }, { "epoch": 0.7, "grad_norm": 1.594526228843496, "learning_rate": 4.442915369992802e-06, "loss": 0.83, "step": 4243 }, { "epoch": 0.7, "grad_norm": 1.9922492373480627, "learning_rate": 4.438493043014793e-06, "loss": 0.706, "step": 4244 }, { "epoch": 0.7, "grad_norm": 5.252044925635171, "learning_rate": 4.434072290209624e-06, "loss": 0.7989, "step": 4245 }, { "epoch": 0.7, "grad_norm": 1.8731188716406595, "learning_rate": 4.429653112828589e-06, "loss": 0.7323, "step": 4246 }, { "epoch": 0.7, "grad_norm": 1.9448372559606102, "learning_rate": 4.4252355121225196e-06, "loss": 0.7214, "step": 4247 }, { "epoch": 0.7, "grad_norm": 1.5972607149496099, "learning_rate": 4.4208194893418125e-06, "loss": 0.772, "step": 4248 }, { "epoch": 0.7, "grad_norm": 2.677741893417064, "learning_rate": 4.41640504573641e-06, "loss": 0.7529, "step": 4249 }, { "epoch": 0.7, "grad_norm": 1.8373550231050453, "learning_rate": 4.411992182555812e-06, "loss": 0.7918, "step": 4250 }, { "epoch": 0.7, "grad_norm": 1.674257976346728, "learning_rate": 4.407580901049071e-06, "loss": 0.7697, "step": 4251 }, { "epoch": 0.7, "grad_norm": 1.8472984067341573, "learning_rate": 4.403171202464791e-06, "loss": 0.7904, "step": 4252 }, { "epoch": 0.7, "grad_norm": 2.820014611344712, "learning_rate": 4.398763088051127e-06, "loss": 0.7751, "step": 4253 }, { "epoch": 0.7, "grad_norm": 0.6377902421785413, "learning_rate": 4.394356559055787e-06, "loss": 0.3364, "step": 4254 }, { "epoch": 0.7, "grad_norm": 2.3894735308245685, "learning_rate": 4.389951616726029e-06, "loss": 0.765, "step": 4255 }, { "epoch": 0.7, "grad_norm": 1.951715867937986, "learning_rate": 4.3855482623086645e-06, "loss": 0.7622, "step": 4256 }, { "epoch": 0.7, "grad_norm": 2.240167927807416, "learning_rate": 4.381146497050053e-06, "loss": 0.7408, "step": 4257 }, { "epoch": 0.7, "grad_norm": 2.1290506553988915, "learning_rate": 4.3767463221961034e-06, "loss": 0.8124, "step": 4258 }, { "epoch": 0.7, "grad_norm": 2.0329600352557304, "learning_rate": 4.372347738992278e-06, "loss": 0.8093, "step": 4259 }, { "epoch": 0.7, "grad_norm": 1.77882281102657, "learning_rate": 4.3679507486835835e-06, "loss": 0.7474, "step": 4260 }, { "epoch": 0.7, "grad_norm": 2.022809562771337, "learning_rate": 4.363555352514587e-06, "loss": 0.7524, "step": 4261 }, { "epoch": 0.7, "grad_norm": 1.9030138165271415, "learning_rate": 4.359161551729385e-06, "loss": 0.8441, "step": 4262 }, { "epoch": 0.7, "grad_norm": 2.069162434859499, "learning_rate": 4.354769347571638e-06, "loss": 0.7376, "step": 4263 }, { "epoch": 0.7, "grad_norm": 1.7838683170717693, "learning_rate": 4.350378741284551e-06, "loss": 0.7908, "step": 4264 }, { "epoch": 0.7, "grad_norm": 2.1027468106207303, "learning_rate": 4.3459897341108756e-06, "loss": 0.7164, "step": 4265 }, { "epoch": 0.7, "grad_norm": 2.128075100898379, "learning_rate": 4.341602327292912e-06, "loss": 0.8403, "step": 4266 }, { "epoch": 0.7, "grad_norm": 2.5394100782979323, "learning_rate": 4.3372165220725045e-06, "loss": 0.769, "step": 4267 }, { "epoch": 0.7, "grad_norm": 3.3578795551971434, "learning_rate": 4.332832319691044e-06, "loss": 0.7191, "step": 4268 }, { "epoch": 0.7, "grad_norm": 2.2096190553827695, "learning_rate": 4.328449721389475e-06, "loss": 0.7576, "step": 4269 }, { "epoch": 0.7, "grad_norm": 2.0341148918319205, "learning_rate": 4.324068728408282e-06, "loss": 0.82, "step": 4270 }, { "epoch": 0.7, "grad_norm": 3.2851182511509296, "learning_rate": 4.319689341987493e-06, "loss": 0.834, "step": 4271 }, { "epoch": 0.7, "grad_norm": 2.4317444824335084, "learning_rate": 4.315311563366686e-06, "loss": 0.7747, "step": 4272 }, { "epoch": 0.7, "grad_norm": 2.640154196171209, "learning_rate": 4.3109353937849815e-06, "loss": 0.8051, "step": 4273 }, { "epoch": 0.7, "grad_norm": 2.2263148273353583, "learning_rate": 4.306560834481045e-06, "loss": 0.8099, "step": 4274 }, { "epoch": 0.7, "grad_norm": 3.5287118448206347, "learning_rate": 4.302187886693087e-06, "loss": 0.7574, "step": 4275 }, { "epoch": 0.7, "grad_norm": 2.41005568983322, "learning_rate": 4.29781655165886e-06, "loss": 0.7281, "step": 4276 }, { "epoch": 0.7, "grad_norm": 1.8154473383176322, "learning_rate": 4.293446830615662e-06, "loss": 0.7081, "step": 4277 }, { "epoch": 0.7, "grad_norm": 1.9183421428561216, "learning_rate": 4.289078724800331e-06, "loss": 0.7785, "step": 4278 }, { "epoch": 0.7, "grad_norm": 1.7533845644843646, "learning_rate": 4.2847122354492555e-06, "loss": 0.8548, "step": 4279 }, { "epoch": 0.7, "grad_norm": 3.1280636117361094, "learning_rate": 4.280347363798356e-06, "loss": 0.8521, "step": 4280 }, { "epoch": 0.7, "grad_norm": 2.771720296634124, "learning_rate": 4.275984111083102e-06, "loss": 0.7989, "step": 4281 }, { "epoch": 0.7, "grad_norm": 2.2223553352724057, "learning_rate": 4.2716224785385075e-06, "loss": 0.8007, "step": 4282 }, { "epoch": 0.7, "grad_norm": 2.553533135485, "learning_rate": 4.267262467399114e-06, "loss": 0.8256, "step": 4283 }, { "epoch": 0.7, "grad_norm": 1.9999578859957756, "learning_rate": 4.2629040788990205e-06, "loss": 0.7531, "step": 4284 }, { "epoch": 0.7, "grad_norm": 0.6348478755505605, "learning_rate": 4.258547314271857e-06, "loss": 0.3449, "step": 4285 }, { "epoch": 0.7, "grad_norm": 2.122354849138043, "learning_rate": 4.254192174750796e-06, "loss": 0.8124, "step": 4286 }, { "epoch": 0.7, "grad_norm": 2.0833877424180054, "learning_rate": 4.249838661568554e-06, "loss": 0.849, "step": 4287 }, { "epoch": 0.7, "grad_norm": 2.2195801468729766, "learning_rate": 4.24548677595738e-06, "loss": 0.7353, "step": 4288 }, { "epoch": 0.7, "grad_norm": 1.8745018243240965, "learning_rate": 4.2411365191490684e-06, "loss": 0.8158, "step": 4289 }, { "epoch": 0.7, "grad_norm": 0.5781071947570332, "learning_rate": 4.236787892374948e-06, "loss": 0.34, "step": 4290 }, { "epoch": 0.7, "grad_norm": 1.8230718515290059, "learning_rate": 4.232440896865888e-06, "loss": 0.7883, "step": 4291 }, { "epoch": 0.7, "grad_norm": 1.5679600795134865, "learning_rate": 4.2280955338523015e-06, "loss": 0.689, "step": 4292 }, { "epoch": 0.71, "grad_norm": 1.7207301595464408, "learning_rate": 4.22375180456413e-06, "loss": 0.745, "step": 4293 }, { "epoch": 0.71, "grad_norm": 2.0666709295421564, "learning_rate": 4.219409710230859e-06, "loss": 0.8008, "step": 4294 }, { "epoch": 0.71, "grad_norm": 0.6960183077608357, "learning_rate": 4.215069252081509e-06, "loss": 0.3501, "step": 4295 }, { "epoch": 0.71, "grad_norm": 3.094141549120192, "learning_rate": 4.210730431344635e-06, "loss": 0.7961, "step": 4296 }, { "epoch": 0.71, "grad_norm": 1.6897013961444465, "learning_rate": 4.206393249248334e-06, "loss": 0.7386, "step": 4297 }, { "epoch": 0.71, "grad_norm": 2.434415405724381, "learning_rate": 4.202057707020235e-06, "loss": 0.7918, "step": 4298 }, { "epoch": 0.71, "grad_norm": 1.758997807977992, "learning_rate": 4.1977238058875045e-06, "loss": 0.8013, "step": 4299 }, { "epoch": 0.71, "grad_norm": 2.0631562787668143, "learning_rate": 4.193391547076844e-06, "loss": 0.8151, "step": 4300 }, { "epoch": 0.71, "grad_norm": 2.0523704079929486, "learning_rate": 4.189060931814489e-06, "loss": 0.7941, "step": 4301 }, { "epoch": 0.71, "grad_norm": 1.7459588494362472, "learning_rate": 4.184731961326213e-06, "loss": 0.757, "step": 4302 }, { "epoch": 0.71, "grad_norm": 2.7934769223131264, "learning_rate": 4.180404636837321e-06, "loss": 0.8208, "step": 4303 }, { "epoch": 0.71, "grad_norm": 5.121186250129963, "learning_rate": 4.176078959572656e-06, "loss": 0.7902, "step": 4304 }, { "epoch": 0.71, "grad_norm": 3.831999878952559, "learning_rate": 4.171754930756586e-06, "loss": 0.7804, "step": 4305 }, { "epoch": 0.71, "grad_norm": 2.3204931151121806, "learning_rate": 4.167432551613021e-06, "loss": 0.7683, "step": 4306 }, { "epoch": 0.71, "grad_norm": 1.8162750584798761, "learning_rate": 4.163111823365403e-06, "loss": 0.7768, "step": 4307 }, { "epoch": 0.71, "grad_norm": 2.2981010873087118, "learning_rate": 4.158792747236702e-06, "loss": 0.7103, "step": 4308 }, { "epoch": 0.71, "grad_norm": 2.596519115204602, "learning_rate": 4.154475324449425e-06, "loss": 0.778, "step": 4309 }, { "epoch": 0.71, "grad_norm": 1.929820183405858, "learning_rate": 4.1501595562256105e-06, "loss": 0.7627, "step": 4310 }, { "epoch": 0.71, "grad_norm": 1.8517294276754432, "learning_rate": 4.145845443786827e-06, "loss": 0.7753, "step": 4311 }, { "epoch": 0.71, "grad_norm": 1.822897140452897, "learning_rate": 4.141532988354173e-06, "loss": 0.7558, "step": 4312 }, { "epoch": 0.71, "grad_norm": 2.0515938989533726, "learning_rate": 4.137222191148282e-06, "loss": 0.7677, "step": 4313 }, { "epoch": 0.71, "grad_norm": 1.785322863297669, "learning_rate": 4.132913053389317e-06, "loss": 0.7313, "step": 4314 }, { "epoch": 0.71, "grad_norm": 1.863987921724444, "learning_rate": 4.128605576296964e-06, "loss": 0.7229, "step": 4315 }, { "epoch": 0.71, "grad_norm": 1.844583120214524, "learning_rate": 4.1242997610904546e-06, "loss": 0.7231, "step": 4316 }, { "epoch": 0.71, "grad_norm": 1.6504815622194662, "learning_rate": 4.119995608988536e-06, "loss": 0.7897, "step": 4317 }, { "epoch": 0.71, "grad_norm": 2.1259399555938234, "learning_rate": 4.11569312120949e-06, "loss": 0.8098, "step": 4318 }, { "epoch": 0.71, "grad_norm": 2.1405242728269736, "learning_rate": 4.111392298971127e-06, "loss": 0.7788, "step": 4319 }, { "epoch": 0.71, "grad_norm": 1.7840442708860171, "learning_rate": 4.107093143490785e-06, "loss": 0.7652, "step": 4320 }, { "epoch": 0.71, "grad_norm": 2.142921333498261, "learning_rate": 4.102795655985331e-06, "loss": 0.8054, "step": 4321 }, { "epoch": 0.71, "grad_norm": 2.0090721924167, "learning_rate": 4.098499837671159e-06, "loss": 0.8679, "step": 4322 }, { "epoch": 0.71, "grad_norm": 2.4305223898756836, "learning_rate": 4.0942056897641934e-06, "loss": 0.7422, "step": 4323 }, { "epoch": 0.71, "grad_norm": 2.4377083199049845, "learning_rate": 4.089913213479882e-06, "loss": 0.7932, "step": 4324 }, { "epoch": 0.71, "grad_norm": 1.9088436073860227, "learning_rate": 4.085622410033203e-06, "loss": 0.767, "step": 4325 }, { "epoch": 0.71, "grad_norm": 2.1168625677178383, "learning_rate": 4.081333280638661e-06, "loss": 0.8476, "step": 4326 }, { "epoch": 0.71, "grad_norm": 1.5311533279843164, "learning_rate": 4.077045826510277e-06, "loss": 0.8064, "step": 4327 }, { "epoch": 0.71, "grad_norm": 2.761482837717328, "learning_rate": 4.072760048861614e-06, "loss": 0.7679, "step": 4328 }, { "epoch": 0.71, "grad_norm": 2.195877081200378, "learning_rate": 4.068475948905746e-06, "loss": 0.8555, "step": 4329 }, { "epoch": 0.71, "grad_norm": 1.6402662317431138, "learning_rate": 4.064193527855285e-06, "loss": 0.8046, "step": 4330 }, { "epoch": 0.71, "grad_norm": 2.1594384136574853, "learning_rate": 4.0599127869223565e-06, "loss": 0.7582, "step": 4331 }, { "epoch": 0.71, "grad_norm": 1.6369059180946999, "learning_rate": 4.055633727318617e-06, "loss": 0.7802, "step": 4332 }, { "epoch": 0.71, "grad_norm": 2.2397995370081825, "learning_rate": 4.051356350255246e-06, "loss": 0.7692, "step": 4333 }, { "epoch": 0.71, "grad_norm": 1.825800712612525, "learning_rate": 4.047080656942943e-06, "loss": 0.8111, "step": 4334 }, { "epoch": 0.71, "grad_norm": 1.7206359789227756, "learning_rate": 4.042806648591938e-06, "loss": 0.7667, "step": 4335 }, { "epoch": 0.71, "grad_norm": 1.877206093331365, "learning_rate": 4.038534326411978e-06, "loss": 0.8422, "step": 4336 }, { "epoch": 0.71, "grad_norm": 1.6199793414109431, "learning_rate": 4.0342636916123355e-06, "loss": 0.7838, "step": 4337 }, { "epoch": 0.71, "grad_norm": 1.8273913461022764, "learning_rate": 4.0299947454018e-06, "loss": 0.8393, "step": 4338 }, { "epoch": 0.71, "grad_norm": 1.5961260789028173, "learning_rate": 4.025727488988696e-06, "loss": 0.7048, "step": 4339 }, { "epoch": 0.71, "grad_norm": 2.4813090363617434, "learning_rate": 4.0214619235808575e-06, "loss": 0.769, "step": 4340 }, { "epoch": 0.71, "grad_norm": 0.602999967310016, "learning_rate": 4.017198050385644e-06, "loss": 0.3479, "step": 4341 }, { "epoch": 0.71, "grad_norm": 2.296378868345467, "learning_rate": 4.012935870609934e-06, "loss": 0.7366, "step": 4342 }, { "epoch": 0.71, "grad_norm": 4.559550360533098, "learning_rate": 4.008675385460131e-06, "loss": 0.7452, "step": 4343 }, { "epoch": 0.71, "grad_norm": 1.9491343597162833, "learning_rate": 4.0044165961421565e-06, "loss": 0.7301, "step": 4344 }, { "epoch": 0.71, "grad_norm": 2.6584751388650742, "learning_rate": 4.000159503861451e-06, "loss": 0.8299, "step": 4345 }, { "epoch": 0.71, "grad_norm": 1.8718992781425448, "learning_rate": 3.9959041098229735e-06, "loss": 0.7654, "step": 4346 }, { "epoch": 0.71, "grad_norm": 2.035592477869978, "learning_rate": 3.991650415231211e-06, "loss": 0.7987, "step": 4347 }, { "epoch": 0.71, "grad_norm": 1.9447149862310746, "learning_rate": 3.987398421290155e-06, "loss": 0.7713, "step": 4348 }, { "epoch": 0.71, "grad_norm": 2.048777870326657, "learning_rate": 3.983148129203326e-06, "loss": 0.8361, "step": 4349 }, { "epoch": 0.71, "grad_norm": 2.3731782855447516, "learning_rate": 3.978899540173759e-06, "loss": 0.8425, "step": 4350 }, { "epoch": 0.71, "grad_norm": 2.3771693518942514, "learning_rate": 3.974652655404012e-06, "loss": 0.806, "step": 4351 }, { "epoch": 0.71, "grad_norm": 1.786614217859133, "learning_rate": 3.970407476096154e-06, "loss": 0.6969, "step": 4352 }, { "epoch": 0.71, "grad_norm": 2.6845180716111403, "learning_rate": 3.966164003451775e-06, "loss": 0.7523, "step": 4353 }, { "epoch": 0.72, "grad_norm": 3.1745028902377515, "learning_rate": 3.961922238671981e-06, "loss": 0.7905, "step": 4354 }, { "epoch": 0.72, "grad_norm": 1.8986273443934982, "learning_rate": 3.957682182957394e-06, "loss": 0.7603, "step": 4355 }, { "epoch": 0.72, "grad_norm": 1.9376570791424805, "learning_rate": 3.953443837508153e-06, "loss": 0.7646, "step": 4356 }, { "epoch": 0.72, "grad_norm": 1.8853074919902904, "learning_rate": 3.949207203523913e-06, "loss": 0.7483, "step": 4357 }, { "epoch": 0.72, "grad_norm": 2.3681132751493617, "learning_rate": 3.944972282203844e-06, "loss": 0.7383, "step": 4358 }, { "epoch": 0.72, "grad_norm": 1.931107609132881, "learning_rate": 3.940739074746632e-06, "loss": 0.7017, "step": 4359 }, { "epoch": 0.72, "grad_norm": 1.7691774281619919, "learning_rate": 3.936507582350479e-06, "loss": 0.7374, "step": 4360 }, { "epoch": 0.72, "grad_norm": 1.8858783092246834, "learning_rate": 3.932277806213093e-06, "loss": 0.8161, "step": 4361 }, { "epoch": 0.72, "grad_norm": 0.6232705711182706, "learning_rate": 3.928049747531711e-06, "loss": 0.3522, "step": 4362 }, { "epoch": 0.72, "grad_norm": 2.4223835421907647, "learning_rate": 3.923823407503076e-06, "loss": 0.8045, "step": 4363 }, { "epoch": 0.72, "grad_norm": 3.3609325169880364, "learning_rate": 3.919598787323442e-06, "loss": 0.7414, "step": 4364 }, { "epoch": 0.72, "grad_norm": 1.8561093469459737, "learning_rate": 3.915375888188579e-06, "loss": 0.8771, "step": 4365 }, { "epoch": 0.72, "grad_norm": 2.013475359005332, "learning_rate": 3.9111547112937685e-06, "loss": 0.7807, "step": 4366 }, { "epoch": 0.72, "grad_norm": 2.513868496027119, "learning_rate": 3.906935257833809e-06, "loss": 0.8286, "step": 4367 }, { "epoch": 0.72, "grad_norm": 1.6431552184402254, "learning_rate": 3.902717529003005e-06, "loss": 0.8657, "step": 4368 }, { "epoch": 0.72, "grad_norm": 1.677293540226831, "learning_rate": 3.898501525995181e-06, "loss": 0.8002, "step": 4369 }, { "epoch": 0.72, "grad_norm": 2.493905329380971, "learning_rate": 3.89428725000366e-06, "loss": 0.7397, "step": 4370 }, { "epoch": 0.72, "grad_norm": 2.2422916417455534, "learning_rate": 3.890074702221288e-06, "loss": 0.7181, "step": 4371 }, { "epoch": 0.72, "grad_norm": 2.6325983235317643, "learning_rate": 3.8858638838404175e-06, "loss": 0.7295, "step": 4372 }, { "epoch": 0.72, "grad_norm": 1.7366025378159629, "learning_rate": 3.88165479605291e-06, "loss": 0.8003, "step": 4373 }, { "epoch": 0.72, "grad_norm": 2.0142453270559053, "learning_rate": 3.8774474400501415e-06, "loss": 0.7085, "step": 4374 }, { "epoch": 0.72, "grad_norm": 4.797844161274133, "learning_rate": 3.873241817022996e-06, "loss": 0.8347, "step": 4375 }, { "epoch": 0.72, "grad_norm": 2.082354446740899, "learning_rate": 3.869037928161863e-06, "loss": 0.7724, "step": 4376 }, { "epoch": 0.72, "grad_norm": 0.6139306459657721, "learning_rate": 3.8648357746566456e-06, "loss": 0.3128, "step": 4377 }, { "epoch": 0.72, "grad_norm": 2.409405678902009, "learning_rate": 3.860635357696756e-06, "loss": 0.8243, "step": 4378 }, { "epoch": 0.72, "grad_norm": 3.1224173448143966, "learning_rate": 3.8564366784711116e-06, "loss": 0.7916, "step": 4379 }, { "epoch": 0.72, "grad_norm": 2.004221815665962, "learning_rate": 3.852239738168141e-06, "loss": 0.8209, "step": 4380 }, { "epoch": 0.72, "grad_norm": 1.4582120199261308, "learning_rate": 3.848044537975778e-06, "loss": 0.7979, "step": 4381 }, { "epoch": 0.72, "grad_norm": 3.942190582260422, "learning_rate": 3.843851079081467e-06, "loss": 0.7159, "step": 4382 }, { "epoch": 0.72, "grad_norm": 1.5709994801783667, "learning_rate": 3.839659362672156e-06, "loss": 0.8155, "step": 4383 }, { "epoch": 0.72, "grad_norm": 1.6124165074758419, "learning_rate": 3.835469389934299e-06, "loss": 0.8076, "step": 4384 }, { "epoch": 0.72, "grad_norm": 2.5517711331193955, "learning_rate": 3.8312811620538655e-06, "loss": 0.7453, "step": 4385 }, { "epoch": 0.72, "grad_norm": 1.854555102074345, "learning_rate": 3.8270946802163216e-06, "loss": 0.7549, "step": 4386 }, { "epoch": 0.72, "grad_norm": 1.740652600701026, "learning_rate": 3.822909945606641e-06, "loss": 0.8039, "step": 4387 }, { "epoch": 0.72, "grad_norm": 1.9589573841645798, "learning_rate": 3.818726959409305e-06, "loss": 0.7569, "step": 4388 }, { "epoch": 0.72, "grad_norm": 2.0419962248855, "learning_rate": 3.8145457228082995e-06, "loss": 0.7171, "step": 4389 }, { "epoch": 0.72, "grad_norm": 2.9604700425428523, "learning_rate": 3.8103662369871143e-06, "loss": 0.7102, "step": 4390 }, { "epoch": 0.72, "grad_norm": 3.012122724646457, "learning_rate": 3.806188503128746e-06, "loss": 0.7929, "step": 4391 }, { "epoch": 0.72, "grad_norm": 1.8278624621653092, "learning_rate": 3.802012522415689e-06, "loss": 0.8126, "step": 4392 }, { "epoch": 0.72, "grad_norm": 2.2149727790123217, "learning_rate": 3.7978382960299476e-06, "loss": 0.7481, "step": 4393 }, { "epoch": 0.72, "grad_norm": 1.9847158003460745, "learning_rate": 3.793665825153029e-06, "loss": 0.7192, "step": 4394 }, { "epoch": 0.72, "grad_norm": 1.5346898477706326, "learning_rate": 3.7894951109659404e-06, "loss": 0.744, "step": 4395 }, { "epoch": 0.72, "grad_norm": 1.579562322161786, "learning_rate": 3.785326154649196e-06, "loss": 0.7721, "step": 4396 }, { "epoch": 0.72, "grad_norm": 2.1559469271196923, "learning_rate": 3.781158957382809e-06, "loss": 0.7717, "step": 4397 }, { "epoch": 0.72, "grad_norm": 2.2951703439546804, "learning_rate": 3.776993520346295e-06, "loss": 0.7525, "step": 4398 }, { "epoch": 0.72, "grad_norm": 0.6472009630263904, "learning_rate": 3.772829844718674e-06, "loss": 0.3655, "step": 4399 }, { "epoch": 0.72, "grad_norm": 1.5603694558735957, "learning_rate": 3.7686679316784635e-06, "loss": 0.8153, "step": 4400 }, { "epoch": 0.72, "grad_norm": 1.870368718746695, "learning_rate": 3.764507782403686e-06, "loss": 0.74, "step": 4401 }, { "epoch": 0.72, "grad_norm": 1.9224831306116141, "learning_rate": 3.760349398071862e-06, "loss": 0.7212, "step": 4402 }, { "epoch": 0.72, "grad_norm": 2.951116976150131, "learning_rate": 3.756192779860014e-06, "loss": 0.7173, "step": 4403 }, { "epoch": 0.72, "grad_norm": 2.6822122213902753, "learning_rate": 3.752037928944664e-06, "loss": 0.7871, "step": 4404 }, { "epoch": 0.72, "grad_norm": 2.5778913043006444, "learning_rate": 3.7478848465018336e-06, "loss": 0.7276, "step": 4405 }, { "epoch": 0.72, "grad_norm": 2.348437812592447, "learning_rate": 3.7437335337070445e-06, "loss": 0.6985, "step": 4406 }, { "epoch": 0.72, "grad_norm": 5.431194908761027, "learning_rate": 3.739583991735316e-06, "loss": 0.7633, "step": 4407 }, { "epoch": 0.72, "grad_norm": 2.771944984533322, "learning_rate": 3.7354362217611652e-06, "loss": 0.8418, "step": 4408 }, { "epoch": 0.72, "grad_norm": 0.615196136407771, "learning_rate": 3.7312902249586146e-06, "loss": 0.3584, "step": 4409 }, { "epoch": 0.72, "grad_norm": 1.9705797390896858, "learning_rate": 3.7271460025011785e-06, "loss": 0.7885, "step": 4410 }, { "epoch": 0.72, "grad_norm": 2.3133931667531153, "learning_rate": 3.723003555561869e-06, "loss": 0.693, "step": 4411 }, { "epoch": 0.72, "grad_norm": 2.193046254273674, "learning_rate": 3.7188628853132023e-06, "loss": 0.8238, "step": 4412 }, { "epoch": 0.72, "grad_norm": 3.4353289917112972, "learning_rate": 3.714723992927177e-06, "loss": 0.8365, "step": 4413 }, { "epoch": 0.72, "grad_norm": 1.5634941479572506, "learning_rate": 3.710586879575302e-06, "loss": 0.7293, "step": 4414 }, { "epoch": 0.73, "grad_norm": 1.7244200160528558, "learning_rate": 3.70645154642858e-06, "loss": 0.757, "step": 4415 }, { "epoch": 0.73, "grad_norm": 1.7799326412107985, "learning_rate": 3.702317994657506e-06, "loss": 0.7928, "step": 4416 }, { "epoch": 0.73, "grad_norm": 3.4979120006411413, "learning_rate": 3.6981862254320757e-06, "loss": 0.7626, "step": 4417 }, { "epoch": 0.73, "grad_norm": 2.422408268514942, "learning_rate": 3.694056239921776e-06, "loss": 0.7884, "step": 4418 }, { "epoch": 0.73, "grad_norm": 2.2347661903377696, "learning_rate": 3.689928039295592e-06, "loss": 0.7724, "step": 4419 }, { "epoch": 0.73, "grad_norm": 2.1004089033745585, "learning_rate": 3.6858016247219998e-06, "loss": 0.7638, "step": 4420 }, { "epoch": 0.73, "grad_norm": 1.7756648470104976, "learning_rate": 3.6816769973689736e-06, "loss": 0.7472, "step": 4421 }, { "epoch": 0.73, "grad_norm": 1.6940125690369525, "learning_rate": 3.677554158403982e-06, "loss": 0.7913, "step": 4422 }, { "epoch": 0.73, "grad_norm": 1.5102102595409035, "learning_rate": 3.6734331089939835e-06, "loss": 0.771, "step": 4423 }, { "epoch": 0.73, "grad_norm": 1.961881999468584, "learning_rate": 3.669313850305435e-06, "loss": 0.8157, "step": 4424 }, { "epoch": 0.73, "grad_norm": 1.8250078223270716, "learning_rate": 3.6651963835042813e-06, "loss": 0.7513, "step": 4425 }, { "epoch": 0.73, "grad_norm": 0.5838244369182107, "learning_rate": 3.6610807097559644e-06, "loss": 0.3295, "step": 4426 }, { "epoch": 0.73, "grad_norm": 1.8995347887417557, "learning_rate": 3.6569668302254167e-06, "loss": 0.7207, "step": 4427 }, { "epoch": 0.73, "grad_norm": 2.3428732277817717, "learning_rate": 3.6528547460770636e-06, "loss": 0.7883, "step": 4428 }, { "epoch": 0.73, "grad_norm": 1.4755401969235513, "learning_rate": 3.648744458474821e-06, "loss": 0.7374, "step": 4429 }, { "epoch": 0.73, "grad_norm": 2.0425563920582563, "learning_rate": 3.6446359685820974e-06, "loss": 0.7521, "step": 4430 }, { "epoch": 0.73, "grad_norm": 4.036079551204399, "learning_rate": 3.6405292775617886e-06, "loss": 0.7473, "step": 4431 }, { "epoch": 0.73, "grad_norm": 2.013911618859014, "learning_rate": 3.6364243865762926e-06, "loss": 0.7601, "step": 4432 }, { "epoch": 0.73, "grad_norm": 2.6298086230714715, "learning_rate": 3.6323212967874866e-06, "loss": 0.7191, "step": 4433 }, { "epoch": 0.73, "grad_norm": 2.475808865549832, "learning_rate": 3.628220009356743e-06, "loss": 0.6969, "step": 4434 }, { "epoch": 0.73, "grad_norm": 1.9629327559141447, "learning_rate": 3.6241205254449197e-06, "loss": 0.7416, "step": 4435 }, { "epoch": 0.73, "grad_norm": 1.9910095857516652, "learning_rate": 3.6200228462123666e-06, "loss": 0.828, "step": 4436 }, { "epoch": 0.73, "grad_norm": 2.470516746823948, "learning_rate": 3.6159269728189237e-06, "loss": 0.7528, "step": 4437 }, { "epoch": 0.73, "grad_norm": 1.3159858388614554, "learning_rate": 3.6118329064239222e-06, "loss": 0.7543, "step": 4438 }, { "epoch": 0.73, "grad_norm": 9.896354463990937, "learning_rate": 3.6077406481861756e-06, "loss": 0.8331, "step": 4439 }, { "epoch": 0.73, "grad_norm": 1.8035855777275271, "learning_rate": 3.6036501992639907e-06, "loss": 0.7216, "step": 4440 }, { "epoch": 0.73, "grad_norm": 1.7133042827066303, "learning_rate": 3.59956156081516e-06, "loss": 0.7776, "step": 4441 }, { "epoch": 0.73, "grad_norm": 2.2567105042377293, "learning_rate": 3.5954747339969653e-06, "loss": 0.6741, "step": 4442 }, { "epoch": 0.73, "grad_norm": 1.848500544439125, "learning_rate": 3.5913897199661716e-06, "loss": 0.7356, "step": 4443 }, { "epoch": 0.73, "grad_norm": 2.4523072405270256, "learning_rate": 3.587306519879037e-06, "loss": 0.7512, "step": 4444 }, { "epoch": 0.73, "grad_norm": 2.3828350276889445, "learning_rate": 3.5832251348912995e-06, "loss": 0.7121, "step": 4445 }, { "epoch": 0.73, "grad_norm": 3.425251541011518, "learning_rate": 3.5791455661581877e-06, "loss": 0.7886, "step": 4446 }, { "epoch": 0.73, "grad_norm": 1.5304737833617943, "learning_rate": 3.5750678148344153e-06, "loss": 0.7606, "step": 4447 }, { "epoch": 0.73, "grad_norm": 1.7463796114773054, "learning_rate": 3.5709918820741816e-06, "loss": 0.8096, "step": 4448 }, { "epoch": 0.73, "grad_norm": 1.4954886879390084, "learning_rate": 3.5669177690311696e-06, "loss": 0.7667, "step": 4449 }, { "epoch": 0.73, "grad_norm": 7.000424198027147, "learning_rate": 3.56284547685855e-06, "loss": 0.7469, "step": 4450 }, { "epoch": 0.73, "grad_norm": 2.497187903939387, "learning_rate": 3.5587750067089745e-06, "loss": 0.7833, "step": 4451 }, { "epoch": 0.73, "grad_norm": 1.9042143703168792, "learning_rate": 3.5547063597345833e-06, "loss": 0.8014, "step": 4452 }, { "epoch": 0.73, "grad_norm": 1.5311134745900872, "learning_rate": 3.5506395370869963e-06, "loss": 0.7883, "step": 4453 }, { "epoch": 0.73, "grad_norm": 1.617049388240314, "learning_rate": 3.546574539917317e-06, "loss": 0.7481, "step": 4454 }, { "epoch": 0.73, "grad_norm": 0.6140126195208828, "learning_rate": 3.5425113693761436e-06, "loss": 0.3482, "step": 4455 }, { "epoch": 0.73, "grad_norm": 2.8033802162643964, "learning_rate": 3.5384500266135393e-06, "loss": 0.7717, "step": 4456 }, { "epoch": 0.73, "grad_norm": 1.6374714967063198, "learning_rate": 3.5343905127790614e-06, "loss": 0.7581, "step": 4457 }, { "epoch": 0.73, "grad_norm": 2.206857955293823, "learning_rate": 3.5303328290217453e-06, "loss": 0.7112, "step": 4458 }, { "epoch": 0.73, "grad_norm": 2.484453918069031, "learning_rate": 3.526276976490112e-06, "loss": 0.7843, "step": 4459 }, { "epoch": 0.73, "grad_norm": 2.277847945034391, "learning_rate": 3.52222295633216e-06, "loss": 0.7127, "step": 4460 }, { "epoch": 0.73, "grad_norm": 1.6938879916493512, "learning_rate": 3.5181707696953728e-06, "loss": 0.7662, "step": 4461 }, { "epoch": 0.73, "grad_norm": 1.792608450575419, "learning_rate": 3.5141204177267117e-06, "loss": 0.6352, "step": 4462 }, { "epoch": 0.73, "grad_norm": 2.108027461904393, "learning_rate": 3.5100719015726228e-06, "loss": 0.7619, "step": 4463 }, { "epoch": 0.73, "grad_norm": 2.9068993802671073, "learning_rate": 3.506025222379027e-06, "loss": 0.7259, "step": 4464 }, { "epoch": 0.73, "grad_norm": 2.1172540106690416, "learning_rate": 3.501980381291331e-06, "loss": 0.7418, "step": 4465 }, { "epoch": 0.73, "grad_norm": 2.227235622928159, "learning_rate": 3.497937379454417e-06, "loss": 0.7149, "step": 4466 }, { "epoch": 0.73, "grad_norm": 2.534790873892219, "learning_rate": 3.493896218012649e-06, "loss": 0.8222, "step": 4467 }, { "epoch": 0.73, "grad_norm": 4.023879476087566, "learning_rate": 3.4898568981098678e-06, "loss": 0.7408, "step": 4468 }, { "epoch": 0.73, "grad_norm": 1.5940346396944622, "learning_rate": 3.4858194208893967e-06, "loss": 0.7453, "step": 4469 }, { "epoch": 0.73, "grad_norm": 0.6336001199637511, "learning_rate": 3.481783787494033e-06, "loss": 0.3319, "step": 4470 }, { "epoch": 0.73, "grad_norm": 1.8479770520932735, "learning_rate": 3.477749999066056e-06, "loss": 0.7476, "step": 4471 }, { "epoch": 0.73, "grad_norm": 2.3816747410801815, "learning_rate": 3.4737180567472196e-06, "loss": 0.7963, "step": 4472 }, { "epoch": 0.73, "grad_norm": 1.8641208802323208, "learning_rate": 3.469687961678757e-06, "loss": 0.7457, "step": 4473 }, { "epoch": 0.73, "grad_norm": 1.6911588663174209, "learning_rate": 3.465659715001379e-06, "loss": 0.8035, "step": 4474 }, { "epoch": 0.73, "grad_norm": 1.997220383873713, "learning_rate": 3.461633317855271e-06, "loss": 0.7689, "step": 4475 }, { "epoch": 0.74, "grad_norm": 1.919429997702525, "learning_rate": 3.4576087713800966e-06, "loss": 0.7331, "step": 4476 }, { "epoch": 0.74, "grad_norm": 1.754038955164557, "learning_rate": 3.4535860767149963e-06, "loss": 0.6866, "step": 4477 }, { "epoch": 0.74, "grad_norm": 2.2312292840098307, "learning_rate": 3.4495652349985844e-06, "loss": 0.7339, "step": 4478 }, { "epoch": 0.74, "grad_norm": 1.966829225820792, "learning_rate": 3.4455462473689515e-06, "loss": 0.7569, "step": 4479 }, { "epoch": 0.74, "grad_norm": 2.616669183278121, "learning_rate": 3.4415291149636642e-06, "loss": 0.7992, "step": 4480 }, { "epoch": 0.74, "grad_norm": 2.3405998627514015, "learning_rate": 3.4375138389197627e-06, "loss": 0.7601, "step": 4481 }, { "epoch": 0.74, "grad_norm": 1.9372491613089027, "learning_rate": 3.433500420373763e-06, "loss": 0.7511, "step": 4482 }, { "epoch": 0.74, "grad_norm": 2.1472590336104056, "learning_rate": 3.429488860461655e-06, "loss": 0.8047, "step": 4483 }, { "epoch": 0.74, "grad_norm": 1.8695772831572448, "learning_rate": 3.425479160318902e-06, "loss": 0.8117, "step": 4484 }, { "epoch": 0.74, "grad_norm": 1.8361277664021822, "learning_rate": 3.421471321080441e-06, "loss": 0.7602, "step": 4485 }, { "epoch": 0.74, "grad_norm": 1.5783998594234203, "learning_rate": 3.4174653438806814e-06, "loss": 0.746, "step": 4486 }, { "epoch": 0.74, "grad_norm": 1.688990873836799, "learning_rate": 3.4134612298535084e-06, "loss": 0.7227, "step": 4487 }, { "epoch": 0.74, "grad_norm": 1.7846430688826076, "learning_rate": 3.4094589801322773e-06, "loss": 0.7838, "step": 4488 }, { "epoch": 0.74, "grad_norm": 2.3295706485436574, "learning_rate": 3.4054585958498177e-06, "loss": 0.7185, "step": 4489 }, { "epoch": 0.74, "grad_norm": 2.4243493999396026, "learning_rate": 3.401460078138428e-06, "loss": 0.7655, "step": 4490 }, { "epoch": 0.74, "grad_norm": 1.9409785850447798, "learning_rate": 3.3974634281298815e-06, "loss": 0.7411, "step": 4491 }, { "epoch": 0.74, "grad_norm": 1.7313530102779338, "learning_rate": 3.3934686469554203e-06, "loss": 0.8361, "step": 4492 }, { "epoch": 0.74, "grad_norm": 1.907187550194426, "learning_rate": 3.389475735745761e-06, "loss": 0.6908, "step": 4493 }, { "epoch": 0.74, "grad_norm": 1.8897711959142827, "learning_rate": 3.3854846956310862e-06, "loss": 0.6765, "step": 4494 }, { "epoch": 0.74, "grad_norm": 2.138502938136328, "learning_rate": 3.381495527741053e-06, "loss": 0.7525, "step": 4495 }, { "epoch": 0.74, "grad_norm": 2.275651975932812, "learning_rate": 3.377508233204787e-06, "loss": 0.76, "step": 4496 }, { "epoch": 0.74, "grad_norm": 1.6221102155769578, "learning_rate": 3.3735228131508824e-06, "loss": 0.8119, "step": 4497 }, { "epoch": 0.74, "grad_norm": 2.347165069276739, "learning_rate": 3.3695392687074045e-06, "loss": 0.824, "step": 4498 }, { "epoch": 0.74, "grad_norm": 2.621220962871099, "learning_rate": 3.3655576010018875e-06, "loss": 0.7258, "step": 4499 }, { "epoch": 0.74, "grad_norm": 1.9558280737946068, "learning_rate": 3.361577811161335e-06, "loss": 0.7252, "step": 4500 }, { "epoch": 0.74, "grad_norm": 1.8536423045632804, "learning_rate": 3.3575999003122162e-06, "loss": 0.7618, "step": 4501 }, { "epoch": 0.74, "grad_norm": 1.577121693760776, "learning_rate": 3.3536238695804713e-06, "loss": 0.8013, "step": 4502 }, { "epoch": 0.74, "grad_norm": 1.9240683006408528, "learning_rate": 3.3496497200915067e-06, "loss": 0.6462, "step": 4503 }, { "epoch": 0.74, "grad_norm": 1.8288221116290873, "learning_rate": 3.3456774529701987e-06, "loss": 0.8831, "step": 4504 }, { "epoch": 0.74, "grad_norm": 2.007341564615966, "learning_rate": 3.3417070693408882e-06, "loss": 0.751, "step": 4505 }, { "epoch": 0.74, "grad_norm": 0.5962494921335754, "learning_rate": 3.3377385703273835e-06, "loss": 0.3134, "step": 4506 }, { "epoch": 0.74, "grad_norm": 1.7858795309314903, "learning_rate": 3.3337719570529603e-06, "loss": 0.8024, "step": 4507 }, { "epoch": 0.74, "grad_norm": 1.968839683908792, "learning_rate": 3.3298072306403595e-06, "loss": 0.8009, "step": 4508 }, { "epoch": 0.74, "grad_norm": 2.383802063489637, "learning_rate": 3.32584439221179e-06, "loss": 0.7761, "step": 4509 }, { "epoch": 0.74, "grad_norm": 2.6630674250800537, "learning_rate": 3.3218834428889244e-06, "loss": 0.7891, "step": 4510 }, { "epoch": 0.74, "grad_norm": 1.964822595508021, "learning_rate": 3.3179243837929e-06, "loss": 0.742, "step": 4511 }, { "epoch": 0.74, "grad_norm": 1.6965344412500443, "learning_rate": 3.3139672160443215e-06, "loss": 0.7965, "step": 4512 }, { "epoch": 0.74, "grad_norm": 1.6695316747699158, "learning_rate": 3.3100119407632556e-06, "loss": 0.7919, "step": 4513 }, { "epoch": 0.74, "grad_norm": 1.5891297385412035, "learning_rate": 3.306058559069236e-06, "loss": 0.7839, "step": 4514 }, { "epoch": 0.74, "grad_norm": 2.0882367976435234, "learning_rate": 3.3021070720812588e-06, "loss": 0.715, "step": 4515 }, { "epoch": 0.74, "grad_norm": 2.078623819617859, "learning_rate": 3.298157480917783e-06, "loss": 0.7414, "step": 4516 }, { "epoch": 0.74, "grad_norm": 1.8407652936589511, "learning_rate": 3.2942097866967336e-06, "loss": 0.7076, "step": 4517 }, { "epoch": 0.74, "grad_norm": 2.1015710359241995, "learning_rate": 3.2902639905354948e-06, "loss": 0.7956, "step": 4518 }, { "epoch": 0.74, "grad_norm": 1.7148502203836604, "learning_rate": 3.286320093550919e-06, "loss": 0.7891, "step": 4519 }, { "epoch": 0.74, "grad_norm": 2.0568856840789387, "learning_rate": 3.2823780968593156e-06, "loss": 0.7173, "step": 4520 }, { "epoch": 0.74, "grad_norm": 1.7230607521666248, "learning_rate": 3.2784380015764596e-06, "loss": 0.7522, "step": 4521 }, { "epoch": 0.74, "grad_norm": 2.2787566442769944, "learning_rate": 3.274499808817586e-06, "loss": 0.8283, "step": 4522 }, { "epoch": 0.74, "grad_norm": 2.387154204517503, "learning_rate": 3.2705635196973927e-06, "loss": 0.7757, "step": 4523 }, { "epoch": 0.74, "grad_norm": 1.953155928094149, "learning_rate": 3.266629135330037e-06, "loss": 0.7564, "step": 4524 }, { "epoch": 0.74, "grad_norm": 1.9049972724490019, "learning_rate": 3.2626966568291396e-06, "loss": 0.7261, "step": 4525 }, { "epoch": 0.74, "grad_norm": 2.755340655130259, "learning_rate": 3.2587660853077797e-06, "loss": 0.7576, "step": 4526 }, { "epoch": 0.74, "grad_norm": 1.7486243673378608, "learning_rate": 3.2548374218784963e-06, "loss": 0.8095, "step": 4527 }, { "epoch": 0.74, "grad_norm": 8.7689687521134, "learning_rate": 3.2509106676532897e-06, "loss": 0.7537, "step": 4528 }, { "epoch": 0.74, "grad_norm": 2.3861694777544007, "learning_rate": 3.2469858237436203e-06, "loss": 0.7425, "step": 4529 }, { "epoch": 0.74, "grad_norm": 2.7180935471156236, "learning_rate": 3.243062891260407e-06, "loss": 0.8118, "step": 4530 }, { "epoch": 0.74, "grad_norm": 2.065500944367292, "learning_rate": 3.2391418713140264e-06, "loss": 0.7418, "step": 4531 }, { "epoch": 0.74, "grad_norm": 1.965931424277871, "learning_rate": 3.235222765014315e-06, "loss": 0.7324, "step": 4532 }, { "epoch": 0.74, "grad_norm": 1.9741559310807204, "learning_rate": 3.231305573470569e-06, "loss": 0.8273, "step": 4533 }, { "epoch": 0.74, "grad_norm": 2.150617723373371, "learning_rate": 3.2273902977915405e-06, "loss": 0.7659, "step": 4534 }, { "epoch": 0.74, "grad_norm": 1.987459217782345, "learning_rate": 3.2234769390854394e-06, "loss": 0.7859, "step": 4535 }, { "epoch": 0.75, "grad_norm": 3.09702653417158, "learning_rate": 3.2195654984599334e-06, "loss": 0.7989, "step": 4536 }, { "epoch": 0.75, "grad_norm": 1.9585928621647508, "learning_rate": 3.2156559770221498e-06, "loss": 0.7559, "step": 4537 }, { "epoch": 0.75, "grad_norm": 1.99204359589185, "learning_rate": 3.2117483758786683e-06, "loss": 0.7767, "step": 4538 }, { "epoch": 0.75, "grad_norm": 2.0463298107799135, "learning_rate": 3.207842696135527e-06, "loss": 0.7079, "step": 4539 }, { "epoch": 0.75, "grad_norm": 2.486697065012398, "learning_rate": 3.2039389388982225e-06, "loss": 0.7445, "step": 4540 }, { "epoch": 0.75, "grad_norm": 0.6480782245483474, "learning_rate": 3.200037105271703e-06, "loss": 0.3609, "step": 4541 }, { "epoch": 0.75, "grad_norm": 2.194630334763538, "learning_rate": 3.1961371963603736e-06, "loss": 0.702, "step": 4542 }, { "epoch": 0.75, "grad_norm": 2.247148312681215, "learning_rate": 3.192239213268099e-06, "loss": 0.7468, "step": 4543 }, { "epoch": 0.75, "grad_norm": 1.7259159311826442, "learning_rate": 3.1883431570981917e-06, "loss": 0.7819, "step": 4544 }, { "epoch": 0.75, "grad_norm": 1.916735632061953, "learning_rate": 3.1844490289534236e-06, "loss": 0.7651, "step": 4545 }, { "epoch": 0.75, "grad_norm": 2.046639491885564, "learning_rate": 3.180556829936019e-06, "loss": 0.7484, "step": 4546 }, { "epoch": 0.75, "grad_norm": 2.0294398674077576, "learning_rate": 3.1766665611476566e-06, "loss": 0.7547, "step": 4547 }, { "epoch": 0.75, "grad_norm": 1.790364837498254, "learning_rate": 3.17277822368947e-06, "loss": 0.7579, "step": 4548 }, { "epoch": 0.75, "grad_norm": 9.784373188474056, "learning_rate": 3.168891818662043e-06, "loss": 0.774, "step": 4549 }, { "epoch": 0.75, "grad_norm": 1.4918424424205383, "learning_rate": 3.1650073471654152e-06, "loss": 0.8776, "step": 4550 }, { "epoch": 0.75, "grad_norm": 2.0091498961458796, "learning_rate": 3.161124810299079e-06, "loss": 0.6934, "step": 4551 }, { "epoch": 0.75, "grad_norm": 1.821598556388361, "learning_rate": 3.157244209161977e-06, "loss": 0.8236, "step": 4552 }, { "epoch": 0.75, "grad_norm": 1.6795092581785955, "learning_rate": 3.1533655448525057e-06, "loss": 0.7116, "step": 4553 }, { "epoch": 0.75, "grad_norm": 2.254840261697356, "learning_rate": 3.1494888184685134e-06, "loss": 0.7968, "step": 4554 }, { "epoch": 0.75, "grad_norm": 1.7814601235369536, "learning_rate": 3.145614031107299e-06, "loss": 0.7964, "step": 4555 }, { "epoch": 0.75, "grad_norm": 2.4248072014533766, "learning_rate": 3.141741183865612e-06, "loss": 0.7848, "step": 4556 }, { "epoch": 0.75, "grad_norm": 2.043256021382472, "learning_rate": 3.1378702778396554e-06, "loss": 0.7607, "step": 4557 }, { "epoch": 0.75, "grad_norm": 2.2515226610434014, "learning_rate": 3.134001314125079e-06, "loss": 0.8392, "step": 4558 }, { "epoch": 0.75, "grad_norm": 2.0756479802040695, "learning_rate": 3.1301342938169854e-06, "loss": 0.7718, "step": 4559 }, { "epoch": 0.75, "grad_norm": 2.7292895880710644, "learning_rate": 3.1262692180099285e-06, "loss": 0.6874, "step": 4560 }, { "epoch": 0.75, "grad_norm": 1.7966623517829305, "learning_rate": 3.1224060877979077e-06, "loss": 0.8002, "step": 4561 }, { "epoch": 0.75, "grad_norm": 1.9748499956584207, "learning_rate": 3.1185449042743744e-06, "loss": 0.7117, "step": 4562 }, { "epoch": 0.75, "grad_norm": 2.3279274968868546, "learning_rate": 3.114685668532229e-06, "loss": 0.7732, "step": 4563 }, { "epoch": 0.75, "grad_norm": 1.6305873075772932, "learning_rate": 3.1108283816638196e-06, "loss": 0.6877, "step": 4564 }, { "epoch": 0.75, "grad_norm": 2.3993922490644906, "learning_rate": 3.1069730447609423e-06, "loss": 0.7085, "step": 4565 }, { "epoch": 0.75, "grad_norm": 2.020774657131397, "learning_rate": 3.103119658914844e-06, "loss": 0.7902, "step": 4566 }, { "epoch": 0.75, "grad_norm": 2.2843924840081495, "learning_rate": 3.0992682252162165e-06, "loss": 0.7311, "step": 4567 }, { "epoch": 0.75, "grad_norm": 1.7991167124353191, "learning_rate": 3.0954187447551996e-06, "loss": 0.7361, "step": 4568 }, { "epoch": 0.75, "grad_norm": 2.602641564929759, "learning_rate": 3.091571218621382e-06, "loss": 0.7179, "step": 4569 }, { "epoch": 0.75, "grad_norm": 1.6733962206707897, "learning_rate": 3.0877256479037952e-06, "loss": 0.8679, "step": 4570 }, { "epoch": 0.75, "grad_norm": 3.3138172477456442, "learning_rate": 3.0838820336909224e-06, "loss": 0.7254, "step": 4571 }, { "epoch": 0.75, "grad_norm": 1.8325844410728986, "learning_rate": 3.0800403770706912e-06, "loss": 0.8448, "step": 4572 }, { "epoch": 0.75, "grad_norm": 5.765140849448767, "learning_rate": 3.076200679130471e-06, "loss": 0.7839, "step": 4573 }, { "epoch": 0.75, "grad_norm": 2.6506122873776423, "learning_rate": 3.072362940957083e-06, "loss": 0.8087, "step": 4574 }, { "epoch": 0.75, "grad_norm": 2.3308365455345395, "learning_rate": 3.0685271636367895e-06, "loss": 0.742, "step": 4575 }, { "epoch": 0.75, "grad_norm": 3.2054833793723057, "learning_rate": 3.064693348255301e-06, "loss": 0.7139, "step": 4576 }, { "epoch": 0.75, "grad_norm": 2.1188720496890263, "learning_rate": 3.060861495897769e-06, "loss": 0.7929, "step": 4577 }, { "epoch": 0.75, "grad_norm": 0.628578817259799, "learning_rate": 3.0570316076487918e-06, "loss": 0.3553, "step": 4578 }, { "epoch": 0.75, "grad_norm": 2.8314849796470405, "learning_rate": 3.0532036845924107e-06, "loss": 0.7129, "step": 4579 }, { "epoch": 0.75, "grad_norm": 2.10002787849663, "learning_rate": 3.049377727812113e-06, "loss": 0.8162, "step": 4580 }, { "epoch": 0.75, "grad_norm": 1.8540761316511338, "learning_rate": 3.0455537383908263e-06, "loss": 0.777, "step": 4581 }, { "epoch": 0.75, "grad_norm": 1.9914208986559752, "learning_rate": 3.041731717410923e-06, "loss": 0.7732, "step": 4582 }, { "epoch": 0.75, "grad_norm": 0.6295058141965171, "learning_rate": 3.0379116659542186e-06, "loss": 0.3262, "step": 4583 }, { "epoch": 0.75, "grad_norm": 6.080923850442664, "learning_rate": 3.0340935851019694e-06, "loss": 0.8053, "step": 4584 }, { "epoch": 0.75, "grad_norm": 1.9613085880577625, "learning_rate": 3.0302774759348797e-06, "loss": 0.7767, "step": 4585 }, { "epoch": 0.75, "grad_norm": 2.236717847288838, "learning_rate": 3.0264633395330834e-06, "loss": 0.7205, "step": 4586 }, { "epoch": 0.75, "grad_norm": 2.3702615395211026, "learning_rate": 3.022651176976166e-06, "loss": 0.8278, "step": 4587 }, { "epoch": 0.75, "grad_norm": 2.8384349010496397, "learning_rate": 3.0188409893431556e-06, "loss": 0.7384, "step": 4588 }, { "epoch": 0.75, "grad_norm": 2.526321401755744, "learning_rate": 3.0150327777125175e-06, "loss": 0.7307, "step": 4589 }, { "epoch": 0.75, "grad_norm": 1.8122615209849673, "learning_rate": 3.011226543162156e-06, "loss": 0.7809, "step": 4590 }, { "epoch": 0.75, "grad_norm": 2.2204908932650316, "learning_rate": 3.007422286769418e-06, "loss": 0.7833, "step": 4591 }, { "epoch": 0.75, "grad_norm": 1.9137591820152737, "learning_rate": 3.003620009611091e-06, "loss": 0.7088, "step": 4592 }, { "epoch": 0.75, "grad_norm": 1.2455431145259401, "learning_rate": 2.999819712763402e-06, "loss": 0.7256, "step": 4593 }, { "epoch": 0.75, "grad_norm": 0.6204045119467998, "learning_rate": 2.996021397302015e-06, "loss": 0.3483, "step": 4594 }, { "epoch": 0.75, "grad_norm": 1.66531402887562, "learning_rate": 2.992225064302037e-06, "loss": 0.7588, "step": 4595 }, { "epoch": 0.75, "grad_norm": 2.3205840433912117, "learning_rate": 2.988430714838011e-06, "loss": 0.745, "step": 4596 }, { "epoch": 0.76, "grad_norm": 1.9851229864196656, "learning_rate": 2.9846383499839205e-06, "loss": 0.7572, "step": 4597 }, { "epoch": 0.76, "grad_norm": 2.014629526205925, "learning_rate": 2.9808479708131864e-06, "loss": 0.7254, "step": 4598 }, { "epoch": 0.76, "grad_norm": 2.0806075277876253, "learning_rate": 2.9770595783986666e-06, "loss": 0.6946, "step": 4599 }, { "epoch": 0.76, "grad_norm": 2.722919805464069, "learning_rate": 2.9732731738126586e-06, "loss": 0.7426, "step": 4600 }, { "epoch": 0.76, "grad_norm": 2.1717149465448773, "learning_rate": 2.969488758126896e-06, "loss": 0.7519, "step": 4601 }, { "epoch": 0.76, "grad_norm": 3.8410346371846162, "learning_rate": 2.965706332412549e-06, "loss": 0.7191, "step": 4602 }, { "epoch": 0.76, "grad_norm": 0.6109975841586625, "learning_rate": 2.9619258977402253e-06, "loss": 0.3828, "step": 4603 }, { "epoch": 0.76, "grad_norm": 1.6052729441033884, "learning_rate": 2.9581474551799703e-06, "loss": 0.7412, "step": 4604 }, { "epoch": 0.76, "grad_norm": 1.61115474528062, "learning_rate": 2.9543710058012633e-06, "loss": 0.8027, "step": 4605 }, { "epoch": 0.76, "grad_norm": 1.726438810614486, "learning_rate": 2.9505965506730195e-06, "loss": 0.7868, "step": 4606 }, { "epoch": 0.76, "grad_norm": 1.654144840988626, "learning_rate": 2.946824090863596e-06, "loss": 0.7628, "step": 4607 }, { "epoch": 0.76, "grad_norm": 1.8967817242850946, "learning_rate": 2.943053627440771e-06, "loss": 0.7068, "step": 4608 }, { "epoch": 0.76, "grad_norm": 1.6824451600260213, "learning_rate": 2.93928516147177e-06, "loss": 0.7972, "step": 4609 }, { "epoch": 0.76, "grad_norm": 0.5995893183227775, "learning_rate": 2.9355186940232493e-06, "loss": 0.3077, "step": 4610 }, { "epoch": 0.76, "grad_norm": 1.955162662967856, "learning_rate": 2.9317542261612986e-06, "loss": 0.8013, "step": 4611 }, { "epoch": 0.76, "grad_norm": 2.234145325835091, "learning_rate": 2.927991758951445e-06, "loss": 0.8202, "step": 4612 }, { "epoch": 0.76, "grad_norm": 2.5655215589757785, "learning_rate": 2.924231293458647e-06, "loss": 0.8114, "step": 4613 }, { "epoch": 0.76, "grad_norm": 2.0146714095812994, "learning_rate": 2.920472830747295e-06, "loss": 0.76, "step": 4614 }, { "epoch": 0.76, "grad_norm": 2.3411813670203068, "learning_rate": 2.9167163718812143e-06, "loss": 0.8326, "step": 4615 }, { "epoch": 0.76, "grad_norm": 3.8210039415936854, "learning_rate": 2.9129619179236625e-06, "loss": 0.7751, "step": 4616 }, { "epoch": 0.76, "grad_norm": 2.6329373451089735, "learning_rate": 2.9092094699373296e-06, "loss": 0.7214, "step": 4617 }, { "epoch": 0.76, "grad_norm": 1.9270782874367336, "learning_rate": 2.90545902898434e-06, "loss": 0.6984, "step": 4618 }, { "epoch": 0.76, "grad_norm": 1.8082733077601747, "learning_rate": 2.9017105961262448e-06, "loss": 0.7917, "step": 4619 }, { "epoch": 0.76, "grad_norm": 1.6908420769928891, "learning_rate": 2.8979641724240324e-06, "loss": 0.7727, "step": 4620 }, { "epoch": 0.76, "grad_norm": 1.682199811509871, "learning_rate": 2.8942197589381204e-06, "loss": 0.808, "step": 4621 }, { "epoch": 0.76, "grad_norm": 1.8361403361792916, "learning_rate": 2.890477356728356e-06, "loss": 0.6948, "step": 4622 }, { "epoch": 0.76, "grad_norm": 1.837749108176603, "learning_rate": 2.886736966854019e-06, "loss": 0.8065, "step": 4623 }, { "epoch": 0.76, "grad_norm": 2.121443709623355, "learning_rate": 2.8829985903738176e-06, "loss": 0.7974, "step": 4624 }, { "epoch": 0.76, "grad_norm": 2.874986964682604, "learning_rate": 2.8792622283458926e-06, "loss": 0.7697, "step": 4625 }, { "epoch": 0.76, "grad_norm": 2.2783418472215775, "learning_rate": 2.8755278818278143e-06, "loss": 0.7981, "step": 4626 }, { "epoch": 0.76, "grad_norm": 1.7939761540003583, "learning_rate": 2.8717955518765794e-06, "loss": 0.7275, "step": 4627 }, { "epoch": 0.76, "grad_norm": 13.093664338385198, "learning_rate": 2.8680652395486198e-06, "loss": 0.8561, "step": 4628 }, { "epoch": 0.76, "grad_norm": 2.09044476844198, "learning_rate": 2.864336945899788e-06, "loss": 0.8376, "step": 4629 }, { "epoch": 0.76, "grad_norm": 2.111216441129074, "learning_rate": 2.860610671985371e-06, "loss": 0.8256, "step": 4630 }, { "epoch": 0.76, "grad_norm": 2.0457050832641186, "learning_rate": 2.856886418860083e-06, "loss": 0.7165, "step": 4631 }, { "epoch": 0.76, "grad_norm": 2.386447460214213, "learning_rate": 2.853164187578067e-06, "loss": 0.7658, "step": 4632 }, { "epoch": 0.76, "grad_norm": 2.1243419351630934, "learning_rate": 2.849443979192892e-06, "loss": 0.7145, "step": 4633 }, { "epoch": 0.76, "grad_norm": 1.5502936084006569, "learning_rate": 2.845725794757551e-06, "loss": 0.6764, "step": 4634 }, { "epoch": 0.76, "grad_norm": 2.0535005472991537, "learning_rate": 2.8420096353244763e-06, "loss": 0.6726, "step": 4635 }, { "epoch": 0.76, "grad_norm": 2.0500735217073007, "learning_rate": 2.838295501945516e-06, "loss": 0.7487, "step": 4636 }, { "epoch": 0.76, "grad_norm": 1.7884425804077688, "learning_rate": 2.834583395671947e-06, "loss": 0.762, "step": 4637 }, { "epoch": 0.76, "grad_norm": 2.0305168437055707, "learning_rate": 2.8308733175544724e-06, "loss": 0.7744, "step": 4638 }, { "epoch": 0.76, "grad_norm": 1.5026739112227951, "learning_rate": 2.827165268643223e-06, "loss": 0.7309, "step": 4639 }, { "epoch": 0.76, "grad_norm": 1.9643402483956898, "learning_rate": 2.8234592499877535e-06, "loss": 0.7897, "step": 4640 }, { "epoch": 0.76, "grad_norm": 2.8580838882613584, "learning_rate": 2.819755262637046e-06, "loss": 0.6819, "step": 4641 }, { "epoch": 0.76, "grad_norm": 3.1707472129503054, "learning_rate": 2.8160533076395045e-06, "loss": 0.8011, "step": 4642 }, { "epoch": 0.76, "grad_norm": 1.8852865297809238, "learning_rate": 2.812353386042962e-06, "loss": 0.743, "step": 4643 }, { "epoch": 0.76, "grad_norm": 1.8238696280941127, "learning_rate": 2.8086554988946714e-06, "loss": 0.777, "step": 4644 }, { "epoch": 0.76, "grad_norm": 0.6553136259740716, "learning_rate": 2.804959647241312e-06, "loss": 0.3597, "step": 4645 }, { "epoch": 0.76, "grad_norm": 2.118540422477829, "learning_rate": 2.8012658321289878e-06, "loss": 0.7333, "step": 4646 }, { "epoch": 0.76, "grad_norm": 1.9606227212089922, "learning_rate": 2.797574054603225e-06, "loss": 0.7077, "step": 4647 }, { "epoch": 0.76, "grad_norm": 1.9992074603511478, "learning_rate": 2.7938843157089734e-06, "loss": 0.7863, "step": 4648 }, { "epoch": 0.76, "grad_norm": 0.6503838413227397, "learning_rate": 2.790196616490607e-06, "loss": 0.347, "step": 4649 }, { "epoch": 0.76, "grad_norm": 1.7373272771991257, "learning_rate": 2.7865109579919223e-06, "loss": 0.8222, "step": 4650 }, { "epoch": 0.76, "grad_norm": 2.1024960926568164, "learning_rate": 2.7828273412561324e-06, "loss": 0.7655, "step": 4651 }, { "epoch": 0.76, "grad_norm": 1.5193388230247682, "learning_rate": 2.7791457673258793e-06, "loss": 0.8246, "step": 4652 }, { "epoch": 0.76, "grad_norm": 2.1007667761653517, "learning_rate": 2.775466237243226e-06, "loss": 0.7567, "step": 4653 }, { "epoch": 0.76, "grad_norm": 1.6094167350484674, "learning_rate": 2.7717887520496545e-06, "loss": 0.7321, "step": 4654 }, { "epoch": 0.76, "grad_norm": 3.9196351858406, "learning_rate": 2.7681133127860705e-06, "loss": 0.6905, "step": 4655 }, { "epoch": 0.76, "grad_norm": 1.8713292990588537, "learning_rate": 2.7644399204927984e-06, "loss": 0.7546, "step": 4656 }, { "epoch": 0.76, "grad_norm": 2.8849461992482612, "learning_rate": 2.7607685762095825e-06, "loss": 0.8116, "step": 4657 }, { "epoch": 0.77, "grad_norm": 1.7219252812182917, "learning_rate": 2.7570992809755937e-06, "loss": 0.6847, "step": 4658 }, { "epoch": 0.77, "grad_norm": 2.5321013062127737, "learning_rate": 2.753432035829415e-06, "loss": 0.7164, "step": 4659 }, { "epoch": 0.77, "grad_norm": 1.7826146369524103, "learning_rate": 2.749766841809054e-06, "loss": 0.7581, "step": 4660 }, { "epoch": 0.77, "grad_norm": 2.5456607010263834, "learning_rate": 2.746103699951934e-06, "loss": 0.6961, "step": 4661 }, { "epoch": 0.77, "grad_norm": 1.7434795745397964, "learning_rate": 2.742442611294902e-06, "loss": 0.7576, "step": 4662 }, { "epoch": 0.77, "grad_norm": 2.1305265611487862, "learning_rate": 2.73878357687422e-06, "loss": 0.7834, "step": 4663 }, { "epoch": 0.77, "grad_norm": 0.6235677645182569, "learning_rate": 2.7351265977255702e-06, "loss": 0.3506, "step": 4664 }, { "epoch": 0.77, "grad_norm": 1.873732067010852, "learning_rate": 2.731471674884053e-06, "loss": 0.75, "step": 4665 }, { "epoch": 0.77, "grad_norm": 1.7430683687502155, "learning_rate": 2.7278188093841874e-06, "loss": 0.7494, "step": 4666 }, { "epoch": 0.77, "grad_norm": 1.6933576967316781, "learning_rate": 2.7241680022599073e-06, "loss": 0.7161, "step": 4667 }, { "epoch": 0.77, "grad_norm": 2.2028848330767135, "learning_rate": 2.720519254544568e-06, "loss": 0.6838, "step": 4668 }, { "epoch": 0.77, "grad_norm": 2.357279221001197, "learning_rate": 2.716872567270938e-06, "loss": 0.7721, "step": 4669 }, { "epoch": 0.77, "grad_norm": 1.7419215303457003, "learning_rate": 2.713227941471206e-06, "loss": 0.7573, "step": 4670 }, { "epoch": 0.77, "grad_norm": 1.8514528205487888, "learning_rate": 2.7095853781769752e-06, "loss": 0.7156, "step": 4671 }, { "epoch": 0.77, "grad_norm": 1.6090874523820873, "learning_rate": 2.7059448784192688e-06, "loss": 0.6883, "step": 4672 }, { "epoch": 0.77, "grad_norm": 1.7937913642506331, "learning_rate": 2.702306443228516e-06, "loss": 0.7823, "step": 4673 }, { "epoch": 0.77, "grad_norm": 2.381440353121855, "learning_rate": 2.6986700736345715e-06, "loss": 0.7493, "step": 4674 }, { "epoch": 0.77, "grad_norm": 4.965606936588777, "learning_rate": 2.6950357706667017e-06, "loss": 0.7634, "step": 4675 }, { "epoch": 0.77, "grad_norm": 1.682470237585335, "learning_rate": 2.6914035353535897e-06, "loss": 0.7545, "step": 4676 }, { "epoch": 0.77, "grad_norm": 2.404386812250604, "learning_rate": 2.68777336872333e-06, "loss": 0.7698, "step": 4677 }, { "epoch": 0.77, "grad_norm": 2.003450056832997, "learning_rate": 2.6841452718034343e-06, "loss": 0.7587, "step": 4678 }, { "epoch": 0.77, "grad_norm": 2.4181377317683586, "learning_rate": 2.6805192456208297e-06, "loss": 0.7831, "step": 4679 }, { "epoch": 0.77, "grad_norm": 1.8660608329418942, "learning_rate": 2.6768952912018498e-06, "loss": 0.7809, "step": 4680 }, { "epoch": 0.77, "grad_norm": 2.041272061093075, "learning_rate": 2.6732734095722545e-06, "loss": 0.6892, "step": 4681 }, { "epoch": 0.77, "grad_norm": 2.412496901477215, "learning_rate": 2.6696536017572074e-06, "loss": 0.7375, "step": 4682 }, { "epoch": 0.77, "grad_norm": 1.6057975034944951, "learning_rate": 2.666035868781285e-06, "loss": 0.7853, "step": 4683 }, { "epoch": 0.77, "grad_norm": 1.9162624569525137, "learning_rate": 2.6624202116684816e-06, "loss": 0.8227, "step": 4684 }, { "epoch": 0.77, "grad_norm": 1.82941362652974, "learning_rate": 2.6588066314422e-06, "loss": 0.7167, "step": 4685 }, { "epoch": 0.77, "grad_norm": 1.8305677750949312, "learning_rate": 2.6551951291252576e-06, "loss": 0.7519, "step": 4686 }, { "epoch": 0.77, "grad_norm": 2.1510433272122103, "learning_rate": 2.651585705739881e-06, "loss": 0.7363, "step": 4687 }, { "epoch": 0.77, "grad_norm": 4.379082970078408, "learning_rate": 2.6479783623077105e-06, "loss": 0.7983, "step": 4688 }, { "epoch": 0.77, "grad_norm": 1.6983978299475258, "learning_rate": 2.6443730998497985e-06, "loss": 0.7967, "step": 4689 }, { "epoch": 0.77, "grad_norm": 1.9012503872132713, "learning_rate": 2.6407699193866045e-06, "loss": 0.7406, "step": 4690 }, { "epoch": 0.77, "grad_norm": 0.5890440147943559, "learning_rate": 2.6371688219380032e-06, "loss": 0.33, "step": 4691 }, { "epoch": 0.77, "grad_norm": 1.8467499769829967, "learning_rate": 2.6335698085232764e-06, "loss": 0.7798, "step": 4692 }, { "epoch": 0.77, "grad_norm": 1.9677414126090156, "learning_rate": 2.62997288016112e-06, "loss": 0.7361, "step": 4693 }, { "epoch": 0.77, "grad_norm": 2.3335404702391256, "learning_rate": 2.6263780378696324e-06, "loss": 0.779, "step": 4694 }, { "epoch": 0.77, "grad_norm": 13.571763251656622, "learning_rate": 2.6227852826663294e-06, "loss": 0.8398, "step": 4695 }, { "epoch": 0.77, "grad_norm": 1.8938795029921656, "learning_rate": 2.6191946155681303e-06, "loss": 0.7237, "step": 4696 }, { "epoch": 0.77, "grad_norm": 1.9508035028796828, "learning_rate": 2.6156060375913685e-06, "loss": 0.7673, "step": 4697 }, { "epoch": 0.77, "grad_norm": 2.462765929830689, "learning_rate": 2.6120195497517818e-06, "loss": 0.8091, "step": 4698 }, { "epoch": 0.77, "grad_norm": 2.0725674856636394, "learning_rate": 2.608435153064519e-06, "loss": 0.7034, "step": 4699 }, { "epoch": 0.77, "grad_norm": 2.037733185888748, "learning_rate": 2.6048528485441347e-06, "loss": 0.6807, "step": 4700 }, { "epoch": 0.77, "grad_norm": 1.7133719681505617, "learning_rate": 2.601272637204595e-06, "loss": 0.7797, "step": 4701 }, { "epoch": 0.77, "grad_norm": 1.9727647024338288, "learning_rate": 2.5976945200592683e-06, "loss": 0.7635, "step": 4702 }, { "epoch": 0.77, "grad_norm": 1.7482228504640676, "learning_rate": 2.5941184981209354e-06, "loss": 0.8072, "step": 4703 }, { "epoch": 0.77, "grad_norm": 1.5000368419168875, "learning_rate": 2.5905445724017786e-06, "loss": 0.8176, "step": 4704 }, { "epoch": 0.77, "grad_norm": 1.5902213708769029, "learning_rate": 2.586972743913394e-06, "loss": 0.7924, "step": 4705 }, { "epoch": 0.77, "grad_norm": 0.5904099154835225, "learning_rate": 2.5834030136667796e-06, "loss": 0.3676, "step": 4706 }, { "epoch": 0.77, "grad_norm": 2.6978944955990563, "learning_rate": 2.579835382672339e-06, "loss": 0.8038, "step": 4707 }, { "epoch": 0.77, "grad_norm": 1.6983951959412937, "learning_rate": 2.5762698519398832e-06, "loss": 0.7502, "step": 4708 }, { "epoch": 0.77, "grad_norm": 4.72579334004029, "learning_rate": 2.5727064224786267e-06, "loss": 0.7671, "step": 4709 }, { "epoch": 0.77, "grad_norm": 1.913251939157364, "learning_rate": 2.569145095297192e-06, "loss": 0.763, "step": 4710 }, { "epoch": 0.77, "grad_norm": 2.0230528147169777, "learning_rate": 2.5655858714036054e-06, "loss": 0.7978, "step": 4711 }, { "epoch": 0.77, "grad_norm": 1.9155262603102952, "learning_rate": 2.5620287518052967e-06, "loss": 0.7611, "step": 4712 }, { "epoch": 0.77, "grad_norm": 1.7660824205149148, "learning_rate": 2.5584737375091016e-06, "loss": 0.7335, "step": 4713 }, { "epoch": 0.77, "grad_norm": 1.8355355954491612, "learning_rate": 2.554920829521259e-06, "loss": 0.7067, "step": 4714 }, { "epoch": 0.77, "grad_norm": 1.637875443642708, "learning_rate": 2.551370028847416e-06, "loss": 0.7759, "step": 4715 }, { "epoch": 0.77, "grad_norm": 1.8635908574415523, "learning_rate": 2.547821336492614e-06, "loss": 0.7906, "step": 4716 }, { "epoch": 0.77, "grad_norm": 1.8653528791305514, "learning_rate": 2.544274753461303e-06, "loss": 0.7688, "step": 4717 }, { "epoch": 0.77, "grad_norm": 2.0349217016041545, "learning_rate": 2.5407302807573387e-06, "loss": 0.7849, "step": 4718 }, { "epoch": 0.78, "grad_norm": 1.7990801819755753, "learning_rate": 2.5371879193839756e-06, "loss": 0.7583, "step": 4719 }, { "epoch": 0.78, "grad_norm": 2.65398046815226, "learning_rate": 2.5336476703438705e-06, "loss": 0.7917, "step": 4720 }, { "epoch": 0.78, "grad_norm": 4.648402229039534, "learning_rate": 2.530109534639085e-06, "loss": 0.6969, "step": 4721 }, { "epoch": 0.78, "grad_norm": 2.6927878444933575, "learning_rate": 2.5265735132710802e-06, "loss": 0.7488, "step": 4722 }, { "epoch": 0.78, "grad_norm": 1.9015512310846625, "learning_rate": 2.5230396072407204e-06, "loss": 0.7625, "step": 4723 }, { "epoch": 0.78, "grad_norm": 1.8982615628350974, "learning_rate": 2.519507817548269e-06, "loss": 0.7083, "step": 4724 }, { "epoch": 0.78, "grad_norm": 1.915658912427572, "learning_rate": 2.515978145193393e-06, "loss": 0.7101, "step": 4725 }, { "epoch": 0.78, "grad_norm": 2.333145500381029, "learning_rate": 2.512450591175157e-06, "loss": 0.7235, "step": 4726 }, { "epoch": 0.78, "grad_norm": 1.9861274355781815, "learning_rate": 2.508925156492027e-06, "loss": 0.7115, "step": 4727 }, { "epoch": 0.78, "grad_norm": 2.3509416386718596, "learning_rate": 2.5054018421418737e-06, "loss": 0.8254, "step": 4728 }, { "epoch": 0.78, "grad_norm": 2.0991520969264217, "learning_rate": 2.5018806491219627e-06, "loss": 0.7768, "step": 4729 }, { "epoch": 0.78, "grad_norm": 2.0915759299705763, "learning_rate": 2.4983615784289585e-06, "loss": 0.7697, "step": 4730 }, { "epoch": 0.78, "grad_norm": 2.018826651486894, "learning_rate": 2.494844631058927e-06, "loss": 0.7596, "step": 4731 }, { "epoch": 0.78, "grad_norm": 2.400663075694023, "learning_rate": 2.4913298080073344e-06, "loss": 0.7802, "step": 4732 }, { "epoch": 0.78, "grad_norm": 2.0535988223745227, "learning_rate": 2.487817110269042e-06, "loss": 0.8152, "step": 4733 }, { "epoch": 0.78, "grad_norm": 1.9250528491736438, "learning_rate": 2.4843065388383126e-06, "loss": 0.7769, "step": 4734 }, { "epoch": 0.78, "grad_norm": 0.5681175976806918, "learning_rate": 2.480798094708805e-06, "loss": 0.3265, "step": 4735 }, { "epoch": 0.78, "grad_norm": 2.5653910176837993, "learning_rate": 2.4772917788735786e-06, "loss": 0.7602, "step": 4736 }, { "epoch": 0.78, "grad_norm": 2.2007614846444348, "learning_rate": 2.473787592325091e-06, "loss": 0.7695, "step": 4737 }, { "epoch": 0.78, "grad_norm": 1.6994945610940158, "learning_rate": 2.470285536055188e-06, "loss": 0.7575, "step": 4738 }, { "epoch": 0.78, "grad_norm": 2.5795885202917552, "learning_rate": 2.4667856110551235e-06, "loss": 0.8179, "step": 4739 }, { "epoch": 0.78, "grad_norm": 2.046978641612059, "learning_rate": 2.463287818315543e-06, "loss": 0.7687, "step": 4740 }, { "epoch": 0.78, "grad_norm": 2.9989491594250097, "learning_rate": 2.4597921588264893e-06, "loss": 0.7417, "step": 4741 }, { "epoch": 0.78, "grad_norm": 2.0437212498763424, "learning_rate": 2.456298633577402e-06, "loss": 0.7544, "step": 4742 }, { "epoch": 0.78, "grad_norm": 2.469880276793193, "learning_rate": 2.4528072435571158e-06, "loss": 0.6794, "step": 4743 }, { "epoch": 0.78, "grad_norm": 2.161415978848464, "learning_rate": 2.449317989753862e-06, "loss": 0.7416, "step": 4744 }, { "epoch": 0.78, "grad_norm": 2.0961304659939564, "learning_rate": 2.445830873155266e-06, "loss": 0.7962, "step": 4745 }, { "epoch": 0.78, "grad_norm": 1.5106573194249973, "learning_rate": 2.4423458947483482e-06, "loss": 0.7686, "step": 4746 }, { "epoch": 0.78, "grad_norm": 1.9641955050175077, "learning_rate": 2.4388630555195247e-06, "loss": 0.6985, "step": 4747 }, { "epoch": 0.78, "grad_norm": 1.8479574277313455, "learning_rate": 2.4353823564546064e-06, "loss": 0.7416, "step": 4748 }, { "epoch": 0.78, "grad_norm": 2.3002896050800166, "learning_rate": 2.4319037985387985e-06, "loss": 0.7724, "step": 4749 }, { "epoch": 0.78, "grad_norm": 1.850411637184314, "learning_rate": 2.428427382756695e-06, "loss": 0.7636, "step": 4750 }, { "epoch": 0.78, "grad_norm": 2.1594271908975737, "learning_rate": 2.424953110092294e-06, "loss": 0.7974, "step": 4751 }, { "epoch": 0.78, "grad_norm": 2.1121758121112677, "learning_rate": 2.4214809815289797e-06, "loss": 0.8194, "step": 4752 }, { "epoch": 0.78, "grad_norm": 2.3217964191174145, "learning_rate": 2.4180109980495293e-06, "loss": 0.7216, "step": 4753 }, { "epoch": 0.78, "grad_norm": 0.5951342700856727, "learning_rate": 2.4145431606361148e-06, "loss": 0.3291, "step": 4754 }, { "epoch": 0.78, "grad_norm": 1.9047037656660646, "learning_rate": 2.4110774702703e-06, "loss": 0.74, "step": 4755 }, { "epoch": 0.78, "grad_norm": 2.3260011204613695, "learning_rate": 2.4076139279330414e-06, "loss": 0.8104, "step": 4756 }, { "epoch": 0.78, "grad_norm": 0.6216893248753094, "learning_rate": 2.4041525346046877e-06, "loss": 0.3641, "step": 4757 }, { "epoch": 0.78, "grad_norm": 1.8812531552278606, "learning_rate": 2.4006932912649816e-06, "loss": 0.7076, "step": 4758 }, { "epoch": 0.78, "grad_norm": 2.2466717544458623, "learning_rate": 2.3972361988930505e-06, "loss": 0.8058, "step": 4759 }, { "epoch": 0.78, "grad_norm": 1.9566426785448332, "learning_rate": 2.3937812584674168e-06, "loss": 0.7787, "step": 4760 }, { "epoch": 0.78, "grad_norm": 3.8522635768133595, "learning_rate": 2.3903284709659957e-06, "loss": 0.7911, "step": 4761 }, { "epoch": 0.78, "grad_norm": 2.305788390848154, "learning_rate": 2.3868778373660927e-06, "loss": 0.6902, "step": 4762 }, { "epoch": 0.78, "grad_norm": 1.8171160157335187, "learning_rate": 2.3834293586444e-06, "loss": 0.7192, "step": 4763 }, { "epoch": 0.78, "grad_norm": 0.6107338220520819, "learning_rate": 2.379983035777005e-06, "loss": 0.3139, "step": 4764 }, { "epoch": 0.78, "grad_norm": 2.0177528089269483, "learning_rate": 2.37653886973938e-06, "loss": 0.7727, "step": 4765 }, { "epoch": 0.78, "grad_norm": 2.5514215539648264, "learning_rate": 2.3730968615063886e-06, "loss": 0.7543, "step": 4766 }, { "epoch": 0.78, "grad_norm": 2.0542623477476343, "learning_rate": 2.3696570120522868e-06, "loss": 0.7959, "step": 4767 }, { "epoch": 0.78, "grad_norm": 1.9546905917414232, "learning_rate": 2.3662193223507135e-06, "loss": 0.5948, "step": 4768 }, { "epoch": 0.78, "grad_norm": 1.7122389485861056, "learning_rate": 2.362783793374701e-06, "loss": 0.7497, "step": 4769 }, { "epoch": 0.78, "grad_norm": 2.5590933340669086, "learning_rate": 2.3593504260966695e-06, "loss": 0.7315, "step": 4770 }, { "epoch": 0.78, "grad_norm": 1.754491960326002, "learning_rate": 2.355919221488424e-06, "loss": 0.7471, "step": 4771 }, { "epoch": 0.78, "grad_norm": 1.6957353371297665, "learning_rate": 2.352490180521162e-06, "loss": 0.7915, "step": 4772 }, { "epoch": 0.78, "grad_norm": 1.911107707791911, "learning_rate": 2.349063304165462e-06, "loss": 0.8106, "step": 4773 }, { "epoch": 0.78, "grad_norm": 14.43446376653241, "learning_rate": 2.345638593391302e-06, "loss": 0.7392, "step": 4774 }, { "epoch": 0.78, "grad_norm": 2.292828309982143, "learning_rate": 2.3422160491680334e-06, "loss": 0.7611, "step": 4775 }, { "epoch": 0.78, "grad_norm": 0.6083648578039649, "learning_rate": 2.3387956724644014e-06, "loss": 0.3436, "step": 4776 }, { "epoch": 0.78, "grad_norm": 3.007292283464697, "learning_rate": 2.3353774642485374e-06, "loss": 0.7697, "step": 4777 }, { "epoch": 0.78, "grad_norm": 0.5348097131483242, "learning_rate": 2.331961425487956e-06, "loss": 0.3522, "step": 4778 }, { "epoch": 0.78, "grad_norm": 1.9316140821882641, "learning_rate": 2.3285475571495617e-06, "loss": 0.652, "step": 4779 }, { "epoch": 0.79, "grad_norm": 1.8143998619199921, "learning_rate": 2.3251358601996453e-06, "loss": 0.7642, "step": 4780 }, { "epoch": 0.79, "grad_norm": 3.2166666092049807, "learning_rate": 2.3217263356038744e-06, "loss": 0.7459, "step": 4781 }, { "epoch": 0.79, "grad_norm": 1.768744481047633, "learning_rate": 2.31831898432731e-06, "loss": 0.775, "step": 4782 }, { "epoch": 0.79, "grad_norm": 1.8363369017982334, "learning_rate": 2.3149138073343958e-06, "loss": 0.7543, "step": 4783 }, { "epoch": 0.79, "grad_norm": 1.9103278764378, "learning_rate": 2.3115108055889614e-06, "loss": 0.7612, "step": 4784 }, { "epoch": 0.79, "grad_norm": 2.3853356359762703, "learning_rate": 2.3081099800542183e-06, "loss": 0.7679, "step": 4785 }, { "epoch": 0.79, "grad_norm": 3.454483283002957, "learning_rate": 2.3047113316927627e-06, "loss": 0.7128, "step": 4786 }, { "epoch": 0.79, "grad_norm": 1.9311484013480202, "learning_rate": 2.301314861466575e-06, "loss": 0.775, "step": 4787 }, { "epoch": 0.79, "grad_norm": 1.9254501917640838, "learning_rate": 2.297920570337019e-06, "loss": 0.7574, "step": 4788 }, { "epoch": 0.79, "grad_norm": 1.9553090425426622, "learning_rate": 2.294528459264842e-06, "loss": 0.7967, "step": 4789 }, { "epoch": 0.79, "grad_norm": 3.205283727865285, "learning_rate": 2.291138529210174e-06, "loss": 0.8509, "step": 4790 }, { "epoch": 0.79, "grad_norm": 2.182726515580972, "learning_rate": 2.287750781132527e-06, "loss": 0.7387, "step": 4791 }, { "epoch": 0.79, "grad_norm": 1.6606629749249977, "learning_rate": 2.284365215990797e-06, "loss": 0.7775, "step": 4792 }, { "epoch": 0.79, "grad_norm": 2.153563064019179, "learning_rate": 2.2809818347432598e-06, "loss": 0.8385, "step": 4793 }, { "epoch": 0.79, "grad_norm": 2.1695547873182095, "learning_rate": 2.2776006383475745e-06, "loss": 0.7745, "step": 4794 }, { "epoch": 0.79, "grad_norm": 2.0718016057456725, "learning_rate": 2.274221627760782e-06, "loss": 0.7932, "step": 4795 }, { "epoch": 0.79, "grad_norm": 2.3088402638548198, "learning_rate": 2.270844803939305e-06, "loss": 0.7588, "step": 4796 }, { "epoch": 0.79, "grad_norm": 1.7205737521867641, "learning_rate": 2.2674701678389423e-06, "loss": 0.684, "step": 4797 }, { "epoch": 0.79, "grad_norm": 0.6131690911051106, "learning_rate": 2.2640977204148838e-06, "loss": 0.3224, "step": 4798 }, { "epoch": 0.79, "grad_norm": 2.1423368875811377, "learning_rate": 2.26072746262169e-06, "loss": 0.7076, "step": 4799 }, { "epoch": 0.79, "grad_norm": 1.8170862925456048, "learning_rate": 2.2573593954133067e-06, "loss": 0.8193, "step": 4800 }, { "epoch": 0.79, "grad_norm": 2.117065438961735, "learning_rate": 2.2539935197430574e-06, "loss": 0.7559, "step": 4801 }, { "epoch": 0.79, "grad_norm": 1.6885292723582497, "learning_rate": 2.2506298365636482e-06, "loss": 0.7732, "step": 4802 }, { "epoch": 0.79, "grad_norm": 2.455294346279633, "learning_rate": 2.2472683468271584e-06, "loss": 0.7915, "step": 4803 }, { "epoch": 0.79, "grad_norm": 0.5947594361080165, "learning_rate": 2.2439090514850527e-06, "loss": 0.3341, "step": 4804 }, { "epoch": 0.79, "grad_norm": 2.124209847623615, "learning_rate": 2.2405519514881723e-06, "loss": 0.7664, "step": 4805 }, { "epoch": 0.79, "grad_norm": 2.688858155948713, "learning_rate": 2.2371970477867377e-06, "loss": 0.7769, "step": 4806 }, { "epoch": 0.79, "grad_norm": 5.042814185869373, "learning_rate": 2.2338443413303466e-06, "loss": 0.6922, "step": 4807 }, { "epoch": 0.79, "grad_norm": 2.7648600286356966, "learning_rate": 2.230493833067977e-06, "loss": 0.7864, "step": 4808 }, { "epoch": 0.79, "grad_norm": 1.7027472555050513, "learning_rate": 2.2271455239479822e-06, "loss": 0.7093, "step": 4809 }, { "epoch": 0.79, "grad_norm": 1.9835224213113125, "learning_rate": 2.2237994149180943e-06, "loss": 0.774, "step": 4810 }, { "epoch": 0.79, "grad_norm": 1.9351132542083513, "learning_rate": 2.220455506925422e-06, "loss": 0.8547, "step": 4811 }, { "epoch": 0.79, "grad_norm": 1.8460679638377682, "learning_rate": 2.2171138009164515e-06, "loss": 0.754, "step": 4812 }, { "epoch": 0.79, "grad_norm": 1.836875770812531, "learning_rate": 2.213774297837047e-06, "loss": 0.7656, "step": 4813 }, { "epoch": 0.79, "grad_norm": 12.201331148624472, "learning_rate": 2.210436998632446e-06, "loss": 0.8152, "step": 4814 }, { "epoch": 0.79, "grad_norm": 1.7914869858613058, "learning_rate": 2.2071019042472643e-06, "loss": 0.8257, "step": 4815 }, { "epoch": 0.79, "grad_norm": 10.46809278653403, "learning_rate": 2.2037690156254944e-06, "loss": 0.7119, "step": 4816 }, { "epoch": 0.79, "grad_norm": 1.8230767252231463, "learning_rate": 2.2004383337105016e-06, "loss": 0.7295, "step": 4817 }, { "epoch": 0.79, "grad_norm": 1.8302866010219356, "learning_rate": 2.1971098594450315e-06, "loss": 0.8062, "step": 4818 }, { "epoch": 0.79, "grad_norm": 0.5885417224252375, "learning_rate": 2.1937835937711995e-06, "loss": 0.3032, "step": 4819 }, { "epoch": 0.79, "grad_norm": 0.6071179784053746, "learning_rate": 2.190459537630495e-06, "loss": 0.3388, "step": 4820 }, { "epoch": 0.79, "grad_norm": 1.5154114926238127, "learning_rate": 2.187137691963791e-06, "loss": 0.75, "step": 4821 }, { "epoch": 0.79, "grad_norm": 1.6665722967019854, "learning_rate": 2.1838180577113268e-06, "loss": 0.8014, "step": 4822 }, { "epoch": 0.79, "grad_norm": 1.9952421718726705, "learning_rate": 2.1805006358127213e-06, "loss": 0.7049, "step": 4823 }, { "epoch": 0.79, "grad_norm": 1.7448524499972797, "learning_rate": 2.177185427206956e-06, "loss": 0.7851, "step": 4824 }, { "epoch": 0.79, "grad_norm": 1.8761920672871983, "learning_rate": 2.173872432832398e-06, "loss": 0.7564, "step": 4825 }, { "epoch": 0.79, "grad_norm": 1.811991262128728, "learning_rate": 2.1705616536267838e-06, "loss": 0.7347, "step": 4826 }, { "epoch": 0.79, "grad_norm": 2.8517273387555035, "learning_rate": 2.1672530905272215e-06, "loss": 0.7656, "step": 4827 }, { "epoch": 0.79, "grad_norm": 2.2729430535236177, "learning_rate": 2.1639467444701934e-06, "loss": 0.7735, "step": 4828 }, { "epoch": 0.79, "grad_norm": 2.066481387555926, "learning_rate": 2.160642616391553e-06, "loss": 0.7539, "step": 4829 }, { "epoch": 0.79, "grad_norm": 1.771043275922715, "learning_rate": 2.1573407072265284e-06, "loss": 0.7766, "step": 4830 }, { "epoch": 0.79, "grad_norm": 1.997437188749581, "learning_rate": 2.1540410179097173e-06, "loss": 0.7713, "step": 4831 }, { "epoch": 0.79, "grad_norm": 2.0256374257667176, "learning_rate": 2.1507435493750885e-06, "loss": 0.7777, "step": 4832 }, { "epoch": 0.79, "grad_norm": 1.7228833973295778, "learning_rate": 2.1474483025559857e-06, "loss": 0.7578, "step": 4833 }, { "epoch": 0.79, "grad_norm": 1.7474131395687806, "learning_rate": 2.1441552783851195e-06, "loss": 0.6584, "step": 4834 }, { "epoch": 0.79, "grad_norm": 1.8918458750971503, "learning_rate": 2.1408644777945753e-06, "loss": 0.7656, "step": 4835 }, { "epoch": 0.79, "grad_norm": 2.443965338127711, "learning_rate": 2.137575901715806e-06, "loss": 0.7702, "step": 4836 }, { "epoch": 0.79, "grad_norm": 2.749619726860219, "learning_rate": 2.1342895510796367e-06, "loss": 0.8131, "step": 4837 }, { "epoch": 0.79, "grad_norm": 2.0071504635133635, "learning_rate": 2.1310054268162628e-06, "loss": 0.8008, "step": 4838 }, { "epoch": 0.79, "grad_norm": 2.0995457352891482, "learning_rate": 2.127723529855248e-06, "loss": 0.711, "step": 4839 }, { "epoch": 0.79, "grad_norm": 2.138501435093979, "learning_rate": 2.124443861125525e-06, "loss": 0.7393, "step": 4840 }, { "epoch": 0.8, "grad_norm": 2.092328332183162, "learning_rate": 2.1211664215553997e-06, "loss": 0.8142, "step": 4841 }, { "epoch": 0.8, "grad_norm": 2.1029672973398594, "learning_rate": 2.1178912120725416e-06, "loss": 0.7099, "step": 4842 }, { "epoch": 0.8, "grad_norm": 2.346531459481881, "learning_rate": 2.114618233603992e-06, "loss": 0.7534, "step": 4843 }, { "epoch": 0.8, "grad_norm": 2.6036671107108367, "learning_rate": 2.111347487076164e-06, "loss": 0.7103, "step": 4844 }, { "epoch": 0.8, "grad_norm": 2.682622661211311, "learning_rate": 2.1080789734148366e-06, "loss": 0.6967, "step": 4845 }, { "epoch": 0.8, "grad_norm": 2.081153228609139, "learning_rate": 2.1048126935451495e-06, "loss": 0.7736, "step": 4846 }, { "epoch": 0.8, "grad_norm": 1.9181825779489727, "learning_rate": 2.10154864839162e-06, "loss": 0.7914, "step": 4847 }, { "epoch": 0.8, "grad_norm": 1.607477340493648, "learning_rate": 2.0982868388781286e-06, "loss": 0.7746, "step": 4848 }, { "epoch": 0.8, "grad_norm": 1.8439611329359131, "learning_rate": 2.0950272659279246e-06, "loss": 0.7938, "step": 4849 }, { "epoch": 0.8, "grad_norm": 1.5986796342097853, "learning_rate": 2.091769930463621e-06, "loss": 0.8358, "step": 4850 }, { "epoch": 0.8, "grad_norm": 1.9163114510165284, "learning_rate": 2.0885148334072013e-06, "loss": 0.7558, "step": 4851 }, { "epoch": 0.8, "grad_norm": 1.5547186461335394, "learning_rate": 2.085261975680014e-06, "loss": 0.7426, "step": 4852 }, { "epoch": 0.8, "grad_norm": 1.896501837298518, "learning_rate": 2.0820113582027734e-06, "loss": 0.8357, "step": 4853 }, { "epoch": 0.8, "grad_norm": 2.4111578267012237, "learning_rate": 2.07876298189556e-06, "loss": 0.7551, "step": 4854 }, { "epoch": 0.8, "grad_norm": 2.228742291049511, "learning_rate": 2.07551684767782e-06, "loss": 0.8564, "step": 4855 }, { "epoch": 0.8, "grad_norm": 1.633548220790239, "learning_rate": 2.072272956468364e-06, "loss": 0.7362, "step": 4856 }, { "epoch": 0.8, "grad_norm": 1.8144674700932897, "learning_rate": 2.0690313091853697e-06, "loss": 0.7695, "step": 4857 }, { "epoch": 0.8, "grad_norm": 2.2755903042844157, "learning_rate": 2.0657919067463773e-06, "loss": 0.7607, "step": 4858 }, { "epoch": 0.8, "grad_norm": 2.0428430325429754, "learning_rate": 2.062554750068294e-06, "loss": 0.7843, "step": 4859 }, { "epoch": 0.8, "grad_norm": 2.2587607288182605, "learning_rate": 2.05931984006739e-06, "loss": 0.8163, "step": 4860 }, { "epoch": 0.8, "grad_norm": 2.0293939744281597, "learning_rate": 2.0560871776592996e-06, "loss": 0.7465, "step": 4861 }, { "epoch": 0.8, "grad_norm": 19.736652939243108, "learning_rate": 2.0528567637590214e-06, "loss": 0.7762, "step": 4862 }, { "epoch": 0.8, "grad_norm": 1.7147268966449618, "learning_rate": 2.0496285992809163e-06, "loss": 0.8397, "step": 4863 }, { "epoch": 0.8, "grad_norm": 2.209467300696268, "learning_rate": 2.0464026851387096e-06, "loss": 0.7668, "step": 4864 }, { "epoch": 0.8, "grad_norm": 2.0453210801365547, "learning_rate": 2.0431790222454906e-06, "loss": 0.7814, "step": 4865 }, { "epoch": 0.8, "grad_norm": 3.0258339162264782, "learning_rate": 2.03995761151371e-06, "loss": 0.7309, "step": 4866 }, { "epoch": 0.8, "grad_norm": 1.8117475021722445, "learning_rate": 2.0367384538551805e-06, "loss": 0.7832, "step": 4867 }, { "epoch": 0.8, "grad_norm": 2.595240277245247, "learning_rate": 2.033521550181078e-06, "loss": 0.8019, "step": 4868 }, { "epoch": 0.8, "grad_norm": 1.959825606999622, "learning_rate": 2.0303069014019415e-06, "loss": 0.7947, "step": 4869 }, { "epoch": 0.8, "grad_norm": 3.318322465441555, "learning_rate": 2.0270945084276695e-06, "loss": 0.6797, "step": 4870 }, { "epoch": 0.8, "grad_norm": 3.0631648062061227, "learning_rate": 2.0238843721675226e-06, "loss": 0.7222, "step": 4871 }, { "epoch": 0.8, "grad_norm": 2.0076018658102397, "learning_rate": 2.020676493530126e-06, "loss": 0.7853, "step": 4872 }, { "epoch": 0.8, "grad_norm": 1.7996647448981247, "learning_rate": 2.0174708734234596e-06, "loss": 0.7972, "step": 4873 }, { "epoch": 0.8, "grad_norm": 1.7980760245521654, "learning_rate": 2.0142675127548684e-06, "loss": 0.7405, "step": 4874 }, { "epoch": 0.8, "grad_norm": 2.0650482977830245, "learning_rate": 2.0110664124310574e-06, "loss": 0.7675, "step": 4875 }, { "epoch": 0.8, "grad_norm": 1.787766803356601, "learning_rate": 2.007867573358091e-06, "loss": 0.8135, "step": 4876 }, { "epoch": 0.8, "grad_norm": 2.036514894324938, "learning_rate": 2.0046709964413947e-06, "loss": 0.6407, "step": 4877 }, { "epoch": 0.8, "grad_norm": 0.6102945724517325, "learning_rate": 2.0014766825857514e-06, "loss": 0.313, "step": 4878 }, { "epoch": 0.8, "grad_norm": 1.9716278414957564, "learning_rate": 1.9982846326953066e-06, "loss": 0.7812, "step": 4879 }, { "epoch": 0.8, "grad_norm": 2.409415864299375, "learning_rate": 1.995094847673561e-06, "loss": 0.7613, "step": 4880 }, { "epoch": 0.8, "grad_norm": 1.9103578726910406, "learning_rate": 1.991907328423379e-06, "loss": 0.7119, "step": 4881 }, { "epoch": 0.8, "grad_norm": 1.7973395648581711, "learning_rate": 1.9887220758469794e-06, "loss": 0.7752, "step": 4882 }, { "epoch": 0.8, "grad_norm": 2.386246668170741, "learning_rate": 1.985539090845943e-06, "loss": 0.7331, "step": 4883 }, { "epoch": 0.8, "grad_norm": 1.9003306302674612, "learning_rate": 1.982358374321205e-06, "loss": 0.8391, "step": 4884 }, { "epoch": 0.8, "grad_norm": 3.154280879743576, "learning_rate": 1.9791799271730626e-06, "loss": 0.8072, "step": 4885 }, { "epoch": 0.8, "grad_norm": 2.073950844926521, "learning_rate": 1.9760037503011664e-06, "loss": 0.7707, "step": 4886 }, { "epoch": 0.8, "grad_norm": 2.004833117373423, "learning_rate": 1.972829844604528e-06, "loss": 0.7249, "step": 4887 }, { "epoch": 0.8, "grad_norm": 1.675259880072528, "learning_rate": 1.9696582109815145e-06, "loss": 0.7031, "step": 4888 }, { "epoch": 0.8, "grad_norm": 2.137843111028765, "learning_rate": 1.966488850329851e-06, "loss": 0.7404, "step": 4889 }, { "epoch": 0.8, "grad_norm": 2.062317950503516, "learning_rate": 1.9633217635466164e-06, "loss": 0.8192, "step": 4890 }, { "epoch": 0.8, "grad_norm": 1.601424217950457, "learning_rate": 1.960156951528248e-06, "loss": 0.7804, "step": 4891 }, { "epoch": 0.8, "grad_norm": 2.084300197291579, "learning_rate": 1.9569944151705423e-06, "loss": 0.7584, "step": 4892 }, { "epoch": 0.8, "grad_norm": 2.272125709606073, "learning_rate": 1.9538341553686446e-06, "loss": 0.7966, "step": 4893 }, { "epoch": 0.8, "grad_norm": 2.231888796779771, "learning_rate": 1.950676173017062e-06, "loss": 0.7804, "step": 4894 }, { "epoch": 0.8, "grad_norm": 1.8932733716416141, "learning_rate": 1.947520469009655e-06, "loss": 0.6857, "step": 4895 }, { "epoch": 0.8, "grad_norm": 2.466813118158042, "learning_rate": 1.9443670442396378e-06, "loss": 0.7523, "step": 4896 }, { "epoch": 0.8, "grad_norm": 2.950072772453807, "learning_rate": 1.941215899599581e-06, "loss": 0.7523, "step": 4897 }, { "epoch": 0.8, "grad_norm": 3.164654662719005, "learning_rate": 1.93806703598141e-06, "loss": 0.7881, "step": 4898 }, { "epoch": 0.8, "grad_norm": 2.480400722043341, "learning_rate": 1.9349204542764044e-06, "loss": 0.7676, "step": 4899 }, { "epoch": 0.8, "grad_norm": 1.5660697399508428, "learning_rate": 1.9317761553751957e-06, "loss": 0.7565, "step": 4900 }, { "epoch": 0.8, "grad_norm": 2.1079201997095605, "learning_rate": 1.928634140167772e-06, "loss": 0.7086, "step": 4901 }, { "epoch": 0.81, "grad_norm": 2.3920866354645196, "learning_rate": 1.9254944095434745e-06, "loss": 0.7878, "step": 4902 }, { "epoch": 0.81, "grad_norm": 2.0598009386319984, "learning_rate": 1.9223569643909978e-06, "loss": 0.7512, "step": 4903 }, { "epoch": 0.81, "grad_norm": 1.7852088814433789, "learning_rate": 1.919221805598388e-06, "loss": 0.7021, "step": 4904 }, { "epoch": 0.81, "grad_norm": 1.6862702462150916, "learning_rate": 1.9160889340530455e-06, "loss": 0.7877, "step": 4905 }, { "epoch": 0.81, "grad_norm": 2.213836127682183, "learning_rate": 1.9129583506417236e-06, "loss": 0.7188, "step": 4906 }, { "epoch": 0.81, "grad_norm": 1.9249829774701117, "learning_rate": 1.9098300562505266e-06, "loss": 0.8003, "step": 4907 }, { "epoch": 0.81, "grad_norm": 1.7821640628215498, "learning_rate": 1.9067040517649115e-06, "loss": 0.7512, "step": 4908 }, { "epoch": 0.81, "grad_norm": 2.4222399627170654, "learning_rate": 1.9035803380696883e-06, "loss": 0.7505, "step": 4909 }, { "epoch": 0.81, "grad_norm": 4.7143283205421085, "learning_rate": 1.9004589160490173e-06, "loss": 0.7285, "step": 4910 }, { "epoch": 0.81, "grad_norm": 2.3535996558735035, "learning_rate": 1.8973397865864095e-06, "loss": 0.8046, "step": 4911 }, { "epoch": 0.81, "grad_norm": 2.1242381932601515, "learning_rate": 1.8942229505647292e-06, "loss": 0.7244, "step": 4912 }, { "epoch": 0.81, "grad_norm": 2.1843814405017508, "learning_rate": 1.8911084088661903e-06, "loss": 0.7276, "step": 4913 }, { "epoch": 0.81, "grad_norm": 1.9185311404702952, "learning_rate": 1.8879961623723553e-06, "loss": 0.7162, "step": 4914 }, { "epoch": 0.81, "grad_norm": 1.8306884939274948, "learning_rate": 1.884886211964141e-06, "loss": 0.7472, "step": 4915 }, { "epoch": 0.81, "grad_norm": 1.9222973775773637, "learning_rate": 1.8817785585218118e-06, "loss": 0.723, "step": 4916 }, { "epoch": 0.81, "grad_norm": 0.6241644768360278, "learning_rate": 1.878673202924982e-06, "loss": 0.3382, "step": 4917 }, { "epoch": 0.81, "grad_norm": 2.411886672131599, "learning_rate": 1.8755701460526166e-06, "loss": 0.7793, "step": 4918 }, { "epoch": 0.81, "grad_norm": 1.79663588999589, "learning_rate": 1.8724693887830292e-06, "loss": 0.8043, "step": 4919 }, { "epoch": 0.81, "grad_norm": 1.5384700632961978, "learning_rate": 1.8693709319938824e-06, "loss": 0.7311, "step": 4920 }, { "epoch": 0.81, "grad_norm": 2.2931098649900004, "learning_rate": 1.866274776562188e-06, "loss": 0.8008, "step": 4921 }, { "epoch": 0.81, "grad_norm": 2.2482755861163364, "learning_rate": 1.863180923364306e-06, "loss": 0.8375, "step": 4922 }, { "epoch": 0.81, "grad_norm": 1.9582793500369766, "learning_rate": 1.860089373275945e-06, "loss": 0.7131, "step": 4923 }, { "epoch": 0.81, "grad_norm": 2.2957416652279146, "learning_rate": 1.8570001271721627e-06, "loss": 0.7474, "step": 4924 }, { "epoch": 0.81, "grad_norm": 2.8983361532396694, "learning_rate": 1.8539131859273628e-06, "loss": 0.7312, "step": 4925 }, { "epoch": 0.81, "grad_norm": 2.6907322746992577, "learning_rate": 1.8508285504152979e-06, "loss": 0.6987, "step": 4926 }, { "epoch": 0.81, "grad_norm": 1.9187463673807583, "learning_rate": 1.847746221509067e-06, "loss": 0.7979, "step": 4927 }, { "epoch": 0.81, "grad_norm": 1.7443486573451024, "learning_rate": 1.8446662000811177e-06, "loss": 0.7815, "step": 4928 }, { "epoch": 0.81, "grad_norm": 2.7848463714876437, "learning_rate": 1.841588487003243e-06, "loss": 0.7385, "step": 4929 }, { "epoch": 0.81, "grad_norm": 1.8253783760139872, "learning_rate": 1.8385130831465837e-06, "loss": 0.7952, "step": 4930 }, { "epoch": 0.81, "grad_norm": 1.9674723875262436, "learning_rate": 1.8354399893816255e-06, "loss": 0.7355, "step": 4931 }, { "epoch": 0.81, "grad_norm": 1.7182371709425106, "learning_rate": 1.8323692065782018e-06, "loss": 0.8403, "step": 4932 }, { "epoch": 0.81, "grad_norm": 2.118598429284993, "learning_rate": 1.8293007356054903e-06, "loss": 0.7439, "step": 4933 }, { "epoch": 0.81, "grad_norm": 3.4084415337273257, "learning_rate": 1.8262345773320167e-06, "loss": 0.7058, "step": 4934 }, { "epoch": 0.81, "grad_norm": 1.887597508439431, "learning_rate": 1.8231707326256498e-06, "loss": 0.7992, "step": 4935 }, { "epoch": 0.81, "grad_norm": 4.544820758025274, "learning_rate": 1.8201092023536048e-06, "loss": 0.8059, "step": 4936 }, { "epoch": 0.81, "grad_norm": 2.210456093959715, "learning_rate": 1.817049987382441e-06, "loss": 0.7611, "step": 4937 }, { "epoch": 0.81, "grad_norm": 2.235171529344506, "learning_rate": 1.8139930885780621e-06, "loss": 0.6751, "step": 4938 }, { "epoch": 0.81, "grad_norm": 2.726769137114403, "learning_rate": 1.8109385068057183e-06, "loss": 0.756, "step": 4939 }, { "epoch": 0.81, "grad_norm": 2.3248906488791548, "learning_rate": 1.8078862429300015e-06, "loss": 0.7216, "step": 4940 }, { "epoch": 0.81, "grad_norm": 1.767870032392336, "learning_rate": 1.8048362978148492e-06, "loss": 0.7497, "step": 4941 }, { "epoch": 0.81, "grad_norm": 1.6402625570832943, "learning_rate": 1.8017886723235423e-06, "loss": 0.7015, "step": 4942 }, { "epoch": 0.81, "grad_norm": 1.9992929083466209, "learning_rate": 1.7987433673187026e-06, "loss": 0.8196, "step": 4943 }, { "epoch": 0.81, "grad_norm": 1.7295259090837911, "learning_rate": 1.7957003836623e-06, "loss": 0.7652, "step": 4944 }, { "epoch": 0.81, "grad_norm": 1.8113911935373603, "learning_rate": 1.7926597222156438e-06, "loss": 0.7896, "step": 4945 }, { "epoch": 0.81, "grad_norm": 2.044982168984008, "learning_rate": 1.789621383839386e-06, "loss": 0.7894, "step": 4946 }, { "epoch": 0.81, "grad_norm": 1.7261127305938795, "learning_rate": 1.786585369393522e-06, "loss": 0.7769, "step": 4947 }, { "epoch": 0.81, "grad_norm": 1.5632156817273262, "learning_rate": 1.7835516797373908e-06, "loss": 0.7632, "step": 4948 }, { "epoch": 0.81, "grad_norm": 2.420487053983332, "learning_rate": 1.7805203157296692e-06, "loss": 0.6896, "step": 4949 }, { "epoch": 0.81, "grad_norm": 2.8521548058234854, "learning_rate": 1.7774912782283815e-06, "loss": 0.792, "step": 4950 }, { "epoch": 0.81, "grad_norm": 4.523571474491748, "learning_rate": 1.7744645680908878e-06, "loss": 0.7652, "step": 4951 }, { "epoch": 0.81, "grad_norm": 0.6273230977538957, "learning_rate": 1.771440186173894e-06, "loss": 0.3355, "step": 4952 }, { "epoch": 0.81, "grad_norm": 2.977974987255861, "learning_rate": 1.7684181333334437e-06, "loss": 0.8182, "step": 4953 }, { "epoch": 0.81, "grad_norm": 1.6412187683178845, "learning_rate": 1.7653984104249221e-06, "loss": 0.7026, "step": 4954 }, { "epoch": 0.81, "grad_norm": 1.9746445923492286, "learning_rate": 1.7623810183030576e-06, "loss": 0.7897, "step": 4955 }, { "epoch": 0.81, "grad_norm": 1.9632691745479514, "learning_rate": 1.7593659578219147e-06, "loss": 0.7077, "step": 4956 }, { "epoch": 0.81, "grad_norm": 1.9441378776473908, "learning_rate": 1.7563532298349018e-06, "loss": 0.7716, "step": 4957 }, { "epoch": 0.81, "grad_norm": 1.9096984084451882, "learning_rate": 1.7533428351947634e-06, "loss": 0.742, "step": 4958 }, { "epoch": 0.81, "grad_norm": 2.637885399446534, "learning_rate": 1.7503347747535859e-06, "loss": 0.8055, "step": 4959 }, { "epoch": 0.81, "grad_norm": 1.8752735515396488, "learning_rate": 1.7473290493627948e-06, "loss": 0.782, "step": 4960 }, { "epoch": 0.81, "grad_norm": 2.194421599014929, "learning_rate": 1.744325659873154e-06, "loss": 0.6985, "step": 4961 }, { "epoch": 0.81, "grad_norm": 2.597704816968669, "learning_rate": 1.7413246071347667e-06, "loss": 0.7278, "step": 4962 }, { "epoch": 0.82, "grad_norm": 2.2500928148154387, "learning_rate": 1.7383258919970746e-06, "loss": 0.7244, "step": 4963 }, { "epoch": 0.82, "grad_norm": 2.34234287670994, "learning_rate": 1.7353295153088578e-06, "loss": 0.7094, "step": 4964 }, { "epoch": 0.82, "grad_norm": 1.5849794883308612, "learning_rate": 1.7323354779182345e-06, "loss": 0.785, "step": 4965 }, { "epoch": 0.82, "grad_norm": 2.0766115549527533, "learning_rate": 1.729343780672661e-06, "loss": 0.7182, "step": 4966 }, { "epoch": 0.82, "grad_norm": 2.04541960744969, "learning_rate": 1.7263544244189302e-06, "loss": 0.7794, "step": 4967 }, { "epoch": 0.82, "grad_norm": 1.8548581307414098, "learning_rate": 1.7233674100031728e-06, "loss": 0.754, "step": 4968 }, { "epoch": 0.82, "grad_norm": 1.6533000729427618, "learning_rate": 1.7203827382708582e-06, "loss": 0.7375, "step": 4969 }, { "epoch": 0.82, "grad_norm": 1.8975286223580417, "learning_rate": 1.7174004100667907e-06, "loss": 0.8025, "step": 4970 }, { "epoch": 0.82, "grad_norm": 1.7682573876833652, "learning_rate": 1.7144204262351116e-06, "loss": 0.7557, "step": 4971 }, { "epoch": 0.82, "grad_norm": 2.1371046746796, "learning_rate": 1.7114427876192996e-06, "loss": 0.7888, "step": 4972 }, { "epoch": 0.82, "grad_norm": 1.8645766414196696, "learning_rate": 1.7084674950621694e-06, "loss": 0.7259, "step": 4973 }, { "epoch": 0.82, "grad_norm": 0.5920842834890454, "learning_rate": 1.7054945494058705e-06, "loss": 0.367, "step": 4974 }, { "epoch": 0.82, "grad_norm": 1.8768025165144668, "learning_rate": 1.7025239514918913e-06, "loss": 0.6866, "step": 4975 }, { "epoch": 0.82, "grad_norm": 2.34019238673976, "learning_rate": 1.6995557021610477e-06, "loss": 0.7284, "step": 4976 }, { "epoch": 0.82, "grad_norm": 2.086482657909838, "learning_rate": 1.696589802253501e-06, "loss": 0.7856, "step": 4977 }, { "epoch": 0.82, "grad_norm": 2.1737882687059997, "learning_rate": 1.6936262526087432e-06, "loss": 0.7438, "step": 4978 }, { "epoch": 0.82, "grad_norm": 1.7448282315297756, "learning_rate": 1.690665054065599e-06, "loss": 0.8076, "step": 4979 }, { "epoch": 0.82, "grad_norm": 3.16704661443459, "learning_rate": 1.6877062074622296e-06, "loss": 0.7581, "step": 4980 }, { "epoch": 0.82, "grad_norm": 1.7068061985304794, "learning_rate": 1.6847497136361312e-06, "loss": 0.7884, "step": 4981 }, { "epoch": 0.82, "grad_norm": 2.577608815417296, "learning_rate": 1.6817955734241321e-06, "loss": 0.6747, "step": 4982 }, { "epoch": 0.82, "grad_norm": 2.1789023215062584, "learning_rate": 1.6788437876623963e-06, "loss": 0.652, "step": 4983 }, { "epoch": 0.82, "grad_norm": 2.1882475869542657, "learning_rate": 1.6758943571864206e-06, "loss": 0.7193, "step": 4984 }, { "epoch": 0.82, "grad_norm": 1.7465110758741424, "learning_rate": 1.6729472828310334e-06, "loss": 0.8186, "step": 4985 }, { "epoch": 0.82, "grad_norm": 1.8950716910087055, "learning_rate": 1.670002565430401e-06, "loss": 0.7288, "step": 4986 }, { "epoch": 0.82, "grad_norm": 0.6120100534277121, "learning_rate": 1.6670602058180164e-06, "loss": 0.3407, "step": 4987 }, { "epoch": 0.82, "grad_norm": 1.711211692820243, "learning_rate": 1.6641202048267102e-06, "loss": 0.7281, "step": 4988 }, { "epoch": 0.82, "grad_norm": 13.048720172738255, "learning_rate": 1.6611825632886424e-06, "loss": 0.7665, "step": 4989 }, { "epoch": 0.82, "grad_norm": 0.6279933857764183, "learning_rate": 1.658247282035307e-06, "loss": 0.3483, "step": 4990 }, { "epoch": 0.82, "grad_norm": 2.6165840511630303, "learning_rate": 1.6553143618975288e-06, "loss": 0.7863, "step": 4991 }, { "epoch": 0.82, "grad_norm": 2.1887799806347, "learning_rate": 1.6523838037054652e-06, "loss": 0.7298, "step": 4992 }, { "epoch": 0.82, "grad_norm": 2.1382072771317198, "learning_rate": 1.6494556082886038e-06, "loss": 0.7314, "step": 4993 }, { "epoch": 0.82, "grad_norm": 1.9491521467515243, "learning_rate": 1.646529776475765e-06, "loss": 0.832, "step": 4994 }, { "epoch": 0.82, "grad_norm": 1.9429220185236444, "learning_rate": 1.6436063090950982e-06, "loss": 0.7167, "step": 4995 }, { "epoch": 0.82, "grad_norm": 2.2391414885851577, "learning_rate": 1.6406852069740876e-06, "loss": 0.8789, "step": 4996 }, { "epoch": 0.82, "grad_norm": 2.08307257009979, "learning_rate": 1.6377664709395403e-06, "loss": 0.761, "step": 4997 }, { "epoch": 0.82, "grad_norm": 1.903689941768313, "learning_rate": 1.634850101817601e-06, "loss": 0.7286, "step": 4998 }, { "epoch": 0.82, "grad_norm": 2.290484130003856, "learning_rate": 1.631936100433742e-06, "loss": 0.7608, "step": 4999 }, { "epoch": 0.82, "grad_norm": 2.9333589811537433, "learning_rate": 1.629024467612762e-06, "loss": 0.8666, "step": 5000 } ], "logging_steps": 1.0, "max_steps": 6088, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 2829058636021760.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }