diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -11,17655 +11,17655 @@ "log_history": [ { "epoch": 0.001203659123736158, - "grad_norm": 4.21875, - "learning_rate": 2.5919093035841547e-05, - "loss": 1.9379, + "grad_norm": 5.4375, + "learning_rate": 1.0334365714285713e-05, + "loss": 1.9508, "step": 5 }, { "epoch": 0.002407318247472316, - "grad_norm": 4.09375, - "learning_rate": 5.831795933064347e-05, - "loss": 1.8071, + "grad_norm": 3.546875, + "learning_rate": 2.3252322857142852e-05, + "loss": 1.902, "step": 10 }, { "epoch": 0.0036109773712084737, - "grad_norm": 3.015625, - "learning_rate": 9.071682562544542e-05, - "loss": 1.6819, + "grad_norm": 3.09375, + "learning_rate": 3.617028e-05, + "loss": 1.8219, "step": 15 }, { "epoch": 0.004814636494944632, - "grad_norm": 3.015625, - "learning_rate": 0.00012311569192024734, - "loss": 1.5538, + "grad_norm": 3.125, + "learning_rate": 4.908823714285714e-05, + "loss": 1.7196, "step": 20 }, { "epoch": 0.00601829561868079, - "grad_norm": 3.078125, - "learning_rate": 0.0001555145582150493, - "loss": 1.4964, + "grad_norm": 2.984375, + "learning_rate": 6.200619428571428e-05, + "loss": 1.6708, "step": 25 }, { "epoch": 0.007221954742416947, - "grad_norm": 3.03125, - "learning_rate": 0.00018791342450985124, - "loss": 1.4156, + "grad_norm": 2.828125, + "learning_rate": 7.492415142857143e-05, + "loss": 1.5652, "step": 30 }, { "epoch": 0.008425613866153106, - "grad_norm": 2.96875, - "learning_rate": 0.00022031229080465316, - "loss": 1.374, + "grad_norm": 2.65625, + "learning_rate": 8.784210857142857e-05, + "loss": 1.5083, "step": 35 }, { "epoch": 0.009629272989889264, - "grad_norm": 3.328125, - "learning_rate": 0.0002267920205809419, - "loss": 1.3175, + "grad_norm": 2.640625, + "learning_rate": 9.042568266274864e-05, + "loss": 1.4327, "step": 40 }, { "epoch": 0.010832932113625422, "grad_norm": 2.859375, - "learning_rate": 0.0002267918439326645, - "loss": 1.3111, + "learning_rate": 9.042561223019535e-05, + "loss": 1.4151, "step": 45 }, { "epoch": 0.01203659123736158, - "grad_norm": 3.0, - "learning_rate": 0.00022679153140139644, - "loss": 1.262, + "grad_norm": 2.9375, + "learning_rate": 9.04254876188744e-05, + "loss": 1.3557, "step": 50 }, { "epoch": 0.013240250361097737, - "grad_norm": 3.0625, - "learning_rate": 0.00022679108298763706, - "loss": 1.2531, + "grad_norm": 2.71875, + "learning_rate": 9.042530882898486e-05, + "loss": 1.3445, "step": 55 }, { "epoch": 0.014443909484833895, - "grad_norm": 2.953125, - "learning_rate": 0.00022679049869210282, - "loss": 1.2081, + "grad_norm": 2.546875, + "learning_rate": 9.04250758608124e-05, + "loss": 1.3013, "step": 60 }, { "epoch": 0.015647568608570053, - "grad_norm": 2.71875, - "learning_rate": 0.00022678977851572723, - "loss": 1.2061, + "grad_norm": 2.75, + "learning_rate": 9.042478871472926e-05, + "loss": 1.3059, "step": 65 }, { "epoch": 0.016851227732306212, - "grad_norm": 2.953125, - "learning_rate": 0.00022678892245966094, - "loss": 1.1786, + "grad_norm": 3.09375, + "learning_rate": 9.042444739119417e-05, + "loss": 1.276, "step": 70 }, { "epoch": 0.018054886856042368, - "grad_norm": 2.453125, - "learning_rate": 0.00022678793052527177, - "loss": 1.1732, + "grad_norm": 2.84375, + "learning_rate": 9.042405189075255e-05, + "loss": 1.2679, "step": 75 }, { "epoch": 0.019258545979778528, - "grad_norm": 2.84375, - "learning_rate": 0.00022678680271414454, - "loss": 1.1527, + "grad_norm": 2.578125, + "learning_rate": 9.042360221403626e-05, + "loss": 1.2525, "step": 80 }, { "epoch": 0.020462205103514684, - "grad_norm": 2.46875, - "learning_rate": 0.00022678553902808117, - "loss": 1.1535, + "grad_norm": 2.625, + "learning_rate": 9.042309836176377e-05, + "loss": 1.2534, "step": 85 }, { "epoch": 0.021665864227250843, - "grad_norm": 2.546875, - "learning_rate": 0.0002267841394691008, - "loss": 1.1462, + "grad_norm": 2.640625, + "learning_rate": 9.042254033474015e-05, + "loss": 1.2386, "step": 90 }, { "epoch": 0.022869523350987, - "grad_norm": 2.25, - "learning_rate": 0.00022678260403943945, - "loss": 1.125, + "grad_norm": 2.609375, + "learning_rate": 9.042192813385693e-05, + "loss": 1.211, "step": 95 }, { "epoch": 0.02407318247472316, - "grad_norm": 2.34375, - "learning_rate": 0.00022678093274155044, - "loss": 1.1135, + "grad_norm": 2.53125, + "learning_rate": 9.042126176009227e-05, + "loss": 1.2098, "step": 100 }, { "epoch": 0.025276841598459315, "grad_norm": 2.625, - "learning_rate": 0.00022677912557810405, - "loss": 1.0841, + "learning_rate": 9.042054121451089e-05, + "loss": 1.1801, "step": 105 }, { "epoch": 0.026480500722195474, - "grad_norm": 2.234375, - "learning_rate": 0.0002267771825519876, - "loss": 1.1195, + "grad_norm": 2.53125, + "learning_rate": 9.041976649826399e-05, + "loss": 1.2064, "step": 110 }, { "epoch": 0.027684159845931634, - "grad_norm": 2.5625, - "learning_rate": 0.00022677510366630565, - "loss": 1.1159, + "grad_norm": 2.40625, + "learning_rate": 9.041893761258942e-05, + "loss": 1.2028, "step": 115 }, { "epoch": 0.02888781896966779, - "grad_norm": 3.09375, - "learning_rate": 0.00022677288892437964, - "loss": 1.0941, + "grad_norm": 2.59375, + "learning_rate": 9.041805455881148e-05, + "loss": 1.1858, "step": 120 }, { "epoch": 0.03009147809340395, - "grad_norm": 2.359375, - "learning_rate": 0.0002267705383297482, - "loss": 1.0454, + "grad_norm": 2.734375, + "learning_rate": 9.041711733834108e-05, + "loss": 1.1385, "step": 125 }, { "epoch": 0.031295137217140105, - "grad_norm": 2.40625, - "learning_rate": 0.000226768051886167, - "loss": 1.1214, + "grad_norm": 2.4375, + "learning_rate": 9.041612595267565e-05, + "loss": 1.2035, "step": 130 }, { "epoch": 0.03249879634087626, - "grad_norm": 2.515625, - "learning_rate": 0.00022676542959760868, - "loss": 1.0516, + "grad_norm": 2.484375, + "learning_rate": 9.041508040339921e-05, + "loss": 1.1282, "step": 135 }, { "epoch": 0.033702455464612424, - "grad_norm": 2.46875, - "learning_rate": 0.00022676267146826303, - "loss": 1.0544, + "grad_norm": 2.640625, + "learning_rate": 9.041398069218224e-05, + "loss": 1.1379, "step": 140 }, { "epoch": 0.03490611458834858, - "grad_norm": 3.125, - "learning_rate": 0.00022675977750253681, - "loss": 1.0484, + "grad_norm": 3.15625, + "learning_rate": 9.04128268207818e-05, + "loss": 1.1257, "step": 145 }, { "epoch": 0.036109773712084736, - "grad_norm": 4.1875, - "learning_rate": 0.00022675674770505387, - "loss": 1.0536, + "grad_norm": 2.359375, + "learning_rate": 9.04116187910415e-05, + "loss": 1.1285, "step": 150 }, { "epoch": 0.03731343283582089, - "grad_norm": 2.453125, - "learning_rate": 0.000226753582080655, - "loss": 1.054, + "grad_norm": 2.828125, + "learning_rate": 9.041035660489145e-05, + "loss": 1.1318, "step": 155 }, { "epoch": 0.038517091959557055, - "grad_norm": 2.15625, - "learning_rate": 0.0002267502806343981, - "loss": 1.0733, + "grad_norm": 2.65625, + "learning_rate": 9.040904026434829e-05, + "loss": 1.1602, "step": 160 }, { "epoch": 0.03972075108329321, - "grad_norm": 2.125, - "learning_rate": 0.000226746843371558, - "loss": 1.0351, + "grad_norm": 2.390625, + "learning_rate": 9.04076697715152e-05, + "loss": 1.1164, "step": 165 }, { "epoch": 0.04092441020702937, - "grad_norm": 2.359375, - "learning_rate": 0.00022674327029762663, - "loss": 1.0426, + "grad_norm": 2.59375, + "learning_rate": 9.04062451285819e-05, + "loss": 1.1217, "step": 170 }, { "epoch": 0.04212806933076553, - "grad_norm": 2.09375, - "learning_rate": 0.00022673956141831277, - "loss": 1.0345, + "grad_norm": 2.359375, + "learning_rate": 9.040476633782458e-05, + "loss": 1.1071, "step": 175 }, { "epoch": 0.043331728454501686, - "grad_norm": 2.296875, - "learning_rate": 0.00022673571673954227, - "loss": 1.0316, + "grad_norm": 2.359375, + "learning_rate": 9.040323340160597e-05, + "loss": 1.1138, "step": 180 }, { "epoch": 0.04453538757823784, - "grad_norm": 2.03125, - "learning_rate": 0.00022673173626745798, - "loss": 0.9783, + "grad_norm": 2.34375, + "learning_rate": 9.040164632237532e-05, + "loss": 1.0674, "step": 185 }, { "epoch": 0.045739046701974, - "grad_norm": 2.296875, - "learning_rate": 0.00022672762000841963, - "loss": 1.0416, + "grad_norm": 2.734375, + "learning_rate": 9.040000510266836e-05, + "loss": 1.1186, "step": 190 }, { "epoch": 0.04694270582571016, - "grad_norm": 2.375, - "learning_rate": 0.000226723367969004, - "loss": 0.9912, + "grad_norm": 2.640625, + "learning_rate": 9.039830974510734e-05, + "loss": 1.0882, "step": 195 }, { "epoch": 0.04814636494944632, - "grad_norm": 2.390625, - "learning_rate": 0.0002267189801560047, - "loss": 0.9928, + "grad_norm": 2.578125, + "learning_rate": 9.039656025240102e-05, + "loss": 1.0725, "step": 200 }, { "epoch": 0.04935002407318247, - "grad_norm": 1.9375, - "learning_rate": 0.00022671445657643235, - "loss": 0.9891, + "grad_norm": 2.296875, + "learning_rate": 9.039475662734464e-05, + "loss": 1.0741, "step": 205 }, { "epoch": 0.05055368319691863, - "grad_norm": 1.859375, - "learning_rate": 0.0002267097972375145, - "loss": 0.9898, + "grad_norm": 2.453125, + "learning_rate": 9.039289887281991e-05, + "loss": 1.0708, "step": 210 }, { "epoch": 0.05175734232065479, - "grad_norm": 2.234375, - "learning_rate": 0.00022670500214669556, - "loss": 0.9922, + "grad_norm": 3.0625, + "learning_rate": 9.039098699179508e-05, + "loss": 1.0737, "step": 215 }, { "epoch": 0.05296100144439095, - "grad_norm": 2.0625, - "learning_rate": 0.00022670007131163683, - "loss": 0.9936, + "grad_norm": 2.609375, + "learning_rate": 9.038902098732483e-05, + "loss": 1.0697, "step": 220 }, { "epoch": 0.054164660568127104, - "grad_norm": 2.234375, - "learning_rate": 0.00022669500474021656, - "loss": 0.9848, + "grad_norm": 2.5625, + "learning_rate": 9.038700086255029e-05, + "loss": 1.0658, "step": 225 }, { "epoch": 0.05536831969186327, - "grad_norm": 2.171875, - "learning_rate": 0.00022668980244052982, - "loss": 0.9805, + "grad_norm": 2.546875, + "learning_rate": 9.038492662069918e-05, + "loss": 1.0652, "step": 230 }, { "epoch": 0.05657197881559942, - "grad_norm": 2.1875, - "learning_rate": 0.00022668446442088852, - "loss": 0.957, + "grad_norm": 2.546875, + "learning_rate": 9.038279826508553e-05, + "loss": 1.0497, "step": 235 }, { "epoch": 0.05777563793933558, - "grad_norm": 2.21875, - "learning_rate": 0.0002266789906898215, - "loss": 0.957, + "grad_norm": 2.328125, + "learning_rate": 9.038061579910999e-05, + "loss": 1.0288, "step": 240 }, { "epoch": 0.058979297063071735, - "grad_norm": 2.09375, - "learning_rate": 0.00022667338125607434, - "loss": 0.9754, + "grad_norm": 2.515625, + "learning_rate": 9.037837922625949e-05, + "loss": 1.0503, "step": 245 }, { "epoch": 0.0601829561868079, - "grad_norm": 2.0, - "learning_rate": 0.00022666763612860948, - "loss": 0.9609, + "grad_norm": 2.453125, + "learning_rate": 9.037608855010756e-05, + "loss": 1.0362, "step": 250 }, { "epoch": 0.061386615310544054, - "grad_norm": 1.8671875, - "learning_rate": 0.0002266617553166062, - "loss": 0.988, + "grad_norm": 2.3125, + "learning_rate": 9.03737437743141e-05, + "loss": 1.0657, "step": 255 }, { "epoch": 0.06259027443428021, - "grad_norm": 1.9140625, - "learning_rate": 0.00022665573882946045, - "loss": 0.9235, + "grad_norm": 2.65625, + "learning_rate": 9.037134490262544e-05, + "loss": 1.0126, "step": 260 }, { "epoch": 0.06379393355801637, - "grad_norm": 2.140625, - "learning_rate": 0.00022664958667678516, - "loss": 0.9618, + "grad_norm": 2.515625, + "learning_rate": 9.036889193887439e-05, + "loss": 1.0415, "step": 265 }, { "epoch": 0.06499759268175252, - "grad_norm": 2.046875, - "learning_rate": 0.0002266432988684098, - "loss": 0.9416, + "grad_norm": 2.53125, + "learning_rate": 9.036638488698015e-05, + "loss": 1.0286, "step": 270 }, { "epoch": 0.06620125180548869, - "grad_norm": 1.9453125, - "learning_rate": 0.00022663687541438066, - "loss": 0.9466, + "grad_norm": 2.5625, + "learning_rate": 9.036382375094836e-05, + "loss": 1.0283, "step": 275 }, { "epoch": 0.06740491092922485, - "grad_norm": 2.140625, - "learning_rate": 0.00022663031632496082, - "loss": 0.9454, + "grad_norm": 2.46875, + "learning_rate": 9.036120853487101e-05, + "loss": 1.0147, "step": 280 }, { "epoch": 0.068608570052961, - "grad_norm": 2.171875, - "learning_rate": 0.00022662362161063, - "loss": 0.9294, + "grad_norm": 2.71875, + "learning_rate": 9.035853924292659e-05, + "loss": 1.0081, "step": 285 }, { "epoch": 0.06981222917669716, - "grad_norm": 2.046875, - "learning_rate": 0.00022661679128208466, - "loss": 0.9302, + "grad_norm": 2.46875, + "learning_rate": 9.035581587937994e-05, + "loss": 1.0067, "step": 290 }, { "epoch": 0.07101588830043332, - "grad_norm": 1.9765625, - "learning_rate": 0.0002266098253502379, - "loss": 0.9221, + "grad_norm": 2.40625, + "learning_rate": 9.035303844858227e-05, + "loss": 0.9987, "step": 295 }, { "epoch": 0.07221954742416947, - "grad_norm": 2.453125, - "learning_rate": 0.0002266027238262195, - "loss": 0.9036, + "grad_norm": 2.484375, + "learning_rate": 9.035020695497124e-05, + "loss": 0.9811, "step": 300 }, { "epoch": 0.07342320654790563, - "grad_norm": 2.1875, - "learning_rate": 0.0002265954867213759, - "loss": 0.9155, + "grad_norm": 2.734375, + "learning_rate": 9.034732140307082e-05, + "loss": 0.9992, "step": 305 }, { "epoch": 0.07462686567164178, - "grad_norm": 1.84375, - "learning_rate": 0.00022658811404727006, - "loss": 0.9637, + "grad_norm": 2.453125, + "learning_rate": 9.03443817974914e-05, + "loss": 1.0393, "step": 310 }, { "epoch": 0.07583052479537795, - "grad_norm": 2.0625, - "learning_rate": 0.00022658060581568168, - "loss": 0.8965, + "grad_norm": 2.234375, + "learning_rate": 9.03413881429297e-05, + "loss": 0.9789, "step": 315 }, { "epoch": 0.07703418391911411, - "grad_norm": 2.078125, - "learning_rate": 0.00022657296203860703, - "loss": 0.9047, + "grad_norm": 2.5625, + "learning_rate": 9.033834044416883e-05, + "loss": 0.9784, "step": 320 }, { "epoch": 0.07823784304285027, - "grad_norm": 2.015625, - "learning_rate": 0.0002265651827282588, - "loss": 0.9086, + "grad_norm": 2.296875, + "learning_rate": 9.033523870607821e-05, + "loss": 0.9919, "step": 325 }, { "epoch": 0.07944150216658642, - "grad_norm": 1.9296875, - "learning_rate": 0.00022655726789706638, - "loss": 0.9326, + "grad_norm": 2.4375, + "learning_rate": 9.033208293361363e-05, + "loss": 1.0113, "step": 330 }, { "epoch": 0.08064516129032258, - "grad_norm": 2.046875, - "learning_rate": 0.0002265492175576757, - "loss": 0.9124, + "grad_norm": 2.546875, + "learning_rate": 9.032887313181723e-05, + "loss": 0.9894, "step": 335 }, { "epoch": 0.08184882041405873, - "grad_norm": 2.046875, - "learning_rate": 0.00022654103172294905, - "loss": 0.912, + "grad_norm": 2.46875, + "learning_rate": 9.032560930581743e-05, + "loss": 0.9843, "step": 340 }, { "epoch": 0.08305247953779489, - "grad_norm": 2.109375, - "learning_rate": 0.00022653271040596538, - "loss": 0.8867, + "grad_norm": 2.265625, + "learning_rate": 9.032229146082899e-05, + "loss": 0.9649, "step": 345 }, { "epoch": 0.08425613866153106, - "grad_norm": 1.7734375, - "learning_rate": 0.00022652425362001993, - "loss": 0.9291, + "grad_norm": 2.28125, + "learning_rate": 9.031891960215297e-05, + "loss": 1.0048, "step": 350 }, { "epoch": 0.08545979778526722, - "grad_norm": 1.9140625, - "learning_rate": 0.00022651566137862455, - "loss": 0.9007, + "grad_norm": 2.3125, + "learning_rate": 9.031549373517673e-05, + "loss": 0.9789, "step": 355 }, { "epoch": 0.08666345690900337, - "grad_norm": 1.7109375, - "learning_rate": 0.0002265069336955074, - "loss": 0.9096, + "grad_norm": 2.3125, + "learning_rate": 9.031201386537395e-05, + "loss": 0.9783, "step": 360 }, { "epoch": 0.08786711603273953, - "grad_norm": 1.9765625, - "learning_rate": 0.0002264980705846131, - "loss": 0.9083, + "grad_norm": 2.25, + "learning_rate": 9.030847999830456e-05, + "loss": 0.9757, "step": 365 }, { "epoch": 0.08907077515647568, - "grad_norm": 1.8203125, - "learning_rate": 0.00022648907206010264, - "loss": 0.9076, + "grad_norm": 2.53125, + "learning_rate": 9.030489213961477e-05, + "loss": 0.9843, "step": 370 }, { "epoch": 0.09027443428021184, - "grad_norm": 1.953125, - "learning_rate": 0.00022647993813635332, - "loss": 0.9176, + "grad_norm": 2.234375, + "learning_rate": 9.030125029503704e-05, + "loss": 0.9934, "step": 375 }, { "epoch": 0.091478093403948, - "grad_norm": 1.734375, - "learning_rate": 0.00022647066882795883, - "loss": 0.8837, + "grad_norm": 2.171875, + "learning_rate": 9.029755447039015e-05, + "loss": 0.9648, "step": 380 }, { "epoch": 0.09268175252768417, - "grad_norm": 1.84375, - "learning_rate": 0.00022646126414972915, - "loss": 0.9043, + "grad_norm": 2.4375, + "learning_rate": 9.029380467157904e-05, + "loss": 0.9665, "step": 385 }, { "epoch": 0.09388541165142032, - "grad_norm": 1.8125, - "learning_rate": 0.00022645172411669054, - "loss": 0.8879, + "grad_norm": 2.328125, + "learning_rate": 9.029000090459495e-05, + "loss": 0.9573, "step": 390 }, { "epoch": 0.09508907077515648, - "grad_norm": 1.8515625, - "learning_rate": 0.0002264420487440855, - "loss": 0.886, + "grad_norm": 2.328125, + "learning_rate": 9.02861431755153e-05, + "loss": 0.9575, "step": 395 }, { "epoch": 0.09629272989889263, - "grad_norm": 2.046875, - "learning_rate": 0.00022643223804737285, - "loss": 0.8831, + "grad_norm": 2.390625, + "learning_rate": 9.028223149050378e-05, + "loss": 0.9628, "step": 400 }, { "epoch": 0.09749638902262879, - "grad_norm": 1.78125, - "learning_rate": 0.00022642229204222753, - "loss": 0.8904, + "grad_norm": 2.375, + "learning_rate": 9.027826585581026e-05, + "loss": 0.9622, "step": 405 }, { "epoch": 0.09870004814636495, - "grad_norm": 1.921875, - "learning_rate": 0.00022641221074454071, - "loss": 0.8953, + "grad_norm": 2.265625, + "learning_rate": 9.027424627777077e-05, + "loss": 0.967, "step": 410 }, { "epoch": 0.0999037072701011, - "grad_norm": 1.8203125, - "learning_rate": 0.00022640199417041975, - "loss": 0.8996, + "grad_norm": 2.328125, + "learning_rate": 9.02701727628076e-05, + "loss": 0.9696, "step": 415 }, { "epoch": 0.10110736639383726, - "grad_norm": 2.015625, - "learning_rate": 0.00022639164233618806, - "loss": 0.8947, + "grad_norm": 2.4375, + "learning_rate": 9.026604531742918e-05, + "loss": 0.9679, "step": 420 }, { "epoch": 0.10231102551757343, - "grad_norm": 2.109375, - "learning_rate": 0.00022638115525838528, - "loss": 0.8925, + "grad_norm": 2.5, + "learning_rate": 9.026186394823009e-05, + "loss": 0.963, "step": 425 }, { "epoch": 0.10351468464130958, - "grad_norm": 2.09375, - "learning_rate": 0.00022637053295376702, - "loss": 0.8401, + "grad_norm": 2.625, + "learning_rate": 9.025762866189111e-05, + "loss": 0.9161, "step": 430 }, { "epoch": 0.10471834376504574, - "grad_norm": 1.6796875, - "learning_rate": 0.00022635977543930503, - "loss": 0.8833, + "grad_norm": 2.203125, + "learning_rate": 9.025333946517913e-05, + "loss": 0.9469, "step": 435 }, { "epoch": 0.1059220028887819, - "grad_norm": 1.9609375, - "learning_rate": 0.00022634888273218704, - "loss": 0.8507, + "grad_norm": 2.3125, + "learning_rate": 9.024899636494718e-05, + "loss": 0.9326, "step": 440 }, { "epoch": 0.10712566201251805, - "grad_norm": 1.890625, - "learning_rate": 0.0002263378548498168, - "loss": 0.8893, + "grad_norm": 2.5625, + "learning_rate": 9.024459936813441e-05, + "loss": 0.9632, "step": 445 }, { "epoch": 0.10832932113625421, - "grad_norm": 1.6796875, - "learning_rate": 0.00022632669180981408, - "loss": 0.8929, + "grad_norm": 2.421875, + "learning_rate": 9.024014848176614e-05, + "loss": 0.9652, "step": 450 }, { "epoch": 0.10953298025999036, - "grad_norm": 1.609375, - "learning_rate": 0.00022631539363001456, - "loss": 0.895, + "grad_norm": 2.3125, + "learning_rate": 9.023564371295371e-05, + "loss": 0.9651, "step": 455 }, { "epoch": 0.11073663938372653, - "grad_norm": 1.6484375, - "learning_rate": 0.00022630396032846976, - "loss": 0.862, + "grad_norm": 2.15625, + "learning_rate": 9.023108506889461e-05, + "loss": 0.9315, "step": 460 }, { "epoch": 0.11194029850746269, - "grad_norm": 1.90625, - "learning_rate": 0.00022629239192344726, - "loss": 0.8537, + "grad_norm": 2.21875, + "learning_rate": 9.022647255687235e-05, + "loss": 0.9254, "step": 465 }, { "epoch": 0.11314395763119885, - "grad_norm": 1.546875, - "learning_rate": 0.0002262806884334303, - "loss": 0.8656, + "grad_norm": 2.3125, + "learning_rate": 9.022180618425656e-05, + "loss": 0.9447, "step": 470 }, { "epoch": 0.114347616754935, - "grad_norm": 1.8671875, - "learning_rate": 0.00022626884987711815, - "loss": 0.9012, + "grad_norm": 2.234375, + "learning_rate": 9.021708595850291e-05, + "loss": 0.9736, "step": 475 }, { "epoch": 0.11555127587867116, - "grad_norm": 1.6171875, - "learning_rate": 0.0002262568762734257, - "loss": 0.8476, + "grad_norm": 2.375, + "learning_rate": 9.02123118871531e-05, + "loss": 0.9191, "step": 480 }, { "epoch": 0.11675493500240731, - "grad_norm": 1.7265625, - "learning_rate": 0.00022624476764148384, - "loss": 0.8656, + "grad_norm": 2.28125, + "learning_rate": 9.02074839778349e-05, + "loss": 0.9291, "step": 485 }, { "epoch": 0.11795859412614347, - "grad_norm": 1.7265625, - "learning_rate": 0.00022623252400063893, - "loss": 0.8029, + "grad_norm": 2.546875, + "learning_rate": 9.020260223826204e-05, + "loss": 0.8817, "step": 490 }, { "epoch": 0.11916225324987964, - "grad_norm": 1.703125, - "learning_rate": 0.00022622014537045318, - "loss": 0.845, + "grad_norm": 2.234375, + "learning_rate": 9.01976666762343e-05, + "loss": 0.9112, "step": 495 }, { "epoch": 0.1203659123736158, - "grad_norm": 1.703125, - "learning_rate": 0.00022620763177070452, - "loss": 0.8572, + "grad_norm": 2.359375, + "learning_rate": 9.019267729963743e-05, + "loss": 0.9293, "step": 500 }, { "epoch": 0.1203659123736158, - "eval_loss": 0.7564012408256531, - "eval_runtime": 2.3287, - "eval_samples_per_second": 85.887, - "eval_steps_per_second": 85.887, + "eval_loss": 0.8218328356742859, + "eval_runtime": 2.3852, + "eval_samples_per_second": 83.851, + "eval_steps_per_second": 83.851, "step": 500 }, { "epoch": 0.12156957149735195, - "grad_norm": 1.8515625, - "learning_rate": 0.00022619498322138643, - "loss": 0.8083, + "grad_norm": 2.28125, + "learning_rate": 9.018763411644319e-05, + "loss": 0.8772, "step": 505 }, { "epoch": 0.12277323062108811, - "grad_norm": 1.8984375, - "learning_rate": 0.0002261821997427081, - "loss": 0.8492, + "grad_norm": 2.359375, + "learning_rate": 9.01825371347093e-05, + "loss": 0.911, "step": 510 }, { "epoch": 0.12397688974482426, - "grad_norm": 1.6953125, - "learning_rate": 0.0002261692813550942, - "loss": 0.8865, + "grad_norm": 2.296875, + "learning_rate": 9.017738636257942e-05, + "loss": 0.9427, "step": 515 }, { "epoch": 0.12518054886856042, - "grad_norm": 1.8984375, - "learning_rate": 0.00022615622807918505, - "loss": 0.9027, + "grad_norm": 2.421875, + "learning_rate": 9.017218180828317e-05, + "loss": 0.9655, "step": 520 }, { "epoch": 0.12638420799229658, - "grad_norm": 1.734375, - "learning_rate": 0.0002261430399358364, - "loss": 0.8837, + "grad_norm": 2.375, + "learning_rate": 9.016692348013607e-05, + "loss": 0.9441, "step": 525 }, { "epoch": 0.12758786711603273, - "grad_norm": 1.8984375, - "learning_rate": 0.00022612971694611954, - "loss": 0.8441, + "grad_norm": 2.5, + "learning_rate": 9.016161138653961e-05, + "loss": 0.9123, "step": 530 }, { "epoch": 0.1287915262397689, - "grad_norm": 1.7734375, - "learning_rate": 0.0002261162591313212, - "loss": 0.8344, + "grad_norm": 2.375, + "learning_rate": 9.015624553598115e-05, + "loss": 0.9097, "step": 535 }, { "epoch": 0.12999518536350504, - "grad_norm": 1.765625, - "learning_rate": 0.00022610266651294347, - "loss": 0.8385, + "grad_norm": 2.390625, + "learning_rate": 9.015082593703393e-05, + "loss": 0.9038, "step": 540 }, { "epoch": 0.1311988444872412, - "grad_norm": 1.7734375, - "learning_rate": 0.00022608893911270394, - "loss": 0.8546, + "grad_norm": 2.296875, + "learning_rate": 9.014535259835709e-05, + "loss": 0.9204, "step": 545 }, { "epoch": 0.13240250361097738, - "grad_norm": 1.6953125, - "learning_rate": 0.00022607507695253541, - "loss": 0.8098, + "grad_norm": 2.5, + "learning_rate": 9.013982552869561e-05, + "loss": 0.8791, "step": 550 }, { "epoch": 0.13360616273471354, - "grad_norm": 1.8046875, - "learning_rate": 0.00022606108005458612, - "loss": 0.8538, + "grad_norm": 2.296875, + "learning_rate": 9.013424473688034e-05, + "loss": 0.9163, "step": 555 }, { "epoch": 0.1348098218584497, - "grad_norm": 1.59375, - "learning_rate": 0.00022604694844121948, - "loss": 0.8376, + "grad_norm": 2.171875, + "learning_rate": 9.012861023182796e-05, + "loss": 0.908, "step": 560 }, { "epoch": 0.13601348098218585, - "grad_norm": 1.6953125, - "learning_rate": 0.00022603268213501425, - "loss": 0.8142, + "grad_norm": 2.3125, + "learning_rate": 9.012292202254096e-05, + "loss": 0.9004, "step": 565 }, { "epoch": 0.137217140105922, - "grad_norm": 1.6484375, - "learning_rate": 0.00022601828115876422, - "loss": 0.8149, + "grad_norm": 2.3125, + "learning_rate": 9.011718011810762e-05, + "loss": 0.8959, "step": 570 }, { "epoch": 0.13842079922965816, - "grad_norm": 1.515625, - "learning_rate": 0.00022600374553547852, - "loss": 0.7743, + "grad_norm": 2.234375, + "learning_rate": 9.011138452770206e-05, + "loss": 0.8498, "step": 575 }, { "epoch": 0.13962445835339432, - "grad_norm": 1.71875, - "learning_rate": 0.00022598907528838139, - "loss": 0.8268, + "grad_norm": 2.125, + "learning_rate": 9.010553526058414e-05, + "loss": 0.9045, "step": 580 }, { "epoch": 0.14082811747713048, - "grad_norm": 1.7734375, - "learning_rate": 0.00022597427044091206, - "loss": 0.8209, + "grad_norm": 2.4375, + "learning_rate": 9.009963232609949e-05, + "loss": 0.9024, "step": 585 }, { "epoch": 0.14203177660086663, - "grad_norm": 1.7578125, - "learning_rate": 0.00022595933101672488, - "loss": 0.8314, + "grad_norm": 2.34375, + "learning_rate": 9.009367573367947e-05, + "loss": 0.9076, "step": 590 }, { "epoch": 0.1432354357246028, - "grad_norm": 1.8125, - "learning_rate": 0.00022594425703968926, - "loss": 0.8349, + "grad_norm": 2.328125, + "learning_rate": 9.00876654928412e-05, + "loss": 0.8974, "step": 595 }, { "epoch": 0.14443909484833894, - "grad_norm": 1.5625, - "learning_rate": 0.00022592904853388957, - "loss": 0.8047, + "grad_norm": 2.125, + "learning_rate": 9.008160161318752e-05, + "loss": 0.8766, "step": 600 }, { "epoch": 0.1456427539720751, - "grad_norm": 1.640625, - "learning_rate": 0.00022591370552362504, - "loss": 0.8227, + "grad_norm": 2.140625, + "learning_rate": 9.007548410440693e-05, + "loss": 0.9035, "step": 605 }, { "epoch": 0.14684641309581126, - "grad_norm": 1.5703125, - "learning_rate": 0.0002258982280334099, - "loss": 0.8137, + "grad_norm": 2.1875, + "learning_rate": 9.006931297627366e-05, + "loss": 0.8876, "step": 610 }, { "epoch": 0.1480500722195474, - "grad_norm": 1.5859375, - "learning_rate": 0.0002258826160879732, - "loss": 0.8336, + "grad_norm": 2.140625, + "learning_rate": 9.006308823864757e-05, + "loss": 0.9029, "step": 615 }, { "epoch": 0.14925373134328357, - "grad_norm": 1.6328125, - "learning_rate": 0.00022586686971225886, - "loss": 0.8551, + "grad_norm": 2.15625, + "learning_rate": 9.005680990147421e-05, + "loss": 0.9197, "step": 620 }, { "epoch": 0.15045739046701975, - "grad_norm": 1.796875, - "learning_rate": 0.0002258509889314255, - "loss": 0.822, + "grad_norm": 2.203125, + "learning_rate": 9.005047797478474e-05, + "loss": 0.8957, "step": 625 }, { "epoch": 0.1516610495907559, - "grad_norm": 1.7265625, - "learning_rate": 0.00022583497377084654, - "loss": 0.8523, + "grad_norm": 2.234375, + "learning_rate": 9.004409246869597e-05, + "loss": 0.9216, "step": 630 }, { "epoch": 0.15286470871449206, - "grad_norm": 1.546875, - "learning_rate": 0.00022581882425611017, - "loss": 0.8357, + "grad_norm": 2.421875, + "learning_rate": 9.00376533934103e-05, + "loss": 0.8964, "step": 635 }, { "epoch": 0.15406836783822822, - "grad_norm": 1.5234375, - "learning_rate": 0.00022580254041301912, - "loss": 0.8109, + "grad_norm": 2.109375, + "learning_rate": 9.003116075921573e-05, + "loss": 0.8794, "step": 640 }, { "epoch": 0.15527202696196438, - "grad_norm": 1.6953125, - "learning_rate": 0.0002257861222675908, - "loss": 0.8282, + "grad_norm": 2.15625, + "learning_rate": 9.00246145764858e-05, + "loss": 0.9003, "step": 645 }, { "epoch": 0.15647568608570053, - "grad_norm": 1.6796875, - "learning_rate": 0.0002257695698460572, - "loss": 0.835, + "grad_norm": 2.109375, + "learning_rate": 9.001801485567965e-05, + "loss": 0.8987, "step": 650 }, { "epoch": 0.1576793452094367, - "grad_norm": 1.7265625, - "learning_rate": 0.00022575288317486488, - "loss": 0.8625, + "grad_norm": 2.3125, + "learning_rate": 9.001136160734195e-05, + "loss": 0.9281, "step": 655 }, { "epoch": 0.15888300433317284, - "grad_norm": 1.7109375, - "learning_rate": 0.00022573606228067477, - "loss": 0.8219, + "grad_norm": 2.15625, + "learning_rate": 9.000465484210284e-05, + "loss": 0.8848, "step": 660 }, { "epoch": 0.160086663456909, - "grad_norm": 1.6640625, - "learning_rate": 0.00022571910719036245, - "loss": 0.842, + "grad_norm": 2.09375, + "learning_rate": 8.999789457067806e-05, + "loss": 0.9069, "step": 665 }, { "epoch": 0.16129032258064516, - "grad_norm": 1.75, - "learning_rate": 0.00022570201793101777, - "loss": 0.8186, + "grad_norm": 2.390625, + "learning_rate": 8.999108080386879e-05, + "loss": 0.8869, "step": 670 }, { "epoch": 0.1624939817043813, - "grad_norm": 1.6640625, - "learning_rate": 0.00022568479452994496, - "loss": 0.7838, + "grad_norm": 2.03125, + "learning_rate": 8.998421355256165e-05, + "loss": 0.8587, "step": 675 }, { "epoch": 0.16369764082811747, - "grad_norm": 1.7421875, - "learning_rate": 0.00022566743701466264, - "loss": 0.8122, + "grad_norm": 2.390625, + "learning_rate": 8.997729282772878e-05, + "loss": 0.8698, "step": 680 }, { "epoch": 0.16490129995185362, - "grad_norm": 1.640625, - "learning_rate": 0.00022564994541290366, - "loss": 0.804, + "grad_norm": 2.234375, + "learning_rate": 8.997031864042769e-05, + "loss": 0.8718, "step": 685 }, { "epoch": 0.16610495907558978, - "grad_norm": 1.515625, - "learning_rate": 0.00022563231975261506, - "loss": 0.8007, + "grad_norm": 2.28125, + "learning_rate": 8.996329100180137e-05, + "loss": 0.874, "step": 690 }, { "epoch": 0.16730861819932596, - "grad_norm": 1.671875, - "learning_rate": 0.00022561456006195825, - "loss": 0.7914, + "grad_norm": 2.1875, + "learning_rate": 8.995620992307819e-05, + "loss": 0.864, "step": 695 }, { "epoch": 0.16851227732306212, - "grad_norm": 1.625, - "learning_rate": 0.00022559666636930853, - "loss": 0.7936, + "grad_norm": 2.515625, + "learning_rate": 8.994907541557187e-05, + "loss": 0.8563, "step": 700 }, { "epoch": 0.16971593644679828, - "grad_norm": 1.7109375, - "learning_rate": 0.0002255786387032555, - "loss": 0.8352, + "grad_norm": 2.375, + "learning_rate": 8.994188749068154e-05, + "loss": 0.8976, "step": 705 }, { "epoch": 0.17091959557053443, - "grad_norm": 1.8046875, - "learning_rate": 0.00022556047709260273, - "loss": 0.8178, + "grad_norm": 2.375, + "learning_rate": 8.993464615989168e-05, + "loss": 0.8883, "step": 710 }, { "epoch": 0.1721232546942706, - "grad_norm": 1.671875, - "learning_rate": 0.00022554218156636783, - "loss": 0.8147, + "grad_norm": 2.234375, + "learning_rate": 8.992735143477204e-05, + "loss": 0.8737, "step": 715 }, { "epoch": 0.17332691381800674, - "grad_norm": 1.5390625, - "learning_rate": 0.00022552375215378242, - "loss": 0.7963, + "grad_norm": 2.125, + "learning_rate": 8.992000332697775e-05, + "loss": 0.8644, "step": 720 }, { "epoch": 0.1745305729417429, - "grad_norm": 1.703125, - "learning_rate": 0.00022550518888429184, - "loss": 0.8563, + "grad_norm": 2.265625, + "learning_rate": 8.991260184824919e-05, + "loss": 0.9081, "step": 725 }, { "epoch": 0.17573423206547906, - "grad_norm": 1.671875, - "learning_rate": 0.00022548649178755556, - "loss": 0.805, + "grad_norm": 2.4375, + "learning_rate": 8.990514701041205e-05, + "loss": 0.8672, "step": 730 }, { "epoch": 0.1769378911892152, - "grad_norm": 1.6796875, - "learning_rate": 0.00022546766089344666, - "loss": 0.8126, + "grad_norm": 2.265625, + "learning_rate": 8.989763882537721e-05, + "loss": 0.8754, "step": 735 }, { "epoch": 0.17814155031295137, - "grad_norm": 1.578125, - "learning_rate": 0.00022544869623205215, - "loss": 0.7807, + "grad_norm": 2.359375, + "learning_rate": 8.989007730514085e-05, + "loss": 0.8516, "step": 740 }, { "epoch": 0.17934520943668752, - "grad_norm": 1.5234375, - "learning_rate": 0.00022542959783367265, - "loss": 0.7762, + "grad_norm": 2.234375, + "learning_rate": 8.988246246178434e-05, + "loss": 0.8398, "step": 745 }, { "epoch": 0.18054886856042368, - "grad_norm": 1.4375, - "learning_rate": 0.00022541036572882255, - "loss": 0.758, + "grad_norm": 2.390625, + "learning_rate": 8.987479430747425e-05, + "loss": 0.8273, "step": 750 }, { "epoch": 0.18175252768415984, - "grad_norm": 1.5625, - "learning_rate": 0.00022539099994822978, - "loss": 0.7838, + "grad_norm": 2.140625, + "learning_rate": 8.98670728544623e-05, + "loss": 0.8529, "step": 755 }, { "epoch": 0.182956186807896, - "grad_norm": 1.5078125, - "learning_rate": 0.00022537150052283589, - "loss": 0.7533, + "grad_norm": 2.125, + "learning_rate": 8.985929811508543e-05, + "loss": 0.8219, "step": 760 }, { "epoch": 0.18415984593163215, - "grad_norm": 1.6640625, - "learning_rate": 0.000225351867483796, - "loss": 0.7954, + "grad_norm": 2.453125, + "learning_rate": 8.985147010176565e-05, + "loss": 0.8564, "step": 765 }, { "epoch": 0.18536350505536833, - "grad_norm": 1.5, - "learning_rate": 0.00022533210086247865, - "loss": 0.7542, + "grad_norm": 2.15625, + "learning_rate": 8.984358882701013e-05, + "loss": 0.8156, "step": 770 }, { "epoch": 0.1865671641791045, - "grad_norm": 1.5, - "learning_rate": 0.00022531220069046585, - "loss": 0.8368, + "grad_norm": 2.09375, + "learning_rate": 8.983565430341113e-05, + "loss": 0.8976, "step": 775 }, { "epoch": 0.18777082330284064, - "grad_norm": 1.609375, - "learning_rate": 0.00022529216699955295, - "loss": 0.7912, + "grad_norm": 2.171875, + "learning_rate": 8.9827666543646e-05, + "loss": 0.8503, "step": 780 }, { "epoch": 0.1889744824265768, - "grad_norm": 1.59375, - "learning_rate": 0.00022527199982174865, - "loss": 0.7973, + "grad_norm": 2.15625, + "learning_rate": 8.981962556047708e-05, + "loss": 0.8649, "step": 785 }, { "epoch": 0.19017814155031296, - "grad_norm": 1.6875, - "learning_rate": 0.000225251699189275, - "loss": 0.7791, + "grad_norm": 2.34375, + "learning_rate": 8.981153136675185e-05, + "loss": 0.8456, "step": 790 }, { "epoch": 0.1913818006740491, - "grad_norm": 1.4453125, - "learning_rate": 0.0002252312651345671, - "loss": 0.7466, + "grad_norm": 2.15625, + "learning_rate": 8.980338397540274e-05, + "loss": 0.8125, "step": 795 }, { "epoch": 0.19258545979778527, - "grad_norm": 1.5390625, - "learning_rate": 0.0002252106976902734, - "loss": 0.7886, + "grad_norm": 2.171875, + "learning_rate": 8.979518339944719e-05, + "loss": 0.8483, "step": 800 }, { "epoch": 0.19378911892152142, - "grad_norm": 1.5859375, - "learning_rate": 0.00022518999688925538, - "loss": 0.7954, + "grad_norm": 2.359375, + "learning_rate": 8.978692965198763e-05, + "loss": 0.8548, "step": 805 }, { "epoch": 0.19499277804525758, - "grad_norm": 1.5234375, - "learning_rate": 0.00022516916276458764, - "loss": 0.7861, + "grad_norm": 2.1875, + "learning_rate": 8.977862274621142e-05, + "loss": 0.8518, "step": 810 }, { "epoch": 0.19619643716899374, - "grad_norm": 1.578125, - "learning_rate": 0.00022514819534955773, - "loss": 0.7677, + "grad_norm": 2.375, + "learning_rate": 8.977026269539086e-05, + "loss": 0.8337, "step": 815 }, { "epoch": 0.1974000962927299, - "grad_norm": 1.6171875, - "learning_rate": 0.00022512709467766622, - "loss": 0.7923, + "grad_norm": 2.296875, + "learning_rate": 8.976184951288319e-05, + "loss": 0.8501, "step": 820 }, { "epoch": 0.19860375541646605, - "grad_norm": 1.59375, - "learning_rate": 0.0002251058607826266, - "loss": 0.7832, + "grad_norm": 2.40625, + "learning_rate": 8.97533832121305e-05, + "loss": 0.839, "step": 825 }, { "epoch": 0.1998074145402022, - "grad_norm": 1.7890625, - "learning_rate": 0.00022508449369836514, - "loss": 0.7892, + "grad_norm": 2.21875, + "learning_rate": 8.974486380665977e-05, + "loss": 0.8487, "step": 830 }, { "epoch": 0.20101107366393836, - "grad_norm": 1.6171875, - "learning_rate": 0.00022506299345902102, - "loss": 0.7908, + "grad_norm": 2.171875, + "learning_rate": 8.973629131008285e-05, + "loss": 0.8568, "step": 835 }, { "epoch": 0.20221473278767452, - "grad_norm": 1.6015625, - "learning_rate": 0.00022504136009894607, - "loss": 0.7649, + "grad_norm": 2.359375, + "learning_rate": 8.972766573609635e-05, + "loss": 0.8316, "step": 840 }, { "epoch": 0.2034183919114107, - "grad_norm": 1.4140625, - "learning_rate": 0.00022501959365270487, - "loss": 0.7859, + "grad_norm": 2.125, + "learning_rate": 8.971898709848177e-05, + "loss": 0.8496, "step": 845 }, { "epoch": 0.20462205103514686, - "grad_norm": 1.5546875, - "learning_rate": 0.00022499769415507462, - "loss": 0.796, + "grad_norm": 2.28125, + "learning_rate": 8.971025541110532e-05, + "loss": 0.8577, "step": 850 }, { "epoch": 0.205825710158883, - "grad_norm": 1.40625, - "learning_rate": 0.00022497566164104507, - "loss": 0.8122, + "grad_norm": 2.09375, + "learning_rate": 8.970147068791799e-05, + "loss": 0.8719, "step": 855 }, { "epoch": 0.20702936928261917, - "grad_norm": 1.5078125, - "learning_rate": 0.00022495349614581862, - "loss": 0.7908, + "grad_norm": 2.234375, + "learning_rate": 8.969263294295556e-05, + "loss": 0.8524, "step": 860 }, { "epoch": 0.20823302840635532, - "grad_norm": 1.4765625, - "learning_rate": 0.00022493119770480995, - "loss": 0.8267, + "grad_norm": 2.125, + "learning_rate": 8.968374219033842e-05, + "loss": 0.8869, "step": 865 }, { "epoch": 0.20943668753009148, - "grad_norm": 1.4765625, - "learning_rate": 0.00022490876635364627, - "loss": 0.7968, + "grad_norm": 2.140625, + "learning_rate": 8.967479844427175e-05, + "loss": 0.8667, "step": 870 }, { "epoch": 0.21064034665382764, - "grad_norm": 1.7578125, - "learning_rate": 0.00022488620212816722, - "loss": 0.7767, + "grad_norm": 2.40625, + "learning_rate": 8.966580171904539e-05, + "loss": 0.8348, "step": 875 }, { "epoch": 0.2118440057775638, - "grad_norm": 1.53125, - "learning_rate": 0.00022486350506442453, - "loss": 0.8038, + "grad_norm": 2.109375, + "learning_rate": 8.965675202903374e-05, + "loss": 0.8619, "step": 880 }, { "epoch": 0.21304766490129995, - "grad_norm": 1.5625, - "learning_rate": 0.00022484067519868236, - "loss": 0.7819, + "grad_norm": 2.09375, + "learning_rate": 8.964764938869592e-05, + "loss": 0.8385, "step": 885 }, { "epoch": 0.2142513240250361, - "grad_norm": 1.390625, - "learning_rate": 0.00022481771256741695, - "loss": 0.7665, + "grad_norm": 1.9453125, + "learning_rate": 8.963849381257561e-05, + "loss": 0.8248, "step": 890 }, { "epoch": 0.21545498314877226, - "grad_norm": 1.6015625, - "learning_rate": 0.0002247946172073167, - "loss": 0.7706, + "grad_norm": 2.25, + "learning_rate": 8.962928531530108e-05, + "loss": 0.8327, "step": 895 }, { "epoch": 0.21665864227250842, - "grad_norm": 1.375, - "learning_rate": 0.0002247713891552821, - "loss": 0.7578, + "grad_norm": 2.0, + "learning_rate": 8.962002391158514e-05, + "loss": 0.8131, "step": 900 }, { "epoch": 0.21786230139624457, - "grad_norm": 1.6640625, - "learning_rate": 0.00022474802844842562, - "loss": 0.7915, + "grad_norm": 2.359375, + "learning_rate": 8.961070961622513e-05, + "loss": 0.8587, "step": 905 }, { "epoch": 0.21906596051998073, - "grad_norm": 1.6328125, - "learning_rate": 0.00022472453512407164, - "loss": 0.7835, + "grad_norm": 2.25, + "learning_rate": 8.960134244410293e-05, + "loss": 0.8407, "step": 910 }, { "epoch": 0.2202696196437169, - "grad_norm": 1.4921875, - "learning_rate": 0.00022470090921975652, - "loss": 0.758, + "grad_norm": 2.125, + "learning_rate": 8.959192241018484e-05, + "loss": 0.8187, "step": 915 }, { "epoch": 0.22147327876745307, - "grad_norm": 1.515625, - "learning_rate": 0.00022467715077322835, - "loss": 0.7823, + "grad_norm": 2.28125, + "learning_rate": 8.958244952952172e-05, + "loss": 0.8333, "step": 920 }, { "epoch": 0.22267693789118922, - "grad_norm": 5.375, - "learning_rate": 0.0002246532598224471, - "loss": 0.7762, + "grad_norm": 2.34375, + "learning_rate": 8.957292381724875e-05, + "loss": 0.8425, "step": 925 }, { "epoch": 0.22388059701492538, - "grad_norm": 1.5859375, - "learning_rate": 0.00022462923640558435, - "loss": 0.7705, + "grad_norm": 2.671875, + "learning_rate": 8.956334528858562e-05, + "loss": 0.8347, "step": 930 }, { "epoch": 0.22508425613866154, - "grad_norm": 1.515625, - "learning_rate": 0.0002246050805610233, - "loss": 0.7756, + "grad_norm": 2.46875, + "learning_rate": 8.955371395883631e-05, + "loss": 0.8376, "step": 935 }, { "epoch": 0.2262879152623977, - "grad_norm": 1.59375, - "learning_rate": 0.0002245807923273589, - "loss": 0.7863, + "grad_norm": 2.34375, + "learning_rate": 8.954402984338926e-05, + "loss": 0.8411, "step": 940 }, { "epoch": 0.22749157438613385, - "grad_norm": 1.5859375, - "learning_rate": 0.00022455637174339748, - "loss": 0.7772, + "grad_norm": 2.171875, + "learning_rate": 8.953429295771718e-05, + "loss": 0.8334, "step": 945 }, { "epoch": 0.22869523350987, - "grad_norm": 1.5234375, - "learning_rate": 0.0002245318188481569, - "loss": 0.8, + "grad_norm": 2.09375, + "learning_rate": 8.952450331737715e-05, + "loss": 0.8596, "step": 950 }, { "epoch": 0.22989889263360616, - "grad_norm": 1.5390625, - "learning_rate": 0.0002245071336808663, - "loss": 0.7471, + "grad_norm": 2.140625, + "learning_rate": 8.951466093801045e-05, + "loss": 0.8167, "step": 955 }, { "epoch": 0.23110255175734232, - "grad_norm": 1.578125, - "learning_rate": 0.00022448231628096634, - "loss": 0.7658, + "grad_norm": 2.078125, + "learning_rate": 8.950476583534274e-05, + "loss": 0.824, "step": 960 }, { "epoch": 0.23230621088107847, - "grad_norm": 1.578125, - "learning_rate": 0.00022445736668810887, - "loss": 0.7779, + "grad_norm": 1.984375, + "learning_rate": 8.949481802518383e-05, + "loss": 0.8369, "step": 965 }, { "epoch": 0.23350987000481463, - "grad_norm": 1.3828125, - "learning_rate": 0.00022443228494215686, - "loss": 0.7688, + "grad_norm": 2.375, + "learning_rate": 8.948481752342775e-05, + "loss": 0.8276, "step": 970 }, { "epoch": 0.23471352912855079, - "grad_norm": 1.5625, - "learning_rate": 0.0002244070710831846, - "loss": 0.7415, + "grad_norm": 1.96875, + "learning_rate": 8.947476434605276e-05, + "loss": 0.7982, "step": 975 }, { "epoch": 0.23591718825228694, - "grad_norm": 1.4375, - "learning_rate": 0.0002243817251514773, - "loss": 0.7458, + "grad_norm": 2.09375, + "learning_rate": 8.946465850912128e-05, + "loss": 0.8048, "step": 980 }, { "epoch": 0.2371208473760231, - "grad_norm": 1.421875, - "learning_rate": 0.0002243562471875313, - "loss": 0.7442, + "grad_norm": 1.953125, + "learning_rate": 8.945450002877982e-05, + "loss": 0.8068, "step": 985 }, { "epoch": 0.23832450649975928, - "grad_norm": 1.4921875, - "learning_rate": 0.00022433063723205387, - "loss": 0.7452, + "grad_norm": 2.09375, + "learning_rate": 8.944428892125902e-05, + "loss": 0.8032, "step": 990 }, { "epoch": 0.23952816562349544, - "grad_norm": 1.4765625, - "learning_rate": 0.00022430489532596312, - "loss": 0.762, + "grad_norm": 1.96875, + "learning_rate": 8.943402520287363e-05, + "loss": 0.8195, "step": 995 }, { "epoch": 0.2407318247472316, - "grad_norm": 1.484375, - "learning_rate": 0.000224279021510388, - "loss": 0.7488, + "grad_norm": 2.15625, + "learning_rate": 8.942370889002242e-05, + "loss": 0.805, "step": 1000 }, { "epoch": 0.2407318247472316, - "eval_loss": 0.6652006506919861, - "eval_runtime": 2.3217, - "eval_samples_per_second": 86.142, - "eval_steps_per_second": 86.142, + "eval_loss": 0.71897292137146, + "eval_runtime": 2.3756, + "eval_samples_per_second": 84.188, + "eval_steps_per_second": 84.188, "step": 1000 }, { "epoch": 0.24193548387096775, - "grad_norm": 1.5625, - "learning_rate": 0.00022425301582666831, - "loss": 0.7763, + "grad_norm": 1.984375, + "learning_rate": 8.941333999918825e-05, + "loss": 0.8449, "step": 1005 }, { "epoch": 0.2431391429947039, - "grad_norm": 1.4609375, - "learning_rate": 0.0002242268783163544, - "loss": 0.7596, + "grad_norm": 2.109375, + "learning_rate": 8.940291854693792e-05, + "loss": 0.8147, "step": 1010 }, { "epoch": 0.24434280211844006, - "grad_norm": 1.40625, - "learning_rate": 0.00022420060902120735, - "loss": 0.7576, + "grad_norm": 2.125, + "learning_rate": 8.939244454992225e-05, + "loss": 0.8259, "step": 1015 }, { "epoch": 0.24554646124217622, - "grad_norm": 1.53125, - "learning_rate": 0.00022417420798319872, - "loss": 0.7307, + "grad_norm": 2.1875, + "learning_rate": 8.938191802487601e-05, + "loss": 0.7954, "step": 1020 }, { "epoch": 0.24675012036591237, - "grad_norm": 1.3515625, - "learning_rate": 0.00022414767524451065, - "loss": 0.7232, + "grad_norm": 1.9765625, + "learning_rate": 8.937133898861787e-05, + "loss": 0.7835, "step": 1025 }, { "epoch": 0.24795377948964853, - "grad_norm": 1.4296875, - "learning_rate": 0.00022412101084753557, - "loss": 0.722, + "grad_norm": 2.296875, + "learning_rate": 8.936070745805042e-05, + "loss": 0.7812, "step": 1030 }, { "epoch": 0.24915743861338469, - "grad_norm": 1.578125, - "learning_rate": 0.00022409421483487644, - "loss": 0.7703, + "grad_norm": 2.609375, + "learning_rate": 8.935002345016012e-05, + "loss": 0.8295, "step": 1035 }, { "epoch": 0.25036109773712084, - "grad_norm": 1.53125, - "learning_rate": 0.0002240672872493464, - "loss": 0.7678, + "grad_norm": 2.0, + "learning_rate": 8.93392869820173e-05, + "loss": 0.8214, "step": 1040 }, { "epoch": 0.251564756860857, - "grad_norm": 1.53125, - "learning_rate": 0.0002240402281339688, - "loss": 0.7079, + "grad_norm": 2.5, + "learning_rate": 8.932849807077607e-05, + "loss": 0.7638, "step": 1045 }, { "epoch": 0.25276841598459315, - "grad_norm": 1.4453125, - "learning_rate": 0.00022401303753197716, - "loss": 0.7451, + "grad_norm": 2.140625, + "learning_rate": 8.93176567336743e-05, + "loss": 0.806, "step": 1050 }, { "epoch": 0.2539720751083293, - "grad_norm": 1.546875, - "learning_rate": 0.00022398571548681517, - "loss": 0.7027, + "grad_norm": 2.21875, + "learning_rate": 8.930676298803376e-05, + "loss": 0.7615, "step": 1055 }, { "epoch": 0.25517573423206547, - "grad_norm": 1.6484375, - "learning_rate": 0.00022395826204213635, - "loss": 0.7702, + "grad_norm": 2.15625, + "learning_rate": 8.929581685125977e-05, + "loss": 0.8387, "step": 1060 }, { "epoch": 0.2563793933558016, - "grad_norm": 1.3203125, - "learning_rate": 0.00022393067724180436, - "loss": 0.7592, + "grad_norm": 2.03125, + "learning_rate": 8.928481834084152e-05, + "loss": 0.8256, "step": 1065 }, { "epoch": 0.2575830524795378, - "grad_norm": 1.6015625, - "learning_rate": 0.00022390296112989258, - "loss": 0.7439, + "grad_norm": 2.0625, + "learning_rate": 8.927376747435178e-05, + "loss": 0.8051, "step": 1070 }, { "epoch": 0.25878671160327393, - "grad_norm": 1.484375, - "learning_rate": 0.00022387511375068425, - "loss": 0.7797, + "grad_norm": 2.15625, + "learning_rate": 8.926266426944698e-05, + "loss": 0.8416, "step": 1075 }, { "epoch": 0.2599903707270101, - "grad_norm": 1.59375, - "learning_rate": 0.0002238471351486724, - "loss": 0.8081, + "grad_norm": 2.234375, + "learning_rate": 8.925150874386724e-05, + "loss": 0.8645, "step": 1080 }, { "epoch": 0.26119402985074625, - "grad_norm": 1.6640625, - "learning_rate": 0.00022381902536855957, - "loss": 0.7138, + "grad_norm": 2.28125, + "learning_rate": 8.92403009154362e-05, + "loss": 0.777, "step": 1085 }, { "epoch": 0.2623976889744824, - "grad_norm": 1.5, - "learning_rate": 0.00022379078445525807, - "loss": 0.7482, + "grad_norm": 2.28125, + "learning_rate": 8.922904080206111e-05, + "loss": 0.8047, "step": 1090 }, { "epoch": 0.26360134809821856, - "grad_norm": 1.5859375, - "learning_rate": 0.0002237624124538896, - "loss": 0.7365, + "grad_norm": 2.09375, + "learning_rate": 8.921772842173273e-05, + "loss": 0.7942, "step": 1095 }, { "epoch": 0.26480500722195477, - "grad_norm": 1.40625, - "learning_rate": 0.00022373390940978537, - "loss": 0.7442, + "grad_norm": 2.171875, + "learning_rate": 8.920636379252537e-05, + "loss": 0.8058, "step": 1100 }, { "epoch": 0.2660086663456909, - "grad_norm": 1.421875, - "learning_rate": 0.00022370527536848592, - "loss": 0.7241, + "grad_norm": 2.015625, + "learning_rate": 8.919494693259677e-05, + "loss": 0.7837, "step": 1105 }, { "epoch": 0.2672123254694271, - "grad_norm": 1.4296875, - "learning_rate": 0.00022367651037574106, - "loss": 0.6941, + "grad_norm": 2.109375, + "learning_rate": 8.918347786018815e-05, + "loss": 0.7573, "step": 1110 }, { "epoch": 0.26841598459316324, - "grad_norm": 1.5078125, - "learning_rate": 0.00022364761447751002, - "loss": 0.7523, + "grad_norm": 2.28125, + "learning_rate": 8.917195659362415e-05, + "loss": 0.8122, "step": 1115 }, { "epoch": 0.2696196437168994, - "grad_norm": 1.65625, - "learning_rate": 0.00022361858771996086, - "loss": 0.7132, + "grad_norm": 2.1875, + "learning_rate": 8.916038315131281e-05, + "loss": 0.7647, "step": 1120 }, { "epoch": 0.27082330284063555, - "grad_norm": 1.484375, - "learning_rate": 0.00022358943014947098, - "loss": 0.7609, + "grad_norm": 2.109375, + "learning_rate": 8.91487575517455e-05, + "loss": 0.8169, "step": 1125 }, { "epoch": 0.2720269619643717, - "grad_norm": 1.546875, - "learning_rate": 0.00022356014181262673, - "loss": 0.7449, + "grad_norm": 2.21875, + "learning_rate": 8.913707981349698e-05, + "loss": 0.8063, "step": 1130 }, { "epoch": 0.27323062108810786, - "grad_norm": 1.4453125, - "learning_rate": 0.00022353072275622333, - "loss": 0.7734, + "grad_norm": 2.15625, + "learning_rate": 8.912534995522525e-05, + "loss": 0.8276, "step": 1135 }, { "epoch": 0.274434280211844, - "grad_norm": 1.484375, - "learning_rate": 0.00022350117302726488, - "loss": 0.7446, + "grad_norm": 2.09375, + "learning_rate": 8.911356799567165e-05, + "loss": 0.8053, "step": 1140 }, { "epoch": 0.2756379393355802, - "grad_norm": 1.453125, - "learning_rate": 0.00022347149267296432, - "loss": 0.7568, + "grad_norm": 2.21875, + "learning_rate": 8.91017339536607e-05, + "loss": 0.8152, "step": 1145 }, { "epoch": 0.27684159845931633, - "grad_norm": 1.453125, - "learning_rate": 0.00022344168174074318, - "loss": 0.7767, + "grad_norm": 2.09375, + "learning_rate": 8.908984784810019e-05, + "loss": 0.8254, "step": 1150 }, { "epoch": 0.2780452575830525, - "grad_norm": 1.5078125, - "learning_rate": 0.00022341174027823172, - "loss": 0.7546, + "grad_norm": 2.109375, + "learning_rate": 8.907790969798105e-05, + "loss": 0.799, "step": 1155 }, { "epoch": 0.27924891670678864, - "grad_norm": 1.484375, - "learning_rate": 0.00022338166833326875, - "loss": 0.7682, + "grad_norm": 2.015625, + "learning_rate": 8.906591952237739e-05, + "loss": 0.8217, "step": 1160 }, { "epoch": 0.2804525758305248, - "grad_norm": 1.40625, - "learning_rate": 0.0002233514659539015, - "loss": 0.7089, + "grad_norm": 2.125, + "learning_rate": 8.905387734044643e-05, + "loss": 0.7634, "step": 1165 }, { "epoch": 0.28165623495426095, - "grad_norm": 1.609375, - "learning_rate": 0.00022332113318838563, - "loss": 0.7438, + "grad_norm": 2.25, + "learning_rate": 8.904178317142851e-05, + "loss": 0.7971, "step": 1170 }, { "epoch": 0.2828598940779971, - "grad_norm": 1.3046875, - "learning_rate": 0.0002232906700851851, - "loss": 0.738, + "grad_norm": 2.0, + "learning_rate": 8.9029637034647e-05, + "loss": 0.7895, "step": 1175 }, { "epoch": 0.28406355320173327, - "grad_norm": 1.4765625, - "learning_rate": 0.0002232600766929722, - "loss": 0.7533, + "grad_norm": 2.234375, + "learning_rate": 8.901743894950831e-05, + "loss": 0.8219, "step": 1180 }, { "epoch": 0.2852672123254694, - "grad_norm": 1.4375, - "learning_rate": 0.00022322935306062726, - "loss": 0.7373, + "grad_norm": 2.09375, + "learning_rate": 8.900518893550185e-05, + "loss": 0.7912, "step": 1185 }, { "epoch": 0.2864708714492056, - "grad_norm": 1.4765625, - "learning_rate": 0.00022319849923723884, - "loss": 0.7656, + "grad_norm": 2.0, + "learning_rate": 8.899288701220002e-05, + "loss": 0.8243, "step": 1190 }, { "epoch": 0.28767453057294173, - "grad_norm": 1.4140625, - "learning_rate": 0.0002231675152721034, - "loss": 0.7396, + "grad_norm": 1.9296875, + "learning_rate": 8.898053319925812e-05, + "loss": 0.7997, "step": 1195 }, { "epoch": 0.2888781896966779, - "grad_norm": 1.3984375, - "learning_rate": 0.00022313640121472532, - "loss": 0.747, + "grad_norm": 2.09375, + "learning_rate": 8.896812751641437e-05, + "loss": 0.8069, "step": 1200 }, { "epoch": 0.29008184882041405, - "grad_norm": 1.265625, - "learning_rate": 0.00022310515711481698, - "loss": 0.7081, + "grad_norm": 1.90625, + "learning_rate": 8.89556699834899e-05, + "loss": 0.7625, "step": 1205 }, { "epoch": 0.2912855079441502, - "grad_norm": 1.515625, - "learning_rate": 0.0002230737830222984, - "loss": 0.7457, + "grad_norm": 2.140625, + "learning_rate": 8.894316062038863e-05, + "loss": 0.8064, "step": 1210 }, { "epoch": 0.29248916706788636, - "grad_norm": 1.4296875, - "learning_rate": 0.00022304227898729739, - "loss": 0.7428, + "grad_norm": 2.234375, + "learning_rate": 8.893059944709735e-05, + "loss": 0.8068, "step": 1215 }, { "epoch": 0.2936928261916225, - "grad_norm": 1.4453125, - "learning_rate": 0.00022301064506014922, - "loss": 0.7479, + "grad_norm": 2.0625, + "learning_rate": 8.891798648368554e-05, + "loss": 0.8003, "step": 1220 }, { "epoch": 0.29489648531535867, - "grad_norm": 1.3515625, - "learning_rate": 0.00022297888129139685, - "loss": 0.7351, + "grad_norm": 2.1875, + "learning_rate": 8.890532175030552e-05, + "loss": 0.7926, "step": 1225 }, { "epoch": 0.2961001444390948, - "grad_norm": 1.453125, - "learning_rate": 0.00022294698773179066, - "loss": 0.7447, + "grad_norm": 2.03125, + "learning_rate": 8.889260526719228e-05, + "loss": 0.8027, "step": 1230 }, { "epoch": 0.297303803562831, - "grad_norm": 1.46875, - "learning_rate": 0.00022291496443228834, - "loss": 0.7168, + "grad_norm": 2.40625, + "learning_rate": 8.88798370546635e-05, + "loss": 0.7732, "step": 1235 }, { "epoch": 0.29850746268656714, - "grad_norm": 1.421875, - "learning_rate": 0.000222882811444055, - "loss": 0.7135, + "grad_norm": 2.109375, + "learning_rate": 8.886701713311951e-05, + "loss": 0.7772, "step": 1240 }, { "epoch": 0.29971112181030335, - "grad_norm": 1.4140625, - "learning_rate": 0.00022285052881846276, - "loss": 0.731, + "grad_norm": 2.0625, + "learning_rate": 8.885414552304327e-05, + "loss": 0.7921, "step": 1245 }, { "epoch": 0.3009147809340395, - "grad_norm": 1.484375, - "learning_rate": 0.00022281811660709101, - "loss": 0.7351, + "grad_norm": 2.28125, + "learning_rate": 8.884122224500027e-05, + "loss": 0.7951, "step": 1250 }, { "epoch": 0.30211844005777566, - "grad_norm": 1.5078125, - "learning_rate": 0.0002227855748617262, - "loss": 0.7266, + "grad_norm": 2.109375, + "learning_rate": 8.882824731963864e-05, + "loss": 0.7868, "step": 1255 }, { "epoch": 0.3033220991815118, - "grad_norm": 1.3671875, - "learning_rate": 0.00022275290363436167, - "loss": 0.7304, + "grad_norm": 1.9765625, + "learning_rate": 8.881522076768897e-05, + "loss": 0.7935, "step": 1260 }, { "epoch": 0.304525758305248, - "grad_norm": 1.4140625, - "learning_rate": 0.00022272010297719766, - "loss": 0.7046, + "grad_norm": 2.046875, + "learning_rate": 8.880214260996436e-05, + "loss": 0.7652, "step": 1265 }, { "epoch": 0.30572941742898413, - "grad_norm": 1.3515625, - "learning_rate": 0.00022268717294264122, - "loss": 0.7296, + "grad_norm": 2.125, + "learning_rate": 8.878901286736033e-05, + "loss": 0.7921, "step": 1270 }, { "epoch": 0.3069330765527203, - "grad_norm": 1.28125, - "learning_rate": 0.0002226541135833061, - "loss": 0.7276, + "grad_norm": 1.9921875, + "learning_rate": 8.877583156085487e-05, + "loss": 0.7832, "step": 1275 }, { "epoch": 0.30813673567645644, - "grad_norm": 1.34375, - "learning_rate": 0.0002226209249520127, - "loss": 0.7066, + "grad_norm": 1.953125, + "learning_rate": 8.876259871150831e-05, + "loss": 0.7663, "step": 1280 }, { "epoch": 0.3093403948001926, - "grad_norm": 1.3828125, - "learning_rate": 0.0002225876071017879, - "loss": 0.7438, + "grad_norm": 2.234375, + "learning_rate": 8.874931434046337e-05, + "loss": 0.8072, "step": 1285 }, { "epoch": 0.31054405392392875, - "grad_norm": 1.59375, - "learning_rate": 0.00022255416008586513, - "loss": 0.7323, + "grad_norm": 2.265625, + "learning_rate": 8.873597846894504e-05, + "loss": 0.7885, "step": 1290 }, { "epoch": 0.3117477130476649, - "grad_norm": 1.296875, - "learning_rate": 0.00022252058395768413, - "loss": 0.7475, + "grad_norm": 2.09375, + "learning_rate": 8.872259111826064e-05, + "loss": 0.8007, "step": 1295 }, { "epoch": 0.31295137217140107, - "grad_norm": 1.453125, - "learning_rate": 0.00022248687877089092, - "loss": 0.7016, + "grad_norm": 2.234375, + "learning_rate": 8.870915230979974e-05, + "loss": 0.7605, "step": 1300 }, { "epoch": 0.3141550312951372, - "grad_norm": 1.6484375, - "learning_rate": 0.0002224530445793378, - "loss": 0.7138, + "grad_norm": 2.234375, + "learning_rate": 8.869566206503408e-05, + "loss": 0.7647, "step": 1305 }, { "epoch": 0.3153586904188734, - "grad_norm": 1.6015625, - "learning_rate": 0.00022241908143708308, - "loss": 0.7283, + "grad_norm": 2.234375, + "learning_rate": 8.868212040551763e-05, + "loss": 0.779, "step": 1310 }, { "epoch": 0.31656234954260953, - "grad_norm": 1.5078125, - "learning_rate": 0.0002223849893983912, - "loss": 0.753, + "grad_norm": 2.203125, + "learning_rate": 8.86685273528865e-05, + "loss": 0.8074, "step": 1315 }, { "epoch": 0.3177660086663457, - "grad_norm": 1.46875, - "learning_rate": 0.00022235076851773248, - "loss": 0.7342, + "grad_norm": 2.09375, + "learning_rate": 8.865488292885889e-05, + "loss": 0.7954, "step": 1320 }, { "epoch": 0.31896966779008185, - "grad_norm": 1.5546875, - "learning_rate": 0.00022231641884978314, - "loss": 0.7422, + "grad_norm": 2.0625, + "learning_rate": 8.86411871552351e-05, + "loss": 0.7933, "step": 1325 }, { "epoch": 0.320173326913818, - "grad_norm": 1.3984375, - "learning_rate": 0.00022228194044942505, - "loss": 0.7195, + "grad_norm": 2.15625, + "learning_rate": 8.862744005389743e-05, + "loss": 0.7763, "step": 1330 }, { "epoch": 0.32137698603755416, - "grad_norm": 1.515625, - "learning_rate": 0.00022224733337174597, - "loss": 0.7221, + "grad_norm": 2.078125, + "learning_rate": 8.861364164681028e-05, + "loss": 0.7825, "step": 1335 }, { "epoch": 0.3225806451612903, - "grad_norm": 1.328125, - "learning_rate": 0.0002222125976720391, - "loss": 0.6976, + "grad_norm": 1.953125, + "learning_rate": 8.859979195601994e-05, + "loss": 0.7588, "step": 1340 }, { "epoch": 0.32378430428502647, - "grad_norm": 1.3203125, - "learning_rate": 0.00022217773340580315, - "loss": 0.6882, + "grad_norm": 2.140625, + "learning_rate": 8.858589100365467e-05, + "loss": 0.7538, "step": 1345 }, { "epoch": 0.3249879634087626, - "grad_norm": 1.453125, - "learning_rate": 0.00022214274062874232, - "loss": 0.6822, + "grad_norm": 2.21875, + "learning_rate": 8.85719388119246e-05, + "loss": 0.7476, "step": 1350 }, { "epoch": 0.3261916225324988, - "grad_norm": 1.4375, - "learning_rate": 0.00022210761939676606, - "loss": 0.7242, + "grad_norm": 2.28125, + "learning_rate": 8.855793540312181e-05, + "loss": 0.7811, "step": 1355 }, { "epoch": 0.32739528165623494, - "grad_norm": 1.4296875, - "learning_rate": 0.00022207236976598917, - "loss": 0.7118, + "grad_norm": 2.0625, + "learning_rate": 8.854388079962011e-05, + "loss": 0.7617, "step": 1360 }, { "epoch": 0.3285989407799711, - "grad_norm": 1.328125, - "learning_rate": 0.00022203699179273144, - "loss": 0.7575, + "grad_norm": 2.359375, + "learning_rate": 8.852977502387519e-05, + "loss": 0.815, "step": 1365 }, { "epoch": 0.32980259990370725, - "grad_norm": 1.3046875, - "learning_rate": 0.00022200148553351781, - "loss": 0.6682, + "grad_norm": 1.9296875, + "learning_rate": 8.851561809842441e-05, + "loss": 0.7263, "step": 1370 }, { "epoch": 0.3310062590274434, - "grad_norm": 1.4375, - "learning_rate": 0.00022196585104507823, - "loss": 0.7372, + "grad_norm": 2.0625, + "learning_rate": 8.850141004588697e-05, + "loss": 0.7966, "step": 1375 }, { "epoch": 0.33220991815117956, - "grad_norm": 1.4296875, - "learning_rate": 0.00022193008838434746, - "loss": 0.6879, + "grad_norm": 2.25, + "learning_rate": 8.848715088896367e-05, + "loss": 0.7457, "step": 1380 }, { "epoch": 0.3334135772749157, - "grad_norm": 1.1875, - "learning_rate": 0.00022189419760846503, - "loss": 0.6823, + "grad_norm": 1.9453125, + "learning_rate": 8.847284065043699e-05, + "loss": 0.7447, "step": 1385 }, { "epoch": 0.33461723639865193, - "grad_norm": 1.3984375, - "learning_rate": 0.00022185817877477525, - "loss": 0.7183, + "grad_norm": 2.1875, + "learning_rate": 8.845847935317101e-05, + "loss": 0.7775, "step": 1390 }, { "epoch": 0.3358208955223881, - "grad_norm": 1.34375, - "learning_rate": 0.00022182203194082693, - "loss": 0.7239, + "grad_norm": 2.09375, + "learning_rate": 8.84440670201114e-05, + "loss": 0.7815, "step": 1395 }, { "epoch": 0.33702455464612424, - "grad_norm": 1.234375, - "learning_rate": 0.0002217857571643735, - "loss": 0.7383, + "grad_norm": 2.15625, + "learning_rate": 8.84296036742854e-05, + "loss": 0.7952, "step": 1400 }, { "epoch": 0.3382282137698604, - "grad_norm": 1.3984375, - "learning_rate": 0.0002217493545033727, - "loss": 0.6791, + "grad_norm": 2.109375, + "learning_rate": 8.84150893388017e-05, + "loss": 0.7406, "step": 1405 }, { "epoch": 0.33943187289359655, - "grad_norm": 1.3515625, - "learning_rate": 0.0002217128240159867, - "loss": 0.7147, + "grad_norm": 1.875, + "learning_rate": 8.84005240368505e-05, + "loss": 0.7686, "step": 1410 }, { "epoch": 0.3406355320173327, - "grad_norm": 1.4296875, - "learning_rate": 0.00022167616576058183, - "loss": 0.7212, + "grad_norm": 2.171875, + "learning_rate": 8.838590779170342e-05, + "loss": 0.7738, "step": 1415 }, { "epoch": 0.34183919114106887, - "grad_norm": 1.609375, - "learning_rate": 0.00022163937979572857, - "loss": 0.7317, + "grad_norm": 2.0625, + "learning_rate": 8.837124062671349e-05, + "loss": 0.7792, "step": 1420 }, { "epoch": 0.343042850264805, - "grad_norm": 1.375, - "learning_rate": 0.00022160246618020145, - "loss": 0.7324, + "grad_norm": 2.078125, + "learning_rate": 8.835652256531503e-05, + "loss": 0.7946, "step": 1425 }, { "epoch": 0.3442465093885412, - "grad_norm": 1.53125, - "learning_rate": 0.000221565424972979, - "loss": 0.7112, + "grad_norm": 2.25, + "learning_rate": 8.834175363102377e-05, + "loss": 0.7637, "step": 1430 }, { "epoch": 0.34545016851227733, - "grad_norm": 1.3359375, - "learning_rate": 0.0002215282562332436, - "loss": 0.7241, + "grad_norm": 1.921875, + "learning_rate": 8.832693384743668e-05, + "loss": 0.7822, "step": 1435 }, { "epoch": 0.3466538276360135, - "grad_norm": 1.3515625, - "learning_rate": 0.00022149096002038133, - "loss": 0.7053, + "grad_norm": 2.0, + "learning_rate": 8.831206323823197e-05, + "loss": 0.7579, "step": 1440 }, { "epoch": 0.34785748675974965, - "grad_norm": 1.3125, - "learning_rate": 0.00022145353639398197, - "loss": 0.6724, + "grad_norm": 1.96875, + "learning_rate": 8.829714182716905e-05, + "loss": 0.7308, "step": 1445 }, { "epoch": 0.3490611458834858, - "grad_norm": 1.3515625, - "learning_rate": 0.00022141598541383889, - "loss": 0.6842, + "grad_norm": 2.046875, + "learning_rate": 8.828216963808852e-05, + "loss": 0.7483, "step": 1450 }, { "epoch": 0.35026480500722196, - "grad_norm": 1.3125, - "learning_rate": 0.0002213783071399489, - "loss": 0.6795, + "grad_norm": 2.015625, + "learning_rate": 8.826714669491209e-05, + "loss": 0.7353, "step": 1455 }, { "epoch": 0.3514684641309581, - "grad_norm": 1.3828125, - "learning_rate": 0.0002213405016325123, - "loss": 0.6918, + "grad_norm": 2.28125, + "learning_rate": 8.825207302164259e-05, + "loss": 0.7453, "step": 1460 }, { "epoch": 0.35267212325469427, - "grad_norm": 1.34375, - "learning_rate": 0.00022130256895193254, - "loss": 0.7117, + "grad_norm": 2.09375, + "learning_rate": 8.823694864236388e-05, + "loss": 0.7669, "step": 1465 }, { "epoch": 0.3538757823784304, - "grad_norm": 1.3671875, - "learning_rate": 0.0002212645091588163, - "loss": 0.7424, + "grad_norm": 2.09375, + "learning_rate": 8.822177358124082e-05, + "loss": 0.7977, "step": 1470 }, { "epoch": 0.3550794415021666, - "grad_norm": 1.3046875, - "learning_rate": 0.00022122632231397346, - "loss": 0.7206, + "grad_norm": 2.046875, + "learning_rate": 8.82065478625193e-05, + "loss": 0.7784, "step": 1475 }, { "epoch": 0.35628310062590274, - "grad_norm": 1.3359375, - "learning_rate": 0.0002211880084784167, - "loss": 0.713, + "grad_norm": 2.0, + "learning_rate": 8.819127151052608e-05, + "loss": 0.7662, "step": 1480 }, { "epoch": 0.3574867597496389, - "grad_norm": 1.359375, - "learning_rate": 0.00022114956771336177, - "loss": 0.694, + "grad_norm": 2.203125, + "learning_rate": 8.817594454966887e-05, + "loss": 0.7519, "step": 1485 }, { "epoch": 0.35869041887337505, - "grad_norm": 1.3515625, - "learning_rate": 0.0002211110000802272, - "loss": 0.6787, + "grad_norm": 2.03125, + "learning_rate": 8.816056700443625e-05, + "loss": 0.7348, "step": 1490 }, { "epoch": 0.3598940779971112, - "grad_norm": 1.3515625, - "learning_rate": 0.00022107230564063409, - "loss": 0.6888, + "grad_norm": 1.9140625, + "learning_rate": 8.814513889939756e-05, + "loss": 0.7456, "step": 1495 }, { "epoch": 0.36109773712084736, - "grad_norm": 1.609375, - "learning_rate": 0.00022103348445640626, - "loss": 0.6819, + "grad_norm": 2.140625, + "learning_rate": 8.812966025920297e-05, + "loss": 0.7408, "step": 1500 }, { "epoch": 0.36109773712084736, - "eval_loss": 0.610694944858551, - "eval_runtime": 2.3925, - "eval_samples_per_second": 83.596, - "eval_steps_per_second": 83.596, + "eval_loss": 0.656117856502533, + "eval_runtime": 2.3795, + "eval_samples_per_second": 84.052, + "eval_steps_per_second": 84.052, "step": 1500 }, { "epoch": 0.3623013962445835, - "grad_norm": 1.4375, - "learning_rate": 0.00022099453658957005, - "loss": 0.7186, + "grad_norm": 2.125, + "learning_rate": 8.811413110858338e-05, + "loss": 0.7699, "step": 1505 }, { "epoch": 0.3635050553683197, - "grad_norm": 1.3671875, - "learning_rate": 0.00022095546210235416, - "loss": 0.7024, + "grad_norm": 2.015625, + "learning_rate": 8.809855147235039e-05, + "loss": 0.7545, "step": 1510 }, { "epoch": 0.36470871449205583, - "grad_norm": 1.421875, - "learning_rate": 0.00022091626105718955, - "loss": 0.7064, + "grad_norm": 1.9609375, + "learning_rate": 8.808292137539627e-05, + "loss": 0.7709, "step": 1515 }, { "epoch": 0.365912373615792, - "grad_norm": 1.3671875, - "learning_rate": 0.0002208769335167095, - "loss": 0.676, + "grad_norm": 2.0625, + "learning_rate": 8.80672408426939e-05, + "loss": 0.7391, "step": 1520 }, { "epoch": 0.36711603273952814, - "grad_norm": 1.2890625, - "learning_rate": 0.0002208374795437493, - "loss": 0.6713, + "grad_norm": 2.125, + "learning_rate": 8.805150989929675e-05, + "loss": 0.7249, "step": 1525 }, { "epoch": 0.3683196918632643, - "grad_norm": 1.4140625, - "learning_rate": 0.0002207978992013463, - "loss": 0.734, + "grad_norm": 1.96875, + "learning_rate": 8.803572857033885e-05, + "loss": 0.7786, "step": 1530 }, { "epoch": 0.36952335098700045, - "grad_norm": 1.6328125, - "learning_rate": 0.00022075819255273977, - "loss": 0.7283, + "grad_norm": 2.203125, + "learning_rate": 8.80198968810347e-05, + "loss": 0.7806, "step": 1535 }, { "epoch": 0.37072701011073667, - "grad_norm": 1.5, - "learning_rate": 0.00022071835966137068, - "loss": 0.7184, + "grad_norm": 2.25, + "learning_rate": 8.800401485667929e-05, + "loss": 0.7754, "step": 1540 }, { "epoch": 0.3719306692344728, - "grad_norm": 1.3828125, - "learning_rate": 0.00022067840059088187, - "loss": 0.6937, + "grad_norm": 2.203125, + "learning_rate": 8.798808252264803e-05, + "loss": 0.7467, "step": 1545 }, { "epoch": 0.373134328358209, - "grad_norm": 1.3125, - "learning_rate": 0.0002206383154051176, - "loss": 0.6888, + "grad_norm": 2.03125, + "learning_rate": 8.797209990439668e-05, + "loss": 0.7485, "step": 1550 }, { "epoch": 0.37433798748194513, - "grad_norm": 1.359375, - "learning_rate": 0.00022059810416812377, - "loss": 0.7083, + "grad_norm": 2.03125, + "learning_rate": 8.795606702746139e-05, + "loss": 0.7639, "step": 1555 }, { "epoch": 0.3755416466056813, - "grad_norm": 1.4921875, - "learning_rate": 0.00022055776694414767, - "loss": 0.7063, + "grad_norm": 2.0, + "learning_rate": 8.79399839174586e-05, + "loss": 0.7583, "step": 1560 }, { "epoch": 0.37674530572941745, - "grad_norm": 1.3984375, - "learning_rate": 0.00022051730379763778, - "loss": 0.6938, + "grad_norm": 2.296875, + "learning_rate": 8.792385060008496e-05, + "loss": 0.7499, "step": 1565 }, { "epoch": 0.3779489648531536, - "grad_norm": 1.3359375, - "learning_rate": 0.00022047671479324385, - "loss": 0.7266, + "grad_norm": 2.140625, + "learning_rate": 8.79076671011174e-05, + "loss": 0.7848, "step": 1570 }, { "epoch": 0.37915262397688976, - "grad_norm": 1.4296875, - "learning_rate": 0.00022043599999581673, - "loss": 0.7155, + "grad_norm": 2.140625, + "learning_rate": 8.789143344641301e-05, + "loss": 0.7678, "step": 1575 }, { "epoch": 0.3803562831006259, - "grad_norm": 1.3046875, - "learning_rate": 0.00022039515947040817, - "loss": 0.6873, + "grad_norm": 1.9375, + "learning_rate": 8.787514966190897e-05, + "loss": 0.7457, "step": 1580 }, { "epoch": 0.38155994222436207, - "grad_norm": 1.2734375, - "learning_rate": 0.0002203541932822709, - "loss": 0.6876, + "grad_norm": 2.015625, + "learning_rate": 8.785881577362264e-05, + "loss": 0.7564, "step": 1585 }, { "epoch": 0.3827636013480982, - "grad_norm": 1.3515625, - "learning_rate": 0.00022031310149685842, - "loss": 0.7067, + "grad_norm": 2.125, + "learning_rate": 8.784243180765135e-05, + "loss": 0.7683, "step": 1590 }, { "epoch": 0.3839672604718344, - "grad_norm": 1.390625, - "learning_rate": 0.0002202718841798248, - "loss": 0.6957, + "grad_norm": 2.234375, + "learning_rate": 8.78259977901725e-05, + "loss": 0.7525, "step": 1595 }, { "epoch": 0.38517091959557054, - "grad_norm": 1.359375, - "learning_rate": 0.0002202305413970248, - "loss": 0.7069, + "grad_norm": 2.0625, + "learning_rate": 8.780951374744343e-05, + "loss": 0.7599, "step": 1600 }, { "epoch": 0.3863745787193067, - "grad_norm": 1.2109375, - "learning_rate": 0.00022018907321451356, - "loss": 0.695, + "grad_norm": 1.7890625, + "learning_rate": 8.779297970580142e-05, + "loss": 0.7453, "step": 1605 }, { "epoch": 0.38757823784304285, - "grad_norm": 1.296875, - "learning_rate": 0.0002201474796985466, - "loss": 0.695, + "grad_norm": 2.09375, + "learning_rate": 8.777639569166362e-05, + "loss": 0.7525, "step": 1610 }, { "epoch": 0.388781896966779, - "grad_norm": 1.328125, - "learning_rate": 0.00022010576091557974, - "loss": 0.7094, + "grad_norm": 1.953125, + "learning_rate": 8.775976173152703e-05, + "loss": 0.7646, "step": 1615 }, { "epoch": 0.38998555609051516, - "grad_norm": 1.34375, - "learning_rate": 0.00022006391693226885, - "loss": 0.6834, + "grad_norm": 1.9609375, + "learning_rate": 8.774307785196847e-05, + "loss": 0.7339, "step": 1620 }, { "epoch": 0.3911892152142513, - "grad_norm": 1.4140625, - "learning_rate": 0.0002200219478154699, - "loss": 0.7536, + "grad_norm": 2.046875, + "learning_rate": 8.772634407964449e-05, + "loss": 0.7998, "step": 1625 }, { "epoch": 0.3923928743379875, - "grad_norm": 1.234375, - "learning_rate": 0.00021997985363223882, - "loss": 0.6902, + "grad_norm": 1.9453125, + "learning_rate": 8.770956044129138e-05, + "loss": 0.7457, "step": 1630 }, { "epoch": 0.39359653346172363, - "grad_norm": 1.328125, - "learning_rate": 0.00021993763444983126, - "loss": 0.703, + "grad_norm": 2.578125, + "learning_rate": 8.769272696372507e-05, + "loss": 0.7607, "step": 1635 }, { "epoch": 0.3948001925854598, - "grad_norm": 1.3828125, - "learning_rate": 0.0002198952903357027, - "loss": 0.7073, + "grad_norm": 2.09375, + "learning_rate": 8.767584367384116e-05, + "loss": 0.7693, "step": 1640 }, { "epoch": 0.39600385170919594, - "grad_norm": 1.4296875, - "learning_rate": 0.00021985282135750817, - "loss": 0.6777, + "grad_norm": 2.09375, + "learning_rate": 8.765891059861483e-05, + "loss": 0.735, "step": 1645 }, { "epoch": 0.3972075108329321, - "grad_norm": 1.421875, - "learning_rate": 0.00021981022758310216, - "loss": 0.688, + "grad_norm": 2.21875, + "learning_rate": 8.764192776510076e-05, + "loss": 0.7425, "step": 1650 }, { "epoch": 0.39841116995666825, - "grad_norm": 1.3125, - "learning_rate": 0.00021976750908053868, - "loss": 0.705, + "grad_norm": 1.9296875, + "learning_rate": 8.762489520043319e-05, + "loss": 0.7595, "step": 1655 }, { "epoch": 0.3996148290804044, - "grad_norm": 1.2265625, - "learning_rate": 0.0002197246659180709, - "loss": 0.701, + "grad_norm": 2.0, + "learning_rate": 8.760781293182578e-05, + "loss": 0.7605, "step": 1660 }, { "epoch": 0.40081848820414057, - "grad_norm": 1.34375, - "learning_rate": 0.00021968169816415125, - "loss": 0.6735, + "grad_norm": 2.0625, + "learning_rate": 8.759068098657164e-05, + "loss": 0.7385, "step": 1665 }, { "epoch": 0.4020221473278767, - "grad_norm": 1.4296875, - "learning_rate": 0.00021963860588743113, - "loss": 0.6742, + "grad_norm": 2.328125, + "learning_rate": 8.757349939204319e-05, + "loss": 0.7233, "step": 1670 }, { "epoch": 0.4032258064516129, - "grad_norm": 1.4453125, - "learning_rate": 0.000219595389156761, - "loss": 0.6889, + "grad_norm": 2.140625, + "learning_rate": 8.755626817569224e-05, + "loss": 0.7446, "step": 1675 }, { "epoch": 0.40442946557534903, - "grad_norm": 1.375, - "learning_rate": 0.00021955204804119003, - "loss": 0.718, + "grad_norm": 2.09375, + "learning_rate": 8.753898736504983e-05, + "loss": 0.773, "step": 1680 }, { "epoch": 0.40563312469908525, - "grad_norm": 1.40625, - "learning_rate": 0.00021950858260996633, - "loss": 0.6717, + "grad_norm": 2.09375, + "learning_rate": 8.75216569877263e-05, + "loss": 0.7316, "step": 1685 }, { "epoch": 0.4068367838228214, - "grad_norm": 1.4453125, - "learning_rate": 0.00021946499293253646, - "loss": 0.7024, + "grad_norm": 1.921875, + "learning_rate": 8.750427707141112e-05, + "loss": 0.7527, "step": 1690 }, { "epoch": 0.40804044294655756, - "grad_norm": 1.265625, - "learning_rate": 0.00021942127907854556, - "loss": 0.6815, + "grad_norm": 1.8515625, + "learning_rate": 8.748684764387297e-05, + "loss": 0.7369, "step": 1695 }, { "epoch": 0.4092441020702937, - "grad_norm": 1.3671875, - "learning_rate": 0.00021937744111783717, - "loss": 0.6788, + "grad_norm": 2.046875, + "learning_rate": 8.74693687329596e-05, + "loss": 0.7303, "step": 1700 }, { "epoch": 0.41044776119402987, - "grad_norm": 1.46875, - "learning_rate": 0.00021933347912045305, - "loss": 0.7082, + "grad_norm": 2.125, + "learning_rate": 8.74518403665978e-05, + "loss": 0.7594, "step": 1705 }, { "epoch": 0.411651420317766, - "grad_norm": 1.2265625, - "learning_rate": 0.00021928939315663331, - "loss": 0.6666, + "grad_norm": 1.9921875, + "learning_rate": 8.743426257279343e-05, + "loss": 0.725, "step": 1710 }, { "epoch": 0.4128550794415022, - "grad_norm": 1.25, - "learning_rate": 0.00021924518329681592, - "loss": 0.6757, + "grad_norm": 2.046875, + "learning_rate": 8.741663537963129e-05, + "loss": 0.7353, "step": 1715 }, { "epoch": 0.41405873856523834, - "grad_norm": 1.328125, - "learning_rate": 0.00021920084961163697, - "loss": 0.7003, + "grad_norm": 2.140625, + "learning_rate": 8.739895881527513e-05, + "loss": 0.759, "step": 1720 }, { "epoch": 0.4152623976889745, - "grad_norm": 1.3046875, - "learning_rate": 0.00021915639217193027, - "loss": 0.6869, + "grad_norm": 2.109375, + "learning_rate": 8.738123290796757e-05, + "loss": 0.742, "step": 1725 }, { "epoch": 0.41646605681271065, - "grad_norm": 1.234375, - "learning_rate": 0.00021911181104872747, - "loss": 0.6542, + "grad_norm": 1.8828125, + "learning_rate": 8.736345768603004e-05, + "loss": 0.7174, "step": 1730 }, { "epoch": 0.4176697159364468, - "grad_norm": 1.4296875, - "learning_rate": 0.00021906710631325774, - "loss": 0.6937, + "grad_norm": 2.28125, + "learning_rate": 8.73456331778628e-05, + "loss": 0.7458, "step": 1735 }, { "epoch": 0.41887337506018296, - "grad_norm": 1.53125, - "learning_rate": 0.00021902227803694774, - "loss": 0.6844, + "grad_norm": 2.15625, + "learning_rate": 8.732775941194484e-05, + "loss": 0.7416, "step": 1740 }, { "epoch": 0.4200770341839191, - "grad_norm": 1.234375, - "learning_rate": 0.00021897732629142167, - "loss": 0.6723, + "grad_norm": 2.046875, + "learning_rate": 8.730983641683389e-05, + "loss": 0.735, "step": 1745 }, { "epoch": 0.4212806933076553, - "grad_norm": 1.25, - "learning_rate": 0.00021893225114850086, - "loss": 0.6876, + "grad_norm": 2.0625, + "learning_rate": 8.729186422116627e-05, + "loss": 0.7393, "step": 1750 }, { "epoch": 0.42248435243139143, - "grad_norm": 1.234375, - "learning_rate": 0.00021888705268020378, - "loss": 0.6935, + "grad_norm": 2.0625, + "learning_rate": 8.727384285365692e-05, + "loss": 0.7432, "step": 1755 }, { "epoch": 0.4236880115551276, - "grad_norm": 1.3203125, - "learning_rate": 0.00021884173095874603, - "loss": 0.6683, + "grad_norm": 1.9765625, + "learning_rate": 8.725577234309942e-05, + "loss": 0.7256, "step": 1760 }, { "epoch": 0.42489167067886374, - "grad_norm": 1.265625, - "learning_rate": 0.0002187962860565401, - "loss": 0.6904, + "grad_norm": 1.890625, + "learning_rate": 8.723765271836577e-05, + "loss": 0.7443, "step": 1765 }, { "epoch": 0.4260953298025999, - "grad_norm": 1.390625, - "learning_rate": 0.00021875071804619534, - "loss": 0.6723, + "grad_norm": 2.046875, + "learning_rate": 8.721948400840651e-05, + "loss": 0.7272, "step": 1770 }, { "epoch": 0.42729898892633605, - "grad_norm": 1.4140625, - "learning_rate": 0.00021870502700051765, - "loss": 0.6452, + "grad_norm": 1.953125, + "learning_rate": 8.720126624225056e-05, + "loss": 0.707, "step": 1775 }, { "epoch": 0.4285026480500722, - "grad_norm": 1.3046875, - "learning_rate": 0.0002186592129925097, - "loss": 0.6832, + "grad_norm": 2.203125, + "learning_rate": 8.718299944900527e-05, + "loss": 0.7281, "step": 1780 }, { "epoch": 0.42970630717380837, - "grad_norm": 1.21875, - "learning_rate": 0.0002186132760953705, - "loss": 0.7008, + "grad_norm": 2.046875, + "learning_rate": 8.716468365785626e-05, + "loss": 0.7535, "step": 1785 }, { "epoch": 0.4309099662975445, - "grad_norm": 1.2890625, - "learning_rate": 0.00021856721638249541, - "loss": 0.7177, + "grad_norm": 2.265625, + "learning_rate": 8.714631889806748e-05, + "loss": 0.7714, "step": 1790 }, { "epoch": 0.4321136254212807, - "grad_norm": 1.3203125, - "learning_rate": 0.0002185210339274761, - "loss": 0.6976, + "grad_norm": 1.9296875, + "learning_rate": 8.712790519898113e-05, + "loss": 0.7478, "step": 1795 }, { "epoch": 0.43331728454501683, - "grad_norm": 1.265625, - "learning_rate": 0.0002184747288041002, - "loss": 0.6864, + "grad_norm": 1.9296875, + "learning_rate": 8.710944259001754e-05, + "loss": 0.7414, "step": 1800 }, { "epoch": 0.434520943668753, - "grad_norm": 1.390625, - "learning_rate": 0.00021842830108635155, - "loss": 0.7258, + "grad_norm": 2.015625, + "learning_rate": 8.709093110067527e-05, + "loss": 0.7847, "step": 1805 }, { "epoch": 0.43572460279248915, - "grad_norm": 1.1875, - "learning_rate": 0.00021838175084840962, - "loss": 0.6828, + "grad_norm": 1.84375, + "learning_rate": 8.70723707605309e-05, + "loss": 0.7362, "step": 1810 }, { "epoch": 0.4369282619162253, - "grad_norm": 1.3671875, - "learning_rate": 0.00021833507816464986, - "loss": 0.6631, + "grad_norm": 2.046875, + "learning_rate": 8.705376159923911e-05, + "loss": 0.719, "step": 1815 }, { "epoch": 0.43813192103996146, - "grad_norm": 1.5625, - "learning_rate": 0.00021828828310964317, - "loss": 0.6705, + "grad_norm": 2.171875, + "learning_rate": 8.703510364653258e-05, + "loss": 0.7195, "step": 1820 }, { "epoch": 0.4393355801636976, - "grad_norm": 1.296875, - "learning_rate": 0.00021824136575815612, - "loss": 0.6763, + "grad_norm": 2.015625, + "learning_rate": 8.701639693222192e-05, + "loss": 0.7323, "step": 1825 }, { "epoch": 0.4405392392874338, - "grad_norm": 1.2578125, - "learning_rate": 0.00021819432618515054, - "loss": 0.6638, + "grad_norm": 1.9375, + "learning_rate": 8.699764148619566e-05, + "loss": 0.7182, "step": 1830 }, { "epoch": 0.44174289841117, - "grad_norm": 1.2734375, - "learning_rate": 0.00021814716446578368, - "loss": 0.6926, + "grad_norm": 2.015625, + "learning_rate": 8.69788373384202e-05, + "loss": 0.7431, "step": 1835 }, { "epoch": 0.44294655753490614, - "grad_norm": 1.3203125, - "learning_rate": 0.00021809988067540787, - "loss": 0.7047, + "grad_norm": 2.015625, + "learning_rate": 8.695998451893977e-05, + "loss": 0.7553, "step": 1840 }, { "epoch": 0.4441502166586423, - "grad_norm": 1.2265625, - "learning_rate": 0.00021805247488957042, - "loss": 0.6623, + "grad_norm": 1.8359375, + "learning_rate": 8.69410830578763e-05, + "loss": 0.7163, "step": 1845 }, { "epoch": 0.44535387578237845, - "grad_norm": 1.3359375, - "learning_rate": 0.00021800494718401367, - "loss": 0.6962, + "grad_norm": 1.9296875, + "learning_rate": 8.692213298542952e-05, + "loss": 0.756, "step": 1850 }, { "epoch": 0.4465575349061146, - "grad_norm": 1.25, - "learning_rate": 0.00021795729763467473, - "loss": 0.6792, + "grad_norm": 1.9296875, + "learning_rate": 8.690313433187675e-05, + "loss": 0.7362, "step": 1855 }, { "epoch": 0.44776119402985076, - "grad_norm": 1.2578125, - "learning_rate": 0.0002179095263176853, - "loss": 0.6887, + "grad_norm": 1.921875, + "learning_rate": 8.6884087127573e-05, + "loss": 0.7401, "step": 1860 }, { "epoch": 0.4489648531535869, - "grad_norm": 1.2734375, - "learning_rate": 0.00021786163330937176, - "loss": 0.6322, + "grad_norm": 1.9921875, + "learning_rate": 8.68649914029508e-05, + "loss": 0.6896, "step": 1865 }, { "epoch": 0.4501685122773231, - "grad_norm": 1.421875, - "learning_rate": 0.00021781361868625484, - "loss": 0.6918, + "grad_norm": 2.125, + "learning_rate": 8.684584718852024e-05, + "loss": 0.7406, "step": 1870 }, { "epoch": 0.45137217140105923, - "grad_norm": 1.390625, - "learning_rate": 0.00021776548252504957, - "loss": 0.6844, + "grad_norm": 2.015625, + "learning_rate": 8.682665451486884e-05, + "loss": 0.7314, "step": 1875 }, { "epoch": 0.4525758305247954, - "grad_norm": 1.1875, - "learning_rate": 0.00021771722490266526, - "loss": 0.6716, + "grad_norm": 1.796875, + "learning_rate": 8.68074134126616e-05, + "loss": 0.7332, "step": 1880 }, { "epoch": 0.45377948964853154, - "grad_norm": 1.2890625, - "learning_rate": 0.00021766884589620518, - "loss": 0.6844, + "grad_norm": 2.0, + "learning_rate": 8.678812391264087e-05, + "loss": 0.7426, "step": 1885 }, { "epoch": 0.4549831487722677, - "grad_norm": 1.3671875, - "learning_rate": 0.00021762034558296656, - "loss": 0.6393, + "grad_norm": 2.140625, + "learning_rate": 8.676878604562631e-05, + "loss": 0.6921, "step": 1890 }, { "epoch": 0.45618680789600385, - "grad_norm": 1.2578125, - "learning_rate": 0.00021757172404044049, - "loss": 0.6478, + "grad_norm": 1.9140625, + "learning_rate": 8.674939984251487e-05, + "loss": 0.6969, "step": 1895 }, { "epoch": 0.45739046701974, - "grad_norm": 1.3515625, - "learning_rate": 0.00021752298134631174, - "loss": 0.7216, + "grad_norm": 2.078125, + "learning_rate": 8.672996533428075e-05, + "loss": 0.7683, "step": 1900 }, { "epoch": 0.45859412614347617, - "grad_norm": 1.25, - "learning_rate": 0.0002174741175784586, - "loss": 0.6891, + "grad_norm": 1.984375, + "learning_rate": 8.67104825519753e-05, + "loss": 0.7412, "step": 1905 }, { "epoch": 0.4597977852672123, - "grad_norm": 1.328125, - "learning_rate": 0.00021742513281495292, - "loss": 0.685, + "grad_norm": 2.125, + "learning_rate": 8.669095152672701e-05, + "loss": 0.7409, "step": 1910 }, { "epoch": 0.4610014443909485, - "grad_norm": 1.2265625, - "learning_rate": 0.00021737602713405976, - "loss": 0.6793, + "grad_norm": 1.9609375, + "learning_rate": 8.667137228974147e-05, + "loss": 0.7291, "step": 1915 }, { "epoch": 0.46220510351468463, - "grad_norm": 1.4296875, - "learning_rate": 0.00021732680061423734, - "loss": 0.6675, + "grad_norm": 2.015625, + "learning_rate": 8.665174487230126e-05, + "loss": 0.7291, "step": 1920 }, { "epoch": 0.4634087626384208, - "grad_norm": 1.234375, - "learning_rate": 0.00021727745333413712, - "loss": 0.7095, + "grad_norm": 2.078125, + "learning_rate": 8.663206930576597e-05, + "loss": 0.7628, "step": 1925 }, { "epoch": 0.46461242176215695, - "grad_norm": 1.3515625, - "learning_rate": 0.00021722798537260335, - "loss": 0.6744, + "grad_norm": 2.234375, + "learning_rate": 8.66123456215721e-05, + "loss": 0.7243, "step": 1930 }, { "epoch": 0.4658160808858931, - "grad_norm": 1.265625, - "learning_rate": 0.00021717839680867316, - "loss": 0.6693, + "grad_norm": 1.9140625, + "learning_rate": 8.659257385123307e-05, + "loss": 0.7211, "step": 1935 }, { "epoch": 0.46701974000962926, - "grad_norm": 1.25, - "learning_rate": 0.00021712868772157638, - "loss": 0.6678, + "grad_norm": 1.9296875, + "learning_rate": 8.657275402633909e-05, + "loss": 0.7224, "step": 1940 }, { "epoch": 0.4682233991333654, - "grad_norm": 1.3046875, - "learning_rate": 0.00021707885819073535, - "loss": 0.6671, + "grad_norm": 2.125, + "learning_rate": 8.655288617855712e-05, + "loss": 0.7251, "step": 1945 }, { "epoch": 0.46942705825710157, - "grad_norm": 1.21875, - "learning_rate": 0.00021702890829576493, - "loss": 0.6409, + "grad_norm": 2.046875, + "learning_rate": 8.653297033963094e-05, + "loss": 0.7055, "step": 1950 }, { "epoch": 0.4706307173808377, - "grad_norm": 1.1796875, - "learning_rate": 0.00021697883811647224, - "loss": 0.6518, + "grad_norm": 1.828125, + "learning_rate": 8.651300654138094e-05, + "loss": 0.708, "step": 1955 }, { "epoch": 0.4718343765045739, - "grad_norm": 1.21875, - "learning_rate": 0.00021692864773285655, - "loss": 0.6351, + "grad_norm": 1.921875, + "learning_rate": 8.649299481570414e-05, + "loss": 0.6952, "step": 1960 }, { "epoch": 0.47303803562831004, - "grad_norm": 1.4296875, - "learning_rate": 0.0002168783372251093, - "loss": 0.6873, + "grad_norm": 2.140625, + "learning_rate": 8.647293519457416e-05, + "loss": 0.737, "step": 1965 }, { "epoch": 0.4742416947520462, - "grad_norm": 1.1953125, - "learning_rate": 0.0002168279066736137, - "loss": 0.6952, + "grad_norm": 1.953125, + "learning_rate": 8.645282771004112e-05, + "loss": 0.7449, "step": 1970 }, { "epoch": 0.4754453538757824, - "grad_norm": 1.3203125, - "learning_rate": 0.00021677735615894487, - "loss": 0.6817, + "grad_norm": 1.9921875, + "learning_rate": 8.643267239423163e-05, + "loss": 0.7326, "step": 1975 }, { "epoch": 0.47664901299951856, - "grad_norm": 1.3515625, - "learning_rate": 0.0002167266857618696, - "loss": 0.6743, + "grad_norm": 2.09375, + "learning_rate": 8.641246927934872e-05, + "loss": 0.7305, "step": 1980 }, { "epoch": 0.4778526721232547, - "grad_norm": 1.3046875, - "learning_rate": 0.00021667589556334621, - "loss": 0.6451, + "grad_norm": 2.03125, + "learning_rate": 8.639221839767181e-05, + "loss": 0.7009, "step": 1985 }, { "epoch": 0.4790563312469909, - "grad_norm": 1.2578125, - "learning_rate": 0.00021662498564452436, - "loss": 0.6614, + "grad_norm": 2.0, + "learning_rate": 8.637191978155657e-05, + "loss": 0.7208, "step": 1990 }, { "epoch": 0.48025999037072703, - "grad_norm": 1.25, - "learning_rate": 0.0002165739560867451, - "loss": 0.6457, + "grad_norm": 2.03125, + "learning_rate": 8.635157346343502e-05, + "loss": 0.7027, "step": 1995 }, { "epoch": 0.4814636494944632, - "grad_norm": 1.1796875, - "learning_rate": 0.00021652280697154056, - "loss": 0.6372, + "grad_norm": 1.8671875, + "learning_rate": 8.633117947581536e-05, + "loss": 0.6931, "step": 2000 }, { "epoch": 0.4814636494944632, - "eval_loss": 0.5738186836242676, - "eval_runtime": 2.3214, - "eval_samples_per_second": 86.155, - "eval_steps_per_second": 86.155, + "eval_loss": 0.6240468621253967, + "eval_runtime": 2.3782, + "eval_samples_per_second": 84.098, + "eval_steps_per_second": 84.098, "step": 2000 }, { "epoch": 0.48266730861819934, - "grad_norm": 1.4375, - "learning_rate": 0.00021647153838063392, - "loss": 0.6663, + "grad_norm": 2.140625, + "learning_rate": 8.631073785128194e-05, + "loss": 0.7186, "step": 2005 }, { "epoch": 0.4838709677419355, - "grad_norm": 1.25, - "learning_rate": 0.0002164201503959392, - "loss": 0.6432, + "grad_norm": 1.953125, + "learning_rate": 8.629024862249523e-05, + "loss": 0.7014, "step": 2010 }, { "epoch": 0.48507462686567165, - "grad_norm": 1.265625, - "learning_rate": 0.0002163686430995613, - "loss": 0.6726, + "grad_norm": 2.0, + "learning_rate": 8.626971182219179e-05, + "loss": 0.7217, "step": 2015 }, { "epoch": 0.4862782859894078, - "grad_norm": 1.2734375, - "learning_rate": 0.00021631701657379564, - "loss": 0.6498, + "grad_norm": 1.890625, + "learning_rate": 8.624912748318415e-05, + "loss": 0.7086, "step": 2020 }, { "epoch": 0.48748194511314397, - "grad_norm": 1.3203125, - "learning_rate": 0.00021626527090112815, - "loss": 0.6675, + "grad_norm": 1.9921875, + "learning_rate": 8.62284956383608e-05, + "loss": 0.7187, "step": 2025 }, { "epoch": 0.4886856042368801, - "grad_norm": 1.3203125, - "learning_rate": 0.0002162134061642352, - "loss": 0.6318, + "grad_norm": 2.0625, + "learning_rate": 8.620781632068615e-05, + "loss": 0.6839, "step": 2030 }, { "epoch": 0.4898892633606163, - "grad_norm": 1.4609375, - "learning_rate": 0.00021616142244598328, - "loss": 0.6933, + "grad_norm": 2.234375, + "learning_rate": 8.618708956320042e-05, + "loss": 0.7398, "step": 2035 }, { "epoch": 0.49109292248435243, - "grad_norm": 1.453125, - "learning_rate": 0.0002161093198294291, - "loss": 0.6642, + "grad_norm": 2.1875, + "learning_rate": 8.616631539901971e-05, + "loss": 0.7115, "step": 2040 }, { "epoch": 0.4922965816080886, - "grad_norm": 1.3125, - "learning_rate": 0.00021605709839781932, - "loss": 0.6409, + "grad_norm": 2.0, + "learning_rate": 8.614549386133577e-05, + "loss": 0.6967, "step": 2045 }, { "epoch": 0.49350024073182475, - "grad_norm": 1.328125, - "learning_rate": 0.0002160047582345903, - "loss": 0.6746, + "grad_norm": 2.0, + "learning_rate": 8.612462498341608e-05, + "loss": 0.7306, "step": 2050 }, { "epoch": 0.4947038998555609, - "grad_norm": 1.1953125, - "learning_rate": 0.00021595229942336826, - "loss": 0.6467, + "grad_norm": 1.953125, + "learning_rate": 8.610370879860377e-05, + "loss": 0.6988, "step": 2055 }, { "epoch": 0.49590755897929706, - "grad_norm": 1.3125, - "learning_rate": 0.00021589972204796891, - "loss": 0.6357, + "grad_norm": 1.9921875, + "learning_rate": 8.608274534031753e-05, + "loss": 0.6879, "step": 2060 }, { "epoch": 0.4971112181030332, - "grad_norm": 1.265625, - "learning_rate": 0.00021584702619239748, - "loss": 0.662, + "grad_norm": 2.03125, + "learning_rate": 8.60617346420516e-05, + "loss": 0.7124, "step": 2065 }, { "epoch": 0.49831487722676937, - "grad_norm": 1.265625, - "learning_rate": 0.00021579421194084836, - "loss": 0.6703, + "grad_norm": 2.09375, + "learning_rate": 8.604067673737569e-05, + "loss": 0.7249, "step": 2070 }, { "epoch": 0.4995185363505055, - "grad_norm": 1.21875, - "learning_rate": 0.00021574127937770522, - "loss": 0.6312, + "grad_norm": 2.015625, + "learning_rate": 8.601957165993494e-05, + "loss": 0.6882, "step": 2075 }, { "epoch": 0.5007221954742417, - "grad_norm": 1.6015625, - "learning_rate": 0.00021568822858754073, - "loss": 0.6505, + "grad_norm": 2.0625, + "learning_rate": 8.599841944344982e-05, + "loss": 0.7039, "step": 2080 }, { "epoch": 0.5019258545979779, - "grad_norm": 1.3125, - "learning_rate": 0.00021563505965511642, - "loss": 0.6901, + "grad_norm": 1.8984375, + "learning_rate": 8.597722012171619e-05, + "loss": 0.7367, "step": 2085 }, { "epoch": 0.503129513721714, - "grad_norm": 1.203125, - "learning_rate": 0.00021558177266538267, - "loss": 0.6871, + "grad_norm": 1.9375, + "learning_rate": 8.595597372860511e-05, + "loss": 0.7361, "step": 2090 }, { "epoch": 0.5043331728454502, - "grad_norm": 1.125, - "learning_rate": 0.00021552836770347836, - "loss": 0.6279, + "grad_norm": 1.71875, + "learning_rate": 8.593468029806287e-05, + "loss": 0.6793, "step": 2095 }, { "epoch": 0.5055368319691863, - "grad_norm": 1.1875, - "learning_rate": 0.00021547484485473102, - "loss": 0.6878, + "grad_norm": 1.9140625, + "learning_rate": 8.591333986411094e-05, + "loss": 0.7443, "step": 2100 }, { "epoch": 0.5067404910929225, - "grad_norm": 1.234375, - "learning_rate": 0.00021542120420465637, - "loss": 0.6344, + "grad_norm": 1.9765625, + "learning_rate": 8.589195246084582e-05, + "loss": 0.695, "step": 2105 }, { "epoch": 0.5079441502166586, - "grad_norm": 1.25, - "learning_rate": 0.00021536744583895842, - "loss": 0.6647, + "grad_norm": 2.078125, + "learning_rate": 8.587051812243913e-05, + "loss": 0.7116, "step": 2110 }, { "epoch": 0.5091478093403948, - "grad_norm": 1.3203125, - "learning_rate": 0.0002153135698435293, - "loss": 0.6747, + "grad_norm": 2.09375, + "learning_rate": 8.584903688313742e-05, + "loss": 0.7339, "step": 2115 }, { "epoch": 0.5103514684641309, - "grad_norm": 1.3359375, - "learning_rate": 0.00021525957630444902, - "loss": 0.6583, + "grad_norm": 2.09375, + "learning_rate": 8.582750877726225e-05, + "loss": 0.7091, "step": 2120 }, { "epoch": 0.5115551275878671, - "grad_norm": 1.28125, - "learning_rate": 0.00021520546530798536, - "loss": 0.6879, + "grad_norm": 1.9453125, + "learning_rate": 8.580593383920996e-05, + "loss": 0.7445, "step": 2125 }, { "epoch": 0.5127587867116032, - "grad_norm": 1.1796875, - "learning_rate": 0.0002151512369405939, - "loss": 0.599, + "grad_norm": 2.046875, + "learning_rate": 8.578431210345181e-05, + "loss": 0.6616, "step": 2130 }, { "epoch": 0.5139624458353395, - "grad_norm": 1.2734375, - "learning_rate": 0.00021509689128891763, - "loss": 0.6248, + "grad_norm": 2.0625, + "learning_rate": 8.576264360453377e-05, + "loss": 0.6855, "step": 2135 }, { "epoch": 0.5151661049590756, - "grad_norm": 1.2578125, - "learning_rate": 0.00021504242843978696, - "loss": 0.6863, + "grad_norm": 1.9296875, + "learning_rate": 8.574092837707655e-05, + "loss": 0.7408, "step": 2140 }, { "epoch": 0.5163697640828118, - "grad_norm": 1.2734375, - "learning_rate": 0.00021498784848021963, - "loss": 0.6544, + "grad_norm": 1.921875, + "learning_rate": 8.571916645577552e-05, + "loss": 0.7034, "step": 2145 }, { "epoch": 0.5175734232065479, - "grad_norm": 1.2109375, - "learning_rate": 0.00021493315149742035, - "loss": 0.6409, + "grad_norm": 1.9921875, + "learning_rate": 8.569735787540066e-05, + "loss": 0.6968, "step": 2150 }, { "epoch": 0.5187770823302841, - "grad_norm": 1.3359375, - "learning_rate": 0.0002148783375787809, - "loss": 0.6642, + "grad_norm": 2.0625, + "learning_rate": 8.567550267079648e-05, + "loss": 0.7186, "step": 2155 }, { "epoch": 0.5199807414540202, - "grad_norm": 1.140625, - "learning_rate": 0.00021482340681187984, - "loss": 0.6533, + "grad_norm": 1.875, + "learning_rate": 8.565360087688197e-05, + "loss": 0.6983, "step": 2160 }, { "epoch": 0.5211844005777564, - "grad_norm": 1.296875, - "learning_rate": 0.00021476835928448254, - "loss": 0.6707, + "grad_norm": 1.8828125, + "learning_rate": 8.563165252865064e-05, + "loss": 0.7224, "step": 2165 }, { "epoch": 0.5223880597014925, - "grad_norm": 1.28125, - "learning_rate": 0.00021471319508454073, - "loss": 0.6804, + "grad_norm": 1.9375, + "learning_rate": 8.560965766117027e-05, + "loss": 0.7271, "step": 2170 }, { "epoch": 0.5235917188252287, - "grad_norm": 1.203125, - "learning_rate": 0.00021465791430019273, - "loss": 0.6482, + "grad_norm": 1.875, + "learning_rate": 8.558761630958306e-05, + "loss": 0.7038, "step": 2175 }, { "epoch": 0.5247953779489648, - "grad_norm": 1.15625, - "learning_rate": 0.00021460251701976306, - "loss": 0.6578, + "grad_norm": 1.90625, + "learning_rate": 8.556552850910543e-05, + "loss": 0.7094, "step": 2180 }, { "epoch": 0.525999037072701, - "grad_norm": 1.25, - "learning_rate": 0.00021454700333176232, - "loss": 0.6716, + "grad_norm": 1.8671875, + "learning_rate": 8.554339429502799e-05, + "loss": 0.7284, "step": 2185 }, { "epoch": 0.5272026961964371, - "grad_norm": 1.2265625, - "learning_rate": 0.00021449137332488723, - "loss": 0.6544, + "grad_norm": 2.0, + "learning_rate": 8.552121370271558e-05, + "loss": 0.7069, "step": 2190 }, { "epoch": 0.5284063553201733, - "grad_norm": 1.203125, - "learning_rate": 0.00021443562708802023, - "loss": 0.6569, + "grad_norm": 1.953125, + "learning_rate": 8.549898676760707e-05, + "loss": 0.7067, "step": 2195 }, { "epoch": 0.5296100144439095, - "grad_norm": 1.203125, - "learning_rate": 0.00021437976471022952, - "loss": 0.6566, + "grad_norm": 1.8046875, + "learning_rate": 8.547671352521543e-05, + "loss": 0.712, "step": 2200 }, { "epoch": 0.5308136735676456, - "grad_norm": 1.2734375, - "learning_rate": 0.00021432378628076883, - "loss": 0.6706, + "grad_norm": 2.140625, + "learning_rate": 8.545439401112757e-05, + "loss": 0.7208, "step": 2205 }, { "epoch": 0.5320173326913819, - "grad_norm": 1.265625, - "learning_rate": 0.00021426769188907742, - "loss": 0.6519, + "grad_norm": 1.9140625, + "learning_rate": 8.543202826100438e-05, + "loss": 0.7008, "step": 2210 }, { "epoch": 0.533220991815118, - "grad_norm": 1.125, - "learning_rate": 0.00021421148162477965, - "loss": 0.6371, + "grad_norm": 1.9140625, + "learning_rate": 8.540961631058055e-05, + "loss": 0.6951, "step": 2215 }, { "epoch": 0.5344246509388542, - "grad_norm": 1.3671875, - "learning_rate": 0.0002141551555776852, - "loss": 0.6695, + "grad_norm": 1.9921875, + "learning_rate": 8.538715819566469e-05, + "loss": 0.7211, "step": 2220 }, { "epoch": 0.5356283100625903, - "grad_norm": 1.2421875, - "learning_rate": 0.00021409871383778865, - "loss": 0.6647, + "grad_norm": 1.8671875, + "learning_rate": 8.536465395213907e-05, + "loss": 0.7158, "step": 2225 }, { "epoch": 0.5368319691863265, - "grad_norm": 1.296875, - "learning_rate": 0.00021404215649526936, - "loss": 0.6499, + "grad_norm": 2.25, + "learning_rate": 8.534210361595971e-05, + "loss": 0.7094, "step": 2230 }, { "epoch": 0.5380356283100626, - "grad_norm": 1.1953125, - "learning_rate": 0.0002139854836404915, - "loss": 0.6778, + "grad_norm": 1.84375, + "learning_rate": 8.531950722315626e-05, + "loss": 0.7322, "step": 2235 }, { "epoch": 0.5392392874337988, - "grad_norm": 1.296875, - "learning_rate": 0.0002139286953640038, - "loss": 0.6359, + "grad_norm": 2.03125, + "learning_rate": 8.529686480983197e-05, + "loss": 0.6897, "step": 2240 }, { "epoch": 0.5404429465575349, - "grad_norm": 1.1640625, - "learning_rate": 0.00021387179175653932, - "loss": 0.6276, + "grad_norm": 1.9375, + "learning_rate": 8.527417641216363e-05, + "loss": 0.681, "step": 2245 }, { "epoch": 0.5416466056812711, - "grad_norm": 1.3515625, - "learning_rate": 0.00021381477290901546, - "loss": 0.621, + "grad_norm": 1.984375, + "learning_rate": 8.525144206640146e-05, + "loss": 0.6816, "step": 2250 }, { "epoch": 0.5428502648050072, - "grad_norm": 1.2421875, - "learning_rate": 0.00021375763891253369, - "loss": 0.6506, + "grad_norm": 1.8515625, + "learning_rate": 8.522866180886912e-05, + "loss": 0.7046, "step": 2255 }, { "epoch": 0.5440539239287434, - "grad_norm": 1.3125, - "learning_rate": 0.0002137003898583795, - "loss": 0.6655, + "grad_norm": 1.96875, + "learning_rate": 8.520583567596362e-05, + "loss": 0.7192, "step": 2260 }, { "epoch": 0.5452575830524795, - "grad_norm": 1.3515625, - "learning_rate": 0.00021364302583802227, - "loss": 0.6868, + "grad_norm": 1.96875, + "learning_rate": 8.518296370415528e-05, + "loss": 0.7359, "step": 2265 }, { "epoch": 0.5464612421762157, - "grad_norm": 1.171875, - "learning_rate": 0.00021358554694311493, - "loss": 0.6588, + "grad_norm": 1.8515625, + "learning_rate": 8.516004592998764e-05, + "loss": 0.7143, "step": 2270 }, { "epoch": 0.5476649012999518, - "grad_norm": 1.25, - "learning_rate": 0.00021352795326549405, - "loss": 0.6437, + "grad_norm": 2.0, + "learning_rate": 8.513708239007743e-05, + "loss": 0.6977, "step": 2275 }, { "epoch": 0.548868560423688, - "grad_norm": 1.3203125, - "learning_rate": 0.00021347024489717952, - "loss": 0.6792, + "grad_norm": 2.015625, + "learning_rate": 8.51140731211145e-05, + "loss": 0.7373, "step": 2280 }, { "epoch": 0.5500722195474241, - "grad_norm": 1.2578125, - "learning_rate": 0.00021341242193037455, - "loss": 0.6552, + "grad_norm": 1.90625, + "learning_rate": 8.509101815986176e-05, + "loss": 0.7074, "step": 2285 }, { "epoch": 0.5512758786711603, - "grad_norm": 1.3046875, - "learning_rate": 0.00021335448445746543, - "loss": 0.6497, + "grad_norm": 2.015625, + "learning_rate": 8.506791754315512e-05, + "loss": 0.6997, "step": 2290 }, { "epoch": 0.5524795377948964, - "grad_norm": 1.1015625, - "learning_rate": 0.00021329643257102137, - "loss": 0.6367, + "grad_norm": 1.9140625, + "learning_rate": 8.504477130790347e-05, + "loss": 0.6911, "step": 2295 }, { "epoch": 0.5536831969186327, - "grad_norm": 1.2421875, - "learning_rate": 0.00021323826636379445, - "loss": 0.6491, + "grad_norm": 2.046875, + "learning_rate": 8.502157949108855e-05, + "loss": 0.7084, "step": 2300 }, { "epoch": 0.5548868560423688, - "grad_norm": 1.1640625, - "learning_rate": 0.00021317998592871925, - "loss": 0.6606, + "grad_norm": 2.171875, + "learning_rate": 8.499834212976492e-05, + "loss": 0.7209, "step": 2305 }, { "epoch": 0.556090515166105, - "grad_norm": 1.34375, - "learning_rate": 0.00021312159135891305, - "loss": 0.633, + "grad_norm": 2.03125, + "learning_rate": 8.497505926105995e-05, + "loss": 0.7058, "step": 2310 }, { "epoch": 0.5572941742898411, - "grad_norm": 1.265625, - "learning_rate": 0.00021306308274767537, - "loss": 0.6315, + "grad_norm": 1.84375, + "learning_rate": 8.495173092217366e-05, + "loss": 0.6855, "step": 2315 }, { "epoch": 0.5584978334135773, - "grad_norm": 1.234375, - "learning_rate": 0.00021300446018848802, - "loss": 0.6749, + "grad_norm": 1.859375, + "learning_rate": 8.49283571503788e-05, + "loss": 0.7268, "step": 2320 }, { "epoch": 0.5597014925373134, - "grad_norm": 1.203125, - "learning_rate": 0.00021294572377501478, - "loss": 0.6274, + "grad_norm": 1.984375, + "learning_rate": 8.490493798302064e-05, + "loss": 0.6848, "step": 2325 }, { "epoch": 0.5609051516610496, - "grad_norm": 1.2265625, - "learning_rate": 0.00021288687360110137, - "loss": 0.6416, + "grad_norm": 1.9609375, + "learning_rate": 8.488147345751701e-05, + "loss": 0.6916, "step": 2330 }, { "epoch": 0.5621088107847857, - "grad_norm": 1.1015625, - "learning_rate": 0.0002128279097607753, - "loss": 0.6117, + "grad_norm": 1.8359375, + "learning_rate": 8.48579636113582e-05, + "loss": 0.6673, "step": 2335 }, { "epoch": 0.5633124699085219, - "grad_norm": 1.265625, - "learning_rate": 0.0002127688323482457, - "loss": 0.666, + "grad_norm": 2.0625, + "learning_rate": 8.483440848210695e-05, + "loss": 0.7186, "step": 2340 }, { "epoch": 0.5645161290322581, - "grad_norm": 1.2421875, - "learning_rate": 0.00021270964145790307, - "loss": 0.6466, + "grad_norm": 2.03125, + "learning_rate": 8.481080810739828e-05, + "loss": 0.6997, "step": 2345 }, { "epoch": 0.5657197881559942, - "grad_norm": 1.3515625, - "learning_rate": 0.00021265033718431933, - "loss": 0.6512, + "grad_norm": 2.03125, + "learning_rate": 8.478716252493955e-05, + "loss": 0.7005, "step": 2350 }, { "epoch": 0.5669234472797304, - "grad_norm": 1.2421875, - "learning_rate": 0.0002125909196222475, - "loss": 0.6206, + "grad_norm": 3.515625, + "learning_rate": 8.476347177251034e-05, + "loss": 0.6912, "step": 2355 }, { "epoch": 0.5681271064034665, - "grad_norm": 1.1328125, - "learning_rate": 0.00021253138886662156, - "loss": 0.6251, + "grad_norm": 2.0, + "learning_rate": 8.473973588796241e-05, + "loss": 0.6777, "step": 2360 }, { "epoch": 0.5693307655272027, - "grad_norm": 1.1875, - "learning_rate": 0.00021247174501255647, - "loss": 0.615, + "grad_norm": 1.9140625, + "learning_rate": 8.47159549092196e-05, + "loss": 0.6676, "step": 2365 }, { "epoch": 0.5705344246509388, - "grad_norm": 1.21875, - "learning_rate": 0.00021241198815534777, - "loss": 0.6135, + "grad_norm": 2.015625, + "learning_rate": 8.469212887427782e-05, + "loss": 0.668, "step": 2370 }, { "epoch": 0.571738083774675, - "grad_norm": 1.296875, - "learning_rate": 0.00021235211839047162, - "loss": 0.6334, + "grad_norm": 2.140625, + "learning_rate": 8.466825782120499e-05, + "loss": 0.686, "step": 2375 }, { "epoch": 0.5729417428984112, - "grad_norm": 1.25, - "learning_rate": 0.00021229213581358455, - "loss": 0.6264, + "grad_norm": 1.8359375, + "learning_rate": 8.46443417881409e-05, + "loss": 0.6801, "step": 2380 }, { "epoch": 0.5741454020221474, - "grad_norm": 1.2734375, - "learning_rate": 0.00021223204052052332, - "loss": 0.6381, + "grad_norm": 2.125, + "learning_rate": 8.462038081329726e-05, + "loss": 0.6964, "step": 2385 }, { "epoch": 0.5753490611458835, - "grad_norm": 1.2890625, - "learning_rate": 0.00021217183260730486, - "loss": 0.686, + "grad_norm": 1.96875, + "learning_rate": 8.459637493495757e-05, + "loss": 0.7331, "step": 2390 }, { "epoch": 0.5765527202696197, - "grad_norm": 1.203125, - "learning_rate": 0.00021211151217012593, - "loss": 0.6399, + "grad_norm": 2.0, + "learning_rate": 8.457232419147705e-05, + "loss": 0.6916, "step": 2395 }, { "epoch": 0.5777563793933558, - "grad_norm": 1.3203125, - "learning_rate": 0.00021205107930536316, - "loss": 0.6504, + "grad_norm": 2.15625, + "learning_rate": 8.454822862128265e-05, + "loss": 0.7108, "step": 2400 }, { "epoch": 0.578960038517092, - "grad_norm": 1.15625, - "learning_rate": 0.00021199053410957274, - "loss": 0.6541, + "grad_norm": 1.9296875, + "learning_rate": 8.452408826287289e-05, + "loss": 0.7062, "step": 2405 }, { "epoch": 0.5801636976408281, - "grad_norm": 1.2109375, - "learning_rate": 0.0002119298766794904, - "loss": 0.6074, + "grad_norm": 1.9765625, + "learning_rate": 8.449990315481787e-05, + "loss": 0.6639, "step": 2410 }, { "epoch": 0.5813673567645643, - "grad_norm": 1.2109375, - "learning_rate": 0.00021186910711203116, - "loss": 0.6128, + "grad_norm": 1.984375, + "learning_rate": 8.447567333575921e-05, + "loss": 0.6632, "step": 2415 }, { "epoch": 0.5825710158883004, - "grad_norm": 1.3515625, - "learning_rate": 0.00021180822550428917, - "loss": 0.6706, + "grad_norm": 2.0625, + "learning_rate": 8.445139884440995e-05, + "loss": 0.7215, "step": 2420 }, { "epoch": 0.5837746750120366, - "grad_norm": 1.078125, - "learning_rate": 0.00021174723195353768, - "loss": 0.6283, + "grad_norm": 1.8203125, + "learning_rate": 8.44270797195545e-05, + "loss": 0.6915, "step": 2425 }, { "epoch": 0.5849783341357727, - "grad_norm": 1.921875, - "learning_rate": 0.00021168612655722872, - "loss": 0.6635, + "grad_norm": 1.9765625, + "learning_rate": 8.440271600004858e-05, + "loss": 0.7024, "step": 2430 }, { "epoch": 0.5861819932595089, - "grad_norm": 1.3671875, - "learning_rate": 0.0002116249094129931, - "loss": 0.6292, + "grad_norm": 2.15625, + "learning_rate": 8.437830772481918e-05, + "loss": 0.6825, "step": 2435 }, { "epoch": 0.587385652383245, - "grad_norm": 1.171875, - "learning_rate": 0.00021156358061864006, - "loss": 0.6267, + "grad_norm": 1.953125, + "learning_rate": 8.435385493286446e-05, + "loss": 0.6769, "step": 2440 }, { "epoch": 0.5885893115069812, - "grad_norm": 1.3203125, - "learning_rate": 0.0002115021402721573, - "loss": 0.6445, + "grad_norm": 1.9921875, + "learning_rate": 8.432935766325371e-05, + "loss": 0.6996, "step": 2445 }, { "epoch": 0.5897929706307173, - "grad_norm": 1.1640625, - "learning_rate": 0.00021144058847171078, - "loss": 0.6511, + "grad_norm": 1.9765625, + "learning_rate": 8.430481595512729e-05, + "loss": 0.6987, "step": 2450 }, { "epoch": 0.5909966297544536, - "grad_norm": 1.1953125, - "learning_rate": 0.0002113789253156445, - "loss": 0.6339, + "grad_norm": 2.140625, + "learning_rate": 8.428022984769657e-05, + "loss": 0.6808, "step": 2455 }, { "epoch": 0.5922002888781897, - "grad_norm": 1.1640625, - "learning_rate": 0.00021131715090248033, - "loss": 0.6244, + "grad_norm": 1.71875, + "learning_rate": 8.425559938024383e-05, + "loss": 0.6724, "step": 2460 }, { "epoch": 0.5934039480019259, - "grad_norm": 1.3203125, - "learning_rate": 0.00021125526533091797, - "loss": 0.6402, + "grad_norm": 2.15625, + "learning_rate": 8.423092459212224e-05, + "loss": 0.6991, "step": 2465 }, { "epoch": 0.594607607125662, - "grad_norm": 1.3359375, - "learning_rate": 0.0002111932686998347, - "loss": 0.6461, + "grad_norm": 1.9921875, + "learning_rate": 8.42062055227558e-05, + "loss": 0.6948, "step": 2470 }, { "epoch": 0.5958112662493982, - "grad_norm": 1.25, - "learning_rate": 0.00021113116110828528, - "loss": 0.6322, + "grad_norm": 1.984375, + "learning_rate": 8.418144221163925e-05, + "loss": 0.6856, "step": 2475 }, { "epoch": 0.5970149253731343, - "grad_norm": 1.28125, - "learning_rate": 0.0002110689426555016, - "loss": 0.6586, + "grad_norm": 1.9453125, + "learning_rate": 8.4156634698338e-05, + "loss": 0.7034, "step": 2480 }, { "epoch": 0.5982185844968705, - "grad_norm": 1.21875, - "learning_rate": 0.00021100661344089296, - "loss": 0.7098, + "grad_norm": 1.8515625, + "learning_rate": 8.413178302248809e-05, + "loss": 0.7589, "step": 2485 }, { "epoch": 0.5994222436206067, - "grad_norm": 1.171875, - "learning_rate": 0.00021094417356404534, - "loss": 0.6151, + "grad_norm": 1.8203125, + "learning_rate": 8.410688722379615e-05, + "loss": 0.6722, "step": 2490 }, { "epoch": 0.6006259027443428, - "grad_norm": 1.4375, - "learning_rate": 0.00021088162312472172, - "loss": 0.6223, + "grad_norm": 2.265625, + "learning_rate": 8.408194734203926e-05, + "loss": 0.677, "step": 2495 }, { "epoch": 0.601829561868079, - "grad_norm": 1.2265625, - "learning_rate": 0.00021081896222286168, - "loss": 0.635, + "grad_norm": 1.859375, + "learning_rate": 8.405696341706498e-05, + "loss": 0.6897, "step": 2500 }, { "epoch": 0.601829561868079, - "eval_loss": 0.5357303619384766, - "eval_runtime": 2.329, - "eval_samples_per_second": 85.873, - "eval_steps_per_second": 85.873, + "eval_loss": 0.5873296856880188, + "eval_runtime": 2.3774, + "eval_samples_per_second": 84.126, + "eval_steps_per_second": 84.126, "step": 2500 }, { "epoch": 0.6030332209918151, - "grad_norm": 1.203125, - "learning_rate": 0.0002107561909585812, - "loss": 0.6146, + "grad_norm": 2.21875, + "learning_rate": 8.403193548879122e-05, + "loss": 0.6698, "step": 2505 }, { "epoch": 0.6042368801155513, - "grad_norm": 1.0859375, - "learning_rate": 0.00021069330943217275, - "loss": 0.6374, + "grad_norm": 1.78125, + "learning_rate": 8.400686359720615e-05, + "loss": 0.6876, "step": 2510 }, { "epoch": 0.6054405392392874, - "grad_norm": 1.2734375, - "learning_rate": 0.00021063031774410483, - "loss": 0.6167, + "grad_norm": 2.109375, + "learning_rate": 8.398174778236827e-05, + "loss": 0.6744, "step": 2515 }, { "epoch": 0.6066441983630236, - "grad_norm": 1.1484375, - "learning_rate": 0.00021056721599502207, - "loss": 0.6452, + "grad_norm": 1.9921875, + "learning_rate": 8.395658808440621e-05, + "loss": 0.6946, "step": 2520 }, { "epoch": 0.6078478574867597, - "grad_norm": 1.1953125, - "learning_rate": 0.00021050400428574483, - "loss": 0.646, + "grad_norm": 1.9453125, + "learning_rate": 8.39313845435187e-05, + "loss": 0.6982, "step": 2525 }, { "epoch": 0.609051516610496, - "grad_norm": 1.234375, - "learning_rate": 0.00021044068271726924, - "loss": 0.6255, + "grad_norm": 1.9296875, + "learning_rate": 8.390613719997455e-05, + "loss": 0.6789, "step": 2530 }, { "epoch": 0.610255175734232, - "grad_norm": 1.171875, - "learning_rate": 0.00021037725139076694, - "loss": 0.6137, + "grad_norm": 1.84375, + "learning_rate": 8.388084609411253e-05, + "loss": 0.6695, "step": 2535 }, { "epoch": 0.6114588348579683, - "grad_norm": 1.2578125, - "learning_rate": 0.00021031371040758498, - "loss": 0.6115, + "grad_norm": 1.953125, + "learning_rate": 8.385551126634136e-05, + "loss": 0.6683, "step": 2540 }, { "epoch": 0.6126624939817044, - "grad_norm": 1.15625, - "learning_rate": 0.0002102500598692454, - "loss": 0.6333, + "grad_norm": 1.9921875, + "learning_rate": 8.383013275713957e-05, + "loss": 0.6808, "step": 2545 }, { "epoch": 0.6138661531054406, - "grad_norm": 1.125, - "learning_rate": 0.00021018629987744564, - "loss": 0.6361, + "grad_norm": 1.8515625, + "learning_rate": 8.380471060705555e-05, + "loss": 0.685, "step": 2550 }, { "epoch": 0.6150698122291767, - "grad_norm": 1.109375, - "learning_rate": 0.00021012243053405768, - "loss": 0.6137, + "grad_norm": 2.0625, + "learning_rate": 8.377924485670735e-05, + "loss": 0.6711, "step": 2555 }, { "epoch": 0.6162734713529129, - "grad_norm": 1.140625, - "learning_rate": 0.00021005845194112846, - "loss": 0.6192, + "grad_norm": 2.0625, + "learning_rate": 8.375373554678274e-05, + "loss": 0.6687, "step": 2560 }, { "epoch": 0.617477130476649, - "grad_norm": 1.1796875, - "learning_rate": 0.00020999436420087928, - "loss": 0.641, + "grad_norm": 1.9296875, + "learning_rate": 8.372818271803903e-05, + "loss": 0.694, "step": 2565 }, { "epoch": 0.6186807896003852, - "grad_norm": 1.265625, - "learning_rate": 0.000209930167415706, - "loss": 0.6261, + "grad_norm": 2.0, + "learning_rate": 8.370258641130311e-05, + "loss": 0.6833, "step": 2570 }, { "epoch": 0.6198844487241213, - "grad_norm": 1.265625, - "learning_rate": 0.00020986586168817852, - "loss": 0.5986, + "grad_norm": 1.9375, + "learning_rate": 8.36769466674713e-05, + "loss": 0.6531, "step": 2575 }, { "epoch": 0.6210881078478575, - "grad_norm": 1.2734375, - "learning_rate": 0.00020980144712104103, - "loss": 0.5911, + "grad_norm": 2.0, + "learning_rate": 8.365126352750935e-05, + "loss": 0.6508, "step": 2580 }, { "epoch": 0.6222917669715936, - "grad_norm": 1.15625, - "learning_rate": 0.0002097369238172114, - "loss": 0.6609, + "grad_norm": 1.8515625, + "learning_rate": 8.362553703245232e-05, + "loss": 0.7133, "step": 2585 }, { "epoch": 0.6234954260953298, - "grad_norm": 1.234375, - "learning_rate": 0.0002096722918797814, - "loss": 0.6253, + "grad_norm": 2.0, + "learning_rate": 8.359976722340458e-05, + "loss": 0.6703, "step": 2590 }, { "epoch": 0.6246990852190659, - "grad_norm": 1.234375, - "learning_rate": 0.00020960755141201625, - "loss": 0.6306, + "grad_norm": 2.0, + "learning_rate": 8.357395414153964e-05, + "loss": 0.6837, "step": 2595 }, { "epoch": 0.6259027443428021, - "grad_norm": 1.171875, - "learning_rate": 0.00020954270251735465, - "loss": 0.6236, + "grad_norm": 1.8203125, + "learning_rate": 8.354809782810021e-05, + "loss": 0.672, "step": 2600 }, { "epoch": 0.6271064034665382, - "grad_norm": 1.3359375, - "learning_rate": 0.0002094777452994085, - "loss": 0.6331, + "grad_norm": 2.125, + "learning_rate": 8.352219832439807e-05, + "loss": 0.6889, "step": 2605 }, { "epoch": 0.6283100625902744, - "grad_norm": 1.3359375, - "learning_rate": 0.00020941267986196275, - "loss": 0.6534, + "grad_norm": 1.9765625, + "learning_rate": 8.349625567181395e-05, + "loss": 0.71, "step": 2610 }, { "epoch": 0.6295137217140105, - "grad_norm": 1.1953125, - "learning_rate": 0.00020934750630897535, - "loss": 0.6403, + "grad_norm": 1.875, + "learning_rate": 8.34702699117976e-05, + "loss": 0.6913, "step": 2615 }, { "epoch": 0.6307173808377468, - "grad_norm": 1.1015625, - "learning_rate": 0.00020928222474457688, - "loss": 0.6232, + "grad_norm": 1.859375, + "learning_rate": 8.344424108586755e-05, + "loss": 0.6695, "step": 2620 }, { "epoch": 0.6319210399614829, - "grad_norm": 1.1953125, - "learning_rate": 0.00020921683527307054, - "loss": 0.6653, + "grad_norm": 1.8828125, + "learning_rate": 8.341816923561122e-05, + "loss": 0.7204, "step": 2625 }, { "epoch": 0.6331246990852191, - "grad_norm": 1.140625, - "learning_rate": 0.00020915133799893202, - "loss": 0.6327, + "grad_norm": 1.796875, + "learning_rate": 8.339205440268475e-05, + "loss": 0.695, "step": 2630 }, { "epoch": 0.6343283582089553, - "grad_norm": 1.125, - "learning_rate": 0.0002090857330268091, - "loss": 0.6198, + "grad_norm": 1.9921875, + "learning_rate": 8.336589662881294e-05, + "loss": 0.6763, "step": 2635 }, { "epoch": 0.6355320173326914, - "grad_norm": 1.1640625, - "learning_rate": 0.0002090200204615217, - "loss": 0.6601, + "grad_norm": 2.015625, + "learning_rate": 8.333969595578922e-05, + "loss": 0.7225, "step": 2640 }, { "epoch": 0.6367356764564276, - "grad_norm": 1.2421875, - "learning_rate": 0.0002089542004080617, - "loss": 0.636, + "grad_norm": 2.03125, + "learning_rate": 8.331345242547552e-05, + "loss": 0.6886, "step": 2645 }, { "epoch": 0.6379393355801637, - "grad_norm": 1.1953125, - "learning_rate": 0.00020888827297159266, - "loss": 0.621, + "grad_norm": 1.8671875, + "learning_rate": 8.328716607980231e-05, + "loss": 0.6745, "step": 2650 }, { "epoch": 0.6391429947038999, - "grad_norm": 1.3203125, - "learning_rate": 0.0002088222382574497, - "loss": 0.6454, + "grad_norm": 2.09375, + "learning_rate": 8.32608369607684e-05, + "loss": 0.6954, "step": 2655 }, { "epoch": 0.640346653827636, - "grad_norm": 1.2265625, - "learning_rate": 0.0002087560963711394, - "loss": 0.621, + "grad_norm": 1.7890625, + "learning_rate": 8.323446511044099e-05, + "loss": 0.6837, "step": 2660 }, { "epoch": 0.6415503129513722, - "grad_norm": 1.1796875, - "learning_rate": 0.0002086898474183395, - "loss": 0.6225, + "grad_norm": 1.828125, + "learning_rate": 8.320805057095552e-05, + "loss": 0.6815, "step": 2665 }, { "epoch": 0.6427539720751083, - "grad_norm": 1.0859375, - "learning_rate": 0.00020862349150489886, - "loss": 0.598, + "grad_norm": 1.796875, + "learning_rate": 8.318159338451568e-05, + "loss": 0.6518, "step": 2670 }, { "epoch": 0.6439576311988445, - "grad_norm": 1.0390625, - "learning_rate": 0.00020855702873683724, - "loss": 0.6181, + "grad_norm": 1.6875, + "learning_rate": 8.315509359339325e-05, + "loss": 0.6725, "step": 2675 }, { "epoch": 0.6451612903225806, - "grad_norm": 1.3515625, - "learning_rate": 0.00020849045922034508, - "loss": 0.6285, + "grad_norm": 2.28125, + "learning_rate": 8.312855123992811e-05, + "loss": 0.6893, "step": 2680 }, { "epoch": 0.6463649494463168, - "grad_norm": 1.171875, - "learning_rate": 0.00020842378306178342, - "loss": 0.6267, + "grad_norm": 1.9453125, + "learning_rate": 8.310196636652814e-05, + "loss": 0.6899, "step": 2685 }, { "epoch": 0.6475686085700529, - "grad_norm": 1.234375, - "learning_rate": 0.00020835700036768364, - "loss": 0.6217, + "grad_norm": 1.9609375, + "learning_rate": 8.307533901566915e-05, + "loss": 0.6764, "step": 2690 }, { "epoch": 0.6487722676937892, - "grad_norm": 1.1484375, - "learning_rate": 0.00020829011124474738, - "loss": 0.6169, + "grad_norm": 1.9140625, + "learning_rate": 8.30486692298948e-05, + "loss": 0.6736, "step": 2695 }, { "epoch": 0.6499759268175253, - "grad_norm": 1.25, - "learning_rate": 0.00020822311579984636, - "loss": 0.6251, + "grad_norm": 1.9296875, + "learning_rate": 8.302195705181661e-05, + "loss": 0.6778, "step": 2700 }, { "epoch": 0.6511795859412615, - "grad_norm": 1.140625, - "learning_rate": 0.00020815601414002203, - "loss": 0.679, + "grad_norm": 1.953125, + "learning_rate": 8.299520252411376e-05, + "loss": 0.7351, "step": 2705 }, { "epoch": 0.6523832450649976, - "grad_norm": 1.109375, - "learning_rate": 0.00020808880637248573, - "loss": 0.606, + "grad_norm": 1.9765625, + "learning_rate": 8.296840568953316e-05, + "loss": 0.6587, "step": 2710 }, { "epoch": 0.6535869041887338, - "grad_norm": 1.078125, - "learning_rate": 0.0002080214926046182, - "loss": 0.6391, + "grad_norm": 1.796875, + "learning_rate": 8.294156659088925e-05, + "loss": 0.6894, "step": 2715 }, { "epoch": 0.6547905633124699, - "grad_norm": 1.15625, - "learning_rate": 0.00020795407294396954, - "loss": 0.6187, + "grad_norm": 1.8359375, + "learning_rate": 8.291468527106403e-05, + "loss": 0.6801, "step": 2720 }, { "epoch": 0.6559942224362061, - "grad_norm": 1.2421875, - "learning_rate": 0.0002078865474982592, - "loss": 0.6171, + "grad_norm": 2.03125, + "learning_rate": 8.288776177300697e-05, + "loss": 0.6697, "step": 2725 }, { "epoch": 0.6571978815599422, - "grad_norm": 1.2734375, - "learning_rate": 0.00020781891637537542, - "loss": 0.6509, + "grad_norm": 2.078125, + "learning_rate": 8.286079613973492e-05, + "loss": 0.7093, "step": 2730 }, { "epoch": 0.6584015406836784, - "grad_norm": 1.2265625, - "learning_rate": 0.0002077511796833755, - "loss": 0.6642, + "grad_norm": 1.8828125, + "learning_rate": 8.283378841433207e-05, + "loss": 0.7219, "step": 2735 }, { "epoch": 0.6596051998074145, - "grad_norm": 1.125, - "learning_rate": 0.0002076833375304852, - "loss": 0.6154, + "grad_norm": 1.8359375, + "learning_rate": 8.280673863994979e-05, + "loss": 0.665, "step": 2740 }, { "epoch": 0.6608088589311507, - "grad_norm": 1.1484375, - "learning_rate": 0.00020761539002509897, - "loss": 0.5939, + "grad_norm": 1.8828125, + "learning_rate": 8.277964685980671e-05, + "loss": 0.6458, "step": 2745 }, { "epoch": 0.6620125180548868, - "grad_norm": 1.234375, - "learning_rate": 0.00020754733727577945, - "loss": 0.6353, + "grad_norm": 1.9140625, + "learning_rate": 8.275251311718857e-05, + "loss": 0.6887, "step": 2750 }, { "epoch": 0.663216177178623, - "grad_norm": 1.1640625, - "learning_rate": 0.00020747917939125757, - "loss": 0.6425, + "grad_norm": 1.859375, + "learning_rate": 8.272533745544813e-05, + "loss": 0.6971, "step": 2755 }, { "epoch": 0.6644198363023591, - "grad_norm": 1.125, - "learning_rate": 0.00020741091648043204, - "loss": 0.6184, + "grad_norm": 1.96875, + "learning_rate": 8.269811991800508e-05, + "loss": 0.6764, "step": 2760 }, { "epoch": 0.6656234954260953, - "grad_norm": 1.1328125, - "learning_rate": 0.0002073425486523696, - "loss": 0.6352, + "grad_norm": 1.9609375, + "learning_rate": 8.267086054834613e-05, + "loss": 0.6856, "step": 2765 }, { "epoch": 0.6668271545498314, - "grad_norm": 1.1875, - "learning_rate": 0.00020727407601630447, - "loss": 0.6352, + "grad_norm": 1.8828125, + "learning_rate": 8.264355939002474e-05, + "loss": 0.6799, "step": 2770 }, { "epoch": 0.6680308136735676, - "grad_norm": 1.3515625, - "learning_rate": 0.00020720549868163835, - "loss": 0.6225, + "grad_norm": 1.9453125, + "learning_rate": 8.261621648666117e-05, + "loss": 0.6726, "step": 2775 }, { "epoch": 0.6692344727973039, - "grad_norm": 1.1328125, - "learning_rate": 0.00020713681675794027, - "loss": 0.6428, + "grad_norm": 1.8671875, + "learning_rate": 8.258883188194235e-05, + "loss": 0.6958, "step": 2780 }, { "epoch": 0.67043813192104, - "grad_norm": 1.15625, - "learning_rate": 0.0002070680303549463, - "loss": 0.6306, + "grad_norm": 1.921875, + "learning_rate": 8.256140561962187e-05, + "loss": 0.6789, "step": 2785 }, { "epoch": 0.6716417910447762, - "grad_norm": 1.2578125, - "learning_rate": 0.00020699913958255951, - "loss": 0.5984, + "grad_norm": 2.21875, + "learning_rate": 8.253393774351987e-05, + "loss": 0.6606, "step": 2790 }, { "epoch": 0.6728454501685123, - "grad_norm": 1.109375, - "learning_rate": 0.0002069301445508497, - "loss": 0.6286, + "grad_norm": 2.015625, + "learning_rate": 8.250642829752299e-05, + "loss": 0.6853, "step": 2795 }, { "epoch": 0.6740491092922485, - "grad_norm": 1.203125, - "learning_rate": 0.00020686104537005322, - "loss": 0.6289, + "grad_norm": 1.9921875, + "learning_rate": 8.247887732558424e-05, + "loss": 0.6833, "step": 2800 }, { "epoch": 0.6752527684159846, - "grad_norm": 1.234375, - "learning_rate": 0.00020679184215057286, - "loss": 0.6127, + "grad_norm": 1.7890625, + "learning_rate": 8.245128487172301e-05, + "loss": 0.6691, "step": 2805 }, { "epoch": 0.6764564275397208, - "grad_norm": 1.2890625, - "learning_rate": 0.00020672253500297766, - "loss": 0.6326, + "grad_norm": 2.171875, + "learning_rate": 8.242365098002502e-05, + "loss": 0.6835, "step": 2810 }, { "epoch": 0.6776600866634569, - "grad_norm": 1.296875, - "learning_rate": 0.00020665312403800258, - "loss": 0.653, + "grad_norm": 2.0, + "learning_rate": 8.239597569464208e-05, + "loss": 0.7022, "step": 2815 }, { "epoch": 0.6788637457871931, - "grad_norm": 1.21875, - "learning_rate": 0.00020658360936654866, - "loss": 0.634, + "grad_norm": 1.984375, + "learning_rate": 8.236825905979225e-05, + "loss": 0.6848, "step": 2820 }, { "epoch": 0.6800674049109292, - "grad_norm": 1.3671875, - "learning_rate": 0.00020651399109968243, - "loss": 0.6147, + "grad_norm": 1.9375, + "learning_rate": 8.234050111975957e-05, + "loss": 0.6668, "step": 2825 }, { "epoch": 0.6812710640346654, - "grad_norm": 1.15625, - "learning_rate": 0.0002064442693486361, - "loss": 0.5986, + "grad_norm": 1.9453125, + "learning_rate": 8.231270191889412e-05, + "loss": 0.6541, "step": 2830 }, { "epoch": 0.6824747231584015, - "grad_norm": 1.09375, - "learning_rate": 0.0002063744442248071, - "loss": 0.6218, + "grad_norm": 1.7265625, + "learning_rate": 8.228486150161192e-05, + "loss": 0.6727, "step": 2835 }, { "epoch": 0.6836783822821377, - "grad_norm": 1.1015625, - "learning_rate": 0.00020630451583975812, - "loss": 0.6377, + "grad_norm": 1.8828125, + "learning_rate": 8.22569799123948e-05, + "loss": 0.6896, "step": 2840 }, { "epoch": 0.6848820414058738, - "grad_norm": 1.1796875, - "learning_rate": 0.0002062344843052168, - "loss": 0.6162, + "grad_norm": 1.9765625, + "learning_rate": 8.222905719579042e-05, + "loss": 0.6767, "step": 2845 }, { "epoch": 0.68608570052961, - "grad_norm": 1.15625, - "learning_rate": 0.00020616434973307553, - "loss": 0.614, + "grad_norm": 1.875, + "learning_rate": 8.220109339641209e-05, + "loss": 0.6652, "step": 2850 }, { "epoch": 0.6872893596533461, - "grad_norm": 1.234375, - "learning_rate": 0.00020609411223539143, - "loss": 0.5993, + "grad_norm": 2.203125, + "learning_rate": 8.217308855893879e-05, + "loss": 0.6578, "step": 2855 }, { "epoch": 0.6884930187770824, - "grad_norm": 1.1796875, - "learning_rate": 0.00020602377192438601, - "loss": 0.6165, + "grad_norm": 1.8984375, + "learning_rate": 8.21450427281151e-05, + "loss": 0.6724, "step": 2860 }, { "epoch": 0.6896966779008185, - "grad_norm": 1.21875, - "learning_rate": 0.00020595332891244503, - "loss": 0.6207, + "grad_norm": 1.859375, + "learning_rate": 8.211695594875105e-05, + "loss": 0.6726, "step": 2865 }, { "epoch": 0.6909003370245547, - "grad_norm": 1.1640625, - "learning_rate": 0.00020588278331211833, - "loss": 0.6156, + "grad_norm": 1.8984375, + "learning_rate": 8.208882826572211e-05, + "loss": 0.6672, "step": 2870 }, { "epoch": 0.6921039961482908, - "grad_norm": 1.1484375, - "learning_rate": 0.00020581213523611976, - "loss": 0.6058, + "grad_norm": 1.796875, + "learning_rate": 8.206065972396911e-05, + "loss": 0.6602, "step": 2875 }, { "epoch": 0.693307655272027, - "grad_norm": 1.0859375, - "learning_rate": 0.00020574138479732682, - "loss": 0.6111, + "grad_norm": 1.859375, + "learning_rate": 8.203245036849817e-05, + "loss": 0.6736, "step": 2880 }, { "epoch": 0.6945113143957631, - "grad_norm": 1.125, - "learning_rate": 0.00020567053210878057, - "loss": 0.6366, + "grad_norm": 1.8359375, + "learning_rate": 8.20042002443806e-05, + "loss": 0.6935, "step": 2885 }, { "epoch": 0.6957149735194993, - "grad_norm": 1.140625, - "learning_rate": 0.00020559957728368545, - "loss": 0.6231, + "grad_norm": 1.859375, + "learning_rate": 8.197590939675286e-05, + "loss": 0.6757, "step": 2890 }, { "epoch": 0.6969186326432354, - "grad_norm": 1.1953125, - "learning_rate": 0.00020552852043540903, - "loss": 0.6342, + "grad_norm": 1.9921875, + "learning_rate": 8.194757787081647e-05, + "loss": 0.6979, "step": 2895 }, { "epoch": 0.6981222917669716, - "grad_norm": 1.265625, - "learning_rate": 0.000205457361677482, - "loss": 0.5919, + "grad_norm": 2.03125, + "learning_rate": 8.191920571183793e-05, + "loss": 0.6502, "step": 2900 }, { "epoch": 0.6993259508907077, - "grad_norm": 1.21875, - "learning_rate": 0.00020538610112359783, - "loss": 0.6279, + "grad_norm": 1.7890625, + "learning_rate": 8.189079296514871e-05, + "loss": 0.6837, "step": 2905 }, { "epoch": 0.7005296100144439, - "grad_norm": 1.0703125, - "learning_rate": 0.0002053147388876125, - "loss": 0.5846, + "grad_norm": 1.8359375, + "learning_rate": 8.186233967614505e-05, + "loss": 0.636, "step": 2910 }, { "epoch": 0.70173326913818, - "grad_norm": 1.2890625, - "learning_rate": 0.0002052432750835447, - "loss": 0.6311, + "grad_norm": 1.9453125, + "learning_rate": 8.183384589028806e-05, + "loss": 0.6806, "step": 2915 }, { "epoch": 0.7029369282619162, - "grad_norm": 1.109375, - "learning_rate": 0.00020517170982557522, - "loss": 0.5531, + "grad_norm": 1.8671875, + "learning_rate": 8.180531165310348e-05, + "loss": 0.6107, "step": 2920 }, { "epoch": 0.7041405873856523, - "grad_norm": 1.234375, - "learning_rate": 0.000205100043228047, - "loss": 0.6293, + "grad_norm": 1.96875, + "learning_rate": 8.177673701018172e-05, + "loss": 0.6876, "step": 2925 }, { "epoch": 0.7053442465093885, - "grad_norm": 1.140625, - "learning_rate": 0.00020502827540546485, - "loss": 0.5835, + "grad_norm": 2.03125, + "learning_rate": 8.174812200717771e-05, + "loss": 0.6419, "step": 2930 }, { "epoch": 0.7065479056331248, - "grad_norm": 1.1796875, - "learning_rate": 0.00020495640647249537, - "loss": 0.5965, + "grad_norm": 2.0625, + "learning_rate": 8.17194666898109e-05, + "loss": 0.6529, "step": 2935 }, { "epoch": 0.7077515647568609, - "grad_norm": 1.1328125, - "learning_rate": 0.00020488443654396676, - "loss": 0.5794, + "grad_norm": 1.8203125, + "learning_rate": 8.169077110386515e-05, + "loss": 0.6357, "step": 2940 }, { "epoch": 0.7089552238805971, - "grad_norm": 1.15625, - "learning_rate": 0.00020481236573486846, - "loss": 0.6297, + "grad_norm": 1.859375, + "learning_rate": 8.166203529518865e-05, + "loss": 0.6823, "step": 2945 }, { "epoch": 0.7101588830043332, - "grad_norm": 1.328125, - "learning_rate": 0.00020474019416035115, - "loss": 0.6167, + "grad_norm": 1.953125, + "learning_rate": 8.163325930969384e-05, + "loss": 0.6726, "step": 2950 }, { "epoch": 0.7113625421280694, - "grad_norm": 1.15625, - "learning_rate": 0.0002046679219357265, - "loss": 0.6045, + "grad_norm": 1.8828125, + "learning_rate": 8.160444319335739e-05, + "loss": 0.6617, "step": 2955 }, { "epoch": 0.7125662012518055, - "grad_norm": 1.140625, - "learning_rate": 0.00020459554917646699, - "loss": 0.5762, + "grad_norm": 1.84375, + "learning_rate": 8.157558699222005e-05, + "loss": 0.6209, "step": 2960 }, { "epoch": 0.7137698603755417, - "grad_norm": 1.0390625, - "learning_rate": 0.00020452307599820577, - "loss": 0.5958, + "grad_norm": 2.046875, + "learning_rate": 8.154669075238665e-05, + "loss": 0.6494, "step": 2965 }, { "epoch": 0.7149735194992778, - "grad_norm": 1.2109375, - "learning_rate": 0.00020445050251673635, - "loss": 0.6138, + "grad_norm": 1.8671875, + "learning_rate": 8.151775452002595e-05, + "loss": 0.6646, "step": 2970 }, { "epoch": 0.716177178623014, - "grad_norm": 1.2890625, - "learning_rate": 0.0002043778288480126, - "loss": 0.5892, + "grad_norm": 1.875, + "learning_rate": 8.148877834137063e-05, + "loss": 0.6488, "step": 2975 }, { "epoch": 0.7173808377467501, - "grad_norm": 1.1875, - "learning_rate": 0.0002043050551081484, - "loss": 0.6052, + "grad_norm": 1.9375, + "learning_rate": 8.145976226271723e-05, + "loss": 0.6553, "step": 2980 }, { "epoch": 0.7185844968704863, - "grad_norm": 1.0703125, - "learning_rate": 0.00020423218141341754, - "loss": 0.6163, + "grad_norm": 1.8203125, + "learning_rate": 8.143070633042598e-05, + "loss": 0.675, "step": 2985 }, { "epoch": 0.7197881559942224, - "grad_norm": 1.21875, - "learning_rate": 0.00020415920788025344, - "loss": 0.6141, + "grad_norm": 1.9765625, + "learning_rate": 8.14016105909208e-05, + "loss": 0.6698, "step": 2990 }, { "epoch": 0.7209918151179586, - "grad_norm": 1.078125, - "learning_rate": 0.00020408613462524918, - "loss": 0.5943, + "grad_norm": 1.9921875, + "learning_rate": 8.137247509068925e-05, + "loss": 0.6541, "step": 2995 }, { "epoch": 0.7221954742416947, - "grad_norm": 1.203125, - "learning_rate": 0.00020401296176515704, - "loss": 0.5897, + "grad_norm": 1.8671875, + "learning_rate": 8.134329987628239e-05, + "loss": 0.6526, "step": 3000 }, { "epoch": 0.7221954742416947, - "eval_loss": 0.5175977349281311, - "eval_runtime": 2.3616, - "eval_samples_per_second": 84.689, - "eval_steps_per_second": 84.689, + "eval_loss": 0.5636051297187805, + "eval_runtime": 2.3734, + "eval_samples_per_second": 84.267, + "eval_steps_per_second": 84.267, "step": 3000 }, { "epoch": 0.7233991333654309, - "grad_norm": 1.1640625, - "learning_rate": 0.00020393968941688853, - "loss": 0.6327, + "grad_norm": 1.9453125, + "learning_rate": 8.13140849943147e-05, + "loss": 0.6909, "step": 3005 }, { "epoch": 0.724602792489167, - "grad_norm": 1.15625, - "learning_rate": 0.00020386631769751402, - "loss": 0.675, + "grad_norm": 1.9453125, + "learning_rate": 8.12848304914641e-05, + "loss": 0.7247, "step": 3010 }, { "epoch": 0.7258064516129032, - "grad_norm": 1.171875, - "learning_rate": 0.00020379284672426278, - "loss": 0.6061, + "grad_norm": 1.921875, + "learning_rate": 8.125553641447178e-05, + "loss": 0.6622, "step": 3015 }, { "epoch": 0.7270101107366393, - "grad_norm": 1.15625, - "learning_rate": 0.0002037192766145225, - "loss": 0.5972, + "grad_norm": 1.8671875, + "learning_rate": 8.122620281014217e-05, + "loss": 0.6514, "step": 3020 }, { "epoch": 0.7282137698603756, - "grad_norm": 1.109375, - "learning_rate": 0.00020364560748583946, - "loss": 0.592, + "grad_norm": 1.8984375, + "learning_rate": 8.119682972534283e-05, + "loss": 0.6487, "step": 3025 }, { "epoch": 0.7294174289841117, - "grad_norm": 1.1796875, - "learning_rate": 0.00020357183945591797, - "loss": 0.6227, + "grad_norm": 2.03125, + "learning_rate": 8.116741720700445e-05, + "loss": 0.6745, "step": 3030 }, { "epoch": 0.7306210881078479, - "grad_norm": 1.1171875, - "learning_rate": 0.00020349797264262046, - "loss": 0.5965, + "grad_norm": 1.75, + "learning_rate": 8.113796530212067e-05, + "loss": 0.6514, "step": 3035 }, { "epoch": 0.731824747231584, - "grad_norm": 1.1484375, - "learning_rate": 0.00020342400716396718, - "loss": 0.5808, + "grad_norm": 1.96875, + "learning_rate": 8.11084740577481e-05, + "loss": 0.6335, "step": 3040 }, { "epoch": 0.7330284063553202, - "grad_norm": 1.1484375, - "learning_rate": 0.00020334994313813597, - "loss": 0.5812, + "grad_norm": 1.8828125, + "learning_rate": 8.10789435210062e-05, + "loss": 0.6396, "step": 3045 }, { "epoch": 0.7342320654790563, - "grad_norm": 1.1171875, - "learning_rate": 0.00020327578068346212, - "loss": 0.6354, + "grad_norm": 1.8515625, + "learning_rate": 8.104937373907715e-05, + "loss": 0.6843, "step": 3050 }, { "epoch": 0.7354357246027925, - "grad_norm": 1.296875, - "learning_rate": 0.00020320151991843832, - "loss": 0.5793, + "grad_norm": 2.25, + "learning_rate": 8.101976475920592e-05, + "loss": 0.6315, "step": 3055 }, { "epoch": 0.7366393837265286, - "grad_norm": 1.21875, - "learning_rate": 0.00020312716096171417, - "loss": 0.6238, + "grad_norm": 1.9140625, + "learning_rate": 8.099011662870006e-05, + "loss": 0.6807, "step": 3060 }, { "epoch": 0.7378430428502648, - "grad_norm": 1.2109375, - "learning_rate": 0.0002030527039320962, - "loss": 0.6126, + "grad_norm": 1.9375, + "learning_rate": 8.096042939492967e-05, + "loss": 0.667, "step": 3065 }, { "epoch": 0.7390467019740009, - "grad_norm": 1.125, - "learning_rate": 0.00020297814894854773, - "loss": 0.6398, + "grad_norm": 1.8125, + "learning_rate": 8.093070310532736e-05, + "loss": 0.6976, "step": 3070 }, { "epoch": 0.7402503610977371, - "grad_norm": 1.1015625, - "learning_rate": 0.00020290349613018846, - "loss": 0.6368, + "grad_norm": 1.8515625, + "learning_rate": 8.090093780738813e-05, + "loss": 0.6889, "step": 3075 }, { "epoch": 0.7414540202214733, - "grad_norm": 1.09375, - "learning_rate": 0.00020282874559629445, - "loss": 0.6148, + "grad_norm": 1.828125, + "learning_rate": 8.087113354866925e-05, + "loss": 0.6632, "step": 3080 }, { "epoch": 0.7426576793452094, - "grad_norm": 1.1484375, - "learning_rate": 0.00020275389746629793, - "loss": 0.5384, + "grad_norm": 1.75, + "learning_rate": 8.084129037679033e-05, + "loss": 0.5964, "step": 3085 }, { "epoch": 0.7438613384689456, - "grad_norm": 1.1484375, - "learning_rate": 0.00020267895185978704, - "loss": 0.6293, + "grad_norm": 2.15625, + "learning_rate": 8.081140833943309e-05, + "loss": 0.6913, "step": 3090 }, { "epoch": 0.7450649975926817, - "grad_norm": 1.078125, - "learning_rate": 0.00020260390889650554, - "loss": 0.5933, + "grad_norm": 1.875, + "learning_rate": 8.078148748434136e-05, + "loss": 0.6537, "step": 3095 }, { "epoch": 0.746268656716418, - "grad_norm": 1.171875, - "learning_rate": 0.00020252876869635293, - "loss": 0.6124, + "grad_norm": 1.8828125, + "learning_rate": 8.0751527859321e-05, + "loss": 0.6671, "step": 3100 }, { "epoch": 0.7474723158401541, - "grad_norm": 1.2421875, - "learning_rate": 0.00020245353137938397, - "loss": 0.6026, + "grad_norm": 2.0, + "learning_rate": 8.07215295122398e-05, + "loss": 0.6567, "step": 3105 }, { "epoch": 0.7486759749638903, - "grad_norm": 1.15625, - "learning_rate": 0.00020237819706580865, - "loss": 0.5674, + "grad_norm": 1.8359375, + "learning_rate": 8.069149249102747e-05, + "loss": 0.6213, "step": 3110 }, { "epoch": 0.7498796340876264, - "grad_norm": 0.9765625, - "learning_rate": 0.00020230276587599182, - "loss": 0.607, + "grad_norm": 1.6875, + "learning_rate": 8.066141684367543e-05, + "loss": 0.666, "step": 3115 }, { "epoch": 0.7510832932113626, - "grad_norm": 1.0078125, - "learning_rate": 0.00020222723793045323, - "loss": 0.5869, + "grad_norm": 1.7890625, + "learning_rate": 8.063130261823686e-05, + "loss": 0.6447, "step": 3120 }, { "epoch": 0.7522869523350987, - "grad_norm": 1.1640625, - "learning_rate": 0.00020215161334986715, - "loss": 0.5974, + "grad_norm": 1.7890625, + "learning_rate": 8.060114986282659e-05, + "loss": 0.6485, "step": 3125 }, { "epoch": 0.7534906114588349, - "grad_norm": 1.171875, - "learning_rate": 0.00020207589225506228, - "loss": 0.6184, + "grad_norm": 1.921875, + "learning_rate": 8.057095862562095e-05, + "loss": 0.6681, "step": 3130 }, { "epoch": 0.754694270582571, - "grad_norm": 1.171875, - "learning_rate": 0.0002020000747670215, - "loss": 0.5738, + "grad_norm": 1.9765625, + "learning_rate": 8.054072895485785e-05, + "loss": 0.6333, "step": 3135 }, { "epoch": 0.7558979297063072, - "grad_norm": 1.0625, - "learning_rate": 0.00020192416100688176, - "loss": 0.6298, + "grad_norm": 1.921875, + "learning_rate": 8.051046089883653e-05, + "loss": 0.6946, "step": 3140 }, { "epoch": 0.7571015888300433, - "grad_norm": 1.15625, - "learning_rate": 0.00020184815109593377, - "loss": 0.5855, + "grad_norm": 1.796875, + "learning_rate": 8.04801545059176e-05, + "loss": 0.6473, "step": 3145 }, { "epoch": 0.7583052479537795, - "grad_norm": 1.1328125, - "learning_rate": 0.00020177204515562188, - "loss": 0.6123, + "grad_norm": 1.9296875, + "learning_rate": 8.044980982452287e-05, + "loss": 0.6631, "step": 3150 }, { "epoch": 0.7595089070775156, - "grad_norm": 1.1015625, - "learning_rate": 0.00020169584330754389, - "loss": 0.5754, + "grad_norm": 2.0, + "learning_rate": 8.04194269031354e-05, + "loss": 0.6357, "step": 3155 }, { "epoch": 0.7607125662012518, - "grad_norm": 1.140625, - "learning_rate": 0.00020161954567345078, - "loss": 0.6184, + "grad_norm": 1.90625, + "learning_rate": 8.03890057902993e-05, + "loss": 0.6665, "step": 3160 }, { "epoch": 0.7619162253249879, - "grad_norm": 1.1171875, - "learning_rate": 0.00020154315237524666, - "loss": 0.6027, + "grad_norm": 1.9296875, + "learning_rate": 8.03585465346197e-05, + "loss": 0.656, "step": 3165 }, { "epoch": 0.7631198844487241, - "grad_norm": 1.0859375, - "learning_rate": 0.00020146666353498843, - "loss": 0.5804, + "grad_norm": 1.8046875, + "learning_rate": 8.03280491847627e-05, + "loss": 0.6366, "step": 3170 }, { "epoch": 0.7643235435724602, - "grad_norm": 1.09375, - "learning_rate": 0.0002013900792748856, - "loss": 0.6019, + "grad_norm": 2.03125, + "learning_rate": 8.029751378945523e-05, + "loss": 0.6621, "step": 3175 }, { "epoch": 0.7655272026961965, - "grad_norm": 1.1171875, - "learning_rate": 0.0002013133997173002, - "loss": 0.5933, + "grad_norm": 1.8515625, + "learning_rate": 8.026694039748501e-05, + "loss": 0.6474, "step": 3180 }, { "epoch": 0.7667308618199326, - "grad_norm": 1.1796875, - "learning_rate": 0.00020123662498474653, - "loss": 0.6004, + "grad_norm": 1.8828125, + "learning_rate": 8.023632905770052e-05, + "loss": 0.656, "step": 3185 }, { "epoch": 0.7679345209436688, - "grad_norm": 1.0703125, - "learning_rate": 0.00020115975519989092, - "loss": 0.6069, + "grad_norm": 1.8828125, + "learning_rate": 8.020567981901081e-05, + "loss": 0.6605, "step": 3190 }, { "epoch": 0.7691381800674049, - "grad_norm": 1.140625, - "learning_rate": 0.00020108279048555158, - "loss": 0.616, + "grad_norm": 1.828125, + "learning_rate": 8.017499273038551e-05, + "loss": 0.6676, "step": 3195 }, { "epoch": 0.7703418391911411, - "grad_norm": 1.15625, - "learning_rate": 0.0002010057309646984, - "loss": 0.6362, + "grad_norm": 1.8828125, + "learning_rate": 8.014426784085473e-05, + "loss": 0.6876, "step": 3200 }, { "epoch": 0.7715454983148772, - "grad_norm": 1.171875, - "learning_rate": 0.00020092857676045272, - "loss": 0.5916, + "grad_norm": 1.9453125, + "learning_rate": 8.011350519950895e-05, + "loss": 0.6471, "step": 3205 }, { "epoch": 0.7727491574386134, - "grad_norm": 1.0703125, - "learning_rate": 0.0002008513279960872, - "loss": 0.6225, + "grad_norm": 1.8515625, + "learning_rate": 8.0082704855499e-05, + "loss": 0.676, "step": 3210 }, { "epoch": 0.7739528165623495, - "grad_norm": 1.0859375, - "learning_rate": 0.0002007739847950256, - "loss": 0.606, + "grad_norm": 1.7578125, + "learning_rate": 8.005186685803592e-05, + "loss": 0.6637, "step": 3215 }, { "epoch": 0.7751564756860857, - "grad_norm": 1.1875, - "learning_rate": 0.00020069654728084243, - "loss": 0.5747, + "grad_norm": 1.96875, + "learning_rate": 8.002099125639094e-05, + "loss": 0.6359, "step": 3220 }, { "epoch": 0.7763601348098219, - "grad_norm": 1.0703125, - "learning_rate": 0.00020061901557726308, - "loss": 0.6203, + "grad_norm": 1.8125, + "learning_rate": 7.999007809989535e-05, + "loss": 0.6745, "step": 3225 }, { "epoch": 0.777563793933558, - "grad_norm": 1.109375, - "learning_rate": 0.0002005413898081633, - "loss": 0.6539, + "grad_norm": 1.828125, + "learning_rate": 7.995912743794046e-05, + "loss": 0.7047, "step": 3230 }, { "epoch": 0.7787674530572942, - "grad_norm": 1.1875, - "learning_rate": 0.0002004636700975691, - "loss": 0.6167, + "grad_norm": 1.875, + "learning_rate": 7.992813931997745e-05, + "loss": 0.6711, "step": 3235 }, { "epoch": 0.7799711121810303, - "grad_norm": 1.1640625, - "learning_rate": 0.00020038585656965684, - "loss": 0.585, + "grad_norm": 1.9765625, + "learning_rate": 7.989711379551745e-05, + "loss": 0.6448, "step": 3240 }, { "epoch": 0.7811747713047665, - "grad_norm": 1.1875, - "learning_rate": 0.00020030794934875238, - "loss": 0.5556, + "grad_norm": 1.9765625, + "learning_rate": 7.986605091413127e-05, + "loss": 0.6056, "step": 3245 }, { "epoch": 0.7823784304285026, - "grad_norm": 1.046875, - "learning_rate": 0.0002002299485593316, - "loss": 0.6019, + "grad_norm": 1.78125, + "learning_rate": 7.98349507254494e-05, + "loss": 0.6616, "step": 3250 }, { "epoch": 0.7835820895522388, - "grad_norm": 1.28125, - "learning_rate": 0.00020015185432601976, - "loss": 0.571, + "grad_norm": 2.109375, + "learning_rate": 7.980381327916201e-05, + "loss": 0.6327, "step": 3255 }, { "epoch": 0.784785748675975, - "grad_norm": 1.09375, - "learning_rate": 0.00020007366677359138, - "loss": 0.6165, + "grad_norm": 1.90625, + "learning_rate": 7.977263862501875e-05, + "loss": 0.6675, "step": 3260 }, { "epoch": 0.7859894077997112, - "grad_norm": 0.98828125, - "learning_rate": 0.0001999953860269702, - "loss": 0.5768, + "grad_norm": 1.78125, + "learning_rate": 7.97414268128287e-05, + "loss": 0.6372, "step": 3265 }, { "epoch": 0.7871930669234473, - "grad_norm": 1.109375, - "learning_rate": 0.00019991701221122872, - "loss": 0.5612, + "grad_norm": 1.875, + "learning_rate": 7.971017789246037e-05, + "loss": 0.6288, "step": 3270 }, { "epoch": 0.7883967260471835, - "grad_norm": 1.1640625, - "learning_rate": 0.00019983854545158823, - "loss": 0.6152, + "grad_norm": 1.9921875, + "learning_rate": 7.967889191384151e-05, + "loss": 0.6759, "step": 3275 }, { "epoch": 0.7896003851709196, - "grad_norm": 1.0625, - "learning_rate": 0.0001997599858734185, - "loss": 0.5604, + "grad_norm": 1.78125, + "learning_rate": 7.964756892695908e-05, + "loss": 0.6199, "step": 3280 }, { "epoch": 0.7908040442946558, - "grad_norm": 1.078125, - "learning_rate": 0.00019968133360223758, - "loss": 0.618, + "grad_norm": 1.7578125, + "learning_rate": 7.96162089818592e-05, + "loss": 0.6693, "step": 3285 }, { "epoch": 0.7920077034183919, - "grad_norm": 1.1328125, - "learning_rate": 0.00019960258876371164, - "loss": 0.5797, + "grad_norm": 1.953125, + "learning_rate": 7.958481212864701e-05, + "loss": 0.6442, "step": 3290 }, { "epoch": 0.7932113625421281, - "grad_norm": 1.234375, - "learning_rate": 0.00019952375148365477, - "loss": 0.5979, + "grad_norm": 1.890625, + "learning_rate": 7.955337841748664e-05, + "loss": 0.6529, "step": 3295 }, { "epoch": 0.7944150216658642, - "grad_norm": 1.1875, - "learning_rate": 0.00019944482188802873, - "loss": 0.6005, + "grad_norm": 2.078125, + "learning_rate": 7.952190789860111e-05, + "loss": 0.6617, "step": 3300 }, { "epoch": 0.7956186807896004, - "grad_norm": 1.1328125, - "learning_rate": 0.00019936580010294273, - "loss": 0.5972, + "grad_norm": 1.8125, + "learning_rate": 7.949040062227223e-05, + "loss": 0.6523, "step": 3305 }, { "epoch": 0.7968223399133365, - "grad_norm": 1.203125, - "learning_rate": 0.0001992866862546534, - "loss": 0.581, + "grad_norm": 2.03125, + "learning_rate": 7.945885663884056e-05, + "loss": 0.6367, "step": 3310 }, { "epoch": 0.7980259990370727, - "grad_norm": 1.1640625, - "learning_rate": 0.00019920748046956433, - "loss": 0.5994, + "grad_norm": 2.03125, + "learning_rate": 7.942727599870528e-05, + "loss": 0.6551, "step": 3315 }, { "epoch": 0.7992296581608088, - "grad_norm": 1.0625, - "learning_rate": 0.0001991281828742261, - "loss": 0.6082, + "grad_norm": 1.765625, + "learning_rate": 7.939565875232418e-05, + "loss": 0.6625, "step": 3320 }, { "epoch": 0.800433317284545, - "grad_norm": 1.0703125, - "learning_rate": 0.0001990487935953359, - "loss": 0.6041, + "grad_norm": 1.8359375, + "learning_rate": 7.936400495021352e-05, + "loss": 0.6613, "step": 3325 }, { "epoch": 0.8016369764082811, - "grad_norm": 1.09375, - "learning_rate": 0.00019896931275973747, - "loss": 0.588, + "grad_norm": 1.859375, + "learning_rate": 7.933231464294792e-05, + "loss": 0.6526, "step": 3330 }, { "epoch": 0.8028406355320173, - "grad_norm": 1.203125, - "learning_rate": 0.00019888974049442077, - "loss": 0.5845, + "grad_norm": 2.046875, + "learning_rate": 7.93005878811604e-05, + "loss": 0.6475, "step": 3335 }, { "epoch": 0.8040442946557534, - "grad_norm": 1.2109375, - "learning_rate": 0.00019881007692652199, - "loss": 0.6132, + "grad_norm": 2.171875, + "learning_rate": 7.92688247155422e-05, + "loss": 0.6639, "step": 3340 }, { "epoch": 0.8052479537794897, - "grad_norm": 1.0703125, - "learning_rate": 0.00019873032218332298, - "loss": 0.5982, + "grad_norm": 1.828125, + "learning_rate": 7.923702519684269e-05, + "loss": 0.6487, "step": 3345 }, { "epoch": 0.8064516129032258, - "grad_norm": 1.125, - "learning_rate": 0.00019865047639225142, - "loss": 0.5542, + "grad_norm": 1.7734375, + "learning_rate": 7.920518937586937e-05, + "loss": 0.6095, "step": 3350 }, { "epoch": 0.807655272026962, - "grad_norm": 1.140625, - "learning_rate": 0.00019857053968088038, - "loss": 0.5909, + "grad_norm": 1.8203125, + "learning_rate": 7.91733173034877e-05, + "loss": 0.6489, "step": 3355 }, { "epoch": 0.8088589311506981, - "grad_norm": 1.09375, - "learning_rate": 0.0001984905121769283, - "loss": 0.5296, + "grad_norm": 1.7109375, + "learning_rate": 7.914140903062111e-05, + "loss": 0.5872, "step": 3360 }, { "epoch": 0.8100625902744343, - "grad_norm": 1.1328125, - "learning_rate": 0.00019841039400825852, - "loss": 0.573, + "grad_norm": 1.8203125, + "learning_rate": 7.91094646082508e-05, + "loss": 0.6228, "step": 3365 }, { "epoch": 0.8112662493981705, - "grad_norm": 1.1015625, - "learning_rate": 0.00019833018530287944, - "loss": 0.5903, + "grad_norm": 1.875, + "learning_rate": 7.90774840874158e-05, + "loss": 0.6478, "step": 3370 }, { "epoch": 0.8124699085219066, - "grad_norm": 1.0625, - "learning_rate": 0.00019824988618894398, - "loss": 0.5946, + "grad_norm": 1.75, + "learning_rate": 7.90454675192128e-05, + "loss": 0.6454, "step": 3375 }, { "epoch": 0.8136735676456428, - "grad_norm": 1.1953125, - "learning_rate": 0.00019816949679474948, - "loss": 0.6007, + "grad_norm": 2.046875, + "learning_rate": 7.901341495479601e-05, + "loss": 0.6554, "step": 3380 }, { "epoch": 0.8148772267693789, - "grad_norm": 0.9453125, - "learning_rate": 0.00019808901724873763, - "loss": 0.546, + "grad_norm": 1.7578125, + "learning_rate": 7.898132644537726e-05, + "loss": 0.6047, "step": 3385 }, { "epoch": 0.8160808858931151, - "grad_norm": 1.34375, - "learning_rate": 0.0001980084476794941, - "loss": 0.6417, + "grad_norm": 2.015625, + "learning_rate": 7.894920204222574e-05, + "loss": 0.6909, "step": 3390 }, { "epoch": 0.8172845450168512, - "grad_norm": 1.265625, - "learning_rate": 0.00019792778821574843, - "loss": 0.5938, + "grad_norm": 1.9140625, + "learning_rate": 7.891704179666802e-05, + "loss": 0.6497, "step": 3395 }, { "epoch": 0.8184882041405874, - "grad_norm": 1.078125, - "learning_rate": 0.00019784703898637372, - "loss": 0.5912, + "grad_norm": 1.796875, + "learning_rate": 7.888484576008793e-05, + "loss": 0.6461, "step": 3400 }, { "epoch": 0.8196918632643235, - "grad_norm": 1.3359375, - "learning_rate": 0.00019776620012038661, - "loss": 0.6322, + "grad_norm": 1.9453125, + "learning_rate": 7.885261398392648e-05, + "loss": 0.6881, "step": 3405 }, { "epoch": 0.8208955223880597, - "grad_norm": 1.0546875, - "learning_rate": 0.00019768527174694682, - "loss": 0.5943, + "grad_norm": 1.9453125, + "learning_rate": 7.882034651968177e-05, + "loss": 0.6539, "step": 3410 }, { "epoch": 0.8220991815117958, - "grad_norm": 1.1953125, - "learning_rate": 0.00019760425399535718, - "loss": 0.5985, + "grad_norm": 2.03125, + "learning_rate": 7.878804341890898e-05, + "loss": 0.6525, "step": 3415 }, { "epoch": 0.823302840635532, - "grad_norm": 1.1875, - "learning_rate": 0.00019752314699506327, - "loss": 0.5655, + "grad_norm": 1.9921875, + "learning_rate": 7.875570473322013e-05, + "loss": 0.6226, "step": 3420 }, { "epoch": 0.8245064997592682, - "grad_norm": 1.0859375, - "learning_rate": 0.00019744195087565328, - "loss": 0.6188, + "grad_norm": 1.84375, + "learning_rate": 7.872333051428418e-05, + "loss": 0.6703, "step": 3425 }, { "epoch": 0.8257101588830044, - "grad_norm": 1.1953125, - "learning_rate": 0.00019736066576685784, - "loss": 0.6342, + "grad_norm": 1.8203125, + "learning_rate": 7.869092081382683e-05, + "loss": 0.6874, "step": 3430 }, { "epoch": 0.8269138180067405, - "grad_norm": 1.1328125, - "learning_rate": 0.00019727929179854962, - "loss": 0.5502, + "grad_norm": 1.8203125, + "learning_rate": 7.865847568363046e-05, + "loss": 0.6103, "step": 3435 }, { "epoch": 0.8281174771304767, - "grad_norm": 1.109375, - "learning_rate": 0.00019719782910074347, - "loss": 0.558, + "grad_norm": 1.8203125, + "learning_rate": 7.862599517553409e-05, + "loss": 0.6162, "step": 3440 }, { "epoch": 0.8293211362542128, - "grad_norm": 1.0859375, - "learning_rate": 0.0001971162778035958, - "loss": 0.5651, + "grad_norm": 1.7109375, + "learning_rate": 7.859347934143323e-05, + "loss": 0.6293, "step": 3445 }, { "epoch": 0.830524795377949, - "grad_norm": 1.0625, - "learning_rate": 0.0001970346380374048, - "loss": 0.5724, + "grad_norm": 1.90625, + "learning_rate": 7.856092823327988e-05, + "loss": 0.6337, "step": 3450 }, { "epoch": 0.8317284545016851, - "grad_norm": 1.140625, - "learning_rate": 0.00019695290993260978, - "loss": 0.5978, + "grad_norm": 1.9296875, + "learning_rate": 7.852834190308231e-05, + "loss": 0.6557, "step": 3455 }, { "epoch": 0.8329321136254213, - "grad_norm": 1.2734375, - "learning_rate": 0.00019687109361979133, - "loss": 0.6081, + "grad_norm": 1.9453125, + "learning_rate": 7.849572040290516e-05, + "loss": 0.669, "step": 3460 }, { "epoch": 0.8341357727491574, - "grad_norm": 1.078125, - "learning_rate": 0.00019678918922967094, - "loss": 0.5728, + "grad_norm": 2.0, + "learning_rate": 7.846306378486922e-05, + "loss": 0.6297, "step": 3465 }, { "epoch": 0.8353394318728936, - "grad_norm": 1.0859375, - "learning_rate": 0.00019670719689311085, - "loss": 0.5811, + "grad_norm": 1.796875, + "learning_rate": 7.84303721011514e-05, + "loss": 0.6417, "step": 3470 }, { "epoch": 0.8365430909966297, - "grad_norm": 1.1484375, - "learning_rate": 0.0001966251167411138, - "loss": 0.5643, + "grad_norm": 1.8515625, + "learning_rate": 7.83976454039846e-05, + "loss": 0.6279, "step": 3475 }, { "epoch": 0.8377467501203659, - "grad_norm": 1.140625, - "learning_rate": 0.0001965429489048228, - "loss": 0.5901, + "grad_norm": 1.9296875, + "learning_rate": 7.836488374565772e-05, + "loss": 0.6407, "step": 3480 }, { "epoch": 0.838950409244102, - "grad_norm": 1.0703125, - "learning_rate": 0.00019646069351552097, - "loss": 0.603, + "grad_norm": 1.71875, + "learning_rate": 7.833208717851549e-05, + "loss": 0.654, "step": 3485 }, { "epoch": 0.8401540683678382, - "grad_norm": 1.0625, - "learning_rate": 0.00019637835070463141, - "loss": 0.5392, + "grad_norm": 1.859375, + "learning_rate": 7.829925575495841e-05, + "loss": 0.5987, "step": 3490 }, { "epoch": 0.8413577274915743, - "grad_norm": 1.265625, - "learning_rate": 0.00019629592060371674, - "loss": 0.6147, + "grad_norm": 1.9296875, + "learning_rate": 7.826638952744266e-05, + "loss": 0.6754, "step": 3495 }, { "epoch": 0.8425613866153105, - "grad_norm": 1.1015625, - "learning_rate": 0.00019621340334447922, - "loss": 0.6238, + "grad_norm": 1.8203125, + "learning_rate": 7.82334885484801e-05, + "loss": 0.6744, "step": 3500 }, { "epoch": 0.8425613866153105, - "eval_loss": 0.4944419860839844, - "eval_runtime": 2.3247, - "eval_samples_per_second": 86.033, - "eval_steps_per_second": 86.033, + "eval_loss": 0.5451033115386963, + "eval_runtime": 2.3757, + "eval_samples_per_second": 84.185, + "eval_steps_per_second": 84.185, "step": 3500 }, { "epoch": 0.8437650457390466, - "grad_norm": 1.0703125, - "learning_rate": 0.0001961307990587602, - "loss": 0.6024, + "grad_norm": 1.7890625, + "learning_rate": 7.8200552870638e-05, + "loss": 0.6557, "step": 3505 }, { "epoch": 0.8449687048627829, - "grad_norm": 1.0703125, - "learning_rate": 0.0001960481078785402, - "loss": 0.5933, + "grad_norm": 1.9453125, + "learning_rate": 7.816758254653918e-05, + "loss": 0.6694, "step": 3510 }, { "epoch": 0.8461723639865191, - "grad_norm": 1.1953125, - "learning_rate": 0.0001959653299359385, - "loss": 0.5914, + "grad_norm": 2.015625, + "learning_rate": 7.813457762886171e-05, + "loss": 0.6502, "step": 3515 }, { "epoch": 0.8473760231102552, - "grad_norm": 1.1171875, - "learning_rate": 0.00019588246536321303, - "loss": 0.5737, + "grad_norm": 1.9140625, + "learning_rate": 7.810153817033905e-05, + "loss": 0.6346, "step": 3520 }, { "epoch": 0.8485796822339914, - "grad_norm": 1.1171875, - "learning_rate": 0.00019579951429276013, - "loss": 0.6031, + "grad_norm": 1.8828125, + "learning_rate": 7.806846422375973e-05, + "loss": 0.6592, "step": 3525 }, { "epoch": 0.8497833413577275, - "grad_norm": 1.125, - "learning_rate": 0.0001957164768571144, - "loss": 0.5895, + "grad_norm": 1.96875, + "learning_rate": 7.803535584196749e-05, + "loss": 0.651, "step": 3530 }, { "epoch": 0.8509870004814637, - "grad_norm": 1.3203125, - "learning_rate": 0.00019563335318894832, - "loss": 0.6464, + "grad_norm": 1.8984375, + "learning_rate": 7.800221307786098e-05, + "loss": 0.6975, "step": 3535 }, { "epoch": 0.8521906596051998, - "grad_norm": 1.0234375, - "learning_rate": 0.00019555014342107223, - "loss": 0.6019, + "grad_norm": 1.7421875, + "learning_rate": 7.79690359843939e-05, + "loss": 0.6522, "step": 3540 }, { "epoch": 0.853394318728936, - "grad_norm": 1.1953125, - "learning_rate": 0.00019546684768643397, - "loss": 0.5916, + "grad_norm": 2.125, + "learning_rate": 7.79358246145747e-05, + "loss": 0.643, "step": 3545 }, { "epoch": 0.8545979778526721, - "grad_norm": 1.0859375, - "learning_rate": 0.00019538346611811883, - "loss": 0.5606, + "grad_norm": 1.71875, + "learning_rate": 7.790257902146664e-05, + "loss": 0.6158, "step": 3550 }, { "epoch": 0.8558016369764083, - "grad_norm": 1.1015625, - "learning_rate": 0.0001952999988493491, - "loss": 0.6111, + "grad_norm": 1.8203125, + "learning_rate": 7.786929925818764e-05, + "loss": 0.6586, "step": 3555 }, { "epoch": 0.8570052961001444, - "grad_norm": 1.0703125, - "learning_rate": 0.00019521644601348418, - "loss": 0.5894, + "grad_norm": 1.78125, + "learning_rate": 7.783598537791028e-05, + "loss": 0.6511, "step": 3560 }, { "epoch": 0.8582089552238806, - "grad_norm": 1.0703125, - "learning_rate": 0.00019513280774402004, - "loss": 0.6145, + "grad_norm": 1.71875, + "learning_rate": 7.780263743386158e-05, + "loss": 0.675, "step": 3565 }, { "epoch": 0.8594126143476167, - "grad_norm": 1.1796875, - "learning_rate": 0.00019504908417458916, - "loss": 0.5932, + "grad_norm": 2.0, + "learning_rate": 7.776925547932298e-05, + "loss": 0.6483, "step": 3570 }, { "epoch": 0.8606162734713529, - "grad_norm": 1.109375, - "learning_rate": 0.00019496527543896034, - "loss": 0.5976, + "grad_norm": 1.8984375, + "learning_rate": 7.77358395676303e-05, + "loss": 0.6581, "step": 3575 }, { "epoch": 0.861819932595089, - "grad_norm": 1.1875, - "learning_rate": 0.00019488138167103852, - "loss": 0.5759, + "grad_norm": 1.9921875, + "learning_rate": 7.770238975217362e-05, + "loss": 0.6332, "step": 3580 }, { "epoch": 0.8630235917188253, - "grad_norm": 1.0703125, - "learning_rate": 0.0001947974030048644, - "loss": 0.5615, + "grad_norm": 1.7890625, + "learning_rate": 7.766890608639714e-05, + "loss": 0.619, "step": 3585 }, { "epoch": 0.8642272508425614, - "grad_norm": 1.0390625, - "learning_rate": 0.0001947133395746143, - "loss": 0.5799, + "grad_norm": 1.9765625, + "learning_rate": 7.76353886237992e-05, + "loss": 0.6358, "step": 3590 }, { "epoch": 0.8654309099662976, - "grad_norm": 1.140625, - "learning_rate": 0.00019462919151460014, - "loss": 0.5977, + "grad_norm": 2.03125, + "learning_rate": 7.76018374179321e-05, + "loss": 0.6597, "step": 3595 }, { "epoch": 0.8666345690900337, - "grad_norm": 1.1953125, - "learning_rate": 0.00019454495895926887, - "loss": 0.5752, + "grad_norm": 1.9375, + "learning_rate": 7.756825252240205e-05, + "loss": 0.6377, "step": 3600 }, { "epoch": 0.8678382282137699, - "grad_norm": 1.0625, - "learning_rate": 0.00019446064204320257, - "loss": 0.5857, + "grad_norm": 1.7578125, + "learning_rate": 7.753463399086914e-05, + "loss": 0.6374, "step": 3605 }, { "epoch": 0.869041887337506, - "grad_norm": 1.109375, - "learning_rate": 0.00019437624090111802, - "loss": 0.5683, + "grad_norm": 1.859375, + "learning_rate": 7.750098187704714e-05, + "loss": 0.6228, "step": 3610 }, { "epoch": 0.8702455464612422, - "grad_norm": 1.046875, - "learning_rate": 0.0001942917556678666, - "loss": 0.612, + "grad_norm": 1.8671875, + "learning_rate": 7.746729623470351e-05, + "loss": 0.6647, "step": 3615 }, { "epoch": 0.8714492055849783, - "grad_norm": 1.171875, - "learning_rate": 0.00019420718647843413, - "loss": 0.5705, + "grad_norm": 2.0, + "learning_rate": 7.743357711765927e-05, + "loss": 0.6302, "step": 3620 }, { "epoch": 0.8726528647087145, - "grad_norm": 1.140625, - "learning_rate": 0.00019412253346794042, - "loss": 0.5963, + "grad_norm": 1.9296875, + "learning_rate": 7.739982457978893e-05, + "loss": 0.6551, "step": 3625 }, { "epoch": 0.8738565238324506, - "grad_norm": 1.0625, - "learning_rate": 0.00019403779677163927, - "loss": 0.6115, + "grad_norm": 1.8984375, + "learning_rate": 7.736603867502036e-05, + "loss": 0.664, "step": 3630 }, { "epoch": 0.8750601829561868, - "grad_norm": 1.03125, - "learning_rate": 0.00019395297652491825, - "loss": 0.5673, + "grad_norm": 1.765625, + "learning_rate": 7.733221945733482e-05, + "loss": 0.6133, "step": 3635 }, { "epoch": 0.8762638420799229, - "grad_norm": 1.0, - "learning_rate": 0.00019386807286329836, - "loss": 0.6173, + "grad_norm": 1.703125, + "learning_rate": 7.729836698076673e-05, + "loss": 0.6739, "step": 3640 }, { "epoch": 0.8774675012036591, - "grad_norm": 1.1171875, - "learning_rate": 0.00019378308592243388, - "loss": 0.5901, + "grad_norm": 1.9921875, + "learning_rate": 7.726448129940368e-05, + "loss": 0.6483, "step": 3645 }, { "epoch": 0.8786711603273952, - "grad_norm": 1.1875, - "learning_rate": 0.00019369801583811214, - "loss": 0.584, + "grad_norm": 1.8515625, + "learning_rate": 7.723056246738628e-05, + "loss": 0.6408, "step": 3650 }, { "epoch": 0.8798748194511314, - "grad_norm": 1.046875, - "learning_rate": 0.00019361286274625333, - "loss": 0.5723, + "grad_norm": 1.875, + "learning_rate": 7.719661053890815e-05, + "loss": 0.6317, "step": 3655 }, { "epoch": 0.8810784785748677, - "grad_norm": 1.078125, - "learning_rate": 0.0001935276267829103, - "loss": 0.5513, + "grad_norm": 1.859375, + "learning_rate": 7.716262556821576e-05, + "loss": 0.6088, "step": 3660 }, { "epoch": 0.8822821376986038, - "grad_norm": 1.375, - "learning_rate": 0.00019344230808426822, - "loss": 0.5449, + "grad_norm": 1.828125, + "learning_rate": 7.712860760960836e-05, + "loss": 0.6008, "step": 3665 }, { "epoch": 0.88348579682234, - "grad_norm": 1.0390625, - "learning_rate": 0.00019335690678664452, - "loss": 0.6053, + "grad_norm": 1.7734375, + "learning_rate": 7.709455671743798e-05, + "loss": 0.6684, "step": 3670 }, { "epoch": 0.8846894559460761, - "grad_norm": 1.1328125, - "learning_rate": 0.00019327142302648855, - "loss": 0.5903, + "grad_norm": 1.8515625, + "learning_rate": 7.706047294610915e-05, + "loss": 0.654, "step": 3675 }, { "epoch": 0.8858931150698123, - "grad_norm": 1.1875, - "learning_rate": 0.0001931858569403815, - "loss": 0.5992, + "grad_norm": 1.859375, + "learning_rate": 7.702635635007905e-05, + "loss": 0.6494, "step": 3680 }, { "epoch": 0.8870967741935484, - "grad_norm": 1.109375, - "learning_rate": 0.000193100208665036, - "loss": 0.5534, + "grad_norm": 1.890625, + "learning_rate": 7.699220698385724e-05, + "loss": 0.6126, "step": 3685 }, { "epoch": 0.8883004333172846, - "grad_norm": 1.15625, - "learning_rate": 0.00019301447833729607, - "loss": 0.5766, + "grad_norm": 1.8515625, + "learning_rate": 7.695802490200566e-05, + "loss": 0.6369, "step": 3690 }, { "epoch": 0.8895040924410207, - "grad_norm": 1.2109375, - "learning_rate": 0.00019292866609413675, - "loss": 0.5827, + "grad_norm": 1.953125, + "learning_rate": 7.692381015913849e-05, + "loss": 0.6444, "step": 3695 }, { "epoch": 0.8907077515647569, - "grad_norm": 1.140625, - "learning_rate": 0.00019284277207266408, - "loss": 0.5991, + "grad_norm": 1.953125, + "learning_rate": 7.688956280992215e-05, + "loss": 0.6562, "step": 3700 }, { "epoch": 0.891911410688493, - "grad_norm": 1.0, - "learning_rate": 0.0001927567964101146, - "loss": 0.5538, + "grad_norm": 1.7265625, + "learning_rate": 7.685528290907508e-05, + "loss": 0.6131, "step": 3705 }, { "epoch": 0.8931150698122292, - "grad_norm": 1.0703125, - "learning_rate": 0.00019267073924385546, - "loss": 0.5797, + "grad_norm": 1.8671875, + "learning_rate": 7.682097051136783e-05, + "loss": 0.6328, "step": 3710 }, { "epoch": 0.8943187289359653, - "grad_norm": 1.1640625, - "learning_rate": 0.00019258460071138389, - "loss": 0.5836, + "grad_norm": 1.7265625, + "learning_rate": 7.678662567162278e-05, + "loss": 0.6386, "step": 3715 }, { "epoch": 0.8955223880597015, - "grad_norm": 0.97265625, - "learning_rate": 0.00019249838095032718, - "loss": 0.5667, + "grad_norm": 1.625, + "learning_rate": 7.675224844471417e-05, + "loss": 0.6282, "step": 3720 }, { "epoch": 0.8967260471834376, - "grad_norm": 1.0078125, - "learning_rate": 0.00019241208009844246, - "loss": 0.5829, + "grad_norm": 1.8671875, + "learning_rate": 7.671783888556803e-05, + "loss": 0.6408, "step": 3725 }, { "epoch": 0.8979297063071738, - "grad_norm": 0.9375, - "learning_rate": 0.00019232569829361632, - "loss": 0.5941, + "grad_norm": 1.671875, + "learning_rate": 7.668339704916199e-05, + "loss": 0.6514, "step": 3730 }, { "epoch": 0.8991333654309099, - "grad_norm": 1.0234375, - "learning_rate": 0.00019223923567386478, - "loss": 0.5397, + "grad_norm": 1.703125, + "learning_rate": 7.664892299052529e-05, + "loss": 0.5961, "step": 3735 }, { "epoch": 0.9003370245546461, - "grad_norm": 0.99609375, - "learning_rate": 0.0001921526923773329, - "loss": 0.57, + "grad_norm": 1.75, + "learning_rate": 7.661441676473863e-05, + "loss": 0.6297, "step": 3740 }, { "epoch": 0.9015406836783822, - "grad_norm": 1.15625, - "learning_rate": 0.00019206606854229468, - "loss": 0.5475, + "grad_norm": 1.9296875, + "learning_rate": 7.657987842693411e-05, + "loss": 0.6064, "step": 3745 }, { "epoch": 0.9027443428021185, - "grad_norm": 1.109375, - "learning_rate": 0.00019197936430715286, - "loss": 0.5723, + "grad_norm": 1.8671875, + "learning_rate": 7.654530803229514e-05, + "loss": 0.6348, "step": 3750 }, { "epoch": 0.9039480019258546, - "grad_norm": 1.0, - "learning_rate": 0.00019189257981043852, - "loss": 0.584, + "grad_norm": 1.6796875, + "learning_rate": 7.651070563605635e-05, + "loss": 0.643, "step": 3755 }, { "epoch": 0.9051516610495908, - "grad_norm": 1.109375, - "learning_rate": 0.00019180571519081108, - "loss": 0.5894, + "grad_norm": 1.9296875, + "learning_rate": 7.647607129350351e-05, + "loss": 0.6449, "step": 3760 }, { "epoch": 0.9063553201733269, - "grad_norm": 1.1484375, - "learning_rate": 0.0001917187705870579, - "loss": 0.6174, + "grad_norm": 2.015625, + "learning_rate": 7.64414050599734e-05, + "loss": 0.6765, "step": 3765 }, { "epoch": 0.9075589792970631, - "grad_norm": 1.09375, - "learning_rate": 0.00019163174613809423, - "loss": 0.57, + "grad_norm": 1.7734375, + "learning_rate": 7.640670699085381e-05, + "loss": 0.6329, "step": 3770 }, { "epoch": 0.9087626384207992, - "grad_norm": 1.125, - "learning_rate": 0.00019154464198296273, - "loss": 0.5829, + "grad_norm": 1.796875, + "learning_rate": 7.637197714158331e-05, + "loss": 0.6431, "step": 3775 }, { "epoch": 0.9099662975445354, - "grad_norm": 1.09375, - "learning_rate": 0.0001914574582608336, - "loss": 0.5865, + "grad_norm": 1.828125, + "learning_rate": 7.633721556765135e-05, + "loss": 0.6499, "step": 3780 }, { "epoch": 0.9111699566682715, - "grad_norm": 1.0625, - "learning_rate": 0.00019137019511100402, - "loss": 0.5783, + "grad_norm": 1.7890625, + "learning_rate": 7.630242232459796e-05, + "loss": 0.6441, "step": 3785 }, { "epoch": 0.9123736157920077, - "grad_norm": 1.1484375, - "learning_rate": 0.0001912828526728982, - "loss": 0.5777, + "grad_norm": 1.9296875, + "learning_rate": 7.626759746801386e-05, + "loss": 0.6292, "step": 3790 }, { "epoch": 0.9135772749157438, - "grad_norm": 1.1640625, - "learning_rate": 0.00019119543108606687, - "loss": 0.5602, + "grad_norm": 1.84375, + "learning_rate": 7.623274105354023e-05, + "loss": 0.6206, "step": 3795 }, { "epoch": 0.91478093403948, - "grad_norm": 1.0859375, - "learning_rate": 0.0001911079304901874, - "loss": 0.5957, + "grad_norm": 1.78125, + "learning_rate": 7.619785313686869e-05, + "loss": 0.6495, "step": 3800 }, { "epoch": 0.9159845931632162, - "grad_norm": 1.0546875, - "learning_rate": 0.00019102035102506326, - "loss": 0.5826, + "grad_norm": 1.7421875, + "learning_rate": 7.616293377374117e-05, + "loss": 0.639, "step": 3805 }, { "epoch": 0.9171882522869523, - "grad_norm": 1.109375, - "learning_rate": 0.00019093269283062403, - "loss": 0.6071, + "grad_norm": 1.7890625, + "learning_rate": 7.612798301994989e-05, + "loss": 0.6683, "step": 3810 }, { "epoch": 0.9183919114106885, - "grad_norm": 0.99609375, - "learning_rate": 0.000190844956046925, - "loss": 0.5595, + "grad_norm": 1.765625, + "learning_rate": 7.609300093133718e-05, + "loss": 0.6174, "step": 3815 }, { "epoch": 0.9195955705344246, - "grad_norm": 1.0859375, - "learning_rate": 0.00019075714081414705, - "loss": 0.6065, + "grad_norm": 1.796875, + "learning_rate": 7.605798756379544e-05, + "loss": 0.6562, "step": 3820 }, { "epoch": 0.9207992296581609, - "grad_norm": 1.171875, - "learning_rate": 0.00019066924727259644, - "loss": 0.5667, + "grad_norm": 1.7421875, + "learning_rate": 7.602294297326707e-05, + "loss": 0.6241, "step": 3825 }, { "epoch": 0.922002888781897, - "grad_norm": 1.1171875, - "learning_rate": 0.00019058127556270451, - "loss": 0.5874, + "grad_norm": 1.96875, + "learning_rate": 7.598786721574434e-05, + "loss": 0.6487, "step": 3830 }, { "epoch": 0.9232065479056332, - "grad_norm": 1.15625, - "learning_rate": 0.00019049322582502748, - "loss": 0.5925, + "grad_norm": 1.71875, + "learning_rate": 7.595276034726931e-05, + "loss": 0.6446, "step": 3835 }, { "epoch": 0.9244102070293693, - "grad_norm": 1.078125, - "learning_rate": 0.00019040509820024626, - "loss": 0.5357, + "grad_norm": 1.7890625, + "learning_rate": 7.591762242393376e-05, + "loss": 0.5968, "step": 3840 }, { "epoch": 0.9256138661531055, - "grad_norm": 1.125, - "learning_rate": 0.00019031689282916623, - "loss": 0.5651, + "grad_norm": 1.96875, + "learning_rate": 7.588245350187908e-05, + "loss": 0.6245, "step": 3845 }, { "epoch": 0.9268175252768416, - "grad_norm": 1.109375, - "learning_rate": 0.000190228609852717, - "loss": 0.5789, + "grad_norm": 1.7421875, + "learning_rate": 7.584725363729622e-05, + "loss": 0.6346, "step": 3850 }, { "epoch": 0.9280211844005778, - "grad_norm": 1.21875, - "learning_rate": 0.00019014024941195202, - "loss": 0.6223, + "grad_norm": 1.8359375, + "learning_rate": 7.58120228864255e-05, + "loss": 0.6781, "step": 3855 }, { "epoch": 0.9292248435243139, - "grad_norm": 1.125, - "learning_rate": 0.00019005181164804874, - "loss": 0.5748, + "grad_norm": 1.8203125, + "learning_rate": 7.577676130555667e-05, + "loss": 0.6297, "step": 3860 }, { "epoch": 0.9304285026480501, - "grad_norm": 1.1171875, - "learning_rate": 0.0001899632967023079, - "loss": 0.5933, + "grad_norm": 1.828125, + "learning_rate": 7.574146895102864e-05, + "loss": 0.6462, "step": 3865 }, { "epoch": 0.9316321617717862, - "grad_norm": 1.1953125, - "learning_rate": 0.00018987470471615382, - "loss": 0.6052, + "grad_norm": 1.9140625, + "learning_rate": 7.570614587922959e-05, + "loss": 0.6597, "step": 3870 }, { "epoch": 0.9328358208955224, - "grad_norm": 1.09375, - "learning_rate": 0.00018978603583113374, - "loss": 0.5672, + "grad_norm": 1.8203125, + "learning_rate": 7.567079214659673e-05, + "loss": 0.6347, "step": 3875 }, { "epoch": 0.9340394800192585, - "grad_norm": 1.1015625, - "learning_rate": 0.0001896972901889178, - "loss": 0.5861, + "grad_norm": 1.7890625, + "learning_rate": 7.563540780961625e-05, + "loss": 0.6446, "step": 3880 }, { "epoch": 0.9352431391429947, - "grad_norm": 1.0625, - "learning_rate": 0.00018960846793129876, - "loss": 0.54, + "grad_norm": 1.7265625, + "learning_rate": 7.559999292482325e-05, + "loss": 0.6035, "step": 3885 }, { "epoch": 0.9364467982667308, - "grad_norm": 1.15625, - "learning_rate": 0.0001895195692001919, - "loss": 0.5657, + "grad_norm": 1.8125, + "learning_rate": 7.556454754880164e-05, + "loss": 0.6296, "step": 3890 }, { "epoch": 0.937650457390467, - "grad_norm": 1.1171875, - "learning_rate": 0.00018943059413763452, - "loss": 0.5652, + "grad_norm": 1.8203125, + "learning_rate": 7.552907173818404e-05, + "loss": 0.6286, "step": 3895 }, { "epoch": 0.9388541165142031, - "grad_norm": 1.1875, - "learning_rate": 0.00018934154288578598, - "loss": 0.5915, + "grad_norm": 1.9609375, + "learning_rate": 7.54935655496517e-05, + "loss": 0.6594, "step": 3900 }, { "epoch": 0.9400577756379394, - "grad_norm": 1.0234375, - "learning_rate": 0.00018925241558692742, - "loss": 0.5638, + "grad_norm": 1.8046875, + "learning_rate": 7.545802903993443e-05, + "loss": 0.6177, "step": 3905 }, { "epoch": 0.9412614347616755, - "grad_norm": 1.09375, - "learning_rate": 0.0001891632123834613, - "loss": 0.5953, + "grad_norm": 1.8359375, + "learning_rate": 7.542246226581044e-05, + "loss": 0.6585, "step": 3910 }, { "epoch": 0.9424650938854117, - "grad_norm": 1.0390625, - "learning_rate": 0.00018907393341791154, - "loss": 0.5912, + "grad_norm": 1.8515625, + "learning_rate": 7.538686528410632e-05, + "loss": 0.6521, "step": 3915 }, { "epoch": 0.9436687530091478, - "grad_norm": 1.0546875, - "learning_rate": 0.00018898457883292306, - "loss": 0.5958, + "grad_norm": 1.7890625, + "learning_rate": 7.535123815169692e-05, + "loss": 0.65, "step": 3920 }, { "epoch": 0.944872412132884, - "grad_norm": 1.0390625, - "learning_rate": 0.00018889514877126155, - "loss": 0.5443, + "grad_norm": 1.84375, + "learning_rate": 7.531558092550528e-05, + "loss": 0.6083, "step": 3925 }, { "epoch": 0.9460760712566201, - "grad_norm": 1.1796875, - "learning_rate": 0.00018880564337581332, - "loss": 0.565, + "grad_norm": 1.8984375, + "learning_rate": 7.52798936625025e-05, + "loss": 0.6246, "step": 3930 }, { "epoch": 0.9472797303803563, - "grad_norm": 1.09375, - "learning_rate": 0.00018871606278958501, - "loss": 0.5618, + "grad_norm": 1.9375, + "learning_rate": 7.524417641970765e-05, + "loss": 0.6239, "step": 3935 }, { "epoch": 0.9484833895040924, - "grad_norm": 0.99609375, - "learning_rate": 0.0001886264071557035, - "loss": 0.548, + "grad_norm": 1.796875, + "learning_rate": 7.520842925418777e-05, + "loss": 0.613, "step": 3940 }, { "epoch": 0.9496870486278286, - "grad_norm": 1.1953125, - "learning_rate": 0.0001885366766174155, - "loss": 0.55, + "grad_norm": 1.859375, + "learning_rate": 7.517265222305764e-05, + "loss": 0.5982, "step": 3945 }, { "epoch": 0.9508907077515648, - "grad_norm": 1.109375, - "learning_rate": 0.00018844687131808741, - "loss": 0.5458, + "grad_norm": 1.8359375, + "learning_rate": 7.51368453834798e-05, + "loss": 0.6073, "step": 3950 }, { "epoch": 0.9520943668753009, - "grad_norm": 1.0546875, - "learning_rate": 0.00018835699140120504, - "loss": 0.5367, + "grad_norm": 1.75, + "learning_rate": 7.510100879266439e-05, + "loss": 0.6008, "step": 3955 }, { "epoch": 0.9532980259990371, - "grad_norm": 1.0625, - "learning_rate": 0.00018826703701037344, - "loss": 0.5375, + "grad_norm": 1.8359375, + "learning_rate": 7.506514250786909e-05, + "loss": 0.5953, "step": 3960 }, { "epoch": 0.9545016851227732, - "grad_norm": 1.046875, - "learning_rate": 0.00018817700828931675, - "loss": 0.5864, + "grad_norm": 1.8125, + "learning_rate": 7.502924658639905e-05, + "loss": 0.6404, "step": 3965 }, { "epoch": 0.9557053442465094, - "grad_norm": 1.109375, - "learning_rate": 0.0001880869053818777, - "loss": 0.561, + "grad_norm": 1.859375, + "learning_rate": 7.499332108560672e-05, + "loss": 0.6154, "step": 3970 }, { "epoch": 0.9569090033702455, - "grad_norm": 1.0703125, - "learning_rate": 0.0001879967284320177, - "loss": 0.5635, + "grad_norm": 1.8203125, + "learning_rate": 7.495736606289187e-05, + "loss": 0.6237, "step": 3975 }, { "epoch": 0.9581126624939817, - "grad_norm": 1.2421875, - "learning_rate": 0.00018790647758381638, - "loss": 0.5788, + "grad_norm": 1.8359375, + "learning_rate": 7.492138157570138e-05, + "loss": 0.6389, "step": 3980 }, { "epoch": 0.9593163216177178, - "grad_norm": 1.296875, - "learning_rate": 0.00018781615298147142, - "loss": 0.5925, + "grad_norm": 2.0625, + "learning_rate": 7.488536768152926e-05, + "loss": 0.6503, "step": 3985 }, { "epoch": 0.9605199807414541, - "grad_norm": 1.0546875, - "learning_rate": 0.00018772575476929846, - "loss": 0.5371, + "grad_norm": 1.9453125, + "learning_rate": 7.484932443791649e-05, + "loss": 0.6008, "step": 3990 }, { "epoch": 0.9617236398651902, - "grad_norm": 1.0859375, - "learning_rate": 0.0001876352830917306, - "loss": 0.5876, + "grad_norm": 1.8515625, + "learning_rate": 7.48132519024509e-05, + "loss": 0.6517, "step": 3995 }, { "epoch": 0.9629272989889264, - "grad_norm": 1.0703125, - "learning_rate": 0.00018754473809331842, - "loss": 0.548, + "grad_norm": 2.1875, + "learning_rate": 7.477715013276717e-05, + "loss": 0.6073, "step": 4000 }, { "epoch": 0.9629272989889264, - "eval_loss": 0.4761577546596527, - "eval_runtime": 2.3265, - "eval_samples_per_second": 85.965, - "eval_steps_per_second": 85.965, + "eval_loss": 0.5287883281707764, + "eval_runtime": 2.3734, + "eval_samples_per_second": 84.268, + "eval_steps_per_second": 84.268, "step": 4000 }, { "epoch": 0.9641309581126625, - "grad_norm": 1.1015625, - "learning_rate": 0.00018745411991872958, - "loss": 0.6145, + "grad_norm": 1.6796875, + "learning_rate": 7.474101918654669e-05, + "loss": 0.6722, "step": 4005 }, { "epoch": 0.9653346172363987, - "grad_norm": 1.125, - "learning_rate": 0.00018736342871274872, - "loss": 0.5699, + "grad_norm": 1.8828125, + "learning_rate": 7.470485912151741e-05, + "loss": 0.6282, "step": 4010 }, { "epoch": 0.9665382763601348, - "grad_norm": 1.1796875, - "learning_rate": 0.00018727266462027715, - "loss": 0.566, + "grad_norm": 1.8359375, + "learning_rate": 7.466866999545389e-05, + "loss": 0.6271, "step": 4015 }, { "epoch": 0.967741935483871, - "grad_norm": 1.1796875, - "learning_rate": 0.0001871818277863326, - "loss": 0.5739, + "grad_norm": 1.78125, + "learning_rate": 7.463245186617704e-05, + "loss": 0.6315, "step": 4020 }, { "epoch": 0.9689455946076071, - "grad_norm": 1.3515625, - "learning_rate": 0.0001870909183560491, - "loss": 0.5514, + "grad_norm": 2.0, + "learning_rate": 7.459620479155417e-05, + "loss": 0.6088, "step": 4025 }, { "epoch": 0.9701492537313433, - "grad_norm": 1.0859375, - "learning_rate": 0.00018699993647467656, - "loss": 0.5715, + "grad_norm": 1.9453125, + "learning_rate": 7.455992882949882e-05, + "loss": 0.6318, "step": 4030 }, { "epoch": 0.9713529128550794, - "grad_norm": 1.015625, - "learning_rate": 0.00018690888228758068, - "loss": 0.5733, + "grad_norm": 1.7734375, + "learning_rate": 7.452362403797062e-05, + "loss": 0.6308, "step": 4035 }, { "epoch": 0.9725565719788156, - "grad_norm": 1.078125, - "learning_rate": 0.00018681775594024276, - "loss": 0.5686, + "grad_norm": 1.8203125, + "learning_rate": 7.448729047497539e-05, + "loss": 0.6294, "step": 4040 }, { "epoch": 0.9737602311025517, - "grad_norm": 1.046875, - "learning_rate": 0.0001867265575782593, - "loss": 0.5603, + "grad_norm": 1.78125, + "learning_rate": 7.445092819856481e-05, + "loss": 0.6201, "step": 4045 }, { "epoch": 0.9749638902262879, - "grad_norm": 1.078125, - "learning_rate": 0.000186635287347342, - "loss": 0.5842, + "grad_norm": 1.7890625, + "learning_rate": 7.441453726683651e-05, + "loss": 0.6393, "step": 4050 }, { "epoch": 0.976167549350024, - "grad_norm": 1.203125, - "learning_rate": 0.00018654394539331719, - "loss": 0.5532, + "grad_norm": 1.984375, + "learning_rate": 7.437811773793383e-05, + "loss": 0.616, "step": 4055 }, { "epoch": 0.9773712084737602, - "grad_norm": 1.1640625, - "learning_rate": 0.00018645253186212586, - "loss": 0.5622, + "grad_norm": 1.8203125, + "learning_rate": 7.434166967004584e-05, + "loss": 0.6214, "step": 4060 }, { "epoch": 0.9785748675974963, - "grad_norm": 1.1015625, - "learning_rate": 0.00018636104689982353, - "loss": 0.5465, + "grad_norm": 1.8984375, + "learning_rate": 7.430519312140727e-05, + "loss": 0.6053, "step": 4065 }, { "epoch": 0.9797785267212326, - "grad_norm": 1.046875, - "learning_rate": 0.0001862694906525796, - "loss": 0.5558, + "grad_norm": 1.7890625, + "learning_rate": 7.426868815029821e-05, + "loss": 0.6156, "step": 4070 }, { "epoch": 0.9809821858449687, - "grad_norm": 1.1953125, - "learning_rate": 0.0001861778632666776, - "loss": 0.6136, + "grad_norm": 1.9609375, + "learning_rate": 7.42321548150443e-05, + "loss": 0.6719, "step": 4075 }, { "epoch": 0.9821858449687049, - "grad_norm": 1.015625, - "learning_rate": 0.00018608616488851454, - "loss": 0.5822, + "grad_norm": 1.8125, + "learning_rate": 7.419559317401646e-05, + "loss": 0.6398, "step": 4080 }, { "epoch": 0.983389504092441, - "grad_norm": 1.03125, - "learning_rate": 0.00018599439566460084, - "loss": 0.5606, + "grad_norm": 1.8125, + "learning_rate": 7.415900328563075e-05, + "loss": 0.6271, "step": 4085 }, { "epoch": 0.9845931632161772, - "grad_norm": 0.98046875, - "learning_rate": 0.00018590255574156032, - "loss": 0.5736, + "grad_norm": 1.6953125, + "learning_rate": 7.412238520834849e-05, + "loss": 0.6345, "step": 4090 }, { "epoch": 0.9857968223399133, - "grad_norm": 1.109375, - "learning_rate": 0.00018581064526612963, - "loss": 0.5646, + "grad_norm": 1.828125, + "learning_rate": 7.408573900067597e-05, + "loss": 0.6243, "step": 4095 }, { "epoch": 0.9870004814636495, - "grad_norm": 1.2578125, - "learning_rate": 0.00018571866438515805, - "loss": 0.579, + "grad_norm": 1.9140625, + "learning_rate": 7.404906472116442e-05, + "loss": 0.6419, "step": 4100 }, { "epoch": 0.9882041405873857, - "grad_norm": 1.0859375, - "learning_rate": 0.00018562661324560752, - "loss": 0.5724, + "grad_norm": 1.8515625, + "learning_rate": 7.401236242840994e-05, + "loss": 0.6275, "step": 4105 }, { "epoch": 0.9894077997111218, - "grad_norm": 1.15625, - "learning_rate": 0.00018553449199455214, - "loss": 0.5655, + "grad_norm": 1.96875, + "learning_rate": 7.39756321810534e-05, + "loss": 0.6225, "step": 4110 }, { "epoch": 0.990611458834858, - "grad_norm": 1.015625, - "learning_rate": 0.0001854423007791781, - "loss": 0.5577, + "grad_norm": 1.734375, + "learning_rate": 7.393887403778031e-05, + "loss": 0.6182, "step": 4115 }, { "epoch": 0.9918151179585941, - "grad_norm": 1.03125, - "learning_rate": 0.00018535003974678324, - "loss": 0.5735, + "grad_norm": 1.734375, + "learning_rate": 7.390208805732075e-05, + "loss": 0.6354, "step": 4120 }, { "epoch": 0.9930187770823303, - "grad_norm": 1.046875, - "learning_rate": 0.0001852577090447771, - "loss": 0.5591, + "grad_norm": 1.8203125, + "learning_rate": 7.386527429844931e-05, + "loss": 0.6233, "step": 4125 }, { "epoch": 0.9942224362060664, - "grad_norm": 1.046875, - "learning_rate": 0.00018516530882068053, - "loss": 0.5517, + "grad_norm": 1.859375, + "learning_rate": 7.382843281998493e-05, + "loss": 0.6186, "step": 4130 }, { "epoch": 0.9954260953298026, - "grad_norm": 0.98046875, - "learning_rate": 0.00018507283922212524, - "loss": 0.5602, + "grad_norm": 1.6640625, + "learning_rate": 7.379156368079083e-05, + "loss": 0.6241, "step": 4135 }, { "epoch": 0.9966297544535387, - "grad_norm": 1.1796875, - "learning_rate": 0.0001849803003968541, - "loss": 0.5502, + "grad_norm": 1.8046875, + "learning_rate": 7.375466693977446e-05, + "loss": 0.6113, "step": 4140 }, { "epoch": 0.997833413577275, - "grad_norm": 1.1640625, - "learning_rate": 0.00018488769249272035, - "loss": 0.5798, + "grad_norm": 1.953125, + "learning_rate": 7.371774265588736e-05, + "loss": 0.6421, "step": 4145 }, { "epoch": 0.999037072701011, - "grad_norm": 1.2109375, - "learning_rate": 0.00018479501565768768, - "loss": 0.5664, + "grad_norm": 1.859375, + "learning_rate": 7.368079088812505e-05, + "loss": 0.6272, "step": 4150 }, { "epoch": 0.9997592681752527, - "eval_loss": 0.47299399971961975, - "eval_runtime": 2.3193, - "eval_samples_per_second": 86.234, - "eval_steps_per_second": 86.234, + "eval_loss": 0.5253685116767883, + "eval_runtime": 2.3588, + "eval_samples_per_second": 84.79, + "eval_steps_per_second": 84.79, "step": 4153 }, { "epoch": 1.0002407318247473, - "grad_norm": 1.046875, - "learning_rate": 0.00018470227003982996, - "loss": 0.5757, + "grad_norm": 1.78125, + "learning_rate": 7.3643811695527e-05, + "loss": 0.6347, "step": 4155 }, { "epoch": 1.0014443909484834, - "grad_norm": 1.0625, - "learning_rate": 0.00018460945578733083, - "loss": 0.5188, + "grad_norm": 1.875, + "learning_rate": 7.360680513717645e-05, + "loss": 0.593, "step": 4160 }, { "epoch": 1.0026480500722195, - "grad_norm": 1.03125, - "learning_rate": 0.00018451657304848377, - "loss": 0.5036, + "grad_norm": 1.875, + "learning_rate": 7.356977127220044e-05, + "loss": 0.5812, "step": 4165 }, { "epoch": 1.0038517091959558, - "grad_norm": 0.98046875, - "learning_rate": 0.00018442362197169154, - "loss": 0.5098, + "grad_norm": 1.671875, + "learning_rate": 7.353271015976957e-05, + "loss": 0.5873, "step": 4170 }, { "epoch": 1.0050553683196919, - "grad_norm": 0.9765625, - "learning_rate": 0.00018433060270546612, - "loss": 0.5011, + "grad_norm": 1.671875, + "learning_rate": 7.349562185909799e-05, + "loss": 0.5787, "step": 4175 }, { "epoch": 1.006259027443428, - "grad_norm": 1.03125, - "learning_rate": 0.00018423751539842846, - "loss": 0.5173, + "grad_norm": 1.7734375, + "learning_rate": 7.34585064294433e-05, + "loss": 0.5924, "step": 4180 }, { "epoch": 1.007462686567164, - "grad_norm": 1.0703125, - "learning_rate": 0.00018414436019930825, - "loss": 0.5428, + "grad_norm": 1.828125, + "learning_rate": 7.342136393010646e-05, + "loss": 0.6223, "step": 4185 }, { "epoch": 1.0086663456909004, - "grad_norm": 1.078125, - "learning_rate": 0.00018405113725694357, - "loss": 0.5075, + "grad_norm": 1.90625, + "learning_rate": 7.338419442043163e-05, + "loss": 0.5894, "step": 4190 }, { "epoch": 1.0098700048146365, - "grad_norm": 0.984375, - "learning_rate": 0.00018395784672028083, - "loss": 0.5271, + "grad_norm": 1.8125, + "learning_rate": 7.33469979598062e-05, + "loss": 0.6056, "step": 4195 }, { "epoch": 1.0110736639383726, - "grad_norm": 1.0859375, - "learning_rate": 0.00018386448873837434, - "loss": 0.5341, + "grad_norm": 1.9453125, + "learning_rate": 7.330977460766052e-05, + "loss": 0.6117, "step": 4200 }, { "epoch": 1.0122773230621087, - "grad_norm": 1.1171875, - "learning_rate": 0.00018377106346038625, - "loss": 0.5488, + "grad_norm": 1.921875, + "learning_rate": 7.327252442346803e-05, + "loss": 0.6322, "step": 4205 }, { "epoch": 1.013480982185845, - "grad_norm": 0.96875, - "learning_rate": 0.00018367757103558618, - "loss": 0.5355, + "grad_norm": 1.7578125, + "learning_rate": 7.323524746674492e-05, + "loss": 0.6172, "step": 4210 }, { "epoch": 1.0146846413095811, - "grad_norm": 1.015625, - "learning_rate": 0.0001835840116133511, - "loss": 0.4969, + "grad_norm": 1.765625, + "learning_rate": 7.319794379705025e-05, + "loss": 0.5709, "step": 4215 }, { "epoch": 1.0158883004333172, - "grad_norm": 1.109375, - "learning_rate": 0.00018349038534316495, - "loss": 0.5187, + "grad_norm": 1.796875, + "learning_rate": 7.316061347398569e-05, + "loss": 0.596, "step": 4220 }, { "epoch": 1.0170919595570533, - "grad_norm": 1.203125, - "learning_rate": 0.00018339669237461853, - "loss": 0.5127, + "grad_norm": 2.03125, + "learning_rate": 7.312325655719554e-05, + "loss": 0.5906, "step": 4225 }, { "epoch": 1.0182956186807897, - "grad_norm": 1.1015625, - "learning_rate": 0.00018330293285740915, - "loss": 0.5283, + "grad_norm": 1.9609375, + "learning_rate": 7.308587310636659e-05, + "loss": 0.6094, "step": 4230 }, { "epoch": 1.0194992778045258, - "grad_norm": 1.109375, - "learning_rate": 0.00018320910694134054, - "loss": 0.5152, + "grad_norm": 1.90625, + "learning_rate": 7.3048463181228e-05, + "loss": 0.5855, "step": 4235 }, { "epoch": 1.0207029369282619, - "grad_norm": 1.1484375, - "learning_rate": 0.0001831152147763224, - "loss": 0.5188, + "grad_norm": 1.9296875, + "learning_rate": 7.301102684155125e-05, + "loss": 0.5976, "step": 4240 }, { "epoch": 1.021906596051998, - "grad_norm": 0.98828125, - "learning_rate": 0.0001830212565123704, - "loss": 0.5386, + "grad_norm": 1.7265625, + "learning_rate": 7.297356414714998e-05, + "loss": 0.6164, "step": 4245 }, { "epoch": 1.0231102551757343, - "grad_norm": 1.0390625, - "learning_rate": 0.0001829272322996057, - "loss": 0.5123, + "grad_norm": 1.8125, + "learning_rate": 7.293607515788001e-05, + "loss": 0.5867, "step": 4250 }, { "epoch": 1.0243139142994704, - "grad_norm": 1.0234375, - "learning_rate": 0.0001828331422882549, - "loss": 0.5383, + "grad_norm": 1.71875, + "learning_rate": 7.28985599336391e-05, + "loss": 0.6059, "step": 4255 }, { "epoch": 1.0255175734232065, - "grad_norm": 1.015625, - "learning_rate": 0.00018273898662864982, - "loss": 0.5371, + "grad_norm": 1.921875, + "learning_rate": 7.286101853436703e-05, + "loss": 0.6158, "step": 4260 }, { "epoch": 1.0267212325469428, - "grad_norm": 1.09375, - "learning_rate": 0.0001826447654712269, - "loss": 0.5366, + "grad_norm": 1.875, + "learning_rate": 7.282345102004525e-05, + "loss": 0.6171, "step": 4265 }, { "epoch": 1.027924891670679, - "grad_norm": 1.0546875, - "learning_rate": 0.00018255047896652754, - "loss": 0.5156, + "grad_norm": 1.6484375, + "learning_rate": 7.278585745069705e-05, + "loss": 0.5931, "step": 4270 }, { "epoch": 1.029128550794415, - "grad_norm": 1.1484375, - "learning_rate": 0.00018245612726519733, - "loss": 0.5332, + "grad_norm": 1.7734375, + "learning_rate": 7.274823788638734e-05, + "loss": 0.6167, "step": 4275 }, { "epoch": 1.030332209918151, - "grad_norm": 0.9765625, - "learning_rate": 0.00018236171051798608, - "loss": 0.5126, + "grad_norm": 1.734375, + "learning_rate": 7.271059238722249e-05, + "loss": 0.59, "step": 4280 }, { "epoch": 1.0315358690418874, - "grad_norm": 1.078125, - "learning_rate": 0.00018226722887574757, - "loss": 0.5155, + "grad_norm": 1.8828125, + "learning_rate": 7.267292101335038e-05, + "loss": 0.5904, "step": 4285 }, { "epoch": 1.0327395281656235, - "grad_norm": 1.109375, - "learning_rate": 0.00018217268248943932, - "loss": 0.4939, + "grad_norm": 1.8984375, + "learning_rate": 7.263522382496025e-05, + "loss": 0.5618, "step": 4290 }, { "epoch": 1.0339431872893596, - "grad_norm": 1.0546875, - "learning_rate": 0.0001820780715101221, - "loss": 0.4743, + "grad_norm": 1.7734375, + "learning_rate": 7.259750088228248e-05, + "loss": 0.5486, "step": 4295 }, { "epoch": 1.0351468464130957, - "grad_norm": 0.97265625, - "learning_rate": 0.0001819833960889601, - "loss": 0.4784, + "grad_norm": 1.6796875, + "learning_rate": 7.255975224558869e-05, + "loss": 0.5483, "step": 4300 }, { "epoch": 1.036350505536832, - "grad_norm": 1.1328125, - "learning_rate": 0.00018188865637722027, - "loss": 0.525, + "grad_norm": 2.03125, + "learning_rate": 7.252197797519152e-05, + "loss": 0.6077, "step": 4305 }, { "epoch": 1.0375541646605682, - "grad_norm": 1.15625, - "learning_rate": 0.00018179385252627245, - "loss": 0.5214, + "grad_norm": 1.890625, + "learning_rate": 7.248417813144458e-05, + "loss": 0.5974, "step": 4310 }, { "epoch": 1.0387578237843043, - "grad_norm": 1.140625, - "learning_rate": 0.00018169898468758892, - "loss": 0.5136, + "grad_norm": 2.1875, + "learning_rate": 7.244635277474234e-05, + "loss": 0.5916, "step": 4315 }, { "epoch": 1.0399614829080404, - "grad_norm": 1.0234375, - "learning_rate": 0.00018160405301274413, - "loss": 0.5118, + "grad_norm": 2.078125, + "learning_rate": 7.240850196552e-05, + "loss": 0.5828, "step": 4320 }, { "epoch": 1.0411651420317767, - "grad_norm": 1.03125, - "learning_rate": 0.00018150905765341454, - "loss": 0.5377, + "grad_norm": 1.765625, + "learning_rate": 7.237062576425343e-05, + "loss": 0.6159, "step": 4325 }, { "epoch": 1.0423688011555128, - "grad_norm": 1.125, - "learning_rate": 0.0001814139987613784, - "loss": 0.5084, + "grad_norm": 1.8515625, + "learning_rate": 7.233272423145914e-05, + "loss": 0.5824, "step": 4330 }, { "epoch": 1.0435724602792489, - "grad_norm": 1.125, - "learning_rate": 0.0001813188764885154, - "loss": 0.4996, + "grad_norm": 1.96875, + "learning_rate": 7.229479742769402e-05, + "loss": 0.5696, "step": 4335 }, { "epoch": 1.044776119402985, - "grad_norm": 1.015625, - "learning_rate": 0.00018122369098680667, - "loss": 0.4996, + "grad_norm": 1.90625, + "learning_rate": 7.225684541355544e-05, + "loss": 0.5774, "step": 4340 }, { "epoch": 1.0459797785267213, - "grad_norm": 1.0625, - "learning_rate": 0.00018112844240833413, - "loss": 0.4993, + "grad_norm": 1.921875, + "learning_rate": 7.221886824968091e-05, + "loss": 0.5745, "step": 4345 }, { "epoch": 1.0471834376504574, - "grad_norm": 1.0390625, - "learning_rate": 0.00018103313090528066, - "loss": 0.5305, + "grad_norm": 1.875, + "learning_rate": 7.218086599674827e-05, + "loss": 0.598, "step": 4350 }, { "epoch": 1.0483870967741935, - "grad_norm": 1.21875, - "learning_rate": 0.00018093775662992957, - "loss": 0.4925, + "grad_norm": 2.03125, + "learning_rate": 7.214283871547534e-05, + "loss": 0.5683, "step": 4355 }, { "epoch": 1.0495907558979296, - "grad_norm": 1.09375, - "learning_rate": 0.00018084231973466449, - "loss": 0.4962, + "grad_norm": 2.0625, + "learning_rate": 7.210478646661996e-05, + "loss": 0.5702, "step": 4360 }, { "epoch": 1.050794415021666, - "grad_norm": 1.1171875, - "learning_rate": 0.00018074682037196914, - "loss": 0.5043, + "grad_norm": 2.03125, + "learning_rate": 7.20667093109799e-05, + "loss": 0.5808, "step": 4365 }, { "epoch": 1.051998074145402, - "grad_norm": 0.98828125, - "learning_rate": 0.00018065125869442703, - "loss": 0.4967, + "grad_norm": 1.7421875, + "learning_rate": 7.20286073093927e-05, + "loss": 0.5783, "step": 4370 }, { "epoch": 1.0532017332691381, - "grad_norm": 1.0, - "learning_rate": 0.00018055563485472122, - "loss": 0.5168, + "grad_norm": 1.75, + "learning_rate": 7.199048052273555e-05, + "loss": 0.5939, "step": 4375 }, { "epoch": 1.0544053923928742, - "grad_norm": 1.1015625, - "learning_rate": 0.0001804599490056341, - "loss": 0.4848, + "grad_norm": 1.90625, + "learning_rate": 7.195232901192531e-05, + "loss": 0.5577, "step": 4380 }, { "epoch": 1.0556090515166106, - "grad_norm": 1.109375, - "learning_rate": 0.00018036420130004702, - "loss": 0.528, + "grad_norm": 1.9140625, + "learning_rate": 7.19141528379183e-05, + "loss": 0.6019, "step": 4385 }, { "epoch": 1.0568127106403467, - "grad_norm": 1.125, - "learning_rate": 0.0001802683918909404, - "loss": 0.5396, + "grad_norm": 1.8828125, + "learning_rate": 7.187595206171027e-05, + "loss": 0.613, "step": 4390 }, { "epoch": 1.0580163697640828, - "grad_norm": 1.09375, - "learning_rate": 0.00018017252093139308, - "loss": 0.5284, + "grad_norm": 1.796875, + "learning_rate": 7.183772674433626e-05, + "loss": 0.6052, "step": 4395 }, { "epoch": 1.059220028887819, - "grad_norm": 1.0, - "learning_rate": 0.0001800765885745823, - "loss": 0.5153, + "grad_norm": 1.71875, + "learning_rate": 7.179947694687053e-05, + "loss": 0.5839, "step": 4400 }, { "epoch": 1.0604236880115552, - "grad_norm": 1.046875, - "learning_rate": 0.0001799805949737833, - "loss": 0.5189, + "grad_norm": 1.953125, + "learning_rate": 7.176120273042646e-05, + "loss": 0.589, "step": 4405 }, { "epoch": 1.0616273471352913, - "grad_norm": 1.03125, - "learning_rate": 0.00017988454028236927, - "loss": 0.5141, + "grad_norm": 1.8203125, + "learning_rate": 7.172290415615641e-05, + "loss": 0.5846, "step": 4410 }, { "epoch": 1.0628310062590274, - "grad_norm": 1.0625, - "learning_rate": 0.000179788424653811, - "loss": 0.5143, + "grad_norm": 1.859375, + "learning_rate": 7.16845812852517e-05, + "loss": 0.5936, "step": 4415 }, { "epoch": 1.0640346653827637, - "grad_norm": 1.109375, - "learning_rate": 0.00017969224824167666, - "loss": 0.5211, + "grad_norm": 1.9140625, + "learning_rate": 7.164623417894248e-05, + "loss": 0.5959, "step": 4420 }, { "epoch": 1.0652383245064998, - "grad_norm": 1.078125, - "learning_rate": 0.0001795960111996315, - "loss": 0.5341, + "grad_norm": 1.953125, + "learning_rate": 7.160786289849756e-05, + "loss": 0.6061, "step": 4425 }, { "epoch": 1.066441983630236, - "grad_norm": 1.0234375, - "learning_rate": 0.00017949971368143755, - "loss": 0.516, + "grad_norm": 1.9375, + "learning_rate": 7.156946750522443e-05, + "loss": 0.5924, "step": 4430 }, { "epoch": 1.067645642753972, - "grad_norm": 1.0390625, - "learning_rate": 0.00017940335584095362, - "loss": 0.5551, + "grad_norm": 1.734375, + "learning_rate": 7.153104806046906e-05, + "loss": 0.6298, "step": 4435 }, { "epoch": 1.0688493018777083, - "grad_norm": 1.09375, - "learning_rate": 0.00017930693783213485, - "loss": 0.5108, + "grad_norm": 2.078125, + "learning_rate": 7.14926046256159e-05, + "loss": 0.5842, "step": 4440 }, { "epoch": 1.0700529610014444, - "grad_norm": 1.03125, - "learning_rate": 0.00017921045980903244, - "loss": 0.5098, + "grad_norm": 1.734375, + "learning_rate": 7.145413726208768e-05, + "loss": 0.5907, "step": 4445 }, { "epoch": 1.0712566201251805, - "grad_norm": 0.99609375, - "learning_rate": 0.00017911392192579369, - "loss": 0.508, + "grad_norm": 1.7734375, + "learning_rate": 7.141564603134542e-05, + "loss": 0.5904, "step": 4450 }, { "epoch": 1.0724602792489166, - "grad_norm": 1.0, - "learning_rate": 0.00017901732433666122, - "loss": 0.516, + "grad_norm": 1.7265625, + "learning_rate": 7.137713099488821e-05, + "loss": 0.5957, "step": 4455 }, { "epoch": 1.073663938372653, - "grad_norm": 0.97265625, - "learning_rate": 0.00017892066719597325, - "loss": 0.4903, + "grad_norm": 1.7109375, + "learning_rate": 7.133859221425321e-05, + "loss": 0.5596, "step": 4460 }, { "epoch": 1.074867597496389, - "grad_norm": 1.015625, - "learning_rate": 0.00017882395065816316, - "loss": 0.5097, + "grad_norm": 1.7734375, + "learning_rate": 7.130002975101552e-05, + "loss": 0.5793, "step": 4465 }, { "epoch": 1.0760712566201251, - "grad_norm": 0.9296875, - "learning_rate": 0.00017872717487775916, - "loss": 0.5157, + "grad_norm": 1.7421875, + "learning_rate": 7.126144366678808e-05, + "loss": 0.5941, "step": 4470 }, { "epoch": 1.0772749157438612, - "grad_norm": 1.0234375, - "learning_rate": 0.00017863034000938416, - "loss": 0.5023, + "grad_norm": 1.8515625, + "learning_rate": 7.122283402322153e-05, + "loss": 0.5812, "step": 4475 }, { "epoch": 1.0784785748675976, - "grad_norm": 1.0859375, - "learning_rate": 0.00017853344620775546, - "loss": 0.4771, + "grad_norm": 1.8828125, + "learning_rate": 7.11842008820042e-05, + "loss": 0.55, "step": 4480 }, { "epoch": 1.0796822339913337, - "grad_norm": 1.078125, - "learning_rate": 0.00017843649362768446, - "loss": 0.5046, + "grad_norm": 1.71875, + "learning_rate": 7.114554430486195e-05, + "loss": 0.5836, "step": 4485 }, { "epoch": 1.0808858931150698, - "grad_norm": 1.015625, - "learning_rate": 0.00017833948242407656, - "loss": 0.5215, + "grad_norm": 1.765625, + "learning_rate": 7.110686435355807e-05, + "loss": 0.5931, "step": 4490 }, { "epoch": 1.0820895522388059, - "grad_norm": 0.98828125, - "learning_rate": 0.00017824241275193084, - "loss": 0.4988, + "grad_norm": 1.78125, + "learning_rate": 7.106816108989323e-05, + "loss": 0.5786, "step": 4495 }, { "epoch": 1.0832932113625422, - "grad_norm": 0.91015625, - "learning_rate": 0.00017814528476633966, - "loss": 0.5191, + "grad_norm": 1.6171875, + "learning_rate": 7.102943457570528e-05, + "loss": 0.5964, "step": 4500 }, { "epoch": 1.0832932113625422, - "eval_loss": 0.4610966742038727, - "eval_runtime": 2.3462, - "eval_samples_per_second": 85.245, - "eval_steps_per_second": 85.245, + "eval_loss": 0.5141281485557556, + "eval_runtime": 2.3886, + "eval_samples_per_second": 83.731, + "eval_steps_per_second": 83.731, "step": 4500 }, { "epoch": 1.0844968704862783, - "grad_norm": 1.0546875, - "learning_rate": 0.00017804809862248877, - "loss": 0.5037, + "grad_norm": 1.6875, + "learning_rate": 7.099068487286934e-05, + "loss": 0.5783, "step": 4505 }, { "epoch": 1.0857005296100144, - "grad_norm": 1.0234375, - "learning_rate": 0.0001779508544756566, - "loss": 0.5468, + "grad_norm": 1.7421875, + "learning_rate": 7.09519120432974e-05, + "loss": 0.6244, "step": 4510 }, { "epoch": 1.0869041887337505, - "grad_norm": 1.0390625, - "learning_rate": 0.00017785355248121438, - "loss": 0.5076, + "grad_norm": 1.8203125, + "learning_rate": 7.091311614893857e-05, + "loss": 0.58, "step": 4515 }, { "epoch": 1.0881078478574868, - "grad_norm": 1.0078125, - "learning_rate": 0.00017775619279462575, - "loss": 0.5718, + "grad_norm": 1.75, + "learning_rate": 7.087429725177873e-05, + "loss": 0.6542, "step": 4520 }, { "epoch": 1.089311506981223, - "grad_norm": 1.125, - "learning_rate": 0.0001776587755714466, - "loss": 0.5055, + "grad_norm": 1.8671875, + "learning_rate": 7.083545541384052e-05, + "loss": 0.5792, "step": 4525 }, { "epoch": 1.090515166104959, - "grad_norm": 1.0546875, - "learning_rate": 0.00017756130096732465, - "loss": 0.5195, + "grad_norm": 1.9921875, + "learning_rate": 7.079659069718324e-05, + "loss": 0.5915, "step": 4530 }, { "epoch": 1.0917188252286953, - "grad_norm": 1.109375, - "learning_rate": 0.0001774637691379993, - "loss": 0.5541, + "grad_norm": 2.03125, + "learning_rate": 7.075770316390276e-05, + "loss": 0.6299, "step": 4535 }, { "epoch": 1.0929224843524314, - "grad_norm": 1.125, - "learning_rate": 0.0001773661802393014, - "loss": 0.497, + "grad_norm": 2.046875, + "learning_rate": 7.071879287613135e-05, + "loss": 0.5702, "step": 4540 }, { "epoch": 1.0941261434761675, - "grad_norm": 1.015625, - "learning_rate": 0.00017726853442715307, - "loss": 0.5273, + "grad_norm": 1.859375, + "learning_rate": 7.067985989603772e-05, + "loss": 0.6024, "step": 4545 }, { "epoch": 1.0953298025999036, - "grad_norm": 1.15625, - "learning_rate": 0.00017717083185756724, - "loss": 0.4967, + "grad_norm": 1.8515625, + "learning_rate": 7.064090428582678e-05, + "loss": 0.5642, "step": 4550 }, { "epoch": 1.09653346172364, - "grad_norm": 0.99609375, - "learning_rate": 0.00017707307268664753, - "loss": 0.5305, + "grad_norm": 1.8984375, + "learning_rate": 7.060192610773958e-05, + "loss": 0.6057, "step": 4555 }, { "epoch": 1.097737120847376, - "grad_norm": 1.234375, - "learning_rate": 0.00017697525707058813, - "loss": 0.54, + "grad_norm": 2.078125, + "learning_rate": 7.05629254240533e-05, + "loss": 0.617, "step": 4560 }, { "epoch": 1.0989407799711122, - "grad_norm": 1.0546875, - "learning_rate": 0.00017687738516567323, - "loss": 0.5148, + "grad_norm": 1.8125, + "learning_rate": 7.052390229708101e-05, + "loss": 0.5896, "step": 4565 }, { "epoch": 1.1001444390948483, - "grad_norm": 1.0, - "learning_rate": 0.00017677945712827705, - "loss": 0.5216, + "grad_norm": 1.765625, + "learning_rate": 7.048485678917165e-05, + "loss": 0.5928, "step": 4570 }, { "epoch": 1.1013480982185846, - "grad_norm": 1.0625, - "learning_rate": 0.00017668147311486354, - "loss": 0.4958, + "grad_norm": 1.7578125, + "learning_rate": 7.044578896270995e-05, + "loss": 0.5681, "step": 4575 }, { "epoch": 1.1025517573423207, - "grad_norm": 1.09375, - "learning_rate": 0.000176583433281986, - "loss": 0.4883, + "grad_norm": 2.046875, + "learning_rate": 7.040669888011628e-05, + "loss": 0.5644, "step": 4580 }, { "epoch": 1.1037554164660568, - "grad_norm": 1.0859375, - "learning_rate": 0.00017648533778628697, - "loss": 0.5328, + "grad_norm": 1.96875, + "learning_rate": 7.036758660384658e-05, + "loss": 0.6015, "step": 4585 }, { "epoch": 1.104959075589793, - "grad_norm": 1.171875, - "learning_rate": 0.0001763871867844979, - "loss": 0.5174, + "grad_norm": 1.90625, + "learning_rate": 7.032845219639224e-05, + "loss": 0.5951, "step": 4590 }, { "epoch": 1.1061627347135292, - "grad_norm": 1.0546875, - "learning_rate": 0.0001762889804334389, - "loss": 0.484, + "grad_norm": 1.8515625, + "learning_rate": 7.028929572027999e-05, + "loss": 0.5609, "step": 4595 }, { "epoch": 1.1073663938372653, - "grad_norm": 1.1328125, - "learning_rate": 0.0001761907188900186, - "loss": 0.5134, + "grad_norm": 1.796875, + "learning_rate": 7.025011723807186e-05, + "loss": 0.5884, "step": 4600 }, { "epoch": 1.1085700529610014, - "grad_norm": 1.0078125, - "learning_rate": 0.00017609240231123368, - "loss": 0.4753, + "grad_norm": 1.859375, + "learning_rate": 7.021091681236499e-05, + "loss": 0.5529, "step": 4605 }, { "epoch": 1.1097737120847375, - "grad_norm": 1.015625, - "learning_rate": 0.0001759940308541689, - "loss": 0.5269, + "grad_norm": 1.84375, + "learning_rate": 7.017169450579165e-05, + "loss": 0.6031, "step": 4610 }, { "epoch": 1.1109773712084738, - "grad_norm": 1.078125, - "learning_rate": 0.00017589560467599663, - "loss": 0.5214, + "grad_norm": 1.8125, + "learning_rate": 7.013245038101904e-05, + "loss": 0.5963, "step": 4615 }, { "epoch": 1.11218103033221, - "grad_norm": 1.1015625, - "learning_rate": 0.0001757971239339766, - "loss": 0.4905, + "grad_norm": 1.9296875, + "learning_rate": 7.009318450074916e-05, + "loss": 0.5657, "step": 4620 }, { "epoch": 1.113384689455946, - "grad_norm": 1.0625, - "learning_rate": 0.0001756985887854559, - "loss": 0.5319, + "grad_norm": 1.8359375, + "learning_rate": 7.005389692771887e-05, + "loss": 0.6062, "step": 4625 }, { "epoch": 1.1145883485796821, - "grad_norm": 1.03125, - "learning_rate": 0.0001755999993878683, - "loss": 0.544, + "grad_norm": 1.890625, + "learning_rate": 7.001458772469961e-05, + "loss": 0.6236, "step": 4630 }, { "epoch": 1.1157920077034185, - "grad_norm": 1.171875, - "learning_rate": 0.0001755013558987345, - "loss": 0.5251, + "grad_norm": 2.015625, + "learning_rate": 6.997525695449741e-05, + "loss": 0.6032, "step": 4635 }, { "epoch": 1.1169956668271546, - "grad_norm": 1.0703125, - "learning_rate": 0.00017540265847566146, - "loss": 0.5092, + "grad_norm": 1.8515625, + "learning_rate": 6.993590467995278e-05, + "loss": 0.5849, "step": 4640 }, { "epoch": 1.1181993259508907, - "grad_norm": 1.0546875, - "learning_rate": 0.00017530390727634238, - "loss": 0.5232, + "grad_norm": 1.8671875, + "learning_rate": 6.989653096394055e-05, + "loss": 0.6049, "step": 4645 }, { "epoch": 1.1194029850746268, - "grad_norm": 1.0390625, - "learning_rate": 0.00017520510245855632, - "loss": 0.4937, + "grad_norm": 1.7734375, + "learning_rate": 6.985713586936982e-05, + "loss": 0.5667, "step": 4650 }, { "epoch": 1.120606644198363, - "grad_norm": 1.0546875, - "learning_rate": 0.0001751062441801681, - "loss": 0.4988, + "grad_norm": 1.8828125, + "learning_rate": 6.981771945918386e-05, + "loss": 0.5837, "step": 4655 }, { "epoch": 1.1218103033220992, - "grad_norm": 1.015625, - "learning_rate": 0.00017500733259912787, - "loss": 0.5025, + "grad_norm": 1.7734375, + "learning_rate": 6.977828179636e-05, + "loss": 0.5758, "step": 4660 }, { "epoch": 1.1230139624458353, - "grad_norm": 1.046875, - "learning_rate": 0.00017490836787347104, - "loss": 0.5357, + "grad_norm": 1.796875, + "learning_rate": 6.973882294390953e-05, + "loss": 0.6107, "step": 4665 }, { "epoch": 1.1242176215695716, - "grad_norm": 1.0390625, - "learning_rate": 0.00017480935016131777, - "loss": 0.4758, + "grad_norm": 1.828125, + "learning_rate": 6.969934296487751e-05, + "loss": 0.5513, "step": 4670 }, { "epoch": 1.1254212806933077, - "grad_norm": 0.92578125, - "learning_rate": 0.00017471027962087302, - "loss": 0.533, + "grad_norm": 1.6953125, + "learning_rate": 6.965984192234288e-05, + "loss": 0.609, "step": 4675 }, { "epoch": 1.1266249398170438, - "grad_norm": 1.03125, - "learning_rate": 0.0001746111564104262, - "loss": 0.5033, + "grad_norm": 1.71875, + "learning_rate": 6.962031987941819e-05, + "loss": 0.5756, "step": 4680 }, { "epoch": 1.12782859894078, - "grad_norm": 1.0078125, - "learning_rate": 0.00017451198068835067, - "loss": 0.5, + "grad_norm": 1.8671875, + "learning_rate": 6.958077689924949e-05, + "loss": 0.5786, "step": 4685 }, { "epoch": 1.129032258064516, - "grad_norm": 1.046875, - "learning_rate": 0.0001744127526131039, - "loss": 0.5192, + "grad_norm": 1.78125, + "learning_rate": 6.954121304501637e-05, + "loss": 0.5892, "step": 4690 }, { "epoch": 1.1302359171882523, - "grad_norm": 0.97265625, - "learning_rate": 0.0001743134723432269, - "loss": 0.5026, + "grad_norm": 1.765625, + "learning_rate": 6.95016283799317e-05, + "loss": 0.5793, "step": 4695 }, { "epoch": 1.1314395763119884, - "grad_norm": 0.94921875, - "learning_rate": 0.000174214140037344, - "loss": 0.5086, + "grad_norm": 1.65625, + "learning_rate": 6.946202296724163e-05, + "loss": 0.5834, "step": 4700 }, { "epoch": 1.1326432354357245, - "grad_norm": 1.1171875, - "learning_rate": 0.0001741147558541629, - "loss": 0.488, + "grad_norm": 1.96875, + "learning_rate": 6.942239687022546e-05, + "loss": 0.5662, "step": 4705 }, { "epoch": 1.1338468945594609, - "grad_norm": 1.0625, - "learning_rate": 0.00017401531995247393, - "loss": 0.5062, + "grad_norm": 1.90625, + "learning_rate": 6.938275015219554e-05, + "loss": 0.5866, "step": 4710 }, { "epoch": 1.135050553683197, - "grad_norm": 1.140625, - "learning_rate": 0.00017391583249115025, - "loss": 0.5186, + "grad_norm": 2.015625, + "learning_rate": 6.934308287649715e-05, + "loss": 0.5925, "step": 4715 }, { "epoch": 1.136254212806933, - "grad_norm": 1.0234375, - "learning_rate": 0.0001738162936291473, - "loss": 0.5381, + "grad_norm": 1.796875, + "learning_rate": 6.930339510650845e-05, + "loss": 0.6116, "step": 4720 }, { "epoch": 1.1374578719306692, - "grad_norm": 1.09375, - "learning_rate": 0.0001737167035255026, - "loss": 0.5178, + "grad_norm": 1.7890625, + "learning_rate": 6.926368690564028e-05, + "loss": 0.5894, "step": 4725 }, { "epoch": 1.1386615310544055, - "grad_norm": 1.21875, - "learning_rate": 0.0001736170623393357, - "loss": 0.5159, + "grad_norm": 1.953125, + "learning_rate": 6.92239583373362e-05, + "loss": 0.5818, "step": 4730 }, { "epoch": 1.1398651901781416, - "grad_norm": 0.92578125, - "learning_rate": 0.0001735173702298476, - "loss": 0.531, + "grad_norm": 1.7421875, + "learning_rate": 6.918420946507227e-05, + "loss": 0.6084, "step": 4735 }, { "epoch": 1.1410688493018777, - "grad_norm": 0.97265625, - "learning_rate": 0.0001734176273563208, - "loss": 0.4937, + "grad_norm": 1.71875, + "learning_rate": 6.914444035235701e-05, + "loss": 0.5666, "step": 4740 }, { "epoch": 1.1422725084256138, - "grad_norm": 0.9921875, - "learning_rate": 0.0001733178338781188, - "loss": 0.5248, + "grad_norm": 1.7265625, + "learning_rate": 6.910465106273126e-05, + "loss": 0.6012, "step": 4745 }, { "epoch": 1.14347616754935, - "grad_norm": 1.03125, - "learning_rate": 0.000173217989954686, - "loss": 0.5205, + "grad_norm": 1.734375, + "learning_rate": 6.906484165976809e-05, + "loss": 0.5949, "step": 4750 }, { "epoch": 1.1446798266730862, - "grad_norm": 1.078125, - "learning_rate": 0.0001731180957455474, - "loss": 0.4966, + "grad_norm": 1.859375, + "learning_rate": 6.902501220707273e-05, + "loss": 0.5736, "step": 4755 }, { "epoch": 1.1458834857968223, - "grad_norm": 1.0234375, - "learning_rate": 0.00017301815141030833, - "loss": 0.5217, + "grad_norm": 1.8671875, + "learning_rate": 6.898516276828243e-05, + "loss": 0.5972, "step": 4760 }, { "epoch": 1.1470871449205584, - "grad_norm": 1.1015625, - "learning_rate": 0.0001729181571086542, - "loss": 0.4961, + "grad_norm": 1.875, + "learning_rate": 6.894529340706638e-05, + "loss": 0.572, "step": 4765 }, { "epoch": 1.1482908040442947, - "grad_norm": 1.1015625, - "learning_rate": 0.00017281811300035033, - "loss": 0.5075, + "grad_norm": 1.8984375, + "learning_rate": 6.890540418712563e-05, + "loss": 0.5827, "step": 4770 }, { "epoch": 1.1494944631680308, - "grad_norm": 1.0390625, - "learning_rate": 0.00017271801924524153, - "loss": 0.4828, + "grad_norm": 1.7734375, + "learning_rate": 6.886549517219288e-05, + "loss": 0.5556, "step": 4775 }, { "epoch": 1.150698122291767, - "grad_norm": 1.1796875, - "learning_rate": 0.00017261787600325192, - "loss": 0.5235, + "grad_norm": 1.96875, + "learning_rate": 6.882556642603252e-05, + "loss": 0.5949, "step": 4780 }, { "epoch": 1.151901781415503, - "grad_norm": 1.1796875, - "learning_rate": 0.00017251768343438478, - "loss": 0.5286, + "grad_norm": 2.0, + "learning_rate": 6.878561801244046e-05, + "loss": 0.6044, "step": 4785 }, { "epoch": 1.1531054405392394, - "grad_norm": 0.95703125, - "learning_rate": 0.00017241744169872213, - "loss": 0.5124, + "grad_norm": 1.7890625, + "learning_rate": 6.874564999524402e-05, + "loss": 0.5891, "step": 4790 }, { "epoch": 1.1543090996629755, - "grad_norm": 0.98828125, - "learning_rate": 0.00017231715095642456, - "loss": 0.5079, + "grad_norm": 1.7890625, + "learning_rate": 6.870566243830186e-05, + "loss": 0.5785, "step": 4795 }, { "epoch": 1.1555127587867116, - "grad_norm": 1.03125, - "learning_rate": 0.00017221681136773102, - "loss": 0.4946, + "grad_norm": 1.8203125, + "learning_rate": 6.866565540550383e-05, + "loss": 0.571, "step": 4800 }, { "epoch": 1.1567164179104479, - "grad_norm": 1.0234375, - "learning_rate": 0.00017211642309295836, - "loss": 0.5115, + "grad_norm": 1.65625, + "learning_rate": 6.862562896077089e-05, + "loss": 0.5865, "step": 4805 }, { "epoch": 1.157920077034184, - "grad_norm": 1.0703125, - "learning_rate": 0.00017201598629250136, - "loss": 0.5167, + "grad_norm": 1.8203125, + "learning_rate": 6.858558316805507e-05, + "loss": 0.596, "step": 4810 }, { "epoch": 1.15912373615792, - "grad_norm": 1.0859375, - "learning_rate": 0.00017191550112683227, - "loss": 0.528, + "grad_norm": 1.8671875, + "learning_rate": 6.854551809133927e-05, + "loss": 0.5989, "step": 4815 }, { "epoch": 1.1603273952816562, - "grad_norm": 1.0625, - "learning_rate": 0.00017181496775650065, - "loss": 0.5209, + "grad_norm": 1.90625, + "learning_rate": 6.850543379463721e-05, + "loss": 0.5925, "step": 4820 }, { "epoch": 1.1615310544053923, - "grad_norm": 1.0078125, - "learning_rate": 0.000171714386342133, - "loss": 0.5259, + "grad_norm": 1.8671875, + "learning_rate": 6.846533034199333e-05, + "loss": 0.6034, "step": 4825 }, { "epoch": 1.1627347135291286, - "grad_norm": 0.96875, - "learning_rate": 0.00017161375704443271, - "loss": 0.4659, + "grad_norm": 1.84375, + "learning_rate": 6.842520779748267e-05, + "loss": 0.546, "step": 4830 }, { "epoch": 1.1639383726528647, - "grad_norm": 1.1015625, - "learning_rate": 0.0001715130800241795, - "loss": 0.5014, + "grad_norm": 1.859375, + "learning_rate": 6.838506622521074e-05, + "loss": 0.5778, "step": 4835 }, { "epoch": 1.1651420317766008, - "grad_norm": 1.0546875, - "learning_rate": 0.00017141235544222957, - "loss": 0.5175, + "grad_norm": 1.734375, + "learning_rate": 6.834490568931353e-05, + "loss": 0.5936, "step": 4840 }, { "epoch": 1.1663456909003371, - "grad_norm": 1.015625, - "learning_rate": 0.0001713115834595149, - "loss": 0.5066, + "grad_norm": 1.7734375, + "learning_rate": 6.830472625395724e-05, + "loss": 0.5766, "step": 4845 }, { "epoch": 1.1675493500240732, - "grad_norm": 1.0703125, - "learning_rate": 0.00017121076423704326, - "loss": 0.53, + "grad_norm": 1.78125, + "learning_rate": 6.826452798333832e-05, + "loss": 0.6055, "step": 4850 }, { "epoch": 1.1687530091478093, - "grad_norm": 0.99609375, - "learning_rate": 0.000171109897935898, - "loss": 0.4582, + "grad_norm": 1.8515625, + "learning_rate": 6.82243109416833e-05, + "loss": 0.5406, "step": 4855 }, { "epoch": 1.1699566682715454, - "grad_norm": 1.0703125, - "learning_rate": 0.00017100898471723755, - "loss": 0.5133, + "grad_norm": 1.921875, + "learning_rate": 6.818407519324872e-05, + "loss": 0.5927, "step": 4860 }, { "epoch": 1.1711603273952818, - "grad_norm": 1.03125, - "learning_rate": 0.00017090802474229537, - "loss": 0.493, + "grad_norm": 1.84375, + "learning_rate": 6.814382080232097e-05, + "loss": 0.5674, "step": 4865 }, { "epoch": 1.1723639865190179, - "grad_norm": 1.078125, - "learning_rate": 0.00017080701817237962, - "loss": 0.4868, + "grad_norm": 1.90625, + "learning_rate": 6.810354783321625e-05, + "loss": 0.568, "step": 4870 }, { "epoch": 1.173567645642754, - "grad_norm": 1.0390625, - "learning_rate": 0.00017070596516887296, - "loss": 0.5291, + "grad_norm": 1.84375, + "learning_rate": 6.806325635028046e-05, + "loss": 0.6112, "step": 4875 }, { "epoch": 1.17477130476649, - "grad_norm": 1.1015625, - "learning_rate": 0.00017060486589323212, - "loss": 0.5192, + "grad_norm": 1.875, + "learning_rate": 6.802294641788903e-05, + "loss": 0.5944, "step": 4880 }, { "epoch": 1.1759749638902264, - "grad_norm": 1.0234375, - "learning_rate": 0.00017050372050698786, - "loss": 0.5252, + "grad_norm": 1.7109375, + "learning_rate": 6.798261810044692e-05, + "loss": 0.5964, "step": 4885 }, { "epoch": 1.1771786230139625, - "grad_norm": 1.0234375, - "learning_rate": 0.00017040252917174456, - "loss": 0.4888, + "grad_norm": 1.7734375, + "learning_rate": 6.794227146238844e-05, + "loss": 0.5649, "step": 4890 }, { "epoch": 1.1783822821376986, - "grad_norm": 0.98046875, - "learning_rate": 0.00017030129204918004, - "loss": 0.4926, + "grad_norm": 1.703125, + "learning_rate": 6.790190656817718e-05, + "loss": 0.5644, "step": 4895 }, { "epoch": 1.1795859412614347, - "grad_norm": 1.0390625, - "learning_rate": 0.00017020000930104528, - "loss": 0.4779, + "grad_norm": 1.8359375, + "learning_rate": 6.786152348230588e-05, + "loss": 0.5487, "step": 4900 }, { "epoch": 1.180789600385171, - "grad_norm": 0.98046875, - "learning_rate": 0.00017009868108916408, - "loss": 0.495, + "grad_norm": 1.6640625, + "learning_rate": 6.782112226929635e-05, + "loss": 0.5746, "step": 4905 }, { "epoch": 1.181993259508907, - "grad_norm": 1.1328125, - "learning_rate": 0.00016999730757543308, - "loss": 0.558, + "grad_norm": 1.8203125, + "learning_rate": 6.77807029936994e-05, + "loss": 0.6301, "step": 4910 }, { "epoch": 1.1831969186326432, - "grad_norm": 1.1015625, - "learning_rate": 0.00016989588892182107, - "loss": 0.4822, + "grad_norm": 1.90625, + "learning_rate": 6.774026572009468e-05, + "loss": 0.5591, "step": 4915 }, { "epoch": 1.1844005777563793, - "grad_norm": 1.0078125, - "learning_rate": 0.00016979442529036905, - "loss": 0.5072, + "grad_norm": 1.703125, + "learning_rate": 6.769981051309053e-05, + "loss": 0.5829, "step": 4920 }, { "epoch": 1.1856042368801156, - "grad_norm": 1.078125, - "learning_rate": 0.00016969291684318995, - "loss": 0.496, + "grad_norm": 1.7734375, + "learning_rate": 6.765933743732404e-05, + "loss": 0.5762, "step": 4925 }, { "epoch": 1.1868078960038517, - "grad_norm": 1.03125, - "learning_rate": 0.00016959136374246822, - "loss": 0.5283, + "grad_norm": 1.8984375, + "learning_rate": 6.761884655746083e-05, + "loss": 0.6076, "step": 4930 }, { "epoch": 1.1880115551275878, - "grad_norm": 1.0625, - "learning_rate": 0.00016948976615045966, - "loss": 0.5206, + "grad_norm": 1.796875, + "learning_rate": 6.757833793819489e-05, + "loss": 0.5906, "step": 4935 }, { "epoch": 1.1892152142513241, - "grad_norm": 0.96875, - "learning_rate": 0.00016938812422949122, - "loss": 0.4826, + "grad_norm": 1.640625, + "learning_rate": 6.753781164424865e-05, + "loss": 0.5542, "step": 4940 }, { "epoch": 1.1904188733750602, - "grad_norm": 1.0546875, - "learning_rate": 0.0001692864381419606, - "loss": 0.5193, + "grad_norm": 1.9296875, + "learning_rate": 6.749726774037273e-05, + "loss": 0.5974, "step": 4945 }, { "epoch": 1.1916225324987963, - "grad_norm": 0.99609375, - "learning_rate": 0.00016918470805033615, - "loss": 0.4983, + "grad_norm": 1.8125, + "learning_rate": 6.74567062913459e-05, + "loss": 0.5803, "step": 4950 }, { "epoch": 1.1928261916225325, - "grad_norm": 0.984375, - "learning_rate": 0.0001690829341171564, - "loss": 0.4781, + "grad_norm": 1.7578125, + "learning_rate": 6.741612736197494e-05, + "loss": 0.5514, "step": 4955 }, { "epoch": 1.1940298507462686, - "grad_norm": 1.0546875, - "learning_rate": 0.00016898111650503006, - "loss": 0.5292, + "grad_norm": 1.7578125, + "learning_rate": 6.737553101709459e-05, + "loss": 0.6052, "step": 4960 }, { "epoch": 1.1952335098700049, - "grad_norm": 1.09375, - "learning_rate": 0.00016887925537663556, - "loss": 0.4959, + "grad_norm": 1.9921875, + "learning_rate": 6.73349173215674e-05, + "loss": 0.5725, "step": 4965 }, { "epoch": 1.196437168993741, - "grad_norm": 1.0078125, - "learning_rate": 0.00016877735089472089, - "loss": 0.4755, + "grad_norm": 1.8046875, + "learning_rate": 6.729428634028365e-05, + "loss": 0.55, "step": 4970 }, { "epoch": 1.197640828117477, - "grad_norm": 1.1171875, - "learning_rate": 0.00016867540322210322, - "loss": 0.5072, + "grad_norm": 2.078125, + "learning_rate": 6.725363813816121e-05, + "loss": 0.5828, "step": 4975 }, { "epoch": 1.1988444872412134, - "grad_norm": 1.1171875, - "learning_rate": 0.00016857341252166886, - "loss": 0.4966, + "grad_norm": 2.109375, + "learning_rate": 6.72129727801455e-05, + "loss": 0.5685, "step": 4980 }, { "epoch": 1.2000481463649495, - "grad_norm": 1.0859375, - "learning_rate": 0.0001684713789563728, - "loss": 0.5135, + "grad_norm": 2.046875, + "learning_rate": 6.717229033120935e-05, + "loss": 0.5884, "step": 4985 }, { "epoch": 1.2012518054886856, - "grad_norm": 1.0234375, - "learning_rate": 0.0001683693026892385, - "loss": 0.4978, + "grad_norm": 1.796875, + "learning_rate": 6.713159085635288e-05, + "loss": 0.5687, "step": 4990 }, { "epoch": 1.2024554646124217, - "grad_norm": 1.1640625, - "learning_rate": 0.00016826718388335767, - "loss": 0.5139, + "grad_norm": 1.78125, + "learning_rate": 6.70908744206034e-05, + "loss": 0.5961, "step": 4995 }, { "epoch": 1.2036591237361578, - "grad_norm": 1.1171875, - "learning_rate": 0.00016816502270189002, - "loss": 0.5329, + "grad_norm": 1.90625, + "learning_rate": 6.705014108901535e-05, + "loss": 0.608, "step": 5000 }, { "epoch": 1.2036591237361578, - "eval_loss": 0.4486047625541687, - "eval_runtime": 2.327, - "eval_samples_per_second": 85.948, - "eval_steps_per_second": 85.948, + "eval_loss": 0.504835307598114, + "eval_runtime": 2.3814, + "eval_samples_per_second": 83.984, + "eval_steps_per_second": 83.984, "step": 5000 }, { "epoch": 1.2048627828598941, - "grad_norm": 1.0546875, - "learning_rate": 0.00016806281930806287, - "loss": 0.4805, + "grad_norm": 1.828125, + "learning_rate": 6.700939092667015e-05, + "loss": 0.5535, "step": 5005 }, { "epoch": 1.2060664419836302, - "grad_norm": 0.98828125, - "learning_rate": 0.0001679605738651711, - "loss": 0.4785, + "grad_norm": 1.8515625, + "learning_rate": 6.696862399867612e-05, + "loss": 0.5502, "step": 5010 }, { "epoch": 1.2072701011073663, - "grad_norm": 1.0234375, - "learning_rate": 0.00016785828653657667, - "loss": 0.5004, + "grad_norm": 1.8359375, + "learning_rate": 6.692784037016834e-05, + "loss": 0.5779, "step": 5015 }, { "epoch": 1.2084737602311026, - "grad_norm": 1.0078125, - "learning_rate": 0.00016775595748570854, - "loss": 0.5022, + "grad_norm": 1.71875, + "learning_rate": 6.688704010630863e-05, + "loss": 0.5702, "step": 5020 }, { "epoch": 1.2096774193548387, - "grad_norm": 0.97265625, - "learning_rate": 0.0001676535868760623, - "loss": 0.5276, + "grad_norm": 1.6953125, + "learning_rate": 6.684622327228531e-05, + "loss": 0.6041, "step": 5025 }, { "epoch": 1.2108810784785748, - "grad_norm": 1.09375, - "learning_rate": 0.00016755117487119987, - "loss": 0.5087, + "grad_norm": 1.8046875, + "learning_rate": 6.680538993331324e-05, + "loss": 0.5853, "step": 5030 }, { "epoch": 1.212084737602311, - "grad_norm": 1.0234375, - "learning_rate": 0.0001674487216347495, - "loss": 0.5072, + "grad_norm": 1.7578125, + "learning_rate": 6.676454015463363e-05, + "loss": 0.583, "step": 5035 }, { "epoch": 1.2132883967260473, - "grad_norm": 1.078125, - "learning_rate": 0.00016734622733040514, - "loss": 0.5236, + "grad_norm": 1.8984375, + "learning_rate": 6.672367400151395e-05, + "loss": 0.6005, "step": 5040 }, { "epoch": 1.2144920558497834, - "grad_norm": 1.03125, - "learning_rate": 0.00016724369212192637, - "loss": 0.516, + "grad_norm": 1.7109375, + "learning_rate": 6.668279153924781e-05, + "loss": 0.5903, "step": 5045 }, { "epoch": 1.2156957149735195, - "grad_norm": 1.015625, - "learning_rate": 0.0001671411161731382, - "loss": 0.5012, + "grad_norm": 1.84375, + "learning_rate": 6.664189283315494e-05, + "loss": 0.5783, "step": 5050 }, { "epoch": 1.2168993740972556, - "grad_norm": 1.0234375, - "learning_rate": 0.00016703849964793077, - "loss": 0.5148, + "grad_norm": 1.875, + "learning_rate": 6.6600977948581e-05, + "loss": 0.5852, "step": 5055 }, { "epoch": 1.218103033220992, - "grad_norm": 0.90625, - "learning_rate": 0.00016693584271025892, - "loss": 0.4953, + "grad_norm": 1.6640625, + "learning_rate": 6.656004695089745e-05, + "loss": 0.5719, "step": 5060 }, { "epoch": 1.219306692344728, - "grad_norm": 0.9609375, - "learning_rate": 0.00016683314552414207, - "loss": 0.4757, + "grad_norm": 1.6171875, + "learning_rate": 6.651909990550153e-05, + "loss": 0.5579, "step": 5065 }, { "epoch": 1.220510351468464, - "grad_norm": 1.09375, - "learning_rate": 0.0001667304082536641, - "loss": 0.5038, + "grad_norm": 1.9140625, + "learning_rate": 6.647813687781617e-05, + "loss": 0.574, "step": 5070 }, { "epoch": 1.2217140105922002, - "grad_norm": 0.94140625, - "learning_rate": 0.00016662763106297274, - "loss": 0.4967, + "grad_norm": 1.59375, + "learning_rate": 6.643715793328972e-05, + "loss": 0.5757, "step": 5075 }, { "epoch": 1.2229176697159365, - "grad_norm": 1.0859375, - "learning_rate": 0.00016652481411627966, - "loss": 0.5337, + "grad_norm": 1.7578125, + "learning_rate": 6.63961631373961e-05, + "loss": 0.6137, "step": 5080 }, { "epoch": 1.2241213288396726, - "grad_norm": 0.91796875, - "learning_rate": 0.00016642195757785995, - "loss": 0.4646, + "grad_norm": 1.640625, + "learning_rate": 6.635515255563442e-05, + "loss": 0.5425, "step": 5085 }, { "epoch": 1.2253249879634087, - "grad_norm": 1.1171875, - "learning_rate": 0.000166319061612052, - "loss": 0.5122, + "grad_norm": 1.8046875, + "learning_rate": 6.63141262535291e-05, + "loss": 0.5983, "step": 5090 }, { "epoch": 1.2265286470871448, - "grad_norm": 1.0, - "learning_rate": 0.00016621612638325717, - "loss": 0.4961, + "grad_norm": 1.8125, + "learning_rate": 6.627308429662967e-05, + "loss": 0.5729, "step": 5095 }, { "epoch": 1.2277323062108811, - "grad_norm": 1.0078125, - "learning_rate": 0.00016611315205593958, - "loss": 0.5155, + "grad_norm": 1.7265625, + "learning_rate": 6.623202675051065e-05, + "loss": 0.5927, "step": 5100 }, { "epoch": 1.2289359653346172, - "grad_norm": 1.125, - "learning_rate": 0.00016601013879462585, - "loss": 0.4868, + "grad_norm": 1.9140625, + "learning_rate": 6.619095368077145e-05, + "loss": 0.5628, "step": 5105 }, { "epoch": 1.2301396244583533, - "grad_norm": 1.046875, - "learning_rate": 0.0001659070867639047, - "loss": 0.4981, + "grad_norm": 1.75, + "learning_rate": 6.614986515303635e-05, + "loss": 0.576, "step": 5110 }, { "epoch": 1.2313432835820897, - "grad_norm": 1.015625, - "learning_rate": 0.00016580399612842685, - "loss": 0.5182, + "grad_norm": 1.765625, + "learning_rate": 6.610876123295423e-05, + "loss": 0.5994, "step": 5115 }, { "epoch": 1.2325469427058258, - "grad_norm": 0.97265625, - "learning_rate": 0.00016570086705290478, - "loss": 0.4996, + "grad_norm": 1.796875, + "learning_rate": 6.606764198619866e-05, + "loss": 0.5784, "step": 5120 }, { "epoch": 1.2337506018295619, - "grad_norm": 1.078125, - "learning_rate": 0.00016559769970211224, - "loss": 0.4892, + "grad_norm": 1.984375, + "learning_rate": 6.602650747846764e-05, + "loss": 0.5643, "step": 5125 }, { "epoch": 1.234954260953298, - "grad_norm": 1.15625, - "learning_rate": 0.00016549449424088425, - "loss": 0.4844, + "grad_norm": 1.9609375, + "learning_rate": 6.598535777548355e-05, + "loss": 0.5572, "step": 5130 }, { "epoch": 1.236157920077034, - "grad_norm": 0.93359375, - "learning_rate": 0.00016539125083411672, - "loss": 0.4493, + "grad_norm": 1.7109375, + "learning_rate": 6.594419294299312e-05, + "loss": 0.5226, "step": 5135 }, { "epoch": 1.2373615792007704, - "grad_norm": 1.0546875, - "learning_rate": 0.00016528796964676606, - "loss": 0.5282, + "grad_norm": 1.75, + "learning_rate": 6.590301304676714e-05, + "loss": 0.6042, "step": 5140 }, { "epoch": 1.2385652383245065, - "grad_norm": 0.94140625, - "learning_rate": 0.00016518465084384916, - "loss": 0.5125, + "grad_norm": 1.75, + "learning_rate": 6.586181815260056e-05, + "loss": 0.5893, "step": 5145 }, { "epoch": 1.2397688974482426, - "grad_norm": 1.015625, - "learning_rate": 0.00016508129459044302, - "loss": 0.4925, + "grad_norm": 1.7890625, + "learning_rate": 6.582060832631225e-05, + "loss": 0.5681, "step": 5150 }, { "epoch": 1.240972556571979, - "grad_norm": 1.03125, - "learning_rate": 0.0001649779010516844, - "loss": 0.4672, + "grad_norm": 1.921875, + "learning_rate": 6.577938363374495e-05, + "loss": 0.5464, "step": 5155 }, { "epoch": 1.242176215695715, - "grad_norm": 1.0546875, - "learning_rate": 0.00016487447039276968, - "loss": 0.5154, + "grad_norm": 1.796875, + "learning_rate": 6.573814414076516e-05, + "loss": 0.5935, "step": 5160 }, { "epoch": 1.2433798748194511, - "grad_norm": 1.140625, - "learning_rate": 0.00016477100277895456, - "loss": 0.4897, + "grad_norm": 1.953125, + "learning_rate": 6.569688991326301e-05, + "loss": 0.5728, "step": 5165 }, { "epoch": 1.2445835339431872, - "grad_norm": 0.984375, - "learning_rate": 0.0001646674983755537, - "loss": 0.5114, + "grad_norm": 1.8046875, + "learning_rate": 6.565562101715217e-05, + "loss": 0.5898, "step": 5170 }, { "epoch": 1.2457871930669235, - "grad_norm": 1.1953125, - "learning_rate": 0.00016456395734794064, - "loss": 0.492, + "grad_norm": 1.9765625, + "learning_rate": 6.561433751836976e-05, + "loss": 0.5699, "step": 5175 }, { "epoch": 1.2469908521906596, - "grad_norm": 1.0703125, - "learning_rate": 0.00016446037986154744, - "loss": 0.5115, + "grad_norm": 1.8203125, + "learning_rate": 6.557303948287626e-05, + "loss": 0.588, "step": 5180 }, { "epoch": 1.2481945113143957, - "grad_norm": 1.0, - "learning_rate": 0.00016435676608186434, - "loss": 0.4874, + "grad_norm": 1.90625, + "learning_rate": 6.55317269766553e-05, + "loss": 0.5642, "step": 5185 }, { "epoch": 1.2493981704381318, - "grad_norm": 1.0234375, - "learning_rate": 0.0001642531161744396, - "loss": 0.4899, + "grad_norm": 1.7890625, + "learning_rate": 6.54904000657137e-05, + "loss": 0.5558, "step": 5190 }, { "epoch": 1.2506018295618682, - "grad_norm": 0.9921875, - "learning_rate": 0.00016414943030487915, - "loss": 0.4998, + "grad_norm": 1.9765625, + "learning_rate": 6.544905881608125e-05, + "loss": 0.5734, "step": 5195 }, { "epoch": 1.2518054886856043, - "grad_norm": 0.984375, - "learning_rate": 0.00016404570863884647, - "loss": 0.5105, + "grad_norm": 1.71875, + "learning_rate": 6.540770329381069e-05, + "loss": 0.5851, "step": 5200 }, { "epoch": 1.2530091478093404, - "grad_norm": 1.0234375, - "learning_rate": 0.0001639419513420622, - "loss": 0.4967, + "grad_norm": 1.8515625, + "learning_rate": 6.536633356497752e-05, + "loss": 0.5713, "step": 5205 }, { "epoch": 1.2542128069330767, - "grad_norm": 1.0625, - "learning_rate": 0.00016383815858030392, - "loss": 0.4966, + "grad_norm": 1.75, + "learning_rate": 6.532494969568e-05, + "loss": 0.5687, "step": 5210 }, { "epoch": 1.2554164660568128, - "grad_norm": 1.046875, - "learning_rate": 0.00016373433051940583, - "loss": 0.4736, + "grad_norm": 1.8828125, + "learning_rate": 6.528355175203889e-05, + "loss": 0.5576, "step": 5215 }, { "epoch": 1.2566201251805489, - "grad_norm": 1.15625, - "learning_rate": 0.00016363046732525855, - "loss": 0.5043, + "grad_norm": 2.140625, + "learning_rate": 6.524213980019754e-05, + "loss": 0.5835, "step": 5220 }, { "epoch": 1.257823784304285, - "grad_norm": 0.99609375, - "learning_rate": 0.00016352656916380885, - "loss": 0.4954, + "grad_norm": 1.765625, + "learning_rate": 6.52007139063216e-05, + "loss": 0.5701, "step": 5225 }, { "epoch": 1.259027443428021, - "grad_norm": 1.0859375, - "learning_rate": 0.00016342263620105937, - "loss": 0.4906, + "grad_norm": 1.7734375, + "learning_rate": 6.515927413659907e-05, + "loss": 0.5708, "step": 5230 }, { "epoch": 1.2602311025517574, - "grad_norm": 0.97265625, - "learning_rate": 0.00016331866860306828, - "loss": 0.5127, + "grad_norm": 1.71875, + "learning_rate": 6.511782055724003e-05, + "loss": 0.5876, "step": 5235 }, { "epoch": 1.2614347616754935, - "grad_norm": 1.0625, - "learning_rate": 0.00016321466653594925, - "loss": 0.5064, + "grad_norm": 1.921875, + "learning_rate": 6.50763532344767e-05, + "loss": 0.5921, "step": 5240 }, { "epoch": 1.2626384207992296, - "grad_norm": 1.0234375, - "learning_rate": 0.00016311063016587082, - "loss": 0.4934, + "grad_norm": 1.84375, + "learning_rate": 6.503487223456324e-05, + "loss": 0.5684, "step": 5245 }, { "epoch": 1.263842079922966, - "grad_norm": 1.0, - "learning_rate": 0.0001630065596590565, - "loss": 0.5157, + "grad_norm": 1.859375, + "learning_rate": 6.499337762377561e-05, + "loss": 0.5894, "step": 5250 }, { "epoch": 1.265045739046702, - "grad_norm": 1.0, - "learning_rate": 0.00016290245518178428, - "loss": 0.4909, + "grad_norm": 1.8828125, + "learning_rate": 6.495186946841161e-05, + "loss": 0.5706, "step": 5255 }, { "epoch": 1.2662493981704381, - "grad_norm": 1.1484375, - "learning_rate": 0.00016279831690038643, - "loss": 0.5066, + "grad_norm": 1.8984375, + "learning_rate": 6.491034783479062e-05, + "loss": 0.576, "step": 5260 }, { "epoch": 1.2674530572941742, - "grad_norm": 0.99609375, - "learning_rate": 0.0001626941449812492, - "loss": 0.5386, + "grad_norm": 1.78125, + "learning_rate": 6.486881278925356e-05, + "loss": 0.6153, "step": 5265 }, { "epoch": 1.2686567164179103, - "grad_norm": 1.046875, - "learning_rate": 0.0001625899395908126, - "loss": 0.4992, + "grad_norm": 1.953125, + "learning_rate": 6.482726439816277e-05, + "loss": 0.5782, "step": 5270 }, { "epoch": 1.2698603755416467, - "grad_norm": 0.91796875, - "learning_rate": 0.00016248570089557018, - "loss": 0.481, + "grad_norm": 1.71875, + "learning_rate": 6.478570272790195e-05, + "loss": 0.564, "step": 5275 }, { "epoch": 1.2710640346653828, - "grad_norm": 1.0859375, - "learning_rate": 0.0001623814290620686, - "loss": 0.5125, + "grad_norm": 1.8046875, + "learning_rate": 6.474412784487598e-05, + "loss": 0.5923, "step": 5280 }, { "epoch": 1.2722676937891189, - "grad_norm": 1.0078125, - "learning_rate": 0.00016227712425690758, - "loss": 0.5138, + "grad_norm": 1.8671875, + "learning_rate": 6.470253981551087e-05, + "loss": 0.5988, "step": 5285 }, { "epoch": 1.2734713529128552, - "grad_norm": 1.0390625, - "learning_rate": 0.00016217278664673945, - "loss": 0.4842, + "grad_norm": 1.8046875, + "learning_rate": 6.466093870625365e-05, + "loss": 0.5651, "step": 5290 }, { "epoch": 1.2746750120365913, - "grad_norm": 0.93359375, - "learning_rate": 0.0001620684163982689, - "loss": 0.4963, + "grad_norm": 1.8359375, + "learning_rate": 6.461932458357219e-05, + "loss": 0.5763, "step": 5295 }, { "epoch": 1.2758786711603274, - "grad_norm": 0.953125, - "learning_rate": 0.00016196401367825285, - "loss": 0.5083, + "grad_norm": 1.765625, + "learning_rate": 6.457769751395522e-05, + "loss": 0.5833, "step": 5300 }, { "epoch": 1.2770823302840635, - "grad_norm": 0.9921875, - "learning_rate": 0.00016185957865350015, - "loss": 0.4933, + "grad_norm": 1.796875, + "learning_rate": 6.453605756391211e-05, + "loss": 0.5698, "step": 5305 }, { "epoch": 1.2782859894077996, - "grad_norm": 1.0859375, - "learning_rate": 0.00016175511149087114, - "loss": 0.5504, + "grad_norm": 1.8515625, + "learning_rate": 6.449440479997285e-05, + "loss": 0.6319, "step": 5310 }, { "epoch": 1.279489648531536, - "grad_norm": 0.97265625, - "learning_rate": 0.00016165061235727757, - "loss": 0.4871, + "grad_norm": 1.8828125, + "learning_rate": 6.445273928868784e-05, + "loss": 0.5665, "step": 5315 }, { "epoch": 1.280693307655272, - "grad_norm": 1.0078125, - "learning_rate": 0.0001615460814196823, - "loss": 0.4984, + "grad_norm": 1.75, + "learning_rate": 6.441106109662791e-05, + "loss": 0.5775, "step": 5320 }, { "epoch": 1.281896966779008, - "grad_norm": 1.0234375, - "learning_rate": 0.0001614415188450989, - "loss": 0.4956, + "grad_norm": 1.84375, + "learning_rate": 6.436937029038412e-05, + "loss": 0.5772, "step": 5325 }, { "epoch": 1.2831006259027444, - "grad_norm": 1.0078125, - "learning_rate": 0.00016133692480059167, - "loss": 0.5095, + "grad_norm": 1.8203125, + "learning_rate": 6.432766693656772e-05, + "loss": 0.587, "step": 5330 }, { "epoch": 1.2843042850264805, - "grad_norm": 1.0546875, - "learning_rate": 0.00016123229945327498, - "loss": 0.5178, + "grad_norm": 1.8671875, + "learning_rate": 6.428595110180994e-05, + "loss": 0.5947, "step": 5335 }, { "epoch": 1.2855079441502166, - "grad_norm": 1.203125, - "learning_rate": 0.00016112764297031333, - "loss": 0.5206, + "grad_norm": 1.8046875, + "learning_rate": 6.424422285276199e-05, + "loss": 0.5932, "step": 5340 }, { "epoch": 1.286711603273953, - "grad_norm": 1.0546875, - "learning_rate": 0.00016102295551892102, - "loss": 0.4835, + "grad_norm": 1.7890625, + "learning_rate": 6.420248225609494e-05, + "loss": 0.5611, "step": 5345 }, { "epoch": 1.287915262397689, - "grad_norm": 1.015625, - "learning_rate": 0.0001609182372663617, - "loss": 0.4915, + "grad_norm": 1.859375, + "learning_rate": 6.416072937849956e-05, + "loss": 0.5718, "step": 5350 }, { "epoch": 1.2891189215214252, - "grad_norm": 1.0859375, - "learning_rate": 0.00016081348837994827, - "loss": 0.5257, + "grad_norm": 1.8203125, + "learning_rate": 6.411896428668621e-05, + "loss": 0.6024, "step": 5355 }, { "epoch": 1.2903225806451613, - "grad_norm": 1.1015625, - "learning_rate": 0.00016070870902704266, - "loss": 0.525, + "grad_norm": 1.890625, + "learning_rate": 6.407718704738484e-05, + "loss": 0.5984, "step": 5360 }, { "epoch": 1.2915262397688974, - "grad_norm": 0.9609375, - "learning_rate": 0.00016060389937505538, - "loss": 0.4968, + "grad_norm": 1.8515625, + "learning_rate": 6.403539772734476e-05, + "loss": 0.5748, "step": 5365 }, { "epoch": 1.2927298988926337, - "grad_norm": 0.83984375, - "learning_rate": 0.0001604990595914454, - "loss": 0.466, + "grad_norm": 1.609375, + "learning_rate": 6.399359639333457e-05, + "loss": 0.5438, "step": 5370 }, { "epoch": 1.2939335580163698, - "grad_norm": 1.0234375, - "learning_rate": 0.0001603941898437198, - "loss": 0.4868, + "grad_norm": 1.8046875, + "learning_rate": 6.395178311214212e-05, + "loss": 0.563, "step": 5375 }, { "epoch": 1.2951372171401059, - "grad_norm": 0.94921875, - "learning_rate": 0.00016028929029943356, - "loss": 0.5042, + "grad_norm": 1.796875, + "learning_rate": 6.390995795057427e-05, + "loss": 0.5893, "step": 5380 }, { "epoch": 1.2963408762638422, - "grad_norm": 0.984375, - "learning_rate": 0.0001601843611261893, - "loss": 0.5249, + "grad_norm": 1.6953125, + "learning_rate": 6.386812097545697e-05, + "loss": 0.6014, "step": 5385 }, { "epoch": 1.2975445353875783, - "grad_norm": 1.0078125, - "learning_rate": 0.00016007940249163687, - "loss": 0.5073, + "grad_norm": 1.875, + "learning_rate": 6.38262722536349e-05, + "loss": 0.5909, "step": 5390 }, { "epoch": 1.2987481945113144, - "grad_norm": 1.09375, - "learning_rate": 0.00015997441456347332, - "loss": 0.504, + "grad_norm": 1.9921875, + "learning_rate": 6.378441185197166e-05, + "loss": 0.584, "step": 5395 }, { "epoch": 1.2999518536350505, - "grad_norm": 0.9140625, - "learning_rate": 0.0001598693975094424, - "loss": 0.4509, + "grad_norm": 1.7890625, + "learning_rate": 6.374253983734941e-05, + "loss": 0.5316, "step": 5400 }, { "epoch": 1.3011555127587866, - "grad_norm": 0.984375, - "learning_rate": 0.00015976435149733445, - "loss": 0.5044, + "grad_norm": 1.7890625, + "learning_rate": 6.370065627666889e-05, + "loss": 0.5844, "step": 5405 }, { "epoch": 1.302359171882523, - "grad_norm": 0.98046875, - "learning_rate": 0.00015965927669498616, - "loss": 0.5191, + "grad_norm": 1.90625, + "learning_rate": 6.365876123684933e-05, + "loss": 0.5983, "step": 5410 }, { "epoch": 1.303562831006259, - "grad_norm": 0.9765625, - "learning_rate": 0.00015955417327027997, - "loss": 0.4769, + "grad_norm": 1.78125, + "learning_rate": 6.36168547848282e-05, + "loss": 0.5529, "step": 5415 }, { "epoch": 1.3047664901299951, - "grad_norm": 0.99609375, - "learning_rate": 0.00015944904139114435, - "loss": 0.4883, + "grad_norm": 1.921875, + "learning_rate": 6.357493698756132e-05, + "loss": 0.5704, "step": 5420 }, { "epoch": 1.3059701492537314, - "grad_norm": 1.015625, - "learning_rate": 0.000159343881225553, - "loss": 0.4698, + "grad_norm": 1.796875, + "learning_rate": 6.353300791202258e-05, + "loss": 0.5494, "step": 5425 }, { "epoch": 1.3071738083774675, - "grad_norm": 1.0625, - "learning_rate": 0.00015923869294152493, - "loss": 0.4906, + "grad_norm": 1.7890625, + "learning_rate": 6.349106762520385e-05, + "loss": 0.5676, "step": 5430 }, { "epoch": 1.3083774675012037, - "grad_norm": 0.97265625, - "learning_rate": 0.0001591334767071241, - "loss": 0.4966, + "grad_norm": 1.9375, + "learning_rate": 6.344911619411501e-05, + "loss": 0.5792, "step": 5435 }, { "epoch": 1.3095811266249398, - "grad_norm": 1.0078125, - "learning_rate": 0.000159028232690459, - "loss": 0.5051, + "grad_norm": 1.7578125, + "learning_rate": 6.340715368578366e-05, + "loss": 0.5892, "step": 5440 }, { "epoch": 1.3107847857486759, - "grad_norm": 1.0546875, - "learning_rate": 0.00015892296105968266, - "loss": 0.5209, + "grad_norm": 1.9375, + "learning_rate": 6.336518016725515e-05, + "loss": 0.6019, "step": 5445 }, { "epoch": 1.3119884448724122, - "grad_norm": 1.0703125, - "learning_rate": 0.00015881766198299209, - "loss": 0.4825, + "grad_norm": 1.9140625, + "learning_rate": 6.332319570559239e-05, + "loss": 0.5629, "step": 5450 }, { "epoch": 1.3131921039961483, - "grad_norm": 0.97265625, - "learning_rate": 0.00015871233562862828, - "loss": 0.4875, + "grad_norm": 1.7734375, + "learning_rate": 6.32812003678758e-05, + "loss": 0.5702, "step": 5455 }, { "epoch": 1.3143957631198844, - "grad_norm": 1.03125, - "learning_rate": 0.0001586069821648757, - "loss": 0.5002, + "grad_norm": 1.75, + "learning_rate": 6.323919422120313e-05, + "loss": 0.5739, "step": 5460 }, { "epoch": 1.3155994222436207, - "grad_norm": 1.078125, - "learning_rate": 0.00015850160176006226, - "loss": 0.4955, + "grad_norm": 1.84375, + "learning_rate": 6.31971773326895e-05, + "loss": 0.5745, "step": 5465 }, { "epoch": 1.3168030813673568, - "grad_norm": 1.046875, - "learning_rate": 0.00015839619458255874, - "loss": 0.4618, + "grad_norm": 1.8203125, + "learning_rate": 6.315514976946707e-05, + "loss": 0.5406, "step": 5470 }, { "epoch": 1.318006740491093, - "grad_norm": 0.96875, - "learning_rate": 0.00015829076080077883, - "loss": 0.4873, + "grad_norm": 1.6953125, + "learning_rate": 6.311311159868512e-05, + "loss": 0.5648, "step": 5475 }, { "epoch": 1.319210399614829, - "grad_norm": 0.9296875, - "learning_rate": 0.00015818530058317869, - "loss": 0.4883, + "grad_norm": 1.71875, + "learning_rate": 6.307106288750988e-05, + "loss": 0.5692, "step": 5480 }, { "epoch": 1.320414058738565, - "grad_norm": 1.09375, - "learning_rate": 0.00015807981409825665, - "loss": 0.4733, + "grad_norm": 1.8515625, + "learning_rate": 6.30290037031244e-05, + "loss": 0.5491, "step": 5485 }, { "epoch": 1.3216177178623014, - "grad_norm": 0.98046875, - "learning_rate": 0.00015797430151455317, - "loss": 0.5089, + "grad_norm": 1.8984375, + "learning_rate": 6.298693411272852e-05, + "loss": 0.5926, "step": 5490 }, { "epoch": 1.3228213769860375, - "grad_norm": 0.91796875, - "learning_rate": 0.00015786876300065024, - "loss": 0.4834, + "grad_norm": 1.640625, + "learning_rate": 6.29448541835386e-05, + "loss": 0.5574, "step": 5495 }, { "epoch": 1.3240250361097736, - "grad_norm": 0.91796875, - "learning_rate": 0.00015776319872517136, - "loss": 0.4989, + "grad_norm": 1.65625, + "learning_rate": 6.29027639827876e-05, + "loss": 0.5765, "step": 5500 }, { "epoch": 1.3240250361097736, - "eval_loss": 0.4303210377693176, - "eval_runtime": 2.3345, - "eval_samples_per_second": 85.673, - "eval_steps_per_second": 85.673, + "eval_loss": 0.495535284280777, + "eval_runtime": 2.3946, + "eval_samples_per_second": 83.521, + "eval_steps_per_second": 83.521, "step": 5500 }, { "epoch": 1.32522869523351, - "grad_norm": 1.0703125, - "learning_rate": 0.00015765760885678122, - "loss": 0.5083, + "grad_norm": 1.8984375, + "learning_rate": 6.286066357772489e-05, + "loss": 0.5907, "step": 5505 }, { "epoch": 1.326432354357246, - "grad_norm": 1.03125, - "learning_rate": 0.0001575519935641853, - "loss": 0.515, + "grad_norm": 1.8515625, + "learning_rate": 6.28185530356161e-05, + "loss": 0.5972, "step": 5510 }, { "epoch": 1.3276360134809821, - "grad_norm": 1.0546875, - "learning_rate": 0.00015744635301612983, - "loss": 0.4648, + "grad_norm": 1.7734375, + "learning_rate": 6.277643242374309e-05, + "loss": 0.5421, "step": 5515 }, { "epoch": 1.3288396726047185, - "grad_norm": 1.0546875, - "learning_rate": 0.0001573406873814013, - "loss": 0.488, + "grad_norm": 2.015625, + "learning_rate": 6.273430180940383e-05, + "loss": 0.562, "step": 5520 }, { "epoch": 1.3300433317284546, - "grad_norm": 1.2109375, - "learning_rate": 0.00015723499682882626, - "loss": 0.5022, + "grad_norm": 1.8828125, + "learning_rate": 6.269216125991218e-05, + "loss": 0.5846, "step": 5525 }, { "epoch": 1.3312469908521907, - "grad_norm": 0.9765625, - "learning_rate": 0.0001571292815272712, - "loss": 0.4769, + "grad_norm": 1.75, + "learning_rate": 6.2650010842598e-05, + "loss": 0.5568, "step": 5530 }, { "epoch": 1.3324506499759268, - "grad_norm": 1.0703125, - "learning_rate": 0.00015702354164564197, - "loss": 0.4903, + "grad_norm": 1.8828125, + "learning_rate": 6.260785062480678e-05, + "loss": 0.5694, "step": 5535 }, { "epoch": 1.3336543090996629, - "grad_norm": 1.046875, - "learning_rate": 0.00015691777735288387, - "loss": 0.4775, + "grad_norm": 1.7734375, + "learning_rate": 6.25656806738998e-05, + "loss": 0.5584, "step": 5540 }, { "epoch": 1.3348579682233992, - "grad_norm": 0.95703125, - "learning_rate": 0.00015681198881798116, - "loss": 0.4991, + "grad_norm": 1.6328125, + "learning_rate": 6.252350105725383e-05, + "loss": 0.5781, "step": 5545 }, { "epoch": 1.3360616273471353, - "grad_norm": 1.09375, - "learning_rate": 0.0001567061762099567, - "loss": 0.5037, + "grad_norm": 1.8515625, + "learning_rate": 6.248131184226104e-05, + "loss": 0.5785, "step": 5550 }, { "epoch": 1.3372652864708714, - "grad_norm": 1.1171875, - "learning_rate": 0.00015660033969787198, - "loss": 0.4895, + "grad_norm": 1.9609375, + "learning_rate": 6.243911309632902e-05, + "loss": 0.5688, "step": 5555 }, { "epoch": 1.3384689455946077, - "grad_norm": 1.125, - "learning_rate": 0.00015649447945082656, - "loss": 0.51, + "grad_norm": 1.9609375, + "learning_rate": 6.239690488688052e-05, + "loss": 0.5884, "step": 5560 }, { "epoch": 1.3396726047183438, - "grad_norm": 1.109375, - "learning_rate": 0.00015638859563795804, - "loss": 0.519, + "grad_norm": 1.9921875, + "learning_rate": 6.235468728135346e-05, + "loss": 0.5986, "step": 5565 }, { "epoch": 1.34087626384208, - "grad_norm": 0.984375, - "learning_rate": 0.0001562826884284416, - "loss": 0.4689, + "grad_norm": 1.7578125, + "learning_rate": 6.231246034720076e-05, + "loss": 0.5553, "step": 5570 }, { "epoch": 1.342079922965816, - "grad_norm": 0.96484375, - "learning_rate": 0.0001561767579914898, - "loss": 0.513, + "grad_norm": 1.859375, + "learning_rate": 6.227022415189022e-05, + "loss": 0.5957, "step": 5575 }, { "epoch": 1.3432835820895521, - "grad_norm": 0.96875, - "learning_rate": 0.0001560708044963523, - "loss": 0.4905, + "grad_norm": 1.6953125, + "learning_rate": 6.222797876290444e-05, + "loss": 0.5722, "step": 5580 }, { "epoch": 1.3444872412132884, - "grad_norm": 0.99609375, - "learning_rate": 0.00015596482811231565, - "loss": 0.4709, + "grad_norm": 1.8046875, + "learning_rate": 6.218572424774072e-05, + "loss": 0.5465, "step": 5585 }, { "epoch": 1.3456909003370245, - "grad_norm": 1.15625, - "learning_rate": 0.0001558588290087029, - "loss": 0.4787, + "grad_norm": 1.8359375, + "learning_rate": 6.214346067391097e-05, + "loss": 0.556, "step": 5590 }, { "epoch": 1.3468945594607606, - "grad_norm": 1.015625, - "learning_rate": 0.0001557528073548735, - "loss": 0.5334, + "grad_norm": 1.8046875, + "learning_rate": 6.210118810894153e-05, + "loss": 0.6122, "step": 5595 }, { "epoch": 1.348098218584497, - "grad_norm": 1.078125, - "learning_rate": 0.00015564676332022287, - "loss": 0.5084, + "grad_norm": 2.125, + "learning_rate": 6.205890662037314e-05, + "loss": 0.588, "step": 5600 }, { "epoch": 1.349301877708233, - "grad_norm": 1.015625, - "learning_rate": 0.00015554069707418217, - "loss": 0.4736, + "grad_norm": 1.8515625, + "learning_rate": 6.201661627576077e-05, + "loss": 0.5601, "step": 5605 }, { "epoch": 1.3505055368319692, - "grad_norm": 1.109375, - "learning_rate": 0.00015543460878621805, - "loss": 0.49, + "grad_norm": 1.9453125, + "learning_rate": 6.197431714267353e-05, + "loss": 0.5656, "step": 5610 }, { "epoch": 1.3517091959557053, - "grad_norm": 0.9375, - "learning_rate": 0.00015532849862583245, - "loss": 0.4873, + "grad_norm": 1.7421875, + "learning_rate": 6.193200928869461e-05, + "loss": 0.5655, "step": 5615 }, { "epoch": 1.3529128550794414, - "grad_norm": 1.0625, - "learning_rate": 0.00015522236676256216, - "loss": 0.4829, + "grad_norm": 1.8515625, + "learning_rate": 6.188969278142111e-05, + "loss": 0.5607, "step": 5620 }, { "epoch": 1.3541165142031777, - "grad_norm": 1.0390625, - "learning_rate": 0.00015511621336597876, - "loss": 0.5117, + "grad_norm": 1.9453125, + "learning_rate": 6.184736768846398e-05, + "loss": 0.5934, "step": 5625 }, { "epoch": 1.3553201733269138, - "grad_norm": 1.0078125, - "learning_rate": 0.00015501003860568809, - "loss": 0.4813, + "grad_norm": 1.765625, + "learning_rate": 6.180503407744784e-05, + "loss": 0.5679, "step": 5630 }, { "epoch": 1.35652383245065, - "grad_norm": 1.0390625, - "learning_rate": 0.00015490384265133021, - "loss": 0.4823, + "grad_norm": 1.7890625, + "learning_rate": 6.176269201601095e-05, + "loss": 0.5609, "step": 5635 }, { "epoch": 1.3577274915743862, - "grad_norm": 1.0, - "learning_rate": 0.00015479762567257904, - "loss": 0.5006, + "grad_norm": 1.7421875, + "learning_rate": 6.172034157180509e-05, + "loss": 0.5771, "step": 5640 }, { "epoch": 1.3589311506981223, - "grad_norm": 1.09375, - "learning_rate": 0.00015469138783914208, - "loss": 0.5248, + "grad_norm": 1.7890625, + "learning_rate": 6.16779828124954e-05, + "loss": 0.6023, "step": 5645 }, { "epoch": 1.3601348098218584, - "grad_norm": 1.0234375, - "learning_rate": 0.0001545851293207602, - "loss": 0.5195, + "grad_norm": 1.8515625, + "learning_rate": 6.163561580576032e-05, + "loss": 0.6001, "step": 5650 }, { "epoch": 1.3613384689455947, - "grad_norm": 1.0546875, - "learning_rate": 0.0001544788502872072, - "loss": 0.4864, + "grad_norm": 1.8203125, + "learning_rate": 6.159324061929146e-05, + "loss": 0.5612, "step": 5655 }, { "epoch": 1.3625421280693308, - "grad_norm": 0.97265625, - "learning_rate": 0.00015437255090828983, - "loss": 0.4865, + "grad_norm": 1.7890625, + "learning_rate": 6.155085732079353e-05, + "loss": 0.5683, "step": 5660 }, { "epoch": 1.363745787193067, - "grad_norm": 1.03125, - "learning_rate": 0.0001542662313538471, - "loss": 0.4892, + "grad_norm": 1.8515625, + "learning_rate": 6.150846597798414e-05, + "loss": 0.5708, "step": 5665 }, { "epoch": 1.364949446316803, - "grad_norm": 1.0390625, - "learning_rate": 0.0001541598917937505, - "loss": 0.4824, + "grad_norm": 1.7421875, + "learning_rate": 6.14660666585938e-05, + "loss": 0.561, "step": 5670 }, { "epoch": 1.3661531054405391, - "grad_norm": 1.0390625, - "learning_rate": 0.00015405353239790343, - "loss": 0.4895, + "grad_norm": 1.734375, + "learning_rate": 6.142365943036578e-05, + "loss": 0.568, "step": 5675 }, { "epoch": 1.3673567645642755, - "grad_norm": 0.8828125, - "learning_rate": 0.00015394715333624088, - "loss": 0.4622, + "grad_norm": 1.6015625, + "learning_rate": 6.138124436105594e-05, + "loss": 0.5471, "step": 5680 }, { "epoch": 1.3685604236880116, - "grad_norm": 0.94921875, - "learning_rate": 0.00015384075477872927, - "loss": 0.5052, + "grad_norm": 1.7578125, + "learning_rate": 6.133882151843267e-05, + "loss": 0.5879, "step": 5685 }, { "epoch": 1.3697640828117477, - "grad_norm": 0.95703125, - "learning_rate": 0.00015373433689536627, - "loss": 0.454, + "grad_norm": 1.859375, + "learning_rate": 6.129639097027681e-05, + "loss": 0.5363, "step": 5690 }, { "epoch": 1.370967741935484, - "grad_norm": 1.0625, - "learning_rate": 0.0001536278998561804, - "loss": 0.5243, + "grad_norm": 1.8046875, + "learning_rate": 6.125395278438152e-05, + "loss": 0.6045, "step": 5695 }, { "epoch": 1.37217140105922, - "grad_norm": 1.046875, - "learning_rate": 0.00015352144383123074, - "loss": 0.4948, + "grad_norm": 1.9375, + "learning_rate": 6.121150702855211e-05, + "loss": 0.5747, "step": 5700 }, { "epoch": 1.3733750601829562, - "grad_norm": 1.0234375, - "learning_rate": 0.00015341496899060677, - "loss": 0.4879, + "grad_norm": 1.8515625, + "learning_rate": 6.116905377060605e-05, + "loss": 0.5699, "step": 5705 }, { "epoch": 1.3745787193066923, - "grad_norm": 0.98828125, - "learning_rate": 0.00015330847550442788, - "loss": 0.4649, + "grad_norm": 1.7734375, + "learning_rate": 6.112659307837273e-05, + "loss": 0.5496, "step": 5710 }, { "epoch": 1.3757823784304284, - "grad_norm": 1.0703125, - "learning_rate": 0.00015320196354284347, - "loss": 0.509, + "grad_norm": 1.9296875, + "learning_rate": 6.108412501969346e-05, + "loss": 0.5902, "step": 5715 }, { "epoch": 1.3769860375541647, - "grad_norm": 0.94140625, - "learning_rate": 0.00015309543327603228, - "loss": 0.4778, + "grad_norm": 1.734375, + "learning_rate": 6.104164966242133e-05, + "loss": 0.559, "step": 5720 }, { "epoch": 1.3781896966779008, - "grad_norm": 1.0, - "learning_rate": 0.00015298888487420243, - "loss": 0.4932, + "grad_norm": 1.8046875, + "learning_rate": 6.0999167074421053e-05, + "loss": 0.5762, "step": 5725 }, { "epoch": 1.379393355801637, - "grad_norm": 1.0234375, - "learning_rate": 0.00015288231850759093, - "loss": 0.4759, + "grad_norm": 1.734375, + "learning_rate": 6.095667732356893e-05, + "loss": 0.5563, "step": 5730 }, { "epoch": 1.3805970149253732, - "grad_norm": 1.0234375, - "learning_rate": 0.00015277573434646348, - "loss": 0.5073, + "grad_norm": 1.859375, + "learning_rate": 6.0914180477752677e-05, + "loss": 0.5903, "step": 5735 }, { "epoch": 1.3818006740491093, - "grad_norm": 0.94921875, - "learning_rate": 0.00015266913256111426, - "loss": 0.4987, + "grad_norm": 1.6953125, + "learning_rate": 6.0871676604871354e-05, + "loss": 0.5749, "step": 5740 }, { "epoch": 1.3830043331728454, - "grad_norm": 0.90625, - "learning_rate": 0.0001525625133218656, - "loss": 0.457, + "grad_norm": 1.65625, + "learning_rate": 6.082916577283527e-05, + "loss": 0.5362, "step": 5745 }, { "epoch": 1.3842079922965815, - "grad_norm": 0.96875, - "learning_rate": 0.00015245587679906775, - "loss": 0.4982, + "grad_norm": 1.9296875, + "learning_rate": 6.078664804956582e-05, + "loss": 0.5791, "step": 5750 }, { "epoch": 1.3854116514203176, - "grad_norm": 0.9296875, - "learning_rate": 0.0001523492231630985, - "loss": 0.4992, + "grad_norm": 1.59375, + "learning_rate": 6.074412350299544e-05, + "loss": 0.5773, "step": 5755 }, { "epoch": 1.386615310544054, - "grad_norm": 1.0390625, - "learning_rate": 0.00015224255258436306, - "loss": 0.5156, + "grad_norm": 1.9765625, + "learning_rate": 6.070159220106747e-05, + "loss": 0.5955, "step": 5760 }, { "epoch": 1.38781896966779, - "grad_norm": 0.9453125, - "learning_rate": 0.0001521358652332936, - "loss": 0.4694, + "grad_norm": 1.6796875, + "learning_rate": 6.065905421173599e-05, + "loss": 0.5525, "step": 5765 }, { "epoch": 1.3890226287915262, - "grad_norm": 1.0390625, - "learning_rate": 0.00015202916128034916, - "loss": 0.4726, + "grad_norm": 1.75, + "learning_rate": 6.0616509602965814e-05, + "loss": 0.5571, "step": 5770 }, { "epoch": 1.3902262879152625, - "grad_norm": 1.015625, - "learning_rate": 0.00015192244089601536, - "loss": 0.5051, + "grad_norm": 1.7734375, + "learning_rate": 6.0573958442732336e-05, + "loss": 0.5879, "step": 5775 }, { "epoch": 1.3914299470389986, - "grad_norm": 1.0546875, - "learning_rate": 0.000151815704250804, - "loss": 0.4599, + "grad_norm": 1.7890625, + "learning_rate": 6.0531400799021386e-05, + "loss": 0.5356, "step": 5780 }, { "epoch": 1.3926336061627347, - "grad_norm": 0.99609375, - "learning_rate": 0.00015170895151525287, - "loss": 0.5049, + "grad_norm": 1.875, + "learning_rate": 6.0488836739829186e-05, + "loss": 0.593, "step": 5785 }, { "epoch": 1.393837265286471, - "grad_norm": 1.0625, - "learning_rate": 0.00015160218285992547, - "loss": 0.4865, + "grad_norm": 1.8046875, + "learning_rate": 6.044626633316218e-05, + "loss": 0.5648, "step": 5790 }, { "epoch": 1.395040924410207, - "grad_norm": 0.98828125, - "learning_rate": 0.00015149539845541073, - "loss": 0.4782, + "grad_norm": 1.75, + "learning_rate": 6.040368964703695e-05, + "loss": 0.5559, "step": 5795 }, { "epoch": 1.3962445835339432, - "grad_norm": 0.96484375, - "learning_rate": 0.00015138859847232277, - "loss": 0.4875, + "grad_norm": 1.7734375, + "learning_rate": 6.036110674948015e-05, + "loss": 0.5578, "step": 5800 }, { "epoch": 1.3974482426576793, - "grad_norm": 0.93359375, - "learning_rate": 0.0001512817830813006, - "loss": 0.5089, + "grad_norm": 1.65625, + "learning_rate": 6.031851770852833e-05, + "loss": 0.5853, "step": 5805 }, { "epoch": 1.3986519017814154, - "grad_norm": 1.0078125, - "learning_rate": 0.00015117495245300783, - "loss": 0.5131, + "grad_norm": 1.84375, + "learning_rate": 6.027592259222785e-05, + "loss": 0.5863, "step": 5810 }, { "epoch": 1.3998555609051517, - "grad_norm": 1.046875, - "learning_rate": 0.0001510681067581324, - "loss": 0.4927, + "grad_norm": 1.7265625, + "learning_rate": 6.0233321468634804e-05, + "loss": 0.5745, "step": 5815 }, { "epoch": 1.4010592200288878, - "grad_norm": 1.03125, - "learning_rate": 0.0001509612461673863, - "loss": 0.488, + "grad_norm": 1.8203125, + "learning_rate": 6.019071440581483e-05, + "loss": 0.5631, "step": 5820 }, { "epoch": 1.402262879152624, - "grad_norm": 1.0, - "learning_rate": 0.00015085437085150545, - "loss": 0.4898, + "grad_norm": 1.8203125, + "learning_rate": 6.014810147184313e-05, + "loss": 0.573, "step": 5825 }, { "epoch": 1.4034665382763603, - "grad_norm": 0.99609375, - "learning_rate": 0.00015074748098124912, - "loss": 0.4959, + "grad_norm": 1.6171875, + "learning_rate": 6.0105482734804226e-05, + "loss": 0.5808, "step": 5830 }, { "epoch": 1.4046701974000964, - "grad_norm": 1.0234375, - "learning_rate": 0.00015064057672739995, - "loss": 0.5057, + "grad_norm": 1.8359375, + "learning_rate": 6.006285826279195e-05, + "loss": 0.583, "step": 5835 }, { "epoch": 1.4058738565238325, - "grad_norm": 1.0078125, - "learning_rate": 0.00015053365826076364, - "loss": 0.4903, + "grad_norm": 1.7734375, + "learning_rate": 6.002022812390929e-05, + "loss": 0.5752, "step": 5840 }, { "epoch": 1.4070775156475686, - "grad_norm": 1.09375, - "learning_rate": 0.00015042672575216832, - "loss": 0.4555, + "grad_norm": 1.9453125, + "learning_rate": 5.9977592386268245e-05, + "loss": 0.5391, "step": 5845 }, { "epoch": 1.4082811747713047, - "grad_norm": 0.8984375, - "learning_rate": 0.00015031977937246478, - "loss": 0.457, + "grad_norm": 1.75, + "learning_rate": 5.99349511179898e-05, + "loss": 0.5429, "step": 5850 }, { "epoch": 1.409484833895041, - "grad_norm": 0.99609375, - "learning_rate": 0.00015021281929252598, - "loss": 0.4664, + "grad_norm": 1.84375, + "learning_rate": 5.98923043872038e-05, + "loss": 0.5496, "step": 5855 }, { "epoch": 1.410688493018777, - "grad_norm": 0.94921875, - "learning_rate": 0.00015010584568324667, - "loss": 0.46, + "grad_norm": 1.78125, + "learning_rate": 5.984965226204877e-05, + "loss": 0.5354, "step": 5860 }, { "epoch": 1.4118921521425132, - "grad_norm": 1.09375, - "learning_rate": 0.00014999885871554326, - "loss": 0.4646, + "grad_norm": 1.7265625, + "learning_rate": 5.9806994810671885e-05, + "loss": 0.5414, "step": 5865 }, { "epoch": 1.4130958112662495, - "grad_norm": 1.0859375, - "learning_rate": 0.0001498918585603535, - "loss": 0.4671, + "grad_norm": 1.8671875, + "learning_rate": 5.976433210122879e-05, + "loss": 0.5518, "step": 5870 }, { "epoch": 1.4142994703899856, - "grad_norm": 0.97265625, - "learning_rate": 0.00014978484538863613, - "loss": 0.4614, + "grad_norm": 1.796875, + "learning_rate": 5.972166420188357e-05, + "loss": 0.5437, "step": 5875 }, { "epoch": 1.4155031295137217, - "grad_norm": 1.0625, - "learning_rate": 0.00014967781937137088, - "loss": 0.4714, + "grad_norm": 1.9921875, + "learning_rate": 5.96789911808086e-05, + "loss": 0.5527, "step": 5880 }, { "epoch": 1.4167067886374578, - "grad_norm": 1.046875, - "learning_rate": 0.00014957078067955786, - "loss": 0.4589, + "grad_norm": 1.84375, + "learning_rate": 5.963631310618443e-05, + "loss": 0.5396, "step": 5885 }, { "epoch": 1.417910447761194, - "grad_norm": 0.9765625, - "learning_rate": 0.0001494637294842174, - "loss": 0.4809, + "grad_norm": 1.8359375, + "learning_rate": 5.9593630046199664e-05, + "loss": 0.5587, "step": 5890 }, { "epoch": 1.4191141068849302, - "grad_norm": 1.0078125, - "learning_rate": 0.00014935666595639, - "loss": 0.4796, + "grad_norm": 1.7578125, + "learning_rate": 5.955094206905092e-05, + "loss": 0.5626, "step": 5895 }, { "epoch": 1.4203177660086663, - "grad_norm": 0.9609375, - "learning_rate": 0.00014924959026713555, - "loss": 0.4601, + "grad_norm": 1.65625, + "learning_rate": 5.950824924294259e-05, + "loss": 0.5302, "step": 5900 }, { "epoch": 1.4215214251324024, - "grad_norm": 0.98046875, - "learning_rate": 0.0001491425025875337, - "loss": 0.4703, + "grad_norm": 1.8828125, + "learning_rate": 5.946555163608692e-05, + "loss": 0.5493, "step": 5905 }, { "epoch": 1.4227250842561388, - "grad_norm": 0.9921875, - "learning_rate": 0.000149035403088683, - "loss": 0.4881, + "grad_norm": 1.71875, + "learning_rate": 5.942284931670372e-05, + "loss": 0.5716, "step": 5910 }, { "epoch": 1.4239287433798749, - "grad_norm": 0.89453125, - "learning_rate": 0.00014892829194170111, - "loss": 0.5071, + "grad_norm": 1.703125, + "learning_rate": 5.9380142353020346e-05, + "loss": 0.5909, "step": 5915 }, { "epoch": 1.425132402503611, - "grad_norm": 1.0390625, - "learning_rate": 0.00014882116931772408, - "loss": 0.4847, + "grad_norm": 1.8828125, + "learning_rate": 5.933743081327158e-05, + "loss": 0.5614, "step": 5920 }, { "epoch": 1.4263360616273473, - "grad_norm": 1.0078125, - "learning_rate": 0.00014871403538790649, - "loss": 0.4635, + "grad_norm": 1.8125, + "learning_rate": 5.9294714765699514e-05, + "loss": 0.5447, "step": 5925 }, { "epoch": 1.4275397207510834, - "grad_norm": 0.984375, - "learning_rate": 0.0001486068903234208, - "loss": 0.4724, + "grad_norm": 1.765625, + "learning_rate": 5.925199427855343e-05, + "loss": 0.5522, "step": 5930 }, { "epoch": 1.4287433798748195, - "grad_norm": 0.984375, - "learning_rate": 0.00014849973429545744, - "loss": 0.4836, + "grad_norm": 1.75, + "learning_rate": 5.9209269420089735e-05, + "loss": 0.566, "step": 5935 }, { "epoch": 1.4299470389985556, - "grad_norm": 0.94921875, - "learning_rate": 0.0001483925674752242, - "loss": 0.4912, + "grad_norm": 1.6640625, + "learning_rate": 5.916654025857179e-05, + "loss": 0.5696, "step": 5940 }, { "epoch": 1.4311506981222917, - "grad_norm": 1.0, - "learning_rate": 0.0001482853900339463, - "loss": 0.4868, + "grad_norm": 1.9140625, + "learning_rate": 5.912380686226985e-05, + "loss": 0.5718, "step": 5945 }, { "epoch": 1.432354357246028, - "grad_norm": 1.0390625, - "learning_rate": 0.00014817820214286568, - "loss": 0.4723, + "grad_norm": 1.8359375, + "learning_rate": 5.908106929946091e-05, + "loss": 0.5587, "step": 5950 }, { "epoch": 1.433558016369764, - "grad_norm": 0.9453125, - "learning_rate": 0.00014807100397324122, - "loss": 0.5071, + "grad_norm": 1.625, + "learning_rate": 5.903832763842866e-05, + "loss": 0.591, "step": 5955 }, { "epoch": 1.4347616754935002, - "grad_norm": 1.046875, - "learning_rate": 0.00014796379569634804, - "loss": 0.4888, + "grad_norm": 1.859375, + "learning_rate": 5.8995581947463295e-05, + "loss": 0.5719, "step": 5960 }, { "epoch": 1.4359653346172365, - "grad_norm": 0.953125, - "learning_rate": 0.00014785657748347752, - "loss": 0.4806, + "grad_norm": 1.828125, + "learning_rate": 5.8952832294861496e-05, + "loss": 0.5658, "step": 5965 }, { "epoch": 1.4371689937409726, - "grad_norm": 1.0078125, - "learning_rate": 0.00014774934950593686, - "loss": 0.4639, + "grad_norm": 1.8828125, + "learning_rate": 5.891007874892622e-05, + "loss": 0.5402, "step": 5970 }, { "epoch": 1.4383726528647087, - "grad_norm": 1.078125, - "learning_rate": 0.00014764211193504895, - "loss": 0.4824, + "grad_norm": 1.953125, + "learning_rate": 5.8867321377966717e-05, + "loss": 0.574, "step": 5975 }, { "epoch": 1.4395763119884448, - "grad_norm": 1.0546875, - "learning_rate": 0.0001475348649421518, - "loss": 0.5016, + "grad_norm": 1.8203125, + "learning_rate": 5.882456025029825e-05, + "loss": 0.588, "step": 5980 }, { "epoch": 1.440779971112181, - "grad_norm": 1.0546875, - "learning_rate": 0.0001474276086985987, - "loss": 0.4922, + "grad_norm": 1.8203125, + "learning_rate": 5.878179543424218e-05, + "loss": 0.5719, "step": 5985 }, { "epoch": 1.4419836302359172, - "grad_norm": 0.92578125, - "learning_rate": 0.00014732034337575767, - "loss": 0.4747, + "grad_norm": 1.6796875, + "learning_rate": 5.873902699812571e-05, + "loss": 0.556, "step": 5990 }, { "epoch": 1.4431872893596533, - "grad_norm": 1.0703125, - "learning_rate": 0.00014721306914501113, - "loss": 0.5108, + "grad_norm": 1.859375, + "learning_rate": 5.869625501028182e-05, + "loss": 0.5931, "step": 5995 }, { "epoch": 1.4443909484833894, - "grad_norm": 0.9921875, - "learning_rate": 0.00014710578617775584, - "loss": 0.4907, + "grad_norm": 1.8203125, + "learning_rate": 5.865347953904921e-05, + "loss": 0.5726, "step": 6000 }, { "epoch": 1.4443909484833894, - "eval_loss": 0.4240965247154236, - "eval_runtime": 2.3273, - "eval_samples_per_second": 85.935, - "eval_steps_per_second": 85.935, + "eval_loss": 0.4834999740123749, + "eval_runtime": 2.3838, + "eval_samples_per_second": 83.899, + "eval_steps_per_second": 83.899, "step": 6000 }, { "epoch": 1.4455946076071258, - "grad_norm": 1.0078125, - "learning_rate": 0.0001469984946454024, - "loss": 0.4903, + "grad_norm": 1.8203125, + "learning_rate": 5.861070065277209e-05, + "loss": 0.5762, "step": 6005 }, { "epoch": 1.4467982667308619, - "grad_norm": 1.015625, - "learning_rate": 0.0001468911947193753, - "loss": 0.468, + "grad_norm": 1.6484375, + "learning_rate": 5.8567918419800176e-05, + "loss": 0.5525, "step": 6010 }, { "epoch": 1.448001925854598, - "grad_norm": 1.0859375, - "learning_rate": 0.00014678388657111223, - "loss": 0.4715, + "grad_norm": 1.890625, + "learning_rate": 5.852513290848851e-05, + "loss": 0.5484, "step": 6015 }, { "epoch": 1.449205584978334, - "grad_norm": 1.0390625, - "learning_rate": 0.0001466765703720641, - "loss": 0.4654, + "grad_norm": 1.765625, + "learning_rate": 5.848234418719734e-05, + "loss": 0.5445, "step": 6020 }, { "epoch": 1.4504092441020702, - "grad_norm": 0.953125, - "learning_rate": 0.00014656924629369473, - "loss": 0.5008, + "grad_norm": 1.7109375, + "learning_rate": 5.843955232429211e-05, + "loss": 0.5799, "step": 6025 }, { "epoch": 1.4516129032258065, - "grad_norm": 0.8984375, - "learning_rate": 0.00014646191450748045, - "loss": 0.456, + "grad_norm": 1.6953125, + "learning_rate": 5.8396757388143224e-05, + "loss": 0.5382, "step": 6030 }, { "epoch": 1.4528165623495426, - "grad_norm": 0.94140625, - "learning_rate": 0.00014635457518490994, - "loss": 0.4842, + "grad_norm": 1.7109375, + "learning_rate": 5.8353959447126025e-05, + "loss": 0.5697, "step": 6035 }, { "epoch": 1.4540202214732787, - "grad_norm": 0.95703125, - "learning_rate": 0.00014624722849748397, - "loss": 0.447, + "grad_norm": 1.90625, + "learning_rate": 5.831115856962066e-05, + "loss": 0.5324, "step": 6040 }, { "epoch": 1.455223880597015, - "grad_norm": 0.94140625, - "learning_rate": 0.00014613987461671498, - "loss": 0.4667, + "grad_norm": 1.7890625, + "learning_rate": 5.826835482401195e-05, + "loss": 0.5493, "step": 6045 }, { "epoch": 1.4564275397207511, - "grad_norm": 1.0234375, - "learning_rate": 0.00014603251371412697, - "loss": 0.4852, + "grad_norm": 1.7421875, + "learning_rate": 5.8225548278689315e-05, + "loss": 0.5675, "step": 6050 }, { "epoch": 1.4576311988444872, - "grad_norm": 0.95703125, - "learning_rate": 0.0001459251459612551, - "loss": 0.5156, + "grad_norm": 1.6640625, + "learning_rate": 5.8182739002046636e-05, + "loss": 0.5948, "step": 6055 }, { "epoch": 1.4588348579682233, - "grad_norm": 1.046875, - "learning_rate": 0.00014581777152964555, - "loss": 0.4844, + "grad_norm": 1.8046875, + "learning_rate": 5.813992706248215e-05, + "loss": 0.5652, "step": 6060 }, { "epoch": 1.4600385170919594, - "grad_norm": 0.9921875, - "learning_rate": 0.00014571039059085516, - "loss": 0.4777, + "grad_norm": 1.8671875, + "learning_rate": 5.8097112528398386e-05, + "loss": 0.5612, "step": 6065 }, { "epoch": 1.4612421762156957, - "grad_norm": 0.9921875, - "learning_rate": 0.0001456030033164511, - "loss": 0.4674, + "grad_norm": 1.8046875, + "learning_rate": 5.805429546820198e-05, + "loss": 0.5453, "step": 6070 }, { "epoch": 1.4624458353394318, - "grad_norm": 1.0703125, - "learning_rate": 0.00014549560987801074, - "loss": 0.4813, + "grad_norm": 1.7734375, + "learning_rate": 5.801147595030362e-05, + "loss": 0.564, "step": 6075 }, { "epoch": 1.463649494463168, - "grad_norm": 1.0390625, - "learning_rate": 0.00014538821044712128, - "loss": 0.5192, + "grad_norm": 1.765625, + "learning_rate": 5.7968654043117935e-05, + "loss": 0.6056, "step": 6080 }, { "epoch": 1.4648531535869043, - "grad_norm": 0.98046875, - "learning_rate": 0.00014528080519537933, - "loss": 0.4662, + "grad_norm": 1.734375, + "learning_rate": 5.792582981506331e-05, + "loss": 0.555, "step": 6085 }, { "epoch": 1.4660568127106404, - "grad_norm": 0.89453125, - "learning_rate": 0.00014517339429439115, - "loss": 0.4644, + "grad_norm": 1.6796875, + "learning_rate": 5.788300333456193e-05, + "loss": 0.546, "step": 6090 }, { "epoch": 1.4672604718343765, - "grad_norm": 1.0234375, - "learning_rate": 0.0001450659779157717, - "loss": 0.489, + "grad_norm": 1.8203125, + "learning_rate": 5.784017467003951e-05, + "loss": 0.5716, "step": 6095 }, { "epoch": 1.4684641309581128, - "grad_norm": 0.97265625, - "learning_rate": 0.00014495855623114485, - "loss": 0.4889, + "grad_norm": 1.7578125, + "learning_rate": 5.779734388992527e-05, + "loss": 0.5661, "step": 6100 }, { "epoch": 1.469667790081849, - "grad_norm": 1.046875, - "learning_rate": 0.000144851129412143, - "loss": 0.4869, + "grad_norm": 1.8828125, + "learning_rate": 5.7754511062651863e-05, + "loss": 0.57, "step": 6105 }, { "epoch": 1.470871449205585, - "grad_norm": 0.96484375, - "learning_rate": 0.00014474369763040648, - "loss": 0.4616, + "grad_norm": 1.703125, + "learning_rate": 5.7711676256655096e-05, + "loss": 0.5395, "step": 6110 }, { "epoch": 1.472075108329321, - "grad_norm": 0.95703125, - "learning_rate": 0.00014463626105758388, - "loss": 0.4959, + "grad_norm": 1.6953125, + "learning_rate": 5.766883954037406e-05, + "loss": 0.577, "step": 6115 }, { "epoch": 1.4732787674530572, - "grad_norm": 1.0390625, - "learning_rate": 0.00014452881986533127, - "loss": 0.4974, + "grad_norm": 1.7578125, + "learning_rate": 5.762600098225082e-05, + "loss": 0.573, "step": 6120 }, { "epoch": 1.4744824265767935, - "grad_norm": 1.0078125, - "learning_rate": 0.00014442137422531216, - "loss": 0.4752, + "grad_norm": 1.78125, + "learning_rate": 5.7583160650730445e-05, + "loss": 0.5609, "step": 6125 }, { "epoch": 1.4756860857005296, - "grad_norm": 1.0546875, - "learning_rate": 0.00014431392430919705, - "loss": 0.483, + "grad_norm": 1.7890625, + "learning_rate": 5.7540318614260756e-05, + "loss": 0.568, "step": 6130 }, { "epoch": 1.4768897448242657, - "grad_norm": 1.0859375, - "learning_rate": 0.00014420647028866342, - "loss": 0.5015, + "grad_norm": 2.03125, + "learning_rate": 5.7497474941292365e-05, + "loss": 0.5868, "step": 6135 }, { "epoch": 1.478093403948002, - "grad_norm": 1.0625, - "learning_rate": 0.0001440990123353953, - "loss": 0.5044, + "grad_norm": 1.8359375, + "learning_rate": 5.74546297002785e-05, + "loss": 0.5808, "step": 6140 }, { "epoch": 1.4792970630717381, - "grad_norm": 0.97265625, - "learning_rate": 0.00014399155062108285, - "loss": 0.5025, + "grad_norm": 1.765625, + "learning_rate": 5.741178295967483e-05, + "loss": 0.5857, "step": 6145 }, { "epoch": 1.4805007221954742, - "grad_norm": 1.1640625, - "learning_rate": 0.0001438840853174224, - "loss": 0.5176, + "grad_norm": 1.9609375, + "learning_rate": 5.7368934787939496e-05, + "loss": 0.6007, "step": 6150 }, { "epoch": 1.4817043813192103, - "grad_norm": 0.92578125, - "learning_rate": 0.00014377661659611596, - "loss": 0.4469, + "grad_norm": 1.75, + "learning_rate": 5.732608525353289e-05, + "loss": 0.5295, "step": 6155 }, { "epoch": 1.4829080404429464, - "grad_norm": 0.90234375, - "learning_rate": 0.00014366914462887102, - "loss": 0.4655, + "grad_norm": 1.6484375, + "learning_rate": 5.728323442491758e-05, + "loss": 0.5456, "step": 6160 }, { "epoch": 1.4841116995666828, - "grad_norm": 1.1015625, - "learning_rate": 0.0001435616695874001, - "loss": 0.492, + "grad_norm": 1.8046875, + "learning_rate": 5.724038237055818e-05, + "loss": 0.5767, "step": 6165 }, { "epoch": 1.4853153586904189, - "grad_norm": 1.078125, - "learning_rate": 0.0001434541916434209, - "loss": 0.5039, + "grad_norm": 1.7890625, + "learning_rate": 5.7197529158921306e-05, + "loss": 0.5897, "step": 6170 }, { "epoch": 1.486519017814155, - "grad_norm": 1.046875, - "learning_rate": 0.0001433467109686556, - "loss": 0.4815, + "grad_norm": 1.8203125, + "learning_rate": 5.7154674858475426e-05, + "loss": 0.5648, "step": 6175 }, { "epoch": 1.4877226769378913, - "grad_norm": 1.0078125, - "learning_rate": 0.00014323922773483075, - "loss": 0.4836, + "grad_norm": 1.8046875, + "learning_rate": 5.7111819537690704e-05, + "loss": 0.568, "step": 6180 }, { "epoch": 1.4889263360616274, - "grad_norm": 0.98046875, - "learning_rate": 0.00014313174211367697, - "loss": 0.4795, + "grad_norm": 1.7421875, + "learning_rate": 5.7068963265038944e-05, + "loss": 0.5616, "step": 6185 }, { "epoch": 1.4901299951853635, - "grad_norm": 0.98828125, - "learning_rate": 0.00014302425427692878, - "loss": 0.476, + "grad_norm": 1.7421875, + "learning_rate": 5.702610610899351e-05, + "loss": 0.5613, "step": 6190 }, { "epoch": 1.4913336543090996, - "grad_norm": 1.140625, - "learning_rate": 0.00014291676439632414, - "loss": 0.4833, + "grad_norm": 1.8359375, + "learning_rate": 5.6983248138029126e-05, + "loss": 0.5591, "step": 6195 }, { "epoch": 1.4925373134328357, - "grad_norm": 1.1328125, - "learning_rate": 0.00014280927264360442, - "loss": 0.5057, + "grad_norm": 1.796875, + "learning_rate": 5.6940389420621875e-05, + "loss": 0.5878, "step": 6200 }, { "epoch": 1.493740972556572, - "grad_norm": 1.09375, - "learning_rate": 0.00014270177919051375, - "loss": 0.4653, + "grad_norm": 1.8671875, + "learning_rate": 5.689753002524897e-05, + "loss": 0.5478, "step": 6205 }, { "epoch": 1.4949446316803081, - "grad_norm": 1.0078125, - "learning_rate": 0.00014259428420879922, - "loss": 0.5059, + "grad_norm": 1.9453125, + "learning_rate": 5.6854670020388755e-05, + "loss": 0.5943, "step": 6210 }, { "epoch": 1.4961482908040442, - "grad_norm": 1.015625, - "learning_rate": 0.0001424867878702102, - "loss": 0.4602, + "grad_norm": 1.796875, + "learning_rate": 5.681180947452053e-05, + "loss": 0.54, "step": 6215 }, { "epoch": 1.4973519499277805, - "grad_norm": 0.984375, - "learning_rate": 0.0001423792903464983, - "loss": 0.4804, + "grad_norm": 1.8046875, + "learning_rate": 5.6768948456124446e-05, + "loss": 0.5636, "step": 6220 }, { "epoch": 1.4985556090515166, - "grad_norm": 1.09375, - "learning_rate": 0.000142271791809417, - "loss": 0.4603, + "grad_norm": 1.84375, + "learning_rate": 5.6726087033681435e-05, + "loss": 0.5474, "step": 6225 }, { "epoch": 1.4997592681752527, - "grad_norm": 0.921875, - "learning_rate": 0.0001421642924307214, - "loss": 0.4718, + "grad_norm": 1.734375, + "learning_rate": 5.668322527567306e-05, + "loss": 0.5508, "step": 6230 }, { "epoch": 1.500962927298989, - "grad_norm": 0.96484375, - "learning_rate": 0.00014205679238216796, - "loss": 0.4655, + "grad_norm": 1.6875, + "learning_rate": 5.664036325058142e-05, + "loss": 0.5451, "step": 6235 }, { "epoch": 1.502166586422725, - "grad_norm": 1.109375, - "learning_rate": 0.0001419492918355142, - "loss": 0.4922, + "grad_norm": 1.7578125, + "learning_rate": 5.659750102688905e-05, + "loss": 0.5744, "step": 6240 }, { "epoch": 1.5033702455464613, - "grad_norm": 1.078125, - "learning_rate": 0.00014184179096251844, - "loss": 0.4521, + "grad_norm": 1.7734375, + "learning_rate": 5.65546386730788e-05, + "loss": 0.5363, "step": 6245 }, { "epoch": 1.5045739046701974, - "grad_norm": 0.98046875, - "learning_rate": 0.00014173428993493947, - "loss": 0.4273, + "grad_norm": 1.9296875, + "learning_rate": 5.6511776257633695e-05, + "loss": 0.5122, "step": 6250 }, { "epoch": 1.5057775637939335, - "grad_norm": 1.0078125, - "learning_rate": 0.00014162678892453643, - "loss": 0.4752, + "grad_norm": 1.8203125, + "learning_rate": 5.646891384903691e-05, + "loss": 0.561, "step": 6255 }, { "epoch": 1.5069812229176698, - "grad_norm": 0.99609375, - "learning_rate": 0.00014151928810306836, - "loss": 0.4789, + "grad_norm": 1.7890625, + "learning_rate": 5.642605151577159e-05, + "loss": 0.5591, "step": 6260 }, { "epoch": 1.5081848820414059, - "grad_norm": 1.015625, - "learning_rate": 0.000141411787642294, - "loss": 0.4892, + "grad_norm": 1.8515625, + "learning_rate": 5.6383189326320746e-05, + "loss": 0.5762, "step": 6265 }, { "epoch": 1.509388541165142, - "grad_norm": 1.0, - "learning_rate": 0.00014130428771397157, - "loss": 0.4965, + "grad_norm": 1.78125, + "learning_rate": 5.634032734916717e-05, + "loss": 0.5803, "step": 6270 }, { "epoch": 1.5105922002888783, - "grad_norm": 0.953125, - "learning_rate": 0.00014119678848985837, - "loss": 0.4868, + "grad_norm": 1.765625, + "learning_rate": 5.629746565279332e-05, + "loss": 0.571, "step": 6275 }, { "epoch": 1.5117958594126142, - "grad_norm": 1.046875, - "learning_rate": 0.00014108929014171055, - "loss": 0.4735, + "grad_norm": 1.8203125, + "learning_rate": 5.625460430568118e-05, + "loss": 0.5594, "step": 6280 }, { "epoch": 1.5129995185363505, - "grad_norm": 1.015625, - "learning_rate": 0.000140981792841283, - "loss": 0.5023, + "grad_norm": 1.8203125, + "learning_rate": 5.6211743376312215e-05, + "loss": 0.5824, "step": 6285 }, { "epoch": 1.5142031776600868, - "grad_norm": 1.0, - "learning_rate": 0.00014087429676032883, - "loss": 0.4862, + "grad_norm": 1.875, + "learning_rate": 5.616888293316721e-05, + "loss": 0.567, "step": 6290 }, { "epoch": 1.5154068367838227, - "grad_norm": 0.9765625, - "learning_rate": 0.0001407668020705992, - "loss": 0.4565, + "grad_norm": 1.796875, + "learning_rate": 5.6126023044726146e-05, + "loss": 0.5406, "step": 6295 }, { "epoch": 1.516610495907559, - "grad_norm": 1.0234375, - "learning_rate": 0.00014065930894384307, - "loss": 0.4691, + "grad_norm": 1.7109375, + "learning_rate": 5.608316377946814e-05, + "loss": 0.552, "step": 6300 }, { "epoch": 1.5178141550312951, - "grad_norm": 1.109375, - "learning_rate": 0.00014055181755180687, - "loss": 0.4625, + "grad_norm": 1.9296875, + "learning_rate": 5.604030520587131e-05, + "loss": 0.5447, "step": 6305 }, { "epoch": 1.5190178141550312, - "grad_norm": 0.9921875, - "learning_rate": 0.00014044432806623432, - "loss": 0.488, + "grad_norm": 1.7421875, + "learning_rate": 5.5997447392412674e-05, + "loss": 0.573, "step": 6310 }, { "epoch": 1.5202214732787676, - "grad_norm": 0.94140625, - "learning_rate": 0.0001403368406588661, - "loss": 0.5025, + "grad_norm": 1.7578125, + "learning_rate": 5.595459040756804e-05, + "loss": 0.5796, "step": 6315 }, { "epoch": 1.5214251324025037, - "grad_norm": 1.125, - "learning_rate": 0.00014022935550143947, - "loss": 0.5103, + "grad_norm": 1.8515625, + "learning_rate": 5.5911734319811873e-05, + "loss": 0.5932, "step": 6320 }, { "epoch": 1.5226287915262398, - "grad_norm": 0.921875, - "learning_rate": 0.00014012187276568822, - "loss": 0.4723, + "grad_norm": 1.75, + "learning_rate": 5.586887919761723e-05, + "loss": 0.558, "step": 6325 }, { "epoch": 1.523832450649976, - "grad_norm": 1.0, - "learning_rate": 0.00014001439262334211, - "loss": 0.4889, + "grad_norm": 1.75, + "learning_rate": 5.5826025109455575e-05, + "loss": 0.5722, "step": 6330 }, { "epoch": 1.525036109773712, - "grad_norm": 1.0703125, - "learning_rate": 0.00013990691524612696, - "loss": 0.4893, + "grad_norm": 1.875, + "learning_rate": 5.578317212379678e-05, + "loss": 0.5709, "step": 6335 }, { "epoch": 1.5262397688974483, - "grad_norm": 0.96484375, - "learning_rate": 0.000139799440805764, - "loss": 0.4815, + "grad_norm": 1.7421875, + "learning_rate": 5.574032030910894e-05, + "loss": 0.5628, "step": 6340 }, { "epoch": 1.5274434280211844, - "grad_norm": 0.9765625, - "learning_rate": 0.00013969196947396988, - "loss": 0.4445, + "grad_norm": 1.75, + "learning_rate": 5.569746973385826e-05, + "loss": 0.5304, "step": 6345 }, { "epoch": 1.5286470871449205, - "grad_norm": 0.97265625, - "learning_rate": 0.0001395845014224562, - "loss": 0.4789, + "grad_norm": 1.609375, + "learning_rate": 5.565462046650896e-05, + "loss": 0.5613, "step": 6350 }, { "epoch": 1.5298507462686568, - "grad_norm": 1.015625, - "learning_rate": 0.00013947703682292936, - "loss": 0.4415, + "grad_norm": 1.859375, + "learning_rate": 5.5611772575523196e-05, + "loss": 0.5292, "step": 6355 }, { "epoch": 1.531054405392393, - "grad_norm": 0.95703125, - "learning_rate": 0.00013936957584709028, - "loss": 0.4717, + "grad_norm": 1.734375, + "learning_rate": 5.556892612936092e-05, + "loss": 0.5569, "step": 6360 }, { "epoch": 1.532258064516129, - "grad_norm": 1.0859375, - "learning_rate": 0.00013926211866663402, - "loss": 0.4453, + "grad_norm": 1.7421875, + "learning_rate": 5.552608119647977e-05, + "loss": 0.524, "step": 6365 }, { "epoch": 1.5334617236398653, - "grad_norm": 1.0390625, - "learning_rate": 0.0001391546654532496, - "loss": 0.4251, + "grad_norm": 1.8125, + "learning_rate": 5.548323784533496e-05, + "loss": 0.5069, "step": 6370 }, { "epoch": 1.5346653827636012, - "grad_norm": 0.94921875, - "learning_rate": 0.00013904721637861975, - "loss": 0.4904, + "grad_norm": 1.7578125, + "learning_rate": 5.544039614437918e-05, + "loss": 0.5754, "step": 6375 }, { "epoch": 1.5358690418873375, - "grad_norm": 0.8828125, - "learning_rate": 0.00013893977161442045, - "loss": 0.4357, + "grad_norm": 1.75, + "learning_rate": 5.539755616206247e-05, + "loss": 0.5214, "step": 6380 }, { "epoch": 1.5370727010110736, - "grad_norm": 1.0390625, - "learning_rate": 0.00013883233133232098, - "loss": 0.4781, + "grad_norm": 1.9140625, + "learning_rate": 5.535471796683216e-05, + "loss": 0.5615, "step": 6385 }, { "epoch": 1.5382763601348097, - "grad_norm": 0.9609375, - "learning_rate": 0.0001387248957039834, - "loss": 0.4717, + "grad_norm": 1.703125, + "learning_rate": 5.531188162713272e-05, + "loss": 0.5571, "step": 6390 }, { "epoch": 1.539480019258546, - "grad_norm": 1.03125, - "learning_rate": 0.0001386174649010622, - "loss": 0.5017, + "grad_norm": 1.8046875, + "learning_rate": 5.5269047211405604e-05, + "loss": 0.5801, "step": 6395 }, { "epoch": 1.5406836783822822, - "grad_norm": 0.96484375, - "learning_rate": 0.00013851003909520434, - "loss": 0.4545, + "grad_norm": 1.75, + "learning_rate": 5.5226214788089254e-05, + "loss": 0.5368, "step": 6400 }, { "epoch": 1.5418873375060183, - "grad_norm": 0.90234375, - "learning_rate": 0.00013840261845804867, - "loss": 0.4755, + "grad_norm": 1.78125, + "learning_rate": 5.518338442561888e-05, + "loss": 0.563, "step": 6405 }, { "epoch": 1.5430909966297546, - "grad_norm": 0.9375, - "learning_rate": 0.00013829520316122583, - "loss": 0.4928, + "grad_norm": 1.703125, + "learning_rate": 5.5140556192426436e-05, + "loss": 0.5753, "step": 6410 }, { "epoch": 1.5442946557534905, - "grad_norm": 1.046875, - "learning_rate": 0.00013818779337635797, - "loss": 0.4556, + "grad_norm": 1.890625, + "learning_rate": 5.509773015694046e-05, + "loss": 0.5442, "step": 6415 }, { "epoch": 1.5454983148772268, - "grad_norm": 0.91015625, - "learning_rate": 0.0001380803892750584, - "loss": 0.498, + "grad_norm": 1.6796875, + "learning_rate": 5.505490638758598e-05, + "loss": 0.5839, "step": 6420 }, { "epoch": 1.5467019740009629, - "grad_norm": 1.046875, - "learning_rate": 0.00013797299102893124, - "loss": 0.461, + "grad_norm": 1.734375, + "learning_rate": 5.501208495278439e-05, + "loss": 0.5473, "step": 6425 }, { "epoch": 1.547905633124699, - "grad_norm": 1.0546875, - "learning_rate": 0.0001378655988095715, - "loss": 0.5067, + "grad_norm": 2.015625, + "learning_rate": 5.4969265920953384e-05, + "loss": 0.5913, "step": 6430 }, { "epoch": 1.5491092922484353, - "grad_norm": 0.9609375, - "learning_rate": 0.0001377582127885643, - "loss": 0.4751, + "grad_norm": 1.71875, + "learning_rate": 5.492644936050678e-05, + "loss": 0.5544, "step": 6435 }, { "epoch": 1.5503129513721714, - "grad_norm": 0.9921875, - "learning_rate": 0.000137650833137485, - "loss": 0.4702, + "grad_norm": 1.7890625, + "learning_rate": 5.488363533985446e-05, + "loss": 0.5547, "step": 6440 }, { "epoch": 1.5515166104959075, - "grad_norm": 0.91015625, - "learning_rate": 0.0001375434600278988, - "loss": 0.4701, + "grad_norm": 1.7265625, + "learning_rate": 5.484082392740228e-05, + "loss": 0.55, "step": 6445 }, { "epoch": 1.5527202696196438, - "grad_norm": 1.0546875, - "learning_rate": 0.00013743609363136037, - "loss": 0.4813, + "grad_norm": 1.7734375, + "learning_rate": 5.479801519155188e-05, + "loss": 0.5659, "step": 6450 }, { "epoch": 1.55392392874338, - "grad_norm": 0.97265625, - "learning_rate": 0.00013732873411941368, - "loss": 0.476, + "grad_norm": 1.7109375, + "learning_rate": 5.475520920070066e-05, + "loss": 0.5658, "step": 6455 }, { "epoch": 1.555127587867116, - "grad_norm": 1.0234375, - "learning_rate": 0.00013722138166359177, - "loss": 0.4452, + "grad_norm": 1.8125, + "learning_rate": 5.4712406023241627e-05, + "loss": 0.5296, "step": 6460 }, { "epoch": 1.5563312469908523, - "grad_norm": 1.0859375, - "learning_rate": 0.00013711403643541624, - "loss": 0.4756, + "grad_norm": 1.96875, + "learning_rate": 5.466960572756325e-05, + "loss": 0.5646, "step": 6465 }, { "epoch": 1.5575349061145882, - "grad_norm": 1.0234375, - "learning_rate": 0.0001370066986063973, - "loss": 0.4644, + "grad_norm": 1.7421875, + "learning_rate": 5.4626808382049446e-05, + "loss": 0.5429, "step": 6470 }, { "epoch": 1.5587385652383245, - "grad_norm": 0.94140625, - "learning_rate": 0.00013689936834803331, - "loss": 0.4883, + "grad_norm": 1.734375, + "learning_rate": 5.4584014055079425e-05, + "loss": 0.5754, "step": 6475 }, { "epoch": 1.5599422243620606, - "grad_norm": 1.015625, - "learning_rate": 0.0001367920458318105, - "loss": 0.4638, + "grad_norm": 1.859375, + "learning_rate": 5.454122281502754e-05, + "loss": 0.5458, "step": 6480 }, { "epoch": 1.5611458834857967, - "grad_norm": 0.875, - "learning_rate": 0.0001366847312292027, - "loss": 0.479, + "grad_norm": 1.6953125, + "learning_rate": 5.449843473026321e-05, + "loss": 0.5646, "step": 6485 }, { "epoch": 1.562349542609533, - "grad_norm": 0.9296875, - "learning_rate": 0.00013657742471167114, - "loss": 0.5064, + "grad_norm": 1.6875, + "learning_rate": 5.4455649869150815e-05, + "loss": 0.5942, "step": 6490 }, { "epoch": 1.5635532017332692, - "grad_norm": 0.96875, - "learning_rate": 0.00013647012645066412, - "loss": 0.4636, + "grad_norm": 1.7421875, + "learning_rate": 5.441286830004962e-05, + "loss": 0.5503, "step": 6495 }, { "epoch": 1.5647568608570053, - "grad_norm": 1.0, - "learning_rate": 0.00013636283661761685, - "loss": 0.4949, + "grad_norm": 1.8984375, + "learning_rate": 5.437009009131361e-05, + "loss": 0.5812, "step": 6500 }, { "epoch": 1.5647568608570053, - "eval_loss": 0.40641745924949646, - "eval_runtime": 2.3286, - "eval_samples_per_second": 85.887, - "eval_steps_per_second": 85.887, + "eval_loss": 0.474599689245224, + "eval_runtime": 2.3786, + "eval_samples_per_second": 84.082, + "eval_steps_per_second": 84.082, "step": 6500 }, { "epoch": 1.5659605199807416, - "grad_norm": 1.015625, - "learning_rate": 0.00013625555538395088, - "loss": 0.4845, + "grad_norm": 1.7890625, + "learning_rate": 5.4327315311291376e-05, + "loss": 0.5735, "step": 6505 }, { "epoch": 1.5671641791044775, - "grad_norm": 1.03125, - "learning_rate": 0.00013614828292107418, - "loss": 0.4763, + "grad_norm": 1.75, + "learning_rate": 5.4284544028326075e-05, + "loss": 0.5622, "step": 6510 }, { "epoch": 1.5683678382282138, - "grad_norm": 1.015625, - "learning_rate": 0.00013604101940038057, - "loss": 0.4717, + "grad_norm": 1.7890625, + "learning_rate": 5.424177631075522e-05, + "loss": 0.5599, "step": 6515 }, { "epoch": 1.56957149735195, - "grad_norm": 0.9375, - "learning_rate": 0.00013593376499324968, - "loss": 0.469, + "grad_norm": 1.671875, + "learning_rate": 5.4199012226910674e-05, + "loss": 0.5592, "step": 6520 }, { "epoch": 1.570775156475686, - "grad_norm": 0.91796875, - "learning_rate": 0.00013582651987104665, - "loss": 0.4392, + "grad_norm": 1.6875, + "learning_rate": 5.41562518451185e-05, + "loss": 0.5172, "step": 6525 }, { "epoch": 1.5719788155994223, - "grad_norm": 0.98828125, - "learning_rate": 0.0001357192842051216, - "loss": 0.5062, + "grad_norm": 1.78125, + "learning_rate": 5.41134952336988e-05, + "loss": 0.5955, "step": 6530 }, { "epoch": 1.5731824747231584, - "grad_norm": 0.96875, - "learning_rate": 0.00013561205816680965, - "loss": 0.4877, + "grad_norm": 1.6328125, + "learning_rate": 5.407074246096569e-05, + "loss": 0.5673, "step": 6535 }, { "epoch": 1.5743861338468945, - "grad_norm": 0.95703125, - "learning_rate": 0.0001355048419274305, - "loss": 0.4602, + "grad_norm": 1.859375, + "learning_rate": 5.402799359522712e-05, + "loss": 0.5444, "step": 6540 }, { "epoch": 1.5755897929706308, - "grad_norm": 1.0, - "learning_rate": 0.00013539763565828826, - "loss": 0.4776, + "grad_norm": 1.8671875, + "learning_rate": 5.3985248704784854e-05, + "loss": 0.5643, "step": 6545 }, { "epoch": 1.5767934520943667, - "grad_norm": 0.98046875, - "learning_rate": 0.00013529043953067107, - "loss": 0.4627, + "grad_norm": 1.71875, + "learning_rate": 5.394250785793426e-05, + "loss": 0.5508, "step": 6550 }, { "epoch": 1.577997111218103, - "grad_norm": 1.0390625, - "learning_rate": 0.00013518325371585083, - "loss": 0.4619, + "grad_norm": 1.9453125, + "learning_rate": 5.3899771122964236e-05, + "loss": 0.5542, "step": 6555 }, { "epoch": 1.5792007703418391, - "grad_norm": 1.015625, - "learning_rate": 0.00013507607838508302, - "loss": 0.4584, + "grad_norm": 1.828125, + "learning_rate": 5.385703856815715e-05, + "loss": 0.5481, "step": 6560 }, { "epoch": 1.5804044294655752, - "grad_norm": 1.015625, - "learning_rate": 0.0001349689137096063, - "loss": 0.4611, + "grad_norm": 1.796875, + "learning_rate": 5.3814310261788656e-05, + "loss": 0.548, "step": 6565 }, { "epoch": 1.5816080885893116, - "grad_norm": 0.93359375, - "learning_rate": 0.0001348617598606424, - "loss": 0.477, + "grad_norm": 1.7578125, + "learning_rate": 5.377158627212763e-05, + "loss": 0.5631, "step": 6570 }, { "epoch": 1.5828117477130477, - "grad_norm": 1.015625, - "learning_rate": 0.00013475461700939573, - "loss": 0.4954, + "grad_norm": 1.71875, + "learning_rate": 5.3728866667436084e-05, + "loss": 0.5859, "step": 6575 }, { "epoch": 1.5840154068367838, - "grad_norm": 0.83984375, - "learning_rate": 0.00013464748532705296, - "loss": 0.4791, + "grad_norm": 1.609375, + "learning_rate": 5.368615151596895e-05, + "loss": 0.5639, "step": 6580 }, { "epoch": 1.58521906596052, - "grad_norm": 1.0, - "learning_rate": 0.00013454036498478322, - "loss": 0.4583, + "grad_norm": 1.7890625, + "learning_rate": 5.364344088597413e-05, + "loss": 0.5455, "step": 6585 }, { "epoch": 1.5864227250842562, - "grad_norm": 1.015625, - "learning_rate": 0.00013443325615373724, - "loss": 0.4773, + "grad_norm": 1.765625, + "learning_rate": 5.360073484569224e-05, + "loss": 0.5638, "step": 6590 }, { "epoch": 1.5876263842079923, - "grad_norm": 1.0, - "learning_rate": 0.0001343261590050475, - "loss": 0.452, + "grad_norm": 1.8671875, + "learning_rate": 5.355803346335659e-05, + "loss": 0.5402, "step": 6595 }, { "epoch": 1.5888300433317286, - "grad_norm": 1.1015625, - "learning_rate": 0.00013421907370982786, - "loss": 0.4573, + "grad_norm": 1.984375, + "learning_rate": 5.351533680719304e-05, + "loss": 0.5433, "step": 6600 }, { "epoch": 1.5900337024554645, - "grad_norm": 0.94140625, - "learning_rate": 0.0001341120004391731, - "loss": 0.4828, + "grad_norm": 1.828125, + "learning_rate": 5.347264494541991e-05, + "loss": 0.572, "step": 6605 }, { "epoch": 1.5912373615792008, - "grad_norm": 1.0, - "learning_rate": 0.00013400493936415887, - "loss": 0.4652, + "grad_norm": 1.8671875, + "learning_rate": 5.342995794624785e-05, + "loss": 0.5504, "step": 6610 }, { "epoch": 1.592441020702937, - "grad_norm": 1.03125, - "learning_rate": 0.00013389789065584132, - "loss": 0.5017, + "grad_norm": 1.75, + "learning_rate": 5.338727587787974e-05, + "loss": 0.5862, "step": 6615 }, { "epoch": 1.593644679826673, - "grad_norm": 1.0078125, - "learning_rate": 0.00013379085448525683, - "loss": 0.4661, + "grad_norm": 1.8203125, + "learning_rate": 5.334459880851056e-05, + "loss": 0.5455, "step": 6620 }, { "epoch": 1.5948483389504093, - "grad_norm": 1.03125, - "learning_rate": 0.00013368383102342184, - "loss": 0.4196, + "grad_norm": 1.8203125, + "learning_rate": 5.330192680632737e-05, + "loss": 0.4963, "step": 6625 }, { "epoch": 1.5960519980741454, - "grad_norm": 0.953125, - "learning_rate": 0.0001335768204413323, - "loss": 0.4707, + "grad_norm": 1.640625, + "learning_rate": 5.3259259939509045e-05, + "loss": 0.5513, "step": 6630 }, { "epoch": 1.5972556571978815, - "grad_norm": 0.9453125, - "learning_rate": 0.00013346982290996377, - "loss": 0.4274, + "grad_norm": 1.8359375, + "learning_rate": 5.3216598276226344e-05, + "loss": 0.5138, "step": 6635 }, { "epoch": 1.5984593163216179, - "grad_norm": 0.9453125, - "learning_rate": 0.00013336283860027084, - "loss": 0.4546, + "grad_norm": 1.734375, + "learning_rate": 5.317394188464164e-05, + "loss": 0.5406, "step": 6640 }, { "epoch": 1.5996629754453537, - "grad_norm": 0.98828125, - "learning_rate": 0.00013325586768318695, - "loss": 0.4824, + "grad_norm": 1.8359375, + "learning_rate": 5.31312908329089e-05, + "loss": 0.5713, "step": 6645 }, { "epoch": 1.60086663456909, - "grad_norm": 0.96875, - "learning_rate": 0.00013314891032962438, - "loss": 0.4687, + "grad_norm": 1.6875, + "learning_rate": 5.308864518917363e-05, + "loss": 0.5581, "step": 6650 }, { "epoch": 1.6020702936928262, - "grad_norm": 0.93359375, - "learning_rate": 0.00013304196671047334, - "loss": 0.4817, + "grad_norm": 1.8046875, + "learning_rate": 5.3046005021572554e-05, + "loss": 0.5687, "step": 6655 }, { "epoch": 1.6032739528165623, - "grad_norm": 0.9765625, - "learning_rate": 0.00013293503699660252, - "loss": 0.4961, + "grad_norm": 1.75, + "learning_rate": 5.300337039823381e-05, + "loss": 0.5819, "step": 6660 }, { "epoch": 1.6044776119402986, - "grad_norm": 0.890625, - "learning_rate": 0.00013282812135885803, - "loss": 0.4988, + "grad_norm": 1.703125, + "learning_rate": 5.296074138727653e-05, + "loss": 0.5857, "step": 6665 }, { "epoch": 1.6056812710640347, - "grad_norm": 1.0703125, - "learning_rate": 0.00013272121996806376, - "loss": 0.499, + "grad_norm": 1.90625, + "learning_rate": 5.291811805681099e-05, + "loss": 0.5859, "step": 6670 }, { "epoch": 1.6068849301877708, - "grad_norm": 0.9453125, - "learning_rate": 0.00013261433299502066, - "loss": 0.4834, + "grad_norm": 1.7578125, + "learning_rate": 5.2875500474938325e-05, + "loss": 0.5633, "step": 6675 }, { "epoch": 1.608088589311507, - "grad_norm": 1.0390625, - "learning_rate": 0.00013250746061050674, - "loss": 0.5136, + "grad_norm": 1.8671875, + "learning_rate": 5.2832888709750496e-05, + "loss": 0.6047, "step": 6680 }, { "epoch": 1.609292248435243, - "grad_norm": 0.98828125, - "learning_rate": 0.0001324006029852767, - "loss": 0.4877, + "grad_norm": 1.7890625, + "learning_rate": 5.279028282933021e-05, + "loss": 0.5807, "step": 6685 }, { "epoch": 1.6104959075589793, - "grad_norm": 0.97265625, - "learning_rate": 0.00013229376029006158, - "loss": 0.4488, + "grad_norm": 1.7734375, + "learning_rate": 5.274768290175072e-05, + "loss": 0.5383, "step": 6690 }, { "epoch": 1.6116995666827154, - "grad_norm": 0.97265625, - "learning_rate": 0.00013218693269556868, - "loss": 0.4624, + "grad_norm": 1.734375, + "learning_rate": 5.27050889950758e-05, + "loss": 0.5466, "step": 6695 }, { "epoch": 1.6129032258064515, - "grad_norm": 1.0234375, - "learning_rate": 0.00013208012037248102, - "loss": 0.4708, + "grad_norm": 1.9375, + "learning_rate": 5.2662501177359574e-05, + "loss": 0.5619, "step": 6700 }, { "epoch": 1.6141068849301878, - "grad_norm": 0.92578125, - "learning_rate": 0.00013197332349145738, - "loss": 0.4808, + "grad_norm": 1.6875, + "learning_rate": 5.261991951664648e-05, + "loss": 0.5675, "step": 6705 }, { "epoch": 1.615310544053924, - "grad_norm": 0.97265625, - "learning_rate": 0.0001318665422231318, - "loss": 0.4547, + "grad_norm": 1.7578125, + "learning_rate": 5.25773440809711e-05, + "loss": 0.542, "step": 6710 }, { "epoch": 1.61651420317766, - "grad_norm": 0.890625, - "learning_rate": 0.00013175977673811335, - "loss": 0.4189, + "grad_norm": 1.703125, + "learning_rate": 5.253477493835804e-05, + "loss": 0.5005, "step": 6715 }, { "epoch": 1.6177178623013964, - "grad_norm": 0.98828125, - "learning_rate": 0.0001316530272069859, - "loss": 0.4899, + "grad_norm": 1.75, + "learning_rate": 5.2492212156821894e-05, + "loss": 0.5781, "step": 6720 }, { "epoch": 1.6189215214251322, - "grad_norm": 1.0078125, - "learning_rate": 0.00013154629380030786, - "loss": 0.4669, + "grad_norm": 1.65625, + "learning_rate": 5.244965580436708e-05, + "loss": 0.555, "step": 6725 }, { "epoch": 1.6201251805488686, - "grad_norm": 1.0546875, - "learning_rate": 0.0001314395766886118, - "loss": 0.4335, + "grad_norm": 1.84375, + "learning_rate": 5.240710594898772e-05, + "loss": 0.5242, "step": 6730 }, { "epoch": 1.6213288396726049, - "grad_norm": 0.94921875, - "learning_rate": 0.0001313328760424044, - "loss": 0.4794, + "grad_norm": 1.7578125, + "learning_rate": 5.2364562658667605e-05, + "loss": 0.5704, "step": 6735 }, { "epoch": 1.6225324987963408, - "grad_norm": 0.9609375, - "learning_rate": 0.00013122619203216585, - "loss": 0.4599, + "grad_norm": 1.734375, + "learning_rate": 5.232202600137997e-05, + "loss": 0.5543, "step": 6740 }, { "epoch": 1.623736157920077, - "grad_norm": 0.97265625, - "learning_rate": 0.0001311195248283499, - "loss": 0.473, + "grad_norm": 1.75, + "learning_rate": 5.227949604508752e-05, + "loss": 0.5542, "step": 6745 }, { "epoch": 1.6249398170438132, - "grad_norm": 0.9921875, - "learning_rate": 0.0001310128746013834, - "loss": 0.4743, + "grad_norm": 1.7734375, + "learning_rate": 5.223697285774222e-05, + "loss": 0.5577, "step": 6750 }, { "epoch": 1.6261434761675493, - "grad_norm": 0.921875, - "learning_rate": 0.00013090624152166603, - "loss": 0.4904, + "grad_norm": 1.6953125, + "learning_rate": 5.21944565072852e-05, + "loss": 0.5784, "step": 6755 }, { "epoch": 1.6273471352912856, - "grad_norm": 0.92578125, - "learning_rate": 0.00013079962575957016, - "loss": 0.4788, + "grad_norm": 1.6640625, + "learning_rate": 5.215194706164672e-05, + "loss": 0.5631, "step": 6760 }, { "epoch": 1.6285507944150217, - "grad_norm": 1.0703125, - "learning_rate": 0.00013069302748544041, - "loss": 0.4817, + "grad_norm": 1.8125, + "learning_rate": 5.2109444588745944e-05, + "loss": 0.5689, "step": 6765 }, { "epoch": 1.6297544535387578, - "grad_norm": 1.0703125, - "learning_rate": 0.00013058644686959352, - "loss": 0.4672, + "grad_norm": 1.8203125, + "learning_rate": 5.2066949156490945e-05, + "loss": 0.5607, "step": 6770 }, { "epoch": 1.6309581126624941, - "grad_norm": 0.95703125, - "learning_rate": 0.00013047988408231798, - "loss": 0.4422, + "grad_norm": 1.671875, + "learning_rate": 5.202446083277853e-05, + "loss": 0.5387, "step": 6775 }, { "epoch": 1.63216177178623, - "grad_norm": 1.0703125, - "learning_rate": 0.00013037333929387382, - "loss": 0.4462, + "grad_norm": 1.8828125, + "learning_rate": 5.198197968549415e-05, + "loss": 0.5354, "step": 6780 }, { "epoch": 1.6333654309099663, - "grad_norm": 0.953125, - "learning_rate": 0.00013026681267449232, - "loss": 0.4398, + "grad_norm": 1.703125, + "learning_rate": 5.1939505782511785e-05, + "loss": 0.5274, "step": 6785 }, { "epoch": 1.6345690900337024, - "grad_norm": 0.9765625, - "learning_rate": 0.00013016030439437563, - "loss": 0.4846, + "grad_norm": 1.75, + "learning_rate": 5.189703919169384e-05, + "loss": 0.572, "step": 6790 }, { "epoch": 1.6357727491574385, - "grad_norm": 1.015625, - "learning_rate": 0.00013005381462369677, - "loss": 0.4687, + "grad_norm": 1.8203125, + "learning_rate": 5.1854579980891054e-05, + "loss": 0.5596, "step": 6795 }, { "epoch": 1.6369764082811749, - "grad_norm": 1.0234375, - "learning_rate": 0.00012994734353259904, - "loss": 0.4857, + "grad_norm": 1.75, + "learning_rate": 5.1812128217942345e-05, + "loss": 0.5749, "step": 6800 }, { "epoch": 1.638180067404911, - "grad_norm": 1.0078125, - "learning_rate": 0.00012984089129119592, - "loss": 0.4741, + "grad_norm": 1.8671875, + "learning_rate": 5.1769683970674747e-05, + "loss": 0.561, "step": 6805 }, { "epoch": 1.639383726528647, - "grad_norm": 0.97265625, - "learning_rate": 0.00012973445806957088, - "loss": 0.454, + "grad_norm": 1.7578125, + "learning_rate": 5.17272473069033e-05, + "loss": 0.5404, "step": 6810 }, { "epoch": 1.6405873856523834, - "grad_norm": 1.0546875, - "learning_rate": 0.00012962804403777686, - "loss": 0.4533, + "grad_norm": 1.8984375, + "learning_rate": 5.168481829443091e-05, + "loss": 0.549, "step": 6815 }, { "epoch": 1.6417910447761193, - "grad_norm": 1.0234375, - "learning_rate": 0.00012952164936583626, - "loss": 0.4534, + "grad_norm": 1.8125, + "learning_rate": 5.164239700104825e-05, + "loss": 0.542, "step": 6820 }, { "epoch": 1.6429947038998556, - "grad_norm": 0.9609375, - "learning_rate": 0.00012941527422374047, - "loss": 0.4579, + "grad_norm": 1.8125, + "learning_rate": 5.1599983494533695e-05, + "loss": 0.543, "step": 6825 }, { "epoch": 1.6441983630235917, - "grad_norm": 1.0, - "learning_rate": 0.00012930891878144967, - "loss": 0.4757, + "grad_norm": 1.765625, + "learning_rate": 5.155757784265314e-05, + "loss": 0.5616, "step": 6830 }, { "epoch": 1.6454020221473278, - "grad_norm": 1.0234375, - "learning_rate": 0.00012920258320889264, - "loss": 0.4496, + "grad_norm": 1.8046875, + "learning_rate": 5.151518011315995e-05, + "loss": 0.5404, "step": 6835 }, { "epoch": 1.646605681271064, - "grad_norm": 0.8515625, - "learning_rate": 0.00012909626767596628, - "loss": 0.4754, + "grad_norm": 1.734375, + "learning_rate": 5.147279037379482e-05, + "loss": 0.5633, "step": 6840 }, { "epoch": 1.6478093403948002, - "grad_norm": 0.9375, - "learning_rate": 0.00012898997235253568, - "loss": 0.4474, + "grad_norm": 1.703125, + "learning_rate": 5.1430408692285696e-05, + "loss": 0.5326, "step": 6845 }, { "epoch": 1.6490129995185363, - "grad_norm": 1.0390625, - "learning_rate": 0.00012888369740843343, - "loss": 0.4898, + "grad_norm": 1.8203125, + "learning_rate": 5.138803513634765e-05, + "loss": 0.5834, "step": 6850 }, { "epoch": 1.6502166586422726, - "grad_norm": 1.0078125, - "learning_rate": 0.00012877744301345963, - "loss": 0.4379, + "grad_norm": 1.8359375, + "learning_rate": 5.134566977368272e-05, + "loss": 0.5237, "step": 6855 }, { "epoch": 1.6514203177660085, - "grad_norm": 0.9921875, - "learning_rate": 0.0001286712093373817, - "loss": 0.4473, + "grad_norm": 1.8046875, + "learning_rate": 5.130331267197996e-05, + "loss": 0.5359, "step": 6860 }, { "epoch": 1.6526239768897448, - "grad_norm": 1.015625, - "learning_rate": 0.00012856499654993362, - "loss": 0.4587, + "grad_norm": 1.75, + "learning_rate": 5.12609638989151e-05, + "loss": 0.5491, "step": 6865 }, { "epoch": 1.653827636013481, - "grad_norm": 0.90234375, - "learning_rate": 0.0001284588048208164, - "loss": 0.494, + "grad_norm": 1.7109375, + "learning_rate": 5.121862352215066e-05, + "loss": 0.5837, "step": 6870 }, { "epoch": 1.655031295137217, - "grad_norm": 0.94140625, - "learning_rate": 0.00012835263431969704, - "loss": 0.4159, + "grad_norm": 1.765625, + "learning_rate": 5.1176291609335684e-05, + "loss": 0.505, "step": 6875 }, { "epoch": 1.6562349542609534, - "grad_norm": 1.0234375, - "learning_rate": 0.00012824648521620884, - "loss": 0.4717, + "grad_norm": 1.875, + "learning_rate": 5.113396822810574e-05, + "loss": 0.5621, "step": 6880 }, { "epoch": 1.6574386133846895, - "grad_norm": 0.94140625, - "learning_rate": 0.00012814035767995093, - "loss": 0.4967, + "grad_norm": 1.796875, + "learning_rate": 5.1091653446082735e-05, + "loss": 0.5906, "step": 6885 }, { "epoch": 1.6586422725084256, - "grad_norm": 0.99609375, - "learning_rate": 0.00012803425188048775, - "loss": 0.4812, + "grad_norm": 1.859375, + "learning_rate": 5.104934733087482e-05, + "loss": 0.578, "step": 6890 }, { "epoch": 1.6598459316321619, - "grad_norm": 1.015625, - "learning_rate": 0.00012792816798734932, - "loss": 0.4541, + "grad_norm": 1.8046875, + "learning_rate": 5.1007049950076364e-05, + "loss": 0.542, "step": 6895 }, { "epoch": 1.661049590755898, - "grad_norm": 0.97265625, - "learning_rate": 0.0001278221061700304, - "loss": 0.4543, + "grad_norm": 1.7421875, + "learning_rate": 5.096476137126769e-05, + "loss": 0.5437, "step": 6900 }, { "epoch": 1.662253249879634, - "grad_norm": 0.9375, - "learning_rate": 0.0001277160665979907, - "loss": 0.4674, + "grad_norm": 1.6953125, + "learning_rate": 5.0922481662015155e-05, + "loss": 0.5545, "step": 6905 }, { "epoch": 1.6634569090033704, - "grad_norm": 0.97265625, - "learning_rate": 0.00012761004944065413, - "loss": 0.4515, + "grad_norm": 1.7421875, + "learning_rate": 5.0880210889870854e-05, + "loss": 0.537, "step": 6910 }, { "epoch": 1.6646605681271063, - "grad_norm": 0.96484375, - "learning_rate": 0.000127504054867409, - "loss": 0.4813, + "grad_norm": 1.921875, + "learning_rate": 5.0837949122372656e-05, + "loss": 0.5729, "step": 6915 }, { "epoch": 1.6658642272508426, - "grad_norm": 0.95703125, - "learning_rate": 0.00012739808304760753, - "loss": 0.4676, + "grad_norm": 1.796875, + "learning_rate": 5.079569642704406e-05, + "loss": 0.5586, "step": 6920 }, { "epoch": 1.6670678863745787, - "grad_norm": 0.94921875, - "learning_rate": 0.0001272921341505654, - "loss": 0.4614, + "grad_norm": 1.7890625, + "learning_rate": 5.0753452871394004e-05, + "loss": 0.5469, "step": 6925 }, { "epoch": 1.6682715454983148, - "grad_norm": 1.03125, - "learning_rate": 0.00012718620834556186, - "loss": 0.4998, + "grad_norm": 1.8828125, + "learning_rate": 5.0711218522916894e-05, + "loss": 0.5943, "step": 6930 }, { "epoch": 1.6694752046220511, - "grad_norm": 0.94921875, - "learning_rate": 0.00012708030580183918, - "loss": 0.4414, + "grad_norm": 1.7890625, + "learning_rate": 5.066899344909236e-05, + "loss": 0.5338, "step": 6935 }, { "epoch": 1.6706788637457872, - "grad_norm": 1.03125, - "learning_rate": 0.00012697442668860247, - "loss": 0.4531, + "grad_norm": 1.8984375, + "learning_rate": 5.062677771738526e-05, + "loss": 0.5326, "step": 6940 }, { "epoch": 1.6718825228695233, - "grad_norm": 1.109375, - "learning_rate": 0.00012686857117501945, - "loss": 0.4892, + "grad_norm": 1.84375, + "learning_rate": 5.058457139524553e-05, + "loss": 0.5777, "step": 6945 }, { "epoch": 1.6730861819932596, - "grad_norm": 1.0234375, - "learning_rate": 0.00012676273943022, - "loss": 0.426, + "grad_norm": 1.7890625, + "learning_rate": 5.0542374550108e-05, + "loss": 0.5162, "step": 6950 }, { "epoch": 1.6742898411169955, - "grad_norm": 1.0234375, - "learning_rate": 0.00012665693162329622, - "loss": 0.4756, + "grad_norm": 1.8203125, + "learning_rate": 5.0500187249392465e-05, + "loss": 0.5621, "step": 6955 }, { "epoch": 1.6754935002407318, - "grad_norm": 1.015625, - "learning_rate": 0.0001265511479233018, - "loss": 0.4395, + "grad_norm": 1.8828125, + "learning_rate": 5.04580095605034e-05, + "loss": 0.5239, "step": 6960 }, { "epoch": 1.676697159364468, - "grad_norm": 1.0390625, - "learning_rate": 0.000126445388499252, - "loss": 0.4848, + "grad_norm": 1.9375, + "learning_rate": 5.041584155082993e-05, + "loss": 0.5724, "step": 6965 }, { "epoch": 1.677900818488204, - "grad_norm": 0.94140625, - "learning_rate": 0.00012633965352012327, - "loss": 0.4429, + "grad_norm": 1.7890625, + "learning_rate": 5.0373683287745734e-05, + "loss": 0.5335, "step": 6970 }, { "epoch": 1.6791044776119404, - "grad_norm": 1.015625, - "learning_rate": 0.00012623394315485295, - "loss": 0.4576, + "grad_norm": 1.9375, + "learning_rate": 5.03315348386089e-05, + "loss": 0.5453, "step": 6975 }, { "epoch": 1.6803081367356765, - "grad_norm": 1.0, - "learning_rate": 0.0001261282575723392, - "loss": 0.431, + "grad_norm": 1.8515625, + "learning_rate": 5.028939627076186e-05, + "loss": 0.516, "step": 6980 }, { "epoch": 1.6815117958594126, - "grad_norm": 0.984375, - "learning_rate": 0.00012602259694144042, - "loss": 0.4581, + "grad_norm": 1.6875, + "learning_rate": 5.024726765153124e-05, + "loss": 0.5416, "step": 6985 }, { "epoch": 1.682715454983149, - "grad_norm": 1.1171875, - "learning_rate": 0.0001259169614309752, - "loss": 0.4918, + "grad_norm": 1.7734375, + "learning_rate": 5.020514904822775e-05, + "loss": 0.5831, "step": 6990 }, { "epoch": 1.6839191141068848, - "grad_norm": 0.9453125, - "learning_rate": 0.0001258113512097221, - "loss": 0.4466, + "grad_norm": 1.625, + "learning_rate": 5.016304052814616e-05, + "loss": 0.5323, "step": 6995 }, { "epoch": 1.685122773230621, - "grad_norm": 0.94921875, - "learning_rate": 0.00012570576644641902, - "loss": 0.4627, + "grad_norm": 1.7265625, + "learning_rate": 5.012094215856504e-05, + "loss": 0.5534, "step": 7000 }, { "epoch": 1.685122773230621, - "eval_loss": 0.39601707458496094, - "eval_runtime": 2.3295, - "eval_samples_per_second": 85.855, - "eval_steps_per_second": 85.855, + "eval_loss": 0.4666500389575958, + "eval_runtime": 2.397, + "eval_samples_per_second": 83.437, + "eval_steps_per_second": 83.437, "step": 7000 }, { "epoch": 1.6863264323543572, - "grad_norm": 1.0078125, - "learning_rate": 0.0001256002073097635, - "loss": 0.4661, + "grad_norm": 1.84375, + "learning_rate": 5.0078854006746855e-05, + "loss": 0.556, "step": 7005 }, { "epoch": 1.6875300914780933, - "grad_norm": 0.94140625, - "learning_rate": 0.0001254946739684119, - "loss": 0.4607, + "grad_norm": 1.7265625, + "learning_rate": 5.0036776139937625e-05, + "loss": 0.5457, "step": 7010 }, { "epoch": 1.6887337506018296, - "grad_norm": 1.015625, - "learning_rate": 0.00012538916659097946, - "loss": 0.4801, + "grad_norm": 1.953125, + "learning_rate": 4.999470862536702e-05, + "loss": 0.5743, "step": 7015 }, { "epoch": 1.6899374097255657, - "grad_norm": 0.99609375, - "learning_rate": 0.00012528368534603994, - "loss": 0.4188, + "grad_norm": 1.734375, + "learning_rate": 4.995265153024816e-05, + "loss": 0.5029, "step": 7020 }, { "epoch": 1.6911410688493018, - "grad_norm": 0.87109375, - "learning_rate": 0.0001251782304021253, - "loss": 0.4417, + "grad_norm": 1.6953125, + "learning_rate": 4.991060492177747e-05, + "loss": 0.5265, "step": 7025 }, { "epoch": 1.6923447279730381, - "grad_norm": 0.9296875, - "learning_rate": 0.00012507280192772553, - "loss": 0.4458, + "grad_norm": 1.859375, + "learning_rate": 4.986856886713468e-05, + "loss": 0.5292, "step": 7030 }, { "epoch": 1.6935483870967742, - "grad_norm": 1.0546875, - "learning_rate": 0.00012496740009128828, - "loss": 0.4766, + "grad_norm": 1.875, + "learning_rate": 4.982654343348259e-05, + "loss": 0.5662, "step": 7035 }, { "epoch": 1.6947520462205103, - "grad_norm": 0.93359375, - "learning_rate": 0.0001248620250612187, - "loss": 0.4545, + "grad_norm": 1.75, + "learning_rate": 4.978452868796711e-05, + "loss": 0.5449, "step": 7040 }, { "epoch": 1.6959557053442467, - "grad_norm": 1.0546875, - "learning_rate": 0.00012475667700587907, - "loss": 0.4706, + "grad_norm": 1.75, + "learning_rate": 4.974252469771702e-05, + "loss": 0.5611, "step": 7045 }, { "epoch": 1.6971593644679825, - "grad_norm": 0.984375, - "learning_rate": 0.00012465135609358852, - "loss": 0.4426, + "grad_norm": 1.765625, + "learning_rate": 4.9700531529843924e-05, + "loss": 0.5356, "step": 7050 }, { "epoch": 1.6983630235917189, - "grad_norm": 0.96484375, - "learning_rate": 0.00012454606249262298, - "loss": 0.4354, + "grad_norm": 1.828125, + "learning_rate": 4.965854925144216e-05, + "loss": 0.5268, "step": 7055 }, { "epoch": 1.699566682715455, - "grad_norm": 1.0703125, - "learning_rate": 0.0001244407963712145, - "loss": 0.4523, + "grad_norm": 1.78125, + "learning_rate": 4.961657792958859e-05, + "loss": 0.5484, "step": 7060 }, { "epoch": 1.700770341839191, - "grad_norm": 0.94140625, - "learning_rate": 0.00012433555789755142, - "loss": 0.4347, + "grad_norm": 1.6171875, + "learning_rate": 4.957461763134268e-05, + "loss": 0.5174, "step": 7065 }, { "epoch": 1.7019740009629274, - "grad_norm": 0.99609375, - "learning_rate": 0.0001242303472397779, - "loss": 0.4651, + "grad_norm": 1.8203125, + "learning_rate": 4.9532668423746236e-05, + "loss": 0.5592, "step": 7070 }, { "epoch": 1.7031776600866635, - "grad_norm": 0.98046875, - "learning_rate": 0.00012412516456599348, - "loss": 0.4843, + "grad_norm": 1.84375, + "learning_rate": 4.949073037382329e-05, + "loss": 0.5761, "step": 7075 }, { "epoch": 1.7043813192103996, - "grad_norm": 0.91796875, - "learning_rate": 0.00012402001004425318, - "loss": 0.4209, + "grad_norm": 1.6796875, + "learning_rate": 4.944880354858013e-05, + "loss": 0.5113, "step": 7080 }, { "epoch": 1.705584978334136, - "grad_norm": 0.8984375, - "learning_rate": 0.00012391488384256698, - "loss": 0.4455, + "grad_norm": 1.6640625, + "learning_rate": 4.940688801500507e-05, + "loss": 0.5383, "step": 7085 }, { "epoch": 1.7067886374578718, - "grad_norm": 1.046875, - "learning_rate": 0.00012380978612889956, - "loss": 0.4435, + "grad_norm": 1.859375, + "learning_rate": 4.936498384006837e-05, + "loss": 0.5348, "step": 7090 }, { "epoch": 1.7079922965816081, - "grad_norm": 0.95703125, - "learning_rate": 0.0001237047170711702, - "loss": 0.4335, + "grad_norm": 1.796875, + "learning_rate": 4.932309109072219e-05, + "loss": 0.517, "step": 7095 }, { "epoch": 1.7091959557053442, - "grad_norm": 0.8828125, - "learning_rate": 0.00012359967683725224, - "loss": 0.4618, + "grad_norm": 1.796875, + "learning_rate": 4.928120983390039e-05, + "loss": 0.5535, "step": 7100 }, { "epoch": 1.7103996148290803, - "grad_norm": 1.0546875, - "learning_rate": 0.00012349466559497305, - "loss": 0.4424, + "grad_norm": 1.90625, + "learning_rate": 4.923934013651847e-05, + "loss": 0.5321, "step": 7105 }, { "epoch": 1.7116032739528166, - "grad_norm": 0.984375, - "learning_rate": 0.0001233896835121137, - "loss": 0.4782, + "grad_norm": 1.859375, + "learning_rate": 4.919748206547348e-05, + "loss": 0.5678, "step": 7110 }, { "epoch": 1.7128069330765527, - "grad_norm": 0.92578125, - "learning_rate": 0.00012328473075640865, - "loss": 0.4612, + "grad_norm": 1.734375, + "learning_rate": 4.915563568764389e-05, + "loss": 0.5454, "step": 7115 }, { "epoch": 1.7140105922002888, - "grad_norm": 1.0234375, - "learning_rate": 0.0001231798074955455, - "loss": 0.4643, + "grad_norm": 1.75, + "learning_rate": 4.9113801069889495e-05, + "loss": 0.5501, "step": 7120 }, { "epoch": 1.7152142513240252, - "grad_norm": 0.94921875, - "learning_rate": 0.0001230749138971647, - "loss": 0.4281, + "grad_norm": 1.8203125, + "learning_rate": 4.9071978279051264e-05, + "loss": 0.5178, "step": 7125 }, { "epoch": 1.716417910447761, - "grad_norm": 1.140625, - "learning_rate": 0.0001229700501288594, - "loss": 0.4956, + "grad_norm": 1.9765625, + "learning_rate": 4.9030167381951334e-05, + "loss": 0.5888, "step": 7130 }, { "epoch": 1.7176215695714974, - "grad_norm": 1.0, - "learning_rate": 0.0001228652163581749, - "loss": 0.4719, + "grad_norm": 1.859375, + "learning_rate": 4.8988368445392745e-05, + "loss": 0.5564, "step": 7135 }, { "epoch": 1.7188252286952335, - "grad_norm": 0.96484375, - "learning_rate": 0.0001227604127526088, - "loss": 0.4662, + "grad_norm": 1.6875, + "learning_rate": 4.894658153615954e-05, + "loss": 0.5587, "step": 7140 }, { "epoch": 1.7200288878189696, - "grad_norm": 0.984375, - "learning_rate": 0.00012265563947961032, - "loss": 0.4303, + "grad_norm": 1.8828125, + "learning_rate": 4.890480672101644e-05, + "loss": 0.5199, "step": 7145 }, { "epoch": 1.7212325469427059, - "grad_norm": 0.86328125, - "learning_rate": 0.00012255089670658035, - "loss": 0.4591, + "grad_norm": 1.671875, + "learning_rate": 4.886304406670892e-05, + "loss": 0.547, "step": 7150 }, { "epoch": 1.722436206066442, - "grad_norm": 0.91796875, - "learning_rate": 0.00012244618460087095, - "loss": 0.4596, + "grad_norm": 1.765625, + "learning_rate": 4.8821293639963e-05, + "loss": 0.5442, "step": 7155 }, { "epoch": 1.723639865190178, - "grad_norm": 1.078125, - "learning_rate": 0.00012234150332978523, - "loss": 0.4732, + "grad_norm": 1.921875, + "learning_rate": 4.8779555507485126e-05, + "loss": 0.5671, "step": 7160 }, { "epoch": 1.7248435243139144, - "grad_norm": 0.91015625, - "learning_rate": 0.00012223685306057708, - "loss": 0.426, + "grad_norm": 1.65625, + "learning_rate": 4.8737829735962187e-05, + "loss": 0.5209, "step": 7165 }, { "epoch": 1.7260471834376505, - "grad_norm": 0.9375, - "learning_rate": 0.00012213223396045068, - "loss": 0.4488, + "grad_norm": 1.765625, + "learning_rate": 4.8696116392061194e-05, + "loss": 0.5422, "step": 7170 }, { "epoch": 1.7272508425613866, - "grad_norm": 1.0234375, - "learning_rate": 0.00012202764619656066, - "loss": 0.5006, + "grad_norm": 1.8515625, + "learning_rate": 4.865441554242945e-05, + "loss": 0.587, "step": 7175 }, { "epoch": 1.728454501685123, - "grad_norm": 1.0078125, - "learning_rate": 0.00012192308993601139, - "loss": 0.4377, + "grad_norm": 1.8984375, + "learning_rate": 4.861272725369419e-05, + "loss": 0.5308, "step": 7180 }, { "epoch": 1.7296581608088588, - "grad_norm": 1.0625, - "learning_rate": 0.00012181856534585694, - "loss": 0.4809, + "grad_norm": 1.90625, + "learning_rate": 4.8571051592462595e-05, + "loss": 0.5725, "step": 7185 }, { "epoch": 1.7308618199325951, - "grad_norm": 1.0546875, - "learning_rate": 0.00012171407259310094, - "loss": 0.4921, + "grad_norm": 1.921875, + "learning_rate": 4.8529388625321745e-05, + "loss": 0.5845, "step": 7190 }, { "epoch": 1.7320654790563312, - "grad_norm": 0.93359375, - "learning_rate": 0.00012160961184469586, - "loss": 0.4381, + "grad_norm": 1.734375, + "learning_rate": 4.8487738418838315e-05, + "loss": 0.528, "step": 7195 }, { "epoch": 1.7332691381800673, - "grad_norm": 0.9453125, - "learning_rate": 0.0001215051832675433, - "loss": 0.4606, + "grad_norm": 1.7890625, + "learning_rate": 4.8446101039558695e-05, + "loss": 0.5504, "step": 7200 }, { "epoch": 1.7344727973038037, - "grad_norm": 0.94140625, - "learning_rate": 0.00012140078702849334, - "loss": 0.4398, + "grad_norm": 1.703125, + "learning_rate": 4.8404476554008745e-05, + "loss": 0.529, "step": 7205 }, { "epoch": 1.7356764564275398, - "grad_norm": 0.93359375, - "learning_rate": 0.00012129642329434436, - "loss": 0.4644, + "grad_norm": 1.8046875, + "learning_rate": 4.83628650286937e-05, + "loss": 0.5613, "step": 7210 }, { "epoch": 1.7368801155512759, - "grad_norm": 1.0546875, - "learning_rate": 0.00012119209223184295, - "loss": 0.4207, + "grad_norm": 1.9765625, + "learning_rate": 4.832126653009813e-05, + "loss": 0.5065, "step": 7215 }, { "epoch": 1.7380837746750122, - "grad_norm": 0.99609375, - "learning_rate": 0.00012108779400768328, - "loss": 0.4623, + "grad_norm": 1.6953125, + "learning_rate": 4.8279681124685756e-05, + "loss": 0.5505, "step": 7220 }, { "epoch": 1.739287433798748, - "grad_norm": 0.96875, - "learning_rate": 0.00012098352878850726, - "loss": 0.479, + "grad_norm": 1.8359375, + "learning_rate": 4.8238108878899414e-05, + "loss": 0.5748, "step": 7225 }, { "epoch": 1.7404910929224844, - "grad_norm": 0.9921875, - "learning_rate": 0.00012087929674090398, - "loss": 0.4418, + "grad_norm": 1.8046875, + "learning_rate": 4.819654985916089e-05, + "loss": 0.529, "step": 7230 }, { "epoch": 1.7416947520462205, - "grad_norm": 1.0, - "learning_rate": 0.0001207750980314095, - "loss": 0.467, + "grad_norm": 1.828125, + "learning_rate": 4.815500413187084e-05, + "loss": 0.5588, "step": 7235 }, { "epoch": 1.7428984111699566, - "grad_norm": 0.96484375, - "learning_rate": 0.00012067093282650665, - "loss": 0.4649, + "grad_norm": 1.7421875, + "learning_rate": 4.811347176340868e-05, + "loss": 0.5538, "step": 7240 }, { "epoch": 1.744102070293693, - "grad_norm": 0.8828125, - "learning_rate": 0.00012056680129262471, - "loss": 0.4586, + "grad_norm": 1.578125, + "learning_rate": 4.807195282013249e-05, + "loss": 0.5497, "step": 7245 }, { "epoch": 1.745305729417429, - "grad_norm": 1.171875, - "learning_rate": 0.00012046270359613924, - "loss": 0.4582, + "grad_norm": 1.8828125, + "learning_rate": 4.803044736837891e-05, + "loss": 0.5447, "step": 7250 }, { "epoch": 1.746509388541165, - "grad_norm": 0.96484375, - "learning_rate": 0.00012035863990337164, - "loss": 0.441, + "grad_norm": 1.7265625, + "learning_rate": 4.7988955474463006e-05, + "loss": 0.5343, "step": 7255 }, { "epoch": 1.7477130476649014, - "grad_norm": 0.93359375, - "learning_rate": 0.00012025461038058895, - "loss": 0.4544, + "grad_norm": 1.7578125, + "learning_rate": 4.794747720467817e-05, + "loss": 0.5484, "step": 7260 }, { "epoch": 1.7489167067886373, - "grad_norm": 1.0078125, - "learning_rate": 0.00012015061519400376, - "loss": 0.4677, + "grad_norm": 1.734375, + "learning_rate": 4.790601262529607e-05, + "loss": 0.5603, "step": 7265 }, { "epoch": 1.7501203659123736, - "grad_norm": 0.953125, - "learning_rate": 0.00012004665450977369, - "loss": 0.4504, + "grad_norm": 1.6953125, + "learning_rate": 4.786456180256646e-05, + "loss": 0.5391, "step": 7270 }, { "epoch": 1.7513240250361097, - "grad_norm": 0.890625, - "learning_rate": 0.00011994272849400127, - "loss": 0.4343, + "grad_norm": 1.6484375, + "learning_rate": 4.782312480271714e-05, + "loss": 0.5282, "step": 7275 }, { "epoch": 1.7525276841598458, - "grad_norm": 0.953125, - "learning_rate": 0.00011983883731273365, - "loss": 0.4355, + "grad_norm": 1.71875, + "learning_rate": 4.7781701691953805e-05, + "loss": 0.5288, "step": 7280 }, { "epoch": 1.7537313432835822, - "grad_norm": 0.93359375, - "learning_rate": 0.00011973498113196224, - "loss": 0.4414, + "grad_norm": 1.703125, + "learning_rate": 4.7740292536459954e-05, + "loss": 0.5371, "step": 7285 }, { "epoch": 1.7549350024073183, - "grad_norm": 0.93359375, - "learning_rate": 0.00011963116011762266, - "loss": 0.4435, + "grad_norm": 1.6640625, + "learning_rate": 4.769889740239683e-05, + "loss": 0.5329, "step": 7290 }, { "epoch": 1.7561386615310544, - "grad_norm": 0.90234375, - "learning_rate": 0.00011952737443559425, - "loss": 0.4345, + "grad_norm": 1.578125, + "learning_rate": 4.765751635590323e-05, + "loss": 0.5258, "step": 7295 }, { "epoch": 1.7573423206547907, - "grad_norm": 0.9765625, - "learning_rate": 0.0001194236242516999, - "loss": 0.429, + "grad_norm": 1.671875, + "learning_rate": 4.761614946309545e-05, + "loss": 0.5218, "step": 7300 }, { "epoch": 1.7585459797785266, - "grad_norm": 0.9375, - "learning_rate": 0.00011931990973170589, - "loss": 0.4475, + "grad_norm": 1.640625, + "learning_rate": 4.75747967900672e-05, + "loss": 0.5422, "step": 7305 }, { "epoch": 1.7597496389022629, - "grad_norm": 1.0234375, - "learning_rate": 0.00011921623104132133, - "loss": 0.4813, + "grad_norm": 1.734375, + "learning_rate": 4.753345840288944e-05, + "loss": 0.5723, "step": 7310 }, { "epoch": 1.7609532980259992, - "grad_norm": 0.890625, - "learning_rate": 0.0001191125883461983, - "loss": 0.427, + "grad_norm": 1.8046875, + "learning_rate": 4.7492134367610326e-05, + "loss": 0.5182, "step": 7315 }, { "epoch": 1.762156957149735, - "grad_norm": 0.984375, - "learning_rate": 0.00011900898181193111, - "loss": 0.4404, + "grad_norm": 1.671875, + "learning_rate": 4.745082475025504e-05, + "loss": 0.5323, "step": 7320 }, { "epoch": 1.7633606162734714, - "grad_norm": 0.9609375, - "learning_rate": 0.00011890541160405657, - "loss": 0.475, + "grad_norm": 1.6875, + "learning_rate": 4.740952961682579e-05, + "loss": 0.5708, "step": 7325 }, { "epoch": 1.7645642753972075, - "grad_norm": 0.99609375, - "learning_rate": 0.0001188018778880533, - "loss": 0.438, + "grad_norm": 1.7109375, + "learning_rate": 4.736824903330162e-05, + "loss": 0.5257, "step": 7330 }, { "epoch": 1.7657679345209436, - "grad_norm": 0.921875, - "learning_rate": 0.0001186983808293416, - "loss": 0.4769, + "grad_norm": 1.703125, + "learning_rate": 4.7326983065638294e-05, + "loss": 0.5635, "step": 7335 }, { "epoch": 1.76697159364468, - "grad_norm": 0.9609375, - "learning_rate": 0.00011859492059328326, - "loss": 0.4548, + "grad_norm": 1.7109375, + "learning_rate": 4.728573177976827e-05, + "loss": 0.5469, "step": 7340 }, { "epoch": 1.768175252768416, - "grad_norm": 1.03125, - "learning_rate": 0.00011849149734518117, - "loss": 0.4788, + "grad_norm": 1.796875, + "learning_rate": 4.724449524160051e-05, + "loss": 0.5697, "step": 7345 }, { "epoch": 1.7693789118921521, - "grad_norm": 1.0078125, - "learning_rate": 0.00011838811125027922, - "loss": 0.4381, + "grad_norm": 1.8125, + "learning_rate": 4.720327351702044e-05, + "loss": 0.5328, "step": 7350 }, { "epoch": 1.7705825710158885, - "grad_norm": 0.88671875, - "learning_rate": 0.00011828476247376191, - "loss": 0.4601, + "grad_norm": 1.671875, + "learning_rate": 4.716206667188983e-05, + "loss": 0.5488, "step": 7355 }, { "epoch": 1.7717862301396243, - "grad_norm": 0.94921875, - "learning_rate": 0.00011818145118075404, - "loss": 0.4442, + "grad_norm": 1.71875, + "learning_rate": 4.712087477204663e-05, + "loss": 0.5374, "step": 7360 }, { "epoch": 1.7729898892633607, - "grad_norm": 1.0546875, - "learning_rate": 0.0001180781775363206, - "loss": 0.4801, + "grad_norm": 1.8359375, + "learning_rate": 4.7079697883304946e-05, + "loss": 0.5754, "step": 7365 }, { "epoch": 1.7741935483870968, - "grad_norm": 0.94921875, - "learning_rate": 0.00011797494170546634, - "loss": 0.4331, + "grad_norm": 1.7890625, + "learning_rate": 4.703853607145486e-05, + "loss": 0.5282, "step": 7370 }, { "epoch": 1.7753972075108329, - "grad_norm": 0.9921875, - "learning_rate": 0.00011787174385313575, - "loss": 0.4632, + "grad_norm": 1.734375, + "learning_rate": 4.699738940226245e-05, + "loss": 0.5548, "step": 7375 }, { "epoch": 1.7766008666345692, - "grad_norm": 1.0390625, - "learning_rate": 0.00011776858414421245, - "loss": 0.4645, + "grad_norm": 1.8046875, + "learning_rate": 4.6956257941469496e-05, + "loss": 0.5581, "step": 7380 }, { "epoch": 1.7778045257583053, - "grad_norm": 0.97265625, - "learning_rate": 0.00011766546274351928, - "loss": 0.4454, + "grad_norm": 1.6640625, + "learning_rate": 4.691514175479356e-05, + "loss": 0.5371, "step": 7385 }, { "epoch": 1.7790081848820414, - "grad_norm": 0.9140625, - "learning_rate": 0.00011756237981581779, - "loss": 0.4495, + "grad_norm": 1.6953125, + "learning_rate": 4.687404090792776e-05, + "loss": 0.5387, "step": 7390 }, { "epoch": 1.7802118440057777, - "grad_norm": 0.875, - "learning_rate": 0.000117459335525808, - "loss": 0.4446, + "grad_norm": 1.5859375, + "learning_rate": 4.683295546654069e-05, + "loss": 0.5363, "step": 7395 }, { "epoch": 1.7814155031295136, - "grad_norm": 0.90234375, - "learning_rate": 0.00011735633003812841, - "loss": 0.45, + "grad_norm": 1.6953125, + "learning_rate": 4.67918854962764e-05, + "loss": 0.5407, "step": 7400 }, { "epoch": 1.78261916225325, - "grad_norm": 1.0703125, - "learning_rate": 0.00011725336351735521, - "loss": 0.4647, + "grad_norm": 2.078125, + "learning_rate": 4.675083106275412e-05, + "loss": 0.5591, "step": 7405 }, { "epoch": 1.783822821376986, - "grad_norm": 0.984375, - "learning_rate": 0.00011715043612800264, - "loss": 0.4462, + "grad_norm": 1.609375, + "learning_rate": 4.670979223156835e-05, + "loss": 0.5392, "step": 7410 }, { "epoch": 1.785026480500722, - "grad_norm": 0.90625, - "learning_rate": 0.00011704754803452227, - "loss": 0.4486, + "grad_norm": 1.609375, + "learning_rate": 4.666876906828863e-05, + "loss": 0.5424, "step": 7415 }, { "epoch": 1.7862301396244584, - "grad_norm": 0.88671875, - "learning_rate": 0.00011694469940130282, - "loss": 0.4623, + "grad_norm": 1.7890625, + "learning_rate": 4.6627761638459406e-05, + "loss": 0.5594, "step": 7420 }, { "epoch": 1.7874337987481945, - "grad_norm": 0.88671875, - "learning_rate": 0.0001168418903926701, - "loss": 0.4344, + "grad_norm": 1.6796875, + "learning_rate": 4.658677000760009e-05, + "loss": 0.5231, "step": 7425 }, { "epoch": 1.7886374578719306, - "grad_norm": 0.91796875, - "learning_rate": 0.00011673912117288654, - "loss": 0.4514, + "grad_norm": 1.734375, + "learning_rate": 4.654579424120477e-05, + "loss": 0.5428, "step": 7430 }, { "epoch": 1.789841116995667, - "grad_norm": 1.03125, - "learning_rate": 0.00011663639190615098, - "loss": 0.4742, + "grad_norm": 1.859375, + "learning_rate": 4.650483440474222e-05, + "loss": 0.5652, "step": 7435 }, { "epoch": 1.7910447761194028, - "grad_norm": 0.94921875, - "learning_rate": 0.00011653370275659851, - "loss": 0.4461, + "grad_norm": 1.765625, + "learning_rate": 4.646389056365578e-05, + "loss": 0.5394, "step": 7440 }, { "epoch": 1.7922484352431391, - "grad_norm": 0.953125, - "learning_rate": 0.00011643105388830002, - "loss": 0.4476, + "grad_norm": 1.875, + "learning_rate": 4.6422962783363183e-05, + "loss": 0.549, "step": 7445 }, { "epoch": 1.7934520943668752, - "grad_norm": 1.03125, - "learning_rate": 0.00011632844546526213, - "loss": 0.4673, + "grad_norm": 1.84375, + "learning_rate": 4.638205112925657e-05, + "loss": 0.5581, "step": 7450 }, { "epoch": 1.7946557534906113, - "grad_norm": 0.9453125, - "learning_rate": 0.00011622587765142672, - "loss": 0.4476, + "grad_norm": 1.9765625, + "learning_rate": 4.6341155666702214e-05, + "loss": 0.535, "step": 7455 }, { "epoch": 1.7958594126143477, - "grad_norm": 1.0, - "learning_rate": 0.00011612335061067093, - "loss": 0.43, + "grad_norm": 1.828125, + "learning_rate": 4.630027646104064e-05, + "loss": 0.5253, "step": 7460 }, { "epoch": 1.7970630717380838, - "grad_norm": 0.9453125, - "learning_rate": 0.00011602086450680667, - "loss": 0.4537, + "grad_norm": 1.703125, + "learning_rate": 4.625941357758631e-05, + "loss": 0.5492, "step": 7465 }, { "epoch": 1.7982667308618199, - "grad_norm": 0.9765625, - "learning_rate": 0.00011591841950358047, - "loss": 0.4319, + "grad_norm": 1.7890625, + "learning_rate": 4.6218567081627645e-05, + "loss": 0.527, "step": 7470 }, { "epoch": 1.7994703899855562, - "grad_norm": 1.1328125, - "learning_rate": 0.00011581601576467318, - "loss": 0.4701, + "grad_norm": 1.9453125, + "learning_rate": 4.6177737038426874e-05, + "loss": 0.5611, "step": 7475 }, { "epoch": 1.8006740491092923, - "grad_norm": 0.93359375, - "learning_rate": 0.00011571365345369971, - "loss": 0.436, + "grad_norm": 1.7109375, + "learning_rate": 4.613692351321994e-05, + "loss": 0.5286, "step": 7480 }, { "epoch": 1.8018777082330284, - "grad_norm": 0.9296875, - "learning_rate": 0.00011561133273420877, - "loss": 0.4666, + "grad_norm": 1.8046875, + "learning_rate": 4.609612657121637e-05, + "loss": 0.5646, "step": 7485 }, { "epoch": 1.8030813673567647, - "grad_norm": 0.96484375, - "learning_rate": 0.00011550905376968271, - "loss": 0.4398, + "grad_norm": 1.8125, + "learning_rate": 4.6055346277599265e-05, + "loss": 0.5358, "step": 7490 }, { "epoch": 1.8042850264805006, - "grad_norm": 1.046875, - "learning_rate": 0.00011540681672353703, - "loss": 0.4286, + "grad_norm": 1.875, + "learning_rate": 4.601458269752504e-05, + "loss": 0.5247, "step": 7495 }, { "epoch": 1.805488685604237, - "grad_norm": 0.8828125, - "learning_rate": 0.00011530462175912039, - "loss": 0.4269, + "grad_norm": 1.5859375, + "learning_rate": 4.5973835896123486e-05, + "loss": 0.5176, "step": 7500 }, { "epoch": 1.805488685604237, - "eval_loss": 0.38533905148506165, - "eval_runtime": 2.3297, - "eval_samples_per_second": 85.846, - "eval_steps_per_second": 85.846, + "eval_loss": 0.4626540243625641, + "eval_runtime": 2.3908, + "eval_samples_per_second": 83.653, + "eval_steps_per_second": 83.653, "step": 7500 }, { "epoch": 1.806692344727973, - "grad_norm": 1.0, - "learning_rate": 0.00011520246903971402, - "loss": 0.477, + "grad_norm": 1.796875, + "learning_rate": 4.593310593849748e-05, + "loss": 0.5756, "step": 7505 }, { "epoch": 1.8078960038517091, - "grad_norm": 0.9609375, - "learning_rate": 0.00011510035872853193, - "loss": 0.4374, + "grad_norm": 1.765625, + "learning_rate": 4.589239288972312e-05, + "loss": 0.5284, "step": 7510 }, { "epoch": 1.8090996629754454, - "grad_norm": 1.0, - "learning_rate": 0.00011499829098872011, - "loss": 0.4507, + "grad_norm": 1.7890625, + "learning_rate": 4.585169681484939e-05, + "loss": 0.5443, "step": 7515 }, { "epoch": 1.8103033220991815, - "grad_norm": 0.93359375, - "learning_rate": 0.0001148962659833567, - "loss": 0.4374, + "grad_norm": 1.8515625, + "learning_rate": 4.581101777889819e-05, + "loss": 0.5249, "step": 7520 }, { "epoch": 1.8115069812229176, - "grad_norm": 0.9609375, - "learning_rate": 0.0001147942838754515, - "loss": 0.4278, + "grad_norm": 1.875, + "learning_rate": 4.5770355846864206e-05, + "loss": 0.5193, "step": 7525 }, { "epoch": 1.812710640346654, - "grad_norm": 0.9296875, - "learning_rate": 0.00011469234482794574, - "loss": 0.4627, + "grad_norm": 1.65625, + "learning_rate": 4.572971108371475e-05, + "loss": 0.5546, "step": 7530 }, { "epoch": 1.8139142994703898, - "grad_norm": 1.015625, - "learning_rate": 0.0001145904490037119, - "loss": 0.4855, + "grad_norm": 1.796875, + "learning_rate": 4.568908355438975e-05, + "loss": 0.5815, "step": 7535 }, { "epoch": 1.8151179585941262, - "grad_norm": 1.0703125, - "learning_rate": 0.00011448859656555343, - "loss": 0.4601, + "grad_norm": 1.9140625, + "learning_rate": 4.5648473323801595e-05, + "loss": 0.5476, "step": 7540 }, { "epoch": 1.8163216177178623, - "grad_norm": 0.96484375, - "learning_rate": 0.00011438678767620438, - "loss": 0.467, + "grad_norm": 1.796875, + "learning_rate": 4.5607880456835016e-05, + "loss": 0.5581, "step": 7545 }, { "epoch": 1.8175252768415984, - "grad_norm": 0.90625, - "learning_rate": 0.00011428502249832926, - "loss": 0.441, + "grad_norm": 1.71875, + "learning_rate": 4.5567305018347e-05, + "loss": 0.5326, "step": 7550 }, { "epoch": 1.8187289359653347, - "grad_norm": 1.0390625, - "learning_rate": 0.00011418330119452268, - "loss": 0.4881, + "grad_norm": 1.9609375, + "learning_rate": 4.5526747073166687e-05, + "loss": 0.5818, "step": 7555 }, { "epoch": 1.8199325950890708, - "grad_norm": 0.96484375, - "learning_rate": 0.00011408162392730925, - "loss": 0.4399, + "grad_norm": 1.8125, + "learning_rate": 4.5486206686095286e-05, + "loss": 0.5309, "step": 7560 }, { "epoch": 1.821136254212807, - "grad_norm": 0.98046875, - "learning_rate": 0.00011397999085914326, - "loss": 0.4557, + "grad_norm": 1.71875, + "learning_rate": 4.544568392190597e-05, + "loss": 0.5461, "step": 7565 }, { "epoch": 1.8223399133365432, - "grad_norm": 0.92578125, - "learning_rate": 0.0001138784021524082, - "loss": 0.4381, + "grad_norm": 1.7265625, + "learning_rate": 4.5405178845343696e-05, + "loss": 0.5276, "step": 7570 }, { "epoch": 1.823543572460279, - "grad_norm": 1.015625, - "learning_rate": 0.00011377685796941681, - "loss": 0.4533, + "grad_norm": 2.015625, + "learning_rate": 4.536469152112521e-05, + "loss": 0.5509, "step": 7575 }, { "epoch": 1.8247472315840154, - "grad_norm": 1.0078125, - "learning_rate": 0.00011367535847241065, - "loss": 0.4457, + "grad_norm": 1.734375, + "learning_rate": 4.532422201393886e-05, + "loss": 0.5464, "step": 7580 }, { "epoch": 1.8259508907077515, - "grad_norm": 0.92578125, - "learning_rate": 0.00011357390382355994, - "loss": 0.4434, + "grad_norm": 1.734375, + "learning_rate": 4.528377038844456e-05, + "loss": 0.5373, "step": 7585 }, { "epoch": 1.8271545498314876, - "grad_norm": 1.078125, - "learning_rate": 0.00011347249418496313, - "loss": 0.4519, + "grad_norm": 1.9453125, + "learning_rate": 4.5243336709273616e-05, + "loss": 0.5423, "step": 7590 }, { "epoch": 1.828358208955224, - "grad_norm": 0.87890625, - "learning_rate": 0.00011337112971864687, - "loss": 0.4711, + "grad_norm": 1.6015625, + "learning_rate": 4.520292104102871e-05, + "loss": 0.5627, "step": 7595 }, { "epoch": 1.82956186807896, - "grad_norm": 1.0078125, - "learning_rate": 0.00011326981058656562, - "loss": 0.4264, + "grad_norm": 1.8203125, + "learning_rate": 4.5162523448283706e-05, + "loss": 0.5206, "step": 7600 }, { "epoch": 1.8307655272026961, - "grad_norm": 1.1171875, - "learning_rate": 0.00011316853695060129, - "loss": 0.444, + "grad_norm": 1.765625, + "learning_rate": 4.5122143995583584e-05, + "loss": 0.5358, "step": 7605 }, { "epoch": 1.8319691863264325, - "grad_norm": 0.890625, - "learning_rate": 0.0001130673089725633, - "loss": 0.4593, + "grad_norm": 1.6953125, + "learning_rate": 4.5081782747444394e-05, + "loss": 0.56, "step": 7610 }, { "epoch": 1.8331728454501686, - "grad_norm": 0.86328125, - "learning_rate": 0.00011296612681418791, - "loss": 0.4368, + "grad_norm": 1.6953125, + "learning_rate": 4.504143976835303e-05, + "loss": 0.5386, "step": 7615 }, { "epoch": 1.8343765045739047, - "grad_norm": 1.0390625, - "learning_rate": 0.00011286499063713833, - "loss": 0.4577, + "grad_norm": 1.9296875, + "learning_rate": 4.5001115122767254e-05, + "loss": 0.5516, "step": 7620 }, { "epoch": 1.835580163697641, - "grad_norm": 1.0625, - "learning_rate": 0.00011276390060300422, - "loss": 0.4293, + "grad_norm": 1.8203125, + "learning_rate": 4.49608088751155e-05, + "loss": 0.5241, "step": 7625 }, { "epoch": 1.8367838228213769, - "grad_norm": 1.015625, - "learning_rate": 0.00011266285687330156, - "loss": 0.4642, + "grad_norm": 1.875, + "learning_rate": 4.4920521089796824e-05, + "loss": 0.5599, "step": 7630 }, { "epoch": 1.8379874819451132, - "grad_norm": 1.0390625, - "learning_rate": 0.00011256185960947234, - "loss": 0.4338, + "grad_norm": 1.875, + "learning_rate": 4.488025183118079e-05, + "loss": 0.5273, "step": 7635 }, { "epoch": 1.8391911410688493, - "grad_norm": 0.92578125, - "learning_rate": 0.00011246090897288423, - "loss": 0.4696, + "grad_norm": 1.734375, + "learning_rate": 4.484000116360732e-05, + "loss": 0.569, "step": 7640 }, { "epoch": 1.8403948001925854, - "grad_norm": 1.0, - "learning_rate": 0.00011236000512483051, - "loss": 0.4537, + "grad_norm": 1.71875, + "learning_rate": 4.479976915138668e-05, + "loss": 0.5478, "step": 7645 }, { "epoch": 1.8415984593163217, - "grad_norm": 0.9453125, - "learning_rate": 0.00011225914822652971, - "loss": 0.4719, + "grad_norm": 1.7109375, + "learning_rate": 4.4759555858799334e-05, + "loss": 0.5625, "step": 7650 }, { "epoch": 1.8428021184400578, - "grad_norm": 0.9765625, - "learning_rate": 0.00011215833843912521, - "loss": 0.4471, + "grad_norm": 1.75, + "learning_rate": 4.471936135009578e-05, + "loss": 0.5344, "step": 7655 }, { "epoch": 1.844005777563794, - "grad_norm": 1.0625, - "learning_rate": 0.00011205757592368529, - "loss": 0.4484, + "grad_norm": 1.890625, + "learning_rate": 4.467918568949656e-05, + "loss": 0.5465, "step": 7660 }, { "epoch": 1.8452094366875302, - "grad_norm": 1.0078125, - "learning_rate": 0.00011195686084120253, - "loss": 0.4721, + "grad_norm": 1.8359375, + "learning_rate": 4.463902894119206e-05, + "loss": 0.5683, "step": 7665 }, { "epoch": 1.8464130958112661, - "grad_norm": 1.0, - "learning_rate": 0.00011185619335259387, - "loss": 0.4611, + "grad_norm": 1.7890625, + "learning_rate": 4.459889116934248e-05, + "loss": 0.5549, "step": 7670 }, { "epoch": 1.8476167549350024, - "grad_norm": 0.90234375, - "learning_rate": 0.00011175557361870016, - "loss": 0.46, + "grad_norm": 1.6640625, + "learning_rate": 4.45587724380777e-05, + "loss": 0.5563, "step": 7675 }, { "epoch": 1.8488204140587385, - "grad_norm": 0.87890625, - "learning_rate": 0.00011165500180028593, - "loss": 0.4198, + "grad_norm": 1.6953125, + "learning_rate": 4.4518672811497165e-05, + "loss": 0.513, "step": 7680 }, { "epoch": 1.8500240731824746, - "grad_norm": 0.96875, - "learning_rate": 0.00011155447805803916, - "loss": 0.4589, + "grad_norm": 1.7578125, + "learning_rate": 4.4478592353669794e-05, + "loss": 0.5555, "step": 7685 }, { "epoch": 1.851227732306211, - "grad_norm": 1.0390625, - "learning_rate": 0.00011145400255257098, - "loss": 0.4717, + "grad_norm": 1.8984375, + "learning_rate": 4.4438531128633875e-05, + "loss": 0.5707, "step": 7690 }, { "epoch": 1.852431391429947, - "grad_norm": 0.8984375, - "learning_rate": 0.00011135357544441552, - "loss": 0.4211, + "grad_norm": 1.6875, + "learning_rate": 4.439848920039698e-05, + "loss": 0.5214, "step": 7695 }, { "epoch": 1.8536350505536832, - "grad_norm": 1.0234375, - "learning_rate": 0.00011125319689402963, - "loss": 0.4559, + "grad_norm": 1.953125, + "learning_rate": 4.435846663293587e-05, + "loss": 0.5544, "step": 7700 }, { "epoch": 1.8548387096774195, - "grad_norm": 0.98828125, - "learning_rate": 0.0001111528670617924, - "loss": 0.4388, + "grad_norm": 1.859375, + "learning_rate": 4.4318463490196315e-05, + "loss": 0.5307, "step": 7705 }, { "epoch": 1.8560423688011554, - "grad_norm": 0.89453125, - "learning_rate": 0.00011105258610800524, - "loss": 0.4366, + "grad_norm": 1.6171875, + "learning_rate": 4.4278479836093106e-05, + "loss": 0.5312, "step": 7710 }, { "epoch": 1.8572460279248917, - "grad_norm": 1.03125, - "learning_rate": 0.00011095235419289132, - "loss": 0.4626, + "grad_norm": 1.796875, + "learning_rate": 4.423851573450984e-05, + "loss": 0.5578, "step": 7715 }, { "epoch": 1.8584496870486278, - "grad_norm": 1.0234375, - "learning_rate": 0.00011085217147659563, - "loss": 0.46, + "grad_norm": 1.765625, + "learning_rate": 4.419857124929894e-05, + "loss": 0.5453, "step": 7720 }, { "epoch": 1.8596533461723639, - "grad_norm": 0.95703125, - "learning_rate": 0.00011075203811918447, - "loss": 0.4343, + "grad_norm": 1.78125, + "learning_rate": 4.415864644428145e-05, + "loss": 0.5286, "step": 7725 }, { "epoch": 1.8608570052961002, - "grad_norm": 1.046875, - "learning_rate": 0.00011065195428064525, - "loss": 0.4431, + "grad_norm": 1.8359375, + "learning_rate": 4.411874138324696e-05, + "loss": 0.5402, "step": 7730 }, { "epoch": 1.8620606644198363, - "grad_norm": 0.984375, - "learning_rate": 0.0001105519201208863, - "loss": 0.4651, + "grad_norm": 1.71875, + "learning_rate": 4.407885612995354e-05, + "loss": 0.5602, "step": 7735 }, { "epoch": 1.8632643235435724, - "grad_norm": 0.87890625, - "learning_rate": 0.00011045193579973652, - "loss": 0.4469, + "grad_norm": 1.59375, + "learning_rate": 4.4038990748127575e-05, + "loss": 0.5414, "step": 7740 }, { "epoch": 1.8644679826673087, - "grad_norm": 0.90234375, - "learning_rate": 0.00011035200147694524, - "loss": 0.4162, + "grad_norm": 1.671875, + "learning_rate": 4.399914530146374e-05, + "loss": 0.5071, "step": 7745 }, { "epoch": 1.8656716417910446, - "grad_norm": 0.96484375, - "learning_rate": 0.00011025211731218196, - "loss": 0.464, + "grad_norm": 1.796875, + "learning_rate": 4.395931985362487e-05, + "loss": 0.5557, "step": 7750 }, { "epoch": 1.866875300914781, - "grad_norm": 1.03125, - "learning_rate": 0.00011015228346503588, - "loss": 0.4367, + "grad_norm": 1.9375, + "learning_rate": 4.3919514468241795e-05, + "loss": 0.5293, "step": 7755 }, { "epoch": 1.8680789600385173, - "grad_norm": 1.1328125, - "learning_rate": 0.00011005250009501595, - "loss": 0.4468, + "grad_norm": 1.9453125, + "learning_rate": 4.3879729208913316e-05, + "loss": 0.545, "step": 7760 }, { "epoch": 1.8692826191622531, - "grad_norm": 0.95703125, - "learning_rate": 0.0001099527673615504, - "loss": 0.4421, + "grad_norm": 1.7734375, + "learning_rate": 4.3839964139206086e-05, + "loss": 0.5315, "step": 7765 }, { "epoch": 1.8704862782859895, - "grad_norm": 0.98828125, - "learning_rate": 0.00010985308542398652, - "loss": 0.4423, + "grad_norm": 1.828125, + "learning_rate": 4.380021932265448e-05, + "loss": 0.5332, "step": 7770 }, { "epoch": 1.8716899374097256, - "grad_norm": 0.9453125, - "learning_rate": 0.0001097534544415906, - "loss": 0.4434, + "grad_norm": 1.71875, + "learning_rate": 4.376049482276055e-05, + "loss": 0.5343, "step": 7775 }, { "epoch": 1.8728935965334617, - "grad_norm": 1.015625, - "learning_rate": 0.00010965387457354735, - "loss": 0.4315, + "grad_norm": 1.90625, + "learning_rate": 4.3720790702993845e-05, + "loss": 0.5271, "step": 7780 }, { "epoch": 1.874097255657198, - "grad_norm": 0.9609375, - "learning_rate": 0.00010955434597895985, - "loss": 0.4426, + "grad_norm": 1.7421875, + "learning_rate": 4.368110702679138e-05, + "loss": 0.5424, "step": 7785 }, { "epoch": 1.875300914780934, - "grad_norm": 1.03125, - "learning_rate": 0.0001094548688168493, - "loss": 0.4348, + "grad_norm": 1.71875, + "learning_rate": 4.364144385755748e-05, + "loss": 0.5271, "step": 7790 }, { "epoch": 1.8765045739046702, - "grad_norm": 0.9765625, - "learning_rate": 0.00010935544324615469, - "loss": 0.4881, + "grad_norm": 1.796875, + "learning_rate": 4.3601801258663726e-05, + "loss": 0.5775, "step": 7795 }, { "epoch": 1.8777082330284065, - "grad_norm": 1.046875, - "learning_rate": 0.00010925606942573264, - "loss": 0.4528, + "grad_norm": 1.9375, + "learning_rate": 4.3562179293448856e-05, + "loss": 0.5464, "step": 7800 }, { "epoch": 1.8789118921521424, - "grad_norm": 1.015625, - "learning_rate": 0.00010915674751435698, - "loss": 0.477, + "grad_norm": 1.71875, + "learning_rate": 4.352257802521857e-05, + "loss": 0.5693, "step": 7805 }, { "epoch": 1.8801155512758787, - "grad_norm": 0.97265625, - "learning_rate": 0.00010905747767071873, - "loss": 0.4532, + "grad_norm": 1.71875, + "learning_rate": 4.348299751724559e-05, + "loss": 0.5444, "step": 7810 }, { "epoch": 1.8813192103996148, - "grad_norm": 0.890625, - "learning_rate": 0.0001089582600534256, - "loss": 0.4256, + "grad_norm": 1.796875, + "learning_rate": 4.3443437832769386e-05, + "loss": 0.5224, "step": 7815 }, { "epoch": 1.882522869523351, - "grad_norm": 1.0625, - "learning_rate": 0.00010885909482100192, - "loss": 0.4766, + "grad_norm": 1.7578125, + "learning_rate": 4.34038990349962e-05, + "loss": 0.5714, "step": 7820 }, { "epoch": 1.8837265286470872, - "grad_norm": 0.95703125, - "learning_rate": 0.0001087599821318883, - "loss": 0.4574, + "grad_norm": 1.8046875, + "learning_rate": 4.33643811870989e-05, + "loss": 0.5562, "step": 7825 }, { "epoch": 1.8849301877708233, - "grad_norm": 0.99609375, - "learning_rate": 0.0001086609221444414, - "loss": 0.471, + "grad_norm": 1.6796875, + "learning_rate": 4.3324884352216864e-05, + "loss": 0.5635, "step": 7830 }, { "epoch": 1.8861338468945594, - "grad_norm": 1.0546875, - "learning_rate": 0.00010856191501693376, - "loss": 0.4534, + "grad_norm": 1.9453125, + "learning_rate": 4.3285408593455934e-05, + "loss": 0.5494, "step": 7835 }, { "epoch": 1.8873375060182958, - "grad_norm": 0.96875, - "learning_rate": 0.00010846296090755331, - "loss": 0.4612, + "grad_norm": 1.7421875, + "learning_rate": 4.324595397388824e-05, + "loss": 0.5608, "step": 7840 }, { "epoch": 1.8885411651420316, - "grad_norm": 0.984375, - "learning_rate": 0.00010836405997440341, - "loss": 0.4521, + "grad_norm": 1.796875, + "learning_rate": 4.320652055655215e-05, + "loss": 0.5439, "step": 7845 }, { "epoch": 1.889744824265768, - "grad_norm": 1.1015625, - "learning_rate": 0.00010826521237550231, - "loss": 0.4192, + "grad_norm": 1.9609375, + "learning_rate": 4.316710840445214e-05, + "loss": 0.5137, "step": 7850 }, { "epoch": 1.890948483389504, - "grad_norm": 1.0078125, - "learning_rate": 0.0001081664182687832, - "loss": 0.4427, + "grad_norm": 1.8203125, + "learning_rate": 4.312771758055874e-05, + "loss": 0.5378, "step": 7855 }, { "epoch": 1.8921521425132402, - "grad_norm": 1.0078125, - "learning_rate": 0.00010806767781209375, - "loss": 0.4234, + "grad_norm": 1.8359375, + "learning_rate": 4.308834814780839e-05, + "loss": 0.5135, "step": 7860 }, { "epoch": 1.8933558016369765, - "grad_norm": 0.93359375, - "learning_rate": 0.00010796899116319585, - "loss": 0.4496, + "grad_norm": 1.8046875, + "learning_rate": 4.304900016910336e-05, + "loss": 0.5414, "step": 7865 }, { "epoch": 1.8945594607607126, - "grad_norm": 0.96484375, - "learning_rate": 0.00010787035847976552, - "loss": 0.4245, + "grad_norm": 1.796875, + "learning_rate": 4.3009673707311626e-05, + "loss": 0.5139, "step": 7870 }, { "epoch": 1.8957631198844487, - "grad_norm": 0.9609375, - "learning_rate": 0.00010777177991939242, - "loss": 0.4627, + "grad_norm": 1.671875, + "learning_rate": 4.2970368825266766e-05, + "loss": 0.561, "step": 7875 }, { "epoch": 1.896966779008185, - "grad_norm": 0.8515625, - "learning_rate": 0.0001076732556395799, - "loss": 0.3973, + "grad_norm": 1.7265625, + "learning_rate": 4.2931085585767944e-05, + "loss": 0.4961, "step": 7880 }, { "epoch": 1.8981704381319209, - "grad_norm": 0.9765625, - "learning_rate": 0.00010757478579774447, - "loss": 0.4402, + "grad_norm": 1.8671875, + "learning_rate": 4.289182405157969e-05, + "loss": 0.538, "step": 7885 }, { "epoch": 1.8993740972556572, - "grad_norm": 0.90625, - "learning_rate": 0.00010747637055121569, - "loss": 0.4569, + "grad_norm": 1.65625, + "learning_rate": 4.2852584285431875e-05, + "loss": 0.5569, "step": 7890 }, { "epoch": 1.9005777563793933, - "grad_norm": 0.91796875, - "learning_rate": 0.00010737801005723593, - "loss": 0.4578, + "grad_norm": 1.65625, + "learning_rate": 4.281336635001959e-05, + "loss": 0.5541, "step": 7895 }, { "epoch": 1.9017814155031294, - "grad_norm": 0.8984375, - "learning_rate": 0.00010727970447295998, - "loss": 0.411, + "grad_norm": 1.6796875, + "learning_rate": 4.277417030800302e-05, + "loss": 0.5085, "step": 7900 }, { "epoch": 1.9029850746268657, - "grad_norm": 1.0234375, - "learning_rate": 0.00010718145395545498, - "loss": 0.4578, + "grad_norm": 1.8125, + "learning_rate": 4.27349962220074e-05, + "loss": 0.5528, "step": 7905 }, { "epoch": 1.9041887337506018, - "grad_norm": 1.0234375, - "learning_rate": 0.00010708325866170012, - "loss": 0.4324, + "grad_norm": 1.6875, + "learning_rate": 4.26958441546229e-05, + "loss": 0.5311, "step": 7910 }, { "epoch": 1.905392392874338, - "grad_norm": 0.9296875, - "learning_rate": 0.00010698511874858627, - "loss": 0.4317, + "grad_norm": 1.6875, + "learning_rate": 4.265671416840447e-05, + "loss": 0.5243, "step": 7915 }, { "epoch": 1.9065960519980742, - "grad_norm": 0.99609375, - "learning_rate": 0.00010688703437291589, - "loss": 0.4577, + "grad_norm": 1.75, + "learning_rate": 4.26176063258718e-05, + "loss": 0.5511, "step": 7920 }, { "epoch": 1.9077997111218103, - "grad_norm": 1.0234375, - "learning_rate": 0.0001067890056914026, - "loss": 0.4877, + "grad_norm": 1.78125, + "learning_rate": 4.2578520689509194e-05, + "loss": 0.5811, "step": 7925 }, { "epoch": 1.9090033702455464, - "grad_norm": 0.99609375, - "learning_rate": 0.00010669103286067112, - "loss": 0.449, + "grad_norm": 1.828125, + "learning_rate": 4.253945732176547e-05, + "loss": 0.5476, "step": 7930 }, { "epoch": 1.9102070293692828, - "grad_norm": 1.0078125, - "learning_rate": 0.00010659311603725699, - "loss": 0.443, + "grad_norm": 1.8828125, + "learning_rate": 4.25004162850539e-05, + "loss": 0.5432, "step": 7935 }, { "epoch": 1.9114106884930187, - "grad_norm": 0.89453125, - "learning_rate": 0.0001064952553776061, - "loss": 0.4254, + "grad_norm": 1.6796875, + "learning_rate": 4.246139764175203e-05, + "loss": 0.5195, "step": 7940 }, { "epoch": 1.912614347616755, - "grad_norm": 0.98046875, - "learning_rate": 0.00010639745103807476, - "loss": 0.457, + "grad_norm": 1.8515625, + "learning_rate": 4.242240145420167e-05, + "loss": 0.5533, "step": 7945 }, { "epoch": 1.913818006740491, - "grad_norm": 0.96875, - "learning_rate": 0.00010629970317492917, - "loss": 0.4245, + "grad_norm": 1.7109375, + "learning_rate": 4.2383427784708695e-05, + "loss": 0.5197, "step": 7950 }, { "epoch": 1.9150216658642272, - "grad_norm": 1.0546875, - "learning_rate": 0.00010620201194434547, - "loss": 0.4502, + "grad_norm": 1.9140625, + "learning_rate": 4.234447669554309e-05, + "loss": 0.5451, "step": 7955 }, { "epoch": 1.9162253249879635, - "grad_norm": 0.9609375, - "learning_rate": 0.00010610437750240909, - "loss": 0.4602, + "grad_norm": 1.8984375, + "learning_rate": 4.2305548248938675e-05, + "loss": 0.5578, "step": 7960 }, { "epoch": 1.9174289841116996, - "grad_norm": 0.91796875, - "learning_rate": 0.00010600680000511486, - "loss": 0.441, + "grad_norm": 1.7734375, + "learning_rate": 4.226664250709313e-05, + "loss": 0.5325, "step": 7965 }, { "epoch": 1.9186326432354357, - "grad_norm": 0.85546875, - "learning_rate": 0.00010590927960836667, - "loss": 0.4052, + "grad_norm": 1.5625, + "learning_rate": 4.222775953216787e-05, + "loss": 0.4984, "step": 7970 }, { "epoch": 1.919836302359172, - "grad_norm": 0.89453125, - "learning_rate": 0.00010581181646797702, - "loss": 0.4291, + "grad_norm": 1.546875, + "learning_rate": 4.218889938628789e-05, + "loss": 0.5191, "step": 7975 }, { "epoch": 1.921039961482908, - "grad_norm": 1.078125, - "learning_rate": 0.00010571441073966709, - "loss": 0.4371, + "grad_norm": 1.9375, + "learning_rate": 4.2150062131541775e-05, + "loss": 0.5357, "step": 7980 }, { "epoch": 1.9222436206066442, - "grad_norm": 0.9609375, - "learning_rate": 0.00010561706257906627, - "loss": 0.4757, + "grad_norm": 1.6875, + "learning_rate": 4.21112478299815e-05, + "loss": 0.5722, "step": 7985 }, { "epoch": 1.9234472797303803, - "grad_norm": 0.98046875, - "learning_rate": 0.00010551977214171191, - "loss": 0.4418, + "grad_norm": 1.7265625, + "learning_rate": 4.207245654362236e-05, + "loss": 0.5424, "step": 7990 }, { "epoch": 1.9246509388541164, - "grad_norm": 0.98046875, - "learning_rate": 0.00010542253958304926, - "loss": 0.4536, + "grad_norm": 1.7421875, + "learning_rate": 4.203368833444289e-05, + "loss": 0.5493, "step": 7995 }, { "epoch": 1.9258545979778527, - "grad_norm": 0.94921875, - "learning_rate": 0.00010532536505843094, - "loss": 0.4284, + "grad_norm": 1.734375, + "learning_rate": 4.199494326438473e-05, + "loss": 0.5227, "step": 8000 }, { "epoch": 1.9258545979778527, - "eval_loss": 0.3760577440261841, - "eval_runtime": 2.3365, - "eval_samples_per_second": 85.598, - "eval_steps_per_second": 85.598, + "eval_loss": 0.45808783173561096, + "eval_runtime": 2.3907, + "eval_samples_per_second": 83.659, + "eval_steps_per_second": 83.659, "step": 8000 }, { "epoch": 1.9270582571015888, - "grad_norm": 1.0078125, - "learning_rate": 0.00010522824872311702, - "loss": 0.4599, + "grad_norm": 1.7890625, + "learning_rate": 4.195622139535261e-05, + "loss": 0.5611, "step": 8005 }, { "epoch": 1.928261916225325, - "grad_norm": 0.9375, - "learning_rate": 0.00010513119073227441, - "loss": 0.4221, + "grad_norm": 1.671875, + "learning_rate": 4.1917522789214106e-05, + "loss": 0.5128, "step": 8010 }, { "epoch": 1.9294655753490613, - "grad_norm": 1.015625, - "learning_rate": 0.000105034191240977, - "loss": 0.4508, + "grad_norm": 1.859375, + "learning_rate": 4.1878847507799706e-05, + "loss": 0.5488, "step": 8015 }, { "epoch": 1.9306692344727971, - "grad_norm": 1.03125, - "learning_rate": 0.00010493725040420506, - "loss": 0.46, + "grad_norm": 1.9296875, + "learning_rate": 4.1840195612902584e-05, + "loss": 0.5555, "step": 8020 }, { "epoch": 1.9318728935965335, - "grad_norm": 0.91796875, - "learning_rate": 0.00010484036837684515, - "loss": 0.4756, + "grad_norm": 1.8203125, + "learning_rate": 4.180156716627854e-05, + "loss": 0.5714, "step": 8025 }, { "epoch": 1.9330765527202696, - "grad_norm": 0.98828125, - "learning_rate": 0.00010474354531368998, - "loss": 0.4805, + "grad_norm": 1.8984375, + "learning_rate": 4.176296222964594e-05, + "loss": 0.5726, "step": 8030 }, { "epoch": 1.9342802118440057, - "grad_norm": 0.97265625, - "learning_rate": 0.00010464678136943798, - "loss": 0.4063, + "grad_norm": 1.796875, + "learning_rate": 4.1724380864685604e-05, + "loss": 0.498, "step": 8035 }, { "epoch": 1.935483870967742, - "grad_norm": 0.99609375, - "learning_rate": 0.00010455007669869309, - "loss": 0.4752, + "grad_norm": 1.96875, + "learning_rate": 4.1685823133040615e-05, + "loss": 0.5735, "step": 8040 }, { "epoch": 1.936687530091478, - "grad_norm": 1.03125, - "learning_rate": 0.00010445343145596464, - "loss": 0.4321, + "grad_norm": 1.8671875, + "learning_rate": 4.1647289096316394e-05, + "loss": 0.5289, "step": 8045 }, { "epoch": 1.9378911892152142, - "grad_norm": 0.91796875, - "learning_rate": 0.00010435684579566686, - "loss": 0.4418, + "grad_norm": 1.8125, + "learning_rate": 4.1608778816080405e-05, + "loss": 0.5413, "step": 8050 }, { "epoch": 1.9390948483389505, - "grad_norm": 0.95703125, - "learning_rate": 0.000104260319872119, - "loss": 0.4341, + "grad_norm": 1.71875, + "learning_rate": 4.1570292353862246e-05, + "loss": 0.5341, "step": 8055 }, { "epoch": 1.9402985074626866, - "grad_norm": 0.93359375, - "learning_rate": 0.00010416385383954461, - "loss": 0.445, + "grad_norm": 1.703125, + "learning_rate": 4.153182977115338e-05, + "loss": 0.5431, "step": 8060 }, { "epoch": 1.9415021665864227, - "grad_norm": 0.9765625, - "learning_rate": 0.00010406744785207181, - "loss": 0.4388, + "grad_norm": 1.7421875, + "learning_rate": 4.1493391129407193e-05, + "loss": 0.5291, "step": 8065 }, { "epoch": 1.942705825710159, - "grad_norm": 0.93359375, - "learning_rate": 0.00010397110206373257, - "loss": 0.4201, + "grad_norm": 1.7890625, + "learning_rate": 4.145497649003876e-05, + "loss": 0.5176, "step": 8070 }, { "epoch": 1.943909484833895, - "grad_norm": 0.953125, - "learning_rate": 0.00010387481662846276, - "loss": 0.4331, + "grad_norm": 1.8515625, + "learning_rate": 4.1416585914424804e-05, + "loss": 0.5295, "step": 8075 }, { "epoch": 1.9451131439576312, - "grad_norm": 1.0390625, - "learning_rate": 0.00010377859170010186, - "loss": 0.4167, + "grad_norm": 1.9453125, + "learning_rate": 4.1378219463903664e-05, + "loss": 0.5114, "step": 8080 }, { "epoch": 1.9463168030813673, - "grad_norm": 0.9921875, - "learning_rate": 0.0001036824274323926, - "loss": 0.3948, + "grad_norm": 1.9375, + "learning_rate": 4.133987719977506e-05, + "loss": 0.4922, "step": 8085 }, { "epoch": 1.9475204622051034, - "grad_norm": 0.87109375, - "learning_rate": 0.00010358632397898084, - "loss": 0.4125, + "grad_norm": 1.6484375, + "learning_rate": 4.1301559183300124e-05, + "loss": 0.5135, "step": 8090 }, { "epoch": 1.9487241213288398, - "grad_norm": 0.8359375, - "learning_rate": 0.00010349028149341532, - "loss": 0.4195, + "grad_norm": 1.6484375, + "learning_rate": 4.1263265475701224e-05, + "loss": 0.5139, "step": 8095 }, { "epoch": 1.9499277804525759, - "grad_norm": 1.1484375, - "learning_rate": 0.00010339430012914721, - "loss": 0.4498, + "grad_norm": 1.8671875, + "learning_rate": 4.122499613816186e-05, + "loss": 0.5498, "step": 8100 }, { "epoch": 1.951131439576312, - "grad_norm": 0.92578125, - "learning_rate": 0.00010329838003953023, - "loss": 0.4814, + "grad_norm": 1.640625, + "learning_rate": 4.118675123182667e-05, + "loss": 0.5806, "step": 8105 }, { "epoch": 1.9523350987000483, - "grad_norm": 0.984375, - "learning_rate": 0.00010320252137781999, - "loss": 0.4273, + "grad_norm": 1.75, + "learning_rate": 4.114853081780116e-05, + "loss": 0.5269, "step": 8110 }, { "epoch": 1.9535387578237842, - "grad_norm": 0.91015625, - "learning_rate": 0.00010310672429717416, - "loss": 0.4408, + "grad_norm": 1.6640625, + "learning_rate": 4.1110334957151796e-05, + "loss": 0.5393, "step": 8115 }, { "epoch": 1.9547424169475205, - "grad_norm": 0.93359375, - "learning_rate": 0.00010301098895065184, - "loss": 0.4186, + "grad_norm": 1.640625, + "learning_rate": 4.107216371090574e-05, + "loss": 0.5191, "step": 8120 }, { "epoch": 1.9559460760712566, - "grad_norm": 1.0390625, - "learning_rate": 0.00010291531549121358, - "loss": 0.4438, + "grad_norm": 1.9140625, + "learning_rate": 4.103401714005087e-05, + "loss": 0.5444, "step": 8125 }, { "epoch": 1.9571497351949927, - "grad_norm": 1.1171875, - "learning_rate": 0.0001028197040717211, - "loss": 0.4384, + "grad_norm": 1.9609375, + "learning_rate": 4.099589530553563e-05, + "loss": 0.5345, "step": 8130 }, { "epoch": 1.958353394318729, - "grad_norm": 1.0390625, - "learning_rate": 0.00010272415484493687, - "loss": 0.4653, + "grad_norm": 1.9375, + "learning_rate": 4.095779826826893e-05, + "loss": 0.5644, "step": 8135 }, { "epoch": 1.9595570534424651, - "grad_norm": 0.91796875, - "learning_rate": 0.00010262866796352407, - "loss": 0.4359, + "grad_norm": 1.671875, + "learning_rate": 4.0919726089120075e-05, + "loss": 0.533, "step": 8140 }, { "epoch": 1.9607607125662012, - "grad_norm": 0.97265625, - "learning_rate": 0.00010253324358004631, - "loss": 0.4379, + "grad_norm": 1.8671875, + "learning_rate": 4.088167882891866e-05, + "loss": 0.5384, "step": 8145 }, { "epoch": 1.9619643716899375, - "grad_norm": 1.046875, - "learning_rate": 0.00010243788184696724, - "loss": 0.4111, + "grad_norm": 1.8203125, + "learning_rate": 4.0843656548454426e-05, + "loss": 0.4981, "step": 8150 }, { "epoch": 1.9631680308136734, - "grad_norm": 1.0390625, - "learning_rate": 0.00010234258291665051, - "loss": 0.4488, + "grad_norm": 1.734375, + "learning_rate": 4.0805659308477266e-05, + "loss": 0.5528, "step": 8155 }, { "epoch": 1.9643716899374097, - "grad_norm": 0.95703125, - "learning_rate": 0.00010224734694135932, - "loss": 0.429, + "grad_norm": 1.7265625, + "learning_rate": 4.076768716969699e-05, + "loss": 0.522, "step": 8160 }, { "epoch": 1.9655753490611458, - "grad_norm": 0.97265625, - "learning_rate": 0.0001021521740732564, - "loss": 0.4306, + "grad_norm": 1.671875, + "learning_rate": 4.072974019278337e-05, + "loss": 0.5249, "step": 8165 }, { "epoch": 1.966779008184882, - "grad_norm": 0.9140625, - "learning_rate": 0.00010205706446440356, - "loss": 0.4334, + "grad_norm": 1.7890625, + "learning_rate": 4.069181843836593e-05, + "loss": 0.529, "step": 8170 }, { "epoch": 1.9679826673086183, - "grad_norm": 0.96484375, - "learning_rate": 0.0001019620182667616, - "loss": 0.4464, + "grad_norm": 1.796875, + "learning_rate": 4.0653921967033926e-05, + "loss": 0.5466, "step": 8175 }, { "epoch": 1.9691863264323544, - "grad_norm": 0.87109375, - "learning_rate": 0.00010186703563218998, - "loss": 0.4644, + "grad_norm": 1.6328125, + "learning_rate": 4.0616050839336205e-05, + "loss": 0.5613, "step": 8180 }, { "epoch": 1.9703899855560905, - "grad_norm": 1.046875, - "learning_rate": 0.00010177211671244654, - "loss": 0.447, + "grad_norm": 1.84375, + "learning_rate": 4.0578205115781094e-05, + "loss": 0.5489, "step": 8185 }, { "epoch": 1.9715936446798268, - "grad_norm": 1.046875, - "learning_rate": 0.00010167726165918744, - "loss": 0.4393, + "grad_norm": 1.8671875, + "learning_rate": 4.054038485683638e-05, + "loss": 0.539, "step": 8190 }, { "epoch": 1.9727973038035629, - "grad_norm": 0.8984375, - "learning_rate": 0.00010158247062396676, - "loss": 0.4297, + "grad_norm": 1.6796875, + "learning_rate": 4.050259012292916e-05, + "loss": 0.5292, "step": 8195 }, { "epoch": 1.974000962927299, - "grad_norm": 0.97265625, - "learning_rate": 0.0001014877437582362, - "loss": 0.4323, + "grad_norm": 1.8125, + "learning_rate": 4.046482097444569e-05, + "loss": 0.5266, "step": 8200 }, { "epoch": 1.9752046220510353, - "grad_norm": 0.984375, - "learning_rate": 0.00010139308121334507, - "loss": 0.4682, + "grad_norm": 1.6953125, + "learning_rate": 4.042707747173141e-05, + "loss": 0.568, "step": 8205 }, { "epoch": 1.9764082811747712, - "grad_norm": 1.0546875, - "learning_rate": 0.00010129848314053981, - "loss": 0.4678, + "grad_norm": 1.921875, + "learning_rate": 4.038935967509075e-05, + "loss": 0.567, "step": 8210 }, { "epoch": 1.9776119402985075, - "grad_norm": 0.8984375, - "learning_rate": 0.0001012039496909639, - "loss": 0.4142, + "grad_norm": 1.65625, + "learning_rate": 4.0351667644787085e-05, + "loss": 0.5137, "step": 8215 }, { "epoch": 1.9788155994222436, - "grad_norm": 0.9765625, - "learning_rate": 0.00010110948101565761, - "loss": 0.4435, + "grad_norm": 1.734375, + "learning_rate": 4.031400144104264e-05, + "loss": 0.5382, "step": 8220 }, { "epoch": 1.9800192585459797, - "grad_norm": 0.87109375, - "learning_rate": 0.00010101507726555761, - "loss": 0.4143, + "grad_norm": 1.640625, + "learning_rate": 4.027636112403832e-05, + "loss": 0.5133, "step": 8225 }, { "epoch": 1.981222917669716, - "grad_norm": 0.953125, - "learning_rate": 0.00010092073859149691, - "loss": 0.4354, + "grad_norm": 1.7265625, + "learning_rate": 4.023874675391373e-05, + "loss": 0.5305, "step": 8230 }, { "epoch": 1.9824265767934521, - "grad_norm": 0.953125, - "learning_rate": 0.00010082646514420448, - "loss": 0.4269, + "grad_norm": 1.7734375, + "learning_rate": 4.0201158390766936e-05, + "loss": 0.5301, "step": 8235 }, { "epoch": 1.9836302359171882, - "grad_norm": 0.92578125, - "learning_rate": 0.00010073225707430519, - "loss": 0.4367, + "grad_norm": 1.765625, + "learning_rate": 4.0163596094654575e-05, + "loss": 0.532, "step": 8240 }, { "epoch": 1.9848338950409246, - "grad_norm": 0.9453125, - "learning_rate": 0.00010063811453231937, - "loss": 0.4275, + "grad_norm": 1.734375, + "learning_rate": 4.0126059925591536e-05, + "loss": 0.5295, "step": 8245 }, { "epoch": 1.9860375541646604, - "grad_norm": 0.984375, - "learning_rate": 0.00010054403766866263, - "loss": 0.4633, + "grad_norm": 1.796875, + "learning_rate": 4.0088549943550984e-05, + "loss": 0.5648, "step": 8250 }, { "epoch": 1.9872412132883968, - "grad_norm": 0.90234375, - "learning_rate": 0.00010045002663364573, - "loss": 0.4285, + "grad_norm": 1.65625, + "learning_rate": 4.005106620846428e-05, + "loss": 0.5304, "step": 8255 }, { "epoch": 1.9884448724121329, - "grad_norm": 0.94140625, - "learning_rate": 0.00010035608157747416, - "loss": 0.4353, + "grad_norm": 1.7109375, + "learning_rate": 4.00136087802208e-05, + "loss": 0.5317, "step": 8260 }, { "epoch": 1.989648531535869, - "grad_norm": 0.921875, - "learning_rate": 0.00010026220265024805, - "loss": 0.4334, + "grad_norm": 1.65625, + "learning_rate": 3.997617771866792e-05, + "loss": 0.5327, "step": 8265 }, { "epoch": 1.9908521906596053, - "grad_norm": 0.9609375, - "learning_rate": 0.0001001683900019619, - "loss": 0.4502, + "grad_norm": 1.6796875, + "learning_rate": 3.99387730836109e-05, + "loss": 0.5569, "step": 8270 }, { "epoch": 1.9920558497833414, - "grad_norm": 0.96484375, - "learning_rate": 0.00010007464378250427, - "loss": 0.4344, + "grad_norm": 1.7421875, + "learning_rate": 3.990139493481274e-05, + "loss": 0.5294, "step": 8275 }, { "epoch": 1.9932595089070775, - "grad_norm": 1.0, - "learning_rate": 9.99809641416575e-05, - "loss": 0.4446, + "grad_norm": 1.796875, + "learning_rate": 3.986404333199413e-05, + "loss": 0.5422, "step": 8280 }, { "epoch": 1.9944631680308138, - "grad_norm": 0.98046875, - "learning_rate": 9.988735122909773e-05, - "loss": 0.4562, + "grad_norm": 1.7421875, + "learning_rate": 3.982671833483337e-05, + "loss": 0.5508, "step": 8285 }, { "epoch": 1.9956668271545497, - "grad_norm": 0.984375, - "learning_rate": 9.979380519439437e-05, - "loss": 0.4453, + "grad_norm": 1.796875, + "learning_rate": 3.9789420002966245e-05, + "loss": 0.5432, "step": 8290 }, { "epoch": 1.996870486278286, - "grad_norm": 0.91015625, - "learning_rate": 9.970032618700996e-05, - "loss": 0.4357, + "grad_norm": 1.7421875, + "learning_rate": 3.97521483959859e-05, + "loss": 0.5359, "step": 8295 }, { "epoch": 1.998074145402022, - "grad_norm": 0.95703125, - "learning_rate": 9.960691435630003e-05, - "loss": 0.4065, + "grad_norm": 1.7421875, + "learning_rate": 3.9714903573442826e-05, + "loss": 0.5057, "step": 8300 }, { "epoch": 1.9992778045257582, - "grad_norm": 0.95703125, - "learning_rate": 9.951356985151279e-05, - "loss": 0.4182, + "grad_norm": 1.78125, + "learning_rate": 3.967768559484472e-05, + "loss": 0.5162, "step": 8305 }, { "epoch": 1.9995185363505055, - "eval_loss": 0.37132248282432556, - "eval_runtime": 2.3263, - "eval_samples_per_second": 85.972, - "eval_steps_per_second": 85.972, + "eval_loss": 0.45429831743240356, + "eval_runtime": 2.3729, + "eval_samples_per_second": 84.285, + "eval_steps_per_second": 84.285, "step": 8306 }, { "epoch": 2.0004814636494945, - "grad_norm": 0.9140625, - "learning_rate": 9.942029282178871e-05, - "loss": 0.4205, + "grad_norm": 1.7734375, + "learning_rate": 3.964049451965633e-05, + "loss": 0.5306, "step": 8310 }, { "epoch": 2.0016851227732304, - "grad_norm": 0.87890625, - "learning_rate": 9.932708341616069e-05, - "loss": 0.3759, + "grad_norm": 1.59375, + "learning_rate": 3.9603330407299495e-05, + "loss": 0.4956, "step": 8315 }, { "epoch": 2.0028887818969667, - "grad_norm": 0.875, - "learning_rate": 9.92339417835534e-05, - "loss": 0.4215, + "grad_norm": 1.671875, + "learning_rate": 3.956619331715292e-05, + "loss": 0.5456, "step": 8320 }, { "epoch": 2.004092441020703, - "grad_norm": 0.90625, - "learning_rate": 9.914086807278328e-05, - "loss": 0.393, + "grad_norm": 1.734375, + "learning_rate": 3.952908330855216e-05, + "loss": 0.5081, "step": 8325 }, { "epoch": 2.005296100144439, - "grad_norm": 0.90625, - "learning_rate": 9.904786243255833e-05, - "loss": 0.379, + "grad_norm": 1.7265625, + "learning_rate": 3.9492000440789516e-05, + "loss": 0.5046, "step": 8330 }, { "epoch": 2.0064997592681753, - "grad_norm": 0.94140625, - "learning_rate": 9.895492501147768e-05, - "loss": 0.4035, + "grad_norm": 1.671875, + "learning_rate": 3.945494477311388e-05, + "loss": 0.5213, "step": 8335 }, { "epoch": 2.0077034183919116, - "grad_norm": 0.9765625, - "learning_rate": 9.88620559580315e-05, - "loss": 0.4276, + "grad_norm": 1.8359375, + "learning_rate": 3.941791636473072e-05, + "loss": 0.5507, "step": 8340 }, { "epoch": 2.0089070775156475, - "grad_norm": 0.85546875, - "learning_rate": 9.876925542060069e-05, - "loss": 0.3867, + "grad_norm": 1.6015625, + "learning_rate": 3.9380915274801905e-05, + "loss": 0.5045, "step": 8345 }, { "epoch": 2.0101107366393838, - "grad_norm": 0.94921875, - "learning_rate": 9.867652354745677e-05, - "loss": 0.3835, + "grad_norm": 1.71875, + "learning_rate": 3.934394156244574e-05, + "loss": 0.501, "step": 8350 }, { "epoch": 2.01131439576312, - "grad_norm": 0.9765625, - "learning_rate": 9.858386048676152e-05, - "loss": 0.4029, + "grad_norm": 1.7578125, + "learning_rate": 3.930699528673673e-05, + "loss": 0.524, "step": 8355 }, { "epoch": 2.012518054886856, - "grad_norm": 0.97265625, - "learning_rate": 9.84912663865667e-05, - "loss": 0.3842, + "grad_norm": 1.796875, + "learning_rate": 3.927007650670552e-05, + "loss": 0.5031, "step": 8360 }, { "epoch": 2.0137217140105923, - "grad_norm": 0.87109375, - "learning_rate": 9.8398741394814e-05, - "loss": 0.3881, + "grad_norm": 1.71875, + "learning_rate": 3.9233185281338905e-05, + "loss": 0.5055, "step": 8365 }, { "epoch": 2.014925373134328, - "grad_norm": 0.921875, - "learning_rate": 9.830628565933458e-05, - "loss": 0.3783, + "grad_norm": 1.7578125, + "learning_rate": 3.919632166957956e-05, + "loss": 0.4957, "step": 8370 }, { "epoch": 2.0161290322580645, - "grad_norm": 0.9375, - "learning_rate": 9.821389932784905e-05, - "loss": 0.407, + "grad_norm": 1.828125, + "learning_rate": 3.9159485730326114e-05, + "loss": 0.525, "step": 8375 }, { "epoch": 2.017332691381801, - "grad_norm": 0.90234375, - "learning_rate": 9.81215825479671e-05, - "loss": 0.3535, + "grad_norm": 1.6171875, + "learning_rate": 3.9122677522432955e-05, + "loss": 0.4746, "step": 8380 }, { "epoch": 2.0185363505055367, - "grad_norm": 1.0078125, - "learning_rate": 9.802933546718724e-05, - "loss": 0.4089, + "grad_norm": 1.921875, + "learning_rate": 3.9085897104710155e-05, + "loss": 0.536, "step": 8385 }, { "epoch": 2.019740009629273, - "grad_norm": 0.953125, - "learning_rate": 9.793715823289667e-05, - "loss": 0.3861, + "grad_norm": 1.6328125, + "learning_rate": 3.9049144535923396e-05, + "loss": 0.5009, "step": 8390 }, { "epoch": 2.0209436687530093, - "grad_norm": 0.97265625, - "learning_rate": 9.784505099237094e-05, - "loss": 0.374, + "grad_norm": 1.7265625, + "learning_rate": 3.901241987479385e-05, + "loss": 0.4905, "step": 8395 }, { "epoch": 2.0221473278767452, - "grad_norm": 0.96484375, - "learning_rate": 9.775301389277384e-05, - "loss": 0.3924, + "grad_norm": 1.828125, + "learning_rate": 3.8975723179998116e-05, + "loss": 0.5143, "step": 8400 }, { "epoch": 2.0233509870004815, - "grad_norm": 0.9375, - "learning_rate": 9.766104708115711e-05, - "loss": 0.4105, + "grad_norm": 1.765625, + "learning_rate": 3.893905451016812e-05, + "loss": 0.5292, "step": 8405 }, { "epoch": 2.0245546461242174, - "grad_norm": 0.89453125, - "learning_rate": 9.756915070446007e-05, - "loss": 0.3865, + "grad_norm": 1.6875, + "learning_rate": 3.890241392389098e-05, + "loss": 0.506, "step": 8410 }, { "epoch": 2.0257583052479537, - "grad_norm": 0.93359375, - "learning_rate": 9.747732490950962e-05, - "loss": 0.3718, + "grad_norm": 1.7578125, + "learning_rate": 3.886580147970898e-05, + "loss": 0.4899, "step": 8415 }, { "epoch": 2.02696196437169, - "grad_norm": 0.93359375, - "learning_rate": 9.73855698430198e-05, - "loss": 0.3791, + "grad_norm": 1.734375, + "learning_rate": 3.882921723611939e-05, + "loss": 0.5042, "step": 8420 }, { "epoch": 2.028165623495426, - "grad_norm": 0.921875, - "learning_rate": 9.729388565159167e-05, - "loss": 0.4223, + "grad_norm": 1.5859375, + "learning_rate": 3.879266125157446e-05, + "loss": 0.5438, "step": 8425 }, { "epoch": 2.0293692826191623, - "grad_norm": 0.91796875, - "learning_rate": 9.720227248171316e-05, - "loss": 0.3838, + "grad_norm": 1.6640625, + "learning_rate": 3.875613358448131e-05, + "loss": 0.5023, "step": 8430 }, { "epoch": 2.0305729417428986, - "grad_norm": 0.875, - "learning_rate": 9.711073047975856e-05, - "loss": 0.3981, + "grad_norm": 1.6328125, + "learning_rate": 3.871963429320177e-05, + "loss": 0.5197, "step": 8435 }, { "epoch": 2.0317766008666345, - "grad_norm": 1.0625, - "learning_rate": 9.70192597919885e-05, - "loss": 0.4147, + "grad_norm": 1.859375, + "learning_rate": 3.868316343605233e-05, + "loss": 0.5368, "step": 8440 }, { "epoch": 2.032980259990371, - "grad_norm": 0.91015625, - "learning_rate": 9.692786056454974e-05, - "loss": 0.4111, + "grad_norm": 1.75, + "learning_rate": 3.8646721071304116e-05, + "loss": 0.5308, "step": 8445 }, { "epoch": 2.0341839191141067, - "grad_norm": 1.0546875, - "learning_rate": 9.683653294347478e-05, - "loss": 0.3994, + "grad_norm": 1.8984375, + "learning_rate": 3.861030725718264e-05, + "loss": 0.5176, "step": 8450 }, { "epoch": 2.035387578237843, - "grad_norm": 0.8671875, - "learning_rate": 9.674527707468178e-05, - "loss": 0.3855, + "grad_norm": 1.6328125, + "learning_rate": 3.8573922051867866e-05, + "loss": 0.5153, "step": 8455 }, { "epoch": 2.0365912373615793, - "grad_norm": 1.03125, - "learning_rate": 9.665409310397418e-05, - "loss": 0.3852, + "grad_norm": 1.8984375, + "learning_rate": 3.853756551349402e-05, + "loss": 0.5075, "step": 8460 }, { "epoch": 2.037794896485315, - "grad_norm": 0.94140625, - "learning_rate": 9.656298117704064e-05, - "loss": 0.3917, + "grad_norm": 1.6796875, + "learning_rate": 3.850123770014953e-05, + "loss": 0.5147, "step": 8465 }, { "epoch": 2.0389985556090515, - "grad_norm": 0.96875, - "learning_rate": 9.647194143945462e-05, - "loss": 0.3993, + "grad_norm": 1.734375, + "learning_rate": 3.846493866987692e-05, + "loss": 0.5197, "step": 8470 }, { "epoch": 2.040202214732788, - "grad_norm": 1.0, - "learning_rate": 9.638097403667431e-05, - "loss": 0.3825, + "grad_norm": 1.7578125, + "learning_rate": 3.8428668480672745e-05, + "loss": 0.5012, "step": 8475 }, { "epoch": 2.0414058738565237, - "grad_norm": 0.9609375, - "learning_rate": 9.629007911404229e-05, - "loss": 0.3784, + "grad_norm": 1.8671875, + "learning_rate": 3.839242719048747e-05, + "loss": 0.4997, "step": 8480 }, { "epoch": 2.04260953298026, - "grad_norm": 0.921875, - "learning_rate": 9.619925681678533e-05, - "loss": 0.3965, + "grad_norm": 1.640625, + "learning_rate": 3.8356214857225385e-05, + "loss": 0.5199, "step": 8485 }, { "epoch": 2.043813192103996, - "grad_norm": 0.8984375, - "learning_rate": 9.610850729001423e-05, - "loss": 0.4053, + "grad_norm": 1.703125, + "learning_rate": 3.832003153874453e-05, + "loss": 0.5296, "step": 8490 }, { "epoch": 2.0450168512277322, - "grad_norm": 0.98828125, - "learning_rate": 9.601783067872345e-05, - "loss": 0.3813, + "grad_norm": 1.828125, + "learning_rate": 3.828387729285656e-05, + "loss": 0.5007, "step": 8495 }, { "epoch": 2.0462205103514686, - "grad_norm": 1.1328125, - "learning_rate": 9.592722712779095e-05, - "loss": 0.4025, + "grad_norm": 1.8828125, + "learning_rate": 3.824775217732667e-05, + "loss": 0.524, "step": 8500 }, { "epoch": 2.0462205103514686, - "eval_loss": 0.3717775046825409, - "eval_runtime": 2.331, - "eval_samples_per_second": 85.801, - "eval_steps_per_second": 85.801, + "eval_loss": 0.4528008699417114, + "eval_runtime": 2.3806, + "eval_samples_per_second": 84.013, + "eval_steps_per_second": 84.013, "step": 8500 }, { "epoch": 2.0474241694752044, - "grad_norm": 0.921875, - "learning_rate": 9.583669678197794e-05, - "loss": 0.386, + "grad_norm": 1.7734375, + "learning_rate": 3.821165624987355e-05, + "loss": 0.5006, "step": 8505 }, { "epoch": 2.0486278285989408, - "grad_norm": 0.96484375, - "learning_rate": 9.574623978592874e-05, - "loss": 0.4059, + "grad_norm": 1.84375, + "learning_rate": 3.817558956816925e-05, + "loss": 0.5238, "step": 8510 }, { "epoch": 2.049831487722677, - "grad_norm": 0.92578125, - "learning_rate": 9.565585628417047e-05, - "loss": 0.3932, + "grad_norm": 1.7578125, + "learning_rate": 3.813955218983907e-05, + "loss": 0.5119, "step": 8515 }, { "epoch": 2.051035146846413, - "grad_norm": 0.828125, - "learning_rate": 9.556554642111277e-05, - "loss": 0.4074, + "grad_norm": 1.703125, + "learning_rate": 3.81035441724615e-05, + "loss": 0.5311, "step": 8520 }, { "epoch": 2.0522388059701493, - "grad_norm": 0.98828125, - "learning_rate": 9.547531034104769e-05, - "loss": 0.3699, + "grad_norm": 1.8671875, + "learning_rate": 3.806756557356814e-05, + "loss": 0.4924, "step": 8525 }, { "epoch": 2.0534424650938856, - "grad_norm": 0.8515625, - "learning_rate": 9.538514818814925e-05, - "loss": 0.3557, + "grad_norm": 1.546875, + "learning_rate": 3.803161645064354e-05, + "loss": 0.4726, "step": 8530 }, { "epoch": 2.0546461242176215, - "grad_norm": 0.8671875, - "learning_rate": 9.529506010647357e-05, - "loss": 0.3982, + "grad_norm": 1.703125, + "learning_rate": 3.79956968611252e-05, + "loss": 0.5186, "step": 8535 }, { "epoch": 2.055849783341358, - "grad_norm": 0.859375, - "learning_rate": 9.520504623995827e-05, - "loss": 0.3815, + "grad_norm": 1.578125, + "learning_rate": 3.795980686240342e-05, + "loss": 0.4995, "step": 8540 }, { "epoch": 2.0570534424650937, - "grad_norm": 1.0078125, - "learning_rate": 9.511510673242243e-05, - "loss": 0.3823, + "grad_norm": 1.8671875, + "learning_rate": 3.792394651182122e-05, + "loss": 0.4987, "step": 8545 }, { "epoch": 2.05825710158883, - "grad_norm": 0.96875, - "learning_rate": 9.502524172756631e-05, - "loss": 0.3592, + "grad_norm": 1.7265625, + "learning_rate": 3.788811586667423e-05, + "loss": 0.4794, "step": 8550 }, { "epoch": 2.0594607607125663, - "grad_norm": 0.9765625, - "learning_rate": 9.493545136897118e-05, - "loss": 0.4138, + "grad_norm": 1.765625, + "learning_rate": 3.785231498421063e-05, + "loss": 0.5372, "step": 8555 }, { "epoch": 2.060664419836302, - "grad_norm": 0.94921875, - "learning_rate": 9.484573580009895e-05, - "loss": 0.403, + "grad_norm": 1.828125, + "learning_rate": 3.781654392163107e-05, + "loss": 0.5273, "step": 8560 }, { "epoch": 2.0618680789600385, - "grad_norm": 0.96875, - "learning_rate": 9.475609516429222e-05, - "loss": 0.3886, + "grad_norm": 1.765625, + "learning_rate": 3.7780802736088546e-05, + "loss": 0.5085, "step": 8565 }, { "epoch": 2.063071738083775, - "grad_norm": 0.9609375, - "learning_rate": 9.466652960477364e-05, - "loss": 0.3885, + "grad_norm": 1.78125, + "learning_rate": 3.7745091484688286e-05, + "loss": 0.5062, "step": 8570 }, { "epoch": 2.0642753972075107, - "grad_norm": 0.96875, - "learning_rate": 9.457703926464607e-05, - "loss": 0.3635, + "grad_norm": 1.859375, + "learning_rate": 3.7709410224487735e-05, + "loss": 0.4787, "step": 8575 }, { "epoch": 2.065479056331247, - "grad_norm": 0.98828125, - "learning_rate": 9.448762428689208e-05, - "loss": 0.4009, + "grad_norm": 1.875, + "learning_rate": 3.7673759012496385e-05, + "loss": 0.5274, "step": 8580 }, { "epoch": 2.066682715454983, - "grad_norm": 0.91015625, - "learning_rate": 9.439828481437394e-05, - "loss": 0.385, + "grad_norm": 1.78125, + "learning_rate": 3.763813790567574e-05, + "loss": 0.5058, "step": 8585 }, { "epoch": 2.0678863745787193, - "grad_norm": 0.921875, - "learning_rate": 9.43090209898332e-05, - "loss": 0.3965, + "grad_norm": 1.7421875, + "learning_rate": 3.7602546960939204e-05, + "loss": 0.5229, "step": 8590 }, { "epoch": 2.0690900337024556, - "grad_norm": 0.91796875, - "learning_rate": 9.42198329558906e-05, - "loss": 0.3758, + "grad_norm": 1.625, + "learning_rate": 3.756698623515198e-05, + "loss": 0.4919, "step": 8595 }, { "epoch": 2.0702936928261915, - "grad_norm": 0.94140625, - "learning_rate": 9.413072085504567e-05, - "loss": 0.4014, + "grad_norm": 1.7421875, + "learning_rate": 3.7531455785130975e-05, + "loss": 0.5209, "step": 8600 }, { "epoch": 2.071497351949928, - "grad_norm": 0.90625, - "learning_rate": 9.40416848296768e-05, - "loss": 0.4018, + "grad_norm": 1.703125, + "learning_rate": 3.749595566764477e-05, + "loss": 0.5272, "step": 8605 }, { "epoch": 2.072701011073664, - "grad_norm": 0.95703125, - "learning_rate": 9.395272502204067e-05, - "loss": 0.396, + "grad_norm": 1.859375, + "learning_rate": 3.746048593941342e-05, + "loss": 0.5178, "step": 8610 }, { "epoch": 2.0739046701974, - "grad_norm": 0.9453125, - "learning_rate": 9.386384157427228e-05, - "loss": 0.3882, + "grad_norm": 1.703125, + "learning_rate": 3.742504665710848e-05, + "loss": 0.5046, "step": 8615 }, { "epoch": 2.0751083293211363, - "grad_norm": 0.97265625, - "learning_rate": 9.377503462838457e-05, - "loss": 0.3695, + "grad_norm": 1.7578125, + "learning_rate": 3.738963787735283e-05, + "loss": 0.4966, "step": 8620 }, { "epoch": 2.076311988444872, - "grad_norm": 1.1171875, - "learning_rate": 9.368630432626831e-05, - "loss": 0.4264, + "grad_norm": 1.8828125, + "learning_rate": 3.735425965672063e-05, + "loss": 0.5502, "step": 8625 }, { "epoch": 2.0775156475686085, - "grad_norm": 0.91015625, - "learning_rate": 9.359765080969173e-05, - "loss": 0.3886, + "grad_norm": 1.703125, + "learning_rate": 3.73189120517372e-05, + "loss": 0.5055, "step": 8630 }, { "epoch": 2.078719306692345, - "grad_norm": 0.99609375, - "learning_rate": 9.350907422030044e-05, - "loss": 0.4029, + "grad_norm": 1.9140625, + "learning_rate": 3.7283595118878936e-05, + "loss": 0.5237, "step": 8635 }, { "epoch": 2.0799229658160807, - "grad_norm": 0.8984375, - "learning_rate": 9.342057469961716e-05, - "loss": 0.3945, + "grad_norm": 1.7109375, + "learning_rate": 3.724830891457328e-05, + "loss": 0.5198, "step": 8640 }, { "epoch": 2.081126624939817, - "grad_norm": 0.8828125, - "learning_rate": 9.333215238904137e-05, - "loss": 0.3692, + "grad_norm": 1.6953125, + "learning_rate": 3.721305349519851e-05, + "loss": 0.4892, "step": 8645 }, { "epoch": 2.0823302840635534, - "grad_norm": 0.90625, - "learning_rate": 9.324380742984934e-05, - "loss": 0.3795, + "grad_norm": 1.71875, + "learning_rate": 3.717782891708377e-05, + "loss": 0.4935, "step": 8650 }, { "epoch": 2.0835339431872892, - "grad_norm": 0.91796875, - "learning_rate": 9.315553996319361e-05, - "loss": 0.3721, + "grad_norm": 1.859375, + "learning_rate": 3.71426352365089e-05, + "loss": 0.4897, "step": 8655 }, { "epoch": 2.0847376023110256, - "grad_norm": 0.90234375, - "learning_rate": 9.306735013010294e-05, - "loss": 0.3956, + "grad_norm": 1.6875, + "learning_rate": 3.710747250970436e-05, + "loss": 0.5104, "step": 8660 }, { "epoch": 2.085941261434762, - "grad_norm": 1.015625, - "learning_rate": 9.297923807148213e-05, - "loss": 0.3728, + "grad_norm": 1.859375, + "learning_rate": 3.7072340792851194e-05, + "loss": 0.4919, "step": 8665 }, { "epoch": 2.0871449205584978, - "grad_norm": 0.9609375, - "learning_rate": 9.289120392811164e-05, - "loss": 0.3886, + "grad_norm": 1.84375, + "learning_rate": 3.703724014208087e-05, + "loss": 0.5149, "step": 8670 }, { "epoch": 2.088348579682234, - "grad_norm": 0.98828125, - "learning_rate": 9.280324784064746e-05, - "loss": 0.387, + "grad_norm": 1.765625, + "learning_rate": 3.700217061347524e-05, + "loss": 0.5072, "step": 8675 }, { "epoch": 2.08955223880597, - "grad_norm": 0.8984375, - "learning_rate": 9.271536994962086e-05, - "loss": 0.3819, + "grad_norm": 1.7421875, + "learning_rate": 3.6967132263066405e-05, + "loss": 0.498, "step": 8680 }, { "epoch": 2.0907558979297063, - "grad_norm": 0.9296875, - "learning_rate": 9.262757039543819e-05, - "loss": 0.3791, + "grad_norm": 1.703125, + "learning_rate": 3.693212514683667e-05, + "loss": 0.4949, "step": 8685 }, { "epoch": 2.0919595570534426, - "grad_norm": 0.921875, - "learning_rate": 9.253984931838067e-05, - "loss": 0.4047, + "grad_norm": 1.71875, + "learning_rate": 3.689714932071846e-05, + "loss": 0.5322, "step": 8690 }, { "epoch": 2.0931632161771785, - "grad_norm": 0.98046875, - "learning_rate": 9.245220685860405e-05, - "loss": 0.3822, + "grad_norm": 1.75, + "learning_rate": 3.686220484059414e-05, + "loss": 0.5055, "step": 8695 }, { "epoch": 2.094366875300915, - "grad_norm": 0.96875, - "learning_rate": 9.236464315613853e-05, - "loss": 0.3854, + "grad_norm": 1.828125, + "learning_rate": 3.6827291762296064e-05, + "loss": 0.506, "step": 8700 }, { "epoch": 2.095570534424651, - "grad_norm": 0.90625, - "learning_rate": 9.227715835088844e-05, - "loss": 0.3547, + "grad_norm": 1.6484375, + "learning_rate": 3.6792410141606356e-05, + "loss": 0.4747, "step": 8705 }, { "epoch": 2.096774193548387, - "grad_norm": 0.91796875, - "learning_rate": 9.21897525826321e-05, - "loss": 0.369, + "grad_norm": 1.8046875, + "learning_rate": 3.675756003425692e-05, + "loss": 0.4865, "step": 8710 }, { "epoch": 2.0979778526721233, - "grad_norm": 1.0078125, - "learning_rate": 9.21024259910215e-05, - "loss": 0.4032, + "grad_norm": 1.8359375, + "learning_rate": 3.672274149592928e-05, + "loss": 0.5323, "step": 8715 }, { "epoch": 2.099181511795859, - "grad_norm": 0.88671875, - "learning_rate": 9.201517871558213e-05, - "loss": 0.4117, + "grad_norm": 1.671875, + "learning_rate": 3.668795458225454e-05, + "loss": 0.5322, "step": 8720 }, { "epoch": 2.1003851709195955, - "grad_norm": 0.94140625, - "learning_rate": 9.192801089571282e-05, - "loss": 0.4179, + "grad_norm": 1.703125, + "learning_rate": 3.665319934881328e-05, + "loss": 0.5429, "step": 8725 }, { "epoch": 2.101588830043332, - "grad_norm": 0.93359375, - "learning_rate": 9.184092267068535e-05, - "loss": 0.3882, + "grad_norm": 1.8046875, + "learning_rate": 3.661847585113543e-05, + "loss": 0.5088, "step": 8730 }, { "epoch": 2.1027924891670677, - "grad_norm": 1.015625, - "learning_rate": 9.175391417964443e-05, - "loss": 0.3888, + "grad_norm": 1.8046875, + "learning_rate": 3.658378414470027e-05, + "loss": 0.5134, "step": 8735 }, { "epoch": 2.103996148290804, - "grad_norm": 0.93359375, - "learning_rate": 9.166698556160725e-05, - "loss": 0.3855, + "grad_norm": 1.8671875, + "learning_rate": 3.6549124284936216e-05, + "loss": 0.5073, "step": 8740 }, { "epoch": 2.1051998074145404, - "grad_norm": 0.875, - "learning_rate": 9.158013695546353e-05, - "loss": 0.3764, + "grad_norm": 1.5703125, + "learning_rate": 3.6514496327220876e-05, + "loss": 0.4961, "step": 8745 }, { "epoch": 2.1064034665382763, - "grad_norm": 0.91015625, - "learning_rate": 9.149336849997505e-05, - "loss": 0.3793, + "grad_norm": 1.7109375, + "learning_rate": 3.647990032688083e-05, + "loss": 0.501, "step": 8750 }, { "epoch": 2.1076071256620126, - "grad_norm": 0.96875, - "learning_rate": 9.140668033377559e-05, - "loss": 0.362, + "grad_norm": 1.859375, + "learning_rate": 3.6445336339191626e-05, + "loss": 0.4872, "step": 8755 }, { "epoch": 2.1088107847857485, - "grad_norm": 0.98046875, - "learning_rate": 9.132007259537052e-05, - "loss": 0.3521, + "grad_norm": 1.71875, + "learning_rate": 3.641080441937764e-05, + "loss": 0.4671, "step": 8760 }, { "epoch": 2.110014443909485, - "grad_norm": 0.98046875, - "learning_rate": 9.123354542313694e-05, - "loss": 0.3831, + "grad_norm": 1.8125, + "learning_rate": 3.637630462261205e-05, + "loss": 0.5029, "step": 8765 }, { "epoch": 2.111218103033221, - "grad_norm": 0.953125, - "learning_rate": 9.114709895532298e-05, - "loss": 0.3881, + "grad_norm": 1.796875, + "learning_rate": 3.634183700401667e-05, + "loss": 0.5123, "step": 8770 }, { "epoch": 2.112421762156957, - "grad_norm": 0.98046875, - "learning_rate": 9.1060733330048e-05, - "loss": 0.3998, + "grad_norm": 1.9296875, + "learning_rate": 3.630740161866192e-05, + "loss": 0.528, "step": 8775 }, { "epoch": 2.1136254212806933, - "grad_norm": 1.015625, - "learning_rate": 9.097444868530207e-05, - "loss": 0.3955, + "grad_norm": 1.8671875, + "learning_rate": 3.627299852156673e-05, + "loss": 0.5213, "step": 8780 }, { "epoch": 2.1148290804044296, - "grad_norm": 0.94921875, - "learning_rate": 9.0888245158946e-05, - "loss": 0.4073, + "grad_norm": 1.71875, + "learning_rate": 3.6238627767698414e-05, + "loss": 0.533, "step": 8785 }, { "epoch": 2.1160327395281655, - "grad_norm": 0.984375, - "learning_rate": 9.080212288871087e-05, - "loss": 0.4189, + "grad_norm": 1.71875, + "learning_rate": 3.620428941197263e-05, + "loss": 0.542, "step": 8790 }, { "epoch": 2.117236398651902, - "grad_norm": 1.0, - "learning_rate": 9.071608201219801e-05, - "loss": 0.3936, + "grad_norm": 1.6953125, + "learning_rate": 3.616998350925327e-05, + "loss": 0.511, "step": 8795 }, { "epoch": 2.118440057775638, - "grad_norm": 0.8828125, - "learning_rate": 9.063012266687872e-05, - "loss": 0.3639, + "grad_norm": 1.7109375, + "learning_rate": 3.613571011435239e-05, + "loss": 0.4834, "step": 8800 }, { "epoch": 2.119643716899374, - "grad_norm": 0.953125, - "learning_rate": 9.054424499009393e-05, - "loss": 0.3845, + "grad_norm": 1.8046875, + "learning_rate": 3.610146928203006e-05, + "loss": 0.4998, "step": 8805 }, { "epoch": 2.1208473760231104, - "grad_norm": 0.90625, - "learning_rate": 9.045844911905422e-05, - "loss": 0.4083, + "grad_norm": 1.7421875, + "learning_rate": 3.606726106699437e-05, + "loss": 0.5277, "step": 8810 }, { "epoch": 2.1220510351468462, - "grad_norm": 1.0546875, - "learning_rate": 9.03727351908394e-05, - "loss": 0.4156, + "grad_norm": 1.9765625, + "learning_rate": 3.603308552390128e-05, + "loss": 0.5418, "step": 8815 }, { "epoch": 2.1232546942705826, - "grad_norm": 0.9765625, - "learning_rate": 9.028710334239825e-05, - "loss": 0.3921, + "grad_norm": 1.84375, + "learning_rate": 3.5998942707354524e-05, + "loss": 0.5144, "step": 8820 }, { "epoch": 2.124458353394319, - "grad_norm": 0.9375, - "learning_rate": 9.020155371054863e-05, - "loss": 0.3952, + "grad_norm": 1.734375, + "learning_rate": 3.5964832671905605e-05, + "loss": 0.5143, "step": 8825 }, { "epoch": 2.1256620125180548, - "grad_norm": 0.984375, - "learning_rate": 9.011608643197683e-05, - "loss": 0.4031, + "grad_norm": 1.921875, + "learning_rate": 3.593075547205358e-05, + "loss": 0.5207, "step": 8830 }, { "epoch": 2.126865671641791, - "grad_norm": 0.90625, - "learning_rate": 9.003070164323774e-05, - "loss": 0.348, + "grad_norm": 1.671875, + "learning_rate": 3.5896711162245104e-05, + "loss": 0.4682, "step": 8835 }, { "epoch": 2.1280693307655274, - "grad_norm": 1.0625, - "learning_rate": 8.994539948075428e-05, - "loss": 0.4003, + "grad_norm": 1.8515625, + "learning_rate": 3.586269979687423e-05, + "loss": 0.521, "step": 8840 }, { "epoch": 2.1292729898892633, - "grad_norm": 0.86328125, - "learning_rate": 8.986018008081748e-05, - "loss": 0.3906, + "grad_norm": 1.7109375, + "learning_rate": 3.582872143028243e-05, + "loss": 0.5111, "step": 8845 }, { "epoch": 2.1304766490129996, - "grad_norm": 1.0234375, - "learning_rate": 8.977504357958612e-05, - "loss": 0.3636, + "grad_norm": 1.8671875, + "learning_rate": 3.579477611675842e-05, + "loss": 0.4836, "step": 8850 }, { "epoch": 2.1316803081367355, - "grad_norm": 0.921875, - "learning_rate": 8.968999011308645e-05, - "loss": 0.3795, + "grad_norm": 1.7578125, + "learning_rate": 3.5760863910538094e-05, + "loss": 0.5013, "step": 8855 }, { "epoch": 2.132883967260472, - "grad_norm": 0.91015625, - "learning_rate": 8.960501981721215e-05, - "loss": 0.3742, + "grad_norm": 1.7734375, + "learning_rate": 3.572698486580448e-05, + "loss": 0.4955, "step": 8860 }, { "epoch": 2.134087626384208, - "grad_norm": 0.89453125, - "learning_rate": 8.952013282772397e-05, - "loss": 0.3984, + "grad_norm": 1.609375, + "learning_rate": 3.56931390366876e-05, + "loss": 0.5175, "step": 8865 }, { "epoch": 2.135291285507944, - "grad_norm": 0.90625, - "learning_rate": 8.943532928024951e-05, - "loss": 0.3797, + "grad_norm": 1.625, + "learning_rate": 3.56593264772644e-05, + "loss": 0.5003, "step": 8870 }, { "epoch": 2.1364949446316803, - "grad_norm": 0.890625, - "learning_rate": 8.935060931028317e-05, - "loss": 0.3728, + "grad_norm": 1.6875, + "learning_rate": 3.56255472415587e-05, + "loss": 0.4899, "step": 8875 }, { "epoch": 2.1376986037554166, - "grad_norm": 0.93359375, - "learning_rate": 8.926597305318563e-05, - "loss": 0.3746, + "grad_norm": 1.703125, + "learning_rate": 3.5591801383541035e-05, + "loss": 0.4961, "step": 8880 }, { "epoch": 2.1389022628791525, - "grad_norm": 0.89453125, - "learning_rate": 8.918142064418408e-05, - "loss": 0.388, + "grad_norm": 1.828125, + "learning_rate": 3.555808895712867e-05, + "loss": 0.5083, "step": 8885 }, { "epoch": 2.140105922002889, - "grad_norm": 0.90625, - "learning_rate": 8.909695221837147e-05, - "loss": 0.3717, + "grad_norm": 1.640625, + "learning_rate": 3.55244100161854e-05, + "loss": 0.4862, "step": 8890 }, { "epoch": 2.1413095811266247, - "grad_norm": 1.03125, - "learning_rate": 8.901256791070674e-05, - "loss": 0.383, + "grad_norm": 1.9609375, + "learning_rate": 3.5490764614521526e-05, + "loss": 0.5044, "step": 8895 }, { "epoch": 2.142513240250361, - "grad_norm": 0.91796875, - "learning_rate": 8.892826785601441e-05, - "loss": 0.4029, + "grad_norm": 1.84375, + "learning_rate": 3.54571528058938e-05, + "loss": 0.5277, "step": 8900 }, { "epoch": 2.1437168993740974, - "grad_norm": 1.0234375, - "learning_rate": 8.884405218898433e-05, - "loss": 0.3731, + "grad_norm": 1.8125, + "learning_rate": 3.5423574644005275e-05, + "loss": 0.4926, "step": 8905 }, { "epoch": 2.1449205584978333, - "grad_norm": 1.0703125, - "learning_rate": 8.875992104417155e-05, - "loss": 0.3995, + "grad_norm": 1.90625, + "learning_rate": 3.539003018250523e-05, + "loss": 0.5258, "step": 8910 }, { "epoch": 2.1461242176215696, - "grad_norm": 0.98828125, - "learning_rate": 8.867587455599604e-05, - "loss": 0.406, + "grad_norm": 1.7890625, + "learning_rate": 3.535651947498911e-05, + "loss": 0.5324, "step": 8915 }, { "epoch": 2.147327876745306, - "grad_norm": 0.97265625, - "learning_rate": 8.85919128587426e-05, - "loss": 0.3766, + "grad_norm": 1.765625, + "learning_rate": 3.5323042574998466e-05, + "loss": 0.4956, "step": 8920 }, { "epoch": 2.1485315358690418, - "grad_norm": 0.9296875, - "learning_rate": 8.850803608656048e-05, - "loss": 0.3842, + "grad_norm": 1.859375, + "learning_rate": 3.5289599536020785e-05, + "loss": 0.5051, "step": 8925 }, { "epoch": 2.149735194992778, - "grad_norm": 0.984375, - "learning_rate": 8.842424437346322e-05, - "loss": 0.3749, + "grad_norm": 1.859375, + "learning_rate": 3.525619041148945e-05, + "loss": 0.4986, "step": 8930 }, { "epoch": 2.1509388541165144, - "grad_norm": 0.94140625, - "learning_rate": 8.834053785332854e-05, - "loss": 0.3819, + "grad_norm": 1.6953125, + "learning_rate": 3.522281525478371e-05, + "loss": 0.5007, "step": 8935 }, { "epoch": 2.1521425132402503, - "grad_norm": 1.015625, - "learning_rate": 8.825691665989796e-05, - "loss": 0.3914, + "grad_norm": 1.9140625, + "learning_rate": 3.5189474119228474e-05, + "loss": 0.513, "step": 8940 }, { "epoch": 2.1533461723639866, - "grad_norm": 0.953125, - "learning_rate": 8.817338092677676e-05, - "loss": 0.3993, + "grad_norm": 1.71875, + "learning_rate": 3.515616705809437e-05, + "loss": 0.5226, "step": 8945 }, { "epoch": 2.1545498314877225, - "grad_norm": 0.90234375, - "learning_rate": 8.808993078743364e-05, - "loss": 0.3825, + "grad_norm": 1.7265625, + "learning_rate": 3.512289412459752e-05, + "loss": 0.5039, "step": 8950 }, { "epoch": 2.155753490611459, - "grad_norm": 0.98046875, - "learning_rate": 8.800656637520044e-05, - "loss": 0.3645, + "grad_norm": 1.8203125, + "learning_rate": 3.508965537189954e-05, + "loss": 0.4753, "step": 8955 }, { "epoch": 2.156957149735195, - "grad_norm": 1.0, - "learning_rate": 8.79232878232722e-05, - "loss": 0.3926, + "grad_norm": 1.7734375, + "learning_rate": 3.5056450853107456e-05, + "loss": 0.5191, "step": 8960 }, { "epoch": 2.158160808858931, - "grad_norm": 1.0703125, - "learning_rate": 8.784009526470667e-05, - "loss": 0.3576, + "grad_norm": 1.8515625, + "learning_rate": 3.502328062127355e-05, + "loss": 0.4785, "step": 8965 }, { "epoch": 2.1593644679826673, - "grad_norm": 0.90625, - "learning_rate": 8.775698883242425e-05, - "loss": 0.4071, + "grad_norm": 1.6171875, + "learning_rate": 3.4990144729395374e-05, + "loss": 0.5235, "step": 8970 }, { "epoch": 2.1605681271064037, - "grad_norm": 0.984375, - "learning_rate": 8.767396865920771e-05, - "loss": 0.3773, + "grad_norm": 1.84375, + "learning_rate": 3.4957043230415576e-05, + "loss": 0.5026, "step": 8975 }, { "epoch": 2.1617717862301395, - "grad_norm": 0.94921875, - "learning_rate": 8.759103487770195e-05, - "loss": 0.3665, + "grad_norm": 1.6875, + "learning_rate": 3.492397617722186e-05, + "loss": 0.4837, "step": 8980 }, { "epoch": 2.162975445353876, - "grad_norm": 0.9296875, - "learning_rate": 8.750818762041396e-05, - "loss": 0.3665, + "grad_norm": 1.8359375, + "learning_rate": 3.489094362264691e-05, + "loss": 0.4838, "step": 8985 }, { "epoch": 2.1641791044776117, - "grad_norm": 1.0625, - "learning_rate": 8.74254270197124e-05, - "loss": 0.4031, + "grad_norm": 1.921875, + "learning_rate": 3.4857945619468276e-05, + "loss": 0.5326, "step": 8990 }, { "epoch": 2.165382763601348, - "grad_norm": 0.97265625, - "learning_rate": 8.734275320782748e-05, - "loss": 0.3766, + "grad_norm": 1.734375, + "learning_rate": 3.482498222040832e-05, + "loss": 0.4967, "step": 8995 }, { "epoch": 2.1665864227250844, - "grad_norm": 0.87109375, - "learning_rate": 8.726016631685076e-05, - "loss": 0.392, + "grad_norm": 1.6796875, + "learning_rate": 3.479205347813407e-05, + "loss": 0.5133, "step": 9000 }, { "epoch": 2.1665864227250844, - "eval_loss": 0.36550548672676086, - "eval_runtime": 2.3307, - "eval_samples_per_second": 85.812, - "eval_steps_per_second": 85.812, + "eval_loss": 0.45048728585243225, + "eval_runtime": 2.381, + "eval_samples_per_second": 83.999, + "eval_steps_per_second": 83.999, "step": 9000 }, { "epoch": 2.1677900818488203, - "grad_norm": 0.8828125, - "learning_rate": 8.717766647873494e-05, - "loss": 0.374, + "grad_norm": 1.6796875, + "learning_rate": 3.4759159445257253e-05, + "loss": 0.4913, "step": 9005 }, { "epoch": 2.1689937409725566, - "grad_norm": 0.984375, - "learning_rate": 8.70952538252936e-05, - "loss": 0.4053, + "grad_norm": 1.84375, + "learning_rate": 3.472630017433409e-05, + "loss": 0.5298, "step": 9010 }, { "epoch": 2.170197400096293, - "grad_norm": 0.97265625, - "learning_rate": 8.701292848820101e-05, - "loss": 0.3802, + "grad_norm": 1.8515625, + "learning_rate": 3.4693475717865255e-05, + "loss": 0.5014, "step": 9015 }, { "epoch": 2.171401059220029, - "grad_norm": 0.98828125, - "learning_rate": 8.693069059899202e-05, - "loss": 0.3867, + "grad_norm": 1.859375, + "learning_rate": 3.4660686128295845e-05, + "loss": 0.5076, "step": 9020 }, { "epoch": 2.172604718343765, - "grad_norm": 0.96484375, - "learning_rate": 8.684854028906164e-05, - "loss": 0.3802, + "grad_norm": 1.8046875, + "learning_rate": 3.4627931458015196e-05, + "loss": 0.4989, "step": 9025 }, { "epoch": 2.173808377467501, - "grad_norm": 0.875, - "learning_rate": 8.6766477689665e-05, - "loss": 0.4037, + "grad_norm": 1.5625, + "learning_rate": 3.459521175935687e-05, + "loss": 0.5328, "step": 9030 }, { "epoch": 2.1750120365912373, - "grad_norm": 0.95703125, - "learning_rate": 8.668450293191714e-05, - "loss": 0.3905, + "grad_norm": 1.8046875, + "learning_rate": 3.456252708459858e-05, + "loss": 0.509, "step": 9035 }, { "epoch": 2.1762156957149736, - "grad_norm": 0.9765625, - "learning_rate": 8.660261614679265e-05, - "loss": 0.3937, + "grad_norm": 1.75, + "learning_rate": 3.452987748596203e-05, + "loss": 0.5184, "step": 9040 }, { "epoch": 2.1774193548387095, - "grad_norm": 0.91796875, - "learning_rate": 8.652081746512568e-05, - "loss": 0.3868, + "grad_norm": 1.7265625, + "learning_rate": 3.4497263015612936e-05, + "loss": 0.5017, "step": 9045 }, { "epoch": 2.178623013962446, - "grad_norm": 0.9921875, - "learning_rate": 8.643910701760951e-05, - "loss": 0.3928, + "grad_norm": 1.8203125, + "learning_rate": 3.446468372566084e-05, + "loss": 0.5134, "step": 9050 }, { "epoch": 2.179826673086182, - "grad_norm": 0.9765625, - "learning_rate": 8.635748493479652e-05, - "loss": 0.3723, + "grad_norm": 1.890625, + "learning_rate": 3.443213966815911e-05, + "loss": 0.49, "step": 9055 }, { "epoch": 2.181030332209918, - "grad_norm": 0.95703125, - "learning_rate": 8.627595134709787e-05, - "loss": 0.3675, + "grad_norm": 1.6796875, + "learning_rate": 3.43996308951048e-05, + "loss": 0.4884, "step": 9060 }, { "epoch": 2.1822339913336544, - "grad_norm": 0.9609375, - "learning_rate": 8.61945063847833e-05, - "loss": 0.3986, + "grad_norm": 1.9453125, + "learning_rate": 3.436715745843859e-05, + "loss": 0.5197, "step": 9065 }, { "epoch": 2.1834376504573907, - "grad_norm": 0.953125, - "learning_rate": 8.611315017798102e-05, - "loss": 0.3816, + "grad_norm": 1.796875, + "learning_rate": 3.433471941004473e-05, + "loss": 0.5005, "step": 9070 }, { "epoch": 2.1846413095811266, - "grad_norm": 0.92578125, - "learning_rate": 8.603188285667738e-05, - "loss": 0.3871, + "grad_norm": 1.6171875, + "learning_rate": 3.430231680175087e-05, + "loss": 0.5082, "step": 9075 }, { "epoch": 2.185844968704863, - "grad_norm": 0.96875, - "learning_rate": 8.595070455071673e-05, - "loss": 0.402, + "grad_norm": 1.6640625, + "learning_rate": 3.426994968532811e-05, + "loss": 0.5182, "step": 9080 }, { "epoch": 2.1870486278285988, - "grad_norm": 0.8671875, - "learning_rate": 8.58696153898012e-05, - "loss": 0.3764, + "grad_norm": 1.65625, + "learning_rate": 3.423761811249079e-05, + "loss": 0.4952, "step": 9085 }, { "epoch": 2.188252286952335, - "grad_norm": 0.953125, - "learning_rate": 8.578861550349042e-05, - "loss": 0.39, + "grad_norm": 1.703125, + "learning_rate": 3.420532213489645e-05, + "loss": 0.5092, "step": 9090 }, { "epoch": 2.1894559460760714, - "grad_norm": 0.9453125, - "learning_rate": 8.570770502120153e-05, - "loss": 0.3875, + "grad_norm": 1.6953125, + "learning_rate": 3.417306180414582e-05, + "loss": 0.509, "step": 9095 }, { "epoch": 2.1906596051998073, - "grad_norm": 0.98046875, - "learning_rate": 8.562688407220867e-05, - "loss": 0.4018, + "grad_norm": 1.859375, + "learning_rate": 3.41408371717826e-05, + "loss": 0.5224, "step": 9100 }, { "epoch": 2.1918632643235436, - "grad_norm": 1.0625, - "learning_rate": 8.554615278564303e-05, - "loss": 0.3918, + "grad_norm": 1.9453125, + "learning_rate": 3.410864828929354e-05, + "loss": 0.5157, "step": 9105 }, { "epoch": 2.19306692344728, - "grad_norm": 0.921875, - "learning_rate": 8.546551129049254e-05, - "loss": 0.3791, + "grad_norm": 1.75, + "learning_rate": 3.4076495208108187e-05, + "loss": 0.4962, "step": 9110 }, { "epoch": 2.194270582571016, - "grad_norm": 0.94140625, - "learning_rate": 8.53849597156016e-05, - "loss": 0.397, + "grad_norm": 1.671875, + "learning_rate": 3.404437797959893e-05, + "loss": 0.5158, "step": 9115 }, { "epoch": 2.195474241694752, - "grad_norm": 1.0390625, - "learning_rate": 8.530449818967098e-05, - "loss": 0.4181, + "grad_norm": 1.8671875, + "learning_rate": 3.401229665508088e-05, + "loss": 0.5449, "step": 9120 }, { "epoch": 2.196677900818488, - "grad_norm": 0.92578125, - "learning_rate": 8.522412684125755e-05, - "loss": 0.392, + "grad_norm": 1.7109375, + "learning_rate": 3.398025128581174e-05, + "loss": 0.5122, "step": 9125 }, { "epoch": 2.1978815599422243, - "grad_norm": 0.9453125, - "learning_rate": 8.514384579877418e-05, - "loss": 0.3903, + "grad_norm": 1.7890625, + "learning_rate": 3.394824192299183e-05, + "loss": 0.5131, "step": 9130 }, { "epoch": 2.1990852190659607, - "grad_norm": 0.92578125, - "learning_rate": 8.506365519048936e-05, - "loss": 0.4006, + "grad_norm": 1.6953125, + "learning_rate": 3.391626861776389e-05, + "loss": 0.522, "step": 9135 }, { "epoch": 2.2002888781896965, - "grad_norm": 0.96484375, - "learning_rate": 8.49835551445271e-05, - "loss": 0.3992, + "grad_norm": 1.7421875, + "learning_rate": 3.3884331421213054e-05, + "loss": 0.5202, "step": 9140 }, { "epoch": 2.201492537313433, - "grad_norm": 0.96484375, - "learning_rate": 8.490354578886679e-05, - "loss": 0.3961, + "grad_norm": 1.765625, + "learning_rate": 3.38524303843668e-05, + "loss": 0.52, "step": 9145 }, { "epoch": 2.202696196437169, - "grad_norm": 1.0, - "learning_rate": 8.482362725134282e-05, - "loss": 0.4112, + "grad_norm": 1.8125, + "learning_rate": 3.382056555819477e-05, + "loss": 0.535, "step": 9150 }, { "epoch": 2.203899855560905, - "grad_norm": 0.8828125, - "learning_rate": 8.474379965964456e-05, - "loss": 0.3912, + "grad_norm": 1.734375, + "learning_rate": 3.378873699360882e-05, + "loss": 0.5169, "step": 9155 }, { "epoch": 2.2051035146846414, - "grad_norm": 0.98828125, - "learning_rate": 8.466406314131606e-05, - "loss": 0.3901, + "grad_norm": 1.859375, + "learning_rate": 3.375694474146284e-05, + "loss": 0.5055, "step": 9160 }, { "epoch": 2.2063071738083773, - "grad_norm": 0.984375, - "learning_rate": 8.458441782375577e-05, - "loss": 0.3651, + "grad_norm": 1.796875, + "learning_rate": 3.3725188852552676e-05, + "loss": 0.4851, "step": 9165 }, { "epoch": 2.2075108329321136, - "grad_norm": 0.88671875, - "learning_rate": 8.450486383421655e-05, - "loss": 0.3746, + "grad_norm": 1.6328125, + "learning_rate": 3.369346937761612e-05, + "loss": 0.4962, "step": 9170 }, { "epoch": 2.20871449205585, - "grad_norm": 1.015625, - "learning_rate": 8.442540129980523e-05, - "loss": 0.3802, + "grad_norm": 1.7578125, + "learning_rate": 3.366178636733273e-05, + "loss": 0.507, "step": 9175 }, { "epoch": 2.209918151179586, - "grad_norm": 0.875, - "learning_rate": 8.434603034748262e-05, - "loss": 0.363, + "grad_norm": 1.6171875, + "learning_rate": 3.363013987232387e-05, + "loss": 0.4848, "step": 9180 }, { "epoch": 2.211121810303322, - "grad_norm": 1.046875, - "learning_rate": 8.426675110406314e-05, - "loss": 0.3906, + "grad_norm": 1.953125, + "learning_rate": 3.359852994315251e-05, + "loss": 0.5167, "step": 9185 }, { "epoch": 2.2123254694270584, - "grad_norm": 0.9609375, - "learning_rate": 8.418756369621465e-05, - "loss": 0.3765, + "grad_norm": 1.7265625, + "learning_rate": 3.3566956630323205e-05, + "loss": 0.4989, "step": 9190 }, { "epoch": 2.2135291285507943, - "grad_norm": 0.8984375, - "learning_rate": 8.41084682504584e-05, - "loss": 0.3716, + "grad_norm": 1.75, + "learning_rate": 3.353541998428203e-05, + "loss": 0.4883, "step": 9195 }, { "epoch": 2.2147327876745306, - "grad_norm": 1.0625, - "learning_rate": 8.402946489316858e-05, - "loss": 0.3999, + "grad_norm": 1.90625, + "learning_rate": 3.350392005541645e-05, + "loss": 0.5198, "step": 9200 }, { "epoch": 2.215936446798267, - "grad_norm": 1.015625, - "learning_rate": 8.395055375057235e-05, - "loss": 0.3734, + "grad_norm": 1.9296875, + "learning_rate": 3.347245689405529e-05, + "loss": 0.4946, "step": 9205 }, { "epoch": 2.217140105922003, - "grad_norm": 0.90234375, - "learning_rate": 8.387173494874944e-05, - "loss": 0.3657, + "grad_norm": 1.84375, + "learning_rate": 3.34410305504686e-05, + "loss": 0.4902, "step": 9210 }, { "epoch": 2.218343765045739, - "grad_norm": 1.0703125, - "learning_rate": 8.379300861363211e-05, - "loss": 0.4422, + "grad_norm": 1.9296875, + "learning_rate": 3.340964107486763e-05, + "loss": 0.5749, "step": 9215 }, { "epoch": 2.219547424169475, - "grad_norm": 0.89453125, - "learning_rate": 8.371437487100489e-05, - "loss": 0.3766, + "grad_norm": 1.609375, + "learning_rate": 3.3378288517404715e-05, + "loss": 0.4986, "step": 9220 }, { "epoch": 2.2207510832932114, - "grad_norm": 0.90625, - "learning_rate": 8.363583384650429e-05, - "loss": 0.3925, + "grad_norm": 1.75, + "learning_rate": 3.3346972928173196e-05, + "loss": 0.5188, "step": 9225 }, { "epoch": 2.2219547424169477, - "grad_norm": 1.03125, - "learning_rate": 8.355738566561877e-05, - "loss": 0.4125, + "grad_norm": 1.9453125, + "learning_rate": 3.3315694357207374e-05, + "loss": 0.5349, "step": 9230 }, { "epoch": 2.2231584015406836, - "grad_norm": 0.8828125, - "learning_rate": 8.347903045368839e-05, - "loss": 0.3951, + "grad_norm": 1.6796875, + "learning_rate": 3.328445285448237e-05, + "loss": 0.5194, "step": 9235 }, { "epoch": 2.22436206066442, - "grad_norm": 0.96875, - "learning_rate": 8.340076833590473e-05, - "loss": 0.3738, + "grad_norm": 1.7890625, + "learning_rate": 3.325324846991411e-05, + "loss": 0.4951, "step": 9240 }, { "epoch": 2.2255657197881558, - "grad_norm": 0.91015625, - "learning_rate": 8.332259943731055e-05, - "loss": 0.3783, + "grad_norm": 1.6875, + "learning_rate": 3.322208125335919e-05, + "loss": 0.4959, "step": 9245 }, { "epoch": 2.226769378911892, - "grad_norm": 0.9765625, - "learning_rate": 8.324452388279971e-05, - "loss": 0.3747, + "grad_norm": 1.84375, + "learning_rate": 3.319095125461484e-05, + "loss": 0.4973, "step": 9250 }, { "epoch": 2.2279730380356284, - "grad_norm": 0.8984375, - "learning_rate": 8.316654179711699e-05, - "loss": 0.3734, + "grad_norm": 1.6328125, + "learning_rate": 3.315985852341882e-05, + "loss": 0.4935, "step": 9255 }, { "epoch": 2.2291766971593643, - "grad_norm": 1.015625, - "learning_rate": 8.30886533048577e-05, - "loss": 0.3964, + "grad_norm": 1.8671875, + "learning_rate": 3.3128803109449324e-05, + "loss": 0.5178, "step": 9260 }, { "epoch": 2.2303803562831006, - "grad_norm": 0.98046875, - "learning_rate": 8.30108585304677e-05, - "loss": 0.37, + "grad_norm": 1.859375, + "learning_rate": 3.3097785062324954e-05, + "loss": 0.4916, "step": 9265 }, { "epoch": 2.231584015406837, - "grad_norm": 0.92578125, - "learning_rate": 8.293315759824314e-05, - "loss": 0.3757, + "grad_norm": 1.6640625, + "learning_rate": 3.306680443160462e-05, + "loss": 0.5, "step": 9270 }, { "epoch": 2.232787674530573, - "grad_norm": 0.890625, - "learning_rate": 8.285555063233014e-05, - "loss": 0.3636, + "grad_norm": 1.640625, + "learning_rate": 3.303586126678739e-05, + "loss": 0.4783, "step": 9275 }, { "epoch": 2.233991333654309, - "grad_norm": 0.890625, - "learning_rate": 8.277803775672479e-05, - "loss": 0.3989, + "grad_norm": 1.640625, + "learning_rate": 3.300495561731255e-05, + "loss": 0.5203, "step": 9280 }, { "epoch": 2.2351949927780455, - "grad_norm": 0.921875, - "learning_rate": 8.270061909527272e-05, - "loss": 0.3727, + "grad_norm": 1.7578125, + "learning_rate": 3.297408753255936e-05, + "loss": 0.4935, "step": 9285 }, { "epoch": 2.2363986519017813, - "grad_norm": 1.0703125, - "learning_rate": 8.262329477166919e-05, - "loss": 0.3919, + "grad_norm": 1.8359375, + "learning_rate": 3.294325706184714e-05, + "loss": 0.5134, "step": 9290 }, { "epoch": 2.2376023110255177, - "grad_norm": 0.953125, - "learning_rate": 8.254606490945859e-05, - "loss": 0.4213, + "grad_norm": 1.7578125, + "learning_rate": 3.291246425443507e-05, + "loss": 0.5512, "step": 9295 }, { "epoch": 2.2388059701492535, - "grad_norm": 0.8984375, - "learning_rate": 8.246892963203444e-05, - "loss": 0.4198, + "grad_norm": 1.71875, + "learning_rate": 3.2881709159522154e-05, + "loss": 0.5414, "step": 9300 }, { "epoch": 2.24000962927299, - "grad_norm": 0.89453125, - "learning_rate": 8.239188906263917e-05, - "loss": 0.3767, + "grad_norm": 1.6640625, + "learning_rate": 3.285099182624716e-05, + "loss": 0.4931, "step": 9305 }, { "epoch": 2.241213288396726, - "grad_norm": 0.921875, - "learning_rate": 8.231494332436382e-05, - "loss": 0.3804, + "grad_norm": 1.7109375, + "learning_rate": 3.28203123036885e-05, + "loss": 0.5026, "step": 9310 }, { "epoch": 2.242416947520462, - "grad_norm": 0.9375, - "learning_rate": 8.2238092540148e-05, - "loss": 0.3805, + "grad_norm": 1.78125, + "learning_rate": 3.27896706408642e-05, + "loss": 0.5056, "step": 9315 }, { "epoch": 2.2436206066441984, - "grad_norm": 1.03125, - "learning_rate": 8.216133683277955e-05, - "loss": 0.4068, + "grad_norm": 1.7890625, + "learning_rate": 3.275906688673178e-05, + "loss": 0.5245, "step": 9320 }, { "epoch": 2.2448242657679347, - "grad_norm": 0.94140625, - "learning_rate": 8.20846763248944e-05, - "loss": 0.3855, + "grad_norm": 1.828125, + "learning_rate": 3.272850109018818e-05, + "loss": 0.5042, "step": 9325 }, { "epoch": 2.2460279248916706, - "grad_norm": 1.0390625, - "learning_rate": 8.20081111389764e-05, - "loss": 0.4002, + "grad_norm": 1.8203125, + "learning_rate": 3.2697973300069726e-05, + "loss": 0.5254, "step": 9330 }, { "epoch": 2.247231584015407, - "grad_norm": 0.95703125, - "learning_rate": 8.193164139735707e-05, - "loss": 0.3653, + "grad_norm": 1.7265625, + "learning_rate": 3.266748356515198e-05, + "loss": 0.4846, "step": 9335 }, { "epoch": 2.2484352431391432, - "grad_norm": 0.92578125, - "learning_rate": 8.18552672222155e-05, - "loss": 0.4092, + "grad_norm": 1.78125, + "learning_rate": 3.2637031934149755e-05, + "loss": 0.5338, "step": 9340 }, { "epoch": 2.249638902262879, - "grad_norm": 0.9296875, - "learning_rate": 8.177898873557799e-05, - "loss": 0.3998, + "grad_norm": 1.8046875, + "learning_rate": 3.2606618455716915e-05, + "loss": 0.5228, "step": 9345 }, { "epoch": 2.2508425613866154, - "grad_norm": 1.0234375, - "learning_rate": 8.170280605931803e-05, - "loss": 0.4054, + "grad_norm": 1.828125, + "learning_rate": 3.257624317844642e-05, + "loss": 0.5344, "step": 9350 }, { "epoch": 2.2520462205103513, - "grad_norm": 0.92578125, - "learning_rate": 8.162671931515603e-05, - "loss": 0.371, + "grad_norm": 1.7109375, + "learning_rate": 3.254590615087019e-05, + "loss": 0.4905, "step": 9355 }, { "epoch": 2.2532498796340876, - "grad_norm": 0.99609375, - "learning_rate": 8.155072862465905e-05, - "loss": 0.4149, + "grad_norm": 1.6875, + "learning_rate": 3.251560742145897e-05, + "loss": 0.5454, "step": 9360 }, { "epoch": 2.254453538757824, - "grad_norm": 0.94140625, - "learning_rate": 8.147483410924076e-05, - "loss": 0.3771, + "grad_norm": 1.8515625, + "learning_rate": 3.2485347038622406e-05, + "loss": 0.4926, "step": 9365 }, { "epoch": 2.25565719788156, - "grad_norm": 0.93359375, - "learning_rate": 8.139903589016116e-05, - "loss": 0.3783, + "grad_norm": 1.7109375, + "learning_rate": 3.245512505070883e-05, + "loss": 0.4996, "step": 9370 }, { "epoch": 2.256860857005296, - "grad_norm": 0.9140625, - "learning_rate": 8.132333408852634e-05, - "loss": 0.3782, + "grad_norm": 1.6328125, + "learning_rate": 3.242494150600521e-05, + "loss": 0.4998, "step": 9375 }, { "epoch": 2.258064516129032, - "grad_norm": 0.98046875, - "learning_rate": 8.12477288252884e-05, - "loss": 0.3958, + "grad_norm": 1.8359375, + "learning_rate": 3.239479645273714e-05, + "loss": 0.521, "step": 9380 }, { "epoch": 2.2592681752527684, - "grad_norm": 0.91796875, - "learning_rate": 8.117222022124516e-05, - "loss": 0.3979, + "grad_norm": 1.65625, + "learning_rate": 3.2364689939068656e-05, + "loss": 0.5278, "step": 9385 }, { "epoch": 2.2604718343765047, - "grad_norm": 1.015625, - "learning_rate": 8.109680839703998e-05, - "loss": 0.3755, + "grad_norm": 1.7890625, + "learning_rate": 3.2334622013102265e-05, + "loss": 0.4987, "step": 9390 }, { "epoch": 2.2616754935002406, - "grad_norm": 1.03125, - "learning_rate": 8.10214934731617e-05, - "loss": 0.377, + "grad_norm": 1.8671875, + "learning_rate": 3.230459272287882e-05, + "loss": 0.4953, "step": 9395 }, { "epoch": 2.262879152623977, - "grad_norm": 0.89453125, - "learning_rate": 8.094627556994419e-05, - "loss": 0.3734, + "grad_norm": 1.75, + "learning_rate": 3.227460211637741e-05, + "loss": 0.4993, "step": 9400 }, { "epoch": 2.264082811747713, - "grad_norm": 0.98046875, - "learning_rate": 8.087115480756642e-05, - "loss": 0.3739, + "grad_norm": 1.8125, + "learning_rate": 3.2244650241515334e-05, + "loss": 0.495, "step": 9405 }, { "epoch": 2.265286470871449, - "grad_norm": 0.953125, - "learning_rate": 8.079613130605205e-05, - "loss": 0.3683, + "grad_norm": 1.7421875, + "learning_rate": 3.2214737146148016e-05, + "loss": 0.4875, "step": 9410 }, { "epoch": 2.2664901299951854, - "grad_norm": 1.0234375, - "learning_rate": 8.072120518526948e-05, - "loss": 0.3865, + "grad_norm": 1.875, + "learning_rate": 3.218486287806891e-05, + "loss": 0.505, "step": 9415 }, { "epoch": 2.2676937891189217, - "grad_norm": 0.96484375, - "learning_rate": 8.064637656493139e-05, - "loss": 0.3925, + "grad_norm": 1.703125, + "learning_rate": 3.2155027485009446e-05, + "loss": 0.5149, "step": 9420 }, { "epoch": 2.2688974482426576, - "grad_norm": 0.93359375, - "learning_rate": 8.057164556459475e-05, - "loss": 0.389, + "grad_norm": 1.6875, + "learning_rate": 3.2125231014638925e-05, + "loss": 0.5127, "step": 9425 }, { "epoch": 2.270101107366394, - "grad_norm": 0.93359375, - "learning_rate": 8.049701230366056e-05, - "loss": 0.3663, + "grad_norm": 1.671875, + "learning_rate": 3.2095473514564476e-05, + "loss": 0.4825, "step": 9430 }, { "epoch": 2.27130476649013, - "grad_norm": 1.015625, - "learning_rate": 8.042247690137359e-05, - "loss": 0.3901, + "grad_norm": 1.8046875, + "learning_rate": 3.206575503233094e-05, + "loss": 0.5154, "step": 9435 }, { "epoch": 2.272508425613866, - "grad_norm": 0.953125, - "learning_rate": 8.034803947682238e-05, - "loss": 0.384, + "grad_norm": 1.71875, + "learning_rate": 3.203607561542087e-05, + "loss": 0.5069, "step": 9440 }, { "epoch": 2.2737120847376024, - "grad_norm": 0.921875, - "learning_rate": 8.027370014893877e-05, - "loss": 0.3765, + "grad_norm": 1.6328125, + "learning_rate": 3.200643531125432e-05, + "loss": 0.4937, "step": 9445 }, { "epoch": 2.2749157438613383, - "grad_norm": 1.0078125, - "learning_rate": 8.019945903649802e-05, - "loss": 0.4083, + "grad_norm": 1.8203125, + "learning_rate": 3.197683416718893e-05, + "loss": 0.5267, "step": 9450 }, { "epoch": 2.2761194029850746, - "grad_norm": 0.9453125, - "learning_rate": 8.012531625811835e-05, - "loss": 0.3787, + "grad_norm": 1.7265625, + "learning_rate": 3.194727223051973e-05, + "loss": 0.4984, "step": 9455 }, { "epoch": 2.277323062108811, - "grad_norm": 0.89453125, - "learning_rate": 8.005127193226091e-05, - "loss": 0.3673, + "grad_norm": 1.6328125, + "learning_rate": 3.1917749548479104e-05, + "loss": 0.4873, "step": 9460 }, { "epoch": 2.278526721232547, - "grad_norm": 0.99609375, - "learning_rate": 7.997732617722959e-05, - "loss": 0.378, + "grad_norm": 1.96875, + "learning_rate": 3.188826616823675e-05, + "loss": 0.5014, "step": 9465 }, { "epoch": 2.279730380356283, - "grad_norm": 0.9921875, - "learning_rate": 7.99034791111707e-05, - "loss": 0.3914, + "grad_norm": 1.734375, + "learning_rate": 3.185882213689953e-05, + "loss": 0.5099, "step": 9470 }, { "epoch": 2.2809340394800195, - "grad_norm": 0.89453125, - "learning_rate": 7.982973085207295e-05, - "loss": 0.4008, + "grad_norm": 1.640625, + "learning_rate": 3.1829417501511474e-05, + "loss": 0.532, "step": 9475 }, { "epoch": 2.2821376986037554, - "grad_norm": 1.09375, - "learning_rate": 7.97560815177672e-05, - "loss": 0.4041, + "grad_norm": 1.6953125, + "learning_rate": 3.180005230905367e-05, + "loss": 0.5268, "step": 9480 }, { "epoch": 2.2833413577274917, - "grad_norm": 1.03125, - "learning_rate": 7.96825312259261e-05, - "loss": 0.3792, + "grad_norm": 1.8515625, + "learning_rate": 3.1770726606444116e-05, + "loss": 0.5, "step": 9485 }, { "epoch": 2.2845450168512276, - "grad_norm": 0.94140625, - "learning_rate": 7.960908009406425e-05, - "loss": 0.3898, + "grad_norm": 1.7109375, + "learning_rate": 3.1741440440537814e-05, + "loss": 0.5189, "step": 9490 }, { "epoch": 2.285748675974964, - "grad_norm": 1.0234375, - "learning_rate": 7.953572823953769e-05, - "loss": 0.4185, + "grad_norm": 1.828125, + "learning_rate": 3.171219385812652e-05, + "loss": 0.5433, "step": 9495 }, { "epoch": 2.2869523350987, - "grad_norm": 1.0546875, - "learning_rate": 7.946247577954389e-05, - "loss": 0.4054, + "grad_norm": 2.0, + "learning_rate": 3.16829869059388e-05, + "loss": 0.5278, "step": 9500 }, { "epoch": 2.2869523350987, - "eval_loss": 0.3615725636482239, - "eval_runtime": 2.3277, - "eval_samples_per_second": 85.92, - "eval_steps_per_second": 85.92, + "eval_loss": 0.4476446509361267, + "eval_runtime": 2.3917, + "eval_samples_per_second": 83.622, + "eval_steps_per_second": 83.622, "step": 9500 }, { "epoch": 2.288155994222436, - "grad_norm": 0.98046875, - "learning_rate": 7.938932283112149e-05, - "loss": 0.3746, + "grad_norm": 1.7734375, + "learning_rate": 3.165381963063986e-05, + "loss": 0.4982, "step": 9505 }, { "epoch": 2.2893596533461724, - "grad_norm": 0.875, - "learning_rate": 7.931626951115018e-05, - "loss": 0.3736, + "grad_norm": 1.59375, + "learning_rate": 3.1624692078831534e-05, + "loss": 0.4906, "step": 9510 }, { "epoch": 2.2905633124699083, - "grad_norm": 1.046875, - "learning_rate": 7.924331593635042e-05, - "loss": 0.3806, + "grad_norm": 1.921875, + "learning_rate": 3.159560429705218e-05, + "loss": 0.4999, "step": 9515 }, { "epoch": 2.2917669715936446, - "grad_norm": 0.9140625, - "learning_rate": 7.917046222328329e-05, - "loss": 0.4002, + "grad_norm": 1.6875, + "learning_rate": 3.1566556331776615e-05, + "loss": 0.5259, "step": 9520 }, { "epoch": 2.292970630717381, - "grad_norm": 0.8984375, - "learning_rate": 7.909770848835036e-05, - "loss": 0.378, + "grad_norm": 1.6015625, + "learning_rate": 3.153754822941603e-05, + "loss": 0.5007, "step": 9525 }, { "epoch": 2.294174289841117, - "grad_norm": 0.98828125, - "learning_rate": 7.902505484779351e-05, - "loss": 0.4065, + "grad_norm": 1.7265625, + "learning_rate": 3.150858003631798e-05, + "loss": 0.5291, "step": 9530 }, { "epoch": 2.295377948964853, - "grad_norm": 1.0078125, - "learning_rate": 7.895250141769457e-05, - "loss": 0.4029, + "grad_norm": 1.8359375, + "learning_rate": 3.1479651798766166e-05, + "loss": 0.5269, "step": 9535 }, { "epoch": 2.2965816080885895, - "grad_norm": 0.90625, - "learning_rate": 7.888004831397534e-05, - "loss": 0.391, + "grad_norm": 1.625, + "learning_rate": 3.1450763562980515e-05, + "loss": 0.5142, "step": 9540 }, { "epoch": 2.2977852672123253, - "grad_norm": 0.89453125, - "learning_rate": 7.880769565239728e-05, - "loss": 0.3853, + "grad_norm": 1.5078125, + "learning_rate": 3.142191537511701e-05, + "loss": 0.5078, "step": 9545 }, { "epoch": 2.2989889263360617, - "grad_norm": 1.03125, - "learning_rate": 7.873544354856142e-05, - "loss": 0.3837, + "grad_norm": 1.796875, + "learning_rate": 3.139310728126767e-05, + "loss": 0.5082, "step": 9550 }, { "epoch": 2.300192585459798, - "grad_norm": 0.87890625, - "learning_rate": 7.866329211790813e-05, - "loss": 0.3799, + "grad_norm": 1.6015625, + "learning_rate": 3.136433932746046e-05, + "loss": 0.4915, "step": 9555 }, { "epoch": 2.301396244583534, - "grad_norm": 0.94921875, - "learning_rate": 7.859124147571687e-05, - "loss": 0.3938, + "grad_norm": 1.7890625, + "learning_rate": 3.1335611559659167e-05, + "loss": 0.5176, "step": 9560 }, { "epoch": 2.30259990370727, - "grad_norm": 1.0078125, - "learning_rate": 7.85192917371061e-05, - "loss": 0.3614, + "grad_norm": 1.8828125, + "learning_rate": 3.130692402376342e-05, + "loss": 0.4884, "step": 9565 }, { "epoch": 2.303803562831006, - "grad_norm": 0.9765625, - "learning_rate": 7.844744301703301e-05, - "loss": 0.4075, + "grad_norm": 1.7578125, + "learning_rate": 3.127827676560852e-05, + "loss": 0.5307, "step": 9570 }, { "epoch": 2.3050072219547424, - "grad_norm": 0.95703125, - "learning_rate": 7.837569543029349e-05, - "loss": 0.3608, + "grad_norm": 1.7890625, + "learning_rate": 3.124966983096546e-05, + "loss": 0.4811, "step": 9575 }, { "epoch": 2.3062108810784787, - "grad_norm": 0.9609375, - "learning_rate": 7.830404909152181e-05, - "loss": 0.4086, + "grad_norm": 1.75, + "learning_rate": 3.1221103265540804e-05, + "loss": 0.5326, "step": 9580 }, { "epoch": 2.3074145402022146, - "grad_norm": 0.91796875, - "learning_rate": 7.82325041151904e-05, - "loss": 0.3755, + "grad_norm": 1.7109375, + "learning_rate": 3.119257711497657e-05, + "loss": 0.4992, "step": 9585 }, { "epoch": 2.308618199325951, - "grad_norm": 0.96875, - "learning_rate": 7.816106061560983e-05, - "loss": 0.3968, + "grad_norm": 1.671875, + "learning_rate": 3.116409142485026e-05, + "loss": 0.5165, "step": 9590 }, { "epoch": 2.3098218584496872, - "grad_norm": 0.92578125, - "learning_rate": 7.808971870692846e-05, - "loss": 0.4194, + "grad_norm": 1.7109375, + "learning_rate": 3.113564624067468e-05, + "loss": 0.5394, "step": 9595 }, { "epoch": 2.311025517573423, - "grad_norm": 0.95703125, - "learning_rate": 7.801847850313239e-05, - "loss": 0.3945, + "grad_norm": 1.765625, + "learning_rate": 3.110724160789796e-05, + "loss": 0.5177, "step": 9600 }, { "epoch": 2.3122291766971594, - "grad_norm": 0.99609375, - "learning_rate": 7.794734011804522e-05, - "loss": 0.3833, + "grad_norm": 1.703125, + "learning_rate": 3.1078877571903414e-05, + "loss": 0.5007, "step": 9605 }, { "epoch": 2.3134328358208958, - "grad_norm": 0.9140625, - "learning_rate": 7.787630366532785e-05, - "loss": 0.36, + "grad_norm": 1.7578125, + "learning_rate": 3.1050554178009515e-05, + "loss": 0.4802, "step": 9610 }, { "epoch": 2.3146364949446316, - "grad_norm": 0.9921875, - "learning_rate": 7.780536925847832e-05, - "loss": 0.38, + "grad_norm": 1.7265625, + "learning_rate": 3.102227147146977e-05, + "loss": 0.5072, "step": 9615 }, { "epoch": 2.315840154068368, - "grad_norm": 0.953125, - "learning_rate": 7.773453701083158e-05, - "loss": 0.4177, + "grad_norm": 1.7578125, + "learning_rate": 3.099402949747268e-05, + "loss": 0.5498, "step": 9620 }, { "epoch": 2.317043813192104, - "grad_norm": 0.94140625, - "learning_rate": 7.766380703555945e-05, - "loss": 0.3775, + "grad_norm": 1.6484375, + "learning_rate": 3.096582830114171e-05, + "loss": 0.4977, "step": 9625 }, { "epoch": 2.31824747231584, - "grad_norm": 0.94921875, - "learning_rate": 7.75931794456703e-05, - "loss": 0.3996, + "grad_norm": 1.703125, + "learning_rate": 3.093766792753513e-05, + "loss": 0.5215, "step": 9630 }, { "epoch": 2.3194511314395765, - "grad_norm": 0.88671875, - "learning_rate": 7.75226543540089e-05, - "loss": 0.3714, + "grad_norm": 1.8203125, + "learning_rate": 3.090954842164599e-05, + "loss": 0.4972, "step": 9635 }, { "epoch": 2.3206547905633124, - "grad_norm": 0.89453125, - "learning_rate": 7.745223187325628e-05, - "loss": 0.3737, + "grad_norm": 1.78125, + "learning_rate": 3.0881469828402065e-05, + "loss": 0.5002, "step": 9640 }, { "epoch": 2.3218584496870487, - "grad_norm": 0.85546875, - "learning_rate": 7.738191211592948e-05, - "loss": 0.3827, + "grad_norm": 1.703125, + "learning_rate": 3.085343219266574e-05, + "loss": 0.5016, "step": 9645 }, { "epoch": 2.3230621088107846, - "grad_norm": 1.0234375, - "learning_rate": 7.73116951943815e-05, - "loss": 0.387, + "grad_norm": 1.8125, + "learning_rate": 3.082543555923398e-05, + "loss": 0.5121, "step": 9650 }, { "epoch": 2.324265767934521, - "grad_norm": 0.94140625, - "learning_rate": 7.724158122080093e-05, - "loss": 0.3831, + "grad_norm": 1.8125, + "learning_rate": 3.079747997283821e-05, + "loss": 0.5035, "step": 9655 }, { "epoch": 2.325469427058257, - "grad_norm": 0.9921875, - "learning_rate": 7.717157030721195e-05, - "loss": 0.3759, + "grad_norm": 1.765625, + "learning_rate": 3.076956547814431e-05, + "loss": 0.499, "step": 9660 }, { "epoch": 2.326673086181993, - "grad_norm": 0.92578125, - "learning_rate": 7.710166256547402e-05, - "loss": 0.3671, + "grad_norm": 1.7421875, + "learning_rate": 3.074169211975246e-05, + "loss": 0.4914, "step": 9665 }, { "epoch": 2.3278767453057294, - "grad_norm": 1.0546875, - "learning_rate": 7.703185810728186e-05, - "loss": 0.407, + "grad_norm": 1.9609375, + "learning_rate": 3.0713859942197186e-05, + "loss": 0.5288, "step": 9670 }, { "epoch": 2.3290804044294657, - "grad_norm": 0.8984375, - "learning_rate": 7.696215704416505e-05, - "loss": 0.3874, + "grad_norm": 1.796875, + "learning_rate": 3.0686068989947135e-05, + "loss": 0.511, "step": 9675 }, { "epoch": 2.3302840635532016, - "grad_norm": 0.93359375, - "learning_rate": 7.689255948748799e-05, - "loss": 0.3858, + "grad_norm": 1.765625, + "learning_rate": 3.0658319307405126e-05, + "loss": 0.5106, "step": 9680 }, { "epoch": 2.331487722676938, - "grad_norm": 0.94921875, - "learning_rate": 7.682306554844979e-05, - "loss": 0.4008, + "grad_norm": 1.7734375, + "learning_rate": 3.063061093890805e-05, + "loss": 0.5312, "step": 9685 }, { "epoch": 2.3326913818006743, - "grad_norm": 0.8828125, - "learning_rate": 7.675367533808395e-05, - "loss": 0.3694, + "grad_norm": 1.78125, + "learning_rate": 3.060294392872678e-05, + "loss": 0.4921, "step": 9690 }, { "epoch": 2.33389504092441, - "grad_norm": 0.94921875, - "learning_rate": 7.668438896725818e-05, - "loss": 0.3768, + "grad_norm": 1.8046875, + "learning_rate": 3.057531832106609e-05, + "loss": 0.498, "step": 9695 }, { "epoch": 2.3350987000481465, - "grad_norm": 0.94140625, - "learning_rate": 7.661520654667441e-05, - "loss": 0.3921, + "grad_norm": 1.6796875, + "learning_rate": 3.054773416006464e-05, + "loss": 0.5172, "step": 9700 }, { "epoch": 2.3363023591718823, - "grad_norm": 0.90625, - "learning_rate": 7.654612818686837e-05, - "loss": 0.3726, + "grad_norm": 1.703125, + "learning_rate": 3.052019148979483e-05, + "loss": 0.492, "step": 9705 }, { "epoch": 2.3375060182956187, - "grad_norm": 0.99609375, - "learning_rate": 7.647715399820956e-05, - "loss": 0.3798, + "grad_norm": 1.8359375, + "learning_rate": 3.0492690354262795e-05, + "loss": 0.5039, "step": 9710 }, { "epoch": 2.338709677419355, - "grad_norm": 0.96484375, - "learning_rate": 7.64082840909011e-05, - "loss": 0.3735, + "grad_norm": 1.7421875, + "learning_rate": 3.0465230797408302e-05, + "loss": 0.5018, "step": 9715 }, { "epoch": 2.339913336543091, - "grad_norm": 0.8984375, - "learning_rate": 7.633951857497943e-05, - "loss": 0.3727, + "grad_norm": 1.6484375, + "learning_rate": 3.043781286310468e-05, + "loss": 0.492, "step": 9720 }, { "epoch": 2.341116995666827, - "grad_norm": 0.9765625, - "learning_rate": 7.627085756031421e-05, - "loss": 0.3841, + "grad_norm": 1.8125, + "learning_rate": 3.0410436595158767e-05, + "loss": 0.5097, "step": 9725 }, { "epoch": 2.3423206547905635, - "grad_norm": 0.96875, - "learning_rate": 7.620230115660809e-05, - "loss": 0.3906, + "grad_norm": 1.8203125, + "learning_rate": 3.0383102037310794e-05, + "loss": 0.5134, "step": 9730 }, { "epoch": 2.3435243139142994, - "grad_norm": 0.8671875, - "learning_rate": 7.61338494733967e-05, - "loss": 0.3936, + "grad_norm": 1.625, + "learning_rate": 3.0355809233234404e-05, + "loss": 0.52, "step": 9735 }, { "epoch": 2.3447279730380357, - "grad_norm": 0.9765625, - "learning_rate": 7.606550262004827e-05, - "loss": 0.3848, + "grad_norm": 1.8046875, + "learning_rate": 3.0328558226536497e-05, + "loss": 0.5067, "step": 9740 }, { "epoch": 2.345931632161772, - "grad_norm": 0.91015625, - "learning_rate": 7.599726070576351e-05, - "loss": 0.3691, + "grad_norm": 1.671875, + "learning_rate": 3.0301349060757183e-05, + "loss": 0.4941, "step": 9745 }, { "epoch": 2.347135291285508, - "grad_norm": 0.953125, - "learning_rate": 7.592912383957557e-05, - "loss": 0.4196, + "grad_norm": 1.7734375, + "learning_rate": 3.027418177936976e-05, + "loss": 0.5417, "step": 9750 }, { "epoch": 2.3483389504092442, - "grad_norm": 0.95703125, - "learning_rate": 7.586109213034963e-05, - "loss": 0.3569, + "grad_norm": 1.734375, + "learning_rate": 3.024705642578055e-05, + "loss": 0.474, "step": 9755 }, { "epoch": 2.34954260953298, - "grad_norm": 0.92578125, - "learning_rate": 7.579316568678294e-05, - "loss": 0.3628, + "grad_norm": 1.8203125, + "learning_rate": 3.0219973043328925e-05, + "loss": 0.482, "step": 9760 }, { "epoch": 2.3507462686567164, - "grad_norm": 1.0390625, - "learning_rate": 7.572534461740457e-05, - "loss": 0.3826, + "grad_norm": 2.015625, + "learning_rate": 3.0192931675287197e-05, + "loss": 0.5095, "step": 9765 }, { "epoch": 2.3519499277804528, - "grad_norm": 0.89453125, - "learning_rate": 7.565762903057518e-05, - "loss": 0.3648, + "grad_norm": 1.703125, + "learning_rate": 3.0165932364860533e-05, + "loss": 0.4793, "step": 9770 }, { "epoch": 2.3531535869041886, - "grad_norm": 1.046875, - "learning_rate": 7.559001903448696e-05, - "loss": 0.3823, + "grad_norm": 1.8828125, + "learning_rate": 3.013897515518691e-05, + "loss": 0.5014, "step": 9775 }, { "epoch": 2.354357246027925, - "grad_norm": 0.96484375, - "learning_rate": 7.552251473716325e-05, - "loss": 0.3746, + "grad_norm": 1.859375, + "learning_rate": 3.0112060089337026e-05, + "loss": 0.5042, "step": 9780 }, { "epoch": 2.355560905151661, - "grad_norm": 0.91796875, - "learning_rate": 7.545511624645872e-05, - "loss": 0.389, + "grad_norm": 1.6640625, + "learning_rate": 3.0085187210314275e-05, + "loss": 0.5105, "step": 9785 }, { "epoch": 2.356764564275397, - "grad_norm": 1.015625, - "learning_rate": 7.538782367005884e-05, - "loss": 0.4156, + "grad_norm": 1.7578125, + "learning_rate": 3.0058356561054624e-05, + "loss": 0.5373, "step": 9790 }, { "epoch": 2.3579682233991335, - "grad_norm": 0.9375, - "learning_rate": 7.532063711547986e-05, - "loss": 0.3787, + "grad_norm": 1.7578125, + "learning_rate": 3.0031568184426562e-05, + "loss": 0.5055, "step": 9795 }, { "epoch": 2.3591718825228694, - "grad_norm": 0.99609375, - "learning_rate": 7.525355669006875e-05, - "loss": 0.3757, + "grad_norm": 1.8828125, + "learning_rate": 3.000482212323107e-05, + "loss": 0.5001, "step": 9800 }, { "epoch": 2.3603755416466057, - "grad_norm": 0.8984375, - "learning_rate": 7.518658250100275e-05, - "loss": 0.3697, + "grad_norm": 1.7578125, + "learning_rate": 2.9978118420201467e-05, + "loss": 0.4958, "step": 9805 }, { "epoch": 2.361579200770342, - "grad_norm": 1.0546875, - "learning_rate": 7.511971465528949e-05, - "loss": 0.3766, + "grad_norm": 1.8359375, + "learning_rate": 2.9951457118003445e-05, + "loss": 0.4951, "step": 9810 }, { "epoch": 2.362782859894078, - "grad_norm": 0.90234375, - "learning_rate": 7.505295325976668e-05, - "loss": 0.4025, + "grad_norm": 1.6640625, + "learning_rate": 2.9924838259234937e-05, + "loss": 0.5289, "step": 9815 }, { "epoch": 2.363986519017814, - "grad_norm": 0.94140625, - "learning_rate": 7.498629842110183e-05, - "loss": 0.3863, + "grad_norm": 1.7109375, + "learning_rate": 2.9898261886426032e-05, + "loss": 0.5102, "step": 9820 }, { "epoch": 2.3651901781415505, - "grad_norm": 0.8984375, - "learning_rate": 7.491975024579236e-05, - "loss": 0.4002, + "grad_norm": 1.640625, + "learning_rate": 2.9871728042038984e-05, + "loss": 0.5246, "step": 9825 }, { "epoch": 2.3663938372652864, - "grad_norm": 0.90234375, - "learning_rate": 7.485330884016519e-05, - "loss": 0.3582, + "grad_norm": 1.6171875, + "learning_rate": 2.9845236768468072e-05, + "loss": 0.4786, "step": 9830 }, { "epoch": 2.3675974963890227, - "grad_norm": 0.9765625, - "learning_rate": 7.478697431037657e-05, - "loss": 0.4234, + "grad_norm": 1.75, + "learning_rate": 2.981878810803953e-05, + "loss": 0.5543, "step": 9835 }, { "epoch": 2.3688011555127586, - "grad_norm": 0.921875, - "learning_rate": 7.472074676241218e-05, - "loss": 0.385, + "grad_norm": 1.625, + "learning_rate": 2.9792382103011573e-05, + "loss": 0.5079, "step": 9840 }, { "epoch": 2.370004814636495, - "grad_norm": 1.0234375, - "learning_rate": 7.465462630208658e-05, - "loss": 0.3784, + "grad_norm": 1.8671875, + "learning_rate": 2.9766018795574203e-05, + "loss": 0.4993, "step": 9845 }, { "epoch": 2.3712084737602313, - "grad_norm": 1.0390625, - "learning_rate": 7.458861303504338e-05, - "loss": 0.4164, + "grad_norm": 1.984375, + "learning_rate": 2.973969822784925e-05, + "loss": 0.5406, "step": 9850 }, { "epoch": 2.372412132883967, - "grad_norm": 0.8515625, - "learning_rate": 7.45227070667548e-05, - "loss": 0.3661, + "grad_norm": 1.609375, + "learning_rate": 2.9713420441890215e-05, + "loss": 0.4864, "step": 9855 }, { "epoch": 2.3736157920077035, - "grad_norm": 0.88671875, - "learning_rate": 7.445690850252173e-05, - "loss": 0.364, + "grad_norm": 1.65625, + "learning_rate": 2.9687185479682268e-05, + "loss": 0.4883, "step": 9860 }, { "epoch": 2.3748194511314393, - "grad_norm": 0.953125, - "learning_rate": 7.439121744747338e-05, - "loss": 0.3812, + "grad_norm": 1.7578125, + "learning_rate": 2.966099338314216e-05, + "loss": 0.5011, "step": 9865 }, { "epoch": 2.3760231102551757, - "grad_norm": 0.98828125, - "learning_rate": 7.432563400656723e-05, - "loss": 0.3802, + "grad_norm": 1.875, + "learning_rate": 2.963484419411814e-05, + "loss": 0.5035, "step": 9870 }, { "epoch": 2.377226769378912, - "grad_norm": 1.0234375, - "learning_rate": 7.426015828458882e-05, - "loss": 0.3816, + "grad_norm": 1.9140625, + "learning_rate": 2.9608737954389912e-05, + "loss": 0.5068, "step": 9875 }, { "epoch": 2.3784304285026483, - "grad_norm": 0.9921875, - "learning_rate": 7.419479038615156e-05, - "loss": 0.424, + "grad_norm": 1.7890625, + "learning_rate": 2.9582674705668565e-05, + "loss": 0.5568, "step": 9880 }, { "epoch": 2.379634087626384, - "grad_norm": 1.015625, - "learning_rate": 7.412953041569658e-05, - "loss": 0.38, + "grad_norm": 1.8125, + "learning_rate": 2.955665448959647e-05, + "loss": 0.5055, "step": 9885 }, { "epoch": 2.3808377467501205, - "grad_norm": 0.98828125, - "learning_rate": 7.406437847749255e-05, - "loss": 0.369, + "grad_norm": 1.734375, + "learning_rate": 2.9530677347747264e-05, + "loss": 0.4906, "step": 9890 }, { "epoch": 2.3820414058738564, - "grad_norm": 0.9765625, - "learning_rate": 7.399933467563564e-05, - "loss": 0.4152, + "grad_norm": 1.8203125, + "learning_rate": 2.950474332162577e-05, + "loss": 0.5386, "step": 9895 }, { "epoch": 2.3832450649975927, - "grad_norm": 0.9296875, - "learning_rate": 7.393439911404913e-05, - "loss": 0.3682, + "grad_norm": 1.6484375, + "learning_rate": 2.947885245266791e-05, + "loss": 0.4928, "step": 9900 }, { "epoch": 2.384448724121329, - "grad_norm": 0.95703125, - "learning_rate": 7.38695718964834e-05, - "loss": 0.3761, + "grad_norm": 1.8984375, + "learning_rate": 2.9453004782240654e-05, + "loss": 0.5, "step": 9905 }, { "epoch": 2.385652383245065, - "grad_norm": 1.0546875, - "learning_rate": 7.380485312651573e-05, - "loss": 0.3772, + "grad_norm": 1.921875, + "learning_rate": 2.942720035164196e-05, + "loss": 0.4995, "step": 9910 }, { "epoch": 2.3868560423688012, - "grad_norm": 0.9375, - "learning_rate": 7.374024290755012e-05, - "loss": 0.3977, + "grad_norm": 1.71875, + "learning_rate": 2.9401439202100675e-05, + "loss": 0.5246, "step": 9915 }, { "epoch": 2.388059701492537, - "grad_norm": 0.90625, - "learning_rate": 7.367574134281715e-05, - "loss": 0.3786, + "grad_norm": 1.71875, + "learning_rate": 2.937572137477653e-05, + "loss": 0.4995, "step": 9920 }, { "epoch": 2.3892633606162734, - "grad_norm": 0.91015625, - "learning_rate": 7.361134853537379e-05, - "loss": 0.3779, + "grad_norm": 1.703125, + "learning_rate": 2.935004691076002e-05, + "loss": 0.5009, "step": 9925 }, { "epoch": 2.3904670197400097, - "grad_norm": 1.0078125, - "learning_rate": 7.354706458810322e-05, - "loss": 0.3668, + "grad_norm": 1.90625, + "learning_rate": 2.9324415851072354e-05, + "loss": 0.4791, "step": 9930 }, { "epoch": 2.3916706788637456, - "grad_norm": 0.88671875, - "learning_rate": 7.348288960371473e-05, - "loss": 0.3676, + "grad_norm": 1.5546875, + "learning_rate": 2.9298828236665394e-05, + "loss": 0.4884, "step": 9935 }, { "epoch": 2.392874337987482, - "grad_norm": 1.046875, - "learning_rate": 7.341882368474345e-05, - "loss": 0.3678, + "grad_norm": 2.0, + "learning_rate": 2.9273284108421575e-05, + "loss": 0.4949, "step": 9940 }, { "epoch": 2.3940779971112183, - "grad_norm": 0.9765625, - "learning_rate": 7.335486693355033e-05, - "loss": 0.3742, + "grad_norm": 1.90625, + "learning_rate": 2.9247783507153877e-05, + "loss": 0.4961, "step": 9945 }, { "epoch": 2.395281656234954, - "grad_norm": 0.9453125, - "learning_rate": 7.329101945232187e-05, - "loss": 0.3992, + "grad_norm": 1.703125, + "learning_rate": 2.9222326473605734e-05, + "loss": 0.5217, "step": 9950 }, { "epoch": 2.3964853153586905, - "grad_norm": 0.98046875, - "learning_rate": 7.322728134306994e-05, - "loss": 0.3668, + "grad_norm": 1.75, + "learning_rate": 2.919691304845094e-05, + "loss": 0.4973, "step": 9955 }, { "epoch": 2.397688974482427, - "grad_norm": 0.93359375, - "learning_rate": 7.316365270763175e-05, - "loss": 0.3985, + "grad_norm": 1.7265625, + "learning_rate": 2.917154327229365e-05, + "loss": 0.5235, "step": 9960 }, { "epoch": 2.3988926336061627, - "grad_norm": 0.98046875, - "learning_rate": 7.310013364766951e-05, - "loss": 0.3983, + "grad_norm": 1.7578125, + "learning_rate": 2.9146217185668255e-05, + "loss": 0.5185, "step": 9965 }, { "epoch": 2.400096292729899, - "grad_norm": 0.9453125, - "learning_rate": 7.30367242646704e-05, - "loss": 0.3807, + "grad_norm": 1.796875, + "learning_rate": 2.9120934829039346e-05, + "loss": 0.5057, "step": 9970 }, { "epoch": 2.401299951853635, - "grad_norm": 0.99609375, - "learning_rate": 7.297342465994638e-05, - "loss": 0.4173, + "grad_norm": 1.9140625, + "learning_rate": 2.9095696242801658e-05, + "loss": 0.5483, "step": 9975 }, { "epoch": 2.402503610977371, - "grad_norm": 0.94921875, - "learning_rate": 7.291023493463395e-05, - "loss": 0.3729, + "grad_norm": 1.84375, + "learning_rate": 2.9070501467279964e-05, + "loss": 0.4886, "step": 9980 }, { "epoch": 2.4037072701011075, - "grad_norm": 0.98046875, - "learning_rate": 7.284715518969416e-05, - "loss": 0.3915, + "grad_norm": 1.8203125, + "learning_rate": 2.9045350542729088e-05, + "loss": 0.5102, "step": 9985 }, { "epoch": 2.4049109292248434, - "grad_norm": 0.8671875, - "learning_rate": 7.278418552591222e-05, - "loss": 0.3508, + "grad_norm": 1.609375, + "learning_rate": 2.902024350933373e-05, + "loss": 0.4721, "step": 9990 }, { "epoch": 2.4061145883485797, - "grad_norm": 0.98046875, - "learning_rate": 7.27213260438975e-05, - "loss": 0.3481, + "grad_norm": 1.796875, + "learning_rate": 2.89951804072085e-05, + "loss": 0.4717, "step": 9995 }, { "epoch": 2.4073182474723156, - "grad_norm": 0.99609375, - "learning_rate": 7.265857684408339e-05, - "loss": 0.3894, + "grad_norm": 1.90625, + "learning_rate": 2.897016127639782e-05, + "loss": 0.5145, "step": 10000 }, { "epoch": 2.4073182474723156, - "eval_loss": 0.3551611304283142, - "eval_runtime": 2.3328, - "eval_samples_per_second": 85.735, - "eval_steps_per_second": 85.735, + "eval_loss": 0.44393065571784973, + "eval_runtime": 2.3787, + "eval_samples_per_second": 84.081, + "eval_steps_per_second": 84.081, "step": 10000 }, { "epoch": 2.408521906596052, - "grad_norm": 0.9375, - "learning_rate": 7.259593802672696e-05, - "loss": 0.3652, + "grad_norm": 1.7265625, + "learning_rate": 2.894518615687583e-05, + "loss": 0.4904, "step": 10005 }, { "epoch": 2.4097255657197882, - "grad_norm": 0.95703125, - "learning_rate": 7.253340969190904e-05, - "loss": 0.3659, + "grad_norm": 1.78125, + "learning_rate": 2.892025508854639e-05, + "loss": 0.488, "step": 10010 }, { "epoch": 2.410929224843524, - "grad_norm": 0.89453125, - "learning_rate": 7.247099193953384e-05, - "loss": 0.3746, + "grad_norm": 1.6953125, + "learning_rate": 2.8895368111242938e-05, + "loss": 0.5014, "step": 10015 }, { "epoch": 2.4121328839672604, - "grad_norm": 0.99609375, - "learning_rate": 7.240868486932893e-05, - "loss": 0.3767, + "grad_norm": 1.8203125, + "learning_rate": 2.8870525264728488e-05, + "loss": 0.4977, "step": 10020 }, { "epoch": 2.4133365430909968, - "grad_norm": 1.0546875, - "learning_rate": 7.234648858084507e-05, - "loss": 0.3703, + "grad_norm": 1.8828125, + "learning_rate": 2.884572658869555e-05, + "loss": 0.4956, "step": 10025 }, { "epoch": 2.4145402022147326, - "grad_norm": 0.90234375, - "learning_rate": 7.228440317345595e-05, - "loss": 0.3988, + "grad_norm": 1.640625, + "learning_rate": 2.8820972122766042e-05, + "loss": 0.5232, "step": 10030 }, { "epoch": 2.415743861338469, - "grad_norm": 0.87890625, - "learning_rate": 7.222242874635819e-05, - "loss": 0.3794, + "grad_norm": 1.7109375, + "learning_rate": 2.8796261906491266e-05, + "loss": 0.5102, "step": 10035 }, { "epoch": 2.4169475204622053, - "grad_norm": 0.9765625, - "learning_rate": 7.216056539857098e-05, - "loss": 0.3812, + "grad_norm": 1.8984375, + "learning_rate": 2.8771595979351803e-05, + "loss": 0.5025, "step": 10040 }, { "epoch": 2.418151179585941, - "grad_norm": 1.015625, - "learning_rate": 7.209881322893608e-05, - "loss": 0.4054, + "grad_norm": 1.8984375, + "learning_rate": 2.8746974380757468e-05, + "loss": 0.5364, "step": 10045 }, { "epoch": 2.4193548387096775, - "grad_norm": 1.109375, - "learning_rate": 7.20371723361177e-05, - "loss": 0.396, + "grad_norm": 1.921875, + "learning_rate": 2.8722397150047295e-05, + "loss": 0.5253, "step": 10050 }, { "epoch": 2.4205584978334134, - "grad_norm": 0.98046875, - "learning_rate": 7.197564281860209e-05, - "loss": 0.3732, + "grad_norm": 1.78125, + "learning_rate": 2.869786432648937e-05, + "loss": 0.4977, "step": 10055 }, { "epoch": 2.4217621569571497, - "grad_norm": 0.99609375, - "learning_rate": 7.191422477469773e-05, - "loss": 0.3983, + "grad_norm": 1.7890625, + "learning_rate": 2.8673375949280877e-05, + "loss": 0.521, "step": 10060 }, { "epoch": 2.422965816080886, - "grad_norm": 0.95703125, - "learning_rate": 7.185291830253486e-05, - "loss": 0.3695, + "grad_norm": 1.8046875, + "learning_rate": 2.8648932057547947e-05, + "loss": 0.493, "step": 10065 }, { "epoch": 2.424169475204622, - "grad_norm": 1.046875, - "learning_rate": 7.179172350006551e-05, - "loss": 0.3683, + "grad_norm": 1.8984375, + "learning_rate": 2.8624532690345663e-05, + "loss": 0.4942, "step": 10070 }, { "epoch": 2.425373134328358, - "grad_norm": 0.9609375, - "learning_rate": 7.173064046506333e-05, - "loss": 0.3745, + "grad_norm": 1.734375, + "learning_rate": 2.8600177886657963e-05, + "loss": 0.4983, "step": 10075 }, { "epoch": 2.4265767934520945, - "grad_norm": 0.99609375, - "learning_rate": 7.166966929512329e-05, - "loss": 0.3828, + "grad_norm": 1.703125, + "learning_rate": 2.8575867685397572e-05, + "loss": 0.5024, "step": 10080 }, { "epoch": 2.4277804525758304, - "grad_norm": 0.96484375, - "learning_rate": 7.160881008766172e-05, - "loss": 0.3847, + "grad_norm": 1.7890625, + "learning_rate": 2.8551602125405957e-05, + "loss": 0.5094, "step": 10085 }, { "epoch": 2.4289841116995667, - "grad_norm": 0.84375, - "learning_rate": 7.154806293991606e-05, - "loss": 0.3592, + "grad_norm": 1.6875, + "learning_rate": 2.852738124545328e-05, + "loss": 0.4861, "step": 10090 }, { "epoch": 2.430187770823303, - "grad_norm": 0.96875, - "learning_rate": 7.148742794894461e-05, - "loss": 0.3574, + "grad_norm": 1.7578125, + "learning_rate": 2.850320508423827e-05, + "loss": 0.4793, "step": 10095 }, { "epoch": 2.431391429947039, - "grad_norm": 0.94140625, - "learning_rate": 7.142690521162662e-05, - "loss": 0.3854, + "grad_norm": 1.8515625, + "learning_rate": 2.8479073680388264e-05, + "loss": 0.5085, "step": 10100 }, { "epoch": 2.4325950890707753, - "grad_norm": 1.0, - "learning_rate": 7.136649482466184e-05, - "loss": 0.4074, + "grad_norm": 1.8828125, + "learning_rate": 2.8454987072459036e-05, + "loss": 0.532, "step": 10105 }, { "epoch": 2.433798748194511, - "grad_norm": 0.84765625, - "learning_rate": 7.130619688457064e-05, - "loss": 0.3697, + "grad_norm": 1.53125, + "learning_rate": 2.843094529893483e-05, + "loss": 0.4858, "step": 10110 }, { "epoch": 2.4350024073182475, - "grad_norm": 1.03125, - "learning_rate": 7.124601148769362e-05, - "loss": 0.3931, + "grad_norm": 1.9375, + "learning_rate": 2.8406948398228216e-05, + "loss": 0.5159, "step": 10115 }, { "epoch": 2.436206066441984, - "grad_norm": 0.98828125, - "learning_rate": 7.118593873019167e-05, - "loss": 0.3823, + "grad_norm": 1.9375, + "learning_rate": 2.838299640868011e-05, + "loss": 0.5044, "step": 10120 }, { "epoch": 2.4374097255657197, - "grad_norm": 0.86328125, - "learning_rate": 7.11259787080456e-05, - "loss": 0.3761, + "grad_norm": 1.671875, + "learning_rate": 2.835908936855964e-05, + "loss": 0.5037, "step": 10125 }, { "epoch": 2.438613384689456, - "grad_norm": 0.9609375, - "learning_rate": 7.106613151705618e-05, - "loss": 0.3796, + "grad_norm": 1.8203125, + "learning_rate": 2.8335227316064124e-05, + "loss": 0.5048, "step": 10130 }, { "epoch": 2.439817043813192, - "grad_norm": 1.0390625, - "learning_rate": 7.100639725284389e-05, - "loss": 0.3816, + "grad_norm": 1.9765625, + "learning_rate": 2.8311410289319016e-05, + "loss": 0.5143, "step": 10135 }, { "epoch": 2.441020702936928, - "grad_norm": 1.03125, - "learning_rate": 7.094677601084875e-05, - "loss": 0.3752, + "grad_norm": 1.8515625, + "learning_rate": 2.828763832637781e-05, + "loss": 0.5074, "step": 10140 }, { "epoch": 2.4422243620606645, - "grad_norm": 1.0078125, - "learning_rate": 7.088726788633027e-05, - "loss": 0.3921, + "grad_norm": 1.7265625, + "learning_rate": 2.8263911465222024e-05, + "loss": 0.5161, "step": 10145 }, { "epoch": 2.4434280211844004, - "grad_norm": 0.91015625, - "learning_rate": 7.08278729743671e-05, - "loss": 0.3855, + "grad_norm": 1.7265625, + "learning_rate": 2.824022974376108e-05, + "loss": 0.5114, "step": 10150 }, { "epoch": 2.4446316803081367, - "grad_norm": 0.9609375, - "learning_rate": 7.076859136985713e-05, - "loss": 0.3638, + "grad_norm": 1.796875, + "learning_rate": 2.8216593199832305e-05, + "loss": 0.4899, "step": 10155 }, { "epoch": 2.445835339431873, - "grad_norm": 1.0390625, - "learning_rate": 7.070942316751717e-05, - "loss": 0.3997, + "grad_norm": 1.8984375, + "learning_rate": 2.8193001871200842e-05, + "loss": 0.5283, "step": 10160 }, { "epoch": 2.447038998555609, - "grad_norm": 1.0546875, - "learning_rate": 7.065036846188283e-05, - "loss": 0.3768, + "grad_norm": 1.859375, + "learning_rate": 2.8169455795559577e-05, + "loss": 0.5038, "step": 10165 }, { "epoch": 2.4482426576793452, - "grad_norm": 0.9375, - "learning_rate": 7.059142734730838e-05, - "loss": 0.3869, + "grad_norm": 1.6796875, + "learning_rate": 2.8145955010529112e-05, + "loss": 0.5161, "step": 10170 }, { "epoch": 2.4494463168030816, - "grad_norm": 1.0234375, - "learning_rate": 7.053259991796663e-05, - "loss": 0.3904, + "grad_norm": 1.828125, + "learning_rate": 2.8122499553657673e-05, + "loss": 0.5192, "step": 10175 }, { "epoch": 2.4506499759268174, - "grad_norm": 0.984375, - "learning_rate": 7.04738862678487e-05, - "loss": 0.3864, + "grad_norm": 1.8046875, + "learning_rate": 2.8099089462421063e-05, + "loss": 0.5176, "step": 10180 }, { "epoch": 2.4518536350505538, - "grad_norm": 0.9921875, - "learning_rate": 7.0415286490764e-05, - "loss": 0.3742, + "grad_norm": 1.8359375, + "learning_rate": 2.8075724774222617e-05, + "loss": 0.4964, "step": 10185 }, { "epoch": 2.4530572941742896, - "grad_norm": 0.9296875, - "learning_rate": 7.035680068033992e-05, - "loss": 0.3722, + "grad_norm": 1.6640625, + "learning_rate": 2.805240552639311e-05, + "loss": 0.4958, "step": 10190 }, { "epoch": 2.454260953298026, - "grad_norm": 0.92578125, - "learning_rate": 7.029842893002179e-05, - "loss": 0.3881, + "grad_norm": 1.6953125, + "learning_rate": 2.8029131756190724e-05, + "loss": 0.5101, "step": 10195 }, { "epoch": 2.4554646124217623, - "grad_norm": 0.921875, - "learning_rate": 7.024017133307273e-05, - "loss": 0.3726, + "grad_norm": 1.7421875, + "learning_rate": 2.800590350080098e-05, + "loss": 0.4954, "step": 10200 }, { "epoch": 2.456668271545498, - "grad_norm": 1.015625, - "learning_rate": 7.018202798257341e-05, - "loss": 0.3806, + "grad_norm": 1.8828125, + "learning_rate": 2.7982720797336664e-05, + "loss": 0.5019, "step": 10205 }, { "epoch": 2.4578719306692345, - "grad_norm": 1.0390625, - "learning_rate": 7.012399897142203e-05, - "loss": 0.3832, + "grad_norm": 1.796875, + "learning_rate": 2.7959583682837812e-05, + "loss": 0.5106, "step": 10210 }, { "epoch": 2.459075589792971, - "grad_norm": 1.0546875, - "learning_rate": 7.006608439233404e-05, - "loss": 0.4091, + "grad_norm": 1.8203125, + "learning_rate": 2.793649219427158e-05, + "loss": 0.5372, "step": 10215 }, { "epoch": 2.4602792489167067, - "grad_norm": 0.9296875, - "learning_rate": 7.000828433784213e-05, - "loss": 0.3993, + "grad_norm": 1.734375, + "learning_rate": 2.7913446368532267e-05, + "loss": 0.5266, "step": 10220 }, { "epoch": 2.461482908040443, - "grad_norm": 0.9609375, - "learning_rate": 6.995059890029594e-05, - "loss": 0.3766, + "grad_norm": 1.8515625, + "learning_rate": 2.789044624244118e-05, + "loss": 0.5055, "step": 10225 }, { "epoch": 2.4626865671641793, - "grad_norm": 0.984375, - "learning_rate": 6.989302817186201e-05, - "loss": 0.3802, + "grad_norm": 1.8046875, + "learning_rate": 2.786749185274663e-05, + "loss": 0.5008, "step": 10230 }, { "epoch": 2.463890226287915, - "grad_norm": 0.96875, - "learning_rate": 6.983557224452366e-05, - "loss": 0.3807, + "grad_norm": 1.796875, + "learning_rate": 2.7844583236123862e-05, + "loss": 0.5061, "step": 10235 }, { "epoch": 2.4650938854116515, - "grad_norm": 0.921875, - "learning_rate": 6.977823121008066e-05, - "loss": 0.3633, + "grad_norm": 1.640625, + "learning_rate": 2.7821720429174945e-05, + "loss": 0.4883, "step": 10240 }, { "epoch": 2.4662975445353874, - "grad_norm": 0.9609375, - "learning_rate": 6.972100516014932e-05, - "loss": 0.4012, + "grad_norm": 1.6484375, + "learning_rate": 2.7798903468428804e-05, + "loss": 0.5378, "step": 10245 }, { "epoch": 2.4675012036591237, - "grad_norm": 0.9921875, - "learning_rate": 6.96638941861622e-05, - "loss": 0.3709, + "grad_norm": 1.765625, + "learning_rate": 2.7776132390341087e-05, + "loss": 0.4965, "step": 10250 }, { "epoch": 2.46870486278286, - "grad_norm": 1.046875, - "learning_rate": 6.960689837936796e-05, - "loss": 0.388, + "grad_norm": 1.8515625, + "learning_rate": 2.7753407231294127e-05, + "loss": 0.5139, "step": 10255 }, { "epoch": 2.469908521906596, - "grad_norm": 0.9296875, - "learning_rate": 6.955001783083136e-05, - "loss": 0.386, + "grad_norm": 1.75, + "learning_rate": 2.7730728027596928e-05, + "loss": 0.5105, "step": 10260 }, { "epoch": 2.4711121810303323, - "grad_norm": 0.9921875, - "learning_rate": 6.949325263143284e-05, - "loss": 0.3856, + "grad_norm": 1.6796875, + "learning_rate": 2.770809481548502e-05, + "loss": 0.5123, "step": 10265 }, { "epoch": 2.472315840154068, - "grad_norm": 0.98828125, - "learning_rate": 6.943660287186872e-05, - "loss": 0.3852, + "grad_norm": 1.8359375, + "learning_rate": 2.7685507631120492e-05, + "loss": 0.5131, "step": 10270 }, { "epoch": 2.4735194992778045, - "grad_norm": 0.98046875, - "learning_rate": 6.938006864265074e-05, - "loss": 0.398, + "grad_norm": 1.8359375, + "learning_rate": 2.7662966510591853e-05, + "loss": 0.5229, "step": 10275 }, { "epoch": 2.474723158401541, - "grad_norm": 0.95703125, - "learning_rate": 6.932365003410615e-05, - "loss": 0.3785, + "grad_norm": 1.7734375, + "learning_rate": 2.7640471489914056e-05, + "loss": 0.4986, "step": 10280 }, { "epoch": 2.4759268175252767, - "grad_norm": 1.0078125, - "learning_rate": 6.92673471363774e-05, - "loss": 0.378, + "grad_norm": 1.890625, + "learning_rate": 2.761802260502837e-05, + "loss": 0.5047, "step": 10285 }, { "epoch": 2.477130476649013, - "grad_norm": 1.0234375, - "learning_rate": 6.921116003942208e-05, - "loss": 0.3702, + "grad_norm": 1.84375, + "learning_rate": 2.7595619891802358e-05, + "loss": 0.4911, "step": 10290 }, { "epoch": 2.4783341357727493, - "grad_norm": 1.03125, - "learning_rate": 6.915508883301278e-05, - "loss": 0.3715, + "grad_norm": 1.8203125, + "learning_rate": 2.7573263386029815e-05, + "loss": 0.4959, "step": 10295 }, { "epoch": 2.479537794896485, - "grad_norm": 0.89453125, - "learning_rate": 6.90991336067369e-05, - "loss": 0.3744, + "grad_norm": 1.6484375, + "learning_rate": 2.75509531234307e-05, + "loss": 0.4983, "step": 10300 }, { "epoch": 2.4807414540202215, - "grad_norm": 0.9765625, - "learning_rate": 6.904329444999657e-05, - "loss": 0.4, + "grad_norm": 1.84375, + "learning_rate": 2.7528689139651097e-05, + "loss": 0.5337, "step": 10305 }, { "epoch": 2.481945113143958, - "grad_norm": 0.92578125, - "learning_rate": 6.898757145200843e-05, - "loss": 0.357, + "grad_norm": 1.6640625, + "learning_rate": 2.7506471470263154e-05, + "loss": 0.4762, "step": 10310 }, { "epoch": 2.4831487722676937, - "grad_norm": 1.0234375, - "learning_rate": 6.893196470180354e-05, - "loss": 0.384, + "grad_norm": 1.921875, + "learning_rate": 2.7484300150764987e-05, + "loss": 0.5116, "step": 10315 }, { "epoch": 2.48435243139143, - "grad_norm": 0.98046875, - "learning_rate": 6.887647428822726e-05, - "loss": 0.373, + "grad_norm": 1.71875, + "learning_rate": 2.7462175216580705e-05, + "loss": 0.4945, "step": 10320 }, { "epoch": 2.485556090515166, - "grad_norm": 0.94921875, - "learning_rate": 6.882110029993899e-05, - "loss": 0.3975, + "grad_norm": 1.78125, + "learning_rate": 2.744009670306026e-05, + "loss": 0.5256, "step": 10325 }, { "epoch": 2.4867597496389022, - "grad_norm": 0.9375, - "learning_rate": 6.876584282541223e-05, - "loss": 0.3636, + "grad_norm": 1.8359375, + "learning_rate": 2.741806464547947e-05, + "loss": 0.4868, "step": 10330 }, { "epoch": 2.4879634087626386, - "grad_norm": 0.9765625, - "learning_rate": 6.871070195293424e-05, - "loss": 0.3846, + "grad_norm": 1.8125, + "learning_rate": 2.7396079079039927e-05, + "loss": 0.5114, "step": 10335 }, { "epoch": 2.4891670678863744, - "grad_norm": 1.078125, - "learning_rate": 6.865567777060598e-05, - "loss": 0.3798, + "grad_norm": 1.8984375, + "learning_rate": 2.7374140038868904e-05, + "loss": 0.4992, "step": 10340 }, { "epoch": 2.4903707270101108, - "grad_norm": 0.96875, - "learning_rate": 6.860077036634202e-05, - "loss": 0.4189, + "grad_norm": 1.7421875, + "learning_rate": 2.7352247560019385e-05, + "loss": 0.5399, "step": 10345 }, { "epoch": 2.491574386133847, - "grad_norm": 0.95703125, - "learning_rate": 6.854597982787028e-05, - "loss": 0.3581, + "grad_norm": 1.75, + "learning_rate": 2.7330401677469922e-05, + "loss": 0.4825, "step": 10350 }, { "epoch": 2.492778045257583, - "grad_norm": 1.015625, - "learning_rate": 6.849130624273203e-05, - "loss": 0.3756, + "grad_norm": 1.8203125, + "learning_rate": 2.7308602426124648e-05, + "loss": 0.4974, "step": 10355 }, { "epoch": 2.4939817043813193, - "grad_norm": 0.9765625, - "learning_rate": 6.843674969828162e-05, - "loss": 0.3878, + "grad_norm": 1.6953125, + "learning_rate": 2.7286849840813173e-05, + "loss": 0.5078, "step": 10360 }, { "epoch": 2.4951853635050556, - "grad_norm": 1.03125, - "learning_rate": 6.838231028168644e-05, - "loss": 0.4052, + "grad_norm": 1.875, + "learning_rate": 2.7265143956290553e-05, + "loss": 0.5313, "step": 10365 }, { "epoch": 2.4963890226287915, - "grad_norm": 1.015625, - "learning_rate": 6.83279880799267e-05, - "loss": 0.4026, + "grad_norm": 1.78125, + "learning_rate": 2.7243484807237226e-05, + "loss": 0.5245, "step": 10370 }, { "epoch": 2.497592681752528, - "grad_norm": 0.9296875, - "learning_rate": 6.827378317979534e-05, - "loss": 0.3926, + "grad_norm": 1.7109375, + "learning_rate": 2.7221872428258948e-05, + "loss": 0.5195, "step": 10375 }, { "epoch": 2.4987963408762637, - "grad_norm": 1.0390625, - "learning_rate": 6.821969566789795e-05, - "loss": 0.3737, + "grad_norm": 1.9296875, + "learning_rate": 2.720030685388679e-05, + "loss": 0.4962, "step": 10380 }, { "epoch": 2.5, - "grad_norm": 1.03125, - "learning_rate": 6.816572563065244e-05, - "loss": 0.4109, + "grad_norm": 1.828125, + "learning_rate": 2.717878811857698e-05, + "loss": 0.534, "step": 10385 }, { "epoch": 2.5012036591237363, - "grad_norm": 0.9609375, - "learning_rate": 6.811187315428915e-05, - "loss": 0.3564, + "grad_norm": 1.8359375, + "learning_rate": 2.7157316256710952e-05, + "loss": 0.4767, "step": 10390 }, { "epoch": 2.502407318247472, - "grad_norm": 1.015625, - "learning_rate": 6.805813832485053e-05, - "loss": 0.4221, + "grad_norm": 1.8046875, + "learning_rate": 2.7135891302595257e-05, + "loss": 0.5561, "step": 10395 }, { "epoch": 2.5036109773712085, - "grad_norm": 0.9296875, - "learning_rate": 6.800452122819103e-05, - "loss": 0.3687, + "grad_norm": 1.703125, + "learning_rate": 2.7114513290461447e-05, + "loss": 0.4915, "step": 10400 }, { "epoch": 2.5048146364949444, - "grad_norm": 0.9296875, - "learning_rate": 6.795102194997705e-05, - "loss": 0.3671, + "grad_norm": 1.7265625, + "learning_rate": 2.7093182254466127e-05, + "loss": 0.4886, "step": 10405 }, { "epoch": 2.5060182956186807, - "grad_norm": 0.8671875, - "learning_rate": 6.789764057568671e-05, - "loss": 0.3768, + "grad_norm": 1.703125, + "learning_rate": 2.7071898228690814e-05, + "loss": 0.4973, "step": 10410 }, { "epoch": 2.507221954742417, - "grad_norm": 0.984375, - "learning_rate": 6.784437719060974e-05, - "loss": 0.3771, + "grad_norm": 1.890625, + "learning_rate": 2.7050661247141925e-05, + "loss": 0.5028, "step": 10415 }, { "epoch": 2.5084256138661534, - "grad_norm": 0.92578125, - "learning_rate": 6.779123187984744e-05, - "loss": 0.3835, + "grad_norm": 1.71875, + "learning_rate": 2.7029471343750724e-05, + "loss": 0.5052, "step": 10420 }, { "epoch": 2.5096292729898892, - "grad_norm": 1.015625, - "learning_rate": 6.773820472831233e-05, - "loss": 0.3942, + "grad_norm": 1.8515625, + "learning_rate": 2.7008328552373228e-05, + "loss": 0.5204, "step": 10425 }, { "epoch": 2.5108329321136256, - "grad_norm": 0.953125, - "learning_rate": 6.768529582072822e-05, - "loss": 0.3932, + "grad_norm": 1.875, + "learning_rate": 2.698723290679021e-05, + "loss": 0.5232, "step": 10430 }, { "epoch": 2.5120365912373614, - "grad_norm": 0.92578125, - "learning_rate": 6.763250524162999e-05, - "loss": 0.3669, + "grad_norm": 1.765625, + "learning_rate": 2.6966184440707088e-05, + "loss": 0.4943, "step": 10435 }, { "epoch": 2.5132402503610978, - "grad_norm": 0.9921875, - "learning_rate": 6.757983307536345e-05, - "loss": 0.3671, + "grad_norm": 1.953125, + "learning_rate": 2.694518318775393e-05, + "loss": 0.4872, "step": 10440 }, { "epoch": 2.514443909484834, - "grad_norm": 1.0546875, - "learning_rate": 6.752727940608524e-05, - "loss": 0.3995, + "grad_norm": 1.921875, + "learning_rate": 2.6924229181485365e-05, + "loss": 0.5228, "step": 10445 }, { "epoch": 2.51564756860857, - "grad_norm": 0.95703125, - "learning_rate": 6.747484431776261e-05, - "loss": 0.3645, + "grad_norm": 1.71875, + "learning_rate": 2.6903322455380495e-05, + "loss": 0.4856, "step": 10450 }, { "epoch": 2.5168512277323063, - "grad_norm": 1.0, - "learning_rate": 6.742252789417342e-05, - "loss": 0.3829, + "grad_norm": 1.7890625, + "learning_rate": 2.688246304284293e-05, + "loss": 0.506, "step": 10455 }, { "epoch": 2.518054886856042, - "grad_norm": 0.96484375, - "learning_rate": 6.737033021890588e-05, - "loss": 0.3747, + "grad_norm": 1.8515625, + "learning_rate": 2.686165097720065e-05, + "loss": 0.501, "step": 10460 }, { "epoch": 2.5192585459797785, - "grad_norm": 1.0546875, - "learning_rate": 6.731825137535853e-05, - "loss": 0.3803, + "grad_norm": 1.8984375, + "learning_rate": 2.6840886291706015e-05, + "loss": 0.5123, "step": 10465 }, { "epoch": 2.520462205103515, - "grad_norm": 0.93359375, - "learning_rate": 6.726629144673997e-05, - "loss": 0.371, + "grad_norm": 1.59375, + "learning_rate": 2.6820169019535656e-05, + "loss": 0.4959, "step": 10470 }, { "epoch": 2.5216658642272507, - "grad_norm": 0.96875, - "learning_rate": 6.721445051606887e-05, - "loss": 0.3885, + "grad_norm": 1.828125, + "learning_rate": 2.6799499193790473e-05, + "loss": 0.5163, "step": 10475 }, { "epoch": 2.522869523350987, - "grad_norm": 0.97265625, - "learning_rate": 6.716272866617375e-05, - "loss": 0.3639, + "grad_norm": 1.7734375, + "learning_rate": 2.677887684749555e-05, + "loss": 0.4854, "step": 10480 }, { "epoch": 2.524073182474723, - "grad_norm": 0.9609375, - "learning_rate": 6.711112597969284e-05, - "loss": 0.3681, + "grad_norm": 1.71875, + "learning_rate": 2.6758302013600086e-05, + "loss": 0.4947, "step": 10485 }, { "epoch": 2.525276841598459, - "grad_norm": 1.0625, - "learning_rate": 6.705964253907401e-05, - "loss": 0.3629, + "grad_norm": 1.84375, + "learning_rate": 2.6737774724977416e-05, + "loss": 0.4811, "step": 10490 }, { "epoch": 2.5264805007221955, - "grad_norm": 0.91796875, - "learning_rate": 6.700827842657465e-05, - "loss": 0.3651, + "grad_norm": 1.625, + "learning_rate": 2.6717295014424885e-05, + "loss": 0.4926, "step": 10495 }, { "epoch": 2.527684159845932, - "grad_norm": 0.890625, - "learning_rate": 6.695703372426138e-05, - "loss": 0.3695, + "grad_norm": 1.625, + "learning_rate": 2.6696862914663806e-05, + "loss": 0.4918, "step": 10500 }, { "epoch": 2.527684159845932, - "eval_loss": 0.35426628589630127, - "eval_runtime": 2.3371, - "eval_samples_per_second": 85.576, - "eval_steps_per_second": 85.576, + "eval_loss": 0.44249993562698364, + "eval_runtime": 2.3724, + "eval_samples_per_second": 84.304, + "eval_steps_per_second": 84.304, "step": 10500 }, { "epoch": 2.5288878189696677, - "grad_norm": 1.03125, - "learning_rate": 6.690590851401017e-05, - "loss": 0.3796, + "grad_norm": 1.859375, + "learning_rate": 2.6676478458339456e-05, + "loss": 0.5077, "step": 10505 }, { "epoch": 2.530091478093404, - "grad_norm": 0.9453125, - "learning_rate": 6.685490287750592e-05, - "loss": 0.3864, + "grad_norm": 1.7890625, + "learning_rate": 2.6656141678020952e-05, + "loss": 0.5073, "step": 10510 }, { "epoch": 2.53129513721714, - "grad_norm": 1.0078125, - "learning_rate": 6.68040168962426e-05, - "loss": 0.3689, + "grad_norm": 1.8359375, + "learning_rate": 2.6635852606201266e-05, + "loss": 0.4951, "step": 10515 }, { "epoch": 2.5324987963408763, - "grad_norm": 1.0078125, - "learning_rate": 6.675325065152299e-05, - "loss": 0.3656, + "grad_norm": 1.90625, + "learning_rate": 2.6615611275297132e-05, + "loss": 0.4909, "step": 10520 }, { "epoch": 2.5337024554646126, - "grad_norm": 0.92578125, - "learning_rate": 6.670260422445847e-05, - "loss": 0.3775, + "grad_norm": 1.765625, + "learning_rate": 2.6595417717648996e-05, + "loss": 0.4989, "step": 10525 }, { "epoch": 2.5349061145883485, - "grad_norm": 0.96875, - "learning_rate": 6.665207769596911e-05, - "loss": 0.3524, + "grad_norm": 1.75, + "learning_rate": 2.657527196552101e-05, + "loss": 0.4746, "step": 10530 }, { "epoch": 2.536109773712085, - "grad_norm": 0.91015625, - "learning_rate": 6.66016711467833e-05, - "loss": 0.3779, + "grad_norm": 1.6875, + "learning_rate": 2.6555174051100893e-05, + "loss": 0.5065, "step": 10535 }, { "epoch": 2.5373134328358207, - "grad_norm": 0.953125, - "learning_rate": 6.655138465743781e-05, - "loss": 0.3791, + "grad_norm": 1.8203125, + "learning_rate": 2.6535124006499987e-05, + "loss": 0.5075, "step": 10540 }, { "epoch": 2.538517091959557, - "grad_norm": 0.9453125, - "learning_rate": 6.650121830827758e-05, - "loss": 0.3778, + "grad_norm": 1.7890625, + "learning_rate": 2.6515121863753107e-05, + "loss": 0.5003, "step": 10545 }, { "epoch": 2.5397207510832933, - "grad_norm": 0.953125, - "learning_rate": 6.645117217945553e-05, - "loss": 0.3765, + "grad_norm": 1.828125, + "learning_rate": 2.6495167654818555e-05, + "loss": 0.5073, "step": 10550 }, { "epoch": 2.5409244102070296, - "grad_norm": 1.046875, - "learning_rate": 6.640124635093258e-05, - "loss": 0.3838, + "grad_norm": 1.8203125, + "learning_rate": 2.6475261411578046e-05, + "loss": 0.511, "step": 10555 }, { "epoch": 2.5421280693307655, - "grad_norm": 1.078125, - "learning_rate": 6.635144090247737e-05, - "loss": 0.3703, + "grad_norm": 1.953125, + "learning_rate": 2.645540316583664e-05, + "loss": 0.4941, "step": 10560 }, { "epoch": 2.543331728454502, - "grad_norm": 0.96484375, - "learning_rate": 6.630175591366627e-05, - "loss": 0.3701, + "grad_norm": 1.8125, + "learning_rate": 2.6435592949322736e-05, + "loss": 0.4944, "step": 10565 }, { "epoch": 2.5445353875782377, - "grad_norm": 0.9921875, - "learning_rate": 6.625219146388313e-05, - "loss": 0.3728, + "grad_norm": 1.890625, + "learning_rate": 2.641583079368797e-05, + "loss": 0.5039, "step": 10570 }, { "epoch": 2.545739046701974, - "grad_norm": 0.94921875, - "learning_rate": 6.62027476323193e-05, - "loss": 0.3474, + "grad_norm": 1.7109375, + "learning_rate": 2.6396116730507218e-05, + "loss": 0.4688, "step": 10575 }, { "epoch": 2.5469427058257104, - "grad_norm": 0.8984375, - "learning_rate": 6.615342449797326e-05, - "loss": 0.39, + "grad_norm": 1.671875, + "learning_rate": 2.6376450791278488e-05, + "loss": 0.5159, "step": 10580 }, { "epoch": 2.5481463649494462, - "grad_norm": 0.94921875, - "learning_rate": 6.610422213965077e-05, - "loss": 0.4172, + "grad_norm": 1.7734375, + "learning_rate": 2.63568330074229e-05, + "loss": 0.5476, "step": 10585 }, { "epoch": 2.5493500240731826, - "grad_norm": 1.03125, - "learning_rate": 6.60551406359646e-05, - "loss": 0.3828, + "grad_norm": 1.7890625, + "learning_rate": 2.6337263410284665e-05, + "loss": 0.5076, "step": 10590 }, { "epoch": 2.5505536831969184, - "grad_norm": 0.87890625, - "learning_rate": 6.600618006533439e-05, - "loss": 0.4119, + "grad_norm": 1.75, + "learning_rate": 2.631774203113096e-05, + "loss": 0.5403, "step": 10595 }, { "epoch": 2.5517573423206548, - "grad_norm": 0.95703125, - "learning_rate": 6.59573405059866e-05, - "loss": 0.375, + "grad_norm": 1.71875, + "learning_rate": 2.6298268901151964e-05, + "loss": 0.4959, "step": 10600 }, { "epoch": 2.552961001444391, - "grad_norm": 1.1484375, - "learning_rate": 6.590862203595433e-05, - "loss": 0.4003, + "grad_norm": 1.9453125, + "learning_rate": 2.6278844051460745e-05, + "loss": 0.5305, "step": 10605 }, { "epoch": 2.554164660568127, - "grad_norm": 0.96875, - "learning_rate": 6.586002473307714e-05, - "loss": 0.3615, + "grad_norm": 1.8203125, + "learning_rate": 2.625946751309321e-05, + "loss": 0.4833, "step": 10610 }, { "epoch": 2.5553683196918633, - "grad_norm": 1.0859375, - "learning_rate": 6.581154867500117e-05, - "loss": 0.3735, + "grad_norm": 1.90625, + "learning_rate": 2.6240139317008135e-05, + "loss": 0.4973, "step": 10615 }, { "epoch": 2.556571978815599, - "grad_norm": 0.9609375, - "learning_rate": 6.576319393917863e-05, - "loss": 0.3613, + "grad_norm": 1.7890625, + "learning_rate": 2.6220859494086986e-05, + "loss": 0.4845, "step": 10620 }, { "epoch": 2.5577756379393355, - "grad_norm": 0.9765625, - "learning_rate": 6.571496060286808e-05, - "loss": 0.366, + "grad_norm": 1.7734375, + "learning_rate": 2.620162807513401e-05, + "loss": 0.4915, "step": 10625 }, { "epoch": 2.558979297063072, - "grad_norm": 0.9375, - "learning_rate": 6.566684874313397e-05, - "loss": 0.3972, + "grad_norm": 1.75, + "learning_rate": 2.618244509087607e-05, + "loss": 0.5256, "step": 10630 }, { "epoch": 2.560182956186808, - "grad_norm": 0.9140625, - "learning_rate": 6.561885843684673e-05, - "loss": 0.3866, + "grad_norm": 1.75, + "learning_rate": 2.616331057196265e-05, + "loss": 0.5161, "step": 10635 }, { "epoch": 2.561386615310544, - "grad_norm": 0.95703125, - "learning_rate": 6.557098976068259e-05, - "loss": 0.3796, + "grad_norm": 1.71875, + "learning_rate": 2.6144224548965826e-05, + "loss": 0.5071, "step": 10640 }, { "epoch": 2.5625902744342803, - "grad_norm": 0.8671875, - "learning_rate": 6.552324279112338e-05, - "loss": 0.3984, + "grad_norm": 1.5859375, + "learning_rate": 2.6125187052380144e-05, + "loss": 0.5356, "step": 10645 }, { "epoch": 2.563793933558016, - "grad_norm": 0.94140625, - "learning_rate": 6.547561760445653e-05, - "loss": 0.3706, + "grad_norm": 1.75, + "learning_rate": 2.6106198112622658e-05, + "loss": 0.4935, "step": 10650 }, { "epoch": 2.5649975926817525, - "grad_norm": 0.9296875, - "learning_rate": 6.542811427677492e-05, - "loss": 0.3766, + "grad_norm": 1.703125, + "learning_rate": 2.6087257760032834e-05, + "loss": 0.507, "step": 10655 }, { "epoch": 2.566201251805489, - "grad_norm": 0.9140625, - "learning_rate": 6.538073288397665e-05, - "loss": 0.3775, + "grad_norm": 1.6875, + "learning_rate": 2.6068366024872482e-05, + "loss": 0.5048, "step": 10660 }, { "epoch": 2.5674049109292247, - "grad_norm": 1.0546875, - "learning_rate": 6.533347350176507e-05, - "loss": 0.3992, + "grad_norm": 1.8515625, + "learning_rate": 2.604952293732577e-05, + "loss": 0.5287, "step": 10665 }, { "epoch": 2.568608570052961, - "grad_norm": 0.95703125, - "learning_rate": 6.52863362056485e-05, - "loss": 0.3846, + "grad_norm": 1.7421875, + "learning_rate": 2.6030728527499107e-05, + "loss": 0.5108, "step": 10670 }, { "epoch": 2.569812229176697, - "grad_norm": 0.9375, - "learning_rate": 6.523932107094033e-05, - "loss": 0.3707, + "grad_norm": 1.703125, + "learning_rate": 2.601198282542115e-05, + "loss": 0.4971, "step": 10675 }, { "epoch": 2.5710158883004333, - "grad_norm": 0.93359375, - "learning_rate": 6.519242817275864e-05, - "loss": 0.3462, + "grad_norm": 1.7265625, + "learning_rate": 2.599328586104272e-05, + "loss": 0.4708, "step": 10680 }, { "epoch": 2.5722195474241696, - "grad_norm": 1.0390625, - "learning_rate": 6.514565758602627e-05, - "loss": 0.3686, + "grad_norm": 1.8671875, + "learning_rate": 2.5974637664236774e-05, + "loss": 0.4978, "step": 10685 }, { "epoch": 2.573423206547906, - "grad_norm": 0.984375, - "learning_rate": 6.509900938547065e-05, - "loss": 0.3869, + "grad_norm": 1.859375, + "learning_rate": 2.5956038264798357e-05, + "loss": 0.512, "step": 10690 }, { "epoch": 2.574626865671642, - "grad_norm": 0.98046875, - "learning_rate": 6.505248364562362e-05, - "loss": 0.3946, + "grad_norm": 1.6875, + "learning_rate": 2.593748769244453e-05, + "loss": 0.5203, "step": 10695 }, { "epoch": 2.575830524795378, - "grad_norm": 1.015625, - "learning_rate": 6.50060804408214e-05, - "loss": 0.3678, + "grad_norm": 1.84375, + "learning_rate": 2.591898597681436e-05, + "loss": 0.4895, "step": 10700 }, { "epoch": 2.577034183919114, - "grad_norm": 0.96875, - "learning_rate": 6.495979984520442e-05, - "loss": 0.3817, + "grad_norm": 1.859375, + "learning_rate": 2.5900533147468847e-05, + "loss": 0.5136, "step": 10705 }, { "epoch": 2.5782378430428503, - "grad_norm": 1.0859375, - "learning_rate": 6.491364193271718e-05, - "loss": 0.3853, + "grad_norm": 2.3125, + "learning_rate": 2.5882129233890876e-05, + "loss": 0.5157, "step": 10710 }, { "epoch": 2.5794415021665866, - "grad_norm": 1.0078125, - "learning_rate": 6.486760677710823e-05, - "loss": 0.3823, + "grad_norm": 1.7890625, + "learning_rate": 2.5863774265485184e-05, + "loss": 0.5091, "step": 10715 }, { "epoch": 2.5806451612903225, - "grad_norm": 0.94921875, - "learning_rate": 6.48216944519299e-05, - "loss": 0.404, + "grad_norm": 1.7734375, + "learning_rate": 2.5845468271578303e-05, + "loss": 0.5329, "step": 10720 }, { "epoch": 2.581848820414059, - "grad_norm": 0.9375, - "learning_rate": 6.477590503053839e-05, - "loss": 0.3634, + "grad_norm": 1.71875, + "learning_rate": 2.5827211281418537e-05, + "loss": 0.4933, "step": 10725 }, { "epoch": 2.5830524795377947, - "grad_norm": 0.86328125, - "learning_rate": 6.473023858609337e-05, - "loss": 0.3883, + "grad_norm": 1.6953125, + "learning_rate": 2.5809003324175848e-05, + "loss": 0.5242, "step": 10730 }, { "epoch": 2.584256138661531, - "grad_norm": 1.0078125, - "learning_rate": 6.468469519155818e-05, - "loss": 0.3984, + "grad_norm": 1.8203125, + "learning_rate": 2.57908444289419e-05, + "loss": 0.5245, "step": 10735 }, { "epoch": 2.5854597977852674, - "grad_norm": 0.984375, - "learning_rate": 6.463927491969946e-05, - "loss": 0.3616, + "grad_norm": 1.8515625, + "learning_rate": 2.5772734624729958e-05, + "loss": 0.4852, "step": 10740 }, { "epoch": 2.5866634569090032, - "grad_norm": 0.9921875, - "learning_rate": 6.459397784308715e-05, - "loss": 0.3955, + "grad_norm": 1.75, + "learning_rate": 2.575467394047483e-05, + "loss": 0.5249, "step": 10745 }, { "epoch": 2.5878671160327396, - "grad_norm": 0.9140625, - "learning_rate": 6.454880403409437e-05, - "loss": 0.3592, + "grad_norm": 1.7578125, + "learning_rate": 2.5736662405032863e-05, + "loss": 0.4785, "step": 10750 }, { "epoch": 2.5890707751564754, - "grad_norm": 0.921875, - "learning_rate": 6.450375356489732e-05, - "loss": 0.3819, + "grad_norm": 1.6640625, + "learning_rate": 2.5718700047181888e-05, + "loss": 0.5083, "step": 10755 }, { "epoch": 2.5902744342802118, - "grad_norm": 1.0, - "learning_rate": 6.445882650747506e-05, - "loss": 0.3759, + "grad_norm": 1.8359375, + "learning_rate": 2.5700786895621134e-05, + "loss": 0.501, "step": 10760 }, { "epoch": 2.591478093403948, - "grad_norm": 1.0078125, - "learning_rate": 6.441402293360954e-05, - "loss": 0.3487, + "grad_norm": 1.8671875, + "learning_rate": 2.5682922978971232e-05, + "loss": 0.4724, "step": 10765 }, { "epoch": 2.5926817525276844, - "grad_norm": 0.97265625, - "learning_rate": 6.436934291488535e-05, - "loss": 0.3749, + "grad_norm": 1.8828125, + "learning_rate": 2.566510832577413e-05, + "loss": 0.5029, "step": 10770 }, { "epoch": 2.5938854116514203, - "grad_norm": 0.9453125, - "learning_rate": 6.432478652268974e-05, - "loss": 0.3469, + "grad_norm": 1.8359375, + "learning_rate": 2.5647342964493096e-05, + "loss": 0.4711, "step": 10775 }, { "epoch": 2.5950890707751566, - "grad_norm": 1.0390625, - "learning_rate": 6.428035382821242e-05, - "loss": 0.4054, + "grad_norm": 1.8828125, + "learning_rate": 2.5629626923512613e-05, + "loss": 0.5401, "step": 10780 }, { "epoch": 2.5962927298988925, - "grad_norm": 0.953125, - "learning_rate": 6.42360449024454e-05, - "loss": 0.3592, + "grad_norm": 1.671875, + "learning_rate": 2.5611960231138376e-05, + "loss": 0.4761, "step": 10785 }, { "epoch": 2.597496389022629, - "grad_norm": 0.98828125, - "learning_rate": 6.4191859816183e-05, - "loss": 0.3836, + "grad_norm": 1.765625, + "learning_rate": 2.5594342915597223e-05, + "loss": 0.5107, "step": 10790 }, { "epoch": 2.598700048146365, - "grad_norm": 0.90625, - "learning_rate": 6.414779864002165e-05, - "loss": 0.3786, + "grad_norm": 1.65625, + "learning_rate": 2.5576775005037112e-05, + "loss": 0.5045, "step": 10795 }, { "epoch": 2.599903707270101, - "grad_norm": 0.89453125, - "learning_rate": 6.410386144435985e-05, - "loss": 0.3782, + "grad_norm": 1.6640625, + "learning_rate": 2.5559256527527062e-05, + "loss": 0.507, "step": 10800 }, { "epoch": 2.6011073663938373, - "grad_norm": 0.94140625, - "learning_rate": 6.406004829939794e-05, - "loss": 0.3616, + "grad_norm": 1.6640625, + "learning_rate": 2.5541787511057106e-05, + "loss": 0.4856, "step": 10805 }, { "epoch": 2.602311025517573, - "grad_norm": 1.046875, - "learning_rate": 6.40163592751381e-05, - "loss": 0.3922, + "grad_norm": 1.875, + "learning_rate": 2.552436798353826e-05, + "loss": 0.518, "step": 10810 }, { "epoch": 2.6035146846413095, - "grad_norm": 0.9921875, - "learning_rate": 6.39727944413842e-05, - "loss": 0.3729, + "grad_norm": 1.9921875, + "learning_rate": 2.5506997972802453e-05, + "loss": 0.5011, "step": 10815 }, { "epoch": 2.604718343765046, - "grad_norm": 0.89453125, - "learning_rate": 6.392935386774167e-05, - "loss": 0.3611, + "grad_norm": 1.8125, + "learning_rate": 2.5489677506602523e-05, + "loss": 0.484, "step": 10820 }, { "epoch": 2.605922002888782, - "grad_norm": 0.90625, - "learning_rate": 6.388603762361743e-05, - "loss": 0.3967, + "grad_norm": 1.71875, + "learning_rate": 2.5472406612612124e-05, + "loss": 0.5287, "step": 10825 }, { "epoch": 2.607125662012518, - "grad_norm": 0.9453125, - "learning_rate": 6.38428457782197e-05, - "loss": 0.3939, + "grad_norm": 1.7578125, + "learning_rate": 2.5455185318425715e-05, + "loss": 0.5258, "step": 10830 }, { "epoch": 2.6083293211362544, - "grad_norm": 0.97265625, - "learning_rate": 6.3799778400558e-05, - "loss": 0.4049, + "grad_norm": 1.765625, + "learning_rate": 2.543801365155853e-05, + "loss": 0.5339, "step": 10835 }, { "epoch": 2.6095329802599903, - "grad_norm": 0.9296875, - "learning_rate": 6.375683555944297e-05, - "loss": 0.3576, + "grad_norm": 1.7421875, + "learning_rate": 2.542089163944647e-05, + "loss": 0.4856, "step": 10840 }, { "epoch": 2.6107366393837266, - "grad_norm": 1.0078125, - "learning_rate": 6.371401732348621e-05, - "loss": 0.39, + "grad_norm": 1.765625, + "learning_rate": 2.5403819309446124e-05, + "loss": 0.5191, "step": 10845 }, { "epoch": 2.611940298507463, - "grad_norm": 1.0, - "learning_rate": 6.367132376110036e-05, - "loss": 0.4013, + "grad_norm": 1.796875, + "learning_rate": 2.5386796688834706e-05, + "loss": 0.5324, "step": 10850 }, { "epoch": 2.6131439576311988, - "grad_norm": 0.97265625, - "learning_rate": 6.362875494049874e-05, - "loss": 0.3821, + "grad_norm": 1.78125, + "learning_rate": 2.5369823804809997e-05, + "loss": 0.5048, "step": 10855 }, { "epoch": 2.614347616754935, - "grad_norm": 0.953125, - "learning_rate": 6.358631092969539e-05, - "loss": 0.385, + "grad_norm": 1.75, + "learning_rate": 2.5352900684490302e-05, + "loss": 0.5112, "step": 10860 }, { "epoch": 2.615551275878671, - "grad_norm": 0.90234375, - "learning_rate": 6.354399179650503e-05, - "loss": 0.3817, + "grad_norm": 1.7109375, + "learning_rate": 2.5336027354914455e-05, + "loss": 0.5117, "step": 10865 }, { "epoch": 2.6167549350024073, - "grad_norm": 0.984375, - "learning_rate": 6.35017976085427e-05, - "loss": 0.3622, + "grad_norm": 1.7109375, + "learning_rate": 2.5319203843041685e-05, + "loss": 0.4914, "step": 10870 }, { "epoch": 2.6179585941261436, - "grad_norm": 0.9375, - "learning_rate": 6.345972843322398e-05, - "loss": 0.3959, + "grad_norm": 1.734375, + "learning_rate": 2.530243017575167e-05, + "loss": 0.5218, "step": 10875 }, { "epoch": 2.6191622532498795, - "grad_norm": 0.99609375, - "learning_rate": 6.341778433776457e-05, - "loss": 0.3943, + "grad_norm": 1.890625, + "learning_rate": 2.528570637984442e-05, + "loss": 0.5251, "step": 10880 }, { "epoch": 2.620365912373616, - "grad_norm": 1.078125, - "learning_rate": 6.33759653891804e-05, - "loss": 0.3872, + "grad_norm": 1.8828125, + "learning_rate": 2.526903248204028e-05, + "loss": 0.515, "step": 10885 }, { "epoch": 2.6215695714973517, - "grad_norm": 0.98828125, - "learning_rate": 6.333427165428746e-05, - "loss": 0.3675, + "grad_norm": 1.8203125, + "learning_rate": 2.5252408508979866e-05, + "loss": 0.4893, "step": 10890 }, { "epoch": 2.622773230621088, - "grad_norm": 1.0, - "learning_rate": 6.329270319970161e-05, - "loss": 0.3798, + "grad_norm": 1.7734375, + "learning_rate": 2.5235834487224017e-05, + "loss": 0.5025, "step": 10895 }, { "epoch": 2.6239768897448243, - "grad_norm": 0.890625, - "learning_rate": 6.325126009183858e-05, - "loss": 0.3471, + "grad_norm": 1.625, + "learning_rate": 2.5219310443253773e-05, + "loss": 0.4731, "step": 10900 }, { "epoch": 2.6251805488685607, - "grad_norm": 0.97265625, - "learning_rate": 6.320994239691385e-05, - "loss": 0.3822, + "grad_norm": 1.7578125, + "learning_rate": 2.520283640347032e-05, + "loss": 0.5076, "step": 10905 }, { "epoch": 2.6263842079922965, - "grad_norm": 0.93359375, - "learning_rate": 6.316875018094249e-05, - "loss": 0.389, + "grad_norm": 1.7421875, + "learning_rate": 2.518641239419495e-05, + "loss": 0.5212, "step": 10910 }, { "epoch": 2.627587867116033, - "grad_norm": 0.94921875, - "learning_rate": 6.312768350973913e-05, - "loss": 0.3667, + "grad_norm": 1.765625, + "learning_rate": 2.517003844166903e-05, + "loss": 0.4935, "step": 10915 }, { "epoch": 2.6287915262397687, - "grad_norm": 1.0234375, - "learning_rate": 6.308674244891776e-05, - "loss": 0.3713, + "grad_norm": 1.859375, + "learning_rate": 2.5153714572053924e-05, + "loss": 0.502, "step": 10920 }, { "epoch": 2.629995185363505, - "grad_norm": 0.98046875, - "learning_rate": 6.304592706389172e-05, - "loss": 0.3742, + "grad_norm": 1.734375, + "learning_rate": 2.5137440811431e-05, + "loss": 0.501, "step": 10925 }, { "epoch": 2.6311988444872414, - "grad_norm": 0.984375, - "learning_rate": 6.30052374198735e-05, - "loss": 0.3728, + "grad_norm": 1.7421875, + "learning_rate": 2.5121217185801555e-05, + "loss": 0.4993, "step": 10930 }, { "epoch": 2.6324025036109773, - "grad_norm": 0.94140625, - "learning_rate": 6.296467358187474e-05, - "loss": 0.3847, + "grad_norm": 1.8515625, + "learning_rate": 2.5105043721086773e-05, + "loss": 0.5084, "step": 10935 }, { "epoch": 2.6336061627347136, - "grad_norm": 0.90625, - "learning_rate": 6.292423561470606e-05, - "loss": 0.4019, + "grad_norm": 1.734375, + "learning_rate": 2.508892044312772e-05, + "loss": 0.531, "step": 10940 }, { "epoch": 2.6348098218584495, - "grad_norm": 0.91015625, - "learning_rate": 6.288392358297697e-05, - "loss": 0.38, + "grad_norm": 1.7265625, + "learning_rate": 2.507284737768526e-05, + "loss": 0.508, "step": 10945 }, { "epoch": 2.636013480982186, - "grad_norm": 0.9921875, - "learning_rate": 6.284373755109574e-05, - "loss": 0.3992, + "grad_norm": 1.71875, + "learning_rate": 2.5056824550440023e-05, + "loss": 0.5351, "step": 10950 }, { "epoch": 2.637217140105922, - "grad_norm": 0.9609375, - "learning_rate": 6.280367758326935e-05, - "loss": 0.3763, + "grad_norm": 1.8359375, + "learning_rate": 2.504085198699237e-05, + "loss": 0.5032, "step": 10955 }, { "epoch": 2.638420799229658, - "grad_norm": 1.0, - "learning_rate": 6.276374374350337e-05, - "loss": 0.371, + "grad_norm": 1.765625, + "learning_rate": 2.5024929712862387e-05, + "loss": 0.4991, "step": 10960 }, { "epoch": 2.6396244583533943, - "grad_norm": 1.03125, - "learning_rate": 6.272393609560185e-05, - "loss": 0.3975, + "grad_norm": 1.796875, + "learning_rate": 2.5009057753489776e-05, + "loss": 0.5196, "step": 10965 }, { "epoch": 2.64082811747713, - "grad_norm": 1.03125, - "learning_rate": 6.268425470316717e-05, - "loss": 0.4208, + "grad_norm": 1.8359375, + "learning_rate": 2.4993236134233846e-05, + "loss": 0.5479, "step": 10970 }, { "epoch": 2.6420317766008665, - "grad_norm": 0.9296875, - "learning_rate": 6.264469962960005e-05, - "loss": 0.3575, + "grad_norm": 1.7109375, + "learning_rate": 2.497746488037351e-05, + "loss": 0.4799, "step": 10975 }, { "epoch": 2.643235435724603, - "grad_norm": 0.9296875, - "learning_rate": 6.260527093809936e-05, - "loss": 0.38, + "grad_norm": 1.7265625, + "learning_rate": 2.496174401710717e-05, + "loss": 0.5066, "step": 10980 }, { "epoch": 2.644439094848339, - "grad_norm": 0.8984375, - "learning_rate": 6.256596869166204e-05, - "loss": 0.361, + "grad_norm": 1.7734375, + "learning_rate": 2.4946073569552747e-05, + "loss": 0.4878, "step": 10985 }, { "epoch": 2.645642753972075, - "grad_norm": 0.9921875, - "learning_rate": 6.2526792953083e-05, - "loss": 0.4116, + "grad_norm": 1.78125, + "learning_rate": 2.4930453562747602e-05, + "loss": 0.5471, "step": 10990 }, { "epoch": 2.6468464130958114, - "grad_norm": 0.9765625, - "learning_rate": 6.248774378495501e-05, - "loss": 0.3929, + "grad_norm": 1.8984375, + "learning_rate": 2.4914884021648496e-05, + "loss": 0.5275, "step": 10995 }, { "epoch": 2.6480500722195472, - "grad_norm": 1.0078125, - "learning_rate": 6.244882124966866e-05, - "loss": 0.3893, + "grad_norm": 1.859375, + "learning_rate": 2.489936497113156e-05, + "loss": 0.5126, "step": 11000 }, { "epoch": 2.6480500722195472, - "eval_loss": 0.34993645548820496, - "eval_runtime": 2.3328, - "eval_samples_per_second": 85.732, - "eval_steps_per_second": 85.732, + "eval_loss": 0.439637154340744, + "eval_runtime": 2.3871, + "eval_samples_per_second": 83.785, + "eval_steps_per_second": 83.785, "step": 11000 }, { "epoch": 2.6492537313432836, - "grad_norm": 1.078125, - "learning_rate": 6.241002540941217e-05, - "loss": 0.3723, + "grad_norm": 1.90625, + "learning_rate": 2.4883896435992266e-05, + "loss": 0.4977, "step": 11005 }, { "epoch": 2.65045739046702, - "grad_norm": 0.9140625, - "learning_rate": 6.237135632617133e-05, - "loss": 0.3593, + "grad_norm": 1.703125, + "learning_rate": 2.4868478440945356e-05, + "loss": 0.4818, "step": 11010 }, { "epoch": 2.6516610495907558, - "grad_norm": 0.9921875, - "learning_rate": 6.233281406172947e-05, - "loss": 0.3846, + "grad_norm": 1.796875, + "learning_rate": 2.485311101062485e-05, + "loss": 0.5104, "step": 11015 }, { "epoch": 2.652864708714492, - "grad_norm": 0.8984375, - "learning_rate": 6.22943986776672e-05, - "loss": 0.394, + "grad_norm": 1.6328125, + "learning_rate": 2.483779416958395e-05, + "loss": 0.5236, "step": 11020 }, { "epoch": 2.654068367838228, - "grad_norm": 1.015625, - "learning_rate": 6.225611023536247e-05, - "loss": 0.3977, + "grad_norm": 1.8828125, + "learning_rate": 2.482252794229505e-05, + "loss": 0.5293, "step": 11025 }, { "epoch": 2.6552720269619643, - "grad_norm": 0.98046875, - "learning_rate": 6.22179487959904e-05, - "loss": 0.3585, + "grad_norm": 1.7109375, + "learning_rate": 2.4807312353149657e-05, + "loss": 0.4944, "step": 11030 }, { "epoch": 2.6564756860857006, - "grad_norm": 0.88671875, - "learning_rate": 6.217991442052319e-05, - "loss": 0.3813, + "grad_norm": 1.65625, + "learning_rate": 2.4792147426458393e-05, + "loss": 0.5084, "step": 11035 }, { "epoch": 2.657679345209437, - "grad_norm": 1.046875, - "learning_rate": 6.214200716973001e-05, - "loss": 0.3798, + "grad_norm": 1.859375, + "learning_rate": 2.477703318645092e-05, + "loss": 0.5051, "step": 11040 }, { "epoch": 2.658883004333173, - "grad_norm": 0.88671875, - "learning_rate": 6.210422710417694e-05, - "loss": 0.3581, + "grad_norm": 1.6796875, + "learning_rate": 2.4761969657275906e-05, + "loss": 0.485, "step": 11045 }, { "epoch": 2.660086663456909, - "grad_norm": 0.9296875, - "learning_rate": 6.206657428422685e-05, - "loss": 0.3769, + "grad_norm": 1.75, + "learning_rate": 2.4746956863001035e-05, + "loss": 0.5069, "step": 11050 }, { "epoch": 2.661290322580645, - "grad_norm": 0.953125, - "learning_rate": 6.202904877003929e-05, - "loss": 0.3623, + "grad_norm": 1.7265625, + "learning_rate": 2.473199482761289e-05, + "loss": 0.4838, "step": 11055 }, { "epoch": 2.6624939817043813, - "grad_norm": 0.96875, - "learning_rate": 6.199165062157037e-05, - "loss": 0.4072, + "grad_norm": 1.8046875, + "learning_rate": 2.4717083575016957e-05, + "loss": 0.5423, "step": 11060 }, { "epoch": 2.6636976408281177, - "grad_norm": 0.88671875, - "learning_rate": 6.195437989857279e-05, - "loss": 0.3601, + "grad_norm": 1.734375, + "learning_rate": 2.4702223129037607e-05, + "loss": 0.4854, "step": 11065 }, { "epoch": 2.6649012999518535, - "grad_norm": 1.03125, - "learning_rate": 6.19172366605956e-05, - "loss": 0.4149, + "grad_norm": 1.828125, + "learning_rate": 2.4687413513418026e-05, + "loss": 0.5406, "step": 11070 }, { "epoch": 2.66610495907559, - "grad_norm": 0.91796875, - "learning_rate": 6.188022096698417e-05, - "loss": 0.3723, + "grad_norm": 1.6484375, + "learning_rate": 2.467265475182018e-05, + "loss": 0.5017, "step": 11075 }, { "epoch": 2.6673086181993257, - "grad_norm": 0.9296875, - "learning_rate": 6.184333287688008e-05, - "loss": 0.3898, + "grad_norm": 1.6796875, + "learning_rate": 2.4657946867824776e-05, + "loss": 0.5171, "step": 11080 }, { "epoch": 2.668512277323062, - "grad_norm": 1.0, - "learning_rate": 6.180657244922108e-05, - "loss": 0.4081, + "grad_norm": 1.8203125, + "learning_rate": 2.4643289884931263e-05, + "loss": 0.537, "step": 11085 }, { "epoch": 2.6697159364467984, - "grad_norm": 1.046875, - "learning_rate": 6.176993974274084e-05, - "loss": 0.429, + "grad_norm": 1.921875, + "learning_rate": 2.462868382655772e-05, + "loss": 0.5588, "step": 11090 }, { "epoch": 2.6709195955705343, - "grad_norm": 0.94921875, - "learning_rate": 6.17334348159691e-05, - "loss": 0.3665, + "grad_norm": 1.7734375, + "learning_rate": 2.4614128716040896e-05, + "loss": 0.4893, "step": 11095 }, { "epoch": 2.6721232546942706, - "grad_norm": 0.91796875, - "learning_rate": 6.169705772723136e-05, - "loss": 0.3565, + "grad_norm": 1.71875, + "learning_rate": 2.4599624576636134e-05, + "loss": 0.4836, "step": 11100 }, { "epoch": 2.6733269138180065, - "grad_norm": 0.87890625, - "learning_rate": 6.166080853464888e-05, - "loss": 0.3682, + "grad_norm": 1.65625, + "learning_rate": 2.4585171431517324e-05, + "loss": 0.4908, "step": 11105 }, { "epoch": 2.674530572941743, - "grad_norm": 1.0078125, - "learning_rate": 6.162468729613855e-05, - "loss": 0.3733, + "grad_norm": 1.859375, + "learning_rate": 2.4570769303776878e-05, + "loss": 0.5044, "step": 11110 }, { "epoch": 2.675734232065479, - "grad_norm": 1.0234375, - "learning_rate": 6.158869406941286e-05, - "loss": 0.394, + "grad_norm": 1.796875, + "learning_rate": 2.455641821642571e-05, + "loss": 0.5238, "step": 11115 }, { "epoch": 2.6769378911892154, - "grad_norm": 0.9140625, - "learning_rate": 6.155282891197976e-05, - "loss": 0.3906, + "grad_norm": 1.5703125, + "learning_rate": 2.4542118192393176e-05, + "loss": 0.5175, "step": 11120 }, { "epoch": 2.6781415503129513, - "grad_norm": 0.8984375, - "learning_rate": 6.151709188114261e-05, - "loss": 0.3708, + "grad_norm": 1.703125, + "learning_rate": 2.452786925452706e-05, + "loss": 0.4973, "step": 11125 }, { "epoch": 2.6793452094366876, - "grad_norm": 0.9296875, - "learning_rate": 6.1481483034e-05, - "loss": 0.3752, + "grad_norm": 1.6796875, + "learning_rate": 2.45136714255935e-05, + "loss": 0.5058, "step": 11130 }, { "epoch": 2.6805488685604235, - "grad_norm": 0.91796875, - "learning_rate": 6.144600242744574e-05, - "loss": 0.3463, + "grad_norm": 1.734375, + "learning_rate": 2.4499524728276994e-05, + "loss": 0.474, "step": 11135 }, { "epoch": 2.68175252768416, - "grad_norm": 0.8828125, - "learning_rate": 6.141065011816873e-05, - "loss": 0.3603, + "grad_norm": 1.6171875, + "learning_rate": 2.448542918518033e-05, + "loss": 0.4893, "step": 11140 }, { "epoch": 2.682956186807896, - "grad_norm": 0.97265625, - "learning_rate": 6.137542616265291e-05, - "loss": 0.3832, + "grad_norm": 1.7109375, + "learning_rate": 2.4471384818824575e-05, + "loss": 0.5101, "step": 11145 }, { "epoch": 2.684159845931632, - "grad_norm": 1.71875, - "learning_rate": 6.134033061717713e-05, - "loss": 0.3703, + "grad_norm": 1.84375, + "learning_rate": 2.4457391651649032e-05, + "loss": 0.4876, "step": 11150 }, { "epoch": 2.6853635050553684, - "grad_norm": 0.859375, - "learning_rate": 6.130536353781511e-05, - "loss": 0.3693, + "grad_norm": 1.6015625, + "learning_rate": 2.44434497060112e-05, + "loss": 0.4892, "step": 11155 }, { "epoch": 2.6865671641791042, - "grad_norm": 0.9375, - "learning_rate": 6.127052498043521e-05, - "loss": 0.3713, + "grad_norm": 1.7109375, + "learning_rate": 2.442955900418671e-05, + "loss": 0.4961, "step": 11160 }, { "epoch": 2.6877708233028406, - "grad_norm": 0.9375, - "learning_rate": 6.123581500070057e-05, - "loss": 0.3796, + "grad_norm": 1.84375, + "learning_rate": 2.441571956836937e-05, + "loss": 0.5106, "step": 11165 }, { "epoch": 2.688974482426577, - "grad_norm": 0.8671875, - "learning_rate": 6.12012336540688e-05, - "loss": 0.3947, + "grad_norm": 1.609375, + "learning_rate": 2.4401931420671035e-05, + "loss": 0.5176, "step": 11170 }, { "epoch": 2.690178141550313, - "grad_norm": 0.9296875, - "learning_rate": 6.116678099579206e-05, - "loss": 0.3832, + "grad_norm": 1.671875, + "learning_rate": 2.4388194583121633e-05, + "loss": 0.514, "step": 11175 }, { "epoch": 2.691381800674049, - "grad_norm": 1.0234375, - "learning_rate": 6.113245708091684e-05, - "loss": 0.3597, + "grad_norm": 1.8203125, + "learning_rate": 2.4374509077669106e-05, + "loss": 0.4828, "step": 11180 }, { "epoch": 2.6925854597977854, - "grad_norm": 0.87109375, - "learning_rate": 6.109826196428398e-05, - "loss": 0.3659, + "grad_norm": 1.671875, + "learning_rate": 2.4360874926179392e-05, + "loss": 0.4942, "step": 11185 }, { "epoch": 2.6937891189215213, - "grad_norm": 0.88671875, - "learning_rate": 6.106419570052849e-05, - "loss": 0.3707, + "grad_norm": 1.6328125, + "learning_rate": 2.4347292150436363e-05, + "loss": 0.4942, "step": 11190 }, { "epoch": 2.6949927780452576, - "grad_norm": 0.984375, - "learning_rate": 6.103025834407954e-05, - "loss": 0.4054, + "grad_norm": 1.734375, + "learning_rate": 2.4333760772141816e-05, + "loss": 0.5316, "step": 11195 }, { "epoch": 2.696196437168994, - "grad_norm": 0.97265625, - "learning_rate": 6.099644994916033e-05, - "loss": 0.3704, + "grad_norm": 1.8671875, + "learning_rate": 2.432028081291543e-05, + "loss": 0.4984, "step": 11200 }, { "epoch": 2.69740009629273, - "grad_norm": 0.90625, - "learning_rate": 6.096277056978799e-05, - "loss": 0.3605, + "grad_norm": 1.71875, + "learning_rate": 2.4306852294294707e-05, + "loss": 0.4856, "step": 11205 }, { "epoch": 2.698603755416466, - "grad_norm": 0.96875, - "learning_rate": 6.092922025977354e-05, - "loss": 0.3834, + "grad_norm": 1.7890625, + "learning_rate": 2.4293475237734994e-05, + "loss": 0.5097, "step": 11210 }, { "epoch": 2.699807414540202, - "grad_norm": 0.9453125, - "learning_rate": 6.0895799072721795e-05, - "loss": 0.3237, + "grad_norm": 1.796875, + "learning_rate": 2.4280149664609395e-05, + "loss": 0.4497, "step": 11215 }, { "epoch": 2.7010110736639383, - "grad_norm": 0.96484375, - "learning_rate": 6.0862507062031206e-05, - "loss": 0.3636, + "grad_norm": 1.78125, + "learning_rate": 2.4266875596208753e-05, + "loss": 0.4879, "step": 11220 }, { "epoch": 2.7022147327876747, - "grad_norm": 0.94921875, - "learning_rate": 6.082934428089391e-05, - "loss": 0.3412, + "grad_norm": 1.84375, + "learning_rate": 2.425365305374163e-05, + "loss": 0.4669, "step": 11225 }, { "epoch": 2.7034183919114105, - "grad_norm": 0.9375, - "learning_rate": 6.0796310782295507e-05, - "loss": 0.3668, + "grad_norm": 1.8515625, + "learning_rate": 2.424048205833426e-05, + "loss": 0.501, "step": 11230 }, { "epoch": 2.704622051035147, - "grad_norm": 0.93359375, - "learning_rate": 6.076340661901507e-05, - "loss": 0.3707, + "grad_norm": 1.7265625, + "learning_rate": 2.4227362631030524e-05, + "loss": 0.4956, "step": 11235 }, { "epoch": 2.7058257101588827, - "grad_norm": 0.95703125, - "learning_rate": 6.073063184362501e-05, - "loss": 0.393, + "grad_norm": 1.8125, + "learning_rate": 2.4214294792791888e-05, + "loss": 0.5205, "step": 11240 }, { "epoch": 2.707029369282619, - "grad_norm": 0.94921875, - "learning_rate": 6.0697986508491e-05, - "loss": 0.3714, + "grad_norm": 1.7578125, + "learning_rate": 2.420127856449741e-05, + "loss": 0.4974, "step": 11245 }, { "epoch": 2.7082330284063554, - "grad_norm": 0.984375, - "learning_rate": 6.066547066577197e-05, - "loss": 0.402, + "grad_norm": 1.7578125, + "learning_rate": 2.418831396694371e-05, + "loss": 0.5347, "step": 11250 }, { "epoch": 2.7094366875300917, - "grad_norm": 0.85546875, - "learning_rate": 6.063308436741984e-05, - "loss": 0.3671, + "grad_norm": 1.6640625, + "learning_rate": 2.4175401020844863e-05, + "loss": 0.4911, "step": 11255 }, { "epoch": 2.7106403466538276, - "grad_norm": 0.87890625, - "learning_rate": 6.060082766517967e-05, - "loss": 0.3621, + "grad_norm": 1.7265625, + "learning_rate": 2.4162539746832464e-05, + "loss": 0.4834, "step": 11260 }, { "epoch": 2.711844005777564, - "grad_norm": 0.921875, - "learning_rate": 6.0568700610589346e-05, - "loss": 0.3637, + "grad_norm": 1.625, + "learning_rate": 2.4149730165455522e-05, + "loss": 0.4858, "step": 11265 }, { "epoch": 2.7130476649013, - "grad_norm": 1.015625, - "learning_rate": 6.0536703254979707e-05, - "loss": 0.3633, + "grad_norm": 1.8125, + "learning_rate": 2.413697229718047e-05, + "loss": 0.4933, "step": 11270 }, { "epoch": 2.714251324025036, - "grad_norm": 0.9296875, - "learning_rate": 6.0504835649474296e-05, - "loss": 0.3947, + "grad_norm": 1.71875, + "learning_rate": 2.4124266162391106e-05, + "loss": 0.5216, "step": 11275 }, { "epoch": 2.7154549831487724, - "grad_norm": 0.99609375, - "learning_rate": 6.0473097844989376e-05, - "loss": 0.3646, + "grad_norm": 1.78125, + "learning_rate": 2.411161178138857e-05, + "loss": 0.4953, "step": 11280 }, { "epoch": 2.7166586422725083, - "grad_norm": 0.9140625, - "learning_rate": 6.0441489892233855e-05, - "loss": 0.3679, + "grad_norm": 1.7109375, + "learning_rate": 2.409900917439134e-05, + "loss": 0.4908, "step": 11285 }, { "epoch": 2.7178623013962446, - "grad_norm": 0.9140625, - "learning_rate": 6.041001184170911e-05, - "loss": 0.3607, + "grad_norm": 1.625, + "learning_rate": 2.408645836153513e-05, + "loss": 0.4851, "step": 11290 }, { "epoch": 2.7190659605199805, - "grad_norm": 0.99609375, - "learning_rate": 6.0378663743709026e-05, - "loss": 0.392, + "grad_norm": 1.8125, + "learning_rate": 2.407395936287294e-05, + "loss": 0.5157, "step": 11295 }, { "epoch": 2.720269619643717, - "grad_norm": 0.94921875, - "learning_rate": 6.034744564831977e-05, - "loss": 0.3655, + "grad_norm": 1.7421875, + "learning_rate": 2.4061512198374943e-05, + "loss": 0.4898, "step": 11300 }, { "epoch": 2.721473278767453, - "grad_norm": 1.0, - "learning_rate": 6.031635760541992e-05, - "loss": 0.3944, + "grad_norm": 1.78125, + "learning_rate": 2.4049116887928542e-05, + "loss": 0.5257, "step": 11305 }, { "epoch": 2.7226769378911895, - "grad_norm": 1.0078125, - "learning_rate": 6.028539966468016e-05, - "loss": 0.3805, + "grad_norm": 1.796875, + "learning_rate": 2.4036773451338246e-05, + "loss": 0.5092, "step": 11310 }, { "epoch": 2.7238805970149254, - "grad_norm": 0.96484375, - "learning_rate": 6.0254571875563366e-05, - "loss": 0.3543, + "grad_norm": 1.8203125, + "learning_rate": 2.4024481908325716e-05, + "loss": 0.4822, "step": 11315 }, { "epoch": 2.7250842561386617, - "grad_norm": 1.0625, - "learning_rate": 6.0223874287324425e-05, - "loss": 0.3775, + "grad_norm": 2.015625, + "learning_rate": 2.4012242278529676e-05, + "loss": 0.5104, "step": 11320 }, { "epoch": 2.7262879152623976, - "grad_norm": 0.98828125, - "learning_rate": 6.019330694901022e-05, - "loss": 0.3701, + "grad_norm": 1.90625, + "learning_rate": 2.4000054581505925e-05, + "loss": 0.4969, "step": 11325 }, { "epoch": 2.727491574386134, - "grad_norm": 0.97265625, - "learning_rate": 6.0162869909459514e-05, - "loss": 0.3582, + "grad_norm": 1.7890625, + "learning_rate": 2.398791883672727e-05, + "loss": 0.4788, "step": 11330 }, { "epoch": 2.72869523350987, - "grad_norm": 1.03125, - "learning_rate": 6.0132563217302914e-05, - "loss": 0.3865, + "grad_norm": 1.828125, + "learning_rate": 2.3975835063583532e-05, + "loss": 0.5095, "step": 11335 }, { "epoch": 2.729898892633606, - "grad_norm": 0.9609375, - "learning_rate": 6.010238692096272e-05, - "loss": 0.3671, + "grad_norm": 1.65625, + "learning_rate": 2.3963803281381466e-05, + "loss": 0.4962, "step": 11340 }, { "epoch": 2.7311025517573424, - "grad_norm": 0.9296875, - "learning_rate": 6.007234106865294e-05, - "loss": 0.3605, + "grad_norm": 1.7578125, + "learning_rate": 2.3951823509344797e-05, + "loss": 0.4857, "step": 11345 }, { "epoch": 2.7323062108810783, - "grad_norm": 0.90625, - "learning_rate": 6.0042425708379124e-05, - "loss": 0.3512, + "grad_norm": 1.7109375, + "learning_rate": 2.393989576661411e-05, + "loss": 0.4847, "step": 11350 }, { "epoch": 2.7335098700048146, - "grad_norm": 0.91015625, - "learning_rate": 6.001264088793834e-05, - "loss": 0.3551, + "grad_norm": 1.71875, + "learning_rate": 2.3928020072246886e-05, + "loss": 0.4819, "step": 11355 }, { "epoch": 2.734713529128551, - "grad_norm": 0.90234375, - "learning_rate": 5.998298665491915e-05, - "loss": 0.377, + "grad_norm": 1.6328125, + "learning_rate": 2.391619644521745e-05, + "loss": 0.5054, "step": 11360 }, { "epoch": 2.735917188252287, - "grad_norm": 0.95703125, - "learning_rate": 5.995346305670136e-05, - "loss": 0.3961, + "grad_norm": 1.765625, + "learning_rate": 2.3904424904416915e-05, + "loss": 0.5271, "step": 11365 }, { "epoch": 2.737120847376023, - "grad_norm": 0.95703125, - "learning_rate": 5.9924070140456144e-05, - "loss": 0.3803, + "grad_norm": 1.8125, + "learning_rate": 2.3892705468653195e-05, + "loss": 0.5125, "step": 11370 }, { "epoch": 2.738324506499759, - "grad_norm": 1.0234375, - "learning_rate": 5.989480795314583e-05, - "loss": 0.4038, + "grad_norm": 1.84375, + "learning_rate": 2.388103815665094e-05, + "loss": 0.5383, "step": 11375 }, { "epoch": 2.7395281656234953, - "grad_norm": 0.86328125, - "learning_rate": 5.98656765415239e-05, - "loss": 0.3586, + "grad_norm": 1.734375, + "learning_rate": 2.3869422987051518e-05, + "loss": 0.4873, "step": 11380 }, { "epoch": 2.7407318247472316, - "grad_norm": 0.9296875, - "learning_rate": 5.983667595213488e-05, - "loss": 0.3561, + "grad_norm": 1.6953125, + "learning_rate": 2.3857859978413e-05, + "loss": 0.4812, "step": 11385 }, { "epoch": 2.741935483870968, - "grad_norm": 0.9765625, - "learning_rate": 5.980780623131426e-05, - "loss": 0.3857, + "grad_norm": 1.8671875, + "learning_rate": 2.38463491492101e-05, + "loss": 0.5206, "step": 11390 }, { "epoch": 2.743139142994704, - "grad_norm": 1.0078125, - "learning_rate": 5.977906742518849e-05, - "loss": 0.3759, + "grad_norm": 1.7890625, + "learning_rate": 2.3834890517834192e-05, + "loss": 0.503, "step": 11395 }, { "epoch": 2.74434280211844, - "grad_norm": 0.90625, - "learning_rate": 5.975045957967477e-05, - "loss": 0.3744, + "grad_norm": 1.703125, + "learning_rate": 2.3823484102593203e-05, + "loss": 0.5046, "step": 11400 }, { "epoch": 2.745546461242176, - "grad_norm": 1.0390625, - "learning_rate": 5.972198274048113e-05, - "loss": 0.3895, + "grad_norm": 1.6796875, + "learning_rate": 2.381212992171168e-05, + "loss": 0.52, "step": 11405 }, { "epoch": 2.7467501203659124, - "grad_norm": 1.0546875, - "learning_rate": 5.969363695310628e-05, - "loss": 0.4043, + "grad_norm": 1.7578125, + "learning_rate": 2.3800827993330697e-05, + "loss": 0.5372, "step": 11410 }, { "epoch": 2.7479537794896487, - "grad_norm": 0.9453125, - "learning_rate": 5.9665422262839467e-05, - "loss": 0.3681, + "grad_norm": 1.6796875, + "learning_rate": 2.3789578335507822e-05, + "loss": 0.4974, "step": 11415 }, { "epoch": 2.7491574386133846, - "grad_norm": 0.921875, - "learning_rate": 5.96373387147606e-05, - "loss": 0.3858, + "grad_norm": 1.7265625, + "learning_rate": 2.3778380966217145e-05, + "loss": 0.5118, "step": 11420 }, { "epoch": 2.750361097737121, - "grad_norm": 0.9765625, - "learning_rate": 5.960938635373993e-05, - "loss": 0.362, + "grad_norm": 1.828125, + "learning_rate": 2.376723590334917e-05, + "loss": 0.4895, "step": 11425 }, { "epoch": 2.7515647568608568, - "grad_norm": 0.984375, - "learning_rate": 5.958156522443819e-05, - "loss": 0.3848, + "grad_norm": 1.8203125, + "learning_rate": 2.375614316471086e-05, + "loss": 0.5111, "step": 11430 }, { "epoch": 2.752768415984593, - "grad_norm": 1.046875, - "learning_rate": 5.955387537130642e-05, - "loss": 0.388, + "grad_norm": 1.8515625, + "learning_rate": 2.3745102768025573e-05, + "loss": 0.515, "step": 11435 }, { "epoch": 2.7539720751083294, - "grad_norm": 0.95703125, - "learning_rate": 5.952631683858589e-05, - "loss": 0.3666, + "grad_norm": 1.671875, + "learning_rate": 2.3734114730933023e-05, + "loss": 0.4891, "step": 11440 }, { "epoch": 2.7551757342320657, - "grad_norm": 0.9921875, - "learning_rate": 5.9498889670308085e-05, - "loss": 0.3751, + "grad_norm": 1.8203125, + "learning_rate": 2.3723179070989285e-05, + "loss": 0.5068, "step": 11445 }, { "epoch": 2.7563793933558016, - "grad_norm": 0.94921875, - "learning_rate": 5.947159391029458e-05, - "loss": 0.3824, + "grad_norm": 1.734375, + "learning_rate": 2.3712295805666734e-05, + "loss": 0.5122, "step": 11450 }, { "epoch": 2.757583052479538, - "grad_norm": 0.9296875, - "learning_rate": 5.944442960215698e-05, - "loss": 0.3736, + "grad_norm": 1.7421875, + "learning_rate": 2.370146495235403e-05, + "loss": 0.5051, "step": 11455 }, { "epoch": 2.758786711603274, - "grad_norm": 0.921875, - "learning_rate": 5.9417396789296946e-05, - "loss": 0.3514, + "grad_norm": 1.65625, + "learning_rate": 2.3690686528356125e-05, + "loss": 0.4773, "step": 11460 }, { "epoch": 2.75999037072701, - "grad_norm": 1.0078125, - "learning_rate": 5.939049551490592e-05, - "loss": 0.3593, + "grad_norm": 1.7578125, + "learning_rate": 2.3679960550894153e-05, + "loss": 0.4785, "step": 11465 }, { "epoch": 2.7611940298507465, - "grad_norm": 0.95703125, - "learning_rate": 5.936372582196529e-05, - "loss": 0.3791, + "grad_norm": 1.8125, + "learning_rate": 2.3669287037105493e-05, + "loss": 0.5043, "step": 11470 }, { "epoch": 2.7623976889744823, - "grad_norm": 0.96875, - "learning_rate": 5.933708775324613e-05, - "loss": 0.3788, + "grad_norm": 1.7265625, + "learning_rate": 2.3658666004043672e-05, + "loss": 0.5046, "step": 11475 }, { "epoch": 2.7636013480982187, - "grad_norm": 0.859375, - "learning_rate": 5.9310581351309275e-05, - "loss": 0.3677, + "grad_norm": 1.609375, + "learning_rate": 2.364809746867839e-05, + "loss": 0.4978, "step": 11480 }, { "epoch": 2.7648050072219545, - "grad_norm": 0.9765625, - "learning_rate": 5.928420665850513e-05, - "loss": 0.3761, + "grad_norm": 1.7890625, + "learning_rate": 2.3637581447895447e-05, + "loss": 0.4991, "step": 11485 }, { "epoch": 2.766008666345691, - "grad_norm": 0.99609375, - "learning_rate": 5.9257963716973694e-05, - "loss": 0.4057, + "grad_norm": 1.796875, + "learning_rate": 2.3627117958496746e-05, + "loss": 0.5357, "step": 11490 }, { "epoch": 2.767212325469427, - "grad_norm": 0.8984375, - "learning_rate": 5.923185256864449e-05, - "loss": 0.3589, + "grad_norm": 1.671875, + "learning_rate": 2.3616707017200274e-05, + "loss": 0.4869, "step": 11495 }, { "epoch": 2.768415984593163, - "grad_norm": 0.88671875, - "learning_rate": 5.920587325523642e-05, - "loss": 0.3619, + "grad_norm": 1.6796875, + "learning_rate": 2.3606348640640037e-05, + "loss": 0.4888, "step": 11500 }, { "epoch": 2.768415984593163, - "eval_loss": 0.34762200713157654, - "eval_runtime": 2.3256, - "eval_samples_per_second": 86.0, - "eval_steps_per_second": 86.0, + "eval_loss": 0.4396386742591858, + "eval_runtime": 2.4019, + "eval_samples_per_second": 83.268, + "eval_steps_per_second": 83.268, "step": 11500 }, { "epoch": 2.7696196437168994, - "grad_norm": 1.0234375, - "learning_rate": 5.9180025818257755e-05, - "loss": 0.349, + "grad_norm": 1.8125, + "learning_rate": 2.359604284536606e-05, + "loss": 0.4737, "step": 11505 }, { "epoch": 2.7708233028406353, - "grad_norm": 0.953125, - "learning_rate": 5.915431029900609e-05, - "loss": 0.3737, + "grad_norm": 1.875, + "learning_rate": 2.3585789647844378e-05, + "loss": 0.5035, "step": 11510 }, { "epoch": 2.7720269619643716, - "grad_norm": 1.0234375, - "learning_rate": 5.912872673856823e-05, - "loss": 0.3582, + "grad_norm": 1.859375, + "learning_rate": 2.3575589064456956e-05, + "loss": 0.4848, "step": 11515 }, { "epoch": 2.773230621088108, - "grad_norm": 0.96875, - "learning_rate": 5.910327517782015e-05, - "loss": 0.3706, + "grad_norm": 1.765625, + "learning_rate": 2.3565441111501725e-05, + "loss": 0.4989, "step": 11520 }, { "epoch": 2.7744342802118442, - "grad_norm": 0.90234375, - "learning_rate": 5.907795565742691e-05, - "loss": 0.3667, + "grad_norm": 1.625, + "learning_rate": 2.3555345805192502e-05, + "loss": 0.4948, "step": 11525 }, { "epoch": 2.77563793933558, - "grad_norm": 0.8671875, - "learning_rate": 5.9052768217842614e-05, - "loss": 0.3641, + "grad_norm": 1.609375, + "learning_rate": 2.3545303161659004e-05, + "loss": 0.4916, "step": 11530 }, { "epoch": 2.7768415984593164, - "grad_norm": 0.9375, - "learning_rate": 5.9027712899310354e-05, - "loss": 0.3808, + "grad_norm": 1.8515625, + "learning_rate": 2.3535313196946802e-05, + "loss": 0.5123, "step": 11535 }, { "epoch": 2.7780452575830523, - "grad_norm": 0.89453125, - "learning_rate": 5.900278974186208e-05, - "loss": 0.381, + "grad_norm": 1.6953125, + "learning_rate": 2.352537592701729e-05, + "loss": 0.5097, "step": 11540 }, { "epoch": 2.7792489167067886, - "grad_norm": 1.03125, - "learning_rate": 5.897799878531861e-05, - "loss": 0.3694, + "grad_norm": 1.8359375, + "learning_rate": 2.351549136774769e-05, + "loss": 0.4975, "step": 11545 }, { "epoch": 2.780452575830525, - "grad_norm": 0.91015625, - "learning_rate": 5.8953340069289544e-05, - "loss": 0.3891, + "grad_norm": 1.6875, + "learning_rate": 2.350565953493098e-05, + "loss": 0.5181, "step": 11550 }, { "epoch": 2.781656234954261, - "grad_norm": 1.1171875, - "learning_rate": 5.8928813633173194e-05, - "loss": 0.3756, + "grad_norm": 1.8515625, + "learning_rate": 2.349588044427592e-05, + "loss": 0.512, "step": 11555 }, { "epoch": 2.782859894077997, - "grad_norm": 0.9921875, - "learning_rate": 5.890441951615651e-05, - "loss": 0.3873, + "grad_norm": 1.703125, + "learning_rate": 2.3486154111406987e-05, + "loss": 0.5108, "step": 11560 }, { "epoch": 2.784063553201733, - "grad_norm": 0.86328125, - "learning_rate": 5.888015775721504e-05, - "loss": 0.3726, + "grad_norm": 1.5859375, + "learning_rate": 2.3476480551864364e-05, + "loss": 0.5023, "step": 11565 }, { "epoch": 2.7852672123254694, - "grad_norm": 0.921875, - "learning_rate": 5.8856028395112874e-05, - "loss": 0.3665, + "grad_norm": 1.6875, + "learning_rate": 2.346685978110393e-05, + "loss": 0.4878, "step": 11570 }, { "epoch": 2.7864708714492057, - "grad_norm": 1.0234375, - "learning_rate": 5.8832031468402505e-05, - "loss": 0.3815, + "grad_norm": 1.8125, + "learning_rate": 2.345729181449719e-05, + "loss": 0.5122, "step": 11575 }, { "epoch": 2.787674530572942, - "grad_norm": 0.99609375, - "learning_rate": 5.880816701542492e-05, - "loss": 0.3586, + "grad_norm": 1.8046875, + "learning_rate": 2.3447776667331323e-05, + "loss": 0.4843, "step": 11580 }, { "epoch": 2.788878189696678, - "grad_norm": 0.9609375, - "learning_rate": 5.878443507430935e-05, - "loss": 0.3847, + "grad_norm": 1.8203125, + "learning_rate": 2.3438314354809085e-05, + "loss": 0.5168, "step": 11585 }, { "epoch": 2.790081848820414, - "grad_norm": 0.94921875, - "learning_rate": 5.8760835682973376e-05, - "loss": 0.3734, + "grad_norm": 1.75, + "learning_rate": 2.3428904892048824e-05, + "loss": 0.5057, "step": 11590 }, { "epoch": 2.79128550794415, - "grad_norm": 1.0859375, - "learning_rate": 5.873736887912278e-05, - "loss": 0.3844, + "grad_norm": 1.8203125, + "learning_rate": 2.3419548294084452e-05, + "loss": 0.5103, "step": 11595 }, { "epoch": 2.7924891670678864, - "grad_norm": 0.8671875, - "learning_rate": 5.871403470025148e-05, - "loss": 0.3846, + "grad_norm": 1.59375, + "learning_rate": 2.3410244575865414e-05, + "loss": 0.5122, "step": 11600 }, { "epoch": 2.7936928261916227, - "grad_norm": 1.0625, - "learning_rate": 5.869083318364154e-05, - "loss": 0.3755, + "grad_norm": 1.859375, + "learning_rate": 2.340099375225667e-05, + "loss": 0.5009, "step": 11605 }, { "epoch": 2.7948964853153586, - "grad_norm": 0.98828125, - "learning_rate": 5.866776436636302e-05, - "loss": 0.3684, + "grad_norm": 1.8046875, + "learning_rate": 2.3391795838038662e-05, + "loss": 0.4974, "step": 11610 }, { "epoch": 2.796100144439095, - "grad_norm": 0.92578125, - "learning_rate": 5.864482828527397e-05, - "loss": 0.3619, + "grad_norm": 1.7421875, + "learning_rate": 2.3382650847907294e-05, + "loss": 0.4871, "step": 11615 }, { "epoch": 2.797303803562831, - "grad_norm": 0.890625, - "learning_rate": 5.862202497702039e-05, - "loss": 0.3825, + "grad_norm": 1.6484375, + "learning_rate": 2.3373558796473928e-05, + "loss": 0.5095, "step": 11620 }, { "epoch": 2.798507462686567, - "grad_norm": 1.0078125, - "learning_rate": 5.859935447803608e-05, - "loss": 0.3873, + "grad_norm": 1.7734375, + "learning_rate": 2.3364519698265313e-05, + "loss": 0.5158, "step": 11625 }, { "epoch": 2.7997111218103035, - "grad_norm": 1.0078125, - "learning_rate": 5.8576816824542733e-05, - "loss": 0.369, + "grad_norm": 1.8046875, + "learning_rate": 2.335553356772363e-05, + "loss": 0.4883, "step": 11630 }, { "epoch": 2.8009147809340393, - "grad_norm": 0.95703125, - "learning_rate": 5.8554412052549716e-05, - "loss": 0.3621, + "grad_norm": 1.796875, + "learning_rate": 2.3346600419206402e-05, + "loss": 0.4891, "step": 11635 }, { "epoch": 2.8021184400577757, - "grad_norm": 1.0390625, - "learning_rate": 5.8532140197854114e-05, - "loss": 0.3802, + "grad_norm": 1.796875, + "learning_rate": 2.333772026698651e-05, + "loss": 0.5075, "step": 11640 }, { "epoch": 2.8033220991815115, - "grad_norm": 0.93359375, - "learning_rate": 5.851000129604065e-05, - "loss": 0.3485, + "grad_norm": 1.7578125, + "learning_rate": 2.332889312525217e-05, + "loss": 0.4784, "step": 11645 }, { "epoch": 2.804525758305248, - "grad_norm": 0.9140625, - "learning_rate": 5.848799538248159e-05, - "loss": 0.3805, + "grad_norm": 1.78125, + "learning_rate": 2.332011900810687e-05, + "loss": 0.507, "step": 11650 }, { "epoch": 2.805729417428984, - "grad_norm": 0.96484375, - "learning_rate": 5.846612249233677e-05, - "loss": 0.3761, + "grad_norm": 1.765625, + "learning_rate": 2.3311397929569424e-05, + "loss": 0.5084, "step": 11655 }, { "epoch": 2.8069330765527205, - "grad_norm": 1.015625, - "learning_rate": 5.844438266055344e-05, - "loss": 0.3674, + "grad_norm": 1.8046875, + "learning_rate": 2.3302729903573866e-05, + "loss": 0.4966, "step": 11660 }, { "epoch": 2.8081367356764564, - "grad_norm": 0.953125, - "learning_rate": 5.84227759218663e-05, - "loss": 0.3672, + "grad_norm": 1.8046875, + "learning_rate": 2.3294114943969488e-05, + "loss": 0.4906, "step": 11665 }, { "epoch": 2.8093403948001927, - "grad_norm": 0.91015625, - "learning_rate": 5.8401302310797366e-05, - "loss": 0.394, + "grad_norm": 1.671875, + "learning_rate": 2.328555306452078e-05, + "loss": 0.5269, "step": 11670 }, { "epoch": 2.8105440539239286, - "grad_norm": 0.9453125, - "learning_rate": 5.837996186165596e-05, - "loss": 0.3866, + "grad_norm": 1.765625, + "learning_rate": 2.3277044278907432e-05, + "loss": 0.5122, "step": 11675 }, { "epoch": 2.811747713047665, - "grad_norm": 0.921875, - "learning_rate": 5.835875460853866e-05, - "loss": 0.3767, + "grad_norm": 1.7890625, + "learning_rate": 2.3268588600724304e-05, + "loss": 0.5064, "step": 11680 }, { "epoch": 2.8129513721714012, - "grad_norm": 0.8828125, - "learning_rate": 5.8337680585329203e-05, - "loss": 0.3844, + "grad_norm": 1.703125, + "learning_rate": 2.32601860434814e-05, + "loss": 0.5197, "step": 11685 }, { "epoch": 2.814155031295137, - "grad_norm": 1.015625, - "learning_rate": 5.8316739825698495e-05, - "loss": 0.404, + "grad_norm": 1.7734375, + "learning_rate": 2.3251836620603854e-05, + "loss": 0.5363, "step": 11690 }, { "epoch": 2.8153586904188734, - "grad_norm": 1.0078125, - "learning_rate": 5.829593236310451e-05, - "loss": 0.3543, + "grad_norm": 1.7890625, + "learning_rate": 2.324354034543191e-05, + "loss": 0.4831, "step": 11695 }, { "epoch": 2.8165623495426093, - "grad_norm": 0.98828125, - "learning_rate": 5.8275258230792205e-05, - "loss": 0.3892, + "grad_norm": 1.765625, + "learning_rate": 2.323529723122087e-05, + "loss": 0.515, "step": 11700 }, { "epoch": 2.8177660086663456, - "grad_norm": 0.94921875, - "learning_rate": 5.8254717461793563e-05, - "loss": 0.3715, + "grad_norm": 1.671875, + "learning_rate": 2.322710729114114e-05, + "loss": 0.4939, "step": 11705 }, { "epoch": 2.818969667790082, - "grad_norm": 0.9453125, - "learning_rate": 5.823431008892747e-05, - "loss": 0.3917, + "grad_norm": 1.75, + "learning_rate": 2.321897053827813e-05, + "loss": 0.5158, "step": 11710 }, { "epoch": 2.8201733269138183, - "grad_norm": 0.98828125, - "learning_rate": 5.8214036144799686e-05, - "loss": 0.3582, + "grad_norm": 1.7734375, + "learning_rate": 2.321088698563229e-05, + "loss": 0.4813, "step": 11715 }, { "epoch": 2.821376986037554, - "grad_norm": 0.91015625, - "learning_rate": 5.819389566180274e-05, - "loss": 0.363, + "grad_norm": 1.7421875, + "learning_rate": 2.320285664611906e-05, + "loss": 0.4869, "step": 11720 }, { "epoch": 2.8225806451612905, - "grad_norm": 0.96484375, - "learning_rate": 5.817388867211597e-05, - "loss": 0.3695, + "grad_norm": 1.734375, + "learning_rate": 2.319487953256887e-05, + "loss": 0.5005, "step": 11725 }, { "epoch": 2.8237843042850264, - "grad_norm": 0.921875, - "learning_rate": 5.815401520770547e-05, - "loss": 0.3727, + "grad_norm": 1.671875, + "learning_rate": 2.3186955657727108e-05, + "loss": 0.5005, "step": 11730 }, { "epoch": 2.8249879634087627, - "grad_norm": 0.91015625, - "learning_rate": 5.813427530032388e-05, - "loss": 0.3934, + "grad_norm": 1.703125, + "learning_rate": 2.317908503425408e-05, + "loss": 0.5251, "step": 11735 }, { "epoch": 2.826191622532499, - "grad_norm": 0.984375, - "learning_rate": 5.811466898151054e-05, - "loss": 0.3768, + "grad_norm": 1.71875, + "learning_rate": 2.3171267674725035e-05, + "loss": 0.5029, "step": 11740 }, { "epoch": 2.827395281656235, - "grad_norm": 0.9453125, - "learning_rate": 5.809519628259132e-05, - "loss": 0.3619, + "grad_norm": 1.8046875, + "learning_rate": 2.316350359163011e-05, + "loss": 0.493, "step": 11745 }, { "epoch": 2.828598940779971, - "grad_norm": 0.96484375, - "learning_rate": 5.807585723467857e-05, - "loss": 0.3673, + "grad_norm": 1.84375, + "learning_rate": 2.3155792797374303e-05, + "loss": 0.4947, "step": 11750 }, { "epoch": 2.829802599903707, - "grad_norm": 0.98046875, - "learning_rate": 5.8056651868671185e-05, - "loss": 0.3556, + "grad_norm": 1.7734375, + "learning_rate": 2.3148135304277513e-05, + "loss": 0.4837, "step": 11755 }, { "epoch": 2.8310062590274434, - "grad_norm": 0.9453125, - "learning_rate": 5.803758021525437e-05, - "loss": 0.3751, + "grad_norm": 1.765625, + "learning_rate": 2.314053112457443e-05, + "loss": 0.5029, "step": 11760 }, { "epoch": 2.8322099181511797, - "grad_norm": 0.984375, - "learning_rate": 5.801864230489977e-05, - "loss": 0.3888, + "grad_norm": 1.875, + "learning_rate": 2.3132980270414596e-05, + "loss": 0.5141, "step": 11765 }, { "epoch": 2.8334135772749156, - "grad_norm": 1.0078125, - "learning_rate": 5.7999838167865285e-05, - "loss": 0.3864, + "grad_norm": 1.7890625, + "learning_rate": 2.312548275386233e-05, + "loss": 0.5216, "step": 11770 }, { "epoch": 2.834617236398652, - "grad_norm": 0.97265625, - "learning_rate": 5.798116783419512e-05, - "loss": 0.3705, + "grad_norm": 1.8046875, + "learning_rate": 2.311803858689675e-05, + "loss": 0.4998, "step": 11775 }, { "epoch": 2.835820895522388, - "grad_norm": 0.8984375, - "learning_rate": 5.796263133371969e-05, - "loss": 0.3574, + "grad_norm": 1.6484375, + "learning_rate": 2.3110647781411726e-05, + "loss": 0.4851, "step": 11780 }, { "epoch": 2.837024554646124, - "grad_norm": 0.9140625, - "learning_rate": 5.794422869605555e-05, - "loss": 0.376, + "grad_norm": 1.6953125, + "learning_rate": 2.3103310349215865e-05, + "loss": 0.505, "step": 11785 }, { "epoch": 2.8382282137698605, - "grad_norm": 0.96875, - "learning_rate": 5.7925959950605414e-05, - "loss": 0.3794, + "grad_norm": 1.7890625, + "learning_rate": 2.3096026302032506e-05, + "loss": 0.5057, "step": 11790 }, { "epoch": 2.8394318728935968, - "grad_norm": 1.0546875, - "learning_rate": 5.790782512655804e-05, - "loss": 0.3919, + "grad_norm": 1.8828125, + "learning_rate": 2.3088795651499692e-05, + "loss": 0.5188, "step": 11795 }, { "epoch": 2.8406355320173327, - "grad_norm": 1.0234375, - "learning_rate": 5.788982425288825e-05, - "loss": 0.3693, + "grad_norm": 1.9296875, + "learning_rate": 2.3081618409170143e-05, + "loss": 0.4989, "step": 11800 }, { "epoch": 2.841839191141069, - "grad_norm": 0.875, - "learning_rate": 5.7871957358356804e-05, - "loss": 0.3673, + "grad_norm": 1.6171875, + "learning_rate": 2.307449458651126e-05, + "loss": 0.4922, "step": 11805 }, { "epoch": 2.843042850264805, - "grad_norm": 0.97265625, - "learning_rate": 5.7854224471510416e-05, - "loss": 0.3755, + "grad_norm": 1.7734375, + "learning_rate": 2.3067424194905067e-05, + "loss": 0.5021, "step": 11810 }, { "epoch": 2.844246509388541, - "grad_norm": 0.94921875, - "learning_rate": 5.783662562068172e-05, - "loss": 0.3665, + "grad_norm": 1.6953125, + "learning_rate": 2.3060407245648265e-05, + "loss": 0.4886, "step": 11815 }, { "epoch": 2.8454501685122775, - "grad_norm": 0.96875, - "learning_rate": 5.7819160833989156e-05, - "loss": 0.3884, + "grad_norm": 1.8125, + "learning_rate": 2.3053443749952122e-05, + "loss": 0.5172, "step": 11820 }, { "epoch": 2.8466538276360134, - "grad_norm": 0.984375, - "learning_rate": 5.7801830139336955e-05, - "loss": 0.4343, + "grad_norm": 1.8046875, + "learning_rate": 2.304653371894252e-05, + "loss": 0.5641, "step": 11825 }, { "epoch": 2.8478574867597497, - "grad_norm": 0.9765625, - "learning_rate": 5.778463356441515e-05, - "loss": 0.3738, + "grad_norm": 1.6640625, + "learning_rate": 2.3039677163659924e-05, + "loss": 0.5064, "step": 11830 }, { "epoch": 2.8490611458834856, - "grad_norm": 0.98828125, - "learning_rate": 5.7767571136699455e-05, - "loss": 0.3628, + "grad_norm": 1.78125, + "learning_rate": 2.3032874095059346e-05, + "loss": 0.4873, "step": 11835 }, { "epoch": 2.850264805007222, - "grad_norm": 1.0, - "learning_rate": 5.775064288345125e-05, - "loss": 0.3685, + "grad_norm": 1.9296875, + "learning_rate": 2.3026124524010348e-05, + "loss": 0.5024, "step": 11840 }, { "epoch": 2.8514684641309582, - "grad_norm": 0.90234375, - "learning_rate": 5.773384883171753e-05, - "loss": 0.3594, + "grad_norm": 1.71875, + "learning_rate": 2.3019428461297008e-05, + "loss": 0.4912, "step": 11845 }, { "epoch": 2.8526721232546945, - "grad_norm": 1.0859375, - "learning_rate": 5.771718900833093e-05, - "loss": 0.3751, + "grad_norm": 1.9375, + "learning_rate": 2.3012785917617934e-05, + "loss": 0.497, "step": 11850 }, { "epoch": 2.8538757823784304, - "grad_norm": 1.0078125, - "learning_rate": 5.770066343990953e-05, - "loss": 0.3942, + "grad_norm": 1.953125, + "learning_rate": 2.3006196903586192e-05, + "loss": 0.5237, "step": 11855 }, { "epoch": 2.8550794415021667, - "grad_norm": 0.94921875, - "learning_rate": 5.768427215285697e-05, - "loss": 0.3609, + "grad_norm": 1.734375, + "learning_rate": 2.299966142972934e-05, + "loss": 0.4852, "step": 11860 }, { "epoch": 2.8562831006259026, - "grad_norm": 0.890625, - "learning_rate": 5.766801517336232e-05, - "loss": 0.3724, + "grad_norm": 1.7265625, + "learning_rate": 2.2993179506489396e-05, + "loss": 0.4981, "step": 11865 }, { "epoch": 2.857486759749639, - "grad_norm": 0.98828125, - "learning_rate": 5.7651892527400065e-05, - "loss": 0.3822, + "grad_norm": 1.796875, + "learning_rate": 2.2986751144222803e-05, + "loss": 0.5157, "step": 11870 }, { "epoch": 2.8586904188733753, - "grad_norm": 0.921875, - "learning_rate": 5.763590424073006e-05, - "loss": 0.3814, + "grad_norm": 1.75, + "learning_rate": 2.2980376353200442e-05, + "loss": 0.5171, "step": 11875 }, { "epoch": 2.859894077997111, - "grad_norm": 0.921875, - "learning_rate": 5.7620050338897514e-05, - "loss": 0.3823, + "grad_norm": 1.6640625, + "learning_rate": 2.2974055143607597e-05, + "loss": 0.5111, "step": 11880 }, { "epoch": 2.8610977371208475, - "grad_norm": 1.046875, - "learning_rate": 5.760433084723286e-05, - "loss": 0.4181, + "grad_norm": 1.8828125, + "learning_rate": 2.296778752554394e-05, + "loss": 0.5504, "step": 11885 }, { "epoch": 2.8623013962445834, - "grad_norm": 0.90234375, - "learning_rate": 5.758874579085185e-05, - "loss": 0.3908, + "grad_norm": 1.6796875, + "learning_rate": 2.296157350902351e-05, + "loss": 0.5188, "step": 11890 }, { "epoch": 2.8635050553683197, - "grad_norm": 1.0078125, - "learning_rate": 5.757329519465538e-05, - "loss": 0.3772, + "grad_norm": 1.953125, + "learning_rate": 2.295541310397472e-05, + "loss": 0.5128, "step": 11895 }, { "epoch": 2.864708714492056, - "grad_norm": 0.9375, - "learning_rate": 5.755797908332955e-05, - "loss": 0.393, + "grad_norm": 1.703125, + "learning_rate": 2.294930632024032e-05, + "loss": 0.517, "step": 11900 }, { "epoch": 2.865912373615792, - "grad_norm": 0.921875, - "learning_rate": 5.754279748134561e-05, - "loss": 0.3668, + "grad_norm": 1.65625, + "learning_rate": 2.294325316757738e-05, + "loss": 0.4973, "step": 11905 }, { "epoch": 2.867116032739528, - "grad_norm": 0.85546875, - "learning_rate": 5.7527750412959805e-05, - "loss": 0.3616, + "grad_norm": 1.625, + "learning_rate": 2.293725365565728e-05, + "loss": 0.486, "step": 11910 }, { "epoch": 2.868319691863264, - "grad_norm": 0.95703125, - "learning_rate": 5.7512837902213556e-05, - "loss": 0.3485, + "grad_norm": 1.7578125, + "learning_rate": 2.2931307794065715e-05, + "loss": 0.4691, "step": 11915 }, { "epoch": 2.8695233509870004, - "grad_norm": 1.0546875, - "learning_rate": 5.749805997293318e-05, - "loss": 0.377, + "grad_norm": 1.78125, + "learning_rate": 2.2925415592302635e-05, + "loss": 0.5042, "step": 11920 }, { "epoch": 2.8707270101107367, - "grad_norm": 0.96484375, - "learning_rate": 5.7483416648730076e-05, - "loss": 0.3527, + "grad_norm": 1.8515625, + "learning_rate": 2.2919577059782285e-05, + "loss": 0.4772, "step": 11925 }, { "epoch": 2.871930669234473, - "grad_norm": 0.9765625, - "learning_rate": 5.7468907953000476e-05, - "loss": 0.3718, + "grad_norm": 1.8515625, + "learning_rate": 2.2913792205833125e-05, + "loss": 0.5008, "step": 11930 }, { "epoch": 2.873134328358209, - "grad_norm": 0.99609375, - "learning_rate": 5.745453390892555e-05, - "loss": 0.3802, + "grad_norm": 1.8203125, + "learning_rate": 2.290806103969787e-05, + "loss": 0.5062, "step": 11935 }, { "epoch": 2.8743379874819452, - "grad_norm": 0.84765625, - "learning_rate": 5.74402945394714e-05, - "loss": 0.3467, + "grad_norm": 1.6875, + "learning_rate": 2.2902383570533478e-05, + "loss": 0.4667, "step": 11940 }, { "epoch": 2.875541646605681, - "grad_norm": 0.95703125, - "learning_rate": 5.742618986738882e-05, - "loss": 0.3688, + "grad_norm": 1.703125, + "learning_rate": 2.2896759807411062e-05, + "loss": 0.4956, "step": 11945 }, { "epoch": 2.8767453057294174, - "grad_norm": 0.87109375, - "learning_rate": 5.741221991521349e-05, - "loss": 0.359, + "grad_norm": 1.6484375, + "learning_rate": 2.2891189759315965e-05, + "loss": 0.4859, "step": 11950 }, { "epoch": 2.8779489648531538, - "grad_norm": 1.0, - "learning_rate": 5.739838470526581e-05, - "loss": 0.3418, + "grad_norm": 1.828125, + "learning_rate": 2.2885673435147692e-05, + "loss": 0.4662, "step": 11955 }, { "epoch": 2.8791526239768896, - "grad_norm": 1.0234375, - "learning_rate": 5.7384684259650885e-05, - "loss": 0.3957, + "grad_norm": 1.890625, + "learning_rate": 2.288021084371992e-05, + "loss": 0.5353, "step": 11960 }, { "epoch": 2.880356283100626, - "grad_norm": 0.9765625, - "learning_rate": 5.737111860025856e-05, - "loss": 0.3609, + "grad_norm": 1.8046875, + "learning_rate": 2.2874801993760473e-05, + "loss": 0.4866, "step": 11965 }, { "epoch": 2.881559942224362, - "grad_norm": 1.046875, - "learning_rate": 5.73576877487632e-05, - "loss": 0.4005, + "grad_norm": 1.875, + "learning_rate": 2.286944689391129e-05, + "loss": 0.5384, "step": 11970 }, { "epoch": 2.882763601348098, - "grad_norm": 1.015625, - "learning_rate": 5.734439172662395e-05, - "loss": 0.3651, + "grad_norm": 1.7734375, + "learning_rate": 2.2864145552728466e-05, + "loss": 0.4915, "step": 11975 }, { "epoch": 2.8839672604718345, - "grad_norm": 1.0234375, - "learning_rate": 5.733123055508439e-05, - "loss": 0.4045, + "grad_norm": 1.8046875, + "learning_rate": 2.285889797868218e-05, + "loss": 0.5343, "step": 11980 }, { "epoch": 2.8851709195955704, - "grad_norm": 0.859375, - "learning_rate": 5.7318204255172714e-05, - "loss": 0.3411, + "grad_norm": 1.578125, + "learning_rate": 2.2853704180156703e-05, + "loss": 0.4632, "step": 11985 }, { "epoch": 2.8863745787193067, - "grad_norm": 1.015625, - "learning_rate": 5.7305312847701617e-05, - "loss": 0.3756, + "grad_norm": 1.9765625, + "learning_rate": 2.2848564165450402e-05, + "loss": 0.5071, "step": 11990 }, { "epoch": 2.887578237843043, - "grad_norm": 0.953125, - "learning_rate": 5.729255635326824e-05, - "loss": 0.3503, + "grad_norm": 1.828125, + "learning_rate": 2.2843477942775693e-05, + "loss": 0.4825, "step": 11995 }, { "epoch": 2.888781896966779, - "grad_norm": 1.015625, - "learning_rate": 5.727993479225422e-05, - "loss": 0.3551, + "grad_norm": 1.8125, + "learning_rate": 2.283844552025907e-05, + "loss": 0.4848, "step": 12000 }, { "epoch": 2.888781896966779, - "eval_loss": 0.34669944643974304, - "eval_runtime": 2.333, - "eval_samples_per_second": 85.726, - "eval_steps_per_second": 85.726, + "eval_loss": 0.43862488865852356, + "eval_runtime": 2.4009, + "eval_samples_per_second": 83.302, + "eval_steps_per_second": 83.302, "step": 12000 }, { "epoch": 2.889985556090515, - "grad_norm": 0.875, - "learning_rate": 5.726744818482557e-05, - "loss": 0.3486, + "grad_norm": 1.640625, + "learning_rate": 2.2833466905941047e-05, + "loss": 0.4724, "step": 12005 }, { "epoch": 2.8911892152142515, - "grad_norm": 1.015625, - "learning_rate": 5.7255096550932674e-05, - "loss": 0.3908, + "grad_norm": 1.875, + "learning_rate": 2.2828542107776172e-05, + "loss": 0.5213, "step": 12010 }, { "epoch": 2.8923928743379874, - "grad_norm": 0.98828125, - "learning_rate": 5.724287991031028e-05, - "loss": 0.3789, + "grad_norm": 1.765625, + "learning_rate": 2.2823671133633007e-05, + "loss": 0.5098, "step": 12015 }, { "epoch": 2.8935965334617237, - "grad_norm": 0.97265625, - "learning_rate": 5.723079828247745e-05, - "loss": 0.36, + "grad_norm": 1.7734375, + "learning_rate": 2.2818853991294124e-05, + "loss": 0.4849, "step": 12020 }, { "epoch": 2.8948001925854596, - "grad_norm": 0.9609375, - "learning_rate": 5.721885168673753e-05, - "loss": 0.3694, + "grad_norm": 1.78125, + "learning_rate": 2.2814090688456086e-05, + "loss": 0.4925, "step": 12025 }, { "epoch": 2.896003851709196, - "grad_norm": 0.99609375, - "learning_rate": 5.720704014217813e-05, - "loss": 0.3823, + "grad_norm": 1.65625, + "learning_rate": 2.2809381232729428e-05, + "loss": 0.51, "step": 12030 }, { "epoch": 2.8972075108329323, - "grad_norm": 0.87109375, - "learning_rate": 5.719536366767105e-05, - "loss": 0.3724, + "grad_norm": 1.6875, + "learning_rate": 2.2804725631638646e-05, + "loss": 0.5046, "step": 12035 }, { "epoch": 2.898411169956668, - "grad_norm": 0.90234375, - "learning_rate": 5.7183822281872304e-05, - "loss": 0.3755, + "grad_norm": 1.6640625, + "learning_rate": 2.28001238926222e-05, + "loss": 0.5075, "step": 12040 }, { "epoch": 2.8996148290804045, - "grad_norm": 1.046875, - "learning_rate": 5.717241600322208e-05, - "loss": 0.3659, + "grad_norm": 1.890625, + "learning_rate": 2.2795576023032496e-05, + "loss": 0.4982, "step": 12045 }, { "epoch": 2.9008184882041403, - "grad_norm": 0.95703125, - "learning_rate": 5.716114484994467e-05, - "loss": 0.3872, + "grad_norm": 1.765625, + "learning_rate": 2.2791082030135854e-05, + "loss": 0.5158, "step": 12050 }, { "epoch": 2.9020221473278767, - "grad_norm": 0.9375, - "learning_rate": 5.715000884004851e-05, - "loss": 0.3928, + "grad_norm": 1.7265625, + "learning_rate": 2.278664192111253e-05, + "loss": 0.5219, "step": 12055 }, { "epoch": 2.903225806451613, - "grad_norm": 0.95703125, - "learning_rate": 5.713900799132607e-05, - "loss": 0.3616, + "grad_norm": 1.765625, + "learning_rate": 2.2782255703056673e-05, + "loss": 0.4818, "step": 12060 }, { "epoch": 2.9044294655753493, - "grad_norm": 1.0390625, - "learning_rate": 5.71281423213539e-05, - "loss": 0.3701, + "grad_norm": 1.875, + "learning_rate": 2.2777923382976342e-05, + "loss": 0.495, "step": 12065 }, { "epoch": 2.905633124699085, - "grad_norm": 0.90234375, - "learning_rate": 5.7117411847492554e-05, - "loss": 0.3745, + "grad_norm": 1.8125, + "learning_rate": 2.277364496779347e-05, + "loss": 0.5079, "step": 12070 }, { "epoch": 2.9068367838228215, - "grad_norm": 1.015625, - "learning_rate": 5.7106816586886575e-05, - "loss": 0.3568, + "grad_norm": 1.734375, + "learning_rate": 2.2769420464343876e-05, + "loss": 0.4817, "step": 12075 }, { "epoch": 2.9080404429465574, - "grad_norm": 0.98046875, - "learning_rate": 5.709635655646446e-05, - "loss": 0.3684, + "grad_norm": 1.8046875, + "learning_rate": 2.2765249879377216e-05, + "loss": 0.4977, "step": 12080 }, { "epoch": 2.9092441020702937, - "grad_norm": 1.046875, - "learning_rate": 5.708603177293868e-05, - "loss": 0.3538, + "grad_norm": 1.8203125, + "learning_rate": 2.2761133219557033e-05, + "loss": 0.4795, "step": 12085 }, { "epoch": 2.91044776119403, - "grad_norm": 1.0390625, - "learning_rate": 5.707584225280558e-05, - "loss": 0.3952, + "grad_norm": 1.78125, + "learning_rate": 2.2757070491460683e-05, + "loss": 0.5268, "step": 12090 }, { "epoch": 2.911651420317766, - "grad_norm": 0.9765625, - "learning_rate": 5.706578801234538e-05, - "loss": 0.3579, + "grad_norm": 1.8359375, + "learning_rate": 2.275306170157937e-05, + "loss": 0.4813, "step": 12095 }, { "epoch": 2.9128550794415022, - "grad_norm": 1.0234375, - "learning_rate": 5.7055869067622224e-05, - "loss": 0.3684, + "grad_norm": 1.953125, + "learning_rate": 2.274910685631811e-05, + "loss": 0.4935, "step": 12100 }, { "epoch": 2.914058738565238, - "grad_norm": 0.94921875, - "learning_rate": 5.704608543448401e-05, - "loss": 0.3869, + "grad_norm": 1.7578125, + "learning_rate": 2.274520596199573e-05, + "loss": 0.5129, "step": 12105 }, { "epoch": 2.9152623976889744, - "grad_norm": 0.9140625, - "learning_rate": 5.703643712856248e-05, - "loss": 0.3674, + "grad_norm": 1.6640625, + "learning_rate": 2.2741359024844862e-05, + "loss": 0.4951, "step": 12110 }, { "epoch": 2.9164660568127108, - "grad_norm": 0.921875, - "learning_rate": 5.702692416527318e-05, - "loss": 0.3581, + "grad_norm": 1.6640625, + "learning_rate": 2.2737566051011933e-05, + "loss": 0.4857, "step": 12115 }, { "epoch": 2.9176697159364466, - "grad_norm": 0.98828125, - "learning_rate": 5.7017546559815344e-05, - "loss": 0.409, + "grad_norm": 1.78125, + "learning_rate": 2.2733827046557128e-05, + "loss": 0.5349, "step": 12120 }, { "epoch": 2.918873375060183, - "grad_norm": 1.0, - "learning_rate": 5.700830432717201e-05, - "loss": 0.3713, + "grad_norm": 1.703125, + "learning_rate": 2.2730142017454424e-05, + "loss": 0.4992, "step": 12125 }, { "epoch": 2.920077034183919, - "grad_norm": 1.0703125, - "learning_rate": 5.6999197482109896e-05, - "loss": 0.4083, + "grad_norm": 1.8359375, + "learning_rate": 2.2726510969591558e-05, + "loss": 0.5426, "step": 12130 }, { "epoch": 2.921280693307655, - "grad_norm": 0.953125, - "learning_rate": 5.699022603917939e-05, - "loss": 0.3336, + "grad_norm": 1.796875, + "learning_rate": 2.2722933908770007e-05, + "loss": 0.4659, "step": 12135 }, { "epoch": 2.9224843524313915, - "grad_norm": 1.0859375, - "learning_rate": 5.698139001271457e-05, - "loss": 0.4137, + "grad_norm": 1.859375, + "learning_rate": 2.2719410840704997e-05, + "loss": 0.5461, "step": 12140 }, { "epoch": 2.923688011555128, - "grad_norm": 0.87890625, - "learning_rate": 5.697268941683314e-05, - "loss": 0.3737, + "grad_norm": 1.65625, + "learning_rate": 2.2715941771025492e-05, + "loss": 0.4993, "step": 12145 }, { "epoch": 2.9248916706788637, - "grad_norm": 1.0703125, - "learning_rate": 5.696412426543641e-05, - "loss": 0.3959, + "grad_norm": 1.8984375, + "learning_rate": 2.2712526705274168e-05, + "loss": 0.5209, "step": 12150 }, { "epoch": 2.9260953298026, - "grad_norm": 0.9609375, - "learning_rate": 5.6955694572209304e-05, - "loss": 0.3728, + "grad_norm": 1.734375, + "learning_rate": 2.2709165648907435e-05, + "loss": 0.4981, "step": 12155 }, { "epoch": 2.927298988926336, - "grad_norm": 0.99609375, - "learning_rate": 5.6947400350620327e-05, - "loss": 0.3708, + "grad_norm": 1.9609375, + "learning_rate": 2.2705858607295397e-05, + "loss": 0.5001, "step": 12160 }, { "epoch": 2.928502648050072, - "grad_norm": 0.9921875, - "learning_rate": 5.6939241613921515e-05, - "loss": 0.3626, + "grad_norm": 1.8359375, + "learning_rate": 2.270260558572186e-05, + "loss": 0.4867, "step": 12165 }, { "epoch": 2.9297063071738085, - "grad_norm": 0.92578125, - "learning_rate": 5.69312183751484e-05, - "loss": 0.3736, + "grad_norm": 1.6875, + "learning_rate": 2.2699406589384306e-05, + "loss": 0.5011, "step": 12170 }, { "epoch": 2.9309099662975444, - "grad_norm": 0.98046875, - "learning_rate": 5.69233306471201e-05, - "loss": 0.3883, + "grad_norm": 1.7578125, + "learning_rate": 2.2696261623393923e-05, + "loss": 0.5139, "step": 12175 }, { "epoch": 2.9321136254212807, - "grad_norm": 0.984375, - "learning_rate": 5.691557844243918e-05, - "loss": 0.3946, + "grad_norm": 1.8125, + "learning_rate": 2.2693170692775562e-05, + "loss": 0.5215, "step": 12180 }, { "epoch": 2.9333172845450166, - "grad_norm": 0.96875, - "learning_rate": 5.690796177349167e-05, - "loss": 0.3773, + "grad_norm": 1.828125, + "learning_rate": 2.2690133802467734e-05, + "loss": 0.5063, "step": 12185 }, { "epoch": 2.934520943668753, - "grad_norm": 0.91015625, - "learning_rate": 5.690048065244703e-05, - "loss": 0.3891, + "grad_norm": 1.703125, + "learning_rate": 2.2687150957322602e-05, + "loss": 0.5152, "step": 12190 }, { "epoch": 2.9357246027924893, - "grad_norm": 0.89453125, - "learning_rate": 5.689313509125819e-05, - "loss": 0.3579, + "grad_norm": 1.6640625, + "learning_rate": 2.2684222162105998e-05, + "loss": 0.4834, "step": 12195 }, { "epoch": 2.9369282619162256, - "grad_norm": 0.9921875, - "learning_rate": 5.68859251016615e-05, - "loss": 0.3767, + "grad_norm": 1.8671875, + "learning_rate": 2.2681347421497392e-05, + "loss": 0.504, "step": 12200 }, { "epoch": 2.9381319210399615, - "grad_norm": 1.1015625, - "learning_rate": 5.687885069517665e-05, - "loss": 0.3768, + "grad_norm": 1.90625, + "learning_rate": 2.2678526740089872e-05, + "loss": 0.5047, "step": 12205 }, { "epoch": 2.939335580163698, - "grad_norm": 1.03125, - "learning_rate": 5.687191188310672e-05, - "loss": 0.3894, + "grad_norm": 1.796875, + "learning_rate": 2.2675760122390163e-05, + "loss": 0.5159, "step": 12210 }, { "epoch": 2.9405392392874337, - "grad_norm": 0.984375, - "learning_rate": 5.686510867653818e-05, - "loss": 0.3629, + "grad_norm": 1.8125, + "learning_rate": 2.2673047572818624e-05, + "loss": 0.4894, "step": 12215 }, { "epoch": 2.94174289841117, - "grad_norm": 0.875, - "learning_rate": 5.685844108634079e-05, - "loss": 0.3721, + "grad_norm": 1.6796875, + "learning_rate": 2.2670389095709196e-05, + "loss": 0.5036, "step": 12220 }, { "epoch": 2.9429465575349063, - "grad_norm": 0.9609375, - "learning_rate": 5.6851909123167645e-05, - "loss": 0.3852, + "grad_norm": 1.65625, + "learning_rate": 2.2667784695309454e-05, + "loss": 0.5154, "step": 12225 }, { "epoch": 2.944150216658642, - "grad_norm": 0.984375, - "learning_rate": 5.684551279745516e-05, - "loss": 0.3671, + "grad_norm": 1.71875, + "learning_rate": 2.2665234375780564e-05, + "loss": 0.4928, "step": 12230 }, { "epoch": 2.9453538757823785, - "grad_norm": 0.9140625, - "learning_rate": 5.6839252119423025e-05, - "loss": 0.4056, + "grad_norm": 1.65625, + "learning_rate": 2.2662738141197275e-05, + "loss": 0.5377, "step": 12235 }, { "epoch": 2.9465575349061144, - "grad_norm": 0.96875, - "learning_rate": 5.683312709907419e-05, - "loss": 0.3677, + "grad_norm": 1.8515625, + "learning_rate": 2.2660295995547933e-05, + "loss": 0.4914, "step": 12240 }, { "epoch": 2.9477611940298507, - "grad_norm": 0.98046875, - "learning_rate": 5.682713774619488e-05, - "loss": 0.3463, + "grad_norm": 1.8125, + "learning_rate": 2.265790794273447e-05, + "loss": 0.4645, "step": 12245 }, { "epoch": 2.948964853153587, - "grad_norm": 0.91015625, - "learning_rate": 5.682128407035453e-05, - "loss": 0.3698, + "grad_norm": 1.7265625, + "learning_rate": 2.2655573986572372e-05, + "loss": 0.4951, "step": 12250 }, { "epoch": 2.950168512277323, - "grad_norm": 0.89453125, - "learning_rate": 5.681556608090582e-05, - "loss": 0.3549, + "grad_norm": 1.7109375, + "learning_rate": 2.2653294130790702e-05, + "loss": 0.4818, "step": 12255 }, { "epoch": 2.9513721714010592, - "grad_norm": 1.0390625, - "learning_rate": 5.680998378698464e-05, - "loss": 0.3922, + "grad_norm": 1.8828125, + "learning_rate": 2.265106837903209e-05, + "loss": 0.5214, "step": 12260 }, { "epoch": 2.952575830524795, - "grad_norm": 0.94140625, - "learning_rate": 5.680453719751005e-05, - "loss": 0.3648, + "grad_norm": 1.6640625, + "learning_rate": 2.2648896734852712e-05, + "loss": 0.4918, "step": 12265 }, { "epoch": 2.9537794896485314, - "grad_norm": 0.94921875, - "learning_rate": 5.67992263211843e-05, - "loss": 0.3732, + "grad_norm": 1.7265625, + "learning_rate": 2.2646779201722303e-05, + "loss": 0.505, "step": 12270 }, { "epoch": 2.9549831487722678, - "grad_norm": 1.0546875, - "learning_rate": 5.679405116649284e-05, - "loss": 0.3641, + "grad_norm": 1.8203125, + "learning_rate": 2.264471578302414e-05, + "loss": 0.4941, "step": 12275 }, { "epoch": 2.956186807896004, - "grad_norm": 0.98046875, - "learning_rate": 5.67890117417042e-05, - "loss": 0.3931, + "grad_norm": 1.8828125, + "learning_rate": 2.2642706482055028e-05, + "loss": 0.521, "step": 12280 }, { "epoch": 2.95739046701974, - "grad_norm": 1.09375, - "learning_rate": 5.6784108054870116e-05, - "loss": 0.378, + "grad_norm": 1.921875, + "learning_rate": 2.2640751302025323e-05, + "loss": 0.5075, "step": 12285 }, { "epoch": 2.9585941261434763, - "grad_norm": 0.98046875, - "learning_rate": 5.677934011382542e-05, - "loss": 0.3494, + "grad_norm": 1.7109375, + "learning_rate": 2.26388502460589e-05, + "loss": 0.4744, "step": 12290 }, { "epoch": 2.959797785267212, - "grad_norm": 1.0625, - "learning_rate": 5.677470792618806e-05, - "loss": 0.3673, + "grad_norm": 1.984375, + "learning_rate": 2.263700331719316e-05, + "loss": 0.5044, "step": 12295 }, { "epoch": 2.9610014443909485, - "grad_norm": 0.9296875, - "learning_rate": 5.6770211499359076e-05, - "loss": 0.3648, + "grad_norm": 1.703125, + "learning_rate": 2.263521051837902e-05, + "loss": 0.4976, "step": 12300 }, { "epoch": 2.962205103514685, - "grad_norm": 1.015625, - "learning_rate": 5.67658508405226e-05, - "loss": 0.3476, + "grad_norm": 1.8984375, + "learning_rate": 2.2633471852480907e-05, + "loss": 0.4728, "step": 12305 }, { "epoch": 2.9634087626384207, - "grad_norm": 0.91796875, - "learning_rate": 5.676162595664586e-05, - "loss": 0.3689, + "grad_norm": 1.6171875, + "learning_rate": 2.2631787322276775e-05, + "loss": 0.4934, "step": 12310 }, { "epoch": 2.964612421762157, - "grad_norm": 0.94921875, - "learning_rate": 5.675753685447913e-05, - "loss": 0.3697, + "grad_norm": 1.7890625, + "learning_rate": 2.2630156930458064e-05, + "loss": 0.4986, "step": 12315 }, { "epoch": 2.965816080885893, - "grad_norm": 0.8828125, - "learning_rate": 5.6753583540555744e-05, - "loss": 0.385, + "grad_norm": 1.671875, + "learning_rate": 2.2628580679629724e-05, + "loss": 0.5195, "step": 12320 }, { "epoch": 2.967019740009629, - "grad_norm": 0.9765625, - "learning_rate": 5.6749766021192104e-05, - "loss": 0.3485, + "grad_norm": 1.734375, + "learning_rate": 2.26270585723102e-05, + "loss": 0.4787, "step": 12325 }, { "epoch": 2.9682233991333655, - "grad_norm": 0.98828125, - "learning_rate": 5.674608430248761e-05, - "loss": 0.3674, + "grad_norm": 1.796875, + "learning_rate": 2.262559061093143e-05, + "loss": 0.4912, "step": 12330 }, { "epoch": 2.969427058257102, - "grad_norm": 0.81640625, - "learning_rate": 5.67425383903247e-05, - "loss": 0.3817, + "grad_norm": 1.5859375, + "learning_rate": 2.2624176797838833e-05, + "loss": 0.5129, "step": 12335 }, { "epoch": 2.9706307173808377, - "grad_norm": 0.98828125, - "learning_rate": 5.673912829036885e-05, - "loss": 0.3935, + "grad_norm": 1.84375, + "learning_rate": 2.2622817135291334e-05, + "loss": 0.5237, "step": 12340 }, { "epoch": 2.971834376504574, - "grad_norm": 0.99609375, - "learning_rate": 5.673585400806851e-05, - "loss": 0.368, + "grad_norm": 1.828125, + "learning_rate": 2.2621511625461316e-05, + "loss": 0.5033, "step": 12345 }, { "epoch": 2.97303803562831, - "grad_norm": 0.91015625, - "learning_rate": 5.673271554865515e-05, - "loss": 0.3778, + "grad_norm": 1.640625, + "learning_rate": 2.2620260270434643e-05, + "loss": 0.5046, "step": 12350 }, { "epoch": 2.9742416947520463, - "grad_norm": 0.95703125, - "learning_rate": 5.6729712917143226e-05, - "loss": 0.3649, + "grad_norm": 1.78125, + "learning_rate": 2.2619063072210674e-05, + "loss": 0.4957, "step": 12355 }, { "epoch": 2.9754453538757826, - "grad_norm": 0.98828125, - "learning_rate": 5.672684611833017e-05, - "loss": 0.3862, + "grad_norm": 1.7890625, + "learning_rate": 2.261792003270221e-05, + "loss": 0.5236, "step": 12360 }, { "epoch": 2.9766490129995185, - "grad_norm": 0.96875, - "learning_rate": 5.672411515679639e-05, - "loss": 0.3711, + "grad_norm": 1.7890625, + "learning_rate": 2.261683115373555e-05, + "loss": 0.5006, "step": 12365 }, { "epoch": 2.9778526721232548, - "grad_norm": 0.8984375, - "learning_rate": 5.672152003690527e-05, - "loss": 0.3482, + "grad_norm": 1.703125, + "learning_rate": 2.2615796437050434e-05, + "loss": 0.4741, "step": 12370 }, { "epoch": 2.9790563312469907, - "grad_norm": 1.0859375, - "learning_rate": 5.671906076280313e-05, - "loss": 0.3625, + "grad_norm": 2.078125, + "learning_rate": 2.2614815884300073e-05, + "loss": 0.4957, "step": 12375 }, { "epoch": 2.980259990370727, - "grad_norm": 1.0234375, - "learning_rate": 5.67167373384193e-05, - "loss": 0.3688, + "grad_norm": 1.859375, + "learning_rate": 2.261388949705115e-05, + "loss": 0.4946, "step": 12380 }, { "epoch": 2.9814636494944633, - "grad_norm": 0.96484375, - "learning_rate": 5.671454976746596e-05, - "loss": 0.385, + "grad_norm": 1.8046875, + "learning_rate": 2.261301727678378e-05, + "loss": 0.5173, "step": 12385 }, { "epoch": 2.982667308618199, - "grad_norm": 0.83203125, - "learning_rate": 5.6712498053438323e-05, - "loss": 0.3662, + "grad_norm": 1.5546875, + "learning_rate": 2.2612199224891555e-05, + "loss": 0.4943, "step": 12390 }, { "epoch": 2.9838709677419355, - "grad_norm": 0.95703125, - "learning_rate": 5.671058219961449e-05, - "loss": 0.3962, + "grad_norm": 1.5703125, + "learning_rate": 2.2611435342681508e-05, + "loss": 0.5252, "step": 12395 }, { "epoch": 2.9850746268656714, - "grad_norm": 0.921875, - "learning_rate": 5.670880220905551e-05, - "loss": 0.3836, + "grad_norm": 1.703125, + "learning_rate": 2.2610725631374132e-05, + "loss": 0.512, "step": 12400 }, { "epoch": 2.9862782859894077, - "grad_norm": 0.9453125, - "learning_rate": 5.6707158084605354e-05, - "loss": 0.3612, + "grad_norm": 1.7578125, + "learning_rate": 2.261007009210336e-05, + "loss": 0.4868, "step": 12405 }, { "epoch": 2.987481945113144, - "grad_norm": 0.953125, - "learning_rate": 5.670564982889091e-05, - "loss": 0.3875, + "grad_norm": 1.7734375, + "learning_rate": 2.2609468725916584e-05, + "loss": 0.5164, "step": 12410 }, { "epoch": 2.9886856042368803, - "grad_norm": 0.93359375, - "learning_rate": 5.670427744432197e-05, - "loss": 0.3711, + "grad_norm": 1.75, + "learning_rate": 2.2608921533774617e-05, + "loss": 0.4966, "step": 12415 }, { "epoch": 2.9898892633606162, - "grad_norm": 0.953125, - "learning_rate": 5.670304093309127e-05, - "loss": 0.3813, + "grad_norm": 1.7265625, + "learning_rate": 2.260842851655175e-05, + "loss": 0.5107, "step": 12420 }, { "epoch": 2.9910929224843525, - "grad_norm": 0.9453125, - "learning_rate": 5.670194029717444e-05, - "loss": 0.384, + "grad_norm": 1.8359375, + "learning_rate": 2.2607989675035685e-05, + "loss": 0.5109, "step": 12425 }, { "epoch": 2.9922965816080884, - "grad_norm": 0.90625, - "learning_rate": 5.670097553832999e-05, - "loss": 0.3819, + "grad_norm": 1.6796875, + "learning_rate": 2.260760500992758e-05, + "loss": 0.5137, "step": 12430 }, { "epoch": 2.9935002407318247, - "grad_norm": 0.90234375, - "learning_rate": 5.670014665809939e-05, - "loss": 0.3667, + "grad_norm": 1.640625, + "learning_rate": 2.260727452184204e-05, + "loss": 0.4969, "step": 12435 }, { "epoch": 2.994703899855561, - "grad_norm": 0.98046875, - "learning_rate": 5.669945365780695e-05, - "loss": 0.3959, + "grad_norm": 1.8125, + "learning_rate": 2.2606998211307087e-05, + "loss": 0.5345, "step": 12440 }, { "epoch": 2.995907558979297, - "grad_norm": 0.9609375, - "learning_rate": 5.6698896538559915e-05, - "loss": 0.3735, + "grad_norm": 1.71875, + "learning_rate": 2.2606776078764204e-05, + "loss": 0.5075, "step": 12445 }, { "epoch": 2.9971112181030333, - "grad_norm": 1.0390625, - "learning_rate": 5.669847530124844e-05, - "loss": 0.3959, + "grad_norm": 1.8515625, + "learning_rate": 2.26066081245683e-05, + "loss": 0.5224, "step": 12450 }, { "epoch": 2.998314877226769, - "grad_norm": 0.9375, - "learning_rate": 5.6698189946545524e-05, - "loss": 0.3593, + "grad_norm": 1.7421875, + "learning_rate": 2.2606494348987723e-05, + "loss": 0.4898, "step": 12455 }, { "epoch": 2.999277804525758, - "eval_loss": 0.34470242261886597, - "eval_runtime": 2.3087, - "eval_samples_per_second": 86.628, - "eval_steps_per_second": 86.628, + "eval_loss": 0.4379862844944, + "eval_runtime": 2.3661, + "eval_samples_per_second": 84.528, + "eval_steps_per_second": 84.528, "step": 12459 } ],