diff --git "a/model/trainer_state.json" "b/model/trainer_state.json" new file mode 100644--- /dev/null +++ "b/model/trainer_state.json" @@ -0,0 +1,63106 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.748186255111463, + "eval_steps": 2000, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005276348766653475, + "grad_norm": NaN, + "learning_rate": 5.275652862041678e-09, + "loss": 3.2006, + "step": 2 + }, + { + "epoch": 0.001055269753330695, + "grad_norm": 3.9530160427093506, + "learning_rate": 1.0551305724083356e-08, + "loss": 3.2016, + "step": 4 + }, + { + "epoch": 0.0015829046299960427, + "grad_norm": 4.264596462249756, + "learning_rate": 2.1102611448166713e-08, + "loss": 3.1927, + "step": 6 + }, + { + "epoch": 0.00211053950666139, + "grad_norm": 4.244070529937744, + "learning_rate": 3.165391717225007e-08, + "loss": 3.2025, + "step": 8 + }, + { + "epoch": 0.002638174383326738, + "grad_norm": 3.9196221828460693, + "learning_rate": 4.2205222896333426e-08, + "loss": 3.1984, + "step": 10 + }, + { + "epoch": 0.0031658092599920855, + "grad_norm": 3.8282294273376465, + "learning_rate": 5.275652862041678e-08, + "loss": 3.1986, + "step": 12 + }, + { + "epoch": 0.003693444136657433, + "grad_norm": NaN, + "learning_rate": 6.330783434450014e-08, + "loss": 3.1978, + "step": 14 + }, + { + "epoch": 0.00422107901332278, + "grad_norm": 3.942249059677124, + "learning_rate": 6.858348720654182e-08, + "loss": 3.2025, + "step": 16 + }, + { + "epoch": 0.004748713889988128, + "grad_norm": 4.207074165344238, + "learning_rate": 7.913479293062517e-08, + "loss": 3.1949, + "step": 18 + }, + { + "epoch": 0.005276348766653476, + "grad_norm": 7.466555118560791, + "learning_rate": 8.968609865470853e-08, + "loss": 3.1973, + "step": 20 + }, + { + "epoch": 0.005803983643318823, + "grad_norm": 11.214427947998047, + "learning_rate": 1.0023740437879189e-07, + "loss": 3.1993, + "step": 22 + }, + { + "epoch": 0.006331618519984171, + "grad_norm": 5.270843029022217, + "learning_rate": 1.1078871010287524e-07, + "loss": 3.191, + "step": 24 + }, + { + "epoch": 0.006859253396649519, + "grad_norm": Infinity, + "learning_rate": 1.2134001582695859e-07, + "loss": 3.1879, + "step": 26 + }, + { + "epoch": 0.007386888273314866, + "grad_norm": 3.710148811340332, + "learning_rate": 1.2661566868900028e-07, + "loss": 3.1909, + "step": 28 + }, + { + "epoch": 0.007914523149980214, + "grad_norm": 5.143282890319824, + "learning_rate": 1.3716697441308363e-07, + "loss": 3.1956, + "step": 30 + }, + { + "epoch": 0.00844215802664556, + "grad_norm": 4.7810163497924805, + "learning_rate": 1.47718280137167e-07, + "loss": 3.1919, + "step": 32 + }, + { + "epoch": 0.00896979290331091, + "grad_norm": 3.9726290702819824, + "learning_rate": 1.5826958586125035e-07, + "loss": 3.1998, + "step": 34 + }, + { + "epoch": 0.009497427779976256, + "grad_norm": 4.133223533630371, + "learning_rate": 1.688208915853337e-07, + "loss": 3.1966, + "step": 36 + }, + { + "epoch": 0.010025062656641603, + "grad_norm": 4.447741508483887, + "learning_rate": 1.7937219730941706e-07, + "loss": 3.1945, + "step": 38 + }, + { + "epoch": 0.010552697533306952, + "grad_norm": 3.7874393463134766, + "learning_rate": 1.8992350303350042e-07, + "loss": 3.1906, + "step": 40 + }, + { + "epoch": 0.0110803324099723, + "grad_norm": 3.7980053424835205, + "learning_rate": 2.0047480875758377e-07, + "loss": 3.1934, + "step": 42 + }, + { + "epoch": 0.011607967286637646, + "grad_norm": 3.2823562622070312, + "learning_rate": 2.1102611448166713e-07, + "loss": 3.1861, + "step": 44 + }, + { + "epoch": 0.012135602163302995, + "grad_norm": 10.366581916809082, + "learning_rate": 2.2157742020575048e-07, + "loss": 3.1867, + "step": 46 + }, + { + "epoch": 0.012663237039968342, + "grad_norm": 3.9421546459198, + "learning_rate": 2.3212872592983381e-07, + "loss": 3.1816, + "step": 48 + }, + { + "epoch": 0.013190871916633689, + "grad_norm": 3.322040557861328, + "learning_rate": 2.4268003165391717e-07, + "loss": 3.1818, + "step": 50 + }, + { + "epoch": 0.013718506793299038, + "grad_norm": 5.72543478012085, + "learning_rate": 2.5323133737800055e-07, + "loss": 3.1862, + "step": 52 + }, + { + "epoch": 0.014246141669964385, + "grad_norm": 3.9358532428741455, + "learning_rate": 2.637826431020839e-07, + "loss": 3.1866, + "step": 54 + }, + { + "epoch": 0.014773776546629732, + "grad_norm": 4.022729873657227, + "learning_rate": 2.7433394882616727e-07, + "loss": 3.1781, + "step": 56 + }, + { + "epoch": 0.01530141142329508, + "grad_norm": 3.590331792831421, + "learning_rate": 2.848852545502506e-07, + "loss": 3.181, + "step": 58 + }, + { + "epoch": 0.015829046299960427, + "grad_norm": 3.8149402141571045, + "learning_rate": 2.95436560274334e-07, + "loss": 3.1795, + "step": 60 + }, + { + "epoch": 0.016356681176625774, + "grad_norm": 3.8646979331970215, + "learning_rate": 3.0598786599841736e-07, + "loss": 3.1703, + "step": 62 + }, + { + "epoch": 0.01688431605329112, + "grad_norm": 3.699587106704712, + "learning_rate": 3.165391717225007e-07, + "loss": 3.1728, + "step": 64 + }, + { + "epoch": 0.01741195092995647, + "grad_norm": 29.707883834838867, + "learning_rate": 3.270904774465841e-07, + "loss": 3.1773, + "step": 66 + }, + { + "epoch": 0.01793958580662182, + "grad_norm": 10.552943229675293, + "learning_rate": 3.376417831706674e-07, + "loss": 3.1732, + "step": 68 + }, + { + "epoch": 0.018467220683287166, + "grad_norm": 4.929538249969482, + "learning_rate": 3.4819308889475074e-07, + "loss": 3.1736, + "step": 70 + }, + { + "epoch": 0.018994855559952513, + "grad_norm": 4.374993324279785, + "learning_rate": 3.587443946188341e-07, + "loss": 3.174, + "step": 72 + }, + { + "epoch": 0.01952249043661786, + "grad_norm": 3.528315782546997, + "learning_rate": 3.692957003429175e-07, + "loss": 3.1652, + "step": 74 + }, + { + "epoch": 0.020050125313283207, + "grad_norm": 3.519756555557251, + "learning_rate": 3.7984700606700083e-07, + "loss": 3.1625, + "step": 76 + }, + { + "epoch": 0.020577760189948554, + "grad_norm": 4.270632743835449, + "learning_rate": 3.9039831179108416e-07, + "loss": 3.1622, + "step": 78 + }, + { + "epoch": 0.021105395066613904, + "grad_norm": 37.94541549682617, + "learning_rate": 4.0094961751516754e-07, + "loss": 3.1582, + "step": 80 + }, + { + "epoch": 0.02163302994327925, + "grad_norm": 4.269265174865723, + "learning_rate": 4.115009232392509e-07, + "loss": 3.1579, + "step": 82 + }, + { + "epoch": 0.0221606648199446, + "grad_norm": 3.752579689025879, + "learning_rate": 4.2205222896333426e-07, + "loss": 3.1577, + "step": 84 + }, + { + "epoch": 0.022688299696609945, + "grad_norm": 10.902413368225098, + "learning_rate": 4.326035346874176e-07, + "loss": 3.1558, + "step": 86 + }, + { + "epoch": 0.023215934573275292, + "grad_norm": 8.729467391967773, + "learning_rate": 4.4315484041150097e-07, + "loss": 3.1499, + "step": 88 + }, + { + "epoch": 0.02374356944994064, + "grad_norm": 5.433394432067871, + "learning_rate": 4.5370614613558435e-07, + "loss": 3.1526, + "step": 90 + }, + { + "epoch": 0.02427120432660599, + "grad_norm": 3.9260177612304688, + "learning_rate": 4.6425745185966763e-07, + "loss": 3.1502, + "step": 92 + }, + { + "epoch": 0.024798839203271337, + "grad_norm": 4.3736066818237305, + "learning_rate": 4.74808757583751e-07, + "loss": 3.1433, + "step": 94 + }, + { + "epoch": 0.025326474079936684, + "grad_norm": 7.156918048858643, + "learning_rate": 4.853600633078343e-07, + "loss": 3.1467, + "step": 96 + }, + { + "epoch": 0.02585410895660203, + "grad_norm": 3.7075700759887695, + "learning_rate": 4.959113690319177e-07, + "loss": 3.1379, + "step": 98 + }, + { + "epoch": 0.026381743833267378, + "grad_norm": 4.034582614898682, + "learning_rate": 5.064626747560011e-07, + "loss": 3.1393, + "step": 100 + }, + { + "epoch": 0.026909378709932725, + "grad_norm": 3.9551854133605957, + "learning_rate": 5.170139804800845e-07, + "loss": 3.1364, + "step": 102 + }, + { + "epoch": 0.027437013586598075, + "grad_norm": 109.4498062133789, + "learning_rate": 5.222896333421262e-07, + "loss": 3.1318, + "step": 104 + }, + { + "epoch": 0.027964648463263422, + "grad_norm": 4.101171016693115, + "learning_rate": 5.328409390662095e-07, + "loss": 3.1285, + "step": 106 + }, + { + "epoch": 0.02849228333992877, + "grad_norm": 6.840890884399414, + "learning_rate": 5.433922447902928e-07, + "loss": 3.1243, + "step": 108 + }, + { + "epoch": 0.029019918216594116, + "grad_norm": 3.6588923931121826, + "learning_rate": 5.539435505143762e-07, + "loss": 3.1227, + "step": 110 + }, + { + "epoch": 0.029547553093259463, + "grad_norm": NaN, + "learning_rate": 5.644948562384595e-07, + "loss": 3.1264, + "step": 112 + }, + { + "epoch": 0.03007518796992481, + "grad_norm": 3.855950355529785, + "learning_rate": 5.697705091005012e-07, + "loss": 3.12, + "step": 114 + }, + { + "epoch": 0.03060282284659016, + "grad_norm": 6.50042200088501, + "learning_rate": 5.803218148245846e-07, + "loss": 3.1167, + "step": 116 + }, + { + "epoch": 0.031130457723255508, + "grad_norm": 3.3859012126922607, + "learning_rate": 5.90873120548668e-07, + "loss": 3.1166, + "step": 118 + }, + { + "epoch": 0.031658092599920855, + "grad_norm": 6.513570308685303, + "learning_rate": 6.014244262727513e-07, + "loss": 3.1071, + "step": 120 + }, + { + "epoch": 0.032185727476586205, + "grad_norm": 6.194601058959961, + "learning_rate": 6.119757319968347e-07, + "loss": 3.1025, + "step": 122 + }, + { + "epoch": 0.03271336235325155, + "grad_norm": 3.9673354625701904, + "learning_rate": 6.22527037720918e-07, + "loss": 3.0892, + "step": 124 + }, + { + "epoch": 0.0332409972299169, + "grad_norm": 4.571136474609375, + "learning_rate": 6.330783434450014e-07, + "loss": 3.0891, + "step": 126 + }, + { + "epoch": 0.03376863210658224, + "grad_norm": 20.535213470458984, + "learning_rate": 6.436296491690847e-07, + "loss": 3.0964, + "step": 128 + }, + { + "epoch": 0.03429626698324759, + "grad_norm": 34.55583572387695, + "learning_rate": 6.541809548931682e-07, + "loss": 3.0872, + "step": 130 + }, + { + "epoch": 0.03482390185991294, + "grad_norm": 5.673486232757568, + "learning_rate": 6.647322606172514e-07, + "loss": 3.0773, + "step": 132 + }, + { + "epoch": 0.03535153673657829, + "grad_norm": 13.078265190124512, + "learning_rate": 6.752835663413348e-07, + "loss": 3.0729, + "step": 134 + }, + { + "epoch": 0.03587917161324364, + "grad_norm": 3.795184373855591, + "learning_rate": 6.858348720654182e-07, + "loss": 3.0707, + "step": 136 + }, + { + "epoch": 0.03640680648990898, + "grad_norm": 5.359780311584473, + "learning_rate": 6.963861777895015e-07, + "loss": 3.0662, + "step": 138 + }, + { + "epoch": 0.03693444136657433, + "grad_norm": 3.4898064136505127, + "learning_rate": 7.069374835135849e-07, + "loss": 3.0759, + "step": 140 + }, + { + "epoch": 0.037462076243239675, + "grad_norm": 5.100823402404785, + "learning_rate": 7.174887892376682e-07, + "loss": 3.0532, + "step": 142 + }, + { + "epoch": 0.037989711119905026, + "grad_norm": 4.224725723266602, + "learning_rate": 7.280400949617515e-07, + "loss": 3.0482, + "step": 144 + }, + { + "epoch": 0.038517345996570376, + "grad_norm": 3.647191047668457, + "learning_rate": 7.38591400685835e-07, + "loss": 3.0468, + "step": 146 + }, + { + "epoch": 0.03904498087323572, + "grad_norm": 3.6561899185180664, + "learning_rate": 7.491427064099183e-07, + "loss": 3.0309, + "step": 148 + }, + { + "epoch": 0.03957261574990107, + "grad_norm": 3.6110692024230957, + "learning_rate": 7.596940121340017e-07, + "loss": 3.0265, + "step": 150 + }, + { + "epoch": 0.040100250626566414, + "grad_norm": 12.14970874786377, + "learning_rate": 7.70245317858085e-07, + "loss": 3.0362, + "step": 152 + }, + { + "epoch": 0.040627885503231764, + "grad_norm": 4.078395366668701, + "learning_rate": 7.807966235821683e-07, + "loss": 2.997, + "step": 154 + }, + { + "epoch": 0.04115552037989711, + "grad_norm": 4.122748851776123, + "learning_rate": 7.913479293062518e-07, + "loss": 2.9984, + "step": 156 + }, + { + "epoch": 0.04168315525656246, + "grad_norm": 5.756165027618408, + "learning_rate": 8.018992350303351e-07, + "loss": 3.0043, + "step": 158 + }, + { + "epoch": 0.04221079013322781, + "grad_norm": 10.111802101135254, + "learning_rate": 8.124505407544184e-07, + "loss": 2.975, + "step": 160 + }, + { + "epoch": 0.04273842500989315, + "grad_norm": 4.273674488067627, + "learning_rate": 8.230018464785019e-07, + "loss": 2.9686, + "step": 162 + }, + { + "epoch": 0.0432660598865585, + "grad_norm": 4.078615665435791, + "learning_rate": 8.335531522025851e-07, + "loss": 2.9599, + "step": 164 + }, + { + "epoch": 0.043793694763223846, + "grad_norm": 3.9134085178375244, + "learning_rate": 8.441044579266685e-07, + "loss": 2.9696, + "step": 166 + }, + { + "epoch": 0.0443213296398892, + "grad_norm": 3.8212637901306152, + "learning_rate": 8.546557636507519e-07, + "loss": 2.9613, + "step": 168 + }, + { + "epoch": 0.04484896451655455, + "grad_norm": 8.7020902633667, + "learning_rate": 8.652070693748352e-07, + "loss": 2.918, + "step": 170 + }, + { + "epoch": 0.04537659939321989, + "grad_norm": 3.957463264465332, + "learning_rate": 8.757583750989187e-07, + "loss": 2.9207, + "step": 172 + }, + { + "epoch": 0.04590423426988524, + "grad_norm": 10.978931427001953, + "learning_rate": 8.863096808230019e-07, + "loss": 2.9051, + "step": 174 + }, + { + "epoch": 0.046431869146550585, + "grad_norm": 26.37824821472168, + "learning_rate": 8.968609865470852e-07, + "loss": 2.9003, + "step": 176 + }, + { + "epoch": 0.046959504023215935, + "grad_norm": 19.916807174682617, + "learning_rate": 9.074122922711687e-07, + "loss": 2.8755, + "step": 178 + }, + { + "epoch": 0.04748713889988128, + "grad_norm": 5.642795562744141, + "learning_rate": 9.17963597995252e-07, + "loss": 2.8431, + "step": 180 + }, + { + "epoch": 0.04801477377654663, + "grad_norm": 5.4845356941223145, + "learning_rate": 9.285149037193353e-07, + "loss": 2.8225, + "step": 182 + }, + { + "epoch": 0.04854240865321198, + "grad_norm": 7.507856845855713, + "learning_rate": 9.390662094434187e-07, + "loss": 2.8141, + "step": 184 + }, + { + "epoch": 0.04907004352987732, + "grad_norm": 33.270015716552734, + "learning_rate": 9.49617515167502e-07, + "loss": 2.7856, + "step": 186 + }, + { + "epoch": 0.049597678406542674, + "grad_norm": 7.942454814910889, + "learning_rate": 9.601688208915855e-07, + "loss": 2.7463, + "step": 188 + }, + { + "epoch": 0.05012531328320802, + "grad_norm": 16.594919204711914, + "learning_rate": 9.707201266156687e-07, + "loss": 2.7034, + "step": 190 + }, + { + "epoch": 0.05065294815987337, + "grad_norm": 270.3727111816406, + "learning_rate": 9.81271432339752e-07, + "loss": 2.6307, + "step": 192 + }, + { + "epoch": 0.05118058303653872, + "grad_norm": 12.470280647277832, + "learning_rate": 9.918227380638354e-07, + "loss": 2.571, + "step": 194 + }, + { + "epoch": 0.05170821791320406, + "grad_norm": 10.462482452392578, + "learning_rate": 1.0023740437879188e-06, + "loss": 2.5615, + "step": 196 + }, + { + "epoch": 0.05223585278986941, + "grad_norm": 12.797545433044434, + "learning_rate": 1.0129253495120022e-06, + "loss": 2.482, + "step": 198 + }, + { + "epoch": 0.052763487666534756, + "grad_norm": 15.189107894897461, + "learning_rate": 1.0234766552360856e-06, + "loss": 2.4752, + "step": 200 + }, + { + "epoch": 0.053291122543200106, + "grad_norm": 11.92821979522705, + "learning_rate": 1.034027960960169e-06, + "loss": 2.463, + "step": 202 + }, + { + "epoch": 0.05381875741986545, + "grad_norm": 18.217239379882812, + "learning_rate": 1.0445792666842524e-06, + "loss": 2.4031, + "step": 204 + }, + { + "epoch": 0.0543463922965308, + "grad_norm": 33.67808151245117, + "learning_rate": 1.0551305724083355e-06, + "loss": 2.3756, + "step": 206 + }, + { + "epoch": 0.05487402717319615, + "grad_norm": 10.945366859436035, + "learning_rate": 1.065681878132419e-06, + "loss": 2.3163, + "step": 208 + }, + { + "epoch": 0.055401662049861494, + "grad_norm": 11.510740280151367, + "learning_rate": 1.0762331838565023e-06, + "loss": 2.3483, + "step": 210 + }, + { + "epoch": 0.055929296926526845, + "grad_norm": 13.849950790405273, + "learning_rate": 1.0867844895805857e-06, + "loss": 2.3163, + "step": 212 + }, + { + "epoch": 0.05645693180319219, + "grad_norm": 12.38608455657959, + "learning_rate": 1.097335795304669e-06, + "loss": 2.3071, + "step": 214 + }, + { + "epoch": 0.05698456667985754, + "grad_norm": 9.628887176513672, + "learning_rate": 1.1078871010287524e-06, + "loss": 2.2446, + "step": 216 + }, + { + "epoch": 0.05751220155652289, + "grad_norm": 9.188224792480469, + "learning_rate": 1.1184384067528358e-06, + "loss": 2.2208, + "step": 218 + }, + { + "epoch": 0.05803983643318823, + "grad_norm": 9.43463134765625, + "learning_rate": 1.128989712476919e-06, + "loss": 2.2396, + "step": 220 + }, + { + "epoch": 0.05856747130985358, + "grad_norm": 49.1281852722168, + "learning_rate": 1.1395410182010024e-06, + "loss": 2.1283, + "step": 222 + }, + { + "epoch": 0.05909510618651893, + "grad_norm": 7.73472785949707, + "learning_rate": 1.1500923239250858e-06, + "loss": 2.1879, + "step": 224 + }, + { + "epoch": 0.05962274106318428, + "grad_norm": 24.536495208740234, + "learning_rate": 1.1606436296491692e-06, + "loss": 2.2139, + "step": 226 + }, + { + "epoch": 0.06015037593984962, + "grad_norm": 7.804764747619629, + "learning_rate": 1.1711949353732525e-06, + "loss": 2.1334, + "step": 228 + }, + { + "epoch": 0.06067801081651497, + "grad_norm": 18.661983489990234, + "learning_rate": 1.181746241097336e-06, + "loss": 2.034, + "step": 230 + }, + { + "epoch": 0.06120564569318032, + "grad_norm": 134.84848022460938, + "learning_rate": 1.1922975468214193e-06, + "loss": 2.098, + "step": 232 + }, + { + "epoch": 0.061733280569845665, + "grad_norm": 9.494499206542969, + "learning_rate": 1.2028488525455027e-06, + "loss": 2.053, + "step": 234 + }, + { + "epoch": 0.062260915446511016, + "grad_norm": 11.574651718139648, + "learning_rate": 1.2134001582695859e-06, + "loss": 2.0844, + "step": 236 + }, + { + "epoch": 0.06278855032317636, + "grad_norm": 14.940314292907715, + "learning_rate": 1.2239514639936694e-06, + "loss": 2.0156, + "step": 238 + }, + { + "epoch": 0.06331618519984171, + "grad_norm": 7.003679275512695, + "learning_rate": 1.2345027697177526e-06, + "loss": 2.0076, + "step": 240 + }, + { + "epoch": 0.06384382007650706, + "grad_norm": 20.857261657714844, + "learning_rate": 1.245054075441836e-06, + "loss": 1.9754, + "step": 242 + }, + { + "epoch": 0.06437145495317241, + "grad_norm": 6.828178405761719, + "learning_rate": 1.2556053811659194e-06, + "loss": 2.0036, + "step": 244 + }, + { + "epoch": 0.06489908982983775, + "grad_norm": 8.601447105407715, + "learning_rate": 1.2661566868900028e-06, + "loss": 1.9687, + "step": 246 + }, + { + "epoch": 0.0654267247065031, + "grad_norm": 4.443185806274414, + "learning_rate": 1.276707992614086e-06, + "loss": 1.9146, + "step": 248 + }, + { + "epoch": 0.06595435958316845, + "grad_norm": 11.137971878051758, + "learning_rate": 1.2872592983381693e-06, + "loss": 1.966, + "step": 250 + }, + { + "epoch": 0.0664819944598338, + "grad_norm": 10.319031715393066, + "learning_rate": 1.297810604062253e-06, + "loss": 1.9568, + "step": 252 + }, + { + "epoch": 0.06700962933649915, + "grad_norm": 9.606258392333984, + "learning_rate": 1.3083619097863363e-06, + "loss": 1.9113, + "step": 254 + }, + { + "epoch": 0.06753726421316449, + "grad_norm": 7.928279399871826, + "learning_rate": 1.3189132155104195e-06, + "loss": 1.8344, + "step": 256 + }, + { + "epoch": 0.06806489908982984, + "grad_norm": 355.3827819824219, + "learning_rate": 1.3294645212345029e-06, + "loss": 1.8391, + "step": 258 + }, + { + "epoch": 0.06859253396649519, + "grad_norm": 4.054043292999268, + "learning_rate": 1.340015826958586e-06, + "loss": 1.8305, + "step": 260 + }, + { + "epoch": 0.06912016884316054, + "grad_norm": 54.05241012573242, + "learning_rate": 1.3505671326826696e-06, + "loss": 1.7938, + "step": 262 + }, + { + "epoch": 0.06964780371982587, + "grad_norm": 5.801769733428955, + "learning_rate": 1.361118438406753e-06, + "loss": 1.8346, + "step": 264 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 9.447383880615234, + "learning_rate": 1.3716697441308364e-06, + "loss": 1.8782, + "step": 266 + }, + { + "epoch": 0.07070307347315657, + "grad_norm": 6.370567321777344, + "learning_rate": 1.3822210498549196e-06, + "loss": 1.78, + "step": 268 + }, + { + "epoch": 0.07123070834982193, + "grad_norm": 9.35560417175293, + "learning_rate": 1.392772355579003e-06, + "loss": 1.7657, + "step": 270 + }, + { + "epoch": 0.07175834322648728, + "grad_norm": 3.827622175216675, + "learning_rate": 1.4033236613030865e-06, + "loss": 1.7361, + "step": 272 + }, + { + "epoch": 0.07228597810315261, + "grad_norm": 37.69013977050781, + "learning_rate": 1.4138749670271697e-06, + "loss": 1.7581, + "step": 274 + }, + { + "epoch": 0.07281361297981796, + "grad_norm": 9.937688827514648, + "learning_rate": 1.424426272751253e-06, + "loss": 1.8304, + "step": 276 + }, + { + "epoch": 0.07334124785648331, + "grad_norm": 3.5308496952056885, + "learning_rate": 1.4349775784753365e-06, + "loss": 1.7376, + "step": 278 + }, + { + "epoch": 0.07386888273314866, + "grad_norm": 20.05885124206543, + "learning_rate": 1.4455288841994196e-06, + "loss": 1.8685, + "step": 280 + }, + { + "epoch": 0.07439651760981401, + "grad_norm": 4.19755220413208, + "learning_rate": 1.456080189923503e-06, + "loss": 1.7394, + "step": 282 + }, + { + "epoch": 0.07492415248647935, + "grad_norm": 4.371680736541748, + "learning_rate": 1.4666314956475866e-06, + "loss": 1.7925, + "step": 284 + }, + { + "epoch": 0.0754517873631447, + "grad_norm": 6.752421855926514, + "learning_rate": 1.47718280137167e-06, + "loss": 1.7954, + "step": 286 + }, + { + "epoch": 0.07597942223981005, + "grad_norm": 3.809807538986206, + "learning_rate": 1.4877341070957532e-06, + "loss": 1.8082, + "step": 288 + }, + { + "epoch": 0.0765070571164754, + "grad_norm": 6.299380302429199, + "learning_rate": 1.4982854128198366e-06, + "loss": 1.8125, + "step": 290 + }, + { + "epoch": 0.07703469199314075, + "grad_norm": 16.563379287719727, + "learning_rate": 1.5088367185439197e-06, + "loss": 1.8433, + "step": 292 + }, + { + "epoch": 0.07756232686980609, + "grad_norm": 5.329596519470215, + "learning_rate": 1.5193880242680033e-06, + "loss": 1.7686, + "step": 294 + }, + { + "epoch": 0.07808996174647144, + "grad_norm": 8.741602897644043, + "learning_rate": 1.5299393299920867e-06, + "loss": 1.7455, + "step": 296 + }, + { + "epoch": 0.07861759662313679, + "grad_norm": 27.244760513305664, + "learning_rate": 1.54049063571617e-06, + "loss": 1.7227, + "step": 298 + }, + { + "epoch": 0.07914523149980214, + "grad_norm": 10.486551284790039, + "learning_rate": 1.5510419414402533e-06, + "loss": 1.7092, + "step": 300 + }, + { + "epoch": 0.07967286637646749, + "grad_norm": 3.2602686882019043, + "learning_rate": 1.5615932471643366e-06, + "loss": 1.7452, + "step": 302 + }, + { + "epoch": 0.08020050125313283, + "grad_norm": 3.6072781085968018, + "learning_rate": 1.57214455288842e-06, + "loss": 1.79, + "step": 304 + }, + { + "epoch": 0.08072813612979818, + "grad_norm": 3.782972812652588, + "learning_rate": 1.5826958586125036e-06, + "loss": 1.7056, + "step": 306 + }, + { + "epoch": 0.08125577100646353, + "grad_norm": 6.9960246086120605, + "learning_rate": 1.5932471643365868e-06, + "loss": 1.7912, + "step": 308 + }, + { + "epoch": 0.08178340588312888, + "grad_norm": 4.737551689147949, + "learning_rate": 1.6037984700606702e-06, + "loss": 1.7585, + "step": 310 + }, + { + "epoch": 0.08231104075979422, + "grad_norm": 11.377225875854492, + "learning_rate": 1.6143497757847533e-06, + "loss": 1.6678, + "step": 312 + }, + { + "epoch": 0.08283867563645957, + "grad_norm": 19.506385803222656, + "learning_rate": 1.6249010815088367e-06, + "loss": 1.7232, + "step": 314 + }, + { + "epoch": 0.08336631051312492, + "grad_norm": 4.979662895202637, + "learning_rate": 1.6354523872329203e-06, + "loss": 1.6056, + "step": 316 + }, + { + "epoch": 0.08389394538979027, + "grad_norm": 5.651816368103027, + "learning_rate": 1.6460036929570037e-06, + "loss": 1.6662, + "step": 318 + }, + { + "epoch": 0.08442158026645562, + "grad_norm": 2.3985848426818848, + "learning_rate": 1.6565549986810869e-06, + "loss": 1.6362, + "step": 320 + }, + { + "epoch": 0.08494921514312095, + "grad_norm": 9.86822509765625, + "learning_rate": 1.6671063044051703e-06, + "loss": 1.6907, + "step": 322 + }, + { + "epoch": 0.0854768500197863, + "grad_norm": 13.873213768005371, + "learning_rate": 1.6776576101292536e-06, + "loss": 1.7455, + "step": 324 + }, + { + "epoch": 0.08600448489645166, + "grad_norm": 10.133545875549316, + "learning_rate": 1.688208915853337e-06, + "loss": 1.7055, + "step": 326 + }, + { + "epoch": 0.086532119773117, + "grad_norm": 40.831443786621094, + "learning_rate": 1.6987602215774204e-06, + "loss": 1.6672, + "step": 328 + }, + { + "epoch": 0.08705975464978236, + "grad_norm": 3.7306265830993652, + "learning_rate": 1.7093115273015038e-06, + "loss": 1.6786, + "step": 330 + }, + { + "epoch": 0.08758738952644769, + "grad_norm": 3.137363910675049, + "learning_rate": 1.719862833025587e-06, + "loss": 1.6882, + "step": 332 + }, + { + "epoch": 0.08811502440311304, + "grad_norm": 2.7168731689453125, + "learning_rate": 1.7304141387496703e-06, + "loss": 1.6841, + "step": 334 + }, + { + "epoch": 0.0886426592797784, + "grad_norm": 18.376760482788086, + "learning_rate": 1.7409654444737537e-06, + "loss": 1.6963, + "step": 336 + }, + { + "epoch": 0.08917029415644374, + "grad_norm": 1.890999674797058, + "learning_rate": 1.7515167501978373e-06, + "loss": 1.6207, + "step": 338 + }, + { + "epoch": 0.0896979290331091, + "grad_norm": 1.4951006174087524, + "learning_rate": 1.7620680559219205e-06, + "loss": 1.6304, + "step": 340 + }, + { + "epoch": 0.09022556390977443, + "grad_norm": 3.687925100326538, + "learning_rate": 1.7726193616460039e-06, + "loss": 1.6165, + "step": 342 + }, + { + "epoch": 0.09075319878643978, + "grad_norm": 27.191234588623047, + "learning_rate": 1.783170667370087e-06, + "loss": 1.6258, + "step": 344 + }, + { + "epoch": 0.09128083366310513, + "grad_norm": 10.416647911071777, + "learning_rate": 1.7937219730941704e-06, + "loss": 1.6175, + "step": 346 + }, + { + "epoch": 0.09180846853977048, + "grad_norm": 2.556605577468872, + "learning_rate": 1.804273278818254e-06, + "loss": 1.6309, + "step": 348 + }, + { + "epoch": 0.09233610341643583, + "grad_norm": 2.95297908782959, + "learning_rate": 1.8148245845423374e-06, + "loss": 1.6299, + "step": 350 + }, + { + "epoch": 0.09286373829310117, + "grad_norm": 3.305846691131592, + "learning_rate": 1.8253758902664206e-06, + "loss": 1.6792, + "step": 352 + }, + { + "epoch": 0.09339137316976652, + "grad_norm": 1.5464898347854614, + "learning_rate": 1.835927195990504e-06, + "loss": 1.6856, + "step": 354 + }, + { + "epoch": 0.09391900804643187, + "grad_norm": 2.6239209175109863, + "learning_rate": 1.8464785017145873e-06, + "loss": 1.6429, + "step": 356 + }, + { + "epoch": 0.09444664292309722, + "grad_norm": 7.56057596206665, + "learning_rate": 1.8570298074386705e-06, + "loss": 1.5169, + "step": 358 + }, + { + "epoch": 0.09497427779976256, + "grad_norm": 4.235796928405762, + "learning_rate": 1.8675811131627541e-06, + "loss": 1.6299, + "step": 360 + }, + { + "epoch": 0.09550191267642791, + "grad_norm": 247.50270080566406, + "learning_rate": 1.8781324188868375e-06, + "loss": 1.56, + "step": 362 + }, + { + "epoch": 0.09602954755309326, + "grad_norm": 5.296196937561035, + "learning_rate": 1.8886837246109207e-06, + "loss": 1.5345, + "step": 364 + }, + { + "epoch": 0.09655718242975861, + "grad_norm": 5.573033809661865, + "learning_rate": 1.899235030335004e-06, + "loss": 1.6738, + "step": 366 + }, + { + "epoch": 0.09708481730642396, + "grad_norm": 1.8204444646835327, + "learning_rate": 1.9097863360590872e-06, + "loss": 1.678, + "step": 368 + }, + { + "epoch": 0.0976124521830893, + "grad_norm": 2.9416329860687256, + "learning_rate": 1.920337641783171e-06, + "loss": 1.5768, + "step": 370 + }, + { + "epoch": 0.09814008705975465, + "grad_norm": 4.211336135864258, + "learning_rate": 1.9308889475072544e-06, + "loss": 1.5899, + "step": 372 + }, + { + "epoch": 0.09866772193642, + "grad_norm": 2.256227970123291, + "learning_rate": 1.9414402532313374e-06, + "loss": 1.6382, + "step": 374 + }, + { + "epoch": 0.09919535681308535, + "grad_norm": 1.497651219367981, + "learning_rate": 1.9519915589554208e-06, + "loss": 1.6308, + "step": 376 + }, + { + "epoch": 0.0997229916897507, + "grad_norm": 3.4664108753204346, + "learning_rate": 1.962542864679504e-06, + "loss": 1.6438, + "step": 378 + }, + { + "epoch": 0.10025062656641603, + "grad_norm": 19.30890655517578, + "learning_rate": 1.9730941704035875e-06, + "loss": 1.5797, + "step": 380 + }, + { + "epoch": 0.10077826144308139, + "grad_norm": 13.07797908782959, + "learning_rate": 1.983645476127671e-06, + "loss": 1.5735, + "step": 382 + }, + { + "epoch": 0.10130589631974674, + "grad_norm": 4.970568656921387, + "learning_rate": 1.9941967818517543e-06, + "loss": 1.546, + "step": 384 + }, + { + "epoch": 0.10183353119641209, + "grad_norm": 13.25097942352295, + "learning_rate": 2.0047480875758377e-06, + "loss": 1.6095, + "step": 386 + }, + { + "epoch": 0.10236116607307744, + "grad_norm": 3.28190016746521, + "learning_rate": 2.015299393299921e-06, + "loss": 1.5175, + "step": 388 + }, + { + "epoch": 0.10288880094974277, + "grad_norm": 5.094784259796143, + "learning_rate": 2.0258506990240044e-06, + "loss": 1.5121, + "step": 390 + }, + { + "epoch": 0.10341643582640812, + "grad_norm": 24.574045181274414, + "learning_rate": 2.036402004748088e-06, + "loss": 1.6044, + "step": 392 + }, + { + "epoch": 0.10394407070307347, + "grad_norm": 2.382647752761841, + "learning_rate": 2.046953310472171e-06, + "loss": 1.4917, + "step": 394 + }, + { + "epoch": 0.10447170557973882, + "grad_norm": 1.8582993745803833, + "learning_rate": 2.0575046161962546e-06, + "loss": 1.6484, + "step": 396 + }, + { + "epoch": 0.10499934045640417, + "grad_norm": 3.1315765380859375, + "learning_rate": 2.068055921920338e-06, + "loss": 1.5342, + "step": 398 + }, + { + "epoch": 0.10552697533306951, + "grad_norm": 2.774212121963501, + "learning_rate": 2.078607227644421e-06, + "loss": 1.5363, + "step": 400 + }, + { + "epoch": 0.10605461020973486, + "grad_norm": 2.7064402103424072, + "learning_rate": 2.0891585333685047e-06, + "loss": 1.4863, + "step": 402 + }, + { + "epoch": 0.10658224508640021, + "grad_norm": 2.168569803237915, + "learning_rate": 2.099709839092588e-06, + "loss": 1.6392, + "step": 404 + }, + { + "epoch": 0.10710987996306556, + "grad_norm": 5.667662620544434, + "learning_rate": 2.110261144816671e-06, + "loss": 1.5225, + "step": 406 + }, + { + "epoch": 0.1076375148397309, + "grad_norm": 1.2962943315505981, + "learning_rate": 2.1208124505407545e-06, + "loss": 1.5485, + "step": 408 + }, + { + "epoch": 0.10816514971639625, + "grad_norm": 5.817214012145996, + "learning_rate": 2.131363756264838e-06, + "loss": 1.5536, + "step": 410 + }, + { + "epoch": 0.1086927845930616, + "grad_norm": 15.700840950012207, + "learning_rate": 2.1419150619889212e-06, + "loss": 1.6034, + "step": 412 + }, + { + "epoch": 0.10922041946972695, + "grad_norm": 2.162670135498047, + "learning_rate": 2.1524663677130046e-06, + "loss": 1.5336, + "step": 414 + }, + { + "epoch": 0.1097480543463923, + "grad_norm": 3.747391939163208, + "learning_rate": 2.163017673437088e-06, + "loss": 1.5711, + "step": 416 + }, + { + "epoch": 0.11027568922305764, + "grad_norm": 3.417733907699585, + "learning_rate": 2.1735689791611714e-06, + "loss": 1.5508, + "step": 418 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 5.373349189758301, + "learning_rate": 2.1841202848852548e-06, + "loss": 1.4465, + "step": 420 + }, + { + "epoch": 0.11133095897638834, + "grad_norm": 5.1337738037109375, + "learning_rate": 2.194671590609338e-06, + "loss": 1.5474, + "step": 422 + }, + { + "epoch": 0.11185859385305369, + "grad_norm": 2.337843418121338, + "learning_rate": 2.2052228963334215e-06, + "loss": 1.5551, + "step": 424 + }, + { + "epoch": 0.11238622872971904, + "grad_norm": 4.951787948608398, + "learning_rate": 2.215774202057505e-06, + "loss": 1.4947, + "step": 426 + }, + { + "epoch": 0.11291386360638438, + "grad_norm": 1.8533304929733276, + "learning_rate": 2.2263255077815883e-06, + "loss": 1.4673, + "step": 428 + }, + { + "epoch": 0.11344149848304973, + "grad_norm": 16.626604080200195, + "learning_rate": 2.2368768135056717e-06, + "loss": 1.5264, + "step": 430 + }, + { + "epoch": 0.11396913335971508, + "grad_norm": 4.292245864868164, + "learning_rate": 2.2474281192297546e-06, + "loss": 1.4385, + "step": 432 + }, + { + "epoch": 0.11449676823638043, + "grad_norm": 4.345268249511719, + "learning_rate": 2.257979424953838e-06, + "loss": 1.4761, + "step": 434 + }, + { + "epoch": 0.11502440311304578, + "grad_norm": 2.6198110580444336, + "learning_rate": 2.268530730677922e-06, + "loss": 1.4935, + "step": 436 + }, + { + "epoch": 0.11555203798971111, + "grad_norm": 2.181818962097168, + "learning_rate": 2.2790820364020048e-06, + "loss": 1.4325, + "step": 438 + }, + { + "epoch": 0.11607967286637647, + "grad_norm": 7.270115852355957, + "learning_rate": 2.289633342126088e-06, + "loss": 1.489, + "step": 440 + }, + { + "epoch": 0.11660730774304182, + "grad_norm": 2.8824009895324707, + "learning_rate": 2.3001846478501715e-06, + "loss": 1.4946, + "step": 442 + }, + { + "epoch": 0.11713494261970717, + "grad_norm": 2.64613938331604, + "learning_rate": 2.310735953574255e-06, + "loss": 1.4334, + "step": 444 + }, + { + "epoch": 0.11766257749637252, + "grad_norm": 4.052391529083252, + "learning_rate": 2.3212872592983383e-06, + "loss": 1.4241, + "step": 446 + }, + { + "epoch": 0.11819021237303785, + "grad_norm": 24.023279190063477, + "learning_rate": 2.3318385650224217e-06, + "loss": 1.4259, + "step": 448 + }, + { + "epoch": 0.1187178472497032, + "grad_norm": 136.9192352294922, + "learning_rate": 2.342389870746505e-06, + "loss": 1.4558, + "step": 450 + }, + { + "epoch": 0.11924548212636855, + "grad_norm": 11.573638916015625, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.5033, + "step": 452 + }, + { + "epoch": 0.1197731170030339, + "grad_norm": 6.5478315353393555, + "learning_rate": 2.363492482194672e-06, + "loss": 1.4251, + "step": 454 + }, + { + "epoch": 0.12030075187969924, + "grad_norm": 54.54530334472656, + "learning_rate": 2.374043787918755e-06, + "loss": 1.489, + "step": 456 + }, + { + "epoch": 0.12082838675636459, + "grad_norm": 2.7549355030059814, + "learning_rate": 2.3845950936428386e-06, + "loss": 1.4265, + "step": 458 + }, + { + "epoch": 0.12135602163302994, + "grad_norm": 15.779853820800781, + "learning_rate": 2.395146399366922e-06, + "loss": 1.3444, + "step": 460 + }, + { + "epoch": 0.12188365650969529, + "grad_norm": 8.674041748046875, + "learning_rate": 2.4056977050910054e-06, + "loss": 1.4139, + "step": 462 + }, + { + "epoch": 0.12241129138636064, + "grad_norm": 9.17253303527832, + "learning_rate": 2.4162490108150883e-06, + "loss": 1.3215, + "step": 464 + }, + { + "epoch": 0.12293892626302598, + "grad_norm": 5.170712471008301, + "learning_rate": 2.4268003165391717e-06, + "loss": 1.3424, + "step": 466 + }, + { + "epoch": 0.12346656113969133, + "grad_norm": 8.814146995544434, + "learning_rate": 2.4373516222632555e-06, + "loss": 1.3596, + "step": 468 + }, + { + "epoch": 0.12399419601635668, + "grad_norm": 4.5802321434021, + "learning_rate": 2.447902927987339e-06, + "loss": 1.327, + "step": 470 + }, + { + "epoch": 0.12452183089302203, + "grad_norm": 2.5747785568237305, + "learning_rate": 2.458454233711422e-06, + "loss": 1.3256, + "step": 472 + }, + { + "epoch": 0.12504946576968737, + "grad_norm": 8.199037551879883, + "learning_rate": 2.4690055394355052e-06, + "loss": 1.2421, + "step": 474 + }, + { + "epoch": 0.12557710064635272, + "grad_norm": 3.318483829498291, + "learning_rate": 2.4795568451595886e-06, + "loss": 1.2276, + "step": 476 + }, + { + "epoch": 0.12610473552301807, + "grad_norm": 37.76396942138672, + "learning_rate": 2.490108150883672e-06, + "loss": 1.2988, + "step": 478 + }, + { + "epoch": 0.12663237039968342, + "grad_norm": 27.113391876220703, + "learning_rate": 2.5006594566077554e-06, + "loss": 1.3125, + "step": 480 + }, + { + "epoch": 0.12716000527634877, + "grad_norm": 15.240288734436035, + "learning_rate": 2.5112107623318388e-06, + "loss": 1.3215, + "step": 482 + }, + { + "epoch": 0.12768764015301412, + "grad_norm": 2.7686173915863037, + "learning_rate": 2.5217620680559226e-06, + "loss": 1.2659, + "step": 484 + }, + { + "epoch": 0.12821527502967947, + "grad_norm": 4.50327205657959, + "learning_rate": 2.5323133737800055e-06, + "loss": 1.273, + "step": 486 + }, + { + "epoch": 0.12874290990634482, + "grad_norm": 2.5665090084075928, + "learning_rate": 2.542864679504089e-06, + "loss": 1.1886, + "step": 488 + }, + { + "epoch": 0.12927054478301014, + "grad_norm": 4.792305946350098, + "learning_rate": 2.553415985228172e-06, + "loss": 1.26, + "step": 490 + }, + { + "epoch": 0.1297981796596755, + "grad_norm": 56.62902069091797, + "learning_rate": 2.5639672909522557e-06, + "loss": 1.2188, + "step": 492 + }, + { + "epoch": 0.13032581453634084, + "grad_norm": 54.81526184082031, + "learning_rate": 2.5745185966763386e-06, + "loss": 1.2558, + "step": 494 + }, + { + "epoch": 0.1308534494130062, + "grad_norm": 4.103349685668945, + "learning_rate": 2.585069902400422e-06, + "loss": 1.1292, + "step": 496 + }, + { + "epoch": 0.13138108428967155, + "grad_norm": 2.215069532394409, + "learning_rate": 2.595621208124506e-06, + "loss": 1.1345, + "step": 498 + }, + { + "epoch": 0.1319087191663369, + "grad_norm": 4.5295820236206055, + "learning_rate": 2.606172513848589e-06, + "loss": 1.1742, + "step": 500 + }, + { + "epoch": 0.13243635404300225, + "grad_norm": 2.79809308052063, + "learning_rate": 2.6167238195726726e-06, + "loss": 1.1976, + "step": 502 + }, + { + "epoch": 0.1329639889196676, + "grad_norm": 4.327549457550049, + "learning_rate": 2.6272751252967556e-06, + "loss": 1.1648, + "step": 504 + }, + { + "epoch": 0.13349162379633295, + "grad_norm": 2.627274751663208, + "learning_rate": 2.637826431020839e-06, + "loss": 1.1836, + "step": 506 + }, + { + "epoch": 0.1340192586729983, + "grad_norm": 5.957850456237793, + "learning_rate": 2.6483777367449227e-06, + "loss": 1.1474, + "step": 508 + }, + { + "epoch": 0.13454689354966362, + "grad_norm": 14.636727333068848, + "learning_rate": 2.6589290424690057e-06, + "loss": 1.118, + "step": 510 + }, + { + "epoch": 0.13507452842632897, + "grad_norm": 6.498028755187988, + "learning_rate": 2.669480348193089e-06, + "loss": 1.132, + "step": 512 + }, + { + "epoch": 0.13560216330299432, + "grad_norm": 4.722952842712402, + "learning_rate": 2.680031653917172e-06, + "loss": 1.0637, + "step": 514 + }, + { + "epoch": 0.13612979817965967, + "grad_norm": 2.487074613571167, + "learning_rate": 2.690582959641256e-06, + "loss": 1.0959, + "step": 516 + }, + { + "epoch": 0.13665743305632502, + "grad_norm": 16.59700584411621, + "learning_rate": 2.7011342653653392e-06, + "loss": 1.0544, + "step": 518 + }, + { + "epoch": 0.13718506793299037, + "grad_norm": 2.066934585571289, + "learning_rate": 2.7116855710894226e-06, + "loss": 1.0277, + "step": 520 + }, + { + "epoch": 0.13771270280965572, + "grad_norm": 2.2459840774536133, + "learning_rate": 2.722236876813506e-06, + "loss": 1.0452, + "step": 522 + }, + { + "epoch": 0.13824033768632107, + "grad_norm": 11.845109939575195, + "learning_rate": 2.732788182537589e-06, + "loss": 1.0486, + "step": 524 + }, + { + "epoch": 0.13876797256298642, + "grad_norm": 4.819644451141357, + "learning_rate": 2.7433394882616728e-06, + "loss": 0.968, + "step": 526 + }, + { + "epoch": 0.13929560743965175, + "grad_norm": 16.332895278930664, + "learning_rate": 2.753890793985756e-06, + "loss": 0.9724, + "step": 528 + }, + { + "epoch": 0.1398232423163171, + "grad_norm": 4.831658840179443, + "learning_rate": 2.764442099709839e-06, + "loss": 1.0492, + "step": 530 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 1.931032657623291, + "learning_rate": 2.774993405433923e-06, + "loss": 1.0455, + "step": 532 + }, + { + "epoch": 0.1408785120696478, + "grad_norm": 11.323331832885742, + "learning_rate": 2.785544711158006e-06, + "loss": 0.983, + "step": 534 + }, + { + "epoch": 0.14140614694631315, + "grad_norm": 6.889852523803711, + "learning_rate": 2.7960960168820893e-06, + "loss": 0.947, + "step": 536 + }, + { + "epoch": 0.1419337818229785, + "grad_norm": 2.8129429817199707, + "learning_rate": 2.806647322606173e-06, + "loss": 0.9532, + "step": 538 + }, + { + "epoch": 0.14246141669964385, + "grad_norm": 2.8626015186309814, + "learning_rate": 2.817198628330256e-06, + "loss": 0.8526, + "step": 540 + }, + { + "epoch": 0.1429890515763092, + "grad_norm": 2.424870252609253, + "learning_rate": 2.8277499340543394e-06, + "loss": 0.9069, + "step": 542 + }, + { + "epoch": 0.14351668645297455, + "grad_norm": 94.4120864868164, + "learning_rate": 2.838301239778423e-06, + "loss": 0.8901, + "step": 544 + }, + { + "epoch": 0.1440443213296399, + "grad_norm": 3.694725275039673, + "learning_rate": 2.848852545502506e-06, + "loss": 0.9082, + "step": 546 + }, + { + "epoch": 0.14457195620630522, + "grad_norm": 7.248526096343994, + "learning_rate": 2.859403851226589e-06, + "loss": 0.8748, + "step": 548 + }, + { + "epoch": 0.14509959108297057, + "grad_norm": NaN, + "learning_rate": 2.869955156950673e-06, + "loss": 0.8596, + "step": 550 + }, + { + "epoch": 0.14562722595963593, + "grad_norm": 4.876184940338135, + "learning_rate": 2.8752308098127144e-06, + "loss": 0.8878, + "step": 552 + }, + { + "epoch": 0.14615486083630128, + "grad_norm": 10.614558219909668, + "learning_rate": 2.885782115536798e-06, + "loss": 0.8776, + "step": 554 + }, + { + "epoch": 0.14668249571296663, + "grad_norm": 3.354245901107788, + "learning_rate": 2.896333421260881e-06, + "loss": 0.8992, + "step": 556 + }, + { + "epoch": 0.14721013058963198, + "grad_norm": 2.4660451412200928, + "learning_rate": 2.906884726984965e-06, + "loss": 0.808, + "step": 558 + }, + { + "epoch": 0.14773776546629733, + "grad_norm": 1.956465721130371, + "learning_rate": 2.917436032709048e-06, + "loss": 0.836, + "step": 560 + }, + { + "epoch": 0.14826540034296268, + "grad_norm": 7.799830436706543, + "learning_rate": 2.9279873384331313e-06, + "loss": 0.848, + "step": 562 + }, + { + "epoch": 0.14879303521962803, + "grad_norm": 2.416813850402832, + "learning_rate": 2.9385386441572143e-06, + "loss": 0.7754, + "step": 564 + }, + { + "epoch": 0.14932067009629338, + "grad_norm": 3.172804594039917, + "learning_rate": 2.949089949881298e-06, + "loss": 0.7825, + "step": 566 + }, + { + "epoch": 0.1498483049729587, + "grad_norm": 3.6063547134399414, + "learning_rate": 2.9596412556053815e-06, + "loss": 0.7507, + "step": 568 + }, + { + "epoch": 0.15037593984962405, + "grad_norm": 2.5601212978363037, + "learning_rate": 2.9701925613294644e-06, + "loss": 0.7596, + "step": 570 + }, + { + "epoch": 0.1509035747262894, + "grad_norm": 91.54145812988281, + "learning_rate": 2.9807438670535483e-06, + "loss": 0.7267, + "step": 572 + }, + { + "epoch": 0.15143120960295475, + "grad_norm": 3.6042819023132324, + "learning_rate": 2.991295172777631e-06, + "loss": 0.7517, + "step": 574 + }, + { + "epoch": 0.1519588444796201, + "grad_norm": 2.3691625595092773, + "learning_rate": 3.001846478501715e-06, + "loss": 0.77, + "step": 576 + }, + { + "epoch": 0.15248647935628545, + "grad_norm": 9.689958572387695, + "learning_rate": 3.0123977842257984e-06, + "loss": 0.7846, + "step": 578 + }, + { + "epoch": 0.1530141142329508, + "grad_norm": 5.683976173400879, + "learning_rate": 3.0229490899498814e-06, + "loss": 0.7631, + "step": 580 + }, + { + "epoch": 0.15354174910961615, + "grad_norm": 19.685636520385742, + "learning_rate": 3.033500395673965e-06, + "loss": 0.7329, + "step": 582 + }, + { + "epoch": 0.1540693839862815, + "grad_norm": 2.090109348297119, + "learning_rate": 3.044051701398048e-06, + "loss": 0.7301, + "step": 584 + }, + { + "epoch": 0.15459701886294683, + "grad_norm": 4.50723123550415, + "learning_rate": 3.0546030071221315e-06, + "loss": 0.6949, + "step": 586 + }, + { + "epoch": 0.15512465373961218, + "grad_norm": 17.643939971923828, + "learning_rate": 3.0651543128462153e-06, + "loss": 0.7219, + "step": 588 + }, + { + "epoch": 0.15565228861627753, + "grad_norm": 2.3234877586364746, + "learning_rate": 3.0757056185702983e-06, + "loss": 0.6899, + "step": 590 + }, + { + "epoch": 0.15617992349294288, + "grad_norm": 3.1329805850982666, + "learning_rate": 3.0862569242943817e-06, + "loss": 0.6641, + "step": 592 + }, + { + "epoch": 0.15670755836960823, + "grad_norm": 3.47593092918396, + "learning_rate": 3.096808230018465e-06, + "loss": 0.7265, + "step": 594 + }, + { + "epoch": 0.15723519324627358, + "grad_norm": 4.066540718078613, + "learning_rate": 3.1073595357425484e-06, + "loss": 0.6685, + "step": 596 + }, + { + "epoch": 0.15776282812293893, + "grad_norm": 2.666475296020508, + "learning_rate": 3.1179108414666314e-06, + "loss": 0.6222, + "step": 598 + }, + { + "epoch": 0.15829046299960428, + "grad_norm": 2.599416732788086, + "learning_rate": 3.128462147190715e-06, + "loss": 0.6199, + "step": 600 + }, + { + "epoch": 0.15881809787626963, + "grad_norm": 2.887054443359375, + "learning_rate": 3.1390134529147986e-06, + "loss": 0.5884, + "step": 602 + }, + { + "epoch": 0.15934573275293498, + "grad_norm": 6.9668450355529785, + "learning_rate": 3.1495647586388815e-06, + "loss": 0.6222, + "step": 604 + }, + { + "epoch": 0.1598733676296003, + "grad_norm": 5.312816619873047, + "learning_rate": 3.1601160643629653e-06, + "loss": 0.6366, + "step": 606 + }, + { + "epoch": 0.16040100250626566, + "grad_norm": 6.542179107666016, + "learning_rate": 3.1706673700870483e-06, + "loss": 0.6347, + "step": 608 + }, + { + "epoch": 0.160928637382931, + "grad_norm": 4.025948524475098, + "learning_rate": 3.1812186758111317e-06, + "loss": 0.6201, + "step": 610 + }, + { + "epoch": 0.16145627225959636, + "grad_norm": 2.2054283618927, + "learning_rate": 3.1917699815352155e-06, + "loss": 0.5879, + "step": 612 + }, + { + "epoch": 0.1619839071362617, + "grad_norm": 3.447096109390259, + "learning_rate": 3.2023212872592984e-06, + "loss": 0.531, + "step": 614 + }, + { + "epoch": 0.16251154201292706, + "grad_norm": 5.069993019104004, + "learning_rate": 3.2128725929833823e-06, + "loss": 0.5894, + "step": 616 + }, + { + "epoch": 0.1630391768895924, + "grad_norm": 1.747745156288147, + "learning_rate": 3.223423898707465e-06, + "loss": 0.583, + "step": 618 + }, + { + "epoch": 0.16356681176625776, + "grad_norm": 3.4433414936065674, + "learning_rate": 3.2339752044315486e-06, + "loss": 0.5785, + "step": 620 + }, + { + "epoch": 0.1640944466429231, + "grad_norm": 1.9073948860168457, + "learning_rate": 3.2445265101556324e-06, + "loss": 0.5519, + "step": 622 + }, + { + "epoch": 0.16462208151958843, + "grad_norm": 2.2265708446502686, + "learning_rate": 3.2550778158797154e-06, + "loss": 0.5245, + "step": 624 + }, + { + "epoch": 0.16514971639625378, + "grad_norm": 2.1955602169036865, + "learning_rate": 3.2656291216037987e-06, + "loss": 0.5356, + "step": 626 + }, + { + "epoch": 0.16567735127291913, + "grad_norm": 2.6840920448303223, + "learning_rate": 3.2761804273278817e-06, + "loss": 0.5452, + "step": 628 + }, + { + "epoch": 0.16620498614958448, + "grad_norm": 3.3503615856170654, + "learning_rate": 3.2867317330519655e-06, + "loss": 0.5068, + "step": 630 + }, + { + "epoch": 0.16673262102624983, + "grad_norm": 2.302027463912964, + "learning_rate": 3.297283038776049e-06, + "loss": 0.5286, + "step": 632 + }, + { + "epoch": 0.16726025590291518, + "grad_norm": 2.589982271194458, + "learning_rate": 3.3078343445001323e-06, + "loss": 0.5066, + "step": 634 + }, + { + "epoch": 0.16778789077958053, + "grad_norm": 2.169400930404663, + "learning_rate": 3.3183856502242157e-06, + "loss": 0.4771, + "step": 636 + }, + { + "epoch": 0.16831552565624588, + "grad_norm": 53.32346725463867, + "learning_rate": 3.3289369559482986e-06, + "loss": 0.4538, + "step": 638 + }, + { + "epoch": 0.16884316053291124, + "grad_norm": 12.66275405883789, + "learning_rate": 3.3394882616723824e-06, + "loss": 0.4701, + "step": 640 + }, + { + "epoch": 0.16937079540957659, + "grad_norm": 64.25328063964844, + "learning_rate": 3.350039567396466e-06, + "loss": 0.4746, + "step": 642 + }, + { + "epoch": 0.1698984302862419, + "grad_norm": 6.899125576019287, + "learning_rate": 3.3605908731205488e-06, + "loss": 0.4691, + "step": 644 + }, + { + "epoch": 0.17042606516290726, + "grad_norm": 2.7724273204803467, + "learning_rate": 3.3711421788446326e-06, + "loss": 0.4601, + "step": 646 + }, + { + "epoch": 0.1709537000395726, + "grad_norm": 1.7114256620407104, + "learning_rate": 3.3816934845687155e-06, + "loss": 0.4336, + "step": 648 + }, + { + "epoch": 0.17148133491623796, + "grad_norm": 2.4038314819335938, + "learning_rate": 3.392244790292799e-06, + "loss": 0.4408, + "step": 650 + }, + { + "epoch": 0.1720089697929033, + "grad_norm": 3.504788398742676, + "learning_rate": 3.4027960960168823e-06, + "loss": 0.4492, + "step": 652 + }, + { + "epoch": 0.17253660466956866, + "grad_norm": 1.8247194290161133, + "learning_rate": 3.4133474017409657e-06, + "loss": 0.3848, + "step": 654 + }, + { + "epoch": 0.173064239546234, + "grad_norm": 2.3109757900238037, + "learning_rate": 3.423898707465049e-06, + "loss": 0.4182, + "step": 656 + }, + { + "epoch": 0.17359187442289936, + "grad_norm": 1.4789122343063354, + "learning_rate": 3.4344500131891324e-06, + "loss": 0.4105, + "step": 658 + }, + { + "epoch": 0.1741195092995647, + "grad_norm": 3.187047243118286, + "learning_rate": 3.445001318913216e-06, + "loss": 0.4255, + "step": 660 + }, + { + "epoch": 0.17464714417623006, + "grad_norm": 1.7560479640960693, + "learning_rate": 3.4555526246372988e-06, + "loss": 0.3828, + "step": 662 + }, + { + "epoch": 0.17517477905289539, + "grad_norm": 2.637230157852173, + "learning_rate": 3.4661039303613826e-06, + "loss": 0.3686, + "step": 664 + }, + { + "epoch": 0.17570241392956074, + "grad_norm": 1.821801781654358, + "learning_rate": 3.476655236085466e-06, + "loss": 0.3572, + "step": 666 + }, + { + "epoch": 0.1762300488062261, + "grad_norm": 2.7633509635925293, + "learning_rate": 3.487206541809549e-06, + "loss": 0.3709, + "step": 668 + }, + { + "epoch": 0.17675768368289144, + "grad_norm": 3.0109922885894775, + "learning_rate": 3.4977578475336327e-06, + "loss": 0.3824, + "step": 670 + }, + { + "epoch": 0.1772853185595568, + "grad_norm": 4.088319301605225, + "learning_rate": 3.5083091532577157e-06, + "loss": 0.3657, + "step": 672 + }, + { + "epoch": 0.17781295343622214, + "grad_norm": 2.996281623840332, + "learning_rate": 3.518860458981799e-06, + "loss": 0.3279, + "step": 674 + }, + { + "epoch": 0.1783405883128875, + "grad_norm": 10.290828704833984, + "learning_rate": 3.529411764705883e-06, + "loss": 0.3502, + "step": 676 + }, + { + "epoch": 0.17886822318955284, + "grad_norm": 2.4969208240509033, + "learning_rate": 3.539963070429966e-06, + "loss": 0.3395, + "step": 678 + }, + { + "epoch": 0.1793958580662182, + "grad_norm": 2.7871758937835693, + "learning_rate": 3.5505143761540497e-06, + "loss": 0.3444, + "step": 680 + }, + { + "epoch": 0.1799234929428835, + "grad_norm": 4.7528276443481445, + "learning_rate": 3.5610656818781326e-06, + "loss": 0.3271, + "step": 682 + }, + { + "epoch": 0.18045112781954886, + "grad_norm": 2.1504299640655518, + "learning_rate": 3.571616987602216e-06, + "loss": 0.3236, + "step": 684 + }, + { + "epoch": 0.1809787626962142, + "grad_norm": 3.5370121002197266, + "learning_rate": 3.5821682933263e-06, + "loss": 0.3444, + "step": 686 + }, + { + "epoch": 0.18150639757287956, + "grad_norm": 1.7846468687057495, + "learning_rate": 3.5927195990503828e-06, + "loss": 0.3077, + "step": 688 + }, + { + "epoch": 0.1820340324495449, + "grad_norm": 23.94403839111328, + "learning_rate": 3.603270904774466e-06, + "loss": 0.3131, + "step": 690 + }, + { + "epoch": 0.18256166732621026, + "grad_norm": 1.79655921459198, + "learning_rate": 3.613822210498549e-06, + "loss": 0.3025, + "step": 692 + }, + { + "epoch": 0.18308930220287561, + "grad_norm": 2.1305274963378906, + "learning_rate": 3.624373516222633e-06, + "loss": 0.3057, + "step": 694 + }, + { + "epoch": 0.18361693707954096, + "grad_norm": 1.2237046957015991, + "learning_rate": 3.6349248219467163e-06, + "loss": 0.2828, + "step": 696 + }, + { + "epoch": 0.18414457195620632, + "grad_norm": 53.01265335083008, + "learning_rate": 3.6454761276707997e-06, + "loss": 0.321, + "step": 698 + }, + { + "epoch": 0.18467220683287167, + "grad_norm": 2.0583603382110596, + "learning_rate": 3.656027433394883e-06, + "loss": 0.2826, + "step": 700 + }, + { + "epoch": 0.185199841709537, + "grad_norm": 1.6712664365768433, + "learning_rate": 3.666578739118966e-06, + "loss": 0.2866, + "step": 702 + }, + { + "epoch": 0.18572747658620234, + "grad_norm": 1.9138531684875488, + "learning_rate": 3.67713004484305e-06, + "loss": 0.2726, + "step": 704 + }, + { + "epoch": 0.1862551114628677, + "grad_norm": 2.3046839237213135, + "learning_rate": 3.6876813505671328e-06, + "loss": 0.2809, + "step": 706 + }, + { + "epoch": 0.18678274633953304, + "grad_norm": 1.4306049346923828, + "learning_rate": 3.698232656291216e-06, + "loss": 0.2588, + "step": 708 + }, + { + "epoch": 0.1873103812161984, + "grad_norm": 1.9603590965270996, + "learning_rate": 3.7087839620153e-06, + "loss": 0.2638, + "step": 710 + }, + { + "epoch": 0.18783801609286374, + "grad_norm": 2.91037917137146, + "learning_rate": 3.719335267739383e-06, + "loss": 0.2488, + "step": 712 + }, + { + "epoch": 0.1883656509695291, + "grad_norm": 2.0575973987579346, + "learning_rate": 3.7298865734634663e-06, + "loss": 0.2669, + "step": 714 + }, + { + "epoch": 0.18889328584619444, + "grad_norm": 2.3801112174987793, + "learning_rate": 3.7404378791875497e-06, + "loss": 0.2422, + "step": 716 + }, + { + "epoch": 0.1894209207228598, + "grad_norm": 1.5387905836105347, + "learning_rate": 3.750989184911633e-06, + "loss": 0.2482, + "step": 718 + }, + { + "epoch": 0.18994855559952512, + "grad_norm": 3.370729923248291, + "learning_rate": 3.761540490635717e-06, + "loss": 0.2356, + "step": 720 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 5.247089862823486, + "learning_rate": 3.7720917963598e-06, + "loss": 0.2409, + "step": 722 + }, + { + "epoch": 0.19100382535285582, + "grad_norm": 1.2888553142547607, + "learning_rate": 3.7826431020838832e-06, + "loss": 0.215, + "step": 724 + }, + { + "epoch": 0.19153146022952117, + "grad_norm": 1.3998925685882568, + "learning_rate": 3.793194407807966e-06, + "loss": 0.2456, + "step": 726 + }, + { + "epoch": 0.19205909510618652, + "grad_norm": 1.5595108270645142, + "learning_rate": 3.80374571353205e-06, + "loss": 0.2494, + "step": 728 + }, + { + "epoch": 0.19258672998285187, + "grad_norm": 1.1287614107131958, + "learning_rate": 3.8142970192561334e-06, + "loss": 0.2136, + "step": 730 + }, + { + "epoch": 0.19311436485951722, + "grad_norm": 8.712166786193848, + "learning_rate": 3.824848324980217e-06, + "loss": 0.2295, + "step": 732 + }, + { + "epoch": 0.19364199973618257, + "grad_norm": 1.1602486371994019, + "learning_rate": 3.8353996307043e-06, + "loss": 0.2023, + "step": 734 + }, + { + "epoch": 0.19416963461284792, + "grad_norm": 1.3671159744262695, + "learning_rate": 3.8459509364283835e-06, + "loss": 0.2085, + "step": 736 + }, + { + "epoch": 0.19469726948951327, + "grad_norm": 1.4168115854263306, + "learning_rate": 3.8565022421524665e-06, + "loss": 0.222, + "step": 738 + }, + { + "epoch": 0.1952249043661786, + "grad_norm": 1.4107491970062256, + "learning_rate": 3.86705354787655e-06, + "loss": 0.2001, + "step": 740 + }, + { + "epoch": 0.19575253924284394, + "grad_norm": 1.033146619796753, + "learning_rate": 3.877604853600633e-06, + "loss": 0.2033, + "step": 742 + }, + { + "epoch": 0.1962801741195093, + "grad_norm": 1.09994375705719, + "learning_rate": 3.888156159324717e-06, + "loss": 0.1887, + "step": 744 + }, + { + "epoch": 0.19680780899617464, + "grad_norm": 1.4930782318115234, + "learning_rate": 3.8987074650488e-06, + "loss": 0.2014, + "step": 746 + }, + { + "epoch": 0.19733544387284, + "grad_norm": 1.5405325889587402, + "learning_rate": 3.909258770772884e-06, + "loss": 0.2008, + "step": 748 + }, + { + "epoch": 0.19786307874950534, + "grad_norm": 1.7280300855636597, + "learning_rate": 3.919810076496967e-06, + "loss": 0.2117, + "step": 750 + }, + { + "epoch": 0.1983907136261707, + "grad_norm": 0.9177537560462952, + "learning_rate": 3.93036138222105e-06, + "loss": 0.188, + "step": 752 + }, + { + "epoch": 0.19891834850283605, + "grad_norm": 220.88548278808594, + "learning_rate": 3.9409126879451336e-06, + "loss": 0.1939, + "step": 754 + }, + { + "epoch": 0.1994459833795014, + "grad_norm": 1.2833611965179443, + "learning_rate": 3.9514639936692165e-06, + "loss": 0.1729, + "step": 756 + }, + { + "epoch": 0.19997361825616675, + "grad_norm": 1.2064625024795532, + "learning_rate": 3.9620152993933e-06, + "loss": 0.1789, + "step": 758 + }, + { + "epoch": 0.20050125313283207, + "grad_norm": 1.0610567331314087, + "learning_rate": 3.972566605117383e-06, + "loss": 0.1934, + "step": 760 + }, + { + "epoch": 0.20102888800949742, + "grad_norm": 1.187633752822876, + "learning_rate": 3.983117910841467e-06, + "loss": 0.1774, + "step": 762 + }, + { + "epoch": 0.20155652288616277, + "grad_norm": 0.7917526960372925, + "learning_rate": 3.993669216565551e-06, + "loss": 0.1526, + "step": 764 + }, + { + "epoch": 0.20208415776282812, + "grad_norm": 2.032256603240967, + "learning_rate": 4.004220522289634e-06, + "loss": 0.1708, + "step": 766 + }, + { + "epoch": 0.20261179263949347, + "grad_norm": 1.1914738416671753, + "learning_rate": 4.014771828013717e-06, + "loss": 0.1614, + "step": 768 + }, + { + "epoch": 0.20313942751615882, + "grad_norm": 1.4673463106155396, + "learning_rate": 4.0253231337378e-06, + "loss": 0.1596, + "step": 770 + }, + { + "epoch": 0.20366706239282417, + "grad_norm": 25.362707138061523, + "learning_rate": 4.0358744394618836e-06, + "loss": 0.1655, + "step": 772 + }, + { + "epoch": 0.20419469726948952, + "grad_norm": 2.3151445388793945, + "learning_rate": 4.046425745185967e-06, + "loss": 0.1627, + "step": 774 + }, + { + "epoch": 0.20472233214615487, + "grad_norm": 0.9245175123214722, + "learning_rate": 4.05697705091005e-06, + "loss": 0.1555, + "step": 776 + }, + { + "epoch": 0.2052499670228202, + "grad_norm": 1.4068682193756104, + "learning_rate": 4.067528356634134e-06, + "loss": 0.1452, + "step": 778 + }, + { + "epoch": 0.20577760189948555, + "grad_norm": 1.277017593383789, + "learning_rate": 4.078079662358217e-06, + "loss": 0.1466, + "step": 780 + }, + { + "epoch": 0.2063052367761509, + "grad_norm": 0.8282482028007507, + "learning_rate": 4.088630968082301e-06, + "loss": 0.1532, + "step": 782 + }, + { + "epoch": 0.20683287165281625, + "grad_norm": 2.998361587524414, + "learning_rate": 4.099182273806384e-06, + "loss": 0.1518, + "step": 784 + }, + { + "epoch": 0.2073605065294816, + "grad_norm": 1.0489484071731567, + "learning_rate": 4.109733579530467e-06, + "loss": 0.159, + "step": 786 + }, + { + "epoch": 0.20788814140614695, + "grad_norm": 1.9688533544540405, + "learning_rate": 4.120284885254551e-06, + "loss": 0.1474, + "step": 788 + }, + { + "epoch": 0.2084157762828123, + "grad_norm": 4.931417942047119, + "learning_rate": 4.130836190978634e-06, + "loss": 0.1405, + "step": 790 + }, + { + "epoch": 0.20894341115947765, + "grad_norm": 1.1365867853164673, + "learning_rate": 4.141387496702717e-06, + "loss": 0.1421, + "step": 792 + }, + { + "epoch": 0.209471046036143, + "grad_norm": 2.4245498180389404, + "learning_rate": 4.151938802426801e-06, + "loss": 0.1207, + "step": 794 + }, + { + "epoch": 0.20999868091280835, + "grad_norm": 1.8138560056686401, + "learning_rate": 4.162490108150884e-06, + "loss": 0.1453, + "step": 796 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 1.2403584718704224, + "learning_rate": 4.173041413874967e-06, + "loss": 0.1347, + "step": 798 + }, + { + "epoch": 0.21105395066613902, + "grad_norm": 1.1631813049316406, + "learning_rate": 4.183592719599051e-06, + "loss": 0.1195, + "step": 800 + }, + { + "epoch": 0.21158158554280437, + "grad_norm": 1.4087822437286377, + "learning_rate": 4.194144025323134e-06, + "loss": 0.1209, + "step": 802 + }, + { + "epoch": 0.21210922041946972, + "grad_norm": 2.7548561096191406, + "learning_rate": 4.204695331047217e-06, + "loss": 0.1235, + "step": 804 + }, + { + "epoch": 0.21263685529613507, + "grad_norm": 1.3945138454437256, + "learning_rate": 4.215246636771301e-06, + "loss": 0.1395, + "step": 806 + }, + { + "epoch": 0.21316449017280042, + "grad_norm": 0.9673140048980713, + "learning_rate": 4.2257979424953845e-06, + "loss": 0.1409, + "step": 808 + }, + { + "epoch": 0.21369212504946578, + "grad_norm": 1.673425316810608, + "learning_rate": 4.2363492482194674e-06, + "loss": 0.1196, + "step": 810 + }, + { + "epoch": 0.21421975992613113, + "grad_norm": 1.1667752265930176, + "learning_rate": 4.246900553943551e-06, + "loss": 0.1156, + "step": 812 + }, + { + "epoch": 0.21474739480279648, + "grad_norm": 1.1030246019363403, + "learning_rate": 4.257451859667634e-06, + "loss": 0.1123, + "step": 814 + }, + { + "epoch": 0.2152750296794618, + "grad_norm": 0.8521634936332703, + "learning_rate": 4.268003165391717e-06, + "loss": 0.111, + "step": 816 + }, + { + "epoch": 0.21580266455612715, + "grad_norm": 0.6844758987426758, + "learning_rate": 4.278554471115801e-06, + "loss": 0.1068, + "step": 818 + }, + { + "epoch": 0.2163302994327925, + "grad_norm": 0.9660982489585876, + "learning_rate": 4.289105776839884e-06, + "loss": 0.1035, + "step": 820 + }, + { + "epoch": 0.21685793430945785, + "grad_norm": 1.415231466293335, + "learning_rate": 4.299657082563968e-06, + "loss": 0.1397, + "step": 822 + }, + { + "epoch": 0.2173855691861232, + "grad_norm": 0.9347997903823853, + "learning_rate": 4.310208388288051e-06, + "loss": 0.1169, + "step": 824 + }, + { + "epoch": 0.21791320406278855, + "grad_norm": 0.9104371666908264, + "learning_rate": 4.3207596940121345e-06, + "loss": 0.121, + "step": 826 + }, + { + "epoch": 0.2184408389394539, + "grad_norm": 1.0964378118515015, + "learning_rate": 4.331310999736218e-06, + "loss": 0.1102, + "step": 828 + }, + { + "epoch": 0.21896847381611925, + "grad_norm": 3.012085437774658, + "learning_rate": 4.341862305460301e-06, + "loss": 0.1081, + "step": 830 + }, + { + "epoch": 0.2194961086927846, + "grad_norm": 0.8148002624511719, + "learning_rate": 4.352413611184384e-06, + "loss": 0.103, + "step": 832 + }, + { + "epoch": 0.22002374356944995, + "grad_norm": 1.045818567276001, + "learning_rate": 4.362964916908467e-06, + "loss": 0.1219, + "step": 834 + }, + { + "epoch": 0.22055137844611528, + "grad_norm": 1.0501551628112793, + "learning_rate": 4.373516222632551e-06, + "loss": 0.0964, + "step": 836 + }, + { + "epoch": 0.22107901332278063, + "grad_norm": 1.2143255472183228, + "learning_rate": 4.384067528356635e-06, + "loss": 0.105, + "step": 838 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 0.810645341873169, + "learning_rate": 4.394618834080718e-06, + "loss": 0.0918, + "step": 840 + }, + { + "epoch": 0.22213428307611133, + "grad_norm": 0.7934425473213196, + "learning_rate": 4.4051701398048016e-06, + "loss": 0.106, + "step": 842 + }, + { + "epoch": 0.22266191795277668, + "grad_norm": 0.8065547943115234, + "learning_rate": 4.4157214455288845e-06, + "loss": 0.087, + "step": 844 + }, + { + "epoch": 0.22318955282944203, + "grad_norm": 0.9062328934669495, + "learning_rate": 4.426272751252968e-06, + "loss": 0.0959, + "step": 846 + }, + { + "epoch": 0.22371718770610738, + "grad_norm": 0.834733247756958, + "learning_rate": 4.436824056977051e-06, + "loss": 0.0976, + "step": 848 + }, + { + "epoch": 0.22424482258277273, + "grad_norm": 0.526149332523346, + "learning_rate": 4.447375362701134e-06, + "loss": 0.0814, + "step": 850 + }, + { + "epoch": 0.22477245745943808, + "grad_norm": 0.7917191982269287, + "learning_rate": 4.457926668425218e-06, + "loss": 0.0814, + "step": 852 + }, + { + "epoch": 0.22530009233610343, + "grad_norm": 0.7492482662200928, + "learning_rate": 4.468477974149301e-06, + "loss": 0.0923, + "step": 854 + }, + { + "epoch": 0.22582772721276875, + "grad_norm": 1.7148336172103882, + "learning_rate": 4.479029279873385e-06, + "loss": 0.0937, + "step": 856 + }, + { + "epoch": 0.2263553620894341, + "grad_norm": 0.7509326338768005, + "learning_rate": 4.489580585597468e-06, + "loss": 0.0913, + "step": 858 + }, + { + "epoch": 0.22688299696609945, + "grad_norm": 1.7775815725326538, + "learning_rate": 4.5001318913215516e-06, + "loss": 0.0926, + "step": 860 + }, + { + "epoch": 0.2274106318427648, + "grad_norm": 0.9340442419052124, + "learning_rate": 4.5106831970456345e-06, + "loss": 0.0926, + "step": 862 + }, + { + "epoch": 0.22793826671943015, + "grad_norm": 1.9536547660827637, + "learning_rate": 4.521234502769718e-06, + "loss": 0.0933, + "step": 864 + }, + { + "epoch": 0.2284659015960955, + "grad_norm": 0.6481009125709534, + "learning_rate": 4.531785808493801e-06, + "loss": 0.0796, + "step": 866 + }, + { + "epoch": 0.22899353647276086, + "grad_norm": 0.6766083240509033, + "learning_rate": 4.542337114217884e-06, + "loss": 0.0809, + "step": 868 + }, + { + "epoch": 0.2295211713494262, + "grad_norm": 0.6013822555541992, + "learning_rate": 4.552888419941968e-06, + "loss": 0.0747, + "step": 870 + }, + { + "epoch": 0.23004880622609156, + "grad_norm": 0.8676326870918274, + "learning_rate": 4.563439725666052e-06, + "loss": 0.0743, + "step": 872 + }, + { + "epoch": 0.23057644110275688, + "grad_norm": 1.0277431011199951, + "learning_rate": 4.573991031390135e-06, + "loss": 0.0791, + "step": 874 + }, + { + "epoch": 0.23110407597942223, + "grad_norm": 0.9935094118118286, + "learning_rate": 4.584542337114219e-06, + "loss": 0.0828, + "step": 876 + }, + { + "epoch": 0.23163171085608758, + "grad_norm": 0.9960004091262817, + "learning_rate": 4.595093642838302e-06, + "loss": 0.0802, + "step": 878 + }, + { + "epoch": 0.23215934573275293, + "grad_norm": 1.7580475807189941, + "learning_rate": 4.6056449485623846e-06, + "loss": 0.0878, + "step": 880 + }, + { + "epoch": 0.23268698060941828, + "grad_norm": 1.4822311401367188, + "learning_rate": 4.616196254286468e-06, + "loss": 0.0826, + "step": 882 + }, + { + "epoch": 0.23321461548608363, + "grad_norm": 1.4140413999557495, + "learning_rate": 4.626747560010551e-06, + "loss": 0.0761, + "step": 884 + }, + { + "epoch": 0.23374225036274898, + "grad_norm": 1.0051692724227905, + "learning_rate": 4.637298865734635e-06, + "loss": 0.0827, + "step": 886 + }, + { + "epoch": 0.23426988523941433, + "grad_norm": 1.0092966556549072, + "learning_rate": 4.647850171458718e-06, + "loss": 0.0688, + "step": 888 + }, + { + "epoch": 0.23479752011607968, + "grad_norm": 0.752403199672699, + "learning_rate": 4.658401477182802e-06, + "loss": 0.0872, + "step": 890 + }, + { + "epoch": 0.23532515499274503, + "grad_norm": 0.8243374228477478, + "learning_rate": 4.668952782906886e-06, + "loss": 0.0682, + "step": 892 + }, + { + "epoch": 0.23585278986941036, + "grad_norm": 1.3188782930374146, + "learning_rate": 4.679504088630969e-06, + "loss": 0.0711, + "step": 894 + }, + { + "epoch": 0.2363804247460757, + "grad_norm": 0.6216471791267395, + "learning_rate": 4.690055394355052e-06, + "loss": 0.072, + "step": 896 + }, + { + "epoch": 0.23690805962274106, + "grad_norm": 1.1984801292419434, + "learning_rate": 4.700606700079135e-06, + "loss": 0.0733, + "step": 898 + }, + { + "epoch": 0.2374356944994064, + "grad_norm": 1.1605061292648315, + "learning_rate": 4.711158005803218e-06, + "loss": 0.0827, + "step": 900 + }, + { + "epoch": 0.23796332937607176, + "grad_norm": 1.0760524272918701, + "learning_rate": 4.721709311527302e-06, + "loss": 0.0723, + "step": 902 + }, + { + "epoch": 0.2384909642527371, + "grad_norm": 0.7429733872413635, + "learning_rate": 4.732260617251385e-06, + "loss": 0.0794, + "step": 904 + }, + { + "epoch": 0.23901859912940246, + "grad_norm": 1.2104941606521606, + "learning_rate": 4.742811922975469e-06, + "loss": 0.0738, + "step": 906 + }, + { + "epoch": 0.2395462340060678, + "grad_norm": 0.6629528999328613, + "learning_rate": 4.753363228699552e-06, + "loss": 0.0698, + "step": 908 + }, + { + "epoch": 0.24007386888273316, + "grad_norm": 0.3995175361633301, + "learning_rate": 4.763914534423636e-06, + "loss": 0.0648, + "step": 910 + }, + { + "epoch": 0.24060150375939848, + "grad_norm": 0.5352901220321655, + "learning_rate": 4.774465840147719e-06, + "loss": 0.0619, + "step": 912 + }, + { + "epoch": 0.24112913863606383, + "grad_norm": 0.618497371673584, + "learning_rate": 4.785017145871802e-06, + "loss": 0.0559, + "step": 914 + }, + { + "epoch": 0.24165677351272918, + "grad_norm": 1.7213947772979736, + "learning_rate": 4.7955684515958854e-06, + "loss": 0.0774, + "step": 916 + }, + { + "epoch": 0.24218440838939453, + "grad_norm": 0.6355694532394409, + "learning_rate": 4.806119757319968e-06, + "loss": 0.0597, + "step": 918 + }, + { + "epoch": 0.24271204326605988, + "grad_norm": 0.3530198633670807, + "learning_rate": 4.816671063044052e-06, + "loss": 0.062, + "step": 920 + }, + { + "epoch": 0.24323967814272524, + "grad_norm": 1.0778855085372925, + "learning_rate": 4.827222368768135e-06, + "loss": 0.0649, + "step": 922 + }, + { + "epoch": 0.24376731301939059, + "grad_norm": 0.7750348448753357, + "learning_rate": 4.837773674492219e-06, + "loss": 0.0625, + "step": 924 + }, + { + "epoch": 0.24429494789605594, + "grad_norm": 0.7860682010650635, + "learning_rate": 4.848324980216303e-06, + "loss": 0.0685, + "step": 926 + }, + { + "epoch": 0.2448225827727213, + "grad_norm": 0.5324569940567017, + "learning_rate": 4.858876285940386e-06, + "loss": 0.0635, + "step": 928 + }, + { + "epoch": 0.24535021764938664, + "grad_norm": 0.7473832964897156, + "learning_rate": 4.869427591664469e-06, + "loss": 0.0526, + "step": 930 + }, + { + "epoch": 0.24587785252605196, + "grad_norm": 0.9635046720504761, + "learning_rate": 4.879978897388552e-06, + "loss": 0.0572, + "step": 932 + }, + { + "epoch": 0.2464054874027173, + "grad_norm": 2.2916696071624756, + "learning_rate": 4.8905302031126355e-06, + "loss": 0.0673, + "step": 934 + }, + { + "epoch": 0.24693312227938266, + "grad_norm": 0.7010274529457092, + "learning_rate": 4.901081508836719e-06, + "loss": 0.059, + "step": 936 + }, + { + "epoch": 0.247460757156048, + "grad_norm": 0.5604134798049927, + "learning_rate": 4.911632814560802e-06, + "loss": 0.0557, + "step": 938 + }, + { + "epoch": 0.24798839203271336, + "grad_norm": 0.5728471279144287, + "learning_rate": 4.922184120284886e-06, + "loss": 0.0684, + "step": 940 + }, + { + "epoch": 0.2485160269093787, + "grad_norm": 0.5802444219589233, + "learning_rate": 4.932735426008969e-06, + "loss": 0.0578, + "step": 942 + }, + { + "epoch": 0.24904366178604406, + "grad_norm": 0.5074941515922546, + "learning_rate": 4.943286731733053e-06, + "loss": 0.0625, + "step": 944 + }, + { + "epoch": 0.2495712966627094, + "grad_norm": 0.7838951945304871, + "learning_rate": 4.953838037457136e-06, + "loss": 0.0585, + "step": 946 + }, + { + "epoch": 0.25009893153937474, + "grad_norm": 0.8502309918403625, + "learning_rate": 4.964389343181219e-06, + "loss": 0.0632, + "step": 948 + }, + { + "epoch": 0.2506265664160401, + "grad_norm": 0.5647032260894775, + "learning_rate": 4.9749406489053025e-06, + "loss": 0.049, + "step": 950 + }, + { + "epoch": 0.25115420129270544, + "grad_norm": 0.5735520124435425, + "learning_rate": 4.9854919546293855e-06, + "loss": 0.0665, + "step": 952 + }, + { + "epoch": 0.2516818361693708, + "grad_norm": 0.913120687007904, + "learning_rate": 4.996043260353469e-06, + "loss": 0.0525, + "step": 954 + }, + { + "epoch": 0.25220947104603614, + "grad_norm": 0.6849641799926758, + "learning_rate": 5.006594566077553e-06, + "loss": 0.0516, + "step": 956 + }, + { + "epoch": 0.2527371059227015, + "grad_norm": 0.3615977466106415, + "learning_rate": 5.017145871801636e-06, + "loss": 0.0581, + "step": 958 + }, + { + "epoch": 0.25326474079936684, + "grad_norm": 2.384124279022217, + "learning_rate": 5.027697177525719e-06, + "loss": 0.0566, + "step": 960 + }, + { + "epoch": 0.2537923756760322, + "grad_norm": 0.8687224388122559, + "learning_rate": 5.038248483249803e-06, + "loss": 0.0579, + "step": 962 + }, + { + "epoch": 0.25432001055269754, + "grad_norm": 0.6377274990081787, + "learning_rate": 5.048799788973886e-06, + "loss": 0.054, + "step": 964 + }, + { + "epoch": 0.2548476454293629, + "grad_norm": 2.5513367652893066, + "learning_rate": 5.05935109469797e-06, + "loss": 0.0579, + "step": 966 + }, + { + "epoch": 0.25537528030602824, + "grad_norm": 2.5262365341186523, + "learning_rate": 5.069902400422053e-06, + "loss": 0.0645, + "step": 968 + }, + { + "epoch": 0.2559029151826936, + "grad_norm": 0.5681880712509155, + "learning_rate": 5.0804537061461355e-06, + "loss": 0.0548, + "step": 970 + }, + { + "epoch": 0.25643055005935894, + "grad_norm": 4.053463459014893, + "learning_rate": 5.091005011870219e-06, + "loss": 0.0517, + "step": 972 + }, + { + "epoch": 0.2569581849360243, + "grad_norm": 4.111974716186523, + "learning_rate": 5.101556317594303e-06, + "loss": 0.0639, + "step": 974 + }, + { + "epoch": 0.25748581981268964, + "grad_norm": 2.6453771591186523, + "learning_rate": 5.112107623318386e-06, + "loss": 0.0634, + "step": 976 + }, + { + "epoch": 0.258013454689355, + "grad_norm": 0.3998512029647827, + "learning_rate": 5.12265892904247e-06, + "loss": 0.0541, + "step": 978 + }, + { + "epoch": 0.2585410895660203, + "grad_norm": 0.6305781602859497, + "learning_rate": 5.133210234766553e-06, + "loss": 0.0479, + "step": 980 + }, + { + "epoch": 0.25906872444268564, + "grad_norm": 0.6801307797431946, + "learning_rate": 5.143761540490636e-06, + "loss": 0.0479, + "step": 982 + }, + { + "epoch": 0.259596359319351, + "grad_norm": 1.778153419494629, + "learning_rate": 5.15431284621472e-06, + "loss": 0.0576, + "step": 984 + }, + { + "epoch": 0.26012399419601634, + "grad_norm": 1.4113526344299316, + "learning_rate": 5.1648641519388034e-06, + "loss": 0.0553, + "step": 986 + }, + { + "epoch": 0.2606516290726817, + "grad_norm": 0.5316371917724609, + "learning_rate": 5.1754154576628855e-06, + "loss": 0.0392, + "step": 988 + }, + { + "epoch": 0.26117926394934704, + "grad_norm": 0.7120594382286072, + "learning_rate": 5.185966763386969e-06, + "loss": 0.0519, + "step": 990 + }, + { + "epoch": 0.2617068988260124, + "grad_norm": 1.1813122034072876, + "learning_rate": 5.196518069111053e-06, + "loss": 0.0651, + "step": 992 + }, + { + "epoch": 0.26223453370267774, + "grad_norm": 0.6545190811157227, + "learning_rate": 5.207069374835136e-06, + "loss": 0.0564, + "step": 994 + }, + { + "epoch": 0.2627621685793431, + "grad_norm": 0.999932587146759, + "learning_rate": 5.21762068055922e-06, + "loss": 0.0505, + "step": 996 + }, + { + "epoch": 0.26328980345600844, + "grad_norm": 1.665206789970398, + "learning_rate": 5.228171986283303e-06, + "loss": 0.0454, + "step": 998 + }, + { + "epoch": 0.2638174383326738, + "grad_norm": 0.5754594206809998, + "learning_rate": 5.238723292007386e-06, + "loss": 0.0431, + "step": 1000 + }, + { + "epoch": 0.26434507320933914, + "grad_norm": 1.2234323024749756, + "learning_rate": 5.24927459773147e-06, + "loss": 0.0479, + "step": 1002 + }, + { + "epoch": 0.2648727080860045, + "grad_norm": 1.59165620803833, + "learning_rate": 5.2598259034555534e-06, + "loss": 0.0517, + "step": 1004 + }, + { + "epoch": 0.26540034296266984, + "grad_norm": 1.801876425743103, + "learning_rate": 5.270377209179636e-06, + "loss": 0.0573, + "step": 1006 + }, + { + "epoch": 0.2659279778393352, + "grad_norm": 0.3186013996601105, + "learning_rate": 5.280928514903719e-06, + "loss": 0.0454, + "step": 1008 + }, + { + "epoch": 0.26645561271600054, + "grad_norm": 1.9515973329544067, + "learning_rate": 5.291479820627803e-06, + "loss": 0.0429, + "step": 1010 + }, + { + "epoch": 0.2669832475926659, + "grad_norm": 2.71252179145813, + "learning_rate": 5.302031126351886e-06, + "loss": 0.0451, + "step": 1012 + }, + { + "epoch": 0.26751088246933125, + "grad_norm": 0.7453064918518066, + "learning_rate": 5.31258243207597e-06, + "loss": 0.0507, + "step": 1014 + }, + { + "epoch": 0.2680385173459966, + "grad_norm": 0.30787304043769836, + "learning_rate": 5.323133737800054e-06, + "loss": 0.0385, + "step": 1016 + }, + { + "epoch": 0.2685661522226619, + "grad_norm": 0.41635483503341675, + "learning_rate": 5.333685043524136e-06, + "loss": 0.0454, + "step": 1018 + }, + { + "epoch": 0.26909378709932724, + "grad_norm": 1.9098718166351318, + "learning_rate": 5.34423634924822e-06, + "loss": 0.0421, + "step": 1020 + }, + { + "epoch": 0.2696214219759926, + "grad_norm": 0.5296719670295715, + "learning_rate": 5.3547876549723035e-06, + "loss": 0.0489, + "step": 1022 + }, + { + "epoch": 0.27014905685265794, + "grad_norm": 0.28635528683662415, + "learning_rate": 5.3653389606963864e-06, + "loss": 0.0355, + "step": 1024 + }, + { + "epoch": 0.2706766917293233, + "grad_norm": 0.7303667068481445, + "learning_rate": 5.37589026642047e-06, + "loss": 0.0463, + "step": 1026 + }, + { + "epoch": 0.27120432660598864, + "grad_norm": 1.1404387950897217, + "learning_rate": 5.386441572144553e-06, + "loss": 0.0515, + "step": 1028 + }, + { + "epoch": 0.271731961482654, + "grad_norm": 0.9421586394309998, + "learning_rate": 5.396992877868636e-06, + "loss": 0.0473, + "step": 1030 + }, + { + "epoch": 0.27225959635931934, + "grad_norm": 2.9227235317230225, + "learning_rate": 5.40754418359272e-06, + "loss": 0.0441, + "step": 1032 + }, + { + "epoch": 0.2727872312359847, + "grad_norm": 1.1050968170166016, + "learning_rate": 5.418095489316804e-06, + "loss": 0.0541, + "step": 1034 + }, + { + "epoch": 0.27331486611265005, + "grad_norm": 0.8840624094009399, + "learning_rate": 5.4286467950408876e-06, + "loss": 0.0436, + "step": 1036 + }, + { + "epoch": 0.2738425009893154, + "grad_norm": 0.7363865375518799, + "learning_rate": 5.43919810076497e-06, + "loss": 0.0434, + "step": 1038 + }, + { + "epoch": 0.27437013586598075, + "grad_norm": 1.4358688592910767, + "learning_rate": 5.4497494064890535e-06, + "loss": 0.0528, + "step": 1040 + }, + { + "epoch": 0.2748977707426461, + "grad_norm": 0.8926219940185547, + "learning_rate": 5.4603007122131365e-06, + "loss": 0.044, + "step": 1042 + }, + { + "epoch": 0.27542540561931145, + "grad_norm": 1.0466747283935547, + "learning_rate": 5.47085201793722e-06, + "loss": 0.0509, + "step": 1044 + }, + { + "epoch": 0.2759530404959768, + "grad_norm": 1.012166976928711, + "learning_rate": 5.481403323661304e-06, + "loss": 0.0359, + "step": 1046 + }, + { + "epoch": 0.27648067537264215, + "grad_norm": 1.1418012380599976, + "learning_rate": 5.491954629385386e-06, + "loss": 0.0424, + "step": 1048 + }, + { + "epoch": 0.2770083102493075, + "grad_norm": 0.5276322960853577, + "learning_rate": 5.50250593510947e-06, + "loss": 0.0465, + "step": 1050 + }, + { + "epoch": 0.27753594512597285, + "grad_norm": 1.9678146839141846, + "learning_rate": 5.513057240833554e-06, + "loss": 0.0386, + "step": 1052 + }, + { + "epoch": 0.2780635800026382, + "grad_norm": 0.2721196711063385, + "learning_rate": 5.523608546557638e-06, + "loss": 0.0397, + "step": 1054 + }, + { + "epoch": 0.2785912148793035, + "grad_norm": 0.3728731572628021, + "learning_rate": 5.5341598522817206e-06, + "loss": 0.0355, + "step": 1056 + }, + { + "epoch": 0.27911884975596885, + "grad_norm": 2.7915401458740234, + "learning_rate": 5.5447111580058035e-06, + "loss": 0.0476, + "step": 1058 + }, + { + "epoch": 0.2796464846326342, + "grad_norm": 0.6177597045898438, + "learning_rate": 5.5552624637298865e-06, + "loss": 0.0456, + "step": 1060 + }, + { + "epoch": 0.28017411950929955, + "grad_norm": 1.0268305540084839, + "learning_rate": 5.56581376945397e-06, + "loss": 0.0406, + "step": 1062 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 2.0053372383117676, + "learning_rate": 5.576365075178054e-06, + "loss": 0.041, + "step": 1064 + }, + { + "epoch": 0.28122938926263025, + "grad_norm": 1.290460228919983, + "learning_rate": 5.586916380902138e-06, + "loss": 0.0445, + "step": 1066 + }, + { + "epoch": 0.2817570241392956, + "grad_norm": 0.2744552195072174, + "learning_rate": 5.59746768662622e-06, + "loss": 0.0398, + "step": 1068 + }, + { + "epoch": 0.28228465901596095, + "grad_norm": 1.329141616821289, + "learning_rate": 5.608018992350304e-06, + "loss": 0.0422, + "step": 1070 + }, + { + "epoch": 0.2828122938926263, + "grad_norm": 0.6233449578285217, + "learning_rate": 5.618570298074388e-06, + "loss": 0.0367, + "step": 1072 + }, + { + "epoch": 0.28333992876929165, + "grad_norm": 0.2690449059009552, + "learning_rate": 5.629121603798471e-06, + "loss": 0.0445, + "step": 1074 + }, + { + "epoch": 0.283867563645957, + "grad_norm": 0.3397813141345978, + "learning_rate": 5.639672909522554e-06, + "loss": 0.032, + "step": 1076 + }, + { + "epoch": 0.28439519852262235, + "grad_norm": 1.2334909439086914, + "learning_rate": 5.6502242152466365e-06, + "loss": 0.0591, + "step": 1078 + }, + { + "epoch": 0.2849228333992877, + "grad_norm": 0.5569453835487366, + "learning_rate": 5.66077552097072e-06, + "loss": 0.0327, + "step": 1080 + }, + { + "epoch": 0.28545046827595305, + "grad_norm": 0.48858642578125, + "learning_rate": 5.671326826694804e-06, + "loss": 0.0459, + "step": 1082 + }, + { + "epoch": 0.2859781031526184, + "grad_norm": 0.9978088140487671, + "learning_rate": 5.681878132418888e-06, + "loss": 0.036, + "step": 1084 + }, + { + "epoch": 0.28650573802928375, + "grad_norm": 0.7877616286277771, + "learning_rate": 5.69242943814297e-06, + "loss": 0.0381, + "step": 1086 + }, + { + "epoch": 0.2870333729059491, + "grad_norm": 0.4482375383377075, + "learning_rate": 5.702980743867054e-06, + "loss": 0.0345, + "step": 1088 + }, + { + "epoch": 0.28756100778261445, + "grad_norm": 1.690389633178711, + "learning_rate": 5.713532049591138e-06, + "loss": 0.0432, + "step": 1090 + }, + { + "epoch": 0.2880886426592798, + "grad_norm": 1.7604840993881226, + "learning_rate": 5.724083355315221e-06, + "loss": 0.0499, + "step": 1092 + }, + { + "epoch": 0.2886162775359451, + "grad_norm": 1.0771281719207764, + "learning_rate": 5.734634661039304e-06, + "loss": 0.0526, + "step": 1094 + }, + { + "epoch": 0.28914391241261045, + "grad_norm": 0.5685479640960693, + "learning_rate": 5.7451859667633865e-06, + "loss": 0.0381, + "step": 1096 + }, + { + "epoch": 0.2896715472892758, + "grad_norm": 1.9474658966064453, + "learning_rate": 5.75573727248747e-06, + "loss": 0.0431, + "step": 1098 + }, + { + "epoch": 0.29019918216594115, + "grad_norm": 1.2233418226242065, + "learning_rate": 5.766288578211554e-06, + "loss": 0.038, + "step": 1100 + }, + { + "epoch": 0.2907268170426065, + "grad_norm": 1.0332355499267578, + "learning_rate": 5.776839883935638e-06, + "loss": 0.0316, + "step": 1102 + }, + { + "epoch": 0.29125445191927185, + "grad_norm": 0.4058944880962372, + "learning_rate": 5.787391189659721e-06, + "loss": 0.0283, + "step": 1104 + }, + { + "epoch": 0.2917820867959372, + "grad_norm": 0.6655163764953613, + "learning_rate": 5.797942495383804e-06, + "loss": 0.047, + "step": 1106 + }, + { + "epoch": 0.29230972167260255, + "grad_norm": 3.1715102195739746, + "learning_rate": 5.808493801107888e-06, + "loss": 0.0341, + "step": 1108 + }, + { + "epoch": 0.2928373565492679, + "grad_norm": 0.6909581422805786, + "learning_rate": 5.819045106831971e-06, + "loss": 0.0479, + "step": 1110 + }, + { + "epoch": 0.29336499142593325, + "grad_norm": 1.4067739248275757, + "learning_rate": 5.8295964125560544e-06, + "loss": 0.0315, + "step": 1112 + }, + { + "epoch": 0.2938926263025986, + "grad_norm": 1.1267246007919312, + "learning_rate": 5.840147718280138e-06, + "loss": 0.0349, + "step": 1114 + }, + { + "epoch": 0.29442026117926395, + "grad_norm": 0.5281479358673096, + "learning_rate": 5.85069902400422e-06, + "loss": 0.0329, + "step": 1116 + }, + { + "epoch": 0.2949478960559293, + "grad_norm": 0.552584171295166, + "learning_rate": 5.861250329728304e-06, + "loss": 0.0297, + "step": 1118 + }, + { + "epoch": 0.29547553093259465, + "grad_norm": 2.4275403022766113, + "learning_rate": 5.871801635452388e-06, + "loss": 0.0408, + "step": 1120 + }, + { + "epoch": 0.29600316580926, + "grad_norm": 0.4563150107860565, + "learning_rate": 5.882352941176471e-06, + "loss": 0.0366, + "step": 1122 + }, + { + "epoch": 0.29653080068592536, + "grad_norm": 2.212120532989502, + "learning_rate": 5.892904246900555e-06, + "loss": 0.0328, + "step": 1124 + }, + { + "epoch": 0.2970584355625907, + "grad_norm": 1.6902319192886353, + "learning_rate": 5.903455552624638e-06, + "loss": 0.0368, + "step": 1126 + }, + { + "epoch": 0.29758607043925606, + "grad_norm": 0.8245353698730469, + "learning_rate": 5.914006858348721e-06, + "loss": 0.0326, + "step": 1128 + }, + { + "epoch": 0.2981137053159214, + "grad_norm": 4.051249027252197, + "learning_rate": 5.9245581640728045e-06, + "loss": 0.0327, + "step": 1130 + }, + { + "epoch": 0.29864134019258676, + "grad_norm": 0.36794641613960266, + "learning_rate": 5.935109469796888e-06, + "loss": 0.0333, + "step": 1132 + }, + { + "epoch": 0.29916897506925205, + "grad_norm": 0.5569996237754822, + "learning_rate": 5.945660775520972e-06, + "loss": 0.0302, + "step": 1134 + }, + { + "epoch": 0.2996966099459174, + "grad_norm": 0.23477977514266968, + "learning_rate": 5.956212081245054e-06, + "loss": 0.0249, + "step": 1136 + }, + { + "epoch": 0.30022424482258275, + "grad_norm": 0.8122280240058899, + "learning_rate": 5.966763386969138e-06, + "loss": 0.0387, + "step": 1138 + }, + { + "epoch": 0.3007518796992481, + "grad_norm": 1.7707135677337646, + "learning_rate": 5.977314692693221e-06, + "loss": 0.0554, + "step": 1140 + }, + { + "epoch": 0.30127951457591345, + "grad_norm": 3.3390674591064453, + "learning_rate": 5.987865998417305e-06, + "loss": 0.0308, + "step": 1142 + }, + { + "epoch": 0.3018071494525788, + "grad_norm": 0.3645137548446655, + "learning_rate": 5.9984173041413886e-06, + "loss": 0.0288, + "step": 1144 + }, + { + "epoch": 0.30233478432924416, + "grad_norm": 0.3672318458557129, + "learning_rate": 6.008968609865471e-06, + "loss": 0.029, + "step": 1146 + }, + { + "epoch": 0.3028624192059095, + "grad_norm": 1.172485113143921, + "learning_rate": 6.0195199155895545e-06, + "loss": 0.0345, + "step": 1148 + }, + { + "epoch": 0.30339005408257486, + "grad_norm": 0.626430332660675, + "learning_rate": 6.030071221313638e-06, + "loss": 0.037, + "step": 1150 + }, + { + "epoch": 0.3039176889592402, + "grad_norm": 0.9359550476074219, + "learning_rate": 6.040622527037722e-06, + "loss": 0.0275, + "step": 1152 + }, + { + "epoch": 0.30444532383590556, + "grad_norm": 1.4612756967544556, + "learning_rate": 6.051173832761805e-06, + "loss": 0.0346, + "step": 1154 + }, + { + "epoch": 0.3049729587125709, + "grad_norm": 0.30720800161361694, + "learning_rate": 6.061725138485888e-06, + "loss": 0.0266, + "step": 1156 + }, + { + "epoch": 0.30550059358923626, + "grad_norm": 1.444648027420044, + "learning_rate": 6.072276444209971e-06, + "loss": 0.0346, + "step": 1158 + }, + { + "epoch": 0.3060282284659016, + "grad_norm": 0.8414546251296997, + "learning_rate": 6.082827749934055e-06, + "loss": 0.0291, + "step": 1160 + }, + { + "epoch": 0.30655586334256696, + "grad_norm": 0.44687145948410034, + "learning_rate": 6.093379055658139e-06, + "loss": 0.025, + "step": 1162 + }, + { + "epoch": 0.3070834982192323, + "grad_norm": 0.8709312677383423, + "learning_rate": 6.103930361382222e-06, + "loss": 0.0322, + "step": 1164 + }, + { + "epoch": 0.30761113309589766, + "grad_norm": 0.6889014840126038, + "learning_rate": 6.1144816671063045e-06, + "loss": 0.0267, + "step": 1166 + }, + { + "epoch": 0.308138767972563, + "grad_norm": 0.5081378817558289, + "learning_rate": 6.125032972830388e-06, + "loss": 0.025, + "step": 1168 + }, + { + "epoch": 0.30866640284922836, + "grad_norm": 1.6178334951400757, + "learning_rate": 6.135584278554472e-06, + "loss": 0.0278, + "step": 1170 + }, + { + "epoch": 0.30919403772589366, + "grad_norm": 0.4411400854587555, + "learning_rate": 6.146135584278555e-06, + "loss": 0.0281, + "step": 1172 + }, + { + "epoch": 0.309721672602559, + "grad_norm": 1.9489634037017822, + "learning_rate": 6.156686890002639e-06, + "loss": 0.0489, + "step": 1174 + }, + { + "epoch": 0.31024930747922436, + "grad_norm": 0.6178317666053772, + "learning_rate": 6.167238195726721e-06, + "loss": 0.033, + "step": 1176 + }, + { + "epoch": 0.3107769423558897, + "grad_norm": 1.085808277130127, + "learning_rate": 6.177789501450805e-06, + "loss": 0.0382, + "step": 1178 + }, + { + "epoch": 0.31130457723255506, + "grad_norm": 1.4383749961853027, + "learning_rate": 6.188340807174889e-06, + "loss": 0.0396, + "step": 1180 + }, + { + "epoch": 0.3118322121092204, + "grad_norm": 0.5503590703010559, + "learning_rate": 6.198892112898972e-06, + "loss": 0.0459, + "step": 1182 + }, + { + "epoch": 0.31235984698588576, + "grad_norm": 0.2518857717514038, + "learning_rate": 6.209443418623055e-06, + "loss": 0.025, + "step": 1184 + }, + { + "epoch": 0.3128874818625511, + "grad_norm": 0.37523695826530457, + "learning_rate": 6.219994724347138e-06, + "loss": 0.0259, + "step": 1186 + }, + { + "epoch": 0.31341511673921646, + "grad_norm": 2.0915467739105225, + "learning_rate": 6.230546030071222e-06, + "loss": 0.0333, + "step": 1188 + }, + { + "epoch": 0.3139427516158818, + "grad_norm": 0.686527669429779, + "learning_rate": 6.241097335795305e-06, + "loss": 0.0379, + "step": 1190 + }, + { + "epoch": 0.31447038649254716, + "grad_norm": 0.5755025148391724, + "learning_rate": 6.251648641519389e-06, + "loss": 0.0345, + "step": 1192 + }, + { + "epoch": 0.3149980213692125, + "grad_norm": 9.857476234436035, + "learning_rate": 6.262199947243471e-06, + "loss": 0.0301, + "step": 1194 + }, + { + "epoch": 0.31552565624587786, + "grad_norm": 0.3449013829231262, + "learning_rate": 6.272751252967555e-06, + "loss": 0.0304, + "step": 1196 + }, + { + "epoch": 0.3160532911225432, + "grad_norm": 0.15941639244556427, + "learning_rate": 6.283302558691639e-06, + "loss": 0.0244, + "step": 1198 + }, + { + "epoch": 0.31658092599920856, + "grad_norm": 1.7402324676513672, + "learning_rate": 6.2938538644157224e-06, + "loss": 0.0261, + "step": 1200 + }, + { + "epoch": 0.3171085608758739, + "grad_norm": 0.6643465757369995, + "learning_rate": 6.304405170139805e-06, + "loss": 0.0405, + "step": 1202 + }, + { + "epoch": 0.31763619575253926, + "grad_norm": 1.147955298423767, + "learning_rate": 6.314956475863888e-06, + "loss": 0.0326, + "step": 1204 + }, + { + "epoch": 0.3181638306292046, + "grad_norm": 0.46380797028541565, + "learning_rate": 6.325507781587972e-06, + "loss": 0.0399, + "step": 1206 + }, + { + "epoch": 0.31869146550586996, + "grad_norm": 0.6398585438728333, + "learning_rate": 6.336059087312055e-06, + "loss": 0.0254, + "step": 1208 + }, + { + "epoch": 0.31921910038253526, + "grad_norm": 0.9494512677192688, + "learning_rate": 6.346610393036139e-06, + "loss": 0.0268, + "step": 1210 + }, + { + "epoch": 0.3197467352592006, + "grad_norm": 0.3231871724128723, + "learning_rate": 6.357161698760223e-06, + "loss": 0.0216, + "step": 1212 + }, + { + "epoch": 0.32027437013586596, + "grad_norm": 0.267789751291275, + "learning_rate": 6.367713004484305e-06, + "loss": 0.0282, + "step": 1214 + }, + { + "epoch": 0.3208020050125313, + "grad_norm": 0.8526958227157593, + "learning_rate": 6.378264310208389e-06, + "loss": 0.0333, + "step": 1216 + }, + { + "epoch": 0.32132963988919666, + "grad_norm": 0.27739831805229187, + "learning_rate": 6.3888156159324725e-06, + "loss": 0.032, + "step": 1218 + }, + { + "epoch": 0.321857274765862, + "grad_norm": 0.6286695003509521, + "learning_rate": 6.399366921656555e-06, + "loss": 0.027, + "step": 1220 + }, + { + "epoch": 0.32238490964252736, + "grad_norm": 0.5061628222465515, + "learning_rate": 6.409918227380639e-06, + "loss": 0.0239, + "step": 1222 + }, + { + "epoch": 0.3229125445191927, + "grad_norm": 0.15107756853103638, + "learning_rate": 6.420469533104722e-06, + "loss": 0.0235, + "step": 1224 + }, + { + "epoch": 0.32344017939585806, + "grad_norm": 0.3665449321269989, + "learning_rate": 6.431020838828805e-06, + "loss": 0.0219, + "step": 1226 + }, + { + "epoch": 0.3239678142725234, + "grad_norm": 0.2509494721889496, + "learning_rate": 6.441572144552889e-06, + "loss": 0.025, + "step": 1228 + }, + { + "epoch": 0.32449544914918876, + "grad_norm": 0.6836773753166199, + "learning_rate": 6.452123450276973e-06, + "loss": 0.0414, + "step": 1230 + }, + { + "epoch": 0.3250230840258541, + "grad_norm": 0.27808094024658203, + "learning_rate": 6.462674756001056e-06, + "loss": 0.0213, + "step": 1232 + }, + { + "epoch": 0.32555071890251946, + "grad_norm": 1.507602334022522, + "learning_rate": 6.473226061725139e-06, + "loss": 0.0327, + "step": 1234 + }, + { + "epoch": 0.3260783537791848, + "grad_norm": 0.19352573156356812, + "learning_rate": 6.4837773674492225e-06, + "loss": 0.0237, + "step": 1236 + }, + { + "epoch": 0.32660598865585017, + "grad_norm": 0.5673214793205261, + "learning_rate": 6.4943286731733054e-06, + "loss": 0.0201, + "step": 1238 + }, + { + "epoch": 0.3271336235325155, + "grad_norm": 0.27252402901649475, + "learning_rate": 6.504879978897389e-06, + "loss": 0.0184, + "step": 1240 + }, + { + "epoch": 0.32766125840918087, + "grad_norm": 1.15165376663208, + "learning_rate": 6.515431284621473e-06, + "loss": 0.0322, + "step": 1242 + }, + { + "epoch": 0.3281888932858462, + "grad_norm": 0.39212268590927124, + "learning_rate": 6.525982590345555e-06, + "loss": 0.0237, + "step": 1244 + }, + { + "epoch": 0.32871652816251157, + "grad_norm": 0.5088096261024475, + "learning_rate": 6.536533896069639e-06, + "loss": 0.0295, + "step": 1246 + }, + { + "epoch": 0.32924416303917686, + "grad_norm": 0.502945065498352, + "learning_rate": 6.547085201793723e-06, + "loss": 0.0329, + "step": 1248 + }, + { + "epoch": 0.3297717979158422, + "grad_norm": 0.45043301582336426, + "learning_rate": 6.557636507517806e-06, + "loss": 0.0247, + "step": 1250 + }, + { + "epoch": 0.33029943279250756, + "grad_norm": 0.299750953912735, + "learning_rate": 6.5681878132418895e-06, + "loss": 0.0205, + "step": 1252 + }, + { + "epoch": 0.3308270676691729, + "grad_norm": 0.2815335988998413, + "learning_rate": 6.5787391189659725e-06, + "loss": 0.0275, + "step": 1254 + }, + { + "epoch": 0.33135470254583826, + "grad_norm": 1.3794933557510376, + "learning_rate": 6.5892904246900555e-06, + "loss": 0.0296, + "step": 1256 + }, + { + "epoch": 0.3318823374225036, + "grad_norm": 1.089917540550232, + "learning_rate": 6.599841730414139e-06, + "loss": 0.0395, + "step": 1258 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 0.7570661306381226, + "learning_rate": 6.610393036138223e-06, + "loss": 0.0321, + "step": 1260 + }, + { + "epoch": 0.3329376071758343, + "grad_norm": 0.9091094136238098, + "learning_rate": 6.620944341862307e-06, + "loss": 0.0397, + "step": 1262 + }, + { + "epoch": 0.33346524205249967, + "grad_norm": 0.2698676884174347, + "learning_rate": 6.631495647586389e-06, + "loss": 0.031, + "step": 1264 + }, + { + "epoch": 0.333992876929165, + "grad_norm": 0.4132876396179199, + "learning_rate": 6.642046953310473e-06, + "loss": 0.0188, + "step": 1266 + }, + { + "epoch": 0.33452051180583037, + "grad_norm": 0.3041493892669678, + "learning_rate": 6.652598259034556e-06, + "loss": 0.0267, + "step": 1268 + }, + { + "epoch": 0.3350481466824957, + "grad_norm": 0.3686787784099579, + "learning_rate": 6.6631495647586396e-06, + "loss": 0.0306, + "step": 1270 + }, + { + "epoch": 0.33557578155916107, + "grad_norm": 0.3083116114139557, + "learning_rate": 6.673700870482723e-06, + "loss": 0.0299, + "step": 1272 + }, + { + "epoch": 0.3361034164358264, + "grad_norm": 0.8073036670684814, + "learning_rate": 6.6842521762068055e-06, + "loss": 0.0299, + "step": 1274 + }, + { + "epoch": 0.33663105131249177, + "grad_norm": 0.8422223329544067, + "learning_rate": 6.694803481930889e-06, + "loss": 0.037, + "step": 1276 + }, + { + "epoch": 0.3371586861891571, + "grad_norm": 0.3019554615020752, + "learning_rate": 6.705354787654973e-06, + "loss": 0.0459, + "step": 1278 + }, + { + "epoch": 0.33768632106582247, + "grad_norm": 1.9652557373046875, + "learning_rate": 6.715906093379057e-06, + "loss": 0.0316, + "step": 1280 + }, + { + "epoch": 0.3382139559424878, + "grad_norm": 2.2743470668792725, + "learning_rate": 6.72645739910314e-06, + "loss": 0.0289, + "step": 1282 + }, + { + "epoch": 0.33874159081915317, + "grad_norm": 0.6434329152107239, + "learning_rate": 6.737008704827223e-06, + "loss": 0.0304, + "step": 1284 + }, + { + "epoch": 0.33926922569581847, + "grad_norm": 1.6779897212982178, + "learning_rate": 6.747560010551306e-06, + "loss": 0.0271, + "step": 1286 + }, + { + "epoch": 0.3397968605724838, + "grad_norm": 1.3447341918945312, + "learning_rate": 6.75811131627539e-06, + "loss": 0.0291, + "step": 1288 + }, + { + "epoch": 0.34032449544914917, + "grad_norm": 0.391597181558609, + "learning_rate": 6.768662621999473e-06, + "loss": 0.0251, + "step": 1290 + }, + { + "epoch": 0.3408521303258145, + "grad_norm": 1.6673383712768555, + "learning_rate": 6.7792139277235555e-06, + "loss": 0.0318, + "step": 1292 + }, + { + "epoch": 0.34137976520247987, + "grad_norm": 0.7751551866531372, + "learning_rate": 6.789765233447639e-06, + "loss": 0.0267, + "step": 1294 + }, + { + "epoch": 0.3419074000791452, + "grad_norm": 1.173416018486023, + "learning_rate": 6.800316539171723e-06, + "loss": 0.027, + "step": 1296 + }, + { + "epoch": 0.34243503495581057, + "grad_norm": 0.8570774793624878, + "learning_rate": 6.810867844895807e-06, + "loss": 0.0318, + "step": 1298 + }, + { + "epoch": 0.3429626698324759, + "grad_norm": 2.196826696395874, + "learning_rate": 6.82141915061989e-06, + "loss": 0.0301, + "step": 1300 + }, + { + "epoch": 0.34349030470914127, + "grad_norm": 0.953108549118042, + "learning_rate": 6.831970456343973e-06, + "loss": 0.0255, + "step": 1302 + }, + { + "epoch": 0.3440179395858066, + "grad_norm": 0.4593990743160248, + "learning_rate": 6.842521762068056e-06, + "loss": 0.0194, + "step": 1304 + }, + { + "epoch": 0.34454557446247197, + "grad_norm": 0.6084488034248352, + "learning_rate": 6.85307306779214e-06, + "loss": 0.029, + "step": 1306 + }, + { + "epoch": 0.3450732093391373, + "grad_norm": 1.5489630699157715, + "learning_rate": 6.863624373516223e-06, + "loss": 0.0374, + "step": 1308 + }, + { + "epoch": 0.34560084421580267, + "grad_norm": 0.2551247775554657, + "learning_rate": 6.874175679240307e-06, + "loss": 0.0202, + "step": 1310 + }, + { + "epoch": 0.346128479092468, + "grad_norm": 0.9257856011390686, + "learning_rate": 6.884726984964389e-06, + "loss": 0.0354, + "step": 1312 + }, + { + "epoch": 0.3466561139691334, + "grad_norm": 0.31185540556907654, + "learning_rate": 6.895278290688473e-06, + "loss": 0.0312, + "step": 1314 + }, + { + "epoch": 0.3471837488457987, + "grad_norm": 0.35571998357772827, + "learning_rate": 6.905829596412557e-06, + "loss": 0.0282, + "step": 1316 + }, + { + "epoch": 0.3477113837224641, + "grad_norm": 0.35175853967666626, + "learning_rate": 6.91638090213664e-06, + "loss": 0.0187, + "step": 1318 + }, + { + "epoch": 0.3482390185991294, + "grad_norm": 0.13657696545124054, + "learning_rate": 6.926932207860724e-06, + "loss": 0.0212, + "step": 1320 + }, + { + "epoch": 0.3487666534757948, + "grad_norm": 0.21234242618083954, + "learning_rate": 6.937483513584806e-06, + "loss": 0.0191, + "step": 1322 + }, + { + "epoch": 0.3492942883524601, + "grad_norm": 0.5921971201896667, + "learning_rate": 6.94803481930889e-06, + "loss": 0.0196, + "step": 1324 + }, + { + "epoch": 0.3498219232291254, + "grad_norm": 0.5949119329452515, + "learning_rate": 6.9585861250329734e-06, + "loss": 0.0225, + "step": 1326 + }, + { + "epoch": 0.35034955810579077, + "grad_norm": 0.6263622045516968, + "learning_rate": 6.969137430757057e-06, + "loss": 0.021, + "step": 1328 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.3750636577606201, + "learning_rate": 6.97968873648114e-06, + "loss": 0.0283, + "step": 1330 + }, + { + "epoch": 0.35140482785912147, + "grad_norm": 0.18606123328208923, + "learning_rate": 6.990240042205223e-06, + "loss": 0.0222, + "step": 1332 + }, + { + "epoch": 0.3519324627357868, + "grad_norm": 0.41302475333213806, + "learning_rate": 7.000791347929307e-06, + "loss": 0.0164, + "step": 1334 + }, + { + "epoch": 0.3524600976124522, + "grad_norm": 0.617996096611023, + "learning_rate": 7.01134265365339e-06, + "loss": 0.0185, + "step": 1336 + }, + { + "epoch": 0.3529877324891175, + "grad_norm": 0.5795587301254272, + "learning_rate": 7.021893959377474e-06, + "loss": 0.0257, + "step": 1338 + }, + { + "epoch": 0.3535153673657829, + "grad_norm": 0.9421643018722534, + "learning_rate": 7.0324452651015575e-06, + "loss": 0.0206, + "step": 1340 + }, + { + "epoch": 0.3540430022424482, + "grad_norm": 0.6646486520767212, + "learning_rate": 7.04299657082564e-06, + "loss": 0.0226, + "step": 1342 + }, + { + "epoch": 0.3545706371191136, + "grad_norm": 0.4665279686450958, + "learning_rate": 7.0535478765497235e-06, + "loss": 0.0157, + "step": 1344 + }, + { + "epoch": 0.3550982719957789, + "grad_norm": 0.7262210845947266, + "learning_rate": 7.064099182273807e-06, + "loss": 0.0278, + "step": 1346 + }, + { + "epoch": 0.3556259068724443, + "grad_norm": 0.4123990535736084, + "learning_rate": 7.07465048799789e-06, + "loss": 0.0207, + "step": 1348 + }, + { + "epoch": 0.3561535417491096, + "grad_norm": 2.3166403770446777, + "learning_rate": 7.085201793721974e-06, + "loss": 0.0255, + "step": 1350 + }, + { + "epoch": 0.356681176625775, + "grad_norm": 0.22120358049869537, + "learning_rate": 7.095753099446057e-06, + "loss": 0.0169, + "step": 1352 + }, + { + "epoch": 0.3572088115024403, + "grad_norm": 0.5288012623786926, + "learning_rate": 7.10630440517014e-06, + "loss": 0.0225, + "step": 1354 + }, + { + "epoch": 0.3577364463791057, + "grad_norm": 0.226117804646492, + "learning_rate": 7.116855710894224e-06, + "loss": 0.0143, + "step": 1356 + }, + { + "epoch": 0.358264081255771, + "grad_norm": 2.1472008228302, + "learning_rate": 7.1274070166183076e-06, + "loss": 0.0326, + "step": 1358 + }, + { + "epoch": 0.3587917161324364, + "grad_norm": 0.11927317082881927, + "learning_rate": 7.137958322342391e-06, + "loss": 0.0211, + "step": 1360 + }, + { + "epoch": 0.35931935100910173, + "grad_norm": 0.3603033721446991, + "learning_rate": 7.1485096280664735e-06, + "loss": 0.0167, + "step": 1362 + }, + { + "epoch": 0.359846985885767, + "grad_norm": 0.1086275577545166, + "learning_rate": 7.159060933790557e-06, + "loss": 0.0164, + "step": 1364 + }, + { + "epoch": 0.3603746207624324, + "grad_norm": 0.249683678150177, + "learning_rate": 7.16961223951464e-06, + "loss": 0.016, + "step": 1366 + }, + { + "epoch": 0.3609022556390977, + "grad_norm": 0.11322273313999176, + "learning_rate": 7.180163545238724e-06, + "loss": 0.0138, + "step": 1368 + }, + { + "epoch": 0.3614298905157631, + "grad_norm": 0.3108185827732086, + "learning_rate": 7.190714850962808e-06, + "loss": 0.0159, + "step": 1370 + }, + { + "epoch": 0.3619575253924284, + "grad_norm": 0.2231854945421219, + "learning_rate": 7.20126615668689e-06, + "loss": 0.0218, + "step": 1372 + }, + { + "epoch": 0.3624851602690938, + "grad_norm": 0.7790166735649109, + "learning_rate": 7.211817462410974e-06, + "loss": 0.0143, + "step": 1374 + }, + { + "epoch": 0.3630127951457591, + "grad_norm": 0.5269120931625366, + "learning_rate": 7.222368768135058e-06, + "loss": 0.0182, + "step": 1376 + }, + { + "epoch": 0.3635404300224245, + "grad_norm": 0.23970767855644226, + "learning_rate": 7.232920073859141e-06, + "loss": 0.0148, + "step": 1378 + }, + { + "epoch": 0.3640680648990898, + "grad_norm": 1.2864912748336792, + "learning_rate": 7.243471379583224e-06, + "loss": 0.0194, + "step": 1380 + }, + { + "epoch": 0.3645956997757552, + "grad_norm": 0.1354512721300125, + "learning_rate": 7.254022685307307e-06, + "loss": 0.02, + "step": 1382 + }, + { + "epoch": 0.36512333465242053, + "grad_norm": 0.1514151394367218, + "learning_rate": 7.26457399103139e-06, + "loss": 0.0141, + "step": 1384 + }, + { + "epoch": 0.3656509695290859, + "grad_norm": 1.5703681707382202, + "learning_rate": 7.275125296755474e-06, + "loss": 0.0236, + "step": 1386 + }, + { + "epoch": 0.36617860440575123, + "grad_norm": 0.13251201808452606, + "learning_rate": 7.285676602479558e-06, + "loss": 0.0135, + "step": 1388 + }, + { + "epoch": 0.3667062392824166, + "grad_norm": 0.5599107146263123, + "learning_rate": 7.296227908203642e-06, + "loss": 0.0197, + "step": 1390 + }, + { + "epoch": 0.36723387415908193, + "grad_norm": 0.45969724655151367, + "learning_rate": 7.306779213927724e-06, + "loss": 0.0212, + "step": 1392 + }, + { + "epoch": 0.3677615090357473, + "grad_norm": 0.3939071297645569, + "learning_rate": 7.317330519651808e-06, + "loss": 0.0193, + "step": 1394 + }, + { + "epoch": 0.36828914391241263, + "grad_norm": 0.35675230622291565, + "learning_rate": 7.327881825375891e-06, + "loss": 0.0173, + "step": 1396 + }, + { + "epoch": 0.368816778789078, + "grad_norm": 0.7565270662307739, + "learning_rate": 7.338433131099974e-06, + "loss": 0.0149, + "step": 1398 + }, + { + "epoch": 0.36934441366574333, + "grad_norm": 1.4832677841186523, + "learning_rate": 7.348984436824057e-06, + "loss": 0.0219, + "step": 1400 + }, + { + "epoch": 0.3698720485424086, + "grad_norm": 0.36017075181007385, + "learning_rate": 7.35953574254814e-06, + "loss": 0.0158, + "step": 1402 + }, + { + "epoch": 0.370399683419074, + "grad_norm": 0.0964527428150177, + "learning_rate": 7.370087048272224e-06, + "loss": 0.0165, + "step": 1404 + }, + { + "epoch": 0.37092731829573933, + "grad_norm": 0.13446801900863647, + "learning_rate": 7.380638353996308e-06, + "loss": 0.0162, + "step": 1406 + }, + { + "epoch": 0.3714549531724047, + "grad_norm": 0.24998094141483307, + "learning_rate": 7.391189659720392e-06, + "loss": 0.0334, + "step": 1408 + }, + { + "epoch": 0.37198258804907003, + "grad_norm": 0.26966410875320435, + "learning_rate": 7.401740965444474e-06, + "loss": 0.0127, + "step": 1410 + }, + { + "epoch": 0.3725102229257354, + "grad_norm": 0.6430260539054871, + "learning_rate": 7.412292271168558e-06, + "loss": 0.0207, + "step": 1412 + }, + { + "epoch": 0.37303785780240073, + "grad_norm": 1.5259071588516235, + "learning_rate": 7.4228435768926414e-06, + "loss": 0.0258, + "step": 1414 + }, + { + "epoch": 0.3735654926790661, + "grad_norm": 1.9842904806137085, + "learning_rate": 7.433394882616724e-06, + "loss": 0.0186, + "step": 1416 + }, + { + "epoch": 0.37409312755573143, + "grad_norm": 0.5760218501091003, + "learning_rate": 7.443946188340808e-06, + "loss": 0.0186, + "step": 1418 + }, + { + "epoch": 0.3746207624323968, + "grad_norm": 0.364391028881073, + "learning_rate": 7.45449749406489e-06, + "loss": 0.0248, + "step": 1420 + }, + { + "epoch": 0.37514839730906213, + "grad_norm": 0.7437575459480286, + "learning_rate": 7.465048799788974e-06, + "loss": 0.0304, + "step": 1422 + }, + { + "epoch": 0.3756760321857275, + "grad_norm": 0.43987399339675903, + "learning_rate": 7.475600105513058e-06, + "loss": 0.0138, + "step": 1424 + }, + { + "epoch": 0.37620366706239283, + "grad_norm": 0.5894337892532349, + "learning_rate": 7.486151411237142e-06, + "loss": 0.0184, + "step": 1426 + }, + { + "epoch": 0.3767313019390582, + "grad_norm": 0.3034588694572449, + "learning_rate": 7.496702716961225e-06, + "loss": 0.014, + "step": 1428 + }, + { + "epoch": 0.37725893681572353, + "grad_norm": 0.7622604966163635, + "learning_rate": 7.507254022685308e-06, + "loss": 0.0218, + "step": 1430 + }, + { + "epoch": 0.3777865716923889, + "grad_norm": 0.9435436725616455, + "learning_rate": 7.5178053284093915e-06, + "loss": 0.0251, + "step": 1432 + }, + { + "epoch": 0.37831420656905423, + "grad_norm": 1.715097427368164, + "learning_rate": 7.528356634133474e-06, + "loss": 0.0169, + "step": 1434 + }, + { + "epoch": 0.3788418414457196, + "grad_norm": 0.6785725355148315, + "learning_rate": 7.538907939857558e-06, + "loss": 0.021, + "step": 1436 + }, + { + "epoch": 0.37936947632238494, + "grad_norm": 0.13539087772369385, + "learning_rate": 7.549459245581642e-06, + "loss": 0.013, + "step": 1438 + }, + { + "epoch": 0.37989711119905023, + "grad_norm": 0.6980010867118835, + "learning_rate": 7.560010551305724e-06, + "loss": 0.0219, + "step": 1440 + }, + { + "epoch": 0.3804247460757156, + "grad_norm": 0.18312445282936096, + "learning_rate": 7.570561857029808e-06, + "loss": 0.0157, + "step": 1442 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.4611237645149231, + "learning_rate": 7.581113162753892e-06, + "loss": 0.0137, + "step": 1444 + }, + { + "epoch": 0.3814800158290463, + "grad_norm": 1.5063396692276, + "learning_rate": 7.591664468477975e-06, + "loss": 0.033, + "step": 1446 + }, + { + "epoch": 0.38200765070571163, + "grad_norm": 0.7286021709442139, + "learning_rate": 7.6022157742020585e-06, + "loss": 0.014, + "step": 1448 + }, + { + "epoch": 0.382535285582377, + "grad_norm": 0.27706024050712585, + "learning_rate": 7.6127670799261415e-06, + "loss": 0.0328, + "step": 1450 + }, + { + "epoch": 0.38306292045904233, + "grad_norm": 0.36290669441223145, + "learning_rate": 7.6233183856502244e-06, + "loss": 0.0165, + "step": 1452 + }, + { + "epoch": 0.3835905553357077, + "grad_norm": 0.49073120951652527, + "learning_rate": 7.633869691374307e-06, + "loss": 0.0208, + "step": 1454 + }, + { + "epoch": 0.38411819021237303, + "grad_norm": 0.10496660321950912, + "learning_rate": 7.644420997098392e-06, + "loss": 0.013, + "step": 1456 + }, + { + "epoch": 0.3846458250890384, + "grad_norm": 0.30058005452156067, + "learning_rate": 7.654972302822475e-06, + "loss": 0.0131, + "step": 1458 + }, + { + "epoch": 0.38517345996570374, + "grad_norm": 0.11778497695922852, + "learning_rate": 7.665523608546558e-06, + "loss": 0.0181, + "step": 1460 + }, + { + "epoch": 0.3857010948423691, + "grad_norm": 0.9148674607276917, + "learning_rate": 7.676074914270641e-06, + "loss": 0.0184, + "step": 1462 + }, + { + "epoch": 0.38622872971903444, + "grad_norm": 0.6032678484916687, + "learning_rate": 7.686626219994726e-06, + "loss": 0.0253, + "step": 1464 + }, + { + "epoch": 0.3867563645956998, + "grad_norm": 0.7819039225578308, + "learning_rate": 7.697177525718809e-06, + "loss": 0.0164, + "step": 1466 + }, + { + "epoch": 0.38728399947236514, + "grad_norm": 0.11761981248855591, + "learning_rate": 7.707728831442891e-06, + "loss": 0.011, + "step": 1468 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 2.6420845985412598, + "learning_rate": 7.718280137166974e-06, + "loss": 0.0227, + "step": 1470 + }, + { + "epoch": 0.38833926922569584, + "grad_norm": 0.08210376650094986, + "learning_rate": 7.728831442891057e-06, + "loss": 0.0116, + "step": 1472 + }, + { + "epoch": 0.3888669041023612, + "grad_norm": 0.29762938618659973, + "learning_rate": 7.739382748615142e-06, + "loss": 0.0127, + "step": 1474 + }, + { + "epoch": 0.38939453897902654, + "grad_norm": 0.28734999895095825, + "learning_rate": 7.749934054339225e-06, + "loss": 0.0135, + "step": 1476 + }, + { + "epoch": 0.38992217385569183, + "grad_norm": 0.23187600076198578, + "learning_rate": 7.76048536006331e-06, + "loss": 0.0228, + "step": 1478 + }, + { + "epoch": 0.3904498087323572, + "grad_norm": 0.7318201661109924, + "learning_rate": 7.771036665787391e-06, + "loss": 0.0251, + "step": 1480 + }, + { + "epoch": 0.39097744360902253, + "grad_norm": 0.26413795351982117, + "learning_rate": 7.781587971511476e-06, + "loss": 0.0173, + "step": 1482 + }, + { + "epoch": 0.3915050784856879, + "grad_norm": 0.7819066047668457, + "learning_rate": 7.792139277235559e-06, + "loss": 0.0157, + "step": 1484 + }, + { + "epoch": 0.39203271336235324, + "grad_norm": 0.2174564003944397, + "learning_rate": 7.802690582959642e-06, + "loss": 0.0221, + "step": 1486 + }, + { + "epoch": 0.3925603482390186, + "grad_norm": 0.7904090285301208, + "learning_rate": 7.813241888683726e-06, + "loss": 0.0196, + "step": 1488 + }, + { + "epoch": 0.39308798311568394, + "grad_norm": 0.3852689862251282, + "learning_rate": 7.823793194407807e-06, + "loss": 0.0146, + "step": 1490 + }, + { + "epoch": 0.3936156179923493, + "grad_norm": 0.4111071527004242, + "learning_rate": 7.834344500131892e-06, + "loss": 0.0124, + "step": 1492 + }, + { + "epoch": 0.39414325286901464, + "grad_norm": 0.436271071434021, + "learning_rate": 7.844895805855975e-06, + "loss": 0.0239, + "step": 1494 + }, + { + "epoch": 0.39467088774568, + "grad_norm": 0.6265550851821899, + "learning_rate": 7.85544711158006e-06, + "loss": 0.0153, + "step": 1496 + }, + { + "epoch": 0.39519852262234534, + "grad_norm": 0.884762704372406, + "learning_rate": 7.865998417304141e-06, + "loss": 0.0154, + "step": 1498 + }, + { + "epoch": 0.3957261574990107, + "grad_norm": 0.10235284268856049, + "learning_rate": 7.876549723028226e-06, + "loss": 0.0106, + "step": 1500 + }, + { + "epoch": 0.39625379237567604, + "grad_norm": 0.27923789620399475, + "learning_rate": 7.887101028752309e-06, + "loss": 0.0279, + "step": 1502 + }, + { + "epoch": 0.3967814272523414, + "grad_norm": 1.2849159240722656, + "learning_rate": 7.897652334476392e-06, + "loss": 0.0403, + "step": 1504 + }, + { + "epoch": 0.39730906212900674, + "grad_norm": 0.17274488508701324, + "learning_rate": 7.908203640200476e-06, + "loss": 0.02, + "step": 1506 + }, + { + "epoch": 0.3978366970056721, + "grad_norm": 0.13662804663181305, + "learning_rate": 7.918754945924557e-06, + "loss": 0.0109, + "step": 1508 + }, + { + "epoch": 0.39836433188233744, + "grad_norm": 0.9093418121337891, + "learning_rate": 7.929306251648642e-06, + "loss": 0.0158, + "step": 1510 + }, + { + "epoch": 0.3988919667590028, + "grad_norm": 2.3841021060943604, + "learning_rate": 7.939857557372725e-06, + "loss": 0.0266, + "step": 1512 + }, + { + "epoch": 0.39941960163566814, + "grad_norm": 0.6121453642845154, + "learning_rate": 7.95040886309681e-06, + "loss": 0.0183, + "step": 1514 + }, + { + "epoch": 0.3999472365123335, + "grad_norm": 0.4481624364852905, + "learning_rate": 7.960960168820893e-06, + "loss": 0.0228, + "step": 1516 + }, + { + "epoch": 0.4004748713889988, + "grad_norm": 0.16407015919685364, + "learning_rate": 7.971511474544976e-06, + "loss": 0.0219, + "step": 1518 + }, + { + "epoch": 0.40100250626566414, + "grad_norm": 0.16393853724002838, + "learning_rate": 7.982062780269059e-06, + "loss": 0.0169, + "step": 1520 + }, + { + "epoch": 0.4015301411423295, + "grad_norm": 0.7462701201438904, + "learning_rate": 7.992614085993142e-06, + "loss": 0.0113, + "step": 1522 + }, + { + "epoch": 0.40205777601899484, + "grad_norm": 0.28890034556388855, + "learning_rate": 8.003165391717226e-06, + "loss": 0.0143, + "step": 1524 + }, + { + "epoch": 0.4025854108956602, + "grad_norm": 0.5456112027168274, + "learning_rate": 8.01371669744131e-06, + "loss": 0.0153, + "step": 1526 + }, + { + "epoch": 0.40311304577232554, + "grad_norm": 0.8814405798912048, + "learning_rate": 8.024268003165392e-06, + "loss": 0.0186, + "step": 1528 + }, + { + "epoch": 0.4036406806489909, + "grad_norm": 0.16500422358512878, + "learning_rate": 8.034819308889475e-06, + "loss": 0.0246, + "step": 1530 + }, + { + "epoch": 0.40416831552565624, + "grad_norm": 0.27641645073890686, + "learning_rate": 8.04537061461356e-06, + "loss": 0.0137, + "step": 1532 + }, + { + "epoch": 0.4046959504023216, + "grad_norm": 0.9207457304000854, + "learning_rate": 8.055921920337643e-06, + "loss": 0.02, + "step": 1534 + }, + { + "epoch": 0.40522358527898694, + "grad_norm": 0.10146048665046692, + "learning_rate": 8.066473226061726e-06, + "loss": 0.0225, + "step": 1536 + }, + { + "epoch": 0.4057512201556523, + "grad_norm": 0.10030649602413177, + "learning_rate": 8.077024531785809e-06, + "loss": 0.0241, + "step": 1538 + }, + { + "epoch": 0.40627885503231764, + "grad_norm": 2.6188690662384033, + "learning_rate": 8.087575837509892e-06, + "loss": 0.0179, + "step": 1540 + }, + { + "epoch": 0.406806489908983, + "grad_norm": 0.3091949224472046, + "learning_rate": 8.098127143233976e-06, + "loss": 0.024, + "step": 1542 + }, + { + "epoch": 0.40733412478564834, + "grad_norm": 0.5613223314285278, + "learning_rate": 8.10867844895806e-06, + "loss": 0.0125, + "step": 1544 + }, + { + "epoch": 0.4078617596623137, + "grad_norm": 0.456822007894516, + "learning_rate": 8.119229754682144e-06, + "loss": 0.0213, + "step": 1546 + }, + { + "epoch": 0.40838939453897904, + "grad_norm": 0.4767163395881653, + "learning_rate": 8.129781060406225e-06, + "loss": 0.0138, + "step": 1548 + }, + { + "epoch": 0.4089170294156444, + "grad_norm": 0.3066970109939575, + "learning_rate": 8.14033236613031e-06, + "loss": 0.0232, + "step": 1550 + }, + { + "epoch": 0.40944466429230975, + "grad_norm": 0.20647092163562775, + "learning_rate": 8.150883671854393e-06, + "loss": 0.0098, + "step": 1552 + }, + { + "epoch": 0.4099722991689751, + "grad_norm": 1.0253335237503052, + "learning_rate": 8.161434977578476e-06, + "loss": 0.0125, + "step": 1554 + }, + { + "epoch": 0.4104999340456404, + "grad_norm": 0.5392470359802246, + "learning_rate": 8.17198628330256e-06, + "loss": 0.0181, + "step": 1556 + }, + { + "epoch": 0.41102756892230574, + "grad_norm": 0.452311247587204, + "learning_rate": 8.182537589026642e-06, + "loss": 0.0136, + "step": 1558 + }, + { + "epoch": 0.4115552037989711, + "grad_norm": 1.0139259099960327, + "learning_rate": 8.193088894750726e-06, + "loss": 0.0324, + "step": 1560 + }, + { + "epoch": 0.41208283867563644, + "grad_norm": 0.09062158316373825, + "learning_rate": 8.20364020047481e-06, + "loss": 0.0145, + "step": 1562 + }, + { + "epoch": 0.4126104735523018, + "grad_norm": 0.09261652827262878, + "learning_rate": 8.214191506198894e-06, + "loss": 0.01, + "step": 1564 + }, + { + "epoch": 0.41313810842896714, + "grad_norm": 0.31005948781967163, + "learning_rate": 8.224742811922977e-06, + "loss": 0.0175, + "step": 1566 + }, + { + "epoch": 0.4136657433056325, + "grad_norm": 1.4196642637252808, + "learning_rate": 8.23529411764706e-06, + "loss": 0.0222, + "step": 1568 + }, + { + "epoch": 0.41419337818229784, + "grad_norm": 0.6666792035102844, + "learning_rate": 8.245845423371143e-06, + "loss": 0.0149, + "step": 1570 + }, + { + "epoch": 0.4147210130589632, + "grad_norm": 0.5725591778755188, + "learning_rate": 8.256396729095226e-06, + "loss": 0.0111, + "step": 1572 + }, + { + "epoch": 0.41524864793562855, + "grad_norm": 0.45843154191970825, + "learning_rate": 8.26694803481931e-06, + "loss": 0.0224, + "step": 1574 + }, + { + "epoch": 0.4157762828122939, + "grad_norm": 0.12068108469247818, + "learning_rate": 8.277499340543393e-06, + "loss": 0.0098, + "step": 1576 + }, + { + "epoch": 0.41630391768895925, + "grad_norm": 0.06340993195772171, + "learning_rate": 8.288050646267476e-06, + "loss": 0.0215, + "step": 1578 + }, + { + "epoch": 0.4168315525656246, + "grad_norm": 0.09006465971469879, + "learning_rate": 8.29860195199156e-06, + "loss": 0.0152, + "step": 1580 + }, + { + "epoch": 0.41735918744228995, + "grad_norm": 0.14839084446430206, + "learning_rate": 8.309153257715644e-06, + "loss": 0.01, + "step": 1582 + }, + { + "epoch": 0.4178868223189553, + "grad_norm": 0.269968181848526, + "learning_rate": 8.319704563439727e-06, + "loss": 0.0155, + "step": 1584 + }, + { + "epoch": 0.41841445719562065, + "grad_norm": 0.11887773126363754, + "learning_rate": 8.33025586916381e-06, + "loss": 0.018, + "step": 1586 + }, + { + "epoch": 0.418942092072286, + "grad_norm": 0.4868936240673065, + "learning_rate": 8.340807174887893e-06, + "loss": 0.0203, + "step": 1588 + }, + { + "epoch": 0.41946972694895135, + "grad_norm": 0.4860062897205353, + "learning_rate": 8.351358480611976e-06, + "loss": 0.0159, + "step": 1590 + }, + { + "epoch": 0.4199973618256167, + "grad_norm": 0.5116564035415649, + "learning_rate": 8.36190978633606e-06, + "loss": 0.0123, + "step": 1592 + }, + { + "epoch": 0.420524996702282, + "grad_norm": 0.272124707698822, + "learning_rate": 8.372461092060143e-06, + "loss": 0.0161, + "step": 1594 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 1.7219926118850708, + "learning_rate": 8.383012397784226e-06, + "loss": 0.0207, + "step": 1596 + }, + { + "epoch": 0.4215802664556127, + "grad_norm": 0.4026455581188202, + "learning_rate": 8.39356370350831e-06, + "loss": 0.0104, + "step": 1598 + }, + { + "epoch": 0.42210790133227805, + "grad_norm": 0.4155260920524597, + "learning_rate": 8.404115009232394e-06, + "loss": 0.011, + "step": 1600 + }, + { + "epoch": 0.4226355362089434, + "grad_norm": 0.10538157820701599, + "learning_rate": 8.414666314956477e-06, + "loss": 0.0173, + "step": 1602 + }, + { + "epoch": 0.42316317108560875, + "grad_norm": 1.127760887145996, + "learning_rate": 8.42521762068056e-06, + "loss": 0.0247, + "step": 1604 + }, + { + "epoch": 0.4236908059622741, + "grad_norm": 0.10938789695501328, + "learning_rate": 8.435768926404643e-06, + "loss": 0.0084, + "step": 1606 + }, + { + "epoch": 0.42421844083893945, + "grad_norm": 0.3502422273159027, + "learning_rate": 8.446320232128726e-06, + "loss": 0.0154, + "step": 1608 + }, + { + "epoch": 0.4247460757156048, + "grad_norm": 0.17728780210018158, + "learning_rate": 8.45687153785281e-06, + "loss": 0.0146, + "step": 1610 + }, + { + "epoch": 0.42527371059227015, + "grad_norm": 0.8387036919593811, + "learning_rate": 8.467422843576893e-06, + "loss": 0.0145, + "step": 1612 + }, + { + "epoch": 0.4258013454689355, + "grad_norm": 0.2885839343070984, + "learning_rate": 8.477974149300976e-06, + "loss": 0.0117, + "step": 1614 + }, + { + "epoch": 0.42632898034560085, + "grad_norm": 0.11423775553703308, + "learning_rate": 8.48852545502506e-06, + "loss": 0.0126, + "step": 1616 + }, + { + "epoch": 0.4268566152222662, + "grad_norm": 1.8938722610473633, + "learning_rate": 8.499076760749144e-06, + "loss": 0.0161, + "step": 1618 + }, + { + "epoch": 0.42738425009893155, + "grad_norm": 0.546311616897583, + "learning_rate": 8.509628066473227e-06, + "loss": 0.0161, + "step": 1620 + }, + { + "epoch": 0.4279118849755969, + "grad_norm": 0.634289026260376, + "learning_rate": 8.52017937219731e-06, + "loss": 0.0159, + "step": 1622 + }, + { + "epoch": 0.42843951985226225, + "grad_norm": 0.09591084718704224, + "learning_rate": 8.530730677921395e-06, + "loss": 0.0141, + "step": 1624 + }, + { + "epoch": 0.4289671547289276, + "grad_norm": 0.1123599112033844, + "learning_rate": 8.541281983645476e-06, + "loss": 0.0136, + "step": 1626 + }, + { + "epoch": 0.42949478960559295, + "grad_norm": 0.7390477657318115, + "learning_rate": 8.55183328936956e-06, + "loss": 0.0165, + "step": 1628 + }, + { + "epoch": 0.4300224244822583, + "grad_norm": 0.10843092948198318, + "learning_rate": 8.562384595093643e-06, + "loss": 0.0205, + "step": 1630 + }, + { + "epoch": 0.4305500593589236, + "grad_norm": 0.4334639608860016, + "learning_rate": 8.572935900817726e-06, + "loss": 0.0171, + "step": 1632 + }, + { + "epoch": 0.43107769423558895, + "grad_norm": 0.815296471118927, + "learning_rate": 8.583487206541811e-06, + "loss": 0.0222, + "step": 1634 + }, + { + "epoch": 0.4316053291122543, + "grad_norm": 0.35164573788642883, + "learning_rate": 8.594038512265894e-06, + "loss": 0.0104, + "step": 1636 + }, + { + "epoch": 0.43213296398891965, + "grad_norm": 0.13947808742523193, + "learning_rate": 8.604589817989977e-06, + "loss": 0.0079, + "step": 1638 + }, + { + "epoch": 0.432660598865585, + "grad_norm": 0.7040034532546997, + "learning_rate": 8.61514112371406e-06, + "loss": 0.0138, + "step": 1640 + }, + { + "epoch": 0.43318823374225035, + "grad_norm": 0.3585178554058075, + "learning_rate": 8.625692429438145e-06, + "loss": 0.0147, + "step": 1642 + }, + { + "epoch": 0.4337158686189157, + "grad_norm": 0.6445247530937195, + "learning_rate": 8.636243735162227e-06, + "loss": 0.0324, + "step": 1644 + }, + { + "epoch": 0.43424350349558105, + "grad_norm": 0.5184330344200134, + "learning_rate": 8.64679504088631e-06, + "loss": 0.0333, + "step": 1646 + }, + { + "epoch": 0.4347711383722464, + "grad_norm": 0.5199213624000549, + "learning_rate": 8.657346346610393e-06, + "loss": 0.0134, + "step": 1648 + }, + { + "epoch": 0.43529877324891175, + "grad_norm": 0.6605756878852844, + "learning_rate": 8.667897652334476e-06, + "loss": 0.0148, + "step": 1650 + }, + { + "epoch": 0.4358264081255771, + "grad_norm": 1.5662145614624023, + "learning_rate": 8.678448958058561e-06, + "loss": 0.0362, + "step": 1652 + }, + { + "epoch": 0.43635404300224245, + "grad_norm": 0.2376788705587387, + "learning_rate": 8.689000263782644e-06, + "loss": 0.0189, + "step": 1654 + }, + { + "epoch": 0.4368816778789078, + "grad_norm": 0.1157929077744484, + "learning_rate": 8.699551569506727e-06, + "loss": 0.0101, + "step": 1656 + }, + { + "epoch": 0.43740931275557315, + "grad_norm": 0.18137826025485992, + "learning_rate": 8.71010287523081e-06, + "loss": 0.0088, + "step": 1658 + }, + { + "epoch": 0.4379369476322385, + "grad_norm": 0.1170000210404396, + "learning_rate": 8.720654180954895e-06, + "loss": 0.0118, + "step": 1660 + }, + { + "epoch": 0.43846458250890386, + "grad_norm": 0.7544876933097839, + "learning_rate": 8.731205486678978e-06, + "loss": 0.0215, + "step": 1662 + }, + { + "epoch": 0.4389922173855692, + "grad_norm": 0.11736688762903214, + "learning_rate": 8.74175679240306e-06, + "loss": 0.0114, + "step": 1664 + }, + { + "epoch": 0.43951985226223456, + "grad_norm": 0.27250173687934875, + "learning_rate": 8.752308098127143e-06, + "loss": 0.0114, + "step": 1666 + }, + { + "epoch": 0.4400474871388999, + "grad_norm": 0.38562077283859253, + "learning_rate": 8.762859403851226e-06, + "loss": 0.012, + "step": 1668 + }, + { + "epoch": 0.4405751220155652, + "grad_norm": 0.1605842113494873, + "learning_rate": 8.773410709575311e-06, + "loss": 0.0133, + "step": 1670 + }, + { + "epoch": 0.44110275689223055, + "grad_norm": 0.2975110709667206, + "learning_rate": 8.783962015299394e-06, + "loss": 0.016, + "step": 1672 + }, + { + "epoch": 0.4416303917688959, + "grad_norm": 0.42133745551109314, + "learning_rate": 8.794513321023479e-06, + "loss": 0.0189, + "step": 1674 + }, + { + "epoch": 0.44215802664556125, + "grad_norm": 0.19585347175598145, + "learning_rate": 8.80506462674756e-06, + "loss": 0.0097, + "step": 1676 + }, + { + "epoch": 0.4426856615222266, + "grad_norm": 0.1066381186246872, + "learning_rate": 8.815615932471645e-06, + "loss": 0.0083, + "step": 1678 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 1.0969204902648926, + "learning_rate": 8.826167238195728e-06, + "loss": 0.0153, + "step": 1680 + }, + { + "epoch": 0.4437409312755573, + "grad_norm": 2.2851181030273438, + "learning_rate": 8.83671854391981e-06, + "loss": 0.0166, + "step": 1682 + }, + { + "epoch": 0.44426856615222265, + "grad_norm": 1.6325347423553467, + "learning_rate": 8.847269849643895e-06, + "loss": 0.0248, + "step": 1684 + }, + { + "epoch": 0.444796201028888, + "grad_norm": 0.42571890354156494, + "learning_rate": 8.857821155367976e-06, + "loss": 0.0183, + "step": 1686 + }, + { + "epoch": 0.44532383590555336, + "grad_norm": 1.4514309167861938, + "learning_rate": 8.868372461092061e-06, + "loss": 0.012, + "step": 1688 + }, + { + "epoch": 0.4458514707822187, + "grad_norm": 0.32670357823371887, + "learning_rate": 8.878923766816144e-06, + "loss": 0.0141, + "step": 1690 + }, + { + "epoch": 0.44637910565888406, + "grad_norm": 0.270956426858902, + "learning_rate": 8.889475072540229e-06, + "loss": 0.0104, + "step": 1692 + }, + { + "epoch": 0.4469067405355494, + "grad_norm": 0.17302672564983368, + "learning_rate": 8.900026378264312e-06, + "loss": 0.0079, + "step": 1694 + }, + { + "epoch": 0.44743437541221476, + "grad_norm": 0.4341029226779938, + "learning_rate": 8.910577683988395e-06, + "loss": 0.0277, + "step": 1696 + }, + { + "epoch": 0.4479620102888801, + "grad_norm": 0.3907453417778015, + "learning_rate": 8.921128989712478e-06, + "loss": 0.0162, + "step": 1698 + }, + { + "epoch": 0.44848964516554546, + "grad_norm": 1.2514222860336304, + "learning_rate": 8.93168029543656e-06, + "loss": 0.0223, + "step": 1700 + }, + { + "epoch": 0.4490172800422108, + "grad_norm": 0.3456608057022095, + "learning_rate": 8.942231601160645e-06, + "loss": 0.0149, + "step": 1702 + }, + { + "epoch": 0.44954491491887616, + "grad_norm": 0.6547033190727234, + "learning_rate": 8.952782906884726e-06, + "loss": 0.0125, + "step": 1704 + }, + { + "epoch": 0.4500725497955415, + "grad_norm": 0.7152488231658936, + "learning_rate": 8.963334212608811e-06, + "loss": 0.0179, + "step": 1706 + }, + { + "epoch": 0.45060018467220686, + "grad_norm": 0.2152302861213684, + "learning_rate": 8.973885518332894e-06, + "loss": 0.0198, + "step": 1708 + }, + { + "epoch": 0.45112781954887216, + "grad_norm": 0.6676157116889954, + "learning_rate": 8.984436824056979e-06, + "loss": 0.0136, + "step": 1710 + }, + { + "epoch": 0.4516554544255375, + "grad_norm": 0.21016037464141846, + "learning_rate": 8.994988129781062e-06, + "loss": 0.0182, + "step": 1712 + }, + { + "epoch": 0.45218308930220286, + "grad_norm": 0.4789685308933258, + "learning_rate": 9.005539435505145e-06, + "loss": 0.013, + "step": 1714 + }, + { + "epoch": 0.4527107241788682, + "grad_norm": 0.2094721645116806, + "learning_rate": 9.016090741229228e-06, + "loss": 0.0093, + "step": 1716 + }, + { + "epoch": 0.45323835905553356, + "grad_norm": 0.39645829796791077, + "learning_rate": 9.02664204695331e-06, + "loss": 0.0169, + "step": 1718 + }, + { + "epoch": 0.4537659939321989, + "grad_norm": 0.1471422016620636, + "learning_rate": 9.037193352677395e-06, + "loss": 0.0212, + "step": 1720 + }, + { + "epoch": 0.45429362880886426, + "grad_norm": 1.0059654712677002, + "learning_rate": 9.047744658401478e-06, + "loss": 0.0135, + "step": 1722 + }, + { + "epoch": 0.4548212636855296, + "grad_norm": 0.6521971821784973, + "learning_rate": 9.058295964125561e-06, + "loss": 0.0123, + "step": 1724 + }, + { + "epoch": 0.45534889856219496, + "grad_norm": 0.8546748757362366, + "learning_rate": 9.068847269849644e-06, + "loss": 0.0108, + "step": 1726 + }, + { + "epoch": 0.4558765334388603, + "grad_norm": 0.6999250650405884, + "learning_rate": 9.079398575573729e-06, + "loss": 0.0141, + "step": 1728 + }, + { + "epoch": 0.45640416831552566, + "grad_norm": 0.4508836269378662, + "learning_rate": 9.089949881297812e-06, + "loss": 0.0215, + "step": 1730 + }, + { + "epoch": 0.456931803192191, + "grad_norm": 0.21598108112812042, + "learning_rate": 9.100501187021895e-06, + "loss": 0.0086, + "step": 1732 + }, + { + "epoch": 0.45745943806885636, + "grad_norm": 0.1276172399520874, + "learning_rate": 9.111052492745978e-06, + "loss": 0.0093, + "step": 1734 + }, + { + "epoch": 0.4579870729455217, + "grad_norm": 0.7618444561958313, + "learning_rate": 9.12160379847006e-06, + "loss": 0.0181, + "step": 1736 + }, + { + "epoch": 0.45851470782218706, + "grad_norm": 0.19917139410972595, + "learning_rate": 9.132155104194145e-06, + "loss": 0.0103, + "step": 1738 + }, + { + "epoch": 0.4590423426988524, + "grad_norm": 0.3440724015235901, + "learning_rate": 9.142706409918228e-06, + "loss": 0.0225, + "step": 1740 + }, + { + "epoch": 0.45956997757551776, + "grad_norm": 0.22431769967079163, + "learning_rate": 9.153257715642311e-06, + "loss": 0.0091, + "step": 1742 + }, + { + "epoch": 0.4600976124521831, + "grad_norm": 0.4036844074726105, + "learning_rate": 9.163809021366394e-06, + "loss": 0.0201, + "step": 1744 + }, + { + "epoch": 0.46062524732884846, + "grad_norm": 0.39781710505485535, + "learning_rate": 9.174360327090479e-06, + "loss": 0.0107, + "step": 1746 + }, + { + "epoch": 0.46115288220551376, + "grad_norm": 2.4612462520599365, + "learning_rate": 9.184911632814562e-06, + "loss": 0.0245, + "step": 1748 + }, + { + "epoch": 0.4616805170821791, + "grad_norm": 0.13814905285835266, + "learning_rate": 9.195462938538645e-06, + "loss": 0.0071, + "step": 1750 + }, + { + "epoch": 0.46220815195884446, + "grad_norm": 0.6992248892784119, + "learning_rate": 9.20601424426273e-06, + "loss": 0.0108, + "step": 1752 + }, + { + "epoch": 0.4627357868355098, + "grad_norm": 0.09460562467575073, + "learning_rate": 9.21656554998681e-06, + "loss": 0.0075, + "step": 1754 + }, + { + "epoch": 0.46326342171217516, + "grad_norm": 0.1926947385072708, + "learning_rate": 9.227116855710895e-06, + "loss": 0.0083, + "step": 1756 + }, + { + "epoch": 0.4637910565888405, + "grad_norm": 0.10408731549978256, + "learning_rate": 9.237668161434978e-06, + "loss": 0.0162, + "step": 1758 + }, + { + "epoch": 0.46431869146550586, + "grad_norm": 0.8383923172950745, + "learning_rate": 9.248219467159061e-06, + "loss": 0.0161, + "step": 1760 + }, + { + "epoch": 0.4648463263421712, + "grad_norm": 2.856740951538086, + "learning_rate": 9.258770772883146e-06, + "loss": 0.0222, + "step": 1762 + }, + { + "epoch": 0.46537396121883656, + "grad_norm": 0.05746760591864586, + "learning_rate": 9.269322078607229e-06, + "loss": 0.0195, + "step": 1764 + }, + { + "epoch": 0.4659015960955019, + "grad_norm": 0.4145953357219696, + "learning_rate": 9.279873384331312e-06, + "loss": 0.0116, + "step": 1766 + }, + { + "epoch": 0.46642923097216726, + "grad_norm": 0.7865034341812134, + "learning_rate": 9.290424690055395e-06, + "loss": 0.0147, + "step": 1768 + }, + { + "epoch": 0.4669568658488326, + "grad_norm": 0.1101914495229721, + "learning_rate": 9.30097599577948e-06, + "loss": 0.0086, + "step": 1770 + }, + { + "epoch": 0.46748450072549796, + "grad_norm": 1.0748062133789062, + "learning_rate": 9.311527301503562e-06, + "loss": 0.0126, + "step": 1772 + }, + { + "epoch": 0.4680121356021633, + "grad_norm": 0.251146525144577, + "learning_rate": 9.322078607227645e-06, + "loss": 0.0118, + "step": 1774 + }, + { + "epoch": 0.46853977047882867, + "grad_norm": 0.2896649241447449, + "learning_rate": 9.332629912951728e-06, + "loss": 0.0117, + "step": 1776 + }, + { + "epoch": 0.469067405355494, + "grad_norm": 0.5549662709236145, + "learning_rate": 9.343181218675811e-06, + "loss": 0.0186, + "step": 1778 + }, + { + "epoch": 0.46959504023215937, + "grad_norm": 1.7876754999160767, + "learning_rate": 9.353732524399896e-06, + "loss": 0.018, + "step": 1780 + }, + { + "epoch": 0.4701226751088247, + "grad_norm": 0.5253521800041199, + "learning_rate": 9.364283830123979e-06, + "loss": 0.0195, + "step": 1782 + }, + { + "epoch": 0.47065030998549007, + "grad_norm": 0.4923895001411438, + "learning_rate": 9.374835135848062e-06, + "loss": 0.0296, + "step": 1784 + }, + { + "epoch": 0.47117794486215536, + "grad_norm": 0.7127161622047424, + "learning_rate": 9.385386441572145e-06, + "loss": 0.0124, + "step": 1786 + }, + { + "epoch": 0.4717055797388207, + "grad_norm": 0.4733903408050537, + "learning_rate": 9.39593774729623e-06, + "loss": 0.0103, + "step": 1788 + }, + { + "epoch": 0.47223321461548606, + "grad_norm": 0.24451479315757751, + "learning_rate": 9.406489053020312e-06, + "loss": 0.0127, + "step": 1790 + }, + { + "epoch": 0.4727608494921514, + "grad_norm": 0.09593214094638824, + "learning_rate": 9.417040358744395e-06, + "loss": 0.0127, + "step": 1792 + }, + { + "epoch": 0.47328848436881676, + "grad_norm": 0.3374280631542206, + "learning_rate": 9.427591664468478e-06, + "loss": 0.0153, + "step": 1794 + }, + { + "epoch": 0.4738161192454821, + "grad_norm": 0.31910622119903564, + "learning_rate": 9.438142970192561e-06, + "loss": 0.0154, + "step": 1796 + }, + { + "epoch": 0.47434375412214747, + "grad_norm": 0.167129784822464, + "learning_rate": 9.448694275916646e-06, + "loss": 0.0077, + "step": 1798 + }, + { + "epoch": 0.4748713889988128, + "grad_norm": 0.3367283344268799, + "learning_rate": 9.459245581640729e-06, + "loss": 0.0136, + "step": 1800 + }, + { + "epoch": 0.47539902387547817, + "grad_norm": 0.15962521731853485, + "learning_rate": 9.469796887364812e-06, + "loss": 0.0114, + "step": 1802 + }, + { + "epoch": 0.4759266587521435, + "grad_norm": 0.19470904767513275, + "learning_rate": 9.480348193088895e-06, + "loss": 0.0088, + "step": 1804 + }, + { + "epoch": 0.47645429362880887, + "grad_norm": 0.33173617720603943, + "learning_rate": 9.49089949881298e-06, + "loss": 0.0147, + "step": 1806 + }, + { + "epoch": 0.4769819285054742, + "grad_norm": 0.7537326216697693, + "learning_rate": 9.501450804537062e-06, + "loss": 0.0168, + "step": 1808 + }, + { + "epoch": 0.47750956338213957, + "grad_norm": 0.20148985087871552, + "learning_rate": 9.512002110261145e-06, + "loss": 0.012, + "step": 1810 + }, + { + "epoch": 0.4780371982588049, + "grad_norm": 0.8123595714569092, + "learning_rate": 9.522553415985228e-06, + "loss": 0.0106, + "step": 1812 + }, + { + "epoch": 0.47856483313547027, + "grad_norm": 0.9041925668716431, + "learning_rate": 9.533104721709311e-06, + "loss": 0.0188, + "step": 1814 + }, + { + "epoch": 0.4790924680121356, + "grad_norm": 1.168379306793213, + "learning_rate": 9.543656027433396e-06, + "loss": 0.0163, + "step": 1816 + }, + { + "epoch": 0.47962010288880097, + "grad_norm": 0.2034379243850708, + "learning_rate": 9.554207333157479e-06, + "loss": 0.023, + "step": 1818 + }, + { + "epoch": 0.4801477377654663, + "grad_norm": 0.31094080209732056, + "learning_rate": 9.564758638881563e-06, + "loss": 0.0079, + "step": 1820 + }, + { + "epoch": 0.48067537264213167, + "grad_norm": 0.292711079120636, + "learning_rate": 9.575309944605645e-06, + "loss": 0.0113, + "step": 1822 + }, + { + "epoch": 0.48120300751879697, + "grad_norm": 0.19901004433631897, + "learning_rate": 9.58586125032973e-06, + "loss": 0.0083, + "step": 1824 + }, + { + "epoch": 0.4817306423954623, + "grad_norm": 0.09616748988628387, + "learning_rate": 9.596412556053812e-06, + "loss": 0.0089, + "step": 1826 + }, + { + "epoch": 0.48225827727212767, + "grad_norm": 0.4310615360736847, + "learning_rate": 9.606963861777895e-06, + "loss": 0.0109, + "step": 1828 + }, + { + "epoch": 0.482785912148793, + "grad_norm": 0.15931418538093567, + "learning_rate": 9.61751516750198e-06, + "loss": 0.0146, + "step": 1830 + }, + { + "epoch": 0.48331354702545837, + "grad_norm": 0.11251848191022873, + "learning_rate": 9.628066473226061e-06, + "loss": 0.015, + "step": 1832 + }, + { + "epoch": 0.4838411819021237, + "grad_norm": 0.06394564360380173, + "learning_rate": 9.638617778950146e-06, + "loss": 0.0091, + "step": 1834 + }, + { + "epoch": 0.48436881677878907, + "grad_norm": 0.32680872082710266, + "learning_rate": 9.649169084674229e-06, + "loss": 0.0171, + "step": 1836 + }, + { + "epoch": 0.4848964516554544, + "grad_norm": 1.057300090789795, + "learning_rate": 9.659720390398314e-06, + "loss": 0.0201, + "step": 1838 + }, + { + "epoch": 0.48542408653211977, + "grad_norm": 0.5051817297935486, + "learning_rate": 9.670271696122396e-06, + "loss": 0.0164, + "step": 1840 + }, + { + "epoch": 0.4859517214087851, + "grad_norm": 0.8090242743492126, + "learning_rate": 9.68082300184648e-06, + "loss": 0.0105, + "step": 1842 + }, + { + "epoch": 0.48647935628545047, + "grad_norm": 0.07837384939193726, + "learning_rate": 9.691374307570562e-06, + "loss": 0.0107, + "step": 1844 + }, + { + "epoch": 0.4870069911621158, + "grad_norm": 0.4974774718284607, + "learning_rate": 9.701925613294645e-06, + "loss": 0.0081, + "step": 1846 + }, + { + "epoch": 0.48753462603878117, + "grad_norm": 0.3274565637111664, + "learning_rate": 9.71247691901873e-06, + "loss": 0.0189, + "step": 1848 + }, + { + "epoch": 0.4880622609154465, + "grad_norm": 0.49347957968711853, + "learning_rate": 9.723028224742813e-06, + "loss": 0.0069, + "step": 1850 + }, + { + "epoch": 0.4885898957921119, + "grad_norm": 0.46580466628074646, + "learning_rate": 9.733579530466896e-06, + "loss": 0.0135, + "step": 1852 + }, + { + "epoch": 0.4891175306687772, + "grad_norm": 0.2859683930873871, + "learning_rate": 9.744130836190979e-06, + "loss": 0.0094, + "step": 1854 + }, + { + "epoch": 0.4896451655454426, + "grad_norm": 0.24449887871742249, + "learning_rate": 9.754682141915064e-06, + "loss": 0.006, + "step": 1856 + }, + { + "epoch": 0.4901728004221079, + "grad_norm": 0.17851987481117249, + "learning_rate": 9.765233447639147e-06, + "loss": 0.0063, + "step": 1858 + }, + { + "epoch": 0.4907004352987733, + "grad_norm": 0.4718785285949707, + "learning_rate": 9.77578475336323e-06, + "loss": 0.0082, + "step": 1860 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 0.32451003789901733, + "learning_rate": 9.786336059087312e-06, + "loss": 0.0087, + "step": 1862 + }, + { + "epoch": 0.4917557050521039, + "grad_norm": 0.08860956132411957, + "learning_rate": 9.796887364811395e-06, + "loss": 0.0108, + "step": 1864 + }, + { + "epoch": 0.49228333992876927, + "grad_norm": 0.24264855682849884, + "learning_rate": 9.80743867053548e-06, + "loss": 0.0063, + "step": 1866 + }, + { + "epoch": 0.4928109748054346, + "grad_norm": 0.17336559295654297, + "learning_rate": 9.817989976259563e-06, + "loss": 0.0059, + "step": 1868 + }, + { + "epoch": 0.49333860968209997, + "grad_norm": 0.09189339727163315, + "learning_rate": 9.828541281983646e-06, + "loss": 0.0202, + "step": 1870 + }, + { + "epoch": 0.4938662445587653, + "grad_norm": 0.0643758475780487, + "learning_rate": 9.839092587707729e-06, + "loss": 0.0065, + "step": 1872 + }, + { + "epoch": 0.49439387943543067, + "grad_norm": 0.48895031213760376, + "learning_rate": 9.849643893431814e-06, + "loss": 0.0119, + "step": 1874 + }, + { + "epoch": 0.494921514312096, + "grad_norm": 0.40274956822395325, + "learning_rate": 9.860195199155897e-06, + "loss": 0.0088, + "step": 1876 + }, + { + "epoch": 0.4954491491887614, + "grad_norm": 0.0890677347779274, + "learning_rate": 9.87074650487998e-06, + "loss": 0.0258, + "step": 1878 + }, + { + "epoch": 0.4959767840654267, + "grad_norm": 0.045009102672338486, + "learning_rate": 9.881297810604064e-06, + "loss": 0.015, + "step": 1880 + }, + { + "epoch": 0.4965044189420921, + "grad_norm": 0.567461371421814, + "learning_rate": 9.891849116328145e-06, + "loss": 0.0142, + "step": 1882 + }, + { + "epoch": 0.4970320538187574, + "grad_norm": 0.29680585861206055, + "learning_rate": 9.90240042205223e-06, + "loss": 0.0132, + "step": 1884 + }, + { + "epoch": 0.4975596886954228, + "grad_norm": 0.06072726845741272, + "learning_rate": 9.912951727776313e-06, + "loss": 0.0155, + "step": 1886 + }, + { + "epoch": 0.4980873235720881, + "grad_norm": 1.5976675748825073, + "learning_rate": 9.923503033500396e-06, + "loss": 0.0198, + "step": 1888 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 0.3252725303173065, + "learning_rate": 9.93405433922448e-06, + "loss": 0.0104, + "step": 1890 + }, + { + "epoch": 0.4991425933254188, + "grad_norm": 0.6365008354187012, + "learning_rate": 9.944605644948564e-06, + "loss": 0.007, + "step": 1892 + }, + { + "epoch": 0.4996702282020842, + "grad_norm": 0.965272843837738, + "learning_rate": 9.955156950672647e-06, + "loss": 0.0143, + "step": 1894 + }, + { + "epoch": 0.5001978630787495, + "grad_norm": 0.5971518158912659, + "learning_rate": 9.96570825639673e-06, + "loss": 0.0166, + "step": 1896 + }, + { + "epoch": 0.5007254979554149, + "grad_norm": 3.6085622310638428, + "learning_rate": 9.976259562120814e-06, + "loss": 0.0164, + "step": 1898 + }, + { + "epoch": 0.5012531328320802, + "grad_norm": 0.6857232451438904, + "learning_rate": 9.986810867844897e-06, + "loss": 0.0113, + "step": 1900 + }, + { + "epoch": 0.5017807677087456, + "grad_norm": 0.9014392495155334, + "learning_rate": 9.99736217356898e-06, + "loss": 0.0166, + "step": 1902 + }, + { + "epoch": 0.5023084025854109, + "grad_norm": 0.3138945996761322, + "learning_rate": 1.0007913479293063e-05, + "loss": 0.0122, + "step": 1904 + }, + { + "epoch": 0.5028360374620763, + "grad_norm": 0.15424932539463043, + "learning_rate": 1.0018464785017146e-05, + "loss": 0.0151, + "step": 1906 + }, + { + "epoch": 0.5033636723387416, + "grad_norm": 0.1918104737997055, + "learning_rate": 1.002901609074123e-05, + "loss": 0.015, + "step": 1908 + }, + { + "epoch": 0.503891307215407, + "grad_norm": 0.8571024537086487, + "learning_rate": 1.0039567396465314e-05, + "loss": 0.0164, + "step": 1910 + }, + { + "epoch": 0.5044189420920723, + "grad_norm": 0.14762331545352936, + "learning_rate": 1.0050118702189398e-05, + "loss": 0.0119, + "step": 1912 + }, + { + "epoch": 0.5049465769687377, + "grad_norm": 0.1840706765651703, + "learning_rate": 1.006067000791348e-05, + "loss": 0.0095, + "step": 1914 + }, + { + "epoch": 0.505474211845403, + "grad_norm": 0.6544621586799622, + "learning_rate": 1.0071221313637562e-05, + "loss": 0.0297, + "step": 1916 + }, + { + "epoch": 0.5060018467220684, + "grad_norm": 1.0439072847366333, + "learning_rate": 1.0081772619361647e-05, + "loss": 0.0198, + "step": 1918 + }, + { + "epoch": 0.5065294815987337, + "grad_norm": 0.32813602685928345, + "learning_rate": 1.009232392508573e-05, + "loss": 0.0186, + "step": 1920 + }, + { + "epoch": 0.507057116475399, + "grad_norm": 0.3296583294868469, + "learning_rate": 1.0102875230809815e-05, + "loss": 0.0067, + "step": 1922 + }, + { + "epoch": 0.5075847513520644, + "grad_norm": 2.555208206176758, + "learning_rate": 1.0113426536533896e-05, + "loss": 0.0222, + "step": 1924 + }, + { + "epoch": 0.5081123862287297, + "grad_norm": 0.4918021261692047, + "learning_rate": 1.0123977842257979e-05, + "loss": 0.0072, + "step": 1926 + }, + { + "epoch": 0.5086400211053951, + "grad_norm": 0.24621514976024628, + "learning_rate": 1.0134529147982064e-05, + "loss": 0.015, + "step": 1928 + }, + { + "epoch": 0.5091676559820604, + "grad_norm": 0.27875441312789917, + "learning_rate": 1.0145080453706147e-05, + "loss": 0.0107, + "step": 1930 + }, + { + "epoch": 0.5096952908587258, + "grad_norm": 0.25439202785491943, + "learning_rate": 1.0155631759430231e-05, + "loss": 0.0143, + "step": 1932 + }, + { + "epoch": 0.5102229257353911, + "grad_norm": 0.10203098505735397, + "learning_rate": 1.0166183065154314e-05, + "loss": 0.0179, + "step": 1934 + }, + { + "epoch": 0.5107505606120565, + "grad_norm": 0.3436344861984253, + "learning_rate": 1.0176734370878395e-05, + "loss": 0.0155, + "step": 1936 + }, + { + "epoch": 0.5112781954887218, + "grad_norm": 1.0725481510162354, + "learning_rate": 1.018728567660248e-05, + "loss": 0.0174, + "step": 1938 + }, + { + "epoch": 0.5118058303653872, + "grad_norm": 0.3909111022949219, + "learning_rate": 1.0197836982326563e-05, + "loss": 0.0107, + "step": 1940 + }, + { + "epoch": 0.5123334652420525, + "grad_norm": 0.12147495150566101, + "learning_rate": 1.0208388288050648e-05, + "loss": 0.0058, + "step": 1942 + }, + { + "epoch": 0.5128611001187179, + "grad_norm": 0.8206871747970581, + "learning_rate": 1.021893959377473e-05, + "loss": 0.0109, + "step": 1944 + }, + { + "epoch": 0.5133887349953832, + "grad_norm": 0.07204122841358185, + "learning_rate": 1.0229490899498814e-05, + "loss": 0.0052, + "step": 1946 + }, + { + "epoch": 0.5139163698720486, + "grad_norm": 0.07046371698379517, + "learning_rate": 1.0240042205222898e-05, + "loss": 0.015, + "step": 1948 + }, + { + "epoch": 0.5144440047487139, + "grad_norm": 0.9566598534584045, + "learning_rate": 1.025059351094698e-05, + "loss": 0.0215, + "step": 1950 + }, + { + "epoch": 0.5149716396253793, + "grad_norm": 0.13647043704986572, + "learning_rate": 1.0261144816671064e-05, + "loss": 0.0058, + "step": 1952 + }, + { + "epoch": 0.5154992745020446, + "grad_norm": 0.1868666410446167, + "learning_rate": 1.0271696122395147e-05, + "loss": 0.0104, + "step": 1954 + }, + { + "epoch": 0.51602690937871, + "grad_norm": 0.8100932240486145, + "learning_rate": 1.028224742811923e-05, + "loss": 0.0079, + "step": 1956 + }, + { + "epoch": 0.5165545442553753, + "grad_norm": 1.1595476865768433, + "learning_rate": 1.0292798733843315e-05, + "loss": 0.006, + "step": 1958 + }, + { + "epoch": 0.5170821791320406, + "grad_norm": 0.4871109426021576, + "learning_rate": 1.0303350039567396e-05, + "loss": 0.0207, + "step": 1960 + }, + { + "epoch": 0.517609814008706, + "grad_norm": 0.3426220715045929, + "learning_rate": 1.0313901345291482e-05, + "loss": 0.0164, + "step": 1962 + }, + { + "epoch": 0.5181374488853713, + "grad_norm": 0.05801932141184807, + "learning_rate": 1.0324452651015564e-05, + "loss": 0.0054, + "step": 1964 + }, + { + "epoch": 0.5186650837620367, + "grad_norm": 1.2561349868774414, + "learning_rate": 1.0335003956739647e-05, + "loss": 0.0095, + "step": 1966 + }, + { + "epoch": 0.519192718638702, + "grad_norm": 0.5320897698402405, + "learning_rate": 1.0345555262463731e-05, + "loss": 0.0089, + "step": 1968 + }, + { + "epoch": 0.5197203535153674, + "grad_norm": 0.23239992558956146, + "learning_rate": 1.0356106568187814e-05, + "loss": 0.0076, + "step": 1970 + }, + { + "epoch": 0.5202479883920327, + "grad_norm": 0.16569556295871735, + "learning_rate": 1.0366657873911899e-05, + "loss": 0.0058, + "step": 1972 + }, + { + "epoch": 0.5207756232686981, + "grad_norm": 0.055448830127716064, + "learning_rate": 1.037720917963598e-05, + "loss": 0.0172, + "step": 1974 + }, + { + "epoch": 0.5213032581453634, + "grad_norm": 0.23831871151924133, + "learning_rate": 1.0387760485360063e-05, + "loss": 0.0267, + "step": 1976 + }, + { + "epoch": 0.5218308930220288, + "grad_norm": 1.4090138673782349, + "learning_rate": 1.0398311791084148e-05, + "loss": 0.0161, + "step": 1978 + }, + { + "epoch": 0.5223585278986941, + "grad_norm": 0.04835481569170952, + "learning_rate": 1.040886309680823e-05, + "loss": 0.0107, + "step": 1980 + }, + { + "epoch": 0.5228861627753595, + "grad_norm": 0.7403975129127502, + "learning_rate": 1.0419414402532315e-05, + "loss": 0.0181, + "step": 1982 + }, + { + "epoch": 0.5234137976520248, + "grad_norm": 0.47098278999328613, + "learning_rate": 1.0429965708256398e-05, + "loss": 0.0217, + "step": 1984 + }, + { + "epoch": 0.5239414325286902, + "grad_norm": 1.8398592472076416, + "learning_rate": 1.044051701398048e-05, + "loss": 0.0153, + "step": 1986 + }, + { + "epoch": 0.5244690674053555, + "grad_norm": 0.6742032170295715, + "learning_rate": 1.0451068319704564e-05, + "loss": 0.0409, + "step": 1988 + }, + { + "epoch": 0.5249967022820209, + "grad_norm": 0.5107528567314148, + "learning_rate": 1.0461619625428647e-05, + "loss": 0.0102, + "step": 1990 + }, + { + "epoch": 0.5255243371586862, + "grad_norm": 0.04487007483839989, + "learning_rate": 1.0472170931152732e-05, + "loss": 0.0121, + "step": 1992 + }, + { + "epoch": 0.5260519720353516, + "grad_norm": 0.14736199378967285, + "learning_rate": 1.0482722236876815e-05, + "loss": 0.009, + "step": 1994 + }, + { + "epoch": 0.5265796069120169, + "grad_norm": 0.06886344403028488, + "learning_rate": 1.0493273542600896e-05, + "loss": 0.0078, + "step": 1996 + }, + { + "epoch": 0.5271072417886822, + "grad_norm": 0.1030597910284996, + "learning_rate": 1.0503824848324982e-05, + "loss": 0.0074, + "step": 1998 + }, + { + "epoch": 0.5276348766653476, + "grad_norm": 0.5475274324417114, + "learning_rate": 1.0514376154049064e-05, + "loss": 0.0138, + "step": 2000 + }, + { + "epoch": 0.5276348766653476, + "eval_loss": 0.008970611728727818, + "eval_runtime": 309.2842, + "eval_samples_per_second": 697.226, + "eval_steps_per_second": 87.156, + "step": 2000 + }, + { + "epoch": 0.5281625115420129, + "grad_norm": 0.0697711855173111, + "learning_rate": 1.0524927459773148e-05, + "loss": 0.0117, + "step": 2002 + }, + { + "epoch": 0.5286901464186783, + "grad_norm": 0.6850720047950745, + "learning_rate": 1.0535478765497231e-05, + "loss": 0.0121, + "step": 2004 + }, + { + "epoch": 0.5292177812953436, + "grad_norm": 0.1346169263124466, + "learning_rate": 1.0546030071221314e-05, + "loss": 0.0071, + "step": 2006 + }, + { + "epoch": 0.529745416172009, + "grad_norm": 0.28213950991630554, + "learning_rate": 1.0556581376945399e-05, + "loss": 0.0093, + "step": 2008 + }, + { + "epoch": 0.5302730510486743, + "grad_norm": 0.45582300424575806, + "learning_rate": 1.056713268266948e-05, + "loss": 0.0061, + "step": 2010 + }, + { + "epoch": 0.5308006859253397, + "grad_norm": 0.8248752355575562, + "learning_rate": 1.0577683988393565e-05, + "loss": 0.012, + "step": 2012 + }, + { + "epoch": 0.531328320802005, + "grad_norm": 0.9385994672775269, + "learning_rate": 1.0588235294117648e-05, + "loss": 0.0105, + "step": 2014 + }, + { + "epoch": 0.5318559556786704, + "grad_norm": 0.3015196919441223, + "learning_rate": 1.059878659984173e-05, + "loss": 0.0199, + "step": 2016 + }, + { + "epoch": 0.5323835905553357, + "grad_norm": 0.05520825833082199, + "learning_rate": 1.0609337905565815e-05, + "loss": 0.0045, + "step": 2018 + }, + { + "epoch": 0.5329112254320011, + "grad_norm": 0.09589599817991257, + "learning_rate": 1.0619889211289898e-05, + "loss": 0.0047, + "step": 2020 + }, + { + "epoch": 0.5334388603086664, + "grad_norm": 0.4211236834526062, + "learning_rate": 1.0630440517013983e-05, + "loss": 0.0181, + "step": 2022 + }, + { + "epoch": 0.5339664951853318, + "grad_norm": 2.131204605102539, + "learning_rate": 1.0640991822738064e-05, + "loss": 0.0183, + "step": 2024 + }, + { + "epoch": 0.5344941300619971, + "grad_norm": 0.07125158607959747, + "learning_rate": 1.0651543128462147e-05, + "loss": 0.0046, + "step": 2026 + }, + { + "epoch": 0.5350217649386625, + "grad_norm": 0.14335665106773376, + "learning_rate": 1.0662094434186232e-05, + "loss": 0.0053, + "step": 2028 + }, + { + "epoch": 0.5355493998153278, + "grad_norm": 0.4094734191894531, + "learning_rate": 1.0672645739910315e-05, + "loss": 0.0079, + "step": 2030 + }, + { + "epoch": 0.5360770346919932, + "grad_norm": 0.10656462609767914, + "learning_rate": 1.06831970456344e-05, + "loss": 0.005, + "step": 2032 + }, + { + "epoch": 0.5366046695686585, + "grad_norm": 0.21685868501663208, + "learning_rate": 1.0693748351358483e-05, + "loss": 0.0049, + "step": 2034 + }, + { + "epoch": 0.5371323044453238, + "grad_norm": 0.16352297365665436, + "learning_rate": 1.0704299657082564e-05, + "loss": 0.0066, + "step": 2036 + }, + { + "epoch": 0.5376599393219892, + "grad_norm": 0.48057350516319275, + "learning_rate": 1.0714850962806648e-05, + "loss": 0.0104, + "step": 2038 + }, + { + "epoch": 0.5381875741986545, + "grad_norm": 0.5535579919815063, + "learning_rate": 1.0725402268530731e-05, + "loss": 0.0166, + "step": 2040 + }, + { + "epoch": 0.5387152090753199, + "grad_norm": 0.14474788308143616, + "learning_rate": 1.0735953574254816e-05, + "loss": 0.0056, + "step": 2042 + }, + { + "epoch": 0.5392428439519852, + "grad_norm": 0.31410789489746094, + "learning_rate": 1.0746504879978899e-05, + "loss": 0.0077, + "step": 2044 + }, + { + "epoch": 0.5397704788286506, + "grad_norm": 0.2153777778148651, + "learning_rate": 1.075705618570298e-05, + "loss": 0.0117, + "step": 2046 + }, + { + "epoch": 0.5402981137053159, + "grad_norm": 0.549472451210022, + "learning_rate": 1.0767607491427065e-05, + "loss": 0.023, + "step": 2048 + }, + { + "epoch": 0.5408257485819813, + "grad_norm": 0.05505160614848137, + "learning_rate": 1.0778158797151148e-05, + "loss": 0.006, + "step": 2050 + }, + { + "epoch": 0.5413533834586466, + "grad_norm": 0.07751892507076263, + "learning_rate": 1.0788710102875233e-05, + "loss": 0.0142, + "step": 2052 + }, + { + "epoch": 0.541881018335312, + "grad_norm": 0.07150750607252121, + "learning_rate": 1.0799261408599315e-05, + "loss": 0.0042, + "step": 2054 + }, + { + "epoch": 0.5424086532119773, + "grad_norm": 0.08824638277292252, + "learning_rate": 1.0809812714323398e-05, + "loss": 0.0083, + "step": 2056 + }, + { + "epoch": 0.5429362880886427, + "grad_norm": 0.65947425365448, + "learning_rate": 1.0820364020047483e-05, + "loss": 0.008, + "step": 2058 + }, + { + "epoch": 0.543463922965308, + "grad_norm": 0.08530624210834503, + "learning_rate": 1.0830915325771564e-05, + "loss": 0.0132, + "step": 2060 + }, + { + "epoch": 0.5439915578419734, + "grad_norm": 0.17977440357208252, + "learning_rate": 1.0841466631495647e-05, + "loss": 0.018, + "step": 2062 + }, + { + "epoch": 0.5445191927186387, + "grad_norm": 0.7156467437744141, + "learning_rate": 1.0852017937219732e-05, + "loss": 0.005, + "step": 2064 + }, + { + "epoch": 0.5450468275953041, + "grad_norm": 0.13476324081420898, + "learning_rate": 1.0862569242943815e-05, + "loss": 0.0042, + "step": 2066 + }, + { + "epoch": 0.5455744624719694, + "grad_norm": 1.2955881357192993, + "learning_rate": 1.08731205486679e-05, + "loss": 0.0071, + "step": 2068 + }, + { + "epoch": 0.5461020973486348, + "grad_norm": 0.5923560857772827, + "learning_rate": 1.0883671854391983e-05, + "loss": 0.0096, + "step": 2070 + }, + { + "epoch": 0.5466297322253001, + "grad_norm": 0.6820182800292969, + "learning_rate": 1.0894223160116064e-05, + "loss": 0.0077, + "step": 2072 + }, + { + "epoch": 0.5471573671019654, + "grad_norm": 0.3611571490764618, + "learning_rate": 1.0904774465840148e-05, + "loss": 0.0128, + "step": 2074 + }, + { + "epoch": 0.5476850019786308, + "grad_norm": 0.9315333962440491, + "learning_rate": 1.0915325771564231e-05, + "loss": 0.0083, + "step": 2076 + }, + { + "epoch": 0.5482126368552961, + "grad_norm": 0.7047861218452454, + "learning_rate": 1.0925877077288316e-05, + "loss": 0.0068, + "step": 2078 + }, + { + "epoch": 0.5487402717319615, + "grad_norm": 0.06301169842481613, + "learning_rate": 1.0936428383012399e-05, + "loss": 0.0047, + "step": 2080 + }, + { + "epoch": 0.5492679066086268, + "grad_norm": 0.5669522285461426, + "learning_rate": 1.094697968873648e-05, + "loss": 0.0187, + "step": 2082 + }, + { + "epoch": 0.5497955414852922, + "grad_norm": 0.07205159217119217, + "learning_rate": 1.0957530994460565e-05, + "loss": 0.0045, + "step": 2084 + }, + { + "epoch": 0.5503231763619575, + "grad_norm": 0.1079445630311966, + "learning_rate": 1.0968082300184648e-05, + "loss": 0.0042, + "step": 2086 + }, + { + "epoch": 0.5508508112386229, + "grad_norm": 0.4666207730770111, + "learning_rate": 1.0978633605908733e-05, + "loss": 0.0119, + "step": 2088 + }, + { + "epoch": 0.5513784461152882, + "grad_norm": 0.03765810653567314, + "learning_rate": 1.0989184911632816e-05, + "loss": 0.0046, + "step": 2090 + }, + { + "epoch": 0.5519060809919536, + "grad_norm": 0.6924009323120117, + "learning_rate": 1.0999736217356898e-05, + "loss": 0.0225, + "step": 2092 + }, + { + "epoch": 0.5524337158686189, + "grad_norm": 0.08210259675979614, + "learning_rate": 1.1010287523080983e-05, + "loss": 0.011, + "step": 2094 + }, + { + "epoch": 0.5529613507452843, + "grad_norm": 0.21477526426315308, + "learning_rate": 1.1020838828805064e-05, + "loss": 0.0129, + "step": 2096 + }, + { + "epoch": 0.5534889856219496, + "grad_norm": 0.08022928237915039, + "learning_rate": 1.1031390134529149e-05, + "loss": 0.0041, + "step": 2098 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 0.21196378767490387, + "learning_rate": 1.1041941440253232e-05, + "loss": 0.0069, + "step": 2100 + }, + { + "epoch": 0.5545442553752803, + "grad_norm": 0.30629515647888184, + "learning_rate": 1.1052492745977315e-05, + "loss": 0.0123, + "step": 2102 + }, + { + "epoch": 0.5550718902519457, + "grad_norm": 0.9035661816596985, + "learning_rate": 1.10630440517014e-05, + "loss": 0.0061, + "step": 2104 + }, + { + "epoch": 0.555599525128611, + "grad_norm": 0.060377076268196106, + "learning_rate": 1.1073595357425483e-05, + "loss": 0.0043, + "step": 2106 + }, + { + "epoch": 0.5561271600052764, + "grad_norm": 0.3856221139431, + "learning_rate": 1.1084146663149567e-05, + "loss": 0.0054, + "step": 2108 + }, + { + "epoch": 0.5566547948819417, + "grad_norm": 0.5163595080375671, + "learning_rate": 1.1094697968873649e-05, + "loss": 0.0083, + "step": 2110 + }, + { + "epoch": 0.557182429758607, + "grad_norm": 0.3377070724964142, + "learning_rate": 1.1105249274597731e-05, + "loss": 0.0062, + "step": 2112 + }, + { + "epoch": 0.5577100646352724, + "grad_norm": 0.1740851104259491, + "learning_rate": 1.1115800580321816e-05, + "loss": 0.0047, + "step": 2114 + }, + { + "epoch": 0.5582376995119377, + "grad_norm": 0.3115760087966919, + "learning_rate": 1.1126351886045899e-05, + "loss": 0.0071, + "step": 2116 + }, + { + "epoch": 0.5587653343886031, + "grad_norm": 0.24874018132686615, + "learning_rate": 1.1136903191769984e-05, + "loss": 0.0045, + "step": 2118 + }, + { + "epoch": 0.5592929692652684, + "grad_norm": 0.5186723470687866, + "learning_rate": 1.1147454497494065e-05, + "loss": 0.0067, + "step": 2120 + }, + { + "epoch": 0.5598206041419338, + "grad_norm": 0.05948548763990402, + "learning_rate": 1.1158005803218148e-05, + "loss": 0.0128, + "step": 2122 + }, + { + "epoch": 0.5603482390185991, + "grad_norm": 0.18602532148361206, + "learning_rate": 1.1168557108942233e-05, + "loss": 0.0045, + "step": 2124 + }, + { + "epoch": 0.5608758738952645, + "grad_norm": 0.03578482195734978, + "learning_rate": 1.1179108414666316e-05, + "loss": 0.0037, + "step": 2126 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.3422965109348297, + "learning_rate": 1.11896597203904e-05, + "loss": 0.0201, + "step": 2128 + }, + { + "epoch": 0.5619311436485952, + "grad_norm": 0.25338688492774963, + "learning_rate": 1.1200211026114483e-05, + "loss": 0.0142, + "step": 2130 + }, + { + "epoch": 0.5624587785252605, + "grad_norm": 0.08165710419416428, + "learning_rate": 1.1210762331838564e-05, + "loss": 0.0039, + "step": 2132 + }, + { + "epoch": 0.5629864134019259, + "grad_norm": 0.29601916670799255, + "learning_rate": 1.1221313637562649e-05, + "loss": 0.0441, + "step": 2134 + }, + { + "epoch": 0.5635140482785912, + "grad_norm": 0.2386123389005661, + "learning_rate": 1.1231864943286732e-05, + "loss": 0.0053, + "step": 2136 + }, + { + "epoch": 0.5640416831552566, + "grad_norm": 0.06005110591650009, + "learning_rate": 1.1242416249010817e-05, + "loss": 0.0045, + "step": 2138 + }, + { + "epoch": 0.5645693180319219, + "grad_norm": 0.1331196427345276, + "learning_rate": 1.12529675547349e-05, + "loss": 0.0118, + "step": 2140 + }, + { + "epoch": 0.5650969529085873, + "grad_norm": 0.21248994767665863, + "learning_rate": 1.1263518860458983e-05, + "loss": 0.0114, + "step": 2142 + }, + { + "epoch": 0.5656245877852526, + "grad_norm": 0.44955915212631226, + "learning_rate": 1.1274070166183067e-05, + "loss": 0.0145, + "step": 2144 + }, + { + "epoch": 0.566152222661918, + "grad_norm": 0.43821075558662415, + "learning_rate": 1.1284621471907149e-05, + "loss": 0.0097, + "step": 2146 + }, + { + "epoch": 0.5666798575385833, + "grad_norm": 0.3931674063205719, + "learning_rate": 1.1295172777631233e-05, + "loss": 0.0213, + "step": 2148 + }, + { + "epoch": 0.5672074924152486, + "grad_norm": 0.0872742161154747, + "learning_rate": 1.1305724083355316e-05, + "loss": 0.0067, + "step": 2150 + }, + { + "epoch": 0.567735127291914, + "grad_norm": 0.21991033852100372, + "learning_rate": 1.1316275389079399e-05, + "loss": 0.0144, + "step": 2152 + }, + { + "epoch": 0.5682627621685793, + "grad_norm": 0.42157208919525146, + "learning_rate": 1.1326826694803484e-05, + "loss": 0.0129, + "step": 2154 + }, + { + "epoch": 0.5687903970452447, + "grad_norm": 0.7920743823051453, + "learning_rate": 1.1337378000527565e-05, + "loss": 0.0096, + "step": 2156 + }, + { + "epoch": 0.56931803192191, + "grad_norm": 0.5487315058708191, + "learning_rate": 1.134792930625165e-05, + "loss": 0.0099, + "step": 2158 + }, + { + "epoch": 0.5698456667985754, + "grad_norm": 0.43245670199394226, + "learning_rate": 1.1358480611975733e-05, + "loss": 0.0116, + "step": 2160 + }, + { + "epoch": 0.5703733016752407, + "grad_norm": 0.32896310091018677, + "learning_rate": 1.1369031917699816e-05, + "loss": 0.0075, + "step": 2162 + }, + { + "epoch": 0.5709009365519061, + "grad_norm": 0.0598653219640255, + "learning_rate": 1.13795832234239e-05, + "loss": 0.0074, + "step": 2164 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.06251257658004761, + "learning_rate": 1.1390134529147983e-05, + "loss": 0.0048, + "step": 2166 + }, + { + "epoch": 0.5719562063052368, + "grad_norm": 0.1044391542673111, + "learning_rate": 1.1400685834872068e-05, + "loss": 0.0108, + "step": 2168 + }, + { + "epoch": 0.5724838411819021, + "grad_norm": 0.6177201867103577, + "learning_rate": 1.1411237140596149e-05, + "loss": 0.0123, + "step": 2170 + }, + { + "epoch": 0.5730114760585675, + "grad_norm": 0.8150466680526733, + "learning_rate": 1.1421788446320232e-05, + "loss": 0.03, + "step": 2172 + }, + { + "epoch": 0.5735391109352328, + "grad_norm": 1.320302963256836, + "learning_rate": 1.1432339752044317e-05, + "loss": 0.0139, + "step": 2174 + }, + { + "epoch": 0.5740667458118982, + "grad_norm": 1.6171177625656128, + "learning_rate": 1.14428910577684e-05, + "loss": 0.0167, + "step": 2176 + }, + { + "epoch": 0.5745943806885635, + "grad_norm": 1.0324256420135498, + "learning_rate": 1.1453442363492484e-05, + "loss": 0.0169, + "step": 2178 + }, + { + "epoch": 0.5751220155652289, + "grad_norm": 0.34613466262817383, + "learning_rate": 1.1463993669216567e-05, + "loss": 0.0118, + "step": 2180 + }, + { + "epoch": 0.5756496504418942, + "grad_norm": 0.3895927965641022, + "learning_rate": 1.1474544974940649e-05, + "loss": 0.0072, + "step": 2182 + }, + { + "epoch": 0.5761772853185596, + "grad_norm": 0.9013178944587708, + "learning_rate": 1.1485096280664733e-05, + "loss": 0.012, + "step": 2184 + }, + { + "epoch": 0.5767049201952249, + "grad_norm": 0.4073829650878906, + "learning_rate": 1.1495647586388816e-05, + "loss": 0.0075, + "step": 2186 + }, + { + "epoch": 0.5772325550718902, + "grad_norm": 0.13119713962078094, + "learning_rate": 1.1506198892112901e-05, + "loss": 0.0042, + "step": 2188 + }, + { + "epoch": 0.5777601899485556, + "grad_norm": 3.673598289489746, + "learning_rate": 1.1516750197836984e-05, + "loss": 0.0085, + "step": 2190 + }, + { + "epoch": 0.5782878248252209, + "grad_norm": 0.5314809679985046, + "learning_rate": 1.1527301503561065e-05, + "loss": 0.0098, + "step": 2192 + }, + { + "epoch": 0.5788154597018863, + "grad_norm": 0.06846645474433899, + "learning_rate": 1.153785280928515e-05, + "loss": 0.0086, + "step": 2194 + }, + { + "epoch": 0.5793430945785516, + "grad_norm": 0.08881646394729614, + "learning_rate": 1.1548404115009233e-05, + "loss": 0.0152, + "step": 2196 + }, + { + "epoch": 0.579870729455217, + "grad_norm": 0.11040347814559937, + "learning_rate": 1.1558955420733317e-05, + "loss": 0.0055, + "step": 2198 + }, + { + "epoch": 0.5803983643318823, + "grad_norm": 0.2141551375389099, + "learning_rate": 1.15695067264574e-05, + "loss": 0.0049, + "step": 2200 + }, + { + "epoch": 0.5809259992085477, + "grad_norm": 0.043771062046289444, + "learning_rate": 1.1580058032181483e-05, + "loss": 0.0035, + "step": 2202 + }, + { + "epoch": 0.581453634085213, + "grad_norm": 0.34490323066711426, + "learning_rate": 1.1590609337905568e-05, + "loss": 0.0054, + "step": 2204 + }, + { + "epoch": 0.5819812689618784, + "grad_norm": 0.15889066457748413, + "learning_rate": 1.160116064362965e-05, + "loss": 0.0068, + "step": 2206 + }, + { + "epoch": 0.5825089038385437, + "grad_norm": 0.18002675473690033, + "learning_rate": 1.1611711949353734e-05, + "loss": 0.0072, + "step": 2208 + }, + { + "epoch": 0.5830365387152091, + "grad_norm": 0.2647058963775635, + "learning_rate": 1.1622263255077817e-05, + "loss": 0.0036, + "step": 2210 + }, + { + "epoch": 0.5835641735918744, + "grad_norm": 0.2886922061443329, + "learning_rate": 1.16328145608019e-05, + "loss": 0.0146, + "step": 2212 + }, + { + "epoch": 0.5840918084685398, + "grad_norm": 0.021714380010962486, + "learning_rate": 1.1643365866525984e-05, + "loss": 0.0038, + "step": 2214 + }, + { + "epoch": 0.5846194433452051, + "grad_norm": 0.4853258728981018, + "learning_rate": 1.1653917172250067e-05, + "loss": 0.0156, + "step": 2216 + }, + { + "epoch": 0.5851470782218705, + "grad_norm": 0.032712068408727646, + "learning_rate": 1.1664468477974152e-05, + "loss": 0.0039, + "step": 2218 + }, + { + "epoch": 0.5856747130985358, + "grad_norm": 0.1000470444560051, + "learning_rate": 1.1675019783698233e-05, + "loss": 0.004, + "step": 2220 + }, + { + "epoch": 0.5862023479752012, + "grad_norm": 0.1126992329955101, + "learning_rate": 1.1685571089422316e-05, + "loss": 0.0126, + "step": 2222 + }, + { + "epoch": 0.5867299828518665, + "grad_norm": 0.1741747409105301, + "learning_rate": 1.1696122395146401e-05, + "loss": 0.0042, + "step": 2224 + }, + { + "epoch": 0.5872576177285319, + "grad_norm": 0.646369218826294, + "learning_rate": 1.1706673700870484e-05, + "loss": 0.008, + "step": 2226 + }, + { + "epoch": 0.5877852526051972, + "grad_norm": 0.7997984290122986, + "learning_rate": 1.1717225006594569e-05, + "loss": 0.0049, + "step": 2228 + }, + { + "epoch": 0.5883128874818625, + "grad_norm": 0.04288318753242493, + "learning_rate": 1.172777631231865e-05, + "loss": 0.0034, + "step": 2230 + }, + { + "epoch": 0.5888405223585279, + "grad_norm": 0.0355897918343544, + "learning_rate": 1.1738327618042733e-05, + "loss": 0.0036, + "step": 2232 + }, + { + "epoch": 0.5893681572351932, + "grad_norm": 0.03065570630133152, + "learning_rate": 1.1748878923766817e-05, + "loss": 0.0036, + "step": 2234 + }, + { + "epoch": 0.5898957921118586, + "grad_norm": 0.04565052315592766, + "learning_rate": 1.17594302294909e-05, + "loss": 0.0041, + "step": 2236 + }, + { + "epoch": 0.5904234269885239, + "grad_norm": 0.24094437062740326, + "learning_rate": 1.1769981535214985e-05, + "loss": 0.0041, + "step": 2238 + }, + { + "epoch": 0.5909510618651893, + "grad_norm": 0.21862636506557465, + "learning_rate": 1.1780532840939068e-05, + "loss": 0.0109, + "step": 2240 + }, + { + "epoch": 0.5914786967418546, + "grad_norm": 0.34097686409950256, + "learning_rate": 1.179108414666315e-05, + "loss": 0.0051, + "step": 2242 + }, + { + "epoch": 0.59200633161852, + "grad_norm": 0.336953729391098, + "learning_rate": 1.1801635452387234e-05, + "loss": 0.0183, + "step": 2244 + }, + { + "epoch": 0.5925339664951853, + "grad_norm": 0.2091287076473236, + "learning_rate": 1.1812186758111317e-05, + "loss": 0.0102, + "step": 2246 + }, + { + "epoch": 0.5930616013718507, + "grad_norm": 0.8213464021682739, + "learning_rate": 1.1822738063835402e-05, + "loss": 0.0234, + "step": 2248 + }, + { + "epoch": 0.593589236248516, + "grad_norm": 0.16762769222259521, + "learning_rate": 1.1833289369559484e-05, + "loss": 0.0038, + "step": 2250 + }, + { + "epoch": 0.5941168711251814, + "grad_norm": 0.0402490571141243, + "learning_rate": 1.1843840675283567e-05, + "loss": 0.0032, + "step": 2252 + }, + { + "epoch": 0.5946445060018467, + "grad_norm": 0.3958369195461273, + "learning_rate": 1.1854391981007652e-05, + "loss": 0.011, + "step": 2254 + }, + { + "epoch": 0.5951721408785121, + "grad_norm": 0.1327764242887497, + "learning_rate": 1.1864943286731733e-05, + "loss": 0.0038, + "step": 2256 + }, + { + "epoch": 0.5956997757551774, + "grad_norm": 0.19824615120887756, + "learning_rate": 1.1875494592455818e-05, + "loss": 0.0058, + "step": 2258 + }, + { + "epoch": 0.5962274106318428, + "grad_norm": 0.1500352919101715, + "learning_rate": 1.1886045898179901e-05, + "loss": 0.0052, + "step": 2260 + }, + { + "epoch": 0.5967550455085081, + "grad_norm": 0.5038607120513916, + "learning_rate": 1.1896597203903984e-05, + "loss": 0.0062, + "step": 2262 + }, + { + "epoch": 0.5972826803851735, + "grad_norm": 0.044930361211299896, + "learning_rate": 1.1907148509628069e-05, + "loss": 0.0039, + "step": 2264 + }, + { + "epoch": 0.5978103152618388, + "grad_norm": 0.3278166949748993, + "learning_rate": 1.191769981535215e-05, + "loss": 0.0107, + "step": 2266 + }, + { + "epoch": 0.5983379501385041, + "grad_norm": 0.07416464388370514, + "learning_rate": 1.1928251121076233e-05, + "loss": 0.0034, + "step": 2268 + }, + { + "epoch": 0.5988655850151695, + "grad_norm": 0.05812099575996399, + "learning_rate": 1.1938802426800317e-05, + "loss": 0.0044, + "step": 2270 + }, + { + "epoch": 0.5993932198918348, + "grad_norm": 0.35704028606414795, + "learning_rate": 1.19493537325244e-05, + "loss": 0.0144, + "step": 2272 + }, + { + "epoch": 0.5999208547685002, + "grad_norm": 0.8661141395568848, + "learning_rate": 1.1959905038248485e-05, + "loss": 0.0088, + "step": 2274 + }, + { + "epoch": 0.6004484896451655, + "grad_norm": 0.28305771946907043, + "learning_rate": 1.1970456343972568e-05, + "loss": 0.0109, + "step": 2276 + }, + { + "epoch": 0.6009761245218309, + "grad_norm": 0.3033488392829895, + "learning_rate": 1.198100764969665e-05, + "loss": 0.0062, + "step": 2278 + }, + { + "epoch": 0.6015037593984962, + "grad_norm": 0.04655710980296135, + "learning_rate": 1.1991558955420734e-05, + "loss": 0.0088, + "step": 2280 + }, + { + "epoch": 0.6020313942751616, + "grad_norm": 0.09750309586524963, + "learning_rate": 1.2002110261144817e-05, + "loss": 0.0036, + "step": 2282 + }, + { + "epoch": 0.6025590291518269, + "grad_norm": 0.03421271592378616, + "learning_rate": 1.2012661566868902e-05, + "loss": 0.0138, + "step": 2284 + }, + { + "epoch": 0.6030866640284923, + "grad_norm": 0.10735774040222168, + "learning_rate": 1.2023212872592985e-05, + "loss": 0.004, + "step": 2286 + }, + { + "epoch": 0.6036142989051576, + "grad_norm": 0.07137402147054672, + "learning_rate": 1.2033764178317067e-05, + "loss": 0.0121, + "step": 2288 + }, + { + "epoch": 0.604141933781823, + "grad_norm": 0.23072344064712524, + "learning_rate": 1.2044315484041152e-05, + "loss": 0.0038, + "step": 2290 + }, + { + "epoch": 0.6046695686584883, + "grad_norm": 0.3488893210887909, + "learning_rate": 1.2054866789765233e-05, + "loss": 0.0039, + "step": 2292 + }, + { + "epoch": 0.6051972035351537, + "grad_norm": 5.6069560050964355, + "learning_rate": 1.2065418095489318e-05, + "loss": 0.0081, + "step": 2294 + }, + { + "epoch": 0.605724838411819, + "grad_norm": 0.047380659729242325, + "learning_rate": 1.2075969401213401e-05, + "loss": 0.0053, + "step": 2296 + }, + { + "epoch": 0.6062524732884844, + "grad_norm": 0.03437439724802971, + "learning_rate": 1.2086520706937484e-05, + "loss": 0.0034, + "step": 2298 + }, + { + "epoch": 0.6067801081651497, + "grad_norm": 0.04999234899878502, + "learning_rate": 1.2097072012661569e-05, + "loss": 0.0031, + "step": 2300 + }, + { + "epoch": 0.6073077430418151, + "grad_norm": 0.6298342943191528, + "learning_rate": 1.210762331838565e-05, + "loss": 0.0062, + "step": 2302 + }, + { + "epoch": 0.6078353779184804, + "grad_norm": 0.24233071506023407, + "learning_rate": 1.2118174624109735e-05, + "loss": 0.0033, + "step": 2304 + }, + { + "epoch": 0.6083630127951457, + "grad_norm": 0.6052067875862122, + "learning_rate": 1.2128725929833817e-05, + "loss": 0.0143, + "step": 2306 + }, + { + "epoch": 0.6088906476718111, + "grad_norm": 0.2998565137386322, + "learning_rate": 1.21392772355579e-05, + "loss": 0.0044, + "step": 2308 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 0.022360214963555336, + "learning_rate": 1.2149828541281985e-05, + "loss": 0.0035, + "step": 2310 + }, + { + "epoch": 0.6099459174251418, + "grad_norm": 0.2969961166381836, + "learning_rate": 1.2160379847006068e-05, + "loss": 0.0056, + "step": 2312 + }, + { + "epoch": 0.6104735523018071, + "grad_norm": 0.9142144322395325, + "learning_rate": 1.2170931152730153e-05, + "loss": 0.0105, + "step": 2314 + }, + { + "epoch": 0.6110011871784725, + "grad_norm": 0.5330867767333984, + "learning_rate": 1.2181482458454234e-05, + "loss": 0.0086, + "step": 2316 + }, + { + "epoch": 0.6115288220551378, + "grad_norm": 0.07397203147411346, + "learning_rate": 1.2192033764178317e-05, + "loss": 0.0054, + "step": 2318 + }, + { + "epoch": 0.6120564569318032, + "grad_norm": 0.06264911592006683, + "learning_rate": 1.2202585069902402e-05, + "loss": 0.0084, + "step": 2320 + }, + { + "epoch": 0.6125840918084685, + "grad_norm": 0.17866390943527222, + "learning_rate": 1.2213136375626485e-05, + "loss": 0.0065, + "step": 2322 + }, + { + "epoch": 0.6131117266851339, + "grad_norm": 0.7390683889389038, + "learning_rate": 1.222368768135057e-05, + "loss": 0.0086, + "step": 2324 + }, + { + "epoch": 0.6136393615617992, + "grad_norm": 0.8157547116279602, + "learning_rate": 1.2234238987074652e-05, + "loss": 0.0049, + "step": 2326 + }, + { + "epoch": 0.6141669964384646, + "grad_norm": 0.11880359798669815, + "learning_rate": 1.2244790292798733e-05, + "loss": 0.0067, + "step": 2328 + }, + { + "epoch": 0.6146946313151299, + "grad_norm": 0.3457139730453491, + "learning_rate": 1.2255341598522818e-05, + "loss": 0.0123, + "step": 2330 + }, + { + "epoch": 0.6152222661917953, + "grad_norm": 0.19722039997577667, + "learning_rate": 1.2265892904246901e-05, + "loss": 0.0067, + "step": 2332 + }, + { + "epoch": 0.6157499010684606, + "grad_norm": 0.8113455772399902, + "learning_rate": 1.2276444209970986e-05, + "loss": 0.0135, + "step": 2334 + }, + { + "epoch": 0.616277535945126, + "grad_norm": 0.06178893893957138, + "learning_rate": 1.2286995515695069e-05, + "loss": 0.0043, + "step": 2336 + }, + { + "epoch": 0.6168051708217913, + "grad_norm": 0.34115535020828247, + "learning_rate": 1.229754682141915e-05, + "loss": 0.0082, + "step": 2338 + }, + { + "epoch": 0.6173328056984567, + "grad_norm": 0.05153942108154297, + "learning_rate": 1.2308098127143235e-05, + "loss": 0.003, + "step": 2340 + }, + { + "epoch": 0.617860440575122, + "grad_norm": 0.6601436734199524, + "learning_rate": 1.2318649432867318e-05, + "loss": 0.0166, + "step": 2342 + }, + { + "epoch": 0.6183880754517873, + "grad_norm": 0.04382011294364929, + "learning_rate": 1.2329200738591402e-05, + "loss": 0.0029, + "step": 2344 + }, + { + "epoch": 0.6189157103284527, + "grad_norm": 0.03275752067565918, + "learning_rate": 1.2339752044315485e-05, + "loss": 0.0028, + "step": 2346 + }, + { + "epoch": 0.619443345205118, + "grad_norm": 0.08252448588609695, + "learning_rate": 1.2350303350039568e-05, + "loss": 0.0033, + "step": 2348 + }, + { + "epoch": 0.6199709800817834, + "grad_norm": 0.1512918919324875, + "learning_rate": 1.2360854655763653e-05, + "loss": 0.0036, + "step": 2350 + }, + { + "epoch": 0.6204986149584487, + "grad_norm": 0.07866047322750092, + "learning_rate": 1.2371405961487734e-05, + "loss": 0.0091, + "step": 2352 + }, + { + "epoch": 0.6210262498351141, + "grad_norm": 0.3957495093345642, + "learning_rate": 1.2381957267211819e-05, + "loss": 0.0039, + "step": 2354 + }, + { + "epoch": 0.6215538847117794, + "grad_norm": 0.6096582412719727, + "learning_rate": 1.2392508572935902e-05, + "loss": 0.0077, + "step": 2356 + }, + { + "epoch": 0.6220815195884448, + "grad_norm": 0.2118632048368454, + "learning_rate": 1.2403059878659985e-05, + "loss": 0.0036, + "step": 2358 + }, + { + "epoch": 0.6226091544651101, + "grad_norm": 0.12897926568984985, + "learning_rate": 1.241361118438407e-05, + "loss": 0.0069, + "step": 2360 + }, + { + "epoch": 0.6231367893417755, + "grad_norm": 0.23569300770759583, + "learning_rate": 1.2424162490108152e-05, + "loss": 0.0039, + "step": 2362 + }, + { + "epoch": 0.6236644242184408, + "grad_norm": 0.053267721086740494, + "learning_rate": 1.2434713795832237e-05, + "loss": 0.0029, + "step": 2364 + }, + { + "epoch": 0.6241920590951062, + "grad_norm": 0.5157940983772278, + "learning_rate": 1.2445265101556318e-05, + "loss": 0.0061, + "step": 2366 + }, + { + "epoch": 0.6247196939717715, + "grad_norm": 0.5263046622276306, + "learning_rate": 1.2455816407280401e-05, + "loss": 0.012, + "step": 2368 + }, + { + "epoch": 0.6252473288484369, + "grad_norm": 0.31121572852134705, + "learning_rate": 1.2466367713004486e-05, + "loss": 0.0159, + "step": 2370 + }, + { + "epoch": 0.6257749637251022, + "grad_norm": 0.06090144068002701, + "learning_rate": 1.2476919018728569e-05, + "loss": 0.003, + "step": 2372 + }, + { + "epoch": 0.6263025986017676, + "grad_norm": 0.15601250529289246, + "learning_rate": 1.2487470324452653e-05, + "loss": 0.0109, + "step": 2374 + }, + { + "epoch": 0.6268302334784329, + "grad_norm": 0.3610065281391144, + "learning_rate": 1.2498021630176735e-05, + "loss": 0.0123, + "step": 2376 + }, + { + "epoch": 0.6273578683550983, + "grad_norm": 0.14966998994350433, + "learning_rate": 1.2508572935900818e-05, + "loss": 0.0045, + "step": 2378 + }, + { + "epoch": 0.6278855032317636, + "grad_norm": 0.10792508721351624, + "learning_rate": 1.2519124241624902e-05, + "loss": 0.003, + "step": 2380 + }, + { + "epoch": 0.6284131381084289, + "grad_norm": 0.1676919162273407, + "learning_rate": 1.2529675547348985e-05, + "loss": 0.0136, + "step": 2382 + }, + { + "epoch": 0.6289407729850943, + "grad_norm": 0.7720479965209961, + "learning_rate": 1.254022685307307e-05, + "loss": 0.021, + "step": 2384 + }, + { + "epoch": 0.6294684078617596, + "grad_norm": 0.07912670820951462, + "learning_rate": 1.2550778158797153e-05, + "loss": 0.003, + "step": 2386 + }, + { + "epoch": 0.629996042738425, + "grad_norm": 0.7447213530540466, + "learning_rate": 1.2561329464521234e-05, + "loss": 0.0075, + "step": 2388 + }, + { + "epoch": 0.6305236776150903, + "grad_norm": 0.040261026471853256, + "learning_rate": 1.2571880770245319e-05, + "loss": 0.0028, + "step": 2390 + }, + { + "epoch": 0.6310513124917557, + "grad_norm": 0.4987450838088989, + "learning_rate": 1.2582432075969402e-05, + "loss": 0.0044, + "step": 2392 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.3082246780395508, + "learning_rate": 1.2592983381693486e-05, + "loss": 0.0266, + "step": 2394 + }, + { + "epoch": 0.6321065822450864, + "grad_norm": 0.03969935327768326, + "learning_rate": 1.260353468741757e-05, + "loss": 0.0028, + "step": 2396 + }, + { + "epoch": 0.6326342171217517, + "grad_norm": 0.46895965933799744, + "learning_rate": 1.2614085993141652e-05, + "loss": 0.005, + "step": 2398 + }, + { + "epoch": 0.6331618519984171, + "grad_norm": 0.6904812455177307, + "learning_rate": 1.2624637298865737e-05, + "loss": 0.0099, + "step": 2400 + }, + { + "epoch": 0.6336894868750824, + "grad_norm": 0.11429673433303833, + "learning_rate": 1.2635188604589818e-05, + "loss": 0.0079, + "step": 2402 + }, + { + "epoch": 0.6342171217517478, + "grad_norm": 0.051434289664030075, + "learning_rate": 1.2645739910313903e-05, + "loss": 0.0088, + "step": 2404 + }, + { + "epoch": 0.6347447566284131, + "grad_norm": 0.6672757863998413, + "learning_rate": 1.2656291216037986e-05, + "loss": 0.0095, + "step": 2406 + }, + { + "epoch": 0.6352723915050785, + "grad_norm": 0.28457197546958923, + "learning_rate": 1.2666842521762069e-05, + "loss": 0.0125, + "step": 2408 + }, + { + "epoch": 0.6358000263817438, + "grad_norm": 0.3738384246826172, + "learning_rate": 1.2677393827486153e-05, + "loss": 0.0122, + "step": 2410 + }, + { + "epoch": 0.6363276612584092, + "grad_norm": 0.29476436972618103, + "learning_rate": 1.2687945133210235e-05, + "loss": 0.0058, + "step": 2412 + }, + { + "epoch": 0.6368552961350745, + "grad_norm": 0.07562926411628723, + "learning_rate": 1.269849643893432e-05, + "loss": 0.0039, + "step": 2414 + }, + { + "epoch": 0.6373829310117399, + "grad_norm": 0.07539337873458862, + "learning_rate": 1.2709047744658402e-05, + "loss": 0.003, + "step": 2416 + }, + { + "epoch": 0.6379105658884052, + "grad_norm": 0.1663830429315567, + "learning_rate": 1.2719599050382485e-05, + "loss": 0.0145, + "step": 2418 + }, + { + "epoch": 0.6384382007650705, + "grad_norm": 0.3360435664653778, + "learning_rate": 1.273015035610657e-05, + "loss": 0.0185, + "step": 2420 + }, + { + "epoch": 0.6389658356417359, + "grad_norm": 0.18304643034934998, + "learning_rate": 1.2740701661830653e-05, + "loss": 0.0036, + "step": 2422 + }, + { + "epoch": 0.6394934705184012, + "grad_norm": 0.029904531314969063, + "learning_rate": 1.2751252967554738e-05, + "loss": 0.0057, + "step": 2424 + }, + { + "epoch": 0.6400211053950666, + "grad_norm": 0.25667449831962585, + "learning_rate": 1.2761804273278819e-05, + "loss": 0.0098, + "step": 2426 + }, + { + "epoch": 0.6405487402717319, + "grad_norm": 0.04895729944109917, + "learning_rate": 1.2772355579002902e-05, + "loss": 0.0046, + "step": 2428 + }, + { + "epoch": 0.6410763751483973, + "grad_norm": 0.16526949405670166, + "learning_rate": 1.2782906884726986e-05, + "loss": 0.0114, + "step": 2430 + }, + { + "epoch": 0.6416040100250626, + "grad_norm": 0.0960419774055481, + "learning_rate": 1.279345819045107e-05, + "loss": 0.0032, + "step": 2432 + }, + { + "epoch": 0.642131644901728, + "grad_norm": 0.08632221817970276, + "learning_rate": 1.2804009496175154e-05, + "loss": 0.0127, + "step": 2434 + }, + { + "epoch": 0.6426592797783933, + "grad_norm": 0.23698070645332336, + "learning_rate": 1.2814560801899237e-05, + "loss": 0.0068, + "step": 2436 + }, + { + "epoch": 0.6431869146550587, + "grad_norm": 0.039588116109371185, + "learning_rate": 1.2825112107623318e-05, + "loss": 0.0033, + "step": 2438 + }, + { + "epoch": 0.643714549531724, + "grad_norm": 1.0413559675216675, + "learning_rate": 1.2835663413347403e-05, + "loss": 0.0214, + "step": 2440 + }, + { + "epoch": 0.6442421844083894, + "grad_norm": 0.3941947817802429, + "learning_rate": 1.2846214719071486e-05, + "loss": 0.0104, + "step": 2442 + }, + { + "epoch": 0.6447698192850547, + "grad_norm": 0.3751527667045593, + "learning_rate": 1.285676602479557e-05, + "loss": 0.0043, + "step": 2444 + }, + { + "epoch": 0.6452974541617201, + "grad_norm": 0.04314848408102989, + "learning_rate": 1.2867317330519653e-05, + "loss": 0.0097, + "step": 2446 + }, + { + "epoch": 0.6458250890383854, + "grad_norm": 0.10433851182460785, + "learning_rate": 1.2877868636243735e-05, + "loss": 0.0038, + "step": 2448 + }, + { + "epoch": 0.6463527239150508, + "grad_norm": 0.1530594825744629, + "learning_rate": 1.288841994196782e-05, + "loss": 0.0029, + "step": 2450 + }, + { + "epoch": 0.6468803587917161, + "grad_norm": 0.1732487678527832, + "learning_rate": 1.2898971247691902e-05, + "loss": 0.0093, + "step": 2452 + }, + { + "epoch": 0.6474079936683815, + "grad_norm": 0.4638527035713196, + "learning_rate": 1.2909522553415987e-05, + "loss": 0.0075, + "step": 2454 + }, + { + "epoch": 0.6479356285450468, + "grad_norm": 0.05113803595304489, + "learning_rate": 1.292007385914007e-05, + "loss": 0.0042, + "step": 2456 + }, + { + "epoch": 0.6484632634217121, + "grad_norm": 0.2139173001050949, + "learning_rate": 1.2930625164864153e-05, + "loss": 0.0093, + "step": 2458 + }, + { + "epoch": 0.6489908982983775, + "grad_norm": 0.039393845945596695, + "learning_rate": 1.2941176470588238e-05, + "loss": 0.0038, + "step": 2460 + }, + { + "epoch": 0.6495185331750428, + "grad_norm": 0.04229651018977165, + "learning_rate": 1.2951727776312319e-05, + "loss": 0.0079, + "step": 2462 + }, + { + "epoch": 0.6500461680517082, + "grad_norm": 0.15886196494102478, + "learning_rate": 1.2962279082036403e-05, + "loss": 0.0052, + "step": 2464 + }, + { + "epoch": 0.6505738029283735, + "grad_norm": 0.03829830512404442, + "learning_rate": 1.2972830387760486e-05, + "loss": 0.0026, + "step": 2466 + }, + { + "epoch": 0.6511014378050389, + "grad_norm": 0.02561100758612156, + "learning_rate": 1.298338169348457e-05, + "loss": 0.0029, + "step": 2468 + }, + { + "epoch": 0.6516290726817042, + "grad_norm": 0.0350370854139328, + "learning_rate": 1.2993932999208654e-05, + "loss": 0.0059, + "step": 2470 + }, + { + "epoch": 0.6521567075583696, + "grad_norm": 0.18002623319625854, + "learning_rate": 1.3004484304932737e-05, + "loss": 0.0025, + "step": 2472 + }, + { + "epoch": 0.6526843424350349, + "grad_norm": 1.1091535091400146, + "learning_rate": 1.3015035610656818e-05, + "loss": 0.0078, + "step": 2474 + }, + { + "epoch": 0.6532119773117003, + "grad_norm": 0.4812288284301758, + "learning_rate": 1.3025586916380903e-05, + "loss": 0.0071, + "step": 2476 + }, + { + "epoch": 0.6537396121883656, + "grad_norm": 0.07512831687927246, + "learning_rate": 1.3036138222104986e-05, + "loss": 0.0027, + "step": 2478 + }, + { + "epoch": 0.654267247065031, + "grad_norm": 0.6827439069747925, + "learning_rate": 1.304668952782907e-05, + "loss": 0.0098, + "step": 2480 + }, + { + "epoch": 0.6547948819416963, + "grad_norm": 0.5691679120063782, + "learning_rate": 1.3057240833553153e-05, + "loss": 0.0095, + "step": 2482 + }, + { + "epoch": 0.6553225168183617, + "grad_norm": 0.07553357630968094, + "learning_rate": 1.3067792139277235e-05, + "loss": 0.0126, + "step": 2484 + }, + { + "epoch": 0.655850151695027, + "grad_norm": 0.31166696548461914, + "learning_rate": 1.307834344500132e-05, + "loss": 0.0036, + "step": 2486 + }, + { + "epoch": 0.6563777865716924, + "grad_norm": 0.16130106151103973, + "learning_rate": 1.3088894750725402e-05, + "loss": 0.0041, + "step": 2488 + }, + { + "epoch": 0.6569054214483577, + "grad_norm": 0.09298447519540787, + "learning_rate": 1.3099446056449487e-05, + "loss": 0.003, + "step": 2490 + }, + { + "epoch": 0.6574330563250231, + "grad_norm": 1.0399887561798096, + "learning_rate": 1.310999736217357e-05, + "loss": 0.01, + "step": 2492 + }, + { + "epoch": 0.6579606912016884, + "grad_norm": 0.02332381345331669, + "learning_rate": 1.3120548667897653e-05, + "loss": 0.0023, + "step": 2494 + }, + { + "epoch": 0.6584883260783537, + "grad_norm": 0.09759225696325302, + "learning_rate": 1.3131099973621738e-05, + "loss": 0.005, + "step": 2496 + }, + { + "epoch": 0.6590159609550191, + "grad_norm": 0.10926694422960281, + "learning_rate": 1.3141651279345819e-05, + "loss": 0.0043, + "step": 2498 + }, + { + "epoch": 0.6595435958316844, + "grad_norm": 0.6652600765228271, + "learning_rate": 1.3152202585069904e-05, + "loss": 0.0071, + "step": 2500 + }, + { + "epoch": 0.6600712307083498, + "grad_norm": 0.2818048894405365, + "learning_rate": 1.3162753890793986e-05, + "loss": 0.0031, + "step": 2502 + }, + { + "epoch": 0.6605988655850151, + "grad_norm": 0.4470961093902588, + "learning_rate": 1.317330519651807e-05, + "loss": 0.0061, + "step": 2504 + }, + { + "epoch": 0.6611265004616805, + "grad_norm": 0.20703017711639404, + "learning_rate": 1.3183856502242154e-05, + "loss": 0.0046, + "step": 2506 + }, + { + "epoch": 0.6616541353383458, + "grad_norm": 0.4324851930141449, + "learning_rate": 1.3194407807966237e-05, + "loss": 0.0094, + "step": 2508 + }, + { + "epoch": 0.6621817702150112, + "grad_norm": 0.18635980784893036, + "learning_rate": 1.3204959113690322e-05, + "loss": 0.0114, + "step": 2510 + }, + { + "epoch": 0.6627094050916765, + "grad_norm": 0.3455379903316498, + "learning_rate": 1.3215510419414403e-05, + "loss": 0.0144, + "step": 2512 + }, + { + "epoch": 0.6632370399683419, + "grad_norm": 0.18342149257659912, + "learning_rate": 1.3226061725138486e-05, + "loss": 0.0034, + "step": 2514 + }, + { + "epoch": 0.6637646748450072, + "grad_norm": 0.1753171980381012, + "learning_rate": 1.323661303086257e-05, + "loss": 0.0069, + "step": 2516 + }, + { + "epoch": 0.6642923097216726, + "grad_norm": 0.35500094294548035, + "learning_rate": 1.3247164336586654e-05, + "loss": 0.0079, + "step": 2518 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 0.5774187445640564, + "learning_rate": 1.3257715642310738e-05, + "loss": 0.0129, + "step": 2520 + }, + { + "epoch": 0.6653475794750033, + "grad_norm": 0.07281900197267532, + "learning_rate": 1.326826694803482e-05, + "loss": 0.0051, + "step": 2522 + }, + { + "epoch": 0.6658752143516686, + "grad_norm": 0.19720390439033508, + "learning_rate": 1.3278818253758902e-05, + "loss": 0.0071, + "step": 2524 + }, + { + "epoch": 0.666402849228334, + "grad_norm": 0.26355379819869995, + "learning_rate": 1.3289369559482987e-05, + "loss": 0.0101, + "step": 2526 + }, + { + "epoch": 0.6669304841049993, + "grad_norm": 0.0309640783816576, + "learning_rate": 1.329992086520707e-05, + "loss": 0.018, + "step": 2528 + }, + { + "epoch": 0.6674581189816647, + "grad_norm": 0.11476221680641174, + "learning_rate": 1.3310472170931155e-05, + "loss": 0.005, + "step": 2530 + }, + { + "epoch": 0.66798575385833, + "grad_norm": 4.0450215339660645, + "learning_rate": 1.3321023476655238e-05, + "loss": 0.0129, + "step": 2532 + }, + { + "epoch": 0.6685133887349953, + "grad_norm": 0.24265612661838531, + "learning_rate": 1.3331574782379319e-05, + "loss": 0.0046, + "step": 2534 + }, + { + "epoch": 0.6690410236116607, + "grad_norm": 0.32178211212158203, + "learning_rate": 1.3342126088103404e-05, + "loss": 0.0062, + "step": 2536 + }, + { + "epoch": 0.669568658488326, + "grad_norm": 0.5474958419799805, + "learning_rate": 1.3352677393827487e-05, + "loss": 0.0114, + "step": 2538 + }, + { + "epoch": 0.6700962933649914, + "grad_norm": 0.30484476685523987, + "learning_rate": 1.3363228699551571e-05, + "loss": 0.0174, + "step": 2540 + }, + { + "epoch": 0.6706239282416567, + "grad_norm": 0.2977488040924072, + "learning_rate": 1.3373780005275654e-05, + "loss": 0.0102, + "step": 2542 + }, + { + "epoch": 0.6711515631183221, + "grad_norm": 0.18393495678901672, + "learning_rate": 1.3384331310999737e-05, + "loss": 0.0038, + "step": 2544 + }, + { + "epoch": 0.6716791979949874, + "grad_norm": 0.09977661818265915, + "learning_rate": 1.3394882616723822e-05, + "loss": 0.003, + "step": 2546 + }, + { + "epoch": 0.6722068328716528, + "grad_norm": 0.3844533860683441, + "learning_rate": 1.3405433922447903e-05, + "loss": 0.0052, + "step": 2548 + }, + { + "epoch": 0.6727344677483181, + "grad_norm": 0.10022243112325668, + "learning_rate": 1.3415985228171988e-05, + "loss": 0.0026, + "step": 2550 + }, + { + "epoch": 0.6732621026249835, + "grad_norm": 0.474331796169281, + "learning_rate": 1.342653653389607e-05, + "loss": 0.0107, + "step": 2552 + }, + { + "epoch": 0.6737897375016488, + "grad_norm": 0.29082173109054565, + "learning_rate": 1.3437087839620154e-05, + "loss": 0.0073, + "step": 2554 + }, + { + "epoch": 0.6743173723783142, + "grad_norm": 0.48244187235832214, + "learning_rate": 1.3447639145344238e-05, + "loss": 0.0107, + "step": 2556 + }, + { + "epoch": 0.6748450072549795, + "grad_norm": 0.09153445065021515, + "learning_rate": 1.345819045106832e-05, + "loss": 0.0026, + "step": 2558 + }, + { + "epoch": 0.6753726421316449, + "grad_norm": 0.39827999472618103, + "learning_rate": 1.3468741756792406e-05, + "loss": 0.0095, + "step": 2560 + }, + { + "epoch": 0.6759002770083102, + "grad_norm": 0.2947092354297638, + "learning_rate": 1.3479293062516487e-05, + "loss": 0.0066, + "step": 2562 + }, + { + "epoch": 0.6764279118849756, + "grad_norm": 0.2210869938135147, + "learning_rate": 1.348984436824057e-05, + "loss": 0.0062, + "step": 2564 + }, + { + "epoch": 0.6769555467616409, + "grad_norm": 0.16169025003910065, + "learning_rate": 1.3500395673964655e-05, + "loss": 0.0062, + "step": 2566 + }, + { + "epoch": 0.6774831816383063, + "grad_norm": 0.1408298909664154, + "learning_rate": 1.3510946979688738e-05, + "loss": 0.004, + "step": 2568 + }, + { + "epoch": 0.6780108165149716, + "grad_norm": 0.395568311214447, + "learning_rate": 1.3521498285412822e-05, + "loss": 0.0046, + "step": 2570 + }, + { + "epoch": 0.6785384513916369, + "grad_norm": 0.06855584681034088, + "learning_rate": 1.3532049591136904e-05, + "loss": 0.0024, + "step": 2572 + }, + { + "epoch": 0.6790660862683023, + "grad_norm": 0.14829349517822266, + "learning_rate": 1.3542600896860987e-05, + "loss": 0.0039, + "step": 2574 + }, + { + "epoch": 0.6795937211449676, + "grad_norm": 0.03608287125825882, + "learning_rate": 1.3553152202585071e-05, + "loss": 0.0023, + "step": 2576 + }, + { + "epoch": 0.680121356021633, + "grad_norm": 0.028731664642691612, + "learning_rate": 1.3563703508309154e-05, + "loss": 0.0021, + "step": 2578 + }, + { + "epoch": 0.6806489908982983, + "grad_norm": 0.3863641619682312, + "learning_rate": 1.3574254814033239e-05, + "loss": 0.0053, + "step": 2580 + }, + { + "epoch": 0.6811766257749637, + "grad_norm": 0.03301887959241867, + "learning_rate": 1.3584806119757322e-05, + "loss": 0.0044, + "step": 2582 + }, + { + "epoch": 0.681704260651629, + "grad_norm": 0.3079659938812256, + "learning_rate": 1.3595357425481403e-05, + "loss": 0.0031, + "step": 2584 + }, + { + "epoch": 0.6822318955282944, + "grad_norm": 0.953937828540802, + "learning_rate": 1.3605908731205488e-05, + "loss": 0.0204, + "step": 2586 + }, + { + "epoch": 0.6827595304049597, + "grad_norm": 0.025959398597478867, + "learning_rate": 1.361646003692957e-05, + "loss": 0.006, + "step": 2588 + }, + { + "epoch": 0.6832871652816251, + "grad_norm": 0.0470987930893898, + "learning_rate": 1.3627011342653655e-05, + "loss": 0.0023, + "step": 2590 + }, + { + "epoch": 0.6838148001582904, + "grad_norm": 0.6920813918113708, + "learning_rate": 1.3637562648377738e-05, + "loss": 0.0037, + "step": 2592 + }, + { + "epoch": 0.6843424350349558, + "grad_norm": 0.4131225645542145, + "learning_rate": 1.364811395410182e-05, + "loss": 0.0048, + "step": 2594 + }, + { + "epoch": 0.6848700699116211, + "grad_norm": 0.10382501780986786, + "learning_rate": 1.3658665259825906e-05, + "loss": 0.0039, + "step": 2596 + }, + { + "epoch": 0.6853977047882865, + "grad_norm": 0.159744530916214, + "learning_rate": 1.3669216565549987e-05, + "loss": 0.0028, + "step": 2598 + }, + { + "epoch": 0.6859253396649518, + "grad_norm": 0.12795746326446533, + "learning_rate": 1.3679767871274072e-05, + "loss": 0.0026, + "step": 2600 + }, + { + "epoch": 0.6864529745416172, + "grad_norm": 0.02451411634683609, + "learning_rate": 1.3690319176998155e-05, + "loss": 0.0021, + "step": 2602 + }, + { + "epoch": 0.6869806094182825, + "grad_norm": 0.044708967208862305, + "learning_rate": 1.3700870482722238e-05, + "loss": 0.0022, + "step": 2604 + }, + { + "epoch": 0.687508244294948, + "grad_norm": 0.022962838411331177, + "learning_rate": 1.3711421788446322e-05, + "loss": 0.0069, + "step": 2606 + }, + { + "epoch": 0.6880358791716132, + "grad_norm": 0.5857008099555969, + "learning_rate": 1.3721973094170404e-05, + "loss": 0.0111, + "step": 2608 + }, + { + "epoch": 0.6885635140482786, + "grad_norm": 0.026238026097416878, + "learning_rate": 1.3732524399894488e-05, + "loss": 0.0021, + "step": 2610 + }, + { + "epoch": 0.6890911489249439, + "grad_norm": 0.12501491606235504, + "learning_rate": 1.3743075705618571e-05, + "loss": 0.0025, + "step": 2612 + }, + { + "epoch": 0.6896187838016092, + "grad_norm": 0.3262298107147217, + "learning_rate": 1.3753627011342654e-05, + "loss": 0.0143, + "step": 2614 + }, + { + "epoch": 0.6901464186782746, + "grad_norm": 0.4272230863571167, + "learning_rate": 1.3764178317066739e-05, + "loss": 0.025, + "step": 2616 + }, + { + "epoch": 0.6906740535549399, + "grad_norm": 0.7354425191879272, + "learning_rate": 1.3774729622790822e-05, + "loss": 0.0062, + "step": 2618 + }, + { + "epoch": 0.6912016884316053, + "grad_norm": 0.10975382477045059, + "learning_rate": 1.3785280928514906e-05, + "loss": 0.0033, + "step": 2620 + }, + { + "epoch": 0.6917293233082706, + "grad_norm": 0.040215864777565, + "learning_rate": 1.3795832234238988e-05, + "loss": 0.0059, + "step": 2622 + }, + { + "epoch": 0.692256958184936, + "grad_norm": 0.06408130377531052, + "learning_rate": 1.380638353996307e-05, + "loss": 0.0031, + "step": 2624 + }, + { + "epoch": 0.6927845930616013, + "grad_norm": 0.10935935378074646, + "learning_rate": 1.3816934845687155e-05, + "loss": 0.0111, + "step": 2626 + }, + { + "epoch": 0.6933122279382667, + "grad_norm": 0.4506552219390869, + "learning_rate": 1.3827486151411238e-05, + "loss": 0.0109, + "step": 2628 + }, + { + "epoch": 0.693839862814932, + "grad_norm": 0.06359558552503586, + "learning_rate": 1.3838037457135323e-05, + "loss": 0.0029, + "step": 2630 + }, + { + "epoch": 0.6943674976915974, + "grad_norm": 0.554663896560669, + "learning_rate": 1.3848588762859406e-05, + "loss": 0.0081, + "step": 2632 + }, + { + "epoch": 0.6948951325682627, + "grad_norm": 0.04934075102210045, + "learning_rate": 1.3859140068583487e-05, + "loss": 0.0035, + "step": 2634 + }, + { + "epoch": 0.6954227674449281, + "grad_norm": 0.030709434300661087, + "learning_rate": 1.3869691374307572e-05, + "loss": 0.0071, + "step": 2636 + }, + { + "epoch": 0.6959504023215934, + "grad_norm": 0.14254042506217957, + "learning_rate": 1.3880242680031655e-05, + "loss": 0.0031, + "step": 2638 + }, + { + "epoch": 0.6964780371982588, + "grad_norm": 0.16822658479213715, + "learning_rate": 1.389079398575574e-05, + "loss": 0.006, + "step": 2640 + }, + { + "epoch": 0.6970056720749241, + "grad_norm": 0.048302389681339264, + "learning_rate": 1.3901345291479822e-05, + "loss": 0.0064, + "step": 2642 + }, + { + "epoch": 0.6975333069515895, + "grad_norm": 0.6077779531478882, + "learning_rate": 1.3911896597203904e-05, + "loss": 0.0057, + "step": 2644 + }, + { + "epoch": 0.6980609418282548, + "grad_norm": 0.5254798531532288, + "learning_rate": 1.3922447902927988e-05, + "loss": 0.0046, + "step": 2646 + }, + { + "epoch": 0.6985885767049202, + "grad_norm": 0.042530231177806854, + "learning_rate": 1.3932999208652071e-05, + "loss": 0.0119, + "step": 2648 + }, + { + "epoch": 0.6991162115815855, + "grad_norm": 0.410663902759552, + "learning_rate": 1.3943550514376156e-05, + "loss": 0.0088, + "step": 2650 + }, + { + "epoch": 0.6996438464582508, + "grad_norm": 0.31983649730682373, + "learning_rate": 1.3954101820100239e-05, + "loss": 0.0067, + "step": 2652 + }, + { + "epoch": 0.7001714813349162, + "grad_norm": 0.7586281895637512, + "learning_rate": 1.3964653125824322e-05, + "loss": 0.0082, + "step": 2654 + }, + { + "epoch": 0.7006991162115815, + "grad_norm": 0.26345500349998474, + "learning_rate": 1.3975204431548407e-05, + "loss": 0.0059, + "step": 2656 + }, + { + "epoch": 0.701226751088247, + "grad_norm": 0.2116478830575943, + "learning_rate": 1.3985755737272488e-05, + "loss": 0.0055, + "step": 2658 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.1562986820936203, + "learning_rate": 1.3996307042996572e-05, + "loss": 0.0036, + "step": 2660 + }, + { + "epoch": 0.7022820208415776, + "grad_norm": 0.1014990583062172, + "learning_rate": 1.4006858348720655e-05, + "loss": 0.0022, + "step": 2662 + }, + { + "epoch": 0.7028096557182429, + "grad_norm": 0.4865214228630066, + "learning_rate": 1.4017409654444738e-05, + "loss": 0.013, + "step": 2664 + }, + { + "epoch": 0.7033372905949083, + "grad_norm": 0.4921734631061554, + "learning_rate": 1.4027960960168823e-05, + "loss": 0.0055, + "step": 2666 + }, + { + "epoch": 0.7038649254715736, + "grad_norm": 0.8906884789466858, + "learning_rate": 1.4038512265892906e-05, + "loss": 0.006, + "step": 2668 + }, + { + "epoch": 0.704392560348239, + "grad_norm": 0.6837649345397949, + "learning_rate": 1.404906357161699e-05, + "loss": 0.0093, + "step": 2670 + }, + { + "epoch": 0.7049201952249043, + "grad_norm": 0.10945390909910202, + "learning_rate": 1.4059614877341072e-05, + "loss": 0.0053, + "step": 2672 + }, + { + "epoch": 0.7054478301015698, + "grad_norm": 0.24208278954029083, + "learning_rate": 1.4070166183065155e-05, + "loss": 0.003, + "step": 2674 + }, + { + "epoch": 0.705975464978235, + "grad_norm": 0.14783506095409393, + "learning_rate": 1.408071748878924e-05, + "loss": 0.0137, + "step": 2676 + }, + { + "epoch": 0.7065030998549005, + "grad_norm": 0.10711310803890228, + "learning_rate": 1.4091268794513322e-05, + "loss": 0.0131, + "step": 2678 + }, + { + "epoch": 0.7070307347315657, + "grad_norm": 0.17332711815834045, + "learning_rate": 1.4101820100237404e-05, + "loss": 0.0032, + "step": 2680 + }, + { + "epoch": 0.7075583696082312, + "grad_norm": 0.6830866932868958, + "learning_rate": 1.4112371405961488e-05, + "loss": 0.0136, + "step": 2682 + }, + { + "epoch": 0.7080860044848964, + "grad_norm": 0.06810946017503738, + "learning_rate": 1.4122922711685571e-05, + "loss": 0.0022, + "step": 2684 + }, + { + "epoch": 0.7086136393615619, + "grad_norm": 0.5201871991157532, + "learning_rate": 1.4133474017409656e-05, + "loss": 0.0043, + "step": 2686 + }, + { + "epoch": 0.7091412742382271, + "grad_norm": 0.09244726598262787, + "learning_rate": 1.4144025323133739e-05, + "loss": 0.0022, + "step": 2688 + }, + { + "epoch": 0.7096689091148924, + "grad_norm": 0.540807843208313, + "learning_rate": 1.4154576628857822e-05, + "loss": 0.0093, + "step": 2690 + }, + { + "epoch": 0.7101965439915578, + "grad_norm": 0.03155829757452011, + "learning_rate": 1.4165127934581907e-05, + "loss": 0.0023, + "step": 2692 + }, + { + "epoch": 0.7107241788682231, + "grad_norm": 0.22798027098178864, + "learning_rate": 1.4175679240305988e-05, + "loss": 0.0073, + "step": 2694 + }, + { + "epoch": 0.7112518137448886, + "grad_norm": 0.5284186005592346, + "learning_rate": 1.4186230546030072e-05, + "loss": 0.0038, + "step": 2696 + }, + { + "epoch": 0.7117794486215538, + "grad_norm": 0.041939087212085724, + "learning_rate": 1.4196781851754155e-05, + "loss": 0.0021, + "step": 2698 + }, + { + "epoch": 0.7123070834982193, + "grad_norm": 0.3177500367164612, + "learning_rate": 1.4207333157478238e-05, + "loss": 0.0032, + "step": 2700 + }, + { + "epoch": 0.7128347183748845, + "grad_norm": 0.020451953634619713, + "learning_rate": 1.4217884463202323e-05, + "loss": 0.0023, + "step": 2702 + }, + { + "epoch": 0.71336235325155, + "grad_norm": 0.03644361346960068, + "learning_rate": 1.4228435768926406e-05, + "loss": 0.0018, + "step": 2704 + }, + { + "epoch": 0.7138899881282152, + "grad_norm": 0.030675729736685753, + "learning_rate": 1.423898707465049e-05, + "loss": 0.003, + "step": 2706 + }, + { + "epoch": 0.7144176230048807, + "grad_norm": 0.03699678182601929, + "learning_rate": 1.4249538380374572e-05, + "loss": 0.007, + "step": 2708 + }, + { + "epoch": 0.714945257881546, + "grad_norm": 0.04584222659468651, + "learning_rate": 1.4260089686098655e-05, + "loss": 0.0098, + "step": 2710 + }, + { + "epoch": 0.7154728927582114, + "grad_norm": 0.043041616678237915, + "learning_rate": 1.427064099182274e-05, + "loss": 0.0028, + "step": 2712 + }, + { + "epoch": 0.7160005276348766, + "grad_norm": 0.4044210612773895, + "learning_rate": 1.4281192297546823e-05, + "loss": 0.0094, + "step": 2714 + }, + { + "epoch": 0.716528162511542, + "grad_norm": 0.6979888677597046, + "learning_rate": 1.4291743603270907e-05, + "loss": 0.0137, + "step": 2716 + }, + { + "epoch": 0.7170557973882073, + "grad_norm": 0.1562553495168686, + "learning_rate": 1.4302294908994988e-05, + "loss": 0.0023, + "step": 2718 + }, + { + "epoch": 0.7175834322648728, + "grad_norm": 0.5136690735816956, + "learning_rate": 1.4312846214719071e-05, + "loss": 0.0112, + "step": 2720 + }, + { + "epoch": 0.718111067141538, + "grad_norm": 0.07132653892040253, + "learning_rate": 1.4323397520443156e-05, + "loss": 0.0047, + "step": 2722 + }, + { + "epoch": 0.7186387020182035, + "grad_norm": 0.06684327125549316, + "learning_rate": 1.4333948826167239e-05, + "loss": 0.0051, + "step": 2724 + }, + { + "epoch": 0.7191663368948688, + "grad_norm": 0.07251565158367157, + "learning_rate": 1.4344500131891324e-05, + "loss": 0.0024, + "step": 2726 + }, + { + "epoch": 0.719693971771534, + "grad_norm": 0.31686505675315857, + "learning_rate": 1.4355051437615407e-05, + "loss": 0.0096, + "step": 2728 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 0.2933167517185211, + "learning_rate": 1.4365602743339488e-05, + "loss": 0.0099, + "step": 2730 + }, + { + "epoch": 0.7207492415248647, + "grad_norm": 0.12726812064647675, + "learning_rate": 1.4376154049063573e-05, + "loss": 0.0075, + "step": 2732 + }, + { + "epoch": 0.7212768764015302, + "grad_norm": 1.0089287757873535, + "learning_rate": 1.4386705354787655e-05, + "loss": 0.0081, + "step": 2734 + }, + { + "epoch": 0.7218045112781954, + "grad_norm": 0.022879842668771744, + "learning_rate": 1.439725666051174e-05, + "loss": 0.0124, + "step": 2736 + }, + { + "epoch": 0.7223321461548609, + "grad_norm": 0.3487453758716583, + "learning_rate": 1.4407807966235823e-05, + "loss": 0.0079, + "step": 2738 + }, + { + "epoch": 0.7228597810315261, + "grad_norm": 0.04873011261224747, + "learning_rate": 1.4418359271959906e-05, + "loss": 0.0034, + "step": 2740 + }, + { + "epoch": 0.7233874159081916, + "grad_norm": 0.07939106971025467, + "learning_rate": 1.442891057768399e-05, + "loss": 0.0058, + "step": 2742 + }, + { + "epoch": 0.7239150507848569, + "grad_norm": 0.5920801758766174, + "learning_rate": 1.4439461883408072e-05, + "loss": 0.0048, + "step": 2744 + }, + { + "epoch": 0.7244426856615223, + "grad_norm": 0.12236537039279938, + "learning_rate": 1.4450013189132157e-05, + "loss": 0.0039, + "step": 2746 + }, + { + "epoch": 0.7249703205381876, + "grad_norm": 0.2089218944311142, + "learning_rate": 1.446056449485624e-05, + "loss": 0.0051, + "step": 2748 + }, + { + "epoch": 0.725497955414853, + "grad_norm": 0.07387931644916534, + "learning_rate": 1.4471115800580323e-05, + "loss": 0.0021, + "step": 2750 + }, + { + "epoch": 0.7260255902915183, + "grad_norm": 0.16933488845825195, + "learning_rate": 1.4481667106304407e-05, + "loss": 0.0023, + "step": 2752 + }, + { + "epoch": 0.7265532251681837, + "grad_norm": 0.025968873873353004, + "learning_rate": 1.4492218412028488e-05, + "loss": 0.0101, + "step": 2754 + }, + { + "epoch": 0.727080860044849, + "grad_norm": 0.14067091047763824, + "learning_rate": 1.4502769717752573e-05, + "loss": 0.0087, + "step": 2756 + }, + { + "epoch": 0.7276084949215144, + "grad_norm": 0.05542822182178497, + "learning_rate": 1.4513321023476656e-05, + "loss": 0.0052, + "step": 2758 + }, + { + "epoch": 0.7281361297981797, + "grad_norm": 0.037888094782829285, + "learning_rate": 1.4523872329200739e-05, + "loss": 0.0018, + "step": 2760 + }, + { + "epoch": 0.7286637646748451, + "grad_norm": 0.05691179260611534, + "learning_rate": 1.4534423634924824e-05, + "loss": 0.0058, + "step": 2762 + }, + { + "epoch": 0.7291913995515104, + "grad_norm": 0.02202732302248478, + "learning_rate": 1.4544974940648907e-05, + "loss": 0.0023, + "step": 2764 + }, + { + "epoch": 0.7297190344281757, + "grad_norm": 0.024148283526301384, + "learning_rate": 1.4555526246372991e-05, + "loss": 0.0018, + "step": 2766 + }, + { + "epoch": 0.7302466693048411, + "grad_norm": 0.38554856181144714, + "learning_rate": 1.4566077552097073e-05, + "loss": 0.0127, + "step": 2768 + }, + { + "epoch": 0.7307743041815064, + "grad_norm": 0.10364695638418198, + "learning_rate": 1.4576628857821156e-05, + "loss": 0.007, + "step": 2770 + }, + { + "epoch": 0.7313019390581718, + "grad_norm": 0.14119498431682587, + "learning_rate": 1.458718016354524e-05, + "loss": 0.0027, + "step": 2772 + }, + { + "epoch": 0.731829573934837, + "grad_norm": 0.16606128215789795, + "learning_rate": 1.4597731469269323e-05, + "loss": 0.0022, + "step": 2774 + }, + { + "epoch": 0.7323572088115025, + "grad_norm": 0.31702470779418945, + "learning_rate": 1.4608282774993408e-05, + "loss": 0.0059, + "step": 2776 + }, + { + "epoch": 0.7328848436881678, + "grad_norm": 0.11535658687353134, + "learning_rate": 1.461883408071749e-05, + "loss": 0.0022, + "step": 2778 + }, + { + "epoch": 0.7334124785648332, + "grad_norm": 0.03971853479743004, + "learning_rate": 1.4629385386441572e-05, + "loss": 0.0073, + "step": 2780 + }, + { + "epoch": 0.7339401134414985, + "grad_norm": 0.021127842366695404, + "learning_rate": 1.4639936692165657e-05, + "loss": 0.0072, + "step": 2782 + }, + { + "epoch": 0.7344677483181639, + "grad_norm": 0.10444173216819763, + "learning_rate": 1.465048799788974e-05, + "loss": 0.002, + "step": 2784 + }, + { + "epoch": 0.7349953831948292, + "grad_norm": 0.20803244411945343, + "learning_rate": 1.4661039303613824e-05, + "loss": 0.0092, + "step": 2786 + }, + { + "epoch": 0.7355230180714946, + "grad_norm": 0.20721256732940674, + "learning_rate": 1.4671590609337907e-05, + "loss": 0.0081, + "step": 2788 + }, + { + "epoch": 0.7360506529481599, + "grad_norm": 0.17056962847709656, + "learning_rate": 1.4682141915061989e-05, + "loss": 0.0145, + "step": 2790 + }, + { + "epoch": 0.7365782878248253, + "grad_norm": 0.4572083353996277, + "learning_rate": 1.4692693220786073e-05, + "loss": 0.0082, + "step": 2792 + }, + { + "epoch": 0.7371059227014906, + "grad_norm": 0.07527345418930054, + "learning_rate": 1.4703244526510156e-05, + "loss": 0.0103, + "step": 2794 + }, + { + "epoch": 0.737633557578156, + "grad_norm": 0.10900134593248367, + "learning_rate": 1.471379583223424e-05, + "loss": 0.0043, + "step": 2796 + }, + { + "epoch": 0.7381611924548213, + "grad_norm": 0.40556085109710693, + "learning_rate": 1.4724347137958324e-05, + "loss": 0.0062, + "step": 2798 + }, + { + "epoch": 0.7386888273314867, + "grad_norm": 0.13155905902385712, + "learning_rate": 1.4734898443682407e-05, + "loss": 0.003, + "step": 2800 + }, + { + "epoch": 0.739216462208152, + "grad_norm": 0.02027856931090355, + "learning_rate": 1.4745449749406491e-05, + "loss": 0.0024, + "step": 2802 + }, + { + "epoch": 0.7397440970848173, + "grad_norm": 0.5315604209899902, + "learning_rate": 1.4756001055130573e-05, + "loss": 0.003, + "step": 2804 + }, + { + "epoch": 0.7402717319614827, + "grad_norm": 0.13666830956935883, + "learning_rate": 1.4766552360854657e-05, + "loss": 0.0125, + "step": 2806 + }, + { + "epoch": 0.740799366838148, + "grad_norm": 0.10963459312915802, + "learning_rate": 1.477710366657874e-05, + "loss": 0.0089, + "step": 2808 + }, + { + "epoch": 0.7413270017148134, + "grad_norm": 0.4175257086753845, + "learning_rate": 1.4787654972302823e-05, + "loss": 0.022, + "step": 2810 + }, + { + "epoch": 0.7418546365914787, + "grad_norm": 0.08779008686542511, + "learning_rate": 1.4798206278026908e-05, + "loss": 0.0048, + "step": 2812 + }, + { + "epoch": 0.7423822714681441, + "grad_norm": 0.41957053542137146, + "learning_rate": 1.480875758375099e-05, + "loss": 0.003, + "step": 2814 + }, + { + "epoch": 0.7429099063448094, + "grad_norm": 0.05065023526549339, + "learning_rate": 1.4819308889475075e-05, + "loss": 0.0033, + "step": 2816 + }, + { + "epoch": 0.7434375412214748, + "grad_norm": 0.08802685141563416, + "learning_rate": 1.4829860195199157e-05, + "loss": 0.004, + "step": 2818 + }, + { + "epoch": 0.7439651760981401, + "grad_norm": 0.42396989464759827, + "learning_rate": 1.484041150092324e-05, + "loss": 0.0213, + "step": 2820 + }, + { + "epoch": 0.7444928109748055, + "grad_norm": 0.7796805500984192, + "learning_rate": 1.4850962806647324e-05, + "loss": 0.0061, + "step": 2822 + }, + { + "epoch": 0.7450204458514708, + "grad_norm": 0.0408904030919075, + "learning_rate": 1.4861514112371407e-05, + "loss": 0.0024, + "step": 2824 + }, + { + "epoch": 0.7455480807281362, + "grad_norm": 0.4320988357067108, + "learning_rate": 1.4872065418095492e-05, + "loss": 0.01, + "step": 2826 + }, + { + "epoch": 0.7460757156048015, + "grad_norm": 0.05633687227964401, + "learning_rate": 1.4882616723819573e-05, + "loss": 0.0089, + "step": 2828 + }, + { + "epoch": 0.7466033504814669, + "grad_norm": 0.13559383153915405, + "learning_rate": 1.4893168029543656e-05, + "loss": 0.0113, + "step": 2830 + }, + { + "epoch": 0.7471309853581322, + "grad_norm": 0.46073251962661743, + "learning_rate": 1.490371933526774e-05, + "loss": 0.0048, + "step": 2832 + }, + { + "epoch": 0.7476586202347976, + "grad_norm": 0.43054983019828796, + "learning_rate": 1.4914270640991824e-05, + "loss": 0.0047, + "step": 2834 + }, + { + "epoch": 0.7481862551114629, + "grad_norm": 0.13776323199272156, + "learning_rate": 1.4924821946715908e-05, + "loss": 0.0026, + "step": 2836 + }, + { + "epoch": 0.7487138899881283, + "grad_norm": 0.09226847440004349, + "learning_rate": 1.4935373252439991e-05, + "loss": 0.0027, + "step": 2838 + }, + { + "epoch": 0.7492415248647936, + "grad_norm": 0.5593143701553345, + "learning_rate": 1.4945924558164073e-05, + "loss": 0.0039, + "step": 2840 + }, + { + "epoch": 0.7497691597414589, + "grad_norm": 0.019928015768527985, + "learning_rate": 1.4956475863888157e-05, + "loss": 0.002, + "step": 2842 + }, + { + "epoch": 0.7502967946181243, + "grad_norm": 0.19999967515468597, + "learning_rate": 1.496702716961224e-05, + "loss": 0.0028, + "step": 2844 + }, + { + "epoch": 0.7508244294947896, + "grad_norm": 0.7811419367790222, + "learning_rate": 1.4977578475336325e-05, + "loss": 0.0107, + "step": 2846 + }, + { + "epoch": 0.751352064371455, + "grad_norm": 0.19400794804096222, + "learning_rate": 1.4988129781060408e-05, + "loss": 0.0033, + "step": 2848 + }, + { + "epoch": 0.7518796992481203, + "grad_norm": 0.05404343828558922, + "learning_rate": 1.4998681086784491e-05, + "loss": 0.005, + "step": 2850 + }, + { + "epoch": 0.7524073341247857, + "grad_norm": 0.05020793154835701, + "learning_rate": 1.5009232392508576e-05, + "loss": 0.0044, + "step": 2852 + }, + { + "epoch": 0.752934969001451, + "grad_norm": 0.816552996635437, + "learning_rate": 1.5019783698232657e-05, + "loss": 0.0062, + "step": 2854 + }, + { + "epoch": 0.7534626038781164, + "grad_norm": 0.2774405777454376, + "learning_rate": 1.5030335003956741e-05, + "loss": 0.006, + "step": 2856 + }, + { + "epoch": 0.7539902387547817, + "grad_norm": 0.19167901575565338, + "learning_rate": 1.5040886309680824e-05, + "loss": 0.0025, + "step": 2858 + }, + { + "epoch": 0.7545178736314471, + "grad_norm": 0.038384564220905304, + "learning_rate": 1.5051437615404907e-05, + "loss": 0.0058, + "step": 2860 + }, + { + "epoch": 0.7550455085081124, + "grad_norm": 0.16032138466835022, + "learning_rate": 1.5061988921128992e-05, + "loss": 0.0031, + "step": 2862 + }, + { + "epoch": 0.7555731433847778, + "grad_norm": 0.34740954637527466, + "learning_rate": 1.5072540226853073e-05, + "loss": 0.0225, + "step": 2864 + }, + { + "epoch": 0.7561007782614431, + "grad_norm": 0.12254014611244202, + "learning_rate": 1.5083091532577158e-05, + "loss": 0.0096, + "step": 2866 + }, + { + "epoch": 0.7566284131381085, + "grad_norm": 0.14817920327186584, + "learning_rate": 1.5093642838301241e-05, + "loss": 0.0155, + "step": 2868 + }, + { + "epoch": 0.7571560480147738, + "grad_norm": 0.6222885251045227, + "learning_rate": 1.5104194144025324e-05, + "loss": 0.0047, + "step": 2870 + }, + { + "epoch": 0.7576836828914392, + "grad_norm": 0.054396338760852814, + "learning_rate": 1.5114745449749408e-05, + "loss": 0.0018, + "step": 2872 + }, + { + "epoch": 0.7582113177681045, + "grad_norm": 1.01432466506958, + "learning_rate": 1.5125296755473491e-05, + "loss": 0.0046, + "step": 2874 + }, + { + "epoch": 0.7587389526447699, + "grad_norm": 0.7172766923904419, + "learning_rate": 1.5135848061197573e-05, + "loss": 0.0049, + "step": 2876 + }, + { + "epoch": 0.7592665875214352, + "grad_norm": 0.19136925041675568, + "learning_rate": 1.5146399366921657e-05, + "loss": 0.0075, + "step": 2878 + }, + { + "epoch": 0.7597942223981005, + "grad_norm": 0.18570387363433838, + "learning_rate": 1.515695067264574e-05, + "loss": 0.0078, + "step": 2880 + }, + { + "epoch": 0.7603218572747659, + "grad_norm": 0.20576144754886627, + "learning_rate": 1.5167501978369825e-05, + "loss": 0.006, + "step": 2882 + }, + { + "epoch": 0.7608494921514312, + "grad_norm": 0.4919429123401642, + "learning_rate": 1.5178053284093908e-05, + "loss": 0.0027, + "step": 2884 + }, + { + "epoch": 0.7613771270280966, + "grad_norm": 0.05406363680958748, + "learning_rate": 1.5188604589817991e-05, + "loss": 0.0121, + "step": 2886 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.10971064120531082, + "learning_rate": 1.5199155895542076e-05, + "loss": 0.0056, + "step": 2888 + }, + { + "epoch": 0.7624323967814273, + "grad_norm": 0.5997482538223267, + "learning_rate": 1.5209707201266157e-05, + "loss": 0.0135, + "step": 2890 + }, + { + "epoch": 0.7629600316580926, + "grad_norm": 0.02323010191321373, + "learning_rate": 1.5220258506990241e-05, + "loss": 0.0042, + "step": 2892 + }, + { + "epoch": 0.763487666534758, + "grad_norm": 0.3150457739830017, + "learning_rate": 1.5230809812714324e-05, + "loss": 0.0086, + "step": 2894 + }, + { + "epoch": 0.7640153014114233, + "grad_norm": 0.30018743872642517, + "learning_rate": 1.5241361118438407e-05, + "loss": 0.0031, + "step": 2896 + }, + { + "epoch": 0.7645429362880887, + "grad_norm": 0.04912113398313522, + "learning_rate": 1.5251912424162492e-05, + "loss": 0.0018, + "step": 2898 + }, + { + "epoch": 0.765070571164754, + "grad_norm": 0.29057180881500244, + "learning_rate": 1.5262463729886573e-05, + "loss": 0.012, + "step": 2900 + }, + { + "epoch": 0.7655982060414194, + "grad_norm": 0.39399492740631104, + "learning_rate": 1.527301503561066e-05, + "loss": 0.0038, + "step": 2902 + }, + { + "epoch": 0.7661258409180847, + "grad_norm": 0.37476950883865356, + "learning_rate": 1.5283566341334743e-05, + "loss": 0.0087, + "step": 2904 + }, + { + "epoch": 0.7666534757947501, + "grad_norm": 0.03973271697759628, + "learning_rate": 1.5294117647058822e-05, + "loss": 0.0021, + "step": 2906 + }, + { + "epoch": 0.7671811106714154, + "grad_norm": 0.027780963107943535, + "learning_rate": 1.530466895278291e-05, + "loss": 0.0144, + "step": 2908 + }, + { + "epoch": 0.7677087455480808, + "grad_norm": 0.050481297075748444, + "learning_rate": 1.531522025850699e-05, + "loss": 0.0054, + "step": 2910 + }, + { + "epoch": 0.7682363804247461, + "grad_norm": 0.14655888080596924, + "learning_rate": 1.5325771564231074e-05, + "loss": 0.0028, + "step": 2912 + }, + { + "epoch": 0.7687640153014115, + "grad_norm": 0.39922428131103516, + "learning_rate": 1.5336322869955157e-05, + "loss": 0.008, + "step": 2914 + }, + { + "epoch": 0.7692916501780768, + "grad_norm": 0.5646882057189941, + "learning_rate": 1.534687417567924e-05, + "loss": 0.0133, + "step": 2916 + }, + { + "epoch": 0.7698192850547421, + "grad_norm": 0.22014349699020386, + "learning_rate": 1.5357425481403327e-05, + "loss": 0.0046, + "step": 2918 + }, + { + "epoch": 0.7703469199314075, + "grad_norm": 0.035573314875364304, + "learning_rate": 1.5367976787127406e-05, + "loss": 0.0026, + "step": 2920 + }, + { + "epoch": 0.7708745548080728, + "grad_norm": 0.32239454984664917, + "learning_rate": 1.5378528092851493e-05, + "loss": 0.004, + "step": 2922 + }, + { + "epoch": 0.7714021896847382, + "grad_norm": 0.08451627939939499, + "learning_rate": 1.5389079398575576e-05, + "loss": 0.0023, + "step": 2924 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 0.42576420307159424, + "learning_rate": 1.539963070429966e-05, + "loss": 0.0036, + "step": 2926 + }, + { + "epoch": 0.7724574594380689, + "grad_norm": 0.2248750776052475, + "learning_rate": 1.541018201002374e-05, + "loss": 0.006, + "step": 2928 + }, + { + "epoch": 0.7729850943147342, + "grad_norm": 0.02868051640689373, + "learning_rate": 1.5420733315747824e-05, + "loss": 0.0105, + "step": 2930 + }, + { + "epoch": 0.7735127291913996, + "grad_norm": 0.3277808427810669, + "learning_rate": 1.543128462147191e-05, + "loss": 0.0094, + "step": 2932 + }, + { + "epoch": 0.7740403640680649, + "grad_norm": 0.3581709861755371, + "learning_rate": 1.544183592719599e-05, + "loss": 0.0045, + "step": 2934 + }, + { + "epoch": 0.7745679989447303, + "grad_norm": 0.05212489888072014, + "learning_rate": 1.5452387232920073e-05, + "loss": 0.0024, + "step": 2936 + }, + { + "epoch": 0.7750956338213956, + "grad_norm": 0.0723189264535904, + "learning_rate": 1.546293853864416e-05, + "loss": 0.0023, + "step": 2938 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 0.4215906858444214, + "learning_rate": 1.5473489844368243e-05, + "loss": 0.0113, + "step": 2940 + }, + { + "epoch": 0.7761509035747263, + "grad_norm": 0.4603463113307953, + "learning_rate": 1.5484041150092326e-05, + "loss": 0.015, + "step": 2942 + }, + { + "epoch": 0.7766785384513917, + "grad_norm": 0.0940290242433548, + "learning_rate": 1.549459245581641e-05, + "loss": 0.0053, + "step": 2944 + }, + { + "epoch": 0.777206173328057, + "grad_norm": 0.23235604166984558, + "learning_rate": 1.550514376154049e-05, + "loss": 0.0042, + "step": 2946 + }, + { + "epoch": 0.7777338082047224, + "grad_norm": 0.15133434534072876, + "learning_rate": 1.5515695067264575e-05, + "loss": 0.0123, + "step": 2948 + }, + { + "epoch": 0.7782614430813877, + "grad_norm": 0.20531556010246277, + "learning_rate": 1.5526246372988657e-05, + "loss": 0.0075, + "step": 2950 + }, + { + "epoch": 0.7787890779580531, + "grad_norm": 0.5660015940666199, + "learning_rate": 1.5536797678712744e-05, + "loss": 0.0068, + "step": 2952 + }, + { + "epoch": 0.7793167128347184, + "grad_norm": 0.09381968528032303, + "learning_rate": 1.5547348984436827e-05, + "loss": 0.002, + "step": 2954 + }, + { + "epoch": 0.7798443477113837, + "grad_norm": 0.0408213846385479, + "learning_rate": 1.5557900290160906e-05, + "loss": 0.0079, + "step": 2956 + }, + { + "epoch": 0.7803719825880491, + "grad_norm": 0.07390326261520386, + "learning_rate": 1.5568451595884993e-05, + "loss": 0.0032, + "step": 2958 + }, + { + "epoch": 0.7808996174647144, + "grad_norm": 0.168019101023674, + "learning_rate": 1.5579002901609076e-05, + "loss": 0.0021, + "step": 2960 + }, + { + "epoch": 0.7814272523413798, + "grad_norm": 0.05616159737110138, + "learning_rate": 1.558955420733316e-05, + "loss": 0.0071, + "step": 2962 + }, + { + "epoch": 0.7819548872180451, + "grad_norm": 0.12861639261245728, + "learning_rate": 1.560010551305724e-05, + "loss": 0.0154, + "step": 2964 + }, + { + "epoch": 0.7824825220947105, + "grad_norm": 0.1286955177783966, + "learning_rate": 1.5610656818781325e-05, + "loss": 0.002, + "step": 2966 + }, + { + "epoch": 0.7830101569713758, + "grad_norm": 0.21297284960746765, + "learning_rate": 1.562120812450541e-05, + "loss": 0.0268, + "step": 2968 + }, + { + "epoch": 0.7835377918480412, + "grad_norm": 0.553989589214325, + "learning_rate": 1.563175943022949e-05, + "loss": 0.0148, + "step": 2970 + }, + { + "epoch": 0.7840654267247065, + "grad_norm": 0.11617293208837509, + "learning_rate": 1.5642310735953577e-05, + "loss": 0.0055, + "step": 2972 + }, + { + "epoch": 0.7845930616013719, + "grad_norm": 0.0947887971997261, + "learning_rate": 1.565286204167766e-05, + "loss": 0.0022, + "step": 2974 + }, + { + "epoch": 0.7851206964780372, + "grad_norm": 0.08728698641061783, + "learning_rate": 1.5663413347401743e-05, + "loss": 0.0067, + "step": 2976 + }, + { + "epoch": 0.7856483313547026, + "grad_norm": 4.816250801086426, + "learning_rate": 1.5673964653125826e-05, + "loss": 0.0091, + "step": 2978 + }, + { + "epoch": 0.7861759662313679, + "grad_norm": 0.14094673097133636, + "learning_rate": 1.568451595884991e-05, + "loss": 0.0017, + "step": 2980 + }, + { + "epoch": 0.7867036011080333, + "grad_norm": 0.3420872092247009, + "learning_rate": 1.569506726457399e-05, + "loss": 0.003, + "step": 2982 + }, + { + "epoch": 0.7872312359846986, + "grad_norm": 0.7650060057640076, + "learning_rate": 1.5705618570298075e-05, + "loss": 0.0117, + "step": 2984 + }, + { + "epoch": 0.787758870861364, + "grad_norm": 1.0275717973709106, + "learning_rate": 1.5716169876022158e-05, + "loss": 0.0216, + "step": 2986 + }, + { + "epoch": 0.7882865057380293, + "grad_norm": 0.14076285064220428, + "learning_rate": 1.5726721181746244e-05, + "loss": 0.0023, + "step": 2988 + }, + { + "epoch": 0.7888141406146947, + "grad_norm": 0.1932496875524521, + "learning_rate": 1.5737272487470327e-05, + "loss": 0.0101, + "step": 2990 + }, + { + "epoch": 0.78934177549136, + "grad_norm": 0.23963822424411774, + "learning_rate": 1.574782379319441e-05, + "loss": 0.0025, + "step": 2992 + }, + { + "epoch": 0.7898694103680254, + "grad_norm": 0.23144647479057312, + "learning_rate": 1.5758375098918493e-05, + "loss": 0.0093, + "step": 2994 + }, + { + "epoch": 0.7903970452446907, + "grad_norm": 0.3138801157474518, + "learning_rate": 1.5768926404642576e-05, + "loss": 0.0048, + "step": 2996 + }, + { + "epoch": 0.790924680121356, + "grad_norm": 0.35604971647262573, + "learning_rate": 1.577947771036666e-05, + "loss": 0.0072, + "step": 2998 + }, + { + "epoch": 0.7914523149980214, + "grad_norm": 0.09411001205444336, + "learning_rate": 1.579002901609074e-05, + "loss": 0.0178, + "step": 3000 + }, + { + "epoch": 0.7919799498746867, + "grad_norm": 0.07667393982410431, + "learning_rate": 1.5800580321814828e-05, + "loss": 0.0018, + "step": 3002 + }, + { + "epoch": 0.7925075847513521, + "grad_norm": 0.0416279211640358, + "learning_rate": 1.581113162753891e-05, + "loss": 0.0016, + "step": 3004 + }, + { + "epoch": 0.7930352196280174, + "grad_norm": 0.047279421240091324, + "learning_rate": 1.582168293326299e-05, + "loss": 0.0019, + "step": 3006 + }, + { + "epoch": 0.7935628545046828, + "grad_norm": 0.08425886183977127, + "learning_rate": 1.5832234238987077e-05, + "loss": 0.0017, + "step": 3008 + }, + { + "epoch": 0.7940904893813481, + "grad_norm": 0.6057291030883789, + "learning_rate": 1.584278554471116e-05, + "loss": 0.0146, + "step": 3010 + }, + { + "epoch": 0.7946181242580135, + "grad_norm": 0.6108447313308716, + "learning_rate": 1.5853336850435243e-05, + "loss": 0.0059, + "step": 3012 + }, + { + "epoch": 0.7951457591346788, + "grad_norm": 0.08885651081800461, + "learning_rate": 1.5863888156159326e-05, + "loss": 0.0094, + "step": 3014 + }, + { + "epoch": 0.7956733940113442, + "grad_norm": 0.27419647574424744, + "learning_rate": 1.587443946188341e-05, + "loss": 0.0059, + "step": 3016 + }, + { + "epoch": 0.7962010288880095, + "grad_norm": 0.32756346464157104, + "learning_rate": 1.588499076760749e-05, + "loss": 0.0104, + "step": 3018 + }, + { + "epoch": 0.7967286637646749, + "grad_norm": 0.05315312370657921, + "learning_rate": 1.5895542073331575e-05, + "loss": 0.0022, + "step": 3020 + }, + { + "epoch": 0.7972562986413402, + "grad_norm": 0.08410067856311798, + "learning_rate": 1.590609337905566e-05, + "loss": 0.0024, + "step": 3022 + }, + { + "epoch": 0.7977839335180056, + "grad_norm": 0.3047165274620056, + "learning_rate": 1.5916644684779744e-05, + "loss": 0.0031, + "step": 3024 + }, + { + "epoch": 0.7983115683946709, + "grad_norm": 0.02518041804432869, + "learning_rate": 1.5927195990503827e-05, + "loss": 0.0016, + "step": 3026 + }, + { + "epoch": 0.7988392032713363, + "grad_norm": 0.21065619587898254, + "learning_rate": 1.593774729622791e-05, + "loss": 0.0024, + "step": 3028 + }, + { + "epoch": 0.7993668381480016, + "grad_norm": 0.06084875389933586, + "learning_rate": 1.5948298601951993e-05, + "loss": 0.0046, + "step": 3030 + }, + { + "epoch": 0.799894473024667, + "grad_norm": 0.02586796134710312, + "learning_rate": 1.5958849907676076e-05, + "loss": 0.0024, + "step": 3032 + }, + { + "epoch": 0.8004221079013323, + "grad_norm": 0.023779932409524918, + "learning_rate": 1.596940121340016e-05, + "loss": 0.0048, + "step": 3034 + }, + { + "epoch": 0.8009497427779976, + "grad_norm": 0.0650366023182869, + "learning_rate": 1.597995251912424e-05, + "loss": 0.0017, + "step": 3036 + }, + { + "epoch": 0.801477377654663, + "grad_norm": 0.1412491798400879, + "learning_rate": 1.5990503824848328e-05, + "loss": 0.0145, + "step": 3038 + }, + { + "epoch": 0.8020050125313283, + "grad_norm": 0.1298144906759262, + "learning_rate": 1.600105513057241e-05, + "loss": 0.0139, + "step": 3040 + }, + { + "epoch": 0.8025326474079937, + "grad_norm": 0.07432260364294052, + "learning_rate": 1.6011606436296494e-05, + "loss": 0.0018, + "step": 3042 + }, + { + "epoch": 0.803060282284659, + "grad_norm": 0.14353355765342712, + "learning_rate": 1.6022157742020577e-05, + "loss": 0.0027, + "step": 3044 + }, + { + "epoch": 0.8035879171613244, + "grad_norm": 0.05570771545171738, + "learning_rate": 1.603270904774466e-05, + "loss": 0.0016, + "step": 3046 + }, + { + "epoch": 0.8041155520379897, + "grad_norm": 0.3205834627151489, + "learning_rate": 1.6043260353468743e-05, + "loss": 0.0049, + "step": 3048 + }, + { + "epoch": 0.8046431869146551, + "grad_norm": 0.019642507657408714, + "learning_rate": 1.6053811659192826e-05, + "loss": 0.0024, + "step": 3050 + }, + { + "epoch": 0.8051708217913204, + "grad_norm": 0.02542518824338913, + "learning_rate": 1.6064362964916912e-05, + "loss": 0.0015, + "step": 3052 + }, + { + "epoch": 0.8056984566679858, + "grad_norm": 0.24287626147270203, + "learning_rate": 1.607491427064099e-05, + "loss": 0.0131, + "step": 3054 + }, + { + "epoch": 0.8062260915446511, + "grad_norm": 0.04329773411154747, + "learning_rate": 1.6085465576365075e-05, + "loss": 0.0046, + "step": 3056 + }, + { + "epoch": 0.8067537264213165, + "grad_norm": 0.7549193501472473, + "learning_rate": 1.609601688208916e-05, + "loss": 0.0094, + "step": 3058 + }, + { + "epoch": 0.8072813612979818, + "grad_norm": 0.10666795819997787, + "learning_rate": 1.6106568187813244e-05, + "loss": 0.0094, + "step": 3060 + }, + { + "epoch": 0.8078089961746472, + "grad_norm": 0.025082509964704514, + "learning_rate": 1.6117119493537327e-05, + "loss": 0.003, + "step": 3062 + }, + { + "epoch": 0.8083366310513125, + "grad_norm": 0.23767520487308502, + "learning_rate": 1.612767079926141e-05, + "loss": 0.0027, + "step": 3064 + }, + { + "epoch": 0.8088642659279779, + "grad_norm": 0.1784301996231079, + "learning_rate": 1.6138222104985493e-05, + "loss": 0.0018, + "step": 3066 + }, + { + "epoch": 0.8093919008046432, + "grad_norm": 0.405828058719635, + "learning_rate": 1.6148773410709576e-05, + "loss": 0.0056, + "step": 3068 + }, + { + "epoch": 0.8099195356813086, + "grad_norm": 0.029604369774460793, + "learning_rate": 1.615932471643366e-05, + "loss": 0.0045, + "step": 3070 + }, + { + "epoch": 0.8104471705579739, + "grad_norm": 0.026707451790571213, + "learning_rate": 1.6169876022157745e-05, + "loss": 0.004, + "step": 3072 + }, + { + "epoch": 0.8109748054346392, + "grad_norm": 0.3798576295375824, + "learning_rate": 1.6180427327881828e-05, + "loss": 0.0112, + "step": 3074 + }, + { + "epoch": 0.8115024403113046, + "grad_norm": 0.0967569351196289, + "learning_rate": 1.619097863360591e-05, + "loss": 0.0058, + "step": 3076 + }, + { + "epoch": 0.8120300751879699, + "grad_norm": 0.016253920271992683, + "learning_rate": 1.6201529939329994e-05, + "loss": 0.0013, + "step": 3078 + }, + { + "epoch": 0.8125577100646353, + "grad_norm": 0.5715863108634949, + "learning_rate": 1.6212081245054077e-05, + "loss": 0.0113, + "step": 3080 + }, + { + "epoch": 0.8130853449413006, + "grad_norm": 0.0441461019217968, + "learning_rate": 1.622263255077816e-05, + "loss": 0.0165, + "step": 3082 + }, + { + "epoch": 0.813612979817966, + "grad_norm": 0.06163244694471359, + "learning_rate": 1.6233183856502243e-05, + "loss": 0.0027, + "step": 3084 + }, + { + "epoch": 0.8141406146946313, + "grad_norm": 0.08869308978319168, + "learning_rate": 1.6243735162226326e-05, + "loss": 0.0084, + "step": 3086 + }, + { + "epoch": 0.8146682495712967, + "grad_norm": 0.3035103976726532, + "learning_rate": 1.6254286467950412e-05, + "loss": 0.0124, + "step": 3088 + }, + { + "epoch": 0.815195884447962, + "grad_norm": 0.15321160852909088, + "learning_rate": 1.6264837773674492e-05, + "loss": 0.0036, + "step": 3090 + }, + { + "epoch": 0.8157235193246274, + "grad_norm": 0.31351539492607117, + "learning_rate": 1.6275389079398575e-05, + "loss": 0.013, + "step": 3092 + }, + { + "epoch": 0.8162511542012927, + "grad_norm": 0.050217241048812866, + "learning_rate": 1.628594038512266e-05, + "loss": 0.0041, + "step": 3094 + }, + { + "epoch": 0.8167787890779581, + "grad_norm": 0.2584534287452698, + "learning_rate": 1.6296491690846744e-05, + "loss": 0.0033, + "step": 3096 + }, + { + "epoch": 0.8173064239546234, + "grad_norm": 0.10340563952922821, + "learning_rate": 1.6307042996570827e-05, + "loss": 0.0127, + "step": 3098 + }, + { + "epoch": 0.8178340588312888, + "grad_norm": 0.03598645329475403, + "learning_rate": 1.631759430229491e-05, + "loss": 0.0019, + "step": 3100 + }, + { + "epoch": 0.8183616937079541, + "grad_norm": 0.12387575954198837, + "learning_rate": 1.6328145608018993e-05, + "loss": 0.0021, + "step": 3102 + }, + { + "epoch": 0.8188893285846195, + "grad_norm": 0.20104879140853882, + "learning_rate": 1.6338696913743076e-05, + "loss": 0.0093, + "step": 3104 + }, + { + "epoch": 0.8194169634612848, + "grad_norm": 0.10162994265556335, + "learning_rate": 1.634924821946716e-05, + "loss": 0.0037, + "step": 3106 + }, + { + "epoch": 0.8199445983379502, + "grad_norm": 0.11857999861240387, + "learning_rate": 1.6359799525191245e-05, + "loss": 0.0022, + "step": 3108 + }, + { + "epoch": 0.8204722332146155, + "grad_norm": 0.09013248980045319, + "learning_rate": 1.6370350830915328e-05, + "loss": 0.002, + "step": 3110 + }, + { + "epoch": 0.8209998680912808, + "grad_norm": 0.04606979340314865, + "learning_rate": 1.638090213663941e-05, + "loss": 0.0016, + "step": 3112 + }, + { + "epoch": 0.8215275029679462, + "grad_norm": 0.13465800881385803, + "learning_rate": 1.6391453442363494e-05, + "loss": 0.009, + "step": 3114 + }, + { + "epoch": 0.8220551378446115, + "grad_norm": 0.013811636716127396, + "learning_rate": 1.6402004748087577e-05, + "loss": 0.0046, + "step": 3116 + }, + { + "epoch": 0.8225827727212769, + "grad_norm": 0.24691824615001678, + "learning_rate": 1.641255605381166e-05, + "loss": 0.0046, + "step": 3118 + }, + { + "epoch": 0.8231104075979422, + "grad_norm": 0.10953506082296371, + "learning_rate": 1.6423107359535743e-05, + "loss": 0.0019, + "step": 3120 + }, + { + "epoch": 0.8236380424746076, + "grad_norm": 0.6315784454345703, + "learning_rate": 1.6433658665259826e-05, + "loss": 0.0111, + "step": 3122 + }, + { + "epoch": 0.8241656773512729, + "grad_norm": 0.05198473855853081, + "learning_rate": 1.6444209970983912e-05, + "loss": 0.0014, + "step": 3124 + }, + { + "epoch": 0.8246933122279383, + "grad_norm": 0.04873940348625183, + "learning_rate": 1.6454761276707992e-05, + "loss": 0.0016, + "step": 3126 + }, + { + "epoch": 0.8252209471046036, + "grad_norm": 0.11330199986696243, + "learning_rate": 1.6465312582432078e-05, + "loss": 0.0067, + "step": 3128 + }, + { + "epoch": 0.825748581981269, + "grad_norm": 0.2503969669342041, + "learning_rate": 1.647586388815616e-05, + "loss": 0.0046, + "step": 3130 + }, + { + "epoch": 0.8262762168579343, + "grad_norm": 0.05639534443616867, + "learning_rate": 1.6486415193880244e-05, + "loss": 0.0015, + "step": 3132 + }, + { + "epoch": 0.8268038517345997, + "grad_norm": 0.8204934000968933, + "learning_rate": 1.6496966499604327e-05, + "loss": 0.0048, + "step": 3134 + }, + { + "epoch": 0.827331486611265, + "grad_norm": 0.02355354279279709, + "learning_rate": 1.650751780532841e-05, + "loss": 0.0065, + "step": 3136 + }, + { + "epoch": 0.8278591214879304, + "grad_norm": 0.012933250516653061, + "learning_rate": 1.6518069111052496e-05, + "loss": 0.0013, + "step": 3138 + }, + { + "epoch": 0.8283867563645957, + "grad_norm": 0.3325115740299225, + "learning_rate": 1.6528620416776576e-05, + "loss": 0.009, + "step": 3140 + }, + { + "epoch": 0.8289143912412611, + "grad_norm": 0.30793407559394836, + "learning_rate": 1.653917172250066e-05, + "loss": 0.0067, + "step": 3142 + }, + { + "epoch": 0.8294420261179264, + "grad_norm": 0.04168511554598808, + "learning_rate": 1.6549723028224745e-05, + "loss": 0.0056, + "step": 3144 + }, + { + "epoch": 0.8299696609945918, + "grad_norm": 0.20189781486988068, + "learning_rate": 1.6560274333948828e-05, + "loss": 0.0037, + "step": 3146 + }, + { + "epoch": 0.8304972958712571, + "grad_norm": 0.9454149007797241, + "learning_rate": 1.657082563967291e-05, + "loss": 0.0153, + "step": 3148 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 0.2894841730594635, + "learning_rate": 1.6581376945396994e-05, + "loss": 0.003, + "step": 3150 + }, + { + "epoch": 0.8315525656245878, + "grad_norm": 0.3282797038555145, + "learning_rate": 1.6591928251121077e-05, + "loss": 0.0081, + "step": 3152 + }, + { + "epoch": 0.8320802005012531, + "grad_norm": 0.03916627913713455, + "learning_rate": 1.660247955684516e-05, + "loss": 0.0048, + "step": 3154 + }, + { + "epoch": 0.8326078353779185, + "grad_norm": 0.25089773535728455, + "learning_rate": 1.6613030862569243e-05, + "loss": 0.002, + "step": 3156 + }, + { + "epoch": 0.8331354702545838, + "grad_norm": 1.29254150390625, + "learning_rate": 1.662358216829333e-05, + "loss": 0.0047, + "step": 3158 + }, + { + "epoch": 0.8336631051312492, + "grad_norm": 0.04590693116188049, + "learning_rate": 1.6634133474017412e-05, + "loss": 0.0013, + "step": 3160 + }, + { + "epoch": 0.8341907400079145, + "grad_norm": 0.195514976978302, + "learning_rate": 1.6644684779741492e-05, + "loss": 0.0014, + "step": 3162 + }, + { + "epoch": 0.8347183748845799, + "grad_norm": 0.408733993768692, + "learning_rate": 1.6655236085465578e-05, + "loss": 0.0016, + "step": 3164 + }, + { + "epoch": 0.8352460097612452, + "grad_norm": 0.3357802927494049, + "learning_rate": 1.666578739118966e-05, + "loss": 0.0123, + "step": 3166 + }, + { + "epoch": 0.8357736446379106, + "grad_norm": 0.9664717316627502, + "learning_rate": 1.6676338696913744e-05, + "loss": 0.0039, + "step": 3168 + }, + { + "epoch": 0.8363012795145759, + "grad_norm": 0.5228005647659302, + "learning_rate": 1.6686890002637827e-05, + "loss": 0.0032, + "step": 3170 + }, + { + "epoch": 0.8368289143912413, + "grad_norm": 0.41807642579078674, + "learning_rate": 1.669744130836191e-05, + "loss": 0.0143, + "step": 3172 + }, + { + "epoch": 0.8373565492679066, + "grad_norm": 0.268572062253952, + "learning_rate": 1.6707992614085996e-05, + "loss": 0.0035, + "step": 3174 + }, + { + "epoch": 0.837884184144572, + "grad_norm": 0.03270929306745529, + "learning_rate": 1.6718543919810076e-05, + "loss": 0.0141, + "step": 3176 + }, + { + "epoch": 0.8384118190212373, + "grad_norm": 0.3899339735507965, + "learning_rate": 1.6729095225534162e-05, + "loss": 0.0037, + "step": 3178 + }, + { + "epoch": 0.8389394538979027, + "grad_norm": 0.34072381258010864, + "learning_rate": 1.6739646531258245e-05, + "loss": 0.0109, + "step": 3180 + }, + { + "epoch": 0.839467088774568, + "grad_norm": 0.024172091856598854, + "learning_rate": 1.6750197836982328e-05, + "loss": 0.0018, + "step": 3182 + }, + { + "epoch": 0.8399947236512334, + "grad_norm": 0.27254346013069153, + "learning_rate": 1.676074914270641e-05, + "loss": 0.0039, + "step": 3184 + }, + { + "epoch": 0.8405223585278987, + "grad_norm": 0.35917001962661743, + "learning_rate": 1.6771300448430494e-05, + "loss": 0.0142, + "step": 3186 + }, + { + "epoch": 0.841049993404564, + "grad_norm": 0.20339259505271912, + "learning_rate": 1.678185175415458e-05, + "loss": 0.0077, + "step": 3188 + }, + { + "epoch": 0.8415776282812294, + "grad_norm": 0.5988117456436157, + "learning_rate": 1.679240305987866e-05, + "loss": 0.0181, + "step": 3190 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.4188942611217499, + "learning_rate": 1.6802954365602743e-05, + "loss": 0.0098, + "step": 3192 + }, + { + "epoch": 0.8426328980345601, + "grad_norm": 0.21079596877098083, + "learning_rate": 1.681350567132683e-05, + "loss": 0.0024, + "step": 3194 + }, + { + "epoch": 0.8431605329112254, + "grad_norm": 0.6172802448272705, + "learning_rate": 1.6824056977050912e-05, + "loss": 0.0107, + "step": 3196 + }, + { + "epoch": 0.8436881677878908, + "grad_norm": 0.2282857447862625, + "learning_rate": 1.6834608282774995e-05, + "loss": 0.0065, + "step": 3198 + }, + { + "epoch": 0.8442158026645561, + "grad_norm": 0.039879992604255676, + "learning_rate": 1.6845159588499078e-05, + "loss": 0.0015, + "step": 3200 + }, + { + "epoch": 0.8447434375412215, + "grad_norm": 0.20140910148620605, + "learning_rate": 1.685571089422316e-05, + "loss": 0.0042, + "step": 3202 + }, + { + "epoch": 0.8452710724178868, + "grad_norm": 0.05336996167898178, + "learning_rate": 1.6866262199947244e-05, + "loss": 0.0084, + "step": 3204 + }, + { + "epoch": 0.8457987072945522, + "grad_norm": 0.03971035033464432, + "learning_rate": 1.6876813505671327e-05, + "loss": 0.0092, + "step": 3206 + }, + { + "epoch": 0.8463263421712175, + "grad_norm": 0.4800412952899933, + "learning_rate": 1.6887364811395413e-05, + "loss": 0.0091, + "step": 3208 + }, + { + "epoch": 0.8468539770478829, + "grad_norm": 0.10540560632944107, + "learning_rate": 1.6897916117119496e-05, + "loss": 0.0063, + "step": 3210 + }, + { + "epoch": 0.8473816119245482, + "grad_norm": 0.051558155566453934, + "learning_rate": 1.6908467422843576e-05, + "loss": 0.0015, + "step": 3212 + }, + { + "epoch": 0.8479092468012136, + "grad_norm": 0.07298706471920013, + "learning_rate": 1.6919018728567662e-05, + "loss": 0.0048, + "step": 3214 + }, + { + "epoch": 0.8484368816778789, + "grad_norm": 0.1622311770915985, + "learning_rate": 1.6929570034291745e-05, + "loss": 0.0037, + "step": 3216 + }, + { + "epoch": 0.8489645165545443, + "grad_norm": 0.05683651939034462, + "learning_rate": 1.6940121340015828e-05, + "loss": 0.0047, + "step": 3218 + }, + { + "epoch": 0.8494921514312096, + "grad_norm": 0.183262437582016, + "learning_rate": 1.695067264573991e-05, + "loss": 0.006, + "step": 3220 + }, + { + "epoch": 0.850019786307875, + "grad_norm": 0.15619830787181854, + "learning_rate": 1.6961223951463994e-05, + "loss": 0.0096, + "step": 3222 + }, + { + "epoch": 0.8505474211845403, + "grad_norm": 0.06146354600787163, + "learning_rate": 1.697177525718808e-05, + "loss": 0.0017, + "step": 3224 + }, + { + "epoch": 0.8510750560612056, + "grad_norm": 0.42020583152770996, + "learning_rate": 1.698232656291216e-05, + "loss": 0.0048, + "step": 3226 + }, + { + "epoch": 0.851602690937871, + "grad_norm": 0.1254711151123047, + "learning_rate": 1.6992877868636246e-05, + "loss": 0.0025, + "step": 3228 + }, + { + "epoch": 0.8521303258145363, + "grad_norm": 0.055820174515247345, + "learning_rate": 1.700342917436033e-05, + "loss": 0.0054, + "step": 3230 + }, + { + "epoch": 0.8526579606912017, + "grad_norm": 0.05323915183544159, + "learning_rate": 1.7013980480084412e-05, + "loss": 0.0015, + "step": 3232 + }, + { + "epoch": 0.853185595567867, + "grad_norm": 0.01721760630607605, + "learning_rate": 1.7024531785808495e-05, + "loss": 0.0012, + "step": 3234 + }, + { + "epoch": 0.8537132304445324, + "grad_norm": 0.26254191994667053, + "learning_rate": 1.7035083091532578e-05, + "loss": 0.0129, + "step": 3236 + }, + { + "epoch": 0.8542408653211977, + "grad_norm": 0.2080577164888382, + "learning_rate": 1.7045634397256665e-05, + "loss": 0.0021, + "step": 3238 + }, + { + "epoch": 0.8547685001978631, + "grad_norm": 0.03274788707494736, + "learning_rate": 1.7056185702980744e-05, + "loss": 0.0018, + "step": 3240 + }, + { + "epoch": 0.8552961350745284, + "grad_norm": 0.016651827841997147, + "learning_rate": 1.7066737008704827e-05, + "loss": 0.0011, + "step": 3242 + }, + { + "epoch": 0.8558237699511938, + "grad_norm": 0.08459331840276718, + "learning_rate": 1.7077288314428913e-05, + "loss": 0.0104, + "step": 3244 + }, + { + "epoch": 0.8563514048278591, + "grad_norm": 0.13235540688037872, + "learning_rate": 1.7087839620152996e-05, + "loss": 0.0072, + "step": 3246 + }, + { + "epoch": 0.8568790397045245, + "grad_norm": 0.022438060492277145, + "learning_rate": 1.709839092587708e-05, + "loss": 0.0066, + "step": 3248 + }, + { + "epoch": 0.8574066745811898, + "grad_norm": 0.47938621044158936, + "learning_rate": 1.7108942231601162e-05, + "loss": 0.0044, + "step": 3250 + }, + { + "epoch": 0.8579343094578552, + "grad_norm": 0.7874009013175964, + "learning_rate": 1.7119493537325245e-05, + "loss": 0.0122, + "step": 3252 + }, + { + "epoch": 0.8584619443345205, + "grad_norm": 0.10983125865459442, + "learning_rate": 1.7130044843049328e-05, + "loss": 0.002, + "step": 3254 + }, + { + "epoch": 0.8589895792111859, + "grad_norm": 0.9786165356636047, + "learning_rate": 1.714059614877341e-05, + "loss": 0.0051, + "step": 3256 + }, + { + "epoch": 0.8595172140878512, + "grad_norm": 0.02250172197818756, + "learning_rate": 1.7151147454497498e-05, + "loss": 0.0019, + "step": 3258 + }, + { + "epoch": 0.8600448489645166, + "grad_norm": 0.22797684371471405, + "learning_rate": 1.716169876022158e-05, + "loss": 0.0083, + "step": 3260 + }, + { + "epoch": 0.8605724838411819, + "grad_norm": 0.20547150075435638, + "learning_rate": 1.717225006594566e-05, + "loss": 0.007, + "step": 3262 + }, + { + "epoch": 0.8611001187178472, + "grad_norm": 0.05126518756151199, + "learning_rate": 1.7182801371669746e-05, + "loss": 0.0063, + "step": 3264 + }, + { + "epoch": 0.8616277535945126, + "grad_norm": 0.7907236814498901, + "learning_rate": 1.719335267739383e-05, + "loss": 0.0033, + "step": 3266 + }, + { + "epoch": 0.8621553884711779, + "grad_norm": 0.31476640701293945, + "learning_rate": 1.7203903983117912e-05, + "loss": 0.0097, + "step": 3268 + }, + { + "epoch": 0.8626830233478433, + "grad_norm": 0.051490116864442825, + "learning_rate": 1.7214455288841995e-05, + "loss": 0.0047, + "step": 3270 + }, + { + "epoch": 0.8632106582245086, + "grad_norm": 0.07529754191637039, + "learning_rate": 1.7225006594566078e-05, + "loss": 0.0013, + "step": 3272 + }, + { + "epoch": 0.863738293101174, + "grad_norm": 0.48510798811912537, + "learning_rate": 1.7235557900290165e-05, + "loss": 0.0024, + "step": 3274 + }, + { + "epoch": 0.8642659279778393, + "grad_norm": 0.11956419050693512, + "learning_rate": 1.7246109206014244e-05, + "loss": 0.0015, + "step": 3276 + }, + { + "epoch": 0.8647935628545047, + "grad_norm": 0.05031069740653038, + "learning_rate": 1.725666051173833e-05, + "loss": 0.0015, + "step": 3278 + }, + { + "epoch": 0.86532119773117, + "grad_norm": 0.5481807589530945, + "learning_rate": 1.7267211817462414e-05, + "loss": 0.0106, + "step": 3280 + }, + { + "epoch": 0.8658488326078354, + "grad_norm": 0.058540552854537964, + "learning_rate": 1.7277763123186496e-05, + "loss": 0.002, + "step": 3282 + }, + { + "epoch": 0.8663764674845007, + "grad_norm": 0.15154387056827545, + "learning_rate": 1.728831442891058e-05, + "loss": 0.0021, + "step": 3284 + }, + { + "epoch": 0.8669041023611661, + "grad_norm": 0.05600970983505249, + "learning_rate": 1.7298865734634662e-05, + "loss": 0.0125, + "step": 3286 + }, + { + "epoch": 0.8674317372378314, + "grad_norm": 0.6263447403907776, + "learning_rate": 1.7309417040358745e-05, + "loss": 0.0059, + "step": 3288 + }, + { + "epoch": 0.8679593721144968, + "grad_norm": 0.034129463136196136, + "learning_rate": 1.731996834608283e-05, + "loss": 0.0014, + "step": 3290 + }, + { + "epoch": 0.8684870069911621, + "grad_norm": 0.043068766593933105, + "learning_rate": 1.733051965180691e-05, + "loss": 0.0014, + "step": 3292 + }, + { + "epoch": 0.8690146418678275, + "grad_norm": 0.5312008261680603, + "learning_rate": 1.7341070957530998e-05, + "loss": 0.0031, + "step": 3294 + }, + { + "epoch": 0.8695422767444928, + "grad_norm": 0.18994882702827454, + "learning_rate": 1.735162226325508e-05, + "loss": 0.0194, + "step": 3296 + }, + { + "epoch": 0.8700699116211582, + "grad_norm": 0.06459280848503113, + "learning_rate": 1.736217356897916e-05, + "loss": 0.0018, + "step": 3298 + }, + { + "epoch": 0.8705975464978235, + "grad_norm": 0.02587042562663555, + "learning_rate": 1.7372724874703246e-05, + "loss": 0.0014, + "step": 3300 + }, + { + "epoch": 0.8711251813744888, + "grad_norm": 0.052058879286050797, + "learning_rate": 1.738327618042733e-05, + "loss": 0.0015, + "step": 3302 + }, + { + "epoch": 0.8716528162511542, + "grad_norm": 0.3902004659175873, + "learning_rate": 1.7393827486151412e-05, + "loss": 0.0096, + "step": 3304 + }, + { + "epoch": 0.8721804511278195, + "grad_norm": 0.1637732982635498, + "learning_rate": 1.7404378791875495e-05, + "loss": 0.0053, + "step": 3306 + }, + { + "epoch": 0.8727080860044849, + "grad_norm": 0.032368551939725876, + "learning_rate": 1.741493009759958e-05, + "loss": 0.0032, + "step": 3308 + }, + { + "epoch": 0.8732357208811502, + "grad_norm": 0.37035995721817017, + "learning_rate": 1.7425481403323665e-05, + "loss": 0.011, + "step": 3310 + }, + { + "epoch": 0.8737633557578156, + "grad_norm": 0.21920596063137054, + "learning_rate": 1.7436032709047744e-05, + "loss": 0.0061, + "step": 3312 + }, + { + "epoch": 0.8742909906344809, + "grad_norm": 0.7277883887290955, + "learning_rate": 1.744658401477183e-05, + "loss": 0.011, + "step": 3314 + }, + { + "epoch": 0.8748186255111463, + "grad_norm": 0.4284469485282898, + "learning_rate": 1.7457135320495914e-05, + "loss": 0.0172, + "step": 3316 + }, + { + "epoch": 0.8753462603878116, + "grad_norm": 0.08198241889476776, + "learning_rate": 1.7467686626219997e-05, + "loss": 0.0015, + "step": 3318 + }, + { + "epoch": 0.875873895264477, + "grad_norm": 0.046691134572029114, + "learning_rate": 1.747823793194408e-05, + "loss": 0.0015, + "step": 3320 + }, + { + "epoch": 0.8764015301411423, + "grad_norm": 0.07068780809640884, + "learning_rate": 1.7488789237668162e-05, + "loss": 0.0017, + "step": 3322 + }, + { + "epoch": 0.8769291650178077, + "grad_norm": 0.1979081630706787, + "learning_rate": 1.7499340543392245e-05, + "loss": 0.0033, + "step": 3324 + }, + { + "epoch": 0.877456799894473, + "grad_norm": 0.02895589917898178, + "learning_rate": 1.750989184911633e-05, + "loss": 0.0014, + "step": 3326 + }, + { + "epoch": 0.8779844347711384, + "grad_norm": 0.031131954863667488, + "learning_rate": 1.752044315484041e-05, + "loss": 0.0015, + "step": 3328 + }, + { + "epoch": 0.8785120696478037, + "grad_norm": 0.023628313094377518, + "learning_rate": 1.7530994460564498e-05, + "loss": 0.0021, + "step": 3330 + }, + { + "epoch": 0.8790397045244691, + "grad_norm": 0.01755370944738388, + "learning_rate": 1.754154576628858e-05, + "loss": 0.0015, + "step": 3332 + }, + { + "epoch": 0.8795673394011344, + "grad_norm": 0.0623902752995491, + "learning_rate": 1.7552097072012664e-05, + "loss": 0.0018, + "step": 3334 + }, + { + "epoch": 0.8800949742777998, + "grad_norm": 0.014745824038982391, + "learning_rate": 1.7562648377736747e-05, + "loss": 0.0011, + "step": 3336 + }, + { + "epoch": 0.8806226091544651, + "grad_norm": 0.6382738351821899, + "learning_rate": 1.757319968346083e-05, + "loss": 0.0154, + "step": 3338 + }, + { + "epoch": 0.8811502440311304, + "grad_norm": 0.15750491619110107, + "learning_rate": 1.7583750989184912e-05, + "loss": 0.0023, + "step": 3340 + }, + { + "epoch": 0.8816778789077958, + "grad_norm": 0.3142077326774597, + "learning_rate": 1.7594302294908995e-05, + "loss": 0.015, + "step": 3342 + }, + { + "epoch": 0.8822055137844611, + "grad_norm": 0.3397883176803589, + "learning_rate": 1.7604853600633082e-05, + "loss": 0.0035, + "step": 3344 + }, + { + "epoch": 0.8827331486611265, + "grad_norm": 0.5466795563697815, + "learning_rate": 1.7615404906357165e-05, + "loss": 0.0073, + "step": 3346 + }, + { + "epoch": 0.8832607835377918, + "grad_norm": 0.36123740673065186, + "learning_rate": 1.7625956212081244e-05, + "loss": 0.0097, + "step": 3348 + }, + { + "epoch": 0.8837884184144572, + "grad_norm": 0.5894849300384521, + "learning_rate": 1.763650751780533e-05, + "loss": 0.0111, + "step": 3350 + }, + { + "epoch": 0.8843160532911225, + "grad_norm": 0.15118646621704102, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.0069, + "step": 3352 + }, + { + "epoch": 0.8848436881677879, + "grad_norm": 0.16722875833511353, + "learning_rate": 1.7657610129253497e-05, + "loss": 0.0064, + "step": 3354 + }, + { + "epoch": 0.8853713230444532, + "grad_norm": 0.5628158450126648, + "learning_rate": 1.766816143497758e-05, + "loss": 0.007, + "step": 3356 + }, + { + "epoch": 0.8858989579211186, + "grad_norm": 0.09750360995531082, + "learning_rate": 1.7678712740701662e-05, + "loss": 0.0021, + "step": 3358 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 0.2477106899023056, + "learning_rate": 1.7689264046425745e-05, + "loss": 0.0045, + "step": 3360 + }, + { + "epoch": 0.8869542276744493, + "grad_norm": 0.19703027606010437, + "learning_rate": 1.769981535214983e-05, + "loss": 0.0073, + "step": 3362 + }, + { + "epoch": 0.8874818625511146, + "grad_norm": 0.22415338456630707, + "learning_rate": 1.7710366657873915e-05, + "loss": 0.0049, + "step": 3364 + }, + { + "epoch": 0.88800949742778, + "grad_norm": 0.1845943182706833, + "learning_rate": 1.7720917963597998e-05, + "loss": 0.0083, + "step": 3366 + }, + { + "epoch": 0.8885371323044453, + "grad_norm": 0.6948882341384888, + "learning_rate": 1.773146926932208e-05, + "loss": 0.0157, + "step": 3368 + }, + { + "epoch": 0.8890647671811107, + "grad_norm": 0.1758277267217636, + "learning_rate": 1.7742020575046164e-05, + "loss": 0.0092, + "step": 3370 + }, + { + "epoch": 0.889592402057776, + "grad_norm": 0.08961277455091476, + "learning_rate": 1.7752571880770247e-05, + "loss": 0.0017, + "step": 3372 + }, + { + "epoch": 0.8901200369344414, + "grad_norm": 0.1268688440322876, + "learning_rate": 1.776312318649433e-05, + "loss": 0.0029, + "step": 3374 + }, + { + "epoch": 0.8906476718111067, + "grad_norm": 0.08441256731748581, + "learning_rate": 1.7773674492218413e-05, + "loss": 0.0031, + "step": 3376 + }, + { + "epoch": 0.891175306687772, + "grad_norm": 0.10144013911485672, + "learning_rate": 1.7784225797942495e-05, + "loss": 0.0091, + "step": 3378 + }, + { + "epoch": 0.8917029415644374, + "grad_norm": 0.04015864059329033, + "learning_rate": 1.7794777103666582e-05, + "loss": 0.0016, + "step": 3380 + }, + { + "epoch": 0.8922305764411027, + "grad_norm": 0.14570122957229614, + "learning_rate": 1.7805328409390665e-05, + "loss": 0.0099, + "step": 3382 + }, + { + "epoch": 0.8927582113177681, + "grad_norm": 0.020515190437436104, + "learning_rate": 1.7815879715114748e-05, + "loss": 0.0033, + "step": 3384 + }, + { + "epoch": 0.8932858461944334, + "grad_norm": 0.14286717772483826, + "learning_rate": 1.782643102083883e-05, + "loss": 0.0112, + "step": 3386 + }, + { + "epoch": 0.8938134810710988, + "grad_norm": 0.12564054131507874, + "learning_rate": 1.7836982326562914e-05, + "loss": 0.0101, + "step": 3388 + }, + { + "epoch": 0.8943411159477641, + "grad_norm": 0.017604457214474678, + "learning_rate": 1.7847533632286997e-05, + "loss": 0.0022, + "step": 3390 + }, + { + "epoch": 0.8948687508244295, + "grad_norm": 0.6241884827613831, + "learning_rate": 1.785808493801108e-05, + "loss": 0.0127, + "step": 3392 + }, + { + "epoch": 0.8953963857010948, + "grad_norm": 0.02147512510418892, + "learning_rate": 1.7868636243735166e-05, + "loss": 0.0013, + "step": 3394 + }, + { + "epoch": 0.8959240205777602, + "grad_norm": 0.11909110844135284, + "learning_rate": 1.7879187549459245e-05, + "loss": 0.0019, + "step": 3396 + }, + { + "epoch": 0.8964516554544255, + "grad_norm": 0.10294556617736816, + "learning_rate": 1.788973885518333e-05, + "loss": 0.0021, + "step": 3398 + }, + { + "epoch": 0.8969792903310909, + "grad_norm": 0.17718160152435303, + "learning_rate": 1.7900290160907415e-05, + "loss": 0.0028, + "step": 3400 + }, + { + "epoch": 0.8975069252077562, + "grad_norm": 0.3011797070503235, + "learning_rate": 1.7910841466631498e-05, + "loss": 0.0065, + "step": 3402 + }, + { + "epoch": 0.8980345600844216, + "grad_norm": 0.040983814746141434, + "learning_rate": 1.792139277235558e-05, + "loss": 0.0012, + "step": 3404 + }, + { + "epoch": 0.8985621949610869, + "grad_norm": 0.26204192638397217, + "learning_rate": 1.7931944078079664e-05, + "loss": 0.0094, + "step": 3406 + }, + { + "epoch": 0.8990898298377523, + "grad_norm": 0.5210272669792175, + "learning_rate": 1.7942495383803747e-05, + "loss": 0.0081, + "step": 3408 + }, + { + "epoch": 0.8996174647144176, + "grad_norm": 0.028664328157901764, + "learning_rate": 1.795304668952783e-05, + "loss": 0.0012, + "step": 3410 + }, + { + "epoch": 0.900145099591083, + "grad_norm": 0.45829951763153076, + "learning_rate": 1.7963597995251913e-05, + "loss": 0.0155, + "step": 3412 + }, + { + "epoch": 0.9006727344677483, + "grad_norm": 0.19936859607696533, + "learning_rate": 1.7974149300976e-05, + "loss": 0.0106, + "step": 3414 + }, + { + "epoch": 0.9012003693444137, + "grad_norm": 0.09109888225793839, + "learning_rate": 1.7984700606700082e-05, + "loss": 0.0022, + "step": 3416 + }, + { + "epoch": 0.901728004221079, + "grad_norm": 0.4848134517669678, + "learning_rate": 1.7995251912424165e-05, + "loss": 0.0149, + "step": 3418 + }, + { + "epoch": 0.9022556390977443, + "grad_norm": 0.054335080087184906, + "learning_rate": 1.8005803218148248e-05, + "loss": 0.0037, + "step": 3420 + }, + { + "epoch": 0.9027832739744097, + "grad_norm": 0.07796861231327057, + "learning_rate": 1.801635452387233e-05, + "loss": 0.0031, + "step": 3422 + }, + { + "epoch": 0.903310908851075, + "grad_norm": 0.24112117290496826, + "learning_rate": 1.8026905829596414e-05, + "loss": 0.0025, + "step": 3424 + }, + { + "epoch": 0.9038385437277404, + "grad_norm": 0.08802921324968338, + "learning_rate": 1.8037457135320497e-05, + "loss": 0.0015, + "step": 3426 + }, + { + "epoch": 0.9043661786044057, + "grad_norm": 0.029533104971051216, + "learning_rate": 1.804800844104458e-05, + "loss": 0.0013, + "step": 3428 + }, + { + "epoch": 0.9048938134810711, + "grad_norm": 0.17745760083198547, + "learning_rate": 1.8058559746768666e-05, + "loss": 0.0099, + "step": 3430 + }, + { + "epoch": 0.9054214483577364, + "grad_norm": 0.03372675180435181, + "learning_rate": 1.8069111052492746e-05, + "loss": 0.0011, + "step": 3432 + }, + { + "epoch": 0.9059490832344018, + "grad_norm": 0.01744096353650093, + "learning_rate": 1.8079662358216832e-05, + "loss": 0.006, + "step": 3434 + }, + { + "epoch": 0.9064767181110671, + "grad_norm": 0.28556710481643677, + "learning_rate": 1.8090213663940915e-05, + "loss": 0.0098, + "step": 3436 + }, + { + "epoch": 0.9070043529877325, + "grad_norm": 0.048967018723487854, + "learning_rate": 1.8100764969664998e-05, + "loss": 0.0109, + "step": 3438 + }, + { + "epoch": 0.9075319878643978, + "grad_norm": 0.026655923575162888, + "learning_rate": 1.811131627538908e-05, + "loss": 0.001, + "step": 3440 + }, + { + "epoch": 0.9080596227410632, + "grad_norm": 0.03343930467963219, + "learning_rate": 1.8121867581113164e-05, + "loss": 0.01, + "step": 3442 + }, + { + "epoch": 0.9085872576177285, + "grad_norm": 0.28658193349838257, + "learning_rate": 1.813241888683725e-05, + "loss": 0.0026, + "step": 3444 + }, + { + "epoch": 0.9091148924943939, + "grad_norm": 0.07839014381170273, + "learning_rate": 1.814297019256133e-05, + "loss": 0.0109, + "step": 3446 + }, + { + "epoch": 0.9096425273710592, + "grad_norm": 0.17614541947841644, + "learning_rate": 1.8153521498285413e-05, + "loss": 0.0026, + "step": 3448 + }, + { + "epoch": 0.9101701622477246, + "grad_norm": 0.08472691476345062, + "learning_rate": 1.81640728040095e-05, + "loss": 0.0019, + "step": 3450 + }, + { + "epoch": 0.9106977971243899, + "grad_norm": 0.19740797579288483, + "learning_rate": 1.8174624109733582e-05, + "loss": 0.0105, + "step": 3452 + }, + { + "epoch": 0.9112254320010553, + "grad_norm": 0.01609707437455654, + "learning_rate": 1.8185175415457665e-05, + "loss": 0.0115, + "step": 3454 + }, + { + "epoch": 0.9117530668777206, + "grad_norm": 0.8820890784263611, + "learning_rate": 1.8195726721181748e-05, + "loss": 0.0261, + "step": 3456 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.3483427166938782, + "learning_rate": 1.820627802690583e-05, + "loss": 0.0031, + "step": 3458 + }, + { + "epoch": 0.9128083366310513, + "grad_norm": 0.10919449478387833, + "learning_rate": 1.8216829332629914e-05, + "loss": 0.0072, + "step": 3460 + }, + { + "epoch": 0.9133359715077166, + "grad_norm": 0.2052246481180191, + "learning_rate": 1.8227380638353997e-05, + "loss": 0.0024, + "step": 3462 + }, + { + "epoch": 0.913863606384382, + "grad_norm": 0.890872597694397, + "learning_rate": 1.8237931944078083e-05, + "loss": 0.0048, + "step": 3464 + }, + { + "epoch": 0.9143912412610473, + "grad_norm": 0.06243900582194328, + "learning_rate": 1.8248483249802166e-05, + "loss": 0.0017, + "step": 3466 + }, + { + "epoch": 0.9149188761377127, + "grad_norm": 0.4145548343658447, + "learning_rate": 1.8259034555526246e-05, + "loss": 0.0046, + "step": 3468 + }, + { + "epoch": 0.915446511014378, + "grad_norm": 5.250077724456787, + "learning_rate": 1.8269585861250332e-05, + "loss": 0.0037, + "step": 3470 + }, + { + "epoch": 0.9159741458910434, + "grad_norm": 0.3449462950229645, + "learning_rate": 1.8280137166974415e-05, + "loss": 0.0041, + "step": 3472 + }, + { + "epoch": 0.9165017807677087, + "grad_norm": 0.10878012329339981, + "learning_rate": 1.8290688472698498e-05, + "loss": 0.002, + "step": 3474 + }, + { + "epoch": 0.9170294156443741, + "grad_norm": 0.3727782070636749, + "learning_rate": 1.830123977842258e-05, + "loss": 0.0097, + "step": 3476 + }, + { + "epoch": 0.9175570505210394, + "grad_norm": 0.02584019862115383, + "learning_rate": 1.8311791084146664e-05, + "loss": 0.0014, + "step": 3478 + }, + { + "epoch": 0.9180846853977048, + "grad_norm": 0.4492707848548889, + "learning_rate": 1.832234238987075e-05, + "loss": 0.02, + "step": 3480 + }, + { + "epoch": 0.9186123202743701, + "grad_norm": 0.05649777129292488, + "learning_rate": 1.833289369559483e-05, + "loss": 0.0013, + "step": 3482 + }, + { + "epoch": 0.9191399551510355, + "grad_norm": 0.10951244086027145, + "learning_rate": 1.8343445001318916e-05, + "loss": 0.008, + "step": 3484 + }, + { + "epoch": 0.9196675900277008, + "grad_norm": 0.36643171310424805, + "learning_rate": 1.8353996307043e-05, + "loss": 0.0073, + "step": 3486 + }, + { + "epoch": 0.9201952249043662, + "grad_norm": 0.02769552357494831, + "learning_rate": 1.8364547612767082e-05, + "loss": 0.0046, + "step": 3488 + }, + { + "epoch": 0.9207228597810315, + "grad_norm": 0.03968926519155502, + "learning_rate": 1.8375098918491165e-05, + "loss": 0.0013, + "step": 3490 + }, + { + "epoch": 0.9212504946576969, + "grad_norm": 0.046263471245765686, + "learning_rate": 1.8385650224215248e-05, + "loss": 0.0023, + "step": 3492 + }, + { + "epoch": 0.9217781295343622, + "grad_norm": 0.20324988663196564, + "learning_rate": 1.839620152993933e-05, + "loss": 0.0166, + "step": 3494 + }, + { + "epoch": 0.9223057644110275, + "grad_norm": 0.0945301204919815, + "learning_rate": 1.8406752835663414e-05, + "loss": 0.0046, + "step": 3496 + }, + { + "epoch": 0.9228333992876929, + "grad_norm": 0.03280003368854523, + "learning_rate": 1.8417304141387497e-05, + "loss": 0.0011, + "step": 3498 + }, + { + "epoch": 0.9233610341643582, + "grad_norm": 0.4711993932723999, + "learning_rate": 1.8427855447111583e-05, + "loss": 0.0164, + "step": 3500 + }, + { + "epoch": 0.9238886690410236, + "grad_norm": 0.3989707827568054, + "learning_rate": 1.8438406752835666e-05, + "loss": 0.0081, + "step": 3502 + }, + { + "epoch": 0.9244163039176889, + "grad_norm": 0.08289652317762375, + "learning_rate": 1.8448958058559746e-05, + "loss": 0.0029, + "step": 3504 + }, + { + "epoch": 0.9249439387943543, + "grad_norm": 0.263421893119812, + "learning_rate": 1.8459509364283832e-05, + "loss": 0.0087, + "step": 3506 + }, + { + "epoch": 0.9254715736710196, + "grad_norm": 0.9962295293807983, + "learning_rate": 1.8470060670007915e-05, + "loss": 0.0027, + "step": 3508 + }, + { + "epoch": 0.925999208547685, + "grad_norm": 0.24460038542747498, + "learning_rate": 1.8480611975731998e-05, + "loss": 0.0041, + "step": 3510 + }, + { + "epoch": 0.9265268434243503, + "grad_norm": 0.038202982395887375, + "learning_rate": 1.849116328145608e-05, + "loss": 0.0067, + "step": 3512 + }, + { + "epoch": 0.9270544783010157, + "grad_norm": 0.06559828668832779, + "learning_rate": 1.8501714587180164e-05, + "loss": 0.0018, + "step": 3514 + }, + { + "epoch": 0.927582113177681, + "grad_norm": 0.03778517246246338, + "learning_rate": 1.851226589290425e-05, + "loss": 0.0017, + "step": 3516 + }, + { + "epoch": 0.9281097480543464, + "grad_norm": 0.6671909093856812, + "learning_rate": 1.852281719862833e-05, + "loss": 0.0168, + "step": 3518 + }, + { + "epoch": 0.9286373829310117, + "grad_norm": 0.08234617114067078, + "learning_rate": 1.8533368504352416e-05, + "loss": 0.0135, + "step": 3520 + }, + { + "epoch": 0.9291650178076771, + "grad_norm": 0.019701406359672546, + "learning_rate": 1.85439198100765e-05, + "loss": 0.0123, + "step": 3522 + }, + { + "epoch": 0.9296926526843424, + "grad_norm": 0.049765389412641525, + "learning_rate": 1.8554471115800582e-05, + "loss": 0.0066, + "step": 3524 + }, + { + "epoch": 0.9302202875610078, + "grad_norm": 0.2255292385816574, + "learning_rate": 1.8565022421524665e-05, + "loss": 0.015, + "step": 3526 + }, + { + "epoch": 0.9307479224376731, + "grad_norm": 0.07961148768663406, + "learning_rate": 1.8575573727248748e-05, + "loss": 0.002, + "step": 3528 + }, + { + "epoch": 0.9312755573143385, + "grad_norm": 0.594898521900177, + "learning_rate": 1.8586125032972834e-05, + "loss": 0.0064, + "step": 3530 + }, + { + "epoch": 0.9318031921910038, + "grad_norm": 0.3260881006717682, + "learning_rate": 1.8596676338696914e-05, + "loss": 0.0042, + "step": 3532 + }, + { + "epoch": 0.9323308270676691, + "grad_norm": 0.030940597876906395, + "learning_rate": 1.8607227644420997e-05, + "loss": 0.0054, + "step": 3534 + }, + { + "epoch": 0.9328584619443345, + "grad_norm": 0.08146202564239502, + "learning_rate": 1.8617778950145083e-05, + "loss": 0.0017, + "step": 3536 + }, + { + "epoch": 0.9333860968209998, + "grad_norm": 0.09209483116865158, + "learning_rate": 1.8628330255869166e-05, + "loss": 0.0035, + "step": 3538 + }, + { + "epoch": 0.9339137316976652, + "grad_norm": 0.06142236292362213, + "learning_rate": 1.863888156159325e-05, + "loss": 0.0022, + "step": 3540 + }, + { + "epoch": 0.9344413665743305, + "grad_norm": 0.06525867432355881, + "learning_rate": 1.8649432867317332e-05, + "loss": 0.0038, + "step": 3542 + }, + { + "epoch": 0.9349690014509959, + "grad_norm": 0.6628194451332092, + "learning_rate": 1.8659984173041415e-05, + "loss": 0.0082, + "step": 3544 + }, + { + "epoch": 0.9354966363276612, + "grad_norm": 0.07489562034606934, + "learning_rate": 1.8670535478765498e-05, + "loss": 0.0051, + "step": 3546 + }, + { + "epoch": 0.9360242712043266, + "grad_norm": 1.0894774198532104, + "learning_rate": 1.868108678448958e-05, + "loss": 0.0027, + "step": 3548 + }, + { + "epoch": 0.9365519060809919, + "grad_norm": 0.16387814283370972, + "learning_rate": 1.8691638090213667e-05, + "loss": 0.0014, + "step": 3550 + }, + { + "epoch": 0.9370795409576573, + "grad_norm": 0.16696599125862122, + "learning_rate": 1.870218939593775e-05, + "loss": 0.0016, + "step": 3552 + }, + { + "epoch": 0.9376071758343226, + "grad_norm": 0.04019898548722267, + "learning_rate": 1.871274070166183e-05, + "loss": 0.0011, + "step": 3554 + }, + { + "epoch": 0.938134810710988, + "grad_norm": 0.023985782638192177, + "learning_rate": 1.8723292007385916e-05, + "loss": 0.0009, + "step": 3556 + }, + { + "epoch": 0.9386624455876533, + "grad_norm": 0.022823380306363106, + "learning_rate": 1.873384331311e-05, + "loss": 0.0054, + "step": 3558 + }, + { + "epoch": 0.9391900804643187, + "grad_norm": 0.011874759569764137, + "learning_rate": 1.8744394618834082e-05, + "loss": 0.0011, + "step": 3560 + }, + { + "epoch": 0.939717715340984, + "grad_norm": 0.011695712804794312, + "learning_rate": 1.8754945924558165e-05, + "loss": 0.0032, + "step": 3562 + }, + { + "epoch": 0.9402453502176494, + "grad_norm": 1.0914076566696167, + "learning_rate": 1.8765497230282248e-05, + "loss": 0.0068, + "step": 3564 + }, + { + "epoch": 0.9407729850943147, + "grad_norm": 0.17373165488243103, + "learning_rate": 1.8776048536006334e-05, + "loss": 0.0148, + "step": 3566 + }, + { + "epoch": 0.9413006199709801, + "grad_norm": 0.013025229796767235, + "learning_rate": 1.8786599841730414e-05, + "loss": 0.0018, + "step": 3568 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 0.5242428183555603, + "learning_rate": 1.87971511474545e-05, + "loss": 0.0054, + "step": 3570 + }, + { + "epoch": 0.9423558897243107, + "grad_norm": 0.6339736580848694, + "learning_rate": 1.8807702453178583e-05, + "loss": 0.0027, + "step": 3572 + }, + { + "epoch": 0.9428835246009761, + "grad_norm": 0.3015701472759247, + "learning_rate": 1.8818253758902666e-05, + "loss": 0.0049, + "step": 3574 + }, + { + "epoch": 0.9434111594776414, + "grad_norm": 0.02226426638662815, + "learning_rate": 1.882880506462675e-05, + "loss": 0.0018, + "step": 3576 + }, + { + "epoch": 0.9439387943543068, + "grad_norm": 0.011592724360525608, + "learning_rate": 1.8839356370350832e-05, + "loss": 0.0043, + "step": 3578 + }, + { + "epoch": 0.9444664292309721, + "grad_norm": 0.11318743228912354, + "learning_rate": 1.8849907676074915e-05, + "loss": 0.0015, + "step": 3580 + }, + { + "epoch": 0.9449940641076375, + "grad_norm": 0.4035346806049347, + "learning_rate": 1.8860458981798998e-05, + "loss": 0.0017, + "step": 3582 + }, + { + "epoch": 0.9455216989843028, + "grad_norm": 0.13476403057575226, + "learning_rate": 1.887101028752308e-05, + "loss": 0.0016, + "step": 3584 + }, + { + "epoch": 0.9460493338609682, + "grad_norm": 0.3013429045677185, + "learning_rate": 1.8881561593247167e-05, + "loss": 0.0047, + "step": 3586 + }, + { + "epoch": 0.9465769687376335, + "grad_norm": 0.592116117477417, + "learning_rate": 1.889211289897125e-05, + "loss": 0.0093, + "step": 3588 + }, + { + "epoch": 0.9471046036142989, + "grad_norm": 0.6785051822662354, + "learning_rate": 1.8902664204695333e-05, + "loss": 0.0137, + "step": 3590 + }, + { + "epoch": 0.9476322384909642, + "grad_norm": 0.04777811840176582, + "learning_rate": 1.8913215510419416e-05, + "loss": 0.0017, + "step": 3592 + }, + { + "epoch": 0.9481598733676296, + "grad_norm": 0.023162763565778732, + "learning_rate": 1.89237668161435e-05, + "loss": 0.0009, + "step": 3594 + }, + { + "epoch": 0.9486875082442949, + "grad_norm": 0.05760585889220238, + "learning_rate": 1.8934318121867582e-05, + "loss": 0.001, + "step": 3596 + }, + { + "epoch": 0.9492151431209603, + "grad_norm": 0.7495129704475403, + "learning_rate": 1.8944869427591665e-05, + "loss": 0.0258, + "step": 3598 + }, + { + "epoch": 0.9497427779976256, + "grad_norm": 0.015208326280117035, + "learning_rate": 1.895542073331575e-05, + "loss": 0.0135, + "step": 3600 + }, + { + "epoch": 0.950270412874291, + "grad_norm": 0.4470962882041931, + "learning_rate": 1.8965972039039834e-05, + "loss": 0.009, + "step": 3602 + }, + { + "epoch": 0.9507980477509563, + "grad_norm": 0.5005911588668823, + "learning_rate": 1.8976523344763914e-05, + "loss": 0.0095, + "step": 3604 + }, + { + "epoch": 0.9513256826276217, + "grad_norm": 0.4783420264720917, + "learning_rate": 1.8987074650488e-05, + "loss": 0.0066, + "step": 3606 + }, + { + "epoch": 0.951853317504287, + "grad_norm": 0.014468241482973099, + "learning_rate": 1.8997625956212083e-05, + "loss": 0.0012, + "step": 3608 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.035120755434036255, + "learning_rate": 1.9008177261936166e-05, + "loss": 0.0014, + "step": 3610 + }, + { + "epoch": 0.9529085872576177, + "grad_norm": 0.2746121287345886, + "learning_rate": 1.901872856766025e-05, + "loss": 0.0143, + "step": 3612 + }, + { + "epoch": 0.953436222134283, + "grad_norm": 0.36493006348609924, + "learning_rate": 1.9029279873384332e-05, + "loss": 0.0099, + "step": 3614 + }, + { + "epoch": 0.9539638570109484, + "grad_norm": 0.1222774013876915, + "learning_rate": 1.9039831179108415e-05, + "loss": 0.0054, + "step": 3616 + }, + { + "epoch": 0.9544914918876137, + "grad_norm": 0.13346649706363678, + "learning_rate": 1.9050382484832498e-05, + "loss": 0.0025, + "step": 3618 + }, + { + "epoch": 0.9550191267642791, + "grad_norm": 0.10160134732723236, + "learning_rate": 1.9060933790556584e-05, + "loss": 0.0146, + "step": 3620 + }, + { + "epoch": 0.9555467616409444, + "grad_norm": 0.46191754937171936, + "learning_rate": 1.9071485096280667e-05, + "loss": 0.0132, + "step": 3622 + }, + { + "epoch": 0.9560743965176098, + "grad_norm": 0.11374475806951523, + "learning_rate": 1.908203640200475e-05, + "loss": 0.0024, + "step": 3624 + }, + { + "epoch": 0.9566020313942751, + "grad_norm": 0.533915102481842, + "learning_rate": 1.9092587707728833e-05, + "loss": 0.0102, + "step": 3626 + }, + { + "epoch": 0.9571296662709405, + "grad_norm": 0.14749102294445038, + "learning_rate": 1.9103139013452916e-05, + "loss": 0.0024, + "step": 3628 + }, + { + "epoch": 0.9576573011476058, + "grad_norm": 0.39464932680130005, + "learning_rate": 1.9113690319177e-05, + "loss": 0.0071, + "step": 3630 + }, + { + "epoch": 0.9581849360242712, + "grad_norm": 0.22604213654994965, + "learning_rate": 1.9124241624901082e-05, + "loss": 0.0068, + "step": 3632 + }, + { + "epoch": 0.9587125709009365, + "grad_norm": 0.0701683983206749, + "learning_rate": 1.9134792930625165e-05, + "loss": 0.0055, + "step": 3634 + }, + { + "epoch": 0.9592402057776019, + "grad_norm": 0.1592673361301422, + "learning_rate": 1.914534423634925e-05, + "loss": 0.0053, + "step": 3636 + }, + { + "epoch": 0.9597678406542672, + "grad_norm": 0.1602717936038971, + "learning_rate": 1.9155895542073334e-05, + "loss": 0.006, + "step": 3638 + }, + { + "epoch": 0.9602954755309326, + "grad_norm": 0.4302048981189728, + "learning_rate": 1.9166446847797417e-05, + "loss": 0.0153, + "step": 3640 + }, + { + "epoch": 0.9608231104075979, + "grad_norm": 0.3445238173007965, + "learning_rate": 1.91769981535215e-05, + "loss": 0.0072, + "step": 3642 + }, + { + "epoch": 0.9613507452842633, + "grad_norm": 0.024308159947395325, + "learning_rate": 1.9187549459245583e-05, + "loss": 0.001, + "step": 3644 + }, + { + "epoch": 0.9618783801609286, + "grad_norm": 0.021398138254880905, + "learning_rate": 1.9198100764969666e-05, + "loss": 0.0089, + "step": 3646 + }, + { + "epoch": 0.9624060150375939, + "grad_norm": 0.039106111973524094, + "learning_rate": 1.920865207069375e-05, + "loss": 0.0063, + "step": 3648 + }, + { + "epoch": 0.9629336499142593, + "grad_norm": 0.0387006551027298, + "learning_rate": 1.9219203376417836e-05, + "loss": 0.0135, + "step": 3650 + }, + { + "epoch": 0.9634612847909246, + "grad_norm": 0.20478267967700958, + "learning_rate": 1.9229754682141915e-05, + "loss": 0.0089, + "step": 3652 + }, + { + "epoch": 0.96398891966759, + "grad_norm": 0.5284172892570496, + "learning_rate": 1.9240305987865998e-05, + "loss": 0.0137, + "step": 3654 + }, + { + "epoch": 0.9645165545442553, + "grad_norm": 0.03765587881207466, + "learning_rate": 1.9250857293590084e-05, + "loss": 0.0029, + "step": 3656 + }, + { + "epoch": 0.9650441894209207, + "grad_norm": 0.6012107729911804, + "learning_rate": 1.9261408599314167e-05, + "loss": 0.0058, + "step": 3658 + }, + { + "epoch": 0.965571824297586, + "grad_norm": 0.06629011780023575, + "learning_rate": 1.927195990503825e-05, + "loss": 0.0036, + "step": 3660 + }, + { + "epoch": 0.9660994591742514, + "grad_norm": 0.2258056253194809, + "learning_rate": 1.9282511210762333e-05, + "loss": 0.0175, + "step": 3662 + }, + { + "epoch": 0.9666270940509167, + "grad_norm": 0.6153824329376221, + "learning_rate": 1.9293062516486416e-05, + "loss": 0.0074, + "step": 3664 + }, + { + "epoch": 0.9671547289275821, + "grad_norm": 0.4212040901184082, + "learning_rate": 1.93036138222105e-05, + "loss": 0.0042, + "step": 3666 + }, + { + "epoch": 0.9676823638042474, + "grad_norm": 0.2705156207084656, + "learning_rate": 1.9314165127934582e-05, + "loss": 0.0085, + "step": 3668 + }, + { + "epoch": 0.9682099986809128, + "grad_norm": 0.2982797622680664, + "learning_rate": 1.932471643365867e-05, + "loss": 0.0115, + "step": 3670 + }, + { + "epoch": 0.9687376335575781, + "grad_norm": 0.5717795491218567, + "learning_rate": 1.933526773938275e-05, + "loss": 0.0052, + "step": 3672 + }, + { + "epoch": 0.9692652684342435, + "grad_norm": 0.3362472653388977, + "learning_rate": 1.9345819045106834e-05, + "loss": 0.0057, + "step": 3674 + }, + { + "epoch": 0.9697929033109088, + "grad_norm": 0.3485587239265442, + "learning_rate": 1.9356370350830917e-05, + "loss": 0.009, + "step": 3676 + }, + { + "epoch": 0.9703205381875742, + "grad_norm": 0.19194796681404114, + "learning_rate": 1.9366921656555e-05, + "loss": 0.0027, + "step": 3678 + }, + { + "epoch": 0.9708481730642395, + "grad_norm": 0.3034185767173767, + "learning_rate": 1.9377472962279083e-05, + "loss": 0.0063, + "step": 3680 + }, + { + "epoch": 0.971375807940905, + "grad_norm": 0.09338180720806122, + "learning_rate": 1.9388024268003166e-05, + "loss": 0.0134, + "step": 3682 + }, + { + "epoch": 0.9719034428175702, + "grad_norm": 0.15050828456878662, + "learning_rate": 1.939857557372725e-05, + "loss": 0.0016, + "step": 3684 + }, + { + "epoch": 0.9724310776942355, + "grad_norm": 0.28643399477005005, + "learning_rate": 1.9409126879451336e-05, + "loss": 0.0035, + "step": 3686 + }, + { + "epoch": 0.9729587125709009, + "grad_norm": 0.04966983199119568, + "learning_rate": 1.9419678185175415e-05, + "loss": 0.0011, + "step": 3688 + }, + { + "epoch": 0.9734863474475662, + "grad_norm": 0.22900517284870148, + "learning_rate": 1.94302294908995e-05, + "loss": 0.0024, + "step": 3690 + }, + { + "epoch": 0.9740139823242316, + "grad_norm": 0.4220162034034729, + "learning_rate": 1.9440780796623584e-05, + "loss": 0.0035, + "step": 3692 + }, + { + "epoch": 0.9745416172008969, + "grad_norm": 0.5705257654190063, + "learning_rate": 1.9451332102347667e-05, + "loss": 0.0119, + "step": 3694 + }, + { + "epoch": 0.9750692520775623, + "grad_norm": 0.028218450024724007, + "learning_rate": 1.946188340807175e-05, + "loss": 0.0012, + "step": 3696 + }, + { + "epoch": 0.9755968869542276, + "grad_norm": 0.09797290712594986, + "learning_rate": 1.9472434713795833e-05, + "loss": 0.0012, + "step": 3698 + }, + { + "epoch": 0.976124521830893, + "grad_norm": 0.16098763048648834, + "learning_rate": 1.9482986019519916e-05, + "loss": 0.0038, + "step": 3700 + }, + { + "epoch": 0.9766521567075583, + "grad_norm": 0.7870072722434998, + "learning_rate": 1.9493537325244e-05, + "loss": 0.0144, + "step": 3702 + }, + { + "epoch": 0.9771797915842237, + "grad_norm": 0.7259330749511719, + "learning_rate": 1.9504088630968082e-05, + "loss": 0.0091, + "step": 3704 + }, + { + "epoch": 0.977707426460889, + "grad_norm": 0.014552721753716469, + "learning_rate": 1.951463993669217e-05, + "loss": 0.0009, + "step": 3706 + }, + { + "epoch": 0.9782350613375544, + "grad_norm": 0.03467230871319771, + "learning_rate": 1.952519124241625e-05, + "loss": 0.0011, + "step": 3708 + }, + { + "epoch": 0.9787626962142197, + "grad_norm": 0.0202826838940382, + "learning_rate": 1.9535742548140334e-05, + "loss": 0.0009, + "step": 3710 + }, + { + "epoch": 0.9792903310908851, + "grad_norm": 0.021304901689291, + "learning_rate": 1.9546293853864417e-05, + "loss": 0.0069, + "step": 3712 + }, + { + "epoch": 0.9798179659675504, + "grad_norm": 0.21580393612384796, + "learning_rate": 1.95568451595885e-05, + "loss": 0.0037, + "step": 3714 + }, + { + "epoch": 0.9803456008442158, + "grad_norm": 0.4028562307357788, + "learning_rate": 1.9567396465312583e-05, + "loss": 0.004, + "step": 3716 + }, + { + "epoch": 0.9808732357208811, + "grad_norm": 0.020414136350154877, + "learning_rate": 1.9577947771036666e-05, + "loss": 0.001, + "step": 3718 + }, + { + "epoch": 0.9814008705975465, + "grad_norm": 0.09334065765142441, + "learning_rate": 1.958849907676075e-05, + "loss": 0.002, + "step": 3720 + }, + { + "epoch": 0.9819285054742118, + "grad_norm": 0.35208913683891296, + "learning_rate": 1.9599050382484836e-05, + "loss": 0.0027, + "step": 3722 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.3167515695095062, + "learning_rate": 1.9609601688208915e-05, + "loss": 0.0059, + "step": 3724 + }, + { + "epoch": 0.9829837752275425, + "grad_norm": 0.6328586339950562, + "learning_rate": 1.9620152993933e-05, + "loss": 0.0103, + "step": 3726 + }, + { + "epoch": 0.9835114101042078, + "grad_norm": 0.11350728571414948, + "learning_rate": 1.9630704299657085e-05, + "loss": 0.0046, + "step": 3728 + }, + { + "epoch": 0.9840390449808732, + "grad_norm": 0.01881176233291626, + "learning_rate": 1.9641255605381167e-05, + "loss": 0.001, + "step": 3730 + }, + { + "epoch": 0.9845666798575385, + "grad_norm": 0.5679580569267273, + "learning_rate": 1.965180691110525e-05, + "loss": 0.0072, + "step": 3732 + }, + { + "epoch": 0.985094314734204, + "grad_norm": 0.3028451204299927, + "learning_rate": 1.9662358216829333e-05, + "loss": 0.0019, + "step": 3734 + }, + { + "epoch": 0.9856219496108692, + "grad_norm": 0.014588289894163609, + "learning_rate": 1.967290952255342e-05, + "loss": 0.0039, + "step": 3736 + }, + { + "epoch": 0.9861495844875346, + "grad_norm": 0.04131188243627548, + "learning_rate": 1.96834608282775e-05, + "loss": 0.0013, + "step": 3738 + }, + { + "epoch": 0.9866772193641999, + "grad_norm": 0.13171321153640747, + "learning_rate": 1.9694012134001582e-05, + "loss": 0.0014, + "step": 3740 + }, + { + "epoch": 0.9872048542408653, + "grad_norm": 0.15728172659873962, + "learning_rate": 1.970456343972567e-05, + "loss": 0.0051, + "step": 3742 + }, + { + "epoch": 0.9877324891175306, + "grad_norm": 0.031518202275037766, + "learning_rate": 1.971511474544975e-05, + "loss": 0.001, + "step": 3744 + }, + { + "epoch": 0.988260123994196, + "grad_norm": 0.18458528816699982, + "learning_rate": 1.9725666051173835e-05, + "loss": 0.009, + "step": 3746 + }, + { + "epoch": 0.9887877588708613, + "grad_norm": 0.015342510305345058, + "learning_rate": 1.9736217356897917e-05, + "loss": 0.0009, + "step": 3748 + }, + { + "epoch": 0.9893153937475268, + "grad_norm": 0.2849581241607666, + "learning_rate": 1.9746768662622e-05, + "loss": 0.0021, + "step": 3750 + }, + { + "epoch": 0.989843028624192, + "grad_norm": 0.10362743586301804, + "learning_rate": 1.9757319968346083e-05, + "loss": 0.0012, + "step": 3752 + }, + { + "epoch": 0.9903706635008575, + "grad_norm": 0.04435824230313301, + "learning_rate": 1.9767871274070166e-05, + "loss": 0.0034, + "step": 3754 + }, + { + "epoch": 0.9908982983775227, + "grad_norm": 0.3240700364112854, + "learning_rate": 1.9778422579794253e-05, + "loss": 0.002, + "step": 3756 + }, + { + "epoch": 0.9914259332541882, + "grad_norm": 0.4167688488960266, + "learning_rate": 1.9788973885518336e-05, + "loss": 0.0125, + "step": 3758 + }, + { + "epoch": 0.9919535681308534, + "grad_norm": 0.0730944499373436, + "learning_rate": 1.9799525191242415e-05, + "loss": 0.0124, + "step": 3760 + }, + { + "epoch": 0.9924812030075187, + "grad_norm": 0.038098182529211044, + "learning_rate": 1.98100764969665e-05, + "loss": 0.0009, + "step": 3762 + }, + { + "epoch": 0.9930088378841841, + "grad_norm": 0.03949872776865959, + "learning_rate": 1.9820627802690585e-05, + "loss": 0.002, + "step": 3764 + }, + { + "epoch": 0.9935364727608494, + "grad_norm": 0.4190538227558136, + "learning_rate": 1.9831179108414668e-05, + "loss": 0.0197, + "step": 3766 + }, + { + "epoch": 0.9940641076375148, + "grad_norm": 0.02329457364976406, + "learning_rate": 1.984173041413875e-05, + "loss": 0.002, + "step": 3768 + }, + { + "epoch": 0.9945917425141801, + "grad_norm": 0.17284929752349854, + "learning_rate": 1.9852281719862833e-05, + "loss": 0.0043, + "step": 3770 + }, + { + "epoch": 0.9951193773908455, + "grad_norm": 0.06589680910110474, + "learning_rate": 1.986283302558692e-05, + "loss": 0.001, + "step": 3772 + }, + { + "epoch": 0.9956470122675108, + "grad_norm": 0.5123501420021057, + "learning_rate": 1.9873384331311e-05, + "loss": 0.0106, + "step": 3774 + }, + { + "epoch": 0.9961746471441763, + "grad_norm": 0.03407685086131096, + "learning_rate": 1.9883935637035086e-05, + "loss": 0.0064, + "step": 3776 + }, + { + "epoch": 0.9967022820208415, + "grad_norm": 0.0825684443116188, + "learning_rate": 1.989448694275917e-05, + "loss": 0.0012, + "step": 3778 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 0.3399464190006256, + "learning_rate": 1.990503824848325e-05, + "loss": 0.007, + "step": 3780 + }, + { + "epoch": 0.9977575517741722, + "grad_norm": 0.029990453273057938, + "learning_rate": 1.9915589554207335e-05, + "loss": 0.0033, + "step": 3782 + }, + { + "epoch": 0.9982851866508377, + "grad_norm": 1.1133619546890259, + "learning_rate": 1.9926140859931418e-05, + "loss": 0.0068, + "step": 3784 + }, + { + "epoch": 0.998812821527503, + "grad_norm": 0.017649371176958084, + "learning_rate": 1.9936692165655504e-05, + "loss": 0.0014, + "step": 3786 + }, + { + "epoch": 0.9993404564041684, + "grad_norm": 0.23266056180000305, + "learning_rate": 1.9947243471379583e-05, + "loss": 0.0072, + "step": 3788 + }, + { + "epoch": 0.9998680912808336, + "grad_norm": 0.07247746735811234, + "learning_rate": 1.9957794777103666e-05, + "loss": 0.0012, + "step": 3790 + }, + { + "epoch": 1.0002638174383327, + "grad_norm": 0.021650834009051323, + "learning_rate": 1.9968346082827753e-05, + "loss": 0.001, + "step": 3792 + }, + { + "epoch": 1.000791452314998, + "grad_norm": 0.04537678882479668, + "learning_rate": 1.9978897388551836e-05, + "loss": 0.0013, + "step": 3794 + }, + { + "epoch": 1.0013190871916633, + "grad_norm": 0.28567689657211304, + "learning_rate": 1.998944869427592e-05, + "loss": 0.007, + "step": 3796 + }, + { + "epoch": 1.0018467220683287, + "grad_norm": 0.6768865585327148, + "learning_rate": 2e-05, + "loss": 0.0104, + "step": 3798 + }, + { + "epoch": 1.002374356944994, + "grad_norm": 0.031064778566360474, + "learning_rate": 1.9998827632697328e-05, + "loss": 0.001, + "step": 3800 + }, + { + "epoch": 1.0029019918216595, + "grad_norm": 0.18485617637634277, + "learning_rate": 1.999765526539465e-05, + "loss": 0.0018, + "step": 3802 + }, + { + "epoch": 1.0034296266983247, + "grad_norm": 0.1632208675146103, + "learning_rate": 1.9996482898091973e-05, + "loss": 0.0044, + "step": 3804 + }, + { + "epoch": 1.00395726157499, + "grad_norm": 0.10618620365858078, + "learning_rate": 1.99953105307893e-05, + "loss": 0.0033, + "step": 3806 + }, + { + "epoch": 1.0044848964516555, + "grad_norm": 0.2557918429374695, + "learning_rate": 1.999413816348662e-05, + "loss": 0.0081, + "step": 3808 + }, + { + "epoch": 1.0050125313283207, + "grad_norm": 0.03563517704606056, + "learning_rate": 1.9992965796183944e-05, + "loss": 0.0084, + "step": 3810 + }, + { + "epoch": 1.005540166204986, + "grad_norm": 0.10475820302963257, + "learning_rate": 1.999179342888127e-05, + "loss": 0.0041, + "step": 3812 + }, + { + "epoch": 1.0060678010816515, + "grad_norm": 0.05446895584464073, + "learning_rate": 1.9990621061578596e-05, + "loss": 0.0054, + "step": 3814 + }, + { + "epoch": 1.006595435958317, + "grad_norm": 0.016544993966817856, + "learning_rate": 1.998944869427592e-05, + "loss": 0.0017, + "step": 3816 + }, + { + "epoch": 1.007123070834982, + "grad_norm": 0.2171528935432434, + "learning_rate": 1.998827632697324e-05, + "loss": 0.0029, + "step": 3818 + }, + { + "epoch": 1.0076507057116475, + "grad_norm": 0.35474029183387756, + "learning_rate": 1.9987103959670567e-05, + "loss": 0.011, + "step": 3820 + }, + { + "epoch": 1.008178340588313, + "grad_norm": 0.061367325484752655, + "learning_rate": 1.998593159236789e-05, + "loss": 0.0066, + "step": 3822 + }, + { + "epoch": 1.0087059754649783, + "grad_norm": 0.12692596018314362, + "learning_rate": 1.9984759225065216e-05, + "loss": 0.0059, + "step": 3824 + }, + { + "epoch": 1.0092336103416435, + "grad_norm": 0.16778327524662018, + "learning_rate": 1.998358685776254e-05, + "loss": 0.003, + "step": 3826 + }, + { + "epoch": 1.009761245218309, + "grad_norm": 0.373136430978775, + "learning_rate": 1.998241449045986e-05, + "loss": 0.0027, + "step": 3828 + }, + { + "epoch": 1.0102888800949743, + "grad_norm": 0.13660356402397156, + "learning_rate": 1.9981242123157187e-05, + "loss": 0.0077, + "step": 3830 + }, + { + "epoch": 1.0108165149716397, + "grad_norm": 0.40462595224380493, + "learning_rate": 1.998006975585451e-05, + "loss": 0.0045, + "step": 3832 + }, + { + "epoch": 1.011344149848305, + "grad_norm": 0.13271167874336243, + "learning_rate": 1.9978897388551836e-05, + "loss": 0.0133, + "step": 3834 + }, + { + "epoch": 1.0118717847249703, + "grad_norm": 0.4622926414012909, + "learning_rate": 1.997772502124916e-05, + "loss": 0.0079, + "step": 3836 + }, + { + "epoch": 1.0123994196016357, + "grad_norm": 0.16301193833351135, + "learning_rate": 1.9976552653946484e-05, + "loss": 0.0095, + "step": 3838 + }, + { + "epoch": 1.0129270544783011, + "grad_norm": 0.30279695987701416, + "learning_rate": 1.9975380286643807e-05, + "loss": 0.0043, + "step": 3840 + }, + { + "epoch": 1.0134546893549663, + "grad_norm": 0.1827424019575119, + "learning_rate": 1.997420791934113e-05, + "loss": 0.0022, + "step": 3842 + }, + { + "epoch": 1.0139823242316317, + "grad_norm": 0.42373672127723694, + "learning_rate": 1.9973035552038456e-05, + "loss": 0.005, + "step": 3844 + }, + { + "epoch": 1.014509959108297, + "grad_norm": 0.051390305161476135, + "learning_rate": 1.997186318473578e-05, + "loss": 0.0067, + "step": 3846 + }, + { + "epoch": 1.0150375939849625, + "grad_norm": 0.04981297627091408, + "learning_rate": 1.9970690817433104e-05, + "loss": 0.0013, + "step": 3848 + }, + { + "epoch": 1.0155652288616277, + "grad_norm": 0.026753809303045273, + "learning_rate": 1.9969518450130427e-05, + "loss": 0.0008, + "step": 3850 + }, + { + "epoch": 1.016092863738293, + "grad_norm": 0.3701239824295044, + "learning_rate": 1.9968346082827753e-05, + "loss": 0.0062, + "step": 3852 + }, + { + "epoch": 1.0166204986149585, + "grad_norm": 0.4049311876296997, + "learning_rate": 1.9967173715525075e-05, + "loss": 0.0019, + "step": 3854 + }, + { + "epoch": 1.0171481334916237, + "grad_norm": 0.11192572861909866, + "learning_rate": 1.9966001348222398e-05, + "loss": 0.007, + "step": 3856 + }, + { + "epoch": 1.017675768368289, + "grad_norm": 0.045226581394672394, + "learning_rate": 1.9964828980919724e-05, + "loss": 0.0015, + "step": 3858 + }, + { + "epoch": 1.0182034032449545, + "grad_norm": 0.06704352796077728, + "learning_rate": 1.996365661361705e-05, + "loss": 0.002, + "step": 3860 + }, + { + "epoch": 1.01873103812162, + "grad_norm": 0.010714714415371418, + "learning_rate": 1.996248424631437e-05, + "loss": 0.0011, + "step": 3862 + }, + { + "epoch": 1.019258672998285, + "grad_norm": 0.045244161039590836, + "learning_rate": 1.9961311879011695e-05, + "loss": 0.0013, + "step": 3864 + }, + { + "epoch": 1.0197863078749505, + "grad_norm": 0.015725260600447655, + "learning_rate": 1.996013951170902e-05, + "loss": 0.0007, + "step": 3866 + }, + { + "epoch": 1.020313942751616, + "grad_norm": 0.26336929202079773, + "learning_rate": 1.9958967144406344e-05, + "loss": 0.0092, + "step": 3868 + }, + { + "epoch": 1.0208415776282813, + "grad_norm": 0.01342934649437666, + "learning_rate": 1.9957794777103666e-05, + "loss": 0.0039, + "step": 3870 + }, + { + "epoch": 1.0213692125049465, + "grad_norm": 0.021566906943917274, + "learning_rate": 1.9956622409800992e-05, + "loss": 0.0034, + "step": 3872 + }, + { + "epoch": 1.021896847381612, + "grad_norm": 0.1488153338432312, + "learning_rate": 1.9955450042498315e-05, + "loss": 0.0011, + "step": 3874 + }, + { + "epoch": 1.0224244822582773, + "grad_norm": 0.056938640773296356, + "learning_rate": 1.995427767519564e-05, + "loss": 0.001, + "step": 3876 + }, + { + "epoch": 1.0229521171349427, + "grad_norm": 0.11323105543851852, + "learning_rate": 1.9953105307892964e-05, + "loss": 0.0013, + "step": 3878 + }, + { + "epoch": 1.023479752011608, + "grad_norm": 0.01680559664964676, + "learning_rate": 1.995193294059029e-05, + "loss": 0.0009, + "step": 3880 + }, + { + "epoch": 1.0240073868882733, + "grad_norm": 0.07252193242311478, + "learning_rate": 1.9950760573287612e-05, + "loss": 0.0037, + "step": 3882 + }, + { + "epoch": 1.0245350217649387, + "grad_norm": 0.05329541862010956, + "learning_rate": 1.9949588205984938e-05, + "loss": 0.0032, + "step": 3884 + }, + { + "epoch": 1.025062656641604, + "grad_norm": 1.0489230155944824, + "learning_rate": 1.994841583868226e-05, + "loss": 0.0017, + "step": 3886 + }, + { + "epoch": 1.0255902915182693, + "grad_norm": 0.04969344288110733, + "learning_rate": 1.9947243471379583e-05, + "loss": 0.0008, + "step": 3888 + }, + { + "epoch": 1.0261179263949347, + "grad_norm": 0.21497264504432678, + "learning_rate": 1.994607110407691e-05, + "loss": 0.0111, + "step": 3890 + }, + { + "epoch": 1.0266455612716001, + "grad_norm": 0.03788812458515167, + "learning_rate": 1.9944898736774232e-05, + "loss": 0.0079, + "step": 3892 + }, + { + "epoch": 1.0271731961482653, + "grad_norm": 0.01489559281617403, + "learning_rate": 1.9943726369471558e-05, + "loss": 0.0033, + "step": 3894 + }, + { + "epoch": 1.0277008310249307, + "grad_norm": 0.08693606406450272, + "learning_rate": 1.994255400216888e-05, + "loss": 0.0009, + "step": 3896 + }, + { + "epoch": 1.0282284659015961, + "grad_norm": 0.01082643587142229, + "learning_rate": 1.9941381634866207e-05, + "loss": 0.0016, + "step": 3898 + }, + { + "epoch": 1.0287561007782615, + "grad_norm": 0.1349562257528305, + "learning_rate": 1.994020926756353e-05, + "loss": 0.0065, + "step": 3900 + }, + { + "epoch": 1.0292837356549267, + "grad_norm": 0.37940558791160583, + "learning_rate": 1.9939036900260852e-05, + "loss": 0.0042, + "step": 3902 + }, + { + "epoch": 1.029811370531592, + "grad_norm": 0.3550264537334442, + "learning_rate": 1.9937864532958178e-05, + "loss": 0.0056, + "step": 3904 + }, + { + "epoch": 1.0303390054082575, + "grad_norm": 0.04579070210456848, + "learning_rate": 1.9936692165655504e-05, + "loss": 0.0011, + "step": 3906 + }, + { + "epoch": 1.030866640284923, + "grad_norm": 0.22546347975730896, + "learning_rate": 1.9935519798352826e-05, + "loss": 0.0037, + "step": 3908 + }, + { + "epoch": 1.031394275161588, + "grad_norm": 0.4377124607563019, + "learning_rate": 1.993434743105015e-05, + "loss": 0.0083, + "step": 3910 + }, + { + "epoch": 1.0319219100382535, + "grad_norm": 0.14292360842227936, + "learning_rate": 1.9933175063747475e-05, + "loss": 0.008, + "step": 3912 + }, + { + "epoch": 1.032449544914919, + "grad_norm": 0.2221398651599884, + "learning_rate": 1.9932002696444798e-05, + "loss": 0.0032, + "step": 3914 + }, + { + "epoch": 1.0329771797915843, + "grad_norm": 0.26594945788383484, + "learning_rate": 1.993083032914212e-05, + "loss": 0.0085, + "step": 3916 + }, + { + "epoch": 1.0335048146682495, + "grad_norm": 0.1715535968542099, + "learning_rate": 1.9929657961839446e-05, + "loss": 0.0027, + "step": 3918 + }, + { + "epoch": 1.034032449544915, + "grad_norm": 0.04784798622131348, + "learning_rate": 1.992848559453677e-05, + "loss": 0.0032, + "step": 3920 + }, + { + "epoch": 1.0345600844215803, + "grad_norm": 0.05379120633006096, + "learning_rate": 1.992731322723409e-05, + "loss": 0.0013, + "step": 3922 + }, + { + "epoch": 1.0350877192982457, + "grad_norm": 0.3587310314178467, + "learning_rate": 1.9926140859931418e-05, + "loss": 0.0172, + "step": 3924 + }, + { + "epoch": 1.035615354174911, + "grad_norm": 0.01790892705321312, + "learning_rate": 1.9924968492628744e-05, + "loss": 0.002, + "step": 3926 + }, + { + "epoch": 1.0361429890515763, + "grad_norm": 0.19401545822620392, + "learning_rate": 1.9923796125326066e-05, + "loss": 0.0015, + "step": 3928 + }, + { + "epoch": 1.0366706239282417, + "grad_norm": 0.04052598029375076, + "learning_rate": 1.992262375802339e-05, + "loss": 0.0008, + "step": 3930 + }, + { + "epoch": 1.037198258804907, + "grad_norm": 0.02557484060525894, + "learning_rate": 1.9921451390720715e-05, + "loss": 0.0026, + "step": 3932 + }, + { + "epoch": 1.0377258936815723, + "grad_norm": 0.010959075763821602, + "learning_rate": 1.9920279023418037e-05, + "loss": 0.0007, + "step": 3934 + }, + { + "epoch": 1.0382535285582377, + "grad_norm": 0.05375581979751587, + "learning_rate": 1.9919106656115363e-05, + "loss": 0.0128, + "step": 3936 + }, + { + "epoch": 1.0387811634349031, + "grad_norm": 0.016952015459537506, + "learning_rate": 1.9917934288812686e-05, + "loss": 0.0079, + "step": 3938 + }, + { + "epoch": 1.0393087983115683, + "grad_norm": 0.38105034828186035, + "learning_rate": 1.9916761921510012e-05, + "loss": 0.0042, + "step": 3940 + }, + { + "epoch": 1.0398364331882337, + "grad_norm": 0.4741811454296112, + "learning_rate": 1.9915589554207335e-05, + "loss": 0.0039, + "step": 3942 + }, + { + "epoch": 1.0403640680648991, + "grad_norm": 0.06460333615541458, + "learning_rate": 1.991441718690466e-05, + "loss": 0.0016, + "step": 3944 + }, + { + "epoch": 1.0408917029415645, + "grad_norm": 0.4989749491214752, + "learning_rate": 1.9913244819601983e-05, + "loss": 0.0037, + "step": 3946 + }, + { + "epoch": 1.0414193378182297, + "grad_norm": 0.6342661380767822, + "learning_rate": 1.9912072452299306e-05, + "loss": 0.0145, + "step": 3948 + }, + { + "epoch": 1.0419469726948951, + "grad_norm": 0.3427657186985016, + "learning_rate": 1.9910900084996632e-05, + "loss": 0.0074, + "step": 3950 + }, + { + "epoch": 1.0424746075715605, + "grad_norm": 0.03448035567998886, + "learning_rate": 1.9909727717693954e-05, + "loss": 0.0014, + "step": 3952 + }, + { + "epoch": 1.043002242448226, + "grad_norm": 0.137958362698555, + "learning_rate": 1.990855535039128e-05, + "loss": 0.0032, + "step": 3954 + }, + { + "epoch": 1.043529877324891, + "grad_norm": 0.14570990204811096, + "learning_rate": 1.9907382983088603e-05, + "loss": 0.0047, + "step": 3956 + }, + { + "epoch": 1.0440575122015565, + "grad_norm": 1.1293901205062866, + "learning_rate": 1.990621061578593e-05, + "loss": 0.0023, + "step": 3958 + }, + { + "epoch": 1.044585147078222, + "grad_norm": 0.061527468264102936, + "learning_rate": 1.990503824848325e-05, + "loss": 0.0027, + "step": 3960 + }, + { + "epoch": 1.045112781954887, + "grad_norm": 0.03190287947654724, + "learning_rate": 1.9903865881180574e-05, + "loss": 0.0024, + "step": 3962 + }, + { + "epoch": 1.0456404168315525, + "grad_norm": 0.29391562938690186, + "learning_rate": 1.99026935138779e-05, + "loss": 0.0016, + "step": 3964 + }, + { + "epoch": 1.046168051708218, + "grad_norm": 0.26868709921836853, + "learning_rate": 1.9901521146575223e-05, + "loss": 0.003, + "step": 3966 + }, + { + "epoch": 1.0466956865848833, + "grad_norm": 0.27394792437553406, + "learning_rate": 1.9900348779272545e-05, + "loss": 0.0015, + "step": 3968 + }, + { + "epoch": 1.0472233214615485, + "grad_norm": 0.06232082098722458, + "learning_rate": 1.989917641196987e-05, + "loss": 0.0014, + "step": 3970 + }, + { + "epoch": 1.047750956338214, + "grad_norm": 0.08107651770114899, + "learning_rate": 1.9898004044667197e-05, + "loss": 0.0015, + "step": 3972 + }, + { + "epoch": 1.0482785912148793, + "grad_norm": 0.05975128710269928, + "learning_rate": 1.989683167736452e-05, + "loss": 0.0013, + "step": 3974 + }, + { + "epoch": 1.0488062260915447, + "grad_norm": 0.022182825952768326, + "learning_rate": 1.9895659310061843e-05, + "loss": 0.0099, + "step": 3976 + }, + { + "epoch": 1.04933386096821, + "grad_norm": 0.3372931182384491, + "learning_rate": 1.989448694275917e-05, + "loss": 0.0046, + "step": 3978 + }, + { + "epoch": 1.0498614958448753, + "grad_norm": 0.3106849193572998, + "learning_rate": 1.989331457545649e-05, + "loss": 0.0065, + "step": 3980 + }, + { + "epoch": 1.0503891307215407, + "grad_norm": 0.06648941338062286, + "learning_rate": 1.9892142208153814e-05, + "loss": 0.0011, + "step": 3982 + }, + { + "epoch": 1.0509167655982061, + "grad_norm": 0.019482029601931572, + "learning_rate": 1.989096984085114e-05, + "loss": 0.001, + "step": 3984 + }, + { + "epoch": 1.0514444004748713, + "grad_norm": 0.04180721193552017, + "learning_rate": 1.9889797473548466e-05, + "loss": 0.0008, + "step": 3986 + }, + { + "epoch": 1.0519720353515367, + "grad_norm": 0.020760901272296906, + "learning_rate": 1.988862510624579e-05, + "loss": 0.0013, + "step": 3988 + }, + { + "epoch": 1.0524996702282021, + "grad_norm": 0.05617310851812363, + "learning_rate": 1.988745273894311e-05, + "loss": 0.0011, + "step": 3990 + }, + { + "epoch": 1.0530273051048675, + "grad_norm": 0.01675761118531227, + "learning_rate": 1.9886280371640437e-05, + "loss": 0.0021, + "step": 3992 + }, + { + "epoch": 1.0535549399815327, + "grad_norm": 0.008978634141385555, + "learning_rate": 1.988510800433776e-05, + "loss": 0.0007, + "step": 3994 + }, + { + "epoch": 1.0540825748581981, + "grad_norm": 0.007971346378326416, + "learning_rate": 1.9883935637035086e-05, + "loss": 0.0006, + "step": 3996 + }, + { + "epoch": 1.0546102097348635, + "grad_norm": 0.015262258239090443, + "learning_rate": 1.9882763269732408e-05, + "loss": 0.0018, + "step": 3998 + }, + { + "epoch": 1.055137844611529, + "grad_norm": 0.2013244479894638, + "learning_rate": 1.9881590902429734e-05, + "loss": 0.0011, + "step": 4000 + }, + { + "epoch": 1.055137844611529, + "eval_loss": 0.004242845345288515, + "eval_runtime": 310.618, + "eval_samples_per_second": 694.232, + "eval_steps_per_second": 86.782, + "step": 4000 + }, + { + "epoch": 1.0556654794881941, + "grad_norm": 0.01818699948489666, + "learning_rate": 1.9880418535127057e-05, + "loss": 0.0012, + "step": 4002 + }, + { + "epoch": 1.0561931143648595, + "grad_norm": 0.4573904871940613, + "learning_rate": 1.9879246167824383e-05, + "loss": 0.0087, + "step": 4004 + }, + { + "epoch": 1.056720749241525, + "grad_norm": 0.005082989577203989, + "learning_rate": 1.9878073800521706e-05, + "loss": 0.0005, + "step": 4006 + }, + { + "epoch": 1.05724838411819, + "grad_norm": 0.010203861631453037, + "learning_rate": 1.9876901433219028e-05, + "loss": 0.001, + "step": 4008 + }, + { + "epoch": 1.0577760189948555, + "grad_norm": 0.010465828701853752, + "learning_rate": 1.9875729065916354e-05, + "loss": 0.0034, + "step": 4010 + }, + { + "epoch": 1.058303653871521, + "grad_norm": 0.09463724493980408, + "learning_rate": 1.9874556698613677e-05, + "loss": 0.0055, + "step": 4012 + }, + { + "epoch": 1.0588312887481863, + "grad_norm": 0.012813962996006012, + "learning_rate": 1.9873384331311e-05, + "loss": 0.0014, + "step": 4014 + }, + { + "epoch": 1.0593589236248515, + "grad_norm": 0.12054309248924255, + "learning_rate": 1.9872211964008325e-05, + "loss": 0.0094, + "step": 4016 + }, + { + "epoch": 1.059886558501517, + "grad_norm": 0.02392805553972721, + "learning_rate": 1.987103959670565e-05, + "loss": 0.0008, + "step": 4018 + }, + { + "epoch": 1.0604141933781823, + "grad_norm": 0.8046829700469971, + "learning_rate": 1.9869867229402974e-05, + "loss": 0.0056, + "step": 4020 + }, + { + "epoch": 1.0609418282548477, + "grad_norm": 0.2820715606212616, + "learning_rate": 1.9868694862100297e-05, + "loss": 0.0059, + "step": 4022 + }, + { + "epoch": 1.061469463131513, + "grad_norm": 0.03334713727235794, + "learning_rate": 1.9867522494797623e-05, + "loss": 0.0007, + "step": 4024 + }, + { + "epoch": 1.0619970980081783, + "grad_norm": 0.050394367426633835, + "learning_rate": 1.9866350127494945e-05, + "loss": 0.0016, + "step": 4026 + }, + { + "epoch": 1.0625247328848437, + "grad_norm": 0.4727458953857422, + "learning_rate": 1.9865177760192268e-05, + "loss": 0.0058, + "step": 4028 + }, + { + "epoch": 1.0630523677615091, + "grad_norm": 0.11521153897047043, + "learning_rate": 1.9864005392889594e-05, + "loss": 0.0199, + "step": 4030 + }, + { + "epoch": 1.0635800026381743, + "grad_norm": 0.029339730739593506, + "learning_rate": 1.986283302558692e-05, + "loss": 0.0016, + "step": 4032 + }, + { + "epoch": 1.0641076375148397, + "grad_norm": 0.11106666922569275, + "learning_rate": 1.9861660658284242e-05, + "loss": 0.0075, + "step": 4034 + }, + { + "epoch": 1.0646352723915051, + "grad_norm": 0.13840313255786896, + "learning_rate": 1.9860488290981565e-05, + "loss": 0.0013, + "step": 4036 + }, + { + "epoch": 1.0651629072681703, + "grad_norm": 0.076719269156456, + "learning_rate": 1.985931592367889e-05, + "loss": 0.0033, + "step": 4038 + }, + { + "epoch": 1.0656905421448357, + "grad_norm": 0.059046268463134766, + "learning_rate": 1.9858143556376214e-05, + "loss": 0.0012, + "step": 4040 + }, + { + "epoch": 1.0662181770215011, + "grad_norm": 0.05011860653758049, + "learning_rate": 1.9856971189073536e-05, + "loss": 0.0012, + "step": 4042 + }, + { + "epoch": 1.0667458118981665, + "grad_norm": 0.2733030319213867, + "learning_rate": 1.9855798821770862e-05, + "loss": 0.0082, + "step": 4044 + }, + { + "epoch": 1.0672734467748317, + "grad_norm": 0.03197386488318443, + "learning_rate": 1.9854626454468188e-05, + "loss": 0.0009, + "step": 4046 + }, + { + "epoch": 1.0678010816514971, + "grad_norm": 1.0904115438461304, + "learning_rate": 1.985345408716551e-05, + "loss": 0.0111, + "step": 4048 + }, + { + "epoch": 1.0683287165281625, + "grad_norm": 0.06159580498933792, + "learning_rate": 1.9852281719862833e-05, + "loss": 0.0117, + "step": 4050 + }, + { + "epoch": 1.068856351404828, + "grad_norm": 2.6952881813049316, + "learning_rate": 1.985110935256016e-05, + "loss": 0.0095, + "step": 4052 + }, + { + "epoch": 1.0693839862814931, + "grad_norm": 0.040034644305706024, + "learning_rate": 1.9849936985257482e-05, + "loss": 0.0012, + "step": 4054 + }, + { + "epoch": 1.0699116211581585, + "grad_norm": 0.0879300907254219, + "learning_rate": 1.9848764617954808e-05, + "loss": 0.0107, + "step": 4056 + }, + { + "epoch": 1.070439256034824, + "grad_norm": 0.10867154598236084, + "learning_rate": 1.984759225065213e-05, + "loss": 0.0018, + "step": 4058 + }, + { + "epoch": 1.0709668909114893, + "grad_norm": 0.02464730478823185, + "learning_rate": 1.9846419883349453e-05, + "loss": 0.0015, + "step": 4060 + }, + { + "epoch": 1.0714945257881545, + "grad_norm": 0.4528050422668457, + "learning_rate": 1.984524751604678e-05, + "loss": 0.0017, + "step": 4062 + }, + { + "epoch": 1.07202216066482, + "grad_norm": 0.030589934438467026, + "learning_rate": 1.9844075148744105e-05, + "loss": 0.0028, + "step": 4064 + }, + { + "epoch": 1.0725497955414853, + "grad_norm": 0.045231062918901443, + "learning_rate": 1.9842902781441428e-05, + "loss": 0.0183, + "step": 4066 + }, + { + "epoch": 1.0730774304181507, + "grad_norm": 0.034026674926280975, + "learning_rate": 1.984173041413875e-05, + "loss": 0.0048, + "step": 4068 + }, + { + "epoch": 1.073605065294816, + "grad_norm": 4.06601619720459, + "learning_rate": 1.9840558046836076e-05, + "loss": 0.0048, + "step": 4070 + }, + { + "epoch": 1.0741327001714813, + "grad_norm": 0.046299949288368225, + "learning_rate": 1.98393856795334e-05, + "loss": 0.0009, + "step": 4072 + }, + { + "epoch": 1.0746603350481467, + "grad_norm": 0.09578470885753632, + "learning_rate": 1.983821331223072e-05, + "loss": 0.001, + "step": 4074 + }, + { + "epoch": 1.0751879699248121, + "grad_norm": 0.016801415011286736, + "learning_rate": 1.9837040944928048e-05, + "loss": 0.0027, + "step": 4076 + }, + { + "epoch": 1.0757156048014773, + "grad_norm": 0.030888568609952927, + "learning_rate": 1.9835868577625374e-05, + "loss": 0.0035, + "step": 4078 + }, + { + "epoch": 1.0762432396781427, + "grad_norm": 0.18055951595306396, + "learning_rate": 1.9834696210322696e-05, + "loss": 0.0017, + "step": 4080 + }, + { + "epoch": 1.0767708745548081, + "grad_norm": 0.126048281788826, + "learning_rate": 1.983352384302002e-05, + "loss": 0.0086, + "step": 4082 + }, + { + "epoch": 1.0772985094314733, + "grad_norm": 0.010279747657477856, + "learning_rate": 1.9832351475717345e-05, + "loss": 0.001, + "step": 4084 + }, + { + "epoch": 1.0778261443081387, + "grad_norm": 0.013232915662229061, + "learning_rate": 1.9831179108414668e-05, + "loss": 0.0022, + "step": 4086 + }, + { + "epoch": 1.0783537791848041, + "grad_norm": 0.4710542559623718, + "learning_rate": 1.983000674111199e-05, + "loss": 0.0074, + "step": 4088 + }, + { + "epoch": 1.0788814140614695, + "grad_norm": 0.027519047260284424, + "learning_rate": 1.9828834373809316e-05, + "loss": 0.0008, + "step": 4090 + }, + { + "epoch": 1.0794090489381347, + "grad_norm": 0.4888823330402374, + "learning_rate": 1.9827662006506642e-05, + "loss": 0.0084, + "step": 4092 + }, + { + "epoch": 1.0799366838148001, + "grad_norm": 0.17619143426418304, + "learning_rate": 1.9826489639203965e-05, + "loss": 0.004, + "step": 4094 + }, + { + "epoch": 1.0804643186914655, + "grad_norm": 0.05794669687747955, + "learning_rate": 1.9825317271901287e-05, + "loss": 0.0037, + "step": 4096 + }, + { + "epoch": 1.080991953568131, + "grad_norm": 0.014881374314427376, + "learning_rate": 1.9824144904598613e-05, + "loss": 0.0007, + "step": 4098 + }, + { + "epoch": 1.0815195884447961, + "grad_norm": 0.08063676953315735, + "learning_rate": 1.9822972537295936e-05, + "loss": 0.0016, + "step": 4100 + }, + { + "epoch": 1.0820472233214615, + "grad_norm": 0.3623463213443756, + "learning_rate": 1.982180016999326e-05, + "loss": 0.0059, + "step": 4102 + }, + { + "epoch": 1.082574858198127, + "grad_norm": 0.30920037627220154, + "learning_rate": 1.9820627802690585e-05, + "loss": 0.0069, + "step": 4104 + }, + { + "epoch": 1.0831024930747923, + "grad_norm": 0.014545424841344357, + "learning_rate": 1.9819455435387907e-05, + "loss": 0.0033, + "step": 4106 + }, + { + "epoch": 1.0836301279514575, + "grad_norm": 0.037064485251903534, + "learning_rate": 1.9818283068085233e-05, + "loss": 0.004, + "step": 4108 + }, + { + "epoch": 1.084157762828123, + "grad_norm": 0.24179254472255707, + "learning_rate": 1.9817110700782556e-05, + "loss": 0.0126, + "step": 4110 + }, + { + "epoch": 1.0846853977047883, + "grad_norm": 0.02128276787698269, + "learning_rate": 1.9815938333479882e-05, + "loss": 0.0037, + "step": 4112 + }, + { + "epoch": 1.0852130325814535, + "grad_norm": 0.014167227782309055, + "learning_rate": 1.9814765966177204e-05, + "loss": 0.0007, + "step": 4114 + }, + { + "epoch": 1.085740667458119, + "grad_norm": 0.6996743083000183, + "learning_rate": 1.981359359887453e-05, + "loss": 0.0028, + "step": 4116 + }, + { + "epoch": 1.0862683023347843, + "grad_norm": 0.2691616714000702, + "learning_rate": 1.9812421231571853e-05, + "loss": 0.0089, + "step": 4118 + }, + { + "epoch": 1.0867959372114497, + "grad_norm": 0.11062702536582947, + "learning_rate": 1.9811248864269176e-05, + "loss": 0.0032, + "step": 4120 + }, + { + "epoch": 1.0873235720881151, + "grad_norm": 0.061569202691316605, + "learning_rate": 1.98100764969665e-05, + "loss": 0.0062, + "step": 4122 + }, + { + "epoch": 1.0878512069647803, + "grad_norm": 0.2601865530014038, + "learning_rate": 1.9808904129663828e-05, + "loss": 0.0029, + "step": 4124 + }, + { + "epoch": 1.0883788418414457, + "grad_norm": 0.01001047808676958, + "learning_rate": 1.980773176236115e-05, + "loss": 0.0016, + "step": 4126 + }, + { + "epoch": 1.0889064767181111, + "grad_norm": 0.41593366861343384, + "learning_rate": 1.9806559395058473e-05, + "loss": 0.0025, + "step": 4128 + }, + { + "epoch": 1.0894341115947763, + "grad_norm": 0.023231210187077522, + "learning_rate": 1.98053870277558e-05, + "loss": 0.0017, + "step": 4130 + }, + { + "epoch": 1.0899617464714417, + "grad_norm": 0.2696290910243988, + "learning_rate": 1.980421466045312e-05, + "loss": 0.0031, + "step": 4132 + }, + { + "epoch": 1.0904893813481071, + "grad_norm": 0.01866723969578743, + "learning_rate": 1.9803042293150444e-05, + "loss": 0.0012, + "step": 4134 + }, + { + "epoch": 1.0910170162247725, + "grad_norm": 0.017459148541092873, + "learning_rate": 1.980186992584777e-05, + "loss": 0.0055, + "step": 4136 + }, + { + "epoch": 1.0915446511014377, + "grad_norm": 0.014426691457629204, + "learning_rate": 1.9800697558545096e-05, + "loss": 0.0007, + "step": 4138 + }, + { + "epoch": 1.0920722859781031, + "grad_norm": 0.0954601913690567, + "learning_rate": 1.9799525191242415e-05, + "loss": 0.001, + "step": 4140 + }, + { + "epoch": 1.0925999208547685, + "grad_norm": 0.060124579817056656, + "learning_rate": 1.979835282393974e-05, + "loss": 0.0011, + "step": 4142 + }, + { + "epoch": 1.093127555731434, + "grad_norm": 0.0058424505405128, + "learning_rate": 1.9797180456637067e-05, + "loss": 0.0013, + "step": 4144 + }, + { + "epoch": 1.0936551906080991, + "grad_norm": 0.5524073839187622, + "learning_rate": 1.979600808933439e-05, + "loss": 0.0126, + "step": 4146 + }, + { + "epoch": 1.0941828254847645, + "grad_norm": 0.006782438140362501, + "learning_rate": 1.9794835722031712e-05, + "loss": 0.0006, + "step": 4148 + }, + { + "epoch": 1.09471046036143, + "grad_norm": 0.6685076355934143, + "learning_rate": 1.979366335472904e-05, + "loss": 0.0043, + "step": 4150 + }, + { + "epoch": 1.0952380952380953, + "grad_norm": 0.028946850448846817, + "learning_rate": 1.979249098742636e-05, + "loss": 0.0012, + "step": 4152 + }, + { + "epoch": 1.0957657301147605, + "grad_norm": 0.13756325840950012, + "learning_rate": 1.9791318620123687e-05, + "loss": 0.0162, + "step": 4154 + }, + { + "epoch": 1.096293364991426, + "grad_norm": 0.3037167489528656, + "learning_rate": 1.979014625282101e-05, + "loss": 0.0046, + "step": 4156 + }, + { + "epoch": 1.0968209998680913, + "grad_norm": 0.18160422146320343, + "learning_rate": 1.9788973885518336e-05, + "loss": 0.0053, + "step": 4158 + }, + { + "epoch": 1.0973486347447565, + "grad_norm": 0.14974090456962585, + "learning_rate": 1.9787801518215658e-05, + "loss": 0.0017, + "step": 4160 + }, + { + "epoch": 1.097876269621422, + "grad_norm": 0.043448556214571, + "learning_rate": 1.9786629150912984e-05, + "loss": 0.006, + "step": 4162 + }, + { + "epoch": 1.0984039044980873, + "grad_norm": 0.0600067675113678, + "learning_rate": 1.9785456783610307e-05, + "loss": 0.0008, + "step": 4164 + }, + { + "epoch": 1.0989315393747527, + "grad_norm": 0.017555207014083862, + "learning_rate": 1.978428441630763e-05, + "loss": 0.001, + "step": 4166 + }, + { + "epoch": 1.099459174251418, + "grad_norm": 0.27504733204841614, + "learning_rate": 1.9783112049004955e-05, + "loss": 0.0089, + "step": 4168 + }, + { + "epoch": 1.0999868091280833, + "grad_norm": 0.4094257056713104, + "learning_rate": 1.9781939681702278e-05, + "loss": 0.0097, + "step": 4170 + }, + { + "epoch": 1.1005144440047487, + "grad_norm": 0.02040870301425457, + "learning_rate": 1.9780767314399604e-05, + "loss": 0.0064, + "step": 4172 + }, + { + "epoch": 1.1010420788814141, + "grad_norm": 0.04204443842172623, + "learning_rate": 1.9779594947096927e-05, + "loss": 0.0007, + "step": 4174 + }, + { + "epoch": 1.1015697137580793, + "grad_norm": 0.014184098690748215, + "learning_rate": 1.9778422579794253e-05, + "loss": 0.0088, + "step": 4176 + }, + { + "epoch": 1.1020973486347447, + "grad_norm": 0.2528325319290161, + "learning_rate": 1.9777250212491575e-05, + "loss": 0.0039, + "step": 4178 + }, + { + "epoch": 1.1026249835114101, + "grad_norm": 0.635211169719696, + "learning_rate": 1.9776077845188898e-05, + "loss": 0.0176, + "step": 4180 + }, + { + "epoch": 1.1031526183880755, + "grad_norm": 0.059470489621162415, + "learning_rate": 1.9774905477886224e-05, + "loss": 0.0014, + "step": 4182 + }, + { + "epoch": 1.1036802532647407, + "grad_norm": 0.11782565712928772, + "learning_rate": 1.977373311058355e-05, + "loss": 0.0139, + "step": 4184 + }, + { + "epoch": 1.1042078881414061, + "grad_norm": 0.02202679216861725, + "learning_rate": 1.977256074328087e-05, + "loss": 0.0041, + "step": 4186 + }, + { + "epoch": 1.1047355230180715, + "grad_norm": 0.7849301695823669, + "learning_rate": 1.9771388375978195e-05, + "loss": 0.0028, + "step": 4188 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.022316325455904007, + "learning_rate": 1.977021600867552e-05, + "loss": 0.001, + "step": 4190 + }, + { + "epoch": 1.1057907927714021, + "grad_norm": 0.0233805850148201, + "learning_rate": 1.9769043641372844e-05, + "loss": 0.0058, + "step": 4192 + }, + { + "epoch": 1.1063184276480675, + "grad_norm": 0.5482938289642334, + "learning_rate": 1.9767871274070166e-05, + "loss": 0.0036, + "step": 4194 + }, + { + "epoch": 1.106846062524733, + "grad_norm": 0.18467076122760773, + "learning_rate": 1.9766698906767492e-05, + "loss": 0.0085, + "step": 4196 + }, + { + "epoch": 1.1073736974013983, + "grad_norm": 0.023641308769583702, + "learning_rate": 1.9765526539464815e-05, + "loss": 0.0025, + "step": 4198 + }, + { + "epoch": 1.1079013322780635, + "grad_norm": 0.3724466860294342, + "learning_rate": 1.9764354172162138e-05, + "loss": 0.0054, + "step": 4200 + }, + { + "epoch": 1.108428967154729, + "grad_norm": 0.03183715417981148, + "learning_rate": 1.9763181804859464e-05, + "loss": 0.0035, + "step": 4202 + }, + { + "epoch": 1.1089566020313943, + "grad_norm": 0.36597728729248047, + "learning_rate": 1.976200943755679e-05, + "loss": 0.0043, + "step": 4204 + }, + { + "epoch": 1.1094842369080595, + "grad_norm": 0.019968349486589432, + "learning_rate": 1.9760837070254112e-05, + "loss": 0.0009, + "step": 4206 + }, + { + "epoch": 1.110011871784725, + "grad_norm": 0.060931768268346786, + "learning_rate": 1.9759664702951435e-05, + "loss": 0.0013, + "step": 4208 + }, + { + "epoch": 1.1105395066613903, + "grad_norm": 0.13000589609146118, + "learning_rate": 1.975849233564876e-05, + "loss": 0.0101, + "step": 4210 + }, + { + "epoch": 1.1110671415380557, + "grad_norm": 0.05712990462779999, + "learning_rate": 1.9757319968346083e-05, + "loss": 0.0008, + "step": 4212 + }, + { + "epoch": 1.111594776414721, + "grad_norm": 0.23221421241760254, + "learning_rate": 1.975614760104341e-05, + "loss": 0.0102, + "step": 4214 + }, + { + "epoch": 1.1121224112913863, + "grad_norm": 0.030436590313911438, + "learning_rate": 1.9754975233740732e-05, + "loss": 0.0015, + "step": 4216 + }, + { + "epoch": 1.1126500461680517, + "grad_norm": 0.14182989299297333, + "learning_rate": 1.9753802866438058e-05, + "loss": 0.0087, + "step": 4218 + }, + { + "epoch": 1.1131776810447171, + "grad_norm": 0.04153032973408699, + "learning_rate": 1.975263049913538e-05, + "loss": 0.0022, + "step": 4220 + }, + { + "epoch": 1.1137053159213823, + "grad_norm": 0.26868143677711487, + "learning_rate": 1.9751458131832707e-05, + "loss": 0.0026, + "step": 4222 + }, + { + "epoch": 1.1142329507980477, + "grad_norm": 0.31287696957588196, + "learning_rate": 1.975028576453003e-05, + "loss": 0.003, + "step": 4224 + }, + { + "epoch": 1.1147605856747131, + "grad_norm": 0.013582884334027767, + "learning_rate": 1.9749113397227352e-05, + "loss": 0.0015, + "step": 4226 + }, + { + "epoch": 1.1152882205513786, + "grad_norm": 0.01660732738673687, + "learning_rate": 1.9747941029924678e-05, + "loss": 0.0008, + "step": 4228 + }, + { + "epoch": 1.1158158554280437, + "grad_norm": 0.014375527389347553, + "learning_rate": 1.9746768662622e-05, + "loss": 0.0015, + "step": 4230 + }, + { + "epoch": 1.1163434903047091, + "grad_norm": 0.14472918212413788, + "learning_rate": 1.9745596295319326e-05, + "loss": 0.0014, + "step": 4232 + }, + { + "epoch": 1.1168711251813745, + "grad_norm": 0.06959904730319977, + "learning_rate": 1.974442392801665e-05, + "loss": 0.0095, + "step": 4234 + }, + { + "epoch": 1.1173987600580397, + "grad_norm": 0.0889083668589592, + "learning_rate": 1.9743251560713975e-05, + "loss": 0.0013, + "step": 4236 + }, + { + "epoch": 1.1179263949347051, + "grad_norm": 0.005525961518287659, + "learning_rate": 1.9742079193411298e-05, + "loss": 0.0038, + "step": 4238 + }, + { + "epoch": 1.1184540298113705, + "grad_norm": 0.315455824136734, + "learning_rate": 1.974090682610862e-05, + "loss": 0.0052, + "step": 4240 + }, + { + "epoch": 1.118981664688036, + "grad_norm": 0.010557172819972038, + "learning_rate": 1.9739734458805946e-05, + "loss": 0.0085, + "step": 4242 + }, + { + "epoch": 1.1195092995647011, + "grad_norm": 0.1809600442647934, + "learning_rate": 1.973856209150327e-05, + "loss": 0.0065, + "step": 4244 + }, + { + "epoch": 1.1200369344413665, + "grad_norm": 0.13298973441123962, + "learning_rate": 1.973738972420059e-05, + "loss": 0.0011, + "step": 4246 + }, + { + "epoch": 1.120564569318032, + "grad_norm": 0.10115810483694077, + "learning_rate": 1.9736217356897917e-05, + "loss": 0.0046, + "step": 4248 + }, + { + "epoch": 1.1210922041946974, + "grad_norm": 0.3022739291191101, + "learning_rate": 1.9735044989595243e-05, + "loss": 0.0029, + "step": 4250 + }, + { + "epoch": 1.1216198390713625, + "grad_norm": 0.021644119173288345, + "learning_rate": 1.9733872622292566e-05, + "loss": 0.0033, + "step": 4252 + }, + { + "epoch": 1.122147473948028, + "grad_norm": 0.02688606269657612, + "learning_rate": 1.973270025498989e-05, + "loss": 0.001, + "step": 4254 + }, + { + "epoch": 1.1226751088246933, + "grad_norm": 0.022296728566288948, + "learning_rate": 1.9731527887687215e-05, + "loss": 0.0007, + "step": 4256 + }, + { + "epoch": 1.1232027437013588, + "grad_norm": 0.1126367375254631, + "learning_rate": 1.9730355520384537e-05, + "loss": 0.0011, + "step": 4258 + }, + { + "epoch": 1.123730378578024, + "grad_norm": 0.0945095345377922, + "learning_rate": 1.972918315308186e-05, + "loss": 0.001, + "step": 4260 + }, + { + "epoch": 1.1242580134546893, + "grad_norm": 0.10825273394584656, + "learning_rate": 1.9728010785779186e-05, + "loss": 0.0011, + "step": 4262 + }, + { + "epoch": 1.1247856483313547, + "grad_norm": 0.11365882307291031, + "learning_rate": 1.9726838418476512e-05, + "loss": 0.0146, + "step": 4264 + }, + { + "epoch": 1.12531328320802, + "grad_norm": 0.012032778933644295, + "learning_rate": 1.9725666051173835e-05, + "loss": 0.0007, + "step": 4266 + }, + { + "epoch": 1.1258409180846853, + "grad_norm": 0.016207845881581306, + "learning_rate": 1.9724493683871157e-05, + "loss": 0.0006, + "step": 4268 + }, + { + "epoch": 1.1263685529613507, + "grad_norm": 0.402178019285202, + "learning_rate": 1.9723321316568483e-05, + "loss": 0.0054, + "step": 4270 + }, + { + "epoch": 1.1268961878380162, + "grad_norm": 0.02973657101392746, + "learning_rate": 1.9722148949265806e-05, + "loss": 0.0022, + "step": 4272 + }, + { + "epoch": 1.1274238227146816, + "grad_norm": 1.1714385747909546, + "learning_rate": 1.9720976581963132e-05, + "loss": 0.0181, + "step": 4274 + }, + { + "epoch": 1.1279514575913467, + "grad_norm": 0.6077834367752075, + "learning_rate": 1.9719804214660454e-05, + "loss": 0.0055, + "step": 4276 + }, + { + "epoch": 1.1284790924680121, + "grad_norm": 0.06869453936815262, + "learning_rate": 1.971863184735778e-05, + "loss": 0.0011, + "step": 4278 + }, + { + "epoch": 1.1290067273446776, + "grad_norm": 0.04445134848356247, + "learning_rate": 1.9717459480055103e-05, + "loss": 0.0039, + "step": 4280 + }, + { + "epoch": 1.1295343622213427, + "grad_norm": 0.013125338591635227, + "learning_rate": 1.971628711275243e-05, + "loss": 0.0006, + "step": 4282 + }, + { + "epoch": 1.1300619970980081, + "grad_norm": 0.3134661018848419, + "learning_rate": 1.971511474544975e-05, + "loss": 0.0078, + "step": 4284 + }, + { + "epoch": 1.1305896319746735, + "grad_norm": 0.4332166612148285, + "learning_rate": 1.9713942378147074e-05, + "loss": 0.0048, + "step": 4286 + }, + { + "epoch": 1.131117266851339, + "grad_norm": 0.07447940111160278, + "learning_rate": 1.97127700108444e-05, + "loss": 0.0043, + "step": 4288 + }, + { + "epoch": 1.1316449017280041, + "grad_norm": 0.023935411125421524, + "learning_rate": 1.9711597643541723e-05, + "loss": 0.0023, + "step": 4290 + }, + { + "epoch": 1.1321725366046695, + "grad_norm": 0.10658252239227295, + "learning_rate": 1.9710425276239045e-05, + "loss": 0.0036, + "step": 4292 + }, + { + "epoch": 1.132700171481335, + "grad_norm": 0.038972459733486176, + "learning_rate": 1.970925290893637e-05, + "loss": 0.0008, + "step": 4294 + }, + { + "epoch": 1.1332278063580004, + "grad_norm": 0.056606825441122055, + "learning_rate": 1.9708080541633697e-05, + "loss": 0.0009, + "step": 4296 + }, + { + "epoch": 1.1337554412346655, + "grad_norm": 0.2597362697124481, + "learning_rate": 1.970690817433102e-05, + "loss": 0.0059, + "step": 4298 + }, + { + "epoch": 1.134283076111331, + "grad_norm": 0.11770262569189072, + "learning_rate": 1.9705735807028343e-05, + "loss": 0.0024, + "step": 4300 + }, + { + "epoch": 1.1348107109879964, + "grad_norm": 0.021376240998506546, + "learning_rate": 1.970456343972567e-05, + "loss": 0.0049, + "step": 4302 + }, + { + "epoch": 1.1353383458646618, + "grad_norm": 0.29133427143096924, + "learning_rate": 1.970339107242299e-05, + "loss": 0.0017, + "step": 4304 + }, + { + "epoch": 1.135865980741327, + "grad_norm": 0.16660794615745544, + "learning_rate": 1.9702218705120314e-05, + "loss": 0.0015, + "step": 4306 + }, + { + "epoch": 1.1363936156179923, + "grad_norm": 0.4434588849544525, + "learning_rate": 1.970104633781764e-05, + "loss": 0.0126, + "step": 4308 + }, + { + "epoch": 1.1369212504946578, + "grad_norm": 0.18524836003780365, + "learning_rate": 1.9699873970514966e-05, + "loss": 0.0129, + "step": 4310 + }, + { + "epoch": 1.137448885371323, + "grad_norm": 0.393754243850708, + "learning_rate": 1.969870160321229e-05, + "loss": 0.0107, + "step": 4312 + }, + { + "epoch": 1.1379765202479883, + "grad_norm": 0.0070448825135827065, + "learning_rate": 1.969752923590961e-05, + "loss": 0.0063, + "step": 4314 + }, + { + "epoch": 1.1385041551246537, + "grad_norm": 0.024128155782818794, + "learning_rate": 1.9696356868606937e-05, + "loss": 0.0007, + "step": 4316 + }, + { + "epoch": 1.1390317900013192, + "grad_norm": 0.0857410728931427, + "learning_rate": 1.969518450130426e-05, + "loss": 0.0008, + "step": 4318 + }, + { + "epoch": 1.1395594248779843, + "grad_norm": 0.042044732719659805, + "learning_rate": 1.9694012134001582e-05, + "loss": 0.004, + "step": 4320 + }, + { + "epoch": 1.1400870597546497, + "grad_norm": 0.04413846135139465, + "learning_rate": 1.9692839766698908e-05, + "loss": 0.002, + "step": 4322 + }, + { + "epoch": 1.1406146946313152, + "grad_norm": 0.07810824364423752, + "learning_rate": 1.9691667399396234e-05, + "loss": 0.0013, + "step": 4324 + }, + { + "epoch": 1.1411423295079806, + "grad_norm": 0.4568030834197998, + "learning_rate": 1.9690495032093557e-05, + "loss": 0.0068, + "step": 4326 + }, + { + "epoch": 1.1416699643846457, + "grad_norm": 0.07198156416416168, + "learning_rate": 1.968932266479088e-05, + "loss": 0.003, + "step": 4328 + }, + { + "epoch": 1.1421975992613111, + "grad_norm": 0.07447123527526855, + "learning_rate": 1.9688150297488205e-05, + "loss": 0.0042, + "step": 4330 + }, + { + "epoch": 1.1427252341379766, + "grad_norm": 0.01063209306448698, + "learning_rate": 1.9686977930185528e-05, + "loss": 0.0007, + "step": 4332 + }, + { + "epoch": 1.143252869014642, + "grad_norm": 0.12687617540359497, + "learning_rate": 1.9685805562882854e-05, + "loss": 0.0009, + "step": 4334 + }, + { + "epoch": 1.1437805038913071, + "grad_norm": 0.017365315929055214, + "learning_rate": 1.9684633195580177e-05, + "loss": 0.001, + "step": 4336 + }, + { + "epoch": 1.1443081387679725, + "grad_norm": 0.13835787773132324, + "learning_rate": 1.96834608282775e-05, + "loss": 0.0009, + "step": 4338 + }, + { + "epoch": 1.144835773644638, + "grad_norm": 0.17692521214485168, + "learning_rate": 1.9682288460974825e-05, + "loss": 0.0012, + "step": 4340 + }, + { + "epoch": 1.1453634085213031, + "grad_norm": 0.4429665505886078, + "learning_rate": 1.968111609367215e-05, + "loss": 0.0021, + "step": 4342 + }, + { + "epoch": 1.1458910433979685, + "grad_norm": 0.8000689148902893, + "learning_rate": 1.9679943726369474e-05, + "loss": 0.0082, + "step": 4344 + }, + { + "epoch": 1.146418678274634, + "grad_norm": 0.32551059126853943, + "learning_rate": 1.9678771359066797e-05, + "loss": 0.0039, + "step": 4346 + }, + { + "epoch": 1.1469463131512994, + "grad_norm": 0.08477803319692612, + "learning_rate": 1.9677598991764123e-05, + "loss": 0.0008, + "step": 4348 + }, + { + "epoch": 1.1474739480279648, + "grad_norm": 0.004862100817263126, + "learning_rate": 1.9676426624461445e-05, + "loss": 0.0004, + "step": 4350 + }, + { + "epoch": 1.14800158290463, + "grad_norm": 0.05917839705944061, + "learning_rate": 1.9675254257158768e-05, + "loss": 0.0027, + "step": 4352 + }, + { + "epoch": 1.1485292177812954, + "grad_norm": 0.004015312530100346, + "learning_rate": 1.9674081889856094e-05, + "loss": 0.0005, + "step": 4354 + }, + { + "epoch": 1.1490568526579608, + "grad_norm": 0.03031083196401596, + "learning_rate": 1.967290952255342e-05, + "loss": 0.0046, + "step": 4356 + }, + { + "epoch": 1.149584487534626, + "grad_norm": 0.04025927186012268, + "learning_rate": 1.9671737155250742e-05, + "loss": 0.0007, + "step": 4358 + }, + { + "epoch": 1.1501121224112913, + "grad_norm": 0.1729552447795868, + "learning_rate": 1.9670564787948065e-05, + "loss": 0.0081, + "step": 4360 + }, + { + "epoch": 1.1506397572879568, + "grad_norm": 0.2491278499364853, + "learning_rate": 1.966939242064539e-05, + "loss": 0.0049, + "step": 4362 + }, + { + "epoch": 1.1511673921646222, + "grad_norm": 0.025110160931944847, + "learning_rate": 1.9668220053342714e-05, + "loss": 0.0052, + "step": 4364 + }, + { + "epoch": 1.1516950270412873, + "grad_norm": 0.028094789013266563, + "learning_rate": 1.9667047686040036e-05, + "loss": 0.0009, + "step": 4366 + }, + { + "epoch": 1.1522226619179528, + "grad_norm": 0.05064205080270767, + "learning_rate": 1.9665875318737362e-05, + "loss": 0.0022, + "step": 4368 + }, + { + "epoch": 1.1527502967946182, + "grad_norm": 0.43874481320381165, + "learning_rate": 1.9664702951434688e-05, + "loss": 0.0124, + "step": 4370 + }, + { + "epoch": 1.1532779316712836, + "grad_norm": 0.5690093636512756, + "learning_rate": 1.966353058413201e-05, + "loss": 0.0022, + "step": 4372 + }, + { + "epoch": 1.1538055665479487, + "grad_norm": 2.0636167526245117, + "learning_rate": 1.9662358216829333e-05, + "loss": 0.0049, + "step": 4374 + }, + { + "epoch": 1.1543332014246142, + "grad_norm": 0.2444980889558792, + "learning_rate": 1.966118584952666e-05, + "loss": 0.0139, + "step": 4376 + }, + { + "epoch": 1.1548608363012796, + "grad_norm": 0.01840645633637905, + "learning_rate": 1.9660013482223982e-05, + "loss": 0.0006, + "step": 4378 + }, + { + "epoch": 1.155388471177945, + "grad_norm": 0.02064526081085205, + "learning_rate": 1.9658841114921305e-05, + "loss": 0.0006, + "step": 4380 + }, + { + "epoch": 1.1559161060546101, + "grad_norm": 0.08911865204572678, + "learning_rate": 1.965766874761863e-05, + "loss": 0.0007, + "step": 4382 + }, + { + "epoch": 1.1564437409312756, + "grad_norm": 0.14936824142932892, + "learning_rate": 1.9656496380315953e-05, + "loss": 0.0126, + "step": 4384 + }, + { + "epoch": 1.156971375807941, + "grad_norm": 0.01521136611700058, + "learning_rate": 1.965532401301328e-05, + "loss": 0.0121, + "step": 4386 + }, + { + "epoch": 1.1574990106846061, + "grad_norm": 0.30815377831459045, + "learning_rate": 1.9654151645710602e-05, + "loss": 0.0013, + "step": 4388 + }, + { + "epoch": 1.1580266455612716, + "grad_norm": 0.19966404139995575, + "learning_rate": 1.9652979278407928e-05, + "loss": 0.0058, + "step": 4390 + }, + { + "epoch": 1.158554280437937, + "grad_norm": 0.013514398597180843, + "learning_rate": 1.965180691110525e-05, + "loss": 0.024, + "step": 4392 + }, + { + "epoch": 1.1590819153146024, + "grad_norm": 0.02790391817688942, + "learning_rate": 1.9650634543802576e-05, + "loss": 0.0016, + "step": 4394 + }, + { + "epoch": 1.1596095501912675, + "grad_norm": 0.3208989202976227, + "learning_rate": 1.96494621764999e-05, + "loss": 0.0034, + "step": 4396 + }, + { + "epoch": 1.160137185067933, + "grad_norm": 0.20949339866638184, + "learning_rate": 1.964828980919722e-05, + "loss": 0.0052, + "step": 4398 + }, + { + "epoch": 1.1606648199445984, + "grad_norm": 0.5572222471237183, + "learning_rate": 1.9647117441894548e-05, + "loss": 0.0034, + "step": 4400 + }, + { + "epoch": 1.1611924548212638, + "grad_norm": 0.13384489715099335, + "learning_rate": 1.9645945074591874e-05, + "loss": 0.0016, + "step": 4402 + }, + { + "epoch": 1.161720089697929, + "grad_norm": 0.24243025481700897, + "learning_rate": 1.9644772707289196e-05, + "loss": 0.0131, + "step": 4404 + }, + { + "epoch": 1.1622477245745944, + "grad_norm": 0.22543661296367645, + "learning_rate": 1.964360033998652e-05, + "loss": 0.0044, + "step": 4406 + }, + { + "epoch": 1.1627753594512598, + "grad_norm": 0.08378975838422775, + "learning_rate": 1.9642427972683845e-05, + "loss": 0.001, + "step": 4408 + }, + { + "epoch": 1.1633029943279252, + "grad_norm": 0.021007966250181198, + "learning_rate": 1.9641255605381167e-05, + "loss": 0.0008, + "step": 4410 + }, + { + "epoch": 1.1638306292045904, + "grad_norm": 0.013681617565453053, + "learning_rate": 1.964008323807849e-05, + "loss": 0.0007, + "step": 4412 + }, + { + "epoch": 1.1643582640812558, + "grad_norm": 0.14630456268787384, + "learning_rate": 1.9638910870775816e-05, + "loss": 0.0044, + "step": 4414 + }, + { + "epoch": 1.1648858989579212, + "grad_norm": 0.2778044044971466, + "learning_rate": 1.9637738503473142e-05, + "loss": 0.0011, + "step": 4416 + }, + { + "epoch": 1.1654135338345863, + "grad_norm": 0.23169730603694916, + "learning_rate": 1.963656613617046e-05, + "loss": 0.0093, + "step": 4418 + }, + { + "epoch": 1.1659411687112518, + "grad_norm": 0.02920382283627987, + "learning_rate": 1.9635393768867787e-05, + "loss": 0.0008, + "step": 4420 + }, + { + "epoch": 1.1664688035879172, + "grad_norm": 0.04556174948811531, + "learning_rate": 1.9634221401565113e-05, + "loss": 0.0012, + "step": 4422 + }, + { + "epoch": 1.1669964384645826, + "grad_norm": 0.014561996795237064, + "learning_rate": 1.9633049034262436e-05, + "loss": 0.0007, + "step": 4424 + }, + { + "epoch": 1.167524073341248, + "grad_norm": 0.09821402281522751, + "learning_rate": 1.963187666695976e-05, + "loss": 0.0159, + "step": 4426 + }, + { + "epoch": 1.1680517082179132, + "grad_norm": 0.0157514289021492, + "learning_rate": 1.9630704299657085e-05, + "loss": 0.0006, + "step": 4428 + }, + { + "epoch": 1.1685793430945786, + "grad_norm": 0.09060139954090118, + "learning_rate": 1.9629531932354407e-05, + "loss": 0.0014, + "step": 4430 + }, + { + "epoch": 1.169106977971244, + "grad_norm": 0.016944697126746178, + "learning_rate": 1.9628359565051733e-05, + "loss": 0.0007, + "step": 4432 + }, + { + "epoch": 1.1696346128479091, + "grad_norm": 0.02988281287252903, + "learning_rate": 1.9627187197749056e-05, + "loss": 0.0032, + "step": 4434 + }, + { + "epoch": 1.1701622477245746, + "grad_norm": 0.32507482171058655, + "learning_rate": 1.9626014830446382e-05, + "loss": 0.0053, + "step": 4436 + }, + { + "epoch": 1.17068988260124, + "grad_norm": 0.5766019225120544, + "learning_rate": 1.9624842463143704e-05, + "loss": 0.0041, + "step": 4438 + }, + { + "epoch": 1.1712175174779054, + "grad_norm": 0.03981131315231323, + "learning_rate": 1.9623670095841027e-05, + "loss": 0.0055, + "step": 4440 + }, + { + "epoch": 1.1717451523545706, + "grad_norm": 0.2502346932888031, + "learning_rate": 1.9622497728538353e-05, + "loss": 0.0018, + "step": 4442 + }, + { + "epoch": 1.172272787231236, + "grad_norm": 0.21483206748962402, + "learning_rate": 1.9621325361235676e-05, + "loss": 0.0177, + "step": 4444 + }, + { + "epoch": 1.1728004221079014, + "grad_norm": 0.6093438267707825, + "learning_rate": 1.9620152993933e-05, + "loss": 0.006, + "step": 4446 + }, + { + "epoch": 1.1733280569845668, + "grad_norm": 0.1699678599834442, + "learning_rate": 1.9618980626630324e-05, + "loss": 0.0094, + "step": 4448 + }, + { + "epoch": 1.173855691861232, + "grad_norm": 1.8108218908309937, + "learning_rate": 1.961780825932765e-05, + "loss": 0.0036, + "step": 4450 + }, + { + "epoch": 1.1743833267378974, + "grad_norm": 0.1308957040309906, + "learning_rate": 1.9616635892024973e-05, + "loss": 0.0126, + "step": 4452 + }, + { + "epoch": 1.1749109616145628, + "grad_norm": 0.023101404309272766, + "learning_rate": 1.96154635247223e-05, + "loss": 0.0008, + "step": 4454 + }, + { + "epoch": 1.1754385964912282, + "grad_norm": 0.025170007720589638, + "learning_rate": 1.961429115741962e-05, + "loss": 0.0019, + "step": 4456 + }, + { + "epoch": 1.1759662313678934, + "grad_norm": 0.11672040820121765, + "learning_rate": 1.9613118790116944e-05, + "loss": 0.002, + "step": 4458 + }, + { + "epoch": 1.1764938662445588, + "grad_norm": 0.21220766007900238, + "learning_rate": 1.961194642281427e-05, + "loss": 0.0023, + "step": 4460 + }, + { + "epoch": 1.1770215011212242, + "grad_norm": 0.7267486453056335, + "learning_rate": 1.9610774055511596e-05, + "loss": 0.0018, + "step": 4462 + }, + { + "epoch": 1.1775491359978894, + "grad_norm": 0.2415233701467514, + "learning_rate": 1.9609601688208915e-05, + "loss": 0.0217, + "step": 4464 + }, + { + "epoch": 1.1780767708745548, + "grad_norm": 0.05124617740511894, + "learning_rate": 1.960842932090624e-05, + "loss": 0.0017, + "step": 4466 + }, + { + "epoch": 1.1786044057512202, + "grad_norm": 0.298318475484848, + "learning_rate": 1.9607256953603567e-05, + "loss": 0.0027, + "step": 4468 + }, + { + "epoch": 1.1791320406278856, + "grad_norm": 0.18129241466522217, + "learning_rate": 1.960608458630089e-05, + "loss": 0.0028, + "step": 4470 + }, + { + "epoch": 1.179659675504551, + "grad_norm": 0.06919640302658081, + "learning_rate": 1.9604912218998212e-05, + "loss": 0.0022, + "step": 4472 + }, + { + "epoch": 1.1801873103812162, + "grad_norm": 0.10772089660167694, + "learning_rate": 1.960373985169554e-05, + "loss": 0.0205, + "step": 4474 + }, + { + "epoch": 1.1807149452578816, + "grad_norm": 0.2865675389766693, + "learning_rate": 1.960256748439286e-05, + "loss": 0.0163, + "step": 4476 + }, + { + "epoch": 1.181242580134547, + "grad_norm": 0.1230730190873146, + "learning_rate": 1.9601395117090184e-05, + "loss": 0.0014, + "step": 4478 + }, + { + "epoch": 1.1817702150112122, + "grad_norm": 0.129010871052742, + "learning_rate": 1.960022274978751e-05, + "loss": 0.002, + "step": 4480 + }, + { + "epoch": 1.1822978498878776, + "grad_norm": 0.07713605463504791, + "learning_rate": 1.9599050382484836e-05, + "loss": 0.0018, + "step": 4482 + }, + { + "epoch": 1.182825484764543, + "grad_norm": 0.013351364992558956, + "learning_rate": 1.9597878015182158e-05, + "loss": 0.0018, + "step": 4484 + }, + { + "epoch": 1.1833531196412084, + "grad_norm": 0.015451117418706417, + "learning_rate": 1.959670564787948e-05, + "loss": 0.0008, + "step": 4486 + }, + { + "epoch": 1.1838807545178736, + "grad_norm": 0.008069987408816814, + "learning_rate": 1.9595533280576807e-05, + "loss": 0.0008, + "step": 4488 + }, + { + "epoch": 1.184408389394539, + "grad_norm": 0.008663958869874477, + "learning_rate": 1.959436091327413e-05, + "loss": 0.001, + "step": 4490 + }, + { + "epoch": 1.1849360242712044, + "grad_norm": 0.00952655915170908, + "learning_rate": 1.9593188545971455e-05, + "loss": 0.0006, + "step": 4492 + }, + { + "epoch": 1.1854636591478696, + "grad_norm": 0.017387842759490013, + "learning_rate": 1.9592016178668778e-05, + "loss": 0.0006, + "step": 4494 + }, + { + "epoch": 1.185991294024535, + "grad_norm": 0.03666413947939873, + "learning_rate": 1.9590843811366104e-05, + "loss": 0.0008, + "step": 4496 + }, + { + "epoch": 1.1865189289012004, + "grad_norm": 0.4247264266014099, + "learning_rate": 1.9589671444063427e-05, + "loss": 0.0098, + "step": 4498 + }, + { + "epoch": 1.1870465637778658, + "grad_norm": 0.19646139442920685, + "learning_rate": 1.958849907676075e-05, + "loss": 0.0067, + "step": 4500 + }, + { + "epoch": 1.1875741986545312, + "grad_norm": 0.02188594825565815, + "learning_rate": 1.9587326709458075e-05, + "loss": 0.0006, + "step": 4502 + }, + { + "epoch": 1.1881018335311964, + "grad_norm": 0.08792304247617722, + "learning_rate": 1.9586154342155398e-05, + "loss": 0.0034, + "step": 4504 + }, + { + "epoch": 1.1886294684078618, + "grad_norm": 0.08937163650989532, + "learning_rate": 1.9584981974852724e-05, + "loss": 0.0011, + "step": 4506 + }, + { + "epoch": 1.1891571032845272, + "grad_norm": 0.23029430210590363, + "learning_rate": 1.9583809607550046e-05, + "loss": 0.0136, + "step": 4508 + }, + { + "epoch": 1.1896847381611924, + "grad_norm": 0.08281965553760529, + "learning_rate": 1.958263724024737e-05, + "loss": 0.001, + "step": 4510 + }, + { + "epoch": 1.1902123730378578, + "grad_norm": 0.025996103882789612, + "learning_rate": 1.9581464872944695e-05, + "loss": 0.0011, + "step": 4512 + }, + { + "epoch": 1.1907400079145232, + "grad_norm": 0.1115848496556282, + "learning_rate": 1.958029250564202e-05, + "loss": 0.0041, + "step": 4514 + }, + { + "epoch": 1.1912676427911886, + "grad_norm": 0.07224689424037933, + "learning_rate": 1.9579120138339344e-05, + "loss": 0.0036, + "step": 4516 + }, + { + "epoch": 1.1917952776678538, + "grad_norm": 0.224129319190979, + "learning_rate": 1.9577947771036666e-05, + "loss": 0.0034, + "step": 4518 + }, + { + "epoch": 1.1923229125445192, + "grad_norm": 0.02547801472246647, + "learning_rate": 1.9576775403733992e-05, + "loss": 0.0007, + "step": 4520 + }, + { + "epoch": 1.1928505474211846, + "grad_norm": 0.5800535678863525, + "learning_rate": 1.9575603036431315e-05, + "loss": 0.0194, + "step": 4522 + }, + { + "epoch": 1.19337818229785, + "grad_norm": 0.04614017531275749, + "learning_rate": 1.9574430669128638e-05, + "loss": 0.0037, + "step": 4524 + }, + { + "epoch": 1.1939058171745152, + "grad_norm": 0.3232923448085785, + "learning_rate": 1.9573258301825964e-05, + "loss": 0.0162, + "step": 4526 + }, + { + "epoch": 1.1944334520511806, + "grad_norm": 0.2898349165916443, + "learning_rate": 1.957208593452329e-05, + "loss": 0.0015, + "step": 4528 + }, + { + "epoch": 1.194961086927846, + "grad_norm": 0.02757585234940052, + "learning_rate": 1.9570913567220612e-05, + "loss": 0.0013, + "step": 4530 + }, + { + "epoch": 1.1954887218045114, + "grad_norm": 0.10668807476758957, + "learning_rate": 1.9569741199917935e-05, + "loss": 0.0138, + "step": 4532 + }, + { + "epoch": 1.1960163566811766, + "grad_norm": 0.16099338233470917, + "learning_rate": 1.956856883261526e-05, + "loss": 0.0018, + "step": 4534 + }, + { + "epoch": 1.196543991557842, + "grad_norm": 0.3917209208011627, + "learning_rate": 1.9567396465312583e-05, + "loss": 0.0043, + "step": 4536 + }, + { + "epoch": 1.1970716264345074, + "grad_norm": 0.04002007469534874, + "learning_rate": 1.9566224098009906e-05, + "loss": 0.0012, + "step": 4538 + }, + { + "epoch": 1.1975992613111726, + "grad_norm": 0.13336333632469177, + "learning_rate": 1.9565051730707232e-05, + "loss": 0.0017, + "step": 4540 + }, + { + "epoch": 1.198126896187838, + "grad_norm": 0.5118239521980286, + "learning_rate": 1.9563879363404558e-05, + "loss": 0.0022, + "step": 4542 + }, + { + "epoch": 1.1986545310645034, + "grad_norm": 0.023996012285351753, + "learning_rate": 1.956270699610188e-05, + "loss": 0.0007, + "step": 4544 + }, + { + "epoch": 1.1991821659411688, + "grad_norm": 0.030142780393362045, + "learning_rate": 1.9561534628799203e-05, + "loss": 0.0009, + "step": 4546 + }, + { + "epoch": 1.1997098008178342, + "grad_norm": 0.6261898279190063, + "learning_rate": 1.956036226149653e-05, + "loss": 0.0107, + "step": 4548 + }, + { + "epoch": 1.2002374356944994, + "grad_norm": 0.4043087363243103, + "learning_rate": 1.9559189894193852e-05, + "loss": 0.0054, + "step": 4550 + }, + { + "epoch": 1.2007650705711648, + "grad_norm": 0.013515640050172806, + "learning_rate": 1.9558017526891178e-05, + "loss": 0.0022, + "step": 4552 + }, + { + "epoch": 1.2012927054478302, + "grad_norm": 0.11312706023454666, + "learning_rate": 1.95568451595885e-05, + "loss": 0.0013, + "step": 4554 + }, + { + "epoch": 1.2018203403244954, + "grad_norm": 0.4656051695346832, + "learning_rate": 1.9555672792285826e-05, + "loss": 0.0022, + "step": 4556 + }, + { + "epoch": 1.2023479752011608, + "grad_norm": 0.9007090330123901, + "learning_rate": 1.955450042498315e-05, + "loss": 0.0097, + "step": 4558 + }, + { + "epoch": 1.2028756100778262, + "grad_norm": 0.07795869559049606, + "learning_rate": 1.955332805768047e-05, + "loss": 0.0056, + "step": 4560 + }, + { + "epoch": 1.2034032449544916, + "grad_norm": 0.08437392115592957, + "learning_rate": 1.9552155690377798e-05, + "loss": 0.0011, + "step": 4562 + }, + { + "epoch": 1.2039308798311568, + "grad_norm": 0.01121995598077774, + "learning_rate": 1.955098332307512e-05, + "loss": 0.001, + "step": 4564 + }, + { + "epoch": 1.2044585147078222, + "grad_norm": 0.8454673290252686, + "learning_rate": 1.9549810955772446e-05, + "loss": 0.0049, + "step": 4566 + }, + { + "epoch": 1.2049861495844876, + "grad_norm": 0.23644158244132996, + "learning_rate": 1.954863858846977e-05, + "loss": 0.002, + "step": 4568 + }, + { + "epoch": 1.2055137844611528, + "grad_norm": 1.2653322219848633, + "learning_rate": 1.954746622116709e-05, + "loss": 0.0037, + "step": 4570 + }, + { + "epoch": 1.2060414193378182, + "grad_norm": 0.1672057956457138, + "learning_rate": 1.9546293853864417e-05, + "loss": 0.0023, + "step": 4572 + }, + { + "epoch": 1.2065690542144836, + "grad_norm": 0.00591963529586792, + "learning_rate": 1.9545121486561743e-05, + "loss": 0.0007, + "step": 4574 + }, + { + "epoch": 1.207096689091149, + "grad_norm": 0.017217906191945076, + "learning_rate": 1.9543949119259066e-05, + "loss": 0.0007, + "step": 4576 + }, + { + "epoch": 1.2076243239678144, + "grad_norm": 0.1830136924982071, + "learning_rate": 1.954277675195639e-05, + "loss": 0.012, + "step": 4578 + }, + { + "epoch": 1.2081519588444796, + "grad_norm": 0.05350952222943306, + "learning_rate": 1.9541604384653715e-05, + "loss": 0.0032, + "step": 4580 + }, + { + "epoch": 1.208679593721145, + "grad_norm": 0.020193379372358322, + "learning_rate": 1.9540432017351037e-05, + "loss": 0.0045, + "step": 4582 + }, + { + "epoch": 1.2092072285978104, + "grad_norm": 0.01214122585952282, + "learning_rate": 1.953925965004836e-05, + "loss": 0.0006, + "step": 4584 + }, + { + "epoch": 1.2097348634744756, + "grad_norm": 0.06443080306053162, + "learning_rate": 1.9538087282745686e-05, + "loss": 0.0008, + "step": 4586 + }, + { + "epoch": 1.210262498351141, + "grad_norm": 0.012930672615766525, + "learning_rate": 1.9536914915443012e-05, + "loss": 0.0005, + "step": 4588 + }, + { + "epoch": 1.2107901332278064, + "grad_norm": 0.0904420018196106, + "learning_rate": 1.9535742548140334e-05, + "loss": 0.0111, + "step": 4590 + }, + { + "epoch": 1.2113177681044718, + "grad_norm": 0.2481929212808609, + "learning_rate": 1.9534570180837657e-05, + "loss": 0.0057, + "step": 4592 + }, + { + "epoch": 1.211845402981137, + "grad_norm": 0.05265621468424797, + "learning_rate": 1.9533397813534983e-05, + "loss": 0.0012, + "step": 4594 + }, + { + "epoch": 1.2123730378578024, + "grad_norm": 0.43796131014823914, + "learning_rate": 1.9532225446232306e-05, + "loss": 0.0092, + "step": 4596 + }, + { + "epoch": 1.2129006727344678, + "grad_norm": 0.05579087883234024, + "learning_rate": 1.953105307892963e-05, + "loss": 0.0011, + "step": 4598 + }, + { + "epoch": 1.2134283076111332, + "grad_norm": 0.4377579391002655, + "learning_rate": 1.9529880711626954e-05, + "loss": 0.016, + "step": 4600 + }, + { + "epoch": 1.2139559424877984, + "grad_norm": 0.13025163114070892, + "learning_rate": 1.952870834432428e-05, + "loss": 0.0105, + "step": 4602 + }, + { + "epoch": 1.2144835773644638, + "grad_norm": 0.1524581015110016, + "learning_rate": 1.9527535977021603e-05, + "loss": 0.0014, + "step": 4604 + }, + { + "epoch": 1.2150112122411292, + "grad_norm": 0.08781922608613968, + "learning_rate": 1.9526363609718926e-05, + "loss": 0.0034, + "step": 4606 + }, + { + "epoch": 1.2155388471177946, + "grad_norm": 0.07946493476629257, + "learning_rate": 1.952519124241625e-05, + "loss": 0.0033, + "step": 4608 + }, + { + "epoch": 1.2160664819944598, + "grad_norm": 0.43861034512519836, + "learning_rate": 1.9524018875113574e-05, + "loss": 0.0158, + "step": 4610 + }, + { + "epoch": 1.2165941168711252, + "grad_norm": 0.26088547706604004, + "learning_rate": 1.95228465078109e-05, + "loss": 0.0042, + "step": 4612 + }, + { + "epoch": 1.2171217517477906, + "grad_norm": 0.05912045016884804, + "learning_rate": 1.9521674140508223e-05, + "loss": 0.001, + "step": 4614 + }, + { + "epoch": 1.2176493866244558, + "grad_norm": 0.20740507543087006, + "learning_rate": 1.9520501773205545e-05, + "loss": 0.0079, + "step": 4616 + }, + { + "epoch": 1.2181770215011212, + "grad_norm": 0.21228305995464325, + "learning_rate": 1.951932940590287e-05, + "loss": 0.0028, + "step": 4618 + }, + { + "epoch": 1.2187046563777866, + "grad_norm": 0.04991602897644043, + "learning_rate": 1.9518157038600194e-05, + "loss": 0.0008, + "step": 4620 + }, + { + "epoch": 1.219232291254452, + "grad_norm": 0.013630307279527187, + "learning_rate": 1.951698467129752e-05, + "loss": 0.0009, + "step": 4622 + }, + { + "epoch": 1.2197599261311174, + "grad_norm": 0.5405182242393494, + "learning_rate": 1.9515812303994843e-05, + "loss": 0.0142, + "step": 4624 + }, + { + "epoch": 1.2202875610077826, + "grad_norm": 0.01963001675903797, + "learning_rate": 1.951463993669217e-05, + "loss": 0.0008, + "step": 4626 + }, + { + "epoch": 1.220815195884448, + "grad_norm": 0.024393413215875626, + "learning_rate": 1.951346756938949e-05, + "loss": 0.0008, + "step": 4628 + }, + { + "epoch": 1.2213428307611134, + "grad_norm": 0.1826399713754654, + "learning_rate": 1.9512295202086814e-05, + "loss": 0.0086, + "step": 4630 + }, + { + "epoch": 1.2218704656377786, + "grad_norm": 0.13212184607982635, + "learning_rate": 1.951112283478414e-05, + "loss": 0.0041, + "step": 4632 + }, + { + "epoch": 1.222398100514444, + "grad_norm": 0.09291022270917892, + "learning_rate": 1.9509950467481466e-05, + "loss": 0.0091, + "step": 4634 + }, + { + "epoch": 1.2229257353911094, + "grad_norm": 0.34990644454956055, + "learning_rate": 1.950877810017879e-05, + "loss": 0.0088, + "step": 4636 + }, + { + "epoch": 1.2234533702677748, + "grad_norm": 0.035748403519392014, + "learning_rate": 1.950760573287611e-05, + "loss": 0.0034, + "step": 4638 + }, + { + "epoch": 1.22398100514444, + "grad_norm": 0.016820790246129036, + "learning_rate": 1.9506433365573437e-05, + "loss": 0.0013, + "step": 4640 + }, + { + "epoch": 1.2245086400211054, + "grad_norm": 0.03421907499432564, + "learning_rate": 1.950526099827076e-05, + "loss": 0.0011, + "step": 4642 + }, + { + "epoch": 1.2250362748977708, + "grad_norm": 0.2629443109035492, + "learning_rate": 1.9504088630968082e-05, + "loss": 0.0111, + "step": 4644 + }, + { + "epoch": 1.225563909774436, + "grad_norm": 0.011865231208503246, + "learning_rate": 1.9502916263665408e-05, + "loss": 0.0005, + "step": 4646 + }, + { + "epoch": 1.2260915446511014, + "grad_norm": 0.38955605030059814, + "learning_rate": 1.9501743896362734e-05, + "loss": 0.0052, + "step": 4648 + }, + { + "epoch": 1.2266191795277668, + "grad_norm": 0.009797215461730957, + "learning_rate": 1.9500571529060053e-05, + "loss": 0.0019, + "step": 4650 + }, + { + "epoch": 1.2271468144044322, + "grad_norm": 0.1469116061925888, + "learning_rate": 1.949939916175738e-05, + "loss": 0.0022, + "step": 4652 + }, + { + "epoch": 1.2276744492810976, + "grad_norm": 0.369338721036911, + "learning_rate": 1.9498226794454705e-05, + "loss": 0.0086, + "step": 4654 + }, + { + "epoch": 1.2282020841577628, + "grad_norm": 0.015711989253759384, + "learning_rate": 1.9497054427152028e-05, + "loss": 0.0006, + "step": 4656 + }, + { + "epoch": 1.2287297190344282, + "grad_norm": 0.16590480506420135, + "learning_rate": 1.949588205984935e-05, + "loss": 0.0101, + "step": 4658 + }, + { + "epoch": 1.2292573539110936, + "grad_norm": 0.4206046462059021, + "learning_rate": 1.9494709692546677e-05, + "loss": 0.0103, + "step": 4660 + }, + { + "epoch": 1.2297849887877588, + "grad_norm": 0.0269597377628088, + "learning_rate": 1.9493537325244e-05, + "loss": 0.001, + "step": 4662 + }, + { + "epoch": 1.2303126236644242, + "grad_norm": 0.0655394196510315, + "learning_rate": 1.9492364957941325e-05, + "loss": 0.0008, + "step": 4664 + }, + { + "epoch": 1.2308402585410896, + "grad_norm": 0.04906943440437317, + "learning_rate": 1.9491192590638648e-05, + "loss": 0.0007, + "step": 4666 + }, + { + "epoch": 1.231367893417755, + "grad_norm": 0.1372591257095337, + "learning_rate": 1.9490020223335974e-05, + "loss": 0.0073, + "step": 4668 + }, + { + "epoch": 1.2318955282944202, + "grad_norm": 0.0492824912071228, + "learning_rate": 1.9488847856033296e-05, + "loss": 0.0008, + "step": 4670 + }, + { + "epoch": 1.2324231631710856, + "grad_norm": 0.01682964526116848, + "learning_rate": 1.9487675488730622e-05, + "loss": 0.0005, + "step": 4672 + }, + { + "epoch": 1.232950798047751, + "grad_norm": 0.47395285964012146, + "learning_rate": 1.9486503121427945e-05, + "loss": 0.0035, + "step": 4674 + }, + { + "epoch": 1.2334784329244164, + "grad_norm": 0.08078434318304062, + "learning_rate": 1.9485330754125268e-05, + "loss": 0.0096, + "step": 4676 + }, + { + "epoch": 1.2340060678010816, + "grad_norm": 0.2580515742301941, + "learning_rate": 1.9484158386822594e-05, + "loss": 0.0067, + "step": 4678 + }, + { + "epoch": 1.234533702677747, + "grad_norm": 0.006095229182392359, + "learning_rate": 1.9482986019519916e-05, + "loss": 0.0005, + "step": 4680 + }, + { + "epoch": 1.2350613375544124, + "grad_norm": 0.011479347944259644, + "learning_rate": 1.9481813652217242e-05, + "loss": 0.0027, + "step": 4682 + }, + { + "epoch": 1.2355889724310778, + "grad_norm": 0.6511725187301636, + "learning_rate": 1.9480641284914565e-05, + "loss": 0.0016, + "step": 4684 + }, + { + "epoch": 1.236116607307743, + "grad_norm": 0.44284966588020325, + "learning_rate": 1.947946891761189e-05, + "loss": 0.0026, + "step": 4686 + }, + { + "epoch": 1.2366442421844084, + "grad_norm": 0.0056256819516420364, + "learning_rate": 1.9478296550309214e-05, + "loss": 0.0009, + "step": 4688 + }, + { + "epoch": 1.2371718770610738, + "grad_norm": 0.3491106927394867, + "learning_rate": 1.9477124183006536e-05, + "loss": 0.0024, + "step": 4690 + }, + { + "epoch": 1.237699511937739, + "grad_norm": 0.004173721186816692, + "learning_rate": 1.9475951815703862e-05, + "loss": 0.0092, + "step": 4692 + }, + { + "epoch": 1.2382271468144044, + "grad_norm": 0.03360743820667267, + "learning_rate": 1.9474779448401188e-05, + "loss": 0.0006, + "step": 4694 + }, + { + "epoch": 1.2387547816910698, + "grad_norm": 0.02952989563345909, + "learning_rate": 1.9473607081098507e-05, + "loss": 0.0005, + "step": 4696 + }, + { + "epoch": 1.2392824165677352, + "grad_norm": 0.38060352206230164, + "learning_rate": 1.9472434713795833e-05, + "loss": 0.0032, + "step": 4698 + }, + { + "epoch": 1.2398100514444006, + "grad_norm": 0.18757019937038422, + "learning_rate": 1.947126234649316e-05, + "loss": 0.0112, + "step": 4700 + }, + { + "epoch": 1.2403376863210658, + "grad_norm": 0.5304321646690369, + "learning_rate": 1.9470089979190482e-05, + "loss": 0.0297, + "step": 4702 + }, + { + "epoch": 1.2408653211977312, + "grad_norm": 0.16375210881233215, + "learning_rate": 1.9468917611887805e-05, + "loss": 0.0098, + "step": 4704 + }, + { + "epoch": 1.2413929560743966, + "grad_norm": 0.04324977472424507, + "learning_rate": 1.946774524458513e-05, + "loss": 0.0014, + "step": 4706 + }, + { + "epoch": 1.2419205909510618, + "grad_norm": 0.14641128480434418, + "learning_rate": 1.9466572877282453e-05, + "loss": 0.0188, + "step": 4708 + }, + { + "epoch": 1.2424482258277272, + "grad_norm": 0.10031818598508835, + "learning_rate": 1.9465400509979776e-05, + "loss": 0.0015, + "step": 4710 + }, + { + "epoch": 1.2429758607043926, + "grad_norm": 0.047057826071977615, + "learning_rate": 1.9464228142677102e-05, + "loss": 0.0015, + "step": 4712 + }, + { + "epoch": 1.243503495581058, + "grad_norm": 0.10888471454381943, + "learning_rate": 1.9463055775374428e-05, + "loss": 0.0019, + "step": 4714 + }, + { + "epoch": 1.2440311304577232, + "grad_norm": 0.022414492443203926, + "learning_rate": 1.946188340807175e-05, + "loss": 0.0009, + "step": 4716 + }, + { + "epoch": 1.2445587653343886, + "grad_norm": 0.4452432692050934, + "learning_rate": 1.9460711040769073e-05, + "loss": 0.0012, + "step": 4718 + }, + { + "epoch": 1.245086400211054, + "grad_norm": 0.36267396807670593, + "learning_rate": 1.94595386734664e-05, + "loss": 0.005, + "step": 4720 + }, + { + "epoch": 1.2456140350877192, + "grad_norm": 0.028342435136437416, + "learning_rate": 1.945836630616372e-05, + "loss": 0.0008, + "step": 4722 + }, + { + "epoch": 1.2461416699643846, + "grad_norm": 0.01980924792587757, + "learning_rate": 1.9457193938861048e-05, + "loss": 0.0008, + "step": 4724 + }, + { + "epoch": 1.24666930484105, + "grad_norm": 0.09400346130132675, + "learning_rate": 1.945602157155837e-05, + "loss": 0.0008, + "step": 4726 + }, + { + "epoch": 1.2471969397177154, + "grad_norm": 0.09676839411258698, + "learning_rate": 1.9454849204255696e-05, + "loss": 0.0042, + "step": 4728 + }, + { + "epoch": 1.2477245745943808, + "grad_norm": 0.04999980330467224, + "learning_rate": 1.945367683695302e-05, + "loss": 0.0036, + "step": 4730 + }, + { + "epoch": 1.248252209471046, + "grad_norm": 0.30251386761665344, + "learning_rate": 1.9452504469650345e-05, + "loss": 0.0051, + "step": 4732 + }, + { + "epoch": 1.2487798443477114, + "grad_norm": 0.08462697267532349, + "learning_rate": 1.9451332102347667e-05, + "loss": 0.0086, + "step": 4734 + }, + { + "epoch": 1.2493074792243768, + "grad_norm": 0.1913071572780609, + "learning_rate": 1.945015973504499e-05, + "loss": 0.0024, + "step": 4736 + }, + { + "epoch": 1.249835114101042, + "grad_norm": 0.2722969949245453, + "learning_rate": 1.9448987367742316e-05, + "loss": 0.0036, + "step": 4738 + }, + { + "epoch": 1.2503627489777074, + "grad_norm": 0.06159104034304619, + "learning_rate": 1.9447815000439642e-05, + "loss": 0.0016, + "step": 4740 + }, + { + "epoch": 1.2508903838543728, + "grad_norm": 0.5482884645462036, + "learning_rate": 1.944664263313696e-05, + "loss": 0.0043, + "step": 4742 + }, + { + "epoch": 1.2514180187310382, + "grad_norm": 0.4108739495277405, + "learning_rate": 1.9445470265834287e-05, + "loss": 0.0066, + "step": 4744 + }, + { + "epoch": 1.2519456536077036, + "grad_norm": 0.18071553111076355, + "learning_rate": 1.9444297898531613e-05, + "loss": 0.0073, + "step": 4746 + }, + { + "epoch": 1.2524732884843688, + "grad_norm": 0.26256197690963745, + "learning_rate": 1.9443125531228936e-05, + "loss": 0.0047, + "step": 4748 + }, + { + "epoch": 1.2530009233610342, + "grad_norm": 0.04992462322115898, + "learning_rate": 1.944195316392626e-05, + "loss": 0.0041, + "step": 4750 + }, + { + "epoch": 1.2535285582376994, + "grad_norm": 0.017626818269491196, + "learning_rate": 1.9440780796623584e-05, + "loss": 0.0031, + "step": 4752 + }, + { + "epoch": 1.2540561931143648, + "grad_norm": 0.022440336644649506, + "learning_rate": 1.9439608429320907e-05, + "loss": 0.0012, + "step": 4754 + }, + { + "epoch": 1.2545838279910302, + "grad_norm": 0.028969792649149895, + "learning_rate": 1.943843606201823e-05, + "loss": 0.0009, + "step": 4756 + }, + { + "epoch": 1.2551114628676956, + "grad_norm": 0.01302251871675253, + "learning_rate": 1.9437263694715556e-05, + "loss": 0.0005, + "step": 4758 + }, + { + "epoch": 1.255639097744361, + "grad_norm": 0.010678023099899292, + "learning_rate": 1.943609132741288e-05, + "loss": 0.0039, + "step": 4760 + }, + { + "epoch": 1.2561667326210262, + "grad_norm": 0.044325415045022964, + "learning_rate": 1.9434918960110204e-05, + "loss": 0.0066, + "step": 4762 + }, + { + "epoch": 1.2566943674976916, + "grad_norm": 0.038872282952070236, + "learning_rate": 1.9433746592807527e-05, + "loss": 0.0007, + "step": 4764 + }, + { + "epoch": 1.257222002374357, + "grad_norm": 0.20847618579864502, + "learning_rate": 1.9432574225504853e-05, + "loss": 0.01, + "step": 4766 + }, + { + "epoch": 1.2577496372510222, + "grad_norm": 0.00967901386320591, + "learning_rate": 1.9431401858202176e-05, + "loss": 0.0006, + "step": 4768 + }, + { + "epoch": 1.2582772721276876, + "grad_norm": 0.14614887535572052, + "learning_rate": 1.94302294908995e-05, + "loss": 0.0084, + "step": 4770 + }, + { + "epoch": 1.258804907004353, + "grad_norm": 0.12532389163970947, + "learning_rate": 1.9429057123596824e-05, + "loss": 0.0013, + "step": 4772 + }, + { + "epoch": 1.2593325418810184, + "grad_norm": 0.04690774157643318, + "learning_rate": 1.942788475629415e-05, + "loss": 0.0012, + "step": 4774 + }, + { + "epoch": 1.2598601767576838, + "grad_norm": 0.11261089146137238, + "learning_rate": 1.9426712388991473e-05, + "loss": 0.0018, + "step": 4776 + }, + { + "epoch": 1.260387811634349, + "grad_norm": 0.04217657819390297, + "learning_rate": 1.9425540021688795e-05, + "loss": 0.0018, + "step": 4778 + }, + { + "epoch": 1.2609154465110144, + "grad_norm": 0.02432895079255104, + "learning_rate": 1.942436765438612e-05, + "loss": 0.0006, + "step": 4780 + }, + { + "epoch": 1.2614430813876798, + "grad_norm": 0.1275792121887207, + "learning_rate": 1.9423195287083444e-05, + "loss": 0.0043, + "step": 4782 + }, + { + "epoch": 1.261970716264345, + "grad_norm": 0.024477969855070114, + "learning_rate": 1.942202291978077e-05, + "loss": 0.0007, + "step": 4784 + }, + { + "epoch": 1.2624983511410104, + "grad_norm": 0.018669264391064644, + "learning_rate": 1.9420850552478093e-05, + "loss": 0.001, + "step": 4786 + }, + { + "epoch": 1.2630259860176758, + "grad_norm": 0.018440309911966324, + "learning_rate": 1.9419678185175415e-05, + "loss": 0.0181, + "step": 4788 + }, + { + "epoch": 1.2635536208943412, + "grad_norm": 0.03226732462644577, + "learning_rate": 1.941850581787274e-05, + "loss": 0.0008, + "step": 4790 + }, + { + "epoch": 1.2640812557710064, + "grad_norm": 0.014282464049756527, + "learning_rate": 1.9417333450570067e-05, + "loss": 0.0014, + "step": 4792 + }, + { + "epoch": 1.2646088906476718, + "grad_norm": 0.018436290323734283, + "learning_rate": 1.941616108326739e-05, + "loss": 0.0006, + "step": 4794 + }, + { + "epoch": 1.2651365255243372, + "grad_norm": 0.05917792394757271, + "learning_rate": 1.9414988715964712e-05, + "loss": 0.0009, + "step": 4796 + }, + { + "epoch": 1.2656641604010024, + "grad_norm": 0.0828830674290657, + "learning_rate": 1.941381634866204e-05, + "loss": 0.0007, + "step": 4798 + }, + { + "epoch": 1.2661917952776678, + "grad_norm": 0.5146016478538513, + "learning_rate": 1.941264398135936e-05, + "loss": 0.0025, + "step": 4800 + }, + { + "epoch": 1.2667194301543332, + "grad_norm": 0.010936693288385868, + "learning_rate": 1.9411471614056684e-05, + "loss": 0.0017, + "step": 4802 + }, + { + "epoch": 1.2672470650309986, + "grad_norm": 0.010953295975923538, + "learning_rate": 1.941029924675401e-05, + "loss": 0.0097, + "step": 4804 + }, + { + "epoch": 1.267774699907664, + "grad_norm": 0.4518735110759735, + "learning_rate": 1.9409126879451336e-05, + "loss": 0.0028, + "step": 4806 + }, + { + "epoch": 1.2683023347843292, + "grad_norm": 0.7513629794120789, + "learning_rate": 1.9407954512148658e-05, + "loss": 0.0087, + "step": 4808 + }, + { + "epoch": 1.2688299696609946, + "grad_norm": 0.3638790547847748, + "learning_rate": 1.940678214484598e-05, + "loss": 0.0014, + "step": 4810 + }, + { + "epoch": 1.26935760453766, + "grad_norm": 0.004035983234643936, + "learning_rate": 1.9405609777543307e-05, + "loss": 0.0005, + "step": 4812 + }, + { + "epoch": 1.2698852394143252, + "grad_norm": 0.06173601374030113, + "learning_rate": 1.940443741024063e-05, + "loss": 0.0007, + "step": 4814 + }, + { + "epoch": 1.2704128742909906, + "grad_norm": 0.03961976617574692, + "learning_rate": 1.9403265042937952e-05, + "loss": 0.0011, + "step": 4816 + }, + { + "epoch": 1.270940509167656, + "grad_norm": 0.13701103627681732, + "learning_rate": 1.9402092675635278e-05, + "loss": 0.0035, + "step": 4818 + }, + { + "epoch": 1.2714681440443214, + "grad_norm": 0.009741785004734993, + "learning_rate": 1.9400920308332604e-05, + "loss": 0.0008, + "step": 4820 + }, + { + "epoch": 1.2719957789209868, + "grad_norm": 0.30874067544937134, + "learning_rate": 1.9399747941029927e-05, + "loss": 0.004, + "step": 4822 + }, + { + "epoch": 1.272523413797652, + "grad_norm": 0.40807822346687317, + "learning_rate": 1.939857557372725e-05, + "loss": 0.0017, + "step": 4824 + }, + { + "epoch": 1.2730510486743174, + "grad_norm": 0.020976917818188667, + "learning_rate": 1.9397403206424575e-05, + "loss": 0.0016, + "step": 4826 + }, + { + "epoch": 1.2735786835509826, + "grad_norm": 0.026087122038006783, + "learning_rate": 1.9396230839121898e-05, + "loss": 0.0005, + "step": 4828 + }, + { + "epoch": 1.274106318427648, + "grad_norm": 0.006895229686051607, + "learning_rate": 1.9395058471819224e-05, + "loss": 0.0017, + "step": 4830 + }, + { + "epoch": 1.2746339533043134, + "grad_norm": 0.39805707335472107, + "learning_rate": 1.9393886104516546e-05, + "loss": 0.0046, + "step": 4832 + }, + { + "epoch": 1.2751615881809788, + "grad_norm": 0.19031864404678345, + "learning_rate": 1.939271373721387e-05, + "loss": 0.0018, + "step": 4834 + }, + { + "epoch": 1.2756892230576442, + "grad_norm": 0.18265698850154877, + "learning_rate": 1.9391541369911195e-05, + "loss": 0.0035, + "step": 4836 + }, + { + "epoch": 1.2762168579343094, + "grad_norm": 0.40202516317367554, + "learning_rate": 1.9390369002608518e-05, + "loss": 0.0087, + "step": 4838 + }, + { + "epoch": 1.2767444928109748, + "grad_norm": 0.014156774617731571, + "learning_rate": 1.9389196635305844e-05, + "loss": 0.0004, + "step": 4840 + }, + { + "epoch": 1.2772721276876402, + "grad_norm": 0.13179297745227814, + "learning_rate": 1.9388024268003166e-05, + "loss": 0.0027, + "step": 4842 + }, + { + "epoch": 1.2777997625643054, + "grad_norm": 0.37114593386650085, + "learning_rate": 1.9386851900700492e-05, + "loss": 0.0164, + "step": 4844 + }, + { + "epoch": 1.2783273974409708, + "grad_norm": 0.006744110956788063, + "learning_rate": 1.9385679533397815e-05, + "loss": 0.001, + "step": 4846 + }, + { + "epoch": 1.2788550323176362, + "grad_norm": 0.4363307058811188, + "learning_rate": 1.9384507166095137e-05, + "loss": 0.0079, + "step": 4848 + }, + { + "epoch": 1.2793826671943016, + "grad_norm": 0.1888546645641327, + "learning_rate": 1.9383334798792463e-05, + "loss": 0.0009, + "step": 4850 + }, + { + "epoch": 1.279910302070967, + "grad_norm": 0.08490395545959473, + "learning_rate": 1.938216243148979e-05, + "loss": 0.0071, + "step": 4852 + }, + { + "epoch": 1.2804379369476322, + "grad_norm": 0.47208237648010254, + "learning_rate": 1.9380990064187112e-05, + "loss": 0.0214, + "step": 4854 + }, + { + "epoch": 1.2809655718242976, + "grad_norm": 0.3146098554134369, + "learning_rate": 1.9379817696884435e-05, + "loss": 0.0021, + "step": 4856 + }, + { + "epoch": 1.281493206700963, + "grad_norm": 0.19368135929107666, + "learning_rate": 1.937864532958176e-05, + "loss": 0.0033, + "step": 4858 + }, + { + "epoch": 1.2820208415776282, + "grad_norm": 0.08768901228904724, + "learning_rate": 1.9377472962279083e-05, + "loss": 0.0015, + "step": 4860 + }, + { + "epoch": 1.2825484764542936, + "grad_norm": 0.27329254150390625, + "learning_rate": 1.9376300594976406e-05, + "loss": 0.0017, + "step": 4862 + }, + { + "epoch": 1.283076111330959, + "grad_norm": 0.03511716052889824, + "learning_rate": 1.9375128227673732e-05, + "loss": 0.0066, + "step": 4864 + }, + { + "epoch": 1.2836037462076244, + "grad_norm": 0.11148921400308609, + "learning_rate": 1.9373955860371058e-05, + "loss": 0.0008, + "step": 4866 + }, + { + "epoch": 1.2841313810842896, + "grad_norm": 0.01380465179681778, + "learning_rate": 1.937278349306838e-05, + "loss": 0.0013, + "step": 4868 + }, + { + "epoch": 1.284659015960955, + "grad_norm": 0.41046181321144104, + "learning_rate": 1.9371611125765703e-05, + "loss": 0.0128, + "step": 4870 + }, + { + "epoch": 1.2851866508376204, + "grad_norm": 0.012132758274674416, + "learning_rate": 1.937043875846303e-05, + "loss": 0.0004, + "step": 4872 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.19259263575077057, + "learning_rate": 1.9369266391160352e-05, + "loss": 0.0123, + "step": 4874 + }, + { + "epoch": 1.286241920590951, + "grad_norm": 0.39705386757850647, + "learning_rate": 1.9368094023857674e-05, + "loss": 0.0045, + "step": 4876 + }, + { + "epoch": 1.2867695554676164, + "grad_norm": 1.1260534524917603, + "learning_rate": 1.9366921656555e-05, + "loss": 0.0095, + "step": 4878 + }, + { + "epoch": 1.2872971903442818, + "grad_norm": 0.014004547148942947, + "learning_rate": 1.9365749289252326e-05, + "loss": 0.0012, + "step": 4880 + }, + { + "epoch": 1.2878248252209472, + "grad_norm": 0.013571846298873425, + "learning_rate": 1.936457692194965e-05, + "loss": 0.0088, + "step": 4882 + }, + { + "epoch": 1.2883524600976124, + "grad_norm": 0.31013497710227966, + "learning_rate": 1.936340455464697e-05, + "loss": 0.0028, + "step": 4884 + }, + { + "epoch": 1.2888800949742778, + "grad_norm": 1.3362929821014404, + "learning_rate": 1.9362232187344298e-05, + "loss": 0.0093, + "step": 4886 + }, + { + "epoch": 1.2894077298509432, + "grad_norm": 0.23813089728355408, + "learning_rate": 1.936105982004162e-05, + "loss": 0.0132, + "step": 4888 + }, + { + "epoch": 1.2899353647276084, + "grad_norm": 0.2755165994167328, + "learning_rate": 1.9359887452738946e-05, + "loss": 0.0063, + "step": 4890 + }, + { + "epoch": 1.2904629996042738, + "grad_norm": 0.11755944788455963, + "learning_rate": 1.935871508543627e-05, + "loss": 0.0014, + "step": 4892 + }, + { + "epoch": 1.2909906344809392, + "grad_norm": 0.021017085760831833, + "learning_rate": 1.935754271813359e-05, + "loss": 0.0015, + "step": 4894 + }, + { + "epoch": 1.2915182693576046, + "grad_norm": 0.018758736550807953, + "learning_rate": 1.9356370350830917e-05, + "loss": 0.0052, + "step": 4896 + }, + { + "epoch": 1.29204590423427, + "grad_norm": 0.4510372579097748, + "learning_rate": 1.935519798352824e-05, + "loss": 0.0102, + "step": 4898 + }, + { + "epoch": 1.2925735391109352, + "grad_norm": 0.036195553839206696, + "learning_rate": 1.9354025616225566e-05, + "loss": 0.0011, + "step": 4900 + }, + { + "epoch": 1.2931011739876006, + "grad_norm": 0.09039503335952759, + "learning_rate": 1.935285324892289e-05, + "loss": 0.0097, + "step": 4902 + }, + { + "epoch": 1.293628808864266, + "grad_norm": 0.017606297507882118, + "learning_rate": 1.9351680881620215e-05, + "loss": 0.0006, + "step": 4904 + }, + { + "epoch": 1.2941564437409312, + "grad_norm": 0.6053255796432495, + "learning_rate": 1.9350508514317537e-05, + "loss": 0.0049, + "step": 4906 + }, + { + "epoch": 1.2946840786175966, + "grad_norm": 0.6595610976219177, + "learning_rate": 1.934933614701486e-05, + "loss": 0.0071, + "step": 4908 + }, + { + "epoch": 1.295211713494262, + "grad_norm": 0.08770721405744553, + "learning_rate": 1.9348163779712186e-05, + "loss": 0.0013, + "step": 4910 + }, + { + "epoch": 1.2957393483709274, + "grad_norm": 0.17573341727256775, + "learning_rate": 1.9346991412409512e-05, + "loss": 0.0038, + "step": 4912 + }, + { + "epoch": 1.2962669832475926, + "grad_norm": 1.0530016422271729, + "learning_rate": 1.9345819045106834e-05, + "loss": 0.0093, + "step": 4914 + }, + { + "epoch": 1.296794618124258, + "grad_norm": 0.3746166527271271, + "learning_rate": 1.9344646677804157e-05, + "loss": 0.0038, + "step": 4916 + }, + { + "epoch": 1.2973222530009234, + "grad_norm": 0.16805846989154816, + "learning_rate": 1.9343474310501483e-05, + "loss": 0.0016, + "step": 4918 + }, + { + "epoch": 1.2978498878775886, + "grad_norm": 0.06688375771045685, + "learning_rate": 1.9342301943198806e-05, + "loss": 0.001, + "step": 4920 + }, + { + "epoch": 1.298377522754254, + "grad_norm": 0.7703543305397034, + "learning_rate": 1.9341129575896128e-05, + "loss": 0.0079, + "step": 4922 + }, + { + "epoch": 1.2989051576309194, + "grad_norm": 0.031059900298714638, + "learning_rate": 1.9339957208593454e-05, + "loss": 0.0008, + "step": 4924 + }, + { + "epoch": 1.2994327925075848, + "grad_norm": 0.012591095641255379, + "learning_rate": 1.933878484129078e-05, + "loss": 0.0006, + "step": 4926 + }, + { + "epoch": 1.2999604273842502, + "grad_norm": 0.13251452147960663, + "learning_rate": 1.93376124739881e-05, + "loss": 0.0017, + "step": 4928 + }, + { + "epoch": 1.3004880622609154, + "grad_norm": 0.08957184851169586, + "learning_rate": 1.9336440106685425e-05, + "loss": 0.0033, + "step": 4930 + }, + { + "epoch": 1.3010156971375808, + "grad_norm": 0.5410167574882507, + "learning_rate": 1.933526773938275e-05, + "loss": 0.0131, + "step": 4932 + }, + { + "epoch": 1.3015433320142462, + "grad_norm": 0.03097003512084484, + "learning_rate": 1.9334095372080074e-05, + "loss": 0.0006, + "step": 4934 + }, + { + "epoch": 1.3020709668909114, + "grad_norm": 0.08578983694314957, + "learning_rate": 1.9332923004777397e-05, + "loss": 0.0029, + "step": 4936 + }, + { + "epoch": 1.3025986017675768, + "grad_norm": 0.5100389719009399, + "learning_rate": 1.9331750637474723e-05, + "loss": 0.0144, + "step": 4938 + }, + { + "epoch": 1.3031262366442422, + "grad_norm": 0.02146683819591999, + "learning_rate": 1.9330578270172045e-05, + "loss": 0.0006, + "step": 4940 + }, + { + "epoch": 1.3036538715209076, + "grad_norm": 0.05902387946844101, + "learning_rate": 1.932940590286937e-05, + "loss": 0.0025, + "step": 4942 + }, + { + "epoch": 1.3041815063975728, + "grad_norm": 0.3526798188686371, + "learning_rate": 1.9328233535566694e-05, + "loss": 0.0013, + "step": 4944 + }, + { + "epoch": 1.3047091412742382, + "grad_norm": 0.03283384442329407, + "learning_rate": 1.932706116826402e-05, + "loss": 0.0032, + "step": 4946 + }, + { + "epoch": 1.3052367761509036, + "grad_norm": 0.03064693883061409, + "learning_rate": 1.9325888800961343e-05, + "loss": 0.0005, + "step": 4948 + }, + { + "epoch": 1.3057644110275688, + "grad_norm": 0.007316006813198328, + "learning_rate": 1.932471643365867e-05, + "loss": 0.0005, + "step": 4950 + }, + { + "epoch": 1.3062920459042342, + "grad_norm": 0.1301509439945221, + "learning_rate": 1.932354406635599e-05, + "loss": 0.0131, + "step": 4952 + }, + { + "epoch": 1.3068196807808996, + "grad_norm": 0.022498713806271553, + "learning_rate": 1.9322371699053314e-05, + "loss": 0.0093, + "step": 4954 + }, + { + "epoch": 1.307347315657565, + "grad_norm": 0.05275021493434906, + "learning_rate": 1.932119933175064e-05, + "loss": 0.012, + "step": 4956 + }, + { + "epoch": 1.3078749505342304, + "grad_norm": 0.10872893035411835, + "learning_rate": 1.9320026964447962e-05, + "loss": 0.0016, + "step": 4958 + }, + { + "epoch": 1.3084025854108956, + "grad_norm": 0.26513567566871643, + "learning_rate": 1.931885459714529e-05, + "loss": 0.0099, + "step": 4960 + }, + { + "epoch": 1.308930220287561, + "grad_norm": 0.012315474450588226, + "learning_rate": 1.931768222984261e-05, + "loss": 0.0011, + "step": 4962 + }, + { + "epoch": 1.3094578551642264, + "grad_norm": 0.0341062918305397, + "learning_rate": 1.9316509862539937e-05, + "loss": 0.0036, + "step": 4964 + }, + { + "epoch": 1.3099854900408916, + "grad_norm": 0.3517654240131378, + "learning_rate": 1.931533749523726e-05, + "loss": 0.0057, + "step": 4966 + }, + { + "epoch": 1.310513124917557, + "grad_norm": 0.30806270241737366, + "learning_rate": 1.9314165127934582e-05, + "loss": 0.0054, + "step": 4968 + }, + { + "epoch": 1.3110407597942224, + "grad_norm": 0.05088873207569122, + "learning_rate": 1.9312992760631908e-05, + "loss": 0.0038, + "step": 4970 + }, + { + "epoch": 1.3115683946708878, + "grad_norm": 0.025078793987631798, + "learning_rate": 1.9311820393329234e-05, + "loss": 0.0081, + "step": 4972 + }, + { + "epoch": 1.3120960295475532, + "grad_norm": 0.03940136730670929, + "learning_rate": 1.9310648026026553e-05, + "loss": 0.0011, + "step": 4974 + }, + { + "epoch": 1.3126236644242184, + "grad_norm": 0.03482584282755852, + "learning_rate": 1.930947565872388e-05, + "loss": 0.0037, + "step": 4976 + }, + { + "epoch": 1.3131512993008838, + "grad_norm": 0.05978667363524437, + "learning_rate": 1.9308303291421205e-05, + "loss": 0.0028, + "step": 4978 + }, + { + "epoch": 1.3136789341775492, + "grad_norm": 0.4986729919910431, + "learning_rate": 1.9307130924118528e-05, + "loss": 0.0061, + "step": 4980 + }, + { + "epoch": 1.3142065690542144, + "grad_norm": 0.23552601039409637, + "learning_rate": 1.930595855681585e-05, + "loss": 0.0024, + "step": 4982 + }, + { + "epoch": 1.3147342039308798, + "grad_norm": 0.013213904574513435, + "learning_rate": 1.9304786189513177e-05, + "loss": 0.0006, + "step": 4984 + }, + { + "epoch": 1.3152618388075452, + "grad_norm": 0.14388911426067352, + "learning_rate": 1.93036138222105e-05, + "loss": 0.0013, + "step": 4986 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.09997100383043289, + "learning_rate": 1.9302441454907822e-05, + "loss": 0.0043, + "step": 4988 + }, + { + "epoch": 1.3163171085608758, + "grad_norm": 0.08808951824903488, + "learning_rate": 1.9301269087605148e-05, + "loss": 0.0017, + "step": 4990 + }, + { + "epoch": 1.3168447434375412, + "grad_norm": 0.006911645643413067, + "learning_rate": 1.9300096720302474e-05, + "loss": 0.0006, + "step": 4992 + }, + { + "epoch": 1.3173723783142066, + "grad_norm": 0.31344467401504517, + "learning_rate": 1.9298924352999796e-05, + "loss": 0.0068, + "step": 4994 + }, + { + "epoch": 1.3179000131908718, + "grad_norm": 0.2534538805484772, + "learning_rate": 1.929775198569712e-05, + "loss": 0.0035, + "step": 4996 + }, + { + "epoch": 1.3184276480675372, + "grad_norm": 0.01373431645333767, + "learning_rate": 1.9296579618394445e-05, + "loss": 0.0019, + "step": 4998 + }, + { + "epoch": 1.3189552829442026, + "grad_norm": 0.009297431446611881, + "learning_rate": 1.9295407251091768e-05, + "loss": 0.0005, + "step": 5000 + }, + { + "epoch": 1.319482917820868, + "grad_norm": 0.3278530240058899, + "learning_rate": 1.9294234883789094e-05, + "loss": 0.0014, + "step": 5002 + }, + { + "epoch": 1.3200105526975334, + "grad_norm": 0.21866407990455627, + "learning_rate": 1.9293062516486416e-05, + "loss": 0.0011, + "step": 5004 + }, + { + "epoch": 1.3205381875741986, + "grad_norm": 0.009909108281135559, + "learning_rate": 1.9291890149183742e-05, + "loss": 0.0004, + "step": 5006 + }, + { + "epoch": 1.321065822450864, + "grad_norm": 0.006535748951137066, + "learning_rate": 1.9290717781881065e-05, + "loss": 0.0004, + "step": 5008 + }, + { + "epoch": 1.3215934573275294, + "grad_norm": 0.018728628754615784, + "learning_rate": 1.928954541457839e-05, + "loss": 0.0026, + "step": 5010 + }, + { + "epoch": 1.3221210922041946, + "grad_norm": 0.03306115046143532, + "learning_rate": 1.9288373047275713e-05, + "loss": 0.0034, + "step": 5012 + }, + { + "epoch": 1.32264872708086, + "grad_norm": 0.3944591283798218, + "learning_rate": 1.9287200679973036e-05, + "loss": 0.0137, + "step": 5014 + }, + { + "epoch": 1.3231763619575254, + "grad_norm": 0.01561917457729578, + "learning_rate": 1.9286028312670362e-05, + "loss": 0.0042, + "step": 5016 + }, + { + "epoch": 1.3237039968341908, + "grad_norm": 0.14224261045455933, + "learning_rate": 1.9284855945367685e-05, + "loss": 0.0056, + "step": 5018 + }, + { + "epoch": 1.324231631710856, + "grad_norm": 0.5248634219169617, + "learning_rate": 1.9283683578065007e-05, + "loss": 0.0152, + "step": 5020 + }, + { + "epoch": 1.3247592665875214, + "grad_norm": 0.1284434050321579, + "learning_rate": 1.9282511210762333e-05, + "loss": 0.0008, + "step": 5022 + }, + { + "epoch": 1.3252869014641868, + "grad_norm": 0.35938355326652527, + "learning_rate": 1.928133884345966e-05, + "loss": 0.0121, + "step": 5024 + }, + { + "epoch": 1.325814536340852, + "grad_norm": 0.10493459552526474, + "learning_rate": 1.9280166476156982e-05, + "loss": 0.0012, + "step": 5026 + }, + { + "epoch": 1.3263421712175174, + "grad_norm": 0.026147892698645592, + "learning_rate": 1.9278994108854305e-05, + "loss": 0.0031, + "step": 5028 + }, + { + "epoch": 1.3268698060941828, + "grad_norm": 0.024705087766051292, + "learning_rate": 1.927782174155163e-05, + "loss": 0.0005, + "step": 5030 + }, + { + "epoch": 1.3273974409708482, + "grad_norm": 0.029450498521327972, + "learning_rate": 1.9276649374248953e-05, + "loss": 0.0006, + "step": 5032 + }, + { + "epoch": 1.3279250758475136, + "grad_norm": 0.05919511988759041, + "learning_rate": 1.9275477006946276e-05, + "loss": 0.0009, + "step": 5034 + }, + { + "epoch": 1.3284527107241788, + "grad_norm": 0.16568461060523987, + "learning_rate": 1.9274304639643602e-05, + "loss": 0.0027, + "step": 5036 + }, + { + "epoch": 1.3289803456008442, + "grad_norm": 0.483662486076355, + "learning_rate": 1.9273132272340928e-05, + "loss": 0.0062, + "step": 5038 + }, + { + "epoch": 1.3295079804775096, + "grad_norm": 0.009735982865095139, + "learning_rate": 1.927195990503825e-05, + "loss": 0.0004, + "step": 5040 + }, + { + "epoch": 1.3300356153541748, + "grad_norm": 0.008577477186918259, + "learning_rate": 1.9270787537735573e-05, + "loss": 0.0004, + "step": 5042 + }, + { + "epoch": 1.3305632502308402, + "grad_norm": 0.0045388201251626015, + "learning_rate": 1.92696151704329e-05, + "loss": 0.0003, + "step": 5044 + }, + { + "epoch": 1.3310908851075056, + "grad_norm": 0.1710488200187683, + "learning_rate": 1.926844280313022e-05, + "loss": 0.0016, + "step": 5046 + }, + { + "epoch": 1.331618519984171, + "grad_norm": 0.005669235717505217, + "learning_rate": 1.9267270435827544e-05, + "loss": 0.0004, + "step": 5048 + }, + { + "epoch": 1.3321461548608364, + "grad_norm": 0.009066825732588768, + "learning_rate": 1.926609806852487e-05, + "loss": 0.0022, + "step": 5050 + }, + { + "epoch": 1.3326737897375016, + "grad_norm": 0.5445948243141174, + "learning_rate": 1.9264925701222196e-05, + "loss": 0.0034, + "step": 5052 + }, + { + "epoch": 1.333201424614167, + "grad_norm": 0.006511414889246225, + "learning_rate": 1.926375333391952e-05, + "loss": 0.0005, + "step": 5054 + }, + { + "epoch": 1.3337290594908324, + "grad_norm": 1.2733232975006104, + "learning_rate": 1.926258096661684e-05, + "loss": 0.0017, + "step": 5056 + }, + { + "epoch": 1.3342566943674976, + "grad_norm": 0.008071266114711761, + "learning_rate": 1.9261408599314167e-05, + "loss": 0.0013, + "step": 5058 + }, + { + "epoch": 1.334784329244163, + "grad_norm": 0.05180010199546814, + "learning_rate": 1.926023623201149e-05, + "loss": 0.0006, + "step": 5060 + }, + { + "epoch": 1.3353119641208284, + "grad_norm": 0.19953633844852448, + "learning_rate": 1.9259063864708816e-05, + "loss": 0.0026, + "step": 5062 + }, + { + "epoch": 1.3358395989974938, + "grad_norm": 0.3468491733074188, + "learning_rate": 1.925789149740614e-05, + "loss": 0.0042, + "step": 5064 + }, + { + "epoch": 1.336367233874159, + "grad_norm": 0.00893770344555378, + "learning_rate": 1.925671913010346e-05, + "loss": 0.0009, + "step": 5066 + }, + { + "epoch": 1.3368948687508244, + "grad_norm": 0.7014684081077576, + "learning_rate": 1.9255546762800787e-05, + "loss": 0.0036, + "step": 5068 + }, + { + "epoch": 1.3374225036274898, + "grad_norm": 0.011509379372000694, + "learning_rate": 1.9254374395498113e-05, + "loss": 0.0008, + "step": 5070 + }, + { + "epoch": 1.337950138504155, + "grad_norm": 0.2127939909696579, + "learning_rate": 1.9253202028195436e-05, + "loss": 0.0034, + "step": 5072 + }, + { + "epoch": 1.3384777733808204, + "grad_norm": 0.0034491587430238724, + "learning_rate": 1.925202966089276e-05, + "loss": 0.0003, + "step": 5074 + }, + { + "epoch": 1.3390054082574858, + "grad_norm": 0.003468960989266634, + "learning_rate": 1.9250857293590084e-05, + "loss": 0.0007, + "step": 5076 + }, + { + "epoch": 1.3395330431341512, + "grad_norm": 0.008809187449514866, + "learning_rate": 1.9249684926287407e-05, + "loss": 0.0003, + "step": 5078 + }, + { + "epoch": 1.3400606780108166, + "grad_norm": 0.2689136862754822, + "learning_rate": 1.924851255898473e-05, + "loss": 0.0061, + "step": 5080 + }, + { + "epoch": 1.3405883128874818, + "grad_norm": 0.0054505132138729095, + "learning_rate": 1.9247340191682056e-05, + "loss": 0.0064, + "step": 5082 + }, + { + "epoch": 1.3411159477641472, + "grad_norm": 0.25510239601135254, + "learning_rate": 1.924616782437938e-05, + "loss": 0.0142, + "step": 5084 + }, + { + "epoch": 1.3416435826408126, + "grad_norm": 0.02574404887855053, + "learning_rate": 1.9244995457076704e-05, + "loss": 0.0005, + "step": 5086 + }, + { + "epoch": 1.3421712175174778, + "grad_norm": 0.02211688458919525, + "learning_rate": 1.9243823089774027e-05, + "loss": 0.0045, + "step": 5088 + }, + { + "epoch": 1.3426988523941432, + "grad_norm": 0.04440069571137428, + "learning_rate": 1.9242650722471353e-05, + "loss": 0.004, + "step": 5090 + }, + { + "epoch": 1.3432264872708086, + "grad_norm": 0.027093151584267616, + "learning_rate": 1.9241478355168675e-05, + "loss": 0.0037, + "step": 5092 + }, + { + "epoch": 1.343754122147474, + "grad_norm": 0.11766494065523148, + "learning_rate": 1.9240305987865998e-05, + "loss": 0.0014, + "step": 5094 + }, + { + "epoch": 1.3442817570241394, + "grad_norm": 0.02389097400009632, + "learning_rate": 1.9239133620563324e-05, + "loss": 0.0012, + "step": 5096 + }, + { + "epoch": 1.3448093919008046, + "grad_norm": 0.3916052579879761, + "learning_rate": 1.923796125326065e-05, + "loss": 0.0066, + "step": 5098 + }, + { + "epoch": 1.34533702677747, + "grad_norm": 0.05326894298195839, + "learning_rate": 1.9236788885957973e-05, + "loss": 0.0007, + "step": 5100 + }, + { + "epoch": 1.3458646616541352, + "grad_norm": 0.03725988045334816, + "learning_rate": 1.9235616518655295e-05, + "loss": 0.0005, + "step": 5102 + }, + { + "epoch": 1.3463922965308006, + "grad_norm": 0.1004336029291153, + "learning_rate": 1.923444415135262e-05, + "loss": 0.0028, + "step": 5104 + }, + { + "epoch": 1.346919931407466, + "grad_norm": 0.04268919676542282, + "learning_rate": 1.9233271784049944e-05, + "loss": 0.0009, + "step": 5106 + }, + { + "epoch": 1.3474475662841314, + "grad_norm": 0.014321631751954556, + "learning_rate": 1.9232099416747267e-05, + "loss": 0.0006, + "step": 5108 + }, + { + "epoch": 1.3479752011607968, + "grad_norm": 0.46118903160095215, + "learning_rate": 1.9230927049444592e-05, + "loss": 0.0087, + "step": 5110 + }, + { + "epoch": 1.348502836037462, + "grad_norm": 0.024527549743652344, + "learning_rate": 1.9229754682141915e-05, + "loss": 0.0006, + "step": 5112 + }, + { + "epoch": 1.3490304709141274, + "grad_norm": 0.13622525334358215, + "learning_rate": 1.922858231483924e-05, + "loss": 0.0121, + "step": 5114 + }, + { + "epoch": 1.3495581057907928, + "grad_norm": 0.038575202226638794, + "learning_rate": 1.9227409947536564e-05, + "loss": 0.0009, + "step": 5116 + }, + { + "epoch": 1.350085740667458, + "grad_norm": 0.2541308104991913, + "learning_rate": 1.922623758023389e-05, + "loss": 0.0054, + "step": 5118 + }, + { + "epoch": 1.3506133755441234, + "grad_norm": 0.011619414202868938, + "learning_rate": 1.9225065212931212e-05, + "loss": 0.0011, + "step": 5120 + }, + { + "epoch": 1.3511410104207888, + "grad_norm": 0.01753954216837883, + "learning_rate": 1.922389284562854e-05, + "loss": 0.012, + "step": 5122 + }, + { + "epoch": 1.3516686452974542, + "grad_norm": 0.014991599135100842, + "learning_rate": 1.922272047832586e-05, + "loss": 0.0005, + "step": 5124 + }, + { + "epoch": 1.3521962801741196, + "grad_norm": 0.04039749875664711, + "learning_rate": 1.9221548111023184e-05, + "loss": 0.0007, + "step": 5126 + }, + { + "epoch": 1.3527239150507848, + "grad_norm": 0.043591368943452835, + "learning_rate": 1.922037574372051e-05, + "loss": 0.0037, + "step": 5128 + }, + { + "epoch": 1.3532515499274502, + "grad_norm": 0.0648285374045372, + "learning_rate": 1.9219203376417836e-05, + "loss": 0.0008, + "step": 5130 + }, + { + "epoch": 1.3537791848041156, + "grad_norm": 0.023104269057512283, + "learning_rate": 1.9218031009115158e-05, + "loss": 0.0023, + "step": 5132 + }, + { + "epoch": 1.3543068196807808, + "grad_norm": 0.04293006658554077, + "learning_rate": 1.921685864181248e-05, + "loss": 0.004, + "step": 5134 + }, + { + "epoch": 1.3548344545574462, + "grad_norm": 0.06763554364442825, + "learning_rate": 1.9215686274509807e-05, + "loss": 0.0008, + "step": 5136 + }, + { + "epoch": 1.3553620894341116, + "grad_norm": 0.040881089866161346, + "learning_rate": 1.921451390720713e-05, + "loss": 0.0005, + "step": 5138 + }, + { + "epoch": 1.355889724310777, + "grad_norm": 0.05510951206088066, + "learning_rate": 1.9213341539904452e-05, + "loss": 0.0005, + "step": 5140 + }, + { + "epoch": 1.3564173591874422, + "grad_norm": 0.4278220236301422, + "learning_rate": 1.9212169172601778e-05, + "loss": 0.0024, + "step": 5142 + }, + { + "epoch": 1.3569449940641076, + "grad_norm": 0.055364761501550674, + "learning_rate": 1.9210996805299104e-05, + "loss": 0.0013, + "step": 5144 + }, + { + "epoch": 1.357472628940773, + "grad_norm": 0.007752491161227226, + "learning_rate": 1.9209824437996427e-05, + "loss": 0.0003, + "step": 5146 + }, + { + "epoch": 1.3580002638174382, + "grad_norm": 0.0053154947236180305, + "learning_rate": 1.920865207069375e-05, + "loss": 0.002, + "step": 5148 + }, + { + "epoch": 1.3585278986941036, + "grad_norm": 0.2353431135416031, + "learning_rate": 1.9207479703391075e-05, + "loss": 0.005, + "step": 5150 + }, + { + "epoch": 1.359055533570769, + "grad_norm": 0.010713987052440643, + "learning_rate": 1.9206307336088398e-05, + "loss": 0.0004, + "step": 5152 + }, + { + "epoch": 1.3595831684474344, + "grad_norm": 0.34868526458740234, + "learning_rate": 1.920513496878572e-05, + "loss": 0.0019, + "step": 5154 + }, + { + "epoch": 1.3601108033240998, + "grad_norm": 0.00840698555111885, + "learning_rate": 1.9203962601483046e-05, + "loss": 0.0005, + "step": 5156 + }, + { + "epoch": 1.360638438200765, + "grad_norm": 0.009555659256875515, + "learning_rate": 1.920279023418037e-05, + "loss": 0.0042, + "step": 5158 + }, + { + "epoch": 1.3611660730774304, + "grad_norm": 0.0407467819750309, + "learning_rate": 1.9201617866877695e-05, + "loss": 0.001, + "step": 5160 + }, + { + "epoch": 1.3616937079540958, + "grad_norm": 0.23059286177158356, + "learning_rate": 1.9200445499575018e-05, + "loss": 0.0009, + "step": 5162 + }, + { + "epoch": 1.362221342830761, + "grad_norm": 0.007577048614621162, + "learning_rate": 1.9199273132272344e-05, + "loss": 0.004, + "step": 5164 + }, + { + "epoch": 1.3627489777074264, + "grad_norm": 0.00931719969958067, + "learning_rate": 1.9198100764969666e-05, + "loss": 0.0003, + "step": 5166 + }, + { + "epoch": 1.3632766125840918, + "grad_norm": 0.7442792057991028, + "learning_rate": 1.919692839766699e-05, + "loss": 0.0094, + "step": 5168 + }, + { + "epoch": 1.3638042474607572, + "grad_norm": 0.005360884591937065, + "learning_rate": 1.9195756030364315e-05, + "loss": 0.0213, + "step": 5170 + }, + { + "epoch": 1.3643318823374226, + "grad_norm": 0.3670794665813446, + "learning_rate": 1.9194583663061637e-05, + "loss": 0.0041, + "step": 5172 + }, + { + "epoch": 1.3648595172140878, + "grad_norm": 1.0198557376861572, + "learning_rate": 1.9193411295758963e-05, + "loss": 0.0042, + "step": 5174 + }, + { + "epoch": 1.3653871520907532, + "grad_norm": 0.00974691566079855, + "learning_rate": 1.9192238928456286e-05, + "loss": 0.0004, + "step": 5176 + }, + { + "epoch": 1.3659147869674184, + "grad_norm": 0.2430436611175537, + "learning_rate": 1.9191066561153612e-05, + "loss": 0.002, + "step": 5178 + }, + { + "epoch": 1.3664424218440838, + "grad_norm": 0.5251835584640503, + "learning_rate": 1.9189894193850935e-05, + "loss": 0.0028, + "step": 5180 + }, + { + "epoch": 1.3669700567207492, + "grad_norm": 0.16355793178081512, + "learning_rate": 1.918872182654826e-05, + "loss": 0.0069, + "step": 5182 + }, + { + "epoch": 1.3674976915974146, + "grad_norm": 0.07289214432239532, + "learning_rate": 1.9187549459245583e-05, + "loss": 0.0013, + "step": 5184 + }, + { + "epoch": 1.36802532647408, + "grad_norm": 0.07706675678491592, + "learning_rate": 1.9186377091942906e-05, + "loss": 0.0012, + "step": 5186 + }, + { + "epoch": 1.3685529613507452, + "grad_norm": 0.07186391204595566, + "learning_rate": 1.9185204724640232e-05, + "loss": 0.0013, + "step": 5188 + }, + { + "epoch": 1.3690805962274106, + "grad_norm": 0.04321333020925522, + "learning_rate": 1.9184032357337558e-05, + "loss": 0.0007, + "step": 5190 + }, + { + "epoch": 1.369608231104076, + "grad_norm": 0.44844987988471985, + "learning_rate": 1.918285999003488e-05, + "loss": 0.0105, + "step": 5192 + }, + { + "epoch": 1.3701358659807412, + "grad_norm": 0.018936634063720703, + "learning_rate": 1.9181687622732203e-05, + "loss": 0.0013, + "step": 5194 + }, + { + "epoch": 1.3706635008574066, + "grad_norm": 0.39055123925209045, + "learning_rate": 1.918051525542953e-05, + "loss": 0.0091, + "step": 5196 + }, + { + "epoch": 1.371191135734072, + "grad_norm": 0.01696842908859253, + "learning_rate": 1.9179342888126852e-05, + "loss": 0.0004, + "step": 5198 + }, + { + "epoch": 1.3717187706107374, + "grad_norm": 0.005804033018648624, + "learning_rate": 1.9178170520824174e-05, + "loss": 0.0005, + "step": 5200 + }, + { + "epoch": 1.3722464054874028, + "grad_norm": 0.14518903195858002, + "learning_rate": 1.91769981535215e-05, + "loss": 0.0097, + "step": 5202 + }, + { + "epoch": 1.372774040364068, + "grad_norm": 0.006137910299003124, + "learning_rate": 1.9175825786218826e-05, + "loss": 0.0004, + "step": 5204 + }, + { + "epoch": 1.3733016752407334, + "grad_norm": 0.009639519266784191, + "learning_rate": 1.9174653418916146e-05, + "loss": 0.0004, + "step": 5206 + }, + { + "epoch": 1.3738293101173988, + "grad_norm": 0.397595077753067, + "learning_rate": 1.917348105161347e-05, + "loss": 0.0081, + "step": 5208 + }, + { + "epoch": 1.374356944994064, + "grad_norm": 0.20612508058547974, + "learning_rate": 1.9172308684310798e-05, + "loss": 0.0031, + "step": 5210 + }, + { + "epoch": 1.3748845798707294, + "grad_norm": 0.7938282489776611, + "learning_rate": 1.917113631700812e-05, + "loss": 0.003, + "step": 5212 + }, + { + "epoch": 1.3754122147473948, + "grad_norm": 0.41988134384155273, + "learning_rate": 1.9169963949705443e-05, + "loss": 0.0045, + "step": 5214 + }, + { + "epoch": 1.3759398496240602, + "grad_norm": 0.15399272739887238, + "learning_rate": 1.916879158240277e-05, + "loss": 0.0052, + "step": 5216 + }, + { + "epoch": 1.3764674845007254, + "grad_norm": 0.34197691082954407, + "learning_rate": 1.916761921510009e-05, + "loss": 0.0087, + "step": 5218 + }, + { + "epoch": 1.3769951193773908, + "grad_norm": 0.09306208789348602, + "learning_rate": 1.9166446847797417e-05, + "loss": 0.0023, + "step": 5220 + }, + { + "epoch": 1.3775227542540562, + "grad_norm": 0.028884282335639, + "learning_rate": 1.916527448049474e-05, + "loss": 0.0093, + "step": 5222 + }, + { + "epoch": 1.3780503891307214, + "grad_norm": 0.13058091700077057, + "learning_rate": 1.9164102113192066e-05, + "loss": 0.0013, + "step": 5224 + }, + { + "epoch": 1.3785780240073868, + "grad_norm": 0.01742563769221306, + "learning_rate": 1.916292974588939e-05, + "loss": 0.002, + "step": 5226 + }, + { + "epoch": 1.3791056588840522, + "grad_norm": 0.019290631636977196, + "learning_rate": 1.916175737858671e-05, + "loss": 0.0006, + "step": 5228 + }, + { + "epoch": 1.3796332937607176, + "grad_norm": 0.024593127891421318, + "learning_rate": 1.9160585011284037e-05, + "loss": 0.0015, + "step": 5230 + }, + { + "epoch": 1.380160928637383, + "grad_norm": 0.18117815256118774, + "learning_rate": 1.915941264398136e-05, + "loss": 0.0048, + "step": 5232 + }, + { + "epoch": 1.3806885635140482, + "grad_norm": 0.08507144451141357, + "learning_rate": 1.9158240276678686e-05, + "loss": 0.0013, + "step": 5234 + }, + { + "epoch": 1.3812161983907136, + "grad_norm": 0.303389310836792, + "learning_rate": 1.915706790937601e-05, + "loss": 0.0014, + "step": 5236 + }, + { + "epoch": 1.381743833267379, + "grad_norm": 0.08746883273124695, + "learning_rate": 1.9155895542073334e-05, + "loss": 0.0008, + "step": 5238 + }, + { + "epoch": 1.3822714681440442, + "grad_norm": 0.010073178447782993, + "learning_rate": 1.9154723174770657e-05, + "loss": 0.0006, + "step": 5240 + }, + { + "epoch": 1.3827991030207096, + "grad_norm": 0.5333629250526428, + "learning_rate": 1.9153550807467983e-05, + "loss": 0.005, + "step": 5242 + }, + { + "epoch": 1.383326737897375, + "grad_norm": 0.1452150195837021, + "learning_rate": 1.9152378440165306e-05, + "loss": 0.0047, + "step": 5244 + }, + { + "epoch": 1.3838543727740404, + "grad_norm": 0.1855604201555252, + "learning_rate": 1.9151206072862628e-05, + "loss": 0.0024, + "step": 5246 + }, + { + "epoch": 1.3843820076507058, + "grad_norm": 0.014378699474036694, + "learning_rate": 1.9150033705559954e-05, + "loss": 0.007, + "step": 5248 + }, + { + "epoch": 1.384909642527371, + "grad_norm": 0.01760864444077015, + "learning_rate": 1.914886133825728e-05, + "loss": 0.0034, + "step": 5250 + }, + { + "epoch": 1.3854372774040364, + "grad_norm": 0.065128855407238, + "learning_rate": 1.91476889709546e-05, + "loss": 0.0013, + "step": 5252 + }, + { + "epoch": 1.3859649122807016, + "grad_norm": 0.08024023473262787, + "learning_rate": 1.9146516603651925e-05, + "loss": 0.0009, + "step": 5254 + }, + { + "epoch": 1.386492547157367, + "grad_norm": 0.10868772864341736, + "learning_rate": 1.914534423634925e-05, + "loss": 0.0026, + "step": 5256 + }, + { + "epoch": 1.3870201820340324, + "grad_norm": 0.46102476119995117, + "learning_rate": 1.9144171869046574e-05, + "loss": 0.0094, + "step": 5258 + }, + { + "epoch": 1.3875478169106978, + "grad_norm": 0.042071498930454254, + "learning_rate": 1.9142999501743897e-05, + "loss": 0.0006, + "step": 5260 + }, + { + "epoch": 1.3880754517873632, + "grad_norm": 0.02639266848564148, + "learning_rate": 1.9141827134441223e-05, + "loss": 0.0005, + "step": 5262 + }, + { + "epoch": 1.3886030866640284, + "grad_norm": 0.01984560675919056, + "learning_rate": 1.9140654767138545e-05, + "loss": 0.0005, + "step": 5264 + }, + { + "epoch": 1.3891307215406938, + "grad_norm": 0.17174974083900452, + "learning_rate": 1.9139482399835868e-05, + "loss": 0.0044, + "step": 5266 + }, + { + "epoch": 1.3896583564173592, + "grad_norm": 0.04167903587222099, + "learning_rate": 1.9138310032533194e-05, + "loss": 0.0014, + "step": 5268 + }, + { + "epoch": 1.3901859912940244, + "grad_norm": 0.17055293917655945, + "learning_rate": 1.913713766523052e-05, + "loss": 0.0017, + "step": 5270 + }, + { + "epoch": 1.3907136261706898, + "grad_norm": 0.08780313283205032, + "learning_rate": 1.9135965297927842e-05, + "loss": 0.0113, + "step": 5272 + }, + { + "epoch": 1.3912412610473552, + "grad_norm": 0.3959076404571533, + "learning_rate": 1.9134792930625165e-05, + "loss": 0.0042, + "step": 5274 + }, + { + "epoch": 1.3917688959240206, + "grad_norm": 0.23410718142986298, + "learning_rate": 1.913362056332249e-05, + "loss": 0.0127, + "step": 5276 + }, + { + "epoch": 1.392296530800686, + "grad_norm": 0.019314417615532875, + "learning_rate": 1.9132448196019814e-05, + "loss": 0.003, + "step": 5278 + }, + { + "epoch": 1.3928241656773512, + "grad_norm": 0.3464282751083374, + "learning_rate": 1.913127582871714e-05, + "loss": 0.0052, + "step": 5280 + }, + { + "epoch": 1.3933518005540166, + "grad_norm": 0.007188545074313879, + "learning_rate": 1.9130103461414462e-05, + "loss": 0.0036, + "step": 5282 + }, + { + "epoch": 1.393879435430682, + "grad_norm": 0.012399906292557716, + "learning_rate": 1.9128931094111788e-05, + "loss": 0.0004, + "step": 5284 + }, + { + "epoch": 1.3944070703073472, + "grad_norm": 0.009484251029789448, + "learning_rate": 1.912775872680911e-05, + "loss": 0.005, + "step": 5286 + }, + { + "epoch": 1.3949347051840126, + "grad_norm": 0.04744881018996239, + "learning_rate": 1.9126586359506434e-05, + "loss": 0.0035, + "step": 5288 + }, + { + "epoch": 1.395462340060678, + "grad_norm": 1.0681997537612915, + "learning_rate": 1.912541399220376e-05, + "loss": 0.0105, + "step": 5290 + }, + { + "epoch": 1.3959899749373434, + "grad_norm": 0.26983579993247986, + "learning_rate": 1.9124241624901082e-05, + "loss": 0.0024, + "step": 5292 + }, + { + "epoch": 1.3965176098140086, + "grad_norm": 0.15803904831409454, + "learning_rate": 1.9123069257598408e-05, + "loss": 0.0028, + "step": 5294 + }, + { + "epoch": 1.397045244690674, + "grad_norm": 0.13289304077625275, + "learning_rate": 1.912189689029573e-05, + "loss": 0.0047, + "step": 5296 + }, + { + "epoch": 1.3975728795673394, + "grad_norm": 0.032863982021808624, + "learning_rate": 1.9120724522993053e-05, + "loss": 0.0009, + "step": 5298 + }, + { + "epoch": 1.3981005144440046, + "grad_norm": 0.031412411481142044, + "learning_rate": 1.911955215569038e-05, + "loss": 0.0053, + "step": 5300 + }, + { + "epoch": 1.39862814932067, + "grad_norm": 0.7880898714065552, + "learning_rate": 1.9118379788387705e-05, + "loss": 0.0027, + "step": 5302 + }, + { + "epoch": 1.3991557841973354, + "grad_norm": 0.3537781238555908, + "learning_rate": 1.9117207421085028e-05, + "loss": 0.0036, + "step": 5304 + }, + { + "epoch": 1.3996834190740008, + "grad_norm": 0.006403452716767788, + "learning_rate": 1.911603505378235e-05, + "loss": 0.0004, + "step": 5306 + }, + { + "epoch": 1.4002110539506663, + "grad_norm": 0.31199249625205994, + "learning_rate": 1.9114862686479677e-05, + "loss": 0.0051, + "step": 5308 + }, + { + "epoch": 1.4007386888273314, + "grad_norm": 0.043244510889053345, + "learning_rate": 1.9113690319177e-05, + "loss": 0.0101, + "step": 5310 + }, + { + "epoch": 1.4012663237039968, + "grad_norm": 0.009928855113685131, + "learning_rate": 1.9112517951874322e-05, + "loss": 0.0007, + "step": 5312 + }, + { + "epoch": 1.4017939585806622, + "grad_norm": 0.4572758078575134, + "learning_rate": 1.9111345584571648e-05, + "loss": 0.0036, + "step": 5314 + }, + { + "epoch": 1.4023215934573274, + "grad_norm": 0.01749766618013382, + "learning_rate": 1.9110173217268974e-05, + "loss": 0.0005, + "step": 5316 + }, + { + "epoch": 1.4028492283339928, + "grad_norm": 0.020312823355197906, + "learning_rate": 1.9109000849966296e-05, + "loss": 0.0005, + "step": 5318 + }, + { + "epoch": 1.4033768632106582, + "grad_norm": 0.14007146656513214, + "learning_rate": 1.910782848266362e-05, + "loss": 0.0017, + "step": 5320 + }, + { + "epoch": 1.4039044980873236, + "grad_norm": 0.36081400513648987, + "learning_rate": 1.9106656115360945e-05, + "loss": 0.0061, + "step": 5322 + }, + { + "epoch": 1.404432132963989, + "grad_norm": 0.247731551527977, + "learning_rate": 1.9105483748058268e-05, + "loss": 0.0038, + "step": 5324 + }, + { + "epoch": 1.4049597678406542, + "grad_norm": 0.03252599760890007, + "learning_rate": 1.910431138075559e-05, + "loss": 0.0024, + "step": 5326 + }, + { + "epoch": 1.4054874027173196, + "grad_norm": 0.3769966959953308, + "learning_rate": 1.9103139013452916e-05, + "loss": 0.0066, + "step": 5328 + }, + { + "epoch": 1.4060150375939848, + "grad_norm": 0.0707608014345169, + "learning_rate": 1.9101966646150242e-05, + "loss": 0.0008, + "step": 5330 + }, + { + "epoch": 1.4065426724706502, + "grad_norm": 0.20177513360977173, + "learning_rate": 1.9100794278847565e-05, + "loss": 0.0087, + "step": 5332 + }, + { + "epoch": 1.4070703073473156, + "grad_norm": 0.25251907110214233, + "learning_rate": 1.9099621911544887e-05, + "loss": 0.0035, + "step": 5334 + }, + { + "epoch": 1.407597942223981, + "grad_norm": 0.13455195724964142, + "learning_rate": 1.9098449544242213e-05, + "loss": 0.0011, + "step": 5336 + }, + { + "epoch": 1.4081255771006465, + "grad_norm": 0.12193722277879715, + "learning_rate": 1.9097277176939536e-05, + "loss": 0.0013, + "step": 5338 + }, + { + "epoch": 1.4086532119773116, + "grad_norm": 0.031238576397299767, + "learning_rate": 1.9096104809636862e-05, + "loss": 0.0056, + "step": 5340 + }, + { + "epoch": 1.409180846853977, + "grad_norm": 0.19347801804542542, + "learning_rate": 1.9094932442334185e-05, + "loss": 0.0039, + "step": 5342 + }, + { + "epoch": 1.4097084817306424, + "grad_norm": 0.24438078701496124, + "learning_rate": 1.9093760075031507e-05, + "loss": 0.0026, + "step": 5344 + }, + { + "epoch": 1.4102361166073076, + "grad_norm": 0.16781306266784668, + "learning_rate": 1.9092587707728833e-05, + "loss": 0.003, + "step": 5346 + }, + { + "epoch": 1.410763751483973, + "grad_norm": 0.32990819215774536, + "learning_rate": 1.909141534042616e-05, + "loss": 0.008, + "step": 5348 + }, + { + "epoch": 1.4112913863606384, + "grad_norm": 0.15511208772659302, + "learning_rate": 1.9090242973123482e-05, + "loss": 0.0043, + "step": 5350 + }, + { + "epoch": 1.4118190212373039, + "grad_norm": 0.020693138241767883, + "learning_rate": 1.9089070605820804e-05, + "loss": 0.0015, + "step": 5352 + }, + { + "epoch": 1.4123466561139693, + "grad_norm": 0.255430668592453, + "learning_rate": 1.908789823851813e-05, + "loss": 0.0103, + "step": 5354 + }, + { + "epoch": 1.4128742909906344, + "grad_norm": 0.029810616746544838, + "learning_rate": 1.9086725871215453e-05, + "loss": 0.0007, + "step": 5356 + }, + { + "epoch": 1.4134019258672998, + "grad_norm": 0.39434653520584106, + "learning_rate": 1.9085553503912776e-05, + "loss": 0.0016, + "step": 5358 + }, + { + "epoch": 1.4139295607439653, + "grad_norm": 0.024035276845097542, + "learning_rate": 1.90843811366101e-05, + "loss": 0.0006, + "step": 5360 + }, + { + "epoch": 1.4144571956206304, + "grad_norm": 0.04563741758465767, + "learning_rate": 1.9083208769307428e-05, + "loss": 0.0097, + "step": 5362 + }, + { + "epoch": 1.4149848304972958, + "grad_norm": 0.015468639321625233, + "learning_rate": 1.908203640200475e-05, + "loss": 0.0031, + "step": 5364 + }, + { + "epoch": 1.4155124653739612, + "grad_norm": 0.032774098217487335, + "learning_rate": 1.9080864034702073e-05, + "loss": 0.001, + "step": 5366 + }, + { + "epoch": 1.4160401002506267, + "grad_norm": 0.009257463738322258, + "learning_rate": 1.90796916673994e-05, + "loss": 0.0005, + "step": 5368 + }, + { + "epoch": 1.4165677351272918, + "grad_norm": 0.033756475895643234, + "learning_rate": 1.907851930009672e-05, + "loss": 0.0005, + "step": 5370 + }, + { + "epoch": 1.4170953700039572, + "grad_norm": 0.04219432547688484, + "learning_rate": 1.9077346932794044e-05, + "loss": 0.0075, + "step": 5372 + }, + { + "epoch": 1.4176230048806227, + "grad_norm": 0.08834859728813171, + "learning_rate": 1.907617456549137e-05, + "loss": 0.0009, + "step": 5374 + }, + { + "epoch": 1.4181506397572878, + "grad_norm": 0.07056812196969986, + "learning_rate": 1.9075002198188696e-05, + "loss": 0.0019, + "step": 5376 + }, + { + "epoch": 1.4186782746339532, + "grad_norm": 0.013846034184098244, + "learning_rate": 1.907382983088602e-05, + "loss": 0.0004, + "step": 5378 + }, + { + "epoch": 1.4192059095106186, + "grad_norm": 0.01660386100411415, + "learning_rate": 1.907265746358334e-05, + "loss": 0.0004, + "step": 5380 + }, + { + "epoch": 1.419733544387284, + "grad_norm": 0.09208539873361588, + "learning_rate": 1.9071485096280667e-05, + "loss": 0.001, + "step": 5382 + }, + { + "epoch": 1.4202611792639495, + "grad_norm": 0.023244112730026245, + "learning_rate": 1.907031272897799e-05, + "loss": 0.0004, + "step": 5384 + }, + { + "epoch": 1.4207888141406146, + "grad_norm": 0.2804052531719208, + "learning_rate": 1.9069140361675313e-05, + "loss": 0.0019, + "step": 5386 + }, + { + "epoch": 1.42131644901728, + "grad_norm": 0.04289567098021507, + "learning_rate": 1.906796799437264e-05, + "loss": 0.0102, + "step": 5388 + }, + { + "epoch": 1.4218440838939455, + "grad_norm": 0.19960272312164307, + "learning_rate": 1.906679562706996e-05, + "loss": 0.0034, + "step": 5390 + }, + { + "epoch": 1.4223717187706106, + "grad_norm": 0.36826789379119873, + "learning_rate": 1.9065623259767287e-05, + "loss": 0.0122, + "step": 5392 + }, + { + "epoch": 1.422899353647276, + "grad_norm": 0.05551215261220932, + "learning_rate": 1.906445089246461e-05, + "loss": 0.0011, + "step": 5394 + }, + { + "epoch": 1.4234269885239414, + "grad_norm": 0.03763752058148384, + "learning_rate": 1.9063278525161936e-05, + "loss": 0.0007, + "step": 5396 + }, + { + "epoch": 1.4239546234006069, + "grad_norm": 0.05754663050174713, + "learning_rate": 1.906210615785926e-05, + "loss": 0.0083, + "step": 5398 + }, + { + "epoch": 1.4244822582772723, + "grad_norm": 0.15315808355808258, + "learning_rate": 1.9060933790556584e-05, + "loss": 0.0103, + "step": 5400 + }, + { + "epoch": 1.4250098931539374, + "grad_norm": 0.13212719559669495, + "learning_rate": 1.9059761423253907e-05, + "loss": 0.0017, + "step": 5402 + }, + { + "epoch": 1.4255375280306029, + "grad_norm": 0.3158784508705139, + "learning_rate": 1.905858905595123e-05, + "loss": 0.0039, + "step": 5404 + }, + { + "epoch": 1.426065162907268, + "grad_norm": 0.3631952106952667, + "learning_rate": 1.9057416688648556e-05, + "loss": 0.0043, + "step": 5406 + }, + { + "epoch": 1.4265927977839334, + "grad_norm": 0.01030691061168909, + "learning_rate": 1.905624432134588e-05, + "loss": 0.0004, + "step": 5408 + }, + { + "epoch": 1.4271204326605988, + "grad_norm": 0.010376742109656334, + "learning_rate": 1.9055071954043204e-05, + "loss": 0.0051, + "step": 5410 + }, + { + "epoch": 1.4276480675372643, + "grad_norm": 0.010181731544435024, + "learning_rate": 1.9053899586740527e-05, + "loss": 0.0087, + "step": 5412 + }, + { + "epoch": 1.4281757024139297, + "grad_norm": 0.20598697662353516, + "learning_rate": 1.9052727219437853e-05, + "loss": 0.0084, + "step": 5414 + }, + { + "epoch": 1.4287033372905948, + "grad_norm": 0.03067043237388134, + "learning_rate": 1.9051554852135175e-05, + "loss": 0.0006, + "step": 5416 + }, + { + "epoch": 1.4292309721672602, + "grad_norm": 0.009440252557396889, + "learning_rate": 1.9050382484832498e-05, + "loss": 0.0004, + "step": 5418 + }, + { + "epoch": 1.4297586070439257, + "grad_norm": 0.016593284904956818, + "learning_rate": 1.9049210117529824e-05, + "loss": 0.0005, + "step": 5420 + }, + { + "epoch": 1.4302862419205908, + "grad_norm": 0.08582646399736404, + "learning_rate": 1.904803775022715e-05, + "loss": 0.0092, + "step": 5422 + }, + { + "epoch": 1.4308138767972562, + "grad_norm": 0.2005874216556549, + "learning_rate": 1.9046865382924473e-05, + "loss": 0.0029, + "step": 5424 + }, + { + "epoch": 1.4313415116739217, + "grad_norm": 0.03094736859202385, + "learning_rate": 1.9045693015621795e-05, + "loss": 0.0034, + "step": 5426 + }, + { + "epoch": 1.431869146550587, + "grad_norm": 0.019116880372166634, + "learning_rate": 1.904452064831912e-05, + "loss": 0.0005, + "step": 5428 + }, + { + "epoch": 1.4323967814272525, + "grad_norm": 0.026820529252290726, + "learning_rate": 1.9043348281016444e-05, + "loss": 0.0012, + "step": 5430 + }, + { + "epoch": 1.4329244163039176, + "grad_norm": 0.01067369244992733, + "learning_rate": 1.9042175913713766e-05, + "loss": 0.0004, + "step": 5432 + }, + { + "epoch": 1.433452051180583, + "grad_norm": 0.022106271237134933, + "learning_rate": 1.9041003546411092e-05, + "loss": 0.0007, + "step": 5434 + }, + { + "epoch": 1.4339796860572485, + "grad_norm": 0.0517406240105629, + "learning_rate": 1.9039831179108415e-05, + "loss": 0.0005, + "step": 5436 + }, + { + "epoch": 1.4345073209339136, + "grad_norm": 0.010827853344380856, + "learning_rate": 1.903865881180574e-05, + "loss": 0.0003, + "step": 5438 + }, + { + "epoch": 1.435034955810579, + "grad_norm": 0.02475409395992756, + "learning_rate": 1.9037486444503064e-05, + "loss": 0.0004, + "step": 5440 + }, + { + "epoch": 1.4355625906872445, + "grad_norm": 0.3275138735771179, + "learning_rate": 1.903631407720039e-05, + "loss": 0.004, + "step": 5442 + }, + { + "epoch": 1.4360902255639099, + "grad_norm": 0.16859450936317444, + "learning_rate": 1.9035141709897712e-05, + "loss": 0.0137, + "step": 5444 + }, + { + "epoch": 1.436617860440575, + "grad_norm": 0.21477259695529938, + "learning_rate": 1.9033969342595035e-05, + "loss": 0.0109, + "step": 5446 + }, + { + "epoch": 1.4371454953172405, + "grad_norm": 0.019490456208586693, + "learning_rate": 1.903279697529236e-05, + "loss": 0.0004, + "step": 5448 + }, + { + "epoch": 1.4376731301939059, + "grad_norm": 0.15031567215919495, + "learning_rate": 1.9031624607989683e-05, + "loss": 0.0016, + "step": 5450 + }, + { + "epoch": 1.438200765070571, + "grad_norm": 0.693465530872345, + "learning_rate": 1.903045224068701e-05, + "loss": 0.0042, + "step": 5452 + }, + { + "epoch": 1.4387283999472364, + "grad_norm": 0.7023822665214539, + "learning_rate": 1.9029279873384332e-05, + "loss": 0.0081, + "step": 5454 + }, + { + "epoch": 1.4392560348239019, + "grad_norm": 0.030720656737685204, + "learning_rate": 1.9028107506081658e-05, + "loss": 0.0042, + "step": 5456 + }, + { + "epoch": 1.4397836697005673, + "grad_norm": 0.29723748564720154, + "learning_rate": 1.902693513877898e-05, + "loss": 0.0028, + "step": 5458 + }, + { + "epoch": 1.4403113045772327, + "grad_norm": 0.011653866618871689, + "learning_rate": 1.9025762771476307e-05, + "loss": 0.0003, + "step": 5460 + }, + { + "epoch": 1.4408389394538978, + "grad_norm": 0.9313302636146545, + "learning_rate": 1.902459040417363e-05, + "loss": 0.0055, + "step": 5462 + }, + { + "epoch": 1.4413665743305633, + "grad_norm": 0.010956582613289356, + "learning_rate": 1.9023418036870952e-05, + "loss": 0.0115, + "step": 5464 + }, + { + "epoch": 1.4418942092072287, + "grad_norm": 0.1546645164489746, + "learning_rate": 1.9022245669568278e-05, + "loss": 0.0075, + "step": 5466 + }, + { + "epoch": 1.4424218440838938, + "grad_norm": 0.006004640366882086, + "learning_rate": 1.9021073302265604e-05, + "loss": 0.0003, + "step": 5468 + }, + { + "epoch": 1.4429494789605593, + "grad_norm": 0.02607995644211769, + "learning_rate": 1.9019900934962927e-05, + "loss": 0.0016, + "step": 5470 + }, + { + "epoch": 1.4434771138372247, + "grad_norm": 0.6868783235549927, + "learning_rate": 1.901872856766025e-05, + "loss": 0.0081, + "step": 5472 + }, + { + "epoch": 1.44400474871389, + "grad_norm": 0.012324526906013489, + "learning_rate": 1.9017556200357575e-05, + "loss": 0.0007, + "step": 5474 + }, + { + "epoch": 1.4445323835905555, + "grad_norm": 0.11594768613576889, + "learning_rate": 1.9016383833054898e-05, + "loss": 0.0014, + "step": 5476 + }, + { + "epoch": 1.4450600184672207, + "grad_norm": 0.07715687155723572, + "learning_rate": 1.901521146575222e-05, + "loss": 0.0093, + "step": 5478 + }, + { + "epoch": 1.445587653343886, + "grad_norm": 0.030928894877433777, + "learning_rate": 1.9014039098449546e-05, + "loss": 0.0006, + "step": 5480 + }, + { + "epoch": 1.4461152882205512, + "grad_norm": 0.0251223873347044, + "learning_rate": 1.901286673114687e-05, + "loss": 0.0007, + "step": 5482 + }, + { + "epoch": 1.4466429230972166, + "grad_norm": 0.09245499223470688, + "learning_rate": 1.901169436384419e-05, + "loss": 0.0009, + "step": 5484 + }, + { + "epoch": 1.447170557973882, + "grad_norm": 0.03342321142554283, + "learning_rate": 1.9010521996541518e-05, + "loss": 0.0004, + "step": 5486 + }, + { + "epoch": 1.4476981928505475, + "grad_norm": 0.005240360274910927, + "learning_rate": 1.9009349629238844e-05, + "loss": 0.0032, + "step": 5488 + }, + { + "epoch": 1.4482258277272129, + "grad_norm": 0.08236131072044373, + "learning_rate": 1.9008177261936166e-05, + "loss": 0.022, + "step": 5490 + }, + { + "epoch": 1.448753462603878, + "grad_norm": 0.005273970775306225, + "learning_rate": 1.900700489463349e-05, + "loss": 0.0021, + "step": 5492 + }, + { + "epoch": 1.4492810974805435, + "grad_norm": 0.13369722664356232, + "learning_rate": 1.9005832527330815e-05, + "loss": 0.0007, + "step": 5494 + }, + { + "epoch": 1.4498087323572089, + "grad_norm": 0.015089641325175762, + "learning_rate": 1.9004660160028137e-05, + "loss": 0.0004, + "step": 5496 + }, + { + "epoch": 1.450336367233874, + "grad_norm": 0.036611080169677734, + "learning_rate": 1.9003487792725463e-05, + "loss": 0.0037, + "step": 5498 + }, + { + "epoch": 1.4508640021105395, + "grad_norm": 0.011139755137264729, + "learning_rate": 1.9002315425422786e-05, + "loss": 0.0011, + "step": 5500 + }, + { + "epoch": 1.4513916369872049, + "grad_norm": 0.21231354773044586, + "learning_rate": 1.9001143058120112e-05, + "loss": 0.002, + "step": 5502 + }, + { + "epoch": 1.4519192718638703, + "grad_norm": 0.043035950511693954, + "learning_rate": 1.8999970690817435e-05, + "loss": 0.0005, + "step": 5504 + }, + { + "epoch": 1.4524469067405357, + "grad_norm": 0.44102710485458374, + "learning_rate": 1.8998798323514757e-05, + "loss": 0.0117, + "step": 5506 + }, + { + "epoch": 1.4529745416172009, + "grad_norm": 0.10023780167102814, + "learning_rate": 1.8997625956212083e-05, + "loss": 0.0054, + "step": 5508 + }, + { + "epoch": 1.4535021764938663, + "grad_norm": 0.2718024253845215, + "learning_rate": 1.8996453588909406e-05, + "loss": 0.0017, + "step": 5510 + }, + { + "epoch": 1.4540298113705317, + "grad_norm": 0.34850195050239563, + "learning_rate": 1.8995281221606732e-05, + "loss": 0.0028, + "step": 5512 + }, + { + "epoch": 1.4545574462471969, + "grad_norm": 0.1116321012377739, + "learning_rate": 1.8994108854304054e-05, + "loss": 0.0008, + "step": 5514 + }, + { + "epoch": 1.4550850811238623, + "grad_norm": 0.34829816222190857, + "learning_rate": 1.899293648700138e-05, + "loss": 0.0084, + "step": 5516 + }, + { + "epoch": 1.4556127160005277, + "grad_norm": 0.09470077604055405, + "learning_rate": 1.8991764119698703e-05, + "loss": 0.0035, + "step": 5518 + }, + { + "epoch": 1.456140350877193, + "grad_norm": 0.019206156954169273, + "learning_rate": 1.899059175239603e-05, + "loss": 0.0043, + "step": 5520 + }, + { + "epoch": 1.4566679857538583, + "grad_norm": 0.004034935962408781, + "learning_rate": 1.898941938509335e-05, + "loss": 0.0003, + "step": 5522 + }, + { + "epoch": 1.4571956206305237, + "grad_norm": 0.1371040642261505, + "learning_rate": 1.8988247017790674e-05, + "loss": 0.0068, + "step": 5524 + }, + { + "epoch": 1.457723255507189, + "grad_norm": 0.03570571914315224, + "learning_rate": 1.8987074650488e-05, + "loss": 0.0006, + "step": 5526 + }, + { + "epoch": 1.4582508903838542, + "grad_norm": 0.1681358963251114, + "learning_rate": 1.8985902283185326e-05, + "loss": 0.0035, + "step": 5528 + }, + { + "epoch": 1.4587785252605197, + "grad_norm": 0.09154894202947617, + "learning_rate": 1.8984729915882645e-05, + "loss": 0.0008, + "step": 5530 + }, + { + "epoch": 1.459306160137185, + "grad_norm": 0.02420027181506157, + "learning_rate": 1.898355754857997e-05, + "loss": 0.0012, + "step": 5532 + }, + { + "epoch": 1.4598337950138505, + "grad_norm": 0.2531881034374237, + "learning_rate": 1.8982385181277297e-05, + "loss": 0.0011, + "step": 5534 + }, + { + "epoch": 1.4603614298905159, + "grad_norm": 0.07112879306077957, + "learning_rate": 1.898121281397462e-05, + "loss": 0.0006, + "step": 5536 + }, + { + "epoch": 1.460889064767181, + "grad_norm": 0.04752187803387642, + "learning_rate": 1.8980040446671943e-05, + "loss": 0.004, + "step": 5538 + }, + { + "epoch": 1.4614166996438465, + "grad_norm": 0.008648978546261787, + "learning_rate": 1.897886807936927e-05, + "loss": 0.0005, + "step": 5540 + }, + { + "epoch": 1.4619443345205119, + "grad_norm": 0.029379285871982574, + "learning_rate": 1.897769571206659e-05, + "loss": 0.0005, + "step": 5542 + }, + { + "epoch": 1.462471969397177, + "grad_norm": 0.6083777546882629, + "learning_rate": 1.8976523344763914e-05, + "loss": 0.015, + "step": 5544 + }, + { + "epoch": 1.4629996042738425, + "grad_norm": 0.17372579872608185, + "learning_rate": 1.897535097746124e-05, + "loss": 0.0032, + "step": 5546 + }, + { + "epoch": 1.4635272391505079, + "grad_norm": 0.016982747241854668, + "learning_rate": 1.8974178610158566e-05, + "loss": 0.0007, + "step": 5548 + }, + { + "epoch": 1.4640548740271733, + "grad_norm": 0.015794847160577774, + "learning_rate": 1.897300624285589e-05, + "loss": 0.0006, + "step": 5550 + }, + { + "epoch": 1.4645825089038387, + "grad_norm": 0.11127861589193344, + "learning_rate": 1.897183387555321e-05, + "loss": 0.0126, + "step": 5552 + }, + { + "epoch": 1.4651101437805039, + "grad_norm": 0.2920222580432892, + "learning_rate": 1.8970661508250537e-05, + "loss": 0.0018, + "step": 5554 + }, + { + "epoch": 1.4656377786571693, + "grad_norm": 0.010175485163927078, + "learning_rate": 1.896948914094786e-05, + "loss": 0.012, + "step": 5556 + }, + { + "epoch": 1.4661654135338344, + "grad_norm": 0.13616901636123657, + "learning_rate": 1.8968316773645186e-05, + "loss": 0.0022, + "step": 5558 + }, + { + "epoch": 1.4666930484104999, + "grad_norm": 0.009496335871517658, + "learning_rate": 1.896714440634251e-05, + "loss": 0.0046, + "step": 5560 + }, + { + "epoch": 1.4672206832871653, + "grad_norm": 0.07352566719055176, + "learning_rate": 1.8965972039039834e-05, + "loss": 0.0071, + "step": 5562 + }, + { + "epoch": 1.4677483181638307, + "grad_norm": 0.3441206216812134, + "learning_rate": 1.8964799671737157e-05, + "loss": 0.0065, + "step": 5564 + }, + { + "epoch": 1.468275953040496, + "grad_norm": 0.016587503254413605, + "learning_rate": 1.896362730443448e-05, + "loss": 0.0015, + "step": 5566 + }, + { + "epoch": 1.4688035879171613, + "grad_norm": 0.02011411264538765, + "learning_rate": 1.8962454937131806e-05, + "loss": 0.0006, + "step": 5568 + }, + { + "epoch": 1.4693312227938267, + "grad_norm": 0.04009369760751724, + "learning_rate": 1.8961282569829128e-05, + "loss": 0.0019, + "step": 5570 + }, + { + "epoch": 1.469858857670492, + "grad_norm": 0.04250234737992287, + "learning_rate": 1.8960110202526454e-05, + "loss": 0.0008, + "step": 5572 + }, + { + "epoch": 1.4703864925471573, + "grad_norm": 0.05308300256729126, + "learning_rate": 1.8958937835223777e-05, + "loss": 0.0006, + "step": 5574 + }, + { + "epoch": 1.4709141274238227, + "grad_norm": 0.02734431065618992, + "learning_rate": 1.89577654679211e-05, + "loss": 0.0045, + "step": 5576 + }, + { + "epoch": 1.471441762300488, + "grad_norm": 0.01663549244403839, + "learning_rate": 1.8956593100618425e-05, + "loss": 0.0018, + "step": 5578 + }, + { + "epoch": 1.4719693971771535, + "grad_norm": 0.005843727383762598, + "learning_rate": 1.895542073331575e-05, + "loss": 0.0003, + "step": 5580 + }, + { + "epoch": 1.4724970320538189, + "grad_norm": 0.10921648889780045, + "learning_rate": 1.8954248366013074e-05, + "loss": 0.0011, + "step": 5582 + }, + { + "epoch": 1.473024666930484, + "grad_norm": 0.16894614696502686, + "learning_rate": 1.8953075998710397e-05, + "loss": 0.0007, + "step": 5584 + }, + { + "epoch": 1.4735523018071495, + "grad_norm": 0.14418604969978333, + "learning_rate": 1.8951903631407723e-05, + "loss": 0.0147, + "step": 5586 + }, + { + "epoch": 1.4740799366838149, + "grad_norm": 0.011453059501945972, + "learning_rate": 1.8950731264105045e-05, + "loss": 0.0005, + "step": 5588 + }, + { + "epoch": 1.47460757156048, + "grad_norm": 0.03089250810444355, + "learning_rate": 1.8949558896802368e-05, + "loss": 0.0008, + "step": 5590 + }, + { + "epoch": 1.4751352064371455, + "grad_norm": 0.5141900777816772, + "learning_rate": 1.8948386529499694e-05, + "loss": 0.0099, + "step": 5592 + }, + { + "epoch": 1.4756628413138109, + "grad_norm": 0.007806441746652126, + "learning_rate": 1.894721416219702e-05, + "loss": 0.0004, + "step": 5594 + }, + { + "epoch": 1.4761904761904763, + "grad_norm": 0.2946335971355438, + "learning_rate": 1.8946041794894342e-05, + "loss": 0.0031, + "step": 5596 + }, + { + "epoch": 1.4767181110671415, + "grad_norm": 0.012489471584558487, + "learning_rate": 1.8944869427591665e-05, + "loss": 0.0004, + "step": 5598 + }, + { + "epoch": 1.4772457459438069, + "grad_norm": 0.35787537693977356, + "learning_rate": 1.894369706028899e-05, + "loss": 0.0028, + "step": 5600 + }, + { + "epoch": 1.4777733808204723, + "grad_norm": 0.11338352411985397, + "learning_rate": 1.8942524692986314e-05, + "loss": 0.0041, + "step": 5602 + }, + { + "epoch": 1.4783010156971375, + "grad_norm": 0.18031999468803406, + "learning_rate": 1.8941352325683636e-05, + "loss": 0.0011, + "step": 5604 + }, + { + "epoch": 1.4788286505738029, + "grad_norm": 0.007773229852318764, + "learning_rate": 1.8940179958380962e-05, + "loss": 0.002, + "step": 5606 + }, + { + "epoch": 1.4793562854504683, + "grad_norm": 0.3068021237850189, + "learning_rate": 1.8939007591078288e-05, + "loss": 0.0014, + "step": 5608 + }, + { + "epoch": 1.4798839203271337, + "grad_norm": 0.23446735739707947, + "learning_rate": 1.893783522377561e-05, + "loss": 0.0012, + "step": 5610 + }, + { + "epoch": 1.480411555203799, + "grad_norm": 0.2292475402355194, + "learning_rate": 1.8936662856472933e-05, + "loss": 0.0055, + "step": 5612 + }, + { + "epoch": 1.4809391900804643, + "grad_norm": 0.13097357749938965, + "learning_rate": 1.893549048917026e-05, + "loss": 0.0089, + "step": 5614 + }, + { + "epoch": 1.4814668249571297, + "grad_norm": 0.0037945732474327087, + "learning_rate": 1.8934318121867582e-05, + "loss": 0.0006, + "step": 5616 + }, + { + "epoch": 1.481994459833795, + "grad_norm": 0.023930413648486137, + "learning_rate": 1.8933145754564908e-05, + "loss": 0.0004, + "step": 5618 + }, + { + "epoch": 1.4825220947104603, + "grad_norm": 0.012107308022677898, + "learning_rate": 1.893197338726223e-05, + "loss": 0.0005, + "step": 5620 + }, + { + "epoch": 1.4830497295871257, + "grad_norm": 0.008919773623347282, + "learning_rate": 1.8930801019959553e-05, + "loss": 0.0003, + "step": 5622 + }, + { + "epoch": 1.483577364463791, + "grad_norm": 0.043380796909332275, + "learning_rate": 1.892962865265688e-05, + "loss": 0.0034, + "step": 5624 + }, + { + "epoch": 1.4841049993404565, + "grad_norm": 0.7829352617263794, + "learning_rate": 1.8928456285354202e-05, + "loss": 0.0033, + "step": 5626 + }, + { + "epoch": 1.4846326342171219, + "grad_norm": 0.276958703994751, + "learning_rate": 1.8927283918051528e-05, + "loss": 0.0056, + "step": 5628 + }, + { + "epoch": 1.485160269093787, + "grad_norm": 0.1890895664691925, + "learning_rate": 1.892611155074885e-05, + "loss": 0.0065, + "step": 5630 + }, + { + "epoch": 1.4856879039704525, + "grad_norm": 0.013564079999923706, + "learning_rate": 1.8924939183446177e-05, + "loss": 0.0006, + "step": 5632 + }, + { + "epoch": 1.4862155388471177, + "grad_norm": 0.2901380956172943, + "learning_rate": 1.89237668161435e-05, + "loss": 0.002, + "step": 5634 + }, + { + "epoch": 1.486743173723783, + "grad_norm": 1.1269328594207764, + "learning_rate": 1.8922594448840822e-05, + "loss": 0.0126, + "step": 5636 + }, + { + "epoch": 1.4872708086004485, + "grad_norm": 0.09038350731134415, + "learning_rate": 1.8921422081538148e-05, + "loss": 0.0022, + "step": 5638 + }, + { + "epoch": 1.4877984434771139, + "grad_norm": 0.22451190650463104, + "learning_rate": 1.8920249714235474e-05, + "loss": 0.0076, + "step": 5640 + }, + { + "epoch": 1.4883260783537793, + "grad_norm": 0.006245868280529976, + "learning_rate": 1.8919077346932796e-05, + "loss": 0.0004, + "step": 5642 + }, + { + "epoch": 1.4888537132304445, + "grad_norm": 0.29880404472351074, + "learning_rate": 1.891790497963012e-05, + "loss": 0.0071, + "step": 5644 + }, + { + "epoch": 1.4893813481071099, + "grad_norm": 0.14707712829113007, + "learning_rate": 1.8916732612327445e-05, + "loss": 0.0056, + "step": 5646 + }, + { + "epoch": 1.4899089829837753, + "grad_norm": 0.02768334187567234, + "learning_rate": 1.8915560245024768e-05, + "loss": 0.0075, + "step": 5648 + }, + { + "epoch": 1.4904366178604405, + "grad_norm": 0.5475381016731262, + "learning_rate": 1.891438787772209e-05, + "loss": 0.0148, + "step": 5650 + }, + { + "epoch": 1.4909642527371059, + "grad_norm": 0.036920033395290375, + "learning_rate": 1.8913215510419416e-05, + "loss": 0.0007, + "step": 5652 + }, + { + "epoch": 1.4914918876137713, + "grad_norm": 0.4409501850605011, + "learning_rate": 1.8912043143116742e-05, + "loss": 0.0007, + "step": 5654 + }, + { + "epoch": 1.4920195224904367, + "grad_norm": 0.032702140510082245, + "learning_rate": 1.891087077581406e-05, + "loss": 0.0008, + "step": 5656 + }, + { + "epoch": 1.492547157367102, + "grad_norm": 0.08464547246694565, + "learning_rate": 1.8909698408511387e-05, + "loss": 0.0016, + "step": 5658 + }, + { + "epoch": 1.4930747922437673, + "grad_norm": 0.01150527223944664, + "learning_rate": 1.8908526041208713e-05, + "loss": 0.0007, + "step": 5660 + }, + { + "epoch": 1.4936024271204327, + "grad_norm": 0.2066228836774826, + "learning_rate": 1.8907353673906036e-05, + "loss": 0.0023, + "step": 5662 + }, + { + "epoch": 1.494130061997098, + "grad_norm": 0.01408227439969778, + "learning_rate": 1.890618130660336e-05, + "loss": 0.0008, + "step": 5664 + }, + { + "epoch": 1.4946576968737633, + "grad_norm": 0.007458052132278681, + "learning_rate": 1.8905008939300685e-05, + "loss": 0.0004, + "step": 5666 + }, + { + "epoch": 1.4951853317504287, + "grad_norm": 0.004549333360046148, + "learning_rate": 1.8903836571998007e-05, + "loss": 0.0003, + "step": 5668 + }, + { + "epoch": 1.495712966627094, + "grad_norm": 0.033750779926776886, + "learning_rate": 1.8902664204695333e-05, + "loss": 0.0005, + "step": 5670 + }, + { + "epoch": 1.4962406015037595, + "grad_norm": 0.02826608344912529, + "learning_rate": 1.8901491837392656e-05, + "loss": 0.0018, + "step": 5672 + }, + { + "epoch": 1.4967682363804247, + "grad_norm": 0.014277038164436817, + "learning_rate": 1.8900319470089982e-05, + "loss": 0.0004, + "step": 5674 + }, + { + "epoch": 1.49729587125709, + "grad_norm": 0.10736554116010666, + "learning_rate": 1.8899147102787304e-05, + "loss": 0.0009, + "step": 5676 + }, + { + "epoch": 1.4978235061337555, + "grad_norm": 0.014047102071344852, + "learning_rate": 1.889797473548463e-05, + "loss": 0.0005, + "step": 5678 + }, + { + "epoch": 1.4983511410104207, + "grad_norm": 0.210733100771904, + "learning_rate": 1.8896802368181953e-05, + "loss": 0.0098, + "step": 5680 + }, + { + "epoch": 1.498878775887086, + "grad_norm": 0.06156830117106438, + "learning_rate": 1.8895630000879276e-05, + "loss": 0.0007, + "step": 5682 + }, + { + "epoch": 1.4994064107637515, + "grad_norm": 0.004277010913938284, + "learning_rate": 1.88944576335766e-05, + "loss": 0.0004, + "step": 5684 + }, + { + "epoch": 1.4999340456404169, + "grad_norm": 0.005853850860148668, + "learning_rate": 1.8893285266273924e-05, + "loss": 0.0005, + "step": 5686 + }, + { + "epoch": 1.5004616805170823, + "grad_norm": 0.015235447324812412, + "learning_rate": 1.889211289897125e-05, + "loss": 0.0014, + "step": 5688 + }, + { + "epoch": 1.5009893153937477, + "grad_norm": 0.013109652325510979, + "learning_rate": 1.8890940531668573e-05, + "loss": 0.0004, + "step": 5690 + }, + { + "epoch": 1.5015169502704129, + "grad_norm": 0.1913483589887619, + "learning_rate": 1.88897681643659e-05, + "loss": 0.0014, + "step": 5692 + }, + { + "epoch": 1.502044585147078, + "grad_norm": 0.6322714686393738, + "learning_rate": 1.888859579706322e-05, + "loss": 0.0088, + "step": 5694 + }, + { + "epoch": 1.5025722200237435, + "grad_norm": 0.034975092858076096, + "learning_rate": 1.8887423429760544e-05, + "loss": 0.0036, + "step": 5696 + }, + { + "epoch": 1.5030998549004089, + "grad_norm": 0.006718873046338558, + "learning_rate": 1.888625106245787e-05, + "loss": 0.0014, + "step": 5698 + }, + { + "epoch": 1.5036274897770743, + "grad_norm": 0.019820788875222206, + "learning_rate": 1.8885078695155196e-05, + "loss": 0.0128, + "step": 5700 + }, + { + "epoch": 1.5041551246537397, + "grad_norm": 0.04902283102273941, + "learning_rate": 1.8883906327852515e-05, + "loss": 0.0021, + "step": 5702 + }, + { + "epoch": 1.504682759530405, + "grad_norm": 0.05632961913943291, + "learning_rate": 1.888273396054984e-05, + "loss": 0.0066, + "step": 5704 + }, + { + "epoch": 1.5052103944070703, + "grad_norm": 0.005308329593390226, + "learning_rate": 1.8881561593247167e-05, + "loss": 0.0003, + "step": 5706 + }, + { + "epoch": 1.5057380292837357, + "grad_norm": 0.36166661977767944, + "learning_rate": 1.888038922594449e-05, + "loss": 0.0191, + "step": 5708 + }, + { + "epoch": 1.5062656641604009, + "grad_norm": 0.013291722163558006, + "learning_rate": 1.8879216858641813e-05, + "loss": 0.0072, + "step": 5710 + }, + { + "epoch": 1.5067932990370663, + "grad_norm": 0.005212400108575821, + "learning_rate": 1.887804449133914e-05, + "loss": 0.0004, + "step": 5712 + }, + { + "epoch": 1.5073209339137317, + "grad_norm": 0.011074794456362724, + "learning_rate": 1.887687212403646e-05, + "loss": 0.0006, + "step": 5714 + }, + { + "epoch": 1.507848568790397, + "grad_norm": 0.03477475792169571, + "learning_rate": 1.8875699756733784e-05, + "loss": 0.0009, + "step": 5716 + }, + { + "epoch": 1.5083762036670625, + "grad_norm": 0.010469415225088596, + "learning_rate": 1.887452738943111e-05, + "loss": 0.0007, + "step": 5718 + }, + { + "epoch": 1.508903838543728, + "grad_norm": 0.03858046978712082, + "learning_rate": 1.8873355022128436e-05, + "loss": 0.0006, + "step": 5720 + }, + { + "epoch": 1.509431473420393, + "grad_norm": 0.28303834795951843, + "learning_rate": 1.887218265482576e-05, + "loss": 0.0035, + "step": 5722 + }, + { + "epoch": 1.5099591082970585, + "grad_norm": 0.17064152657985687, + "learning_rate": 1.887101028752308e-05, + "loss": 0.0127, + "step": 5724 + }, + { + "epoch": 1.5104867431737237, + "grad_norm": 0.2660464942455292, + "learning_rate": 1.8869837920220407e-05, + "loss": 0.0082, + "step": 5726 + }, + { + "epoch": 1.511014378050389, + "grad_norm": 0.21817632019519806, + "learning_rate": 1.886866555291773e-05, + "loss": 0.007, + "step": 5728 + }, + { + "epoch": 1.5115420129270545, + "grad_norm": 0.48539072275161743, + "learning_rate": 1.8867493185615056e-05, + "loss": 0.0056, + "step": 5730 + }, + { + "epoch": 1.5120696478037199, + "grad_norm": 0.02615191973745823, + "learning_rate": 1.8866320818312378e-05, + "loss": 0.0008, + "step": 5732 + }, + { + "epoch": 1.5125972826803853, + "grad_norm": 0.009652979671955109, + "learning_rate": 1.8865148451009704e-05, + "loss": 0.0111, + "step": 5734 + }, + { + "epoch": 1.5131249175570505, + "grad_norm": 0.04219586402177811, + "learning_rate": 1.8863976083707027e-05, + "loss": 0.0015, + "step": 5736 + }, + { + "epoch": 1.5136525524337159, + "grad_norm": 0.027748236432671547, + "learning_rate": 1.8862803716404353e-05, + "loss": 0.0004, + "step": 5738 + }, + { + "epoch": 1.514180187310381, + "grad_norm": 0.01301342062652111, + "learning_rate": 1.8861631349101675e-05, + "loss": 0.012, + "step": 5740 + }, + { + "epoch": 1.5147078221870465, + "grad_norm": 0.008704913780093193, + "learning_rate": 1.8860458981798998e-05, + "loss": 0.0005, + "step": 5742 + }, + { + "epoch": 1.5152354570637119, + "grad_norm": 0.30720800161361694, + "learning_rate": 1.8859286614496324e-05, + "loss": 0.0024, + "step": 5744 + }, + { + "epoch": 1.5157630919403773, + "grad_norm": 0.6571658849716187, + "learning_rate": 1.8858114247193647e-05, + "loss": 0.0039, + "step": 5746 + }, + { + "epoch": 1.5162907268170427, + "grad_norm": 0.024840274825692177, + "learning_rate": 1.8856941879890973e-05, + "loss": 0.0006, + "step": 5748 + }, + { + "epoch": 1.516818361693708, + "grad_norm": 0.0077825975604355335, + "learning_rate": 1.8855769512588295e-05, + "loss": 0.0091, + "step": 5750 + }, + { + "epoch": 1.5173459965703733, + "grad_norm": 0.19086356461048126, + "learning_rate": 1.885459714528562e-05, + "loss": 0.0114, + "step": 5752 + }, + { + "epoch": 1.5178736314470387, + "grad_norm": 0.014176352880895138, + "learning_rate": 1.8853424777982944e-05, + "loss": 0.0104, + "step": 5754 + }, + { + "epoch": 1.5184012663237039, + "grad_norm": 0.04731947183609009, + "learning_rate": 1.8852252410680266e-05, + "loss": 0.0008, + "step": 5756 + }, + { + "epoch": 1.5189289012003693, + "grad_norm": 0.04819200932979584, + "learning_rate": 1.8851080043377592e-05, + "loss": 0.0009, + "step": 5758 + }, + { + "epoch": 1.5194565360770347, + "grad_norm": 0.08797182887792587, + "learning_rate": 1.8849907676074915e-05, + "loss": 0.0014, + "step": 5760 + }, + { + "epoch": 1.5199841709537, + "grad_norm": 0.12676826119422913, + "learning_rate": 1.8848735308772238e-05, + "loss": 0.0023, + "step": 5762 + }, + { + "epoch": 1.5205118058303655, + "grad_norm": 0.013049023225903511, + "learning_rate": 1.8847562941469564e-05, + "loss": 0.0005, + "step": 5764 + }, + { + "epoch": 1.521039440707031, + "grad_norm": 0.010793158784508705, + "learning_rate": 1.884639057416689e-05, + "loss": 0.0005, + "step": 5766 + }, + { + "epoch": 1.521567075583696, + "grad_norm": 0.2860878109931946, + "learning_rate": 1.8845218206864212e-05, + "loss": 0.0024, + "step": 5768 + }, + { + "epoch": 1.5220947104603613, + "grad_norm": 0.27011144161224365, + "learning_rate": 1.8844045839561535e-05, + "loss": 0.01, + "step": 5770 + }, + { + "epoch": 1.5226223453370267, + "grad_norm": 0.07913777977228165, + "learning_rate": 1.884287347225886e-05, + "loss": 0.0011, + "step": 5772 + }, + { + "epoch": 1.523149980213692, + "grad_norm": 0.02226688340306282, + "learning_rate": 1.8841701104956183e-05, + "loss": 0.0027, + "step": 5774 + }, + { + "epoch": 1.5236776150903575, + "grad_norm": 0.00899636559188366, + "learning_rate": 1.8840528737653506e-05, + "loss": 0.0003, + "step": 5776 + }, + { + "epoch": 1.524205249967023, + "grad_norm": 0.058686111122369766, + "learning_rate": 1.8839356370350832e-05, + "loss": 0.0018, + "step": 5778 + }, + { + "epoch": 1.5247328848436883, + "grad_norm": 0.12228047102689743, + "learning_rate": 1.8838184003048158e-05, + "loss": 0.01, + "step": 5780 + }, + { + "epoch": 1.5252605197203535, + "grad_norm": 0.00511889485642314, + "learning_rate": 1.883701163574548e-05, + "loss": 0.0024, + "step": 5782 + }, + { + "epoch": 1.5257881545970189, + "grad_norm": 0.007824452593922615, + "learning_rate": 1.8835839268442803e-05, + "loss": 0.0004, + "step": 5784 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 0.13534051179885864, + "learning_rate": 1.883466690114013e-05, + "loss": 0.0032, + "step": 5786 + }, + { + "epoch": 1.5268434243503495, + "grad_norm": 0.04476173222064972, + "learning_rate": 1.8833494533837452e-05, + "loss": 0.0005, + "step": 5788 + }, + { + "epoch": 1.5273710592270149, + "grad_norm": 0.0057015446946024895, + "learning_rate": 1.8832322166534778e-05, + "loss": 0.0013, + "step": 5790 + }, + { + "epoch": 1.5278986941036803, + "grad_norm": 0.006410823669284582, + "learning_rate": 1.88311497992321e-05, + "loss": 0.0004, + "step": 5792 + }, + { + "epoch": 1.5284263289803457, + "grad_norm": 0.005629014223814011, + "learning_rate": 1.8829977431929426e-05, + "loss": 0.0116, + "step": 5794 + }, + { + "epoch": 1.528953963857011, + "grad_norm": 0.21654051542282104, + "learning_rate": 1.882880506462675e-05, + "loss": 0.0009, + "step": 5796 + }, + { + "epoch": 1.5294815987336763, + "grad_norm": 0.17464546859264374, + "learning_rate": 1.8827632697324075e-05, + "loss": 0.0011, + "step": 5798 + }, + { + "epoch": 1.5300092336103417, + "grad_norm": 0.6457492113113403, + "learning_rate": 1.8826460330021398e-05, + "loss": 0.0104, + "step": 5800 + }, + { + "epoch": 1.5305368684870069, + "grad_norm": 0.3462197780609131, + "learning_rate": 1.882528796271872e-05, + "loss": 0.0042, + "step": 5802 + }, + { + "epoch": 1.5310645033636723, + "grad_norm": 0.6425003409385681, + "learning_rate": 1.8824115595416046e-05, + "loss": 0.0044, + "step": 5804 + }, + { + "epoch": 1.5315921382403377, + "grad_norm": 0.10116885602474213, + "learning_rate": 1.882294322811337e-05, + "loss": 0.0028, + "step": 5806 + }, + { + "epoch": 1.532119773117003, + "grad_norm": 0.11945269256830215, + "learning_rate": 1.882177086081069e-05, + "loss": 0.0129, + "step": 5808 + }, + { + "epoch": 1.5326474079936685, + "grad_norm": 0.1256973147392273, + "learning_rate": 1.8820598493508018e-05, + "loss": 0.0079, + "step": 5810 + }, + { + "epoch": 1.5331750428703337, + "grad_norm": 0.09959682077169418, + "learning_rate": 1.8819426126205344e-05, + "loss": 0.0014, + "step": 5812 + }, + { + "epoch": 1.533702677746999, + "grad_norm": 0.3884078562259674, + "learning_rate": 1.8818253758902666e-05, + "loss": 0.0065, + "step": 5814 + }, + { + "epoch": 1.5342303126236643, + "grad_norm": 0.3377397358417511, + "learning_rate": 1.881708139159999e-05, + "loss": 0.0104, + "step": 5816 + }, + { + "epoch": 1.5347579475003297, + "grad_norm": 0.09416592121124268, + "learning_rate": 1.8815909024297315e-05, + "loss": 0.0008, + "step": 5818 + }, + { + "epoch": 1.535285582376995, + "grad_norm": 0.2532274127006531, + "learning_rate": 1.8814736656994637e-05, + "loss": 0.0009, + "step": 5820 + }, + { + "epoch": 1.5358132172536605, + "grad_norm": 0.029942821711301804, + "learning_rate": 1.881356428969196e-05, + "loss": 0.0067, + "step": 5822 + }, + { + "epoch": 1.536340852130326, + "grad_norm": 0.19431860744953156, + "learning_rate": 1.8812391922389286e-05, + "loss": 0.0012, + "step": 5824 + }, + { + "epoch": 1.5368684870069913, + "grad_norm": 0.00762843806296587, + "learning_rate": 1.8811219555086612e-05, + "loss": 0.0007, + "step": 5826 + }, + { + "epoch": 1.5373961218836565, + "grad_norm": 0.3112473785877228, + "learning_rate": 1.8810047187783935e-05, + "loss": 0.0017, + "step": 5828 + }, + { + "epoch": 1.537923756760322, + "grad_norm": 0.052361417561769485, + "learning_rate": 1.8808874820481257e-05, + "loss": 0.0006, + "step": 5830 + }, + { + "epoch": 1.538451391636987, + "grad_norm": 0.21753019094467163, + "learning_rate": 1.8807702453178583e-05, + "loss": 0.0037, + "step": 5832 + }, + { + "epoch": 1.5389790265136525, + "grad_norm": 0.12100949883460999, + "learning_rate": 1.8806530085875906e-05, + "loss": 0.0083, + "step": 5834 + }, + { + "epoch": 1.5395066613903179, + "grad_norm": 0.01006680354475975, + "learning_rate": 1.880535771857323e-05, + "loss": 0.0053, + "step": 5836 + }, + { + "epoch": 1.5400342962669833, + "grad_norm": 0.2815290689468384, + "learning_rate": 1.8804185351270554e-05, + "loss": 0.0041, + "step": 5838 + }, + { + "epoch": 1.5405619311436487, + "grad_norm": 0.022390063852071762, + "learning_rate": 1.880301298396788e-05, + "loss": 0.0005, + "step": 5840 + }, + { + "epoch": 1.541089566020314, + "grad_norm": 0.22432836890220642, + "learning_rate": 1.8801840616665203e-05, + "loss": 0.0054, + "step": 5842 + }, + { + "epoch": 1.5416172008969793, + "grad_norm": 0.02922552265226841, + "learning_rate": 1.8800668249362526e-05, + "loss": 0.014, + "step": 5844 + }, + { + "epoch": 1.5421448357736445, + "grad_norm": 0.24552203714847565, + "learning_rate": 1.879949588205985e-05, + "loss": 0.007, + "step": 5846 + }, + { + "epoch": 1.5426724706503099, + "grad_norm": 0.09188135713338852, + "learning_rate": 1.8798323514757174e-05, + "loss": 0.0011, + "step": 5848 + }, + { + "epoch": 1.5432001055269753, + "grad_norm": 0.07066573947668076, + "learning_rate": 1.87971511474545e-05, + "loss": 0.0016, + "step": 5850 + }, + { + "epoch": 1.5437277404036407, + "grad_norm": 0.10649452358484268, + "learning_rate": 1.8795978780151823e-05, + "loss": 0.0042, + "step": 5852 + }, + { + "epoch": 1.544255375280306, + "grad_norm": 0.3823515474796295, + "learning_rate": 1.8794806412849145e-05, + "loss": 0.0055, + "step": 5854 + }, + { + "epoch": 1.5447830101569715, + "grad_norm": 0.12275680899620056, + "learning_rate": 1.879363404554647e-05, + "loss": 0.0052, + "step": 5856 + }, + { + "epoch": 1.5453106450336367, + "grad_norm": 0.4811203181743622, + "learning_rate": 1.8792461678243797e-05, + "loss": 0.003, + "step": 5858 + }, + { + "epoch": 1.545838279910302, + "grad_norm": 0.0725078359246254, + "learning_rate": 1.879128931094112e-05, + "loss": 0.0013, + "step": 5860 + }, + { + "epoch": 1.5463659147869673, + "grad_norm": 0.020125363022089005, + "learning_rate": 1.8790116943638443e-05, + "loss": 0.0007, + "step": 5862 + }, + { + "epoch": 1.5468935496636327, + "grad_norm": 0.07206309586763382, + "learning_rate": 1.878894457633577e-05, + "loss": 0.0012, + "step": 5864 + }, + { + "epoch": 1.547421184540298, + "grad_norm": 0.006034654565155506, + "learning_rate": 1.878777220903309e-05, + "loss": 0.007, + "step": 5866 + }, + { + "epoch": 1.5479488194169635, + "grad_norm": 0.6899508833885193, + "learning_rate": 1.8786599841730414e-05, + "loss": 0.0016, + "step": 5868 + }, + { + "epoch": 1.548476454293629, + "grad_norm": 0.023508839309215546, + "learning_rate": 1.878542747442774e-05, + "loss": 0.0004, + "step": 5870 + }, + { + "epoch": 1.5490040891702943, + "grad_norm": 0.4505625367164612, + "learning_rate": 1.8784255107125066e-05, + "loss": 0.0058, + "step": 5872 + }, + { + "epoch": 1.5495317240469595, + "grad_norm": 0.008210878819227219, + "learning_rate": 1.878308273982239e-05, + "loss": 0.003, + "step": 5874 + }, + { + "epoch": 1.550059358923625, + "grad_norm": 0.005712516605854034, + "learning_rate": 1.878191037251971e-05, + "loss": 0.0074, + "step": 5876 + }, + { + "epoch": 1.55058699380029, + "grad_norm": 0.06418018043041229, + "learning_rate": 1.8780738005217037e-05, + "loss": 0.0006, + "step": 5878 + }, + { + "epoch": 1.5511146286769555, + "grad_norm": 0.007718575652688742, + "learning_rate": 1.877956563791436e-05, + "loss": 0.0056, + "step": 5880 + }, + { + "epoch": 1.551642263553621, + "grad_norm": 0.02485973946750164, + "learning_rate": 1.8778393270611682e-05, + "loss": 0.0075, + "step": 5882 + }, + { + "epoch": 1.5521698984302863, + "grad_norm": 0.014024568721652031, + "learning_rate": 1.877722090330901e-05, + "loss": 0.0005, + "step": 5884 + }, + { + "epoch": 1.5526975333069517, + "grad_norm": 0.024446239694952965, + "learning_rate": 1.8776048536006334e-05, + "loss": 0.0009, + "step": 5886 + }, + { + "epoch": 1.5532251681836169, + "grad_norm": 0.011416632682085037, + "learning_rate": 1.8774876168703657e-05, + "loss": 0.0088, + "step": 5888 + }, + { + "epoch": 1.5537528030602823, + "grad_norm": 0.20224446058273315, + "learning_rate": 1.877370380140098e-05, + "loss": 0.0071, + "step": 5890 + }, + { + "epoch": 1.5542804379369475, + "grad_norm": 0.10025540739297867, + "learning_rate": 1.8772531434098306e-05, + "loss": 0.0009, + "step": 5892 + }, + { + "epoch": 1.5548080728136129, + "grad_norm": 0.02478892356157303, + "learning_rate": 1.8771359066795628e-05, + "loss": 0.0007, + "step": 5894 + }, + { + "epoch": 1.5553357076902783, + "grad_norm": 0.19795048236846924, + "learning_rate": 1.877018669949295e-05, + "loss": 0.0071, + "step": 5896 + }, + { + "epoch": 1.5558633425669437, + "grad_norm": 0.19339647889137268, + "learning_rate": 1.8769014332190277e-05, + "loss": 0.0014, + "step": 5898 + }, + { + "epoch": 1.556390977443609, + "grad_norm": 0.007641380187124014, + "learning_rate": 1.87678419648876e-05, + "loss": 0.0006, + "step": 5900 + }, + { + "epoch": 1.5569186123202745, + "grad_norm": 0.36557164788246155, + "learning_rate": 1.8766669597584925e-05, + "loss": 0.013, + "step": 5902 + }, + { + "epoch": 1.5574462471969397, + "grad_norm": 0.11998596787452698, + "learning_rate": 1.8765497230282248e-05, + "loss": 0.0018, + "step": 5904 + }, + { + "epoch": 1.557973882073605, + "grad_norm": 0.1477557122707367, + "learning_rate": 1.8764324862979574e-05, + "loss": 0.0097, + "step": 5906 + }, + { + "epoch": 1.5585015169502703, + "grad_norm": 0.02350522391498089, + "learning_rate": 1.8763152495676897e-05, + "loss": 0.0015, + "step": 5908 + }, + { + "epoch": 1.5590291518269357, + "grad_norm": 0.06435579806566238, + "learning_rate": 1.8761980128374223e-05, + "loss": 0.0024, + "step": 5910 + }, + { + "epoch": 1.559556786703601, + "grad_norm": 0.02967824600636959, + "learning_rate": 1.8760807761071545e-05, + "loss": 0.0021, + "step": 5912 + }, + { + "epoch": 1.5600844215802665, + "grad_norm": 0.0449422188103199, + "learning_rate": 1.8759635393768868e-05, + "loss": 0.0015, + "step": 5914 + }, + { + "epoch": 1.560612056456932, + "grad_norm": 0.013415842317044735, + "learning_rate": 1.8758463026466194e-05, + "loss": 0.001, + "step": 5916 + }, + { + "epoch": 1.5611396913335973, + "grad_norm": 0.019541151821613312, + "learning_rate": 1.875729065916352e-05, + "loss": 0.0044, + "step": 5918 + }, + { + "epoch": 1.5616673262102625, + "grad_norm": 0.05811833217740059, + "learning_rate": 1.8756118291860842e-05, + "loss": 0.0008, + "step": 5920 + }, + { + "epoch": 1.5621949610869277, + "grad_norm": 0.007999199442565441, + "learning_rate": 1.8754945924558165e-05, + "loss": 0.0061, + "step": 5922 + }, + { + "epoch": 1.562722595963593, + "grad_norm": 0.07043740153312683, + "learning_rate": 1.875377355725549e-05, + "loss": 0.0006, + "step": 5924 + }, + { + "epoch": 1.5632502308402585, + "grad_norm": 0.003592312103137374, + "learning_rate": 1.8752601189952814e-05, + "loss": 0.0118, + "step": 5926 + }, + { + "epoch": 1.563777865716924, + "grad_norm": 0.043701253831386566, + "learning_rate": 1.8751428822650136e-05, + "loss": 0.003, + "step": 5928 + }, + { + "epoch": 1.5643055005935893, + "grad_norm": 0.13176877796649933, + "learning_rate": 1.8750256455347462e-05, + "loss": 0.0079, + "step": 5930 + }, + { + "epoch": 1.5648331354702547, + "grad_norm": 0.3681084215641022, + "learning_rate": 1.8749084088044788e-05, + "loss": 0.0011, + "step": 5932 + }, + { + "epoch": 1.56536077034692, + "grad_norm": 0.01108029019087553, + "learning_rate": 1.8747911720742107e-05, + "loss": 0.0008, + "step": 5934 + }, + { + "epoch": 1.5658884052235853, + "grad_norm": 0.04626883938908577, + "learning_rate": 1.8746739353439433e-05, + "loss": 0.004, + "step": 5936 + }, + { + "epoch": 1.5664160401002505, + "grad_norm": 0.3085012137889862, + "learning_rate": 1.874556698613676e-05, + "loss": 0.0017, + "step": 5938 + }, + { + "epoch": 1.566943674976916, + "grad_norm": 0.054276738315820694, + "learning_rate": 1.8744394618834082e-05, + "loss": 0.0006, + "step": 5940 + }, + { + "epoch": 1.5674713098535813, + "grad_norm": 0.14644920825958252, + "learning_rate": 1.8743222251531405e-05, + "loss": 0.0053, + "step": 5942 + }, + { + "epoch": 1.5679989447302467, + "grad_norm": 0.04957154020667076, + "learning_rate": 1.874204988422873e-05, + "loss": 0.0063, + "step": 5944 + }, + { + "epoch": 1.568526579606912, + "grad_norm": 0.12779662013053894, + "learning_rate": 1.8740877516926053e-05, + "loss": 0.007, + "step": 5946 + }, + { + "epoch": 1.5690542144835775, + "grad_norm": 0.15178947150707245, + "learning_rate": 1.873970514962338e-05, + "loss": 0.0009, + "step": 5948 + }, + { + "epoch": 1.5695818493602427, + "grad_norm": 0.3528413772583008, + "learning_rate": 1.8738532782320702e-05, + "loss": 0.0048, + "step": 5950 + }, + { + "epoch": 1.570109484236908, + "grad_norm": 0.027547135949134827, + "learning_rate": 1.8737360415018028e-05, + "loss": 0.0029, + "step": 5952 + }, + { + "epoch": 1.5706371191135733, + "grad_norm": 0.006561142858117819, + "learning_rate": 1.873618804771535e-05, + "loss": 0.0004, + "step": 5954 + }, + { + "epoch": 1.5711647539902387, + "grad_norm": 7.106542587280273, + "learning_rate": 1.8735015680412676e-05, + "loss": 0.0087, + "step": 5956 + }, + { + "epoch": 1.571692388866904, + "grad_norm": 0.003567843232303858, + "learning_rate": 1.873384331311e-05, + "loss": 0.0008, + "step": 5958 + }, + { + "epoch": 1.5722200237435695, + "grad_norm": 0.25470900535583496, + "learning_rate": 1.873267094580732e-05, + "loss": 0.0109, + "step": 5960 + }, + { + "epoch": 1.572747658620235, + "grad_norm": 0.15623876452445984, + "learning_rate": 1.8731498578504648e-05, + "loss": 0.0065, + "step": 5962 + }, + { + "epoch": 1.5732752934969, + "grad_norm": 0.7987511157989502, + "learning_rate": 1.873032621120197e-05, + "loss": 0.0109, + "step": 5964 + }, + { + "epoch": 1.5738029283735655, + "grad_norm": 0.242361918091774, + "learning_rate": 1.8729153843899296e-05, + "loss": 0.0029, + "step": 5966 + }, + { + "epoch": 1.5743305632502307, + "grad_norm": 0.1501699984073639, + "learning_rate": 1.872798147659662e-05, + "loss": 0.0017, + "step": 5968 + }, + { + "epoch": 1.574858198126896, + "grad_norm": 0.32646259665489197, + "learning_rate": 1.8726809109293945e-05, + "loss": 0.021, + "step": 5970 + }, + { + "epoch": 1.5753858330035615, + "grad_norm": 0.3816893398761749, + "learning_rate": 1.8725636741991268e-05, + "loss": 0.0066, + "step": 5972 + }, + { + "epoch": 1.575913467880227, + "grad_norm": 0.09845875948667526, + "learning_rate": 1.872446437468859e-05, + "loss": 0.0012, + "step": 5974 + }, + { + "epoch": 1.5764411027568923, + "grad_norm": 0.012453741393983364, + "learning_rate": 1.8723292007385916e-05, + "loss": 0.0009, + "step": 5976 + }, + { + "epoch": 1.5769687376335577, + "grad_norm": 0.05013902485370636, + "learning_rate": 1.8722119640083242e-05, + "loss": 0.0049, + "step": 5978 + }, + { + "epoch": 1.577496372510223, + "grad_norm": 0.1773402839899063, + "learning_rate": 1.872094727278056e-05, + "loss": 0.011, + "step": 5980 + }, + { + "epoch": 1.5780240073868883, + "grad_norm": 0.021532800048589706, + "learning_rate": 1.8719774905477887e-05, + "loss": 0.0011, + "step": 5982 + }, + { + "epoch": 1.5785516422635535, + "grad_norm": 0.09052951633930206, + "learning_rate": 1.8718602538175213e-05, + "loss": 0.0008, + "step": 5984 + }, + { + "epoch": 1.579079277140219, + "grad_norm": 0.11292821913957596, + "learning_rate": 1.8717430170872536e-05, + "loss": 0.0021, + "step": 5986 + }, + { + "epoch": 1.5796069120168843, + "grad_norm": 0.03565918654203415, + "learning_rate": 1.871625780356986e-05, + "loss": 0.0006, + "step": 5988 + }, + { + "epoch": 1.5801345468935497, + "grad_norm": 0.009846494533121586, + "learning_rate": 1.8715085436267185e-05, + "loss": 0.0047, + "step": 5990 + }, + { + "epoch": 1.5806621817702151, + "grad_norm": 0.07288189232349396, + "learning_rate": 1.8713913068964507e-05, + "loss": 0.001, + "step": 5992 + }, + { + "epoch": 1.5811898166468805, + "grad_norm": 0.08560064435005188, + "learning_rate": 1.871274070166183e-05, + "loss": 0.0015, + "step": 5994 + }, + { + "epoch": 1.5817174515235457, + "grad_norm": 0.011911117471754551, + "learning_rate": 1.8711568334359156e-05, + "loss": 0.0005, + "step": 5996 + }, + { + "epoch": 1.582245086400211, + "grad_norm": 0.8961425423622131, + "learning_rate": 1.8710395967056482e-05, + "loss": 0.0036, + "step": 5998 + }, + { + "epoch": 1.5827727212768763, + "grad_norm": 0.010418999008834362, + "learning_rate": 1.8709223599753804e-05, + "loss": 0.001, + "step": 6000 + }, + { + "epoch": 1.5827727212768763, + "eval_loss": 0.002941887127235532, + "eval_runtime": 308.7262, + "eval_samples_per_second": 698.486, + "eval_steps_per_second": 87.314, + "step": 6000 + }, + { + "epoch": 1.5833003561535417, + "grad_norm": 0.5355306267738342, + "learning_rate": 1.8708051232451127e-05, + "loss": 0.0083, + "step": 6002 + }, + { + "epoch": 1.583827991030207, + "grad_norm": 0.15206676721572876, + "learning_rate": 1.8706878865148453e-05, + "loss": 0.0091, + "step": 6004 + }, + { + "epoch": 1.5843556259068725, + "grad_norm": 0.11292139440774918, + "learning_rate": 1.8705706497845776e-05, + "loss": 0.001, + "step": 6006 + }, + { + "epoch": 1.584883260783538, + "grad_norm": 0.10189344733953476, + "learning_rate": 1.87045341305431e-05, + "loss": 0.0041, + "step": 6008 + }, + { + "epoch": 1.585410895660203, + "grad_norm": 0.3290845453739166, + "learning_rate": 1.8703361763240424e-05, + "loss": 0.0038, + "step": 6010 + }, + { + "epoch": 1.5859385305368685, + "grad_norm": 0.16632696986198425, + "learning_rate": 1.870218939593775e-05, + "loss": 0.0037, + "step": 6012 + }, + { + "epoch": 1.5864661654135337, + "grad_norm": 0.1144455224275589, + "learning_rate": 1.8701017028635073e-05, + "loss": 0.0009, + "step": 6014 + }, + { + "epoch": 1.586993800290199, + "grad_norm": 0.01789017952978611, + "learning_rate": 1.86998446613324e-05, + "loss": 0.0005, + "step": 6016 + }, + { + "epoch": 1.5875214351668645, + "grad_norm": 0.009047349914908409, + "learning_rate": 1.869867229402972e-05, + "loss": 0.0008, + "step": 6018 + }, + { + "epoch": 1.58804907004353, + "grad_norm": 0.20084737241268158, + "learning_rate": 1.8697499926727044e-05, + "loss": 0.0119, + "step": 6020 + }, + { + "epoch": 1.5885767049201953, + "grad_norm": 0.18556097149848938, + "learning_rate": 1.869632755942437e-05, + "loss": 0.0027, + "step": 6022 + }, + { + "epoch": 1.5891043397968607, + "grad_norm": 0.07017727196216583, + "learning_rate": 1.8695155192121693e-05, + "loss": 0.0005, + "step": 6024 + }, + { + "epoch": 1.589631974673526, + "grad_norm": 0.032345011830329895, + "learning_rate": 1.8693982824819015e-05, + "loss": 0.0007, + "step": 6026 + }, + { + "epoch": 1.5901596095501913, + "grad_norm": 0.39468827843666077, + "learning_rate": 1.869281045751634e-05, + "loss": 0.0049, + "step": 6028 + }, + { + "epoch": 1.5906872444268565, + "grad_norm": 0.005316456779837608, + "learning_rate": 1.8691638090213667e-05, + "loss": 0.0005, + "step": 6030 + }, + { + "epoch": 1.591214879303522, + "grad_norm": 0.126144140958786, + "learning_rate": 1.869046572291099e-05, + "loss": 0.0016, + "step": 6032 + }, + { + "epoch": 1.5917425141801873, + "grad_norm": 0.051325082778930664, + "learning_rate": 1.8689293355608312e-05, + "loss": 0.001, + "step": 6034 + }, + { + "epoch": 1.5922701490568527, + "grad_norm": 0.01679389365017414, + "learning_rate": 1.868812098830564e-05, + "loss": 0.0006, + "step": 6036 + }, + { + "epoch": 1.5927977839335181, + "grad_norm": 0.22549816966056824, + "learning_rate": 1.868694862100296e-05, + "loss": 0.0097, + "step": 6038 + }, + { + "epoch": 1.5933254188101833, + "grad_norm": 0.09191488474607468, + "learning_rate": 1.8685776253700284e-05, + "loss": 0.0007, + "step": 6040 + }, + { + "epoch": 1.5938530536868487, + "grad_norm": 0.18129055202007294, + "learning_rate": 1.868460388639761e-05, + "loss": 0.002, + "step": 6042 + }, + { + "epoch": 1.594380688563514, + "grad_norm": 0.275145947933197, + "learning_rate": 1.8683431519094936e-05, + "loss": 0.0067, + "step": 6044 + }, + { + "epoch": 1.5949083234401793, + "grad_norm": 1.610514760017395, + "learning_rate": 1.8682259151792258e-05, + "loss": 0.0047, + "step": 6046 + }, + { + "epoch": 1.5954359583168447, + "grad_norm": 0.204642653465271, + "learning_rate": 1.868108678448958e-05, + "loss": 0.0061, + "step": 6048 + }, + { + "epoch": 1.59596359319351, + "grad_norm": 0.05179504305124283, + "learning_rate": 1.8679914417186907e-05, + "loss": 0.0006, + "step": 6050 + }, + { + "epoch": 1.5964912280701755, + "grad_norm": 1.7167541980743408, + "learning_rate": 1.867874204988423e-05, + "loss": 0.0048, + "step": 6052 + }, + { + "epoch": 1.597018862946841, + "grad_norm": 0.05379081517457962, + "learning_rate": 1.8677569682581552e-05, + "loss": 0.0032, + "step": 6054 + }, + { + "epoch": 1.597546497823506, + "grad_norm": 0.05557811260223389, + "learning_rate": 1.8676397315278878e-05, + "loss": 0.0014, + "step": 6056 + }, + { + "epoch": 1.5980741327001715, + "grad_norm": 0.018179122358560562, + "learning_rate": 1.8675224947976204e-05, + "loss": 0.0004, + "step": 6058 + }, + { + "epoch": 1.5986017675768367, + "grad_norm": 0.013257515616714954, + "learning_rate": 1.8674052580673527e-05, + "loss": 0.0142, + "step": 6060 + }, + { + "epoch": 1.599129402453502, + "grad_norm": 0.009112194180488586, + "learning_rate": 1.867288021337085e-05, + "loss": 0.0028, + "step": 6062 + }, + { + "epoch": 1.5996570373301675, + "grad_norm": 0.008557685650885105, + "learning_rate": 1.8671707846068175e-05, + "loss": 0.0026, + "step": 6064 + }, + { + "epoch": 1.600184672206833, + "grad_norm": 0.013301791623234749, + "learning_rate": 1.8670535478765498e-05, + "loss": 0.0041, + "step": 6066 + }, + { + "epoch": 1.6007123070834983, + "grad_norm": 0.009915720671415329, + "learning_rate": 1.8669363111462824e-05, + "loss": 0.0015, + "step": 6068 + }, + { + "epoch": 1.6012399419601637, + "grad_norm": 0.6671117544174194, + "learning_rate": 1.8668190744160147e-05, + "loss": 0.0026, + "step": 6070 + }, + { + "epoch": 1.601767576836829, + "grad_norm": 0.005996378138661385, + "learning_rate": 1.8667018376857473e-05, + "loss": 0.001, + "step": 6072 + }, + { + "epoch": 1.6022952117134943, + "grad_norm": 0.012032533064484596, + "learning_rate": 1.8665846009554795e-05, + "loss": 0.0027, + "step": 6074 + }, + { + "epoch": 1.6028228465901595, + "grad_norm": 0.01343823503702879, + "learning_rate": 1.866467364225212e-05, + "loss": 0.0067, + "step": 6076 + }, + { + "epoch": 1.603350481466825, + "grad_norm": 0.05256224423646927, + "learning_rate": 1.8663501274949444e-05, + "loss": 0.0064, + "step": 6078 + }, + { + "epoch": 1.6038781163434903, + "grad_norm": 0.18842105567455292, + "learning_rate": 1.8662328907646766e-05, + "loss": 0.0021, + "step": 6080 + }, + { + "epoch": 1.6044057512201557, + "grad_norm": 0.039050597697496414, + "learning_rate": 1.8661156540344092e-05, + "loss": 0.0006, + "step": 6082 + }, + { + "epoch": 1.6049333860968211, + "grad_norm": 0.00986960157752037, + "learning_rate": 1.8659984173041415e-05, + "loss": 0.0006, + "step": 6084 + }, + { + "epoch": 1.6054610209734863, + "grad_norm": 0.008508204482495785, + "learning_rate": 1.8658811805738738e-05, + "loss": 0.0009, + "step": 6086 + }, + { + "epoch": 1.6059886558501517, + "grad_norm": 0.06156516447663307, + "learning_rate": 1.8657639438436064e-05, + "loss": 0.0006, + "step": 6088 + }, + { + "epoch": 1.606516290726817, + "grad_norm": 0.29093343019485474, + "learning_rate": 1.865646707113339e-05, + "loss": 0.005, + "step": 6090 + }, + { + "epoch": 1.6070439256034823, + "grad_norm": 0.01715921051800251, + "learning_rate": 1.8655294703830712e-05, + "loss": 0.0005, + "step": 6092 + }, + { + "epoch": 1.6075715604801477, + "grad_norm": 0.11992287635803223, + "learning_rate": 1.8654122336528035e-05, + "loss": 0.0031, + "step": 6094 + }, + { + "epoch": 1.6080991953568131, + "grad_norm": 0.005962171591818333, + "learning_rate": 1.865294996922536e-05, + "loss": 0.0003, + "step": 6096 + }, + { + "epoch": 1.6086268302334785, + "grad_norm": 0.5153745412826538, + "learning_rate": 1.8651777601922683e-05, + "loss": 0.0158, + "step": 6098 + }, + { + "epoch": 1.609154465110144, + "grad_norm": 0.035137325525283813, + "learning_rate": 1.8650605234620006e-05, + "loss": 0.0004, + "step": 6100 + }, + { + "epoch": 1.6096820999868091, + "grad_norm": 0.00992047879844904, + "learning_rate": 1.8649432867317332e-05, + "loss": 0.0105, + "step": 6102 + }, + { + "epoch": 1.6102097348634745, + "grad_norm": 0.05307888612151146, + "learning_rate": 1.8648260500014658e-05, + "loss": 0.007, + "step": 6104 + }, + { + "epoch": 1.6107373697401397, + "grad_norm": 1.042203664779663, + "learning_rate": 1.864708813271198e-05, + "loss": 0.0125, + "step": 6106 + }, + { + "epoch": 1.611265004616805, + "grad_norm": 0.04026511684060097, + "learning_rate": 1.8645915765409303e-05, + "loss": 0.0047, + "step": 6108 + }, + { + "epoch": 1.6117926394934705, + "grad_norm": 0.03532523289322853, + "learning_rate": 1.864474339810663e-05, + "loss": 0.0054, + "step": 6110 + }, + { + "epoch": 1.612320274370136, + "grad_norm": 0.02961777150630951, + "learning_rate": 1.8643571030803952e-05, + "loss": 0.0013, + "step": 6112 + }, + { + "epoch": 1.6128479092468013, + "grad_norm": 0.25057464838027954, + "learning_rate": 1.8642398663501274e-05, + "loss": 0.0104, + "step": 6114 + }, + { + "epoch": 1.6133755441234665, + "grad_norm": 0.020945696160197258, + "learning_rate": 1.86412262961986e-05, + "loss": 0.0006, + "step": 6116 + }, + { + "epoch": 1.613903179000132, + "grad_norm": 0.05350898206233978, + "learning_rate": 1.8640053928895926e-05, + "loss": 0.0007, + "step": 6118 + }, + { + "epoch": 1.614430813876797, + "grad_norm": 0.013876456767320633, + "learning_rate": 1.863888156159325e-05, + "loss": 0.0006, + "step": 6120 + }, + { + "epoch": 1.6149584487534625, + "grad_norm": 0.027297817170619965, + "learning_rate": 1.863770919429057e-05, + "loss": 0.0012, + "step": 6122 + }, + { + "epoch": 1.615486083630128, + "grad_norm": 0.25000086426734924, + "learning_rate": 1.8636536826987898e-05, + "loss": 0.0052, + "step": 6124 + }, + { + "epoch": 1.6160137185067933, + "grad_norm": 0.01945166476070881, + "learning_rate": 1.863536445968522e-05, + "loss": 0.0004, + "step": 6126 + }, + { + "epoch": 1.6165413533834587, + "grad_norm": 0.09896495938301086, + "learning_rate": 1.8634192092382546e-05, + "loss": 0.0009, + "step": 6128 + }, + { + "epoch": 1.6170689882601241, + "grad_norm": 0.013040781021118164, + "learning_rate": 1.863301972507987e-05, + "loss": 0.0046, + "step": 6130 + }, + { + "epoch": 1.6175966231367893, + "grad_norm": 0.0682150274515152, + "learning_rate": 1.863184735777719e-05, + "loss": 0.0007, + "step": 6132 + }, + { + "epoch": 1.6181242580134547, + "grad_norm": 0.8218745589256287, + "learning_rate": 1.8630674990474517e-05, + "loss": 0.0075, + "step": 6134 + }, + { + "epoch": 1.61865189289012, + "grad_norm": 0.06230597943067551, + "learning_rate": 1.8629502623171843e-05, + "loss": 0.0008, + "step": 6136 + }, + { + "epoch": 1.6191795277667853, + "grad_norm": 0.19271320104599, + "learning_rate": 1.8628330255869166e-05, + "loss": 0.0024, + "step": 6138 + }, + { + "epoch": 1.6197071626434507, + "grad_norm": 0.0282739344984293, + "learning_rate": 1.862715788856649e-05, + "loss": 0.0131, + "step": 6140 + }, + { + "epoch": 1.6202347975201161, + "grad_norm": 0.09290173649787903, + "learning_rate": 1.8625985521263815e-05, + "loss": 0.0006, + "step": 6142 + }, + { + "epoch": 1.6207624323967815, + "grad_norm": 0.034042369574308395, + "learning_rate": 1.8624813153961137e-05, + "loss": 0.0005, + "step": 6144 + }, + { + "epoch": 1.621290067273447, + "grad_norm": 0.026185721158981323, + "learning_rate": 1.862364078665846e-05, + "loss": 0.0005, + "step": 6146 + }, + { + "epoch": 1.6218177021501121, + "grad_norm": 0.056929633021354675, + "learning_rate": 1.8622468419355786e-05, + "loss": 0.0028, + "step": 6148 + }, + { + "epoch": 1.6223453370267775, + "grad_norm": 0.3458031713962555, + "learning_rate": 1.8621296052053112e-05, + "loss": 0.0104, + "step": 6150 + }, + { + "epoch": 1.6228729719034427, + "grad_norm": 0.04630544036626816, + "learning_rate": 1.8620123684750435e-05, + "loss": 0.0018, + "step": 6152 + }, + { + "epoch": 1.6234006067801081, + "grad_norm": 0.34205734729766846, + "learning_rate": 1.8618951317447757e-05, + "loss": 0.0043, + "step": 6154 + }, + { + "epoch": 1.6239282416567735, + "grad_norm": 0.0060612657107412815, + "learning_rate": 1.8617778950145083e-05, + "loss": 0.0004, + "step": 6156 + }, + { + "epoch": 1.624455876533439, + "grad_norm": 0.009352442808449268, + "learning_rate": 1.8616606582842406e-05, + "loss": 0.0073, + "step": 6158 + }, + { + "epoch": 1.6249835114101043, + "grad_norm": 0.2256537526845932, + "learning_rate": 1.861543421553973e-05, + "loss": 0.0155, + "step": 6160 + }, + { + "epoch": 1.6255111462867695, + "grad_norm": 0.01726400852203369, + "learning_rate": 1.8614261848237054e-05, + "loss": 0.005, + "step": 6162 + }, + { + "epoch": 1.626038781163435, + "grad_norm": 0.45101800560951233, + "learning_rate": 1.861308948093438e-05, + "loss": 0.0075, + "step": 6164 + }, + { + "epoch": 1.6265664160401, + "grad_norm": 0.012207714840769768, + "learning_rate": 1.8611917113631703e-05, + "loss": 0.0036, + "step": 6166 + }, + { + "epoch": 1.6270940509167655, + "grad_norm": 0.017022451385855675, + "learning_rate": 1.8610744746329026e-05, + "loss": 0.0006, + "step": 6168 + }, + { + "epoch": 1.627621685793431, + "grad_norm": 0.06998085975646973, + "learning_rate": 1.860957237902635e-05, + "loss": 0.0031, + "step": 6170 + }, + { + "epoch": 1.6281493206700963, + "grad_norm": 0.016635837033391, + "learning_rate": 1.8608400011723674e-05, + "loss": 0.0006, + "step": 6172 + }, + { + "epoch": 1.6286769555467617, + "grad_norm": 0.1359051614999771, + "learning_rate": 1.8607227644420997e-05, + "loss": 0.0071, + "step": 6174 + }, + { + "epoch": 1.6292045904234271, + "grad_norm": 0.013755676336586475, + "learning_rate": 1.8606055277118323e-05, + "loss": 0.0009, + "step": 6176 + }, + { + "epoch": 1.6297322253000923, + "grad_norm": 0.14337027072906494, + "learning_rate": 1.8604882909815645e-05, + "loss": 0.0112, + "step": 6178 + }, + { + "epoch": 1.6302598601767577, + "grad_norm": 0.015028697438538074, + "learning_rate": 1.860371054251297e-05, + "loss": 0.0005, + "step": 6180 + }, + { + "epoch": 1.630787495053423, + "grad_norm": 0.0229660514742136, + "learning_rate": 1.8602538175210294e-05, + "loss": 0.0058, + "step": 6182 + }, + { + "epoch": 1.6313151299300883, + "grad_norm": 0.018481295555830002, + "learning_rate": 1.860136580790762e-05, + "loss": 0.0006, + "step": 6184 + }, + { + "epoch": 1.6318427648067537, + "grad_norm": 0.03273305296897888, + "learning_rate": 1.8600193440604943e-05, + "loss": 0.001, + "step": 6186 + }, + { + "epoch": 1.6323703996834191, + "grad_norm": 0.01712627150118351, + "learning_rate": 1.859902107330227e-05, + "loss": 0.0009, + "step": 6188 + }, + { + "epoch": 1.6328980345600845, + "grad_norm": 0.058565348386764526, + "learning_rate": 1.859784870599959e-05, + "loss": 0.0041, + "step": 6190 + }, + { + "epoch": 1.6334256694367497, + "grad_norm": 0.030375607311725616, + "learning_rate": 1.8596676338696914e-05, + "loss": 0.0004, + "step": 6192 + }, + { + "epoch": 1.6339533043134151, + "grad_norm": 0.3591495156288147, + "learning_rate": 1.859550397139424e-05, + "loss": 0.0181, + "step": 6194 + }, + { + "epoch": 1.6344809391900803, + "grad_norm": 0.2255410999059677, + "learning_rate": 1.8594331604091566e-05, + "loss": 0.003, + "step": 6196 + }, + { + "epoch": 1.6350085740667457, + "grad_norm": 0.009228875860571861, + "learning_rate": 1.859315923678889e-05, + "loss": 0.0004, + "step": 6198 + }, + { + "epoch": 1.6355362089434111, + "grad_norm": 0.015921302139759064, + "learning_rate": 1.859198686948621e-05, + "loss": 0.0007, + "step": 6200 + }, + { + "epoch": 1.6360638438200765, + "grad_norm": 0.010953971184790134, + "learning_rate": 1.8590814502183537e-05, + "loss": 0.0004, + "step": 6202 + }, + { + "epoch": 1.636591478696742, + "grad_norm": 0.0057242559269070625, + "learning_rate": 1.858964213488086e-05, + "loss": 0.0005, + "step": 6204 + }, + { + "epoch": 1.6371191135734073, + "grad_norm": 0.00948275439441204, + "learning_rate": 1.8588469767578182e-05, + "loss": 0.0005, + "step": 6206 + }, + { + "epoch": 1.6376467484500725, + "grad_norm": 0.1451857089996338, + "learning_rate": 1.8587297400275508e-05, + "loss": 0.0024, + "step": 6208 + }, + { + "epoch": 1.638174383326738, + "grad_norm": 0.18930499255657196, + "learning_rate": 1.8586125032972834e-05, + "loss": 0.0011, + "step": 6210 + }, + { + "epoch": 1.638702018203403, + "grad_norm": 0.04164375737309456, + "learning_rate": 1.8584952665670153e-05, + "loss": 0.0006, + "step": 6212 + }, + { + "epoch": 1.6392296530800685, + "grad_norm": 0.007484670728445053, + "learning_rate": 1.858378029836748e-05, + "loss": 0.0004, + "step": 6214 + }, + { + "epoch": 1.639757287956734, + "grad_norm": 0.00980053748935461, + "learning_rate": 1.8582607931064805e-05, + "loss": 0.0049, + "step": 6216 + }, + { + "epoch": 1.6402849228333993, + "grad_norm": 0.1848776936531067, + "learning_rate": 1.8581435563762128e-05, + "loss": 0.0075, + "step": 6218 + }, + { + "epoch": 1.6408125577100647, + "grad_norm": 0.037641335278749466, + "learning_rate": 1.858026319645945e-05, + "loss": 0.0006, + "step": 6220 + }, + { + "epoch": 1.6413401925867301, + "grad_norm": 0.1762790083885193, + "learning_rate": 1.8579090829156777e-05, + "loss": 0.0011, + "step": 6222 + }, + { + "epoch": 1.6418678274633953, + "grad_norm": 0.056518957018852234, + "learning_rate": 1.85779184618541e-05, + "loss": 0.0008, + "step": 6224 + }, + { + "epoch": 1.6423954623400607, + "grad_norm": 0.07108618319034576, + "learning_rate": 1.8576746094551425e-05, + "loss": 0.0005, + "step": 6226 + }, + { + "epoch": 1.642923097216726, + "grad_norm": 0.020141009241342545, + "learning_rate": 1.8575573727248748e-05, + "loss": 0.0009, + "step": 6228 + }, + { + "epoch": 1.6434507320933913, + "grad_norm": 0.02508915401995182, + "learning_rate": 1.8574401359946074e-05, + "loss": 0.0029, + "step": 6230 + }, + { + "epoch": 1.6439783669700567, + "grad_norm": 0.014157676137983799, + "learning_rate": 1.8573228992643397e-05, + "loss": 0.0004, + "step": 6232 + }, + { + "epoch": 1.6445060018467221, + "grad_norm": 0.056794289499521255, + "learning_rate": 1.857205662534072e-05, + "loss": 0.002, + "step": 6234 + }, + { + "epoch": 1.6450336367233875, + "grad_norm": 0.004592963960021734, + "learning_rate": 1.8570884258038045e-05, + "loss": 0.0011, + "step": 6236 + }, + { + "epoch": 1.6455612716000527, + "grad_norm": 0.12293826043605804, + "learning_rate": 1.8569711890735368e-05, + "loss": 0.0075, + "step": 6238 + }, + { + "epoch": 1.6460889064767181, + "grad_norm": 0.015110652893781662, + "learning_rate": 1.8568539523432694e-05, + "loss": 0.0005, + "step": 6240 + }, + { + "epoch": 1.6466165413533833, + "grad_norm": 0.014468848705291748, + "learning_rate": 1.8567367156130016e-05, + "loss": 0.0006, + "step": 6242 + }, + { + "epoch": 1.6471441762300487, + "grad_norm": 0.032307252287864685, + "learning_rate": 1.8566194788827342e-05, + "loss": 0.0006, + "step": 6244 + }, + { + "epoch": 1.6476718111067141, + "grad_norm": 0.009540961124002934, + "learning_rate": 1.8565022421524665e-05, + "loss": 0.0005, + "step": 6246 + }, + { + "epoch": 1.6481994459833795, + "grad_norm": 0.23522144556045532, + "learning_rate": 1.856385005422199e-05, + "loss": 0.0081, + "step": 6248 + }, + { + "epoch": 1.648727080860045, + "grad_norm": 0.031749043613672256, + "learning_rate": 1.8562677686919314e-05, + "loss": 0.0005, + "step": 6250 + }, + { + "epoch": 1.6492547157367103, + "grad_norm": 0.016144555062055588, + "learning_rate": 1.8561505319616636e-05, + "loss": 0.0018, + "step": 6252 + }, + { + "epoch": 1.6497823506133755, + "grad_norm": 0.5206400156021118, + "learning_rate": 1.8560332952313962e-05, + "loss": 0.0023, + "step": 6254 + }, + { + "epoch": 1.650309985490041, + "grad_norm": 0.010748305357992649, + "learning_rate": 1.8559160585011288e-05, + "loss": 0.0003, + "step": 6256 + }, + { + "epoch": 1.6508376203667061, + "grad_norm": 0.016911091282963753, + "learning_rate": 1.8557988217708607e-05, + "loss": 0.0004, + "step": 6258 + }, + { + "epoch": 1.6513652552433715, + "grad_norm": 0.0663478672504425, + "learning_rate": 1.8556815850405933e-05, + "loss": 0.0006, + "step": 6260 + }, + { + "epoch": 1.651892890120037, + "grad_norm": 0.1054966002702713, + "learning_rate": 1.855564348310326e-05, + "loss": 0.0136, + "step": 6262 + }, + { + "epoch": 1.6524205249967023, + "grad_norm": 0.13449043035507202, + "learning_rate": 1.8554471115800582e-05, + "loss": 0.0211, + "step": 6264 + }, + { + "epoch": 1.6529481598733677, + "grad_norm": 0.007811689283698797, + "learning_rate": 1.8553298748497905e-05, + "loss": 0.0003, + "step": 6266 + }, + { + "epoch": 1.653475794750033, + "grad_norm": 0.014362619258463383, + "learning_rate": 1.855212638119523e-05, + "loss": 0.0005, + "step": 6268 + }, + { + "epoch": 1.6540034296266983, + "grad_norm": 0.010328108444809914, + "learning_rate": 1.8550954013892553e-05, + "loss": 0.0004, + "step": 6270 + }, + { + "epoch": 1.6545310645033635, + "grad_norm": 0.009974258951842785, + "learning_rate": 1.8549781646589876e-05, + "loss": 0.0021, + "step": 6272 + }, + { + "epoch": 1.655058699380029, + "grad_norm": 0.010153263807296753, + "learning_rate": 1.8548609279287202e-05, + "loss": 0.0004, + "step": 6274 + }, + { + "epoch": 1.6555863342566943, + "grad_norm": 0.19647279381752014, + "learning_rate": 1.8547436911984528e-05, + "loss": 0.0017, + "step": 6276 + }, + { + "epoch": 1.6561139691333597, + "grad_norm": 0.15456217527389526, + "learning_rate": 1.854626454468185e-05, + "loss": 0.0009, + "step": 6278 + }, + { + "epoch": 1.6566416040100251, + "grad_norm": 0.014567425474524498, + "learning_rate": 1.8545092177379173e-05, + "loss": 0.0107, + "step": 6280 + }, + { + "epoch": 1.6571692388866905, + "grad_norm": 0.018135644495487213, + "learning_rate": 1.85439198100765e-05, + "loss": 0.0005, + "step": 6282 + }, + { + "epoch": 1.6576968737633557, + "grad_norm": 0.15634924173355103, + "learning_rate": 1.854274744277382e-05, + "loss": 0.0008, + "step": 6284 + }, + { + "epoch": 1.6582245086400211, + "grad_norm": 0.012932661920785904, + "learning_rate": 1.8541575075471148e-05, + "loss": 0.0005, + "step": 6286 + }, + { + "epoch": 1.6587521435166863, + "grad_norm": 0.008994466625154018, + "learning_rate": 1.854040270816847e-05, + "loss": 0.0004, + "step": 6288 + }, + { + "epoch": 1.6592797783933517, + "grad_norm": 0.055353790521621704, + "learning_rate": 1.8539230340865796e-05, + "loss": 0.0047, + "step": 6290 + }, + { + "epoch": 1.6598074132700171, + "grad_norm": 0.013505227863788605, + "learning_rate": 1.853805797356312e-05, + "loss": 0.0004, + "step": 6292 + }, + { + "epoch": 1.6603350481466825, + "grad_norm": 0.27849942445755005, + "learning_rate": 1.853688560626044e-05, + "loss": 0.008, + "step": 6294 + }, + { + "epoch": 1.660862683023348, + "grad_norm": 0.01829228363931179, + "learning_rate": 1.8535713238957767e-05, + "loss": 0.0029, + "step": 6296 + }, + { + "epoch": 1.6613903179000133, + "grad_norm": 0.4532518684864044, + "learning_rate": 1.853454087165509e-05, + "loss": 0.0016, + "step": 6298 + }, + { + "epoch": 1.6619179527766785, + "grad_norm": 0.025676213204860687, + "learning_rate": 1.8533368504352416e-05, + "loss": 0.0008, + "step": 6300 + }, + { + "epoch": 1.662445587653344, + "grad_norm": 0.006980431266129017, + "learning_rate": 1.853219613704974e-05, + "loss": 0.0004, + "step": 6302 + }, + { + "epoch": 1.6629732225300091, + "grad_norm": 0.013021113350987434, + "learning_rate": 1.853102376974706e-05, + "loss": 0.0004, + "step": 6304 + }, + { + "epoch": 1.6635008574066745, + "grad_norm": 0.19848906993865967, + "learning_rate": 1.8529851402444387e-05, + "loss": 0.0033, + "step": 6306 + }, + { + "epoch": 1.66402849228334, + "grad_norm": 0.2894245684146881, + "learning_rate": 1.8528679035141713e-05, + "loss": 0.0046, + "step": 6308 + }, + { + "epoch": 1.6645561271600053, + "grad_norm": 0.010163621045649052, + "learning_rate": 1.8527506667839036e-05, + "loss": 0.0012, + "step": 6310 + }, + { + "epoch": 1.6650837620366707, + "grad_norm": 0.14090478420257568, + "learning_rate": 1.852633430053636e-05, + "loss": 0.0078, + "step": 6312 + }, + { + "epoch": 1.665611396913336, + "grad_norm": 0.01992746815085411, + "learning_rate": 1.8525161933233685e-05, + "loss": 0.0007, + "step": 6314 + }, + { + "epoch": 1.6661390317900013, + "grad_norm": 0.012022549286484718, + "learning_rate": 1.8523989565931007e-05, + "loss": 0.0008, + "step": 6316 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.004381274804472923, + "learning_rate": 1.852281719862833e-05, + "loss": 0.0009, + "step": 6318 + }, + { + "epoch": 1.667194301543332, + "grad_norm": 0.08543431758880615, + "learning_rate": 1.8521644831325656e-05, + "loss": 0.0008, + "step": 6320 + }, + { + "epoch": 1.6677219364199973, + "grad_norm": 0.6862421035766602, + "learning_rate": 1.8520472464022982e-05, + "loss": 0.0023, + "step": 6322 + }, + { + "epoch": 1.6682495712966627, + "grad_norm": 0.5631045699119568, + "learning_rate": 1.8519300096720304e-05, + "loss": 0.0065, + "step": 6324 + }, + { + "epoch": 1.6687772061733281, + "grad_norm": 0.038508057594299316, + "learning_rate": 1.8518127729417627e-05, + "loss": 0.0004, + "step": 6326 + }, + { + "epoch": 1.6693048410499935, + "grad_norm": 0.1509505957365036, + "learning_rate": 1.8516955362114953e-05, + "loss": 0.006, + "step": 6328 + }, + { + "epoch": 1.6698324759266587, + "grad_norm": 0.008468752726912498, + "learning_rate": 1.8515782994812276e-05, + "loss": 0.0004, + "step": 6330 + }, + { + "epoch": 1.6703601108033241, + "grad_norm": 0.013263467699289322, + "learning_rate": 1.8514610627509598e-05, + "loss": 0.0065, + "step": 6332 + }, + { + "epoch": 1.6708877456799893, + "grad_norm": 0.13582897186279297, + "learning_rate": 1.8513438260206924e-05, + "loss": 0.0151, + "step": 6334 + }, + { + "epoch": 1.6714153805566547, + "grad_norm": 0.0344148650765419, + "learning_rate": 1.851226589290425e-05, + "loss": 0.0004, + "step": 6336 + }, + { + "epoch": 1.6719430154333201, + "grad_norm": 0.07547661662101746, + "learning_rate": 1.8511093525601573e-05, + "loss": 0.0005, + "step": 6338 + }, + { + "epoch": 1.6724706503099855, + "grad_norm": 0.06444069743156433, + "learning_rate": 1.8509921158298895e-05, + "loss": 0.0005, + "step": 6340 + }, + { + "epoch": 1.672998285186651, + "grad_norm": 0.007635175716131926, + "learning_rate": 1.850874879099622e-05, + "loss": 0.0003, + "step": 6342 + }, + { + "epoch": 1.6735259200633161, + "grad_norm": 0.2849859297275543, + "learning_rate": 1.8507576423693544e-05, + "loss": 0.0108, + "step": 6344 + }, + { + "epoch": 1.6740535549399815, + "grad_norm": 0.05331913009285927, + "learning_rate": 1.850640405639087e-05, + "loss": 0.0094, + "step": 6346 + }, + { + "epoch": 1.6745811898166467, + "grad_norm": 0.03366167098283768, + "learning_rate": 1.8505231689088193e-05, + "loss": 0.0007, + "step": 6348 + }, + { + "epoch": 1.6751088246933121, + "grad_norm": 0.025294475257396698, + "learning_rate": 1.8504059321785515e-05, + "loss": 0.0018, + "step": 6350 + }, + { + "epoch": 1.6756364595699775, + "grad_norm": 0.30981093645095825, + "learning_rate": 1.850288695448284e-05, + "loss": 0.0048, + "step": 6352 + }, + { + "epoch": 1.676164094446643, + "grad_norm": 0.012628673575818539, + "learning_rate": 1.8501714587180164e-05, + "loss": 0.0006, + "step": 6354 + }, + { + "epoch": 1.6766917293233083, + "grad_norm": 0.01593727245926857, + "learning_rate": 1.850054221987749e-05, + "loss": 0.0004, + "step": 6356 + }, + { + "epoch": 1.6772193641999738, + "grad_norm": 0.5299544930458069, + "learning_rate": 1.8499369852574812e-05, + "loss": 0.0101, + "step": 6358 + }, + { + "epoch": 1.677746999076639, + "grad_norm": 0.022422080859541893, + "learning_rate": 1.849819748527214e-05, + "loss": 0.0007, + "step": 6360 + }, + { + "epoch": 1.6782746339533043, + "grad_norm": 0.15836623311042786, + "learning_rate": 1.849702511796946e-05, + "loss": 0.0094, + "step": 6362 + }, + { + "epoch": 1.6788022688299695, + "grad_norm": 0.07601069658994675, + "learning_rate": 1.8495852750666784e-05, + "loss": 0.0015, + "step": 6364 + }, + { + "epoch": 1.679329903706635, + "grad_norm": 0.035747215151786804, + "learning_rate": 1.849468038336411e-05, + "loss": 0.0011, + "step": 6366 + }, + { + "epoch": 1.6798575385833003, + "grad_norm": 0.01548683736473322, + "learning_rate": 1.8493508016061436e-05, + "loss": 0.0004, + "step": 6368 + }, + { + "epoch": 1.6803851734599657, + "grad_norm": 0.029695207253098488, + "learning_rate": 1.8492335648758758e-05, + "loss": 0.0012, + "step": 6370 + }, + { + "epoch": 1.6809128083366311, + "grad_norm": 0.021312441676855087, + "learning_rate": 1.849116328145608e-05, + "loss": 0.0035, + "step": 6372 + }, + { + "epoch": 1.6814404432132966, + "grad_norm": 0.2591036260128021, + "learning_rate": 1.8489990914153407e-05, + "loss": 0.0062, + "step": 6374 + }, + { + "epoch": 1.6819680780899617, + "grad_norm": 0.3137872815132141, + "learning_rate": 1.848881854685073e-05, + "loss": 0.0027, + "step": 6376 + }, + { + "epoch": 1.6824957129666271, + "grad_norm": 0.012691608630120754, + "learning_rate": 1.8487646179548052e-05, + "loss": 0.0005, + "step": 6378 + }, + { + "epoch": 1.6830233478432923, + "grad_norm": 0.014620781876146793, + "learning_rate": 1.8486473812245378e-05, + "loss": 0.0004, + "step": 6380 + }, + { + "epoch": 1.6835509827199577, + "grad_norm": 0.08986234664916992, + "learning_rate": 1.8485301444942704e-05, + "loss": 0.0007, + "step": 6382 + }, + { + "epoch": 1.6840786175966231, + "grad_norm": 0.010890522971749306, + "learning_rate": 1.8484129077640027e-05, + "loss": 0.0009, + "step": 6384 + }, + { + "epoch": 1.6846062524732885, + "grad_norm": 0.02785666659474373, + "learning_rate": 1.848295671033735e-05, + "loss": 0.0004, + "step": 6386 + }, + { + "epoch": 1.685133887349954, + "grad_norm": 0.09183956682682037, + "learning_rate": 1.8481784343034675e-05, + "loss": 0.0185, + "step": 6388 + }, + { + "epoch": 1.6856615222266191, + "grad_norm": 0.09911280870437622, + "learning_rate": 1.8480611975731998e-05, + "loss": 0.009, + "step": 6390 + }, + { + "epoch": 1.6861891571032845, + "grad_norm": 0.008636616170406342, + "learning_rate": 1.847943960842932e-05, + "loss": 0.0007, + "step": 6392 + }, + { + "epoch": 1.6867167919799497, + "grad_norm": 0.12637196481227875, + "learning_rate": 1.8478267241126646e-05, + "loss": 0.0013, + "step": 6394 + }, + { + "epoch": 1.6872444268566151, + "grad_norm": 0.49157142639160156, + "learning_rate": 1.8477094873823972e-05, + "loss": 0.0044, + "step": 6396 + }, + { + "epoch": 1.6877720617332805, + "grad_norm": 0.008864770643413067, + "learning_rate": 1.8475922506521295e-05, + "loss": 0.0033, + "step": 6398 + }, + { + "epoch": 1.688299696609946, + "grad_norm": 0.0038172006607055664, + "learning_rate": 1.8474750139218618e-05, + "loss": 0.0003, + "step": 6400 + }, + { + "epoch": 1.6888273314866113, + "grad_norm": 0.040208783000707626, + "learning_rate": 1.8473577771915944e-05, + "loss": 0.0005, + "step": 6402 + }, + { + "epoch": 1.6893549663632768, + "grad_norm": 0.011767406016588211, + "learning_rate": 1.8472405404613266e-05, + "loss": 0.0153, + "step": 6404 + }, + { + "epoch": 1.689882601239942, + "grad_norm": 0.0386379100382328, + "learning_rate": 1.8471233037310592e-05, + "loss": 0.0066, + "step": 6406 + }, + { + "epoch": 1.6904102361166073, + "grad_norm": 0.1298230141401291, + "learning_rate": 1.8470060670007915e-05, + "loss": 0.0023, + "step": 6408 + }, + { + "epoch": 1.6909378709932725, + "grad_norm": 0.005807765293866396, + "learning_rate": 1.8468888302705238e-05, + "loss": 0.0008, + "step": 6410 + }, + { + "epoch": 1.691465505869938, + "grad_norm": 0.045278191566467285, + "learning_rate": 1.8467715935402564e-05, + "loss": 0.0059, + "step": 6412 + }, + { + "epoch": 1.6919931407466033, + "grad_norm": 0.03313782438635826, + "learning_rate": 1.8466543568099886e-05, + "loss": 0.0036, + "step": 6414 + }, + { + "epoch": 1.6925207756232687, + "grad_norm": 0.08842478692531586, + "learning_rate": 1.8465371200797212e-05, + "loss": 0.002, + "step": 6416 + }, + { + "epoch": 1.6930484104999342, + "grad_norm": 0.008865422569215298, + "learning_rate": 1.8464198833494535e-05, + "loss": 0.0004, + "step": 6418 + }, + { + "epoch": 1.6935760453765996, + "grad_norm": 0.5426661372184753, + "learning_rate": 1.846302646619186e-05, + "loss": 0.0078, + "step": 6420 + }, + { + "epoch": 1.6941036802532647, + "grad_norm": 0.035309359431266785, + "learning_rate": 1.8461854098889183e-05, + "loss": 0.0027, + "step": 6422 + }, + { + "epoch": 1.69463131512993, + "grad_norm": 0.1054605171084404, + "learning_rate": 1.8460681731586506e-05, + "loss": 0.0116, + "step": 6424 + }, + { + "epoch": 1.6951589500065953, + "grad_norm": 0.08602942526340485, + "learning_rate": 1.8459509364283832e-05, + "loss": 0.0009, + "step": 6426 + }, + { + "epoch": 1.6956865848832607, + "grad_norm": 0.010638231411576271, + "learning_rate": 1.8458336996981158e-05, + "loss": 0.0005, + "step": 6428 + }, + { + "epoch": 1.6962142197599261, + "grad_norm": 0.05947592109441757, + "learning_rate": 1.845716462967848e-05, + "loss": 0.002, + "step": 6430 + }, + { + "epoch": 1.6967418546365916, + "grad_norm": 0.10716279596090317, + "learning_rate": 1.8455992262375803e-05, + "loss": 0.001, + "step": 6432 + }, + { + "epoch": 1.697269489513257, + "grad_norm": 0.011456044390797615, + "learning_rate": 1.845481989507313e-05, + "loss": 0.0004, + "step": 6434 + }, + { + "epoch": 1.6977971243899221, + "grad_norm": 0.016792474314570427, + "learning_rate": 1.8453647527770452e-05, + "loss": 0.0004, + "step": 6436 + }, + { + "epoch": 1.6983247592665875, + "grad_norm": 0.008800787851214409, + "learning_rate": 1.8452475160467774e-05, + "loss": 0.0005, + "step": 6438 + }, + { + "epoch": 1.6988523941432527, + "grad_norm": 0.01181352511048317, + "learning_rate": 1.84513027931651e-05, + "loss": 0.001, + "step": 6440 + }, + { + "epoch": 1.6993800290199181, + "grad_norm": 0.02100989781320095, + "learning_rate": 1.8450130425862426e-05, + "loss": 0.0011, + "step": 6442 + }, + { + "epoch": 1.6999076638965835, + "grad_norm": 0.062303971499204636, + "learning_rate": 1.8448958058559746e-05, + "loss": 0.0006, + "step": 6444 + }, + { + "epoch": 1.700435298773249, + "grad_norm": 0.01606779731810093, + "learning_rate": 1.844778569125707e-05, + "loss": 0.002, + "step": 6446 + }, + { + "epoch": 1.7009629336499144, + "grad_norm": 0.20612609386444092, + "learning_rate": 1.8446613323954398e-05, + "loss": 0.0044, + "step": 6448 + }, + { + "epoch": 1.7014905685265798, + "grad_norm": 0.04406523331999779, + "learning_rate": 1.844544095665172e-05, + "loss": 0.0005, + "step": 6450 + }, + { + "epoch": 1.702018203403245, + "grad_norm": 0.011768176220357418, + "learning_rate": 1.8444268589349043e-05, + "loss": 0.0003, + "step": 6452 + }, + { + "epoch": 1.7025458382799104, + "grad_norm": 0.010588052682578564, + "learning_rate": 1.844309622204637e-05, + "loss": 0.0007, + "step": 6454 + }, + { + "epoch": 1.7030734731565755, + "grad_norm": 0.008965942077338696, + "learning_rate": 1.844192385474369e-05, + "loss": 0.0004, + "step": 6456 + }, + { + "epoch": 1.703601108033241, + "grad_norm": 0.046272698789834976, + "learning_rate": 1.8440751487441017e-05, + "loss": 0.0005, + "step": 6458 + }, + { + "epoch": 1.7041287429099063, + "grad_norm": 0.22031943500041962, + "learning_rate": 1.843957912013834e-05, + "loss": 0.0029, + "step": 6460 + }, + { + "epoch": 1.7046563777865718, + "grad_norm": 0.015555034391582012, + "learning_rate": 1.8438406752835666e-05, + "loss": 0.0008, + "step": 6462 + }, + { + "epoch": 1.7051840126632372, + "grad_norm": 0.01742929220199585, + "learning_rate": 1.843723438553299e-05, + "loss": 0.0003, + "step": 6464 + }, + { + "epoch": 1.7057116475399023, + "grad_norm": 0.13934750854969025, + "learning_rate": 1.8436062018230315e-05, + "loss": 0.017, + "step": 6466 + }, + { + "epoch": 1.7062392824165677, + "grad_norm": 0.020540181547403336, + "learning_rate": 1.8434889650927637e-05, + "loss": 0.0003, + "step": 6468 + }, + { + "epoch": 1.706766917293233, + "grad_norm": 0.006713407579809427, + "learning_rate": 1.843371728362496e-05, + "loss": 0.0003, + "step": 6470 + }, + { + "epoch": 1.7072945521698983, + "grad_norm": 0.04093889147043228, + "learning_rate": 1.8432544916322286e-05, + "loss": 0.002, + "step": 6472 + }, + { + "epoch": 1.7078221870465637, + "grad_norm": 0.3659558892250061, + "learning_rate": 1.843137254901961e-05, + "loss": 0.0051, + "step": 6474 + }, + { + "epoch": 1.7083498219232292, + "grad_norm": 0.00809901487082243, + "learning_rate": 1.8430200181716934e-05, + "loss": 0.0006, + "step": 6476 + }, + { + "epoch": 1.7088774567998946, + "grad_norm": 0.31730738282203674, + "learning_rate": 1.8429027814414257e-05, + "loss": 0.0031, + "step": 6478 + }, + { + "epoch": 1.70940509167656, + "grad_norm": 0.05793203040957451, + "learning_rate": 1.8427855447111583e-05, + "loss": 0.0007, + "step": 6480 + }, + { + "epoch": 1.7099327265532251, + "grad_norm": 0.009380633942782879, + "learning_rate": 1.8426683079808906e-05, + "loss": 0.0003, + "step": 6482 + }, + { + "epoch": 1.7104603614298906, + "grad_norm": 0.010393792763352394, + "learning_rate": 1.842551071250623e-05, + "loss": 0.0011, + "step": 6484 + }, + { + "epoch": 1.7109879963065557, + "grad_norm": 0.028540123254060745, + "learning_rate": 1.8424338345203554e-05, + "loss": 0.0003, + "step": 6486 + }, + { + "epoch": 1.7115156311832211, + "grad_norm": 0.26333922147750854, + "learning_rate": 1.842316597790088e-05, + "loss": 0.0056, + "step": 6488 + }, + { + "epoch": 1.7120432660598865, + "grad_norm": 0.005881460849195719, + "learning_rate": 1.84219936105982e-05, + "loss": 0.0005, + "step": 6490 + }, + { + "epoch": 1.712570900936552, + "grad_norm": 0.1419915407896042, + "learning_rate": 1.8420821243295526e-05, + "loss": 0.0068, + "step": 6492 + }, + { + "epoch": 1.7130985358132174, + "grad_norm": 0.023964492604136467, + "learning_rate": 1.841964887599285e-05, + "loss": 0.0005, + "step": 6494 + }, + { + "epoch": 1.7136261706898828, + "grad_norm": 0.15717941522598267, + "learning_rate": 1.8418476508690174e-05, + "loss": 0.0031, + "step": 6496 + }, + { + "epoch": 1.714153805566548, + "grad_norm": 0.03433115780353546, + "learning_rate": 1.8417304141387497e-05, + "loss": 0.0032, + "step": 6498 + }, + { + "epoch": 1.7146814404432131, + "grad_norm": 0.00469957385212183, + "learning_rate": 1.8416131774084823e-05, + "loss": 0.0003, + "step": 6500 + }, + { + "epoch": 1.7152090753198785, + "grad_norm": 0.07298038899898529, + "learning_rate": 1.8414959406782145e-05, + "loss": 0.0005, + "step": 6502 + }, + { + "epoch": 1.715736710196544, + "grad_norm": 0.0035736244171857834, + "learning_rate": 1.8413787039479468e-05, + "loss": 0.0004, + "step": 6504 + }, + { + "epoch": 1.7162643450732094, + "grad_norm": 0.02434380166232586, + "learning_rate": 1.8412614672176794e-05, + "loss": 0.0006, + "step": 6506 + }, + { + "epoch": 1.7167919799498748, + "grad_norm": 0.0142838004976511, + "learning_rate": 1.841144230487412e-05, + "loss": 0.0003, + "step": 6508 + }, + { + "epoch": 1.7173196148265402, + "grad_norm": 0.15018685162067413, + "learning_rate": 1.8410269937571443e-05, + "loss": 0.0133, + "step": 6510 + }, + { + "epoch": 1.7178472497032053, + "grad_norm": 0.015690233558416367, + "learning_rate": 1.8409097570268765e-05, + "loss": 0.0004, + "step": 6512 + }, + { + "epoch": 1.7183748845798708, + "grad_norm": 0.0404902920126915, + "learning_rate": 1.840792520296609e-05, + "loss": 0.0066, + "step": 6514 + }, + { + "epoch": 1.718902519456536, + "grad_norm": 0.1258140355348587, + "learning_rate": 1.8406752835663414e-05, + "loss": 0.0032, + "step": 6516 + }, + { + "epoch": 1.7194301543332013, + "grad_norm": 0.45092007517814636, + "learning_rate": 1.840558046836074e-05, + "loss": 0.0005, + "step": 6518 + }, + { + "epoch": 1.7199577892098667, + "grad_norm": 0.013225269503891468, + "learning_rate": 1.8404408101058062e-05, + "loss": 0.0003, + "step": 6520 + }, + { + "epoch": 1.7204854240865322, + "grad_norm": 0.06405302882194519, + "learning_rate": 1.840323573375539e-05, + "loss": 0.0096, + "step": 6522 + }, + { + "epoch": 1.7210130589631976, + "grad_norm": 0.0055902572348713875, + "learning_rate": 1.840206336645271e-05, + "loss": 0.0041, + "step": 6524 + }, + { + "epoch": 1.721540693839863, + "grad_norm": 0.3698117733001709, + "learning_rate": 1.8400890999150037e-05, + "loss": 0.0136, + "step": 6526 + }, + { + "epoch": 1.7220683287165282, + "grad_norm": 0.12906885147094727, + "learning_rate": 1.839971863184736e-05, + "loss": 0.0066, + "step": 6528 + }, + { + "epoch": 1.7225959635931936, + "grad_norm": 0.26267725229263306, + "learning_rate": 1.8398546264544682e-05, + "loss": 0.0022, + "step": 6530 + }, + { + "epoch": 1.7231235984698587, + "grad_norm": 2.6207990646362305, + "learning_rate": 1.8397373897242008e-05, + "loss": 0.0121, + "step": 6532 + }, + { + "epoch": 1.7236512333465241, + "grad_norm": 0.17383378744125366, + "learning_rate": 1.839620152993933e-05, + "loss": 0.0053, + "step": 6534 + }, + { + "epoch": 1.7241788682231896, + "grad_norm": 0.01655602641403675, + "learning_rate": 1.8395029162636653e-05, + "loss": 0.0005, + "step": 6536 + }, + { + "epoch": 1.724706503099855, + "grad_norm": 0.3648751974105835, + "learning_rate": 1.839385679533398e-05, + "loss": 0.0115, + "step": 6538 + }, + { + "epoch": 1.7252341379765204, + "grad_norm": 0.1666220724582672, + "learning_rate": 1.8392684428031305e-05, + "loss": 0.0016, + "step": 6540 + }, + { + "epoch": 1.7257617728531855, + "grad_norm": 0.02980104275047779, + "learning_rate": 1.8391512060728628e-05, + "loss": 0.0017, + "step": 6542 + }, + { + "epoch": 1.726289407729851, + "grad_norm": 0.08480598777532578, + "learning_rate": 1.839033969342595e-05, + "loss": 0.0011, + "step": 6544 + }, + { + "epoch": 1.7268170426065161, + "grad_norm": 0.17045576870441437, + "learning_rate": 1.8389167326123277e-05, + "loss": 0.0049, + "step": 6546 + }, + { + "epoch": 1.7273446774831815, + "grad_norm": 0.007672307547181845, + "learning_rate": 1.83879949588206e-05, + "loss": 0.0009, + "step": 6548 + }, + { + "epoch": 1.727872312359847, + "grad_norm": 0.008501170203089714, + "learning_rate": 1.8386822591517922e-05, + "loss": 0.0005, + "step": 6550 + }, + { + "epoch": 1.7283999472365124, + "grad_norm": 0.027737263590097427, + "learning_rate": 1.8385650224215248e-05, + "loss": 0.0005, + "step": 6552 + }, + { + "epoch": 1.7289275821131778, + "grad_norm": 0.0735059529542923, + "learning_rate": 1.8384477856912574e-05, + "loss": 0.0009, + "step": 6554 + }, + { + "epoch": 1.7294552169898432, + "grad_norm": 0.005471557378768921, + "learning_rate": 1.8383305489609896e-05, + "loss": 0.007, + "step": 6556 + }, + { + "epoch": 1.7299828518665084, + "grad_norm": 0.01381720881909132, + "learning_rate": 1.838213312230722e-05, + "loss": 0.0005, + "step": 6558 + }, + { + "epoch": 1.7305104867431738, + "grad_norm": 0.05204777792096138, + "learning_rate": 1.8380960755004545e-05, + "loss": 0.0009, + "step": 6560 + }, + { + "epoch": 1.731038121619839, + "grad_norm": 0.007230481132864952, + "learning_rate": 1.8379788387701868e-05, + "loss": 0.0004, + "step": 6562 + }, + { + "epoch": 1.7315657564965043, + "grad_norm": 0.025749707594513893, + "learning_rate": 1.837861602039919e-05, + "loss": 0.0032, + "step": 6564 + }, + { + "epoch": 1.7320933913731698, + "grad_norm": 0.07903726398944855, + "learning_rate": 1.8377443653096516e-05, + "loss": 0.0007, + "step": 6566 + }, + { + "epoch": 1.7326210262498352, + "grad_norm": 0.13579079508781433, + "learning_rate": 1.8376271285793842e-05, + "loss": 0.0005, + "step": 6568 + }, + { + "epoch": 1.7331486611265006, + "grad_norm": 0.008022514171898365, + "learning_rate": 1.8375098918491165e-05, + "loss": 0.0003, + "step": 6570 + }, + { + "epoch": 1.733676296003166, + "grad_norm": 0.43683403730392456, + "learning_rate": 1.8373926551188488e-05, + "loss": 0.0037, + "step": 6572 + }, + { + "epoch": 1.7342039308798312, + "grad_norm": 0.29441237449645996, + "learning_rate": 1.8372754183885814e-05, + "loss": 0.0055, + "step": 6574 + }, + { + "epoch": 1.7347315657564963, + "grad_norm": 0.15812216699123383, + "learning_rate": 1.8371581816583136e-05, + "loss": 0.0113, + "step": 6576 + }, + { + "epoch": 1.7352592006331617, + "grad_norm": 0.3005272150039673, + "learning_rate": 1.8370409449280462e-05, + "loss": 0.0039, + "step": 6578 + }, + { + "epoch": 1.7357868355098272, + "grad_norm": 0.058001358062028885, + "learning_rate": 1.8369237081977785e-05, + "loss": 0.0005, + "step": 6580 + }, + { + "epoch": 1.7363144703864926, + "grad_norm": 0.2180757224559784, + "learning_rate": 1.8368064714675107e-05, + "loss": 0.0014, + "step": 6582 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 0.05538041889667511, + "learning_rate": 1.8366892347372433e-05, + "loss": 0.0081, + "step": 6584 + }, + { + "epoch": 1.7373697401398234, + "grad_norm": 0.04219825193285942, + "learning_rate": 1.836571998006976e-05, + "loss": 0.0026, + "step": 6586 + }, + { + "epoch": 1.7378973750164886, + "grad_norm": 0.18852297961711884, + "learning_rate": 1.8364547612767082e-05, + "loss": 0.0023, + "step": 6588 + }, + { + "epoch": 1.738425009893154, + "grad_norm": 3.7201502323150635, + "learning_rate": 1.8363375245464405e-05, + "loss": 0.0057, + "step": 6590 + }, + { + "epoch": 1.7389526447698191, + "grad_norm": 0.1913967728614807, + "learning_rate": 1.836220287816173e-05, + "loss": 0.0017, + "step": 6592 + }, + { + "epoch": 1.7394802796464846, + "grad_norm": 0.028609849512577057, + "learning_rate": 1.8361030510859053e-05, + "loss": 0.0005, + "step": 6594 + }, + { + "epoch": 1.74000791452315, + "grad_norm": 0.013230391778051853, + "learning_rate": 1.8359858143556376e-05, + "loss": 0.0065, + "step": 6596 + }, + { + "epoch": 1.7405355493998154, + "grad_norm": 0.02510635368525982, + "learning_rate": 1.8358685776253702e-05, + "loss": 0.0012, + "step": 6598 + }, + { + "epoch": 1.7410631842764808, + "grad_norm": 0.02767692133784294, + "learning_rate": 1.8357513408951028e-05, + "loss": 0.0003, + "step": 6600 + }, + { + "epoch": 1.7415908191531462, + "grad_norm": 0.19940495491027832, + "learning_rate": 1.835634104164835e-05, + "loss": 0.0007, + "step": 6602 + }, + { + "epoch": 1.7421184540298114, + "grad_norm": 0.11583592742681503, + "learning_rate": 1.8355168674345673e-05, + "loss": 0.0005, + "step": 6604 + }, + { + "epoch": 1.7426460889064768, + "grad_norm": 0.9648275375366211, + "learning_rate": 1.8353996307043e-05, + "loss": 0.0076, + "step": 6606 + }, + { + "epoch": 1.743173723783142, + "grad_norm": 0.3421812653541565, + "learning_rate": 1.835282393974032e-05, + "loss": 0.0034, + "step": 6608 + }, + { + "epoch": 1.7437013586598074, + "grad_norm": 0.07171850651502609, + "learning_rate": 1.8351651572437644e-05, + "loss": 0.0008, + "step": 6610 + }, + { + "epoch": 1.7442289935364728, + "grad_norm": 0.01822054572403431, + "learning_rate": 1.835047920513497e-05, + "loss": 0.0004, + "step": 6612 + }, + { + "epoch": 1.7447566284131382, + "grad_norm": 0.008562415838241577, + "learning_rate": 1.8349306837832296e-05, + "loss": 0.0065, + "step": 6614 + }, + { + "epoch": 1.7452842632898036, + "grad_norm": 0.10606875270605087, + "learning_rate": 1.834813447052962e-05, + "loss": 0.0124, + "step": 6616 + }, + { + "epoch": 1.7458118981664688, + "grad_norm": 0.05063805356621742, + "learning_rate": 1.834696210322694e-05, + "loss": 0.0088, + "step": 6618 + }, + { + "epoch": 1.7463395330431342, + "grad_norm": 0.08795182406902313, + "learning_rate": 1.8345789735924267e-05, + "loss": 0.0017, + "step": 6620 + }, + { + "epoch": 1.7468671679197993, + "grad_norm": 0.20865991711616516, + "learning_rate": 1.834461736862159e-05, + "loss": 0.0058, + "step": 6622 + }, + { + "epoch": 1.7473948027964648, + "grad_norm": 0.1678265929222107, + "learning_rate": 1.8343445001318916e-05, + "loss": 0.0121, + "step": 6624 + }, + { + "epoch": 1.7479224376731302, + "grad_norm": 0.014898184686899185, + "learning_rate": 1.834227263401624e-05, + "loss": 0.0004, + "step": 6626 + }, + { + "epoch": 1.7484500725497956, + "grad_norm": 0.012179229408502579, + "learning_rate": 1.834110026671356e-05, + "loss": 0.0005, + "step": 6628 + }, + { + "epoch": 1.748977707426461, + "grad_norm": 0.016230447217822075, + "learning_rate": 1.8339927899410887e-05, + "loss": 0.0004, + "step": 6630 + }, + { + "epoch": 1.7495053423031264, + "grad_norm": 0.28911805152893066, + "learning_rate": 1.833875553210821e-05, + "loss": 0.0036, + "step": 6632 + }, + { + "epoch": 1.7500329771797916, + "grad_norm": 0.011694486252963543, + "learning_rate": 1.8337583164805536e-05, + "loss": 0.009, + "step": 6634 + }, + { + "epoch": 1.750560612056457, + "grad_norm": 0.14379459619522095, + "learning_rate": 1.833641079750286e-05, + "loss": 0.0018, + "step": 6636 + }, + { + "epoch": 1.7510882469331222, + "grad_norm": 0.006295908708125353, + "learning_rate": 1.8335238430200184e-05, + "loss": 0.0029, + "step": 6638 + }, + { + "epoch": 1.7516158818097876, + "grad_norm": 0.018990378826856613, + "learning_rate": 1.8334066062897507e-05, + "loss": 0.0006, + "step": 6640 + }, + { + "epoch": 1.752143516686453, + "grad_norm": 0.013579766266047955, + "learning_rate": 1.833289369559483e-05, + "loss": 0.0007, + "step": 6642 + }, + { + "epoch": 1.7526711515631184, + "grad_norm": 0.02301371842622757, + "learning_rate": 1.8331721328292156e-05, + "loss": 0.0005, + "step": 6644 + }, + { + "epoch": 1.7531987864397838, + "grad_norm": 0.2074609249830246, + "learning_rate": 1.833054896098948e-05, + "loss": 0.0025, + "step": 6646 + }, + { + "epoch": 1.7537264213164492, + "grad_norm": 0.010264679789543152, + "learning_rate": 1.8329376593686804e-05, + "loss": 0.0004, + "step": 6648 + }, + { + "epoch": 1.7542540561931144, + "grad_norm": 0.024084195494651794, + "learning_rate": 1.8328204226384127e-05, + "loss": 0.0011, + "step": 6650 + }, + { + "epoch": 1.7547816910697795, + "grad_norm": 0.11535364389419556, + "learning_rate": 1.8327031859081453e-05, + "loss": 0.0099, + "step": 6652 + }, + { + "epoch": 1.755309325946445, + "grad_norm": 0.03431674465537071, + "learning_rate": 1.8325859491778776e-05, + "loss": 0.0004, + "step": 6654 + }, + { + "epoch": 1.7558369608231104, + "grad_norm": 0.13122378289699554, + "learning_rate": 1.8324687124476098e-05, + "loss": 0.0022, + "step": 6656 + }, + { + "epoch": 1.7563645956997758, + "grad_norm": 0.07978076487779617, + "learning_rate": 1.8323514757173424e-05, + "loss": 0.001, + "step": 6658 + }, + { + "epoch": 1.7568922305764412, + "grad_norm": 0.45537805557250977, + "learning_rate": 1.832234238987075e-05, + "loss": 0.0076, + "step": 6660 + }, + { + "epoch": 1.7574198654531066, + "grad_norm": 0.011643096804618835, + "learning_rate": 1.8321170022568073e-05, + "loss": 0.0025, + "step": 6662 + }, + { + "epoch": 1.7579475003297718, + "grad_norm": 0.01111162081360817, + "learning_rate": 1.8319997655265395e-05, + "loss": 0.0039, + "step": 6664 + }, + { + "epoch": 1.7584751352064372, + "grad_norm": 0.0049167824909091, + "learning_rate": 1.831882528796272e-05, + "loss": 0.0004, + "step": 6666 + }, + { + "epoch": 1.7590027700831024, + "grad_norm": 0.011652066372334957, + "learning_rate": 1.8317652920660044e-05, + "loss": 0.0032, + "step": 6668 + }, + { + "epoch": 1.7595304049597678, + "grad_norm": 0.00915137305855751, + "learning_rate": 1.8316480553357367e-05, + "loss": 0.0005, + "step": 6670 + }, + { + "epoch": 1.7600580398364332, + "grad_norm": 0.28434762358665466, + "learning_rate": 1.8315308186054693e-05, + "loss": 0.0009, + "step": 6672 + }, + { + "epoch": 1.7605856747130986, + "grad_norm": 0.07388126105070114, + "learning_rate": 1.8314135818752015e-05, + "loss": 0.0135, + "step": 6674 + }, + { + "epoch": 1.761113309589764, + "grad_norm": 0.0072088115848600864, + "learning_rate": 1.831296345144934e-05, + "loss": 0.0013, + "step": 6676 + }, + { + "epoch": 1.7616409444664294, + "grad_norm": 0.04157235100865364, + "learning_rate": 1.8311791084146664e-05, + "loss": 0.0004, + "step": 6678 + }, + { + "epoch": 1.7621685793430946, + "grad_norm": 0.026083454489707947, + "learning_rate": 1.831061871684399e-05, + "loss": 0.0014, + "step": 6680 + }, + { + "epoch": 1.76269621421976, + "grad_norm": 0.004753231070935726, + "learning_rate": 1.8309446349541312e-05, + "loss": 0.0098, + "step": 6682 + }, + { + "epoch": 1.7632238490964252, + "grad_norm": 0.0054683992639184, + "learning_rate": 1.830827398223864e-05, + "loss": 0.0002, + "step": 6684 + }, + { + "epoch": 1.7637514839730906, + "grad_norm": 0.0051204185001552105, + "learning_rate": 1.830710161493596e-05, + "loss": 0.0013, + "step": 6686 + }, + { + "epoch": 1.764279118849756, + "grad_norm": 0.0034224698320031166, + "learning_rate": 1.8305929247633284e-05, + "loss": 0.0003, + "step": 6688 + }, + { + "epoch": 1.7648067537264214, + "grad_norm": 0.0034517610911279917, + "learning_rate": 1.830475688033061e-05, + "loss": 0.0002, + "step": 6690 + }, + { + "epoch": 1.7653343886030868, + "grad_norm": 0.34393632411956787, + "learning_rate": 1.8303584513027932e-05, + "loss": 0.0034, + "step": 6692 + }, + { + "epoch": 1.765862023479752, + "grad_norm": 0.22004517912864685, + "learning_rate": 1.8302412145725258e-05, + "loss": 0.0061, + "step": 6694 + }, + { + "epoch": 1.7663896583564174, + "grad_norm": 0.0035345633514225483, + "learning_rate": 1.830123977842258e-05, + "loss": 0.0002, + "step": 6696 + }, + { + "epoch": 1.7669172932330826, + "grad_norm": 0.41416269540786743, + "learning_rate": 1.8300067411119907e-05, + "loss": 0.0119, + "step": 6698 + }, + { + "epoch": 1.767444928109748, + "grad_norm": 0.013073817826807499, + "learning_rate": 1.829889504381723e-05, + "loss": 0.0004, + "step": 6700 + }, + { + "epoch": 1.7679725629864134, + "grad_norm": 0.13479626178741455, + "learning_rate": 1.8297722676514552e-05, + "loss": 0.0035, + "step": 6702 + }, + { + "epoch": 1.7685001978630788, + "grad_norm": 0.05497051030397415, + "learning_rate": 1.8296550309211878e-05, + "loss": 0.0007, + "step": 6704 + }, + { + "epoch": 1.7690278327397442, + "grad_norm": 0.21048641204833984, + "learning_rate": 1.8295377941909204e-05, + "loss": 0.0013, + "step": 6706 + }, + { + "epoch": 1.7695554676164096, + "grad_norm": 0.08526334166526794, + "learning_rate": 1.8294205574606527e-05, + "loss": 0.004, + "step": 6708 + }, + { + "epoch": 1.7700831024930748, + "grad_norm": 0.9502393007278442, + "learning_rate": 1.829303320730385e-05, + "loss": 0.0012, + "step": 6710 + }, + { + "epoch": 1.7706107373697402, + "grad_norm": 0.010381976142525673, + "learning_rate": 1.8291860840001175e-05, + "loss": 0.011, + "step": 6712 + }, + { + "epoch": 1.7711383722464054, + "grad_norm": 0.046834900975227356, + "learning_rate": 1.8290688472698498e-05, + "loss": 0.0005, + "step": 6714 + }, + { + "epoch": 1.7716660071230708, + "grad_norm": 0.06696611642837524, + "learning_rate": 1.828951610539582e-05, + "loss": 0.0033, + "step": 6716 + }, + { + "epoch": 1.7721936419997362, + "grad_norm": 0.41886475682258606, + "learning_rate": 1.8288343738093146e-05, + "loss": 0.0011, + "step": 6718 + }, + { + "epoch": 1.7727212768764016, + "grad_norm": 0.20020988583564758, + "learning_rate": 1.8287171370790472e-05, + "loss": 0.0116, + "step": 6720 + }, + { + "epoch": 1.773248911753067, + "grad_norm": 0.02505464106798172, + "learning_rate": 1.828599900348779e-05, + "loss": 0.0024, + "step": 6722 + }, + { + "epoch": 1.7737765466297324, + "grad_norm": 0.016288241371512413, + "learning_rate": 1.8284826636185118e-05, + "loss": 0.0005, + "step": 6724 + }, + { + "epoch": 1.7743041815063976, + "grad_norm": 0.03779410943388939, + "learning_rate": 1.8283654268882444e-05, + "loss": 0.0097, + "step": 6726 + }, + { + "epoch": 1.7748318163830628, + "grad_norm": 0.04058891907334328, + "learning_rate": 1.8282481901579766e-05, + "loss": 0.0004, + "step": 6728 + }, + { + "epoch": 1.7753594512597282, + "grad_norm": 0.04655598849058151, + "learning_rate": 1.828130953427709e-05, + "loss": 0.0031, + "step": 6730 + }, + { + "epoch": 1.7758870861363936, + "grad_norm": 0.01566285826265812, + "learning_rate": 1.8280137166974415e-05, + "loss": 0.0016, + "step": 6732 + }, + { + "epoch": 1.776414721013059, + "grad_norm": 0.4369746148586273, + "learning_rate": 1.8278964799671737e-05, + "loss": 0.0023, + "step": 6734 + }, + { + "epoch": 1.7769423558897244, + "grad_norm": 0.06179923936724663, + "learning_rate": 1.8277792432369063e-05, + "loss": 0.0076, + "step": 6736 + }, + { + "epoch": 1.7774699907663898, + "grad_norm": 0.2894945442676544, + "learning_rate": 1.8276620065066386e-05, + "loss": 0.0094, + "step": 6738 + }, + { + "epoch": 1.777997625643055, + "grad_norm": 0.017711730673909187, + "learning_rate": 1.8275447697763712e-05, + "loss": 0.0016, + "step": 6740 + }, + { + "epoch": 1.7785252605197204, + "grad_norm": 0.1321517378091812, + "learning_rate": 1.8274275330461035e-05, + "loss": 0.001, + "step": 6742 + }, + { + "epoch": 1.7790528953963856, + "grad_norm": 0.03632469102740288, + "learning_rate": 1.827310296315836e-05, + "loss": 0.0006, + "step": 6744 + }, + { + "epoch": 1.779580530273051, + "grad_norm": 0.15788309276103973, + "learning_rate": 1.8271930595855683e-05, + "loss": 0.0016, + "step": 6746 + }, + { + "epoch": 1.7801081651497164, + "grad_norm": 0.5645799040794373, + "learning_rate": 1.8270758228553006e-05, + "loss": 0.0061, + "step": 6748 + }, + { + "epoch": 1.7806358000263818, + "grad_norm": 0.14167168736457825, + "learning_rate": 1.8269585861250332e-05, + "loss": 0.0079, + "step": 6750 + }, + { + "epoch": 1.7811634349030472, + "grad_norm": 0.010547594167292118, + "learning_rate": 1.8268413493947655e-05, + "loss": 0.0005, + "step": 6752 + }, + { + "epoch": 1.7816910697797126, + "grad_norm": 0.06617714464664459, + "learning_rate": 1.826724112664498e-05, + "loss": 0.0005, + "step": 6754 + }, + { + "epoch": 1.7822187046563778, + "grad_norm": 0.028533602133393288, + "learning_rate": 1.8266068759342303e-05, + "loss": 0.0058, + "step": 6756 + }, + { + "epoch": 1.7827463395330432, + "grad_norm": 0.14960460364818573, + "learning_rate": 1.826489639203963e-05, + "loss": 0.0121, + "step": 6758 + }, + { + "epoch": 1.7832739744097084, + "grad_norm": 0.04719865322113037, + "learning_rate": 1.8263724024736952e-05, + "loss": 0.0004, + "step": 6760 + }, + { + "epoch": 1.7838016092863738, + "grad_norm": 0.019071372225880623, + "learning_rate": 1.8262551657434274e-05, + "loss": 0.0004, + "step": 6762 + }, + { + "epoch": 1.7843292441630392, + "grad_norm": 0.6327933669090271, + "learning_rate": 1.82613792901316e-05, + "loss": 0.0113, + "step": 6764 + }, + { + "epoch": 1.7848568790397046, + "grad_norm": 0.04834549129009247, + "learning_rate": 1.8260206922828926e-05, + "loss": 0.0016, + "step": 6766 + }, + { + "epoch": 1.78538451391637, + "grad_norm": 0.03147943690419197, + "learning_rate": 1.8259034555526246e-05, + "loss": 0.0037, + "step": 6768 + }, + { + "epoch": 1.7859121487930352, + "grad_norm": 0.14576388895511627, + "learning_rate": 1.825786218822357e-05, + "loss": 0.0086, + "step": 6770 + }, + { + "epoch": 1.7864397836697006, + "grad_norm": 0.01615813933312893, + "learning_rate": 1.8256689820920898e-05, + "loss": 0.0009, + "step": 6772 + }, + { + "epoch": 1.7869674185463658, + "grad_norm": 0.04726666584610939, + "learning_rate": 1.825551745361822e-05, + "loss": 0.0028, + "step": 6774 + }, + { + "epoch": 1.7874950534230312, + "grad_norm": 0.023256763815879822, + "learning_rate": 1.8254345086315543e-05, + "loss": 0.0008, + "step": 6776 + }, + { + "epoch": 1.7880226882996966, + "grad_norm": 0.4271971881389618, + "learning_rate": 1.825317271901287e-05, + "loss": 0.0036, + "step": 6778 + }, + { + "epoch": 1.788550323176362, + "grad_norm": 0.11530737578868866, + "learning_rate": 1.825200035171019e-05, + "loss": 0.0006, + "step": 6780 + }, + { + "epoch": 1.7890779580530274, + "grad_norm": 0.2193361520767212, + "learning_rate": 1.8250827984407514e-05, + "loss": 0.0013, + "step": 6782 + }, + { + "epoch": 1.7896055929296928, + "grad_norm": 0.014308559708297253, + "learning_rate": 1.824965561710484e-05, + "loss": 0.0091, + "step": 6784 + }, + { + "epoch": 1.790133227806358, + "grad_norm": 0.09522411972284317, + "learning_rate": 1.8248483249802166e-05, + "loss": 0.0009, + "step": 6786 + }, + { + "epoch": 1.7906608626830234, + "grad_norm": 0.010648383758962154, + "learning_rate": 1.824731088249949e-05, + "loss": 0.0005, + "step": 6788 + }, + { + "epoch": 1.7911884975596886, + "grad_norm": 0.09045738726854324, + "learning_rate": 1.824613851519681e-05, + "loss": 0.0018, + "step": 6790 + }, + { + "epoch": 1.791716132436354, + "grad_norm": 0.26487740874290466, + "learning_rate": 1.8244966147894137e-05, + "loss": 0.0047, + "step": 6792 + }, + { + "epoch": 1.7922437673130194, + "grad_norm": 0.2639865279197693, + "learning_rate": 1.824379378059146e-05, + "loss": 0.0039, + "step": 6794 + }, + { + "epoch": 1.7927714021896848, + "grad_norm": 0.035647373646497726, + "learning_rate": 1.8242621413288786e-05, + "loss": 0.0008, + "step": 6796 + }, + { + "epoch": 1.7932990370663502, + "grad_norm": 0.18093299865722656, + "learning_rate": 1.824144904598611e-05, + "loss": 0.0038, + "step": 6798 + }, + { + "epoch": 1.7938266719430156, + "grad_norm": 0.011242569424211979, + "learning_rate": 1.8240276678683434e-05, + "loss": 0.0005, + "step": 6800 + }, + { + "epoch": 1.7943543068196808, + "grad_norm": 0.16372032463550568, + "learning_rate": 1.8239104311380757e-05, + "loss": 0.0011, + "step": 6802 + }, + { + "epoch": 1.7948819416963462, + "grad_norm": 0.1611267775297165, + "learning_rate": 1.8237931944078083e-05, + "loss": 0.0029, + "step": 6804 + }, + { + "epoch": 1.7954095765730114, + "grad_norm": 0.006375662051141262, + "learning_rate": 1.8236759576775406e-05, + "loss": 0.0007, + "step": 6806 + }, + { + "epoch": 1.7959372114496768, + "grad_norm": 0.1907048225402832, + "learning_rate": 1.8235587209472728e-05, + "loss": 0.0021, + "step": 6808 + }, + { + "epoch": 1.7964648463263422, + "grad_norm": 0.010874224826693535, + "learning_rate": 1.8234414842170054e-05, + "loss": 0.0015, + "step": 6810 + }, + { + "epoch": 1.7969924812030076, + "grad_norm": 0.04357612133026123, + "learning_rate": 1.8233242474867377e-05, + "loss": 0.0015, + "step": 6812 + }, + { + "epoch": 1.797520116079673, + "grad_norm": 0.09300518780946732, + "learning_rate": 1.82320701075647e-05, + "loss": 0.0102, + "step": 6814 + }, + { + "epoch": 1.7980477509563382, + "grad_norm": 0.1868680715560913, + "learning_rate": 1.8230897740262025e-05, + "loss": 0.006, + "step": 6816 + }, + { + "epoch": 1.7985753858330036, + "grad_norm": 0.006914967205375433, + "learning_rate": 1.822972537295935e-05, + "loss": 0.0049, + "step": 6818 + }, + { + "epoch": 1.7991030207096688, + "grad_norm": 0.587604820728302, + "learning_rate": 1.8228553005656674e-05, + "loss": 0.0027, + "step": 6820 + }, + { + "epoch": 1.7996306555863342, + "grad_norm": 0.48509204387664795, + "learning_rate": 1.8227380638353997e-05, + "loss": 0.0053, + "step": 6822 + }, + { + "epoch": 1.8001582904629996, + "grad_norm": 0.6479736566543579, + "learning_rate": 1.8226208271051323e-05, + "loss": 0.0083, + "step": 6824 + }, + { + "epoch": 1.800685925339665, + "grad_norm": 0.015962205827236176, + "learning_rate": 1.8225035903748645e-05, + "loss": 0.0004, + "step": 6826 + }, + { + "epoch": 1.8012135602163304, + "grad_norm": 0.17311137914657593, + "learning_rate": 1.8223863536445968e-05, + "loss": 0.0061, + "step": 6828 + }, + { + "epoch": 1.8017411950929958, + "grad_norm": 0.03594956919550896, + "learning_rate": 1.8222691169143294e-05, + "loss": 0.0059, + "step": 6830 + }, + { + "epoch": 1.802268829969661, + "grad_norm": 0.2650473117828369, + "learning_rate": 1.822151880184062e-05, + "loss": 0.0071, + "step": 6832 + }, + { + "epoch": 1.8027964648463264, + "grad_norm": 0.07981809973716736, + "learning_rate": 1.8220346434537943e-05, + "loss": 0.0057, + "step": 6834 + }, + { + "epoch": 1.8033240997229916, + "grad_norm": 0.24360762536525726, + "learning_rate": 1.8219174067235265e-05, + "loss": 0.0079, + "step": 6836 + }, + { + "epoch": 1.803851734599657, + "grad_norm": 0.16078990697860718, + "learning_rate": 1.821800169993259e-05, + "loss": 0.0015, + "step": 6838 + }, + { + "epoch": 1.8043793694763224, + "grad_norm": 0.21431657671928406, + "learning_rate": 1.8216829332629914e-05, + "loss": 0.0056, + "step": 6840 + }, + { + "epoch": 1.8049070043529878, + "grad_norm": 0.06040050461888313, + "learning_rate": 1.8215656965327236e-05, + "loss": 0.0052, + "step": 6842 + }, + { + "epoch": 1.8054346392296532, + "grad_norm": 0.05081396922469139, + "learning_rate": 1.8214484598024562e-05, + "loss": 0.0006, + "step": 6844 + }, + { + "epoch": 1.8059622741063184, + "grad_norm": 0.13114970922470093, + "learning_rate": 1.821331223072189e-05, + "loss": 0.0019, + "step": 6846 + }, + { + "epoch": 1.8064899089829838, + "grad_norm": 0.022344762459397316, + "learning_rate": 1.821213986341921e-05, + "loss": 0.0005, + "step": 6848 + }, + { + "epoch": 1.807017543859649, + "grad_norm": 0.67961186170578, + "learning_rate": 1.8210967496116534e-05, + "loss": 0.0026, + "step": 6850 + }, + { + "epoch": 1.8075451787363144, + "grad_norm": 0.1975441724061966, + "learning_rate": 1.820979512881386e-05, + "loss": 0.0015, + "step": 6852 + }, + { + "epoch": 1.8080728136129798, + "grad_norm": 3.164761781692505, + "learning_rate": 1.8208622761511182e-05, + "loss": 0.0008, + "step": 6854 + }, + { + "epoch": 1.8086004484896452, + "grad_norm": 0.03595109283924103, + "learning_rate": 1.8207450394208508e-05, + "loss": 0.0042, + "step": 6856 + }, + { + "epoch": 1.8091280833663106, + "grad_norm": 0.07685098052024841, + "learning_rate": 1.820627802690583e-05, + "loss": 0.001, + "step": 6858 + }, + { + "epoch": 1.809655718242976, + "grad_norm": 0.42305469512939453, + "learning_rate": 1.8205105659603153e-05, + "loss": 0.0025, + "step": 6860 + }, + { + "epoch": 1.8101833531196412, + "grad_norm": 0.6038016676902771, + "learning_rate": 1.820393329230048e-05, + "loss": 0.005, + "step": 6862 + }, + { + "epoch": 1.8107109879963066, + "grad_norm": 0.009426327422261238, + "learning_rate": 1.8202760924997805e-05, + "loss": 0.0009, + "step": 6864 + }, + { + "epoch": 1.8112386228729718, + "grad_norm": 0.0045813312754035, + "learning_rate": 1.8201588557695128e-05, + "loss": 0.0003, + "step": 6866 + }, + { + "epoch": 1.8117662577496372, + "grad_norm": 0.03408043086528778, + "learning_rate": 1.820041619039245e-05, + "loss": 0.0005, + "step": 6868 + }, + { + "epoch": 1.8122938926263026, + "grad_norm": 0.4089607000350952, + "learning_rate": 1.8199243823089777e-05, + "loss": 0.0179, + "step": 6870 + }, + { + "epoch": 1.812821527502968, + "grad_norm": 0.6286260485649109, + "learning_rate": 1.81980714557871e-05, + "loss": 0.007, + "step": 6872 + }, + { + "epoch": 1.8133491623796334, + "grad_norm": 0.010273473337292671, + "learning_rate": 1.8196899088484422e-05, + "loss": 0.0003, + "step": 6874 + }, + { + "epoch": 1.8138767972562988, + "grad_norm": 0.011743769980967045, + "learning_rate": 1.8195726721181748e-05, + "loss": 0.0017, + "step": 6876 + }, + { + "epoch": 1.814404432132964, + "grad_norm": 0.06975103914737701, + "learning_rate": 1.8194554353879074e-05, + "loss": 0.0014, + "step": 6878 + }, + { + "epoch": 1.8149320670096294, + "grad_norm": 0.1616618037223816, + "learning_rate": 1.8193381986576396e-05, + "loss": 0.0119, + "step": 6880 + }, + { + "epoch": 1.8154597018862946, + "grad_norm": 0.4099276661872864, + "learning_rate": 1.819220961927372e-05, + "loss": 0.0031, + "step": 6882 + }, + { + "epoch": 1.81598733676296, + "grad_norm": 0.1004340872168541, + "learning_rate": 1.8191037251971045e-05, + "loss": 0.0012, + "step": 6884 + }, + { + "epoch": 1.8165149716396254, + "grad_norm": 0.23288467526435852, + "learning_rate": 1.8189864884668368e-05, + "loss": 0.0038, + "step": 6886 + }, + { + "epoch": 1.8170426065162908, + "grad_norm": 0.021967416629195213, + "learning_rate": 1.818869251736569e-05, + "loss": 0.0004, + "step": 6888 + }, + { + "epoch": 1.8175702413929562, + "grad_norm": 0.00945241004228592, + "learning_rate": 1.8187520150063016e-05, + "loss": 0.0006, + "step": 6890 + }, + { + "epoch": 1.8180978762696214, + "grad_norm": 0.008919056504964828, + "learning_rate": 1.8186347782760342e-05, + "loss": 0.0124, + "step": 6892 + }, + { + "epoch": 1.8186255111462868, + "grad_norm": 0.014181561768054962, + "learning_rate": 1.8185175415457665e-05, + "loss": 0.0004, + "step": 6894 + }, + { + "epoch": 1.819153146022952, + "grad_norm": 0.013785246759653091, + "learning_rate": 1.8184003048154987e-05, + "loss": 0.0004, + "step": 6896 + }, + { + "epoch": 1.8196807808996174, + "grad_norm": 0.011254502460360527, + "learning_rate": 1.8182830680852313e-05, + "loss": 0.0032, + "step": 6898 + }, + { + "epoch": 1.8202084157762828, + "grad_norm": 0.024370815604925156, + "learning_rate": 1.8181658313549636e-05, + "loss": 0.0005, + "step": 6900 + }, + { + "epoch": 1.8207360506529482, + "grad_norm": 0.16989441215991974, + "learning_rate": 1.818048594624696e-05, + "loss": 0.0019, + "step": 6902 + }, + { + "epoch": 1.8212636855296136, + "grad_norm": 0.2850690484046936, + "learning_rate": 1.8179313578944285e-05, + "loss": 0.0077, + "step": 6904 + }, + { + "epoch": 1.821791320406279, + "grad_norm": 0.0324210524559021, + "learning_rate": 1.8178141211641607e-05, + "loss": 0.0061, + "step": 6906 + }, + { + "epoch": 1.8223189552829442, + "grad_norm": 0.39608821272850037, + "learning_rate": 1.8176968844338933e-05, + "loss": 0.0043, + "step": 6908 + }, + { + "epoch": 1.8228465901596096, + "grad_norm": 0.19887489080429077, + "learning_rate": 1.8175796477036256e-05, + "loss": 0.013, + "step": 6910 + }, + { + "epoch": 1.8233742250362748, + "grad_norm": 0.017529500648379326, + "learning_rate": 1.8174624109733582e-05, + "loss": 0.0005, + "step": 6912 + }, + { + "epoch": 1.8239018599129402, + "grad_norm": 0.12959951162338257, + "learning_rate": 1.8173451742430905e-05, + "loss": 0.0118, + "step": 6914 + }, + { + "epoch": 1.8244294947896056, + "grad_norm": 0.5072873830795288, + "learning_rate": 1.817227937512823e-05, + "loss": 0.004, + "step": 6916 + }, + { + "epoch": 1.824957129666271, + "grad_norm": 0.022319907322525978, + "learning_rate": 1.8171107007825553e-05, + "loss": 0.0037, + "step": 6918 + }, + { + "epoch": 1.8254847645429364, + "grad_norm": 0.13644444942474365, + "learning_rate": 1.8169934640522876e-05, + "loss": 0.0018, + "step": 6920 + }, + { + "epoch": 1.8260123994196016, + "grad_norm": 0.10943406075239182, + "learning_rate": 1.8168762273220202e-05, + "loss": 0.0014, + "step": 6922 + }, + { + "epoch": 1.826540034296267, + "grad_norm": 0.010731354355812073, + "learning_rate": 1.8167589905917528e-05, + "loss": 0.0005, + "step": 6924 + }, + { + "epoch": 1.8270676691729322, + "grad_norm": 0.18362021446228027, + "learning_rate": 1.816641753861485e-05, + "loss": 0.0053, + "step": 6926 + }, + { + "epoch": 1.8275953040495976, + "grad_norm": 0.022186603397130966, + "learning_rate": 1.8165245171312173e-05, + "loss": 0.0007, + "step": 6928 + }, + { + "epoch": 1.828122938926263, + "grad_norm": 0.01574374921619892, + "learning_rate": 1.81640728040095e-05, + "loss": 0.0008, + "step": 6930 + }, + { + "epoch": 1.8286505738029284, + "grad_norm": 0.010588619858026505, + "learning_rate": 1.816290043670682e-05, + "loss": 0.0005, + "step": 6932 + }, + { + "epoch": 1.8291782086795938, + "grad_norm": 0.0908551961183548, + "learning_rate": 1.8161728069404144e-05, + "loss": 0.012, + "step": 6934 + }, + { + "epoch": 1.8297058435562592, + "grad_norm": 0.01477997750043869, + "learning_rate": 1.816055570210147e-05, + "loss": 0.0005, + "step": 6936 + }, + { + "epoch": 1.8302334784329244, + "grad_norm": 0.06503459811210632, + "learning_rate": 1.8159383334798796e-05, + "loss": 0.0006, + "step": 6938 + }, + { + "epoch": 1.8307611133095898, + "grad_norm": 0.44426488876342773, + "learning_rate": 1.815821096749612e-05, + "loss": 0.0088, + "step": 6940 + }, + { + "epoch": 1.831288748186255, + "grad_norm": 0.16117657721042633, + "learning_rate": 1.815703860019344e-05, + "loss": 0.0009, + "step": 6942 + }, + { + "epoch": 1.8318163830629204, + "grad_norm": 0.015637964010238647, + "learning_rate": 1.8155866232890767e-05, + "loss": 0.0004, + "step": 6944 + }, + { + "epoch": 1.8323440179395858, + "grad_norm": 0.06722085922956467, + "learning_rate": 1.815469386558809e-05, + "loss": 0.0005, + "step": 6946 + }, + { + "epoch": 1.8328716528162512, + "grad_norm": 0.11726824939250946, + "learning_rate": 1.8153521498285413e-05, + "loss": 0.0022, + "step": 6948 + }, + { + "epoch": 1.8333992876929166, + "grad_norm": 0.033471301198005676, + "learning_rate": 1.815234913098274e-05, + "loss": 0.0005, + "step": 6950 + }, + { + "epoch": 1.833926922569582, + "grad_norm": 0.3369801640510559, + "learning_rate": 1.815117676368006e-05, + "loss": 0.0022, + "step": 6952 + }, + { + "epoch": 1.8344545574462472, + "grad_norm": 0.12805591523647308, + "learning_rate": 1.8150004396377387e-05, + "loss": 0.0011, + "step": 6954 + }, + { + "epoch": 1.8349821923229126, + "grad_norm": 0.345303475856781, + "learning_rate": 1.814883202907471e-05, + "loss": 0.0011, + "step": 6956 + }, + { + "epoch": 1.8355098271995778, + "grad_norm": 0.006685962434858084, + "learning_rate": 1.8147659661772036e-05, + "loss": 0.0003, + "step": 6958 + }, + { + "epoch": 1.8360374620762432, + "grad_norm": 0.35644230246543884, + "learning_rate": 1.814648729446936e-05, + "loss": 0.0038, + "step": 6960 + }, + { + "epoch": 1.8365650969529086, + "grad_norm": 0.35807281732559204, + "learning_rate": 1.814531492716668e-05, + "loss": 0.0088, + "step": 6962 + }, + { + "epoch": 1.837092731829574, + "grad_norm": 0.4013753831386566, + "learning_rate": 1.8144142559864007e-05, + "loss": 0.0047, + "step": 6964 + }, + { + "epoch": 1.8376203667062394, + "grad_norm": 0.14724649488925934, + "learning_rate": 1.814297019256133e-05, + "loss": 0.0014, + "step": 6966 + }, + { + "epoch": 1.8381480015829046, + "grad_norm": 0.023847419768571854, + "learning_rate": 1.8141797825258656e-05, + "loss": 0.0004, + "step": 6968 + }, + { + "epoch": 1.83867563645957, + "grad_norm": 0.32796886563301086, + "learning_rate": 1.8140625457955978e-05, + "loss": 0.0022, + "step": 6970 + }, + { + "epoch": 1.8392032713362352, + "grad_norm": 0.32248014211654663, + "learning_rate": 1.8139453090653304e-05, + "loss": 0.0018, + "step": 6972 + }, + { + "epoch": 1.8397309062129006, + "grad_norm": 0.08439931273460388, + "learning_rate": 1.8138280723350627e-05, + "loss": 0.0006, + "step": 6974 + }, + { + "epoch": 1.840258541089566, + "grad_norm": 0.010320400819182396, + "learning_rate": 1.8137108356047953e-05, + "loss": 0.0027, + "step": 6976 + }, + { + "epoch": 1.8407861759662314, + "grad_norm": 0.12170366942882538, + "learning_rate": 1.8135935988745275e-05, + "loss": 0.0125, + "step": 6978 + }, + { + "epoch": 1.8413138108428968, + "grad_norm": 0.006993519142270088, + "learning_rate": 1.8134763621442598e-05, + "loss": 0.0002, + "step": 6980 + }, + { + "epoch": 1.8418414457195622, + "grad_norm": 0.005971113219857216, + "learning_rate": 1.8133591254139924e-05, + "loss": 0.0002, + "step": 6982 + }, + { + "epoch": 1.8423690805962274, + "grad_norm": 0.07958853244781494, + "learning_rate": 1.813241888683725e-05, + "loss": 0.002, + "step": 6984 + }, + { + "epoch": 1.8428967154728928, + "grad_norm": 0.005497721489518881, + "learning_rate": 1.8131246519534573e-05, + "loss": 0.0014, + "step": 6986 + }, + { + "epoch": 1.843424350349558, + "grad_norm": 0.0926695317029953, + "learning_rate": 1.8130074152231895e-05, + "loss": 0.0104, + "step": 6988 + }, + { + "epoch": 1.8439519852262234, + "grad_norm": 0.34864726662635803, + "learning_rate": 1.812890178492922e-05, + "loss": 0.0056, + "step": 6990 + }, + { + "epoch": 1.8444796201028888, + "grad_norm": 0.04201597720384598, + "learning_rate": 1.8127729417626544e-05, + "loss": 0.0048, + "step": 6992 + }, + { + "epoch": 1.8450072549795542, + "grad_norm": 0.22760024666786194, + "learning_rate": 1.8126557050323867e-05, + "loss": 0.0007, + "step": 6994 + }, + { + "epoch": 1.8455348898562196, + "grad_norm": 0.010212618857622147, + "learning_rate": 1.8125384683021193e-05, + "loss": 0.0019, + "step": 6996 + }, + { + "epoch": 1.8460625247328848, + "grad_norm": 0.011186582036316395, + "learning_rate": 1.8124212315718515e-05, + "loss": 0.0045, + "step": 6998 + }, + { + "epoch": 1.8465901596095502, + "grad_norm": 0.28672686219215393, + "learning_rate": 1.8123039948415838e-05, + "loss": 0.006, + "step": 7000 + }, + { + "epoch": 1.8471177944862154, + "grad_norm": 0.16144798696041107, + "learning_rate": 1.8121867581113164e-05, + "loss": 0.0024, + "step": 7002 + }, + { + "epoch": 1.8476454293628808, + "grad_norm": 0.26379677653312683, + "learning_rate": 1.812069521381049e-05, + "loss": 0.007, + "step": 7004 + }, + { + "epoch": 1.8481730642395462, + "grad_norm": 0.12319536507129669, + "learning_rate": 1.8119522846507812e-05, + "loss": 0.0011, + "step": 7006 + }, + { + "epoch": 1.8487006991162116, + "grad_norm": 0.04259222373366356, + "learning_rate": 1.8118350479205135e-05, + "loss": 0.0065, + "step": 7008 + }, + { + "epoch": 1.849228333992877, + "grad_norm": 0.38095083832740784, + "learning_rate": 1.811717811190246e-05, + "loss": 0.0046, + "step": 7010 + }, + { + "epoch": 1.8497559688695424, + "grad_norm": 0.04231986030936241, + "learning_rate": 1.8116005744599784e-05, + "loss": 0.0029, + "step": 7012 + }, + { + "epoch": 1.8502836037462076, + "grad_norm": 0.0030809177551418543, + "learning_rate": 1.811483337729711e-05, + "loss": 0.0003, + "step": 7014 + }, + { + "epoch": 1.850811238622873, + "grad_norm": 0.01778389886021614, + "learning_rate": 1.8113661009994432e-05, + "loss": 0.0003, + "step": 7016 + }, + { + "epoch": 1.8513388734995382, + "grad_norm": 0.02667192555963993, + "learning_rate": 1.8112488642691758e-05, + "loss": 0.0004, + "step": 7018 + }, + { + "epoch": 1.8518665083762036, + "grad_norm": 0.0357588529586792, + "learning_rate": 1.811131627538908e-05, + "loss": 0.0005, + "step": 7020 + }, + { + "epoch": 1.852394143252869, + "grad_norm": 0.007555139716714621, + "learning_rate": 1.8110143908086403e-05, + "loss": 0.0003, + "step": 7022 + }, + { + "epoch": 1.8529217781295344, + "grad_norm": 0.01046295277774334, + "learning_rate": 1.810897154078373e-05, + "loss": 0.0027, + "step": 7024 + }, + { + "epoch": 1.8534494130061998, + "grad_norm": 0.1500365287065506, + "learning_rate": 1.8107799173481052e-05, + "loss": 0.0036, + "step": 7026 + }, + { + "epoch": 1.8539770478828652, + "grad_norm": 0.02980682998895645, + "learning_rate": 1.8106626806178378e-05, + "loss": 0.0004, + "step": 7028 + }, + { + "epoch": 1.8545046827595304, + "grad_norm": 0.1583891063928604, + "learning_rate": 1.81054544388757e-05, + "loss": 0.0011, + "step": 7030 + }, + { + "epoch": 1.8550323176361958, + "grad_norm": 0.014865120872855186, + "learning_rate": 1.8104282071573027e-05, + "loss": 0.0049, + "step": 7032 + }, + { + "epoch": 1.855559952512861, + "grad_norm": 0.01237395964562893, + "learning_rate": 1.810310970427035e-05, + "loss": 0.0002, + "step": 7034 + }, + { + "epoch": 1.8560875873895264, + "grad_norm": 0.08708686381578445, + "learning_rate": 1.8101937336967675e-05, + "loss": 0.0115, + "step": 7036 + }, + { + "epoch": 1.8566152222661918, + "grad_norm": 0.01230090856552124, + "learning_rate": 1.8100764969664998e-05, + "loss": 0.0003, + "step": 7038 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.34664472937583923, + "learning_rate": 1.809959260236232e-05, + "loss": 0.005, + "step": 7040 + }, + { + "epoch": 1.8576704920195226, + "grad_norm": 0.007226404268294573, + "learning_rate": 1.8098420235059646e-05, + "loss": 0.0078, + "step": 7042 + }, + { + "epoch": 1.8581981268961878, + "grad_norm": 0.006759177893400192, + "learning_rate": 1.8097247867756972e-05, + "loss": 0.0027, + "step": 7044 + }, + { + "epoch": 1.8587257617728532, + "grad_norm": 0.0066576250828802586, + "learning_rate": 1.809607550045429e-05, + "loss": 0.0043, + "step": 7046 + }, + { + "epoch": 1.8592533966495184, + "grad_norm": 0.04919712245464325, + "learning_rate": 1.8094903133151618e-05, + "loss": 0.0017, + "step": 7048 + }, + { + "epoch": 1.8597810315261838, + "grad_norm": 0.10881973803043365, + "learning_rate": 1.8093730765848944e-05, + "loss": 0.0031, + "step": 7050 + }, + { + "epoch": 1.8603086664028492, + "grad_norm": 0.1521044671535492, + "learning_rate": 1.8092558398546266e-05, + "loss": 0.0026, + "step": 7052 + }, + { + "epoch": 1.8608363012795146, + "grad_norm": 1.1307224035263062, + "learning_rate": 1.809138603124359e-05, + "loss": 0.0056, + "step": 7054 + }, + { + "epoch": 1.86136393615618, + "grad_norm": 0.10302504897117615, + "learning_rate": 1.8090213663940915e-05, + "loss": 0.0006, + "step": 7056 + }, + { + "epoch": 1.8618915710328454, + "grad_norm": 0.008118889294564724, + "learning_rate": 1.8089041296638237e-05, + "loss": 0.0002, + "step": 7058 + }, + { + "epoch": 1.8624192059095106, + "grad_norm": 0.018293172121047974, + "learning_rate": 1.808786892933556e-05, + "loss": 0.0003, + "step": 7060 + }, + { + "epoch": 1.862946840786176, + "grad_norm": 0.004005722235888243, + "learning_rate": 1.8086696562032886e-05, + "loss": 0.0053, + "step": 7062 + }, + { + "epoch": 1.8634744756628412, + "grad_norm": 0.08297237753868103, + "learning_rate": 1.8085524194730212e-05, + "loss": 0.0006, + "step": 7064 + }, + { + "epoch": 1.8640021105395066, + "grad_norm": 0.016165204346179962, + "learning_rate": 1.8084351827427535e-05, + "loss": 0.0004, + "step": 7066 + }, + { + "epoch": 1.864529745416172, + "grad_norm": 0.021642865613102913, + "learning_rate": 1.8083179460124857e-05, + "loss": 0.0004, + "step": 7068 + }, + { + "epoch": 1.8650573802928374, + "grad_norm": 0.1450224667787552, + "learning_rate": 1.8082007092822183e-05, + "loss": 0.0011, + "step": 7070 + }, + { + "epoch": 1.8655850151695028, + "grad_norm": 0.03469037264585495, + "learning_rate": 1.8080834725519506e-05, + "loss": 0.0003, + "step": 7072 + }, + { + "epoch": 1.866112650046168, + "grad_norm": 0.3301146328449249, + "learning_rate": 1.8079662358216832e-05, + "loss": 0.0094, + "step": 7074 + }, + { + "epoch": 1.8666402849228334, + "grad_norm": 0.5115071535110474, + "learning_rate": 1.8078489990914154e-05, + "loss": 0.0052, + "step": 7076 + }, + { + "epoch": 1.8671679197994986, + "grad_norm": 0.1825496107339859, + "learning_rate": 1.807731762361148e-05, + "loss": 0.0038, + "step": 7078 + }, + { + "epoch": 1.867695554676164, + "grad_norm": 0.1600675731897354, + "learning_rate": 1.8076145256308803e-05, + "loss": 0.0147, + "step": 7080 + }, + { + "epoch": 1.8682231895528294, + "grad_norm": 0.07718092203140259, + "learning_rate": 1.8074972889006126e-05, + "loss": 0.0018, + "step": 7082 + }, + { + "epoch": 1.8687508244294948, + "grad_norm": 0.5116434097290039, + "learning_rate": 1.8073800521703452e-05, + "loss": 0.0042, + "step": 7084 + }, + { + "epoch": 1.8692784593061602, + "grad_norm": 0.2536894381046295, + "learning_rate": 1.8072628154400774e-05, + "loss": 0.0012, + "step": 7086 + }, + { + "epoch": 1.8698060941828256, + "grad_norm": 0.1283128559589386, + "learning_rate": 1.80714557870981e-05, + "loss": 0.0012, + "step": 7088 + }, + { + "epoch": 1.8703337290594908, + "grad_norm": 0.042905542999506, + "learning_rate": 1.8070283419795423e-05, + "loss": 0.0016, + "step": 7090 + }, + { + "epoch": 1.8708613639361562, + "grad_norm": 0.052285198122262955, + "learning_rate": 1.8069111052492746e-05, + "loss": 0.003, + "step": 7092 + }, + { + "epoch": 1.8713889988128214, + "grad_norm": 0.03684263303875923, + "learning_rate": 1.806793868519007e-05, + "loss": 0.0003, + "step": 7094 + }, + { + "epoch": 1.8719166336894868, + "grad_norm": 0.004486782941967249, + "learning_rate": 1.8066766317887398e-05, + "loss": 0.0004, + "step": 7096 + }, + { + "epoch": 1.8724442685661522, + "grad_norm": 0.009603680111467838, + "learning_rate": 1.806559395058472e-05, + "loss": 0.0008, + "step": 7098 + }, + { + "epoch": 1.8729719034428176, + "grad_norm": 0.02227553352713585, + "learning_rate": 1.8064421583282043e-05, + "loss": 0.0003, + "step": 7100 + }, + { + "epoch": 1.873499538319483, + "grad_norm": 0.10921172052621841, + "learning_rate": 1.806324921597937e-05, + "loss": 0.0066, + "step": 7102 + }, + { + "epoch": 1.8740271731961484, + "grad_norm": 0.0316980741918087, + "learning_rate": 1.806207684867669e-05, + "loss": 0.0061, + "step": 7104 + }, + { + "epoch": 1.8745548080728136, + "grad_norm": 0.006588486488908529, + "learning_rate": 1.8060904481374014e-05, + "loss": 0.0075, + "step": 7106 + }, + { + "epoch": 1.875082442949479, + "grad_norm": 0.24357470870018005, + "learning_rate": 1.805973211407134e-05, + "loss": 0.0038, + "step": 7108 + }, + { + "epoch": 1.8756100778261442, + "grad_norm": 4.469862937927246, + "learning_rate": 1.8058559746768666e-05, + "loss": 0.0009, + "step": 7110 + }, + { + "epoch": 1.8761377127028096, + "grad_norm": 0.21198910474777222, + "learning_rate": 1.805738737946599e-05, + "loss": 0.002, + "step": 7112 + }, + { + "epoch": 1.876665347579475, + "grad_norm": 0.021407533437013626, + "learning_rate": 1.805621501216331e-05, + "loss": 0.0006, + "step": 7114 + }, + { + "epoch": 1.8771929824561404, + "grad_norm": 0.02068307250738144, + "learning_rate": 1.8055042644860637e-05, + "loss": 0.0003, + "step": 7116 + }, + { + "epoch": 1.8777206173328058, + "grad_norm": 0.20435954630374908, + "learning_rate": 1.805387027755796e-05, + "loss": 0.0009, + "step": 7118 + }, + { + "epoch": 1.878248252209471, + "grad_norm": 0.013396043330430984, + "learning_rate": 1.8052697910255282e-05, + "loss": 0.0006, + "step": 7120 + }, + { + "epoch": 1.8787758870861364, + "grad_norm": 0.03222363069653511, + "learning_rate": 1.805152554295261e-05, + "loss": 0.0004, + "step": 7122 + }, + { + "epoch": 1.8793035219628016, + "grad_norm": 0.03240014612674713, + "learning_rate": 1.8050353175649934e-05, + "loss": 0.0004, + "step": 7124 + }, + { + "epoch": 1.879831156839467, + "grad_norm": 0.003021383658051491, + "learning_rate": 1.8049180808347257e-05, + "loss": 0.0002, + "step": 7126 + }, + { + "epoch": 1.8803587917161324, + "grad_norm": 0.003889300161972642, + "learning_rate": 1.804800844104458e-05, + "loss": 0.0002, + "step": 7128 + }, + { + "epoch": 1.8808864265927978, + "grad_norm": 0.022107020020484924, + "learning_rate": 1.8046836073741906e-05, + "loss": 0.0052, + "step": 7130 + }, + { + "epoch": 1.8814140614694632, + "grad_norm": 0.0050840540789067745, + "learning_rate": 1.8045663706439228e-05, + "loss": 0.0085, + "step": 7132 + }, + { + "epoch": 1.8819416963461286, + "grad_norm": 0.002366275293752551, + "learning_rate": 1.8044491339136554e-05, + "loss": 0.0002, + "step": 7134 + }, + { + "epoch": 1.8824693312227938, + "grad_norm": 0.003139687003567815, + "learning_rate": 1.8043318971833877e-05, + "loss": 0.0003, + "step": 7136 + }, + { + "epoch": 1.8829969660994592, + "grad_norm": 0.020776135846972466, + "learning_rate": 1.80421466045312e-05, + "loss": 0.0003, + "step": 7138 + }, + { + "epoch": 1.8835246009761244, + "grad_norm": 0.011611005291342735, + "learning_rate": 1.8040974237228525e-05, + "loss": 0.001, + "step": 7140 + }, + { + "epoch": 1.8840522358527898, + "grad_norm": 0.009785949252545834, + "learning_rate": 1.8039801869925848e-05, + "loss": 0.0064, + "step": 7142 + }, + { + "epoch": 1.8845798707294552, + "grad_norm": 0.38836464285850525, + "learning_rate": 1.8038629502623174e-05, + "loss": 0.0019, + "step": 7144 + }, + { + "epoch": 1.8851075056061206, + "grad_norm": 0.023253310471773148, + "learning_rate": 1.8037457135320497e-05, + "loss": 0.002, + "step": 7146 + }, + { + "epoch": 1.885635140482786, + "grad_norm": 0.012240994721651077, + "learning_rate": 1.8036284768017823e-05, + "loss": 0.0004, + "step": 7148 + }, + { + "epoch": 1.8861627753594512, + "grad_norm": 0.014823869802057743, + "learning_rate": 1.8035112400715145e-05, + "loss": 0.0041, + "step": 7150 + }, + { + "epoch": 1.8866904102361166, + "grad_norm": 0.013803822919726372, + "learning_rate": 1.8033940033412468e-05, + "loss": 0.0004, + "step": 7152 + }, + { + "epoch": 1.8872180451127818, + "grad_norm": 0.026179444044828415, + "learning_rate": 1.8032767666109794e-05, + "loss": 0.0004, + "step": 7154 + }, + { + "epoch": 1.8877456799894472, + "grad_norm": 0.18882253766059875, + "learning_rate": 1.803159529880712e-05, + "loss": 0.0019, + "step": 7156 + }, + { + "epoch": 1.8882733148661126, + "grad_norm": 0.012964013032615185, + "learning_rate": 1.8030422931504442e-05, + "loss": 0.0021, + "step": 7158 + }, + { + "epoch": 1.888800949742778, + "grad_norm": 0.25872802734375, + "learning_rate": 1.8029250564201765e-05, + "loss": 0.0161, + "step": 7160 + }, + { + "epoch": 1.8893285846194434, + "grad_norm": 0.187184140086174, + "learning_rate": 1.802807819689909e-05, + "loss": 0.0069, + "step": 7162 + }, + { + "epoch": 1.8898562194961088, + "grad_norm": 0.10768680274486542, + "learning_rate": 1.8026905829596414e-05, + "loss": 0.0182, + "step": 7164 + }, + { + "epoch": 1.890383854372774, + "grad_norm": 0.009697828441858292, + "learning_rate": 1.8025733462293736e-05, + "loss": 0.001, + "step": 7166 + }, + { + "epoch": 1.8909114892494394, + "grad_norm": 0.32593366503715515, + "learning_rate": 1.8024561094991062e-05, + "loss": 0.013, + "step": 7168 + }, + { + "epoch": 1.8914391241261046, + "grad_norm": 0.2141992747783661, + "learning_rate": 1.8023388727688388e-05, + "loss": 0.0068, + "step": 7170 + }, + { + "epoch": 1.89196675900277, + "grad_norm": 0.23333284258842468, + "learning_rate": 1.8022216360385708e-05, + "loss": 0.0055, + "step": 7172 + }, + { + "epoch": 1.8924943938794354, + "grad_norm": 0.10708301514387131, + "learning_rate": 1.8021043993083034e-05, + "loss": 0.001, + "step": 7174 + }, + { + "epoch": 1.8930220287561008, + "grad_norm": 0.05299970135092735, + "learning_rate": 1.801987162578036e-05, + "loss": 0.0006, + "step": 7176 + }, + { + "epoch": 1.8935496636327662, + "grad_norm": 0.05297483131289482, + "learning_rate": 1.8018699258477682e-05, + "loss": 0.0024, + "step": 7178 + }, + { + "epoch": 1.8940772985094316, + "grad_norm": 0.13516269624233246, + "learning_rate": 1.8017526891175005e-05, + "loss": 0.0094, + "step": 7180 + }, + { + "epoch": 1.8946049333860968, + "grad_norm": 0.030071739107370377, + "learning_rate": 1.801635452387233e-05, + "loss": 0.0091, + "step": 7182 + }, + { + "epoch": 1.8951325682627622, + "grad_norm": 0.04489728435873985, + "learning_rate": 1.8015182156569653e-05, + "loss": 0.0006, + "step": 7184 + }, + { + "epoch": 1.8956602031394274, + "grad_norm": 0.013912295922636986, + "learning_rate": 1.801400978926698e-05, + "loss": 0.0038, + "step": 7186 + }, + { + "epoch": 1.8961878380160928, + "grad_norm": 0.02499415911734104, + "learning_rate": 1.8012837421964302e-05, + "loss": 0.0025, + "step": 7188 + }, + { + "epoch": 1.8967154728927582, + "grad_norm": 0.23724079132080078, + "learning_rate": 1.8011665054661628e-05, + "loss": 0.0035, + "step": 7190 + }, + { + "epoch": 1.8972431077694236, + "grad_norm": 0.016383366659283638, + "learning_rate": 1.801049268735895e-05, + "loss": 0.0085, + "step": 7192 + }, + { + "epoch": 1.897770742646089, + "grad_norm": 0.32366663217544556, + "learning_rate": 1.8009320320056277e-05, + "loss": 0.0028, + "step": 7194 + }, + { + "epoch": 1.8982983775227542, + "grad_norm": 0.010187182575464249, + "learning_rate": 1.80081479527536e-05, + "loss": 0.0005, + "step": 7196 + }, + { + "epoch": 1.8988260123994196, + "grad_norm": 0.011455965228378773, + "learning_rate": 1.8006975585450922e-05, + "loss": 0.0006, + "step": 7198 + }, + { + "epoch": 1.8993536472760848, + "grad_norm": 0.0068322401493787766, + "learning_rate": 1.8005803218148248e-05, + "loss": 0.0043, + "step": 7200 + }, + { + "epoch": 1.8998812821527502, + "grad_norm": 0.18014657497406006, + "learning_rate": 1.8004630850845574e-05, + "loss": 0.0008, + "step": 7202 + }, + { + "epoch": 1.9004089170294156, + "grad_norm": 0.10777309536933899, + "learning_rate": 1.8003458483542896e-05, + "loss": 0.0007, + "step": 7204 + }, + { + "epoch": 1.900936551906081, + "grad_norm": 0.005596863105893135, + "learning_rate": 1.800228611624022e-05, + "loss": 0.0003, + "step": 7206 + }, + { + "epoch": 1.9014641867827464, + "grad_norm": 0.008813292719423771, + "learning_rate": 1.8001113748937545e-05, + "loss": 0.0003, + "step": 7208 + }, + { + "epoch": 1.9019918216594118, + "grad_norm": 0.09242135286331177, + "learning_rate": 1.7999941381634868e-05, + "loss": 0.0007, + "step": 7210 + }, + { + "epoch": 1.902519456536077, + "grad_norm": 0.008976446464657784, + "learning_rate": 1.799876901433219e-05, + "loss": 0.0011, + "step": 7212 + }, + { + "epoch": 1.9030470914127424, + "grad_norm": 0.5371431708335876, + "learning_rate": 1.7997596647029516e-05, + "loss": 0.0066, + "step": 7214 + }, + { + "epoch": 1.9035747262894076, + "grad_norm": 0.005459940526634455, + "learning_rate": 1.7996424279726842e-05, + "loss": 0.0015, + "step": 7216 + }, + { + "epoch": 1.904102361166073, + "grad_norm": 0.010271276347339153, + "learning_rate": 1.7995251912424165e-05, + "loss": 0.0007, + "step": 7218 + }, + { + "epoch": 1.9046299960427384, + "grad_norm": 0.061644092202186584, + "learning_rate": 1.7994079545121487e-05, + "loss": 0.0075, + "step": 7220 + }, + { + "epoch": 1.9051576309194038, + "grad_norm": 0.17367304861545563, + "learning_rate": 1.7992907177818813e-05, + "loss": 0.0057, + "step": 7222 + }, + { + "epoch": 1.9056852657960692, + "grad_norm": 0.10240955650806427, + "learning_rate": 1.7991734810516136e-05, + "loss": 0.0013, + "step": 7224 + }, + { + "epoch": 1.9062129006727346, + "grad_norm": 0.0722396969795227, + "learning_rate": 1.799056244321346e-05, + "loss": 0.0013, + "step": 7226 + }, + { + "epoch": 1.9067405355493998, + "grad_norm": 0.07041079550981522, + "learning_rate": 1.7989390075910785e-05, + "loss": 0.0026, + "step": 7228 + }, + { + "epoch": 1.907268170426065, + "grad_norm": 0.013208535499870777, + "learning_rate": 1.7988217708608107e-05, + "loss": 0.0003, + "step": 7230 + }, + { + "epoch": 1.9077958053027304, + "grad_norm": 0.3690620958805084, + "learning_rate": 1.7987045341305433e-05, + "loss": 0.0039, + "step": 7232 + }, + { + "epoch": 1.9083234401793958, + "grad_norm": 0.005696623586118221, + "learning_rate": 1.7985872974002756e-05, + "loss": 0.0002, + "step": 7234 + }, + { + "epoch": 1.9088510750560612, + "grad_norm": 0.026776023209095, + "learning_rate": 1.7984700606700082e-05, + "loss": 0.0005, + "step": 7236 + }, + { + "epoch": 1.9093787099327266, + "grad_norm": 0.10709245502948761, + "learning_rate": 1.7983528239397404e-05, + "loss": 0.0078, + "step": 7238 + }, + { + "epoch": 1.909906344809392, + "grad_norm": 0.16676399111747742, + "learning_rate": 1.7982355872094727e-05, + "loss": 0.0013, + "step": 7240 + }, + { + "epoch": 1.9104339796860572, + "grad_norm": 1.2264479398727417, + "learning_rate": 1.7981183504792053e-05, + "loss": 0.0015, + "step": 7242 + }, + { + "epoch": 1.9109616145627226, + "grad_norm": 0.03442002832889557, + "learning_rate": 1.7980011137489376e-05, + "loss": 0.0004, + "step": 7244 + }, + { + "epoch": 1.9114892494393878, + "grad_norm": 0.034007325768470764, + "learning_rate": 1.79788387701867e-05, + "loss": 0.0005, + "step": 7246 + }, + { + "epoch": 1.9120168843160532, + "grad_norm": 0.11994950473308563, + "learning_rate": 1.7977666402884024e-05, + "loss": 0.0093, + "step": 7248 + }, + { + "epoch": 1.9125445191927186, + "grad_norm": 0.41425490379333496, + "learning_rate": 1.797649403558135e-05, + "loss": 0.0032, + "step": 7250 + }, + { + "epoch": 1.913072154069384, + "grad_norm": 0.07744068652391434, + "learning_rate": 1.7975321668278673e-05, + "loss": 0.0005, + "step": 7252 + }, + { + "epoch": 1.9135997889460494, + "grad_norm": 0.006101813167333603, + "learning_rate": 1.7974149300976e-05, + "loss": 0.0017, + "step": 7254 + }, + { + "epoch": 1.9141274238227148, + "grad_norm": 0.036061033606529236, + "learning_rate": 1.797297693367332e-05, + "loss": 0.0005, + "step": 7256 + }, + { + "epoch": 1.91465505869938, + "grad_norm": 0.012624194845557213, + "learning_rate": 1.7971804566370644e-05, + "loss": 0.0003, + "step": 7258 + }, + { + "epoch": 1.9151826935760454, + "grad_norm": 0.17346535623073578, + "learning_rate": 1.797063219906797e-05, + "loss": 0.0078, + "step": 7260 + }, + { + "epoch": 1.9157103284527106, + "grad_norm": 0.015814831480383873, + "learning_rate": 1.7969459831765296e-05, + "loss": 0.0009, + "step": 7262 + }, + { + "epoch": 1.916237963329376, + "grad_norm": 0.00825114082545042, + "learning_rate": 1.796828746446262e-05, + "loss": 0.0003, + "step": 7264 + }, + { + "epoch": 1.9167655982060414, + "grad_norm": 0.21433797478675842, + "learning_rate": 1.796711509715994e-05, + "loss": 0.0053, + "step": 7266 + }, + { + "epoch": 1.9172932330827068, + "grad_norm": 0.023445971310138702, + "learning_rate": 1.7965942729857267e-05, + "loss": 0.0015, + "step": 7268 + }, + { + "epoch": 1.9178208679593722, + "grad_norm": 0.009142247959971428, + "learning_rate": 1.796477036255459e-05, + "loss": 0.0004, + "step": 7270 + }, + { + "epoch": 1.9183485028360374, + "grad_norm": 0.01765298843383789, + "learning_rate": 1.7963597995251913e-05, + "loss": 0.0003, + "step": 7272 + }, + { + "epoch": 1.9188761377127028, + "grad_norm": 0.006737933959811926, + "learning_rate": 1.796242562794924e-05, + "loss": 0.002, + "step": 7274 + }, + { + "epoch": 1.919403772589368, + "grad_norm": 0.020784657448530197, + "learning_rate": 1.796125326064656e-05, + "loss": 0.0003, + "step": 7276 + }, + { + "epoch": 1.9199314074660334, + "grad_norm": 0.10858873277902603, + "learning_rate": 1.7960080893343884e-05, + "loss": 0.0006, + "step": 7278 + }, + { + "epoch": 1.9204590423426988, + "grad_norm": 0.0025044241920113564, + "learning_rate": 1.795890852604121e-05, + "loss": 0.0002, + "step": 7280 + }, + { + "epoch": 1.9209866772193642, + "grad_norm": 0.009939394891262054, + "learning_rate": 1.7957736158738536e-05, + "loss": 0.0004, + "step": 7282 + }, + { + "epoch": 1.9215143120960296, + "grad_norm": 0.08425923436880112, + "learning_rate": 1.795656379143586e-05, + "loss": 0.0002, + "step": 7284 + }, + { + "epoch": 1.922041946972695, + "grad_norm": 0.005841344129294157, + "learning_rate": 1.795539142413318e-05, + "loss": 0.0008, + "step": 7286 + }, + { + "epoch": 1.9225695818493602, + "grad_norm": 0.009416710585355759, + "learning_rate": 1.7954219056830507e-05, + "loss": 0.013, + "step": 7288 + }, + { + "epoch": 1.9230972167260256, + "grad_norm": 0.015483101829886436, + "learning_rate": 1.795304668952783e-05, + "loss": 0.0002, + "step": 7290 + }, + { + "epoch": 1.9236248516026908, + "grad_norm": 0.0020029209554195404, + "learning_rate": 1.7951874322225156e-05, + "loss": 0.0001, + "step": 7292 + }, + { + "epoch": 1.9241524864793562, + "grad_norm": 0.0022536455653607845, + "learning_rate": 1.7950701954922478e-05, + "loss": 0.0002, + "step": 7294 + }, + { + "epoch": 1.9246801213560216, + "grad_norm": 0.023357493802905083, + "learning_rate": 1.7949529587619804e-05, + "loss": 0.0004, + "step": 7296 + }, + { + "epoch": 1.925207756232687, + "grad_norm": 0.007009007968008518, + "learning_rate": 1.7948357220317127e-05, + "loss": 0.0007, + "step": 7298 + }, + { + "epoch": 1.9257353911093524, + "grad_norm": 0.0027729100547730923, + "learning_rate": 1.794718485301445e-05, + "loss": 0.0084, + "step": 7300 + }, + { + "epoch": 1.9262630259860178, + "grad_norm": 0.27032706141471863, + "learning_rate": 1.7946012485711775e-05, + "loss": 0.0032, + "step": 7302 + }, + { + "epoch": 1.926790660862683, + "grad_norm": 0.006529037840664387, + "learning_rate": 1.7944840118409098e-05, + "loss": 0.0002, + "step": 7304 + }, + { + "epoch": 1.9273182957393482, + "grad_norm": 0.05289405584335327, + "learning_rate": 1.7943667751106424e-05, + "loss": 0.0111, + "step": 7306 + }, + { + "epoch": 1.9278459306160136, + "grad_norm": 0.35621315240859985, + "learning_rate": 1.7942495383803747e-05, + "loss": 0.0085, + "step": 7308 + }, + { + "epoch": 1.928373565492679, + "grad_norm": 0.004067050758749247, + "learning_rate": 1.7941323016501073e-05, + "loss": 0.0003, + "step": 7310 + }, + { + "epoch": 1.9289012003693444, + "grad_norm": 0.16044777631759644, + "learning_rate": 1.7940150649198395e-05, + "loss": 0.0048, + "step": 7312 + }, + { + "epoch": 1.9294288352460098, + "grad_norm": 0.03214329108595848, + "learning_rate": 1.793897828189572e-05, + "loss": 0.0003, + "step": 7314 + }, + { + "epoch": 1.9299564701226752, + "grad_norm": 0.014187702909111977, + "learning_rate": 1.7937805914593044e-05, + "loss": 0.0003, + "step": 7316 + }, + { + "epoch": 1.9304841049993404, + "grad_norm": 0.25511255860328674, + "learning_rate": 1.7936633547290366e-05, + "loss": 0.002, + "step": 7318 + }, + { + "epoch": 1.9310117398760058, + "grad_norm": 0.015508106909692287, + "learning_rate": 1.7935461179987692e-05, + "loss": 0.0002, + "step": 7320 + }, + { + "epoch": 1.931539374752671, + "grad_norm": 0.05145285278558731, + "learning_rate": 1.7934288812685015e-05, + "loss": 0.0011, + "step": 7322 + }, + { + "epoch": 1.9320670096293364, + "grad_norm": 0.19340483844280243, + "learning_rate": 1.7933116445382338e-05, + "loss": 0.0039, + "step": 7324 + }, + { + "epoch": 1.9325946445060018, + "grad_norm": 0.011896051466464996, + "learning_rate": 1.7931944078079664e-05, + "loss": 0.0002, + "step": 7326 + }, + { + "epoch": 1.9331222793826672, + "grad_norm": 0.003177223028615117, + "learning_rate": 1.793077171077699e-05, + "loss": 0.0004, + "step": 7328 + }, + { + "epoch": 1.9336499142593326, + "grad_norm": 0.024813929572701454, + "learning_rate": 1.7929599343474312e-05, + "loss": 0.0014, + "step": 7330 + }, + { + "epoch": 1.934177549135998, + "grad_norm": 0.02828967571258545, + "learning_rate": 1.7928426976171635e-05, + "loss": 0.0018, + "step": 7332 + }, + { + "epoch": 1.9347051840126632, + "grad_norm": 0.37448039650917053, + "learning_rate": 1.792725460886896e-05, + "loss": 0.0078, + "step": 7334 + }, + { + "epoch": 1.9352328188893286, + "grad_norm": 0.07077666372060776, + "learning_rate": 1.7926082241566283e-05, + "loss": 0.0007, + "step": 7336 + }, + { + "epoch": 1.9357604537659938, + "grad_norm": 0.013985364697873592, + "learning_rate": 1.7924909874263606e-05, + "loss": 0.0006, + "step": 7338 + }, + { + "epoch": 1.9362880886426592, + "grad_norm": 0.12837940454483032, + "learning_rate": 1.7923737506960932e-05, + "loss": 0.0038, + "step": 7340 + }, + { + "epoch": 1.9368157235193246, + "grad_norm": 0.10209247469902039, + "learning_rate": 1.7922565139658258e-05, + "loss": 0.0008, + "step": 7342 + }, + { + "epoch": 1.93734335839599, + "grad_norm": 0.004957541823387146, + "learning_rate": 1.792139277235558e-05, + "loss": 0.0004, + "step": 7344 + }, + { + "epoch": 1.9378709932726554, + "grad_norm": 0.011156374588608742, + "learning_rate": 1.7920220405052903e-05, + "loss": 0.0003, + "step": 7346 + }, + { + "epoch": 1.9383986281493206, + "grad_norm": 0.016495725139975548, + "learning_rate": 1.791904803775023e-05, + "loss": 0.0021, + "step": 7348 + }, + { + "epoch": 1.938926263025986, + "grad_norm": 0.03676951304078102, + "learning_rate": 1.7917875670447552e-05, + "loss": 0.003, + "step": 7350 + }, + { + "epoch": 1.9394538979026512, + "grad_norm": 0.041524097323417664, + "learning_rate": 1.7916703303144878e-05, + "loss": 0.0004, + "step": 7352 + }, + { + "epoch": 1.9399815327793166, + "grad_norm": 0.554339587688446, + "learning_rate": 1.79155309358422e-05, + "loss": 0.0045, + "step": 7354 + }, + { + "epoch": 1.940509167655982, + "grad_norm": 0.1485471874475479, + "learning_rate": 1.7914358568539527e-05, + "loss": 0.003, + "step": 7356 + }, + { + "epoch": 1.9410368025326474, + "grad_norm": 0.051429230719804764, + "learning_rate": 1.791318620123685e-05, + "loss": 0.0003, + "step": 7358 + }, + { + "epoch": 1.9415644374093128, + "grad_norm": 0.01328987069427967, + "learning_rate": 1.7912013833934172e-05, + "loss": 0.0056, + "step": 7360 + }, + { + "epoch": 1.9420920722859782, + "grad_norm": 0.3321481943130493, + "learning_rate": 1.7910841466631498e-05, + "loss": 0.0017, + "step": 7362 + }, + { + "epoch": 1.9426197071626434, + "grad_norm": 0.0299510695040226, + "learning_rate": 1.790966909932882e-05, + "loss": 0.0008, + "step": 7364 + }, + { + "epoch": 1.9431473420393088, + "grad_norm": 0.1129213497042656, + "learning_rate": 1.7908496732026146e-05, + "loss": 0.0101, + "step": 7366 + }, + { + "epoch": 1.943674976915974, + "grad_norm": 0.12228289246559143, + "learning_rate": 1.790732436472347e-05, + "loss": 0.0009, + "step": 7368 + }, + { + "epoch": 1.9442026117926394, + "grad_norm": 0.06976354122161865, + "learning_rate": 1.790615199742079e-05, + "loss": 0.0008, + "step": 7370 + }, + { + "epoch": 1.9447302466693048, + "grad_norm": 0.15563832223415375, + "learning_rate": 1.7904979630118118e-05, + "loss": 0.0088, + "step": 7372 + }, + { + "epoch": 1.9452578815459702, + "grad_norm": 0.2845628261566162, + "learning_rate": 1.7903807262815444e-05, + "loss": 0.0045, + "step": 7374 + }, + { + "epoch": 1.9457855164226356, + "grad_norm": 0.01980929635465145, + "learning_rate": 1.7902634895512766e-05, + "loss": 0.0003, + "step": 7376 + }, + { + "epoch": 1.946313151299301, + "grad_norm": 0.010176066309213638, + "learning_rate": 1.790146252821009e-05, + "loss": 0.0003, + "step": 7378 + }, + { + "epoch": 1.9468407861759662, + "grad_norm": 0.006474805995821953, + "learning_rate": 1.7900290160907415e-05, + "loss": 0.0002, + "step": 7380 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 0.006936275400221348, + "learning_rate": 1.7899117793604737e-05, + "loss": 0.0007, + "step": 7382 + }, + { + "epoch": 1.9478960559292968, + "grad_norm": 0.006414617411792278, + "learning_rate": 1.789794542630206e-05, + "loss": 0.0097, + "step": 7384 + }, + { + "epoch": 1.9484236908059622, + "grad_norm": 0.007175194099545479, + "learning_rate": 1.7896773058999386e-05, + "loss": 0.0003, + "step": 7386 + }, + { + "epoch": 1.9489513256826276, + "grad_norm": 0.11519164592027664, + "learning_rate": 1.7895600691696712e-05, + "loss": 0.0007, + "step": 7388 + }, + { + "epoch": 1.949478960559293, + "grad_norm": 0.010443245992064476, + "learning_rate": 1.7894428324394035e-05, + "loss": 0.0004, + "step": 7390 + }, + { + "epoch": 1.9500065954359584, + "grad_norm": 0.030347038060426712, + "learning_rate": 1.7893255957091357e-05, + "loss": 0.0038, + "step": 7392 + }, + { + "epoch": 1.9505342303126236, + "grad_norm": 0.0034457622095942497, + "learning_rate": 1.7892083589788683e-05, + "loss": 0.0002, + "step": 7394 + }, + { + "epoch": 1.951061865189289, + "grad_norm": 0.021227363497018814, + "learning_rate": 1.7890911222486006e-05, + "loss": 0.0003, + "step": 7396 + }, + { + "epoch": 1.9515895000659542, + "grad_norm": 0.6311706900596619, + "learning_rate": 1.788973885518333e-05, + "loss": 0.0025, + "step": 7398 + }, + { + "epoch": 1.9521171349426196, + "grad_norm": 0.02689104713499546, + "learning_rate": 1.7888566487880654e-05, + "loss": 0.0002, + "step": 7400 + }, + { + "epoch": 1.952644769819285, + "grad_norm": 0.0036804943811148405, + "learning_rate": 1.788739412057798e-05, + "loss": 0.0003, + "step": 7402 + }, + { + "epoch": 1.9531724046959504, + "grad_norm": 0.19550062716007233, + "learning_rate": 1.7886221753275303e-05, + "loss": 0.0005, + "step": 7404 + }, + { + "epoch": 1.9537000395726158, + "grad_norm": 0.21074849367141724, + "learning_rate": 1.7885049385972626e-05, + "loss": 0.0008, + "step": 7406 + }, + { + "epoch": 1.9542276744492812, + "grad_norm": 0.3534616231918335, + "learning_rate": 1.788387701866995e-05, + "loss": 0.0027, + "step": 7408 + }, + { + "epoch": 1.9547553093259464, + "grad_norm": 0.12438654154539108, + "learning_rate": 1.7882704651367274e-05, + "loss": 0.0008, + "step": 7410 + }, + { + "epoch": 1.9552829442026118, + "grad_norm": 0.07868499308824539, + "learning_rate": 1.78815322840646e-05, + "loss": 0.0034, + "step": 7412 + }, + { + "epoch": 1.955810579079277, + "grad_norm": 0.20892778038978577, + "learning_rate": 1.7880359916761923e-05, + "loss": 0.0032, + "step": 7414 + }, + { + "epoch": 1.9563382139559424, + "grad_norm": 0.005494598764926195, + "learning_rate": 1.7879187549459245e-05, + "loss": 0.0013, + "step": 7416 + }, + { + "epoch": 1.9568658488326078, + "grad_norm": 0.004491070285439491, + "learning_rate": 1.787801518215657e-05, + "loss": 0.0007, + "step": 7418 + }, + { + "epoch": 1.9573934837092732, + "grad_norm": 0.04194070026278496, + "learning_rate": 1.7876842814853894e-05, + "loss": 0.0045, + "step": 7420 + }, + { + "epoch": 1.9579211185859386, + "grad_norm": 0.21143831312656403, + "learning_rate": 1.787567044755122e-05, + "loss": 0.0077, + "step": 7422 + }, + { + "epoch": 1.9584487534626038, + "grad_norm": 0.09338465332984924, + "learning_rate": 1.7874498080248543e-05, + "loss": 0.0003, + "step": 7424 + }, + { + "epoch": 1.9589763883392692, + "grad_norm": 0.012464006431400776, + "learning_rate": 1.787332571294587e-05, + "loss": 0.0004, + "step": 7426 + }, + { + "epoch": 1.9595040232159344, + "grad_norm": 0.044137414544820786, + "learning_rate": 1.787215334564319e-05, + "loss": 0.0007, + "step": 7428 + }, + { + "epoch": 1.9600316580925998, + "grad_norm": 0.2985957860946655, + "learning_rate": 1.7870980978340514e-05, + "loss": 0.0082, + "step": 7430 + }, + { + "epoch": 1.9605592929692652, + "grad_norm": 0.06288174539804459, + "learning_rate": 1.786980861103784e-05, + "loss": 0.0059, + "step": 7432 + }, + { + "epoch": 1.9610869278459306, + "grad_norm": 0.263553649187088, + "learning_rate": 1.7868636243735166e-05, + "loss": 0.0038, + "step": 7434 + }, + { + "epoch": 1.961614562722596, + "grad_norm": 0.07399009168148041, + "learning_rate": 1.786746387643249e-05, + "loss": 0.0005, + "step": 7436 + }, + { + "epoch": 1.9621421975992615, + "grad_norm": 0.004276069812476635, + "learning_rate": 1.786629150912981e-05, + "loss": 0.0002, + "step": 7438 + }, + { + "epoch": 1.9626698324759266, + "grad_norm": 0.19198264181613922, + "learning_rate": 1.7865119141827137e-05, + "loss": 0.004, + "step": 7440 + }, + { + "epoch": 1.963197467352592, + "grad_norm": 0.010070943273603916, + "learning_rate": 1.786394677452446e-05, + "loss": 0.0005, + "step": 7442 + }, + { + "epoch": 1.9637251022292572, + "grad_norm": 0.26448243856430054, + "learning_rate": 1.7862774407221782e-05, + "loss": 0.0018, + "step": 7444 + }, + { + "epoch": 1.9642527371059226, + "grad_norm": 0.02397056482732296, + "learning_rate": 1.786160203991911e-05, + "loss": 0.0003, + "step": 7446 + }, + { + "epoch": 1.964780371982588, + "grad_norm": 0.014365466311573982, + "learning_rate": 1.7860429672616434e-05, + "loss": 0.0023, + "step": 7448 + }, + { + "epoch": 1.9653080068592534, + "grad_norm": 0.0072662788443267345, + "learning_rate": 1.7859257305313754e-05, + "loss": 0.0006, + "step": 7450 + }, + { + "epoch": 1.9658356417359188, + "grad_norm": 0.18923693895339966, + "learning_rate": 1.785808493801108e-05, + "loss": 0.0051, + "step": 7452 + }, + { + "epoch": 1.9663632766125843, + "grad_norm": 0.2360461950302124, + "learning_rate": 1.7856912570708406e-05, + "loss": 0.0013, + "step": 7454 + }, + { + "epoch": 1.9668909114892494, + "grad_norm": 0.2082657366991043, + "learning_rate": 1.7855740203405728e-05, + "loss": 0.0087, + "step": 7456 + }, + { + "epoch": 1.9674185463659146, + "grad_norm": 0.7068542242050171, + "learning_rate": 1.785456783610305e-05, + "loss": 0.009, + "step": 7458 + }, + { + "epoch": 1.96794618124258, + "grad_norm": 0.07699517160654068, + "learning_rate": 1.7853395468800377e-05, + "loss": 0.0009, + "step": 7460 + }, + { + "epoch": 1.9684738161192454, + "grad_norm": 0.3645283281803131, + "learning_rate": 1.78522231014977e-05, + "loss": 0.0037, + "step": 7462 + }, + { + "epoch": 1.9690014509959108, + "grad_norm": 0.5805326700210571, + "learning_rate": 1.7851050734195025e-05, + "loss": 0.0101, + "step": 7464 + }, + { + "epoch": 1.9695290858725762, + "grad_norm": 0.26085513830184937, + "learning_rate": 1.7849878366892348e-05, + "loss": 0.0212, + "step": 7466 + }, + { + "epoch": 1.9700567207492417, + "grad_norm": 0.03596356883645058, + "learning_rate": 1.7848705999589674e-05, + "loss": 0.0005, + "step": 7468 + }, + { + "epoch": 1.9705843556259068, + "grad_norm": 0.33648720383644104, + "learning_rate": 1.7847533632286997e-05, + "loss": 0.0102, + "step": 7470 + }, + { + "epoch": 1.9711119905025722, + "grad_norm": 0.020951127633452415, + "learning_rate": 1.7846361264984323e-05, + "loss": 0.0009, + "step": 7472 + }, + { + "epoch": 1.9716396253792374, + "grad_norm": 0.05152933672070503, + "learning_rate": 1.7845188897681645e-05, + "loss": 0.001, + "step": 7474 + }, + { + "epoch": 1.9721672602559028, + "grad_norm": 0.9259939193725586, + "learning_rate": 1.7844016530378968e-05, + "loss": 0.0037, + "step": 7476 + }, + { + "epoch": 1.9726948951325682, + "grad_norm": 0.35882171988487244, + "learning_rate": 1.7842844163076294e-05, + "loss": 0.0034, + "step": 7478 + }, + { + "epoch": 1.9732225300092336, + "grad_norm": 0.20324426889419556, + "learning_rate": 1.7841671795773616e-05, + "loss": 0.0014, + "step": 7480 + }, + { + "epoch": 1.973750164885899, + "grad_norm": 0.016597725450992584, + "learning_rate": 1.7840499428470942e-05, + "loss": 0.0009, + "step": 7482 + }, + { + "epoch": 1.9742777997625645, + "grad_norm": 0.016183961182832718, + "learning_rate": 1.7839327061168265e-05, + "loss": 0.0027, + "step": 7484 + }, + { + "epoch": 1.9748054346392296, + "grad_norm": 0.41295620799064636, + "learning_rate": 1.783815469386559e-05, + "loss": 0.0127, + "step": 7486 + }, + { + "epoch": 1.975333069515895, + "grad_norm": 0.024658149108290672, + "learning_rate": 1.7836982326562914e-05, + "loss": 0.0006, + "step": 7488 + }, + { + "epoch": 1.9758607043925602, + "grad_norm": 0.16167303919792175, + "learning_rate": 1.7835809959260236e-05, + "loss": 0.0021, + "step": 7490 + }, + { + "epoch": 1.9763883392692256, + "grad_norm": 0.03208820894360542, + "learning_rate": 1.7834637591957562e-05, + "loss": 0.0105, + "step": 7492 + }, + { + "epoch": 1.976915974145891, + "grad_norm": 0.061177633702754974, + "learning_rate": 1.7833465224654888e-05, + "loss": 0.0056, + "step": 7494 + }, + { + "epoch": 1.9774436090225564, + "grad_norm": 0.11590898036956787, + "learning_rate": 1.7832292857352207e-05, + "loss": 0.0033, + "step": 7496 + }, + { + "epoch": 1.9779712438992219, + "grad_norm": 0.2554647922515869, + "learning_rate": 1.7831120490049533e-05, + "loss": 0.0043, + "step": 7498 + }, + { + "epoch": 1.978498878775887, + "grad_norm": 0.009872869588434696, + "learning_rate": 1.782994812274686e-05, + "loss": 0.0004, + "step": 7500 + }, + { + "epoch": 1.9790265136525524, + "grad_norm": 0.016372084617614746, + "learning_rate": 1.7828775755444182e-05, + "loss": 0.0009, + "step": 7502 + }, + { + "epoch": 1.9795541485292176, + "grad_norm": 0.4275522232055664, + "learning_rate": 1.7827603388141505e-05, + "loss": 0.0012, + "step": 7504 + }, + { + "epoch": 1.980081783405883, + "grad_norm": 0.009024654515087605, + "learning_rate": 1.782643102083883e-05, + "loss": 0.0006, + "step": 7506 + }, + { + "epoch": 1.9806094182825484, + "grad_norm": 0.2415744811296463, + "learning_rate": 1.7825258653536153e-05, + "loss": 0.0021, + "step": 7508 + }, + { + "epoch": 1.9811370531592138, + "grad_norm": 0.008059966377913952, + "learning_rate": 1.7824086286233476e-05, + "loss": 0.0003, + "step": 7510 + }, + { + "epoch": 1.9816646880358793, + "grad_norm": 0.03965466096997261, + "learning_rate": 1.7822913918930802e-05, + "loss": 0.0005, + "step": 7512 + }, + { + "epoch": 1.9821923229125447, + "grad_norm": 0.059111129492521286, + "learning_rate": 1.7821741551628128e-05, + "loss": 0.0004, + "step": 7514 + }, + { + "epoch": 1.9827199577892098, + "grad_norm": 0.06316863745450974, + "learning_rate": 1.782056918432545e-05, + "loss": 0.0009, + "step": 7516 + }, + { + "epoch": 1.9832475926658752, + "grad_norm": 0.199612557888031, + "learning_rate": 1.7819396817022773e-05, + "loss": 0.0035, + "step": 7518 + }, + { + "epoch": 1.9837752275425404, + "grad_norm": 0.11776522547006607, + "learning_rate": 1.78182244497201e-05, + "loss": 0.0164, + "step": 7520 + }, + { + "epoch": 1.9843028624192058, + "grad_norm": 0.2308196872472763, + "learning_rate": 1.7817052082417422e-05, + "loss": 0.0072, + "step": 7522 + }, + { + "epoch": 1.9848304972958712, + "grad_norm": 0.010908435098826885, + "learning_rate": 1.7815879715114748e-05, + "loss": 0.0002, + "step": 7524 + }, + { + "epoch": 1.9853581321725366, + "grad_norm": 0.34048905968666077, + "learning_rate": 1.781470734781207e-05, + "loss": 0.0016, + "step": 7526 + }, + { + "epoch": 1.985885767049202, + "grad_norm": 0.14070934057235718, + "learning_rate": 1.7813534980509396e-05, + "loss": 0.0091, + "step": 7528 + }, + { + "epoch": 1.9864134019258675, + "grad_norm": 0.0369747020304203, + "learning_rate": 1.781236261320672e-05, + "loss": 0.0004, + "step": 7530 + }, + { + "epoch": 1.9869410368025326, + "grad_norm": 0.03394242748618126, + "learning_rate": 1.7811190245904045e-05, + "loss": 0.0071, + "step": 7532 + }, + { + "epoch": 1.9874686716791978, + "grad_norm": 0.07794075459241867, + "learning_rate": 1.7810017878601368e-05, + "loss": 0.0034, + "step": 7534 + }, + { + "epoch": 1.9879963065558632, + "grad_norm": 0.24210619926452637, + "learning_rate": 1.780884551129869e-05, + "loss": 0.0019, + "step": 7536 + }, + { + "epoch": 1.9885239414325286, + "grad_norm": 0.03404839709401131, + "learning_rate": 1.7807673143996016e-05, + "loss": 0.0004, + "step": 7538 + }, + { + "epoch": 1.989051576309194, + "grad_norm": 0.007631769869476557, + "learning_rate": 1.780650077669334e-05, + "loss": 0.0021, + "step": 7540 + }, + { + "epoch": 1.9895792111858595, + "grad_norm": 0.017602382227778435, + "learning_rate": 1.7805328409390665e-05, + "loss": 0.0004, + "step": 7542 + }, + { + "epoch": 1.9901068460625249, + "grad_norm": 0.11804769933223724, + "learning_rate": 1.7804156042087987e-05, + "loss": 0.0171, + "step": 7544 + }, + { + "epoch": 1.99063448093919, + "grad_norm": 0.0866435319185257, + "learning_rate": 1.7802983674785313e-05, + "loss": 0.0111, + "step": 7546 + }, + { + "epoch": 1.9911621158158554, + "grad_norm": 0.6962540149688721, + "learning_rate": 1.7801811307482636e-05, + "loss": 0.0035, + "step": 7548 + }, + { + "epoch": 1.9916897506925206, + "grad_norm": 0.010217134840786457, + "learning_rate": 1.780063894017996e-05, + "loss": 0.0004, + "step": 7550 + }, + { + "epoch": 1.992217385569186, + "grad_norm": 0.014760388061404228, + "learning_rate": 1.7799466572877285e-05, + "loss": 0.0004, + "step": 7552 + }, + { + "epoch": 1.9927450204458514, + "grad_norm": 0.1178998276591301, + "learning_rate": 1.7798294205574607e-05, + "loss": 0.0012, + "step": 7554 + }, + { + "epoch": 1.9932726553225169, + "grad_norm": 0.01017176266759634, + "learning_rate": 1.779712183827193e-05, + "loss": 0.0005, + "step": 7556 + }, + { + "epoch": 1.9938002901991823, + "grad_norm": 0.23902031779289246, + "learning_rate": 1.7795949470969256e-05, + "loss": 0.0093, + "step": 7558 + }, + { + "epoch": 1.9943279250758477, + "grad_norm": 0.04291283339262009, + "learning_rate": 1.7794777103666582e-05, + "loss": 0.0013, + "step": 7560 + }, + { + "epoch": 1.9948555599525128, + "grad_norm": 0.0077087851241230965, + "learning_rate": 1.7793604736363904e-05, + "loss": 0.0005, + "step": 7562 + }, + { + "epoch": 1.9953831948291783, + "grad_norm": 0.0032730933744460344, + "learning_rate": 1.7792432369061227e-05, + "loss": 0.0024, + "step": 7564 + }, + { + "epoch": 1.9959108297058434, + "grad_norm": 0.0037373185623437166, + "learning_rate": 1.7791260001758553e-05, + "loss": 0.002, + "step": 7566 + }, + { + "epoch": 1.9964384645825088, + "grad_norm": 0.00603551184758544, + "learning_rate": 1.7790087634455876e-05, + "loss": 0.0005, + "step": 7568 + }, + { + "epoch": 1.9969660994591742, + "grad_norm": 0.019521385431289673, + "learning_rate": 1.7788915267153198e-05, + "loss": 0.0129, + "step": 7570 + }, + { + "epoch": 1.9974937343358397, + "grad_norm": 0.0573885440826416, + "learning_rate": 1.7787742899850524e-05, + "loss": 0.0005, + "step": 7572 + }, + { + "epoch": 1.998021369212505, + "grad_norm": 0.9516845941543579, + "learning_rate": 1.778657053254785e-05, + "loss": 0.0055, + "step": 7574 + }, + { + "epoch": 1.9985490040891702, + "grad_norm": 0.039598383009433746, + "learning_rate": 1.7785398165245173e-05, + "loss": 0.001, + "step": 7576 + }, + { + "epoch": 1.9990766389658357, + "grad_norm": 0.5896232724189758, + "learning_rate": 1.7784225797942495e-05, + "loss": 0.0055, + "step": 7578 + }, + { + "epoch": 1.9996042738425008, + "grad_norm": 0.12815691530704498, + "learning_rate": 1.778305343063982e-05, + "loss": 0.0024, + "step": 7580 + }, + { + "epoch": 2.0, + "grad_norm": 0.05706343427300453, + "learning_rate": 1.7781881063337144e-05, + "loss": 0.0066, + "step": 7582 + }, + { + "epoch": 2.0005276348766654, + "grad_norm": 0.043731119483709335, + "learning_rate": 1.778070869603447e-05, + "loss": 0.0018, + "step": 7584 + }, + { + "epoch": 2.001055269753331, + "grad_norm": 0.11514376848936081, + "learning_rate": 1.7779536328731793e-05, + "loss": 0.001, + "step": 7586 + }, + { + "epoch": 2.001582904629996, + "grad_norm": 0.5194632411003113, + "learning_rate": 1.777836396142912e-05, + "loss": 0.0034, + "step": 7588 + }, + { + "epoch": 2.002110539506661, + "grad_norm": 0.09023495018482208, + "learning_rate": 1.777719159412644e-05, + "loss": 0.0008, + "step": 7590 + }, + { + "epoch": 2.0026381743833266, + "grad_norm": 0.007048253435641527, + "learning_rate": 1.7776019226823767e-05, + "loss": 0.0028, + "step": 7592 + }, + { + "epoch": 2.003165809259992, + "grad_norm": 0.03586477413773537, + "learning_rate": 1.777484685952109e-05, + "loss": 0.0054, + "step": 7594 + }, + { + "epoch": 2.0036934441366574, + "grad_norm": 0.005035530310124159, + "learning_rate": 1.7773674492218413e-05, + "loss": 0.0063, + "step": 7596 + }, + { + "epoch": 2.004221079013323, + "grad_norm": 0.025299936532974243, + "learning_rate": 1.777250212491574e-05, + "loss": 0.0003, + "step": 7598 + }, + { + "epoch": 2.004748713889988, + "grad_norm": 0.08914460241794586, + "learning_rate": 1.777132975761306e-05, + "loss": 0.0104, + "step": 7600 + }, + { + "epoch": 2.0052763487666536, + "grad_norm": 0.005756769794970751, + "learning_rate": 1.7770157390310384e-05, + "loss": 0.0003, + "step": 7602 + }, + { + "epoch": 2.005803983643319, + "grad_norm": 0.12721498310565948, + "learning_rate": 1.776898502300771e-05, + "loss": 0.0012, + "step": 7604 + }, + { + "epoch": 2.006331618519984, + "grad_norm": 0.0061981347389519215, + "learning_rate": 1.7767812655705036e-05, + "loss": 0.0003, + "step": 7606 + }, + { + "epoch": 2.0068592533966494, + "grad_norm": 0.006193746346980333, + "learning_rate": 1.776664028840236e-05, + "loss": 0.0002, + "step": 7608 + }, + { + "epoch": 2.007386888273315, + "grad_norm": 0.013050589710474014, + "learning_rate": 1.776546792109968e-05, + "loss": 0.0005, + "step": 7610 + }, + { + "epoch": 2.00791452314998, + "grad_norm": 0.411369264125824, + "learning_rate": 1.7764295553797007e-05, + "loss": 0.0021, + "step": 7612 + }, + { + "epoch": 2.0084421580266456, + "grad_norm": 0.05480822175741196, + "learning_rate": 1.776312318649433e-05, + "loss": 0.0007, + "step": 7614 + }, + { + "epoch": 2.008969792903311, + "grad_norm": 0.03680651634931564, + "learning_rate": 1.7761950819191652e-05, + "loss": 0.0019, + "step": 7616 + }, + { + "epoch": 2.0094974277799764, + "grad_norm": 0.007035048678517342, + "learning_rate": 1.7760778451888978e-05, + "loss": 0.0003, + "step": 7618 + }, + { + "epoch": 2.0100250626566414, + "grad_norm": 0.004490306135267019, + "learning_rate": 1.7759606084586304e-05, + "loss": 0.0026, + "step": 7620 + }, + { + "epoch": 2.010552697533307, + "grad_norm": 0.004056817386299372, + "learning_rate": 1.7758433717283627e-05, + "loss": 0.0002, + "step": 7622 + }, + { + "epoch": 2.011080332409972, + "grad_norm": 0.24461956322193146, + "learning_rate": 1.775726134998095e-05, + "loss": 0.0079, + "step": 7624 + }, + { + "epoch": 2.0116079672866376, + "grad_norm": 0.011805124580860138, + "learning_rate": 1.7756088982678275e-05, + "loss": 0.0003, + "step": 7626 + }, + { + "epoch": 2.012135602163303, + "grad_norm": 0.21490290760993958, + "learning_rate": 1.7754916615375598e-05, + "loss": 0.0022, + "step": 7628 + }, + { + "epoch": 2.0126632370399684, + "grad_norm": 0.0118615897372365, + "learning_rate": 1.775374424807292e-05, + "loss": 0.0003, + "step": 7630 + }, + { + "epoch": 2.013190871916634, + "grad_norm": 0.007818209007382393, + "learning_rate": 1.7752571880770247e-05, + "loss": 0.0003, + "step": 7632 + }, + { + "epoch": 2.0137185067932992, + "grad_norm": 0.005070534069091082, + "learning_rate": 1.7751399513467573e-05, + "loss": 0.0003, + "step": 7634 + }, + { + "epoch": 2.014246141669964, + "grad_norm": 0.00483890762552619, + "learning_rate": 1.7750227146164895e-05, + "loss": 0.0002, + "step": 7636 + }, + { + "epoch": 2.0147737765466296, + "grad_norm": 0.2554224133491516, + "learning_rate": 1.7749054778862218e-05, + "loss": 0.0049, + "step": 7638 + }, + { + "epoch": 2.015301411423295, + "grad_norm": 0.03303961455821991, + "learning_rate": 1.7747882411559544e-05, + "loss": 0.0005, + "step": 7640 + }, + { + "epoch": 2.0158290462999604, + "grad_norm": 0.0032045929692685604, + "learning_rate": 1.7746710044256866e-05, + "loss": 0.0002, + "step": 7642 + }, + { + "epoch": 2.016356681176626, + "grad_norm": 0.005471334792673588, + "learning_rate": 1.7745537676954192e-05, + "loss": 0.0045, + "step": 7644 + }, + { + "epoch": 2.016884316053291, + "grad_norm": 0.2835249602794647, + "learning_rate": 1.7744365309651515e-05, + "loss": 0.0024, + "step": 7646 + }, + { + "epoch": 2.0174119509299566, + "grad_norm": 0.21788336336612701, + "learning_rate": 1.7743192942348838e-05, + "loss": 0.002, + "step": 7648 + }, + { + "epoch": 2.017939585806622, + "grad_norm": 0.07795649021863937, + "learning_rate": 1.7742020575046164e-05, + "loss": 0.0085, + "step": 7650 + }, + { + "epoch": 2.018467220683287, + "grad_norm": 0.3349325358867645, + "learning_rate": 1.774084820774349e-05, + "loss": 0.0042, + "step": 7652 + }, + { + "epoch": 2.0189948555599524, + "grad_norm": 0.01976868510246277, + "learning_rate": 1.7739675840440812e-05, + "loss": 0.0051, + "step": 7654 + }, + { + "epoch": 2.019522490436618, + "grad_norm": 0.30020734667778015, + "learning_rate": 1.7738503473138135e-05, + "loss": 0.0057, + "step": 7656 + }, + { + "epoch": 2.020050125313283, + "grad_norm": 0.0024301132652908564, + "learning_rate": 1.773733110583546e-05, + "loss": 0.0002, + "step": 7658 + }, + { + "epoch": 2.0205777601899486, + "grad_norm": 0.004020344465970993, + "learning_rate": 1.7736158738532783e-05, + "loss": 0.0014, + "step": 7660 + }, + { + "epoch": 2.021105395066614, + "grad_norm": 0.003083598567172885, + "learning_rate": 1.7734986371230106e-05, + "loss": 0.0045, + "step": 7662 + }, + { + "epoch": 2.0216330299432794, + "grad_norm": 0.004122274462133646, + "learning_rate": 1.7733814003927432e-05, + "loss": 0.0002, + "step": 7664 + }, + { + "epoch": 2.0221606648199444, + "grad_norm": 0.0882505401968956, + "learning_rate": 1.7732641636624758e-05, + "loss": 0.0089, + "step": 7666 + }, + { + "epoch": 2.02268829969661, + "grad_norm": 0.053798794746398926, + "learning_rate": 1.773146926932208e-05, + "loss": 0.0005, + "step": 7668 + }, + { + "epoch": 2.023215934573275, + "grad_norm": 0.044346027076244354, + "learning_rate": 1.7730296902019403e-05, + "loss": 0.0004, + "step": 7670 + }, + { + "epoch": 2.0237435694499406, + "grad_norm": 0.01715412735939026, + "learning_rate": 1.772912453471673e-05, + "loss": 0.0003, + "step": 7672 + }, + { + "epoch": 2.024271204326606, + "grad_norm": 0.034891631454229355, + "learning_rate": 1.7727952167414052e-05, + "loss": 0.0007, + "step": 7674 + }, + { + "epoch": 2.0247988392032714, + "grad_norm": 0.0240153931081295, + "learning_rate": 1.7726779800111374e-05, + "loss": 0.0003, + "step": 7676 + }, + { + "epoch": 2.025326474079937, + "grad_norm": 0.004020186141133308, + "learning_rate": 1.77256074328087e-05, + "loss": 0.0007, + "step": 7678 + }, + { + "epoch": 2.0258541089566022, + "grad_norm": 0.021011963486671448, + "learning_rate": 1.7724435065506026e-05, + "loss": 0.0003, + "step": 7680 + }, + { + "epoch": 2.026381743833267, + "grad_norm": 0.1612364947795868, + "learning_rate": 1.772326269820335e-05, + "loss": 0.0055, + "step": 7682 + }, + { + "epoch": 2.0269093787099326, + "grad_norm": 0.28724542260169983, + "learning_rate": 1.7722090330900672e-05, + "loss": 0.0017, + "step": 7684 + }, + { + "epoch": 2.027437013586598, + "grad_norm": 0.03401792794466019, + "learning_rate": 1.7720917963597998e-05, + "loss": 0.0004, + "step": 7686 + }, + { + "epoch": 2.0279646484632634, + "grad_norm": 0.19915014505386353, + "learning_rate": 1.771974559629532e-05, + "loss": 0.0069, + "step": 7688 + }, + { + "epoch": 2.028492283339929, + "grad_norm": 0.19487136602401733, + "learning_rate": 1.7718573228992643e-05, + "loss": 0.0111, + "step": 7690 + }, + { + "epoch": 2.029019918216594, + "grad_norm": 0.0072587626054883, + "learning_rate": 1.771740086168997e-05, + "loss": 0.0101, + "step": 7692 + }, + { + "epoch": 2.0295475530932596, + "grad_norm": 0.016243930906057358, + "learning_rate": 1.771622849438729e-05, + "loss": 0.0005, + "step": 7694 + }, + { + "epoch": 2.030075187969925, + "grad_norm": 0.21269269287586212, + "learning_rate": 1.7715056127084618e-05, + "loss": 0.0043, + "step": 7696 + }, + { + "epoch": 2.03060282284659, + "grad_norm": 0.022471487522125244, + "learning_rate": 1.771388375978194e-05, + "loss": 0.0003, + "step": 7698 + }, + { + "epoch": 2.0311304577232554, + "grad_norm": 0.03991202265024185, + "learning_rate": 1.7712711392479266e-05, + "loss": 0.0011, + "step": 7700 + }, + { + "epoch": 2.031658092599921, + "grad_norm": 0.015903398394584656, + "learning_rate": 1.771153902517659e-05, + "loss": 0.0004, + "step": 7702 + }, + { + "epoch": 2.032185727476586, + "grad_norm": 0.13940119743347168, + "learning_rate": 1.7710366657873915e-05, + "loss": 0.0036, + "step": 7704 + }, + { + "epoch": 2.0327133623532516, + "grad_norm": 0.010681858286261559, + "learning_rate": 1.7709194290571237e-05, + "loss": 0.0036, + "step": 7706 + }, + { + "epoch": 2.033240997229917, + "grad_norm": 0.007025548722594976, + "learning_rate": 1.770802192326856e-05, + "loss": 0.0003, + "step": 7708 + }, + { + "epoch": 2.0337686321065824, + "grad_norm": 0.011629078537225723, + "learning_rate": 1.7706849555965886e-05, + "loss": 0.007, + "step": 7710 + }, + { + "epoch": 2.0342962669832474, + "grad_norm": 0.03311237692832947, + "learning_rate": 1.7705677188663212e-05, + "loss": 0.0006, + "step": 7712 + }, + { + "epoch": 2.034823901859913, + "grad_norm": 0.01446925476193428, + "learning_rate": 1.7704504821360535e-05, + "loss": 0.0071, + "step": 7714 + }, + { + "epoch": 2.035351536736578, + "grad_norm": 0.030091704800724983, + "learning_rate": 1.7703332454057857e-05, + "loss": 0.0005, + "step": 7716 + }, + { + "epoch": 2.0358791716132436, + "grad_norm": 0.06559839844703674, + "learning_rate": 1.7702160086755183e-05, + "loss": 0.0007, + "step": 7718 + }, + { + "epoch": 2.036406806489909, + "grad_norm": 0.06730816513299942, + "learning_rate": 1.7700987719452506e-05, + "loss": 0.001, + "step": 7720 + }, + { + "epoch": 2.0369344413665744, + "grad_norm": 0.0941486656665802, + "learning_rate": 1.769981535214983e-05, + "loss": 0.0013, + "step": 7722 + }, + { + "epoch": 2.03746207624324, + "grad_norm": 0.06850658357143402, + "learning_rate": 1.7698642984847154e-05, + "loss": 0.0008, + "step": 7724 + }, + { + "epoch": 2.0379897111199052, + "grad_norm": 0.009615582413971424, + "learning_rate": 1.769747061754448e-05, + "loss": 0.0007, + "step": 7726 + }, + { + "epoch": 2.03851734599657, + "grad_norm": 0.007046441547572613, + "learning_rate": 1.76962982502418e-05, + "loss": 0.0008, + "step": 7728 + }, + { + "epoch": 2.0390449808732356, + "grad_norm": 0.01955445297062397, + "learning_rate": 1.7695125882939126e-05, + "loss": 0.0004, + "step": 7730 + }, + { + "epoch": 2.039572615749901, + "grad_norm": 0.006549195386469364, + "learning_rate": 1.769395351563645e-05, + "loss": 0.0002, + "step": 7732 + }, + { + "epoch": 2.0401002506265664, + "grad_norm": 0.003639067057520151, + "learning_rate": 1.7692781148333774e-05, + "loss": 0.0007, + "step": 7734 + }, + { + "epoch": 2.040627885503232, + "grad_norm": 0.03916838392615318, + "learning_rate": 1.7691608781031097e-05, + "loss": 0.0007, + "step": 7736 + }, + { + "epoch": 2.0411555203798972, + "grad_norm": 0.13672104477882385, + "learning_rate": 1.7690436413728423e-05, + "loss": 0.002, + "step": 7738 + }, + { + "epoch": 2.0416831552565626, + "grad_norm": 0.017854155972599983, + "learning_rate": 1.7689264046425745e-05, + "loss": 0.0003, + "step": 7740 + }, + { + "epoch": 2.0422107901332276, + "grad_norm": 0.0020753282587975264, + "learning_rate": 1.768809167912307e-05, + "loss": 0.0019, + "step": 7742 + }, + { + "epoch": 2.042738425009893, + "grad_norm": 0.0736602395772934, + "learning_rate": 1.7686919311820394e-05, + "loss": 0.0012, + "step": 7744 + }, + { + "epoch": 2.0432660598865584, + "grad_norm": 0.04504906386137009, + "learning_rate": 1.768574694451772e-05, + "loss": 0.0007, + "step": 7746 + }, + { + "epoch": 2.043793694763224, + "grad_norm": 0.2511119842529297, + "learning_rate": 1.7684574577215043e-05, + "loss": 0.0012, + "step": 7748 + }, + { + "epoch": 2.044321329639889, + "grad_norm": 0.05018877610564232, + "learning_rate": 1.7683402209912365e-05, + "loss": 0.0004, + "step": 7750 + }, + { + "epoch": 2.0448489645165546, + "grad_norm": 0.03576037287712097, + "learning_rate": 1.768222984260969e-05, + "loss": 0.0013, + "step": 7752 + }, + { + "epoch": 2.04537659939322, + "grad_norm": 0.0036457981914281845, + "learning_rate": 1.7681057475307014e-05, + "loss": 0.0002, + "step": 7754 + }, + { + "epoch": 2.0459042342698854, + "grad_norm": 0.05702027678489685, + "learning_rate": 1.767988510800434e-05, + "loss": 0.0004, + "step": 7756 + }, + { + "epoch": 2.0464318691465504, + "grad_norm": 0.0019249101169407368, + "learning_rate": 1.7678712740701662e-05, + "loss": 0.0002, + "step": 7758 + }, + { + "epoch": 2.046959504023216, + "grad_norm": 0.006688141264021397, + "learning_rate": 1.767754037339899e-05, + "loss": 0.0002, + "step": 7760 + }, + { + "epoch": 2.047487138899881, + "grad_norm": 0.0028182528913021088, + "learning_rate": 1.767636800609631e-05, + "loss": 0.0018, + "step": 7762 + }, + { + "epoch": 2.0480147737765466, + "grad_norm": 0.5644163489341736, + "learning_rate": 1.7675195638793637e-05, + "loss": 0.0084, + "step": 7764 + }, + { + "epoch": 2.048542408653212, + "grad_norm": 0.0019200280075892806, + "learning_rate": 1.767402327149096e-05, + "loss": 0.0002, + "step": 7766 + }, + { + "epoch": 2.0490700435298774, + "grad_norm": 0.0013772060628980398, + "learning_rate": 1.7672850904188282e-05, + "loss": 0.0034, + "step": 7768 + }, + { + "epoch": 2.049597678406543, + "grad_norm": 0.07661620527505875, + "learning_rate": 1.767167853688561e-05, + "loss": 0.0009, + "step": 7770 + }, + { + "epoch": 2.050125313283208, + "grad_norm": 0.00909795705229044, + "learning_rate": 1.7670506169582934e-05, + "loss": 0.0042, + "step": 7772 + }, + { + "epoch": 2.050652948159873, + "grad_norm": 0.016068752855062485, + "learning_rate": 1.7669333802280254e-05, + "loss": 0.0003, + "step": 7774 + }, + { + "epoch": 2.0511805830365386, + "grad_norm": 0.10216818749904633, + "learning_rate": 1.766816143497758e-05, + "loss": 0.0032, + "step": 7776 + }, + { + "epoch": 2.051708217913204, + "grad_norm": 0.5623202919960022, + "learning_rate": 1.7666989067674906e-05, + "loss": 0.0041, + "step": 7778 + }, + { + "epoch": 2.0522358527898694, + "grad_norm": 0.061000630259513855, + "learning_rate": 1.7665816700372228e-05, + "loss": 0.0003, + "step": 7780 + }, + { + "epoch": 2.052763487666535, + "grad_norm": 0.03016815520823002, + "learning_rate": 1.766464433306955e-05, + "loss": 0.0003, + "step": 7782 + }, + { + "epoch": 2.0532911225432002, + "grad_norm": 0.14960385859012604, + "learning_rate": 1.7663471965766877e-05, + "loss": 0.0038, + "step": 7784 + }, + { + "epoch": 2.0538187574198656, + "grad_norm": 0.2640515863895416, + "learning_rate": 1.76622995984642e-05, + "loss": 0.0014, + "step": 7786 + }, + { + "epoch": 2.0543463922965306, + "grad_norm": 0.009522337466478348, + "learning_rate": 1.7661127231161522e-05, + "loss": 0.0027, + "step": 7788 + }, + { + "epoch": 2.054874027173196, + "grad_norm": 0.0063414983451366425, + "learning_rate": 1.7659954863858848e-05, + "loss": 0.0003, + "step": 7790 + }, + { + "epoch": 2.0554016620498614, + "grad_norm": 0.008193151094019413, + "learning_rate": 1.7658782496556174e-05, + "loss": 0.0003, + "step": 7792 + }, + { + "epoch": 2.055929296926527, + "grad_norm": 0.01448145229369402, + "learning_rate": 1.7657610129253497e-05, + "loss": 0.0033, + "step": 7794 + }, + { + "epoch": 2.0564569318031922, + "grad_norm": 0.7886919379234314, + "learning_rate": 1.765643776195082e-05, + "loss": 0.0031, + "step": 7796 + }, + { + "epoch": 2.0569845666798576, + "grad_norm": 0.0026795912999659777, + "learning_rate": 1.7655265394648145e-05, + "loss": 0.0002, + "step": 7798 + }, + { + "epoch": 2.057512201556523, + "grad_norm": 0.0042033204808831215, + "learning_rate": 1.7654093027345468e-05, + "loss": 0.0002, + "step": 7800 + }, + { + "epoch": 2.0580398364331884, + "grad_norm": 0.014915832318365574, + "learning_rate": 1.7652920660042794e-05, + "loss": 0.0003, + "step": 7802 + }, + { + "epoch": 2.0585674713098534, + "grad_norm": 0.004030031152069569, + "learning_rate": 1.7651748292740116e-05, + "loss": 0.0005, + "step": 7804 + }, + { + "epoch": 2.059095106186519, + "grad_norm": 0.01238662376999855, + "learning_rate": 1.7650575925437442e-05, + "loss": 0.0005, + "step": 7806 + }, + { + "epoch": 2.059622741063184, + "grad_norm": 0.05862395092844963, + "learning_rate": 1.7649403558134765e-05, + "loss": 0.0004, + "step": 7808 + }, + { + "epoch": 2.0601503759398496, + "grad_norm": 0.0925808772444725, + "learning_rate": 1.764823119083209e-05, + "loss": 0.0005, + "step": 7810 + }, + { + "epoch": 2.060678010816515, + "grad_norm": 0.0033906088210642338, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.0004, + "step": 7812 + }, + { + "epoch": 2.0612056456931804, + "grad_norm": 0.443379282951355, + "learning_rate": 1.7645886456226736e-05, + "loss": 0.0095, + "step": 7814 + }, + { + "epoch": 2.061733280569846, + "grad_norm": 0.2470417618751526, + "learning_rate": 1.7644714088924062e-05, + "loss": 0.0041, + "step": 7816 + }, + { + "epoch": 2.062260915446511, + "grad_norm": 0.28251609206199646, + "learning_rate": 1.7643541721621385e-05, + "loss": 0.0028, + "step": 7818 + }, + { + "epoch": 2.062788550323176, + "grad_norm": 0.008858172222971916, + "learning_rate": 1.7642369354318707e-05, + "loss": 0.0022, + "step": 7820 + }, + { + "epoch": 2.0633161851998416, + "grad_norm": 0.016373438760638237, + "learning_rate": 1.7641196987016033e-05, + "loss": 0.0045, + "step": 7822 + }, + { + "epoch": 2.063843820076507, + "grad_norm": 0.17540456354618073, + "learning_rate": 1.764002461971336e-05, + "loss": 0.0052, + "step": 7824 + }, + { + "epoch": 2.0643714549531724, + "grad_norm": 0.08325407654047012, + "learning_rate": 1.7638852252410682e-05, + "loss": 0.0004, + "step": 7826 + }, + { + "epoch": 2.064899089829838, + "grad_norm": 0.0028074770234525204, + "learning_rate": 1.7637679885108005e-05, + "loss": 0.0004, + "step": 7828 + }, + { + "epoch": 2.0654267247065032, + "grad_norm": 0.1781688630580902, + "learning_rate": 1.763650751780533e-05, + "loss": 0.0059, + "step": 7830 + }, + { + "epoch": 2.0659543595831686, + "grad_norm": 0.010833687148988247, + "learning_rate": 1.7635335150502653e-05, + "loss": 0.0002, + "step": 7832 + }, + { + "epoch": 2.0664819944598336, + "grad_norm": 0.0036682861391454935, + "learning_rate": 1.7634162783199976e-05, + "loss": 0.0022, + "step": 7834 + }, + { + "epoch": 2.067009629336499, + "grad_norm": 0.003406750736758113, + "learning_rate": 1.7632990415897302e-05, + "loss": 0.0052, + "step": 7836 + }, + { + "epoch": 2.0675372642131644, + "grad_norm": 0.046367038041353226, + "learning_rate": 1.7631818048594628e-05, + "loss": 0.0009, + "step": 7838 + }, + { + "epoch": 2.06806489908983, + "grad_norm": 0.002583463676273823, + "learning_rate": 1.763064568129195e-05, + "loss": 0.0002, + "step": 7840 + }, + { + "epoch": 2.0685925339664952, + "grad_norm": 0.004017992410808802, + "learning_rate": 1.7629473313989273e-05, + "loss": 0.0002, + "step": 7842 + }, + { + "epoch": 2.0691201688431606, + "grad_norm": 0.011463125236332417, + "learning_rate": 1.76283009466866e-05, + "loss": 0.0004, + "step": 7844 + }, + { + "epoch": 2.069647803719826, + "grad_norm": 0.022555721923708916, + "learning_rate": 1.762712857938392e-05, + "loss": 0.0003, + "step": 7846 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.008080684579908848, + "learning_rate": 1.7625956212081244e-05, + "loss": 0.001, + "step": 7848 + }, + { + "epoch": 2.0707030734731564, + "grad_norm": 0.8271945118904114, + "learning_rate": 1.762478384477857e-05, + "loss": 0.0082, + "step": 7850 + }, + { + "epoch": 2.071230708349822, + "grad_norm": 0.029022028669714928, + "learning_rate": 1.7623611477475896e-05, + "loss": 0.0006, + "step": 7852 + }, + { + "epoch": 2.071758343226487, + "grad_norm": 0.013419866561889648, + "learning_rate": 1.762243911017322e-05, + "loss": 0.0006, + "step": 7854 + }, + { + "epoch": 2.0722859781031526, + "grad_norm": 0.1846867799758911, + "learning_rate": 1.762126674287054e-05, + "loss": 0.0024, + "step": 7856 + }, + { + "epoch": 2.072813612979818, + "grad_norm": 0.020550699904561043, + "learning_rate": 1.7620094375567868e-05, + "loss": 0.0006, + "step": 7858 + }, + { + "epoch": 2.0733412478564834, + "grad_norm": 0.03675619140267372, + "learning_rate": 1.761892200826519e-05, + "loss": 0.0004, + "step": 7860 + }, + { + "epoch": 2.073868882733149, + "grad_norm": 0.01754334568977356, + "learning_rate": 1.7617749640962516e-05, + "loss": 0.0003, + "step": 7862 + }, + { + "epoch": 2.074396517609814, + "grad_norm": 0.02041751891374588, + "learning_rate": 1.761657727365984e-05, + "loss": 0.0003, + "step": 7864 + }, + { + "epoch": 2.074924152486479, + "grad_norm": 0.6638325452804565, + "learning_rate": 1.7615404906357165e-05, + "loss": 0.002, + "step": 7866 + }, + { + "epoch": 2.0754517873631446, + "grad_norm": 0.021029699593782425, + "learning_rate": 1.7614232539054487e-05, + "loss": 0.0009, + "step": 7868 + }, + { + "epoch": 2.07597942223981, + "grad_norm": 0.024617433547973633, + "learning_rate": 1.7613060171751813e-05, + "loss": 0.0004, + "step": 7870 + }, + { + "epoch": 2.0765070571164754, + "grad_norm": 0.05318721756339073, + "learning_rate": 1.7611887804449136e-05, + "loss": 0.0005, + "step": 7872 + }, + { + "epoch": 2.077034691993141, + "grad_norm": 0.00469506299123168, + "learning_rate": 1.761071543714646e-05, + "loss": 0.0013, + "step": 7874 + }, + { + "epoch": 2.0775623268698062, + "grad_norm": 0.3034417927265167, + "learning_rate": 1.7609543069843785e-05, + "loss": 0.0016, + "step": 7876 + }, + { + "epoch": 2.0780899617464716, + "grad_norm": 0.050685375928878784, + "learning_rate": 1.7608370702541107e-05, + "loss": 0.0004, + "step": 7878 + }, + { + "epoch": 2.0786175966231366, + "grad_norm": 0.011322707869112492, + "learning_rate": 1.760719833523843e-05, + "loss": 0.0002, + "step": 7880 + }, + { + "epoch": 2.079145231499802, + "grad_norm": 0.053103409707546234, + "learning_rate": 1.7606025967935756e-05, + "loss": 0.0004, + "step": 7882 + }, + { + "epoch": 2.0796728663764674, + "grad_norm": 0.11356016248464584, + "learning_rate": 1.7604853600633082e-05, + "loss": 0.0006, + "step": 7884 + }, + { + "epoch": 2.080200501253133, + "grad_norm": 0.11479809880256653, + "learning_rate": 1.7603681233330404e-05, + "loss": 0.0009, + "step": 7886 + }, + { + "epoch": 2.0807281361297982, + "grad_norm": 0.004316137172281742, + "learning_rate": 1.7602508866027727e-05, + "loss": 0.0002, + "step": 7888 + }, + { + "epoch": 2.0812557710064636, + "grad_norm": 0.0025221980176866055, + "learning_rate": 1.7601336498725053e-05, + "loss": 0.0003, + "step": 7890 + }, + { + "epoch": 2.081783405883129, + "grad_norm": 0.00520820077508688, + "learning_rate": 1.7600164131422376e-05, + "loss": 0.0002, + "step": 7892 + }, + { + "epoch": 2.082311040759794, + "grad_norm": 0.1634865701198578, + "learning_rate": 1.7598991764119698e-05, + "loss": 0.0006, + "step": 7894 + }, + { + "epoch": 2.0828386756364594, + "grad_norm": 0.003605786245316267, + "learning_rate": 1.7597819396817024e-05, + "loss": 0.0001, + "step": 7896 + }, + { + "epoch": 2.083366310513125, + "grad_norm": 0.00318514765240252, + "learning_rate": 1.759664702951435e-05, + "loss": 0.0105, + "step": 7898 + }, + { + "epoch": 2.0838939453897902, + "grad_norm": 0.012045840732753277, + "learning_rate": 1.7595474662211673e-05, + "loss": 0.0002, + "step": 7900 + }, + { + "epoch": 2.0844215802664556, + "grad_norm": 0.001852103159762919, + "learning_rate": 1.7594302294908995e-05, + "loss": 0.0055, + "step": 7902 + }, + { + "epoch": 2.084949215143121, + "grad_norm": 0.0024279814679175615, + "learning_rate": 1.759312992760632e-05, + "loss": 0.0002, + "step": 7904 + }, + { + "epoch": 2.0854768500197864, + "grad_norm": 0.005558750592172146, + "learning_rate": 1.7591957560303644e-05, + "loss": 0.0002, + "step": 7906 + }, + { + "epoch": 2.086004484896452, + "grad_norm": 0.26326513290405273, + "learning_rate": 1.7590785193000967e-05, + "loss": 0.0087, + "step": 7908 + }, + { + "epoch": 2.086532119773117, + "grad_norm": 0.02309824340045452, + "learning_rate": 1.7589612825698293e-05, + "loss": 0.0003, + "step": 7910 + }, + { + "epoch": 2.087059754649782, + "grad_norm": 0.1961548626422882, + "learning_rate": 1.758844045839562e-05, + "loss": 0.0008, + "step": 7912 + }, + { + "epoch": 2.0875873895264476, + "grad_norm": 0.0035171043127775192, + "learning_rate": 1.758726809109294e-05, + "loss": 0.0002, + "step": 7914 + }, + { + "epoch": 2.088115024403113, + "grad_norm": 0.01069781556725502, + "learning_rate": 1.7586095723790264e-05, + "loss": 0.0102, + "step": 7916 + }, + { + "epoch": 2.0886426592797784, + "grad_norm": 0.005428905598819256, + "learning_rate": 1.758492335648759e-05, + "loss": 0.0005, + "step": 7918 + }, + { + "epoch": 2.089170294156444, + "grad_norm": 0.016410453245043755, + "learning_rate": 1.7583750989184912e-05, + "loss": 0.0003, + "step": 7920 + }, + { + "epoch": 2.0896979290331092, + "grad_norm": 0.07373177260160446, + "learning_rate": 1.758257862188224e-05, + "loss": 0.0005, + "step": 7922 + }, + { + "epoch": 2.090225563909774, + "grad_norm": 0.20725752413272858, + "learning_rate": 1.758140625457956e-05, + "loss": 0.009, + "step": 7924 + }, + { + "epoch": 2.0907531987864396, + "grad_norm": 0.010864976793527603, + "learning_rate": 1.7580233887276884e-05, + "loss": 0.0004, + "step": 7926 + }, + { + "epoch": 2.091280833663105, + "grad_norm": 0.09038619697093964, + "learning_rate": 1.757906151997421e-05, + "loss": 0.0006, + "step": 7928 + }, + { + "epoch": 2.0918084685397704, + "grad_norm": 0.07868112623691559, + "learning_rate": 1.7577889152671536e-05, + "loss": 0.0003, + "step": 7930 + }, + { + "epoch": 2.092336103416436, + "grad_norm": 0.036038875579833984, + "learning_rate": 1.7576716785368858e-05, + "loss": 0.0004, + "step": 7932 + }, + { + "epoch": 2.0928637382931012, + "grad_norm": 0.005858131684362888, + "learning_rate": 1.757554441806618e-05, + "loss": 0.0002, + "step": 7934 + }, + { + "epoch": 2.0933913731697666, + "grad_norm": 0.006856112740933895, + "learning_rate": 1.7574372050763507e-05, + "loss": 0.0005, + "step": 7936 + }, + { + "epoch": 2.093919008046432, + "grad_norm": 0.006625899579375982, + "learning_rate": 1.757319968346083e-05, + "loss": 0.0067, + "step": 7938 + }, + { + "epoch": 2.094446642923097, + "grad_norm": 0.009338034316897392, + "learning_rate": 1.7572027316158152e-05, + "loss": 0.0003, + "step": 7940 + }, + { + "epoch": 2.0949742777997624, + "grad_norm": 0.0037717437371611595, + "learning_rate": 1.7570854948855478e-05, + "loss": 0.0014, + "step": 7942 + }, + { + "epoch": 2.095501912676428, + "grad_norm": 0.0021403534337878227, + "learning_rate": 1.7569682581552804e-05, + "loss": 0.0002, + "step": 7944 + }, + { + "epoch": 2.0960295475530932, + "grad_norm": 0.006298239808529615, + "learning_rate": 1.7568510214250127e-05, + "loss": 0.0004, + "step": 7946 + }, + { + "epoch": 2.0965571824297586, + "grad_norm": 0.0032639680430293083, + "learning_rate": 1.756733784694745e-05, + "loss": 0.0002, + "step": 7948 + }, + { + "epoch": 2.097084817306424, + "grad_norm": 0.0039595020934939384, + "learning_rate": 1.7566165479644775e-05, + "loss": 0.0002, + "step": 7950 + }, + { + "epoch": 2.0976124521830894, + "grad_norm": 0.24248412251472473, + "learning_rate": 1.7564993112342098e-05, + "loss": 0.0031, + "step": 7952 + }, + { + "epoch": 2.098140087059755, + "grad_norm": 0.1162724420428276, + "learning_rate": 1.756382074503942e-05, + "loss": 0.0023, + "step": 7954 + }, + { + "epoch": 2.09866772193642, + "grad_norm": 0.02425733208656311, + "learning_rate": 1.7562648377736747e-05, + "loss": 0.0005, + "step": 7956 + }, + { + "epoch": 2.0991953568130852, + "grad_norm": 0.0031621085945516825, + "learning_rate": 1.7561476010434073e-05, + "loss": 0.0002, + "step": 7958 + }, + { + "epoch": 2.0997229916897506, + "grad_norm": 0.005361306946724653, + "learning_rate": 1.7560303643131395e-05, + "loss": 0.0002, + "step": 7960 + }, + { + "epoch": 2.100250626566416, + "grad_norm": 0.03744201734662056, + "learning_rate": 1.7559131275828718e-05, + "loss": 0.0026, + "step": 7962 + }, + { + "epoch": 2.1007782614430814, + "grad_norm": 0.003668795106932521, + "learning_rate": 1.7557958908526044e-05, + "loss": 0.0002, + "step": 7964 + }, + { + "epoch": 2.101305896319747, + "grad_norm": 0.002393938135355711, + "learning_rate": 1.7556786541223366e-05, + "loss": 0.0004, + "step": 7966 + }, + { + "epoch": 2.1018335311964123, + "grad_norm": 0.0043885428458452225, + "learning_rate": 1.755561417392069e-05, + "loss": 0.0002, + "step": 7968 + }, + { + "epoch": 2.102361166073077, + "grad_norm": 0.0029720987658947706, + "learning_rate": 1.7554441806618015e-05, + "loss": 0.0001, + "step": 7970 + }, + { + "epoch": 2.1028888009497426, + "grad_norm": 0.14011943340301514, + "learning_rate": 1.7553269439315338e-05, + "loss": 0.0029, + "step": 7972 + }, + { + "epoch": 2.103416435826408, + "grad_norm": 0.007375968620181084, + "learning_rate": 1.7552097072012664e-05, + "loss": 0.0001, + "step": 7974 + }, + { + "epoch": 2.1039440707030734, + "grad_norm": 0.0028152219019830227, + "learning_rate": 1.7550924704709986e-05, + "loss": 0.0002, + "step": 7976 + }, + { + "epoch": 2.104471705579739, + "grad_norm": 0.0020251008681952953, + "learning_rate": 1.7549752337407312e-05, + "loss": 0.0004, + "step": 7978 + }, + { + "epoch": 2.1049993404564042, + "grad_norm": 0.005052583757787943, + "learning_rate": 1.7548579970104635e-05, + "loss": 0.0003, + "step": 7980 + }, + { + "epoch": 2.1055269753330697, + "grad_norm": 0.006904369220137596, + "learning_rate": 1.754740760280196e-05, + "loss": 0.0002, + "step": 7982 + }, + { + "epoch": 2.106054610209735, + "grad_norm": 0.03323788568377495, + "learning_rate": 1.7546235235499283e-05, + "loss": 0.0003, + "step": 7984 + }, + { + "epoch": 2.1065822450864, + "grad_norm": 0.0027055596001446247, + "learning_rate": 1.7545062868196606e-05, + "loss": 0.0018, + "step": 7986 + }, + { + "epoch": 2.1071098799630654, + "grad_norm": 0.10057362169027328, + "learning_rate": 1.7543890500893932e-05, + "loss": 0.0019, + "step": 7988 + }, + { + "epoch": 2.107637514839731, + "grad_norm": 0.020400555804371834, + "learning_rate": 1.7542718133591258e-05, + "loss": 0.0003, + "step": 7990 + }, + { + "epoch": 2.1081651497163962, + "grad_norm": 0.002448425395414233, + "learning_rate": 1.754154576628858e-05, + "loss": 0.0001, + "step": 7992 + }, + { + "epoch": 2.1086927845930616, + "grad_norm": 0.1274842917919159, + "learning_rate": 1.7540373398985903e-05, + "loss": 0.0101, + "step": 7994 + }, + { + "epoch": 2.109220419469727, + "grad_norm": 0.005240305792540312, + "learning_rate": 1.753920103168323e-05, + "loss": 0.0002, + "step": 7996 + }, + { + "epoch": 2.1097480543463925, + "grad_norm": 0.002792726969346404, + "learning_rate": 1.7538028664380552e-05, + "loss": 0.0002, + "step": 7998 + }, + { + "epoch": 2.110275689223058, + "grad_norm": 0.026782171800732613, + "learning_rate": 1.7536856297077874e-05, + "loss": 0.008, + "step": 8000 + }, + { + "epoch": 2.110275689223058, + "eval_loss": 0.0026942025870084763, + "eval_runtime": 310.1003, + "eval_samples_per_second": 695.391, + "eval_steps_per_second": 86.927, + "step": 8000 + }, + { + "epoch": 2.110803324099723, + "grad_norm": 0.006495743524283171, + "learning_rate": 1.75356839297752e-05, + "loss": 0.0002, + "step": 8002 + }, + { + "epoch": 2.1113309589763882, + "grad_norm": 0.007627195678651333, + "learning_rate": 1.7534511562472526e-05, + "loss": 0.0021, + "step": 8004 + }, + { + "epoch": 2.1118585938530536, + "grad_norm": 0.009033086709678173, + "learning_rate": 1.7533339195169846e-05, + "loss": 0.0005, + "step": 8006 + }, + { + "epoch": 2.112386228729719, + "grad_norm": 0.007790076080709696, + "learning_rate": 1.753216682786717e-05, + "loss": 0.0002, + "step": 8008 + }, + { + "epoch": 2.1129138636063844, + "grad_norm": 0.010753835551440716, + "learning_rate": 1.7530994460564498e-05, + "loss": 0.0003, + "step": 8010 + }, + { + "epoch": 2.11344149848305, + "grad_norm": 0.02424408309161663, + "learning_rate": 1.752982209326182e-05, + "loss": 0.0003, + "step": 8012 + }, + { + "epoch": 2.1139691333597153, + "grad_norm": 0.8324753046035767, + "learning_rate": 1.7528649725959143e-05, + "loss": 0.0031, + "step": 8014 + }, + { + "epoch": 2.11449676823638, + "grad_norm": 0.008838120847940445, + "learning_rate": 1.752747735865647e-05, + "loss": 0.0046, + "step": 8016 + }, + { + "epoch": 2.1150244031130456, + "grad_norm": 0.027697058394551277, + "learning_rate": 1.752630499135379e-05, + "loss": 0.0003, + "step": 8018 + }, + { + "epoch": 2.115552037989711, + "grad_norm": 0.18574540317058563, + "learning_rate": 1.7525132624051117e-05, + "loss": 0.0018, + "step": 8020 + }, + { + "epoch": 2.1160796728663764, + "grad_norm": 0.14674511551856995, + "learning_rate": 1.752396025674844e-05, + "loss": 0.0057, + "step": 8022 + }, + { + "epoch": 2.116607307743042, + "grad_norm": 1.763789176940918, + "learning_rate": 1.7522787889445766e-05, + "loss": 0.0081, + "step": 8024 + }, + { + "epoch": 2.1171349426197072, + "grad_norm": 0.00630482891574502, + "learning_rate": 1.752161552214309e-05, + "loss": 0.0073, + "step": 8026 + }, + { + "epoch": 2.1176625774963727, + "grad_norm": 0.11266094446182251, + "learning_rate": 1.752044315484041e-05, + "loss": 0.0008, + "step": 8028 + }, + { + "epoch": 2.118190212373038, + "grad_norm": 0.0050177546218037605, + "learning_rate": 1.7519270787537737e-05, + "loss": 0.0008, + "step": 8030 + }, + { + "epoch": 2.118717847249703, + "grad_norm": 0.019556552171707153, + "learning_rate": 1.751809842023506e-05, + "loss": 0.0003, + "step": 8032 + }, + { + "epoch": 2.1192454821263684, + "grad_norm": 0.2021467536687851, + "learning_rate": 1.7516926052932386e-05, + "loss": 0.0083, + "step": 8034 + }, + { + "epoch": 2.119773117003034, + "grad_norm": 0.010783171281218529, + "learning_rate": 1.751575368562971e-05, + "loss": 0.0003, + "step": 8036 + }, + { + "epoch": 2.1203007518796992, + "grad_norm": 0.1189090684056282, + "learning_rate": 1.7514581318327035e-05, + "loss": 0.0034, + "step": 8038 + }, + { + "epoch": 2.1208283867563646, + "grad_norm": 0.38898012042045593, + "learning_rate": 1.7513408951024357e-05, + "loss": 0.0101, + "step": 8040 + }, + { + "epoch": 2.12135602163303, + "grad_norm": 0.06966198980808258, + "learning_rate": 1.7512236583721683e-05, + "loss": 0.0012, + "step": 8042 + }, + { + "epoch": 2.1218836565096955, + "grad_norm": 0.08999565243721008, + "learning_rate": 1.7511064216419006e-05, + "loss": 0.0015, + "step": 8044 + }, + { + "epoch": 2.122411291386361, + "grad_norm": 0.18894805014133453, + "learning_rate": 1.750989184911633e-05, + "loss": 0.0015, + "step": 8046 + }, + { + "epoch": 2.122938926263026, + "grad_norm": 0.009925237856805325, + "learning_rate": 1.7508719481813654e-05, + "loss": 0.0003, + "step": 8048 + }, + { + "epoch": 2.1234665611396912, + "grad_norm": 0.16032087802886963, + "learning_rate": 1.750754711451098e-05, + "loss": 0.0014, + "step": 8050 + }, + { + "epoch": 2.1239941960163566, + "grad_norm": 0.026223696768283844, + "learning_rate": 1.75063747472083e-05, + "loss": 0.0045, + "step": 8052 + }, + { + "epoch": 2.124521830893022, + "grad_norm": 0.04483681544661522, + "learning_rate": 1.7505202379905626e-05, + "loss": 0.0023, + "step": 8054 + }, + { + "epoch": 2.1250494657696875, + "grad_norm": 0.006728039123117924, + "learning_rate": 1.750403001260295e-05, + "loss": 0.0003, + "step": 8056 + }, + { + "epoch": 2.125577100646353, + "grad_norm": 0.004715118557214737, + "learning_rate": 1.7502857645300274e-05, + "loss": 0.0003, + "step": 8058 + }, + { + "epoch": 2.1261047355230183, + "grad_norm": 0.031040411442518234, + "learning_rate": 1.7501685277997597e-05, + "loss": 0.0005, + "step": 8060 + }, + { + "epoch": 2.1266323703996832, + "grad_norm": 0.022905459627509117, + "learning_rate": 1.7500512910694923e-05, + "loss": 0.0021, + "step": 8062 + }, + { + "epoch": 2.1271600052763486, + "grad_norm": 0.06749645620584488, + "learning_rate": 1.7499340543392245e-05, + "loss": 0.0011, + "step": 8064 + }, + { + "epoch": 2.127687640153014, + "grad_norm": 0.16656595468521118, + "learning_rate": 1.7498168176089568e-05, + "loss": 0.0069, + "step": 8066 + }, + { + "epoch": 2.1282152750296794, + "grad_norm": 0.004540395922958851, + "learning_rate": 1.7496995808786894e-05, + "loss": 0.0025, + "step": 8068 + }, + { + "epoch": 2.128742909906345, + "grad_norm": 0.32752326130867004, + "learning_rate": 1.749582344148422e-05, + "loss": 0.0019, + "step": 8070 + }, + { + "epoch": 2.1292705447830103, + "grad_norm": 0.0027726581320166588, + "learning_rate": 1.7494651074181543e-05, + "loss": 0.0002, + "step": 8072 + }, + { + "epoch": 2.1297981796596757, + "grad_norm": 0.0061575183644890785, + "learning_rate": 1.7493478706878865e-05, + "loss": 0.0042, + "step": 8074 + }, + { + "epoch": 2.1303258145363406, + "grad_norm": 0.01863344945013523, + "learning_rate": 1.749230633957619e-05, + "loss": 0.0003, + "step": 8076 + }, + { + "epoch": 2.130853449413006, + "grad_norm": 0.0076566049829125404, + "learning_rate": 1.7491133972273514e-05, + "loss": 0.0076, + "step": 8078 + }, + { + "epoch": 2.1313810842896714, + "grad_norm": 0.009572909213602543, + "learning_rate": 1.748996160497084e-05, + "loss": 0.0057, + "step": 8080 + }, + { + "epoch": 2.131908719166337, + "grad_norm": 0.05516059696674347, + "learning_rate": 1.7488789237668162e-05, + "loss": 0.0004, + "step": 8082 + }, + { + "epoch": 2.1324363540430022, + "grad_norm": 0.00599326565861702, + "learning_rate": 1.748761687036549e-05, + "loss": 0.0002, + "step": 8084 + }, + { + "epoch": 2.1329639889196677, + "grad_norm": 0.013658788986504078, + "learning_rate": 1.748644450306281e-05, + "loss": 0.0003, + "step": 8086 + }, + { + "epoch": 2.133491623796333, + "grad_norm": 0.15239398181438446, + "learning_rate": 1.7485272135760134e-05, + "loss": 0.0012, + "step": 8088 + }, + { + "epoch": 2.1340192586729985, + "grad_norm": 0.0119075458496809, + "learning_rate": 1.748409976845746e-05, + "loss": 0.0002, + "step": 8090 + }, + { + "epoch": 2.1345468935496634, + "grad_norm": 0.4520379602909088, + "learning_rate": 1.7482927401154782e-05, + "loss": 0.0059, + "step": 8092 + }, + { + "epoch": 2.135074528426329, + "grad_norm": 0.006186032202094793, + "learning_rate": 1.7481755033852108e-05, + "loss": 0.0003, + "step": 8094 + }, + { + "epoch": 2.1356021633029942, + "grad_norm": 0.12649086117744446, + "learning_rate": 1.748058266654943e-05, + "loss": 0.0107, + "step": 8096 + }, + { + "epoch": 2.1361297981796596, + "grad_norm": 0.012948540970683098, + "learning_rate": 1.7479410299246753e-05, + "loss": 0.0049, + "step": 8098 + }, + { + "epoch": 2.136657433056325, + "grad_norm": 0.0020479147788137197, + "learning_rate": 1.747823793194408e-05, + "loss": 0.0002, + "step": 8100 + }, + { + "epoch": 2.1371850679329905, + "grad_norm": 0.016383633017539978, + "learning_rate": 1.7477065564641405e-05, + "loss": 0.0002, + "step": 8102 + }, + { + "epoch": 2.137712702809656, + "grad_norm": 0.0048639411106705666, + "learning_rate": 1.7475893197338728e-05, + "loss": 0.0066, + "step": 8104 + }, + { + "epoch": 2.1382403376863213, + "grad_norm": 0.020244812592864037, + "learning_rate": 1.747472083003605e-05, + "loss": 0.0003, + "step": 8106 + }, + { + "epoch": 2.1387679725629862, + "grad_norm": 0.0027151682879775763, + "learning_rate": 1.7473548462733377e-05, + "loss": 0.0014, + "step": 8108 + }, + { + "epoch": 2.1392956074396516, + "grad_norm": 0.005781772546470165, + "learning_rate": 1.74723760954307e-05, + "loss": 0.0002, + "step": 8110 + }, + { + "epoch": 2.139823242316317, + "grad_norm": 0.003922653384506702, + "learning_rate": 1.7471203728128022e-05, + "loss": 0.0003, + "step": 8112 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.04155828431248665, + "learning_rate": 1.7470031360825348e-05, + "loss": 0.0004, + "step": 8114 + }, + { + "epoch": 2.140878512069648, + "grad_norm": 0.11534885317087173, + "learning_rate": 1.7468858993522674e-05, + "loss": 0.0085, + "step": 8116 + }, + { + "epoch": 2.1414061469463133, + "grad_norm": 0.006908387411385775, + "learning_rate": 1.7467686626219997e-05, + "loss": 0.0045, + "step": 8118 + }, + { + "epoch": 2.1419337818229787, + "grad_norm": 0.10908922553062439, + "learning_rate": 1.746651425891732e-05, + "loss": 0.0117, + "step": 8120 + }, + { + "epoch": 2.1424614166996436, + "grad_norm": 0.011304124258458614, + "learning_rate": 1.7465341891614645e-05, + "loss": 0.0003, + "step": 8122 + }, + { + "epoch": 2.142989051576309, + "grad_norm": 0.021724045276641846, + "learning_rate": 1.7464169524311968e-05, + "loss": 0.0004, + "step": 8124 + }, + { + "epoch": 2.1435166864529744, + "grad_norm": 0.008705053478479385, + "learning_rate": 1.746299715700929e-05, + "loss": 0.0004, + "step": 8126 + }, + { + "epoch": 2.14404432132964, + "grad_norm": 0.01916433311998844, + "learning_rate": 1.7461824789706616e-05, + "loss": 0.0003, + "step": 8128 + }, + { + "epoch": 2.1445719562063053, + "grad_norm": 0.010376309975981712, + "learning_rate": 1.7460652422403942e-05, + "loss": 0.0009, + "step": 8130 + }, + { + "epoch": 2.1450995910829707, + "grad_norm": 0.25621047616004944, + "learning_rate": 1.7459480055101265e-05, + "loss": 0.0019, + "step": 8132 + }, + { + "epoch": 2.145627225959636, + "grad_norm": 0.2986365258693695, + "learning_rate": 1.7458307687798588e-05, + "loss": 0.0078, + "step": 8134 + }, + { + "epoch": 2.1461548608363015, + "grad_norm": 0.3477712571620941, + "learning_rate": 1.7457135320495914e-05, + "loss": 0.003, + "step": 8136 + }, + { + "epoch": 2.1466824957129664, + "grad_norm": 0.0077282218262553215, + "learning_rate": 1.7455962953193236e-05, + "loss": 0.0004, + "step": 8138 + }, + { + "epoch": 2.147210130589632, + "grad_norm": 0.009662240743637085, + "learning_rate": 1.7454790585890562e-05, + "loss": 0.0002, + "step": 8140 + }, + { + "epoch": 2.1477377654662972, + "grad_norm": 0.022783193737268448, + "learning_rate": 1.7453618218587885e-05, + "loss": 0.0003, + "step": 8142 + }, + { + "epoch": 2.1482654003429626, + "grad_norm": 0.003432955127209425, + "learning_rate": 1.7452445851285207e-05, + "loss": 0.0002, + "step": 8144 + }, + { + "epoch": 2.148793035219628, + "grad_norm": 0.005727430805563927, + "learning_rate": 1.7451273483982533e-05, + "loss": 0.0015, + "step": 8146 + }, + { + "epoch": 2.1493206700962935, + "grad_norm": 0.012469165027141571, + "learning_rate": 1.7450101116679856e-05, + "loss": 0.0003, + "step": 8148 + }, + { + "epoch": 2.149848304972959, + "grad_norm": 0.004769704304635525, + "learning_rate": 1.7448928749377182e-05, + "loss": 0.0056, + "step": 8150 + }, + { + "epoch": 2.1503759398496243, + "grad_norm": 0.05980290099978447, + "learning_rate": 1.7447756382074505e-05, + "loss": 0.0011, + "step": 8152 + }, + { + "epoch": 2.1509035747262892, + "grad_norm": 0.0227174311876297, + "learning_rate": 1.744658401477183e-05, + "loss": 0.0105, + "step": 8154 + }, + { + "epoch": 2.1514312096029546, + "grad_norm": 0.23851288855075836, + "learning_rate": 1.7445411647469153e-05, + "loss": 0.002, + "step": 8156 + }, + { + "epoch": 2.15195884447962, + "grad_norm": 0.01845904067158699, + "learning_rate": 1.7444239280166476e-05, + "loss": 0.0004, + "step": 8158 + }, + { + "epoch": 2.1524864793562855, + "grad_norm": 0.07472468167543411, + "learning_rate": 1.7443066912863802e-05, + "loss": 0.0006, + "step": 8160 + }, + { + "epoch": 2.153014114232951, + "grad_norm": 0.016812128946185112, + "learning_rate": 1.7441894545561128e-05, + "loss": 0.0004, + "step": 8162 + }, + { + "epoch": 2.1535417491096163, + "grad_norm": 0.17052102088928223, + "learning_rate": 1.744072217825845e-05, + "loss": 0.0034, + "step": 8164 + }, + { + "epoch": 2.1540693839862817, + "grad_norm": 0.10267151147127151, + "learning_rate": 1.7439549810955773e-05, + "loss": 0.0029, + "step": 8166 + }, + { + "epoch": 2.1545970188629466, + "grad_norm": 0.02579970471560955, + "learning_rate": 1.74383774436531e-05, + "loss": 0.0004, + "step": 8168 + }, + { + "epoch": 2.155124653739612, + "grad_norm": 0.1430259346961975, + "learning_rate": 1.743720507635042e-05, + "loss": 0.0051, + "step": 8170 + }, + { + "epoch": 2.1556522886162774, + "grad_norm": 0.03872134909033775, + "learning_rate": 1.7436032709047744e-05, + "loss": 0.0008, + "step": 8172 + }, + { + "epoch": 2.156179923492943, + "grad_norm": 0.015177864581346512, + "learning_rate": 1.743486034174507e-05, + "loss": 0.0007, + "step": 8174 + }, + { + "epoch": 2.1567075583696083, + "grad_norm": 0.008194171823561192, + "learning_rate": 1.7433687974442396e-05, + "loss": 0.0003, + "step": 8176 + }, + { + "epoch": 2.1572351932462737, + "grad_norm": 0.022825896739959717, + "learning_rate": 1.743251560713972e-05, + "loss": 0.0004, + "step": 8178 + }, + { + "epoch": 2.157762828122939, + "grad_norm": 0.12943489849567413, + "learning_rate": 1.743134323983704e-05, + "loss": 0.0038, + "step": 8180 + }, + { + "epoch": 2.1582904629996045, + "grad_norm": 0.058081384748220444, + "learning_rate": 1.7430170872534367e-05, + "loss": 0.0029, + "step": 8182 + }, + { + "epoch": 2.1588180978762694, + "grad_norm": 0.1707729697227478, + "learning_rate": 1.742899850523169e-05, + "loss": 0.0036, + "step": 8184 + }, + { + "epoch": 2.159345732752935, + "grad_norm": 0.0027978706639260054, + "learning_rate": 1.7427826137929013e-05, + "loss": 0.0002, + "step": 8186 + }, + { + "epoch": 2.1598733676296002, + "grad_norm": 0.0018246106337755919, + "learning_rate": 1.742665377062634e-05, + "loss": 0.0002, + "step": 8188 + }, + { + "epoch": 2.1604010025062657, + "grad_norm": 0.003922178875654936, + "learning_rate": 1.7425481403323665e-05, + "loss": 0.0002, + "step": 8190 + }, + { + "epoch": 2.160928637382931, + "grad_norm": 0.04641156271100044, + "learning_rate": 1.7424309036020987e-05, + "loss": 0.0006, + "step": 8192 + }, + { + "epoch": 2.1614562722595965, + "grad_norm": 0.4345264434814453, + "learning_rate": 1.742313666871831e-05, + "loss": 0.0044, + "step": 8194 + }, + { + "epoch": 2.161983907136262, + "grad_norm": 0.020978538319468498, + "learning_rate": 1.7421964301415636e-05, + "loss": 0.0002, + "step": 8196 + }, + { + "epoch": 2.1625115420129273, + "grad_norm": 0.3702141046524048, + "learning_rate": 1.742079193411296e-05, + "loss": 0.0064, + "step": 8198 + }, + { + "epoch": 2.1630391768895922, + "grad_norm": 0.3116953670978546, + "learning_rate": 1.7419619566810285e-05, + "loss": 0.0016, + "step": 8200 + }, + { + "epoch": 2.1635668117662576, + "grad_norm": 0.06433634459972382, + "learning_rate": 1.7418447199507607e-05, + "loss": 0.0006, + "step": 8202 + }, + { + "epoch": 2.164094446642923, + "grad_norm": 0.612733781337738, + "learning_rate": 1.741727483220493e-05, + "loss": 0.0066, + "step": 8204 + }, + { + "epoch": 2.1646220815195885, + "grad_norm": 0.00356648419983685, + "learning_rate": 1.7416102464902256e-05, + "loss": 0.0002, + "step": 8206 + }, + { + "epoch": 2.165149716396254, + "grad_norm": 0.10204338282346725, + "learning_rate": 1.741493009759958e-05, + "loss": 0.0005, + "step": 8208 + }, + { + "epoch": 2.1656773512729193, + "grad_norm": 0.028781607747077942, + "learning_rate": 1.7413757730296904e-05, + "loss": 0.0002, + "step": 8210 + }, + { + "epoch": 2.1662049861495847, + "grad_norm": 0.33207499980926514, + "learning_rate": 1.7412585362994227e-05, + "loss": 0.0019, + "step": 8212 + }, + { + "epoch": 2.1667326210262496, + "grad_norm": 0.0029544734861701727, + "learning_rate": 1.7411412995691553e-05, + "loss": 0.0002, + "step": 8214 + }, + { + "epoch": 2.167260255902915, + "grad_norm": 0.009660816751420498, + "learning_rate": 1.7410240628388876e-05, + "loss": 0.0002, + "step": 8216 + }, + { + "epoch": 2.1677878907795805, + "grad_norm": 0.2923685908317566, + "learning_rate": 1.7409068261086198e-05, + "loss": 0.006, + "step": 8218 + }, + { + "epoch": 2.168315525656246, + "grad_norm": 0.022095728665590286, + "learning_rate": 1.7407895893783524e-05, + "loss": 0.0005, + "step": 8220 + }, + { + "epoch": 2.1688431605329113, + "grad_norm": 0.06331009417772293, + "learning_rate": 1.740672352648085e-05, + "loss": 0.0005, + "step": 8222 + }, + { + "epoch": 2.1693707954095767, + "grad_norm": 0.11904364079236984, + "learning_rate": 1.7405551159178173e-05, + "loss": 0.0036, + "step": 8224 + }, + { + "epoch": 2.169898430286242, + "grad_norm": 0.5409579873085022, + "learning_rate": 1.7404378791875495e-05, + "loss": 0.0013, + "step": 8226 + }, + { + "epoch": 2.170426065162907, + "grad_norm": 0.027531666681170464, + "learning_rate": 1.740320642457282e-05, + "loss": 0.0067, + "step": 8228 + }, + { + "epoch": 2.1709537000395724, + "grad_norm": 0.006353079807013273, + "learning_rate": 1.7402034057270144e-05, + "loss": 0.0005, + "step": 8230 + }, + { + "epoch": 2.171481334916238, + "grad_norm": 0.003316829912364483, + "learning_rate": 1.7400861689967467e-05, + "loss": 0.0002, + "step": 8232 + }, + { + "epoch": 2.1720089697929033, + "grad_norm": 0.03160043805837631, + "learning_rate": 1.7399689322664793e-05, + "loss": 0.0003, + "step": 8234 + }, + { + "epoch": 2.1725366046695687, + "grad_norm": 0.02704039216041565, + "learning_rate": 1.739851695536212e-05, + "loss": 0.0003, + "step": 8236 + }, + { + "epoch": 2.173064239546234, + "grad_norm": 0.005741668865084648, + "learning_rate": 1.7397344588059438e-05, + "loss": 0.0002, + "step": 8238 + }, + { + "epoch": 2.1735918744228995, + "grad_norm": 0.31272339820861816, + "learning_rate": 1.7396172220756764e-05, + "loss": 0.0011, + "step": 8240 + }, + { + "epoch": 2.174119509299565, + "grad_norm": 0.0031915735453367233, + "learning_rate": 1.739499985345409e-05, + "loss": 0.0002, + "step": 8242 + }, + { + "epoch": 2.1746471441762303, + "grad_norm": 0.08107956498861313, + "learning_rate": 1.7393827486151412e-05, + "loss": 0.0015, + "step": 8244 + }, + { + "epoch": 2.1751747790528952, + "grad_norm": 0.03410398215055466, + "learning_rate": 1.7392655118848735e-05, + "loss": 0.002, + "step": 8246 + }, + { + "epoch": 2.1757024139295607, + "grad_norm": 0.00278744800016284, + "learning_rate": 1.739148275154606e-05, + "loss": 0.0094, + "step": 8248 + }, + { + "epoch": 2.176230048806226, + "grad_norm": 0.12658686935901642, + "learning_rate": 1.7390310384243384e-05, + "loss": 0.0106, + "step": 8250 + }, + { + "epoch": 2.1767576836828915, + "grad_norm": 0.014749069698154926, + "learning_rate": 1.738913801694071e-05, + "loss": 0.0003, + "step": 8252 + }, + { + "epoch": 2.177285318559557, + "grad_norm": 0.07460196316242218, + "learning_rate": 1.7387965649638032e-05, + "loss": 0.0005, + "step": 8254 + }, + { + "epoch": 2.1778129534362223, + "grad_norm": 0.06260213255882263, + "learning_rate": 1.7386793282335358e-05, + "loss": 0.0003, + "step": 8256 + }, + { + "epoch": 2.1783405883128877, + "grad_norm": 0.06230403855443001, + "learning_rate": 1.738562091503268e-05, + "loss": 0.0006, + "step": 8258 + }, + { + "epoch": 2.1788682231895526, + "grad_norm": 0.03944354131817818, + "learning_rate": 1.7384448547730007e-05, + "loss": 0.0005, + "step": 8260 + }, + { + "epoch": 2.179395858066218, + "grad_norm": 0.3355860710144043, + "learning_rate": 1.738327618042733e-05, + "loss": 0.0041, + "step": 8262 + }, + { + "epoch": 2.1799234929428835, + "grad_norm": 0.03746291622519493, + "learning_rate": 1.7382103813124652e-05, + "loss": 0.0016, + "step": 8264 + }, + { + "epoch": 2.180451127819549, + "grad_norm": 0.014512593857944012, + "learning_rate": 1.7380931445821978e-05, + "loss": 0.0004, + "step": 8266 + }, + { + "epoch": 2.1809787626962143, + "grad_norm": 0.03722699359059334, + "learning_rate": 1.73797590785193e-05, + "loss": 0.0011, + "step": 8268 + }, + { + "epoch": 2.1815063975728797, + "grad_norm": 0.09943712502717972, + "learning_rate": 1.7378586711216627e-05, + "loss": 0.0005, + "step": 8270 + }, + { + "epoch": 2.182034032449545, + "grad_norm": 0.015077874064445496, + "learning_rate": 1.737741434391395e-05, + "loss": 0.0005, + "step": 8272 + }, + { + "epoch": 2.18256166732621, + "grad_norm": 0.0038249820936471224, + "learning_rate": 1.7376241976611275e-05, + "loss": 0.0002, + "step": 8274 + }, + { + "epoch": 2.1830893022028754, + "grad_norm": 0.0040447101928293705, + "learning_rate": 1.7375069609308598e-05, + "loss": 0.0001, + "step": 8276 + }, + { + "epoch": 2.183616937079541, + "grad_norm": 0.005903555545955896, + "learning_rate": 1.737389724200592e-05, + "loss": 0.0009, + "step": 8278 + }, + { + "epoch": 2.1841445719562063, + "grad_norm": 0.3276078999042511, + "learning_rate": 1.7372724874703246e-05, + "loss": 0.0061, + "step": 8280 + }, + { + "epoch": 2.1846722068328717, + "grad_norm": 0.24011091887950897, + "learning_rate": 1.7371552507400572e-05, + "loss": 0.0088, + "step": 8282 + }, + { + "epoch": 2.185199841709537, + "grad_norm": 0.09066787362098694, + "learning_rate": 1.7370380140097892e-05, + "loss": 0.0003, + "step": 8284 + }, + { + "epoch": 2.1857274765862025, + "grad_norm": 0.00264868326485157, + "learning_rate": 1.7369207772795218e-05, + "loss": 0.0002, + "step": 8286 + }, + { + "epoch": 2.186255111462868, + "grad_norm": 0.1711413413286209, + "learning_rate": 1.7368035405492544e-05, + "loss": 0.0011, + "step": 8288 + }, + { + "epoch": 2.186782746339533, + "grad_norm": 0.008008982986211777, + "learning_rate": 1.7366863038189866e-05, + "loss": 0.0005, + "step": 8290 + }, + { + "epoch": 2.1873103812161983, + "grad_norm": 0.024440575391054153, + "learning_rate": 1.736569067088719e-05, + "loss": 0.0006, + "step": 8292 + }, + { + "epoch": 2.1878380160928637, + "grad_norm": 0.011570393107831478, + "learning_rate": 1.7364518303584515e-05, + "loss": 0.0003, + "step": 8294 + }, + { + "epoch": 2.188365650969529, + "grad_norm": 0.00882816780358553, + "learning_rate": 1.7363345936281838e-05, + "loss": 0.0003, + "step": 8296 + }, + { + "epoch": 2.1888932858461945, + "grad_norm": 0.01612241007387638, + "learning_rate": 1.736217356897916e-05, + "loss": 0.0015, + "step": 8298 + }, + { + "epoch": 2.18942092072286, + "grad_norm": 0.03550972416996956, + "learning_rate": 1.7361001201676486e-05, + "loss": 0.0017, + "step": 8300 + }, + { + "epoch": 2.1899485555995253, + "grad_norm": 0.18651825189590454, + "learning_rate": 1.7359828834373812e-05, + "loss": 0.0008, + "step": 8302 + }, + { + "epoch": 2.1904761904761907, + "grad_norm": 0.5077625513076782, + "learning_rate": 1.7358656467071135e-05, + "loss": 0.007, + "step": 8304 + }, + { + "epoch": 2.1910038253528556, + "grad_norm": 0.010617831721901894, + "learning_rate": 1.7357484099768457e-05, + "loss": 0.0002, + "step": 8306 + }, + { + "epoch": 2.191531460229521, + "grad_norm": 0.14464199542999268, + "learning_rate": 1.7356311732465783e-05, + "loss": 0.0102, + "step": 8308 + }, + { + "epoch": 2.1920590951061865, + "grad_norm": 0.0618228055536747, + "learning_rate": 1.7355139365163106e-05, + "loss": 0.0006, + "step": 8310 + }, + { + "epoch": 2.192586729982852, + "grad_norm": 0.19525811076164246, + "learning_rate": 1.7353966997860432e-05, + "loss": 0.0013, + "step": 8312 + }, + { + "epoch": 2.1931143648595173, + "grad_norm": 1.4175273180007935, + "learning_rate": 1.7352794630557755e-05, + "loss": 0.0185, + "step": 8314 + }, + { + "epoch": 2.1936419997361827, + "grad_norm": 0.028658468276262283, + "learning_rate": 1.735162226325508e-05, + "loss": 0.0042, + "step": 8316 + }, + { + "epoch": 2.194169634612848, + "grad_norm": 0.03678262233734131, + "learning_rate": 1.7350449895952403e-05, + "loss": 0.0008, + "step": 8318 + }, + { + "epoch": 2.194697269489513, + "grad_norm": 0.2617555558681488, + "learning_rate": 1.734927752864973e-05, + "loss": 0.0017, + "step": 8320 + }, + { + "epoch": 2.1952249043661785, + "grad_norm": 0.02262524887919426, + "learning_rate": 1.7348105161347052e-05, + "loss": 0.0005, + "step": 8322 + }, + { + "epoch": 2.195752539242844, + "grad_norm": 0.12419290095567703, + "learning_rate": 1.7346932794044374e-05, + "loss": 0.0009, + "step": 8324 + }, + { + "epoch": 2.1962801741195093, + "grad_norm": 0.11747438460588455, + "learning_rate": 1.73457604267417e-05, + "loss": 0.0072, + "step": 8326 + }, + { + "epoch": 2.1968078089961747, + "grad_norm": 0.027304664254188538, + "learning_rate": 1.7344588059439023e-05, + "loss": 0.0003, + "step": 8328 + }, + { + "epoch": 2.19733544387284, + "grad_norm": 0.022135354578495026, + "learning_rate": 1.7343415692136346e-05, + "loss": 0.0004, + "step": 8330 + }, + { + "epoch": 2.1978630787495055, + "grad_norm": 0.006422373931854963, + "learning_rate": 1.734224332483367e-05, + "loss": 0.0002, + "step": 8332 + }, + { + "epoch": 2.198390713626171, + "grad_norm": 1.4594578742980957, + "learning_rate": 1.7341070957530998e-05, + "loss": 0.0143, + "step": 8334 + }, + { + "epoch": 2.198918348502836, + "grad_norm": 0.019559001550078392, + "learning_rate": 1.733989859022832e-05, + "loss": 0.0029, + "step": 8336 + }, + { + "epoch": 2.1994459833795013, + "grad_norm": 0.032566118985414505, + "learning_rate": 1.7338726222925643e-05, + "loss": 0.0004, + "step": 8338 + }, + { + "epoch": 2.1999736182561667, + "grad_norm": 0.5356190204620361, + "learning_rate": 1.733755385562297e-05, + "loss": 0.0111, + "step": 8340 + }, + { + "epoch": 2.200501253132832, + "grad_norm": 0.03002321720123291, + "learning_rate": 1.733638148832029e-05, + "loss": 0.0005, + "step": 8342 + }, + { + "epoch": 2.2010288880094975, + "grad_norm": 0.192798912525177, + "learning_rate": 1.7335209121017614e-05, + "loss": 0.0008, + "step": 8344 + }, + { + "epoch": 2.201556522886163, + "grad_norm": 0.6622046232223511, + "learning_rate": 1.733403675371494e-05, + "loss": 0.0073, + "step": 8346 + }, + { + "epoch": 2.2020841577628283, + "grad_norm": 0.13941366970539093, + "learning_rate": 1.7332864386412266e-05, + "loss": 0.0037, + "step": 8348 + }, + { + "epoch": 2.2026117926394937, + "grad_norm": 0.08834230899810791, + "learning_rate": 1.733169201910959e-05, + "loss": 0.01, + "step": 8350 + }, + { + "epoch": 2.2031394275161587, + "grad_norm": 0.04364505037665367, + "learning_rate": 1.733051965180691e-05, + "loss": 0.0117, + "step": 8352 + }, + { + "epoch": 2.203667062392824, + "grad_norm": 0.0893743559718132, + "learning_rate": 1.7329347284504237e-05, + "loss": 0.0046, + "step": 8354 + }, + { + "epoch": 2.2041946972694895, + "grad_norm": 0.11718922108411789, + "learning_rate": 1.732817491720156e-05, + "loss": 0.0008, + "step": 8356 + }, + { + "epoch": 2.204722332146155, + "grad_norm": 0.01647164113819599, + "learning_rate": 1.7327002549898882e-05, + "loss": 0.0016, + "step": 8358 + }, + { + "epoch": 2.2052499670228203, + "grad_norm": 0.04288465902209282, + "learning_rate": 1.732583018259621e-05, + "loss": 0.0008, + "step": 8360 + }, + { + "epoch": 2.2057776018994857, + "grad_norm": 0.012847529724240303, + "learning_rate": 1.7324657815293534e-05, + "loss": 0.0073, + "step": 8362 + }, + { + "epoch": 2.206305236776151, + "grad_norm": 0.016835935413837433, + "learning_rate": 1.7323485447990857e-05, + "loss": 0.0004, + "step": 8364 + }, + { + "epoch": 2.206832871652816, + "grad_norm": 0.012083712965250015, + "learning_rate": 1.732231308068818e-05, + "loss": 0.0004, + "step": 8366 + }, + { + "epoch": 2.2073605065294815, + "grad_norm": 0.004254132974892855, + "learning_rate": 1.7321140713385506e-05, + "loss": 0.0003, + "step": 8368 + }, + { + "epoch": 2.207888141406147, + "grad_norm": 0.008116253651678562, + "learning_rate": 1.731996834608283e-05, + "loss": 0.0014, + "step": 8370 + }, + { + "epoch": 2.2084157762828123, + "grad_norm": 0.006012489553540945, + "learning_rate": 1.7318795978780154e-05, + "loss": 0.0002, + "step": 8372 + }, + { + "epoch": 2.2089434111594777, + "grad_norm": 0.170664444565773, + "learning_rate": 1.7317623611477477e-05, + "loss": 0.0019, + "step": 8374 + }, + { + "epoch": 2.209471046036143, + "grad_norm": 0.44156211614608765, + "learning_rate": 1.73164512441748e-05, + "loss": 0.0037, + "step": 8376 + }, + { + "epoch": 2.2099986809128085, + "grad_norm": 0.061122749000787735, + "learning_rate": 1.7315278876872126e-05, + "loss": 0.0003, + "step": 8378 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.008468052372336388, + "learning_rate": 1.731410650956945e-05, + "loss": 0.001, + "step": 8380 + }, + { + "epoch": 2.211053950666139, + "grad_norm": 0.0032512883190065622, + "learning_rate": 1.7312934142266774e-05, + "loss": 0.0045, + "step": 8382 + }, + { + "epoch": 2.2115815855428043, + "grad_norm": 0.017619645223021507, + "learning_rate": 1.7311761774964097e-05, + "loss": 0.0002, + "step": 8384 + }, + { + "epoch": 2.2121092204194697, + "grad_norm": 0.011626399122178555, + "learning_rate": 1.7310589407661423e-05, + "loss": 0.0003, + "step": 8386 + }, + { + "epoch": 2.212636855296135, + "grad_norm": 0.0021484466269612312, + "learning_rate": 1.7309417040358745e-05, + "loss": 0.0001, + "step": 8388 + }, + { + "epoch": 2.2131644901728005, + "grad_norm": 0.0023024659603834152, + "learning_rate": 1.7308244673056068e-05, + "loss": 0.0002, + "step": 8390 + }, + { + "epoch": 2.213692125049466, + "grad_norm": 0.0028187811840325594, + "learning_rate": 1.7307072305753394e-05, + "loss": 0.0061, + "step": 8392 + }, + { + "epoch": 2.2142197599261313, + "grad_norm": 0.1717222034931183, + "learning_rate": 1.730589993845072e-05, + "loss": 0.0076, + "step": 8394 + }, + { + "epoch": 2.2147473948027967, + "grad_norm": 0.007511578034609556, + "learning_rate": 1.7304727571148043e-05, + "loss": 0.0002, + "step": 8396 + }, + { + "epoch": 2.2152750296794617, + "grad_norm": 0.0033027920871973038, + "learning_rate": 1.7303555203845365e-05, + "loss": 0.0002, + "step": 8398 + }, + { + "epoch": 2.215802664556127, + "grad_norm": 0.3797455430030823, + "learning_rate": 1.730238283654269e-05, + "loss": 0.002, + "step": 8400 + }, + { + "epoch": 2.2163302994327925, + "grad_norm": 0.002651918213814497, + "learning_rate": 1.7301210469240014e-05, + "loss": 0.0001, + "step": 8402 + }, + { + "epoch": 2.216857934309458, + "grad_norm": 0.10071304440498352, + "learning_rate": 1.7300038101937336e-05, + "loss": 0.0028, + "step": 8404 + }, + { + "epoch": 2.2173855691861233, + "grad_norm": 0.0017244184855371714, + "learning_rate": 1.7298865734634662e-05, + "loss": 0.0002, + "step": 8406 + }, + { + "epoch": 2.2179132040627887, + "grad_norm": 0.22156721353530884, + "learning_rate": 1.729769336733199e-05, + "loss": 0.0057, + "step": 8408 + }, + { + "epoch": 2.218440838939454, + "grad_norm": 0.06260412931442261, + "learning_rate": 1.729652100002931e-05, + "loss": 0.0013, + "step": 8410 + }, + { + "epoch": 2.218968473816119, + "grad_norm": 0.01302344724535942, + "learning_rate": 1.7295348632726634e-05, + "loss": 0.001, + "step": 8412 + }, + { + "epoch": 2.2194961086927845, + "grad_norm": 0.017931411042809486, + "learning_rate": 1.729417626542396e-05, + "loss": 0.0003, + "step": 8414 + }, + { + "epoch": 2.22002374356945, + "grad_norm": 0.17726323008537292, + "learning_rate": 1.7293003898121282e-05, + "loss": 0.0056, + "step": 8416 + }, + { + "epoch": 2.2205513784461153, + "grad_norm": 0.12579889595508575, + "learning_rate": 1.7291831530818608e-05, + "loss": 0.001, + "step": 8418 + }, + { + "epoch": 2.2210790133227807, + "grad_norm": 0.004720974247902632, + "learning_rate": 1.729065916351593e-05, + "loss": 0.0002, + "step": 8420 + }, + { + "epoch": 2.221606648199446, + "grad_norm": 0.16032040119171143, + "learning_rate": 1.7289486796213253e-05, + "loss": 0.0019, + "step": 8422 + }, + { + "epoch": 2.2221342830761115, + "grad_norm": 0.04143715277314186, + "learning_rate": 1.728831442891058e-05, + "loss": 0.0006, + "step": 8424 + }, + { + "epoch": 2.2226619179527765, + "grad_norm": 0.2847982347011566, + "learning_rate": 1.7287142061607902e-05, + "loss": 0.0094, + "step": 8426 + }, + { + "epoch": 2.223189552829442, + "grad_norm": 0.03789972513914108, + "learning_rate": 1.7285969694305228e-05, + "loss": 0.0008, + "step": 8428 + }, + { + "epoch": 2.2237171877061073, + "grad_norm": 0.05017007142305374, + "learning_rate": 1.728479732700255e-05, + "loss": 0.0006, + "step": 8430 + }, + { + "epoch": 2.2242448225827727, + "grad_norm": 0.012689475901424885, + "learning_rate": 1.7283624959699877e-05, + "loss": 0.0003, + "step": 8432 + }, + { + "epoch": 2.224772457459438, + "grad_norm": 0.05252321437001228, + "learning_rate": 1.72824525923972e-05, + "loss": 0.0007, + "step": 8434 + }, + { + "epoch": 2.2253000923361035, + "grad_norm": 0.00894249975681305, + "learning_rate": 1.7281280225094522e-05, + "loss": 0.0003, + "step": 8436 + }, + { + "epoch": 2.225827727212769, + "grad_norm": 0.42244189977645874, + "learning_rate": 1.7280107857791848e-05, + "loss": 0.0042, + "step": 8438 + }, + { + "epoch": 2.2263553620894343, + "grad_norm": 0.0065462179481983185, + "learning_rate": 1.7278935490489174e-05, + "loss": 0.0004, + "step": 8440 + }, + { + "epoch": 2.2268829969660993, + "grad_norm": 0.03127770870923996, + "learning_rate": 1.7277763123186496e-05, + "loss": 0.0005, + "step": 8442 + }, + { + "epoch": 2.2274106318427647, + "grad_norm": 0.005151296500116587, + "learning_rate": 1.727659075588382e-05, + "loss": 0.0002, + "step": 8444 + }, + { + "epoch": 2.22793826671943, + "grad_norm": 0.010845806449651718, + "learning_rate": 1.7275418388581145e-05, + "loss": 0.0075, + "step": 8446 + }, + { + "epoch": 2.2284659015960955, + "grad_norm": 0.009361195378005505, + "learning_rate": 1.7274246021278468e-05, + "loss": 0.0003, + "step": 8448 + }, + { + "epoch": 2.228993536472761, + "grad_norm": 0.005443581845611334, + "learning_rate": 1.727307365397579e-05, + "loss": 0.0031, + "step": 8450 + }, + { + "epoch": 2.2295211713494263, + "grad_norm": 0.495936781167984, + "learning_rate": 1.7271901286673116e-05, + "loss": 0.0019, + "step": 8452 + }, + { + "epoch": 2.2300488062260917, + "grad_norm": 0.010094784200191498, + "learning_rate": 1.7270728919370442e-05, + "loss": 0.0002, + "step": 8454 + }, + { + "epoch": 2.230576441102757, + "grad_norm": 0.010054224170744419, + "learning_rate": 1.7269556552067765e-05, + "loss": 0.0074, + "step": 8456 + }, + { + "epoch": 2.231104075979422, + "grad_norm": 0.08069716393947601, + "learning_rate": 1.7268384184765088e-05, + "loss": 0.0009, + "step": 8458 + }, + { + "epoch": 2.2316317108560875, + "grad_norm": 0.14826379716396332, + "learning_rate": 1.7267211817462414e-05, + "loss": 0.0015, + "step": 8460 + }, + { + "epoch": 2.232159345732753, + "grad_norm": 0.5938806533813477, + "learning_rate": 1.7266039450159736e-05, + "loss": 0.0051, + "step": 8462 + }, + { + "epoch": 2.2326869806094183, + "grad_norm": 0.2141047865152359, + "learning_rate": 1.726486708285706e-05, + "loss": 0.004, + "step": 8464 + }, + { + "epoch": 2.2332146154860837, + "grad_norm": 0.006489451043307781, + "learning_rate": 1.7263694715554385e-05, + "loss": 0.0003, + "step": 8466 + }, + { + "epoch": 2.233742250362749, + "grad_norm": 0.021053776144981384, + "learning_rate": 1.7262522348251707e-05, + "loss": 0.0004, + "step": 8468 + }, + { + "epoch": 2.2342698852394145, + "grad_norm": 0.01064358651638031, + "learning_rate": 1.7261349980949033e-05, + "loss": 0.0003, + "step": 8470 + }, + { + "epoch": 2.2347975201160795, + "grad_norm": 0.016864456236362457, + "learning_rate": 1.7260177613646356e-05, + "loss": 0.0003, + "step": 8472 + }, + { + "epoch": 2.235325154992745, + "grad_norm": 0.10281476378440857, + "learning_rate": 1.7259005246343682e-05, + "loss": 0.0007, + "step": 8474 + }, + { + "epoch": 2.2358527898694103, + "grad_norm": 0.017733648419380188, + "learning_rate": 1.7257832879041005e-05, + "loss": 0.0004, + "step": 8476 + }, + { + "epoch": 2.2363804247460757, + "grad_norm": 0.11674916744232178, + "learning_rate": 1.725666051173833e-05, + "loss": 0.0069, + "step": 8478 + }, + { + "epoch": 2.236908059622741, + "grad_norm": 0.03259913623332977, + "learning_rate": 1.7255488144435653e-05, + "loss": 0.001, + "step": 8480 + }, + { + "epoch": 2.2374356944994065, + "grad_norm": 0.3094223141670227, + "learning_rate": 1.7254315777132976e-05, + "loss": 0.0046, + "step": 8482 + }, + { + "epoch": 2.237963329376072, + "grad_norm": 0.2831077575683594, + "learning_rate": 1.7253143409830302e-05, + "loss": 0.0028, + "step": 8484 + }, + { + "epoch": 2.2384909642527373, + "grad_norm": 0.10720396786928177, + "learning_rate": 1.7251971042527624e-05, + "loss": 0.0005, + "step": 8486 + }, + { + "epoch": 2.2390185991294023, + "grad_norm": 0.1971769481897354, + "learning_rate": 1.725079867522495e-05, + "loss": 0.0018, + "step": 8488 + }, + { + "epoch": 2.2395462340060677, + "grad_norm": 0.6521124243736267, + "learning_rate": 1.7249626307922273e-05, + "loss": 0.0038, + "step": 8490 + }, + { + "epoch": 2.240073868882733, + "grad_norm": 0.0052849226631224155, + "learning_rate": 1.72484539406196e-05, + "loss": 0.0002, + "step": 8492 + }, + { + "epoch": 2.2406015037593985, + "grad_norm": 0.26846250891685486, + "learning_rate": 1.724728157331692e-05, + "loss": 0.0044, + "step": 8494 + }, + { + "epoch": 2.241129138636064, + "grad_norm": 0.20567865669727325, + "learning_rate": 1.7246109206014244e-05, + "loss": 0.0014, + "step": 8496 + }, + { + "epoch": 2.2416567735127293, + "grad_norm": 0.02122206799685955, + "learning_rate": 1.724493683871157e-05, + "loss": 0.0014, + "step": 8498 + }, + { + "epoch": 2.2421844083893947, + "grad_norm": 0.21814444661140442, + "learning_rate": 1.7243764471408896e-05, + "loss": 0.0061, + "step": 8500 + }, + { + "epoch": 2.24271204326606, + "grad_norm": 0.07184194773435593, + "learning_rate": 1.724259210410622e-05, + "loss": 0.0006, + "step": 8502 + }, + { + "epoch": 2.243239678142725, + "grad_norm": 0.24902264773845673, + "learning_rate": 1.724141973680354e-05, + "loss": 0.0012, + "step": 8504 + }, + { + "epoch": 2.2437673130193905, + "grad_norm": 0.02024879865348339, + "learning_rate": 1.7240247369500867e-05, + "loss": 0.0004, + "step": 8506 + }, + { + "epoch": 2.244294947896056, + "grad_norm": 0.022230759263038635, + "learning_rate": 1.723907500219819e-05, + "loss": 0.0007, + "step": 8508 + }, + { + "epoch": 2.2448225827727213, + "grad_norm": 0.01263556256890297, + "learning_rate": 1.7237902634895513e-05, + "loss": 0.0008, + "step": 8510 + }, + { + "epoch": 2.2453502176493867, + "grad_norm": 0.005056761670857668, + "learning_rate": 1.723673026759284e-05, + "loss": 0.0002, + "step": 8512 + }, + { + "epoch": 2.245877852526052, + "grad_norm": 0.006085589062422514, + "learning_rate": 1.7235557900290165e-05, + "loss": 0.0003, + "step": 8514 + }, + { + "epoch": 2.2464054874027175, + "grad_norm": 0.007154401857405901, + "learning_rate": 1.7234385532987484e-05, + "loss": 0.0002, + "step": 8516 + }, + { + "epoch": 2.2469331222793825, + "grad_norm": 0.1991087794303894, + "learning_rate": 1.723321316568481e-05, + "loss": 0.001, + "step": 8518 + }, + { + "epoch": 2.247460757156048, + "grad_norm": 0.016829099506139755, + "learning_rate": 1.7232040798382136e-05, + "loss": 0.0002, + "step": 8520 + }, + { + "epoch": 2.2479883920327133, + "grad_norm": 0.2834281623363495, + "learning_rate": 1.723086843107946e-05, + "loss": 0.0015, + "step": 8522 + }, + { + "epoch": 2.2485160269093787, + "grad_norm": 0.010330363176763058, + "learning_rate": 1.722969606377678e-05, + "loss": 0.0002, + "step": 8524 + }, + { + "epoch": 2.249043661786044, + "grad_norm": 0.0046426053158938885, + "learning_rate": 1.7228523696474107e-05, + "loss": 0.0001, + "step": 8526 + }, + { + "epoch": 2.2495712966627095, + "grad_norm": 0.6302744150161743, + "learning_rate": 1.722735132917143e-05, + "loss": 0.003, + "step": 8528 + }, + { + "epoch": 2.250098931539375, + "grad_norm": 0.0742327868938446, + "learning_rate": 1.7226178961868756e-05, + "loss": 0.0002, + "step": 8530 + }, + { + "epoch": 2.25062656641604, + "grad_norm": 0.0008886553696356714, + "learning_rate": 1.7225006594566078e-05, + "loss": 0.0001, + "step": 8532 + }, + { + "epoch": 2.2511542012927053, + "grad_norm": 0.007921081967651844, + "learning_rate": 1.7223834227263404e-05, + "loss": 0.0002, + "step": 8534 + }, + { + "epoch": 2.2516818361693707, + "grad_norm": 0.3019741475582123, + "learning_rate": 1.7222661859960727e-05, + "loss": 0.0043, + "step": 8536 + }, + { + "epoch": 2.252209471046036, + "grad_norm": 0.0031368406489491463, + "learning_rate": 1.7221489492658053e-05, + "loss": 0.001, + "step": 8538 + }, + { + "epoch": 2.2527371059227015, + "grad_norm": 0.0046815937384963036, + "learning_rate": 1.7220317125355376e-05, + "loss": 0.0006, + "step": 8540 + }, + { + "epoch": 2.253264740799367, + "grad_norm": 0.015242725610733032, + "learning_rate": 1.7219144758052698e-05, + "loss": 0.0002, + "step": 8542 + }, + { + "epoch": 2.2537923756760323, + "grad_norm": 0.15768827497959137, + "learning_rate": 1.7217972390750024e-05, + "loss": 0.004, + "step": 8544 + }, + { + "epoch": 2.2543200105526977, + "grad_norm": 0.004094903822988272, + "learning_rate": 1.7216800023447347e-05, + "loss": 0.0043, + "step": 8546 + }, + { + "epoch": 2.254847645429363, + "grad_norm": 0.010110889561474323, + "learning_rate": 1.7215627656144673e-05, + "loss": 0.0002, + "step": 8548 + }, + { + "epoch": 2.255375280306028, + "grad_norm": 0.005376966204494238, + "learning_rate": 1.7214455288841995e-05, + "loss": 0.0002, + "step": 8550 + }, + { + "epoch": 2.2559029151826935, + "grad_norm": 0.2026139199733734, + "learning_rate": 1.721328292153932e-05, + "loss": 0.0012, + "step": 8552 + }, + { + "epoch": 2.256430550059359, + "grad_norm": 0.34359434247016907, + "learning_rate": 1.7212110554236644e-05, + "loss": 0.0118, + "step": 8554 + }, + { + "epoch": 2.2569581849360243, + "grad_norm": 0.0014667082577943802, + "learning_rate": 1.7210938186933967e-05, + "loss": 0.0001, + "step": 8556 + }, + { + "epoch": 2.2574858198126897, + "grad_norm": 0.2532248795032501, + "learning_rate": 1.7209765819631293e-05, + "loss": 0.0067, + "step": 8558 + }, + { + "epoch": 2.258013454689355, + "grad_norm": 0.005477135069668293, + "learning_rate": 1.720859345232862e-05, + "loss": 0.0081, + "step": 8560 + }, + { + "epoch": 2.2585410895660205, + "grad_norm": 0.36042720079421997, + "learning_rate": 1.7207421085025938e-05, + "loss": 0.0042, + "step": 8562 + }, + { + "epoch": 2.2590687244426855, + "grad_norm": 0.40170541405677795, + "learning_rate": 1.7206248717723264e-05, + "loss": 0.0024, + "step": 8564 + }, + { + "epoch": 2.259596359319351, + "grad_norm": 0.17989274859428406, + "learning_rate": 1.720507635042059e-05, + "loss": 0.0138, + "step": 8566 + }, + { + "epoch": 2.2601239941960163, + "grad_norm": 0.013326388783752918, + "learning_rate": 1.7203903983117912e-05, + "loss": 0.0007, + "step": 8568 + }, + { + "epoch": 2.2606516290726817, + "grad_norm": 0.04390653222799301, + "learning_rate": 1.7202731615815235e-05, + "loss": 0.0003, + "step": 8570 + }, + { + "epoch": 2.261179263949347, + "grad_norm": 0.014507790096104145, + "learning_rate": 1.720155924851256e-05, + "loss": 0.0005, + "step": 8572 + }, + { + "epoch": 2.2617068988260125, + "grad_norm": 0.06532935053110123, + "learning_rate": 1.7200386881209884e-05, + "loss": 0.0007, + "step": 8574 + }, + { + "epoch": 2.262234533702678, + "grad_norm": 0.05766468495130539, + "learning_rate": 1.7199214513907206e-05, + "loss": 0.0029, + "step": 8576 + }, + { + "epoch": 2.262762168579343, + "grad_norm": 0.2963242828845978, + "learning_rate": 1.7198042146604532e-05, + "loss": 0.0067, + "step": 8578 + }, + { + "epoch": 2.2632898034560083, + "grad_norm": 0.009045944549143314, + "learning_rate": 1.7196869779301858e-05, + "loss": 0.0003, + "step": 8580 + }, + { + "epoch": 2.2638174383326737, + "grad_norm": 0.009453500621020794, + "learning_rate": 1.719569741199918e-05, + "loss": 0.0002, + "step": 8582 + }, + { + "epoch": 2.264345073209339, + "grad_norm": 0.05829896405339241, + "learning_rate": 1.7194525044696503e-05, + "loss": 0.0004, + "step": 8584 + }, + { + "epoch": 2.2648727080860045, + "grad_norm": 0.020013872534036636, + "learning_rate": 1.719335267739383e-05, + "loss": 0.0108, + "step": 8586 + }, + { + "epoch": 2.26540034296267, + "grad_norm": 0.022712763398885727, + "learning_rate": 1.7192180310091152e-05, + "loss": 0.0004, + "step": 8588 + }, + { + "epoch": 2.2659279778393353, + "grad_norm": 0.021926838904619217, + "learning_rate": 1.7191007942788478e-05, + "loss": 0.0016, + "step": 8590 + }, + { + "epoch": 2.2664556127160007, + "grad_norm": 0.027179623022675514, + "learning_rate": 1.71898355754858e-05, + "loss": 0.0005, + "step": 8592 + }, + { + "epoch": 2.266983247592666, + "grad_norm": 0.023839786648750305, + "learning_rate": 1.7188663208183127e-05, + "loss": 0.0011, + "step": 8594 + }, + { + "epoch": 2.267510882469331, + "grad_norm": 0.01749989576637745, + "learning_rate": 1.718749084088045e-05, + "loss": 0.0004, + "step": 8596 + }, + { + "epoch": 2.2680385173459965, + "grad_norm": 0.0546785444021225, + "learning_rate": 1.7186318473577775e-05, + "loss": 0.0003, + "step": 8598 + }, + { + "epoch": 2.268566152222662, + "grad_norm": 0.00834907777607441, + "learning_rate": 1.7185146106275098e-05, + "loss": 0.0004, + "step": 8600 + }, + { + "epoch": 2.2690937870993273, + "grad_norm": 0.36372390389442444, + "learning_rate": 1.718397373897242e-05, + "loss": 0.0019, + "step": 8602 + }, + { + "epoch": 2.2696214219759927, + "grad_norm": 0.024942703545093536, + "learning_rate": 1.7182801371669746e-05, + "loss": 0.0002, + "step": 8604 + }, + { + "epoch": 2.270149056852658, + "grad_norm": 0.2211991846561432, + "learning_rate": 1.718162900436707e-05, + "loss": 0.0005, + "step": 8606 + }, + { + "epoch": 2.2706766917293235, + "grad_norm": 0.02513742446899414, + "learning_rate": 1.718045663706439e-05, + "loss": 0.0006, + "step": 8608 + }, + { + "epoch": 2.2712043266059885, + "grad_norm": 0.1610202044248581, + "learning_rate": 1.7179284269761718e-05, + "loss": 0.0007, + "step": 8610 + }, + { + "epoch": 2.271731961482654, + "grad_norm": 0.0201695766299963, + "learning_rate": 1.7178111902459044e-05, + "loss": 0.0002, + "step": 8612 + }, + { + "epoch": 2.2722595963593193, + "grad_norm": 0.23173178732395172, + "learning_rate": 1.7176939535156366e-05, + "loss": 0.0014, + "step": 8614 + }, + { + "epoch": 2.2727872312359847, + "grad_norm": 0.015247530303895473, + "learning_rate": 1.717576716785369e-05, + "loss": 0.0002, + "step": 8616 + }, + { + "epoch": 2.27331486611265, + "grad_norm": 0.00740092433989048, + "learning_rate": 1.7174594800551015e-05, + "loss": 0.0002, + "step": 8618 + }, + { + "epoch": 2.2738425009893155, + "grad_norm": 0.14538057148456573, + "learning_rate": 1.7173422433248337e-05, + "loss": 0.0007, + "step": 8620 + }, + { + "epoch": 2.274370135865981, + "grad_norm": 0.005770462565124035, + "learning_rate": 1.717225006594566e-05, + "loss": 0.0002, + "step": 8622 + }, + { + "epoch": 2.274897770742646, + "grad_norm": 0.003978466149419546, + "learning_rate": 1.7171077698642986e-05, + "loss": 0.0022, + "step": 8624 + }, + { + "epoch": 2.2754254056193113, + "grad_norm": 0.005687805823981762, + "learning_rate": 1.7169905331340312e-05, + "loss": 0.0002, + "step": 8626 + }, + { + "epoch": 2.2759530404959767, + "grad_norm": 0.07244762778282166, + "learning_rate": 1.7168732964037635e-05, + "loss": 0.0004, + "step": 8628 + }, + { + "epoch": 2.276480675372642, + "grad_norm": 0.005557919852435589, + "learning_rate": 1.7167560596734957e-05, + "loss": 0.0002, + "step": 8630 + }, + { + "epoch": 2.2770083102493075, + "grad_norm": 0.001629962120205164, + "learning_rate": 1.7166388229432283e-05, + "loss": 0.0045, + "step": 8632 + }, + { + "epoch": 2.277535945125973, + "grad_norm": 0.04354281350970268, + "learning_rate": 1.7165215862129606e-05, + "loss": 0.0002, + "step": 8634 + }, + { + "epoch": 2.2780635800026383, + "grad_norm": 0.011091741733253002, + "learning_rate": 1.716404349482693e-05, + "loss": 0.0002, + "step": 8636 + }, + { + "epoch": 2.2785912148793033, + "grad_norm": 0.4520401060581207, + "learning_rate": 1.7162871127524255e-05, + "loss": 0.0045, + "step": 8638 + }, + { + "epoch": 2.2791188497559687, + "grad_norm": 0.012247256934642792, + "learning_rate": 1.716169876022158e-05, + "loss": 0.0002, + "step": 8640 + }, + { + "epoch": 2.279646484632634, + "grad_norm": 0.3082554340362549, + "learning_rate": 1.7160526392918903e-05, + "loss": 0.0037, + "step": 8642 + }, + { + "epoch": 2.2801741195092995, + "grad_norm": 0.003101289737969637, + "learning_rate": 1.7159354025616226e-05, + "loss": 0.0002, + "step": 8644 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.27153927087783813, + "learning_rate": 1.7158181658313552e-05, + "loss": 0.0095, + "step": 8646 + }, + { + "epoch": 2.2812293892626303, + "grad_norm": 0.006360986735671759, + "learning_rate": 1.7157009291010874e-05, + "loss": 0.0023, + "step": 8648 + }, + { + "epoch": 2.2817570241392957, + "grad_norm": 0.09717493504285812, + "learning_rate": 1.71558369237082e-05, + "loss": 0.0013, + "step": 8650 + }, + { + "epoch": 2.282284659015961, + "grad_norm": 0.07839959859848022, + "learning_rate": 1.7154664556405523e-05, + "loss": 0.0006, + "step": 8652 + }, + { + "epoch": 2.2828122938926265, + "grad_norm": 0.10605234652757645, + "learning_rate": 1.7153492189102846e-05, + "loss": 0.0006, + "step": 8654 + }, + { + "epoch": 2.2833399287692915, + "grad_norm": 0.015104962512850761, + "learning_rate": 1.715231982180017e-05, + "loss": 0.0002, + "step": 8656 + }, + { + "epoch": 2.283867563645957, + "grad_norm": 0.04328875616192818, + "learning_rate": 1.7151147454497498e-05, + "loss": 0.0003, + "step": 8658 + }, + { + "epoch": 2.2843951985226223, + "grad_norm": 0.008640826679766178, + "learning_rate": 1.714997508719482e-05, + "loss": 0.0002, + "step": 8660 + }, + { + "epoch": 2.2849228333992877, + "grad_norm": 0.30677077174186707, + "learning_rate": 1.7148802719892143e-05, + "loss": 0.0107, + "step": 8662 + }, + { + "epoch": 2.285450468275953, + "grad_norm": 0.0031739152036607265, + "learning_rate": 1.714763035258947e-05, + "loss": 0.0122, + "step": 8664 + }, + { + "epoch": 2.2859781031526185, + "grad_norm": 0.013510925695300102, + "learning_rate": 1.714645798528679e-05, + "loss": 0.0109, + "step": 8666 + }, + { + "epoch": 2.286505738029284, + "grad_norm": 0.18234370648860931, + "learning_rate": 1.7145285617984114e-05, + "loss": 0.0109, + "step": 8668 + }, + { + "epoch": 2.287033372905949, + "grad_norm": 0.0214503426104784, + "learning_rate": 1.714411325068144e-05, + "loss": 0.0003, + "step": 8670 + }, + { + "epoch": 2.2875610077826143, + "grad_norm": 0.0161298681050539, + "learning_rate": 1.7142940883378766e-05, + "loss": 0.0015, + "step": 8672 + }, + { + "epoch": 2.2880886426592797, + "grad_norm": 0.016513794660568237, + "learning_rate": 1.714176851607609e-05, + "loss": 0.0004, + "step": 8674 + }, + { + "epoch": 2.288616277535945, + "grad_norm": 0.012697076424956322, + "learning_rate": 1.714059614877341e-05, + "loss": 0.0004, + "step": 8676 + }, + { + "epoch": 2.2891439124126105, + "grad_norm": 0.035870011895895004, + "learning_rate": 1.7139423781470737e-05, + "loss": 0.0059, + "step": 8678 + }, + { + "epoch": 2.289671547289276, + "grad_norm": 0.016736825928092003, + "learning_rate": 1.713825141416806e-05, + "loss": 0.0005, + "step": 8680 + }, + { + "epoch": 2.2901991821659413, + "grad_norm": 0.014034248888492584, + "learning_rate": 1.7137079046865382e-05, + "loss": 0.0004, + "step": 8682 + }, + { + "epoch": 2.2907268170426063, + "grad_norm": 0.06229068338871002, + "learning_rate": 1.713590667956271e-05, + "loss": 0.0007, + "step": 8684 + }, + { + "epoch": 2.2912544519192717, + "grad_norm": 0.007007832173258066, + "learning_rate": 1.7134734312260034e-05, + "loss": 0.0003, + "step": 8686 + }, + { + "epoch": 2.291782086795937, + "grad_norm": 0.01804090477526188, + "learning_rate": 1.7133561944957357e-05, + "loss": 0.0003, + "step": 8688 + }, + { + "epoch": 2.2923097216726025, + "grad_norm": 0.00967433676123619, + "learning_rate": 1.713238957765468e-05, + "loss": 0.0006, + "step": 8690 + }, + { + "epoch": 2.292837356549268, + "grad_norm": 0.01801360957324505, + "learning_rate": 1.7131217210352006e-05, + "loss": 0.0003, + "step": 8692 + }, + { + "epoch": 2.2933649914259333, + "grad_norm": 0.24138811230659485, + "learning_rate": 1.7130044843049328e-05, + "loss": 0.0081, + "step": 8694 + }, + { + "epoch": 2.2938926263025987, + "grad_norm": 0.06099320575594902, + "learning_rate": 1.712887247574665e-05, + "loss": 0.0021, + "step": 8696 + }, + { + "epoch": 2.294420261179264, + "grad_norm": 0.02074822597205639, + "learning_rate": 1.7127700108443977e-05, + "loss": 0.0003, + "step": 8698 + }, + { + "epoch": 2.2949478960559295, + "grad_norm": 0.0046663968823850155, + "learning_rate": 1.71265277411413e-05, + "loss": 0.0026, + "step": 8700 + }, + { + "epoch": 2.2954755309325945, + "grad_norm": 0.011029638350009918, + "learning_rate": 1.7125355373838625e-05, + "loss": 0.0031, + "step": 8702 + }, + { + "epoch": 2.29600316580926, + "grad_norm": 0.013731816783547401, + "learning_rate": 1.7124183006535948e-05, + "loss": 0.0002, + "step": 8704 + }, + { + "epoch": 2.2965308006859253, + "grad_norm": 0.09160952270030975, + "learning_rate": 1.7123010639233274e-05, + "loss": 0.0031, + "step": 8706 + }, + { + "epoch": 2.2970584355625907, + "grad_norm": 0.007079300470650196, + "learning_rate": 1.7121838271930597e-05, + "loss": 0.0003, + "step": 8708 + }, + { + "epoch": 2.297586070439256, + "grad_norm": 0.01574878580868244, + "learning_rate": 1.7120665904627923e-05, + "loss": 0.0003, + "step": 8710 + }, + { + "epoch": 2.2981137053159215, + "grad_norm": 0.020566239953041077, + "learning_rate": 1.7119493537325245e-05, + "loss": 0.0007, + "step": 8712 + }, + { + "epoch": 2.298641340192587, + "grad_norm": 0.0051191081292927265, + "learning_rate": 1.7118321170022568e-05, + "loss": 0.0009, + "step": 8714 + }, + { + "epoch": 2.299168975069252, + "grad_norm": 0.038277242332696915, + "learning_rate": 1.7117148802719894e-05, + "loss": 0.0015, + "step": 8716 + }, + { + "epoch": 2.2996966099459173, + "grad_norm": 0.2927088141441345, + "learning_rate": 1.711597643541722e-05, + "loss": 0.006, + "step": 8718 + }, + { + "epoch": 2.3002242448225827, + "grad_norm": 0.009145825169980526, + "learning_rate": 1.7114804068114543e-05, + "loss": 0.0057, + "step": 8720 + }, + { + "epoch": 2.300751879699248, + "grad_norm": 0.2945740520954132, + "learning_rate": 1.7113631700811865e-05, + "loss": 0.0073, + "step": 8722 + }, + { + "epoch": 2.3012795145759135, + "grad_norm": 0.02507849782705307, + "learning_rate": 1.711245933350919e-05, + "loss": 0.0004, + "step": 8724 + }, + { + "epoch": 2.301807149452579, + "grad_norm": 0.1252179890871048, + "learning_rate": 1.7111286966206514e-05, + "loss": 0.0101, + "step": 8726 + }, + { + "epoch": 2.3023347843292443, + "grad_norm": 0.03797069564461708, + "learning_rate": 1.7110114598903836e-05, + "loss": 0.0005, + "step": 8728 + }, + { + "epoch": 2.3028624192059093, + "grad_norm": 0.01681564375758171, + "learning_rate": 1.7108942231601162e-05, + "loss": 0.0004, + "step": 8730 + }, + { + "epoch": 2.3033900540825747, + "grad_norm": 0.33913475275039673, + "learning_rate": 1.710776986429849e-05, + "loss": 0.0221, + "step": 8732 + }, + { + "epoch": 2.30391768895924, + "grad_norm": 0.026551201939582825, + "learning_rate": 1.710659749699581e-05, + "loss": 0.0051, + "step": 8734 + }, + { + "epoch": 2.3044453238359055, + "grad_norm": 0.0071457866579294205, + "learning_rate": 1.7105425129693134e-05, + "loss": 0.0005, + "step": 8736 + }, + { + "epoch": 2.304972958712571, + "grad_norm": 0.34810084104537964, + "learning_rate": 1.710425276239046e-05, + "loss": 0.0069, + "step": 8738 + }, + { + "epoch": 2.3055005935892363, + "grad_norm": 0.1534891575574875, + "learning_rate": 1.7103080395087782e-05, + "loss": 0.0063, + "step": 8740 + }, + { + "epoch": 2.3060282284659017, + "grad_norm": 0.008494382724165916, + "learning_rate": 1.7101908027785105e-05, + "loss": 0.0014, + "step": 8742 + }, + { + "epoch": 2.306555863342567, + "grad_norm": 0.06633397191762924, + "learning_rate": 1.710073566048243e-05, + "loss": 0.0008, + "step": 8744 + }, + { + "epoch": 2.3070834982192325, + "grad_norm": 0.5429632663726807, + "learning_rate": 1.7099563293179753e-05, + "loss": 0.0048, + "step": 8746 + }, + { + "epoch": 2.3076111330958975, + "grad_norm": 0.3907237648963928, + "learning_rate": 1.709839092587708e-05, + "loss": 0.0018, + "step": 8748 + }, + { + "epoch": 2.308138767972563, + "grad_norm": 0.2820742428302765, + "learning_rate": 1.7097218558574402e-05, + "loss": 0.003, + "step": 8750 + }, + { + "epoch": 2.3086664028492283, + "grad_norm": 0.024010175839066505, + "learning_rate": 1.7096046191271728e-05, + "loss": 0.0008, + "step": 8752 + }, + { + "epoch": 2.3091940377258937, + "grad_norm": 0.04958511143922806, + "learning_rate": 1.709487382396905e-05, + "loss": 0.0046, + "step": 8754 + }, + { + "epoch": 2.309721672602559, + "grad_norm": 0.01586506888270378, + "learning_rate": 1.7093701456666373e-05, + "loss": 0.0006, + "step": 8756 + }, + { + "epoch": 2.3102493074792245, + "grad_norm": 0.012422863394021988, + "learning_rate": 1.70925290893637e-05, + "loss": 0.0005, + "step": 8758 + }, + { + "epoch": 2.31077694235589, + "grad_norm": 0.07050211727619171, + "learning_rate": 1.7091356722061022e-05, + "loss": 0.0007, + "step": 8760 + }, + { + "epoch": 2.311304577232555, + "grad_norm": 0.00716213695704937, + "learning_rate": 1.7090184354758348e-05, + "loss": 0.0003, + "step": 8762 + }, + { + "epoch": 2.3118322121092203, + "grad_norm": 0.0253651924431324, + "learning_rate": 1.708901198745567e-05, + "loss": 0.0007, + "step": 8764 + }, + { + "epoch": 2.3123598469858857, + "grad_norm": 0.044641777873039246, + "learning_rate": 1.7087839620152996e-05, + "loss": 0.0008, + "step": 8766 + }, + { + "epoch": 2.312887481862551, + "grad_norm": 0.00816809106618166, + "learning_rate": 1.708666725285032e-05, + "loss": 0.0005, + "step": 8768 + }, + { + "epoch": 2.3134151167392165, + "grad_norm": 0.0042972080409526825, + "learning_rate": 1.7085494885547645e-05, + "loss": 0.0002, + "step": 8770 + }, + { + "epoch": 2.313942751615882, + "grad_norm": 0.003884693142026663, + "learning_rate": 1.7084322518244968e-05, + "loss": 0.0094, + "step": 8772 + }, + { + "epoch": 2.3144703864925473, + "grad_norm": 0.024725040420889854, + "learning_rate": 1.708315015094229e-05, + "loss": 0.0005, + "step": 8774 + }, + { + "epoch": 2.3149980213692123, + "grad_norm": 0.3317440152168274, + "learning_rate": 1.7081977783639616e-05, + "loss": 0.0043, + "step": 8776 + }, + { + "epoch": 2.3155256562458777, + "grad_norm": 0.032181985676288605, + "learning_rate": 1.7080805416336942e-05, + "loss": 0.0098, + "step": 8778 + }, + { + "epoch": 2.316053291122543, + "grad_norm": 0.08577673137187958, + "learning_rate": 1.7079633049034265e-05, + "loss": 0.0013, + "step": 8780 + }, + { + "epoch": 2.3165809259992085, + "grad_norm": 0.07081153988838196, + "learning_rate": 1.7078460681731587e-05, + "loss": 0.0009, + "step": 8782 + }, + { + "epoch": 2.317108560875874, + "grad_norm": 0.15181352198123932, + "learning_rate": 1.7077288314428913e-05, + "loss": 0.0153, + "step": 8784 + }, + { + "epoch": 2.3176361957525393, + "grad_norm": 0.09813249856233597, + "learning_rate": 1.7076115947126236e-05, + "loss": 0.0049, + "step": 8786 + }, + { + "epoch": 2.3181638306292047, + "grad_norm": 0.32996541261672974, + "learning_rate": 1.707494357982356e-05, + "loss": 0.0049, + "step": 8788 + }, + { + "epoch": 2.31869146550587, + "grad_norm": 0.037944596260786057, + "learning_rate": 1.7073771212520885e-05, + "loss": 0.0007, + "step": 8790 + }, + { + "epoch": 2.319219100382535, + "grad_norm": 0.006118190009146929, + "learning_rate": 1.7072598845218207e-05, + "loss": 0.0025, + "step": 8792 + }, + { + "epoch": 2.3197467352592005, + "grad_norm": 0.01830967515707016, + "learning_rate": 1.707142647791553e-05, + "loss": 0.0002, + "step": 8794 + }, + { + "epoch": 2.320274370135866, + "grad_norm": 0.18636950850486755, + "learning_rate": 1.7070254110612856e-05, + "loss": 0.0044, + "step": 8796 + }, + { + "epoch": 2.3208020050125313, + "grad_norm": 0.003553044982254505, + "learning_rate": 1.7069081743310182e-05, + "loss": 0.0001, + "step": 8798 + }, + { + "epoch": 2.3213296398891967, + "grad_norm": 0.058749277144670486, + "learning_rate": 1.7067909376007505e-05, + "loss": 0.0108, + "step": 8800 + }, + { + "epoch": 2.321857274765862, + "grad_norm": 0.018077919259667397, + "learning_rate": 1.7066737008704827e-05, + "loss": 0.0004, + "step": 8802 + }, + { + "epoch": 2.3223849096425275, + "grad_norm": 0.012046977877616882, + "learning_rate": 1.7065564641402153e-05, + "loss": 0.0002, + "step": 8804 + }, + { + "epoch": 2.322912544519193, + "grad_norm": 0.012318301945924759, + "learning_rate": 1.7064392274099476e-05, + "loss": 0.0004, + "step": 8806 + }, + { + "epoch": 2.323440179395858, + "grad_norm": 0.007255181670188904, + "learning_rate": 1.7063219906796802e-05, + "loss": 0.0003, + "step": 8808 + }, + { + "epoch": 2.3239678142725233, + "grad_norm": 0.005244013853371143, + "learning_rate": 1.7062047539494124e-05, + "loss": 0.0006, + "step": 8810 + }, + { + "epoch": 2.3244954491491887, + "grad_norm": 0.2406160831451416, + "learning_rate": 1.706087517219145e-05, + "loss": 0.0045, + "step": 8812 + }, + { + "epoch": 2.325023084025854, + "grad_norm": 0.011626848950982094, + "learning_rate": 1.7059702804888773e-05, + "loss": 0.0002, + "step": 8814 + }, + { + "epoch": 2.3255507189025195, + "grad_norm": 0.0021792587358504534, + "learning_rate": 1.7058530437586096e-05, + "loss": 0.0053, + "step": 8816 + }, + { + "epoch": 2.326078353779185, + "grad_norm": 0.005340514704585075, + "learning_rate": 1.705735807028342e-05, + "loss": 0.0049, + "step": 8818 + }, + { + "epoch": 2.3266059886558503, + "grad_norm": 0.07369290292263031, + "learning_rate": 1.7056185702980744e-05, + "loss": 0.0044, + "step": 8820 + }, + { + "epoch": 2.3271336235325153, + "grad_norm": 0.010532677173614502, + "learning_rate": 1.705501333567807e-05, + "loss": 0.0003, + "step": 8822 + }, + { + "epoch": 2.3276612584091807, + "grad_norm": 0.21929380297660828, + "learning_rate": 1.7053840968375393e-05, + "loss": 0.0011, + "step": 8824 + }, + { + "epoch": 2.328188893285846, + "grad_norm": 0.05185902491211891, + "learning_rate": 1.705266860107272e-05, + "loss": 0.0005, + "step": 8826 + }, + { + "epoch": 2.3287165281625115, + "grad_norm": 0.007229796145111322, + "learning_rate": 1.705149623377004e-05, + "loss": 0.0014, + "step": 8828 + }, + { + "epoch": 2.329244163039177, + "grad_norm": 0.06337716430425644, + "learning_rate": 1.7050323866467367e-05, + "loss": 0.0006, + "step": 8830 + }, + { + "epoch": 2.3297717979158423, + "grad_norm": 0.0030299897771328688, + "learning_rate": 1.704915149916469e-05, + "loss": 0.0002, + "step": 8832 + }, + { + "epoch": 2.3302994327925077, + "grad_norm": 0.023154232650995255, + "learning_rate": 1.7047979131862013e-05, + "loss": 0.0004, + "step": 8834 + }, + { + "epoch": 2.3308270676691727, + "grad_norm": 0.0044453563168644905, + "learning_rate": 1.704680676455934e-05, + "loss": 0.0002, + "step": 8836 + }, + { + "epoch": 2.331354702545838, + "grad_norm": 0.0025377320125699043, + "learning_rate": 1.7045634397256665e-05, + "loss": 0.0011, + "step": 8838 + }, + { + "epoch": 2.3318823374225035, + "grad_norm": 0.149427130818367, + "learning_rate": 1.7044462029953984e-05, + "loss": 0.0008, + "step": 8840 + }, + { + "epoch": 2.332409972299169, + "grad_norm": 0.0950622409582138, + "learning_rate": 1.704328966265131e-05, + "loss": 0.0009, + "step": 8842 + }, + { + "epoch": 2.3329376071758343, + "grad_norm": 0.29611751437187195, + "learning_rate": 1.7042117295348636e-05, + "loss": 0.0052, + "step": 8844 + }, + { + "epoch": 2.3334652420524997, + "grad_norm": 0.0573396198451519, + "learning_rate": 1.704094492804596e-05, + "loss": 0.0025, + "step": 8846 + }, + { + "epoch": 2.333992876929165, + "grad_norm": 0.009508030489087105, + "learning_rate": 1.703977256074328e-05, + "loss": 0.0002, + "step": 8848 + }, + { + "epoch": 2.3345205118058305, + "grad_norm": 0.0030831745825707912, + "learning_rate": 1.7038600193440607e-05, + "loss": 0.0002, + "step": 8850 + }, + { + "epoch": 2.335048146682496, + "grad_norm": 0.004662224557250738, + "learning_rate": 1.703742782613793e-05, + "loss": 0.0001, + "step": 8852 + }, + { + "epoch": 2.335575781559161, + "grad_norm": 0.0061960164457559586, + "learning_rate": 1.7036255458835252e-05, + "loss": 0.0055, + "step": 8854 + }, + { + "epoch": 2.3361034164358263, + "grad_norm": 0.004389876965433359, + "learning_rate": 1.7035083091532578e-05, + "loss": 0.0003, + "step": 8856 + }, + { + "epoch": 2.3366310513124917, + "grad_norm": 0.009068218991160393, + "learning_rate": 1.7033910724229904e-05, + "loss": 0.0002, + "step": 8858 + }, + { + "epoch": 2.337158686189157, + "grad_norm": 0.019032711163163185, + "learning_rate": 1.7032738356927227e-05, + "loss": 0.0005, + "step": 8860 + }, + { + "epoch": 2.3376863210658225, + "grad_norm": 0.023978449404239655, + "learning_rate": 1.703156598962455e-05, + "loss": 0.0004, + "step": 8862 + }, + { + "epoch": 2.338213955942488, + "grad_norm": 0.05627436563372612, + "learning_rate": 1.7030393622321875e-05, + "loss": 0.0007, + "step": 8864 + }, + { + "epoch": 2.3387415908191533, + "grad_norm": 0.023336036130785942, + "learning_rate": 1.7029221255019198e-05, + "loss": 0.0002, + "step": 8866 + }, + { + "epoch": 2.3392692256958183, + "grad_norm": 0.0021718943025916815, + "learning_rate": 1.7028048887716524e-05, + "loss": 0.0028, + "step": 8868 + }, + { + "epoch": 2.3397968605724837, + "grad_norm": 0.30372801423072815, + "learning_rate": 1.7026876520413847e-05, + "loss": 0.0104, + "step": 8870 + }, + { + "epoch": 2.340324495449149, + "grad_norm": 0.0038023022934794426, + "learning_rate": 1.7025704153111173e-05, + "loss": 0.0004, + "step": 8872 + }, + { + "epoch": 2.3408521303258145, + "grad_norm": 0.03494805097579956, + "learning_rate": 1.7024531785808495e-05, + "loss": 0.0003, + "step": 8874 + }, + { + "epoch": 2.34137976520248, + "grad_norm": 0.30475419759750366, + "learning_rate": 1.7023359418505818e-05, + "loss": 0.0048, + "step": 8876 + }, + { + "epoch": 2.3419074000791453, + "grad_norm": 0.004616297781467438, + "learning_rate": 1.7022187051203144e-05, + "loss": 0.0002, + "step": 8878 + }, + { + "epoch": 2.3424350349558107, + "grad_norm": 0.0030902225989848375, + "learning_rate": 1.7021014683900467e-05, + "loss": 0.0003, + "step": 8880 + }, + { + "epoch": 2.3429626698324757, + "grad_norm": 0.005047997925430536, + "learning_rate": 1.7019842316597793e-05, + "loss": 0.0006, + "step": 8882 + }, + { + "epoch": 2.343490304709141, + "grad_norm": 0.08900605142116547, + "learning_rate": 1.7018669949295115e-05, + "loss": 0.0006, + "step": 8884 + }, + { + "epoch": 2.3440179395858065, + "grad_norm": 0.005886254832148552, + "learning_rate": 1.7017497581992438e-05, + "loss": 0.0002, + "step": 8886 + }, + { + "epoch": 2.344545574462472, + "grad_norm": 0.015987582504749298, + "learning_rate": 1.7016325214689764e-05, + "loss": 0.0083, + "step": 8888 + }, + { + "epoch": 2.3450732093391373, + "grad_norm": 0.01659177429974079, + "learning_rate": 1.701515284738709e-05, + "loss": 0.0002, + "step": 8890 + }, + { + "epoch": 2.3456008442158027, + "grad_norm": 0.03383832424879074, + "learning_rate": 1.7013980480084412e-05, + "loss": 0.0033, + "step": 8892 + }, + { + "epoch": 2.346128479092468, + "grad_norm": 0.44073301553726196, + "learning_rate": 1.7012808112781735e-05, + "loss": 0.0008, + "step": 8894 + }, + { + "epoch": 2.3466561139691335, + "grad_norm": 0.1484086662530899, + "learning_rate": 1.701163574547906e-05, + "loss": 0.003, + "step": 8896 + }, + { + "epoch": 2.347183748845799, + "grad_norm": 0.2799950838088989, + "learning_rate": 1.7010463378176384e-05, + "loss": 0.0034, + "step": 8898 + }, + { + "epoch": 2.347711383722464, + "grad_norm": 0.004083006177097559, + "learning_rate": 1.7009291010873706e-05, + "loss": 0.0003, + "step": 8900 + }, + { + "epoch": 2.3482390185991293, + "grad_norm": 0.027032487094402313, + "learning_rate": 1.7008118643571032e-05, + "loss": 0.002, + "step": 8902 + }, + { + "epoch": 2.3487666534757947, + "grad_norm": 0.01755022443830967, + "learning_rate": 1.7006946276268358e-05, + "loss": 0.0003, + "step": 8904 + }, + { + "epoch": 2.34929428835246, + "grad_norm": 0.025943998247385025, + "learning_rate": 1.700577390896568e-05, + "loss": 0.0003, + "step": 8906 + }, + { + "epoch": 2.3498219232291255, + "grad_norm": 0.014207236468791962, + "learning_rate": 1.7004601541663003e-05, + "loss": 0.0002, + "step": 8908 + }, + { + "epoch": 2.350349558105791, + "grad_norm": 0.015059361234307289, + "learning_rate": 1.700342917436033e-05, + "loss": 0.0057, + "step": 8910 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.018209874629974365, + "learning_rate": 1.7002256807057652e-05, + "loss": 0.0004, + "step": 8912 + }, + { + "epoch": 2.3514048278591213, + "grad_norm": 0.0061115603893995285, + "learning_rate": 1.7001084439754975e-05, + "loss": 0.0002, + "step": 8914 + }, + { + "epoch": 2.3519324627357867, + "grad_norm": 0.020205168053507805, + "learning_rate": 1.69999120724523e-05, + "loss": 0.0004, + "step": 8916 + }, + { + "epoch": 2.352460097612452, + "grad_norm": 0.007048334926366806, + "learning_rate": 1.6998739705149627e-05, + "loss": 0.0002, + "step": 8918 + }, + { + "epoch": 2.3529877324891175, + "grad_norm": 0.18581195175647736, + "learning_rate": 1.699756733784695e-05, + "loss": 0.0008, + "step": 8920 + }, + { + "epoch": 2.353515367365783, + "grad_norm": 0.016938256099820137, + "learning_rate": 1.6996394970544272e-05, + "loss": 0.0003, + "step": 8922 + }, + { + "epoch": 2.3540430022424483, + "grad_norm": 0.05215206369757652, + "learning_rate": 1.6995222603241598e-05, + "loss": 0.001, + "step": 8924 + }, + { + "epoch": 2.3545706371191137, + "grad_norm": 0.14737114310264587, + "learning_rate": 1.699405023593892e-05, + "loss": 0.0021, + "step": 8926 + }, + { + "epoch": 2.3550982719957787, + "grad_norm": 0.008213197812438011, + "learning_rate": 1.6992877868636246e-05, + "loss": 0.0002, + "step": 8928 + }, + { + "epoch": 2.355625906872444, + "grad_norm": 0.004104366060346365, + "learning_rate": 1.699170550133357e-05, + "loss": 0.0005, + "step": 8930 + }, + { + "epoch": 2.3561535417491095, + "grad_norm": 0.21622860431671143, + "learning_rate": 1.699053313403089e-05, + "loss": 0.001, + "step": 8932 + }, + { + "epoch": 2.356681176625775, + "grad_norm": 0.28702548146247864, + "learning_rate": 1.6989360766728218e-05, + "loss": 0.001, + "step": 8934 + }, + { + "epoch": 2.3572088115024403, + "grad_norm": 0.07636792212724686, + "learning_rate": 1.698818839942554e-05, + "loss": 0.0003, + "step": 8936 + }, + { + "epoch": 2.3577364463791057, + "grad_norm": 0.0061761923134326935, + "learning_rate": 1.6987016032122866e-05, + "loss": 0.0002, + "step": 8938 + }, + { + "epoch": 2.358264081255771, + "grad_norm": 0.0051612574607133865, + "learning_rate": 1.698584366482019e-05, + "loss": 0.0002, + "step": 8940 + }, + { + "epoch": 2.3587917161324365, + "grad_norm": 0.01783754490315914, + "learning_rate": 1.6984671297517515e-05, + "loss": 0.0002, + "step": 8942 + }, + { + "epoch": 2.359319351009102, + "grad_norm": 0.008214151486754417, + "learning_rate": 1.6983498930214837e-05, + "loss": 0.0004, + "step": 8944 + }, + { + "epoch": 2.359846985885767, + "grad_norm": 0.050342585891485214, + "learning_rate": 1.698232656291216e-05, + "loss": 0.0017, + "step": 8946 + }, + { + "epoch": 2.3603746207624323, + "grad_norm": 0.003461831249296665, + "learning_rate": 1.6981154195609486e-05, + "loss": 0.0001, + "step": 8948 + }, + { + "epoch": 2.3609022556390977, + "grad_norm": 0.027303488925099373, + "learning_rate": 1.6979981828306812e-05, + "loss": 0.0003, + "step": 8950 + }, + { + "epoch": 2.361429890515763, + "grad_norm": 0.006023859139531851, + "learning_rate": 1.6978809461004135e-05, + "loss": 0.0001, + "step": 8952 + }, + { + "epoch": 2.3619575253924285, + "grad_norm": 0.021322283893823624, + "learning_rate": 1.6977637093701457e-05, + "loss": 0.0024, + "step": 8954 + }, + { + "epoch": 2.362485160269094, + "grad_norm": 0.009086718782782555, + "learning_rate": 1.6976464726398783e-05, + "loss": 0.0003, + "step": 8956 + }, + { + "epoch": 2.3630127951457593, + "grad_norm": 0.0018345831194892526, + "learning_rate": 1.6975292359096106e-05, + "loss": 0.0109, + "step": 8958 + }, + { + "epoch": 2.3635404300224243, + "grad_norm": 0.04745766147971153, + "learning_rate": 1.697411999179343e-05, + "loss": 0.0002, + "step": 8960 + }, + { + "epoch": 2.3640680648990897, + "grad_norm": 0.019619625061750412, + "learning_rate": 1.6972947624490754e-05, + "loss": 0.003, + "step": 8962 + }, + { + "epoch": 2.364595699775755, + "grad_norm": 0.0016931211575865746, + "learning_rate": 1.697177525718808e-05, + "loss": 0.0001, + "step": 8964 + }, + { + "epoch": 2.3651233346524205, + "grad_norm": 0.0017400970682501793, + "learning_rate": 1.69706028898854e-05, + "loss": 0.0001, + "step": 8966 + }, + { + "epoch": 2.365650969529086, + "grad_norm": 0.00999564491212368, + "learning_rate": 1.6969430522582726e-05, + "loss": 0.0001, + "step": 8968 + }, + { + "epoch": 2.3661786044057513, + "grad_norm": 0.041785236448049545, + "learning_rate": 1.6968258155280052e-05, + "loss": 0.0003, + "step": 8970 + }, + { + "epoch": 2.3667062392824167, + "grad_norm": 0.014018344692885876, + "learning_rate": 1.6967085787977374e-05, + "loss": 0.0002, + "step": 8972 + }, + { + "epoch": 2.3672338741590817, + "grad_norm": 0.00253113079816103, + "learning_rate": 1.6965913420674697e-05, + "loss": 0.0033, + "step": 8974 + }, + { + "epoch": 2.367761509035747, + "grad_norm": 0.010420892387628555, + "learning_rate": 1.6964741053372023e-05, + "loss": 0.0001, + "step": 8976 + }, + { + "epoch": 2.3682891439124125, + "grad_norm": 0.00231918110512197, + "learning_rate": 1.6963568686069346e-05, + "loss": 0.0002, + "step": 8978 + }, + { + "epoch": 2.368816778789078, + "grad_norm": 0.12598662078380585, + "learning_rate": 1.696239631876667e-05, + "loss": 0.0008, + "step": 8980 + }, + { + "epoch": 2.3693444136657433, + "grad_norm": 0.06743644922971725, + "learning_rate": 1.6961223951463994e-05, + "loss": 0.0004, + "step": 8982 + }, + { + "epoch": 2.3698720485424087, + "grad_norm": 0.5221057534217834, + "learning_rate": 1.696005158416132e-05, + "loss": 0.002, + "step": 8984 + }, + { + "epoch": 2.370399683419074, + "grad_norm": 0.008935595862567425, + "learning_rate": 1.6958879216858643e-05, + "loss": 0.0018, + "step": 8986 + }, + { + "epoch": 2.370927318295739, + "grad_norm": 0.0992436632514, + "learning_rate": 1.695770684955597e-05, + "loss": 0.0039, + "step": 8988 + }, + { + "epoch": 2.3714549531724045, + "grad_norm": 0.0024427459575235844, + "learning_rate": 1.695653448225329e-05, + "loss": 0.0001, + "step": 8990 + }, + { + "epoch": 2.37198258804907, + "grad_norm": 0.013090673834085464, + "learning_rate": 1.6955362114950614e-05, + "loss": 0.0002, + "step": 8992 + }, + { + "epoch": 2.3725102229257353, + "grad_norm": 0.1943458616733551, + "learning_rate": 1.695418974764794e-05, + "loss": 0.0029, + "step": 8994 + }, + { + "epoch": 2.3730378578024007, + "grad_norm": 0.20202000439167023, + "learning_rate": 1.6953017380345263e-05, + "loss": 0.0077, + "step": 8996 + }, + { + "epoch": 2.373565492679066, + "grad_norm": 0.35753920674324036, + "learning_rate": 1.695184501304259e-05, + "loss": 0.0004, + "step": 8998 + }, + { + "epoch": 2.3740931275557315, + "grad_norm": 0.1078399121761322, + "learning_rate": 1.695067264573991e-05, + "loss": 0.0072, + "step": 9000 + }, + { + "epoch": 2.374620762432397, + "grad_norm": 0.002445266582071781, + "learning_rate": 1.6949500278437237e-05, + "loss": 0.0008, + "step": 9002 + }, + { + "epoch": 2.3751483973090624, + "grad_norm": 0.050866883248090744, + "learning_rate": 1.694832791113456e-05, + "loss": 0.0003, + "step": 9004 + }, + { + "epoch": 2.3756760321857273, + "grad_norm": 0.3033812642097473, + "learning_rate": 1.6947155543831882e-05, + "loss": 0.0087, + "step": 9006 + }, + { + "epoch": 2.3762036670623927, + "grad_norm": 0.005489705130457878, + "learning_rate": 1.694598317652921e-05, + "loss": 0.0002, + "step": 9008 + }, + { + "epoch": 2.376731301939058, + "grad_norm": 0.03051944449543953, + "learning_rate": 1.6944810809226534e-05, + "loss": 0.0003, + "step": 9010 + }, + { + "epoch": 2.3772589368157235, + "grad_norm": 0.009996149688959122, + "learning_rate": 1.6943638441923857e-05, + "loss": 0.0004, + "step": 9012 + }, + { + "epoch": 2.377786571692389, + "grad_norm": 0.005307971499860287, + "learning_rate": 1.694246607462118e-05, + "loss": 0.0002, + "step": 9014 + }, + { + "epoch": 2.3783142065690543, + "grad_norm": 0.030547233298420906, + "learning_rate": 1.6941293707318506e-05, + "loss": 0.0012, + "step": 9016 + }, + { + "epoch": 2.3788418414457198, + "grad_norm": 0.00681493803858757, + "learning_rate": 1.6940121340015828e-05, + "loss": 0.002, + "step": 9018 + }, + { + "epoch": 2.3793694763223847, + "grad_norm": 0.008427380584180355, + "learning_rate": 1.693894897271315e-05, + "loss": 0.0003, + "step": 9020 + }, + { + "epoch": 2.37989711119905, + "grad_norm": 0.008326015435159206, + "learning_rate": 1.6937776605410477e-05, + "loss": 0.0072, + "step": 9022 + }, + { + "epoch": 2.3804247460757155, + "grad_norm": 0.04544336721301079, + "learning_rate": 1.69366042381078e-05, + "loss": 0.0009, + "step": 9024 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 0.025767285376787186, + "learning_rate": 1.6935431870805125e-05, + "loss": 0.0011, + "step": 9026 + }, + { + "epoch": 2.3814800158290463, + "grad_norm": 0.014800574630498886, + "learning_rate": 1.6934259503502448e-05, + "loss": 0.0004, + "step": 9028 + }, + { + "epoch": 2.3820076507057117, + "grad_norm": 0.016011182218790054, + "learning_rate": 1.6933087136199774e-05, + "loss": 0.0052, + "step": 9030 + }, + { + "epoch": 2.382535285582377, + "grad_norm": 0.039803650230169296, + "learning_rate": 1.6931914768897097e-05, + "loss": 0.0004, + "step": 9032 + }, + { + "epoch": 2.383062920459042, + "grad_norm": 0.010276750661432743, + "learning_rate": 1.693074240159442e-05, + "loss": 0.0004, + "step": 9034 + }, + { + "epoch": 2.3835905553357075, + "grad_norm": 0.38473019003868103, + "learning_rate": 1.6929570034291745e-05, + "loss": 0.001, + "step": 9036 + }, + { + "epoch": 2.384118190212373, + "grad_norm": 0.004558812361210585, + "learning_rate": 1.6928397666989068e-05, + "loss": 0.0002, + "step": 9038 + }, + { + "epoch": 2.3846458250890383, + "grad_norm": 0.002488242695108056, + "learning_rate": 1.6927225299686394e-05, + "loss": 0.0001, + "step": 9040 + }, + { + "epoch": 2.3851734599657037, + "grad_norm": 0.07008720934391022, + "learning_rate": 1.6926052932383716e-05, + "loss": 0.0014, + "step": 9042 + }, + { + "epoch": 2.385701094842369, + "grad_norm": 0.018190467730164528, + "learning_rate": 1.6924880565081042e-05, + "loss": 0.0003, + "step": 9044 + }, + { + "epoch": 2.3862287297190345, + "grad_norm": 0.003838401986286044, + "learning_rate": 1.6923708197778365e-05, + "loss": 0.0011, + "step": 9046 + }, + { + "epoch": 2.3867563645957, + "grad_norm": 0.02004591003060341, + "learning_rate": 1.692253583047569e-05, + "loss": 0.0003, + "step": 9048 + }, + { + "epoch": 2.3872839994723654, + "grad_norm": 0.0034093286376446486, + "learning_rate": 1.6921363463173014e-05, + "loss": 0.0002, + "step": 9050 + }, + { + "epoch": 2.3878116343490303, + "grad_norm": 0.14892537891864777, + "learning_rate": 1.6920191095870336e-05, + "loss": 0.0023, + "step": 9052 + }, + { + "epoch": 2.3883392692256957, + "grad_norm": 0.016084741801023483, + "learning_rate": 1.6919018728567662e-05, + "loss": 0.0002, + "step": 9054 + }, + { + "epoch": 2.388866904102361, + "grad_norm": 0.010993991047143936, + "learning_rate": 1.6917846361264988e-05, + "loss": 0.0002, + "step": 9056 + }, + { + "epoch": 2.3893945389790265, + "grad_norm": 0.0035159969702363014, + "learning_rate": 1.691667399396231e-05, + "loss": 0.0001, + "step": 9058 + }, + { + "epoch": 2.389922173855692, + "grad_norm": 0.037432920187711716, + "learning_rate": 1.6915501626659634e-05, + "loss": 0.0003, + "step": 9060 + }, + { + "epoch": 2.3904498087323574, + "grad_norm": 0.008157477714121342, + "learning_rate": 1.691432925935696e-05, + "loss": 0.0028, + "step": 9062 + }, + { + "epoch": 2.3909774436090228, + "grad_norm": 0.014338602311909199, + "learning_rate": 1.6913156892054282e-05, + "loss": 0.0013, + "step": 9064 + }, + { + "epoch": 2.3915050784856877, + "grad_norm": 0.0018875645473599434, + "learning_rate": 1.6911984524751605e-05, + "loss": 0.0003, + "step": 9066 + }, + { + "epoch": 2.392032713362353, + "grad_norm": 0.3544996976852417, + "learning_rate": 1.691081215744893e-05, + "loss": 0.0017, + "step": 9068 + }, + { + "epoch": 2.3925603482390185, + "grad_norm": 0.07048368453979492, + "learning_rate": 1.6909639790146253e-05, + "loss": 0.0022, + "step": 9070 + }, + { + "epoch": 2.393087983115684, + "grad_norm": 0.07621606439352036, + "learning_rate": 1.6908467422843576e-05, + "loss": 0.0048, + "step": 9072 + }, + { + "epoch": 2.3936156179923493, + "grad_norm": 0.0014117989921942353, + "learning_rate": 1.6907295055540902e-05, + "loss": 0.0001, + "step": 9074 + }, + { + "epoch": 2.3941432528690147, + "grad_norm": 0.0014381016371771693, + "learning_rate": 1.6906122688238228e-05, + "loss": 0.0001, + "step": 9076 + }, + { + "epoch": 2.39467088774568, + "grad_norm": 0.009844469837844372, + "learning_rate": 1.690495032093555e-05, + "loss": 0.0001, + "step": 9078 + }, + { + "epoch": 2.395198522622345, + "grad_norm": 0.003543374128639698, + "learning_rate": 1.6903777953632873e-05, + "loss": 0.0001, + "step": 9080 + }, + { + "epoch": 2.3957261574990105, + "grad_norm": 0.0016742433654144406, + "learning_rate": 1.69026055863302e-05, + "loss": 0.0001, + "step": 9082 + }, + { + "epoch": 2.396253792375676, + "grad_norm": 0.0027444157749414444, + "learning_rate": 1.6901433219027522e-05, + "loss": 0.0002, + "step": 9084 + }, + { + "epoch": 2.3967814272523413, + "grad_norm": 0.6376184821128845, + "learning_rate": 1.6900260851724848e-05, + "loss": 0.0007, + "step": 9086 + }, + { + "epoch": 2.3973090621290067, + "grad_norm": 0.07292628288269043, + "learning_rate": 1.689908848442217e-05, + "loss": 0.0005, + "step": 9088 + }, + { + "epoch": 2.397836697005672, + "grad_norm": 0.0014919193927198648, + "learning_rate": 1.6897916117119496e-05, + "loss": 0.0001, + "step": 9090 + }, + { + "epoch": 2.3983643318823376, + "grad_norm": 0.12422525882720947, + "learning_rate": 1.689674374981682e-05, + "loss": 0.0007, + "step": 9092 + }, + { + "epoch": 2.398891966759003, + "grad_norm": 1.1027252674102783, + "learning_rate": 1.689557138251414e-05, + "loss": 0.0019, + "step": 9094 + }, + { + "epoch": 2.3994196016356684, + "grad_norm": 0.3027414381504059, + "learning_rate": 1.6894399015211468e-05, + "loss": 0.0027, + "step": 9096 + }, + { + "epoch": 2.3999472365123333, + "grad_norm": 0.006389942951500416, + "learning_rate": 1.689322664790879e-05, + "loss": 0.0017, + "step": 9098 + }, + { + "epoch": 2.4004748713889987, + "grad_norm": 0.0641697570681572, + "learning_rate": 1.6892054280606116e-05, + "loss": 0.0004, + "step": 9100 + }, + { + "epoch": 2.401002506265664, + "grad_norm": 0.0014114094665274024, + "learning_rate": 1.689088191330344e-05, + "loss": 0.0002, + "step": 9102 + }, + { + "epoch": 2.4015301411423295, + "grad_norm": 0.021117953583598137, + "learning_rate": 1.6889709546000765e-05, + "loss": 0.0003, + "step": 9104 + }, + { + "epoch": 2.402057776018995, + "grad_norm": 0.2830766439437866, + "learning_rate": 1.6888537178698087e-05, + "loss": 0.005, + "step": 9106 + }, + { + "epoch": 2.4025854108956604, + "grad_norm": 0.010317099280655384, + "learning_rate": 1.6887364811395413e-05, + "loss": 0.0002, + "step": 9108 + }, + { + "epoch": 2.4031130457723258, + "grad_norm": 0.02758117765188217, + "learning_rate": 1.6886192444092736e-05, + "loss": 0.0002, + "step": 9110 + }, + { + "epoch": 2.4036406806489907, + "grad_norm": 0.02360902726650238, + "learning_rate": 1.688502007679006e-05, + "loss": 0.0008, + "step": 9112 + }, + { + "epoch": 2.404168315525656, + "grad_norm": 0.14790229499340057, + "learning_rate": 1.6883847709487385e-05, + "loss": 0.0004, + "step": 9114 + }, + { + "epoch": 2.4046959504023215, + "grad_norm": 0.01613958179950714, + "learning_rate": 1.6882675342184707e-05, + "loss": 0.0002, + "step": 9116 + }, + { + "epoch": 2.405223585278987, + "grad_norm": 0.0075847995467484, + "learning_rate": 1.688150297488203e-05, + "loss": 0.0004, + "step": 9118 + }, + { + "epoch": 2.4057512201556523, + "grad_norm": 0.004681904800236225, + "learning_rate": 1.6880330607579356e-05, + "loss": 0.0002, + "step": 9120 + }, + { + "epoch": 2.4062788550323178, + "grad_norm": 0.02857520990073681, + "learning_rate": 1.6879158240276682e-05, + "loss": 0.0023, + "step": 9122 + }, + { + "epoch": 2.406806489908983, + "grad_norm": 0.014689316973090172, + "learning_rate": 1.6877985872974004e-05, + "loss": 0.0002, + "step": 9124 + }, + { + "epoch": 2.407334124785648, + "grad_norm": 0.0024262226652354, + "learning_rate": 1.6876813505671327e-05, + "loss": 0.0003, + "step": 9126 + }, + { + "epoch": 2.4078617596623135, + "grad_norm": 0.028655417263507843, + "learning_rate": 1.6875641138368653e-05, + "loss": 0.0032, + "step": 9128 + }, + { + "epoch": 2.408389394538979, + "grad_norm": 0.02096669003367424, + "learning_rate": 1.6874468771065976e-05, + "loss": 0.0007, + "step": 9130 + }, + { + "epoch": 2.4089170294156443, + "grad_norm": 0.03783147409558296, + "learning_rate": 1.6873296403763298e-05, + "loss": 0.0032, + "step": 9132 + }, + { + "epoch": 2.4094446642923097, + "grad_norm": 0.08230985701084137, + "learning_rate": 1.6872124036460624e-05, + "loss": 0.0009, + "step": 9134 + }, + { + "epoch": 2.409972299168975, + "grad_norm": 0.05844524875283241, + "learning_rate": 1.687095166915795e-05, + "loss": 0.0005, + "step": 9136 + }, + { + "epoch": 2.4104999340456406, + "grad_norm": 0.024634091183543205, + "learning_rate": 1.6869779301855273e-05, + "loss": 0.0068, + "step": 9138 + }, + { + "epoch": 2.4110275689223055, + "grad_norm": 0.10886643081903458, + "learning_rate": 1.6868606934552596e-05, + "loss": 0.0005, + "step": 9140 + }, + { + "epoch": 2.411555203798971, + "grad_norm": 0.006662083324044943, + "learning_rate": 1.686743456724992e-05, + "loss": 0.0001, + "step": 9142 + }, + { + "epoch": 2.4120828386756363, + "grad_norm": 0.0033776958007365465, + "learning_rate": 1.6866262199947244e-05, + "loss": 0.0005, + "step": 9144 + }, + { + "epoch": 2.4126104735523017, + "grad_norm": 0.1218060851097107, + "learning_rate": 1.686508983264457e-05, + "loss": 0.0013, + "step": 9146 + }, + { + "epoch": 2.413138108428967, + "grad_norm": 0.055462609976530075, + "learning_rate": 1.6863917465341893e-05, + "loss": 0.0007, + "step": 9148 + }, + { + "epoch": 2.4136657433056325, + "grad_norm": 0.045771900564432144, + "learning_rate": 1.686274509803922e-05, + "loss": 0.0005, + "step": 9150 + }, + { + "epoch": 2.414193378182298, + "grad_norm": 0.016693580895662308, + "learning_rate": 1.686157273073654e-05, + "loss": 0.0016, + "step": 9152 + }, + { + "epoch": 2.4147210130589634, + "grad_norm": 0.22334334254264832, + "learning_rate": 1.6860400363433864e-05, + "loss": 0.0031, + "step": 9154 + }, + { + "epoch": 2.4152486479356288, + "grad_norm": 0.002539668930694461, + "learning_rate": 1.685922799613119e-05, + "loss": 0.0003, + "step": 9156 + }, + { + "epoch": 2.4157762828122937, + "grad_norm": 0.023463338613510132, + "learning_rate": 1.6858055628828513e-05, + "loss": 0.0022, + "step": 9158 + }, + { + "epoch": 2.416303917688959, + "grad_norm": 0.005270007066428661, + "learning_rate": 1.685688326152584e-05, + "loss": 0.0003, + "step": 9160 + }, + { + "epoch": 2.4168315525656245, + "grad_norm": 0.029114514589309692, + "learning_rate": 1.685571089422316e-05, + "loss": 0.0004, + "step": 9162 + }, + { + "epoch": 2.41735918744229, + "grad_norm": 0.6854694485664368, + "learning_rate": 1.6854538526920484e-05, + "loss": 0.0109, + "step": 9164 + }, + { + "epoch": 2.4178868223189554, + "grad_norm": 0.00858831126242876, + "learning_rate": 1.685336615961781e-05, + "loss": 0.0027, + "step": 9166 + }, + { + "epoch": 2.4184144571956208, + "grad_norm": 0.04779059439897537, + "learning_rate": 1.6852193792315136e-05, + "loss": 0.0003, + "step": 9168 + }, + { + "epoch": 2.418942092072286, + "grad_norm": 1.208021640777588, + "learning_rate": 1.685102142501246e-05, + "loss": 0.0102, + "step": 9170 + }, + { + "epoch": 2.419469726948951, + "grad_norm": 0.8460263013839722, + "learning_rate": 1.684984905770978e-05, + "loss": 0.0058, + "step": 9172 + }, + { + "epoch": 2.4199973618256165, + "grad_norm": 0.019398147240281105, + "learning_rate": 1.6848676690407107e-05, + "loss": 0.0004, + "step": 9174 + }, + { + "epoch": 2.420524996702282, + "grad_norm": 0.40069329738616943, + "learning_rate": 1.684750432310443e-05, + "loss": 0.0026, + "step": 9176 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.9921723008155823, + "learning_rate": 1.6846331955801752e-05, + "loss": 0.0076, + "step": 9178 + }, + { + "epoch": 2.4215802664556128, + "grad_norm": 0.05548262223601341, + "learning_rate": 1.6845159588499078e-05, + "loss": 0.0084, + "step": 9180 + }, + { + "epoch": 2.422107901332278, + "grad_norm": 0.18111373484134674, + "learning_rate": 1.6843987221196404e-05, + "loss": 0.0089, + "step": 9182 + }, + { + "epoch": 2.4226355362089436, + "grad_norm": 0.14118556678295135, + "learning_rate": 1.6842814853893727e-05, + "loss": 0.0022, + "step": 9184 + }, + { + "epoch": 2.4231631710856085, + "grad_norm": 0.07000702619552612, + "learning_rate": 1.684164248659105e-05, + "loss": 0.0006, + "step": 9186 + }, + { + "epoch": 2.423690805962274, + "grad_norm": 0.011507155373692513, + "learning_rate": 1.6840470119288375e-05, + "loss": 0.0002, + "step": 9188 + }, + { + "epoch": 2.4242184408389393, + "grad_norm": 0.2206389605998993, + "learning_rate": 1.6839297751985698e-05, + "loss": 0.0093, + "step": 9190 + }, + { + "epoch": 2.4247460757156047, + "grad_norm": 0.44201621413230896, + "learning_rate": 1.683812538468302e-05, + "loss": 0.003, + "step": 9192 + }, + { + "epoch": 2.42527371059227, + "grad_norm": 0.00819023884832859, + "learning_rate": 1.6836953017380347e-05, + "loss": 0.0048, + "step": 9194 + }, + { + "epoch": 2.4258013454689356, + "grad_norm": 0.15250594913959503, + "learning_rate": 1.6835780650077673e-05, + "loss": 0.001, + "step": 9196 + }, + { + "epoch": 2.426328980345601, + "grad_norm": 0.5134063363075256, + "learning_rate": 1.6834608282774995e-05, + "loss": 0.0071, + "step": 9198 + }, + { + "epoch": 2.4268566152222664, + "grad_norm": 0.03875105828046799, + "learning_rate": 1.6833435915472318e-05, + "loss": 0.0005, + "step": 9200 + }, + { + "epoch": 2.4273842500989318, + "grad_norm": 0.10165303945541382, + "learning_rate": 1.6832263548169644e-05, + "loss": 0.0007, + "step": 9202 + }, + { + "epoch": 2.4279118849755967, + "grad_norm": 0.22536171972751617, + "learning_rate": 1.6831091180866966e-05, + "loss": 0.0032, + "step": 9204 + }, + { + "epoch": 2.428439519852262, + "grad_norm": 0.020537465810775757, + "learning_rate": 1.6829918813564292e-05, + "loss": 0.0058, + "step": 9206 + }, + { + "epoch": 2.4289671547289275, + "grad_norm": 0.021596545353531837, + "learning_rate": 1.6828746446261615e-05, + "loss": 0.0004, + "step": 9208 + }, + { + "epoch": 2.429494789605593, + "grad_norm": 0.141325905919075, + "learning_rate": 1.6827574078958938e-05, + "loss": 0.0014, + "step": 9210 + }, + { + "epoch": 2.4300224244822584, + "grad_norm": 0.008369579911231995, + "learning_rate": 1.6826401711656264e-05, + "loss": 0.0001, + "step": 9212 + }, + { + "epoch": 2.4305500593589238, + "grad_norm": 0.007947342470288277, + "learning_rate": 1.6825229344353586e-05, + "loss": 0.0002, + "step": 9214 + }, + { + "epoch": 2.431077694235589, + "grad_norm": 0.16778509318828583, + "learning_rate": 1.6824056977050912e-05, + "loss": 0.0005, + "step": 9216 + }, + { + "epoch": 2.431605329112254, + "grad_norm": 0.004605477210134268, + "learning_rate": 1.6822884609748235e-05, + "loss": 0.0001, + "step": 9218 + }, + { + "epoch": 2.4321329639889195, + "grad_norm": 0.006066730245947838, + "learning_rate": 1.682171224244556e-05, + "loss": 0.0002, + "step": 9220 + }, + { + "epoch": 2.432660598865585, + "grad_norm": 0.6318947076797485, + "learning_rate": 1.6820539875142884e-05, + "loss": 0.014, + "step": 9222 + }, + { + "epoch": 2.4331882337422504, + "grad_norm": 0.003302713157609105, + "learning_rate": 1.6819367507840206e-05, + "loss": 0.0001, + "step": 9224 + }, + { + "epoch": 2.4337158686189158, + "grad_norm": 0.010634688660502434, + "learning_rate": 1.6818195140537532e-05, + "loss": 0.0002, + "step": 9226 + }, + { + "epoch": 2.434243503495581, + "grad_norm": 0.09502138942480087, + "learning_rate": 1.6817022773234858e-05, + "loss": 0.0019, + "step": 9228 + }, + { + "epoch": 2.4347711383722466, + "grad_norm": 0.05818304419517517, + "learning_rate": 1.681585040593218e-05, + "loss": 0.0004, + "step": 9230 + }, + { + "epoch": 2.4352987732489115, + "grad_norm": 0.01341144647449255, + "learning_rate": 1.6814678038629503e-05, + "loss": 0.0002, + "step": 9232 + }, + { + "epoch": 2.435826408125577, + "grad_norm": 0.008065571077167988, + "learning_rate": 1.681350567132683e-05, + "loss": 0.0002, + "step": 9234 + }, + { + "epoch": 2.4363540430022423, + "grad_norm": 0.38571593165397644, + "learning_rate": 1.6812333304024152e-05, + "loss": 0.0015, + "step": 9236 + }, + { + "epoch": 2.4368816778789077, + "grad_norm": 0.6666185855865479, + "learning_rate": 1.6811160936721475e-05, + "loss": 0.0057, + "step": 9238 + }, + { + "epoch": 2.437409312755573, + "grad_norm": 0.04863313212990761, + "learning_rate": 1.68099885694188e-05, + "loss": 0.004, + "step": 9240 + }, + { + "epoch": 2.4379369476322386, + "grad_norm": 0.007487938739359379, + "learning_rate": 1.6808816202116127e-05, + "loss": 0.0005, + "step": 9242 + }, + { + "epoch": 2.438464582508904, + "grad_norm": 0.34428083896636963, + "learning_rate": 1.6807643834813446e-05, + "loss": 0.0036, + "step": 9244 + }, + { + "epoch": 2.4389922173855694, + "grad_norm": 0.1395961195230484, + "learning_rate": 1.6806471467510772e-05, + "loss": 0.0017, + "step": 9246 + }, + { + "epoch": 2.439519852262235, + "grad_norm": 0.004308372735977173, + "learning_rate": 1.6805299100208098e-05, + "loss": 0.0001, + "step": 9248 + }, + { + "epoch": 2.4400474871388997, + "grad_norm": 0.007883957587182522, + "learning_rate": 1.680412673290542e-05, + "loss": 0.0002, + "step": 9250 + }, + { + "epoch": 2.440575122015565, + "grad_norm": 0.013309923931956291, + "learning_rate": 1.6802954365602743e-05, + "loss": 0.0022, + "step": 9252 + }, + { + "epoch": 2.4411027568922306, + "grad_norm": 0.3653549551963806, + "learning_rate": 1.680178199830007e-05, + "loss": 0.0015, + "step": 9254 + }, + { + "epoch": 2.441630391768896, + "grad_norm": 0.003017079783603549, + "learning_rate": 1.680060963099739e-05, + "loss": 0.0002, + "step": 9256 + }, + { + "epoch": 2.4421580266455614, + "grad_norm": 0.0099405562505126, + "learning_rate": 1.6799437263694718e-05, + "loss": 0.0002, + "step": 9258 + }, + { + "epoch": 2.4426856615222268, + "grad_norm": 0.20215536653995514, + "learning_rate": 1.679826489639204e-05, + "loss": 0.0009, + "step": 9260 + }, + { + "epoch": 2.443213296398892, + "grad_norm": 0.008201219141483307, + "learning_rate": 1.6797092529089366e-05, + "loss": 0.0002, + "step": 9262 + }, + { + "epoch": 2.443740931275557, + "grad_norm": 0.08198649436235428, + "learning_rate": 1.679592016178669e-05, + "loss": 0.0045, + "step": 9264 + }, + { + "epoch": 2.4442685661522225, + "grad_norm": 0.0026970868930220604, + "learning_rate": 1.6794747794484015e-05, + "loss": 0.0003, + "step": 9266 + }, + { + "epoch": 2.444796201028888, + "grad_norm": 0.0072012245655059814, + "learning_rate": 1.6793575427181337e-05, + "loss": 0.0002, + "step": 9268 + }, + { + "epoch": 2.4453238359055534, + "grad_norm": 0.3436579704284668, + "learning_rate": 1.679240305987866e-05, + "loss": 0.0047, + "step": 9270 + }, + { + "epoch": 2.4458514707822188, + "grad_norm": 0.3960976004600525, + "learning_rate": 1.6791230692575986e-05, + "loss": 0.0025, + "step": 9272 + }, + { + "epoch": 2.446379105658884, + "grad_norm": 0.018092375248670578, + "learning_rate": 1.679005832527331e-05, + "loss": 0.0022, + "step": 9274 + }, + { + "epoch": 2.4469067405355496, + "grad_norm": 0.006752198562026024, + "learning_rate": 1.6788885957970635e-05, + "loss": 0.0001, + "step": 9276 + }, + { + "epoch": 2.4474343754122145, + "grad_norm": 0.004366693086922169, + "learning_rate": 1.6787713590667957e-05, + "loss": 0.0021, + "step": 9278 + }, + { + "epoch": 2.44796201028888, + "grad_norm": 0.02020297758281231, + "learning_rate": 1.6786541223365283e-05, + "loss": 0.0068, + "step": 9280 + }, + { + "epoch": 2.4484896451655453, + "grad_norm": 0.02791844867169857, + "learning_rate": 1.6785368856062606e-05, + "loss": 0.0002, + "step": 9282 + }, + { + "epoch": 2.4490172800422108, + "grad_norm": 0.011341969482600689, + "learning_rate": 1.678419648875993e-05, + "loss": 0.0003, + "step": 9284 + }, + { + "epoch": 2.449544914918876, + "grad_norm": 0.4226863980293274, + "learning_rate": 1.6783024121457254e-05, + "loss": 0.0029, + "step": 9286 + }, + { + "epoch": 2.4500725497955416, + "grad_norm": 0.9957269430160522, + "learning_rate": 1.678185175415458e-05, + "loss": 0.0075, + "step": 9288 + }, + { + "epoch": 2.450600184672207, + "grad_norm": 0.1621064394712448, + "learning_rate": 1.67806793868519e-05, + "loss": 0.0006, + "step": 9290 + }, + { + "epoch": 2.451127819548872, + "grad_norm": 0.005461583845317364, + "learning_rate": 1.6779507019549226e-05, + "loss": 0.0002, + "step": 9292 + }, + { + "epoch": 2.4516554544255373, + "grad_norm": 1.136325716972351, + "learning_rate": 1.677833465224655e-05, + "loss": 0.0085, + "step": 9294 + }, + { + "epoch": 2.4521830893022027, + "grad_norm": 0.003695097519084811, + "learning_rate": 1.6777162284943874e-05, + "loss": 0.0043, + "step": 9296 + }, + { + "epoch": 2.452710724178868, + "grad_norm": 0.005649776663631201, + "learning_rate": 1.6775989917641197e-05, + "loss": 0.0006, + "step": 9298 + }, + { + "epoch": 2.4532383590555336, + "grad_norm": 0.3886825442314148, + "learning_rate": 1.6774817550338523e-05, + "loss": 0.0009, + "step": 9300 + }, + { + "epoch": 2.453765993932199, + "grad_norm": 0.026244360953569412, + "learning_rate": 1.6773645183035845e-05, + "loss": 0.0004, + "step": 9302 + }, + { + "epoch": 2.4542936288088644, + "grad_norm": 0.19980710744857788, + "learning_rate": 1.6772472815733168e-05, + "loss": 0.0006, + "step": 9304 + }, + { + "epoch": 2.4548212636855298, + "grad_norm": 0.026571646332740784, + "learning_rate": 1.6771300448430494e-05, + "loss": 0.008, + "step": 9306 + }, + { + "epoch": 2.455348898562195, + "grad_norm": 0.011387823149561882, + "learning_rate": 1.677012808112782e-05, + "loss": 0.0047, + "step": 9308 + }, + { + "epoch": 2.45587653343886, + "grad_norm": 0.052641719579696655, + "learning_rate": 1.6768955713825143e-05, + "loss": 0.0038, + "step": 9310 + }, + { + "epoch": 2.4564041683155255, + "grad_norm": 0.019698671996593475, + "learning_rate": 1.6767783346522465e-05, + "loss": 0.0012, + "step": 9312 + }, + { + "epoch": 2.456931803192191, + "grad_norm": 0.26046809554100037, + "learning_rate": 1.676661097921979e-05, + "loss": 0.0063, + "step": 9314 + }, + { + "epoch": 2.4574594380688564, + "grad_norm": 0.03079398348927498, + "learning_rate": 1.6765438611917114e-05, + "loss": 0.0046, + "step": 9316 + }, + { + "epoch": 2.4579870729455218, + "grad_norm": 0.06162635236978531, + "learning_rate": 1.676426624461444e-05, + "loss": 0.0008, + "step": 9318 + }, + { + "epoch": 2.458514707822187, + "grad_norm": 0.025299768894910812, + "learning_rate": 1.6763093877311763e-05, + "loss": 0.0009, + "step": 9320 + }, + { + "epoch": 2.4590423426988526, + "grad_norm": 0.12470518052577972, + "learning_rate": 1.676192151000909e-05, + "loss": 0.001, + "step": 9322 + }, + { + "epoch": 2.4595699775755175, + "grad_norm": 0.4547266364097595, + "learning_rate": 1.676074914270641e-05, + "loss": 0.0046, + "step": 9324 + }, + { + "epoch": 2.460097612452183, + "grad_norm": 0.058585282415151596, + "learning_rate": 1.6759576775403737e-05, + "loss": 0.0007, + "step": 9326 + }, + { + "epoch": 2.4606252473288484, + "grad_norm": 0.03333139792084694, + "learning_rate": 1.675840440810106e-05, + "loss": 0.0006, + "step": 9328 + }, + { + "epoch": 2.4611528822055138, + "grad_norm": 0.2916986644268036, + "learning_rate": 1.6757232040798382e-05, + "loss": 0.0117, + "step": 9330 + }, + { + "epoch": 2.461680517082179, + "grad_norm": 0.004363080952316523, + "learning_rate": 1.675605967349571e-05, + "loss": 0.0053, + "step": 9332 + }, + { + "epoch": 2.4622081519588446, + "grad_norm": 0.4187963902950287, + "learning_rate": 1.675488730619303e-05, + "loss": 0.0107, + "step": 9334 + }, + { + "epoch": 2.46273578683551, + "grad_norm": 0.0018102851463481784, + "learning_rate": 1.6753714938890357e-05, + "loss": 0.003, + "step": 9336 + }, + { + "epoch": 2.463263421712175, + "grad_norm": 0.002815830521285534, + "learning_rate": 1.675254257158768e-05, + "loss": 0.0002, + "step": 9338 + }, + { + "epoch": 2.4637910565888403, + "grad_norm": 0.0646885484457016, + "learning_rate": 1.6751370204285006e-05, + "loss": 0.0075, + "step": 9340 + }, + { + "epoch": 2.4643186914655058, + "grad_norm": 0.33836156129837036, + "learning_rate": 1.6750197836982328e-05, + "loss": 0.0047, + "step": 9342 + }, + { + "epoch": 2.464846326342171, + "grad_norm": 0.1042192131280899, + "learning_rate": 1.674902546967965e-05, + "loss": 0.0009, + "step": 9344 + }, + { + "epoch": 2.4653739612188366, + "grad_norm": 0.010540610179305077, + "learning_rate": 1.6747853102376977e-05, + "loss": 0.0025, + "step": 9346 + }, + { + "epoch": 2.465901596095502, + "grad_norm": 0.018727745860815048, + "learning_rate": 1.67466807350743e-05, + "loss": 0.0002, + "step": 9348 + }, + { + "epoch": 2.4664292309721674, + "grad_norm": 0.1406499296426773, + "learning_rate": 1.6745508367771622e-05, + "loss": 0.0015, + "step": 9350 + }, + { + "epoch": 2.466956865848833, + "grad_norm": 0.03258772939443588, + "learning_rate": 1.6744336000468948e-05, + "loss": 0.0072, + "step": 9352 + }, + { + "epoch": 2.467484500725498, + "grad_norm": 0.007048359606415033, + "learning_rate": 1.6743163633166274e-05, + "loss": 0.0002, + "step": 9354 + }, + { + "epoch": 2.468012135602163, + "grad_norm": 0.003047083504498005, + "learning_rate": 1.6741991265863597e-05, + "loss": 0.0033, + "step": 9356 + }, + { + "epoch": 2.4685397704788286, + "grad_norm": 0.03468075022101402, + "learning_rate": 1.674081889856092e-05, + "loss": 0.0006, + "step": 9358 + }, + { + "epoch": 2.469067405355494, + "grad_norm": 0.030984779819846153, + "learning_rate": 1.6739646531258245e-05, + "loss": 0.0003, + "step": 9360 + }, + { + "epoch": 2.4695950402321594, + "grad_norm": 0.026463979855179787, + "learning_rate": 1.6738474163955568e-05, + "loss": 0.0003, + "step": 9362 + }, + { + "epoch": 2.4701226751088248, + "grad_norm": 0.0018108999356627464, + "learning_rate": 1.673730179665289e-05, + "loss": 0.0002, + "step": 9364 + }, + { + "epoch": 2.47065030998549, + "grad_norm": 0.0051316674798727036, + "learning_rate": 1.6736129429350216e-05, + "loss": 0.0037, + "step": 9366 + }, + { + "epoch": 2.4711779448621556, + "grad_norm": 0.07227342575788498, + "learning_rate": 1.6734957062047542e-05, + "loss": 0.0006, + "step": 9368 + }, + { + "epoch": 2.4717055797388205, + "grad_norm": 0.2776590585708618, + "learning_rate": 1.6733784694744865e-05, + "loss": 0.0065, + "step": 9370 + }, + { + "epoch": 2.472233214615486, + "grad_norm": 0.008138594217598438, + "learning_rate": 1.6732612327442188e-05, + "loss": 0.0014, + "step": 9372 + }, + { + "epoch": 2.4727608494921514, + "grad_norm": 0.1591642051935196, + "learning_rate": 1.6731439960139514e-05, + "loss": 0.0021, + "step": 9374 + }, + { + "epoch": 2.4732884843688168, + "grad_norm": 0.0064796702936291695, + "learning_rate": 1.6730267592836836e-05, + "loss": 0.0001, + "step": 9376 + }, + { + "epoch": 2.473816119245482, + "grad_norm": 0.02427678368985653, + "learning_rate": 1.6729095225534162e-05, + "loss": 0.0002, + "step": 9378 + }, + { + "epoch": 2.4743437541221476, + "grad_norm": 0.004847113508731127, + "learning_rate": 1.6727922858231485e-05, + "loss": 0.0002, + "step": 9380 + }, + { + "epoch": 2.474871388998813, + "grad_norm": 0.0010172673501074314, + "learning_rate": 1.672675049092881e-05, + "loss": 0.0001, + "step": 9382 + }, + { + "epoch": 2.475399023875478, + "grad_norm": 0.003989586606621742, + "learning_rate": 1.6725578123626133e-05, + "loss": 0.0001, + "step": 9384 + }, + { + "epoch": 2.4759266587521434, + "grad_norm": 0.18078838288784027, + "learning_rate": 1.672440575632346e-05, + "loss": 0.0047, + "step": 9386 + }, + { + "epoch": 2.4764542936288088, + "grad_norm": 0.0065165129490196705, + "learning_rate": 1.6723233389020782e-05, + "loss": 0.0002, + "step": 9388 + }, + { + "epoch": 2.476981928505474, + "grad_norm": 0.02254066802561283, + "learning_rate": 1.6722061021718105e-05, + "loss": 0.0002, + "step": 9390 + }, + { + "epoch": 2.4775095633821396, + "grad_norm": 0.007134158164262772, + "learning_rate": 1.672088865441543e-05, + "loss": 0.0004, + "step": 9392 + }, + { + "epoch": 2.478037198258805, + "grad_norm": 0.013494729064404964, + "learning_rate": 1.6719716287112753e-05, + "loss": 0.0003, + "step": 9394 + }, + { + "epoch": 2.4785648331354704, + "grad_norm": 0.0018716251943260431, + "learning_rate": 1.6718543919810076e-05, + "loss": 0.0001, + "step": 9396 + }, + { + "epoch": 2.479092468012136, + "grad_norm": 0.08298640698194504, + "learning_rate": 1.6717371552507402e-05, + "loss": 0.0069, + "step": 9398 + }, + { + "epoch": 2.479620102888801, + "grad_norm": 0.37329158186912537, + "learning_rate": 1.6716199185204728e-05, + "loss": 0.0054, + "step": 9400 + }, + { + "epoch": 2.480147737765466, + "grad_norm": 0.07034184783697128, + "learning_rate": 1.671502681790205e-05, + "loss": 0.0008, + "step": 9402 + }, + { + "epoch": 2.4806753726421316, + "grad_norm": 0.0058713918551802635, + "learning_rate": 1.6713854450599373e-05, + "loss": 0.0059, + "step": 9404 + }, + { + "epoch": 2.481203007518797, + "grad_norm": 0.006083893124014139, + "learning_rate": 1.67126820832967e-05, + "loss": 0.0109, + "step": 9406 + }, + { + "epoch": 2.4817306423954624, + "grad_norm": 0.005639077629894018, + "learning_rate": 1.6711509715994022e-05, + "loss": 0.0002, + "step": 9408 + }, + { + "epoch": 2.4822582772721278, + "grad_norm": 0.5495672225952148, + "learning_rate": 1.6710337348691344e-05, + "loss": 0.0063, + "step": 9410 + }, + { + "epoch": 2.482785912148793, + "grad_norm": 0.006487678270787001, + "learning_rate": 1.670916498138867e-05, + "loss": 0.0008, + "step": 9412 + }, + { + "epoch": 2.4833135470254586, + "grad_norm": 0.05350331962108612, + "learning_rate": 1.6707992614085996e-05, + "loss": 0.0006, + "step": 9414 + }, + { + "epoch": 2.4838411819021236, + "grad_norm": 0.025691106915473938, + "learning_rate": 1.670682024678332e-05, + "loss": 0.0003, + "step": 9416 + }, + { + "epoch": 2.484368816778789, + "grad_norm": 0.0038280095905065536, + "learning_rate": 1.670564787948064e-05, + "loss": 0.0002, + "step": 9418 + }, + { + "epoch": 2.4848964516554544, + "grad_norm": 0.011154145002365112, + "learning_rate": 1.6704475512177968e-05, + "loss": 0.0002, + "step": 9420 + }, + { + "epoch": 2.4854240865321198, + "grad_norm": 0.020482273772358894, + "learning_rate": 1.670330314487529e-05, + "loss": 0.0003, + "step": 9422 + }, + { + "epoch": 2.485951721408785, + "grad_norm": 0.028010152280330658, + "learning_rate": 1.6702130777572613e-05, + "loss": 0.0004, + "step": 9424 + }, + { + "epoch": 2.4864793562854506, + "grad_norm": 0.0037195298355072737, + "learning_rate": 1.670095841026994e-05, + "loss": 0.0001, + "step": 9426 + }, + { + "epoch": 2.487006991162116, + "grad_norm": 0.2354094535112381, + "learning_rate": 1.6699786042967265e-05, + "loss": 0.0092, + "step": 9428 + }, + { + "epoch": 2.487534626038781, + "grad_norm": 0.010718251578509808, + "learning_rate": 1.6698613675664587e-05, + "loss": 0.0029, + "step": 9430 + }, + { + "epoch": 2.4880622609154464, + "grad_norm": 0.00544942170381546, + "learning_rate": 1.669744130836191e-05, + "loss": 0.0029, + "step": 9432 + }, + { + "epoch": 2.4885898957921118, + "grad_norm": 0.10040728747844696, + "learning_rate": 1.6696268941059236e-05, + "loss": 0.0005, + "step": 9434 + }, + { + "epoch": 2.489117530668777, + "grad_norm": 0.23260246217250824, + "learning_rate": 1.669509657375656e-05, + "loss": 0.0032, + "step": 9436 + }, + { + "epoch": 2.4896451655454426, + "grad_norm": 0.005314048379659653, + "learning_rate": 1.6693924206453885e-05, + "loss": 0.0002, + "step": 9438 + }, + { + "epoch": 2.490172800422108, + "grad_norm": 0.005336363334208727, + "learning_rate": 1.6692751839151207e-05, + "loss": 0.0003, + "step": 9440 + }, + { + "epoch": 2.4907004352987734, + "grad_norm": 0.13996407389640808, + "learning_rate": 1.669157947184853e-05, + "loss": 0.0006, + "step": 9442 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.029071981087327003, + "learning_rate": 1.6690407104545856e-05, + "loss": 0.0003, + "step": 9444 + }, + { + "epoch": 2.4917557050521038, + "grad_norm": 0.006268467754125595, + "learning_rate": 1.6689234737243182e-05, + "loss": 0.0002, + "step": 9446 + }, + { + "epoch": 2.492283339928769, + "grad_norm": 0.0035095280036330223, + "learning_rate": 1.6688062369940504e-05, + "loss": 0.003, + "step": 9448 + }, + { + "epoch": 2.4928109748054346, + "grad_norm": 0.012223076075315475, + "learning_rate": 1.6686890002637827e-05, + "loss": 0.0037, + "step": 9450 + }, + { + "epoch": 2.4933386096821, + "grad_norm": 0.02970989979803562, + "learning_rate": 1.6685717635335153e-05, + "loss": 0.0028, + "step": 9452 + }, + { + "epoch": 2.4938662445587654, + "grad_norm": 0.07860152423381805, + "learning_rate": 1.6684545268032476e-05, + "loss": 0.001, + "step": 9454 + }, + { + "epoch": 2.494393879435431, + "grad_norm": 0.027177076786756516, + "learning_rate": 1.6683372900729798e-05, + "loss": 0.0004, + "step": 9456 + }, + { + "epoch": 2.494921514312096, + "grad_norm": 0.05742025375366211, + "learning_rate": 1.6682200533427124e-05, + "loss": 0.0004, + "step": 9458 + }, + { + "epoch": 2.4954491491887616, + "grad_norm": 0.008458547294139862, + "learning_rate": 1.668102816612445e-05, + "loss": 0.006, + "step": 9460 + }, + { + "epoch": 2.4959767840654266, + "grad_norm": 0.01657716929912567, + "learning_rate": 1.6679855798821773e-05, + "loss": 0.0003, + "step": 9462 + }, + { + "epoch": 2.496504418942092, + "grad_norm": 0.0636911541223526, + "learning_rate": 1.6678683431519095e-05, + "loss": 0.0005, + "step": 9464 + }, + { + "epoch": 2.4970320538187574, + "grad_norm": 0.15328659117221832, + "learning_rate": 1.667751106421642e-05, + "loss": 0.0044, + "step": 9466 + }, + { + "epoch": 2.4975596886954228, + "grad_norm": 0.012865841388702393, + "learning_rate": 1.6676338696913744e-05, + "loss": 0.0005, + "step": 9468 + }, + { + "epoch": 2.498087323572088, + "grad_norm": 0.1790451556444168, + "learning_rate": 1.6675166329611067e-05, + "loss": 0.0012, + "step": 9470 + }, + { + "epoch": 2.4986149584487536, + "grad_norm": 0.3441483974456787, + "learning_rate": 1.6673993962308393e-05, + "loss": 0.0036, + "step": 9472 + }, + { + "epoch": 2.499142593325419, + "grad_norm": 0.014697222039103508, + "learning_rate": 1.667282159500572e-05, + "loss": 0.0003, + "step": 9474 + }, + { + "epoch": 2.499670228202084, + "grad_norm": 0.04395194724202156, + "learning_rate": 1.667164922770304e-05, + "loss": 0.0008, + "step": 9476 + }, + { + "epoch": 2.5001978630787494, + "grad_norm": 0.17228545248508453, + "learning_rate": 1.6670476860400364e-05, + "loss": 0.0044, + "step": 9478 + }, + { + "epoch": 2.5007254979554148, + "grad_norm": 0.002534177154302597, + "learning_rate": 1.666930449309769e-05, + "loss": 0.0001, + "step": 9480 + }, + { + "epoch": 2.50125313283208, + "grad_norm": 0.011408338323235512, + "learning_rate": 1.6668132125795013e-05, + "loss": 0.0002, + "step": 9482 + }, + { + "epoch": 2.5017807677087456, + "grad_norm": 0.0334080345928669, + "learning_rate": 1.6666959758492335e-05, + "loss": 0.0003, + "step": 9484 + }, + { + "epoch": 2.502308402585411, + "grad_norm": 0.24819427728652954, + "learning_rate": 1.666578739118966e-05, + "loss": 0.0036, + "step": 9486 + }, + { + "epoch": 2.5028360374620764, + "grad_norm": 0.18555974960327148, + "learning_rate": 1.6664615023886984e-05, + "loss": 0.0011, + "step": 9488 + }, + { + "epoch": 2.5033636723387414, + "grad_norm": 0.00876891240477562, + "learning_rate": 1.666344265658431e-05, + "loss": 0.0001, + "step": 9490 + }, + { + "epoch": 2.503891307215407, + "grad_norm": 0.017297247424721718, + "learning_rate": 1.6662270289281632e-05, + "loss": 0.0005, + "step": 9492 + }, + { + "epoch": 2.504418942092072, + "grad_norm": 0.12508869171142578, + "learning_rate": 1.666109792197896e-05, + "loss": 0.0013, + "step": 9494 + }, + { + "epoch": 2.5049465769687376, + "grad_norm": 0.37148183584213257, + "learning_rate": 1.665992555467628e-05, + "loss": 0.0062, + "step": 9496 + }, + { + "epoch": 2.505474211845403, + "grad_norm": 0.0059496620669960976, + "learning_rate": 1.6658753187373607e-05, + "loss": 0.0001, + "step": 9498 + }, + { + "epoch": 2.5060018467220684, + "grad_norm": 0.009422766044735909, + "learning_rate": 1.665758082007093e-05, + "loss": 0.0008, + "step": 9500 + }, + { + "epoch": 2.506529481598734, + "grad_norm": 0.018785420805215836, + "learning_rate": 1.6656408452768252e-05, + "loss": 0.0003, + "step": 9502 + }, + { + "epoch": 2.5070571164753988, + "grad_norm": 0.0033814962953329086, + "learning_rate": 1.6655236085465578e-05, + "loss": 0.0002, + "step": 9504 + }, + { + "epoch": 2.5075847513520646, + "grad_norm": 0.0033701013308018446, + "learning_rate": 1.6654063718162904e-05, + "loss": 0.0004, + "step": 9506 + }, + { + "epoch": 2.5081123862287296, + "grad_norm": 0.016685698181390762, + "learning_rate": 1.6652891350860227e-05, + "loss": 0.0002, + "step": 9508 + }, + { + "epoch": 2.508640021105395, + "grad_norm": 0.23940151929855347, + "learning_rate": 1.665171898355755e-05, + "loss": 0.0146, + "step": 9510 + }, + { + "epoch": 2.5091676559820604, + "grad_norm": 0.056298647075891495, + "learning_rate": 1.6650546616254875e-05, + "loss": 0.0009, + "step": 9512 + }, + { + "epoch": 2.509695290858726, + "grad_norm": 0.10868874937295914, + "learning_rate": 1.6649374248952198e-05, + "loss": 0.0007, + "step": 9514 + }, + { + "epoch": 2.510222925735391, + "grad_norm": 0.023534031584858894, + "learning_rate": 1.664820188164952e-05, + "loss": 0.003, + "step": 9516 + }, + { + "epoch": 2.5107505606120566, + "grad_norm": 0.006730915047228336, + "learning_rate": 1.6647029514346847e-05, + "loss": 0.0006, + "step": 9518 + }, + { + "epoch": 2.511278195488722, + "grad_norm": 0.0035659843124449253, + "learning_rate": 1.6645857147044173e-05, + "loss": 0.0002, + "step": 9520 + }, + { + "epoch": 2.511805830365387, + "grad_norm": 0.007884289138019085, + "learning_rate": 1.6644684779741492e-05, + "loss": 0.0003, + "step": 9522 + }, + { + "epoch": 2.5123334652420524, + "grad_norm": 0.005365150980651379, + "learning_rate": 1.6643512412438818e-05, + "loss": 0.0001, + "step": 9524 + }, + { + "epoch": 2.5128611001187178, + "grad_norm": 0.0018677827902138233, + "learning_rate": 1.6642340045136144e-05, + "loss": 0.0001, + "step": 9526 + }, + { + "epoch": 2.513388734995383, + "grad_norm": 0.011807176284492016, + "learning_rate": 1.6641167677833466e-05, + "loss": 0.0002, + "step": 9528 + }, + { + "epoch": 2.5139163698720486, + "grad_norm": 0.0016902342904359102, + "learning_rate": 1.663999531053079e-05, + "loss": 0.0001, + "step": 9530 + }, + { + "epoch": 2.514444004748714, + "grad_norm": 0.139547199010849, + "learning_rate": 1.6638822943228115e-05, + "loss": 0.0155, + "step": 9532 + }, + { + "epoch": 2.5149716396253794, + "grad_norm": 0.008497368544340134, + "learning_rate": 1.6637650575925438e-05, + "loss": 0.0038, + "step": 9534 + }, + { + "epoch": 2.5154992745020444, + "grad_norm": 0.2899165153503418, + "learning_rate": 1.6636478208622764e-05, + "loss": 0.0027, + "step": 9536 + }, + { + "epoch": 2.51602690937871, + "grad_norm": 0.0081414096057415, + "learning_rate": 1.6635305841320086e-05, + "loss": 0.0001, + "step": 9538 + }, + { + "epoch": 2.516554544255375, + "grad_norm": 0.0019478361355140805, + "learning_rate": 1.6634133474017412e-05, + "loss": 0.0002, + "step": 9540 + }, + { + "epoch": 2.5170821791320406, + "grad_norm": 0.04081087186932564, + "learning_rate": 1.6632961106714735e-05, + "loss": 0.003, + "step": 9542 + }, + { + "epoch": 2.517609814008706, + "grad_norm": 0.007489334791898727, + "learning_rate": 1.6631788739412057e-05, + "loss": 0.0003, + "step": 9544 + }, + { + "epoch": 2.5181374488853714, + "grad_norm": 0.2291010171175003, + "learning_rate": 1.6630616372109383e-05, + "loss": 0.0009, + "step": 9546 + }, + { + "epoch": 2.518665083762037, + "grad_norm": 0.021971233189105988, + "learning_rate": 1.6629444004806706e-05, + "loss": 0.0068, + "step": 9548 + }, + { + "epoch": 2.5191927186387018, + "grad_norm": 0.1972694993019104, + "learning_rate": 1.6628271637504032e-05, + "loss": 0.0066, + "step": 9550 + }, + { + "epoch": 2.5197203535153676, + "grad_norm": 0.00809588935226202, + "learning_rate": 1.6627099270201355e-05, + "loss": 0.0001, + "step": 9552 + }, + { + "epoch": 2.5202479883920326, + "grad_norm": 0.005624981131404638, + "learning_rate": 1.662592690289868e-05, + "loss": 0.0003, + "step": 9554 + }, + { + "epoch": 2.520775623268698, + "grad_norm": 0.007743275724351406, + "learning_rate": 1.6624754535596003e-05, + "loss": 0.0004, + "step": 9556 + }, + { + "epoch": 2.5213032581453634, + "grad_norm": 0.018774710595607758, + "learning_rate": 1.662358216829333e-05, + "loss": 0.0003, + "step": 9558 + }, + { + "epoch": 2.521830893022029, + "grad_norm": 0.15425056219100952, + "learning_rate": 1.6622409800990652e-05, + "loss": 0.0015, + "step": 9560 + }, + { + "epoch": 2.522358527898694, + "grad_norm": 0.3397856652736664, + "learning_rate": 1.6621237433687974e-05, + "loss": 0.0064, + "step": 9562 + }, + { + "epoch": 2.5228861627753596, + "grad_norm": 0.01374813076108694, + "learning_rate": 1.66200650663853e-05, + "loss": 0.0004, + "step": 9564 + }, + { + "epoch": 2.523413797652025, + "grad_norm": 0.004012230318039656, + "learning_rate": 1.6618892699082626e-05, + "loss": 0.0002, + "step": 9566 + }, + { + "epoch": 2.52394143252869, + "grad_norm": 0.06820479780435562, + "learning_rate": 1.6617720331779946e-05, + "loss": 0.0006, + "step": 9568 + }, + { + "epoch": 2.5244690674053554, + "grad_norm": 0.06958476454019547, + "learning_rate": 1.6616547964477272e-05, + "loss": 0.0095, + "step": 9570 + }, + { + "epoch": 2.5249967022820208, + "grad_norm": 0.006613235920667648, + "learning_rate": 1.6615375597174598e-05, + "loss": 0.0002, + "step": 9572 + }, + { + "epoch": 2.525524337158686, + "grad_norm": 0.011316427029669285, + "learning_rate": 1.661420322987192e-05, + "loss": 0.0002, + "step": 9574 + }, + { + "epoch": 2.5260519720353516, + "grad_norm": 0.004074623342603445, + "learning_rate": 1.6613030862569243e-05, + "loss": 0.0003, + "step": 9576 + }, + { + "epoch": 2.526579606912017, + "grad_norm": 0.43739163875579834, + "learning_rate": 1.661185849526657e-05, + "loss": 0.0018, + "step": 9578 + }, + { + "epoch": 2.5271072417886824, + "grad_norm": 0.017294347286224365, + "learning_rate": 1.661068612796389e-05, + "loss": 0.0004, + "step": 9580 + }, + { + "epoch": 2.5276348766653474, + "grad_norm": 0.06484173983335495, + "learning_rate": 1.6609513760661214e-05, + "loss": 0.0013, + "step": 9582 + }, + { + "epoch": 2.5281625115420128, + "grad_norm": 0.22692450881004333, + "learning_rate": 1.660834139335854e-05, + "loss": 0.0033, + "step": 9584 + }, + { + "epoch": 2.528690146418678, + "grad_norm": 0.005588836036622524, + "learning_rate": 1.6607169026055866e-05, + "loss": 0.0041, + "step": 9586 + }, + { + "epoch": 2.5292177812953436, + "grad_norm": 0.004349463153630495, + "learning_rate": 1.660599665875319e-05, + "loss": 0.0002, + "step": 9588 + }, + { + "epoch": 2.529745416172009, + "grad_norm": 0.05936414375901222, + "learning_rate": 1.660482429145051e-05, + "loss": 0.0011, + "step": 9590 + }, + { + "epoch": 2.5302730510486744, + "grad_norm": 0.15258927643299103, + "learning_rate": 1.6603651924147837e-05, + "loss": 0.0005, + "step": 9592 + }, + { + "epoch": 2.53080068592534, + "grad_norm": 0.0033793626353144646, + "learning_rate": 1.660247955684516e-05, + "loss": 0.0029, + "step": 9594 + }, + { + "epoch": 2.5313283208020048, + "grad_norm": 0.13862307369709015, + "learning_rate": 1.6601307189542486e-05, + "loss": 0.003, + "step": 9596 + }, + { + "epoch": 2.5318559556786706, + "grad_norm": 0.13449600338935852, + "learning_rate": 1.660013482223981e-05, + "loss": 0.001, + "step": 9598 + }, + { + "epoch": 2.5323835905553356, + "grad_norm": 0.04955890029668808, + "learning_rate": 1.6598962454937135e-05, + "loss": 0.0006, + "step": 9600 + }, + { + "epoch": 2.532911225432001, + "grad_norm": 0.09493255615234375, + "learning_rate": 1.6597790087634457e-05, + "loss": 0.0055, + "step": 9602 + }, + { + "epoch": 2.5334388603086664, + "grad_norm": 0.33913856744766235, + "learning_rate": 1.659661772033178e-05, + "loss": 0.0102, + "step": 9604 + }, + { + "epoch": 2.533966495185332, + "grad_norm": 0.05484268069267273, + "learning_rate": 1.6595445353029106e-05, + "loss": 0.0005, + "step": 9606 + }, + { + "epoch": 2.534494130061997, + "grad_norm": 0.014818795956671238, + "learning_rate": 1.659427298572643e-05, + "loss": 0.0002, + "step": 9608 + }, + { + "epoch": 2.5350217649386626, + "grad_norm": 0.024758731946349144, + "learning_rate": 1.6593100618423754e-05, + "loss": 0.0004, + "step": 9610 + }, + { + "epoch": 2.535549399815328, + "grad_norm": 0.018077844753861427, + "learning_rate": 1.6591928251121077e-05, + "loss": 0.0004, + "step": 9612 + }, + { + "epoch": 2.536077034691993, + "grad_norm": 0.021629979833960533, + "learning_rate": 1.65907558838184e-05, + "loss": 0.0069, + "step": 9614 + }, + { + "epoch": 2.5366046695686584, + "grad_norm": 0.08224817365407944, + "learning_rate": 1.6589583516515726e-05, + "loss": 0.0012, + "step": 9616 + }, + { + "epoch": 2.537132304445324, + "grad_norm": 0.027562394738197327, + "learning_rate": 1.658841114921305e-05, + "loss": 0.0006, + "step": 9618 + }, + { + "epoch": 2.537659939321989, + "grad_norm": 0.2341773509979248, + "learning_rate": 1.6587238781910374e-05, + "loss": 0.0068, + "step": 9620 + }, + { + "epoch": 2.5381875741986546, + "grad_norm": 0.11779732257127762, + "learning_rate": 1.6586066414607697e-05, + "loss": 0.0017, + "step": 9622 + }, + { + "epoch": 2.53871520907532, + "grad_norm": 0.24856792390346527, + "learning_rate": 1.6584894047305023e-05, + "loss": 0.0088, + "step": 9624 + }, + { + "epoch": 2.5392428439519854, + "grad_norm": 0.01034040842205286, + "learning_rate": 1.6583721680002345e-05, + "loss": 0.0002, + "step": 9626 + }, + { + "epoch": 2.5397704788286504, + "grad_norm": 0.0038982536643743515, + "learning_rate": 1.6582549312699668e-05, + "loss": 0.0002, + "step": 9628 + }, + { + "epoch": 2.5402981137053158, + "grad_norm": 0.007398985326290131, + "learning_rate": 1.6581376945396994e-05, + "loss": 0.0002, + "step": 9630 + }, + { + "epoch": 2.540825748581981, + "grad_norm": 0.019152125343680382, + "learning_rate": 1.658020457809432e-05, + "loss": 0.0017, + "step": 9632 + }, + { + "epoch": 2.5413533834586466, + "grad_norm": 0.03404833748936653, + "learning_rate": 1.6579032210791643e-05, + "loss": 0.0107, + "step": 9634 + }, + { + "epoch": 2.541881018335312, + "grad_norm": 0.012329939752817154, + "learning_rate": 1.6577859843488965e-05, + "loss": 0.0003, + "step": 9636 + }, + { + "epoch": 2.5424086532119774, + "grad_norm": 0.010087992995977402, + "learning_rate": 1.657668747618629e-05, + "loss": 0.0003, + "step": 9638 + }, + { + "epoch": 2.542936288088643, + "grad_norm": 0.0047765919007360935, + "learning_rate": 1.6575515108883614e-05, + "loss": 0.0005, + "step": 9640 + }, + { + "epoch": 2.5434639229653078, + "grad_norm": 0.008882895112037659, + "learning_rate": 1.6574342741580936e-05, + "loss": 0.0002, + "step": 9642 + }, + { + "epoch": 2.5439915578419736, + "grad_norm": 0.07256074249744415, + "learning_rate": 1.6573170374278262e-05, + "loss": 0.0005, + "step": 9644 + }, + { + "epoch": 2.5445191927186386, + "grad_norm": 0.2480725646018982, + "learning_rate": 1.657199800697559e-05, + "loss": 0.0023, + "step": 9646 + }, + { + "epoch": 2.545046827595304, + "grad_norm": 0.03330860286951065, + "learning_rate": 1.657082563967291e-05, + "loss": 0.0071, + "step": 9648 + }, + { + "epoch": 2.5455744624719694, + "grad_norm": 0.028806673362851143, + "learning_rate": 1.6569653272370234e-05, + "loss": 0.0012, + "step": 9650 + }, + { + "epoch": 2.546102097348635, + "grad_norm": 0.07213879376649857, + "learning_rate": 1.656848090506756e-05, + "loss": 0.0008, + "step": 9652 + }, + { + "epoch": 2.5466297322253, + "grad_norm": 0.093746118247509, + "learning_rate": 1.6567308537764882e-05, + "loss": 0.0012, + "step": 9654 + }, + { + "epoch": 2.547157367101965, + "grad_norm": 0.05565306171774864, + "learning_rate": 1.656613617046221e-05, + "loss": 0.0008, + "step": 9656 + }, + { + "epoch": 2.547685001978631, + "grad_norm": 0.020585007965564728, + "learning_rate": 1.656496380315953e-05, + "loss": 0.0003, + "step": 9658 + }, + { + "epoch": 2.548212636855296, + "grad_norm": 0.1695980727672577, + "learning_rate": 1.6563791435856857e-05, + "loss": 0.005, + "step": 9660 + }, + { + "epoch": 2.5487402717319614, + "grad_norm": 0.02401200868189335, + "learning_rate": 1.656261906855418e-05, + "loss": 0.0003, + "step": 9662 + }, + { + "epoch": 2.549267906608627, + "grad_norm": 0.006177500821650028, + "learning_rate": 1.6561446701251506e-05, + "loss": 0.0003, + "step": 9664 + }, + { + "epoch": 2.549795541485292, + "grad_norm": 0.12185177206993103, + "learning_rate": 1.6560274333948828e-05, + "loss": 0.0031, + "step": 9666 + }, + { + "epoch": 2.5503231763619576, + "grad_norm": 0.3116644024848938, + "learning_rate": 1.655910196664615e-05, + "loss": 0.0016, + "step": 9668 + }, + { + "epoch": 2.550850811238623, + "grad_norm": 0.0021274194587022066, + "learning_rate": 1.6557929599343477e-05, + "loss": 0.0003, + "step": 9670 + }, + { + "epoch": 2.5513784461152884, + "grad_norm": 0.08868689090013504, + "learning_rate": 1.65567572320408e-05, + "loss": 0.0005, + "step": 9672 + }, + { + "epoch": 2.5519060809919534, + "grad_norm": 0.004746333230286837, + "learning_rate": 1.6555584864738122e-05, + "loss": 0.0001, + "step": 9674 + }, + { + "epoch": 2.552433715868619, + "grad_norm": 0.005199959967285395, + "learning_rate": 1.6554412497435448e-05, + "loss": 0.0008, + "step": 9676 + }, + { + "epoch": 2.552961350745284, + "grad_norm": 0.026399310678243637, + "learning_rate": 1.6553240130132774e-05, + "loss": 0.0003, + "step": 9678 + }, + { + "epoch": 2.5534889856219496, + "grad_norm": 0.008152100257575512, + "learning_rate": 1.6552067762830097e-05, + "loss": 0.0002, + "step": 9680 + }, + { + "epoch": 2.554016620498615, + "grad_norm": 0.006323425564914942, + "learning_rate": 1.655089539552742e-05, + "loss": 0.0002, + "step": 9682 + }, + { + "epoch": 2.5545442553752804, + "grad_norm": 0.005357113666832447, + "learning_rate": 1.6549723028224745e-05, + "loss": 0.0001, + "step": 9684 + }, + { + "epoch": 2.555071890251946, + "grad_norm": 0.00402083620429039, + "learning_rate": 1.6548550660922068e-05, + "loss": 0.0002, + "step": 9686 + }, + { + "epoch": 2.5555995251286108, + "grad_norm": 0.008026905357837677, + "learning_rate": 1.654737829361939e-05, + "loss": 0.0001, + "step": 9688 + }, + { + "epoch": 2.5561271600052766, + "grad_norm": 0.0027914566453546286, + "learning_rate": 1.6546205926316716e-05, + "loss": 0.0001, + "step": 9690 + }, + { + "epoch": 2.5566547948819416, + "grad_norm": 0.0025485896039754152, + "learning_rate": 1.6545033559014042e-05, + "loss": 0.0001, + "step": 9692 + }, + { + "epoch": 2.557182429758607, + "grad_norm": 0.24056215584278107, + "learning_rate": 1.6543861191711365e-05, + "loss": 0.0139, + "step": 9694 + }, + { + "epoch": 2.5577100646352724, + "grad_norm": 0.09547711163759232, + "learning_rate": 1.6542688824408688e-05, + "loss": 0.0075, + "step": 9696 + }, + { + "epoch": 2.558237699511938, + "grad_norm": 0.01557842269539833, + "learning_rate": 1.6541516457106014e-05, + "loss": 0.0002, + "step": 9698 + }, + { + "epoch": 2.558765334388603, + "grad_norm": 0.008439709432423115, + "learning_rate": 1.6540344089803336e-05, + "loss": 0.0003, + "step": 9700 + }, + { + "epoch": 2.559292969265268, + "grad_norm": 0.009050078690052032, + "learning_rate": 1.653917172250066e-05, + "loss": 0.0125, + "step": 9702 + }, + { + "epoch": 2.559820604141934, + "grad_norm": 0.013367865234613419, + "learning_rate": 1.6537999355197985e-05, + "loss": 0.0002, + "step": 9704 + }, + { + "epoch": 2.560348239018599, + "grad_norm": 0.03466128185391426, + "learning_rate": 1.653682698789531e-05, + "loss": 0.0004, + "step": 9706 + }, + { + "epoch": 2.5608758738952644, + "grad_norm": 0.010566949844360352, + "learning_rate": 1.6535654620592633e-05, + "loss": 0.0003, + "step": 9708 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.005684318020939827, + "learning_rate": 1.6534482253289956e-05, + "loss": 0.0002, + "step": 9710 + }, + { + "epoch": 2.561931143648595, + "grad_norm": 0.00777710834518075, + "learning_rate": 1.6533309885987282e-05, + "loss": 0.0002, + "step": 9712 + }, + { + "epoch": 2.5624587785252606, + "grad_norm": 0.0243221502751112, + "learning_rate": 1.6532137518684605e-05, + "loss": 0.0004, + "step": 9714 + }, + { + "epoch": 2.562986413401926, + "grad_norm": 0.171961709856987, + "learning_rate": 1.653096515138193e-05, + "loss": 0.01, + "step": 9716 + }, + { + "epoch": 2.5635140482785914, + "grad_norm": 0.04855077713727951, + "learning_rate": 1.6529792784079253e-05, + "loss": 0.0012, + "step": 9718 + }, + { + "epoch": 2.5640416831552564, + "grad_norm": 0.13165555894374847, + "learning_rate": 1.6528620416776576e-05, + "loss": 0.0049, + "step": 9720 + }, + { + "epoch": 2.564569318031922, + "grad_norm": 0.30192631483078003, + "learning_rate": 1.6527448049473902e-05, + "loss": 0.0058, + "step": 9722 + }, + { + "epoch": 2.565096952908587, + "grad_norm": 0.013419780880212784, + "learning_rate": 1.6526275682171228e-05, + "loss": 0.0003, + "step": 9724 + }, + { + "epoch": 2.5656245877852526, + "grad_norm": 0.1516687422990799, + "learning_rate": 1.652510331486855e-05, + "loss": 0.0014, + "step": 9726 + }, + { + "epoch": 2.566152222661918, + "grad_norm": 0.24470046162605286, + "learning_rate": 1.6523930947565873e-05, + "loss": 0.0031, + "step": 9728 + }, + { + "epoch": 2.5666798575385834, + "grad_norm": 0.018749363720417023, + "learning_rate": 1.65227585802632e-05, + "loss": 0.0092, + "step": 9730 + }, + { + "epoch": 2.567207492415249, + "grad_norm": 0.004574873019009829, + "learning_rate": 1.652158621296052e-05, + "loss": 0.0005, + "step": 9732 + }, + { + "epoch": 2.5677351272919138, + "grad_norm": 0.07448487728834152, + "learning_rate": 1.6520413845657844e-05, + "loss": 0.0014, + "step": 9734 + }, + { + "epoch": 2.568262762168579, + "grad_norm": 0.007786344271153212, + "learning_rate": 1.651924147835517e-05, + "loss": 0.0002, + "step": 9736 + }, + { + "epoch": 2.5687903970452446, + "grad_norm": 0.011006612330675125, + "learning_rate": 1.6518069111052496e-05, + "loss": 0.0029, + "step": 9738 + }, + { + "epoch": 2.56931803192191, + "grad_norm": 0.008857950568199158, + "learning_rate": 1.651689674374982e-05, + "loss": 0.0002, + "step": 9740 + }, + { + "epoch": 2.5698456667985754, + "grad_norm": 0.008529018610715866, + "learning_rate": 1.651572437644714e-05, + "loss": 0.0024, + "step": 9742 + }, + { + "epoch": 2.570373301675241, + "grad_norm": 0.0160621777176857, + "learning_rate": 1.6514552009144468e-05, + "loss": 0.0003, + "step": 9744 + }, + { + "epoch": 2.570900936551906, + "grad_norm": 0.008703318424522877, + "learning_rate": 1.651337964184179e-05, + "loss": 0.0003, + "step": 9746 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.23932011425495148, + "learning_rate": 1.6512207274539113e-05, + "loss": 0.0025, + "step": 9748 + }, + { + "epoch": 2.571956206305237, + "grad_norm": 0.10578377544879913, + "learning_rate": 1.651103490723644e-05, + "loss": 0.0039, + "step": 9750 + }, + { + "epoch": 2.572483841181902, + "grad_norm": 0.12305889278650284, + "learning_rate": 1.6509862539933765e-05, + "loss": 0.0049, + "step": 9752 + }, + { + "epoch": 2.5730114760585674, + "grad_norm": 0.006996443960815668, + "learning_rate": 1.6508690172631087e-05, + "loss": 0.0005, + "step": 9754 + }, + { + "epoch": 2.573539110935233, + "grad_norm": 0.2827017605304718, + "learning_rate": 1.650751780532841e-05, + "loss": 0.0097, + "step": 9756 + }, + { + "epoch": 2.574066745811898, + "grad_norm": 0.14397376775741577, + "learning_rate": 1.6506345438025736e-05, + "loss": 0.005, + "step": 9758 + }, + { + "epoch": 2.5745943806885636, + "grad_norm": 0.015623304061591625, + "learning_rate": 1.650517307072306e-05, + "loss": 0.0008, + "step": 9760 + }, + { + "epoch": 2.575122015565229, + "grad_norm": 0.04646063596010208, + "learning_rate": 1.650400070342038e-05, + "loss": 0.0006, + "step": 9762 + }, + { + "epoch": 2.5756496504418944, + "grad_norm": 0.02906743250787258, + "learning_rate": 1.6502828336117707e-05, + "loss": 0.0009, + "step": 9764 + }, + { + "epoch": 2.5761772853185594, + "grad_norm": 0.025903712958097458, + "learning_rate": 1.650165596881503e-05, + "loss": 0.0005, + "step": 9766 + }, + { + "epoch": 2.576704920195225, + "grad_norm": 0.5589832663536072, + "learning_rate": 1.6500483601512356e-05, + "loss": 0.0029, + "step": 9768 + }, + { + "epoch": 2.57723255507189, + "grad_norm": 0.01622377708554268, + "learning_rate": 1.649931123420968e-05, + "loss": 0.0045, + "step": 9770 + }, + { + "epoch": 2.5777601899485556, + "grad_norm": 0.03536967560648918, + "learning_rate": 1.6498138866907004e-05, + "loss": 0.0004, + "step": 9772 + }, + { + "epoch": 2.578287824825221, + "grad_norm": 0.1639387011528015, + "learning_rate": 1.6496966499604327e-05, + "loss": 0.0006, + "step": 9774 + }, + { + "epoch": 2.5788154597018864, + "grad_norm": 0.006639734376221895, + "learning_rate": 1.6495794132301653e-05, + "loss": 0.0003, + "step": 9776 + }, + { + "epoch": 2.579343094578552, + "grad_norm": 0.03641989082098007, + "learning_rate": 1.6494621764998976e-05, + "loss": 0.0033, + "step": 9778 + }, + { + "epoch": 2.579870729455217, + "grad_norm": 0.009277958422899246, + "learning_rate": 1.6493449397696298e-05, + "loss": 0.0034, + "step": 9780 + }, + { + "epoch": 2.580398364331882, + "grad_norm": 0.0034552288707345724, + "learning_rate": 1.6492277030393624e-05, + "loss": 0.0004, + "step": 9782 + }, + { + "epoch": 2.5809259992085476, + "grad_norm": 0.3658374547958374, + "learning_rate": 1.649110466309095e-05, + "loss": 0.0012, + "step": 9784 + }, + { + "epoch": 2.581453634085213, + "grad_norm": 0.0871390774846077, + "learning_rate": 1.6489932295788273e-05, + "loss": 0.0004, + "step": 9786 + }, + { + "epoch": 2.5819812689618784, + "grad_norm": 0.011050754226744175, + "learning_rate": 1.6488759928485595e-05, + "loss": 0.0004, + "step": 9788 + }, + { + "epoch": 2.582508903838544, + "grad_norm": 0.08357113599777222, + "learning_rate": 1.648758756118292e-05, + "loss": 0.0021, + "step": 9790 + }, + { + "epoch": 2.583036538715209, + "grad_norm": 0.003778203623369336, + "learning_rate": 1.6486415193880244e-05, + "loss": 0.0002, + "step": 9792 + }, + { + "epoch": 2.583564173591874, + "grad_norm": 0.00443718396127224, + "learning_rate": 1.6485242826577567e-05, + "loss": 0.0001, + "step": 9794 + }, + { + "epoch": 2.58409180846854, + "grad_norm": 0.11180481314659119, + "learning_rate": 1.6484070459274893e-05, + "loss": 0.0065, + "step": 9796 + }, + { + "epoch": 2.584619443345205, + "grad_norm": 0.005293809808790684, + "learning_rate": 1.648289809197222e-05, + "loss": 0.0002, + "step": 9798 + }, + { + "epoch": 2.5851470782218704, + "grad_norm": 0.01901061274111271, + "learning_rate": 1.6481725724669538e-05, + "loss": 0.0002, + "step": 9800 + }, + { + "epoch": 2.585674713098536, + "grad_norm": 0.00859886221587658, + "learning_rate": 1.6480553357366864e-05, + "loss": 0.0002, + "step": 9802 + }, + { + "epoch": 2.586202347975201, + "grad_norm": 0.0017383871600031853, + "learning_rate": 1.647938099006419e-05, + "loss": 0.0001, + "step": 9804 + }, + { + "epoch": 2.5867299828518666, + "grad_norm": 0.006640384905040264, + "learning_rate": 1.6478208622761512e-05, + "loss": 0.0002, + "step": 9806 + }, + { + "epoch": 2.587257617728532, + "grad_norm": 0.003067792160436511, + "learning_rate": 1.6477036255458835e-05, + "loss": 0.0019, + "step": 9808 + }, + { + "epoch": 2.5877852526051974, + "grad_norm": 0.002364028710871935, + "learning_rate": 1.647586388815616e-05, + "loss": 0.0002, + "step": 9810 + }, + { + "epoch": 2.5883128874818624, + "grad_norm": 0.07476820051670074, + "learning_rate": 1.6474691520853484e-05, + "loss": 0.0032, + "step": 9812 + }, + { + "epoch": 2.588840522358528, + "grad_norm": 0.08680161088705063, + "learning_rate": 1.647351915355081e-05, + "loss": 0.0069, + "step": 9814 + }, + { + "epoch": 2.589368157235193, + "grad_norm": 0.004166629631072283, + "learning_rate": 1.6472346786248132e-05, + "loss": 0.0001, + "step": 9816 + }, + { + "epoch": 2.5898957921118586, + "grad_norm": 0.0032974372152239084, + "learning_rate": 1.6471174418945458e-05, + "loss": 0.0002, + "step": 9818 + }, + { + "epoch": 2.590423426988524, + "grad_norm": 0.0033933401573449373, + "learning_rate": 1.647000205164278e-05, + "loss": 0.0007, + "step": 9820 + }, + { + "epoch": 2.5909510618651894, + "grad_norm": 0.0026077122893184423, + "learning_rate": 1.6468829684340104e-05, + "loss": 0.0001, + "step": 9822 + }, + { + "epoch": 2.591478696741855, + "grad_norm": 0.1755773276090622, + "learning_rate": 1.646765731703743e-05, + "loss": 0.0037, + "step": 9824 + }, + { + "epoch": 2.59200633161852, + "grad_norm": 0.11718112975358963, + "learning_rate": 1.6466484949734752e-05, + "loss": 0.0012, + "step": 9826 + }, + { + "epoch": 2.592533966495185, + "grad_norm": 0.4655302166938782, + "learning_rate": 1.6465312582432078e-05, + "loss": 0.0028, + "step": 9828 + }, + { + "epoch": 2.5930616013718506, + "grad_norm": 0.027750913053750992, + "learning_rate": 1.64641402151294e-05, + "loss": 0.0022, + "step": 9830 + }, + { + "epoch": 2.593589236248516, + "grad_norm": 0.0059098717756569386, + "learning_rate": 1.6462967847826727e-05, + "loss": 0.0002, + "step": 9832 + }, + { + "epoch": 2.5941168711251814, + "grad_norm": 0.003288449952378869, + "learning_rate": 1.646179548052405e-05, + "loss": 0.0002, + "step": 9834 + }, + { + "epoch": 2.594644506001847, + "grad_norm": 0.2385736107826233, + "learning_rate": 1.6460623113221375e-05, + "loss": 0.002, + "step": 9836 + }, + { + "epoch": 2.5951721408785122, + "grad_norm": 0.03499762341380119, + "learning_rate": 1.6459450745918698e-05, + "loss": 0.0005, + "step": 9838 + }, + { + "epoch": 2.595699775755177, + "grad_norm": 0.15292370319366455, + "learning_rate": 1.645827837861602e-05, + "loss": 0.0005, + "step": 9840 + }, + { + "epoch": 2.596227410631843, + "grad_norm": 0.22140714526176453, + "learning_rate": 1.6457106011313347e-05, + "loss": 0.0153, + "step": 9842 + }, + { + "epoch": 2.596755045508508, + "grad_norm": 0.1991986334323883, + "learning_rate": 1.6455933644010673e-05, + "loss": 0.0049, + "step": 9844 + }, + { + "epoch": 2.5972826803851734, + "grad_norm": 0.4067796468734741, + "learning_rate": 1.6454761276707992e-05, + "loss": 0.0015, + "step": 9846 + }, + { + "epoch": 2.597810315261839, + "grad_norm": 0.03251716494560242, + "learning_rate": 1.6453588909405318e-05, + "loss": 0.0035, + "step": 9848 + }, + { + "epoch": 2.598337950138504, + "grad_norm": 0.01423176284879446, + "learning_rate": 1.6452416542102644e-05, + "loss": 0.0003, + "step": 9850 + }, + { + "epoch": 2.5988655850151696, + "grad_norm": 0.018275078386068344, + "learning_rate": 1.6451244174799966e-05, + "loss": 0.0006, + "step": 9852 + }, + { + "epoch": 2.5993932198918346, + "grad_norm": 0.01341279223561287, + "learning_rate": 1.645007180749729e-05, + "loss": 0.0003, + "step": 9854 + }, + { + "epoch": 2.5999208547685004, + "grad_norm": 0.1207752376794815, + "learning_rate": 1.6448899440194615e-05, + "loss": 0.0007, + "step": 9856 + }, + { + "epoch": 2.6004484896451654, + "grad_norm": 0.006441516801714897, + "learning_rate": 1.6447727072891938e-05, + "loss": 0.0005, + "step": 9858 + }, + { + "epoch": 2.600976124521831, + "grad_norm": 0.004766987171024084, + "learning_rate": 1.644655470558926e-05, + "loss": 0.0059, + "step": 9860 + }, + { + "epoch": 2.601503759398496, + "grad_norm": 0.13077092170715332, + "learning_rate": 1.6445382338286586e-05, + "loss": 0.0024, + "step": 9862 + }, + { + "epoch": 2.6020313942751616, + "grad_norm": 0.09028980880975723, + "learning_rate": 1.6444209970983912e-05, + "loss": 0.0007, + "step": 9864 + }, + { + "epoch": 2.602559029151827, + "grad_norm": 0.005553381983190775, + "learning_rate": 1.6443037603681235e-05, + "loss": 0.0003, + "step": 9866 + }, + { + "epoch": 2.6030866640284924, + "grad_norm": 0.01431367453187704, + "learning_rate": 1.6441865236378557e-05, + "loss": 0.0003, + "step": 9868 + }, + { + "epoch": 2.603614298905158, + "grad_norm": 0.04340754821896553, + "learning_rate": 1.6440692869075883e-05, + "loss": 0.0063, + "step": 9870 + }, + { + "epoch": 2.604141933781823, + "grad_norm": 0.006669537164270878, + "learning_rate": 1.6439520501773206e-05, + "loss": 0.0002, + "step": 9872 + }, + { + "epoch": 2.604669568658488, + "grad_norm": 0.004659675061702728, + "learning_rate": 1.6438348134470532e-05, + "loss": 0.0002, + "step": 9874 + }, + { + "epoch": 2.6051972035351536, + "grad_norm": 0.023347053676843643, + "learning_rate": 1.6437175767167855e-05, + "loss": 0.0102, + "step": 9876 + }, + { + "epoch": 2.605724838411819, + "grad_norm": 0.012424042448401451, + "learning_rate": 1.643600339986518e-05, + "loss": 0.0006, + "step": 9878 + }, + { + "epoch": 2.6062524732884844, + "grad_norm": 0.34962546825408936, + "learning_rate": 1.6434831032562503e-05, + "loss": 0.0035, + "step": 9880 + }, + { + "epoch": 2.60678010816515, + "grad_norm": 0.020256688818335533, + "learning_rate": 1.6433658665259826e-05, + "loss": 0.0005, + "step": 9882 + }, + { + "epoch": 2.6073077430418152, + "grad_norm": 0.02613479271531105, + "learning_rate": 1.6432486297957152e-05, + "loss": 0.0075, + "step": 9884 + }, + { + "epoch": 2.60783537791848, + "grad_norm": 0.04111049696803093, + "learning_rate": 1.6431313930654474e-05, + "loss": 0.001, + "step": 9886 + }, + { + "epoch": 2.6083630127951456, + "grad_norm": 0.0850418210029602, + "learning_rate": 1.64301415633518e-05, + "loss": 0.0005, + "step": 9888 + }, + { + "epoch": 2.608890647671811, + "grad_norm": 0.04537215456366539, + "learning_rate": 1.6428969196049123e-05, + "loss": 0.0011, + "step": 9890 + }, + { + "epoch": 2.6094182825484764, + "grad_norm": 0.022001300007104874, + "learning_rate": 1.6427796828746446e-05, + "loss": 0.0019, + "step": 9892 + }, + { + "epoch": 2.609945917425142, + "grad_norm": 0.07847125828266144, + "learning_rate": 1.642662446144377e-05, + "loss": 0.0016, + "step": 9894 + }, + { + "epoch": 2.610473552301807, + "grad_norm": 0.003445217153057456, + "learning_rate": 1.6425452094141098e-05, + "loss": 0.0005, + "step": 9896 + }, + { + "epoch": 2.6110011871784726, + "grad_norm": 0.010225677862763405, + "learning_rate": 1.642427972683842e-05, + "loss": 0.0002, + "step": 9898 + }, + { + "epoch": 2.6115288220551376, + "grad_norm": 0.07064041495323181, + "learning_rate": 1.6423107359535743e-05, + "loss": 0.001, + "step": 9900 + }, + { + "epoch": 2.6120564569318034, + "grad_norm": 0.00183390686288476, + "learning_rate": 1.642193499223307e-05, + "loss": 0.0033, + "step": 9902 + }, + { + "epoch": 2.6125840918084684, + "grad_norm": 0.03953688591718674, + "learning_rate": 1.642076262493039e-05, + "loss": 0.0038, + "step": 9904 + }, + { + "epoch": 2.613111726685134, + "grad_norm": 0.3316164016723633, + "learning_rate": 1.6419590257627714e-05, + "loss": 0.0008, + "step": 9906 + }, + { + "epoch": 2.613639361561799, + "grad_norm": 0.0015285153640434146, + "learning_rate": 1.641841789032504e-05, + "loss": 0.0036, + "step": 9908 + }, + { + "epoch": 2.6141669964384646, + "grad_norm": 0.0447615310549736, + "learning_rate": 1.6417245523022366e-05, + "loss": 0.0065, + "step": 9910 + }, + { + "epoch": 2.61469463131513, + "grad_norm": 0.21195656061172485, + "learning_rate": 1.641607315571969e-05, + "loss": 0.002, + "step": 9912 + }, + { + "epoch": 2.6152222661917954, + "grad_norm": 0.044800110161304474, + "learning_rate": 1.641490078841701e-05, + "loss": 0.0019, + "step": 9914 + }, + { + "epoch": 2.615749901068461, + "grad_norm": 0.0091624204069376, + "learning_rate": 1.6413728421114337e-05, + "loss": 0.0001, + "step": 9916 + }, + { + "epoch": 2.616277535945126, + "grad_norm": 0.003908704966306686, + "learning_rate": 1.641255605381166e-05, + "loss": 0.0002, + "step": 9918 + }, + { + "epoch": 2.616805170821791, + "grad_norm": 0.029375510290265083, + "learning_rate": 1.6411383686508983e-05, + "loss": 0.0004, + "step": 9920 + }, + { + "epoch": 2.6173328056984566, + "grad_norm": 0.08823104202747345, + "learning_rate": 1.641021131920631e-05, + "loss": 0.0013, + "step": 9922 + }, + { + "epoch": 2.617860440575122, + "grad_norm": 0.0017494471976533532, + "learning_rate": 1.6409038951903635e-05, + "loss": 0.0002, + "step": 9924 + }, + { + "epoch": 2.6183880754517874, + "grad_norm": 0.5635382533073425, + "learning_rate": 1.6407866584600957e-05, + "loss": 0.0021, + "step": 9926 + }, + { + "epoch": 2.618915710328453, + "grad_norm": 0.3426331579685211, + "learning_rate": 1.640669421729828e-05, + "loss": 0.0088, + "step": 9928 + }, + { + "epoch": 2.6194433452051182, + "grad_norm": 0.247876837849617, + "learning_rate": 1.6405521849995606e-05, + "loss": 0.0017, + "step": 9930 + }, + { + "epoch": 2.619970980081783, + "grad_norm": 0.002633191179484129, + "learning_rate": 1.640434948269293e-05, + "loss": 0.0008, + "step": 9932 + }, + { + "epoch": 2.6204986149584486, + "grad_norm": 0.1955457478761673, + "learning_rate": 1.6403177115390254e-05, + "loss": 0.0115, + "step": 9934 + }, + { + "epoch": 2.621026249835114, + "grad_norm": 0.005066971760243177, + "learning_rate": 1.6402004748087577e-05, + "loss": 0.0004, + "step": 9936 + }, + { + "epoch": 2.6215538847117794, + "grad_norm": 0.012644967064261436, + "learning_rate": 1.64008323807849e-05, + "loss": 0.0002, + "step": 9938 + }, + { + "epoch": 2.622081519588445, + "grad_norm": 0.061107587069272995, + "learning_rate": 1.6399660013482226e-05, + "loss": 0.0005, + "step": 9940 + }, + { + "epoch": 2.6226091544651102, + "grad_norm": 0.023915711790323257, + "learning_rate": 1.6398487646179548e-05, + "loss": 0.0009, + "step": 9942 + }, + { + "epoch": 2.6231367893417756, + "grad_norm": 0.22851884365081787, + "learning_rate": 1.6397315278876874e-05, + "loss": 0.0008, + "step": 9944 + }, + { + "epoch": 2.6236644242184406, + "grad_norm": 0.4863438308238983, + "learning_rate": 1.6396142911574197e-05, + "loss": 0.0039, + "step": 9946 + }, + { + "epoch": 2.6241920590951064, + "grad_norm": 0.02737777680158615, + "learning_rate": 1.6394970544271523e-05, + "loss": 0.0027, + "step": 9948 + }, + { + "epoch": 2.6247196939717714, + "grad_norm": 0.044537123292684555, + "learning_rate": 1.6393798176968845e-05, + "loss": 0.0004, + "step": 9950 + }, + { + "epoch": 2.625247328848437, + "grad_norm": 0.5841182470321655, + "learning_rate": 1.6392625809666168e-05, + "loss": 0.0028, + "step": 9952 + }, + { + "epoch": 2.625774963725102, + "grad_norm": 0.026244329288601875, + "learning_rate": 1.6391453442363494e-05, + "loss": 0.0067, + "step": 9954 + }, + { + "epoch": 2.6263025986017676, + "grad_norm": 0.17140232026576996, + "learning_rate": 1.639028107506082e-05, + "loss": 0.0013, + "step": 9956 + }, + { + "epoch": 2.626830233478433, + "grad_norm": 0.017104007303714752, + "learning_rate": 1.6389108707758143e-05, + "loss": 0.0002, + "step": 9958 + }, + { + "epoch": 2.6273578683550984, + "grad_norm": 0.3238576352596283, + "learning_rate": 1.6387936340455465e-05, + "loss": 0.0007, + "step": 9960 + }, + { + "epoch": 2.627885503231764, + "grad_norm": 0.006917305290699005, + "learning_rate": 1.638676397315279e-05, + "loss": 0.0024, + "step": 9962 + }, + { + "epoch": 2.628413138108429, + "grad_norm": 0.34022966027259827, + "learning_rate": 1.6385591605850114e-05, + "loss": 0.0022, + "step": 9964 + }, + { + "epoch": 2.628940772985094, + "grad_norm": 0.022137712687253952, + "learning_rate": 1.6384419238547436e-05, + "loss": 0.0002, + "step": 9966 + }, + { + "epoch": 2.6294684078617596, + "grad_norm": 0.0022004691418260336, + "learning_rate": 1.6383246871244762e-05, + "loss": 0.0059, + "step": 9968 + }, + { + "epoch": 2.629996042738425, + "grad_norm": 0.008918183855712414, + "learning_rate": 1.638207450394209e-05, + "loss": 0.0004, + "step": 9970 + }, + { + "epoch": 2.6305236776150904, + "grad_norm": 0.08313160389661789, + "learning_rate": 1.638090213663941e-05, + "loss": 0.0044, + "step": 9972 + }, + { + "epoch": 2.631051312491756, + "grad_norm": 0.004591488745063543, + "learning_rate": 1.6379729769336734e-05, + "loss": 0.0002, + "step": 9974 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.006297009531408548, + "learning_rate": 1.637855740203406e-05, + "loss": 0.0002, + "step": 9976 + }, + { + "epoch": 2.632106582245086, + "grad_norm": 0.6015627384185791, + "learning_rate": 1.6377385034731382e-05, + "loss": 0.0141, + "step": 9978 + }, + { + "epoch": 2.6326342171217516, + "grad_norm": 0.013838340528309345, + "learning_rate": 1.6376212667428705e-05, + "loss": 0.0004, + "step": 9980 + }, + { + "epoch": 2.633161851998417, + "grad_norm": 0.5751420259475708, + "learning_rate": 1.637504030012603e-05, + "loss": 0.0047, + "step": 9982 + }, + { + "epoch": 2.6336894868750824, + "grad_norm": 0.9119733572006226, + "learning_rate": 1.6373867932823357e-05, + "loss": 0.0067, + "step": 9984 + }, + { + "epoch": 2.634217121751748, + "grad_norm": 0.005480596795678139, + "learning_rate": 1.637269556552068e-05, + "loss": 0.0011, + "step": 9986 + }, + { + "epoch": 2.6347447566284132, + "grad_norm": 0.11044011265039444, + "learning_rate": 1.6371523198218002e-05, + "loss": 0.0007, + "step": 9988 + }, + { + "epoch": 2.6352723915050786, + "grad_norm": 0.014372819103300571, + "learning_rate": 1.6370350830915328e-05, + "loss": 0.0002, + "step": 9990 + }, + { + "epoch": 2.6358000263817436, + "grad_norm": 0.16572780907154083, + "learning_rate": 1.636917846361265e-05, + "loss": 0.0028, + "step": 9992 + }, + { + "epoch": 2.6363276612584094, + "grad_norm": 0.15653656423091888, + "learning_rate": 1.6368006096309977e-05, + "loss": 0.001, + "step": 9994 + }, + { + "epoch": 2.6368552961350744, + "grad_norm": 0.006345195230096579, + "learning_rate": 1.63668337290073e-05, + "loss": 0.0017, + "step": 9996 + }, + { + "epoch": 2.63738293101174, + "grad_norm": 0.12379516661167145, + "learning_rate": 1.6365661361704622e-05, + "loss": 0.0009, + "step": 9998 + }, + { + "epoch": 2.6379105658884052, + "grad_norm": 0.01540498249232769, + "learning_rate": 1.6364488994401948e-05, + "loss": 0.0011, + "step": 10000 + }, + { + "epoch": 2.6379105658884052, + "eval_loss": 0.003012891625985503, + "eval_runtime": 304.8984, + "eval_samples_per_second": 707.255, + "eval_steps_per_second": 88.41, + "step": 10000 + }, + { + "epoch": 2.6384382007650706, + "grad_norm": 0.040459901094436646, + "learning_rate": 1.636331662709927e-05, + "loss": 0.0007, + "step": 10002 + }, + { + "epoch": 2.638965835641736, + "grad_norm": 0.00928921066224575, + "learning_rate": 1.6362144259796597e-05, + "loss": 0.0004, + "step": 10004 + }, + { + "epoch": 2.639493470518401, + "grad_norm": 0.9446401596069336, + "learning_rate": 1.636097189249392e-05, + "loss": 0.0095, + "step": 10006 + }, + { + "epoch": 2.640021105395067, + "grad_norm": 0.004121256526559591, + "learning_rate": 1.6359799525191245e-05, + "loss": 0.0062, + "step": 10008 + }, + { + "epoch": 2.640548740271732, + "grad_norm": 0.14831162989139557, + "learning_rate": 1.6358627157888568e-05, + "loss": 0.0087, + "step": 10010 + }, + { + "epoch": 2.641076375148397, + "grad_norm": 0.006429074797779322, + "learning_rate": 1.635745479058589e-05, + "loss": 0.0002, + "step": 10012 + }, + { + "epoch": 2.6416040100250626, + "grad_norm": 0.008413128554821014, + "learning_rate": 1.6356282423283216e-05, + "loss": 0.0033, + "step": 10014 + }, + { + "epoch": 2.642131644901728, + "grad_norm": 0.4481828212738037, + "learning_rate": 1.6355110055980542e-05, + "loss": 0.0059, + "step": 10016 + }, + { + "epoch": 2.6426592797783934, + "grad_norm": 0.073452427983284, + "learning_rate": 1.6353937688677865e-05, + "loss": 0.0082, + "step": 10018 + }, + { + "epoch": 2.643186914655059, + "grad_norm": 0.00503569096326828, + "learning_rate": 1.6352765321375188e-05, + "loss": 0.0026, + "step": 10020 + }, + { + "epoch": 2.6437145495317242, + "grad_norm": 0.010714642703533173, + "learning_rate": 1.6351592954072514e-05, + "loss": 0.0005, + "step": 10022 + }, + { + "epoch": 2.644242184408389, + "grad_norm": 0.011665343306958675, + "learning_rate": 1.6350420586769836e-05, + "loss": 0.0003, + "step": 10024 + }, + { + "epoch": 2.6447698192850546, + "grad_norm": 0.6757835745811462, + "learning_rate": 1.634924821946716e-05, + "loss": 0.0038, + "step": 10026 + }, + { + "epoch": 2.64529745416172, + "grad_norm": 0.6998052000999451, + "learning_rate": 1.6348075852164485e-05, + "loss": 0.0082, + "step": 10028 + }, + { + "epoch": 2.6458250890383854, + "grad_norm": 0.10739561170339584, + "learning_rate": 1.634690348486181e-05, + "loss": 0.0011, + "step": 10030 + }, + { + "epoch": 2.646352723915051, + "grad_norm": 0.009563351050019264, + "learning_rate": 1.634573111755913e-05, + "loss": 0.0004, + "step": 10032 + }, + { + "epoch": 2.6468803587917162, + "grad_norm": 0.055194444954395294, + "learning_rate": 1.6344558750256456e-05, + "loss": 0.002, + "step": 10034 + }, + { + "epoch": 2.6474079936683816, + "grad_norm": 0.022763976827263832, + "learning_rate": 1.6343386382953782e-05, + "loss": 0.0005, + "step": 10036 + }, + { + "epoch": 2.6479356285450466, + "grad_norm": 0.01972128450870514, + "learning_rate": 1.6342214015651105e-05, + "loss": 0.001, + "step": 10038 + }, + { + "epoch": 2.648463263421712, + "grad_norm": 0.024166271090507507, + "learning_rate": 1.6341041648348427e-05, + "loss": 0.0022, + "step": 10040 + }, + { + "epoch": 2.6489908982983774, + "grad_norm": 0.010368158109486103, + "learning_rate": 1.6339869281045753e-05, + "loss": 0.0004, + "step": 10042 + }, + { + "epoch": 2.649518533175043, + "grad_norm": 0.011331546120345592, + "learning_rate": 1.6338696913743076e-05, + "loss": 0.0008, + "step": 10044 + }, + { + "epoch": 2.6500461680517082, + "grad_norm": 0.01633613370358944, + "learning_rate": 1.6337524546440402e-05, + "loss": 0.0004, + "step": 10046 + }, + { + "epoch": 2.6505738029283736, + "grad_norm": 0.1244087889790535, + "learning_rate": 1.6336352179137724e-05, + "loss": 0.0023, + "step": 10048 + }, + { + "epoch": 2.651101437805039, + "grad_norm": 0.02273700200021267, + "learning_rate": 1.633517981183505e-05, + "loss": 0.0052, + "step": 10050 + }, + { + "epoch": 2.651629072681704, + "grad_norm": 0.002857713960111141, + "learning_rate": 1.6334007444532373e-05, + "loss": 0.0035, + "step": 10052 + }, + { + "epoch": 2.65215670755837, + "grad_norm": 0.01920654997229576, + "learning_rate": 1.63328350772297e-05, + "loss": 0.0002, + "step": 10054 + }, + { + "epoch": 2.652684342435035, + "grad_norm": 0.070858895778656, + "learning_rate": 1.633166270992702e-05, + "loss": 0.0053, + "step": 10056 + }, + { + "epoch": 2.6532119773117, + "grad_norm": 0.010704652406275272, + "learning_rate": 1.6330490342624344e-05, + "loss": 0.0003, + "step": 10058 + }, + { + "epoch": 2.6537396121883656, + "grad_norm": 0.009833483956754208, + "learning_rate": 1.632931797532167e-05, + "loss": 0.0002, + "step": 10060 + }, + { + "epoch": 2.654267247065031, + "grad_norm": 0.011340968310832977, + "learning_rate": 1.6328145608018993e-05, + "loss": 0.0002, + "step": 10062 + }, + { + "epoch": 2.6547948819416964, + "grad_norm": 0.26045575737953186, + "learning_rate": 1.632697324071632e-05, + "loss": 0.0004, + "step": 10064 + }, + { + "epoch": 2.655322516818362, + "grad_norm": 0.020322537049651146, + "learning_rate": 1.632580087341364e-05, + "loss": 0.0003, + "step": 10066 + }, + { + "epoch": 2.6558501516950273, + "grad_norm": 0.00863407738506794, + "learning_rate": 1.6324628506110967e-05, + "loss": 0.0019, + "step": 10068 + }, + { + "epoch": 2.656377786571692, + "grad_norm": 0.0037915457505732775, + "learning_rate": 1.632345613880829e-05, + "loss": 0.0001, + "step": 10070 + }, + { + "epoch": 2.6569054214483576, + "grad_norm": 0.008761012926697731, + "learning_rate": 1.6322283771505613e-05, + "loss": 0.0003, + "step": 10072 + }, + { + "epoch": 2.657433056325023, + "grad_norm": 0.13541939854621887, + "learning_rate": 1.632111140420294e-05, + "loss": 0.0142, + "step": 10074 + }, + { + "epoch": 2.6579606912016884, + "grad_norm": 0.3594376742839813, + "learning_rate": 1.6319939036900265e-05, + "loss": 0.0079, + "step": 10076 + }, + { + "epoch": 2.658488326078354, + "grad_norm": 0.012882583774626255, + "learning_rate": 1.6318766669597584e-05, + "loss": 0.0026, + "step": 10078 + }, + { + "epoch": 2.6590159609550192, + "grad_norm": 0.0031964373774826527, + "learning_rate": 1.631759430229491e-05, + "loss": 0.0001, + "step": 10080 + }, + { + "epoch": 2.6595435958316846, + "grad_norm": 0.024077754467725754, + "learning_rate": 1.6316421934992236e-05, + "loss": 0.0003, + "step": 10082 + }, + { + "epoch": 2.6600712307083496, + "grad_norm": 0.16193130612373352, + "learning_rate": 1.631524956768956e-05, + "loss": 0.0007, + "step": 10084 + }, + { + "epoch": 2.660598865585015, + "grad_norm": 0.020724685862660408, + "learning_rate": 1.631407720038688e-05, + "loss": 0.0004, + "step": 10086 + }, + { + "epoch": 2.6611265004616804, + "grad_norm": 0.049928389489650726, + "learning_rate": 1.6312904833084207e-05, + "loss": 0.0005, + "step": 10088 + }, + { + "epoch": 2.661654135338346, + "grad_norm": 0.011409396305680275, + "learning_rate": 1.631173246578153e-05, + "loss": 0.0006, + "step": 10090 + }, + { + "epoch": 2.6621817702150112, + "grad_norm": 0.041459888219833374, + "learning_rate": 1.6310560098478852e-05, + "loss": 0.0032, + "step": 10092 + }, + { + "epoch": 2.6627094050916766, + "grad_norm": 0.06831348687410355, + "learning_rate": 1.630938773117618e-05, + "loss": 0.0007, + "step": 10094 + }, + { + "epoch": 2.663237039968342, + "grad_norm": 0.03501872345805168, + "learning_rate": 1.6308215363873504e-05, + "loss": 0.0014, + "step": 10096 + }, + { + "epoch": 2.663764674845007, + "grad_norm": 0.005276770796626806, + "learning_rate": 1.6307042996570827e-05, + "loss": 0.0002, + "step": 10098 + }, + { + "epoch": 2.664292309721673, + "grad_norm": 0.00911620445549488, + "learning_rate": 1.630587062926815e-05, + "loss": 0.0002, + "step": 10100 + }, + { + "epoch": 2.664819944598338, + "grad_norm": 0.10924285650253296, + "learning_rate": 1.6304698261965476e-05, + "loss": 0.0008, + "step": 10102 + }, + { + "epoch": 2.6653475794750032, + "grad_norm": 0.0018379648681730032, + "learning_rate": 1.6303525894662798e-05, + "loss": 0.001, + "step": 10104 + }, + { + "epoch": 2.6658752143516686, + "grad_norm": 0.026479706168174744, + "learning_rate": 1.6302353527360124e-05, + "loss": 0.0002, + "step": 10106 + }, + { + "epoch": 2.666402849228334, + "grad_norm": 0.13763563334941864, + "learning_rate": 1.6301181160057447e-05, + "loss": 0.0049, + "step": 10108 + }, + { + "epoch": 2.6669304841049994, + "grad_norm": 0.29197925329208374, + "learning_rate": 1.6300008792754773e-05, + "loss": 0.0095, + "step": 10110 + }, + { + "epoch": 2.667458118981665, + "grad_norm": 0.0057456353679299355, + "learning_rate": 1.6298836425452095e-05, + "loss": 0.0003, + "step": 10112 + }, + { + "epoch": 2.6679857538583303, + "grad_norm": 0.012873589992523193, + "learning_rate": 1.629766405814942e-05, + "loss": 0.0003, + "step": 10114 + }, + { + "epoch": 2.668513388734995, + "grad_norm": 0.20723286271095276, + "learning_rate": 1.6296491690846744e-05, + "loss": 0.0038, + "step": 10116 + }, + { + "epoch": 2.6690410236116606, + "grad_norm": 0.023152600973844528, + "learning_rate": 1.6295319323544067e-05, + "loss": 0.0006, + "step": 10118 + }, + { + "epoch": 2.669568658488326, + "grad_norm": 0.01455459464341402, + "learning_rate": 1.6294146956241393e-05, + "loss": 0.0004, + "step": 10120 + }, + { + "epoch": 2.6700962933649914, + "grad_norm": 0.1119035929441452, + "learning_rate": 1.6292974588938715e-05, + "loss": 0.0005, + "step": 10122 + }, + { + "epoch": 2.670623928241657, + "grad_norm": 0.011391349136829376, + "learning_rate": 1.6291802221636038e-05, + "loss": 0.0003, + "step": 10124 + }, + { + "epoch": 2.6711515631183222, + "grad_norm": 0.014707144349813461, + "learning_rate": 1.6290629854333364e-05, + "loss": 0.0003, + "step": 10126 + }, + { + "epoch": 2.6716791979949877, + "grad_norm": 0.01622041128575802, + "learning_rate": 1.628945748703069e-05, + "loss": 0.006, + "step": 10128 + }, + { + "epoch": 2.6722068328716526, + "grad_norm": 0.039411697536706924, + "learning_rate": 1.6288285119728012e-05, + "loss": 0.0007, + "step": 10130 + }, + { + "epoch": 2.672734467748318, + "grad_norm": 0.5889595150947571, + "learning_rate": 1.6287112752425335e-05, + "loss": 0.0065, + "step": 10132 + }, + { + "epoch": 2.6732621026249834, + "grad_norm": 0.07541930675506592, + "learning_rate": 1.628594038512266e-05, + "loss": 0.001, + "step": 10134 + }, + { + "epoch": 2.673789737501649, + "grad_norm": 0.29502058029174805, + "learning_rate": 1.6284768017819984e-05, + "loss": 0.0061, + "step": 10136 + }, + { + "epoch": 2.6743173723783142, + "grad_norm": 0.0662885457277298, + "learning_rate": 1.6283595650517306e-05, + "loss": 0.0007, + "step": 10138 + }, + { + "epoch": 2.6748450072549796, + "grad_norm": 0.08191285282373428, + "learning_rate": 1.6282423283214632e-05, + "loss": 0.0008, + "step": 10140 + }, + { + "epoch": 2.675372642131645, + "grad_norm": 0.02166077494621277, + "learning_rate": 1.6281250915911958e-05, + "loss": 0.0066, + "step": 10142 + }, + { + "epoch": 2.67590027700831, + "grad_norm": 0.10464536398649216, + "learning_rate": 1.628007854860928e-05, + "loss": 0.0004, + "step": 10144 + }, + { + "epoch": 2.676427911884976, + "grad_norm": 0.35096871852874756, + "learning_rate": 1.6278906181306603e-05, + "loss": 0.0107, + "step": 10146 + }, + { + "epoch": 2.676955546761641, + "grad_norm": 0.1145530715584755, + "learning_rate": 1.627773381400393e-05, + "loss": 0.001, + "step": 10148 + }, + { + "epoch": 2.6774831816383062, + "grad_norm": 0.02205895446240902, + "learning_rate": 1.6276561446701252e-05, + "loss": 0.0019, + "step": 10150 + }, + { + "epoch": 2.6780108165149716, + "grad_norm": 0.0519212931394577, + "learning_rate": 1.6275389079398575e-05, + "loss": 0.003, + "step": 10152 + }, + { + "epoch": 2.678538451391637, + "grad_norm": 0.4521586000919342, + "learning_rate": 1.62742167120959e-05, + "loss": 0.0072, + "step": 10154 + }, + { + "epoch": 2.6790660862683024, + "grad_norm": 0.026832319796085358, + "learning_rate": 1.6273044344793227e-05, + "loss": 0.0006, + "step": 10156 + }, + { + "epoch": 2.6795937211449674, + "grad_norm": 0.0217572133988142, + "learning_rate": 1.627187197749055e-05, + "loss": 0.0032, + "step": 10158 + }, + { + "epoch": 2.6801213560216333, + "grad_norm": 0.5107687711715698, + "learning_rate": 1.6270699610187872e-05, + "loss": 0.0043, + "step": 10160 + }, + { + "epoch": 2.6806489908982982, + "grad_norm": 0.02537829801440239, + "learning_rate": 1.6269527242885198e-05, + "loss": 0.0029, + "step": 10162 + }, + { + "epoch": 2.6811766257749636, + "grad_norm": 0.015179174952208996, + "learning_rate": 1.626835487558252e-05, + "loss": 0.0003, + "step": 10164 + }, + { + "epoch": 2.681704260651629, + "grad_norm": 0.005159683991223574, + "learning_rate": 1.6267182508279847e-05, + "loss": 0.0002, + "step": 10166 + }, + { + "epoch": 2.6822318955282944, + "grad_norm": 0.004316482227295637, + "learning_rate": 1.626601014097717e-05, + "loss": 0.0002, + "step": 10168 + }, + { + "epoch": 2.68275953040496, + "grad_norm": 0.037827711552381516, + "learning_rate": 1.6264837773674492e-05, + "loss": 0.0006, + "step": 10170 + }, + { + "epoch": 2.6832871652816253, + "grad_norm": 0.005982946138828993, + "learning_rate": 1.6263665406371818e-05, + "loss": 0.0006, + "step": 10172 + }, + { + "epoch": 2.6838148001582907, + "grad_norm": 0.24059918522834778, + "learning_rate": 1.6262493039069144e-05, + "loss": 0.0047, + "step": 10174 + }, + { + "epoch": 2.6843424350349556, + "grad_norm": 0.21524173021316528, + "learning_rate": 1.6261320671766466e-05, + "loss": 0.0008, + "step": 10176 + }, + { + "epoch": 2.684870069911621, + "grad_norm": 0.42408064007759094, + "learning_rate": 1.626014830446379e-05, + "loss": 0.0012, + "step": 10178 + }, + { + "epoch": 2.6853977047882864, + "grad_norm": 0.027813291177153587, + "learning_rate": 1.6258975937161115e-05, + "loss": 0.0008, + "step": 10180 + }, + { + "epoch": 2.685925339664952, + "grad_norm": 0.029931776225566864, + "learning_rate": 1.6257803569858438e-05, + "loss": 0.0004, + "step": 10182 + }, + { + "epoch": 2.6864529745416172, + "grad_norm": 0.22920307517051697, + "learning_rate": 1.625663120255576e-05, + "loss": 0.0039, + "step": 10184 + }, + { + "epoch": 2.6869806094182827, + "grad_norm": 0.26193779706954956, + "learning_rate": 1.6255458835253086e-05, + "loss": 0.0039, + "step": 10186 + }, + { + "epoch": 2.687508244294948, + "grad_norm": 0.018776025623083115, + "learning_rate": 1.6254286467950412e-05, + "loss": 0.0002, + "step": 10188 + }, + { + "epoch": 2.688035879171613, + "grad_norm": 0.08109041303396225, + "learning_rate": 1.6253114100647735e-05, + "loss": 0.0005, + "step": 10190 + }, + { + "epoch": 2.688563514048279, + "grad_norm": 0.07377723604440689, + "learning_rate": 1.6251941733345057e-05, + "loss": 0.0064, + "step": 10192 + }, + { + "epoch": 2.689091148924944, + "grad_norm": 0.2626207172870636, + "learning_rate": 1.6250769366042383e-05, + "loss": 0.0031, + "step": 10194 + }, + { + "epoch": 2.6896187838016092, + "grad_norm": 0.06434417515993118, + "learning_rate": 1.6249596998739706e-05, + "loss": 0.0025, + "step": 10196 + }, + { + "epoch": 2.6901464186782746, + "grad_norm": 0.011479714885354042, + "learning_rate": 1.624842463143703e-05, + "loss": 0.0021, + "step": 10198 + }, + { + "epoch": 2.69067405355494, + "grad_norm": 0.1808626353740692, + "learning_rate": 1.6247252264134355e-05, + "loss": 0.0008, + "step": 10200 + }, + { + "epoch": 2.6912016884316055, + "grad_norm": 0.21125920116901398, + "learning_rate": 1.624607989683168e-05, + "loss": 0.0012, + "step": 10202 + }, + { + "epoch": 2.6917293233082704, + "grad_norm": 0.006954110227525234, + "learning_rate": 1.6244907529529003e-05, + "loss": 0.0015, + "step": 10204 + }, + { + "epoch": 2.6922569581849363, + "grad_norm": 0.023724660277366638, + "learning_rate": 1.6243735162226326e-05, + "loss": 0.0002, + "step": 10206 + }, + { + "epoch": 2.6927845930616012, + "grad_norm": 0.01998404785990715, + "learning_rate": 1.6242562794923652e-05, + "loss": 0.0002, + "step": 10208 + }, + { + "epoch": 2.6933122279382666, + "grad_norm": 0.01249097939580679, + "learning_rate": 1.6241390427620974e-05, + "loss": 0.0002, + "step": 10210 + }, + { + "epoch": 2.693839862814932, + "grad_norm": 0.00532314321026206, + "learning_rate": 1.6240218060318297e-05, + "loss": 0.0021, + "step": 10212 + }, + { + "epoch": 2.6943674976915974, + "grad_norm": 0.008576758205890656, + "learning_rate": 1.6239045693015623e-05, + "loss": 0.0001, + "step": 10214 + }, + { + "epoch": 2.694895132568263, + "grad_norm": 0.37752214074134827, + "learning_rate": 1.6237873325712946e-05, + "loss": 0.0026, + "step": 10216 + }, + { + "epoch": 2.6954227674449283, + "grad_norm": 0.0719999298453331, + "learning_rate": 1.623670095841027e-05, + "loss": 0.0034, + "step": 10218 + }, + { + "epoch": 2.6959504023215937, + "grad_norm": 0.0043886820785701275, + "learning_rate": 1.6235528591107594e-05, + "loss": 0.0002, + "step": 10220 + }, + { + "epoch": 2.6964780371982586, + "grad_norm": 0.0025643054395914078, + "learning_rate": 1.623435622380492e-05, + "loss": 0.0001, + "step": 10222 + }, + { + "epoch": 2.697005672074924, + "grad_norm": 0.0026266088243573904, + "learning_rate": 1.6233183856502243e-05, + "loss": 0.0076, + "step": 10224 + }, + { + "epoch": 2.6975333069515894, + "grad_norm": 0.028669361025094986, + "learning_rate": 1.623201148919957e-05, + "loss": 0.0001, + "step": 10226 + }, + { + "epoch": 2.698060941828255, + "grad_norm": 0.004915098659694195, + "learning_rate": 1.623083912189689e-05, + "loss": 0.0002, + "step": 10228 + }, + { + "epoch": 2.6985885767049202, + "grad_norm": 0.20986366271972656, + "learning_rate": 1.6229666754594214e-05, + "loss": 0.007, + "step": 10230 + }, + { + "epoch": 2.6991162115815857, + "grad_norm": 0.2567131221294403, + "learning_rate": 1.622849438729154e-05, + "loss": 0.0017, + "step": 10232 + }, + { + "epoch": 2.699643846458251, + "grad_norm": 0.5779173374176025, + "learning_rate": 1.6227322019988866e-05, + "loss": 0.0104, + "step": 10234 + }, + { + "epoch": 2.700171481334916, + "grad_norm": 0.029341069981455803, + "learning_rate": 1.622614965268619e-05, + "loss": 0.0004, + "step": 10236 + }, + { + "epoch": 2.7006991162115814, + "grad_norm": 0.024440644308924675, + "learning_rate": 1.622497728538351e-05, + "loss": 0.0003, + "step": 10238 + }, + { + "epoch": 2.701226751088247, + "grad_norm": 0.1657402515411377, + "learning_rate": 1.6223804918080837e-05, + "loss": 0.003, + "step": 10240 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.11054570227861404, + "learning_rate": 1.622263255077816e-05, + "loss": 0.0099, + "step": 10242 + }, + { + "epoch": 2.7022820208415776, + "grad_norm": 0.04331626370549202, + "learning_rate": 1.6221460183475482e-05, + "loss": 0.0011, + "step": 10244 + }, + { + "epoch": 2.702809655718243, + "grad_norm": 0.2792474031448364, + "learning_rate": 1.622028781617281e-05, + "loss": 0.0045, + "step": 10246 + }, + { + "epoch": 2.7033372905949085, + "grad_norm": 0.39037200808525085, + "learning_rate": 1.6219115448870134e-05, + "loss": 0.0047, + "step": 10248 + }, + { + "epoch": 2.7038649254715734, + "grad_norm": 0.1614951342344284, + "learning_rate": 1.6217943081567457e-05, + "loss": 0.0009, + "step": 10250 + }, + { + "epoch": 2.7043925603482393, + "grad_norm": 0.38243216276168823, + "learning_rate": 1.621677071426478e-05, + "loss": 0.0042, + "step": 10252 + }, + { + "epoch": 2.7049201952249042, + "grad_norm": 0.09769252687692642, + "learning_rate": 1.6215598346962106e-05, + "loss": 0.0006, + "step": 10254 + }, + { + "epoch": 2.7054478301015696, + "grad_norm": 0.013055678457021713, + "learning_rate": 1.621442597965943e-05, + "loss": 0.0002, + "step": 10256 + }, + { + "epoch": 2.705975464978235, + "grad_norm": 0.005415745545178652, + "learning_rate": 1.621325361235675e-05, + "loss": 0.0001, + "step": 10258 + }, + { + "epoch": 2.7065030998549005, + "grad_norm": 0.01072960440069437, + "learning_rate": 1.6212081245054077e-05, + "loss": 0.0002, + "step": 10260 + }, + { + "epoch": 2.707030734731566, + "grad_norm": 0.0034734054934233427, + "learning_rate": 1.62109088777514e-05, + "loss": 0.0001, + "step": 10262 + }, + { + "epoch": 2.7075583696082313, + "grad_norm": 0.0229923315346241, + "learning_rate": 1.6209736510448726e-05, + "loss": 0.0003, + "step": 10264 + }, + { + "epoch": 2.7080860044848967, + "grad_norm": 0.5536501407623291, + "learning_rate": 1.6208564143146048e-05, + "loss": 0.0041, + "step": 10266 + }, + { + "epoch": 2.7086136393615616, + "grad_norm": 0.0037044298369437456, + "learning_rate": 1.6207391775843374e-05, + "loss": 0.0001, + "step": 10268 + }, + { + "epoch": 2.709141274238227, + "grad_norm": 0.01903834380209446, + "learning_rate": 1.6206219408540697e-05, + "loss": 0.0002, + "step": 10270 + }, + { + "epoch": 2.7096689091148924, + "grad_norm": 0.438747376203537, + "learning_rate": 1.6205047041238023e-05, + "loss": 0.0095, + "step": 10272 + }, + { + "epoch": 2.710196543991558, + "grad_norm": 0.0027053984813392162, + "learning_rate": 1.6203874673935345e-05, + "loss": 0.0001, + "step": 10274 + }, + { + "epoch": 2.7107241788682233, + "grad_norm": 0.4959735572338104, + "learning_rate": 1.6202702306632668e-05, + "loss": 0.0053, + "step": 10276 + }, + { + "epoch": 2.7112518137448887, + "grad_norm": 0.2716568112373352, + "learning_rate": 1.6201529939329994e-05, + "loss": 0.0019, + "step": 10278 + }, + { + "epoch": 2.711779448621554, + "grad_norm": 0.042891975492239, + "learning_rate": 1.6200357572027317e-05, + "loss": 0.0005, + "step": 10280 + }, + { + "epoch": 2.712307083498219, + "grad_norm": 0.14142274856567383, + "learning_rate": 1.6199185204724643e-05, + "loss": 0.0019, + "step": 10282 + }, + { + "epoch": 2.7128347183748844, + "grad_norm": 0.017956344410777092, + "learning_rate": 1.6198012837421965e-05, + "loss": 0.001, + "step": 10284 + }, + { + "epoch": 2.71336235325155, + "grad_norm": 0.011953428387641907, + "learning_rate": 1.619684047011929e-05, + "loss": 0.0003, + "step": 10286 + }, + { + "epoch": 2.7138899881282152, + "grad_norm": 0.01387231145054102, + "learning_rate": 1.6195668102816614e-05, + "loss": 0.0004, + "step": 10288 + }, + { + "epoch": 2.7144176230048807, + "grad_norm": 0.05084547400474548, + "learning_rate": 1.6194495735513936e-05, + "loss": 0.0027, + "step": 10290 + }, + { + "epoch": 2.714945257881546, + "grad_norm": 0.13179543614387512, + "learning_rate": 1.6193323368211262e-05, + "loss": 0.0008, + "step": 10292 + }, + { + "epoch": 2.7154728927582115, + "grad_norm": 0.007650289684534073, + "learning_rate": 1.619215100090859e-05, + "loss": 0.0007, + "step": 10294 + }, + { + "epoch": 2.7160005276348764, + "grad_norm": 0.0030638708267360926, + "learning_rate": 1.619097863360591e-05, + "loss": 0.0002, + "step": 10296 + }, + { + "epoch": 2.7165281625115423, + "grad_norm": 0.015649886801838875, + "learning_rate": 1.6189806266303234e-05, + "loss": 0.0002, + "step": 10298 + }, + { + "epoch": 2.7170557973882072, + "grad_norm": 0.003158345352858305, + "learning_rate": 1.618863389900056e-05, + "loss": 0.0001, + "step": 10300 + }, + { + "epoch": 2.7175834322648726, + "grad_norm": 0.1383063793182373, + "learning_rate": 1.6187461531697882e-05, + "loss": 0.0072, + "step": 10302 + }, + { + "epoch": 2.718111067141538, + "grad_norm": 0.05545736476778984, + "learning_rate": 1.6186289164395205e-05, + "loss": 0.0006, + "step": 10304 + }, + { + "epoch": 2.7186387020182035, + "grad_norm": 0.007084875367581844, + "learning_rate": 1.618511679709253e-05, + "loss": 0.0055, + "step": 10306 + }, + { + "epoch": 2.719166336894869, + "grad_norm": 0.27538004517555237, + "learning_rate": 1.6183944429789857e-05, + "loss": 0.0089, + "step": 10308 + }, + { + "epoch": 2.719693971771534, + "grad_norm": 0.13659940659999847, + "learning_rate": 1.6182772062487176e-05, + "loss": 0.0072, + "step": 10310 + }, + { + "epoch": 2.7202216066481997, + "grad_norm": 0.003937129396945238, + "learning_rate": 1.6181599695184502e-05, + "loss": 0.0001, + "step": 10312 + }, + { + "epoch": 2.7207492415248646, + "grad_norm": 0.010266292840242386, + "learning_rate": 1.6180427327881828e-05, + "loss": 0.0003, + "step": 10314 + }, + { + "epoch": 2.72127687640153, + "grad_norm": 0.0061270506121218204, + "learning_rate": 1.617925496057915e-05, + "loss": 0.0035, + "step": 10316 + }, + { + "epoch": 2.7218045112781954, + "grad_norm": 0.012609126046299934, + "learning_rate": 1.6178082593276473e-05, + "loss": 0.0002, + "step": 10318 + }, + { + "epoch": 2.722332146154861, + "grad_norm": 0.03763534501194954, + "learning_rate": 1.61769102259738e-05, + "loss": 0.0007, + "step": 10320 + }, + { + "epoch": 2.7228597810315263, + "grad_norm": 0.03857172653079033, + "learning_rate": 1.6175737858671122e-05, + "loss": 0.0006, + "step": 10322 + }, + { + "epoch": 2.7233874159081917, + "grad_norm": 0.11698652058839798, + "learning_rate": 1.6174565491368448e-05, + "loss": 0.0005, + "step": 10324 + }, + { + "epoch": 2.723915050784857, + "grad_norm": 0.026443185284733772, + "learning_rate": 1.617339312406577e-05, + "loss": 0.0007, + "step": 10326 + }, + { + "epoch": 2.724442685661522, + "grad_norm": 0.009037015028297901, + "learning_rate": 1.6172220756763096e-05, + "loss": 0.0002, + "step": 10328 + }, + { + "epoch": 2.7249703205381874, + "grad_norm": 0.024594329297542572, + "learning_rate": 1.617104838946042e-05, + "loss": 0.0002, + "step": 10330 + }, + { + "epoch": 2.725497955414853, + "grad_norm": 0.004508471116423607, + "learning_rate": 1.6169876022157745e-05, + "loss": 0.0005, + "step": 10332 + }, + { + "epoch": 2.7260255902915183, + "grad_norm": 0.19159966707229614, + "learning_rate": 1.6168703654855068e-05, + "loss": 0.0041, + "step": 10334 + }, + { + "epoch": 2.7265532251681837, + "grad_norm": 0.11250832676887512, + "learning_rate": 1.616753128755239e-05, + "loss": 0.0026, + "step": 10336 + }, + { + "epoch": 2.727080860044849, + "grad_norm": 0.08989267796278, + "learning_rate": 1.6166358920249716e-05, + "loss": 0.0008, + "step": 10338 + }, + { + "epoch": 2.7276084949215145, + "grad_norm": 0.035604219883680344, + "learning_rate": 1.616518655294704e-05, + "loss": 0.0003, + "step": 10340 + }, + { + "epoch": 2.7281361297981794, + "grad_norm": 0.15006737411022186, + "learning_rate": 1.6164014185644365e-05, + "loss": 0.0058, + "step": 10342 + }, + { + "epoch": 2.7286637646748453, + "grad_norm": 0.6424149870872498, + "learning_rate": 1.6162841818341688e-05, + "loss": 0.0059, + "step": 10344 + }, + { + "epoch": 2.7291913995515102, + "grad_norm": 0.0375511534512043, + "learning_rate": 1.6161669451039014e-05, + "loss": 0.0003, + "step": 10346 + }, + { + "epoch": 2.7297190344281757, + "grad_norm": 0.008648638613522053, + "learning_rate": 1.6160497083736336e-05, + "loss": 0.0003, + "step": 10348 + }, + { + "epoch": 2.730246669304841, + "grad_norm": 0.0036959433928132057, + "learning_rate": 1.615932471643366e-05, + "loss": 0.0002, + "step": 10350 + }, + { + "epoch": 2.7307743041815065, + "grad_norm": 0.11710254848003387, + "learning_rate": 1.6158152349130985e-05, + "loss": 0.0006, + "step": 10352 + }, + { + "epoch": 2.731301939058172, + "grad_norm": 0.4867843687534332, + "learning_rate": 1.615697998182831e-05, + "loss": 0.0024, + "step": 10354 + }, + { + "epoch": 2.731829573934837, + "grad_norm": 0.12093371152877808, + "learning_rate": 1.615580761452563e-05, + "loss": 0.0033, + "step": 10356 + }, + { + "epoch": 2.7323572088115027, + "grad_norm": 0.009218224324285984, + "learning_rate": 1.6154635247222956e-05, + "loss": 0.0001, + "step": 10358 + }, + { + "epoch": 2.7328848436881676, + "grad_norm": 0.007711457554250956, + "learning_rate": 1.6153462879920282e-05, + "loss": 0.0002, + "step": 10360 + }, + { + "epoch": 2.733412478564833, + "grad_norm": 0.0020836717449128628, + "learning_rate": 1.6152290512617605e-05, + "loss": 0.0001, + "step": 10362 + }, + { + "epoch": 2.7339401134414985, + "grad_norm": 0.01181034930050373, + "learning_rate": 1.6151118145314927e-05, + "loss": 0.0002, + "step": 10364 + }, + { + "epoch": 2.734467748318164, + "grad_norm": 0.299978107213974, + "learning_rate": 1.6149945778012253e-05, + "loss": 0.0044, + "step": 10366 + }, + { + "epoch": 2.7349953831948293, + "grad_norm": 0.1510164737701416, + "learning_rate": 1.6148773410709576e-05, + "loss": 0.0087, + "step": 10368 + }, + { + "epoch": 2.7355230180714947, + "grad_norm": 0.04417930170893669, + "learning_rate": 1.61476010434069e-05, + "loss": 0.0004, + "step": 10370 + }, + { + "epoch": 2.73605065294816, + "grad_norm": 0.36642682552337646, + "learning_rate": 1.6146428676104224e-05, + "loss": 0.007, + "step": 10372 + }, + { + "epoch": 2.736578287824825, + "grad_norm": 0.25396209955215454, + "learning_rate": 1.614525630880155e-05, + "loss": 0.003, + "step": 10374 + }, + { + "epoch": 2.7371059227014904, + "grad_norm": 0.03319845721125603, + "learning_rate": 1.6144083941498873e-05, + "loss": 0.0008, + "step": 10376 + }, + { + "epoch": 2.737633557578156, + "grad_norm": 0.08722386509180069, + "learning_rate": 1.6142911574196196e-05, + "loss": 0.0005, + "step": 10378 + }, + { + "epoch": 2.7381611924548213, + "grad_norm": 0.04443521797657013, + "learning_rate": 1.614173920689352e-05, + "loss": 0.0007, + "step": 10380 + }, + { + "epoch": 2.7386888273314867, + "grad_norm": 0.1583508849143982, + "learning_rate": 1.6140566839590844e-05, + "loss": 0.0011, + "step": 10382 + }, + { + "epoch": 2.739216462208152, + "grad_norm": 0.10489816218614578, + "learning_rate": 1.613939447228817e-05, + "loss": 0.0103, + "step": 10384 + }, + { + "epoch": 2.7397440970848175, + "grad_norm": 0.16231128573417664, + "learning_rate": 1.6138222104985493e-05, + "loss": 0.003, + "step": 10386 + }, + { + "epoch": 2.7402717319614824, + "grad_norm": 0.14324724674224854, + "learning_rate": 1.613704973768282e-05, + "loss": 0.0031, + "step": 10388 + }, + { + "epoch": 2.740799366838148, + "grad_norm": 0.42572712898254395, + "learning_rate": 1.613587737038014e-05, + "loss": 0.0069, + "step": 10390 + }, + { + "epoch": 2.7413270017148132, + "grad_norm": 0.07803104817867279, + "learning_rate": 1.6134705003077467e-05, + "loss": 0.0007, + "step": 10392 + }, + { + "epoch": 2.7418546365914787, + "grad_norm": 0.017962772399187088, + "learning_rate": 1.613353263577479e-05, + "loss": 0.0019, + "step": 10394 + }, + { + "epoch": 2.742382271468144, + "grad_norm": 0.010436495766043663, + "learning_rate": 1.6132360268472113e-05, + "loss": 0.0002, + "step": 10396 + }, + { + "epoch": 2.7429099063448095, + "grad_norm": 0.006540838163346052, + "learning_rate": 1.613118790116944e-05, + "loss": 0.0002, + "step": 10398 + }, + { + "epoch": 2.743437541221475, + "grad_norm": 0.04525614157319069, + "learning_rate": 1.613001553386676e-05, + "loss": 0.0004, + "step": 10400 + }, + { + "epoch": 2.74396517609814, + "grad_norm": 0.18133945763111115, + "learning_rate": 1.6128843166564084e-05, + "loss": 0.0007, + "step": 10402 + }, + { + "epoch": 2.7444928109748057, + "grad_norm": 0.02008386142551899, + "learning_rate": 1.612767079926141e-05, + "loss": 0.0003, + "step": 10404 + }, + { + "epoch": 2.7450204458514706, + "grad_norm": 0.03378121554851532, + "learning_rate": 1.6126498431958736e-05, + "loss": 0.0003, + "step": 10406 + }, + { + "epoch": 2.745548080728136, + "grad_norm": 0.0044477046467363834, + "learning_rate": 1.612532606465606e-05, + "loss": 0.0003, + "step": 10408 + }, + { + "epoch": 2.7460757156048015, + "grad_norm": 0.003295032074674964, + "learning_rate": 1.612415369735338e-05, + "loss": 0.0001, + "step": 10410 + }, + { + "epoch": 2.746603350481467, + "grad_norm": 0.029876194894313812, + "learning_rate": 1.6122981330050707e-05, + "loss": 0.002, + "step": 10412 + }, + { + "epoch": 2.7471309853581323, + "grad_norm": 0.15428918600082397, + "learning_rate": 1.612180896274803e-05, + "loss": 0.0089, + "step": 10414 + }, + { + "epoch": 2.7476586202347977, + "grad_norm": 0.1032402291893959, + "learning_rate": 1.6120636595445352e-05, + "loss": 0.0004, + "step": 10416 + }, + { + "epoch": 2.748186255111463, + "grad_norm": 0.04639940708875656, + "learning_rate": 1.6119464228142678e-05, + "loss": 0.0003, + "step": 10418 + }, + { + "epoch": 2.748713889988128, + "grad_norm": 0.002372640883550048, + "learning_rate": 1.6118291860840004e-05, + "loss": 0.0001, + "step": 10420 + }, + { + "epoch": 2.7492415248647935, + "grad_norm": 0.0038341491017490625, + "learning_rate": 1.6117119493537327e-05, + "loss": 0.0001, + "step": 10422 + }, + { + "epoch": 2.749769159741459, + "grad_norm": 0.1997923105955124, + "learning_rate": 1.611594712623465e-05, + "loss": 0.0009, + "step": 10424 + }, + { + "epoch": 2.7502967946181243, + "grad_norm": 0.0021572911646217108, + "learning_rate": 1.6114774758931976e-05, + "loss": 0.0001, + "step": 10426 + }, + { + "epoch": 2.7508244294947897, + "grad_norm": 0.14753365516662598, + "learning_rate": 1.6113602391629298e-05, + "loss": 0.0047, + "step": 10428 + }, + { + "epoch": 2.751352064371455, + "grad_norm": 0.3258956968784332, + "learning_rate": 1.611243002432662e-05, + "loss": 0.0015, + "step": 10430 + }, + { + "epoch": 2.7518796992481205, + "grad_norm": 0.0017480694223195314, + "learning_rate": 1.6111257657023947e-05, + "loss": 0.0001, + "step": 10432 + }, + { + "epoch": 2.7524073341247854, + "grad_norm": 0.19851310551166534, + "learning_rate": 1.6110085289721273e-05, + "loss": 0.0059, + "step": 10434 + }, + { + "epoch": 2.752934969001451, + "grad_norm": 0.0034043213818222284, + "learning_rate": 1.6108912922418595e-05, + "loss": 0.0001, + "step": 10436 + }, + { + "epoch": 2.7534626038781163, + "grad_norm": 0.014115131460130215, + "learning_rate": 1.6107740555115918e-05, + "loss": 0.0011, + "step": 10438 + }, + { + "epoch": 2.7539902387547817, + "grad_norm": 0.03450223430991173, + "learning_rate": 1.6106568187813244e-05, + "loss": 0.0003, + "step": 10440 + }, + { + "epoch": 2.754517873631447, + "grad_norm": 0.0375853031873703, + "learning_rate": 1.6105395820510567e-05, + "loss": 0.0027, + "step": 10442 + }, + { + "epoch": 2.7550455085081125, + "grad_norm": 0.5102871656417847, + "learning_rate": 1.6104223453207893e-05, + "loss": 0.0077, + "step": 10444 + }, + { + "epoch": 2.755573143384778, + "grad_norm": 0.08819139748811722, + "learning_rate": 1.6103051085905215e-05, + "loss": 0.0007, + "step": 10446 + }, + { + "epoch": 2.756100778261443, + "grad_norm": 0.03190590813755989, + "learning_rate": 1.6101878718602538e-05, + "loss": 0.0044, + "step": 10448 + }, + { + "epoch": 2.7566284131381087, + "grad_norm": 0.015115033835172653, + "learning_rate": 1.6100706351299864e-05, + "loss": 0.0001, + "step": 10450 + }, + { + "epoch": 2.7571560480147737, + "grad_norm": 0.3068906366825104, + "learning_rate": 1.609953398399719e-05, + "loss": 0.0011, + "step": 10452 + }, + { + "epoch": 2.757683682891439, + "grad_norm": 0.010568120516836643, + "learning_rate": 1.6098361616694512e-05, + "loss": 0.0141, + "step": 10454 + }, + { + "epoch": 2.7582113177681045, + "grad_norm": 0.012142041698098183, + "learning_rate": 1.6097189249391835e-05, + "loss": 0.0048, + "step": 10456 + }, + { + "epoch": 2.75873895264477, + "grad_norm": 0.08339930325746536, + "learning_rate": 1.609601688208916e-05, + "loss": 0.0016, + "step": 10458 + }, + { + "epoch": 2.7592665875214353, + "grad_norm": 0.049694646149873734, + "learning_rate": 1.6094844514786484e-05, + "loss": 0.0014, + "step": 10460 + }, + { + "epoch": 2.7597942223981002, + "grad_norm": 0.006534635089337826, + "learning_rate": 1.6093672147483806e-05, + "loss": 0.0002, + "step": 10462 + }, + { + "epoch": 2.760321857274766, + "grad_norm": 0.39165571331977844, + "learning_rate": 1.6092499780181132e-05, + "loss": 0.0106, + "step": 10464 + }, + { + "epoch": 2.760849492151431, + "grad_norm": 0.04891330748796463, + "learning_rate": 1.6091327412878458e-05, + "loss": 0.0083, + "step": 10466 + }, + { + "epoch": 2.7613771270280965, + "grad_norm": 0.08482888340950012, + "learning_rate": 1.609015504557578e-05, + "loss": 0.0006, + "step": 10468 + }, + { + "epoch": 2.761904761904762, + "grad_norm": 0.021887386217713356, + "learning_rate": 1.6088982678273103e-05, + "loss": 0.0003, + "step": 10470 + }, + { + "epoch": 2.7624323967814273, + "grad_norm": 0.17294719815254211, + "learning_rate": 1.608781031097043e-05, + "loss": 0.0007, + "step": 10472 + }, + { + "epoch": 2.7629600316580927, + "grad_norm": 0.03563355281949043, + "learning_rate": 1.6086637943667752e-05, + "loss": 0.0015, + "step": 10474 + }, + { + "epoch": 2.763487666534758, + "grad_norm": 0.032150380313396454, + "learning_rate": 1.6085465576365075e-05, + "loss": 0.0033, + "step": 10476 + }, + { + "epoch": 2.7640153014114235, + "grad_norm": 0.019264180213212967, + "learning_rate": 1.60842932090624e-05, + "loss": 0.0004, + "step": 10478 + }, + { + "epoch": 2.7645429362880884, + "grad_norm": 0.010080166161060333, + "learning_rate": 1.6083120841759727e-05, + "loss": 0.0009, + "step": 10480 + }, + { + "epoch": 2.765070571164754, + "grad_norm": 0.013606147840619087, + "learning_rate": 1.608194847445705e-05, + "loss": 0.0003, + "step": 10482 + }, + { + "epoch": 2.7655982060414193, + "grad_norm": 0.006229972932487726, + "learning_rate": 1.6080776107154372e-05, + "loss": 0.0006, + "step": 10484 + }, + { + "epoch": 2.7661258409180847, + "grad_norm": 0.16056688129901886, + "learning_rate": 1.6079603739851698e-05, + "loss": 0.0005, + "step": 10486 + }, + { + "epoch": 2.76665347579475, + "grad_norm": 0.007320140954107046, + "learning_rate": 1.607843137254902e-05, + "loss": 0.0002, + "step": 10488 + }, + { + "epoch": 2.7671811106714155, + "grad_norm": 0.00509006530046463, + "learning_rate": 1.6077259005246343e-05, + "loss": 0.0002, + "step": 10490 + }, + { + "epoch": 2.767708745548081, + "grad_norm": 0.18615444004535675, + "learning_rate": 1.607608663794367e-05, + "loss": 0.0035, + "step": 10492 + }, + { + "epoch": 2.768236380424746, + "grad_norm": 0.002615657402202487, + "learning_rate": 1.607491427064099e-05, + "loss": 0.0001, + "step": 10494 + }, + { + "epoch": 2.7687640153014117, + "grad_norm": 0.0018918621353805065, + "learning_rate": 1.6073741903338318e-05, + "loss": 0.0001, + "step": 10496 + }, + { + "epoch": 2.7692916501780767, + "grad_norm": 0.0015118569135665894, + "learning_rate": 1.607256953603564e-05, + "loss": 0.0006, + "step": 10498 + }, + { + "epoch": 2.769819285054742, + "grad_norm": 0.0017918499652296305, + "learning_rate": 1.6071397168732966e-05, + "loss": 0.0026, + "step": 10500 + }, + { + "epoch": 2.7703469199314075, + "grad_norm": 0.0016898721223697066, + "learning_rate": 1.607022480143029e-05, + "loss": 0.0001, + "step": 10502 + }, + { + "epoch": 2.770874554808073, + "grad_norm": 0.005674637388437986, + "learning_rate": 1.6069052434127615e-05, + "loss": 0.0001, + "step": 10504 + }, + { + "epoch": 2.7714021896847383, + "grad_norm": 0.2660583257675171, + "learning_rate": 1.6067880066824937e-05, + "loss": 0.0026, + "step": 10506 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.14115945994853973, + "learning_rate": 1.606670769952226e-05, + "loss": 0.0003, + "step": 10508 + }, + { + "epoch": 2.772457459438069, + "grad_norm": 0.03925144299864769, + "learning_rate": 1.6065535332219586e-05, + "loss": 0.0003, + "step": 10510 + }, + { + "epoch": 2.772985094314734, + "grad_norm": 0.01029877457767725, + "learning_rate": 1.6064362964916912e-05, + "loss": 0.0069, + "step": 10512 + }, + { + "epoch": 2.7735127291913995, + "grad_norm": 0.805678129196167, + "learning_rate": 1.6063190597614235e-05, + "loss": 0.0021, + "step": 10514 + }, + { + "epoch": 2.774040364068065, + "grad_norm": 0.04742409661412239, + "learning_rate": 1.6062018230311557e-05, + "loss": 0.0003, + "step": 10516 + }, + { + "epoch": 2.7745679989447303, + "grad_norm": 0.13494116067886353, + "learning_rate": 1.6060845863008883e-05, + "loss": 0.0038, + "step": 10518 + }, + { + "epoch": 2.7750956338213957, + "grad_norm": 0.003706324379891157, + "learning_rate": 1.6059673495706206e-05, + "loss": 0.0001, + "step": 10520 + }, + { + "epoch": 2.775623268698061, + "grad_norm": 0.031015997752547264, + "learning_rate": 1.605850112840353e-05, + "loss": 0.0004, + "step": 10522 + }, + { + "epoch": 2.7761509035747265, + "grad_norm": 0.16272807121276855, + "learning_rate": 1.6057328761100855e-05, + "loss": 0.0025, + "step": 10524 + }, + { + "epoch": 2.7766785384513915, + "grad_norm": 0.17956775426864624, + "learning_rate": 1.605615639379818e-05, + "loss": 0.0171, + "step": 10526 + }, + { + "epoch": 2.777206173328057, + "grad_norm": 0.1253015100955963, + "learning_rate": 1.6054984026495503e-05, + "loss": 0.003, + "step": 10528 + }, + { + "epoch": 2.7777338082047223, + "grad_norm": 0.015360220335423946, + "learning_rate": 1.6053811659192826e-05, + "loss": 0.0008, + "step": 10530 + }, + { + "epoch": 2.7782614430813877, + "grad_norm": 0.0259291660040617, + "learning_rate": 1.6052639291890152e-05, + "loss": 0.0004, + "step": 10532 + }, + { + "epoch": 2.778789077958053, + "grad_norm": 0.36748161911964417, + "learning_rate": 1.6051466924587474e-05, + "loss": 0.0011, + "step": 10534 + }, + { + "epoch": 2.7793167128347185, + "grad_norm": 0.38274404406547546, + "learning_rate": 1.6050294557284797e-05, + "loss": 0.0035, + "step": 10536 + }, + { + "epoch": 2.779844347711384, + "grad_norm": 0.0038161841221153736, + "learning_rate": 1.6049122189982123e-05, + "loss": 0.0004, + "step": 10538 + }, + { + "epoch": 2.780371982588049, + "grad_norm": 0.06628759205341339, + "learning_rate": 1.6047949822679446e-05, + "loss": 0.0005, + "step": 10540 + }, + { + "epoch": 2.7808996174647143, + "grad_norm": 0.25100472569465637, + "learning_rate": 1.604677745537677e-05, + "loss": 0.0081, + "step": 10542 + }, + { + "epoch": 2.7814272523413797, + "grad_norm": 0.046735141426324844, + "learning_rate": 1.6045605088074094e-05, + "loss": 0.0005, + "step": 10544 + }, + { + "epoch": 2.781954887218045, + "grad_norm": 0.21183516085147858, + "learning_rate": 1.604443272077142e-05, + "loss": 0.0042, + "step": 10546 + }, + { + "epoch": 2.7824825220947105, + "grad_norm": 0.004465840756893158, + "learning_rate": 1.6043260353468743e-05, + "loss": 0.0002, + "step": 10548 + }, + { + "epoch": 2.783010156971376, + "grad_norm": 0.16219407320022583, + "learning_rate": 1.6042087986166065e-05, + "loss": 0.0038, + "step": 10550 + }, + { + "epoch": 2.7835377918480413, + "grad_norm": 0.2728232443332672, + "learning_rate": 1.604091561886339e-05, + "loss": 0.0017, + "step": 10552 + }, + { + "epoch": 2.7840654267247062, + "grad_norm": 0.0023152651265263557, + "learning_rate": 1.6039743251560714e-05, + "loss": 0.0009, + "step": 10554 + }, + { + "epoch": 2.784593061601372, + "grad_norm": 0.15543006360530853, + "learning_rate": 1.603857088425804e-05, + "loss": 0.0029, + "step": 10556 + }, + { + "epoch": 2.785120696478037, + "grad_norm": 0.015677418559789658, + "learning_rate": 1.6037398516955363e-05, + "loss": 0.0007, + "step": 10558 + }, + { + "epoch": 2.7856483313547025, + "grad_norm": 0.05882008746266365, + "learning_rate": 1.603622614965269e-05, + "loss": 0.0004, + "step": 10560 + }, + { + "epoch": 2.786175966231368, + "grad_norm": 0.011346589773893356, + "learning_rate": 1.603505378235001e-05, + "loss": 0.0002, + "step": 10562 + }, + { + "epoch": 2.7867036011080333, + "grad_norm": 0.03339455649256706, + "learning_rate": 1.6033881415047337e-05, + "loss": 0.0037, + "step": 10564 + }, + { + "epoch": 2.7872312359846987, + "grad_norm": 0.034989334642887115, + "learning_rate": 1.603270904774466e-05, + "loss": 0.0004, + "step": 10566 + }, + { + "epoch": 2.787758870861364, + "grad_norm": 0.39145269989967346, + "learning_rate": 1.6031536680441982e-05, + "loss": 0.0024, + "step": 10568 + }, + { + "epoch": 2.7882865057380295, + "grad_norm": 0.023640668019652367, + "learning_rate": 1.603036431313931e-05, + "loss": 0.0056, + "step": 10570 + }, + { + "epoch": 2.7888141406146945, + "grad_norm": 0.33624720573425293, + "learning_rate": 1.6029191945836634e-05, + "loss": 0.0017, + "step": 10572 + }, + { + "epoch": 2.78934177549136, + "grad_norm": 0.17937953770160675, + "learning_rate": 1.6028019578533957e-05, + "loss": 0.0037, + "step": 10574 + }, + { + "epoch": 2.7898694103680253, + "grad_norm": 0.015493113547563553, + "learning_rate": 1.602684721123128e-05, + "loss": 0.0018, + "step": 10576 + }, + { + "epoch": 2.7903970452446907, + "grad_norm": 0.07171630859375, + "learning_rate": 1.6025674843928606e-05, + "loss": 0.0037, + "step": 10578 + }, + { + "epoch": 2.790924680121356, + "grad_norm": 0.020815186202526093, + "learning_rate": 1.6024502476625928e-05, + "loss": 0.0007, + "step": 10580 + }, + { + "epoch": 2.7914523149980215, + "grad_norm": 0.33588045835494995, + "learning_rate": 1.602333010932325e-05, + "loss": 0.0138, + "step": 10582 + }, + { + "epoch": 2.791979949874687, + "grad_norm": 0.014632471837103367, + "learning_rate": 1.6022157742020577e-05, + "loss": 0.0004, + "step": 10584 + }, + { + "epoch": 2.792507584751352, + "grad_norm": 0.012326742522418499, + "learning_rate": 1.60209853747179e-05, + "loss": 0.0032, + "step": 10586 + }, + { + "epoch": 2.7930352196280173, + "grad_norm": 0.009965041652321815, + "learning_rate": 1.6019813007415222e-05, + "loss": 0.0014, + "step": 10588 + }, + { + "epoch": 2.7935628545046827, + "grad_norm": 0.2284841537475586, + "learning_rate": 1.6018640640112548e-05, + "loss": 0.0018, + "step": 10590 + }, + { + "epoch": 2.794090489381348, + "grad_norm": 0.01972680538892746, + "learning_rate": 1.6017468272809874e-05, + "loss": 0.0003, + "step": 10592 + }, + { + "epoch": 2.7946181242580135, + "grad_norm": 0.011329953558743, + "learning_rate": 1.6016295905507197e-05, + "loss": 0.0004, + "step": 10594 + }, + { + "epoch": 2.795145759134679, + "grad_norm": 0.35841625928878784, + "learning_rate": 1.601512353820452e-05, + "loss": 0.0003, + "step": 10596 + }, + { + "epoch": 2.7956733940113443, + "grad_norm": 0.025331657379865646, + "learning_rate": 1.6013951170901845e-05, + "loss": 0.0061, + "step": 10598 + }, + { + "epoch": 2.7962010288880093, + "grad_norm": 0.27243486046791077, + "learning_rate": 1.6012778803599168e-05, + "loss": 0.0018, + "step": 10600 + }, + { + "epoch": 2.796728663764675, + "grad_norm": 0.05267871916294098, + "learning_rate": 1.6011606436296494e-05, + "loss": 0.0005, + "step": 10602 + }, + { + "epoch": 2.79725629864134, + "grad_norm": 0.39887478947639465, + "learning_rate": 1.6010434068993817e-05, + "loss": 0.0077, + "step": 10604 + }, + { + "epoch": 2.7977839335180055, + "grad_norm": 0.015155128203332424, + "learning_rate": 1.6009261701691143e-05, + "loss": 0.0003, + "step": 10606 + }, + { + "epoch": 2.798311568394671, + "grad_norm": 0.03480479121208191, + "learning_rate": 1.6008089334388465e-05, + "loss": 0.0005, + "step": 10608 + }, + { + "epoch": 2.7988392032713363, + "grad_norm": 0.052348051220178604, + "learning_rate": 1.6006916967085788e-05, + "loss": 0.0006, + "step": 10610 + }, + { + "epoch": 2.7993668381480017, + "grad_norm": 0.00547346007078886, + "learning_rate": 1.6005744599783114e-05, + "loss": 0.0002, + "step": 10612 + }, + { + "epoch": 2.799894473024667, + "grad_norm": 0.11003998667001724, + "learning_rate": 1.6004572232480436e-05, + "loss": 0.0051, + "step": 10614 + }, + { + "epoch": 2.8004221079013325, + "grad_norm": 0.08329156041145325, + "learning_rate": 1.6003399865177762e-05, + "loss": 0.0005, + "step": 10616 + }, + { + "epoch": 2.8009497427779975, + "grad_norm": 0.012772937305271626, + "learning_rate": 1.6002227497875085e-05, + "loss": 0.0052, + "step": 10618 + }, + { + "epoch": 2.801477377654663, + "grad_norm": 0.050190918147563934, + "learning_rate": 1.600105513057241e-05, + "loss": 0.0013, + "step": 10620 + }, + { + "epoch": 2.8020050125313283, + "grad_norm": 0.009493930265307426, + "learning_rate": 1.5999882763269734e-05, + "loss": 0.0027, + "step": 10622 + }, + { + "epoch": 2.8025326474079937, + "grad_norm": 0.2898407578468323, + "learning_rate": 1.599871039596706e-05, + "loss": 0.0093, + "step": 10624 + }, + { + "epoch": 2.803060282284659, + "grad_norm": 0.08289454877376556, + "learning_rate": 1.5997538028664382e-05, + "loss": 0.0009, + "step": 10626 + }, + { + "epoch": 2.8035879171613245, + "grad_norm": 0.021880408748984337, + "learning_rate": 1.5996365661361705e-05, + "loss": 0.0048, + "step": 10628 + }, + { + "epoch": 2.80411555203799, + "grad_norm": 0.0278635211288929, + "learning_rate": 1.599519329405903e-05, + "loss": 0.0015, + "step": 10630 + }, + { + "epoch": 2.804643186914655, + "grad_norm": 0.03385419398546219, + "learning_rate": 1.5994020926756357e-05, + "loss": 0.0011, + "step": 10632 + }, + { + "epoch": 2.8051708217913203, + "grad_norm": 0.3429679870605469, + "learning_rate": 1.5992848559453676e-05, + "loss": 0.0023, + "step": 10634 + }, + { + "epoch": 2.8056984566679857, + "grad_norm": 0.029681889340281487, + "learning_rate": 1.5991676192151002e-05, + "loss": 0.0018, + "step": 10636 + }, + { + "epoch": 2.806226091544651, + "grad_norm": 0.012384260073304176, + "learning_rate": 1.5990503824848328e-05, + "loss": 0.0005, + "step": 10638 + }, + { + "epoch": 2.8067537264213165, + "grad_norm": 0.028774341568350792, + "learning_rate": 1.598933145754565e-05, + "loss": 0.0007, + "step": 10640 + }, + { + "epoch": 2.807281361297982, + "grad_norm": 0.01638384349644184, + "learning_rate": 1.5988159090242973e-05, + "loss": 0.0047, + "step": 10642 + }, + { + "epoch": 2.8078089961746473, + "grad_norm": 0.007681482005864382, + "learning_rate": 1.59869867229403e-05, + "loss": 0.0007, + "step": 10644 + }, + { + "epoch": 2.8083366310513123, + "grad_norm": 0.002761618699878454, + "learning_rate": 1.5985814355637622e-05, + "loss": 0.0002, + "step": 10646 + }, + { + "epoch": 2.808864265927978, + "grad_norm": 0.17323370277881622, + "learning_rate": 1.5984641988334944e-05, + "loss": 0.0005, + "step": 10648 + }, + { + "epoch": 2.809391900804643, + "grad_norm": 0.013480155728757381, + "learning_rate": 1.598346962103227e-05, + "loss": 0.0003, + "step": 10650 + }, + { + "epoch": 2.8099195356813085, + "grad_norm": 0.030156243592500687, + "learning_rate": 1.5982297253729596e-05, + "loss": 0.0001, + "step": 10652 + }, + { + "epoch": 2.810447170557974, + "grad_norm": 0.20130038261413574, + "learning_rate": 1.598112488642692e-05, + "loss": 0.001, + "step": 10654 + }, + { + "epoch": 2.8109748054346393, + "grad_norm": 0.08426833152770996, + "learning_rate": 1.597995251912424e-05, + "loss": 0.0004, + "step": 10656 + }, + { + "epoch": 2.8115024403113047, + "grad_norm": 0.1856847107410431, + "learning_rate": 1.5978780151821568e-05, + "loss": 0.0061, + "step": 10658 + }, + { + "epoch": 2.8120300751879697, + "grad_norm": 0.10218340903520584, + "learning_rate": 1.597760778451889e-05, + "loss": 0.0012, + "step": 10660 + }, + { + "epoch": 2.8125577100646355, + "grad_norm": 0.0022376873530447483, + "learning_rate": 1.5976435417216216e-05, + "loss": 0.0001, + "step": 10662 + }, + { + "epoch": 2.8130853449413005, + "grad_norm": 0.1578328162431717, + "learning_rate": 1.597526304991354e-05, + "loss": 0.0036, + "step": 10664 + }, + { + "epoch": 2.813612979817966, + "grad_norm": 0.007641657255589962, + "learning_rate": 1.5974090682610865e-05, + "loss": 0.0001, + "step": 10666 + }, + { + "epoch": 2.8141406146946313, + "grad_norm": 0.0774262472987175, + "learning_rate": 1.5972918315308187e-05, + "loss": 0.0005, + "step": 10668 + }, + { + "epoch": 2.8146682495712967, + "grad_norm": 0.037771906703710556, + "learning_rate": 1.597174594800551e-05, + "loss": 0.0024, + "step": 10670 + }, + { + "epoch": 2.815195884447962, + "grad_norm": 0.0038277218118309975, + "learning_rate": 1.5970573580702836e-05, + "loss": 0.0017, + "step": 10672 + }, + { + "epoch": 2.8157235193246275, + "grad_norm": 0.016744032502174377, + "learning_rate": 1.596940121340016e-05, + "loss": 0.0002, + "step": 10674 + }, + { + "epoch": 2.816251154201293, + "grad_norm": 0.010669780895113945, + "learning_rate": 1.5968228846097485e-05, + "loss": 0.0003, + "step": 10676 + }, + { + "epoch": 2.816778789077958, + "grad_norm": 0.0047254362143576145, + "learning_rate": 1.5967056478794807e-05, + "loss": 0.0004, + "step": 10678 + }, + { + "epoch": 2.8173064239546233, + "grad_norm": 0.003408439690247178, + "learning_rate": 1.596588411149213e-05, + "loss": 0.0001, + "step": 10680 + }, + { + "epoch": 2.8178340588312887, + "grad_norm": 0.13666285574436188, + "learning_rate": 1.5964711744189456e-05, + "loss": 0.0064, + "step": 10682 + }, + { + "epoch": 2.818361693707954, + "grad_norm": 0.020895155146718025, + "learning_rate": 1.5963539376886782e-05, + "loss": 0.0033, + "step": 10684 + }, + { + "epoch": 2.8188893285846195, + "grad_norm": 0.2160155177116394, + "learning_rate": 1.5962367009584105e-05, + "loss": 0.0008, + "step": 10686 + }, + { + "epoch": 2.819416963461285, + "grad_norm": 0.5639979243278503, + "learning_rate": 1.5961194642281427e-05, + "loss": 0.0014, + "step": 10688 + }, + { + "epoch": 2.8199445983379503, + "grad_norm": 0.0364290326833725, + "learning_rate": 1.5960022274978753e-05, + "loss": 0.0017, + "step": 10690 + }, + { + "epoch": 2.8204722332146153, + "grad_norm": 0.015265332534909248, + "learning_rate": 1.5958849907676076e-05, + "loss": 0.0002, + "step": 10692 + }, + { + "epoch": 2.8209998680912807, + "grad_norm": 0.06762682646512985, + "learning_rate": 1.59576775403734e-05, + "loss": 0.0058, + "step": 10694 + }, + { + "epoch": 2.821527502967946, + "grad_norm": 0.07732943445444107, + "learning_rate": 1.5956505173070724e-05, + "loss": 0.0004, + "step": 10696 + }, + { + "epoch": 2.8220551378446115, + "grad_norm": 0.015548141673207283, + "learning_rate": 1.595533280576805e-05, + "loss": 0.0003, + "step": 10698 + }, + { + "epoch": 2.822582772721277, + "grad_norm": 0.010333742015063763, + "learning_rate": 1.5954160438465373e-05, + "loss": 0.0001, + "step": 10700 + }, + { + "epoch": 2.8231104075979423, + "grad_norm": 0.15977101027965546, + "learning_rate": 1.5952988071162696e-05, + "loss": 0.0116, + "step": 10702 + }, + { + "epoch": 2.8236380424746077, + "grad_norm": 0.05018147453665733, + "learning_rate": 1.595181570386002e-05, + "loss": 0.0025, + "step": 10704 + }, + { + "epoch": 2.8241656773512727, + "grad_norm": 0.05570510774850845, + "learning_rate": 1.5950643336557344e-05, + "loss": 0.0004, + "step": 10706 + }, + { + "epoch": 2.8246933122279385, + "grad_norm": 0.4764038622379303, + "learning_rate": 1.5949470969254667e-05, + "loss": 0.007, + "step": 10708 + }, + { + "epoch": 2.8252209471046035, + "grad_norm": 0.009195162914693356, + "learning_rate": 1.5948298601951993e-05, + "loss": 0.0003, + "step": 10710 + }, + { + "epoch": 2.825748581981269, + "grad_norm": 0.01314073521643877, + "learning_rate": 1.594712623464932e-05, + "loss": 0.0016, + "step": 10712 + }, + { + "epoch": 2.8262762168579343, + "grad_norm": 0.07665744423866272, + "learning_rate": 1.594595386734664e-05, + "loss": 0.0022, + "step": 10714 + }, + { + "epoch": 2.8268038517345997, + "grad_norm": 0.017766010016202927, + "learning_rate": 1.5944781500043964e-05, + "loss": 0.0002, + "step": 10716 + }, + { + "epoch": 2.827331486611265, + "grad_norm": 0.016065388917922974, + "learning_rate": 1.594360913274129e-05, + "loss": 0.0004, + "step": 10718 + }, + { + "epoch": 2.8278591214879305, + "grad_norm": 0.06622691452503204, + "learning_rate": 1.5942436765438613e-05, + "loss": 0.0008, + "step": 10720 + }, + { + "epoch": 2.828386756364596, + "grad_norm": 0.02156776934862137, + "learning_rate": 1.594126439813594e-05, + "loss": 0.0004, + "step": 10722 + }, + { + "epoch": 2.828914391241261, + "grad_norm": 0.045699942857027054, + "learning_rate": 1.594009203083326e-05, + "loss": 0.0005, + "step": 10724 + }, + { + "epoch": 2.8294420261179263, + "grad_norm": 0.07818799465894699, + "learning_rate": 1.5938919663530584e-05, + "loss": 0.0007, + "step": 10726 + }, + { + "epoch": 2.8299696609945917, + "grad_norm": 0.010485724546015263, + "learning_rate": 1.593774729622791e-05, + "loss": 0.0003, + "step": 10728 + }, + { + "epoch": 2.830497295871257, + "grad_norm": 0.012283282354474068, + "learning_rate": 1.5936574928925232e-05, + "loss": 0.0003, + "step": 10730 + }, + { + "epoch": 2.8310249307479225, + "grad_norm": 0.16864606738090515, + "learning_rate": 1.593540256162256e-05, + "loss": 0.0013, + "step": 10732 + }, + { + "epoch": 2.831552565624588, + "grad_norm": 0.07562873512506485, + "learning_rate": 1.593423019431988e-05, + "loss": 0.0048, + "step": 10734 + }, + { + "epoch": 2.8320802005012533, + "grad_norm": 0.08227592706680298, + "learning_rate": 1.5933057827017207e-05, + "loss": 0.0016, + "step": 10736 + }, + { + "epoch": 2.8326078353779183, + "grad_norm": 0.020665368065238, + "learning_rate": 1.593188545971453e-05, + "loss": 0.0002, + "step": 10738 + }, + { + "epoch": 2.8331354702545837, + "grad_norm": 1.1406469345092773, + "learning_rate": 1.5930713092411852e-05, + "loss": 0.0046, + "step": 10740 + }, + { + "epoch": 2.833663105131249, + "grad_norm": 0.002767177764326334, + "learning_rate": 1.5929540725109178e-05, + "loss": 0.0001, + "step": 10742 + }, + { + "epoch": 2.8341907400079145, + "grad_norm": 0.015608085319399834, + "learning_rate": 1.5928368357806504e-05, + "loss": 0.0002, + "step": 10744 + }, + { + "epoch": 2.83471837488458, + "grad_norm": 0.010982811450958252, + "learning_rate": 1.5927195990503827e-05, + "loss": 0.0002, + "step": 10746 + }, + { + "epoch": 2.8352460097612453, + "grad_norm": 0.0062458934262394905, + "learning_rate": 1.592602362320115e-05, + "loss": 0.0001, + "step": 10748 + }, + { + "epoch": 2.8357736446379107, + "grad_norm": 0.07974641025066376, + "learning_rate": 1.5924851255898475e-05, + "loss": 0.005, + "step": 10750 + }, + { + "epoch": 2.8363012795145757, + "grad_norm": 0.12513215839862823, + "learning_rate": 1.5923678888595798e-05, + "loss": 0.0046, + "step": 10752 + }, + { + "epoch": 2.8368289143912415, + "grad_norm": 0.014552476815879345, + "learning_rate": 1.592250652129312e-05, + "loss": 0.0002, + "step": 10754 + }, + { + "epoch": 2.8373565492679065, + "grad_norm": 0.004124457482248545, + "learning_rate": 1.5921334153990447e-05, + "loss": 0.0001, + "step": 10756 + }, + { + "epoch": 2.837884184144572, + "grad_norm": 0.001947185955941677, + "learning_rate": 1.5920161786687773e-05, + "loss": 0.0001, + "step": 10758 + }, + { + "epoch": 2.8384118190212373, + "grad_norm": 0.1726454198360443, + "learning_rate": 1.5918989419385092e-05, + "loss": 0.0032, + "step": 10760 + }, + { + "epoch": 2.8389394538979027, + "grad_norm": 0.030814502388238907, + "learning_rate": 1.5917817052082418e-05, + "loss": 0.0029, + "step": 10762 + }, + { + "epoch": 2.839467088774568, + "grad_norm": 0.49308958649635315, + "learning_rate": 1.5916644684779744e-05, + "loss": 0.0034, + "step": 10764 + }, + { + "epoch": 2.8399947236512335, + "grad_norm": 0.2221287488937378, + "learning_rate": 1.5915472317477067e-05, + "loss": 0.011, + "step": 10766 + }, + { + "epoch": 2.840522358527899, + "grad_norm": 0.02190147340297699, + "learning_rate": 1.591429995017439e-05, + "loss": 0.0005, + "step": 10768 + }, + { + "epoch": 2.841049993404564, + "grad_norm": 0.20059210062026978, + "learning_rate": 1.5913127582871715e-05, + "loss": 0.008, + "step": 10770 + }, + { + "epoch": 2.8415776282812293, + "grad_norm": 0.03163395822048187, + "learning_rate": 1.5911955215569038e-05, + "loss": 0.0023, + "step": 10772 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.03655963018536568, + "learning_rate": 1.5910782848266364e-05, + "loss": 0.0011, + "step": 10774 + }, + { + "epoch": 2.84263289803456, + "grad_norm": 0.2834987938404083, + "learning_rate": 1.5909610480963686e-05, + "loss": 0.0011, + "step": 10776 + }, + { + "epoch": 2.8431605329112255, + "grad_norm": 0.5971429944038391, + "learning_rate": 1.5908438113661012e-05, + "loss": 0.0065, + "step": 10778 + }, + { + "epoch": 2.843688167787891, + "grad_norm": 0.03370390087366104, + "learning_rate": 1.5907265746358335e-05, + "loss": 0.0018, + "step": 10780 + }, + { + "epoch": 2.8442158026645563, + "grad_norm": 0.018630212172865868, + "learning_rate": 1.590609337905566e-05, + "loss": 0.0018, + "step": 10782 + }, + { + "epoch": 2.8447434375412213, + "grad_norm": 0.04749193787574768, + "learning_rate": 1.5904921011752984e-05, + "loss": 0.0011, + "step": 10784 + }, + { + "epoch": 2.8452710724178867, + "grad_norm": 0.008454281836748123, + "learning_rate": 1.5903748644450306e-05, + "loss": 0.0001, + "step": 10786 + }, + { + "epoch": 2.845798707294552, + "grad_norm": 0.0013105194084346294, + "learning_rate": 1.5902576277147632e-05, + "loss": 0.0061, + "step": 10788 + }, + { + "epoch": 2.8463263421712175, + "grad_norm": 0.0011695845751091838, + "learning_rate": 1.5901403909844955e-05, + "loss": 0.0001, + "step": 10790 + }, + { + "epoch": 2.846853977047883, + "grad_norm": 0.009269044734537601, + "learning_rate": 1.590023154254228e-05, + "loss": 0.0001, + "step": 10792 + }, + { + "epoch": 2.8473816119245483, + "grad_norm": 0.018281759694218636, + "learning_rate": 1.5899059175239603e-05, + "loss": 0.0042, + "step": 10794 + }, + { + "epoch": 2.8479092468012137, + "grad_norm": 0.017839957028627396, + "learning_rate": 1.589788680793693e-05, + "loss": 0.0002, + "step": 10796 + }, + { + "epoch": 2.8484368816778787, + "grad_norm": 0.002466953359544277, + "learning_rate": 1.5896714440634252e-05, + "loss": 0.0001, + "step": 10798 + }, + { + "epoch": 2.8489645165545445, + "grad_norm": 0.015297019854187965, + "learning_rate": 1.5895542073331575e-05, + "loss": 0.0051, + "step": 10800 + }, + { + "epoch": 2.8494921514312095, + "grad_norm": 0.010651654563844204, + "learning_rate": 1.58943697060289e-05, + "loss": 0.0031, + "step": 10802 + }, + { + "epoch": 2.850019786307875, + "grad_norm": 0.003497797530144453, + "learning_rate": 1.5893197338726227e-05, + "loss": 0.0004, + "step": 10804 + }, + { + "epoch": 2.8505474211845403, + "grad_norm": 0.2859157919883728, + "learning_rate": 1.5892024971423546e-05, + "loss": 0.0014, + "step": 10806 + }, + { + "epoch": 2.8510750560612057, + "grad_norm": 0.009120682254433632, + "learning_rate": 1.5890852604120872e-05, + "loss": 0.0032, + "step": 10808 + }, + { + "epoch": 2.851602690937871, + "grad_norm": 0.017628595232963562, + "learning_rate": 1.5889680236818198e-05, + "loss": 0.0072, + "step": 10810 + }, + { + "epoch": 2.852130325814536, + "grad_norm": 0.04462342709302902, + "learning_rate": 1.588850786951552e-05, + "loss": 0.0051, + "step": 10812 + }, + { + "epoch": 2.852657960691202, + "grad_norm": 0.004943850450217724, + "learning_rate": 1.5887335502212843e-05, + "loss": 0.0003, + "step": 10814 + }, + { + "epoch": 2.853185595567867, + "grad_norm": 0.0016857556765899062, + "learning_rate": 1.588616313491017e-05, + "loss": 0.0047, + "step": 10816 + }, + { + "epoch": 2.8537132304445323, + "grad_norm": 0.0029732519760727882, + "learning_rate": 1.588499076760749e-05, + "loss": 0.0001, + "step": 10818 + }, + { + "epoch": 2.8542408653211977, + "grad_norm": 0.012442219071090221, + "learning_rate": 1.5883818400304814e-05, + "loss": 0.0043, + "step": 10820 + }, + { + "epoch": 2.854768500197863, + "grad_norm": 0.031861815601587296, + "learning_rate": 1.588264603300214e-05, + "loss": 0.0003, + "step": 10822 + }, + { + "epoch": 2.8552961350745285, + "grad_norm": 0.29266056418418884, + "learning_rate": 1.5881473665699466e-05, + "loss": 0.0068, + "step": 10824 + }, + { + "epoch": 2.855823769951194, + "grad_norm": 0.007192881312221289, + "learning_rate": 1.588030129839679e-05, + "loss": 0.0001, + "step": 10826 + }, + { + "epoch": 2.8563514048278593, + "grad_norm": 0.5382595658302307, + "learning_rate": 1.587912893109411e-05, + "loss": 0.0068, + "step": 10828 + }, + { + "epoch": 2.8568790397045243, + "grad_norm": 0.0061733173206448555, + "learning_rate": 1.5877956563791437e-05, + "loss": 0.0028, + "step": 10830 + }, + { + "epoch": 2.8574066745811897, + "grad_norm": 0.03274013474583626, + "learning_rate": 1.587678419648876e-05, + "loss": 0.0003, + "step": 10832 + }, + { + "epoch": 2.857934309457855, + "grad_norm": 0.10017738491296768, + "learning_rate": 1.5875611829186086e-05, + "loss": 0.0012, + "step": 10834 + }, + { + "epoch": 2.8584619443345205, + "grad_norm": 0.00397944962605834, + "learning_rate": 1.587443946188341e-05, + "loss": 0.0005, + "step": 10836 + }, + { + "epoch": 2.858989579211186, + "grad_norm": 0.006913243792951107, + "learning_rate": 1.5873267094580735e-05, + "loss": 0.0005, + "step": 10838 + }, + { + "epoch": 2.8595172140878513, + "grad_norm": 0.027362482622265816, + "learning_rate": 1.5872094727278057e-05, + "loss": 0.0004, + "step": 10840 + }, + { + "epoch": 2.8600448489645167, + "grad_norm": 0.13989081978797913, + "learning_rate": 1.5870922359975383e-05, + "loss": 0.0034, + "step": 10842 + }, + { + "epoch": 2.8605724838411817, + "grad_norm": 0.004744218662381172, + "learning_rate": 1.5869749992672706e-05, + "loss": 0.0001, + "step": 10844 + }, + { + "epoch": 2.861100118717847, + "grad_norm": 0.08737576752901077, + "learning_rate": 1.586857762537003e-05, + "loss": 0.0005, + "step": 10846 + }, + { + "epoch": 2.8616277535945125, + "grad_norm": 0.007107593584805727, + "learning_rate": 1.5867405258067354e-05, + "loss": 0.0002, + "step": 10848 + }, + { + "epoch": 2.862155388471178, + "grad_norm": 0.01175332348793745, + "learning_rate": 1.5866232890764677e-05, + "loss": 0.0002, + "step": 10850 + }, + { + "epoch": 2.8626830233478433, + "grad_norm": 0.03193758428096771, + "learning_rate": 1.5865060523462003e-05, + "loss": 0.0004, + "step": 10852 + }, + { + "epoch": 2.8632106582245087, + "grad_norm": 0.14482370018959045, + "learning_rate": 1.5863888156159326e-05, + "loss": 0.0038, + "step": 10854 + }, + { + "epoch": 2.863738293101174, + "grad_norm": 0.05368214100599289, + "learning_rate": 1.5862715788856652e-05, + "loss": 0.0007, + "step": 10856 + }, + { + "epoch": 2.864265927977839, + "grad_norm": 0.04897599667310715, + "learning_rate": 1.5861543421553974e-05, + "loss": 0.0003, + "step": 10858 + }, + { + "epoch": 2.864793562854505, + "grad_norm": 0.7585223913192749, + "learning_rate": 1.5860371054251297e-05, + "loss": 0.0047, + "step": 10860 + }, + { + "epoch": 2.86532119773117, + "grad_norm": 0.0030547333881258965, + "learning_rate": 1.5859198686948623e-05, + "loss": 0.0004, + "step": 10862 + }, + { + "epoch": 2.8658488326078353, + "grad_norm": 0.48899152874946594, + "learning_rate": 1.5858026319645946e-05, + "loss": 0.0061, + "step": 10864 + }, + { + "epoch": 2.8663764674845007, + "grad_norm": 0.29632559418678284, + "learning_rate": 1.5856853952343268e-05, + "loss": 0.0008, + "step": 10866 + }, + { + "epoch": 2.866904102361166, + "grad_norm": 0.006052087992429733, + "learning_rate": 1.5855681585040594e-05, + "loss": 0.0001, + "step": 10868 + }, + { + "epoch": 2.8674317372378315, + "grad_norm": 0.002743009477853775, + "learning_rate": 1.585450921773792e-05, + "loss": 0.0027, + "step": 10870 + }, + { + "epoch": 2.867959372114497, + "grad_norm": 0.004623222630470991, + "learning_rate": 1.5853336850435243e-05, + "loss": 0.0001, + "step": 10872 + }, + { + "epoch": 2.8684870069911623, + "grad_norm": 0.05664484202861786, + "learning_rate": 1.5852164483132565e-05, + "loss": 0.0021, + "step": 10874 + }, + { + "epoch": 2.8690146418678273, + "grad_norm": 0.26450127363204956, + "learning_rate": 1.585099211582989e-05, + "loss": 0.0009, + "step": 10876 + }, + { + "epoch": 2.8695422767444927, + "grad_norm": 0.007257516495883465, + "learning_rate": 1.5849819748527214e-05, + "loss": 0.0001, + "step": 10878 + }, + { + "epoch": 2.870069911621158, + "grad_norm": 0.012499837204813957, + "learning_rate": 1.584864738122454e-05, + "loss": 0.0002, + "step": 10880 + }, + { + "epoch": 2.8705975464978235, + "grad_norm": 0.0028711857739835978, + "learning_rate": 1.5847475013921863e-05, + "loss": 0.0027, + "step": 10882 + }, + { + "epoch": 2.871125181374489, + "grad_norm": 0.002746378071606159, + "learning_rate": 1.584630264661919e-05, + "loss": 0.0003, + "step": 10884 + }, + { + "epoch": 2.8716528162511543, + "grad_norm": 0.0865766853094101, + "learning_rate": 1.584513027931651e-05, + "loss": 0.0005, + "step": 10886 + }, + { + "epoch": 2.8721804511278197, + "grad_norm": 0.0022712056525051594, + "learning_rate": 1.5843957912013834e-05, + "loss": 0.0001, + "step": 10888 + }, + { + "epoch": 2.8727080860044847, + "grad_norm": 0.0039640162140131, + "learning_rate": 1.584278554471116e-05, + "loss": 0.0001, + "step": 10890 + }, + { + "epoch": 2.87323572088115, + "grad_norm": 0.030182836577296257, + "learning_rate": 1.5841613177408482e-05, + "loss": 0.0031, + "step": 10892 + }, + { + "epoch": 2.8737633557578155, + "grad_norm": 0.0447503924369812, + "learning_rate": 1.584044081010581e-05, + "loss": 0.0004, + "step": 10894 + }, + { + "epoch": 2.874290990634481, + "grad_norm": 0.011140438728034496, + "learning_rate": 1.583926844280313e-05, + "loss": 0.0002, + "step": 10896 + }, + { + "epoch": 2.8748186255111463, + "grad_norm": 0.004503864794969559, + "learning_rate": 1.5838096075500457e-05, + "loss": 0.0001, + "step": 10898 + }, + { + "epoch": 2.8753462603878117, + "grad_norm": 0.0024113422259688377, + "learning_rate": 1.583692370819778e-05, + "loss": 0.0019, + "step": 10900 + }, + { + "epoch": 2.875873895264477, + "grad_norm": 0.003327008569613099, + "learning_rate": 1.5835751340895106e-05, + "loss": 0.0004, + "step": 10902 + }, + { + "epoch": 2.876401530141142, + "grad_norm": 0.0678999274969101, + "learning_rate": 1.5834578973592428e-05, + "loss": 0.0072, + "step": 10904 + }, + { + "epoch": 2.876929165017808, + "grad_norm": 0.003056969027966261, + "learning_rate": 1.583340660628975e-05, + "loss": 0.0001, + "step": 10906 + }, + { + "epoch": 2.877456799894473, + "grad_norm": 0.06247151270508766, + "learning_rate": 1.5832234238987077e-05, + "loss": 0.0008, + "step": 10908 + }, + { + "epoch": 2.8779844347711383, + "grad_norm": 0.022841012105345726, + "learning_rate": 1.58310618716844e-05, + "loss": 0.0004, + "step": 10910 + }, + { + "epoch": 2.8785120696478037, + "grad_norm": 0.03277462348341942, + "learning_rate": 1.5829889504381722e-05, + "loss": 0.0002, + "step": 10912 + }, + { + "epoch": 2.879039704524469, + "grad_norm": 0.01569521240890026, + "learning_rate": 1.5828717137079048e-05, + "loss": 0.0004, + "step": 10914 + }, + { + "epoch": 2.8795673394011345, + "grad_norm": 0.012667302042245865, + "learning_rate": 1.5827544769776374e-05, + "loss": 0.0002, + "step": 10916 + }, + { + "epoch": 2.8800949742778, + "grad_norm": 0.007481901906430721, + "learning_rate": 1.5826372402473697e-05, + "loss": 0.0003, + "step": 10918 + }, + { + "epoch": 2.8806226091544653, + "grad_norm": 0.11798073351383209, + "learning_rate": 1.582520003517102e-05, + "loss": 0.0074, + "step": 10920 + }, + { + "epoch": 2.8811502440311303, + "grad_norm": 0.13841570913791656, + "learning_rate": 1.5824027667868345e-05, + "loss": 0.0182, + "step": 10922 + }, + { + "epoch": 2.8816778789077957, + "grad_norm": 0.003934998530894518, + "learning_rate": 1.5822855300565668e-05, + "loss": 0.0001, + "step": 10924 + }, + { + "epoch": 2.882205513784461, + "grad_norm": 0.01007485669106245, + "learning_rate": 1.582168293326299e-05, + "loss": 0.0001, + "step": 10926 + }, + { + "epoch": 2.8827331486611265, + "grad_norm": 0.002635871758684516, + "learning_rate": 1.5820510565960316e-05, + "loss": 0.0001, + "step": 10928 + }, + { + "epoch": 2.883260783537792, + "grad_norm": 0.004766147118061781, + "learning_rate": 1.5819338198657642e-05, + "loss": 0.0002, + "step": 10930 + }, + { + "epoch": 2.8837884184144573, + "grad_norm": 0.004046139307320118, + "learning_rate": 1.5818165831354965e-05, + "loss": 0.0031, + "step": 10932 + }, + { + "epoch": 2.8843160532911227, + "grad_norm": 0.10610795766115189, + "learning_rate": 1.5816993464052288e-05, + "loss": 0.0013, + "step": 10934 + }, + { + "epoch": 2.8848436881677877, + "grad_norm": 0.011090212501585484, + "learning_rate": 1.5815821096749614e-05, + "loss": 0.0048, + "step": 10936 + }, + { + "epoch": 2.885371323044453, + "grad_norm": 0.017151953652501106, + "learning_rate": 1.5814648729446936e-05, + "loss": 0.0002, + "step": 10938 + }, + { + "epoch": 2.8858989579211185, + "grad_norm": 0.006261609029024839, + "learning_rate": 1.5813476362144262e-05, + "loss": 0.0004, + "step": 10940 + }, + { + "epoch": 2.886426592797784, + "grad_norm": 0.025163013488054276, + "learning_rate": 1.5812303994841585e-05, + "loss": 0.0005, + "step": 10942 + }, + { + "epoch": 2.8869542276744493, + "grad_norm": 0.11151667684316635, + "learning_rate": 1.581113162753891e-05, + "loss": 0.0004, + "step": 10944 + }, + { + "epoch": 2.8874818625511147, + "grad_norm": 0.006350723560899496, + "learning_rate": 1.5809959260236234e-05, + "loss": 0.0002, + "step": 10946 + }, + { + "epoch": 2.88800949742778, + "grad_norm": 0.0034588684793561697, + "learning_rate": 1.5808786892933556e-05, + "loss": 0.0002, + "step": 10948 + }, + { + "epoch": 2.888537132304445, + "grad_norm": 0.03132856264710426, + "learning_rate": 1.5807614525630882e-05, + "loss": 0.0003, + "step": 10950 + }, + { + "epoch": 2.889064767181111, + "grad_norm": 0.023768778890371323, + "learning_rate": 1.5806442158328205e-05, + "loss": 0.0003, + "step": 10952 + }, + { + "epoch": 2.889592402057776, + "grad_norm": 0.0026694766711443663, + "learning_rate": 1.580526979102553e-05, + "loss": 0.0003, + "step": 10954 + }, + { + "epoch": 2.8901200369344413, + "grad_norm": 0.03390858322381973, + "learning_rate": 1.5804097423722853e-05, + "loss": 0.0002, + "step": 10956 + }, + { + "epoch": 2.8906476718111067, + "grad_norm": 0.005540776066482067, + "learning_rate": 1.5802925056420176e-05, + "loss": 0.0006, + "step": 10958 + }, + { + "epoch": 2.891175306687772, + "grad_norm": 0.08213181048631668, + "learning_rate": 1.5801752689117502e-05, + "loss": 0.0016, + "step": 10960 + }, + { + "epoch": 2.8917029415644375, + "grad_norm": 0.13124386966228485, + "learning_rate": 1.5800580321814828e-05, + "loss": 0.0012, + "step": 10962 + }, + { + "epoch": 2.8922305764411025, + "grad_norm": 0.12559263408184052, + "learning_rate": 1.579940795451215e-05, + "loss": 0.0047, + "step": 10964 + }, + { + "epoch": 2.8927582113177683, + "grad_norm": 0.09788770228624344, + "learning_rate": 1.5798235587209473e-05, + "loss": 0.0059, + "step": 10966 + }, + { + "epoch": 2.8932858461944333, + "grad_norm": 0.036642178893089294, + "learning_rate": 1.57970632199068e-05, + "loss": 0.0003, + "step": 10968 + }, + { + "epoch": 2.8938134810710987, + "grad_norm": 0.030749913305044174, + "learning_rate": 1.5795890852604122e-05, + "loss": 0.0039, + "step": 10970 + }, + { + "epoch": 2.894341115947764, + "grad_norm": 0.12599237263202667, + "learning_rate": 1.5794718485301444e-05, + "loss": 0.0008, + "step": 10972 + }, + { + "epoch": 2.8948687508244295, + "grad_norm": 0.014618049375712872, + "learning_rate": 1.579354611799877e-05, + "loss": 0.0014, + "step": 10974 + }, + { + "epoch": 2.895396385701095, + "grad_norm": 0.05166430398821831, + "learning_rate": 1.5792373750696096e-05, + "loss": 0.0037, + "step": 10976 + }, + { + "epoch": 2.8959240205777603, + "grad_norm": 0.19108550250530243, + "learning_rate": 1.579120138339342e-05, + "loss": 0.0024, + "step": 10978 + }, + { + "epoch": 2.8964516554544257, + "grad_norm": 0.020237991586327553, + "learning_rate": 1.579002901609074e-05, + "loss": 0.0003, + "step": 10980 + }, + { + "epoch": 2.8969792903310907, + "grad_norm": 0.01646234467625618, + "learning_rate": 1.5788856648788068e-05, + "loss": 0.0003, + "step": 10982 + }, + { + "epoch": 2.897506925207756, + "grad_norm": 0.21350039541721344, + "learning_rate": 1.578768428148539e-05, + "loss": 0.0055, + "step": 10984 + }, + { + "epoch": 2.8980345600844215, + "grad_norm": 0.01290766429156065, + "learning_rate": 1.5786511914182713e-05, + "loss": 0.0002, + "step": 10986 + }, + { + "epoch": 2.898562194961087, + "grad_norm": 0.0034698082599788904, + "learning_rate": 1.578533954688004e-05, + "loss": 0.0001, + "step": 10988 + }, + { + "epoch": 2.8990898298377523, + "grad_norm": 0.23771171271800995, + "learning_rate": 1.5784167179577365e-05, + "loss": 0.0017, + "step": 10990 + }, + { + "epoch": 2.8996174647144177, + "grad_norm": 0.007456687279045582, + "learning_rate": 1.5782994812274687e-05, + "loss": 0.0033, + "step": 10992 + }, + { + "epoch": 2.900145099591083, + "grad_norm": 0.11656972765922546, + "learning_rate": 1.578182244497201e-05, + "loss": 0.0004, + "step": 10994 + }, + { + "epoch": 2.900672734467748, + "grad_norm": 0.03721444681286812, + "learning_rate": 1.5780650077669336e-05, + "loss": 0.0003, + "step": 10996 + }, + { + "epoch": 2.901200369344414, + "grad_norm": 0.008118205703794956, + "learning_rate": 1.577947771036666e-05, + "loss": 0.0002, + "step": 10998 + }, + { + "epoch": 2.901728004221079, + "grad_norm": 0.021038711071014404, + "learning_rate": 1.5778305343063985e-05, + "loss": 0.0017, + "step": 11000 + }, + { + "epoch": 2.9022556390977443, + "grad_norm": 0.009313001297414303, + "learning_rate": 1.5777132975761307e-05, + "loss": 0.0007, + "step": 11002 + }, + { + "epoch": 2.9027832739744097, + "grad_norm": 0.11831189692020416, + "learning_rate": 1.577596060845863e-05, + "loss": 0.0009, + "step": 11004 + }, + { + "epoch": 2.903310908851075, + "grad_norm": 0.015744134783744812, + "learning_rate": 1.5774788241155956e-05, + "loss": 0.0003, + "step": 11006 + }, + { + "epoch": 2.9038385437277405, + "grad_norm": 0.009818023070693016, + "learning_rate": 1.577361587385328e-05, + "loss": 0.0001, + "step": 11008 + }, + { + "epoch": 2.9043661786044055, + "grad_norm": 0.015668800100684166, + "learning_rate": 1.5772443506550604e-05, + "loss": 0.0028, + "step": 11010 + }, + { + "epoch": 2.9048938134810713, + "grad_norm": 0.03990878909826279, + "learning_rate": 1.5771271139247927e-05, + "loss": 0.0003, + "step": 11012 + }, + { + "epoch": 2.9054214483577363, + "grad_norm": 0.24578945338726044, + "learning_rate": 1.5770098771945253e-05, + "loss": 0.0139, + "step": 11014 + }, + { + "epoch": 2.9059490832344017, + "grad_norm": 0.012393361888825893, + "learning_rate": 1.5768926404642576e-05, + "loss": 0.0007, + "step": 11016 + }, + { + "epoch": 2.906476718111067, + "grad_norm": 0.01778719201683998, + "learning_rate": 1.5767754037339898e-05, + "loss": 0.0002, + "step": 11018 + }, + { + "epoch": 2.9070043529877325, + "grad_norm": 0.08904173225164413, + "learning_rate": 1.5766581670037224e-05, + "loss": 0.0004, + "step": 11020 + }, + { + "epoch": 2.907531987864398, + "grad_norm": 0.3796539008617401, + "learning_rate": 1.576540930273455e-05, + "loss": 0.0069, + "step": 11022 + }, + { + "epoch": 2.9080596227410633, + "grad_norm": 0.3943403363227844, + "learning_rate": 1.5764236935431873e-05, + "loss": 0.0087, + "step": 11024 + }, + { + "epoch": 2.9085872576177287, + "grad_norm": 0.006933401804417372, + "learning_rate": 1.5763064568129196e-05, + "loss": 0.0001, + "step": 11026 + }, + { + "epoch": 2.9091148924943937, + "grad_norm": 0.0027540167793631554, + "learning_rate": 1.576189220082652e-05, + "loss": 0.0001, + "step": 11028 + }, + { + "epoch": 2.909642527371059, + "grad_norm": 0.521648108959198, + "learning_rate": 1.5760719833523844e-05, + "loss": 0.0012, + "step": 11030 + }, + { + "epoch": 2.9101701622477245, + "grad_norm": 0.004037758801132441, + "learning_rate": 1.5759547466221167e-05, + "loss": 0.0002, + "step": 11032 + }, + { + "epoch": 2.91069779712439, + "grad_norm": 0.007231828756630421, + "learning_rate": 1.5758375098918493e-05, + "loss": 0.0068, + "step": 11034 + }, + { + "epoch": 2.9112254320010553, + "grad_norm": 0.22193394601345062, + "learning_rate": 1.575720273161582e-05, + "loss": 0.0093, + "step": 11036 + }, + { + "epoch": 2.9117530668777207, + "grad_norm": 0.011287095956504345, + "learning_rate": 1.5756030364313138e-05, + "loss": 0.0002, + "step": 11038 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.023821281269192696, + "learning_rate": 1.5754857997010464e-05, + "loss": 0.0003, + "step": 11040 + }, + { + "epoch": 2.912808336631051, + "grad_norm": 0.17033597826957703, + "learning_rate": 1.575368562970779e-05, + "loss": 0.0031, + "step": 11042 + }, + { + "epoch": 2.9133359715077165, + "grad_norm": 0.01828545518219471, + "learning_rate": 1.5752513262405113e-05, + "loss": 0.0003, + "step": 11044 + }, + { + "epoch": 2.913863606384382, + "grad_norm": 0.015825552865862846, + "learning_rate": 1.5751340895102435e-05, + "loss": 0.0003, + "step": 11046 + }, + { + "epoch": 2.9143912412610473, + "grad_norm": 0.013684647157788277, + "learning_rate": 1.575016852779976e-05, + "loss": 0.0003, + "step": 11048 + }, + { + "epoch": 2.9149188761377127, + "grad_norm": 0.012006234377622604, + "learning_rate": 1.5748996160497084e-05, + "loss": 0.0003, + "step": 11050 + }, + { + "epoch": 2.915446511014378, + "grad_norm": 0.11970895528793335, + "learning_rate": 1.574782379319441e-05, + "loss": 0.0051, + "step": 11052 + }, + { + "epoch": 2.9159741458910435, + "grad_norm": 0.00855785608291626, + "learning_rate": 1.5746651425891732e-05, + "loss": 0.0003, + "step": 11054 + }, + { + "epoch": 2.9165017807677085, + "grad_norm": 0.23897992074489594, + "learning_rate": 1.574547905858906e-05, + "loss": 0.0012, + "step": 11056 + }, + { + "epoch": 2.9170294156443743, + "grad_norm": 0.01058815885335207, + "learning_rate": 1.574430669128638e-05, + "loss": 0.0002, + "step": 11058 + }, + { + "epoch": 2.9175570505210393, + "grad_norm": 0.006022877525538206, + "learning_rate": 1.5743134323983707e-05, + "loss": 0.0002, + "step": 11060 + }, + { + "epoch": 2.9180846853977047, + "grad_norm": 0.008581925183534622, + "learning_rate": 1.574196195668103e-05, + "loss": 0.0002, + "step": 11062 + }, + { + "epoch": 2.91861232027437, + "grad_norm": 0.16375499963760376, + "learning_rate": 1.5740789589378352e-05, + "loss": 0.001, + "step": 11064 + }, + { + "epoch": 2.9191399551510355, + "grad_norm": 0.015455804765224457, + "learning_rate": 1.5739617222075678e-05, + "loss": 0.0003, + "step": 11066 + }, + { + "epoch": 2.919667590027701, + "grad_norm": 0.0607217513024807, + "learning_rate": 1.5738444854773e-05, + "loss": 0.0023, + "step": 11068 + }, + { + "epoch": 2.9201952249043663, + "grad_norm": 0.004027269314974546, + "learning_rate": 1.5737272487470327e-05, + "loss": 0.0015, + "step": 11070 + }, + { + "epoch": 2.9207228597810317, + "grad_norm": 0.10812457650899887, + "learning_rate": 1.573610012016765e-05, + "loss": 0.0038, + "step": 11072 + }, + { + "epoch": 2.9212504946576967, + "grad_norm": 0.014642500318586826, + "learning_rate": 1.5734927752864975e-05, + "loss": 0.0004, + "step": 11074 + }, + { + "epoch": 2.921778129534362, + "grad_norm": 0.0036884990986436605, + "learning_rate": 1.5733755385562298e-05, + "loss": 0.0002, + "step": 11076 + }, + { + "epoch": 2.9223057644110275, + "grad_norm": 0.05014617741107941, + "learning_rate": 1.573258301825962e-05, + "loss": 0.0029, + "step": 11078 + }, + { + "epoch": 2.922833399287693, + "grad_norm": 0.006173725705593824, + "learning_rate": 1.5731410650956947e-05, + "loss": 0.0003, + "step": 11080 + }, + { + "epoch": 2.9233610341643583, + "grad_norm": 0.001959455432370305, + "learning_rate": 1.5730238283654273e-05, + "loss": 0.0001, + "step": 11082 + }, + { + "epoch": 2.9238886690410237, + "grad_norm": 0.009915990754961967, + "learning_rate": 1.5729065916351592e-05, + "loss": 0.0001, + "step": 11084 + }, + { + "epoch": 2.924416303917689, + "grad_norm": 0.08379388600587845, + "learning_rate": 1.5727893549048918e-05, + "loss": 0.0029, + "step": 11086 + }, + { + "epoch": 2.924943938794354, + "grad_norm": 0.004508547950536013, + "learning_rate": 1.5726721181746244e-05, + "loss": 0.0084, + "step": 11088 + }, + { + "epoch": 2.9254715736710195, + "grad_norm": 0.16827456653118134, + "learning_rate": 1.5725548814443566e-05, + "loss": 0.0005, + "step": 11090 + }, + { + "epoch": 2.925999208547685, + "grad_norm": 0.009216914884746075, + "learning_rate": 1.572437644714089e-05, + "loss": 0.0003, + "step": 11092 + }, + { + "epoch": 2.9265268434243503, + "grad_norm": 0.20953458547592163, + "learning_rate": 1.5723204079838215e-05, + "loss": 0.0006, + "step": 11094 + }, + { + "epoch": 2.9270544783010157, + "grad_norm": 0.006858900189399719, + "learning_rate": 1.5722031712535538e-05, + "loss": 0.0003, + "step": 11096 + }, + { + "epoch": 2.927582113177681, + "grad_norm": 0.0038812614511698484, + "learning_rate": 1.572085934523286e-05, + "loss": 0.0005, + "step": 11098 + }, + { + "epoch": 2.9281097480543465, + "grad_norm": 0.006412880029529333, + "learning_rate": 1.5719686977930186e-05, + "loss": 0.0001, + "step": 11100 + }, + { + "epoch": 2.9286373829310115, + "grad_norm": 0.005061767064034939, + "learning_rate": 1.5718514610627512e-05, + "loss": 0.007, + "step": 11102 + }, + { + "epoch": 2.9291650178076774, + "grad_norm": 0.00681583397090435, + "learning_rate": 1.5717342243324835e-05, + "loss": 0.0002, + "step": 11104 + }, + { + "epoch": 2.9296926526843423, + "grad_norm": 0.005065695848315954, + "learning_rate": 1.5716169876022158e-05, + "loss": 0.0001, + "step": 11106 + }, + { + "epoch": 2.9302202875610077, + "grad_norm": 0.3890315294265747, + "learning_rate": 1.5714997508719484e-05, + "loss": 0.0041, + "step": 11108 + }, + { + "epoch": 2.930747922437673, + "grad_norm": 0.0014304176438599825, + "learning_rate": 1.5713825141416806e-05, + "loss": 0.0001, + "step": 11110 + }, + { + "epoch": 2.9312755573143385, + "grad_norm": 0.017068948596715927, + "learning_rate": 1.5712652774114132e-05, + "loss": 0.0007, + "step": 11112 + }, + { + "epoch": 2.931803192191004, + "grad_norm": 0.0025766384787857533, + "learning_rate": 1.5711480406811455e-05, + "loss": 0.0015, + "step": 11114 + }, + { + "epoch": 2.932330827067669, + "grad_norm": 0.008516393601894379, + "learning_rate": 1.571030803950878e-05, + "loss": 0.0007, + "step": 11116 + }, + { + "epoch": 2.9328584619443347, + "grad_norm": 0.004232816398143768, + "learning_rate": 1.5709135672206103e-05, + "loss": 0.0001, + "step": 11118 + }, + { + "epoch": 2.9333860968209997, + "grad_norm": 0.1559467762708664, + "learning_rate": 1.570796330490343e-05, + "loss": 0.0053, + "step": 11120 + }, + { + "epoch": 2.933913731697665, + "grad_norm": 0.21909527480602264, + "learning_rate": 1.5706790937600752e-05, + "loss": 0.0049, + "step": 11122 + }, + { + "epoch": 2.9344413665743305, + "grad_norm": 0.15304869413375854, + "learning_rate": 1.5705618570298075e-05, + "loss": 0.0021, + "step": 11124 + }, + { + "epoch": 2.934969001450996, + "grad_norm": 0.007808706723153591, + "learning_rate": 1.57044462029954e-05, + "loss": 0.0002, + "step": 11126 + }, + { + "epoch": 2.9354966363276613, + "grad_norm": 0.007191672921180725, + "learning_rate": 1.5703273835692723e-05, + "loss": 0.0003, + "step": 11128 + }, + { + "epoch": 2.9360242712043267, + "grad_norm": 0.004520385060459375, + "learning_rate": 1.5702101468390046e-05, + "loss": 0.0002, + "step": 11130 + }, + { + "epoch": 2.936551906080992, + "grad_norm": 0.06667023152112961, + "learning_rate": 1.5700929101087372e-05, + "loss": 0.0006, + "step": 11132 + }, + { + "epoch": 2.937079540957657, + "grad_norm": 0.2178720086812973, + "learning_rate": 1.5699756733784698e-05, + "loss": 0.0008, + "step": 11134 + }, + { + "epoch": 2.9376071758343225, + "grad_norm": 0.12209979444742203, + "learning_rate": 1.569858436648202e-05, + "loss": 0.0005, + "step": 11136 + }, + { + "epoch": 2.938134810710988, + "grad_norm": 0.38294342160224915, + "learning_rate": 1.5697411999179343e-05, + "loss": 0.0036, + "step": 11138 + }, + { + "epoch": 2.9386624455876533, + "grad_norm": 0.006181091535836458, + "learning_rate": 1.569623963187667e-05, + "loss": 0.0002, + "step": 11140 + }, + { + "epoch": 2.9391900804643187, + "grad_norm": 0.003815221134573221, + "learning_rate": 1.569506726457399e-05, + "loss": 0.0001, + "step": 11142 + }, + { + "epoch": 2.939717715340984, + "grad_norm": 0.26199641823768616, + "learning_rate": 1.5693894897271314e-05, + "loss": 0.0053, + "step": 11144 + }, + { + "epoch": 2.9402453502176495, + "grad_norm": 0.02983473427593708, + "learning_rate": 1.569272252996864e-05, + "loss": 0.0013, + "step": 11146 + }, + { + "epoch": 2.9407729850943145, + "grad_norm": 0.007775311358273029, + "learning_rate": 1.5691550162665966e-05, + "loss": 0.0002, + "step": 11148 + }, + { + "epoch": 2.9413006199709804, + "grad_norm": 0.038429372012615204, + "learning_rate": 1.569037779536329e-05, + "loss": 0.0033, + "step": 11150 + }, + { + "epoch": 2.9418282548476453, + "grad_norm": 0.008728312328457832, + "learning_rate": 1.568920542806061e-05, + "loss": 0.0002, + "step": 11152 + }, + { + "epoch": 2.9423558897243107, + "grad_norm": 0.015016240999102592, + "learning_rate": 1.5688033060757937e-05, + "loss": 0.0046, + "step": 11154 + }, + { + "epoch": 2.942883524600976, + "grad_norm": 0.11416613310575485, + "learning_rate": 1.568686069345526e-05, + "loss": 0.0009, + "step": 11156 + }, + { + "epoch": 2.9434111594776415, + "grad_norm": 0.013484964147210121, + "learning_rate": 1.5685688326152583e-05, + "loss": 0.0005, + "step": 11158 + }, + { + "epoch": 2.943938794354307, + "grad_norm": 0.00478522339835763, + "learning_rate": 1.568451595884991e-05, + "loss": 0.0113, + "step": 11160 + }, + { + "epoch": 2.944466429230972, + "grad_norm": 0.026466330513358116, + "learning_rate": 1.5683343591547235e-05, + "loss": 0.0031, + "step": 11162 + }, + { + "epoch": 2.9449940641076378, + "grad_norm": 0.19358190894126892, + "learning_rate": 1.5682171224244557e-05, + "loss": 0.0016, + "step": 11164 + }, + { + "epoch": 2.9455216989843027, + "grad_norm": 0.0275163222104311, + "learning_rate": 1.568099885694188e-05, + "loss": 0.0011, + "step": 11166 + }, + { + "epoch": 2.946049333860968, + "grad_norm": 0.14555466175079346, + "learning_rate": 1.5679826489639206e-05, + "loss": 0.0031, + "step": 11168 + }, + { + "epoch": 2.9465769687376335, + "grad_norm": 0.026780959218740463, + "learning_rate": 1.567865412233653e-05, + "loss": 0.0003, + "step": 11170 + }, + { + "epoch": 2.947104603614299, + "grad_norm": 0.5165127515792847, + "learning_rate": 1.5677481755033854e-05, + "loss": 0.0041, + "step": 11172 + }, + { + "epoch": 2.9476322384909643, + "grad_norm": 0.720551073551178, + "learning_rate": 1.5676309387731177e-05, + "loss": 0.0046, + "step": 11174 + }, + { + "epoch": 2.9481598733676297, + "grad_norm": 0.541237473487854, + "learning_rate": 1.5675137020428503e-05, + "loss": 0.0043, + "step": 11176 + }, + { + "epoch": 2.948687508244295, + "grad_norm": 0.005525035783648491, + "learning_rate": 1.5673964653125826e-05, + "loss": 0.0002, + "step": 11178 + }, + { + "epoch": 2.94921514312096, + "grad_norm": 0.05756855756044388, + "learning_rate": 1.567279228582315e-05, + "loss": 0.0018, + "step": 11180 + }, + { + "epoch": 2.9497427779976255, + "grad_norm": 0.006616607308387756, + "learning_rate": 1.5671619918520474e-05, + "loss": 0.0002, + "step": 11182 + }, + { + "epoch": 2.950270412874291, + "grad_norm": 0.009078210219740868, + "learning_rate": 1.5670447551217797e-05, + "loss": 0.0071, + "step": 11184 + }, + { + "epoch": 2.9507980477509563, + "grad_norm": 0.005579438991844654, + "learning_rate": 1.5669275183915123e-05, + "loss": 0.0033, + "step": 11186 + }, + { + "epoch": 2.9513256826276217, + "grad_norm": 0.3364255130290985, + "learning_rate": 1.5668102816612445e-05, + "loss": 0.0208, + "step": 11188 + }, + { + "epoch": 2.951853317504287, + "grad_norm": 0.1309358924627304, + "learning_rate": 1.5666930449309768e-05, + "loss": 0.0016, + "step": 11190 + }, + { + "epoch": 2.9523809523809526, + "grad_norm": 0.015405184589326382, + "learning_rate": 1.5665758082007094e-05, + "loss": 0.0004, + "step": 11192 + }, + { + "epoch": 2.9529085872576175, + "grad_norm": 0.16502195596694946, + "learning_rate": 1.566458571470442e-05, + "loss": 0.0046, + "step": 11194 + }, + { + "epoch": 2.953436222134283, + "grad_norm": 0.0168463047593832, + "learning_rate": 1.5663413347401743e-05, + "loss": 0.0003, + "step": 11196 + }, + { + "epoch": 2.9539638570109483, + "grad_norm": 0.2203052043914795, + "learning_rate": 1.5662240980099065e-05, + "loss": 0.0013, + "step": 11198 + }, + { + "epoch": 2.9544914918876137, + "grad_norm": 0.07115744799375534, + "learning_rate": 1.566106861279639e-05, + "loss": 0.0006, + "step": 11200 + }, + { + "epoch": 2.955019126764279, + "grad_norm": 0.39305418729782104, + "learning_rate": 1.5659896245493714e-05, + "loss": 0.002, + "step": 11202 + }, + { + "epoch": 2.9555467616409445, + "grad_norm": 0.019015908241271973, + "learning_rate": 1.5658723878191037e-05, + "loss": 0.0022, + "step": 11204 + }, + { + "epoch": 2.95607439651761, + "grad_norm": 0.00974256545305252, + "learning_rate": 1.5657551510888363e-05, + "loss": 0.0003, + "step": 11206 + }, + { + "epoch": 2.956602031394275, + "grad_norm": 0.006788923405110836, + "learning_rate": 1.565637914358569e-05, + "loss": 0.0019, + "step": 11208 + }, + { + "epoch": 2.9571296662709408, + "grad_norm": 0.004304150585085154, + "learning_rate": 1.565520677628301e-05, + "loss": 0.0014, + "step": 11210 + }, + { + "epoch": 2.9576573011476057, + "grad_norm": 0.0029966968577355146, + "learning_rate": 1.5654034408980334e-05, + "loss": 0.0001, + "step": 11212 + }, + { + "epoch": 2.958184936024271, + "grad_norm": 0.22117862105369568, + "learning_rate": 1.565286204167766e-05, + "loss": 0.0101, + "step": 11214 + }, + { + "epoch": 2.9587125709009365, + "grad_norm": 0.011935076676309109, + "learning_rate": 1.5651689674374982e-05, + "loss": 0.0052, + "step": 11216 + }, + { + "epoch": 2.959240205777602, + "grad_norm": 0.008753105998039246, + "learning_rate": 1.5650517307072305e-05, + "loss": 0.0075, + "step": 11218 + }, + { + "epoch": 2.9597678406542673, + "grad_norm": 0.07808756828308105, + "learning_rate": 1.564934493976963e-05, + "loss": 0.0006, + "step": 11220 + }, + { + "epoch": 2.9602954755309328, + "grad_norm": 0.07702696323394775, + "learning_rate": 1.5648172572466957e-05, + "loss": 0.0005, + "step": 11222 + }, + { + "epoch": 2.960823110407598, + "grad_norm": 0.008874021470546722, + "learning_rate": 1.564700020516428e-05, + "loss": 0.0003, + "step": 11224 + }, + { + "epoch": 2.961350745284263, + "grad_norm": 0.18609319627285004, + "learning_rate": 1.5645827837861602e-05, + "loss": 0.0018, + "step": 11226 + }, + { + "epoch": 2.9618783801609285, + "grad_norm": 0.031928278505802155, + "learning_rate": 1.5644655470558928e-05, + "loss": 0.0004, + "step": 11228 + }, + { + "epoch": 2.962406015037594, + "grad_norm": 0.03219984099268913, + "learning_rate": 1.564348310325625e-05, + "loss": 0.0007, + "step": 11230 + }, + { + "epoch": 2.9629336499142593, + "grad_norm": 0.03667648881673813, + "learning_rate": 1.5642310735953577e-05, + "loss": 0.0011, + "step": 11232 + }, + { + "epoch": 2.9634612847909247, + "grad_norm": 0.061341892927885056, + "learning_rate": 1.56411383686509e-05, + "loss": 0.0014, + "step": 11234 + }, + { + "epoch": 2.96398891966759, + "grad_norm": 0.28050392866134644, + "learning_rate": 1.5639966001348222e-05, + "loss": 0.0015, + "step": 11236 + }, + { + "epoch": 2.9645165545442556, + "grad_norm": 0.12663063406944275, + "learning_rate": 1.5638793634045548e-05, + "loss": 0.0036, + "step": 11238 + }, + { + "epoch": 2.9650441894209205, + "grad_norm": 0.006915643345564604, + "learning_rate": 1.5637621266742874e-05, + "loss": 0.0001, + "step": 11240 + }, + { + "epoch": 2.965571824297586, + "grad_norm": 0.0031396690756082535, + "learning_rate": 1.5636448899440197e-05, + "loss": 0.0001, + "step": 11242 + }, + { + "epoch": 2.9660994591742513, + "grad_norm": 0.1089482232928276, + "learning_rate": 1.563527653213752e-05, + "loss": 0.0031, + "step": 11244 + }, + { + "epoch": 2.9666270940509167, + "grad_norm": 0.047662850469350815, + "learning_rate": 1.5634104164834845e-05, + "loss": 0.0007, + "step": 11246 + }, + { + "epoch": 2.967154728927582, + "grad_norm": 0.3276481032371521, + "learning_rate": 1.5632931797532168e-05, + "loss": 0.0105, + "step": 11248 + }, + { + "epoch": 2.9676823638042475, + "grad_norm": 0.02770773135125637, + "learning_rate": 1.563175943022949e-05, + "loss": 0.0062, + "step": 11250 + }, + { + "epoch": 2.968209998680913, + "grad_norm": 0.00141725514549762, + "learning_rate": 1.5630587062926816e-05, + "loss": 0.0006, + "step": 11252 + }, + { + "epoch": 2.968737633557578, + "grad_norm": 0.006298349238932133, + "learning_rate": 1.5629414695624142e-05, + "loss": 0.0007, + "step": 11254 + }, + { + "epoch": 2.9692652684342438, + "grad_norm": 0.004630073439329863, + "learning_rate": 1.5628242328321465e-05, + "loss": 0.0014, + "step": 11256 + }, + { + "epoch": 2.9697929033109087, + "grad_norm": 0.011088870465755463, + "learning_rate": 1.5627069961018788e-05, + "loss": 0.0006, + "step": 11258 + }, + { + "epoch": 2.970320538187574, + "grad_norm": 0.004627760499715805, + "learning_rate": 1.5625897593716114e-05, + "loss": 0.0002, + "step": 11260 + }, + { + "epoch": 2.9708481730642395, + "grad_norm": 0.006615020800381899, + "learning_rate": 1.5624725226413436e-05, + "loss": 0.0002, + "step": 11262 + }, + { + "epoch": 2.971375807940905, + "grad_norm": 0.005432829260826111, + "learning_rate": 1.562355285911076e-05, + "loss": 0.0001, + "step": 11264 + }, + { + "epoch": 2.9719034428175704, + "grad_norm": 0.024777311831712723, + "learning_rate": 1.5622380491808085e-05, + "loss": 0.0027, + "step": 11266 + }, + { + "epoch": 2.9724310776942353, + "grad_norm": 0.0016248209867626429, + "learning_rate": 1.562120812450541e-05, + "loss": 0.0002, + "step": 11268 + }, + { + "epoch": 2.972958712570901, + "grad_norm": 0.0014966666931286454, + "learning_rate": 1.5620035757202733e-05, + "loss": 0.0005, + "step": 11270 + }, + { + "epoch": 2.973486347447566, + "grad_norm": 0.004056157544255257, + "learning_rate": 1.5618863389900056e-05, + "loss": 0.0002, + "step": 11272 + }, + { + "epoch": 2.9740139823242315, + "grad_norm": 0.019159922376275063, + "learning_rate": 1.5617691022597382e-05, + "loss": 0.0002, + "step": 11274 + }, + { + "epoch": 2.974541617200897, + "grad_norm": 0.016227202489972115, + "learning_rate": 1.5616518655294705e-05, + "loss": 0.0002, + "step": 11276 + }, + { + "epoch": 2.9750692520775623, + "grad_norm": 0.02758737839758396, + "learning_rate": 1.5615346287992027e-05, + "loss": 0.0035, + "step": 11278 + }, + { + "epoch": 2.9755968869542277, + "grad_norm": 0.005215683951973915, + "learning_rate": 1.5614173920689353e-05, + "loss": 0.0002, + "step": 11280 + }, + { + "epoch": 2.976124521830893, + "grad_norm": 0.002744237193837762, + "learning_rate": 1.5613001553386676e-05, + "loss": 0.0002, + "step": 11282 + }, + { + "epoch": 2.9766521567075586, + "grad_norm": 0.01570221036672592, + "learning_rate": 1.5611829186084002e-05, + "loss": 0.0002, + "step": 11284 + }, + { + "epoch": 2.9771797915842235, + "grad_norm": 0.002422186778858304, + "learning_rate": 1.5610656818781325e-05, + "loss": 0.0003, + "step": 11286 + }, + { + "epoch": 2.977707426460889, + "grad_norm": 0.21965956687927246, + "learning_rate": 1.560948445147865e-05, + "loss": 0.0016, + "step": 11288 + }, + { + "epoch": 2.9782350613375543, + "grad_norm": 0.4069344401359558, + "learning_rate": 1.5608312084175973e-05, + "loss": 0.0079, + "step": 11290 + }, + { + "epoch": 2.9787626962142197, + "grad_norm": 0.04899465665221214, + "learning_rate": 1.56071397168733e-05, + "loss": 0.0026, + "step": 11292 + }, + { + "epoch": 2.979290331090885, + "grad_norm": 0.004071435425430536, + "learning_rate": 1.5605967349570622e-05, + "loss": 0.0001, + "step": 11294 + }, + { + "epoch": 2.9798179659675506, + "grad_norm": 0.4186365008354187, + "learning_rate": 1.5604794982267944e-05, + "loss": 0.0043, + "step": 11296 + }, + { + "epoch": 2.980345600844216, + "grad_norm": 0.04099571332335472, + "learning_rate": 1.560362261496527e-05, + "loss": 0.0053, + "step": 11298 + }, + { + "epoch": 2.980873235720881, + "grad_norm": 0.11042546480894089, + "learning_rate": 1.5602450247662596e-05, + "loss": 0.0004, + "step": 11300 + }, + { + "epoch": 2.9814008705975468, + "grad_norm": 0.252606600522995, + "learning_rate": 1.560127788035992e-05, + "loss": 0.002, + "step": 11302 + }, + { + "epoch": 2.9819285054742117, + "grad_norm": 0.3244939148426056, + "learning_rate": 1.560010551305724e-05, + "loss": 0.0024, + "step": 11304 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.4238820970058441, + "learning_rate": 1.5598933145754568e-05, + "loss": 0.0018, + "step": 11306 + }, + { + "epoch": 2.9829837752275425, + "grad_norm": 0.06754470616579056, + "learning_rate": 1.559776077845189e-05, + "loss": 0.0005, + "step": 11308 + }, + { + "epoch": 2.983511410104208, + "grad_norm": 0.21483127772808075, + "learning_rate": 1.5596588411149213e-05, + "loss": 0.0023, + "step": 11310 + }, + { + "epoch": 2.9840390449808734, + "grad_norm": 0.0012577598681673408, + "learning_rate": 1.559541604384654e-05, + "loss": 0.0001, + "step": 11312 + }, + { + "epoch": 2.9845666798575383, + "grad_norm": 0.003689805045723915, + "learning_rate": 1.5594243676543865e-05, + "loss": 0.0003, + "step": 11314 + }, + { + "epoch": 2.985094314734204, + "grad_norm": 0.39582765102386475, + "learning_rate": 1.5593071309241184e-05, + "loss": 0.0151, + "step": 11316 + }, + { + "epoch": 2.985621949610869, + "grad_norm": 0.01934988610446453, + "learning_rate": 1.559189894193851e-05, + "loss": 0.0038, + "step": 11318 + }, + { + "epoch": 2.9861495844875345, + "grad_norm": 0.0024530210066586733, + "learning_rate": 1.5590726574635836e-05, + "loss": 0.0011, + "step": 11320 + }, + { + "epoch": 2.9866772193642, + "grad_norm": 0.15320320427417755, + "learning_rate": 1.558955420733316e-05, + "loss": 0.0066, + "step": 11322 + }, + { + "epoch": 2.9872048542408653, + "grad_norm": 0.024962181225419044, + "learning_rate": 1.558838184003048e-05, + "loss": 0.0004, + "step": 11324 + }, + { + "epoch": 2.9877324891175308, + "grad_norm": 0.003324150573462248, + "learning_rate": 1.5587209472727807e-05, + "loss": 0.0001, + "step": 11326 + }, + { + "epoch": 2.988260123994196, + "grad_norm": 0.0015680735232308507, + "learning_rate": 1.558603710542513e-05, + "loss": 0.0001, + "step": 11328 + }, + { + "epoch": 2.9887877588708616, + "grad_norm": 0.004195955116301775, + "learning_rate": 1.5584864738122456e-05, + "loss": 0.0002, + "step": 11330 + }, + { + "epoch": 2.9893153937475265, + "grad_norm": 0.06602656096220016, + "learning_rate": 1.558369237081978e-05, + "loss": 0.0019, + "step": 11332 + }, + { + "epoch": 2.989843028624192, + "grad_norm": 0.055718179792165756, + "learning_rate": 1.5582520003517104e-05, + "loss": 0.0003, + "step": 11334 + }, + { + "epoch": 2.9903706635008573, + "grad_norm": 0.0009874050738289952, + "learning_rate": 1.5581347636214427e-05, + "loss": 0.0001, + "step": 11336 + }, + { + "epoch": 2.9908982983775227, + "grad_norm": 0.08447689563035965, + "learning_rate": 1.558017526891175e-05, + "loss": 0.0022, + "step": 11338 + }, + { + "epoch": 2.991425933254188, + "grad_norm": 0.01458085235208273, + "learning_rate": 1.5579002901609076e-05, + "loss": 0.0013, + "step": 11340 + }, + { + "epoch": 2.9919535681308536, + "grad_norm": 0.00550368195399642, + "learning_rate": 1.5577830534306398e-05, + "loss": 0.0002, + "step": 11342 + }, + { + "epoch": 2.992481203007519, + "grad_norm": 0.24843333661556244, + "learning_rate": 1.5576658167003724e-05, + "loss": 0.0057, + "step": 11344 + }, + { + "epoch": 2.993008837884184, + "grad_norm": 0.198090061545372, + "learning_rate": 1.5575485799701047e-05, + "loss": 0.0029, + "step": 11346 + }, + { + "epoch": 2.9935364727608493, + "grad_norm": 0.00995627697557211, + "learning_rate": 1.5574313432398373e-05, + "loss": 0.0018, + "step": 11348 + }, + { + "epoch": 2.9940641076375147, + "grad_norm": 0.007098629139363766, + "learning_rate": 1.5573141065095695e-05, + "loss": 0.0002, + "step": 11350 + }, + { + "epoch": 2.99459174251418, + "grad_norm": 0.020151540637016296, + "learning_rate": 1.557196869779302e-05, + "loss": 0.0003, + "step": 11352 + }, + { + "epoch": 2.9951193773908455, + "grad_norm": 0.0529305562376976, + "learning_rate": 1.5570796330490344e-05, + "loss": 0.0017, + "step": 11354 + }, + { + "epoch": 2.995647012267511, + "grad_norm": 0.009790640324354172, + "learning_rate": 1.5569623963187667e-05, + "loss": 0.0003, + "step": 11356 + }, + { + "epoch": 2.9961746471441764, + "grad_norm": 0.37103816866874695, + "learning_rate": 1.5568451595884993e-05, + "loss": 0.0066, + "step": 11358 + }, + { + "epoch": 2.9967022820208413, + "grad_norm": 0.02228919416666031, + "learning_rate": 1.556727922858232e-05, + "loss": 0.0017, + "step": 11360 + }, + { + "epoch": 2.997229916897507, + "grad_norm": 0.04665565490722656, + "learning_rate": 1.5566106861279638e-05, + "loss": 0.0004, + "step": 11362 + }, + { + "epoch": 2.997757551774172, + "grad_norm": 0.015165266580879688, + "learning_rate": 1.5564934493976964e-05, + "loss": 0.001, + "step": 11364 + }, + { + "epoch": 2.9982851866508375, + "grad_norm": 0.023136671632528305, + "learning_rate": 1.556376212667429e-05, + "loss": 0.0003, + "step": 11366 + }, + { + "epoch": 2.998812821527503, + "grad_norm": 0.007359964307397604, + "learning_rate": 1.5562589759371613e-05, + "loss": 0.0002, + "step": 11368 + }, + { + "epoch": 2.9993404564041684, + "grad_norm": 0.013571026735007763, + "learning_rate": 1.5561417392068935e-05, + "loss": 0.0002, + "step": 11370 + }, + { + "epoch": 2.9998680912808338, + "grad_norm": 0.06981614977121353, + "learning_rate": 1.556024502476626e-05, + "loss": 0.0011, + "step": 11372 + }, + { + "epoch": 3.0002638174383325, + "grad_norm": 0.021488675847649574, + "learning_rate": 1.5559072657463584e-05, + "loss": 0.0008, + "step": 11374 + }, + { + "epoch": 3.000791452314998, + "grad_norm": 0.019794074818491936, + "learning_rate": 1.5557900290160906e-05, + "loss": 0.0004, + "step": 11376 + }, + { + "epoch": 3.0013190871916633, + "grad_norm": 0.030113747343420982, + "learning_rate": 1.5556727922858232e-05, + "loss": 0.0062, + "step": 11378 + }, + { + "epoch": 3.0018467220683287, + "grad_norm": 0.0034959784243255854, + "learning_rate": 1.555555555555556e-05, + "loss": 0.0036, + "step": 11380 + }, + { + "epoch": 3.002374356944994, + "grad_norm": 0.01317757647484541, + "learning_rate": 1.555438318825288e-05, + "loss": 0.0001, + "step": 11382 + }, + { + "epoch": 3.0029019918216595, + "grad_norm": 0.061670441180467606, + "learning_rate": 1.5553210820950204e-05, + "loss": 0.0012, + "step": 11384 + }, + { + "epoch": 3.003429626698325, + "grad_norm": 0.007171541452407837, + "learning_rate": 1.555203845364753e-05, + "loss": 0.0001, + "step": 11386 + }, + { + "epoch": 3.0039572615749903, + "grad_norm": 0.007401262875646353, + "learning_rate": 1.5550866086344852e-05, + "loss": 0.0002, + "step": 11388 + }, + { + "epoch": 3.0044848964516553, + "grad_norm": 0.0020341144409030676, + "learning_rate": 1.5549693719042178e-05, + "loss": 0.0002, + "step": 11390 + }, + { + "epoch": 3.0050125313283207, + "grad_norm": 0.007756334729492664, + "learning_rate": 1.55485213517395e-05, + "loss": 0.0002, + "step": 11392 + }, + { + "epoch": 3.005540166204986, + "grad_norm": 0.003032946027815342, + "learning_rate": 1.5547348984436827e-05, + "loss": 0.0001, + "step": 11394 + }, + { + "epoch": 3.0060678010816515, + "grad_norm": 0.009681744500994682, + "learning_rate": 1.554617661713415e-05, + "loss": 0.0002, + "step": 11396 + }, + { + "epoch": 3.006595435958317, + "grad_norm": 0.022650370374321938, + "learning_rate": 1.5545004249831472e-05, + "loss": 0.0011, + "step": 11398 + }, + { + "epoch": 3.0071230708349823, + "grad_norm": 0.00500065041705966, + "learning_rate": 1.5543831882528798e-05, + "loss": 0.0001, + "step": 11400 + }, + { + "epoch": 3.0076507057116477, + "grad_norm": 0.007939276285469532, + "learning_rate": 1.554265951522612e-05, + "loss": 0.0001, + "step": 11402 + }, + { + "epoch": 3.0081783405883127, + "grad_norm": 0.012807793915271759, + "learning_rate": 1.5541487147923447e-05, + "loss": 0.0002, + "step": 11404 + }, + { + "epoch": 3.008705975464978, + "grad_norm": 0.05623207241296768, + "learning_rate": 1.554031478062077e-05, + "loss": 0.0022, + "step": 11406 + }, + { + "epoch": 3.0092336103416435, + "grad_norm": 0.006399143021553755, + "learning_rate": 1.5539142413318092e-05, + "loss": 0.0036, + "step": 11408 + }, + { + "epoch": 3.009761245218309, + "grad_norm": 0.2005932331085205, + "learning_rate": 1.5537970046015418e-05, + "loss": 0.0022, + "step": 11410 + }, + { + "epoch": 3.0102888800949743, + "grad_norm": 0.011010454967617989, + "learning_rate": 1.5536797678712744e-05, + "loss": 0.0001, + "step": 11412 + }, + { + "epoch": 3.0108165149716397, + "grad_norm": 0.003048637183383107, + "learning_rate": 1.5535625311410066e-05, + "loss": 0.0001, + "step": 11414 + }, + { + "epoch": 3.011344149848305, + "grad_norm": 0.11074810475111008, + "learning_rate": 1.553445294410739e-05, + "loss": 0.0017, + "step": 11416 + }, + { + "epoch": 3.0118717847249705, + "grad_norm": 0.19511237740516663, + "learning_rate": 1.5533280576804715e-05, + "loss": 0.0014, + "step": 11418 + }, + { + "epoch": 3.0123994196016355, + "grad_norm": 0.04731442406773567, + "learning_rate": 1.5532108209502038e-05, + "loss": 0.0006, + "step": 11420 + }, + { + "epoch": 3.012927054478301, + "grad_norm": 0.04997008293867111, + "learning_rate": 1.553093584219936e-05, + "loss": 0.0003, + "step": 11422 + }, + { + "epoch": 3.0134546893549663, + "grad_norm": 0.5642150044441223, + "learning_rate": 1.5529763474896686e-05, + "loss": 0.0042, + "step": 11424 + }, + { + "epoch": 3.0139823242316317, + "grad_norm": 0.004857398569583893, + "learning_rate": 1.5528591107594012e-05, + "loss": 0.0005, + "step": 11426 + }, + { + "epoch": 3.014509959108297, + "grad_norm": 0.009887230582535267, + "learning_rate": 1.5527418740291335e-05, + "loss": 0.0005, + "step": 11428 + }, + { + "epoch": 3.0150375939849625, + "grad_norm": 0.016168469563126564, + "learning_rate": 1.5526246372988657e-05, + "loss": 0.0022, + "step": 11430 + }, + { + "epoch": 3.015565228861628, + "grad_norm": 0.107755646109581, + "learning_rate": 1.5525074005685983e-05, + "loss": 0.0062, + "step": 11432 + }, + { + "epoch": 3.016092863738293, + "grad_norm": 0.0035601858980953693, + "learning_rate": 1.5523901638383306e-05, + "loss": 0.003, + "step": 11434 + }, + { + "epoch": 3.0166204986149583, + "grad_norm": 0.008732947520911694, + "learning_rate": 1.552272927108063e-05, + "loss": 0.0002, + "step": 11436 + }, + { + "epoch": 3.0171481334916237, + "grad_norm": 0.0027263257652521133, + "learning_rate": 1.5521556903777955e-05, + "loss": 0.0014, + "step": 11438 + }, + { + "epoch": 3.017675768368289, + "grad_norm": 0.0017486122669652104, + "learning_rate": 1.552038453647528e-05, + "loss": 0.0002, + "step": 11440 + }, + { + "epoch": 3.0182034032449545, + "grad_norm": 0.14423972368240356, + "learning_rate": 1.5519212169172603e-05, + "loss": 0.0083, + "step": 11442 + }, + { + "epoch": 3.01873103812162, + "grad_norm": 0.2075878381729126, + "learning_rate": 1.5518039801869926e-05, + "loss": 0.003, + "step": 11444 + }, + { + "epoch": 3.0192586729982853, + "grad_norm": 0.004610868636518717, + "learning_rate": 1.5516867434567252e-05, + "loss": 0.0001, + "step": 11446 + }, + { + "epoch": 3.0197863078749507, + "grad_norm": 0.0017839831998571754, + "learning_rate": 1.5515695067264575e-05, + "loss": 0.0001, + "step": 11448 + }, + { + "epoch": 3.0203139427516157, + "grad_norm": 0.023723160848021507, + "learning_rate": 1.55145226999619e-05, + "loss": 0.0004, + "step": 11450 + }, + { + "epoch": 3.020841577628281, + "grad_norm": 0.008730652742087841, + "learning_rate": 1.5513350332659223e-05, + "loss": 0.0003, + "step": 11452 + }, + { + "epoch": 3.0213692125049465, + "grad_norm": 0.28876784443855286, + "learning_rate": 1.5512177965356546e-05, + "loss": 0.0019, + "step": 11454 + }, + { + "epoch": 3.021896847381612, + "grad_norm": 0.03897785767912865, + "learning_rate": 1.5511005598053872e-05, + "loss": 0.0005, + "step": 11456 + }, + { + "epoch": 3.0224244822582773, + "grad_norm": 0.09493458271026611, + "learning_rate": 1.5509833230751194e-05, + "loss": 0.0008, + "step": 11458 + }, + { + "epoch": 3.0229521171349427, + "grad_norm": 0.039185523986816406, + "learning_rate": 1.550866086344852e-05, + "loss": 0.0033, + "step": 11460 + }, + { + "epoch": 3.023479752011608, + "grad_norm": 0.02747596800327301, + "learning_rate": 1.5507488496145843e-05, + "loss": 0.0003, + "step": 11462 + }, + { + "epoch": 3.0240073868882735, + "grad_norm": 0.01118041854351759, + "learning_rate": 1.550631612884317e-05, + "loss": 0.0002, + "step": 11464 + }, + { + "epoch": 3.0245350217649385, + "grad_norm": 0.0024622688069939613, + "learning_rate": 1.550514376154049e-05, + "loss": 0.0001, + "step": 11466 + }, + { + "epoch": 3.025062656641604, + "grad_norm": 0.0026577196549624205, + "learning_rate": 1.5503971394237814e-05, + "loss": 0.0002, + "step": 11468 + }, + { + "epoch": 3.0255902915182693, + "grad_norm": 0.002125085098668933, + "learning_rate": 1.550279902693514e-05, + "loss": 0.0001, + "step": 11470 + }, + { + "epoch": 3.0261179263949347, + "grad_norm": 0.007845243439078331, + "learning_rate": 1.5501626659632466e-05, + "loss": 0.0002, + "step": 11472 + }, + { + "epoch": 3.0266455612716, + "grad_norm": 0.27712756395339966, + "learning_rate": 1.550045429232979e-05, + "loss": 0.0013, + "step": 11474 + }, + { + "epoch": 3.0271731961482655, + "grad_norm": 0.04938041791319847, + "learning_rate": 1.549928192502711e-05, + "loss": 0.0002, + "step": 11476 + }, + { + "epoch": 3.027700831024931, + "grad_norm": 0.03571852669119835, + "learning_rate": 1.5498109557724437e-05, + "loss": 0.0041, + "step": 11478 + }, + { + "epoch": 3.028228465901596, + "grad_norm": 0.014635531231760979, + "learning_rate": 1.549693719042176e-05, + "loss": 0.0001, + "step": 11480 + }, + { + "epoch": 3.0287561007782613, + "grad_norm": 0.011164351366460323, + "learning_rate": 1.5495764823119083e-05, + "loss": 0.0001, + "step": 11482 + }, + { + "epoch": 3.0292837356549267, + "grad_norm": 0.0011985040036961436, + "learning_rate": 1.549459245581641e-05, + "loss": 0.0002, + "step": 11484 + }, + { + "epoch": 3.029811370531592, + "grad_norm": 0.0075470502488315105, + "learning_rate": 1.5493420088513735e-05, + "loss": 0.0001, + "step": 11486 + }, + { + "epoch": 3.0303390054082575, + "grad_norm": 0.1668025553226471, + "learning_rate": 1.5492247721211057e-05, + "loss": 0.0005, + "step": 11488 + }, + { + "epoch": 3.030866640284923, + "grad_norm": 0.0021117967553436756, + "learning_rate": 1.549107535390838e-05, + "loss": 0.0002, + "step": 11490 + }, + { + "epoch": 3.0313942751615883, + "grad_norm": 0.13342005014419556, + "learning_rate": 1.5489902986605706e-05, + "loss": 0.0007, + "step": 11492 + }, + { + "epoch": 3.0319219100382537, + "grad_norm": 0.002101855119690299, + "learning_rate": 1.548873061930303e-05, + "loss": 0.0001, + "step": 11494 + }, + { + "epoch": 3.0324495449149187, + "grad_norm": 0.013079481199383736, + "learning_rate": 1.548755825200035e-05, + "loss": 0.0035, + "step": 11496 + }, + { + "epoch": 3.032977179791584, + "grad_norm": 0.0035403193905949593, + "learning_rate": 1.5486385884697677e-05, + "loss": 0.0001, + "step": 11498 + }, + { + "epoch": 3.0335048146682495, + "grad_norm": 0.0027877544052898884, + "learning_rate": 1.5485213517395003e-05, + "loss": 0.0001, + "step": 11500 + }, + { + "epoch": 3.034032449544915, + "grad_norm": 0.33902907371520996, + "learning_rate": 1.5484041150092326e-05, + "loss": 0.0006, + "step": 11502 + }, + { + "epoch": 3.0345600844215803, + "grad_norm": 0.1151464655995369, + "learning_rate": 1.5482868782789648e-05, + "loss": 0.0019, + "step": 11504 + }, + { + "epoch": 3.0350877192982457, + "grad_norm": 0.20107294619083405, + "learning_rate": 1.5481696415486974e-05, + "loss": 0.0041, + "step": 11506 + }, + { + "epoch": 3.035615354174911, + "grad_norm": 0.32543253898620605, + "learning_rate": 1.5480524048184297e-05, + "loss": 0.0092, + "step": 11508 + }, + { + "epoch": 3.036142989051576, + "grad_norm": 0.0037066019140183926, + "learning_rate": 1.5479351680881623e-05, + "loss": 0.0001, + "step": 11510 + }, + { + "epoch": 3.0366706239282415, + "grad_norm": 0.5956739187240601, + "learning_rate": 1.5478179313578945e-05, + "loss": 0.006, + "step": 11512 + }, + { + "epoch": 3.037198258804907, + "grad_norm": 0.0744740292429924, + "learning_rate": 1.5477006946276268e-05, + "loss": 0.0011, + "step": 11514 + }, + { + "epoch": 3.0377258936815723, + "grad_norm": 0.23514948785305023, + "learning_rate": 1.5475834578973594e-05, + "loss": 0.0053, + "step": 11516 + }, + { + "epoch": 3.0382535285582377, + "grad_norm": 0.06304159760475159, + "learning_rate": 1.547466221167092e-05, + "loss": 0.0009, + "step": 11518 + }, + { + "epoch": 3.038781163434903, + "grad_norm": 0.040896542370319366, + "learning_rate": 1.5473489844368243e-05, + "loss": 0.0008, + "step": 11520 + }, + { + "epoch": 3.0393087983115685, + "grad_norm": 0.01098975446075201, + "learning_rate": 1.5472317477065565e-05, + "loss": 0.012, + "step": 11522 + }, + { + "epoch": 3.039836433188234, + "grad_norm": 0.003964156378060579, + "learning_rate": 1.547114510976289e-05, + "loss": 0.0002, + "step": 11524 + }, + { + "epoch": 3.040364068064899, + "grad_norm": 0.0023480216041207314, + "learning_rate": 1.5469972742460214e-05, + "loss": 0.0001, + "step": 11526 + }, + { + "epoch": 3.0408917029415643, + "grad_norm": 0.004111128393560648, + "learning_rate": 1.5468800375157536e-05, + "loss": 0.0003, + "step": 11528 + }, + { + "epoch": 3.0414193378182297, + "grad_norm": 0.02091490849852562, + "learning_rate": 1.5467628007854862e-05, + "loss": 0.0002, + "step": 11530 + }, + { + "epoch": 3.041946972694895, + "grad_norm": 0.004143743775784969, + "learning_rate": 1.546645564055219e-05, + "loss": 0.0019, + "step": 11532 + }, + { + "epoch": 3.0424746075715605, + "grad_norm": 0.02904115989804268, + "learning_rate": 1.546528327324951e-05, + "loss": 0.0002, + "step": 11534 + }, + { + "epoch": 3.043002242448226, + "grad_norm": 0.04356355965137482, + "learning_rate": 1.5464110905946834e-05, + "loss": 0.0034, + "step": 11536 + }, + { + "epoch": 3.0435298773248913, + "grad_norm": 0.0035824051592499018, + "learning_rate": 1.546293853864416e-05, + "loss": 0.0001, + "step": 11538 + }, + { + "epoch": 3.0440575122015567, + "grad_norm": 0.002788296202197671, + "learning_rate": 1.5461766171341482e-05, + "loss": 0.0002, + "step": 11540 + }, + { + "epoch": 3.0445851470782217, + "grad_norm": 0.0026206013280898333, + "learning_rate": 1.5460593804038805e-05, + "loss": 0.0001, + "step": 11542 + }, + { + "epoch": 3.045112781954887, + "grad_norm": 0.007308779284358025, + "learning_rate": 1.545942143673613e-05, + "loss": 0.0028, + "step": 11544 + }, + { + "epoch": 3.0456404168315525, + "grad_norm": 0.0043159653432667255, + "learning_rate": 1.5458249069433457e-05, + "loss": 0.0023, + "step": 11546 + }, + { + "epoch": 3.046168051708218, + "grad_norm": 0.02044234611093998, + "learning_rate": 1.545707670213078e-05, + "loss": 0.0056, + "step": 11548 + }, + { + "epoch": 3.0466956865848833, + "grad_norm": 0.003655163338407874, + "learning_rate": 1.5455904334828102e-05, + "loss": 0.0001, + "step": 11550 + }, + { + "epoch": 3.0472233214615487, + "grad_norm": 0.005533859133720398, + "learning_rate": 1.5454731967525428e-05, + "loss": 0.0032, + "step": 11552 + }, + { + "epoch": 3.047750956338214, + "grad_norm": 0.31409427523612976, + "learning_rate": 1.545355960022275e-05, + "loss": 0.0047, + "step": 11554 + }, + { + "epoch": 3.048278591214879, + "grad_norm": 0.08668719977140427, + "learning_rate": 1.5452387232920073e-05, + "loss": 0.002, + "step": 11556 + }, + { + "epoch": 3.0488062260915445, + "grad_norm": 0.47945910692214966, + "learning_rate": 1.54512148656174e-05, + "loss": 0.0136, + "step": 11558 + }, + { + "epoch": 3.04933386096821, + "grad_norm": 0.10474639385938644, + "learning_rate": 1.5450042498314722e-05, + "loss": 0.005, + "step": 11560 + }, + { + "epoch": 3.0498614958448753, + "grad_norm": 0.027111830189824104, + "learning_rate": 1.5448870131012048e-05, + "loss": 0.0027, + "step": 11562 + }, + { + "epoch": 3.0503891307215407, + "grad_norm": 0.14468255639076233, + "learning_rate": 1.544769776370937e-05, + "loss": 0.0024, + "step": 11564 + }, + { + "epoch": 3.050916765598206, + "grad_norm": 0.49737587571144104, + "learning_rate": 1.5446525396406697e-05, + "loss": 0.0098, + "step": 11566 + }, + { + "epoch": 3.0514444004748715, + "grad_norm": 0.035601794719696045, + "learning_rate": 1.544535302910402e-05, + "loss": 0.0005, + "step": 11568 + }, + { + "epoch": 3.051972035351537, + "grad_norm": 0.22918394207954407, + "learning_rate": 1.5444180661801345e-05, + "loss": 0.0023, + "step": 11570 + }, + { + "epoch": 3.052499670228202, + "grad_norm": 0.08990082144737244, + "learning_rate": 1.5443008294498668e-05, + "loss": 0.0008, + "step": 11572 + }, + { + "epoch": 3.0530273051048673, + "grad_norm": 0.16990241408348083, + "learning_rate": 1.544183592719599e-05, + "loss": 0.0035, + "step": 11574 + }, + { + "epoch": 3.0535549399815327, + "grad_norm": 0.08150006830692291, + "learning_rate": 1.5440663559893316e-05, + "loss": 0.0056, + "step": 11576 + }, + { + "epoch": 3.054082574858198, + "grad_norm": 0.010103391483426094, + "learning_rate": 1.5439491192590642e-05, + "loss": 0.0042, + "step": 11578 + }, + { + "epoch": 3.0546102097348635, + "grad_norm": 0.016477899625897408, + "learning_rate": 1.5438318825287965e-05, + "loss": 0.0005, + "step": 11580 + }, + { + "epoch": 3.055137844611529, + "grad_norm": 0.06367174535989761, + "learning_rate": 1.5437146457985288e-05, + "loss": 0.0028, + "step": 11582 + }, + { + "epoch": 3.0556654794881943, + "grad_norm": 0.023623596876859665, + "learning_rate": 1.5435974090682614e-05, + "loss": 0.0033, + "step": 11584 + }, + { + "epoch": 3.0561931143648593, + "grad_norm": 0.002079389290884137, + "learning_rate": 1.5434801723379936e-05, + "loss": 0.0003, + "step": 11586 + }, + { + "epoch": 3.0567207492415247, + "grad_norm": 0.2250768095254898, + "learning_rate": 1.543362935607726e-05, + "loss": 0.0029, + "step": 11588 + }, + { + "epoch": 3.05724838411819, + "grad_norm": 0.008881489746272564, + "learning_rate": 1.5432456988774585e-05, + "loss": 0.0004, + "step": 11590 + }, + { + "epoch": 3.0577760189948555, + "grad_norm": 0.012733422219753265, + "learning_rate": 1.543128462147191e-05, + "loss": 0.0007, + "step": 11592 + }, + { + "epoch": 3.058303653871521, + "grad_norm": 0.006398745812475681, + "learning_rate": 1.543011225416923e-05, + "loss": 0.0019, + "step": 11594 + }, + { + "epoch": 3.0588312887481863, + "grad_norm": 0.005636992398649454, + "learning_rate": 1.5428939886866556e-05, + "loss": 0.0002, + "step": 11596 + }, + { + "epoch": 3.0593589236248517, + "grad_norm": 0.0043172528967261314, + "learning_rate": 1.5427767519563882e-05, + "loss": 0.0004, + "step": 11598 + }, + { + "epoch": 3.059886558501517, + "grad_norm": 0.004348721820861101, + "learning_rate": 1.5426595152261205e-05, + "loss": 0.0001, + "step": 11600 + }, + { + "epoch": 3.060414193378182, + "grad_norm": 0.1319580376148224, + "learning_rate": 1.5425422784958527e-05, + "loss": 0.0073, + "step": 11602 + }, + { + "epoch": 3.0609418282548475, + "grad_norm": 0.01119422260671854, + "learning_rate": 1.5424250417655853e-05, + "loss": 0.0002, + "step": 11604 + }, + { + "epoch": 3.061469463131513, + "grad_norm": 0.00403070542961359, + "learning_rate": 1.5423078050353176e-05, + "loss": 0.0003, + "step": 11606 + }, + { + "epoch": 3.0619970980081783, + "grad_norm": 0.010463337413966656, + "learning_rate": 1.5421905683050502e-05, + "loss": 0.0002, + "step": 11608 + }, + { + "epoch": 3.0625247328848437, + "grad_norm": 0.008040067739784718, + "learning_rate": 1.5420733315747824e-05, + "loss": 0.0002, + "step": 11610 + }, + { + "epoch": 3.063052367761509, + "grad_norm": 0.06578750163316727, + "learning_rate": 1.541956094844515e-05, + "loss": 0.0005, + "step": 11612 + }, + { + "epoch": 3.0635800026381745, + "grad_norm": 0.13349579274654388, + "learning_rate": 1.5418388581142473e-05, + "loss": 0.0011, + "step": 11614 + }, + { + "epoch": 3.06410763751484, + "grad_norm": 0.08762696385383606, + "learning_rate": 1.5417216213839796e-05, + "loss": 0.0006, + "step": 11616 + }, + { + "epoch": 3.064635272391505, + "grad_norm": 0.06276495009660721, + "learning_rate": 1.541604384653712e-05, + "loss": 0.0034, + "step": 11618 + }, + { + "epoch": 3.0651629072681703, + "grad_norm": 0.09590120613574982, + "learning_rate": 1.5414871479234444e-05, + "loss": 0.0026, + "step": 11620 + }, + { + "epoch": 3.0656905421448357, + "grad_norm": 0.0038352380506694317, + "learning_rate": 1.541369911193177e-05, + "loss": 0.0002, + "step": 11622 + }, + { + "epoch": 3.066218177021501, + "grad_norm": 0.007045881822705269, + "learning_rate": 1.5412526744629093e-05, + "loss": 0.0016, + "step": 11624 + }, + { + "epoch": 3.0667458118981665, + "grad_norm": 0.5029280781745911, + "learning_rate": 1.541135437732642e-05, + "loss": 0.0038, + "step": 11626 + }, + { + "epoch": 3.067273446774832, + "grad_norm": 0.7557677626609802, + "learning_rate": 1.541018201002374e-05, + "loss": 0.0061, + "step": 11628 + }, + { + "epoch": 3.0678010816514973, + "grad_norm": 0.006119227036833763, + "learning_rate": 1.5409009642721068e-05, + "loss": 0.0001, + "step": 11630 + }, + { + "epoch": 3.0683287165281623, + "grad_norm": 0.03954951837658882, + "learning_rate": 1.540783727541839e-05, + "loss": 0.0022, + "step": 11632 + }, + { + "epoch": 3.0688563514048277, + "grad_norm": 0.027077922597527504, + "learning_rate": 1.5406664908115713e-05, + "loss": 0.0002, + "step": 11634 + }, + { + "epoch": 3.069383986281493, + "grad_norm": 0.002893306314945221, + "learning_rate": 1.540549254081304e-05, + "loss": 0.0026, + "step": 11636 + }, + { + "epoch": 3.0699116211581585, + "grad_norm": 0.02394948899745941, + "learning_rate": 1.5404320173510365e-05, + "loss": 0.0004, + "step": 11638 + }, + { + "epoch": 3.070439256034824, + "grad_norm": 0.002354920608922839, + "learning_rate": 1.5403147806207684e-05, + "loss": 0.0061, + "step": 11640 + }, + { + "epoch": 3.0709668909114893, + "grad_norm": 0.0028961573261767626, + "learning_rate": 1.540197543890501e-05, + "loss": 0.0001, + "step": 11642 + }, + { + "epoch": 3.0714945257881547, + "grad_norm": 0.037267912179231644, + "learning_rate": 1.5400803071602336e-05, + "loss": 0.0003, + "step": 11644 + }, + { + "epoch": 3.07202216066482, + "grad_norm": 0.006822964176535606, + "learning_rate": 1.539963070429966e-05, + "loss": 0.0018, + "step": 11646 + }, + { + "epoch": 3.072549795541485, + "grad_norm": 0.0014879250666126609, + "learning_rate": 1.539845833699698e-05, + "loss": 0.0001, + "step": 11648 + }, + { + "epoch": 3.0730774304181505, + "grad_norm": 0.12266230583190918, + "learning_rate": 1.5397285969694307e-05, + "loss": 0.0005, + "step": 11650 + }, + { + "epoch": 3.073605065294816, + "grad_norm": 0.035257309675216675, + "learning_rate": 1.539611360239163e-05, + "loss": 0.0057, + "step": 11652 + }, + { + "epoch": 3.0741327001714813, + "grad_norm": 0.001422407804057002, + "learning_rate": 1.5394941235088952e-05, + "loss": 0.0002, + "step": 11654 + }, + { + "epoch": 3.0746603350481467, + "grad_norm": 0.007696301210671663, + "learning_rate": 1.539376886778628e-05, + "loss": 0.0002, + "step": 11656 + }, + { + "epoch": 3.075187969924812, + "grad_norm": 0.007886717095971107, + "learning_rate": 1.5392596500483604e-05, + "loss": 0.0002, + "step": 11658 + }, + { + "epoch": 3.0757156048014775, + "grad_norm": 0.3243177533149719, + "learning_rate": 1.5391424133180927e-05, + "loss": 0.0064, + "step": 11660 + }, + { + "epoch": 3.076243239678143, + "grad_norm": 0.00961129367351532, + "learning_rate": 1.539025176587825e-05, + "loss": 0.002, + "step": 11662 + }, + { + "epoch": 3.076770874554808, + "grad_norm": 0.057567697018384933, + "learning_rate": 1.5389079398575576e-05, + "loss": 0.0005, + "step": 11664 + }, + { + "epoch": 3.0772985094314733, + "grad_norm": 0.015755487605929375, + "learning_rate": 1.5387907031272898e-05, + "loss": 0.0003, + "step": 11666 + }, + { + "epoch": 3.0778261443081387, + "grad_norm": 0.3144618570804596, + "learning_rate": 1.5386734663970224e-05, + "loss": 0.0038, + "step": 11668 + }, + { + "epoch": 3.078353779184804, + "grad_norm": 0.001117972657084465, + "learning_rate": 1.5385562296667547e-05, + "loss": 0.0001, + "step": 11670 + }, + { + "epoch": 3.0788814140614695, + "grad_norm": 0.01160969864577055, + "learning_rate": 1.5384389929364873e-05, + "loss": 0.0083, + "step": 11672 + }, + { + "epoch": 3.079409048938135, + "grad_norm": 0.04004651680588722, + "learning_rate": 1.5383217562062195e-05, + "loss": 0.0003, + "step": 11674 + }, + { + "epoch": 3.0799366838148003, + "grad_norm": 0.01592238061130047, + "learning_rate": 1.5382045194759518e-05, + "loss": 0.0002, + "step": 11676 + }, + { + "epoch": 3.0804643186914653, + "grad_norm": 0.006149986293166876, + "learning_rate": 1.5380872827456844e-05, + "loss": 0.0004, + "step": 11678 + }, + { + "epoch": 3.0809919535681307, + "grad_norm": 0.03429195284843445, + "learning_rate": 1.5379700460154167e-05, + "loss": 0.001, + "step": 11680 + }, + { + "epoch": 3.081519588444796, + "grad_norm": 0.2062884271144867, + "learning_rate": 1.5378528092851493e-05, + "loss": 0.0099, + "step": 11682 + }, + { + "epoch": 3.0820472233214615, + "grad_norm": 0.005498053506016731, + "learning_rate": 1.5377355725548815e-05, + "loss": 0.0007, + "step": 11684 + }, + { + "epoch": 3.082574858198127, + "grad_norm": 0.031737230718135834, + "learning_rate": 1.5376183358246138e-05, + "loss": 0.0004, + "step": 11686 + }, + { + "epoch": 3.0831024930747923, + "grad_norm": 0.03331172093749046, + "learning_rate": 1.5375010990943464e-05, + "loss": 0.0005, + "step": 11688 + }, + { + "epoch": 3.0836301279514577, + "grad_norm": 0.02962432987987995, + "learning_rate": 1.537383862364079e-05, + "loss": 0.0041, + "step": 11690 + }, + { + "epoch": 3.084157762828123, + "grad_norm": 0.012639682739973068, + "learning_rate": 1.5372666256338112e-05, + "loss": 0.0004, + "step": 11692 + }, + { + "epoch": 3.084685397704788, + "grad_norm": 0.011564895510673523, + "learning_rate": 1.5371493889035435e-05, + "loss": 0.0002, + "step": 11694 + }, + { + "epoch": 3.0852130325814535, + "grad_norm": 0.01220796536654234, + "learning_rate": 1.537032152173276e-05, + "loss": 0.0002, + "step": 11696 + }, + { + "epoch": 3.085740667458119, + "grad_norm": 0.004776576533913612, + "learning_rate": 1.5369149154430084e-05, + "loss": 0.0001, + "step": 11698 + }, + { + "epoch": 3.0862683023347843, + "grad_norm": 0.01065440196543932, + "learning_rate": 1.5367976787127406e-05, + "loss": 0.0002, + "step": 11700 + }, + { + "epoch": 3.0867959372114497, + "grad_norm": 0.029106101021170616, + "learning_rate": 1.5366804419824732e-05, + "loss": 0.0005, + "step": 11702 + }, + { + "epoch": 3.087323572088115, + "grad_norm": 0.015952477231621742, + "learning_rate": 1.5365632052522058e-05, + "loss": 0.0003, + "step": 11704 + }, + { + "epoch": 3.0878512069647805, + "grad_norm": 0.007320077158510685, + "learning_rate": 1.536445968521938e-05, + "loss": 0.0002, + "step": 11706 + }, + { + "epoch": 3.0883788418414455, + "grad_norm": 0.0011458293301984668, + "learning_rate": 1.5363287317916704e-05, + "loss": 0.0001, + "step": 11708 + }, + { + "epoch": 3.088906476718111, + "grad_norm": 0.0015402563149109483, + "learning_rate": 1.536211495061403e-05, + "loss": 0.0002, + "step": 11710 + }, + { + "epoch": 3.0894341115947763, + "grad_norm": 0.32783427834510803, + "learning_rate": 1.5360942583311352e-05, + "loss": 0.0009, + "step": 11712 + }, + { + "epoch": 3.0899617464714417, + "grad_norm": 0.03682337701320648, + "learning_rate": 1.5359770216008675e-05, + "loss": 0.0003, + "step": 11714 + }, + { + "epoch": 3.090489381348107, + "grad_norm": 0.0020456979982554913, + "learning_rate": 1.5358597848706e-05, + "loss": 0.0001, + "step": 11716 + }, + { + "epoch": 3.0910170162247725, + "grad_norm": 0.0005837947246618569, + "learning_rate": 1.5357425481403327e-05, + "loss": 0.0003, + "step": 11718 + }, + { + "epoch": 3.091544651101438, + "grad_norm": 0.002236252650618553, + "learning_rate": 1.535625311410065e-05, + "loss": 0.0001, + "step": 11720 + }, + { + "epoch": 3.0920722859781034, + "grad_norm": 0.0011007218854501843, + "learning_rate": 1.5355080746797972e-05, + "loss": 0.0006, + "step": 11722 + }, + { + "epoch": 3.0925999208547683, + "grad_norm": 0.0008354330202564597, + "learning_rate": 1.5353908379495298e-05, + "loss": 0.0001, + "step": 11724 + }, + { + "epoch": 3.0931275557314337, + "grad_norm": 0.0027659162878990173, + "learning_rate": 1.535273601219262e-05, + "loss": 0.0001, + "step": 11726 + }, + { + "epoch": 3.093655190608099, + "grad_norm": 0.004651054739952087, + "learning_rate": 1.5351563644889947e-05, + "loss": 0.0001, + "step": 11728 + }, + { + "epoch": 3.0941828254847645, + "grad_norm": 0.015414467081427574, + "learning_rate": 1.535039127758727e-05, + "loss": 0.0002, + "step": 11730 + }, + { + "epoch": 3.09471046036143, + "grad_norm": 0.325246125459671, + "learning_rate": 1.5349218910284592e-05, + "loss": 0.0059, + "step": 11732 + }, + { + "epoch": 3.0952380952380953, + "grad_norm": 0.0011983104050159454, + "learning_rate": 1.5348046542981918e-05, + "loss": 0.0001, + "step": 11734 + }, + { + "epoch": 3.0957657301147607, + "grad_norm": 0.023902354761958122, + "learning_rate": 1.534687417567924e-05, + "loss": 0.0003, + "step": 11736 + }, + { + "epoch": 3.0962933649914257, + "grad_norm": 0.010586334392428398, + "learning_rate": 1.5345701808376566e-05, + "loss": 0.0031, + "step": 11738 + }, + { + "epoch": 3.096820999868091, + "grad_norm": 0.018998868763446808, + "learning_rate": 1.534452944107389e-05, + "loss": 0.001, + "step": 11740 + }, + { + "epoch": 3.0973486347447565, + "grad_norm": 1.3149949312210083, + "learning_rate": 1.5343357073771215e-05, + "loss": 0.0031, + "step": 11742 + }, + { + "epoch": 3.097876269621422, + "grad_norm": 0.024523666128516197, + "learning_rate": 1.5342184706468538e-05, + "loss": 0.0004, + "step": 11744 + }, + { + "epoch": 3.0984039044980873, + "grad_norm": 0.007934467867016792, + "learning_rate": 1.534101233916586e-05, + "loss": 0.0003, + "step": 11746 + }, + { + "epoch": 3.0989315393747527, + "grad_norm": 0.003031771630048752, + "learning_rate": 1.5339839971863186e-05, + "loss": 0.0001, + "step": 11748 + }, + { + "epoch": 3.099459174251418, + "grad_norm": 0.0018329420126974583, + "learning_rate": 1.5338667604560512e-05, + "loss": 0.0001, + "step": 11750 + }, + { + "epoch": 3.0999868091280836, + "grad_norm": 0.004394008778035641, + "learning_rate": 1.5337495237257835e-05, + "loss": 0.0001, + "step": 11752 + }, + { + "epoch": 3.1005144440047485, + "grad_norm": 0.002556506311520934, + "learning_rate": 1.5336322869955157e-05, + "loss": 0.0001, + "step": 11754 + }, + { + "epoch": 3.101042078881414, + "grad_norm": 0.0038725915364921093, + "learning_rate": 1.5335150502652483e-05, + "loss": 0.0005, + "step": 11756 + }, + { + "epoch": 3.1015697137580793, + "grad_norm": 0.01065925881266594, + "learning_rate": 1.5333978135349806e-05, + "loss": 0.0002, + "step": 11758 + }, + { + "epoch": 3.1020973486347447, + "grad_norm": 0.10335531085729599, + "learning_rate": 1.533280576804713e-05, + "loss": 0.0003, + "step": 11760 + }, + { + "epoch": 3.10262498351141, + "grad_norm": 0.0017042135586962104, + "learning_rate": 1.5331633400744455e-05, + "loss": 0.0001, + "step": 11762 + }, + { + "epoch": 3.1031526183880755, + "grad_norm": 0.009348143823444843, + "learning_rate": 1.533046103344178e-05, + "loss": 0.0002, + "step": 11764 + }, + { + "epoch": 3.103680253264741, + "grad_norm": 0.001547979423776269, + "learning_rate": 1.5329288666139103e-05, + "loss": 0.0001, + "step": 11766 + }, + { + "epoch": 3.1042078881414064, + "grad_norm": 0.1131911426782608, + "learning_rate": 1.5328116298836426e-05, + "loss": 0.0017, + "step": 11768 + }, + { + "epoch": 3.1047355230180713, + "grad_norm": 0.03236907348036766, + "learning_rate": 1.5326943931533752e-05, + "loss": 0.0017, + "step": 11770 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.011050299741327763, + "learning_rate": 1.5325771564231074e-05, + "loss": 0.0001, + "step": 11772 + }, + { + "epoch": 3.105790792771402, + "grad_norm": 0.005555733572691679, + "learning_rate": 1.5324599196928397e-05, + "loss": 0.0001, + "step": 11774 + }, + { + "epoch": 3.1063184276480675, + "grad_norm": 0.05601205676794052, + "learning_rate": 1.5323426829625723e-05, + "loss": 0.0007, + "step": 11776 + }, + { + "epoch": 3.106846062524733, + "grad_norm": 0.0033258546609431505, + "learning_rate": 1.5322254462323046e-05, + "loss": 0.0001, + "step": 11778 + }, + { + "epoch": 3.1073736974013983, + "grad_norm": 0.005336197093129158, + "learning_rate": 1.532108209502037e-05, + "loss": 0.0001, + "step": 11780 + }, + { + "epoch": 3.1079013322780638, + "grad_norm": 0.1893468052148819, + "learning_rate": 1.5319909727717694e-05, + "loss": 0.0017, + "step": 11782 + }, + { + "epoch": 3.1084289671547287, + "grad_norm": 0.001791690243408084, + "learning_rate": 1.531873736041502e-05, + "loss": 0.0001, + "step": 11784 + }, + { + "epoch": 3.108956602031394, + "grad_norm": 0.007714635226875544, + "learning_rate": 1.5317564993112343e-05, + "loss": 0.0002, + "step": 11786 + }, + { + "epoch": 3.1094842369080595, + "grad_norm": 0.0010252463398501277, + "learning_rate": 1.531639262580967e-05, + "loss": 0.0001, + "step": 11788 + }, + { + "epoch": 3.110011871784725, + "grad_norm": 0.001679701148532331, + "learning_rate": 1.531522025850699e-05, + "loss": 0.0017, + "step": 11790 + }, + { + "epoch": 3.1105395066613903, + "grad_norm": 0.0017678564181551337, + "learning_rate": 1.5314047891204314e-05, + "loss": 0.0001, + "step": 11792 + }, + { + "epoch": 3.1110671415380557, + "grad_norm": 0.001329536666162312, + "learning_rate": 1.531287552390164e-05, + "loss": 0.0001, + "step": 11794 + }, + { + "epoch": 3.111594776414721, + "grad_norm": 0.1822463423013687, + "learning_rate": 1.5311703156598963e-05, + "loss": 0.0079, + "step": 11796 + }, + { + "epoch": 3.1121224112913866, + "grad_norm": 0.0009631540742702782, + "learning_rate": 1.531053078929629e-05, + "loss": 0.0001, + "step": 11798 + }, + { + "epoch": 3.1126500461680515, + "grad_norm": 0.001030605984851718, + "learning_rate": 1.530935842199361e-05, + "loss": 0.0011, + "step": 11800 + }, + { + "epoch": 3.113177681044717, + "grad_norm": 0.001085309311747551, + "learning_rate": 1.5308186054690937e-05, + "loss": 0.0007, + "step": 11802 + }, + { + "epoch": 3.1137053159213823, + "grad_norm": 0.0013074937742203474, + "learning_rate": 1.530701368738826e-05, + "loss": 0.0001, + "step": 11804 + }, + { + "epoch": 3.1142329507980477, + "grad_norm": 0.006058864761143923, + "learning_rate": 1.5305841320085583e-05, + "loss": 0.0001, + "step": 11806 + }, + { + "epoch": 3.114760585674713, + "grad_norm": 0.0030822535045444965, + "learning_rate": 1.530466895278291e-05, + "loss": 0.0128, + "step": 11808 + }, + { + "epoch": 3.1152882205513786, + "grad_norm": 0.0034193464089185, + "learning_rate": 1.5303496585480235e-05, + "loss": 0.0001, + "step": 11810 + }, + { + "epoch": 3.115815855428044, + "grad_norm": 0.015667103230953217, + "learning_rate": 1.5302324218177557e-05, + "loss": 0.0002, + "step": 11812 + }, + { + "epoch": 3.1163434903047094, + "grad_norm": 0.030205370858311653, + "learning_rate": 1.530115185087488e-05, + "loss": 0.002, + "step": 11814 + }, + { + "epoch": 3.1168711251813743, + "grad_norm": 0.34952518343925476, + "learning_rate": 1.5299979483572206e-05, + "loss": 0.0016, + "step": 11816 + }, + { + "epoch": 3.1173987600580397, + "grad_norm": 0.027052605524659157, + "learning_rate": 1.529880711626953e-05, + "loss": 0.0003, + "step": 11818 + }, + { + "epoch": 3.117926394934705, + "grad_norm": 0.010953436605632305, + "learning_rate": 1.529763474896685e-05, + "loss": 0.0001, + "step": 11820 + }, + { + "epoch": 3.1184540298113705, + "grad_norm": 0.01681903935968876, + "learning_rate": 1.5296462381664177e-05, + "loss": 0.0002, + "step": 11822 + }, + { + "epoch": 3.118981664688036, + "grad_norm": 0.007377699017524719, + "learning_rate": 1.5295290014361503e-05, + "loss": 0.0048, + "step": 11824 + }, + { + "epoch": 3.1195092995647014, + "grad_norm": 0.0355173759162426, + "learning_rate": 1.5294117647058822e-05, + "loss": 0.0015, + "step": 11826 + }, + { + "epoch": 3.1200369344413668, + "grad_norm": 0.28742384910583496, + "learning_rate": 1.5292945279756148e-05, + "loss": 0.0026, + "step": 11828 + }, + { + "epoch": 3.1205645693180317, + "grad_norm": 0.00792448129504919, + "learning_rate": 1.5291772912453474e-05, + "loss": 0.0002, + "step": 11830 + }, + { + "epoch": 3.121092204194697, + "grad_norm": 0.005995727144181728, + "learning_rate": 1.5290600545150797e-05, + "loss": 0.0001, + "step": 11832 + }, + { + "epoch": 3.1216198390713625, + "grad_norm": 0.2250320315361023, + "learning_rate": 1.528942817784812e-05, + "loss": 0.0139, + "step": 11834 + }, + { + "epoch": 3.122147473948028, + "grad_norm": 0.06941168010234833, + "learning_rate": 1.5288255810545445e-05, + "loss": 0.0006, + "step": 11836 + }, + { + "epoch": 3.1226751088246933, + "grad_norm": 0.012062967754900455, + "learning_rate": 1.5287083443242768e-05, + "loss": 0.0002, + "step": 11838 + }, + { + "epoch": 3.1232027437013588, + "grad_norm": 0.04815351963043213, + "learning_rate": 1.5285911075940094e-05, + "loss": 0.0006, + "step": 11840 + }, + { + "epoch": 3.123730378578024, + "grad_norm": 0.05840311199426651, + "learning_rate": 1.5284738708637417e-05, + "loss": 0.0002, + "step": 11842 + }, + { + "epoch": 3.1242580134546896, + "grad_norm": 0.18326854705810547, + "learning_rate": 1.5283566341334743e-05, + "loss": 0.0037, + "step": 11844 + }, + { + "epoch": 3.1247856483313545, + "grad_norm": 0.005397849716246128, + "learning_rate": 1.5282393974032065e-05, + "loss": 0.0002, + "step": 11846 + }, + { + "epoch": 3.12531328320802, + "grad_norm": 0.4723694622516632, + "learning_rate": 1.528122160672939e-05, + "loss": 0.0027, + "step": 11848 + }, + { + "epoch": 3.1258409180846853, + "grad_norm": 0.013642877340316772, + "learning_rate": 1.5280049239426714e-05, + "loss": 0.0002, + "step": 11850 + }, + { + "epoch": 3.1263685529613507, + "grad_norm": 0.0055252849124372005, + "learning_rate": 1.5278876872124036e-05, + "loss": 0.0001, + "step": 11852 + }, + { + "epoch": 3.126896187838016, + "grad_norm": 0.010852250270545483, + "learning_rate": 1.5277704504821362e-05, + "loss": 0.0018, + "step": 11854 + }, + { + "epoch": 3.1274238227146816, + "grad_norm": 0.0354408361017704, + "learning_rate": 1.5276532137518685e-05, + "loss": 0.0004, + "step": 11856 + }, + { + "epoch": 3.127951457591347, + "grad_norm": 0.10583585500717163, + "learning_rate": 1.527535977021601e-05, + "loss": 0.0012, + "step": 11858 + }, + { + "epoch": 3.1284790924680124, + "grad_norm": 0.004090205766260624, + "learning_rate": 1.5274187402913334e-05, + "loss": 0.0001, + "step": 11860 + }, + { + "epoch": 3.1290067273446773, + "grad_norm": 0.008335566148161888, + "learning_rate": 1.527301503561066e-05, + "loss": 0.0002, + "step": 11862 + }, + { + "epoch": 3.1295343622213427, + "grad_norm": 0.0030485244933515787, + "learning_rate": 1.5271842668307982e-05, + "loss": 0.0003, + "step": 11864 + }, + { + "epoch": 3.130061997098008, + "grad_norm": 0.01130819134414196, + "learning_rate": 1.5270670301005305e-05, + "loss": 0.0001, + "step": 11866 + }, + { + "epoch": 3.1305896319746735, + "grad_norm": 0.006212105508893728, + "learning_rate": 1.526949793370263e-05, + "loss": 0.0012, + "step": 11868 + }, + { + "epoch": 3.131117266851339, + "grad_norm": 0.6454027891159058, + "learning_rate": 1.5268325566399957e-05, + "loss": 0.0026, + "step": 11870 + }, + { + "epoch": 3.1316449017280044, + "grad_norm": 0.0344306156039238, + "learning_rate": 1.5267153199097276e-05, + "loss": 0.0001, + "step": 11872 + }, + { + "epoch": 3.1321725366046698, + "grad_norm": 0.004505176097154617, + "learning_rate": 1.5265980831794602e-05, + "loss": 0.0001, + "step": 11874 + }, + { + "epoch": 3.1327001714813347, + "grad_norm": 0.015104983001947403, + "learning_rate": 1.5264808464491928e-05, + "loss": 0.0001, + "step": 11876 + }, + { + "epoch": 3.133227806358, + "grad_norm": 0.11799124628305435, + "learning_rate": 1.526363609718925e-05, + "loss": 0.0047, + "step": 11878 + }, + { + "epoch": 3.1337554412346655, + "grad_norm": 0.16081315279006958, + "learning_rate": 1.5262463729886573e-05, + "loss": 0.0095, + "step": 11880 + }, + { + "epoch": 3.134283076111331, + "grad_norm": 0.0023301290348172188, + "learning_rate": 1.52612913625839e-05, + "loss": 0.0001, + "step": 11882 + }, + { + "epoch": 3.1348107109879964, + "grad_norm": 0.07921656221151352, + "learning_rate": 1.5260118995281222e-05, + "loss": 0.003, + "step": 11884 + }, + { + "epoch": 3.1353383458646618, + "grad_norm": 0.002433034125715494, + "learning_rate": 1.5258946627978545e-05, + "loss": 0.0002, + "step": 11886 + }, + { + "epoch": 3.135865980741327, + "grad_norm": 0.003261640900745988, + "learning_rate": 1.525777426067587e-05, + "loss": 0.0001, + "step": 11888 + }, + { + "epoch": 3.136393615617992, + "grad_norm": 0.012253541499376297, + "learning_rate": 1.5256601893373195e-05, + "loss": 0.0002, + "step": 11890 + }, + { + "epoch": 3.1369212504946575, + "grad_norm": 0.005106658674776554, + "learning_rate": 1.5255429526070519e-05, + "loss": 0.0041, + "step": 11892 + }, + { + "epoch": 3.137448885371323, + "grad_norm": 0.09745527803897858, + "learning_rate": 1.5254257158767842e-05, + "loss": 0.0006, + "step": 11894 + }, + { + "epoch": 3.1379765202479883, + "grad_norm": 0.021479250863194466, + "learning_rate": 1.5253084791465168e-05, + "loss": 0.0002, + "step": 11896 + }, + { + "epoch": 3.1385041551246537, + "grad_norm": 0.0159311480820179, + "learning_rate": 1.5251912424162492e-05, + "loss": 0.0002, + "step": 11898 + }, + { + "epoch": 3.139031790001319, + "grad_norm": 0.003550906665623188, + "learning_rate": 1.5250740056859816e-05, + "loss": 0.0002, + "step": 11900 + }, + { + "epoch": 3.1395594248779846, + "grad_norm": 0.14598795771598816, + "learning_rate": 1.5249567689557139e-05, + "loss": 0.0048, + "step": 11902 + }, + { + "epoch": 3.14008705975465, + "grad_norm": 0.2873014211654663, + "learning_rate": 1.5248395322254463e-05, + "loss": 0.0009, + "step": 11904 + }, + { + "epoch": 3.140614694631315, + "grad_norm": 0.0024331710301339626, + "learning_rate": 1.5247222954951788e-05, + "loss": 0.0001, + "step": 11906 + }, + { + "epoch": 3.1411423295079803, + "grad_norm": 0.047581322491168976, + "learning_rate": 1.5246050587649114e-05, + "loss": 0.0003, + "step": 11908 + }, + { + "epoch": 3.1416699643846457, + "grad_norm": 0.21058610081672668, + "learning_rate": 1.5244878220346434e-05, + "loss": 0.0112, + "step": 11910 + }, + { + "epoch": 3.142197599261311, + "grad_norm": 0.17583981156349182, + "learning_rate": 1.524370585304376e-05, + "loss": 0.0018, + "step": 11912 + }, + { + "epoch": 3.1427252341379766, + "grad_norm": 0.22496606409549713, + "learning_rate": 1.5242533485741085e-05, + "loss": 0.0016, + "step": 11914 + }, + { + "epoch": 3.143252869014642, + "grad_norm": 0.0014363236259669065, + "learning_rate": 1.5241361118438407e-05, + "loss": 0.0001, + "step": 11916 + }, + { + "epoch": 3.1437805038913074, + "grad_norm": 0.013002214953303337, + "learning_rate": 1.5240188751135732e-05, + "loss": 0.0002, + "step": 11918 + }, + { + "epoch": 3.1443081387679728, + "grad_norm": 0.003435534192249179, + "learning_rate": 1.5239016383833056e-05, + "loss": 0.0002, + "step": 11920 + }, + { + "epoch": 3.1448357736446377, + "grad_norm": 0.005466964095830917, + "learning_rate": 1.523784401653038e-05, + "loss": 0.0002, + "step": 11922 + }, + { + "epoch": 3.145363408521303, + "grad_norm": 0.058066703379154205, + "learning_rate": 1.5236671649227703e-05, + "loss": 0.0003, + "step": 11924 + }, + { + "epoch": 3.1458910433979685, + "grad_norm": 0.019075877964496613, + "learning_rate": 1.5235499281925027e-05, + "loss": 0.0007, + "step": 11926 + }, + { + "epoch": 3.146418678274634, + "grad_norm": 0.004056169185787439, + "learning_rate": 1.5234326914622353e-05, + "loss": 0.0002, + "step": 11928 + }, + { + "epoch": 3.1469463131512994, + "grad_norm": 0.0047959680669009686, + "learning_rate": 1.5233154547319678e-05, + "loss": 0.0001, + "step": 11930 + }, + { + "epoch": 3.1474739480279648, + "grad_norm": 0.06213916093111038, + "learning_rate": 1.5231982180017e-05, + "loss": 0.0003, + "step": 11932 + }, + { + "epoch": 3.14800158290463, + "grad_norm": 0.010586760006844997, + "learning_rate": 1.5230809812714324e-05, + "loss": 0.0002, + "step": 11934 + }, + { + "epoch": 3.148529217781295, + "grad_norm": 0.0036941145081073046, + "learning_rate": 1.5229637445411649e-05, + "loss": 0.0002, + "step": 11936 + }, + { + "epoch": 3.1490568526579605, + "grad_norm": 0.08355434983968735, + "learning_rate": 1.5228465078108973e-05, + "loss": 0.0004, + "step": 11938 + }, + { + "epoch": 3.149584487534626, + "grad_norm": 0.0016524738166481256, + "learning_rate": 1.5227292710806296e-05, + "loss": 0.0001, + "step": 11940 + }, + { + "epoch": 3.1501121224112913, + "grad_norm": 0.0027809597086161375, + "learning_rate": 1.5226120343503622e-05, + "loss": 0.0001, + "step": 11942 + }, + { + "epoch": 3.1506397572879568, + "grad_norm": 0.0028866317588835955, + "learning_rate": 1.5224947976200946e-05, + "loss": 0.0004, + "step": 11944 + }, + { + "epoch": 3.151167392164622, + "grad_norm": 0.0028033247217535973, + "learning_rate": 1.5223775608898269e-05, + "loss": 0.0001, + "step": 11946 + }, + { + "epoch": 3.1516950270412876, + "grad_norm": 0.001166435074992478, + "learning_rate": 1.5222603241595593e-05, + "loss": 0.0001, + "step": 11948 + }, + { + "epoch": 3.152222661917953, + "grad_norm": 0.0057763271033763885, + "learning_rate": 1.5221430874292917e-05, + "loss": 0.0042, + "step": 11950 + }, + { + "epoch": 3.152750296794618, + "grad_norm": 0.0014458176447078586, + "learning_rate": 1.5220258506990241e-05, + "loss": 0.0001, + "step": 11952 + }, + { + "epoch": 3.1532779316712833, + "grad_norm": 0.011435562744736671, + "learning_rate": 1.5219086139687564e-05, + "loss": 0.0001, + "step": 11954 + }, + { + "epoch": 3.1538055665479487, + "grad_norm": 0.01526802871376276, + "learning_rate": 1.5217913772384888e-05, + "loss": 0.0055, + "step": 11956 + }, + { + "epoch": 3.154333201424614, + "grad_norm": 0.00219574267975986, + "learning_rate": 1.5216741405082214e-05, + "loss": 0.0001, + "step": 11958 + }, + { + "epoch": 3.1548608363012796, + "grad_norm": 0.0033928437624126673, + "learning_rate": 1.5215569037779539e-05, + "loss": 0.0001, + "step": 11960 + }, + { + "epoch": 3.155388471177945, + "grad_norm": 0.002702942118048668, + "learning_rate": 1.5214396670476861e-05, + "loss": 0.0001, + "step": 11962 + }, + { + "epoch": 3.1559161060546104, + "grad_norm": 0.0022177088540047407, + "learning_rate": 1.5213224303174186e-05, + "loss": 0.0002, + "step": 11964 + }, + { + "epoch": 3.1564437409312758, + "grad_norm": 0.04372172802686691, + "learning_rate": 1.521205193587151e-05, + "loss": 0.0002, + "step": 11966 + }, + { + "epoch": 3.1569713758079407, + "grad_norm": 0.00401318771764636, + "learning_rate": 1.5210879568568834e-05, + "loss": 0.0033, + "step": 11968 + }, + { + "epoch": 3.157499010684606, + "grad_norm": 0.0022605787962675095, + "learning_rate": 1.5209707201266157e-05, + "loss": 0.0001, + "step": 11970 + }, + { + "epoch": 3.1580266455612716, + "grad_norm": 0.0019305865280330181, + "learning_rate": 1.5208534833963481e-05, + "loss": 0.0006, + "step": 11972 + }, + { + "epoch": 3.158554280437937, + "grad_norm": 0.010764563456177711, + "learning_rate": 1.5207362466660807e-05, + "loss": 0.0022, + "step": 11974 + }, + { + "epoch": 3.1590819153146024, + "grad_norm": 0.006860312074422836, + "learning_rate": 1.520619009935813e-05, + "loss": 0.0006, + "step": 11976 + }, + { + "epoch": 3.1596095501912678, + "grad_norm": 0.0035384141374379396, + "learning_rate": 1.5205017732055454e-05, + "loss": 0.0001, + "step": 11978 + }, + { + "epoch": 3.160137185067933, + "grad_norm": 0.0021234164014458656, + "learning_rate": 1.5203845364752778e-05, + "loss": 0.0001, + "step": 11980 + }, + { + "epoch": 3.160664819944598, + "grad_norm": 0.0026004472747445107, + "learning_rate": 1.5202672997450103e-05, + "loss": 0.0001, + "step": 11982 + }, + { + "epoch": 3.1611924548212635, + "grad_norm": 0.010281438007950783, + "learning_rate": 1.5201500630147425e-05, + "loss": 0.0002, + "step": 11984 + }, + { + "epoch": 3.161720089697929, + "grad_norm": 0.0035370506811887026, + "learning_rate": 1.520032826284475e-05, + "loss": 0.0002, + "step": 11986 + }, + { + "epoch": 3.1622477245745944, + "grad_norm": 0.27215510606765747, + "learning_rate": 1.5199155895542076e-05, + "loss": 0.0069, + "step": 11988 + }, + { + "epoch": 3.1627753594512598, + "grad_norm": 0.000987038016319275, + "learning_rate": 1.51979835282394e-05, + "loss": 0.0001, + "step": 11990 + }, + { + "epoch": 3.163302994327925, + "grad_norm": 0.18366742134094238, + "learning_rate": 1.5196811160936722e-05, + "loss": 0.0023, + "step": 11992 + }, + { + "epoch": 3.1638306292045906, + "grad_norm": 0.0042994278483092785, + "learning_rate": 1.5195638793634047e-05, + "loss": 0.0001, + "step": 11994 + }, + { + "epoch": 3.164358264081256, + "grad_norm": 0.016926344484090805, + "learning_rate": 1.5194466426331371e-05, + "loss": 0.0002, + "step": 11996 + }, + { + "epoch": 3.164885898957921, + "grad_norm": 0.024253925308585167, + "learning_rate": 1.5193294059028695e-05, + "loss": 0.0002, + "step": 11998 + }, + { + "epoch": 3.1654135338345863, + "grad_norm": 0.04582114890217781, + "learning_rate": 1.5192121691726018e-05, + "loss": 0.0004, + "step": 12000 + }, + { + "epoch": 3.1654135338345863, + "eval_loss": 0.002039920538663864, + "eval_runtime": 308.5259, + "eval_samples_per_second": 698.94, + "eval_steps_per_second": 87.37, + "step": 12000 + }, + { + "epoch": 3.1659411687112518, + "grad_norm": 0.02741740271449089, + "learning_rate": 1.5190949324423342e-05, + "loss": 0.0031, + "step": 12002 + }, + { + "epoch": 3.166468803587917, + "grad_norm": 0.03169642761349678, + "learning_rate": 1.5189776957120668e-05, + "loss": 0.0002, + "step": 12004 + }, + { + "epoch": 3.1669964384645826, + "grad_norm": 0.01121450960636139, + "learning_rate": 1.5188604589817991e-05, + "loss": 0.0011, + "step": 12006 + }, + { + "epoch": 3.167524073341248, + "grad_norm": 0.150194451212883, + "learning_rate": 1.5187432222515315e-05, + "loss": 0.0012, + "step": 12008 + }, + { + "epoch": 3.1680517082179134, + "grad_norm": 0.01531602256000042, + "learning_rate": 1.518625985521264e-05, + "loss": 0.0001, + "step": 12010 + }, + { + "epoch": 3.168579343094579, + "grad_norm": 0.0067245797254145145, + "learning_rate": 1.5185087487909964e-05, + "loss": 0.0001, + "step": 12012 + }, + { + "epoch": 3.1691069779712437, + "grad_norm": 0.014143621549010277, + "learning_rate": 1.5183915120607286e-05, + "loss": 0.0001, + "step": 12014 + }, + { + "epoch": 3.169634612847909, + "grad_norm": 0.0016880101757124066, + "learning_rate": 1.518274275330461e-05, + "loss": 0.0001, + "step": 12016 + }, + { + "epoch": 3.1701622477245746, + "grad_norm": 0.01461319625377655, + "learning_rate": 1.5181570386001937e-05, + "loss": 0.0061, + "step": 12018 + }, + { + "epoch": 3.17068988260124, + "grad_norm": 0.00572096137329936, + "learning_rate": 1.5180398018699261e-05, + "loss": 0.0006, + "step": 12020 + }, + { + "epoch": 3.1712175174779054, + "grad_norm": 0.0033124915789812803, + "learning_rate": 1.5179225651396584e-05, + "loss": 0.0001, + "step": 12022 + }, + { + "epoch": 3.1717451523545708, + "grad_norm": 0.0009871027432382107, + "learning_rate": 1.5178053284093908e-05, + "loss": 0.002, + "step": 12024 + }, + { + "epoch": 3.172272787231236, + "grad_norm": 0.0033296041656285524, + "learning_rate": 1.5176880916791232e-05, + "loss": 0.0004, + "step": 12026 + }, + { + "epoch": 3.172800422107901, + "grad_norm": 0.002510115969926119, + "learning_rate": 1.5175708549488557e-05, + "loss": 0.0003, + "step": 12028 + }, + { + "epoch": 3.1733280569845665, + "grad_norm": 0.05402478203177452, + "learning_rate": 1.517453618218588e-05, + "loss": 0.0002, + "step": 12030 + }, + { + "epoch": 3.173855691861232, + "grad_norm": 0.03085239976644516, + "learning_rate": 1.5173363814883203e-05, + "loss": 0.0005, + "step": 12032 + }, + { + "epoch": 3.1743833267378974, + "grad_norm": 0.10808102786540985, + "learning_rate": 1.517219144758053e-05, + "loss": 0.0054, + "step": 12034 + }, + { + "epoch": 3.1749109616145628, + "grad_norm": 0.006559455767273903, + "learning_rate": 1.517101908027785e-05, + "loss": 0.0002, + "step": 12036 + }, + { + "epoch": 3.175438596491228, + "grad_norm": 0.0011487703304737806, + "learning_rate": 1.5169846712975176e-05, + "loss": 0.0001, + "step": 12038 + }, + { + "epoch": 3.1759662313678936, + "grad_norm": 0.015999285504221916, + "learning_rate": 1.51686743456725e-05, + "loss": 0.0035, + "step": 12040 + }, + { + "epoch": 3.1764938662445585, + "grad_norm": 0.2489738017320633, + "learning_rate": 1.5167501978369825e-05, + "loss": 0.0092, + "step": 12042 + }, + { + "epoch": 3.177021501121224, + "grad_norm": 0.0026754143182188272, + "learning_rate": 1.5166329611067148e-05, + "loss": 0.0001, + "step": 12044 + }, + { + "epoch": 3.1775491359978894, + "grad_norm": 0.005852822680026293, + "learning_rate": 1.5165157243764472e-05, + "loss": 0.0001, + "step": 12046 + }, + { + "epoch": 3.1780767708745548, + "grad_norm": 0.532707929611206, + "learning_rate": 1.5163984876461796e-05, + "loss": 0.0059, + "step": 12048 + }, + { + "epoch": 3.17860440575122, + "grad_norm": 0.004174140281975269, + "learning_rate": 1.5162812509159122e-05, + "loss": 0.0001, + "step": 12050 + }, + { + "epoch": 3.1791320406278856, + "grad_norm": 0.33380019664764404, + "learning_rate": 1.5161640141856445e-05, + "loss": 0.0059, + "step": 12052 + }, + { + "epoch": 3.179659675504551, + "grad_norm": 0.12799042463302612, + "learning_rate": 1.5160467774553769e-05, + "loss": 0.0026, + "step": 12054 + }, + { + "epoch": 3.1801873103812164, + "grad_norm": 0.21793706715106964, + "learning_rate": 1.5159295407251093e-05, + "loss": 0.001, + "step": 12056 + }, + { + "epoch": 3.1807149452578813, + "grad_norm": 0.010548199526965618, + "learning_rate": 1.5158123039948418e-05, + "loss": 0.0007, + "step": 12058 + }, + { + "epoch": 3.1812425801345467, + "grad_norm": 0.012888855300843716, + "learning_rate": 1.515695067264574e-05, + "loss": 0.0005, + "step": 12060 + }, + { + "epoch": 3.181770215011212, + "grad_norm": 0.0034977206960320473, + "learning_rate": 1.5155778305343065e-05, + "loss": 0.0001, + "step": 12062 + }, + { + "epoch": 3.1822978498878776, + "grad_norm": 0.2514473795890808, + "learning_rate": 1.515460593804039e-05, + "loss": 0.0034, + "step": 12064 + }, + { + "epoch": 3.182825484764543, + "grad_norm": 0.017513137310743332, + "learning_rate": 1.5153433570737712e-05, + "loss": 0.0003, + "step": 12066 + }, + { + "epoch": 3.1833531196412084, + "grad_norm": 0.10942038148641586, + "learning_rate": 1.5152261203435038e-05, + "loss": 0.0007, + "step": 12068 + }, + { + "epoch": 3.183880754517874, + "grad_norm": 0.041366588324308395, + "learning_rate": 1.5151088836132362e-05, + "loss": 0.0099, + "step": 12070 + }, + { + "epoch": 3.184408389394539, + "grad_norm": 0.007316990755498409, + "learning_rate": 1.5149916468829686e-05, + "loss": 0.0007, + "step": 12072 + }, + { + "epoch": 3.184936024271204, + "grad_norm": 0.012736713513731956, + "learning_rate": 1.5148744101527009e-05, + "loss": 0.0002, + "step": 12074 + }, + { + "epoch": 3.1854636591478696, + "grad_norm": 0.007493920158594847, + "learning_rate": 1.5147571734224333e-05, + "loss": 0.0001, + "step": 12076 + }, + { + "epoch": 3.185991294024535, + "grad_norm": 0.2460278868675232, + "learning_rate": 1.5146399366921657e-05, + "loss": 0.001, + "step": 12078 + }, + { + "epoch": 3.1865189289012004, + "grad_norm": 0.20282800495624542, + "learning_rate": 1.5145226999618983e-05, + "loss": 0.0009, + "step": 12080 + }, + { + "epoch": 3.1870465637778658, + "grad_norm": 0.05235022306442261, + "learning_rate": 1.5144054632316304e-05, + "loss": 0.0033, + "step": 12082 + }, + { + "epoch": 3.187574198654531, + "grad_norm": 0.006419491954147816, + "learning_rate": 1.514288226501363e-05, + "loss": 0.0002, + "step": 12084 + }, + { + "epoch": 3.1881018335311966, + "grad_norm": 0.013754473999142647, + "learning_rate": 1.5141709897710955e-05, + "loss": 0.0002, + "step": 12086 + }, + { + "epoch": 3.1886294684078615, + "grad_norm": 0.002300675492733717, + "learning_rate": 1.5140537530408279e-05, + "loss": 0.0002, + "step": 12088 + }, + { + "epoch": 3.189157103284527, + "grad_norm": 0.0023352010175585747, + "learning_rate": 1.5139365163105601e-05, + "loss": 0.0023, + "step": 12090 + }, + { + "epoch": 3.1896847381611924, + "grad_norm": 0.0023213389795273542, + "learning_rate": 1.5138192795802926e-05, + "loss": 0.0001, + "step": 12092 + }, + { + "epoch": 3.1902123730378578, + "grad_norm": 0.014940359629690647, + "learning_rate": 1.513702042850025e-05, + "loss": 0.0002, + "step": 12094 + }, + { + "epoch": 3.190740007914523, + "grad_norm": 0.0013320365687832236, + "learning_rate": 1.5135848061197573e-05, + "loss": 0.003, + "step": 12096 + }, + { + "epoch": 3.1912676427911886, + "grad_norm": 0.004757832735776901, + "learning_rate": 1.5134675693894899e-05, + "loss": 0.0001, + "step": 12098 + }, + { + "epoch": 3.191795277667854, + "grad_norm": 0.014275246299803257, + "learning_rate": 1.5133503326592223e-05, + "loss": 0.0002, + "step": 12100 + }, + { + "epoch": 3.1923229125445194, + "grad_norm": 0.003216400509700179, + "learning_rate": 1.5132330959289547e-05, + "loss": 0.0002, + "step": 12102 + }, + { + "epoch": 3.1928505474211843, + "grad_norm": 0.21964530646800995, + "learning_rate": 1.513115859198687e-05, + "loss": 0.0033, + "step": 12104 + }, + { + "epoch": 3.1933781822978498, + "grad_norm": 0.002763207536190748, + "learning_rate": 1.5129986224684194e-05, + "loss": 0.0001, + "step": 12106 + }, + { + "epoch": 3.193905817174515, + "grad_norm": 0.04957830533385277, + "learning_rate": 1.5128813857381519e-05, + "loss": 0.0003, + "step": 12108 + }, + { + "epoch": 3.1944334520511806, + "grad_norm": 0.0699155405163765, + "learning_rate": 1.5127641490078845e-05, + "loss": 0.0002, + "step": 12110 + }, + { + "epoch": 3.194961086927846, + "grad_norm": 0.0015858011320233345, + "learning_rate": 1.5126469122776165e-05, + "loss": 0.0001, + "step": 12112 + }, + { + "epoch": 3.1954887218045114, + "grad_norm": 0.2079681009054184, + "learning_rate": 1.5125296755473491e-05, + "loss": 0.0118, + "step": 12114 + }, + { + "epoch": 3.196016356681177, + "grad_norm": 0.01839447394013405, + "learning_rate": 1.5124124388170816e-05, + "loss": 0.0002, + "step": 12116 + }, + { + "epoch": 3.196543991557842, + "grad_norm": 0.014736006036400795, + "learning_rate": 1.512295202086814e-05, + "loss": 0.0003, + "step": 12118 + }, + { + "epoch": 3.197071626434507, + "grad_norm": 0.0161014162003994, + "learning_rate": 1.5121779653565463e-05, + "loss": 0.0004, + "step": 12120 + }, + { + "epoch": 3.1975992613111726, + "grad_norm": 0.013091795146465302, + "learning_rate": 1.5120607286262787e-05, + "loss": 0.0013, + "step": 12122 + }, + { + "epoch": 3.198126896187838, + "grad_norm": 0.0391085110604763, + "learning_rate": 1.5119434918960111e-05, + "loss": 0.0002, + "step": 12124 + }, + { + "epoch": 3.1986545310645034, + "grad_norm": 0.036706581711769104, + "learning_rate": 1.5118262551657437e-05, + "loss": 0.0003, + "step": 12126 + }, + { + "epoch": 3.1991821659411688, + "grad_norm": 0.009892258793115616, + "learning_rate": 1.511709018435476e-05, + "loss": 0.0002, + "step": 12128 + }, + { + "epoch": 3.199709800817834, + "grad_norm": 0.004223641008138657, + "learning_rate": 1.5115917817052084e-05, + "loss": 0.0004, + "step": 12130 + }, + { + "epoch": 3.2002374356944996, + "grad_norm": 0.002446049591526389, + "learning_rate": 1.5114745449749408e-05, + "loss": 0.0002, + "step": 12132 + }, + { + "epoch": 3.2007650705711646, + "grad_norm": 0.005069136619567871, + "learning_rate": 1.5113573082446731e-05, + "loss": 0.0028, + "step": 12134 + }, + { + "epoch": 3.20129270544783, + "grad_norm": 0.05175362154841423, + "learning_rate": 1.5112400715144055e-05, + "loss": 0.0008, + "step": 12136 + }, + { + "epoch": 3.2018203403244954, + "grad_norm": 0.005984514951705933, + "learning_rate": 1.511122834784138e-05, + "loss": 0.0002, + "step": 12138 + }, + { + "epoch": 3.2023479752011608, + "grad_norm": 0.04686838015913963, + "learning_rate": 1.5110055980538704e-05, + "loss": 0.0003, + "step": 12140 + }, + { + "epoch": 3.202875610077826, + "grad_norm": 0.002270226599648595, + "learning_rate": 1.5108883613236027e-05, + "loss": 0.0003, + "step": 12142 + }, + { + "epoch": 3.2034032449544916, + "grad_norm": 0.1066778227686882, + "learning_rate": 1.5107711245933353e-05, + "loss": 0.0005, + "step": 12144 + }, + { + "epoch": 3.203930879831157, + "grad_norm": 0.0019624645356088877, + "learning_rate": 1.5106538878630677e-05, + "loss": 0.0001, + "step": 12146 + }, + { + "epoch": 3.2044585147078224, + "grad_norm": 0.0032647696789354086, + "learning_rate": 1.5105366511328001e-05, + "loss": 0.0002, + "step": 12148 + }, + { + "epoch": 3.2049861495844874, + "grad_norm": 0.0035716600250452757, + "learning_rate": 1.5104194144025324e-05, + "loss": 0.0001, + "step": 12150 + }, + { + "epoch": 3.2055137844611528, + "grad_norm": 0.024023395031690598, + "learning_rate": 1.5103021776722648e-05, + "loss": 0.0004, + "step": 12152 + }, + { + "epoch": 3.206041419337818, + "grad_norm": 0.030033422634005547, + "learning_rate": 1.5101849409419972e-05, + "loss": 0.0018, + "step": 12154 + }, + { + "epoch": 3.2065690542144836, + "grad_norm": 0.3583436906337738, + "learning_rate": 1.5100677042117298e-05, + "loss": 0.0041, + "step": 12156 + }, + { + "epoch": 3.207096689091149, + "grad_norm": 0.0038007202092558146, + "learning_rate": 1.509950467481462e-05, + "loss": 0.0001, + "step": 12158 + }, + { + "epoch": 3.2076243239678144, + "grad_norm": 0.0017299172468483448, + "learning_rate": 1.5098332307511945e-05, + "loss": 0.0026, + "step": 12160 + }, + { + "epoch": 3.20815195884448, + "grad_norm": 0.02666393667459488, + "learning_rate": 1.509715994020927e-05, + "loss": 0.0002, + "step": 12162 + }, + { + "epoch": 3.208679593721145, + "grad_norm": 0.008027177304029465, + "learning_rate": 1.5095987572906592e-05, + "loss": 0.0003, + "step": 12164 + }, + { + "epoch": 3.20920722859781, + "grad_norm": 0.0009763955022208393, + "learning_rate": 1.5094815205603917e-05, + "loss": 0.0002, + "step": 12166 + }, + { + "epoch": 3.2097348634744756, + "grad_norm": 0.005770031828433275, + "learning_rate": 1.5093642838301241e-05, + "loss": 0.0001, + "step": 12168 + }, + { + "epoch": 3.210262498351141, + "grad_norm": 0.002708855550736189, + "learning_rate": 1.5092470470998565e-05, + "loss": 0.0001, + "step": 12170 + }, + { + "epoch": 3.2107901332278064, + "grad_norm": 0.5139384269714355, + "learning_rate": 1.5091298103695888e-05, + "loss": 0.0023, + "step": 12172 + }, + { + "epoch": 3.211317768104472, + "grad_norm": 0.033553920686244965, + "learning_rate": 1.5090125736393214e-05, + "loss": 0.0002, + "step": 12174 + }, + { + "epoch": 3.211845402981137, + "grad_norm": 0.020696455612778664, + "learning_rate": 1.5088953369090538e-05, + "loss": 0.0002, + "step": 12176 + }, + { + "epoch": 3.2123730378578026, + "grad_norm": 0.20340518653392792, + "learning_rate": 1.5087781001787862e-05, + "loss": 0.0077, + "step": 12178 + }, + { + "epoch": 3.2129006727344676, + "grad_norm": 0.0032597393728792667, + "learning_rate": 1.5086608634485185e-05, + "loss": 0.0001, + "step": 12180 + }, + { + "epoch": 3.213428307611133, + "grad_norm": 0.0016624248819425702, + "learning_rate": 1.508543626718251e-05, + "loss": 0.0002, + "step": 12182 + }, + { + "epoch": 3.2139559424877984, + "grad_norm": 0.003420014400035143, + "learning_rate": 1.5084263899879834e-05, + "loss": 0.0001, + "step": 12184 + }, + { + "epoch": 3.2144835773644638, + "grad_norm": 0.002668896922841668, + "learning_rate": 1.5083091532577158e-05, + "loss": 0.0001, + "step": 12186 + }, + { + "epoch": 3.215011212241129, + "grad_norm": 0.002591104246675968, + "learning_rate": 1.508191916527448e-05, + "loss": 0.0001, + "step": 12188 + }, + { + "epoch": 3.2155388471177946, + "grad_norm": 0.0019997265189886093, + "learning_rate": 1.5080746797971807e-05, + "loss": 0.0001, + "step": 12190 + }, + { + "epoch": 3.21606648199446, + "grad_norm": 0.05192222073674202, + "learning_rate": 1.507957443066913e-05, + "loss": 0.0039, + "step": 12192 + }, + { + "epoch": 3.216594116871125, + "grad_norm": 0.013520834036171436, + "learning_rate": 1.5078402063366453e-05, + "loss": 0.0003, + "step": 12194 + }, + { + "epoch": 3.2171217517477904, + "grad_norm": 0.38278353214263916, + "learning_rate": 1.5077229696063778e-05, + "loss": 0.0096, + "step": 12196 + }, + { + "epoch": 3.2176493866244558, + "grad_norm": 0.4144228398799896, + "learning_rate": 1.5076057328761102e-05, + "loss": 0.0016, + "step": 12198 + }, + { + "epoch": 3.218177021501121, + "grad_norm": 0.013922172598540783, + "learning_rate": 1.5074884961458426e-05, + "loss": 0.0001, + "step": 12200 + }, + { + "epoch": 3.2187046563777866, + "grad_norm": 0.5144954919815063, + "learning_rate": 1.5073712594155749e-05, + "loss": 0.0055, + "step": 12202 + }, + { + "epoch": 3.219232291254452, + "grad_norm": 0.07431253045797348, + "learning_rate": 1.5072540226853073e-05, + "loss": 0.0008, + "step": 12204 + }, + { + "epoch": 3.2197599261311174, + "grad_norm": 0.0031412863172590733, + "learning_rate": 1.50713678595504e-05, + "loss": 0.0004, + "step": 12206 + }, + { + "epoch": 3.220287561007783, + "grad_norm": 0.04459472373127937, + "learning_rate": 1.5070195492247724e-05, + "loss": 0.0003, + "step": 12208 + }, + { + "epoch": 3.2208151958844478, + "grad_norm": 0.4638805091381073, + "learning_rate": 1.5069023124945046e-05, + "loss": 0.0018, + "step": 12210 + }, + { + "epoch": 3.221342830761113, + "grad_norm": 0.06791482120752335, + "learning_rate": 1.506785075764237e-05, + "loss": 0.0003, + "step": 12212 + }, + { + "epoch": 3.2218704656377786, + "grad_norm": 0.044803038239479065, + "learning_rate": 1.5066678390339695e-05, + "loss": 0.0036, + "step": 12214 + }, + { + "epoch": 3.222398100514444, + "grad_norm": 0.03378104791045189, + "learning_rate": 1.5065506023037019e-05, + "loss": 0.0005, + "step": 12216 + }, + { + "epoch": 3.2229257353911094, + "grad_norm": 0.001428286894224584, + "learning_rate": 1.5064333655734342e-05, + "loss": 0.0001, + "step": 12218 + }, + { + "epoch": 3.223453370267775, + "grad_norm": 0.049654796719551086, + "learning_rate": 1.5063161288431668e-05, + "loss": 0.0003, + "step": 12220 + }, + { + "epoch": 3.22398100514444, + "grad_norm": 0.012941641733050346, + "learning_rate": 1.5061988921128992e-05, + "loss": 0.0032, + "step": 12222 + }, + { + "epoch": 3.2245086400211056, + "grad_norm": 0.047891292721033096, + "learning_rate": 1.5060816553826315e-05, + "loss": 0.0008, + "step": 12224 + }, + { + "epoch": 3.2250362748977706, + "grad_norm": 0.13839474320411682, + "learning_rate": 1.5059644186523639e-05, + "loss": 0.0004, + "step": 12226 + }, + { + "epoch": 3.225563909774436, + "grad_norm": 0.46760061383247375, + "learning_rate": 1.5058471819220963e-05, + "loss": 0.0018, + "step": 12228 + }, + { + "epoch": 3.2260915446511014, + "grad_norm": 0.0011263181222602725, + "learning_rate": 1.5057299451918288e-05, + "loss": 0.0001, + "step": 12230 + }, + { + "epoch": 3.226619179527767, + "grad_norm": 0.0012674794998019934, + "learning_rate": 1.505612708461561e-05, + "loss": 0.0012, + "step": 12232 + }, + { + "epoch": 3.227146814404432, + "grad_norm": 0.2884463667869568, + "learning_rate": 1.5054954717312934e-05, + "loss": 0.0011, + "step": 12234 + }, + { + "epoch": 3.2276744492810976, + "grad_norm": 0.02706562541425228, + "learning_rate": 1.505378235001026e-05, + "loss": 0.0019, + "step": 12236 + }, + { + "epoch": 3.228202084157763, + "grad_norm": 0.1451983004808426, + "learning_rate": 1.5052609982707585e-05, + "loss": 0.0027, + "step": 12238 + }, + { + "epoch": 3.228729719034428, + "grad_norm": 0.02086603082716465, + "learning_rate": 1.5051437615404907e-05, + "loss": 0.0001, + "step": 12240 + }, + { + "epoch": 3.2292573539110934, + "grad_norm": 0.005232056602835655, + "learning_rate": 1.5050265248102232e-05, + "loss": 0.0001, + "step": 12242 + }, + { + "epoch": 3.2297849887877588, + "grad_norm": 0.0006145601510070264, + "learning_rate": 1.5049092880799556e-05, + "loss": 0.0001, + "step": 12244 + }, + { + "epoch": 3.230312623664424, + "grad_norm": 0.20089608430862427, + "learning_rate": 1.504792051349688e-05, + "loss": 0.0004, + "step": 12246 + }, + { + "epoch": 3.2308402585410896, + "grad_norm": 0.1176416203379631, + "learning_rate": 1.5046748146194203e-05, + "loss": 0.0018, + "step": 12248 + }, + { + "epoch": 3.231367893417755, + "grad_norm": 0.025387024506926537, + "learning_rate": 1.5045575778891527e-05, + "loss": 0.0004, + "step": 12250 + }, + { + "epoch": 3.2318955282944204, + "grad_norm": 0.002067725406959653, + "learning_rate": 1.5044403411588853e-05, + "loss": 0.0001, + "step": 12252 + }, + { + "epoch": 3.232423163171086, + "grad_norm": 0.15761461853981018, + "learning_rate": 1.5043231044286176e-05, + "loss": 0.0011, + "step": 12254 + }, + { + "epoch": 3.2329507980477508, + "grad_norm": 0.0036229921970516443, + "learning_rate": 1.50420586769835e-05, + "loss": 0.0001, + "step": 12256 + }, + { + "epoch": 3.233478432924416, + "grad_norm": 0.034720439463853836, + "learning_rate": 1.5040886309680824e-05, + "loss": 0.0005, + "step": 12258 + }, + { + "epoch": 3.2340060678010816, + "grad_norm": 0.0008475804352201521, + "learning_rate": 1.5039713942378149e-05, + "loss": 0.0002, + "step": 12260 + }, + { + "epoch": 3.234533702677747, + "grad_norm": 0.1771378219127655, + "learning_rate": 1.5038541575075471e-05, + "loss": 0.0013, + "step": 12262 + }, + { + "epoch": 3.2350613375544124, + "grad_norm": 0.004701185971498489, + "learning_rate": 1.5037369207772796e-05, + "loss": 0.0001, + "step": 12264 + }, + { + "epoch": 3.235588972431078, + "grad_norm": 0.0011438896181061864, + "learning_rate": 1.5036196840470122e-05, + "loss": 0.0001, + "step": 12266 + }, + { + "epoch": 3.236116607307743, + "grad_norm": 0.001746654394082725, + "learning_rate": 1.5035024473167446e-05, + "loss": 0.0001, + "step": 12268 + }, + { + "epoch": 3.2366442421844086, + "grad_norm": 0.0016371987294405699, + "learning_rate": 1.5033852105864769e-05, + "loss": 0.0002, + "step": 12270 + }, + { + "epoch": 3.2371718770610736, + "grad_norm": 0.0006116978474892676, + "learning_rate": 1.5032679738562093e-05, + "loss": 0.0017, + "step": 12272 + }, + { + "epoch": 3.237699511937739, + "grad_norm": 0.11051228642463684, + "learning_rate": 1.5031507371259417e-05, + "loss": 0.0005, + "step": 12274 + }, + { + "epoch": 3.2382271468144044, + "grad_norm": 0.004848892334848642, + "learning_rate": 1.5030335003956741e-05, + "loss": 0.0001, + "step": 12276 + }, + { + "epoch": 3.23875478169107, + "grad_norm": 0.010012451559305191, + "learning_rate": 1.5029162636654064e-05, + "loss": 0.0002, + "step": 12278 + }, + { + "epoch": 3.239282416567735, + "grad_norm": 0.174339160323143, + "learning_rate": 1.5027990269351388e-05, + "loss": 0.0008, + "step": 12280 + }, + { + "epoch": 3.2398100514444006, + "grad_norm": 0.0006109951646067202, + "learning_rate": 1.5026817902048714e-05, + "loss": 0.0001, + "step": 12282 + }, + { + "epoch": 3.240337686321066, + "grad_norm": 0.04474085196852684, + "learning_rate": 1.5025645534746037e-05, + "loss": 0.0002, + "step": 12284 + }, + { + "epoch": 3.240865321197731, + "grad_norm": 0.17108899354934692, + "learning_rate": 1.5024473167443361e-05, + "loss": 0.0012, + "step": 12286 + }, + { + "epoch": 3.2413929560743964, + "grad_norm": 0.0032411266583949327, + "learning_rate": 1.5023300800140686e-05, + "loss": 0.0005, + "step": 12288 + }, + { + "epoch": 3.2419205909510618, + "grad_norm": 0.0016545936232432723, + "learning_rate": 1.502212843283801e-05, + "loss": 0.0001, + "step": 12290 + }, + { + "epoch": 3.242448225827727, + "grad_norm": 0.005003671161830425, + "learning_rate": 1.5020956065535332e-05, + "loss": 0.002, + "step": 12292 + }, + { + "epoch": 3.2429758607043926, + "grad_norm": 0.05869731307029724, + "learning_rate": 1.5019783698232657e-05, + "loss": 0.0021, + "step": 12294 + }, + { + "epoch": 3.243503495581058, + "grad_norm": 0.0041139014065265656, + "learning_rate": 1.5018611330929981e-05, + "loss": 0.0001, + "step": 12296 + }, + { + "epoch": 3.2440311304577234, + "grad_norm": 0.41744595766067505, + "learning_rate": 1.5017438963627307e-05, + "loss": 0.0054, + "step": 12298 + }, + { + "epoch": 3.244558765334389, + "grad_norm": 0.10777255892753601, + "learning_rate": 1.501626659632463e-05, + "loss": 0.0013, + "step": 12300 + }, + { + "epoch": 3.2450864002110538, + "grad_norm": 0.10238875448703766, + "learning_rate": 1.5015094229021954e-05, + "loss": 0.0011, + "step": 12302 + }, + { + "epoch": 3.245614035087719, + "grad_norm": 0.0026119293179363012, + "learning_rate": 1.5013921861719278e-05, + "loss": 0.0008, + "step": 12304 + }, + { + "epoch": 3.2461416699643846, + "grad_norm": 0.508979082107544, + "learning_rate": 1.5012749494416603e-05, + "loss": 0.0042, + "step": 12306 + }, + { + "epoch": 3.24666930484105, + "grad_norm": 0.002053396310657263, + "learning_rate": 1.5011577127113925e-05, + "loss": 0.0001, + "step": 12308 + }, + { + "epoch": 3.2471969397177154, + "grad_norm": 0.002549069467931986, + "learning_rate": 1.501040475981125e-05, + "loss": 0.0001, + "step": 12310 + }, + { + "epoch": 3.247724574594381, + "grad_norm": 0.1785450577735901, + "learning_rate": 1.5009232392508576e-05, + "loss": 0.0038, + "step": 12312 + }, + { + "epoch": 3.248252209471046, + "grad_norm": 0.01727011241018772, + "learning_rate": 1.5008060025205896e-05, + "loss": 0.0002, + "step": 12314 + }, + { + "epoch": 3.2487798443477116, + "grad_norm": 0.0032584399450570345, + "learning_rate": 1.5006887657903222e-05, + "loss": 0.0001, + "step": 12316 + }, + { + "epoch": 3.2493074792243766, + "grad_norm": 0.018787860870361328, + "learning_rate": 1.5005715290600547e-05, + "loss": 0.0124, + "step": 12318 + }, + { + "epoch": 3.249835114101042, + "grad_norm": 0.038121547549963, + "learning_rate": 1.5004542923297871e-05, + "loss": 0.0032, + "step": 12320 + }, + { + "epoch": 3.2503627489777074, + "grad_norm": 0.14056818187236786, + "learning_rate": 1.5003370555995194e-05, + "loss": 0.0127, + "step": 12322 + }, + { + "epoch": 3.250890383854373, + "grad_norm": 0.019827954471111298, + "learning_rate": 1.5002198188692518e-05, + "loss": 0.0002, + "step": 12324 + }, + { + "epoch": 3.251418018731038, + "grad_norm": 0.05516277998685837, + "learning_rate": 1.5001025821389842e-05, + "loss": 0.0004, + "step": 12326 + }, + { + "epoch": 3.2519456536077036, + "grad_norm": 0.07574236392974854, + "learning_rate": 1.4999853454087168e-05, + "loss": 0.0007, + "step": 12328 + }, + { + "epoch": 3.252473288484369, + "grad_norm": 0.013323014602065086, + "learning_rate": 1.4998681086784491e-05, + "loss": 0.0002, + "step": 12330 + }, + { + "epoch": 3.253000923361034, + "grad_norm": 0.09978456050157547, + "learning_rate": 1.4997508719481815e-05, + "loss": 0.0003, + "step": 12332 + }, + { + "epoch": 3.2535285582376994, + "grad_norm": 0.02635672688484192, + "learning_rate": 1.499633635217914e-05, + "loss": 0.0003, + "step": 12334 + }, + { + "epoch": 3.254056193114365, + "grad_norm": 0.058461714535951614, + "learning_rate": 1.4995163984876464e-05, + "loss": 0.0045, + "step": 12336 + }, + { + "epoch": 3.25458382799103, + "grad_norm": 0.06053803116083145, + "learning_rate": 1.4993991617573786e-05, + "loss": 0.0007, + "step": 12338 + }, + { + "epoch": 3.2551114628676956, + "grad_norm": 0.029077967628836632, + "learning_rate": 1.499281925027111e-05, + "loss": 0.0002, + "step": 12340 + }, + { + "epoch": 3.255639097744361, + "grad_norm": 0.00357701163738966, + "learning_rate": 1.4991646882968437e-05, + "loss": 0.0002, + "step": 12342 + }, + { + "epoch": 3.2561667326210264, + "grad_norm": 0.009385957382619381, + "learning_rate": 1.4990474515665758e-05, + "loss": 0.0001, + "step": 12344 + }, + { + "epoch": 3.2566943674976914, + "grad_norm": 0.0021948108915239573, + "learning_rate": 1.4989302148363084e-05, + "loss": 0.0001, + "step": 12346 + }, + { + "epoch": 3.2572220023743568, + "grad_norm": 0.14248931407928467, + "learning_rate": 1.4988129781060408e-05, + "loss": 0.0008, + "step": 12348 + }, + { + "epoch": 3.257749637251022, + "grad_norm": 0.003939746413379908, + "learning_rate": 1.4986957413757732e-05, + "loss": 0.0002, + "step": 12350 + }, + { + "epoch": 3.2582772721276876, + "grad_norm": 0.01158890500664711, + "learning_rate": 1.4985785046455055e-05, + "loss": 0.0001, + "step": 12352 + }, + { + "epoch": 3.258804907004353, + "grad_norm": 0.00510030472651124, + "learning_rate": 1.4984612679152379e-05, + "loss": 0.0002, + "step": 12354 + }, + { + "epoch": 3.2593325418810184, + "grad_norm": 0.01859450154006481, + "learning_rate": 1.4983440311849703e-05, + "loss": 0.0003, + "step": 12356 + }, + { + "epoch": 3.259860176757684, + "grad_norm": 0.0227781031280756, + "learning_rate": 1.498226794454703e-05, + "loss": 0.0013, + "step": 12358 + }, + { + "epoch": 3.260387811634349, + "grad_norm": 0.008518476970493793, + "learning_rate": 1.498109557724435e-05, + "loss": 0.0022, + "step": 12360 + }, + { + "epoch": 3.2609154465110146, + "grad_norm": 0.0017300822073593736, + "learning_rate": 1.4979923209941676e-05, + "loss": 0.0001, + "step": 12362 + }, + { + "epoch": 3.2614430813876796, + "grad_norm": 0.029293764382600784, + "learning_rate": 1.4978750842639e-05, + "loss": 0.0002, + "step": 12364 + }, + { + "epoch": 3.261970716264345, + "grad_norm": 0.001236802781932056, + "learning_rate": 1.4977578475336325e-05, + "loss": 0.0021, + "step": 12366 + }, + { + "epoch": 3.2624983511410104, + "grad_norm": 0.0011440407251939178, + "learning_rate": 1.4976406108033648e-05, + "loss": 0.005, + "step": 12368 + }, + { + "epoch": 3.263025986017676, + "grad_norm": 0.035897668451070786, + "learning_rate": 1.4975233740730972e-05, + "loss": 0.0001, + "step": 12370 + }, + { + "epoch": 3.263553620894341, + "grad_norm": 0.002174224006012082, + "learning_rate": 1.4974061373428296e-05, + "loss": 0.0024, + "step": 12372 + }, + { + "epoch": 3.2640812557710066, + "grad_norm": 0.027618562802672386, + "learning_rate": 1.4972889006125619e-05, + "loss": 0.0002, + "step": 12374 + }, + { + "epoch": 3.264608890647672, + "grad_norm": 0.3274710774421692, + "learning_rate": 1.4971716638822945e-05, + "loss": 0.0036, + "step": 12376 + }, + { + "epoch": 3.265136525524337, + "grad_norm": 0.2488120049238205, + "learning_rate": 1.4970544271520269e-05, + "loss": 0.0008, + "step": 12378 + }, + { + "epoch": 3.2656641604010024, + "grad_norm": 0.0051090712659060955, + "learning_rate": 1.4969371904217593e-05, + "loss": 0.0002, + "step": 12380 + }, + { + "epoch": 3.266191795277668, + "grad_norm": 0.002219813410192728, + "learning_rate": 1.4968199536914916e-05, + "loss": 0.0051, + "step": 12382 + }, + { + "epoch": 3.266719430154333, + "grad_norm": 0.027404747903347015, + "learning_rate": 1.496702716961224e-05, + "loss": 0.0004, + "step": 12384 + }, + { + "epoch": 3.2672470650309986, + "grad_norm": 0.005713648162782192, + "learning_rate": 1.4965854802309565e-05, + "loss": 0.0001, + "step": 12386 + }, + { + "epoch": 3.267774699907664, + "grad_norm": 0.00118094717618078, + "learning_rate": 1.496468243500689e-05, + "loss": 0.0017, + "step": 12388 + }, + { + "epoch": 3.2683023347843294, + "grad_norm": 0.013989376835525036, + "learning_rate": 1.4963510067704212e-05, + "loss": 0.001, + "step": 12390 + }, + { + "epoch": 3.2688299696609944, + "grad_norm": 0.007033671252429485, + "learning_rate": 1.4962337700401538e-05, + "loss": 0.0002, + "step": 12392 + }, + { + "epoch": 3.26935760453766, + "grad_norm": 0.003224356332793832, + "learning_rate": 1.4961165333098862e-05, + "loss": 0.0001, + "step": 12394 + }, + { + "epoch": 3.269885239414325, + "grad_norm": 0.01259429007768631, + "learning_rate": 1.4959992965796186e-05, + "loss": 0.0002, + "step": 12396 + }, + { + "epoch": 3.2704128742909906, + "grad_norm": 0.14940667152404785, + "learning_rate": 1.4958820598493509e-05, + "loss": 0.0141, + "step": 12398 + }, + { + "epoch": 3.270940509167656, + "grad_norm": 0.0026125777512788773, + "learning_rate": 1.4957648231190833e-05, + "loss": 0.0002, + "step": 12400 + }, + { + "epoch": 3.2714681440443214, + "grad_norm": 0.057590942829847336, + "learning_rate": 1.4956475863888157e-05, + "loss": 0.0006, + "step": 12402 + }, + { + "epoch": 3.271995778920987, + "grad_norm": 0.002450880827382207, + "learning_rate": 1.495530349658548e-05, + "loss": 0.0008, + "step": 12404 + }, + { + "epoch": 3.272523413797652, + "grad_norm": 0.07523937523365021, + "learning_rate": 1.4954131129282804e-05, + "loss": 0.0007, + "step": 12406 + }, + { + "epoch": 3.2730510486743176, + "grad_norm": 0.0047121415846049786, + "learning_rate": 1.495295876198013e-05, + "loss": 0.0001, + "step": 12408 + }, + { + "epoch": 3.2735786835509826, + "grad_norm": 0.0020530130714178085, + "learning_rate": 1.4951786394677455e-05, + "loss": 0.0001, + "step": 12410 + }, + { + "epoch": 3.274106318427648, + "grad_norm": 0.09238512068986893, + "learning_rate": 1.4950614027374777e-05, + "loss": 0.0004, + "step": 12412 + }, + { + "epoch": 3.2746339533043134, + "grad_norm": 0.001209335750900209, + "learning_rate": 1.4949441660072101e-05, + "loss": 0.0001, + "step": 12414 + }, + { + "epoch": 3.275161588180979, + "grad_norm": 0.012459367513656616, + "learning_rate": 1.4948269292769426e-05, + "loss": 0.0001, + "step": 12416 + }, + { + "epoch": 3.275689223057644, + "grad_norm": 0.003964018542319536, + "learning_rate": 1.494709692546675e-05, + "loss": 0.0001, + "step": 12418 + }, + { + "epoch": 3.2762168579343096, + "grad_norm": 0.03751324489712715, + "learning_rate": 1.4945924558164073e-05, + "loss": 0.0008, + "step": 12420 + }, + { + "epoch": 3.276744492810975, + "grad_norm": 0.0020675123669207096, + "learning_rate": 1.4944752190861399e-05, + "loss": 0.0001, + "step": 12422 + }, + { + "epoch": 3.27727212768764, + "grad_norm": 0.009002125822007656, + "learning_rate": 1.4943579823558723e-05, + "loss": 0.0001, + "step": 12424 + }, + { + "epoch": 3.2777997625643054, + "grad_norm": 0.0017681281315162778, + "learning_rate": 1.4942407456256047e-05, + "loss": 0.0001, + "step": 12426 + }, + { + "epoch": 3.278327397440971, + "grad_norm": 0.08797478675842285, + "learning_rate": 1.494123508895337e-05, + "loss": 0.0016, + "step": 12428 + }, + { + "epoch": 3.278855032317636, + "grad_norm": 0.18811021745204926, + "learning_rate": 1.4940062721650694e-05, + "loss": 0.0037, + "step": 12430 + }, + { + "epoch": 3.2793826671943016, + "grad_norm": 0.16305945813655853, + "learning_rate": 1.4938890354348018e-05, + "loss": 0.0046, + "step": 12432 + }, + { + "epoch": 3.279910302070967, + "grad_norm": 0.0025748517364263535, + "learning_rate": 1.4937717987045341e-05, + "loss": 0.0001, + "step": 12434 + }, + { + "epoch": 3.2804379369476324, + "grad_norm": 0.00346167734824121, + "learning_rate": 1.4936545619742665e-05, + "loss": 0.0001, + "step": 12436 + }, + { + "epoch": 3.2809655718242974, + "grad_norm": 0.0029026209376752377, + "learning_rate": 1.4935373252439991e-05, + "loss": 0.0001, + "step": 12438 + }, + { + "epoch": 3.281493206700963, + "grad_norm": 0.008669101633131504, + "learning_rate": 1.4934200885137316e-05, + "loss": 0.0003, + "step": 12440 + }, + { + "epoch": 3.282020841577628, + "grad_norm": 0.004440275486558676, + "learning_rate": 1.4933028517834638e-05, + "loss": 0.0001, + "step": 12442 + }, + { + "epoch": 3.2825484764542936, + "grad_norm": 0.002320939442142844, + "learning_rate": 1.4931856150531963e-05, + "loss": 0.0023, + "step": 12444 + }, + { + "epoch": 3.283076111330959, + "grad_norm": 0.002513415180146694, + "learning_rate": 1.4930683783229287e-05, + "loss": 0.0002, + "step": 12446 + }, + { + "epoch": 3.2836037462076244, + "grad_norm": 0.13825605809688568, + "learning_rate": 1.4929511415926611e-05, + "loss": 0.0003, + "step": 12448 + }, + { + "epoch": 3.28413138108429, + "grad_norm": 0.014046830125153065, + "learning_rate": 1.4928339048623934e-05, + "loss": 0.0001, + "step": 12450 + }, + { + "epoch": 3.2846590159609548, + "grad_norm": 0.004170242231339216, + "learning_rate": 1.492716668132126e-05, + "loss": 0.0001, + "step": 12452 + }, + { + "epoch": 3.28518665083762, + "grad_norm": 0.0009637702605687082, + "learning_rate": 1.4925994314018584e-05, + "loss": 0.0001, + "step": 12454 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 0.13543041050434113, + "learning_rate": 1.4924821946715908e-05, + "loss": 0.0029, + "step": 12456 + }, + { + "epoch": 3.286241920590951, + "grad_norm": 0.0006229165010154247, + "learning_rate": 1.4923649579413231e-05, + "loss": 0.0001, + "step": 12458 + }, + { + "epoch": 3.2867695554676164, + "grad_norm": 0.28710153698921204, + "learning_rate": 1.4922477212110555e-05, + "loss": 0.0047, + "step": 12460 + }, + { + "epoch": 3.287297190344282, + "grad_norm": 0.0009922959143295884, + "learning_rate": 1.492130484480788e-05, + "loss": 0.0001, + "step": 12462 + }, + { + "epoch": 3.287824825220947, + "grad_norm": 0.40714046359062195, + "learning_rate": 1.4920132477505202e-05, + "loss": 0.0021, + "step": 12464 + }, + { + "epoch": 3.2883524600976126, + "grad_norm": 0.0010696036042645574, + "learning_rate": 1.4918960110202527e-05, + "loss": 0.0001, + "step": 12466 + }, + { + "epoch": 3.288880094974278, + "grad_norm": 0.021075481548905373, + "learning_rate": 1.4917787742899853e-05, + "loss": 0.0002, + "step": 12468 + }, + { + "epoch": 3.289407729850943, + "grad_norm": 0.0019422649638727307, + "learning_rate": 1.4916615375597177e-05, + "loss": 0.0001, + "step": 12470 + }, + { + "epoch": 3.2899353647276084, + "grad_norm": 0.10500095784664154, + "learning_rate": 1.49154430082945e-05, + "loss": 0.0008, + "step": 12472 + }, + { + "epoch": 3.290462999604274, + "grad_norm": 0.0039474377408623695, + "learning_rate": 1.4914270640991824e-05, + "loss": 0.004, + "step": 12474 + }, + { + "epoch": 3.290990634480939, + "grad_norm": 0.04439942166209221, + "learning_rate": 1.4913098273689148e-05, + "loss": 0.0002, + "step": 12476 + }, + { + "epoch": 3.2915182693576046, + "grad_norm": 0.4068252444267273, + "learning_rate": 1.4911925906386472e-05, + "loss": 0.0034, + "step": 12478 + }, + { + "epoch": 3.29204590423427, + "grad_norm": 0.00803406722843647, + "learning_rate": 1.4910753539083795e-05, + "loss": 0.0001, + "step": 12480 + }, + { + "epoch": 3.2925735391109354, + "grad_norm": 1.1087733507156372, + "learning_rate": 1.490958117178112e-05, + "loss": 0.0094, + "step": 12482 + }, + { + "epoch": 3.2931011739876004, + "grad_norm": 0.00770940724760294, + "learning_rate": 1.4908408804478445e-05, + "loss": 0.0001, + "step": 12484 + }, + { + "epoch": 3.293628808864266, + "grad_norm": 0.013796641491353512, + "learning_rate": 1.490723643717577e-05, + "loss": 0.0002, + "step": 12486 + }, + { + "epoch": 3.294156443740931, + "grad_norm": 0.00208406918682158, + "learning_rate": 1.4906064069873092e-05, + "loss": 0.0001, + "step": 12488 + }, + { + "epoch": 3.2946840786175966, + "grad_norm": 0.011195470578968525, + "learning_rate": 1.4904891702570417e-05, + "loss": 0.0002, + "step": 12490 + }, + { + "epoch": 3.295211713494262, + "grad_norm": 0.0039724926464259624, + "learning_rate": 1.490371933526774e-05, + "loss": 0.0001, + "step": 12492 + }, + { + "epoch": 3.2957393483709274, + "grad_norm": 0.00260384869761765, + "learning_rate": 1.4902546967965063e-05, + "loss": 0.0001, + "step": 12494 + }, + { + "epoch": 3.296266983247593, + "grad_norm": 0.0020923137199133635, + "learning_rate": 1.4901374600662388e-05, + "loss": 0.0001, + "step": 12496 + }, + { + "epoch": 3.296794618124258, + "grad_norm": 0.003998976666480303, + "learning_rate": 1.4900202233359714e-05, + "loss": 0.0001, + "step": 12498 + }, + { + "epoch": 3.297322253000923, + "grad_norm": 0.13673880696296692, + "learning_rate": 1.4899029866057038e-05, + "loss": 0.0004, + "step": 12500 + }, + { + "epoch": 3.2978498878775886, + "grad_norm": 0.0035080404486507177, + "learning_rate": 1.489785749875436e-05, + "loss": 0.0001, + "step": 12502 + }, + { + "epoch": 3.298377522754254, + "grad_norm": 0.025078099220991135, + "learning_rate": 1.4896685131451685e-05, + "loss": 0.0002, + "step": 12504 + }, + { + "epoch": 3.2989051576309194, + "grad_norm": 0.0019240042893216014, + "learning_rate": 1.489551276414901e-05, + "loss": 0.0002, + "step": 12506 + }, + { + "epoch": 3.299432792507585, + "grad_norm": 0.006981111131608486, + "learning_rate": 1.4894340396846334e-05, + "loss": 0.0001, + "step": 12508 + }, + { + "epoch": 3.29996042738425, + "grad_norm": 0.005336382891982794, + "learning_rate": 1.4893168029543656e-05, + "loss": 0.0001, + "step": 12510 + }, + { + "epoch": 3.3004880622609156, + "grad_norm": 0.0020687696523964405, + "learning_rate": 1.489199566224098e-05, + "loss": 0.0001, + "step": 12512 + }, + { + "epoch": 3.301015697137581, + "grad_norm": 0.005006611347198486, + "learning_rate": 1.4890823294938306e-05, + "loss": 0.0041, + "step": 12514 + }, + { + "epoch": 3.301543332014246, + "grad_norm": 0.3650480806827545, + "learning_rate": 1.488965092763563e-05, + "loss": 0.0015, + "step": 12516 + }, + { + "epoch": 3.3020709668909114, + "grad_norm": 0.005965156946331263, + "learning_rate": 1.4888478560332953e-05, + "loss": 0.0001, + "step": 12518 + }, + { + "epoch": 3.302598601767577, + "grad_norm": 0.005529665853828192, + "learning_rate": 1.4887306193030278e-05, + "loss": 0.0102, + "step": 12520 + }, + { + "epoch": 3.303126236644242, + "grad_norm": 0.009291482158005238, + "learning_rate": 1.4886133825727602e-05, + "loss": 0.0001, + "step": 12522 + }, + { + "epoch": 3.3036538715209076, + "grad_norm": 0.0029265526682138443, + "learning_rate": 1.4884961458424925e-05, + "loss": 0.0001, + "step": 12524 + }, + { + "epoch": 3.304181506397573, + "grad_norm": 0.024419665336608887, + "learning_rate": 1.4883789091122249e-05, + "loss": 0.0027, + "step": 12526 + }, + { + "epoch": 3.3047091412742384, + "grad_norm": 0.005416937172412872, + "learning_rate": 1.4882616723819573e-05, + "loss": 0.0001, + "step": 12528 + }, + { + "epoch": 3.3052367761509034, + "grad_norm": 0.16119758784770966, + "learning_rate": 1.48814443565169e-05, + "loss": 0.0039, + "step": 12530 + }, + { + "epoch": 3.305764411027569, + "grad_norm": 0.005041640717536211, + "learning_rate": 1.4880271989214222e-05, + "loss": 0.0002, + "step": 12532 + }, + { + "epoch": 3.306292045904234, + "grad_norm": 0.013903765007853508, + "learning_rate": 1.4879099621911546e-05, + "loss": 0.0003, + "step": 12534 + }, + { + "epoch": 3.3068196807808996, + "grad_norm": 0.01569296233355999, + "learning_rate": 1.487792725460887e-05, + "loss": 0.0002, + "step": 12536 + }, + { + "epoch": 3.307347315657565, + "grad_norm": 0.00729235727339983, + "learning_rate": 1.4876754887306195e-05, + "loss": 0.0019, + "step": 12538 + }, + { + "epoch": 3.3078749505342304, + "grad_norm": 0.005361164454370737, + "learning_rate": 1.4875582520003517e-05, + "loss": 0.0001, + "step": 12540 + }, + { + "epoch": 3.308402585410896, + "grad_norm": 0.20352213084697723, + "learning_rate": 1.4874410152700842e-05, + "loss": 0.0004, + "step": 12542 + }, + { + "epoch": 3.308930220287561, + "grad_norm": 0.08736811578273773, + "learning_rate": 1.4873237785398168e-05, + "loss": 0.0004, + "step": 12544 + }, + { + "epoch": 3.309457855164226, + "grad_norm": 0.0036761609371751547, + "learning_rate": 1.4872065418095492e-05, + "loss": 0.0002, + "step": 12546 + }, + { + "epoch": 3.3099854900408916, + "grad_norm": 0.1401059478521347, + "learning_rate": 1.4870893050792815e-05, + "loss": 0.0023, + "step": 12548 + }, + { + "epoch": 3.310513124917557, + "grad_norm": 0.0033016044180840254, + "learning_rate": 1.4869720683490139e-05, + "loss": 0.0108, + "step": 12550 + }, + { + "epoch": 3.3110407597942224, + "grad_norm": 0.1808210015296936, + "learning_rate": 1.4868548316187463e-05, + "loss": 0.0022, + "step": 12552 + }, + { + "epoch": 3.311568394670888, + "grad_norm": 0.32862740755081177, + "learning_rate": 1.4867375948884786e-05, + "loss": 0.0055, + "step": 12554 + }, + { + "epoch": 3.312096029547553, + "grad_norm": 0.12876714766025543, + "learning_rate": 1.486620358158211e-05, + "loss": 0.0005, + "step": 12556 + }, + { + "epoch": 3.3126236644242186, + "grad_norm": 0.08027168363332748, + "learning_rate": 1.4865031214279434e-05, + "loss": 0.0024, + "step": 12558 + }, + { + "epoch": 3.313151299300884, + "grad_norm": 0.003544918494299054, + "learning_rate": 1.486385884697676e-05, + "loss": 0.0001, + "step": 12560 + }, + { + "epoch": 3.313678934177549, + "grad_norm": 0.003792038420215249, + "learning_rate": 1.4862686479674083e-05, + "loss": 0.0001, + "step": 12562 + }, + { + "epoch": 3.3142065690542144, + "grad_norm": 0.019001798704266548, + "learning_rate": 1.4861514112371407e-05, + "loss": 0.0001, + "step": 12564 + }, + { + "epoch": 3.31473420393088, + "grad_norm": 0.006841033697128296, + "learning_rate": 1.4860341745068732e-05, + "loss": 0.0001, + "step": 12566 + }, + { + "epoch": 3.315261838807545, + "grad_norm": 0.026511667296290398, + "learning_rate": 1.4859169377766056e-05, + "loss": 0.0005, + "step": 12568 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.2217414379119873, + "learning_rate": 1.4857997010463379e-05, + "loss": 0.0015, + "step": 12570 + }, + { + "epoch": 3.316317108560876, + "grad_norm": 0.03422966226935387, + "learning_rate": 1.4856824643160703e-05, + "loss": 0.0002, + "step": 12572 + }, + { + "epoch": 3.3168447434375414, + "grad_norm": 0.004725426435470581, + "learning_rate": 1.4855652275858027e-05, + "loss": 0.0001, + "step": 12574 + }, + { + "epoch": 3.3173723783142064, + "grad_norm": 0.03426062688231468, + "learning_rate": 1.4854479908555353e-05, + "loss": 0.0005, + "step": 12576 + }, + { + "epoch": 3.317900013190872, + "grad_norm": 0.002099931240081787, + "learning_rate": 1.4853307541252676e-05, + "loss": 0.0001, + "step": 12578 + }, + { + "epoch": 3.318427648067537, + "grad_norm": 0.03257502242922783, + "learning_rate": 1.485213517395e-05, + "loss": 0.008, + "step": 12580 + }, + { + "epoch": 3.3189552829442026, + "grad_norm": 0.002998597454279661, + "learning_rate": 1.4850962806647324e-05, + "loss": 0.0001, + "step": 12582 + }, + { + "epoch": 3.319482917820868, + "grad_norm": 0.005152563098818064, + "learning_rate": 1.4849790439344647e-05, + "loss": 0.0001, + "step": 12584 + }, + { + "epoch": 3.3200105526975334, + "grad_norm": 0.0031630960293114185, + "learning_rate": 1.4848618072041971e-05, + "loss": 0.0002, + "step": 12586 + }, + { + "epoch": 3.320538187574199, + "grad_norm": 0.17322860658168793, + "learning_rate": 1.4847445704739296e-05, + "loss": 0.0021, + "step": 12588 + }, + { + "epoch": 3.321065822450864, + "grad_norm": 0.03282801806926727, + "learning_rate": 1.4846273337436622e-05, + "loss": 0.0025, + "step": 12590 + }, + { + "epoch": 3.321593457327529, + "grad_norm": 0.004693582653999329, + "learning_rate": 1.4845100970133942e-05, + "loss": 0.0067, + "step": 12592 + }, + { + "epoch": 3.3221210922041946, + "grad_norm": 0.2551272213459015, + "learning_rate": 1.4843928602831268e-05, + "loss": 0.0019, + "step": 12594 + }, + { + "epoch": 3.32264872708086, + "grad_norm": 0.03309916332364082, + "learning_rate": 1.4842756235528593e-05, + "loss": 0.0003, + "step": 12596 + }, + { + "epoch": 3.3231763619575254, + "grad_norm": 0.21877454221248627, + "learning_rate": 1.4841583868225917e-05, + "loss": 0.0009, + "step": 12598 + }, + { + "epoch": 3.323703996834191, + "grad_norm": 0.3162223994731903, + "learning_rate": 1.484041150092324e-05, + "loss": 0.0014, + "step": 12600 + }, + { + "epoch": 3.3242316317108562, + "grad_norm": 0.016141049563884735, + "learning_rate": 1.4839239133620564e-05, + "loss": 0.0005, + "step": 12602 + }, + { + "epoch": 3.324759266587521, + "grad_norm": 0.007980220951139927, + "learning_rate": 1.4838066766317888e-05, + "loss": 0.0002, + "step": 12604 + }, + { + "epoch": 3.3252869014641866, + "grad_norm": 0.004292087629437447, + "learning_rate": 1.4836894399015214e-05, + "loss": 0.0001, + "step": 12606 + }, + { + "epoch": 3.325814536340852, + "grad_norm": 0.17255344986915588, + "learning_rate": 1.4835722031712537e-05, + "loss": 0.0017, + "step": 12608 + }, + { + "epoch": 3.3263421712175174, + "grad_norm": 0.044512998312711716, + "learning_rate": 1.4834549664409861e-05, + "loss": 0.0003, + "step": 12610 + }, + { + "epoch": 3.326869806094183, + "grad_norm": 0.0023226544726639986, + "learning_rate": 1.4833377297107186e-05, + "loss": 0.0016, + "step": 12612 + }, + { + "epoch": 3.327397440970848, + "grad_norm": 0.050715573132038116, + "learning_rate": 1.4832204929804508e-05, + "loss": 0.0023, + "step": 12614 + }, + { + "epoch": 3.3279250758475136, + "grad_norm": 0.0011589010246098042, + "learning_rate": 1.4831032562501832e-05, + "loss": 0.0001, + "step": 12616 + }, + { + "epoch": 3.328452710724179, + "grad_norm": 0.05072222650051117, + "learning_rate": 1.4829860195199157e-05, + "loss": 0.0005, + "step": 12618 + }, + { + "epoch": 3.3289803456008444, + "grad_norm": 0.08030836284160614, + "learning_rate": 1.4828687827896481e-05, + "loss": 0.0004, + "step": 12620 + }, + { + "epoch": 3.3295079804775094, + "grad_norm": 0.172357976436615, + "learning_rate": 1.4827515460593804e-05, + "loss": 0.0026, + "step": 12622 + }, + { + "epoch": 3.330035615354175, + "grad_norm": 0.352999746799469, + "learning_rate": 1.482634309329113e-05, + "loss": 0.0056, + "step": 12624 + }, + { + "epoch": 3.33056325023084, + "grad_norm": 0.003950738348066807, + "learning_rate": 1.4825170725988454e-05, + "loss": 0.0001, + "step": 12626 + }, + { + "epoch": 3.3310908851075056, + "grad_norm": 0.00645467871800065, + "learning_rate": 1.4823998358685778e-05, + "loss": 0.0001, + "step": 12628 + }, + { + "epoch": 3.331618519984171, + "grad_norm": 0.1287345290184021, + "learning_rate": 1.4822825991383101e-05, + "loss": 0.0008, + "step": 12630 + }, + { + "epoch": 3.3321461548608364, + "grad_norm": 0.0050504254177212715, + "learning_rate": 1.4821653624080425e-05, + "loss": 0.0001, + "step": 12632 + }, + { + "epoch": 3.332673789737502, + "grad_norm": 0.031992655247449875, + "learning_rate": 1.482048125677775e-05, + "loss": 0.0027, + "step": 12634 + }, + { + "epoch": 3.333201424614167, + "grad_norm": 1.345658779144287, + "learning_rate": 1.4819308889475075e-05, + "loss": 0.0063, + "step": 12636 + }, + { + "epoch": 3.333729059490832, + "grad_norm": 0.0028618883807212114, + "learning_rate": 1.4818136522172396e-05, + "loss": 0.0018, + "step": 12638 + }, + { + "epoch": 3.3342566943674976, + "grad_norm": 0.010029694996774197, + "learning_rate": 1.4816964154869722e-05, + "loss": 0.0001, + "step": 12640 + }, + { + "epoch": 3.334784329244163, + "grad_norm": 0.2903737425804138, + "learning_rate": 1.4815791787567047e-05, + "loss": 0.0062, + "step": 12642 + }, + { + "epoch": 3.3353119641208284, + "grad_norm": 0.20396688580513, + "learning_rate": 1.481461942026437e-05, + "loss": 0.0101, + "step": 12644 + }, + { + "epoch": 3.335839598997494, + "grad_norm": 0.004620454739779234, + "learning_rate": 1.4813447052961694e-05, + "loss": 0.0001, + "step": 12646 + }, + { + "epoch": 3.3363672338741592, + "grad_norm": 0.047083497047424316, + "learning_rate": 1.4812274685659018e-05, + "loss": 0.0084, + "step": 12648 + }, + { + "epoch": 3.336894868750824, + "grad_norm": 0.016685141250491142, + "learning_rate": 1.4811102318356342e-05, + "loss": 0.0002, + "step": 12650 + }, + { + "epoch": 3.3374225036274896, + "grad_norm": 0.003992026671767235, + "learning_rate": 1.4809929951053665e-05, + "loss": 0.001, + "step": 12652 + }, + { + "epoch": 3.337950138504155, + "grad_norm": 0.0042436025105416775, + "learning_rate": 1.480875758375099e-05, + "loss": 0.001, + "step": 12654 + }, + { + "epoch": 3.3384777733808204, + "grad_norm": 0.39976218342781067, + "learning_rate": 1.4807585216448315e-05, + "loss": 0.005, + "step": 12656 + }, + { + "epoch": 3.339005408257486, + "grad_norm": 0.028542930260300636, + "learning_rate": 1.480641284914564e-05, + "loss": 0.0002, + "step": 12658 + }, + { + "epoch": 3.3395330431341512, + "grad_norm": 0.02068397030234337, + "learning_rate": 1.4805240481842962e-05, + "loss": 0.0002, + "step": 12660 + }, + { + "epoch": 3.3400606780108166, + "grad_norm": 0.002585349604487419, + "learning_rate": 1.4804068114540286e-05, + "loss": 0.0002, + "step": 12662 + }, + { + "epoch": 3.340588312887482, + "grad_norm": 0.001763110514730215, + "learning_rate": 1.480289574723761e-05, + "loss": 0.0001, + "step": 12664 + }, + { + "epoch": 3.3411159477641474, + "grad_norm": 0.04954471066594124, + "learning_rate": 1.4801723379934937e-05, + "loss": 0.0046, + "step": 12666 + }, + { + "epoch": 3.3416435826408124, + "grad_norm": 0.008199494332075119, + "learning_rate": 1.4800551012632258e-05, + "loss": 0.0002, + "step": 12668 + }, + { + "epoch": 3.342171217517478, + "grad_norm": 0.008017565123736858, + "learning_rate": 1.4799378645329584e-05, + "loss": 0.0002, + "step": 12670 + }, + { + "epoch": 3.342698852394143, + "grad_norm": 0.04349033161997795, + "learning_rate": 1.4798206278026908e-05, + "loss": 0.0002, + "step": 12672 + }, + { + "epoch": 3.3432264872708086, + "grad_norm": 0.02996407262980938, + "learning_rate": 1.479703391072423e-05, + "loss": 0.0032, + "step": 12674 + }, + { + "epoch": 3.343754122147474, + "grad_norm": 0.003738545114174485, + "learning_rate": 1.4795861543421555e-05, + "loss": 0.0001, + "step": 12676 + }, + { + "epoch": 3.3442817570241394, + "grad_norm": 0.0020718472078442574, + "learning_rate": 1.4794689176118879e-05, + "loss": 0.0001, + "step": 12678 + }, + { + "epoch": 3.344809391900805, + "grad_norm": 0.0044201030395925045, + "learning_rate": 1.4793516808816203e-05, + "loss": 0.0002, + "step": 12680 + }, + { + "epoch": 3.34533702677747, + "grad_norm": 0.3261224329471588, + "learning_rate": 1.4792344441513526e-05, + "loss": 0.0116, + "step": 12682 + }, + { + "epoch": 3.345864661654135, + "grad_norm": 0.0033223724458366632, + "learning_rate": 1.479117207421085e-05, + "loss": 0.0001, + "step": 12684 + }, + { + "epoch": 3.3463922965308006, + "grad_norm": 0.052436091005802155, + "learning_rate": 1.4789999706908176e-05, + "loss": 0.0069, + "step": 12686 + }, + { + "epoch": 3.346919931407466, + "grad_norm": 0.0447532944381237, + "learning_rate": 1.47888273396055e-05, + "loss": 0.0002, + "step": 12688 + }, + { + "epoch": 3.3474475662841314, + "grad_norm": 0.020323678851127625, + "learning_rate": 1.4787654972302823e-05, + "loss": 0.0002, + "step": 12690 + }, + { + "epoch": 3.347975201160797, + "grad_norm": 0.018404141068458557, + "learning_rate": 1.4786482605000148e-05, + "loss": 0.0003, + "step": 12692 + }, + { + "epoch": 3.3485028360374622, + "grad_norm": 0.00866026896983385, + "learning_rate": 1.4785310237697472e-05, + "loss": 0.0001, + "step": 12694 + }, + { + "epoch": 3.349030470914127, + "grad_norm": 0.004135155119001865, + "learning_rate": 1.4784137870394796e-05, + "loss": 0.0001, + "step": 12696 + }, + { + "epoch": 3.3495581057907926, + "grad_norm": 0.2388627976179123, + "learning_rate": 1.4782965503092119e-05, + "loss": 0.0069, + "step": 12698 + }, + { + "epoch": 3.350085740667458, + "grad_norm": 0.5722746849060059, + "learning_rate": 1.4781793135789445e-05, + "loss": 0.0006, + "step": 12700 + }, + { + "epoch": 3.3506133755441234, + "grad_norm": 0.047123175114393234, + "learning_rate": 1.4780620768486769e-05, + "loss": 0.0005, + "step": 12702 + }, + { + "epoch": 3.351141010420789, + "grad_norm": 0.006197458598762751, + "learning_rate": 1.4779448401184092e-05, + "loss": 0.002, + "step": 12704 + }, + { + "epoch": 3.3516686452974542, + "grad_norm": 0.03968854248523712, + "learning_rate": 1.4778276033881416e-05, + "loss": 0.003, + "step": 12706 + }, + { + "epoch": 3.3521962801741196, + "grad_norm": 0.008455358445644379, + "learning_rate": 1.477710366657874e-05, + "loss": 0.0002, + "step": 12708 + }, + { + "epoch": 3.352723915050785, + "grad_norm": 0.1344848871231079, + "learning_rate": 1.4775931299276065e-05, + "loss": 0.0006, + "step": 12710 + }, + { + "epoch": 3.3532515499274504, + "grad_norm": 0.005897244904190302, + "learning_rate": 1.4774758931973387e-05, + "loss": 0.0001, + "step": 12712 + }, + { + "epoch": 3.3537791848041154, + "grad_norm": 0.005361002869904041, + "learning_rate": 1.4773586564670711e-05, + "loss": 0.0001, + "step": 12714 + }, + { + "epoch": 3.354306819680781, + "grad_norm": 0.009035392664372921, + "learning_rate": 1.4772414197368037e-05, + "loss": 0.0002, + "step": 12716 + }, + { + "epoch": 3.354834454557446, + "grad_norm": 0.008323458023369312, + "learning_rate": 1.4771241830065362e-05, + "loss": 0.0003, + "step": 12718 + }, + { + "epoch": 3.3553620894341116, + "grad_norm": 0.0027956597041338682, + "learning_rate": 1.4770069462762684e-05, + "loss": 0.0001, + "step": 12720 + }, + { + "epoch": 3.355889724310777, + "grad_norm": 0.3719315528869629, + "learning_rate": 1.4768897095460009e-05, + "loss": 0.0015, + "step": 12722 + }, + { + "epoch": 3.3564173591874424, + "grad_norm": 0.021669995039701462, + "learning_rate": 1.4767724728157333e-05, + "loss": 0.0001, + "step": 12724 + }, + { + "epoch": 3.356944994064108, + "grad_norm": 0.017765892669558525, + "learning_rate": 1.4766552360854657e-05, + "loss": 0.0001, + "step": 12726 + }, + { + "epoch": 3.357472628940773, + "grad_norm": 0.003827114589512348, + "learning_rate": 1.476537999355198e-05, + "loss": 0.0001, + "step": 12728 + }, + { + "epoch": 3.358000263817438, + "grad_norm": 0.06088089942932129, + "learning_rate": 1.4764207626249304e-05, + "loss": 0.0005, + "step": 12730 + }, + { + "epoch": 3.3585278986941036, + "grad_norm": 0.0036704139783978462, + "learning_rate": 1.476303525894663e-05, + "loss": 0.0001, + "step": 12732 + }, + { + "epoch": 3.359055533570769, + "grad_norm": 0.0010699181584641337, + "learning_rate": 1.4761862891643954e-05, + "loss": 0.0001, + "step": 12734 + }, + { + "epoch": 3.3595831684474344, + "grad_norm": 0.005475872196257114, + "learning_rate": 1.4760690524341277e-05, + "loss": 0.0009, + "step": 12736 + }, + { + "epoch": 3.3601108033241, + "grad_norm": 0.040631961077451706, + "learning_rate": 1.4759518157038601e-05, + "loss": 0.0003, + "step": 12738 + }, + { + "epoch": 3.3606384382007652, + "grad_norm": 0.19272790849208832, + "learning_rate": 1.4758345789735926e-05, + "loss": 0.0011, + "step": 12740 + }, + { + "epoch": 3.36116607307743, + "grad_norm": 0.02627319097518921, + "learning_rate": 1.4757173422433248e-05, + "loss": 0.0003, + "step": 12742 + }, + { + "epoch": 3.3616937079540956, + "grad_norm": 0.03006032481789589, + "learning_rate": 1.4756001055130573e-05, + "loss": 0.0048, + "step": 12744 + }, + { + "epoch": 3.362221342830761, + "grad_norm": 0.0020829015411436558, + "learning_rate": 1.4754828687827899e-05, + "loss": 0.0023, + "step": 12746 + }, + { + "epoch": 3.3627489777074264, + "grad_norm": 0.07738564908504486, + "learning_rate": 1.4753656320525223e-05, + "loss": 0.0046, + "step": 12748 + }, + { + "epoch": 3.363276612584092, + "grad_norm": 0.002525971969589591, + "learning_rate": 1.4752483953222546e-05, + "loss": 0.0003, + "step": 12750 + }, + { + "epoch": 3.3638042474607572, + "grad_norm": 0.11692377179861069, + "learning_rate": 1.475131158591987e-05, + "loss": 0.0007, + "step": 12752 + }, + { + "epoch": 3.3643318823374226, + "grad_norm": 0.028156951069831848, + "learning_rate": 1.4750139218617194e-05, + "loss": 0.0002, + "step": 12754 + }, + { + "epoch": 3.3648595172140876, + "grad_norm": 0.5504355430603027, + "learning_rate": 1.4748966851314518e-05, + "loss": 0.0015, + "step": 12756 + }, + { + "epoch": 3.365387152090753, + "grad_norm": 0.3660854995250702, + "learning_rate": 1.4747794484011841e-05, + "loss": 0.0019, + "step": 12758 + }, + { + "epoch": 3.3659147869674184, + "grad_norm": 0.6300755143165588, + "learning_rate": 1.4746622116709165e-05, + "loss": 0.0001, + "step": 12760 + }, + { + "epoch": 3.366442421844084, + "grad_norm": 0.14891372621059418, + "learning_rate": 1.4745449749406491e-05, + "loss": 0.0008, + "step": 12762 + }, + { + "epoch": 3.3669700567207492, + "grad_norm": 0.007339715026319027, + "learning_rate": 1.4744277382103816e-05, + "loss": 0.0001, + "step": 12764 + }, + { + "epoch": 3.3674976915974146, + "grad_norm": 0.04765629023313522, + "learning_rate": 1.4743105014801138e-05, + "loss": 0.002, + "step": 12766 + }, + { + "epoch": 3.36802532647408, + "grad_norm": 0.0041613029316067696, + "learning_rate": 1.4741932647498463e-05, + "loss": 0.0001, + "step": 12768 + }, + { + "epoch": 3.3685529613507454, + "grad_norm": 0.0009416475659236312, + "learning_rate": 1.4740760280195787e-05, + "loss": 0.0001, + "step": 12770 + }, + { + "epoch": 3.369080596227411, + "grad_norm": 0.0021900166757404804, + "learning_rate": 1.473958791289311e-05, + "loss": 0.0039, + "step": 12772 + }, + { + "epoch": 3.369608231104076, + "grad_norm": 0.0010894982842728496, + "learning_rate": 1.4738415545590434e-05, + "loss": 0.0001, + "step": 12774 + }, + { + "epoch": 3.370135865980741, + "grad_norm": 0.0024835504591464996, + "learning_rate": 1.473724317828776e-05, + "loss": 0.0001, + "step": 12776 + }, + { + "epoch": 3.3706635008574066, + "grad_norm": 0.0305930208414793, + "learning_rate": 1.4736070810985084e-05, + "loss": 0.0018, + "step": 12778 + }, + { + "epoch": 3.371191135734072, + "grad_norm": 0.0031344820745289326, + "learning_rate": 1.4734898443682407e-05, + "loss": 0.0001, + "step": 12780 + }, + { + "epoch": 3.3717187706107374, + "grad_norm": 0.3904370665550232, + "learning_rate": 1.4733726076379731e-05, + "loss": 0.0013, + "step": 12782 + }, + { + "epoch": 3.372246405487403, + "grad_norm": 0.0014828727580606937, + "learning_rate": 1.4732553709077055e-05, + "loss": 0.0001, + "step": 12784 + }, + { + "epoch": 3.3727740403640682, + "grad_norm": 0.008116373792290688, + "learning_rate": 1.473138134177438e-05, + "loss": 0.0005, + "step": 12786 + }, + { + "epoch": 3.373301675240733, + "grad_norm": 0.001104878494516015, + "learning_rate": 1.4730208974471702e-05, + "loss": 0.0001, + "step": 12788 + }, + { + "epoch": 3.3738293101173986, + "grad_norm": 0.010129696689546108, + "learning_rate": 1.4729036607169027e-05, + "loss": 0.0001, + "step": 12790 + }, + { + "epoch": 3.374356944994064, + "grad_norm": 0.003463742323219776, + "learning_rate": 1.4727864239866353e-05, + "loss": 0.0001, + "step": 12792 + }, + { + "epoch": 3.3748845798707294, + "grad_norm": 0.008967684581875801, + "learning_rate": 1.4726691872563677e-05, + "loss": 0.0001, + "step": 12794 + }, + { + "epoch": 3.375412214747395, + "grad_norm": 0.0550786629319191, + "learning_rate": 1.4725519505261e-05, + "loss": 0.007, + "step": 12796 + }, + { + "epoch": 3.3759398496240602, + "grad_norm": 0.04960915073752403, + "learning_rate": 1.4724347137958324e-05, + "loss": 0.0044, + "step": 12798 + }, + { + "epoch": 3.3764674845007256, + "grad_norm": 0.2250358909368515, + "learning_rate": 1.4723174770655648e-05, + "loss": 0.0008, + "step": 12800 + }, + { + "epoch": 3.3769951193773906, + "grad_norm": 0.0197929535061121, + "learning_rate": 1.472200240335297e-05, + "loss": 0.0002, + "step": 12802 + }, + { + "epoch": 3.377522754254056, + "grad_norm": 0.047493577003479004, + "learning_rate": 1.4720830036050295e-05, + "loss": 0.0008, + "step": 12804 + }, + { + "epoch": 3.3780503891307214, + "grad_norm": 0.0035266748163849115, + "learning_rate": 1.471965766874762e-05, + "loss": 0.0001, + "step": 12806 + }, + { + "epoch": 3.378578024007387, + "grad_norm": 0.003191528841853142, + "learning_rate": 1.4718485301444945e-05, + "loss": 0.0004, + "step": 12808 + }, + { + "epoch": 3.3791056588840522, + "grad_norm": 0.003136835992336273, + "learning_rate": 1.4717312934142268e-05, + "loss": 0.0001, + "step": 12810 + }, + { + "epoch": 3.3796332937607176, + "grad_norm": 0.2909621596336365, + "learning_rate": 1.4716140566839592e-05, + "loss": 0.0045, + "step": 12812 + }, + { + "epoch": 3.380160928637383, + "grad_norm": 0.0032200112473219633, + "learning_rate": 1.4714968199536916e-05, + "loss": 0.0007, + "step": 12814 + }, + { + "epoch": 3.3806885635140485, + "grad_norm": 0.0008903958951123059, + "learning_rate": 1.471379583223424e-05, + "loss": 0.0001, + "step": 12816 + }, + { + "epoch": 3.381216198390714, + "grad_norm": 0.22698207199573517, + "learning_rate": 1.4712623464931563e-05, + "loss": 0.0052, + "step": 12818 + }, + { + "epoch": 3.381743833267379, + "grad_norm": 0.02921552211046219, + "learning_rate": 1.4711451097628888e-05, + "loss": 0.0025, + "step": 12820 + }, + { + "epoch": 3.3822714681440442, + "grad_norm": 0.028323380276560783, + "learning_rate": 1.4710278730326214e-05, + "loss": 0.0018, + "step": 12822 + }, + { + "epoch": 3.3827991030207096, + "grad_norm": 0.005693587474524975, + "learning_rate": 1.4709106363023538e-05, + "loss": 0.0031, + "step": 12824 + }, + { + "epoch": 3.383326737897375, + "grad_norm": 0.01528213731944561, + "learning_rate": 1.470793399572086e-05, + "loss": 0.0001, + "step": 12826 + }, + { + "epoch": 3.3838543727740404, + "grad_norm": 0.27821752429008484, + "learning_rate": 1.4706761628418185e-05, + "loss": 0.002, + "step": 12828 + }, + { + "epoch": 3.384382007650706, + "grad_norm": 0.2859189808368683, + "learning_rate": 1.470558926111551e-05, + "loss": 0.0009, + "step": 12830 + }, + { + "epoch": 3.3849096425273713, + "grad_norm": 0.023879719898104668, + "learning_rate": 1.4704416893812832e-05, + "loss": 0.0001, + "step": 12832 + }, + { + "epoch": 3.385437277404036, + "grad_norm": 0.0563436783850193, + "learning_rate": 1.4703244526510156e-05, + "loss": 0.0003, + "step": 12834 + }, + { + "epoch": 3.3859649122807016, + "grad_norm": 0.02108338475227356, + "learning_rate": 1.470207215920748e-05, + "loss": 0.0002, + "step": 12836 + }, + { + "epoch": 3.386492547157367, + "grad_norm": 0.006681269966065884, + "learning_rate": 1.4700899791904806e-05, + "loss": 0.0001, + "step": 12838 + }, + { + "epoch": 3.3870201820340324, + "grad_norm": 0.005389468278735876, + "learning_rate": 1.4699727424602129e-05, + "loss": 0.0001, + "step": 12840 + }, + { + "epoch": 3.387547816910698, + "grad_norm": 0.17887243628501892, + "learning_rate": 1.4698555057299453e-05, + "loss": 0.0005, + "step": 12842 + }, + { + "epoch": 3.3880754517873632, + "grad_norm": 0.23914456367492676, + "learning_rate": 1.4697382689996778e-05, + "loss": 0.0061, + "step": 12844 + }, + { + "epoch": 3.3886030866640287, + "grad_norm": 0.007782400585711002, + "learning_rate": 1.4696210322694102e-05, + "loss": 0.0003, + "step": 12846 + }, + { + "epoch": 3.3891307215406936, + "grad_norm": 0.3987472653388977, + "learning_rate": 1.4695037955391425e-05, + "loss": 0.0036, + "step": 12848 + }, + { + "epoch": 3.389658356417359, + "grad_norm": 0.0004978901124559343, + "learning_rate": 1.4693865588088749e-05, + "loss": 0.0, + "step": 12850 + }, + { + "epoch": 3.3901859912940244, + "grad_norm": 0.004095248878002167, + "learning_rate": 1.4692693220786073e-05, + "loss": 0.0036, + "step": 12852 + }, + { + "epoch": 3.39071362617069, + "grad_norm": 0.008316470310091972, + "learning_rate": 1.46915208534834e-05, + "loss": 0.0018, + "step": 12854 + }, + { + "epoch": 3.3912412610473552, + "grad_norm": 0.00938620138913393, + "learning_rate": 1.4690348486180722e-05, + "loss": 0.0001, + "step": 12856 + }, + { + "epoch": 3.3917688959240206, + "grad_norm": 0.19050489366054535, + "learning_rate": 1.4689176118878046e-05, + "loss": 0.0013, + "step": 12858 + }, + { + "epoch": 3.392296530800686, + "grad_norm": 0.018561897799372673, + "learning_rate": 1.468800375157537e-05, + "loss": 0.0002, + "step": 12860 + }, + { + "epoch": 3.3928241656773515, + "grad_norm": 0.03019341640174389, + "learning_rate": 1.4686831384272693e-05, + "loss": 0.0007, + "step": 12862 + }, + { + "epoch": 3.393351800554017, + "grad_norm": 0.10786980390548706, + "learning_rate": 1.4685659016970017e-05, + "loss": 0.0007, + "step": 12864 + }, + { + "epoch": 3.393879435430682, + "grad_norm": 0.010366533882915974, + "learning_rate": 1.4684486649667342e-05, + "loss": 0.0002, + "step": 12866 + }, + { + "epoch": 3.3944070703073472, + "grad_norm": 0.12161105871200562, + "learning_rate": 1.4683314282364668e-05, + "loss": 0.0007, + "step": 12868 + }, + { + "epoch": 3.3949347051840126, + "grad_norm": 0.0025886318180710077, + "learning_rate": 1.4682141915061989e-05, + "loss": 0.0001, + "step": 12870 + }, + { + "epoch": 3.395462340060678, + "grad_norm": 0.003819168545305729, + "learning_rate": 1.4680969547759315e-05, + "loss": 0.0001, + "step": 12872 + }, + { + "epoch": 3.3959899749373434, + "grad_norm": 0.27307358384132385, + "learning_rate": 1.4679797180456639e-05, + "loss": 0.0019, + "step": 12874 + }, + { + "epoch": 3.396517609814009, + "grad_norm": 0.0312928669154644, + "learning_rate": 1.4678624813153963e-05, + "loss": 0.0042, + "step": 12876 + }, + { + "epoch": 3.3970452446906743, + "grad_norm": 0.07344041019678116, + "learning_rate": 1.4677452445851286e-05, + "loss": 0.0062, + "step": 12878 + }, + { + "epoch": 3.397572879567339, + "grad_norm": 0.02616173028945923, + "learning_rate": 1.467628007854861e-05, + "loss": 0.0007, + "step": 12880 + }, + { + "epoch": 3.3981005144440046, + "grad_norm": 1.115526795387268, + "learning_rate": 1.4675107711245934e-05, + "loss": 0.0026, + "step": 12882 + }, + { + "epoch": 3.39862814932067, + "grad_norm": 0.0024312317837029696, + "learning_rate": 1.467393534394326e-05, + "loss": 0.0001, + "step": 12884 + }, + { + "epoch": 3.3991557841973354, + "grad_norm": 0.06243127956986427, + "learning_rate": 1.4672762976640583e-05, + "loss": 0.0004, + "step": 12886 + }, + { + "epoch": 3.399683419074001, + "grad_norm": 0.04226028546690941, + "learning_rate": 1.4671590609337907e-05, + "loss": 0.0037, + "step": 12888 + }, + { + "epoch": 3.4002110539506663, + "grad_norm": 0.0008993876399472356, + "learning_rate": 1.4670418242035232e-05, + "loss": 0.0001, + "step": 12890 + }, + { + "epoch": 3.4007386888273317, + "grad_norm": 0.0013941143406555057, + "learning_rate": 1.4669245874732554e-05, + "loss": 0.0001, + "step": 12892 + }, + { + "epoch": 3.4012663237039966, + "grad_norm": 0.0025026986841112375, + "learning_rate": 1.4668073507429878e-05, + "loss": 0.0002, + "step": 12894 + }, + { + "epoch": 3.401793958580662, + "grad_norm": 0.025497181341052055, + "learning_rate": 1.4666901140127203e-05, + "loss": 0.0004, + "step": 12896 + }, + { + "epoch": 3.4023215934573274, + "grad_norm": 0.002218756126239896, + "learning_rate": 1.4665728772824527e-05, + "loss": 0.0002, + "step": 12898 + }, + { + "epoch": 3.402849228333993, + "grad_norm": 0.168950155377388, + "learning_rate": 1.466455640552185e-05, + "loss": 0.001, + "step": 12900 + }, + { + "epoch": 3.4033768632106582, + "grad_norm": 0.0011952592758461833, + "learning_rate": 1.4663384038219176e-05, + "loss": 0.0001, + "step": 12902 + }, + { + "epoch": 3.4039044980873236, + "grad_norm": 0.03807021677494049, + "learning_rate": 1.46622116709165e-05, + "loss": 0.0018, + "step": 12904 + }, + { + "epoch": 3.404432132963989, + "grad_norm": 0.002987172454595566, + "learning_rate": 1.4661039303613824e-05, + "loss": 0.0001, + "step": 12906 + }, + { + "epoch": 3.4049597678406545, + "grad_norm": 0.7772179245948792, + "learning_rate": 1.4659866936311147e-05, + "loss": 0.0082, + "step": 12908 + }, + { + "epoch": 3.4054874027173194, + "grad_norm": 0.014136211946606636, + "learning_rate": 1.4658694569008471e-05, + "loss": 0.0008, + "step": 12910 + }, + { + "epoch": 3.406015037593985, + "grad_norm": 0.0004706461331807077, + "learning_rate": 1.4657522201705796e-05, + "loss": 0.0002, + "step": 12912 + }, + { + "epoch": 3.4065426724706502, + "grad_norm": 0.05921865999698639, + "learning_rate": 1.4656349834403122e-05, + "loss": 0.0004, + "step": 12914 + }, + { + "epoch": 3.4070703073473156, + "grad_norm": 0.18930469453334808, + "learning_rate": 1.4655177467100442e-05, + "loss": 0.0007, + "step": 12916 + }, + { + "epoch": 3.407597942223981, + "grad_norm": 0.003198957536369562, + "learning_rate": 1.4654005099797768e-05, + "loss": 0.0001, + "step": 12918 + }, + { + "epoch": 3.4081255771006465, + "grad_norm": 0.0012372960336506367, + "learning_rate": 1.4652832732495093e-05, + "loss": 0.0023, + "step": 12920 + }, + { + "epoch": 3.408653211977312, + "grad_norm": 0.0007213413482531905, + "learning_rate": 1.4651660365192415e-05, + "loss": 0.0001, + "step": 12922 + }, + { + "epoch": 3.4091808468539773, + "grad_norm": 0.0004380934115033597, + "learning_rate": 1.465048799788974e-05, + "loss": 0.0001, + "step": 12924 + }, + { + "epoch": 3.4097084817306422, + "grad_norm": 0.052899863570928574, + "learning_rate": 1.4649315630587064e-05, + "loss": 0.0001, + "step": 12926 + }, + { + "epoch": 3.4102361166073076, + "grad_norm": 0.17565637826919556, + "learning_rate": 1.4648143263284388e-05, + "loss": 0.0095, + "step": 12928 + }, + { + "epoch": 3.410763751483973, + "grad_norm": 0.1983254849910736, + "learning_rate": 1.4646970895981711e-05, + "loss": 0.0072, + "step": 12930 + }, + { + "epoch": 3.4112913863606384, + "grad_norm": 0.007500369101762772, + "learning_rate": 1.4645798528679037e-05, + "loss": 0.0001, + "step": 12932 + }, + { + "epoch": 3.411819021237304, + "grad_norm": 0.06700573116540909, + "learning_rate": 1.4644626161376361e-05, + "loss": 0.0009, + "step": 12934 + }, + { + "epoch": 3.4123466561139693, + "grad_norm": 0.021525762975215912, + "learning_rate": 1.4643453794073685e-05, + "loss": 0.0004, + "step": 12936 + }, + { + "epoch": 3.4128742909906347, + "grad_norm": 0.02362729422748089, + "learning_rate": 1.4642281426771008e-05, + "loss": 0.0002, + "step": 12938 + }, + { + "epoch": 3.4134019258672996, + "grad_norm": 0.02769460342824459, + "learning_rate": 1.4641109059468332e-05, + "loss": 0.0005, + "step": 12940 + }, + { + "epoch": 3.413929560743965, + "grad_norm": 0.22016781568527222, + "learning_rate": 1.4639936692165657e-05, + "loss": 0.0014, + "step": 12942 + }, + { + "epoch": 3.4144571956206304, + "grad_norm": 0.03808465227484703, + "learning_rate": 1.4638764324862981e-05, + "loss": 0.0005, + "step": 12944 + }, + { + "epoch": 3.414984830497296, + "grad_norm": 0.22575907409191132, + "learning_rate": 1.4637591957560304e-05, + "loss": 0.0112, + "step": 12946 + }, + { + "epoch": 3.4155124653739612, + "grad_norm": 0.25544652342796326, + "learning_rate": 1.463641959025763e-05, + "loss": 0.0013, + "step": 12948 + }, + { + "epoch": 3.4160401002506267, + "grad_norm": 0.054190728813409805, + "learning_rate": 1.4635247222954954e-05, + "loss": 0.0029, + "step": 12950 + }, + { + "epoch": 3.416567735127292, + "grad_norm": 0.16938740015029907, + "learning_rate": 1.4634074855652277e-05, + "loss": 0.0015, + "step": 12952 + }, + { + "epoch": 3.417095370003957, + "grad_norm": 0.006905209738761187, + "learning_rate": 1.46329024883496e-05, + "loss": 0.0008, + "step": 12954 + }, + { + "epoch": 3.4176230048806224, + "grad_norm": 0.11812205612659454, + "learning_rate": 1.4631730121046925e-05, + "loss": 0.0031, + "step": 12956 + }, + { + "epoch": 3.418150639757288, + "grad_norm": 0.011428671889007092, + "learning_rate": 1.463055775374425e-05, + "loss": 0.0002, + "step": 12958 + }, + { + "epoch": 3.4186782746339532, + "grad_norm": 0.07749740779399872, + "learning_rate": 1.4629385386441572e-05, + "loss": 0.0007, + "step": 12960 + }, + { + "epoch": 3.4192059095106186, + "grad_norm": 0.018794246017932892, + "learning_rate": 1.4628213019138896e-05, + "loss": 0.0002, + "step": 12962 + }, + { + "epoch": 3.419733544387284, + "grad_norm": 0.0018723801476880908, + "learning_rate": 1.4627040651836222e-05, + "loss": 0.0002, + "step": 12964 + }, + { + "epoch": 3.4202611792639495, + "grad_norm": 0.00544018717482686, + "learning_rate": 1.4625868284533547e-05, + "loss": 0.0001, + "step": 12966 + }, + { + "epoch": 3.420788814140615, + "grad_norm": 0.10312362760305405, + "learning_rate": 1.462469591723087e-05, + "loss": 0.0054, + "step": 12968 + }, + { + "epoch": 3.4213164490172803, + "grad_norm": 0.0009536227444186807, + "learning_rate": 1.4623523549928194e-05, + "loss": 0.0001, + "step": 12970 + }, + { + "epoch": 3.4218440838939452, + "grad_norm": 0.012317696586251259, + "learning_rate": 1.4622351182625518e-05, + "loss": 0.0003, + "step": 12972 + }, + { + "epoch": 3.4223717187706106, + "grad_norm": 0.00248960149474442, + "learning_rate": 1.4621178815322842e-05, + "loss": 0.0001, + "step": 12974 + }, + { + "epoch": 3.422899353647276, + "grad_norm": 0.03216012194752693, + "learning_rate": 1.4620006448020165e-05, + "loss": 0.0035, + "step": 12976 + }, + { + "epoch": 3.4234269885239414, + "grad_norm": 0.004353682044893503, + "learning_rate": 1.461883408071749e-05, + "loss": 0.0001, + "step": 12978 + }, + { + "epoch": 3.423954623400607, + "grad_norm": 0.004445212427526712, + "learning_rate": 1.4617661713414815e-05, + "loss": 0.0001, + "step": 12980 + }, + { + "epoch": 3.4244822582772723, + "grad_norm": 0.021008435636758804, + "learning_rate": 1.4616489346112138e-05, + "loss": 0.0005, + "step": 12982 + }, + { + "epoch": 3.4250098931539377, + "grad_norm": 0.16950669884681702, + "learning_rate": 1.4615316978809462e-05, + "loss": 0.0011, + "step": 12984 + }, + { + "epoch": 3.4255375280306026, + "grad_norm": 0.02024170011281967, + "learning_rate": 1.4614144611506786e-05, + "loss": 0.0002, + "step": 12986 + }, + { + "epoch": 3.426065162907268, + "grad_norm": 0.0055969590321183205, + "learning_rate": 1.461297224420411e-05, + "loss": 0.001, + "step": 12988 + }, + { + "epoch": 3.4265927977839334, + "grad_norm": 0.3322409391403198, + "learning_rate": 1.4611799876901433e-05, + "loss": 0.0053, + "step": 12990 + }, + { + "epoch": 3.427120432660599, + "grad_norm": 0.014525067061185837, + "learning_rate": 1.4610627509598758e-05, + "loss": 0.0004, + "step": 12992 + }, + { + "epoch": 3.4276480675372643, + "grad_norm": 0.11429799348115921, + "learning_rate": 1.4609455142296084e-05, + "loss": 0.0037, + "step": 12994 + }, + { + "epoch": 3.4281757024139297, + "grad_norm": 0.02411283366382122, + "learning_rate": 1.4608282774993408e-05, + "loss": 0.0004, + "step": 12996 + }, + { + "epoch": 3.428703337290595, + "grad_norm": 0.026028091087937355, + "learning_rate": 1.460711040769073e-05, + "loss": 0.0013, + "step": 12998 + }, + { + "epoch": 3.42923097216726, + "grad_norm": 0.025786278769373894, + "learning_rate": 1.4605938040388055e-05, + "loss": 0.0005, + "step": 13000 + }, + { + "epoch": 3.4297586070439254, + "grad_norm": 0.06100715696811676, + "learning_rate": 1.4604765673085379e-05, + "loss": 0.0074, + "step": 13002 + }, + { + "epoch": 3.430286241920591, + "grad_norm": 0.029771259054541588, + "learning_rate": 1.4603593305782703e-05, + "loss": 0.0004, + "step": 13004 + }, + { + "epoch": 3.4308138767972562, + "grad_norm": 0.014716899953782558, + "learning_rate": 1.4602420938480026e-05, + "loss": 0.0002, + "step": 13006 + }, + { + "epoch": 3.4313415116739217, + "grad_norm": 0.010676209814846516, + "learning_rate": 1.460124857117735e-05, + "loss": 0.0003, + "step": 13008 + }, + { + "epoch": 3.431869146550587, + "grad_norm": 0.0028787024784833193, + "learning_rate": 1.4600076203874676e-05, + "loss": 0.0001, + "step": 13010 + }, + { + "epoch": 3.4323967814272525, + "grad_norm": 0.006652758456766605, + "learning_rate": 1.4598903836571999e-05, + "loss": 0.0003, + "step": 13012 + }, + { + "epoch": 3.432924416303918, + "grad_norm": 0.006779851857572794, + "learning_rate": 1.4597731469269323e-05, + "loss": 0.0002, + "step": 13014 + }, + { + "epoch": 3.4334520511805833, + "grad_norm": 0.0048193056136369705, + "learning_rate": 1.4596559101966647e-05, + "loss": 0.0002, + "step": 13016 + }, + { + "epoch": 3.4339796860572482, + "grad_norm": 0.004912851378321648, + "learning_rate": 1.4595386734663972e-05, + "loss": 0.0001, + "step": 13018 + }, + { + "epoch": 3.4345073209339136, + "grad_norm": 0.015390881337225437, + "learning_rate": 1.4594214367361294e-05, + "loss": 0.0002, + "step": 13020 + }, + { + "epoch": 3.435034955810579, + "grad_norm": 0.0016472802963107824, + "learning_rate": 1.4593042000058619e-05, + "loss": 0.0001, + "step": 13022 + }, + { + "epoch": 3.4355625906872445, + "grad_norm": 0.16624978184700012, + "learning_rate": 1.4591869632755945e-05, + "loss": 0.0057, + "step": 13024 + }, + { + "epoch": 3.43609022556391, + "grad_norm": 0.002382928505539894, + "learning_rate": 1.4590697265453269e-05, + "loss": 0.0002, + "step": 13026 + }, + { + "epoch": 3.4366178604405753, + "grad_norm": 0.014473461546003819, + "learning_rate": 1.4589524898150592e-05, + "loss": 0.0001, + "step": 13028 + }, + { + "epoch": 3.4371454953172407, + "grad_norm": 0.0010075210593640804, + "learning_rate": 1.4588352530847916e-05, + "loss": 0.0001, + "step": 13030 + }, + { + "epoch": 3.4376731301939056, + "grad_norm": 0.0008455117349512875, + "learning_rate": 1.458718016354524e-05, + "loss": 0.0005, + "step": 13032 + }, + { + "epoch": 3.438200765070571, + "grad_norm": 0.0019998580683022738, + "learning_rate": 1.4586007796242564e-05, + "loss": 0.0001, + "step": 13034 + }, + { + "epoch": 3.4387283999472364, + "grad_norm": 0.03806300833821297, + "learning_rate": 1.4584835428939887e-05, + "loss": 0.0061, + "step": 13036 + }, + { + "epoch": 3.439256034823902, + "grad_norm": 0.002800910733640194, + "learning_rate": 1.4583663061637211e-05, + "loss": 0.0001, + "step": 13038 + }, + { + "epoch": 3.4397836697005673, + "grad_norm": 0.12121741473674774, + "learning_rate": 1.4582490694334537e-05, + "loss": 0.0157, + "step": 13040 + }, + { + "epoch": 3.4403113045772327, + "grad_norm": 0.00597397843375802, + "learning_rate": 1.458131832703186e-05, + "loss": 0.0002, + "step": 13042 + }, + { + "epoch": 3.440838939453898, + "grad_norm": 0.06579842418432236, + "learning_rate": 1.4580145959729184e-05, + "loss": 0.0043, + "step": 13044 + }, + { + "epoch": 3.441366574330563, + "grad_norm": 0.3799944818019867, + "learning_rate": 1.4578973592426509e-05, + "loss": 0.0014, + "step": 13046 + }, + { + "epoch": 3.4418942092072284, + "grad_norm": 0.0035905202385038137, + "learning_rate": 1.4577801225123833e-05, + "loss": 0.0001, + "step": 13048 + }, + { + "epoch": 3.442421844083894, + "grad_norm": 0.7379227876663208, + "learning_rate": 1.4576628857821156e-05, + "loss": 0.003, + "step": 13050 + }, + { + "epoch": 3.4429494789605593, + "grad_norm": 0.11414764821529388, + "learning_rate": 1.457545649051848e-05, + "loss": 0.0061, + "step": 13052 + }, + { + "epoch": 3.4434771138372247, + "grad_norm": 0.07504340261220932, + "learning_rate": 1.4574284123215804e-05, + "loss": 0.0005, + "step": 13054 + }, + { + "epoch": 3.44400474871389, + "grad_norm": 0.00634445995092392, + "learning_rate": 1.457311175591313e-05, + "loss": 0.0001, + "step": 13056 + }, + { + "epoch": 3.4445323835905555, + "grad_norm": 0.06721663475036621, + "learning_rate": 1.4571939388610453e-05, + "loss": 0.0016, + "step": 13058 + }, + { + "epoch": 3.445060018467221, + "grad_norm": 0.16639603674411774, + "learning_rate": 1.4570767021307777e-05, + "loss": 0.0009, + "step": 13060 + }, + { + "epoch": 3.445587653343886, + "grad_norm": 0.02787509374320507, + "learning_rate": 1.4569594654005101e-05, + "loss": 0.0003, + "step": 13062 + }, + { + "epoch": 3.4461152882205512, + "grad_norm": 0.48299503326416016, + "learning_rate": 1.4568422286702426e-05, + "loss": 0.0019, + "step": 13064 + }, + { + "epoch": 3.4466429230972166, + "grad_norm": 0.0036132936365902424, + "learning_rate": 1.4567249919399748e-05, + "loss": 0.0002, + "step": 13066 + }, + { + "epoch": 3.447170557973882, + "grad_norm": 0.32871299982070923, + "learning_rate": 1.4566077552097073e-05, + "loss": 0.0046, + "step": 13068 + }, + { + "epoch": 3.4476981928505475, + "grad_norm": 0.15009737014770508, + "learning_rate": 1.4564905184794399e-05, + "loss": 0.0015, + "step": 13070 + }, + { + "epoch": 3.448225827727213, + "grad_norm": 0.09231700003147125, + "learning_rate": 1.456373281749172e-05, + "loss": 0.0107, + "step": 13072 + }, + { + "epoch": 3.4487534626038783, + "grad_norm": 0.009320828132331371, + "learning_rate": 1.4562560450189045e-05, + "loss": 0.0004, + "step": 13074 + }, + { + "epoch": 3.4492810974805437, + "grad_norm": 1.248985767364502, + "learning_rate": 1.456138808288637e-05, + "loss": 0.006, + "step": 13076 + }, + { + "epoch": 3.4498087323572086, + "grad_norm": 0.022630739957094193, + "learning_rate": 1.4560215715583694e-05, + "loss": 0.0003, + "step": 13078 + }, + { + "epoch": 3.450336367233874, + "grad_norm": 0.12244158238172531, + "learning_rate": 1.4559043348281017e-05, + "loss": 0.004, + "step": 13080 + }, + { + "epoch": 3.4508640021105395, + "grad_norm": 0.008567840792238712, + "learning_rate": 1.4557870980978341e-05, + "loss": 0.001, + "step": 13082 + }, + { + "epoch": 3.451391636987205, + "grad_norm": 0.025737915188074112, + "learning_rate": 1.4556698613675665e-05, + "loss": 0.0011, + "step": 13084 + }, + { + "epoch": 3.4519192718638703, + "grad_norm": 0.029184643179178238, + "learning_rate": 1.4555526246372991e-05, + "loss": 0.0005, + "step": 13086 + }, + { + "epoch": 3.4524469067405357, + "grad_norm": 0.5094318389892578, + "learning_rate": 1.4554353879070314e-05, + "loss": 0.0025, + "step": 13088 + }, + { + "epoch": 3.452974541617201, + "grad_norm": 0.3287552297115326, + "learning_rate": 1.4553181511767638e-05, + "loss": 0.0018, + "step": 13090 + }, + { + "epoch": 3.453502176493866, + "grad_norm": 0.30590054392814636, + "learning_rate": 1.4552009144464963e-05, + "loss": 0.0098, + "step": 13092 + }, + { + "epoch": 3.4540298113705314, + "grad_norm": 0.004925344604998827, + "learning_rate": 1.4550836777162287e-05, + "loss": 0.0001, + "step": 13094 + }, + { + "epoch": 3.454557446247197, + "grad_norm": 0.006997205782681704, + "learning_rate": 1.454966440985961e-05, + "loss": 0.0002, + "step": 13096 + }, + { + "epoch": 3.4550850811238623, + "grad_norm": 0.004840676207095385, + "learning_rate": 1.4548492042556934e-05, + "loss": 0.0001, + "step": 13098 + }, + { + "epoch": 3.4556127160005277, + "grad_norm": 0.0034233364276587963, + "learning_rate": 1.454731967525426e-05, + "loss": 0.0002, + "step": 13100 + }, + { + "epoch": 3.456140350877193, + "grad_norm": 0.4646050035953522, + "learning_rate": 1.454614730795158e-05, + "loss": 0.0022, + "step": 13102 + }, + { + "epoch": 3.4566679857538585, + "grad_norm": 0.4383513033390045, + "learning_rate": 1.4544974940648907e-05, + "loss": 0.0065, + "step": 13104 + }, + { + "epoch": 3.4571956206305234, + "grad_norm": 0.006174925249069929, + "learning_rate": 1.4543802573346231e-05, + "loss": 0.0002, + "step": 13106 + }, + { + "epoch": 3.457723255507189, + "grad_norm": 0.013052839785814285, + "learning_rate": 1.4542630206043555e-05, + "loss": 0.0004, + "step": 13108 + }, + { + "epoch": 3.4582508903838542, + "grad_norm": 0.016419610008597374, + "learning_rate": 1.4541457838740878e-05, + "loss": 0.0002, + "step": 13110 + }, + { + "epoch": 3.4587785252605197, + "grad_norm": 0.097843237221241, + "learning_rate": 1.4540285471438202e-05, + "loss": 0.0039, + "step": 13112 + }, + { + "epoch": 3.459306160137185, + "grad_norm": 0.013639349490404129, + "learning_rate": 1.4539113104135526e-05, + "loss": 0.0003, + "step": 13114 + }, + { + "epoch": 3.4598337950138505, + "grad_norm": 0.009579506702721119, + "learning_rate": 1.4537940736832852e-05, + "loss": 0.0003, + "step": 13116 + }, + { + "epoch": 3.460361429890516, + "grad_norm": 0.017497645691037178, + "learning_rate": 1.4536768369530173e-05, + "loss": 0.0003, + "step": 13118 + }, + { + "epoch": 3.4608890647671813, + "grad_norm": 0.4386647641658783, + "learning_rate": 1.45355960022275e-05, + "loss": 0.0069, + "step": 13120 + }, + { + "epoch": 3.4614166996438467, + "grad_norm": 0.093409463763237, + "learning_rate": 1.4534423634924824e-05, + "loss": 0.0002, + "step": 13122 + }, + { + "epoch": 3.4619443345205116, + "grad_norm": 0.0126325823366642, + "learning_rate": 1.4533251267622148e-05, + "loss": 0.0035, + "step": 13124 + }, + { + "epoch": 3.462471969397177, + "grad_norm": 0.004116280935704708, + "learning_rate": 1.453207890031947e-05, + "loss": 0.0002, + "step": 13126 + }, + { + "epoch": 3.4629996042738425, + "grad_norm": 0.11507988721132278, + "learning_rate": 1.4530906533016795e-05, + "loss": 0.0041, + "step": 13128 + }, + { + "epoch": 3.463527239150508, + "grad_norm": 0.5472546219825745, + "learning_rate": 1.452973416571412e-05, + "loss": 0.0096, + "step": 13130 + }, + { + "epoch": 3.4640548740271733, + "grad_norm": 0.005371617153286934, + "learning_rate": 1.4528561798411442e-05, + "loss": 0.0002, + "step": 13132 + }, + { + "epoch": 3.4645825089038387, + "grad_norm": 0.016764603555202484, + "learning_rate": 1.4527389431108768e-05, + "loss": 0.0002, + "step": 13134 + }, + { + "epoch": 3.465110143780504, + "grad_norm": 0.008996683172881603, + "learning_rate": 1.4526217063806092e-05, + "loss": 0.0022, + "step": 13136 + }, + { + "epoch": 3.465637778657169, + "grad_norm": 0.0050375922583043575, + "learning_rate": 1.4525044696503416e-05, + "loss": 0.0001, + "step": 13138 + }, + { + "epoch": 3.4661654135338344, + "grad_norm": 0.05399443954229355, + "learning_rate": 1.4523872329200739e-05, + "loss": 0.0008, + "step": 13140 + }, + { + "epoch": 3.4666930484105, + "grad_norm": 0.02366846799850464, + "learning_rate": 1.4522699961898063e-05, + "loss": 0.0004, + "step": 13142 + }, + { + "epoch": 3.4672206832871653, + "grad_norm": 0.03233412280678749, + "learning_rate": 1.4521527594595388e-05, + "loss": 0.001, + "step": 13144 + }, + { + "epoch": 3.4677483181638307, + "grad_norm": 0.008782139979302883, + "learning_rate": 1.4520355227292714e-05, + "loss": 0.0002, + "step": 13146 + }, + { + "epoch": 3.468275953040496, + "grad_norm": 0.0011673751287162304, + "learning_rate": 1.4519182859990035e-05, + "loss": 0.0038, + "step": 13148 + }, + { + "epoch": 3.4688035879171615, + "grad_norm": 0.01102442666888237, + "learning_rate": 1.451801049268736e-05, + "loss": 0.0055, + "step": 13150 + }, + { + "epoch": 3.4693312227938264, + "grad_norm": 0.016549458727240562, + "learning_rate": 1.4516838125384685e-05, + "loss": 0.0002, + "step": 13152 + }, + { + "epoch": 3.469858857670492, + "grad_norm": 0.005932432599365711, + "learning_rate": 1.451566575808201e-05, + "loss": 0.0001, + "step": 13154 + }, + { + "epoch": 3.4703864925471573, + "grad_norm": 0.006412939168512821, + "learning_rate": 1.4514493390779332e-05, + "loss": 0.0001, + "step": 13156 + }, + { + "epoch": 3.4709141274238227, + "grad_norm": 0.21586424112319946, + "learning_rate": 1.4513321023476656e-05, + "loss": 0.003, + "step": 13158 + }, + { + "epoch": 3.471441762300488, + "grad_norm": 0.038032032549381256, + "learning_rate": 1.451214865617398e-05, + "loss": 0.0003, + "step": 13160 + }, + { + "epoch": 3.4719693971771535, + "grad_norm": 0.01949276402592659, + "learning_rate": 1.4510976288871303e-05, + "loss": 0.0002, + "step": 13162 + }, + { + "epoch": 3.472497032053819, + "grad_norm": 0.08945074677467346, + "learning_rate": 1.4509803921568629e-05, + "loss": 0.0008, + "step": 13164 + }, + { + "epoch": 3.4730246669304843, + "grad_norm": 0.028650889173150063, + "learning_rate": 1.4508631554265953e-05, + "loss": 0.0003, + "step": 13166 + }, + { + "epoch": 3.4735523018071497, + "grad_norm": 0.00624432135373354, + "learning_rate": 1.4507459186963278e-05, + "loss": 0.0001, + "step": 13168 + }, + { + "epoch": 3.4740799366838147, + "grad_norm": 0.19685311615467072, + "learning_rate": 1.45062868196606e-05, + "loss": 0.0007, + "step": 13170 + }, + { + "epoch": 3.47460757156048, + "grad_norm": 0.03589513525366783, + "learning_rate": 1.4505114452357925e-05, + "loss": 0.0004, + "step": 13172 + }, + { + "epoch": 3.4751352064371455, + "grad_norm": 0.3839118480682373, + "learning_rate": 1.4503942085055249e-05, + "loss": 0.0006, + "step": 13174 + }, + { + "epoch": 3.475662841313811, + "grad_norm": 0.02023867331445217, + "learning_rate": 1.4502769717752573e-05, + "loss": 0.0002, + "step": 13176 + }, + { + "epoch": 3.4761904761904763, + "grad_norm": 0.0022804499603807926, + "learning_rate": 1.4501597350449896e-05, + "loss": 0.0021, + "step": 13178 + }, + { + "epoch": 3.4767181110671417, + "grad_norm": 0.011299386620521545, + "learning_rate": 1.4500424983147222e-05, + "loss": 0.0002, + "step": 13180 + }, + { + "epoch": 3.477245745943807, + "grad_norm": 0.0038669004570692778, + "learning_rate": 1.4499252615844546e-05, + "loss": 0.0003, + "step": 13182 + }, + { + "epoch": 3.477773380820472, + "grad_norm": 0.0022868807427585125, + "learning_rate": 1.449808024854187e-05, + "loss": 0.0001, + "step": 13184 + }, + { + "epoch": 3.4783010156971375, + "grad_norm": 0.011131820268929005, + "learning_rate": 1.4496907881239193e-05, + "loss": 0.0002, + "step": 13186 + }, + { + "epoch": 3.478828650573803, + "grad_norm": 0.0008949352777563035, + "learning_rate": 1.4495735513936517e-05, + "loss": 0.0001, + "step": 13188 + }, + { + "epoch": 3.4793562854504683, + "grad_norm": 0.0012651582947000861, + "learning_rate": 1.4494563146633842e-05, + "loss": 0.0001, + "step": 13190 + }, + { + "epoch": 3.4798839203271337, + "grad_norm": 0.23505857586860657, + "learning_rate": 1.4493390779331164e-05, + "loss": 0.0062, + "step": 13192 + }, + { + "epoch": 3.480411555203799, + "grad_norm": 0.0015460343565791845, + "learning_rate": 1.4492218412028488e-05, + "loss": 0.0001, + "step": 13194 + }, + { + "epoch": 3.4809391900804645, + "grad_norm": 0.0018347389996051788, + "learning_rate": 1.4491046044725814e-05, + "loss": 0.0001, + "step": 13196 + }, + { + "epoch": 3.4814668249571294, + "grad_norm": 0.002981681376695633, + "learning_rate": 1.4489873677423139e-05, + "loss": 0.0001, + "step": 13198 + }, + { + "epoch": 3.481994459833795, + "grad_norm": 0.19510982930660248, + "learning_rate": 1.4488701310120461e-05, + "loss": 0.0031, + "step": 13200 + }, + { + "epoch": 3.4825220947104603, + "grad_norm": 4.50567102432251, + "learning_rate": 1.4487528942817786e-05, + "loss": 0.0005, + "step": 13202 + }, + { + "epoch": 3.4830497295871257, + "grad_norm": 0.004772757645696402, + "learning_rate": 1.448635657551511e-05, + "loss": 0.0001, + "step": 13204 + }, + { + "epoch": 3.483577364463791, + "grad_norm": 0.0043159048072993755, + "learning_rate": 1.4485184208212434e-05, + "loss": 0.0067, + "step": 13206 + }, + { + "epoch": 3.4841049993404565, + "grad_norm": 0.009085065685212612, + "learning_rate": 1.4484011840909757e-05, + "loss": 0.0043, + "step": 13208 + }, + { + "epoch": 3.484632634217122, + "grad_norm": 0.012796040624380112, + "learning_rate": 1.4482839473607083e-05, + "loss": 0.0001, + "step": 13210 + }, + { + "epoch": 3.4851602690937873, + "grad_norm": 0.17922332882881165, + "learning_rate": 1.4481667106304407e-05, + "loss": 0.0028, + "step": 13212 + }, + { + "epoch": 3.4856879039704527, + "grad_norm": 0.02387390471994877, + "learning_rate": 1.4480494739001732e-05, + "loss": 0.002, + "step": 13214 + }, + { + "epoch": 3.4862155388471177, + "grad_norm": 0.04464009031653404, + "learning_rate": 1.4479322371699054e-05, + "loss": 0.0001, + "step": 13216 + }, + { + "epoch": 3.486743173723783, + "grad_norm": 0.02738698571920395, + "learning_rate": 1.4478150004396378e-05, + "loss": 0.0015, + "step": 13218 + }, + { + "epoch": 3.4872708086004485, + "grad_norm": 0.009192301891744137, + "learning_rate": 1.4476977637093703e-05, + "loss": 0.0027, + "step": 13220 + }, + { + "epoch": 3.487798443477114, + "grad_norm": 0.3036627173423767, + "learning_rate": 1.4475805269791025e-05, + "loss": 0.0053, + "step": 13222 + }, + { + "epoch": 3.4883260783537793, + "grad_norm": 0.0513317696750164, + "learning_rate": 1.447463290248835e-05, + "loss": 0.0002, + "step": 13224 + }, + { + "epoch": 3.4888537132304447, + "grad_norm": 0.018894528970122337, + "learning_rate": 1.4473460535185676e-05, + "loss": 0.0001, + "step": 13226 + }, + { + "epoch": 3.48938134810711, + "grad_norm": 0.006444321013987064, + "learning_rate": 1.4472288167883e-05, + "loss": 0.0001, + "step": 13228 + }, + { + "epoch": 3.489908982983775, + "grad_norm": 0.028665438294410706, + "learning_rate": 1.4471115800580323e-05, + "loss": 0.0003, + "step": 13230 + }, + { + "epoch": 3.4904366178604405, + "grad_norm": 0.007472584024071693, + "learning_rate": 1.4469943433277647e-05, + "loss": 0.0002, + "step": 13232 + }, + { + "epoch": 3.490964252737106, + "grad_norm": 0.0035983072593808174, + "learning_rate": 1.4468771065974971e-05, + "loss": 0.0001, + "step": 13234 + }, + { + "epoch": 3.4914918876137713, + "grad_norm": 0.010211063548922539, + "learning_rate": 1.4467598698672295e-05, + "loss": 0.0001, + "step": 13236 + }, + { + "epoch": 3.4920195224904367, + "grad_norm": 0.18137823045253754, + "learning_rate": 1.4466426331369618e-05, + "loss": 0.0009, + "step": 13238 + }, + { + "epoch": 3.492547157367102, + "grad_norm": 0.17839407920837402, + "learning_rate": 1.4465253964066942e-05, + "loss": 0.005, + "step": 13240 + }, + { + "epoch": 3.4930747922437675, + "grad_norm": 0.0032987992744892836, + "learning_rate": 1.4464081596764268e-05, + "loss": 0.0054, + "step": 13242 + }, + { + "epoch": 3.4936024271204325, + "grad_norm": 0.13308247923851013, + "learning_rate": 1.4462909229461593e-05, + "loss": 0.0024, + "step": 13244 + }, + { + "epoch": 3.494130061997098, + "grad_norm": 0.03800034523010254, + "learning_rate": 1.4461736862158915e-05, + "loss": 0.0005, + "step": 13246 + }, + { + "epoch": 3.4946576968737633, + "grad_norm": 0.002311323070898652, + "learning_rate": 1.446056449485624e-05, + "loss": 0.0001, + "step": 13248 + }, + { + "epoch": 3.4951853317504287, + "grad_norm": 0.0017550018383190036, + "learning_rate": 1.4459392127553564e-05, + "loss": 0.0001, + "step": 13250 + }, + { + "epoch": 3.495712966627094, + "grad_norm": 0.01275341585278511, + "learning_rate": 1.4458219760250887e-05, + "loss": 0.0053, + "step": 13252 + }, + { + "epoch": 3.4962406015037595, + "grad_norm": 0.06619631499052048, + "learning_rate": 1.445704739294821e-05, + "loss": 0.0034, + "step": 13254 + }, + { + "epoch": 3.496768236380425, + "grad_norm": 1.4277780055999756, + "learning_rate": 1.4455875025645537e-05, + "loss": 0.0013, + "step": 13256 + }, + { + "epoch": 3.49729587125709, + "grad_norm": 0.02170044369995594, + "learning_rate": 1.4454702658342861e-05, + "loss": 0.0002, + "step": 13258 + }, + { + "epoch": 3.4978235061337553, + "grad_norm": 0.00341522297821939, + "learning_rate": 1.4453530291040184e-05, + "loss": 0.0001, + "step": 13260 + }, + { + "epoch": 3.4983511410104207, + "grad_norm": 0.004657794255763292, + "learning_rate": 1.4452357923737508e-05, + "loss": 0.0001, + "step": 13262 + }, + { + "epoch": 3.498878775887086, + "grad_norm": 0.018005700781941414, + "learning_rate": 1.4451185556434832e-05, + "loss": 0.0002, + "step": 13264 + }, + { + "epoch": 3.4994064107637515, + "grad_norm": 0.005933028645813465, + "learning_rate": 1.4450013189132157e-05, + "loss": 0.0001, + "step": 13266 + }, + { + "epoch": 3.499934045640417, + "grad_norm": 0.0036303706001490355, + "learning_rate": 1.444884082182948e-05, + "loss": 0.0029, + "step": 13268 + }, + { + "epoch": 3.5004616805170823, + "grad_norm": 0.2860598564147949, + "learning_rate": 1.4447668454526804e-05, + "loss": 0.0017, + "step": 13270 + }, + { + "epoch": 3.5009893153937477, + "grad_norm": 0.004898716229945421, + "learning_rate": 1.444649608722413e-05, + "loss": 0.0001, + "step": 13272 + }, + { + "epoch": 3.501516950270413, + "grad_norm": 0.4070478081703186, + "learning_rate": 1.4445323719921454e-05, + "loss": 0.0046, + "step": 13274 + }, + { + "epoch": 3.502044585147078, + "grad_norm": 0.0025595163460820913, + "learning_rate": 1.4444151352618776e-05, + "loss": 0.0006, + "step": 13276 + }, + { + "epoch": 3.5025722200237435, + "grad_norm": 0.003280411008745432, + "learning_rate": 1.44429789853161e-05, + "loss": 0.0017, + "step": 13278 + }, + { + "epoch": 3.503099854900409, + "grad_norm": 0.3190287947654724, + "learning_rate": 1.4441806618013425e-05, + "loss": 0.0067, + "step": 13280 + }, + { + "epoch": 3.5036274897770743, + "grad_norm": 0.04928690567612648, + "learning_rate": 1.4440634250710748e-05, + "loss": 0.0003, + "step": 13282 + }, + { + "epoch": 3.5041551246537397, + "grad_norm": 0.24481868743896484, + "learning_rate": 1.4439461883408072e-05, + "loss": 0.0006, + "step": 13284 + }, + { + "epoch": 3.504682759530405, + "grad_norm": 0.0026504986453801394, + "learning_rate": 1.4438289516105396e-05, + "loss": 0.0001, + "step": 13286 + }, + { + "epoch": 3.5052103944070705, + "grad_norm": 0.17425015568733215, + "learning_rate": 1.4437117148802722e-05, + "loss": 0.0007, + "step": 13288 + }, + { + "epoch": 3.5057380292837355, + "grad_norm": 0.006609808653593063, + "learning_rate": 1.4435944781500045e-05, + "loss": 0.0001, + "step": 13290 + }, + { + "epoch": 3.506265664160401, + "grad_norm": 0.019675543531775475, + "learning_rate": 1.443477241419737e-05, + "loss": 0.0003, + "step": 13292 + }, + { + "epoch": 3.5067932990370663, + "grad_norm": 0.03719085454940796, + "learning_rate": 1.4433600046894694e-05, + "loss": 0.0002, + "step": 13294 + }, + { + "epoch": 3.5073209339137317, + "grad_norm": 0.07534006237983704, + "learning_rate": 1.4432427679592018e-05, + "loss": 0.0006, + "step": 13296 + }, + { + "epoch": 3.507848568790397, + "grad_norm": 0.006638506427407265, + "learning_rate": 1.443125531228934e-05, + "loss": 0.0001, + "step": 13298 + }, + { + "epoch": 3.5083762036670625, + "grad_norm": 0.019556934013962746, + "learning_rate": 1.4430082944986665e-05, + "loss": 0.0006, + "step": 13300 + }, + { + "epoch": 3.508903838543728, + "grad_norm": 0.002903567161411047, + "learning_rate": 1.442891057768399e-05, + "loss": 0.0001, + "step": 13302 + }, + { + "epoch": 3.509431473420393, + "grad_norm": 0.01289418712258339, + "learning_rate": 1.4427738210381315e-05, + "loss": 0.0001, + "step": 13304 + }, + { + "epoch": 3.5099591082970587, + "grad_norm": 0.0030618137679994106, + "learning_rate": 1.4426565843078638e-05, + "loss": 0.0003, + "step": 13306 + }, + { + "epoch": 3.5104867431737237, + "grad_norm": 0.012666366994380951, + "learning_rate": 1.4425393475775962e-05, + "loss": 0.0003, + "step": 13308 + }, + { + "epoch": 3.511014378050389, + "grad_norm": 0.0012785899452865124, + "learning_rate": 1.4424221108473286e-05, + "loss": 0.0006, + "step": 13310 + }, + { + "epoch": 3.5115420129270545, + "grad_norm": 0.0012210054555907845, + "learning_rate": 1.4423048741170609e-05, + "loss": 0.0034, + "step": 13312 + }, + { + "epoch": 3.51206964780372, + "grad_norm": 0.007756946142762899, + "learning_rate": 1.4421876373867933e-05, + "loss": 0.0001, + "step": 13314 + }, + { + "epoch": 3.5125972826803853, + "grad_norm": 0.1112484410405159, + "learning_rate": 1.4420704006565257e-05, + "loss": 0.0033, + "step": 13316 + }, + { + "epoch": 3.5131249175570503, + "grad_norm": 0.039543043822050095, + "learning_rate": 1.4419531639262583e-05, + "loss": 0.0004, + "step": 13318 + }, + { + "epoch": 3.513652552433716, + "grad_norm": 0.0030271250288933516, + "learning_rate": 1.4418359271959906e-05, + "loss": 0.0002, + "step": 13320 + }, + { + "epoch": 3.514180187310381, + "grad_norm": 0.08772625029087067, + "learning_rate": 1.441718690465723e-05, + "loss": 0.0004, + "step": 13322 + }, + { + "epoch": 3.5147078221870465, + "grad_norm": 0.0013510428834706545, + "learning_rate": 1.4416014537354555e-05, + "loss": 0.0007, + "step": 13324 + }, + { + "epoch": 3.515235457063712, + "grad_norm": 0.0006485724588856101, + "learning_rate": 1.4414842170051879e-05, + "loss": 0.0001, + "step": 13326 + }, + { + "epoch": 3.5157630919403773, + "grad_norm": 0.006814165972173214, + "learning_rate": 1.4413669802749202e-05, + "loss": 0.0018, + "step": 13328 + }, + { + "epoch": 3.5162907268170427, + "grad_norm": 0.00044526145211420953, + "learning_rate": 1.4412497435446526e-05, + "loss": 0.0, + "step": 13330 + }, + { + "epoch": 3.516818361693708, + "grad_norm": 0.004090779926627874, + "learning_rate": 1.441132506814385e-05, + "loss": 0.0003, + "step": 13332 + }, + { + "epoch": 3.5173459965703735, + "grad_norm": 0.01732165738940239, + "learning_rate": 1.4410152700841176e-05, + "loss": 0.0002, + "step": 13334 + }, + { + "epoch": 3.5178736314470385, + "grad_norm": 0.0008789632120169699, + "learning_rate": 1.4408980333538499e-05, + "loss": 0.0001, + "step": 13336 + }, + { + "epoch": 3.518401266323704, + "grad_norm": 0.05112196132540703, + "learning_rate": 1.4407807966235823e-05, + "loss": 0.0002, + "step": 13338 + }, + { + "epoch": 3.5189289012003693, + "grad_norm": 0.0009929134976118803, + "learning_rate": 1.4406635598933147e-05, + "loss": 0.0004, + "step": 13340 + }, + { + "epoch": 3.5194565360770347, + "grad_norm": 0.00731823593378067, + "learning_rate": 1.4405463231630472e-05, + "loss": 0.0002, + "step": 13342 + }, + { + "epoch": 3.5199841709537, + "grad_norm": 0.034352321177721024, + "learning_rate": 1.4404290864327794e-05, + "loss": 0.0021, + "step": 13344 + }, + { + "epoch": 3.5205118058303655, + "grad_norm": 0.3127502501010895, + "learning_rate": 1.4403118497025119e-05, + "loss": 0.0019, + "step": 13346 + }, + { + "epoch": 3.521039440707031, + "grad_norm": 0.006871634162962437, + "learning_rate": 1.4401946129722445e-05, + "loss": 0.0001, + "step": 13348 + }, + { + "epoch": 3.521567075583696, + "grad_norm": 0.22444264590740204, + "learning_rate": 1.4400773762419766e-05, + "loss": 0.0073, + "step": 13350 + }, + { + "epoch": 3.5220947104603613, + "grad_norm": 0.0016749515198171139, + "learning_rate": 1.4399601395117092e-05, + "loss": 0.0001, + "step": 13352 + }, + { + "epoch": 3.5226223453370267, + "grad_norm": 0.12579713761806488, + "learning_rate": 1.4398429027814416e-05, + "loss": 0.0003, + "step": 13354 + }, + { + "epoch": 3.523149980213692, + "grad_norm": 0.0021081198938190937, + "learning_rate": 1.439725666051174e-05, + "loss": 0.0009, + "step": 13356 + }, + { + "epoch": 3.5236776150903575, + "grad_norm": 0.11233742535114288, + "learning_rate": 1.4396084293209063e-05, + "loss": 0.0042, + "step": 13358 + }, + { + "epoch": 3.524205249967023, + "grad_norm": 0.0015413176734000444, + "learning_rate": 1.4394911925906387e-05, + "loss": 0.0, + "step": 13360 + }, + { + "epoch": 3.5247328848436883, + "grad_norm": 0.0008430299931205809, + "learning_rate": 1.4393739558603711e-05, + "loss": 0.0001, + "step": 13362 + }, + { + "epoch": 3.5252605197203533, + "grad_norm": 0.056977529078722, + "learning_rate": 1.4392567191301037e-05, + "loss": 0.002, + "step": 13364 + }, + { + "epoch": 3.525788154597019, + "grad_norm": 0.01077105849981308, + "learning_rate": 1.439139482399836e-05, + "loss": 0.0001, + "step": 13366 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.00034250164753757417, + "learning_rate": 1.4390222456695684e-05, + "loss": 0.003, + "step": 13368 + }, + { + "epoch": 3.5268434243503495, + "grad_norm": 0.2205660194158554, + "learning_rate": 1.4389050089393009e-05, + "loss": 0.0017, + "step": 13370 + }, + { + "epoch": 3.527371059227015, + "grad_norm": 0.00620070518925786, + "learning_rate": 1.4387877722090333e-05, + "loss": 0.0001, + "step": 13372 + }, + { + "epoch": 3.5278986941036803, + "grad_norm": 0.004268507473170757, + "learning_rate": 1.4386705354787655e-05, + "loss": 0.0001, + "step": 13374 + }, + { + "epoch": 3.5284263289803457, + "grad_norm": 0.0017061511753126979, + "learning_rate": 1.438553298748498e-05, + "loss": 0.002, + "step": 13376 + }, + { + "epoch": 3.528953963857011, + "grad_norm": 0.0014201455051079392, + "learning_rate": 1.4384360620182304e-05, + "loss": 0.0003, + "step": 13378 + }, + { + "epoch": 3.5294815987336765, + "grad_norm": 0.007614693604409695, + "learning_rate": 1.4383188252879627e-05, + "loss": 0.0005, + "step": 13380 + }, + { + "epoch": 3.5300092336103415, + "grad_norm": 0.20587721467018127, + "learning_rate": 1.4382015885576953e-05, + "loss": 0.0019, + "step": 13382 + }, + { + "epoch": 3.530536868487007, + "grad_norm": 0.02001318521797657, + "learning_rate": 1.4380843518274277e-05, + "loss": 0.0005, + "step": 13384 + }, + { + "epoch": 3.5310645033636723, + "grad_norm": 0.002753373235464096, + "learning_rate": 1.4379671150971601e-05, + "loss": 0.0003, + "step": 13386 + }, + { + "epoch": 3.5315921382403377, + "grad_norm": 0.0030659090261906385, + "learning_rate": 1.4378498783668924e-05, + "loss": 0.0001, + "step": 13388 + }, + { + "epoch": 3.532119773117003, + "grad_norm": 0.011605043895542622, + "learning_rate": 1.4377326416366248e-05, + "loss": 0.0001, + "step": 13390 + }, + { + "epoch": 3.5326474079936685, + "grad_norm": 0.0013540658401325345, + "learning_rate": 1.4376154049063573e-05, + "loss": 0.0, + "step": 13392 + }, + { + "epoch": 3.533175042870334, + "grad_norm": 0.003989151678979397, + "learning_rate": 1.4374981681760899e-05, + "loss": 0.0038, + "step": 13394 + }, + { + "epoch": 3.533702677746999, + "grad_norm": 0.007641843985766172, + "learning_rate": 1.437380931445822e-05, + "loss": 0.0025, + "step": 13396 + }, + { + "epoch": 3.5342303126236643, + "grad_norm": 0.09746041893959045, + "learning_rate": 1.4372636947155545e-05, + "loss": 0.0009, + "step": 13398 + }, + { + "epoch": 3.5347579475003297, + "grad_norm": 0.0014472653856500983, + "learning_rate": 1.437146457985287e-05, + "loss": 0.0001, + "step": 13400 + }, + { + "epoch": 3.535285582376995, + "grad_norm": 0.016034388914704323, + "learning_rate": 1.4370292212550194e-05, + "loss": 0.0008, + "step": 13402 + }, + { + "epoch": 3.5358132172536605, + "grad_norm": 0.008160603232681751, + "learning_rate": 1.4369119845247517e-05, + "loss": 0.0001, + "step": 13404 + }, + { + "epoch": 3.536340852130326, + "grad_norm": 0.19877073168754578, + "learning_rate": 1.4367947477944841e-05, + "loss": 0.0081, + "step": 13406 + }, + { + "epoch": 3.5368684870069913, + "grad_norm": 0.0011256674770265818, + "learning_rate": 1.4366775110642165e-05, + "loss": 0.0001, + "step": 13408 + }, + { + "epoch": 3.5373961218836563, + "grad_norm": 0.0733499750494957, + "learning_rate": 1.4365602743339488e-05, + "loss": 0.002, + "step": 13410 + }, + { + "epoch": 3.537923756760322, + "grad_norm": 0.011633023619651794, + "learning_rate": 1.4364430376036814e-05, + "loss": 0.0001, + "step": 13412 + }, + { + "epoch": 3.538451391636987, + "grad_norm": 0.0023335644509643316, + "learning_rate": 1.4363258008734138e-05, + "loss": 0.0001, + "step": 13414 + }, + { + "epoch": 3.5389790265136525, + "grad_norm": 0.004412886220961809, + "learning_rate": 1.4362085641431462e-05, + "loss": 0.0001, + "step": 13416 + }, + { + "epoch": 3.539506661390318, + "grad_norm": 0.004577846731990576, + "learning_rate": 1.4360913274128785e-05, + "loss": 0.0001, + "step": 13418 + }, + { + "epoch": 3.5400342962669833, + "grad_norm": 0.002333554206416011, + "learning_rate": 1.435974090682611e-05, + "loss": 0.0008, + "step": 13420 + }, + { + "epoch": 3.5405619311436487, + "grad_norm": 0.016421331092715263, + "learning_rate": 1.4358568539523434e-05, + "loss": 0.0002, + "step": 13422 + }, + { + "epoch": 3.541089566020314, + "grad_norm": 0.020917583256959915, + "learning_rate": 1.435739617222076e-05, + "loss": 0.0002, + "step": 13424 + }, + { + "epoch": 3.5416172008969795, + "grad_norm": 0.0014604252064600587, + "learning_rate": 1.435622380491808e-05, + "loss": 0.0001, + "step": 13426 + }, + { + "epoch": 3.5421448357736445, + "grad_norm": 0.02527245692908764, + "learning_rate": 1.4355051437615407e-05, + "loss": 0.0002, + "step": 13428 + }, + { + "epoch": 3.54267247065031, + "grad_norm": 0.02336183935403824, + "learning_rate": 1.4353879070312731e-05, + "loss": 0.0003, + "step": 13430 + }, + { + "epoch": 3.5432001055269753, + "grad_norm": 0.16205239295959473, + "learning_rate": 1.4352706703010055e-05, + "loss": 0.0019, + "step": 13432 + }, + { + "epoch": 3.5437277404036407, + "grad_norm": 0.04707127436995506, + "learning_rate": 1.4351534335707378e-05, + "loss": 0.0003, + "step": 13434 + }, + { + "epoch": 3.544255375280306, + "grad_norm": 0.19462910294532776, + "learning_rate": 1.4350361968404702e-05, + "loss": 0.0023, + "step": 13436 + }, + { + "epoch": 3.5447830101569715, + "grad_norm": 0.007460389751940966, + "learning_rate": 1.4349189601102026e-05, + "loss": 0.0001, + "step": 13438 + }, + { + "epoch": 3.545310645033637, + "grad_norm": 0.0010438116732984781, + "learning_rate": 1.4348017233799349e-05, + "loss": 0.0001, + "step": 13440 + }, + { + "epoch": 3.545838279910302, + "grad_norm": 0.001672861399129033, + "learning_rate": 1.4346844866496673e-05, + "loss": 0.0001, + "step": 13442 + }, + { + "epoch": 3.5463659147869673, + "grad_norm": 0.06308780610561371, + "learning_rate": 1.4345672499194e-05, + "loss": 0.0006, + "step": 13444 + }, + { + "epoch": 3.5468935496636327, + "grad_norm": 0.016390418633818626, + "learning_rate": 1.4344500131891324e-05, + "loss": 0.0001, + "step": 13446 + }, + { + "epoch": 3.547421184540298, + "grad_norm": 0.0038369616959244013, + "learning_rate": 1.4343327764588646e-05, + "loss": 0.0041, + "step": 13448 + }, + { + "epoch": 3.5479488194169635, + "grad_norm": 0.0011918328236788511, + "learning_rate": 1.434215539728597e-05, + "loss": 0.0001, + "step": 13450 + }, + { + "epoch": 3.548476454293629, + "grad_norm": 0.5406104326248169, + "learning_rate": 1.4340983029983295e-05, + "loss": 0.0145, + "step": 13452 + }, + { + "epoch": 3.5490040891702943, + "grad_norm": 0.0031193136237561703, + "learning_rate": 1.433981066268062e-05, + "loss": 0.0001, + "step": 13454 + }, + { + "epoch": 3.5495317240469593, + "grad_norm": 0.017968712374567986, + "learning_rate": 1.4338638295377942e-05, + "loss": 0.0004, + "step": 13456 + }, + { + "epoch": 3.550059358923625, + "grad_norm": 0.0475284717977047, + "learning_rate": 1.4337465928075268e-05, + "loss": 0.0002, + "step": 13458 + }, + { + "epoch": 3.55058699380029, + "grad_norm": 0.033159852027893066, + "learning_rate": 1.4336293560772592e-05, + "loss": 0.0022, + "step": 13460 + }, + { + "epoch": 3.5511146286769555, + "grad_norm": 0.02973903901875019, + "learning_rate": 1.4335121193469916e-05, + "loss": 0.0003, + "step": 13462 + }, + { + "epoch": 3.551642263553621, + "grad_norm": 0.11619208008050919, + "learning_rate": 1.4333948826167239e-05, + "loss": 0.0016, + "step": 13464 + }, + { + "epoch": 3.5521698984302863, + "grad_norm": 0.15278464555740356, + "learning_rate": 1.4332776458864563e-05, + "loss": 0.0008, + "step": 13466 + }, + { + "epoch": 3.5526975333069517, + "grad_norm": 0.6708641052246094, + "learning_rate": 1.4331604091561888e-05, + "loss": 0.0049, + "step": 13468 + }, + { + "epoch": 3.5532251681836167, + "grad_norm": 0.02877923473715782, + "learning_rate": 1.433043172425921e-05, + "loss": 0.0002, + "step": 13470 + }, + { + "epoch": 3.5537528030602825, + "grad_norm": 0.035219475626945496, + "learning_rate": 1.4329259356956535e-05, + "loss": 0.0002, + "step": 13472 + }, + { + "epoch": 3.5542804379369475, + "grad_norm": 0.006910056807100773, + "learning_rate": 1.432808698965386e-05, + "loss": 0.0001, + "step": 13474 + }, + { + "epoch": 3.554808072813613, + "grad_norm": 0.002969237044453621, + "learning_rate": 1.4326914622351185e-05, + "loss": 0.0038, + "step": 13476 + }, + { + "epoch": 3.5553357076902783, + "grad_norm": 0.033056143671274185, + "learning_rate": 1.4325742255048507e-05, + "loss": 0.0001, + "step": 13478 + }, + { + "epoch": 3.5558633425669437, + "grad_norm": 0.009481310844421387, + "learning_rate": 1.4324569887745832e-05, + "loss": 0.0001, + "step": 13480 + }, + { + "epoch": 3.556390977443609, + "grad_norm": 0.27951571345329285, + "learning_rate": 1.4323397520443156e-05, + "loss": 0.0005, + "step": 13482 + }, + { + "epoch": 3.5569186123202745, + "grad_norm": 0.002152862260118127, + "learning_rate": 1.432222515314048e-05, + "loss": 0.0001, + "step": 13484 + }, + { + "epoch": 3.55744624719694, + "grad_norm": 0.016592184081673622, + "learning_rate": 1.4321052785837803e-05, + "loss": 0.0001, + "step": 13486 + }, + { + "epoch": 3.557973882073605, + "grad_norm": 0.015287335962057114, + "learning_rate": 1.4319880418535129e-05, + "loss": 0.0001, + "step": 13488 + }, + { + "epoch": 3.5585015169502703, + "grad_norm": 0.0022955210879445076, + "learning_rate": 1.4318708051232453e-05, + "loss": 0.0016, + "step": 13490 + }, + { + "epoch": 3.5590291518269357, + "grad_norm": 0.0028626907151192427, + "learning_rate": 1.4317535683929778e-05, + "loss": 0.0007, + "step": 13492 + }, + { + "epoch": 3.559556786703601, + "grad_norm": 0.003049307269975543, + "learning_rate": 1.43163633166271e-05, + "loss": 0.0001, + "step": 13494 + }, + { + "epoch": 3.5600844215802665, + "grad_norm": 0.001203340943902731, + "learning_rate": 1.4315190949324424e-05, + "loss": 0.0001, + "step": 13496 + }, + { + "epoch": 3.560612056456932, + "grad_norm": 0.009576382115483284, + "learning_rate": 1.4314018582021749e-05, + "loss": 0.0023, + "step": 13498 + }, + { + "epoch": 3.5611396913335973, + "grad_norm": 0.006663038861006498, + "learning_rate": 1.4312846214719071e-05, + "loss": 0.0001, + "step": 13500 + }, + { + "epoch": 3.5616673262102623, + "grad_norm": 0.011624319478869438, + "learning_rate": 1.4311673847416396e-05, + "loss": 0.0003, + "step": 13502 + }, + { + "epoch": 3.5621949610869277, + "grad_norm": 0.001771369599737227, + "learning_rate": 1.4310501480113722e-05, + "loss": 0.0015, + "step": 13504 + }, + { + "epoch": 3.562722595963593, + "grad_norm": 0.002516245935112238, + "learning_rate": 1.4309329112811046e-05, + "loss": 0.0001, + "step": 13506 + }, + { + "epoch": 3.5632502308402585, + "grad_norm": 0.0295269712805748, + "learning_rate": 1.4308156745508369e-05, + "loss": 0.0003, + "step": 13508 + }, + { + "epoch": 3.563777865716924, + "grad_norm": 0.056787505745887756, + "learning_rate": 1.4306984378205693e-05, + "loss": 0.0004, + "step": 13510 + }, + { + "epoch": 3.5643055005935893, + "grad_norm": 0.24250495433807373, + "learning_rate": 1.4305812010903017e-05, + "loss": 0.0108, + "step": 13512 + }, + { + "epoch": 3.5648331354702547, + "grad_norm": 0.891004741191864, + "learning_rate": 1.4304639643600342e-05, + "loss": 0.0084, + "step": 13514 + }, + { + "epoch": 3.5653607703469197, + "grad_norm": 0.2238980084657669, + "learning_rate": 1.4303467276297664e-05, + "loss": 0.0062, + "step": 13516 + }, + { + "epoch": 3.5658884052235855, + "grad_norm": 0.022016050294041634, + "learning_rate": 1.4302294908994988e-05, + "loss": 0.0001, + "step": 13518 + }, + { + "epoch": 3.5664160401002505, + "grad_norm": 0.11320075392723083, + "learning_rate": 1.4301122541692314e-05, + "loss": 0.0022, + "step": 13520 + }, + { + "epoch": 3.566943674976916, + "grad_norm": 0.08724847435951233, + "learning_rate": 1.4299950174389639e-05, + "loss": 0.0006, + "step": 13522 + }, + { + "epoch": 3.5674713098535813, + "grad_norm": 0.06757239252328873, + "learning_rate": 1.4298777807086961e-05, + "loss": 0.0004, + "step": 13524 + }, + { + "epoch": 3.5679989447302467, + "grad_norm": 0.03070792369544506, + "learning_rate": 1.4297605439784286e-05, + "loss": 0.0009, + "step": 13526 + }, + { + "epoch": 3.568526579606912, + "grad_norm": 0.006289472803473473, + "learning_rate": 1.429643307248161e-05, + "loss": 0.0002, + "step": 13528 + }, + { + "epoch": 3.5690542144835775, + "grad_norm": 0.43815097212791443, + "learning_rate": 1.4295260705178933e-05, + "loss": 0.004, + "step": 13530 + }, + { + "epoch": 3.569581849360243, + "grad_norm": 0.10000594705343246, + "learning_rate": 1.4294088337876257e-05, + "loss": 0.0007, + "step": 13532 + }, + { + "epoch": 3.570109484236908, + "grad_norm": 0.003102256450802088, + "learning_rate": 1.4292915970573583e-05, + "loss": 0.0001, + "step": 13534 + }, + { + "epoch": 3.5706371191135733, + "grad_norm": 0.005605553742498159, + "learning_rate": 1.4291743603270907e-05, + "loss": 0.0002, + "step": 13536 + }, + { + "epoch": 3.5711647539902387, + "grad_norm": 0.006956430152058601, + "learning_rate": 1.429057123596823e-05, + "loss": 0.0001, + "step": 13538 + }, + { + "epoch": 3.571692388866904, + "grad_norm": 0.011766090989112854, + "learning_rate": 1.4289398868665554e-05, + "loss": 0.004, + "step": 13540 + }, + { + "epoch": 3.5722200237435695, + "grad_norm": 0.002993211383000016, + "learning_rate": 1.4288226501362878e-05, + "loss": 0.0002, + "step": 13542 + }, + { + "epoch": 3.572747658620235, + "grad_norm": 0.013599567115306854, + "learning_rate": 1.4287054134060203e-05, + "loss": 0.0001, + "step": 13544 + }, + { + "epoch": 3.5732752934969003, + "grad_norm": 0.0035114630591124296, + "learning_rate": 1.4285881766757525e-05, + "loss": 0.003, + "step": 13546 + }, + { + "epoch": 3.5738029283735653, + "grad_norm": 0.28234735131263733, + "learning_rate": 1.428470939945485e-05, + "loss": 0.002, + "step": 13548 + }, + { + "epoch": 3.5743305632502307, + "grad_norm": 0.18516358733177185, + "learning_rate": 1.4283537032152176e-05, + "loss": 0.0034, + "step": 13550 + }, + { + "epoch": 3.574858198126896, + "grad_norm": 0.020420948043465614, + "learning_rate": 1.42823646648495e-05, + "loss": 0.0002, + "step": 13552 + }, + { + "epoch": 3.5753858330035615, + "grad_norm": 0.11613316088914871, + "learning_rate": 1.4281192297546823e-05, + "loss": 0.0029, + "step": 13554 + }, + { + "epoch": 3.575913467880227, + "grad_norm": 0.0056850239634513855, + "learning_rate": 1.4280019930244147e-05, + "loss": 0.0001, + "step": 13556 + }, + { + "epoch": 3.5764411027568923, + "grad_norm": 0.034527555108070374, + "learning_rate": 1.4278847562941471e-05, + "loss": 0.0002, + "step": 13558 + }, + { + "epoch": 3.5769687376335577, + "grad_norm": 0.00563775235787034, + "learning_rate": 1.4277675195638794e-05, + "loss": 0.0003, + "step": 13560 + }, + { + "epoch": 3.5774963725102227, + "grad_norm": 0.07833846658468246, + "learning_rate": 1.4276502828336118e-05, + "loss": 0.0004, + "step": 13562 + }, + { + "epoch": 3.5780240073868885, + "grad_norm": 0.42317861318588257, + "learning_rate": 1.4275330461033442e-05, + "loss": 0.0077, + "step": 13564 + }, + { + "epoch": 3.5785516422635535, + "grad_norm": 0.04198991507291794, + "learning_rate": 1.4274158093730768e-05, + "loss": 0.0054, + "step": 13566 + }, + { + "epoch": 3.579079277140219, + "grad_norm": 0.006637962069362402, + "learning_rate": 1.4272985726428091e-05, + "loss": 0.0001, + "step": 13568 + }, + { + "epoch": 3.5796069120168843, + "grad_norm": 0.017879415303468704, + "learning_rate": 1.4271813359125415e-05, + "loss": 0.0002, + "step": 13570 + }, + { + "epoch": 3.5801345468935497, + "grad_norm": 0.026664411649107933, + "learning_rate": 1.427064099182274e-05, + "loss": 0.0008, + "step": 13572 + }, + { + "epoch": 3.580662181770215, + "grad_norm": 0.004283298272639513, + "learning_rate": 1.4269468624520064e-05, + "loss": 0.0002, + "step": 13574 + }, + { + "epoch": 3.5811898166468805, + "grad_norm": 0.005414664279669523, + "learning_rate": 1.4268296257217386e-05, + "loss": 0.0013, + "step": 13576 + }, + { + "epoch": 3.581717451523546, + "grad_norm": 0.0037524274084717035, + "learning_rate": 1.426712388991471e-05, + "loss": 0.0001, + "step": 13578 + }, + { + "epoch": 3.582245086400211, + "grad_norm": 0.0028813467361032963, + "learning_rate": 1.4265951522612037e-05, + "loss": 0.0036, + "step": 13580 + }, + { + "epoch": 3.5827727212768763, + "grad_norm": 0.6913414001464844, + "learning_rate": 1.4264779155309361e-05, + "loss": 0.0033, + "step": 13582 + }, + { + "epoch": 3.5833003561535417, + "grad_norm": 0.01214907318353653, + "learning_rate": 1.4263606788006684e-05, + "loss": 0.0001, + "step": 13584 + }, + { + "epoch": 3.583827991030207, + "grad_norm": 0.0016866248333826661, + "learning_rate": 1.4262434420704008e-05, + "loss": 0.0005, + "step": 13586 + }, + { + "epoch": 3.5843556259068725, + "grad_norm": 0.039499782025814056, + "learning_rate": 1.4261262053401332e-05, + "loss": 0.0002, + "step": 13588 + }, + { + "epoch": 3.584883260783538, + "grad_norm": 0.5196143388748169, + "learning_rate": 1.4260089686098655e-05, + "loss": 0.0016, + "step": 13590 + }, + { + "epoch": 3.5854108956602033, + "grad_norm": 0.02723328396677971, + "learning_rate": 1.425891731879598e-05, + "loss": 0.0002, + "step": 13592 + }, + { + "epoch": 3.5859385305368683, + "grad_norm": 0.002471050014719367, + "learning_rate": 1.4257744951493304e-05, + "loss": 0.0058, + "step": 13594 + }, + { + "epoch": 3.5864661654135337, + "grad_norm": 0.007327945437282324, + "learning_rate": 1.425657258419063e-05, + "loss": 0.0001, + "step": 13596 + }, + { + "epoch": 3.586993800290199, + "grad_norm": 0.007348271552473307, + "learning_rate": 1.4255400216887952e-05, + "loss": 0.0009, + "step": 13598 + }, + { + "epoch": 3.5875214351668645, + "grad_norm": 0.007340410724282265, + "learning_rate": 1.4254227849585276e-05, + "loss": 0.0027, + "step": 13600 + }, + { + "epoch": 3.58804907004353, + "grad_norm": 0.007482288405299187, + "learning_rate": 1.42530554822826e-05, + "loss": 0.0001, + "step": 13602 + }, + { + "epoch": 3.5885767049201953, + "grad_norm": 0.15879511833190918, + "learning_rate": 1.4251883114979925e-05, + "loss": 0.0022, + "step": 13604 + }, + { + "epoch": 3.5891043397968607, + "grad_norm": 0.010256988927721977, + "learning_rate": 1.4250710747677248e-05, + "loss": 0.0002, + "step": 13606 + }, + { + "epoch": 3.5896319746735257, + "grad_norm": 0.004293687175959349, + "learning_rate": 1.4249538380374572e-05, + "loss": 0.0036, + "step": 13608 + }, + { + "epoch": 3.5901596095501915, + "grad_norm": 0.5537733435630798, + "learning_rate": 1.4248366013071896e-05, + "loss": 0.0017, + "step": 13610 + }, + { + "epoch": 3.5906872444268565, + "grad_norm": 0.0017727587837725878, + "learning_rate": 1.4247193645769222e-05, + "loss": 0.0001, + "step": 13612 + }, + { + "epoch": 3.591214879303522, + "grad_norm": 0.008389841765165329, + "learning_rate": 1.4246021278466545e-05, + "loss": 0.0001, + "step": 13614 + }, + { + "epoch": 3.5917425141801873, + "grad_norm": 0.06261681765317917, + "learning_rate": 1.424484891116387e-05, + "loss": 0.0004, + "step": 13616 + }, + { + "epoch": 3.5922701490568527, + "grad_norm": 0.002119483193382621, + "learning_rate": 1.4243676543861193e-05, + "loss": 0.0001, + "step": 13618 + }, + { + "epoch": 3.592797783933518, + "grad_norm": 0.017841886729002, + "learning_rate": 1.4242504176558516e-05, + "loss": 0.0001, + "step": 13620 + }, + { + "epoch": 3.593325418810183, + "grad_norm": 0.01631525158882141, + "learning_rate": 1.424133180925584e-05, + "loss": 0.0001, + "step": 13622 + }, + { + "epoch": 3.593853053686849, + "grad_norm": 0.07498383522033691, + "learning_rate": 1.4240159441953165e-05, + "loss": 0.0002, + "step": 13624 + }, + { + "epoch": 3.594380688563514, + "grad_norm": 0.12542247772216797, + "learning_rate": 1.423898707465049e-05, + "loss": 0.0096, + "step": 13626 + }, + { + "epoch": 3.5949083234401793, + "grad_norm": 0.15024369955062866, + "learning_rate": 1.4237814707347812e-05, + "loss": 0.0021, + "step": 13628 + }, + { + "epoch": 3.5954359583168447, + "grad_norm": 0.004627245478332043, + "learning_rate": 1.4236642340045138e-05, + "loss": 0.0005, + "step": 13630 + }, + { + "epoch": 3.59596359319351, + "grad_norm": 0.003503324231132865, + "learning_rate": 1.4235469972742462e-05, + "loss": 0.0001, + "step": 13632 + }, + { + "epoch": 3.5964912280701755, + "grad_norm": 0.00668094540014863, + "learning_rate": 1.4234297605439786e-05, + "loss": 0.0001, + "step": 13634 + }, + { + "epoch": 3.597018862946841, + "grad_norm": 0.046305395662784576, + "learning_rate": 1.4233125238137109e-05, + "loss": 0.0006, + "step": 13636 + }, + { + "epoch": 3.5975464978235063, + "grad_norm": 0.33384087681770325, + "learning_rate": 1.4231952870834433e-05, + "loss": 0.0016, + "step": 13638 + }, + { + "epoch": 3.5980741327001713, + "grad_norm": 0.02464880421757698, + "learning_rate": 1.4230780503531757e-05, + "loss": 0.0003, + "step": 13640 + }, + { + "epoch": 3.5986017675768367, + "grad_norm": 0.1855979561805725, + "learning_rate": 1.4229608136229083e-05, + "loss": 0.0087, + "step": 13642 + }, + { + "epoch": 3.599129402453502, + "grad_norm": 0.294509619474411, + "learning_rate": 1.4228435768926406e-05, + "loss": 0.0013, + "step": 13644 + }, + { + "epoch": 3.5996570373301675, + "grad_norm": 0.24224154651165009, + "learning_rate": 1.422726340162373e-05, + "loss": 0.0028, + "step": 13646 + }, + { + "epoch": 3.600184672206833, + "grad_norm": 0.019307157024741173, + "learning_rate": 1.4226091034321055e-05, + "loss": 0.0002, + "step": 13648 + }, + { + "epoch": 3.6007123070834983, + "grad_norm": 0.004628595430403948, + "learning_rate": 1.4224918667018377e-05, + "loss": 0.0001, + "step": 13650 + }, + { + "epoch": 3.6012399419601637, + "grad_norm": 0.01668064296245575, + "learning_rate": 1.4223746299715702e-05, + "loss": 0.0044, + "step": 13652 + }, + { + "epoch": 3.6017675768368287, + "grad_norm": 0.10748594999313354, + "learning_rate": 1.4222573932413026e-05, + "loss": 0.0058, + "step": 13654 + }, + { + "epoch": 3.6022952117134945, + "grad_norm": 0.009482168592512608, + "learning_rate": 1.422140156511035e-05, + "loss": 0.0001, + "step": 13656 + }, + { + "epoch": 3.6028228465901595, + "grad_norm": 0.13630633056163788, + "learning_rate": 1.4220229197807673e-05, + "loss": 0.0031, + "step": 13658 + }, + { + "epoch": 3.603350481466825, + "grad_norm": 0.012034938670694828, + "learning_rate": 1.4219056830504999e-05, + "loss": 0.0003, + "step": 13660 + }, + { + "epoch": 3.6038781163434903, + "grad_norm": 0.013477620668709278, + "learning_rate": 1.4217884463202323e-05, + "loss": 0.0002, + "step": 13662 + }, + { + "epoch": 3.6044057512201557, + "grad_norm": 0.021344821900129318, + "learning_rate": 1.4216712095899647e-05, + "loss": 0.0003, + "step": 13664 + }, + { + "epoch": 3.604933386096821, + "grad_norm": 0.022437922656536102, + "learning_rate": 1.421553972859697e-05, + "loss": 0.0053, + "step": 13666 + }, + { + "epoch": 3.605461020973486, + "grad_norm": 0.047965701669454575, + "learning_rate": 1.4214367361294294e-05, + "loss": 0.0009, + "step": 13668 + }, + { + "epoch": 3.605988655850152, + "grad_norm": 0.03075437992811203, + "learning_rate": 1.4213194993991619e-05, + "loss": 0.0002, + "step": 13670 + }, + { + "epoch": 3.606516290726817, + "grad_norm": 0.03520682081580162, + "learning_rate": 1.4212022626688945e-05, + "loss": 0.0006, + "step": 13672 + }, + { + "epoch": 3.6070439256034823, + "grad_norm": 0.003198059508576989, + "learning_rate": 1.4210850259386266e-05, + "loss": 0.0001, + "step": 13674 + }, + { + "epoch": 3.6075715604801477, + "grad_norm": 0.009106059558689594, + "learning_rate": 1.4209677892083591e-05, + "loss": 0.0031, + "step": 13676 + }, + { + "epoch": 3.608099195356813, + "grad_norm": 0.004883518908172846, + "learning_rate": 1.4208505524780916e-05, + "loss": 0.0001, + "step": 13678 + }, + { + "epoch": 3.6086268302334785, + "grad_norm": 0.15545164048671722, + "learning_rate": 1.4207333157478238e-05, + "loss": 0.0044, + "step": 13680 + }, + { + "epoch": 3.609154465110144, + "grad_norm": 0.060108933597803116, + "learning_rate": 1.4206160790175563e-05, + "loss": 0.0005, + "step": 13682 + }, + { + "epoch": 3.6096820999868093, + "grad_norm": 0.3105689585208893, + "learning_rate": 1.4204988422872887e-05, + "loss": 0.0049, + "step": 13684 + }, + { + "epoch": 3.6102097348634743, + "grad_norm": 0.006968844216316938, + "learning_rate": 1.4203816055570211e-05, + "loss": 0.0031, + "step": 13686 + }, + { + "epoch": 3.6107373697401397, + "grad_norm": 0.08934616297483444, + "learning_rate": 1.4202643688267534e-05, + "loss": 0.0004, + "step": 13688 + }, + { + "epoch": 3.611265004616805, + "grad_norm": 0.12971992790699005, + "learning_rate": 1.420147132096486e-05, + "loss": 0.0007, + "step": 13690 + }, + { + "epoch": 3.6117926394934705, + "grad_norm": 0.10447660833597183, + "learning_rate": 1.4200298953662184e-05, + "loss": 0.0125, + "step": 13692 + }, + { + "epoch": 3.612320274370136, + "grad_norm": 0.00939419399946928, + "learning_rate": 1.4199126586359509e-05, + "loss": 0.0027, + "step": 13694 + }, + { + "epoch": 3.6128479092468013, + "grad_norm": 0.00948688481003046, + "learning_rate": 1.4197954219056831e-05, + "loss": 0.0003, + "step": 13696 + }, + { + "epoch": 3.6133755441234667, + "grad_norm": 0.014697683975100517, + "learning_rate": 1.4196781851754155e-05, + "loss": 0.0003, + "step": 13698 + }, + { + "epoch": 3.6139031790001317, + "grad_norm": 0.02223571017384529, + "learning_rate": 1.419560948445148e-05, + "loss": 0.0004, + "step": 13700 + }, + { + "epoch": 3.614430813876797, + "grad_norm": 0.011471452191472054, + "learning_rate": 1.4194437117148804e-05, + "loss": 0.0003, + "step": 13702 + }, + { + "epoch": 3.6149584487534625, + "grad_norm": 0.01984969712793827, + "learning_rate": 1.4193264749846127e-05, + "loss": 0.0003, + "step": 13704 + }, + { + "epoch": 3.615486083630128, + "grad_norm": 0.0411917120218277, + "learning_rate": 1.4192092382543453e-05, + "loss": 0.0007, + "step": 13706 + }, + { + "epoch": 3.6160137185067933, + "grad_norm": 0.26495519280433655, + "learning_rate": 1.4190920015240777e-05, + "loss": 0.0012, + "step": 13708 + }, + { + "epoch": 3.6165413533834587, + "grad_norm": 0.00799752026796341, + "learning_rate": 1.41897476479381e-05, + "loss": 0.0001, + "step": 13710 + }, + { + "epoch": 3.617068988260124, + "grad_norm": 0.018395960330963135, + "learning_rate": 1.4188575280635424e-05, + "loss": 0.0003, + "step": 13712 + }, + { + "epoch": 3.617596623136789, + "grad_norm": 0.22137223184108734, + "learning_rate": 1.4187402913332748e-05, + "loss": 0.0155, + "step": 13714 + }, + { + "epoch": 3.618124258013455, + "grad_norm": 0.015872566029429436, + "learning_rate": 1.4186230546030072e-05, + "loss": 0.0003, + "step": 13716 + }, + { + "epoch": 3.61865189289012, + "grad_norm": 0.01204270776361227, + "learning_rate": 1.4185058178727395e-05, + "loss": 0.0008, + "step": 13718 + }, + { + "epoch": 3.6191795277667853, + "grad_norm": 0.02816845290362835, + "learning_rate": 1.418388581142472e-05, + "loss": 0.0006, + "step": 13720 + }, + { + "epoch": 3.6197071626434507, + "grad_norm": 0.013066752813756466, + "learning_rate": 1.4182713444122045e-05, + "loss": 0.0023, + "step": 13722 + }, + { + "epoch": 3.620234797520116, + "grad_norm": 0.1766463667154312, + "learning_rate": 1.418154107681937e-05, + "loss": 0.0014, + "step": 13724 + }, + { + "epoch": 3.6207624323967815, + "grad_norm": 0.013522560708224773, + "learning_rate": 1.4180368709516692e-05, + "loss": 0.0004, + "step": 13726 + }, + { + "epoch": 3.621290067273447, + "grad_norm": 0.008264630101621151, + "learning_rate": 1.4179196342214017e-05, + "loss": 0.0002, + "step": 13728 + }, + { + "epoch": 3.6218177021501123, + "grad_norm": 0.3534570634365082, + "learning_rate": 1.4178023974911341e-05, + "loss": 0.0019, + "step": 13730 + }, + { + "epoch": 3.6223453370267773, + "grad_norm": 0.010552404448390007, + "learning_rate": 1.4176851607608665e-05, + "loss": 0.0002, + "step": 13732 + }, + { + "epoch": 3.6228729719034427, + "grad_norm": 0.2339635193347931, + "learning_rate": 1.4175679240305988e-05, + "loss": 0.0021, + "step": 13734 + }, + { + "epoch": 3.623400606780108, + "grad_norm": 0.00953048188239336, + "learning_rate": 1.4174506873003314e-05, + "loss": 0.0002, + "step": 13736 + }, + { + "epoch": 3.6239282416567735, + "grad_norm": 0.001122951740399003, + "learning_rate": 1.4173334505700638e-05, + "loss": 0.0001, + "step": 13738 + }, + { + "epoch": 3.624455876533439, + "grad_norm": 0.0019843855407088995, + "learning_rate": 1.417216213839796e-05, + "loss": 0.0061, + "step": 13740 + }, + { + "epoch": 3.6249835114101043, + "grad_norm": 0.183063343167305, + "learning_rate": 1.4170989771095285e-05, + "loss": 0.0043, + "step": 13742 + }, + { + "epoch": 3.6255111462867697, + "grad_norm": 0.0019649858586490154, + "learning_rate": 1.416981740379261e-05, + "loss": 0.0001, + "step": 13744 + }, + { + "epoch": 3.6260387811634347, + "grad_norm": 0.3792394697666168, + "learning_rate": 1.4168645036489934e-05, + "loss": 0.0017, + "step": 13746 + }, + { + "epoch": 3.6265664160401, + "grad_norm": 0.00143834063783288, + "learning_rate": 1.4167472669187256e-05, + "loss": 0.0001, + "step": 13748 + }, + { + "epoch": 3.6270940509167655, + "grad_norm": 0.010883752256631851, + "learning_rate": 1.416630030188458e-05, + "loss": 0.0002, + "step": 13750 + }, + { + "epoch": 3.627621685793431, + "grad_norm": 0.024765748530626297, + "learning_rate": 1.4165127934581907e-05, + "loss": 0.0015, + "step": 13752 + }, + { + "epoch": 3.6281493206700963, + "grad_norm": 0.0022054947912693024, + "learning_rate": 1.4163955567279231e-05, + "loss": 0.0003, + "step": 13754 + }, + { + "epoch": 3.6286769555467617, + "grad_norm": 0.004032871685922146, + "learning_rate": 1.4162783199976553e-05, + "loss": 0.0007, + "step": 13756 + }, + { + "epoch": 3.629204590423427, + "grad_norm": 0.013433301821351051, + "learning_rate": 1.4161610832673878e-05, + "loss": 0.0002, + "step": 13758 + }, + { + "epoch": 3.629732225300092, + "grad_norm": 0.09449783712625504, + "learning_rate": 1.4160438465371202e-05, + "loss": 0.0011, + "step": 13760 + }, + { + "epoch": 3.630259860176758, + "grad_norm": 0.005851709749549627, + "learning_rate": 1.4159266098068526e-05, + "loss": 0.0003, + "step": 13762 + }, + { + "epoch": 3.630787495053423, + "grad_norm": 0.004724827595055103, + "learning_rate": 1.4158093730765849e-05, + "loss": 0.0001, + "step": 13764 + }, + { + "epoch": 3.6313151299300883, + "grad_norm": 0.13648100197315216, + "learning_rate": 1.4156921363463173e-05, + "loss": 0.0015, + "step": 13766 + }, + { + "epoch": 3.6318427648067537, + "grad_norm": 0.08404086530208588, + "learning_rate": 1.41557489961605e-05, + "loss": 0.0005, + "step": 13768 + }, + { + "epoch": 3.632370399683419, + "grad_norm": 0.028905104845762253, + "learning_rate": 1.4154576628857822e-05, + "loss": 0.0003, + "step": 13770 + }, + { + "epoch": 3.6328980345600845, + "grad_norm": 0.006415505427867174, + "learning_rate": 1.4153404261555146e-05, + "loss": 0.0001, + "step": 13772 + }, + { + "epoch": 3.6334256694367495, + "grad_norm": 0.005787878762930632, + "learning_rate": 1.415223189425247e-05, + "loss": 0.0002, + "step": 13774 + }, + { + "epoch": 3.6339533043134153, + "grad_norm": 0.0031470763497054577, + "learning_rate": 1.4151059526949795e-05, + "loss": 0.0002, + "step": 13776 + }, + { + "epoch": 3.6344809391900803, + "grad_norm": 0.002646668581292033, + "learning_rate": 1.4149887159647117e-05, + "loss": 0.0001, + "step": 13778 + }, + { + "epoch": 3.6350085740667457, + "grad_norm": 0.0034304936416447163, + "learning_rate": 1.4148714792344442e-05, + "loss": 0.0001, + "step": 13780 + }, + { + "epoch": 3.635536208943411, + "grad_norm": 0.002961629070341587, + "learning_rate": 1.4147542425041768e-05, + "loss": 0.0001, + "step": 13782 + }, + { + "epoch": 3.6360638438200765, + "grad_norm": 0.018391866236925125, + "learning_rate": 1.4146370057739092e-05, + "loss": 0.0002, + "step": 13784 + }, + { + "epoch": 3.636591478696742, + "grad_norm": 0.002856276696547866, + "learning_rate": 1.4145197690436415e-05, + "loss": 0.0001, + "step": 13786 + }, + { + "epoch": 3.6371191135734073, + "grad_norm": 0.0219675675034523, + "learning_rate": 1.4144025323133739e-05, + "loss": 0.0001, + "step": 13788 + }, + { + "epoch": 3.6376467484500727, + "grad_norm": 0.37723588943481445, + "learning_rate": 1.4142852955831063e-05, + "loss": 0.0012, + "step": 13790 + }, + { + "epoch": 3.6381743833267377, + "grad_norm": 0.5295794010162354, + "learning_rate": 1.4141680588528388e-05, + "loss": 0.004, + "step": 13792 + }, + { + "epoch": 3.638702018203403, + "grad_norm": 0.02267521806061268, + "learning_rate": 1.414050822122571e-05, + "loss": 0.0029, + "step": 13794 + }, + { + "epoch": 3.6392296530800685, + "grad_norm": 0.002397047821432352, + "learning_rate": 1.4139335853923034e-05, + "loss": 0.0001, + "step": 13796 + }, + { + "epoch": 3.639757287956734, + "grad_norm": 0.1735205054283142, + "learning_rate": 1.413816348662036e-05, + "loss": 0.0022, + "step": 13798 + }, + { + "epoch": 3.6402849228333993, + "grad_norm": 0.00504866661503911, + "learning_rate": 1.4136991119317683e-05, + "loss": 0.0001, + "step": 13800 + }, + { + "epoch": 3.6408125577100647, + "grad_norm": 0.0030311548616737127, + "learning_rate": 1.4135818752015007e-05, + "loss": 0.0001, + "step": 13802 + }, + { + "epoch": 3.64134019258673, + "grad_norm": 0.0081120440736413, + "learning_rate": 1.4134646384712332e-05, + "loss": 0.0, + "step": 13804 + }, + { + "epoch": 3.641867827463395, + "grad_norm": 0.02334555797278881, + "learning_rate": 1.4133474017409656e-05, + "loss": 0.0001, + "step": 13806 + }, + { + "epoch": 3.642395462340061, + "grad_norm": 0.0036491972859948874, + "learning_rate": 1.4132301650106979e-05, + "loss": 0.004, + "step": 13808 + }, + { + "epoch": 3.642923097216726, + "grad_norm": 0.06673484295606613, + "learning_rate": 1.4131129282804303e-05, + "loss": 0.0013, + "step": 13810 + }, + { + "epoch": 3.6434507320933913, + "grad_norm": 0.006967675406485796, + "learning_rate": 1.4129956915501629e-05, + "loss": 0.0001, + "step": 13812 + }, + { + "epoch": 3.6439783669700567, + "grad_norm": 0.004350700881332159, + "learning_rate": 1.4128784548198953e-05, + "loss": 0.0001, + "step": 13814 + }, + { + "epoch": 3.644506001846722, + "grad_norm": 0.0043449182994663715, + "learning_rate": 1.4127612180896276e-05, + "loss": 0.0007, + "step": 13816 + }, + { + "epoch": 3.6450336367233875, + "grad_norm": 0.0061339144594967365, + "learning_rate": 1.41264398135936e-05, + "loss": 0.0001, + "step": 13818 + }, + { + "epoch": 3.6455612716000525, + "grad_norm": 0.11970734596252441, + "learning_rate": 1.4125267446290924e-05, + "loss": 0.0004, + "step": 13820 + }, + { + "epoch": 3.6460889064767183, + "grad_norm": 0.32739880681037903, + "learning_rate": 1.4124095078988249e-05, + "loss": 0.0044, + "step": 13822 + }, + { + "epoch": 3.6466165413533833, + "grad_norm": 0.004153251647949219, + "learning_rate": 1.4122922711685571e-05, + "loss": 0.0002, + "step": 13824 + }, + { + "epoch": 3.6471441762300487, + "grad_norm": 0.0011028964072465897, + "learning_rate": 1.4121750344382896e-05, + "loss": 0.0001, + "step": 13826 + }, + { + "epoch": 3.647671811106714, + "grad_norm": 0.0010912081925198436, + "learning_rate": 1.4120577977080222e-05, + "loss": 0.0001, + "step": 13828 + }, + { + "epoch": 3.6481994459833795, + "grad_norm": 0.19258898496627808, + "learning_rate": 1.4119405609777543e-05, + "loss": 0.005, + "step": 13830 + }, + { + "epoch": 3.648727080860045, + "grad_norm": 0.0013623092090710998, + "learning_rate": 1.4118233242474869e-05, + "loss": 0.0001, + "step": 13832 + }, + { + "epoch": 3.6492547157367103, + "grad_norm": 0.001515248091891408, + "learning_rate": 1.4117060875172193e-05, + "loss": 0.0008, + "step": 13834 + }, + { + "epoch": 3.6497823506133757, + "grad_norm": 0.0022917520254850388, + "learning_rate": 1.4115888507869517e-05, + "loss": 0.0016, + "step": 13836 + }, + { + "epoch": 3.6503099854900407, + "grad_norm": 0.014699428342282772, + "learning_rate": 1.411471614056684e-05, + "loss": 0.0001, + "step": 13838 + }, + { + "epoch": 3.650837620366706, + "grad_norm": 0.035116687417030334, + "learning_rate": 1.4113543773264164e-05, + "loss": 0.0008, + "step": 13840 + }, + { + "epoch": 3.6513652552433715, + "grad_norm": 0.007497469894587994, + "learning_rate": 1.4112371405961488e-05, + "loss": 0.0002, + "step": 13842 + }, + { + "epoch": 3.651892890120037, + "grad_norm": 0.18829631805419922, + "learning_rate": 1.4111199038658814e-05, + "loss": 0.0006, + "step": 13844 + }, + { + "epoch": 3.6524205249967023, + "grad_norm": 0.06156857684254646, + "learning_rate": 1.4110026671356137e-05, + "loss": 0.0003, + "step": 13846 + }, + { + "epoch": 3.6529481598733677, + "grad_norm": 0.01057380624115467, + "learning_rate": 1.4108854304053461e-05, + "loss": 0.0001, + "step": 13848 + }, + { + "epoch": 3.653475794750033, + "grad_norm": 0.004653078503906727, + "learning_rate": 1.4107681936750786e-05, + "loss": 0.0001, + "step": 13850 + }, + { + "epoch": 3.654003429626698, + "grad_norm": 0.0035690541844815016, + "learning_rate": 1.410650956944811e-05, + "loss": 0.0006, + "step": 13852 + }, + { + "epoch": 3.6545310645033635, + "grad_norm": 0.0008233352564275265, + "learning_rate": 1.4105337202145433e-05, + "loss": 0.0001, + "step": 13854 + }, + { + "epoch": 3.655058699380029, + "grad_norm": 0.036134589463472366, + "learning_rate": 1.4104164834842757e-05, + "loss": 0.0001, + "step": 13856 + }, + { + "epoch": 3.6555863342566943, + "grad_norm": 0.02701704017817974, + "learning_rate": 1.4102992467540083e-05, + "loss": 0.0002, + "step": 13858 + }, + { + "epoch": 3.6561139691333597, + "grad_norm": 0.0640455037355423, + "learning_rate": 1.4101820100237404e-05, + "loss": 0.0008, + "step": 13860 + }, + { + "epoch": 3.656641604010025, + "grad_norm": 0.0013334552058950067, + "learning_rate": 1.410064773293473e-05, + "loss": 0.0002, + "step": 13862 + }, + { + "epoch": 3.6571692388866905, + "grad_norm": 0.003034617519006133, + "learning_rate": 1.4099475365632054e-05, + "loss": 0.0001, + "step": 13864 + }, + { + "epoch": 3.6576968737633555, + "grad_norm": 0.0010099539067596197, + "learning_rate": 1.4098302998329378e-05, + "loss": 0.0001, + "step": 13866 + }, + { + "epoch": 3.6582245086400214, + "grad_norm": 0.007576422765851021, + "learning_rate": 1.4097130631026701e-05, + "loss": 0.0001, + "step": 13868 + }, + { + "epoch": 3.6587521435166863, + "grad_norm": 0.009512494318187237, + "learning_rate": 1.4095958263724025e-05, + "loss": 0.005, + "step": 13870 + }, + { + "epoch": 3.6592797783933517, + "grad_norm": 0.27978515625, + "learning_rate": 1.409478589642135e-05, + "loss": 0.0068, + "step": 13872 + }, + { + "epoch": 3.659807413270017, + "grad_norm": 0.0014386852271854877, + "learning_rate": 1.4093613529118676e-05, + "loss": 0.0061, + "step": 13874 + }, + { + "epoch": 3.6603350481466825, + "grad_norm": 0.018548712134361267, + "learning_rate": 1.4092441161815996e-05, + "loss": 0.0002, + "step": 13876 + }, + { + "epoch": 3.660862683023348, + "grad_norm": 0.003604686353355646, + "learning_rate": 1.4091268794513322e-05, + "loss": 0.0001, + "step": 13878 + }, + { + "epoch": 3.6613903179000133, + "grad_norm": 0.02170572057366371, + "learning_rate": 1.4090096427210647e-05, + "loss": 0.0002, + "step": 13880 + }, + { + "epoch": 3.6619179527766788, + "grad_norm": 0.05046401172876358, + "learning_rate": 1.4088924059907971e-05, + "loss": 0.0005, + "step": 13882 + }, + { + "epoch": 3.6624455876533437, + "grad_norm": 0.04031582176685333, + "learning_rate": 1.4087751692605294e-05, + "loss": 0.0004, + "step": 13884 + }, + { + "epoch": 3.662973222530009, + "grad_norm": 0.09003745764493942, + "learning_rate": 1.4086579325302618e-05, + "loss": 0.0045, + "step": 13886 + }, + { + "epoch": 3.6635008574066745, + "grad_norm": 0.05134955793619156, + "learning_rate": 1.4085406957999942e-05, + "loss": 0.0012, + "step": 13888 + }, + { + "epoch": 3.66402849228334, + "grad_norm": 0.2717305123806, + "learning_rate": 1.4084234590697265e-05, + "loss": 0.0011, + "step": 13890 + }, + { + "epoch": 3.6645561271600053, + "grad_norm": 0.05086902529001236, + "learning_rate": 1.4083062223394591e-05, + "loss": 0.0003, + "step": 13892 + }, + { + "epoch": 3.6650837620366707, + "grad_norm": 0.010640780441462994, + "learning_rate": 1.4081889856091915e-05, + "loss": 0.0002, + "step": 13894 + }, + { + "epoch": 3.665611396913336, + "grad_norm": 0.011064468882977962, + "learning_rate": 1.408071748878924e-05, + "loss": 0.0009, + "step": 13896 + }, + { + "epoch": 3.666139031790001, + "grad_norm": 0.04145946353673935, + "learning_rate": 1.4079545121486562e-05, + "loss": 0.0027, + "step": 13898 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.32541096210479736, + "learning_rate": 1.4078372754183886e-05, + "loss": 0.0054, + "step": 13900 + }, + { + "epoch": 3.667194301543332, + "grad_norm": 0.11257703602313995, + "learning_rate": 1.407720038688121e-05, + "loss": 0.002, + "step": 13902 + }, + { + "epoch": 3.6677219364199973, + "grad_norm": 0.013694314286112785, + "learning_rate": 1.4076028019578537e-05, + "loss": 0.0001, + "step": 13904 + }, + { + "epoch": 3.6682495712966627, + "grad_norm": 0.013745635747909546, + "learning_rate": 1.4074855652275858e-05, + "loss": 0.0003, + "step": 13906 + }, + { + "epoch": 3.668777206173328, + "grad_norm": 0.015585111454129219, + "learning_rate": 1.4073683284973184e-05, + "loss": 0.0002, + "step": 13908 + }, + { + "epoch": 3.6693048410499935, + "grad_norm": 0.07881342619657516, + "learning_rate": 1.4072510917670508e-05, + "loss": 0.0005, + "step": 13910 + }, + { + "epoch": 3.6698324759266585, + "grad_norm": 0.007845714688301086, + "learning_rate": 1.4071338550367832e-05, + "loss": 0.002, + "step": 13912 + }, + { + "epoch": 3.6703601108033244, + "grad_norm": 0.028746647760272026, + "learning_rate": 1.4070166183065155e-05, + "loss": 0.0002, + "step": 13914 + }, + { + "epoch": 3.6708877456799893, + "grad_norm": 0.0023138062097132206, + "learning_rate": 1.406899381576248e-05, + "loss": 0.0003, + "step": 13916 + }, + { + "epoch": 3.6714153805566547, + "grad_norm": 0.10824259370565414, + "learning_rate": 1.4067821448459803e-05, + "loss": 0.0013, + "step": 13918 + }, + { + "epoch": 3.67194301543332, + "grad_norm": 0.00152233662083745, + "learning_rate": 1.4066649081157126e-05, + "loss": 0.0001, + "step": 13920 + }, + { + "epoch": 3.6724706503099855, + "grad_norm": 0.02607865259051323, + "learning_rate": 1.4065476713854452e-05, + "loss": 0.0002, + "step": 13922 + }, + { + "epoch": 3.672998285186651, + "grad_norm": 0.18996022641658783, + "learning_rate": 1.4064304346551776e-05, + "loss": 0.001, + "step": 13924 + }, + { + "epoch": 3.673525920063316, + "grad_norm": 0.5789695978164673, + "learning_rate": 1.40631319792491e-05, + "loss": 0.0056, + "step": 13926 + }, + { + "epoch": 3.6740535549399818, + "grad_norm": 0.0025197905488312244, + "learning_rate": 1.4061959611946423e-05, + "loss": 0.0002, + "step": 13928 + }, + { + "epoch": 3.6745811898166467, + "grad_norm": 0.00178276386577636, + "learning_rate": 1.4060787244643748e-05, + "loss": 0.0001, + "step": 13930 + }, + { + "epoch": 3.675108824693312, + "grad_norm": 0.00755200395360589, + "learning_rate": 1.4059614877341072e-05, + "loss": 0.0001, + "step": 13932 + }, + { + "epoch": 3.6756364595699775, + "grad_norm": 0.01000160351395607, + "learning_rate": 1.4058442510038396e-05, + "loss": 0.0001, + "step": 13934 + }, + { + "epoch": 3.676164094446643, + "grad_norm": 0.02098625712096691, + "learning_rate": 1.4057270142735719e-05, + "loss": 0.0012, + "step": 13936 + }, + { + "epoch": 3.6766917293233083, + "grad_norm": 0.010152781382203102, + "learning_rate": 1.4056097775433045e-05, + "loss": 0.0004, + "step": 13938 + }, + { + "epoch": 3.6772193641999738, + "grad_norm": 0.003417849773541093, + "learning_rate": 1.4054925408130369e-05, + "loss": 0.0001, + "step": 13940 + }, + { + "epoch": 3.677746999076639, + "grad_norm": 0.042149223387241364, + "learning_rate": 1.4053753040827693e-05, + "loss": 0.0009, + "step": 13942 + }, + { + "epoch": 3.678274633953304, + "grad_norm": 0.017616625875234604, + "learning_rate": 1.4052580673525016e-05, + "loss": 0.0002, + "step": 13944 + }, + { + "epoch": 3.6788022688299695, + "grad_norm": 0.006542342249304056, + "learning_rate": 1.405140830622234e-05, + "loss": 0.0012, + "step": 13946 + }, + { + "epoch": 3.679329903706635, + "grad_norm": 0.0015869534108787775, + "learning_rate": 1.4050235938919665e-05, + "loss": 0.0001, + "step": 13948 + }, + { + "epoch": 3.6798575385833003, + "grad_norm": 0.0016107434639707208, + "learning_rate": 1.404906357161699e-05, + "loss": 0.0001, + "step": 13950 + }, + { + "epoch": 3.6803851734599657, + "grad_norm": 0.002640083432197571, + "learning_rate": 1.4047891204314312e-05, + "loss": 0.0001, + "step": 13952 + }, + { + "epoch": 3.680912808336631, + "grad_norm": 0.09958534687757492, + "learning_rate": 1.4046718837011638e-05, + "loss": 0.0007, + "step": 13954 + }, + { + "epoch": 3.6814404432132966, + "grad_norm": 0.3673099875450134, + "learning_rate": 1.4045546469708962e-05, + "loss": 0.0006, + "step": 13956 + }, + { + "epoch": 3.6819680780899615, + "grad_norm": 0.0008897238876670599, + "learning_rate": 1.4044374102406284e-05, + "loss": 0.0001, + "step": 13958 + }, + { + "epoch": 3.6824957129666274, + "grad_norm": 0.0007637497037649155, + "learning_rate": 1.4043201735103609e-05, + "loss": 0.0001, + "step": 13960 + }, + { + "epoch": 3.6830233478432923, + "grad_norm": 0.0011942581040784717, + "learning_rate": 1.4042029367800933e-05, + "loss": 0.0001, + "step": 13962 + }, + { + "epoch": 3.6835509827199577, + "grad_norm": 0.0009379456168971956, + "learning_rate": 1.4040857000498257e-05, + "loss": 0.0003, + "step": 13964 + }, + { + "epoch": 3.684078617596623, + "grad_norm": 0.009028236381709576, + "learning_rate": 1.403968463319558e-05, + "loss": 0.0001, + "step": 13966 + }, + { + "epoch": 3.6846062524732885, + "grad_norm": 0.005668005906045437, + "learning_rate": 1.4038512265892906e-05, + "loss": 0.0003, + "step": 13968 + }, + { + "epoch": 3.685133887349954, + "grad_norm": 0.22349634766578674, + "learning_rate": 1.403733989859023e-05, + "loss": 0.0008, + "step": 13970 + }, + { + "epoch": 3.685661522226619, + "grad_norm": 0.12780922651290894, + "learning_rate": 1.4036167531287555e-05, + "loss": 0.0007, + "step": 13972 + }, + { + "epoch": 3.6861891571032848, + "grad_norm": 0.002578982152044773, + "learning_rate": 1.4034995163984877e-05, + "loss": 0.0043, + "step": 13974 + }, + { + "epoch": 3.6867167919799497, + "grad_norm": 0.10857314616441727, + "learning_rate": 1.4033822796682202e-05, + "loss": 0.0003, + "step": 13976 + }, + { + "epoch": 3.687244426856615, + "grad_norm": 0.007991681806743145, + "learning_rate": 1.4032650429379526e-05, + "loss": 0.0032, + "step": 13978 + }, + { + "epoch": 3.6877720617332805, + "grad_norm": 0.006617802660912275, + "learning_rate": 1.403147806207685e-05, + "loss": 0.0001, + "step": 13980 + }, + { + "epoch": 3.688299696609946, + "grad_norm": 0.053022295236587524, + "learning_rate": 1.4030305694774173e-05, + "loss": 0.0002, + "step": 13982 + }, + { + "epoch": 3.6888273314866113, + "grad_norm": 0.015283863991498947, + "learning_rate": 1.4029133327471499e-05, + "loss": 0.0008, + "step": 13984 + }, + { + "epoch": 3.6893549663632768, + "grad_norm": 0.2940078377723694, + "learning_rate": 1.4027960960168823e-05, + "loss": 0.0018, + "step": 13986 + }, + { + "epoch": 3.689882601239942, + "grad_norm": 0.20188693702220917, + "learning_rate": 1.4026788592866146e-05, + "loss": 0.0067, + "step": 13988 + }, + { + "epoch": 3.690410236116607, + "grad_norm": 0.2887450158596039, + "learning_rate": 1.402561622556347e-05, + "loss": 0.001, + "step": 13990 + }, + { + "epoch": 3.6909378709932725, + "grad_norm": 0.0008537604589946568, + "learning_rate": 1.4024443858260794e-05, + "loss": 0.0008, + "step": 13992 + }, + { + "epoch": 3.691465505869938, + "grad_norm": 0.04034432768821716, + "learning_rate": 1.4023271490958119e-05, + "loss": 0.0001, + "step": 13994 + }, + { + "epoch": 3.6919931407466033, + "grad_norm": 0.0028434675186872482, + "learning_rate": 1.4022099123655441e-05, + "loss": 0.0001, + "step": 13996 + }, + { + "epoch": 3.6925207756232687, + "grad_norm": 0.0009975123684853315, + "learning_rate": 1.4020926756352765e-05, + "loss": 0.0068, + "step": 13998 + }, + { + "epoch": 3.693048410499934, + "grad_norm": 0.0013286283938214183, + "learning_rate": 1.4019754389050091e-05, + "loss": 0.0008, + "step": 14000 + }, + { + "epoch": 3.693048410499934, + "eval_loss": 0.001845505554229021, + "eval_runtime": 304.9476, + "eval_samples_per_second": 707.141, + "eval_steps_per_second": 88.396, + "step": 14000 + }, + { + "epoch": 3.6935760453765996, + "grad_norm": 0.0028150046709924936, + "learning_rate": 1.4018582021747416e-05, + "loss": 0.0001, + "step": 14002 + }, + { + "epoch": 3.6941036802532645, + "grad_norm": 0.014950588345527649, + "learning_rate": 1.4017409654444738e-05, + "loss": 0.0001, + "step": 14004 + }, + { + "epoch": 3.69463131512993, + "grad_norm": 0.21773792803287506, + "learning_rate": 1.4016237287142063e-05, + "loss": 0.0043, + "step": 14006 + }, + { + "epoch": 3.6951589500065953, + "grad_norm": 0.005476482678204775, + "learning_rate": 1.4015064919839387e-05, + "loss": 0.0001, + "step": 14008 + }, + { + "epoch": 3.6956865848832607, + "grad_norm": 0.003828542772680521, + "learning_rate": 1.4013892552536711e-05, + "loss": 0.0001, + "step": 14010 + }, + { + "epoch": 3.696214219759926, + "grad_norm": 0.005846434738487005, + "learning_rate": 1.4012720185234034e-05, + "loss": 0.0001, + "step": 14012 + }, + { + "epoch": 3.6967418546365916, + "grad_norm": 0.1585274189710617, + "learning_rate": 1.401154781793136e-05, + "loss": 0.0018, + "step": 14014 + }, + { + "epoch": 3.697269489513257, + "grad_norm": 0.0049894871190190315, + "learning_rate": 1.4010375450628684e-05, + "loss": 0.0001, + "step": 14016 + }, + { + "epoch": 3.697797124389922, + "grad_norm": 0.006693379022181034, + "learning_rate": 1.4009203083326007e-05, + "loss": 0.0005, + "step": 14018 + }, + { + "epoch": 3.6983247592665878, + "grad_norm": 0.2098798304796219, + "learning_rate": 1.4008030716023331e-05, + "loss": 0.001, + "step": 14020 + }, + { + "epoch": 3.6988523941432527, + "grad_norm": 0.10144098103046417, + "learning_rate": 1.4006858348720655e-05, + "loss": 0.0013, + "step": 14022 + }, + { + "epoch": 3.699380029019918, + "grad_norm": 0.02444670908153057, + "learning_rate": 1.400568598141798e-05, + "loss": 0.0001, + "step": 14024 + }, + { + "epoch": 3.6999076638965835, + "grad_norm": 0.18576550483703613, + "learning_rate": 1.4004513614115302e-05, + "loss": 0.0036, + "step": 14026 + }, + { + "epoch": 3.700435298773249, + "grad_norm": 0.010743264108896255, + "learning_rate": 1.4003341246812627e-05, + "loss": 0.0001, + "step": 14028 + }, + { + "epoch": 3.7009629336499144, + "grad_norm": 0.34003373980522156, + "learning_rate": 1.4002168879509953e-05, + "loss": 0.0009, + "step": 14030 + }, + { + "epoch": 3.7014905685265798, + "grad_norm": 0.031538575887680054, + "learning_rate": 1.4000996512207277e-05, + "loss": 0.0031, + "step": 14032 + }, + { + "epoch": 3.702018203403245, + "grad_norm": 0.0012906535994261503, + "learning_rate": 1.39998241449046e-05, + "loss": 0.0001, + "step": 14034 + }, + { + "epoch": 3.70254583827991, + "grad_norm": 0.5654388666152954, + "learning_rate": 1.3998651777601924e-05, + "loss": 0.0012, + "step": 14036 + }, + { + "epoch": 3.7030734731565755, + "grad_norm": 0.0011966429883614182, + "learning_rate": 1.3997479410299248e-05, + "loss": 0.0002, + "step": 14038 + }, + { + "epoch": 3.703601108033241, + "grad_norm": 0.0008918678504414856, + "learning_rate": 1.3996307042996572e-05, + "loss": 0.0001, + "step": 14040 + }, + { + "epoch": 3.7041287429099063, + "grad_norm": 0.001838468131609261, + "learning_rate": 1.3995134675693895e-05, + "loss": 0.0, + "step": 14042 + }, + { + "epoch": 3.7046563777865718, + "grad_norm": 0.001980544300749898, + "learning_rate": 1.399396230839122e-05, + "loss": 0.0, + "step": 14044 + }, + { + "epoch": 3.705184012663237, + "grad_norm": 0.11307946592569351, + "learning_rate": 1.3992789941088545e-05, + "loss": 0.0006, + "step": 14046 + }, + { + "epoch": 3.7057116475399026, + "grad_norm": 0.0029789162799715996, + "learning_rate": 1.3991617573785868e-05, + "loss": 0.0001, + "step": 14048 + }, + { + "epoch": 3.7062392824165675, + "grad_norm": 0.6105914115905762, + "learning_rate": 1.3990445206483192e-05, + "loss": 0.0023, + "step": 14050 + }, + { + "epoch": 3.706766917293233, + "grad_norm": 0.0077101727947592735, + "learning_rate": 1.3989272839180517e-05, + "loss": 0.0001, + "step": 14052 + }, + { + "epoch": 3.7072945521698983, + "grad_norm": 0.6214226484298706, + "learning_rate": 1.3988100471877841e-05, + "loss": 0.012, + "step": 14054 + }, + { + "epoch": 3.7078221870465637, + "grad_norm": 0.02082904241979122, + "learning_rate": 1.3986928104575163e-05, + "loss": 0.0002, + "step": 14056 + }, + { + "epoch": 3.708349821923229, + "grad_norm": 0.0015204312512651086, + "learning_rate": 1.3985755737272488e-05, + "loss": 0.0001, + "step": 14058 + }, + { + "epoch": 3.7088774567998946, + "grad_norm": 0.002872937358915806, + "learning_rate": 1.3984583369969814e-05, + "loss": 0.0, + "step": 14060 + }, + { + "epoch": 3.70940509167656, + "grad_norm": 0.006177556701004505, + "learning_rate": 1.3983411002667138e-05, + "loss": 0.0016, + "step": 14062 + }, + { + "epoch": 3.709932726553225, + "grad_norm": 0.0011766963871195912, + "learning_rate": 1.398223863536446e-05, + "loss": 0.0003, + "step": 14064 + }, + { + "epoch": 3.7104603614298908, + "grad_norm": 0.001509245135821402, + "learning_rate": 1.3981066268061785e-05, + "loss": 0.0033, + "step": 14066 + }, + { + "epoch": 3.7109879963065557, + "grad_norm": 0.7631971836090088, + "learning_rate": 1.397989390075911e-05, + "loss": 0.0022, + "step": 14068 + }, + { + "epoch": 3.711515631183221, + "grad_norm": 0.10245063900947571, + "learning_rate": 1.3978721533456434e-05, + "loss": 0.0004, + "step": 14070 + }, + { + "epoch": 3.7120432660598865, + "grad_norm": 0.0013841738691553473, + "learning_rate": 1.3977549166153756e-05, + "loss": 0.0001, + "step": 14072 + }, + { + "epoch": 3.712570900936552, + "grad_norm": 0.0011771214194595814, + "learning_rate": 1.397637679885108e-05, + "loss": 0.0001, + "step": 14074 + }, + { + "epoch": 3.7130985358132174, + "grad_norm": 0.043655868619680405, + "learning_rate": 1.3975204431548407e-05, + "loss": 0.0003, + "step": 14076 + }, + { + "epoch": 3.7136261706898828, + "grad_norm": 0.023115048184990883, + "learning_rate": 1.3974032064245729e-05, + "loss": 0.0031, + "step": 14078 + }, + { + "epoch": 3.714153805566548, + "grad_norm": 0.2030753493309021, + "learning_rate": 1.3972859696943053e-05, + "loss": 0.006, + "step": 14080 + }, + { + "epoch": 3.714681440443213, + "grad_norm": 0.022028565406799316, + "learning_rate": 1.3971687329640378e-05, + "loss": 0.0004, + "step": 14082 + }, + { + "epoch": 3.7152090753198785, + "grad_norm": 0.05795411393046379, + "learning_rate": 1.3970514962337702e-05, + "loss": 0.0007, + "step": 14084 + }, + { + "epoch": 3.715736710196544, + "grad_norm": 0.03858013451099396, + "learning_rate": 1.3969342595035025e-05, + "loss": 0.0003, + "step": 14086 + }, + { + "epoch": 3.7162643450732094, + "grad_norm": 0.21064554154872894, + "learning_rate": 1.3968170227732349e-05, + "loss": 0.0055, + "step": 14088 + }, + { + "epoch": 3.7167919799498748, + "grad_norm": 0.0026948503218591213, + "learning_rate": 1.3966997860429673e-05, + "loss": 0.0001, + "step": 14090 + }, + { + "epoch": 3.71731961482654, + "grad_norm": 0.0073941354639828205, + "learning_rate": 1.3965825493127e-05, + "loss": 0.0019, + "step": 14092 + }, + { + "epoch": 3.7178472497032056, + "grad_norm": 0.027295323088765144, + "learning_rate": 1.3964653125824322e-05, + "loss": 0.002, + "step": 14094 + }, + { + "epoch": 3.7183748845798705, + "grad_norm": 0.03045400232076645, + "learning_rate": 1.3963480758521646e-05, + "loss": 0.0015, + "step": 14096 + }, + { + "epoch": 3.718902519456536, + "grad_norm": 0.006279811263084412, + "learning_rate": 1.396230839121897e-05, + "loss": 0.0001, + "step": 14098 + }, + { + "epoch": 3.7194301543332013, + "grad_norm": 0.002510065445676446, + "learning_rate": 1.3961136023916295e-05, + "loss": 0.0005, + "step": 14100 + }, + { + "epoch": 3.7199577892098667, + "grad_norm": 0.0035955675411969423, + "learning_rate": 1.3959963656613617e-05, + "loss": 0.0001, + "step": 14102 + }, + { + "epoch": 3.720485424086532, + "grad_norm": 0.002318868413567543, + "learning_rate": 1.3958791289310942e-05, + "loss": 0.0001, + "step": 14104 + }, + { + "epoch": 3.7210130589631976, + "grad_norm": 0.12906184792518616, + "learning_rate": 1.3957618922008268e-05, + "loss": 0.0039, + "step": 14106 + }, + { + "epoch": 3.721540693839863, + "grad_norm": 0.0043855165131390095, + "learning_rate": 1.3956446554705589e-05, + "loss": 0.0001, + "step": 14108 + }, + { + "epoch": 3.722068328716528, + "grad_norm": 0.0020545630250126123, + "learning_rate": 1.3955274187402915e-05, + "loss": 0.0001, + "step": 14110 + }, + { + "epoch": 3.722595963593194, + "grad_norm": 0.17266526818275452, + "learning_rate": 1.3954101820100239e-05, + "loss": 0.0008, + "step": 14112 + }, + { + "epoch": 3.7231235984698587, + "grad_norm": 0.025535300374031067, + "learning_rate": 1.3952929452797563e-05, + "loss": 0.0004, + "step": 14114 + }, + { + "epoch": 3.723651233346524, + "grad_norm": 0.003654324682429433, + "learning_rate": 1.3951757085494886e-05, + "loss": 0.001, + "step": 14116 + }, + { + "epoch": 3.7241788682231896, + "grad_norm": 0.0016648194286972284, + "learning_rate": 1.395058471819221e-05, + "loss": 0.0001, + "step": 14118 + }, + { + "epoch": 3.724706503099855, + "grad_norm": 0.024226464331150055, + "learning_rate": 1.3949412350889534e-05, + "loss": 0.0002, + "step": 14120 + }, + { + "epoch": 3.7252341379765204, + "grad_norm": 0.001037978450767696, + "learning_rate": 1.394823998358686e-05, + "loss": 0.0001, + "step": 14122 + }, + { + "epoch": 3.7257617728531853, + "grad_norm": 0.38217854499816895, + "learning_rate": 1.3947067616284183e-05, + "loss": 0.006, + "step": 14124 + }, + { + "epoch": 3.726289407729851, + "grad_norm": 0.005137371364980936, + "learning_rate": 1.3945895248981507e-05, + "loss": 0.0001, + "step": 14126 + }, + { + "epoch": 3.726817042606516, + "grad_norm": 0.027384096756577492, + "learning_rate": 1.3944722881678832e-05, + "loss": 0.0002, + "step": 14128 + }, + { + "epoch": 3.7273446774831815, + "grad_norm": 0.00370484939776361, + "learning_rate": 1.3943550514376156e-05, + "loss": 0.0001, + "step": 14130 + }, + { + "epoch": 3.727872312359847, + "grad_norm": 0.010618140920996666, + "learning_rate": 1.3942378147073479e-05, + "loss": 0.0002, + "step": 14132 + }, + { + "epoch": 3.7283999472365124, + "grad_norm": 0.09845705330371857, + "learning_rate": 1.3941205779770803e-05, + "loss": 0.0006, + "step": 14134 + }, + { + "epoch": 3.7289275821131778, + "grad_norm": 0.011253150179982185, + "learning_rate": 1.3940033412468129e-05, + "loss": 0.0001, + "step": 14136 + }, + { + "epoch": 3.729455216989843, + "grad_norm": 0.004153664689511061, + "learning_rate": 1.393886104516545e-05, + "loss": 0.0002, + "step": 14138 + }, + { + "epoch": 3.7299828518665086, + "grad_norm": 0.03594054654240608, + "learning_rate": 1.3937688677862776e-05, + "loss": 0.0029, + "step": 14140 + }, + { + "epoch": 3.7305104867431735, + "grad_norm": 0.027152713388204575, + "learning_rate": 1.39365163105601e-05, + "loss": 0.0017, + "step": 14142 + }, + { + "epoch": 3.731038121619839, + "grad_norm": 0.0019186835270375013, + "learning_rate": 1.3935343943257424e-05, + "loss": 0.0002, + "step": 14144 + }, + { + "epoch": 3.7315657564965043, + "grad_norm": 0.007169646210968494, + "learning_rate": 1.3934171575954747e-05, + "loss": 0.0015, + "step": 14146 + }, + { + "epoch": 3.7320933913731698, + "grad_norm": 0.0042944373562932014, + "learning_rate": 1.3932999208652071e-05, + "loss": 0.0001, + "step": 14148 + }, + { + "epoch": 3.732621026249835, + "grad_norm": 0.0012568800011649728, + "learning_rate": 1.3931826841349396e-05, + "loss": 0.0001, + "step": 14150 + }, + { + "epoch": 3.7331486611265006, + "grad_norm": 0.03491229936480522, + "learning_rate": 1.3930654474046722e-05, + "loss": 0.0002, + "step": 14152 + }, + { + "epoch": 3.733676296003166, + "grad_norm": 0.005973242688924074, + "learning_rate": 1.3929482106744043e-05, + "loss": 0.0001, + "step": 14154 + }, + { + "epoch": 3.734203930879831, + "grad_norm": 0.01136226486414671, + "learning_rate": 1.3928309739441369e-05, + "loss": 0.0001, + "step": 14156 + }, + { + "epoch": 3.7347315657564963, + "grad_norm": 0.35139980912208557, + "learning_rate": 1.3927137372138693e-05, + "loss": 0.0058, + "step": 14158 + }, + { + "epoch": 3.7352592006331617, + "grad_norm": 0.20161810517311096, + "learning_rate": 1.3925965004836017e-05, + "loss": 0.0061, + "step": 14160 + }, + { + "epoch": 3.735786835509827, + "grad_norm": 0.003766945330426097, + "learning_rate": 1.392479263753334e-05, + "loss": 0.0001, + "step": 14162 + }, + { + "epoch": 3.7363144703864926, + "grad_norm": 0.004283728543668985, + "learning_rate": 1.3923620270230664e-05, + "loss": 0.0001, + "step": 14164 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.4410621225833893, + "learning_rate": 1.3922447902927988e-05, + "loss": 0.0045, + "step": 14166 + }, + { + "epoch": 3.7373697401398234, + "grad_norm": 0.02408297173678875, + "learning_rate": 1.3921275535625311e-05, + "loss": 0.0003, + "step": 14168 + }, + { + "epoch": 3.7378973750164883, + "grad_norm": 0.006640077102929354, + "learning_rate": 1.3920103168322637e-05, + "loss": 0.0078, + "step": 14170 + }, + { + "epoch": 3.738425009893154, + "grad_norm": 0.39527642726898193, + "learning_rate": 1.3918930801019961e-05, + "loss": 0.0012, + "step": 14172 + }, + { + "epoch": 3.738952644769819, + "grad_norm": 0.16964207589626312, + "learning_rate": 1.3917758433717286e-05, + "loss": 0.0009, + "step": 14174 + }, + { + "epoch": 3.7394802796464846, + "grad_norm": 0.0011794700985774398, + "learning_rate": 1.3916586066414608e-05, + "loss": 0.0006, + "step": 14176 + }, + { + "epoch": 3.74000791452315, + "grad_norm": 0.06876937299966812, + "learning_rate": 1.3915413699111932e-05, + "loss": 0.0004, + "step": 14178 + }, + { + "epoch": 3.7405355493998154, + "grad_norm": 0.3787221908569336, + "learning_rate": 1.3914241331809257e-05, + "loss": 0.004, + "step": 14180 + }, + { + "epoch": 3.7410631842764808, + "grad_norm": 0.006636624690145254, + "learning_rate": 1.3913068964506583e-05, + "loss": 0.0039, + "step": 14182 + }, + { + "epoch": 3.741590819153146, + "grad_norm": 0.01972862333059311, + "learning_rate": 1.3911896597203904e-05, + "loss": 0.0001, + "step": 14184 + }, + { + "epoch": 3.7421184540298116, + "grad_norm": 0.0022930665872991085, + "learning_rate": 1.391072422990123e-05, + "loss": 0.0001, + "step": 14186 + }, + { + "epoch": 3.7426460889064765, + "grad_norm": 0.019529998302459717, + "learning_rate": 1.3909551862598554e-05, + "loss": 0.0007, + "step": 14188 + }, + { + "epoch": 3.743173723783142, + "grad_norm": 0.2372877597808838, + "learning_rate": 1.3908379495295878e-05, + "loss": 0.0022, + "step": 14190 + }, + { + "epoch": 3.7437013586598074, + "grad_norm": 0.023598141968250275, + "learning_rate": 1.3907207127993201e-05, + "loss": 0.0005, + "step": 14192 + }, + { + "epoch": 3.7442289935364728, + "grad_norm": 0.0018776459619402885, + "learning_rate": 1.3906034760690525e-05, + "loss": 0.0006, + "step": 14194 + }, + { + "epoch": 3.744756628413138, + "grad_norm": 0.013261916115880013, + "learning_rate": 1.390486239338785e-05, + "loss": 0.0001, + "step": 14196 + }, + { + "epoch": 3.7452842632898036, + "grad_norm": 0.009747841395437717, + "learning_rate": 1.3903690026085172e-05, + "loss": 0.0001, + "step": 14198 + }, + { + "epoch": 3.745811898166469, + "grad_norm": 0.0007802019244991243, + "learning_rate": 1.3902517658782496e-05, + "loss": 0.0001, + "step": 14200 + }, + { + "epoch": 3.746339533043134, + "grad_norm": 0.16681823134422302, + "learning_rate": 1.3901345291479822e-05, + "loss": 0.0019, + "step": 14202 + }, + { + "epoch": 3.7468671679197993, + "grad_norm": 0.03772736340761185, + "learning_rate": 1.3900172924177147e-05, + "loss": 0.0032, + "step": 14204 + }, + { + "epoch": 3.7473948027964648, + "grad_norm": 0.03549661859869957, + "learning_rate": 1.389900055687447e-05, + "loss": 0.0002, + "step": 14206 + }, + { + "epoch": 3.74792243767313, + "grad_norm": 0.002422419609501958, + "learning_rate": 1.3897828189571794e-05, + "loss": 0.0001, + "step": 14208 + }, + { + "epoch": 3.7484500725497956, + "grad_norm": 0.004447773098945618, + "learning_rate": 1.3896655822269118e-05, + "loss": 0.0006, + "step": 14210 + }, + { + "epoch": 3.748977707426461, + "grad_norm": 0.03541899845004082, + "learning_rate": 1.3895483454966442e-05, + "loss": 0.0003, + "step": 14212 + }, + { + "epoch": 3.7495053423031264, + "grad_norm": 0.003472092328593135, + "learning_rate": 1.3894311087663765e-05, + "loss": 0.0001, + "step": 14214 + }, + { + "epoch": 3.7500329771797913, + "grad_norm": 0.005149820353835821, + "learning_rate": 1.3893138720361091e-05, + "loss": 0.0004, + "step": 14216 + }, + { + "epoch": 3.750560612056457, + "grad_norm": 0.014302995055913925, + "learning_rate": 1.3891966353058415e-05, + "loss": 0.0001, + "step": 14218 + }, + { + "epoch": 3.751088246933122, + "grad_norm": 0.002443007891997695, + "learning_rate": 1.389079398575574e-05, + "loss": 0.0001, + "step": 14220 + }, + { + "epoch": 3.7516158818097876, + "grad_norm": 0.010081557556986809, + "learning_rate": 1.3889621618453062e-05, + "loss": 0.0001, + "step": 14222 + }, + { + "epoch": 3.752143516686453, + "grad_norm": 0.0008749486296437681, + "learning_rate": 1.3888449251150386e-05, + "loss": 0.0001, + "step": 14224 + }, + { + "epoch": 3.7526711515631184, + "grad_norm": 0.032801058143377304, + "learning_rate": 1.388727688384771e-05, + "loss": 0.0002, + "step": 14226 + }, + { + "epoch": 3.7531987864397838, + "grad_norm": 0.10997653007507324, + "learning_rate": 1.3886104516545033e-05, + "loss": 0.0002, + "step": 14228 + }, + { + "epoch": 3.753726421316449, + "grad_norm": 0.0011817143531516194, + "learning_rate": 1.3884932149242358e-05, + "loss": 0.0001, + "step": 14230 + }, + { + "epoch": 3.7542540561931146, + "grad_norm": 0.0007789197843521833, + "learning_rate": 1.3883759781939684e-05, + "loss": 0.0, + "step": 14232 + }, + { + "epoch": 3.7547816910697795, + "grad_norm": 0.011626013554632664, + "learning_rate": 1.3882587414637008e-05, + "loss": 0.0001, + "step": 14234 + }, + { + "epoch": 3.755309325946445, + "grad_norm": 0.0012134802527725697, + "learning_rate": 1.388141504733433e-05, + "loss": 0.0, + "step": 14236 + }, + { + "epoch": 3.7558369608231104, + "grad_norm": 0.043431662023067474, + "learning_rate": 1.3880242680031655e-05, + "loss": 0.003, + "step": 14238 + }, + { + "epoch": 3.7563645956997758, + "grad_norm": 0.007815525867044926, + "learning_rate": 1.3879070312728979e-05, + "loss": 0.0003, + "step": 14240 + }, + { + "epoch": 3.756892230576441, + "grad_norm": 0.0030723297968506813, + "learning_rate": 1.3877897945426303e-05, + "loss": 0.0001, + "step": 14242 + }, + { + "epoch": 3.7574198654531066, + "grad_norm": 0.0013697237009182572, + "learning_rate": 1.3876725578123626e-05, + "loss": 0.0012, + "step": 14244 + }, + { + "epoch": 3.757947500329772, + "grad_norm": 0.0042142788879573345, + "learning_rate": 1.3875553210820952e-05, + "loss": 0.0001, + "step": 14246 + }, + { + "epoch": 3.758475135206437, + "grad_norm": 0.02796071581542492, + "learning_rate": 1.3874380843518276e-05, + "loss": 0.0013, + "step": 14248 + }, + { + "epoch": 3.7590027700831024, + "grad_norm": 0.0045418632216751575, + "learning_rate": 1.38732084762156e-05, + "loss": 0.0001, + "step": 14250 + }, + { + "epoch": 3.7595304049597678, + "grad_norm": 0.00033236114541068673, + "learning_rate": 1.3872036108912923e-05, + "loss": 0.0, + "step": 14252 + }, + { + "epoch": 3.760058039836433, + "grad_norm": 0.0009394205990247428, + "learning_rate": 1.3870863741610248e-05, + "loss": 0.0, + "step": 14254 + }, + { + "epoch": 3.7605856747130986, + "grad_norm": 0.0033845044672489166, + "learning_rate": 1.3869691374307572e-05, + "loss": 0.0043, + "step": 14256 + }, + { + "epoch": 3.761113309589764, + "grad_norm": 0.5621153116226196, + "learning_rate": 1.3868519007004894e-05, + "loss": 0.0004, + "step": 14258 + }, + { + "epoch": 3.7616409444664294, + "grad_norm": 0.960089385509491, + "learning_rate": 1.3867346639702219e-05, + "loss": 0.0111, + "step": 14260 + }, + { + "epoch": 3.7621685793430943, + "grad_norm": 0.028056735172867775, + "learning_rate": 1.3866174272399545e-05, + "loss": 0.0003, + "step": 14262 + }, + { + "epoch": 3.76269621421976, + "grad_norm": 0.02650287374854088, + "learning_rate": 1.3865001905096869e-05, + "loss": 0.0003, + "step": 14264 + }, + { + "epoch": 3.763223849096425, + "grad_norm": 0.03659752383828163, + "learning_rate": 1.3863829537794192e-05, + "loss": 0.0002, + "step": 14266 + }, + { + "epoch": 3.7637514839730906, + "grad_norm": 0.08674832433462143, + "learning_rate": 1.3862657170491516e-05, + "loss": 0.0007, + "step": 14268 + }, + { + "epoch": 3.764279118849756, + "grad_norm": 0.06463878601789474, + "learning_rate": 1.386148480318884e-05, + "loss": 0.0005, + "step": 14270 + }, + { + "epoch": 3.7648067537264214, + "grad_norm": 0.04420682042837143, + "learning_rate": 1.3860312435886165e-05, + "loss": 0.0012, + "step": 14272 + }, + { + "epoch": 3.765334388603087, + "grad_norm": 0.33160367608070374, + "learning_rate": 1.3859140068583487e-05, + "loss": 0.0046, + "step": 14274 + }, + { + "epoch": 3.7658620234797517, + "grad_norm": 0.008876286447048187, + "learning_rate": 1.3857967701280812e-05, + "loss": 0.0004, + "step": 14276 + }, + { + "epoch": 3.7663896583564176, + "grad_norm": 0.012743016704916954, + "learning_rate": 1.3856795333978138e-05, + "loss": 0.0002, + "step": 14278 + }, + { + "epoch": 3.7669172932330826, + "grad_norm": 0.33691149950027466, + "learning_rate": 1.3855622966675462e-05, + "loss": 0.0093, + "step": 14280 + }, + { + "epoch": 3.767444928109748, + "grad_norm": 0.2590779662132263, + "learning_rate": 1.3854450599372784e-05, + "loss": 0.0078, + "step": 14282 + }, + { + "epoch": 3.7679725629864134, + "grad_norm": 0.015889707952737808, + "learning_rate": 1.3853278232070109e-05, + "loss": 0.0002, + "step": 14284 + }, + { + "epoch": 3.7685001978630788, + "grad_norm": 0.007317593786865473, + "learning_rate": 1.3852105864767433e-05, + "loss": 0.001, + "step": 14286 + }, + { + "epoch": 3.769027832739744, + "grad_norm": 0.01726665534079075, + "learning_rate": 1.3850933497464756e-05, + "loss": 0.0003, + "step": 14288 + }, + { + "epoch": 3.7695554676164096, + "grad_norm": 0.030377088114619255, + "learning_rate": 1.384976113016208e-05, + "loss": 0.0034, + "step": 14290 + }, + { + "epoch": 3.770083102493075, + "grad_norm": 0.03348901867866516, + "learning_rate": 1.3848588762859406e-05, + "loss": 0.0002, + "step": 14292 + }, + { + "epoch": 3.77061073736974, + "grad_norm": 0.011084666475653648, + "learning_rate": 1.384741639555673e-05, + "loss": 0.0001, + "step": 14294 + }, + { + "epoch": 3.7711383722464054, + "grad_norm": 0.00126194441691041, + "learning_rate": 1.3846244028254053e-05, + "loss": 0.0001, + "step": 14296 + }, + { + "epoch": 3.7716660071230708, + "grad_norm": 0.01557447575032711, + "learning_rate": 1.3845071660951377e-05, + "loss": 0.0001, + "step": 14298 + }, + { + "epoch": 3.772193641999736, + "grad_norm": 0.009340454824268818, + "learning_rate": 1.3843899293648701e-05, + "loss": 0.0001, + "step": 14300 + }, + { + "epoch": 3.7727212768764016, + "grad_norm": 0.005849413573741913, + "learning_rate": 1.3842726926346026e-05, + "loss": 0.0002, + "step": 14302 + }, + { + "epoch": 3.773248911753067, + "grad_norm": 0.0017391141736879945, + "learning_rate": 1.3841554559043348e-05, + "loss": 0.0001, + "step": 14304 + }, + { + "epoch": 3.7737765466297324, + "grad_norm": 0.0045512886717915535, + "learning_rate": 1.3840382191740673e-05, + "loss": 0.0001, + "step": 14306 + }, + { + "epoch": 3.7743041815063973, + "grad_norm": 0.0034732643980532885, + "learning_rate": 1.3839209824437999e-05, + "loss": 0.0131, + "step": 14308 + }, + { + "epoch": 3.7748318163830628, + "grad_norm": 0.0025681033730506897, + "learning_rate": 1.3838037457135323e-05, + "loss": 0.0001, + "step": 14310 + }, + { + "epoch": 3.775359451259728, + "grad_norm": 0.008556224405765533, + "learning_rate": 1.3836865089832646e-05, + "loss": 0.0002, + "step": 14312 + }, + { + "epoch": 3.7758870861363936, + "grad_norm": 0.08488000929355621, + "learning_rate": 1.383569272252997e-05, + "loss": 0.0003, + "step": 14314 + }, + { + "epoch": 3.776414721013059, + "grad_norm": 0.004361926112323999, + "learning_rate": 1.3834520355227294e-05, + "loss": 0.0001, + "step": 14316 + }, + { + "epoch": 3.7769423558897244, + "grad_norm": 0.003949291072785854, + "learning_rate": 1.3833347987924617e-05, + "loss": 0.0001, + "step": 14318 + }, + { + "epoch": 3.77746999076639, + "grad_norm": 0.01158733107149601, + "learning_rate": 1.3832175620621941e-05, + "loss": 0.0013, + "step": 14320 + }, + { + "epoch": 3.7779976256430547, + "grad_norm": 0.007217090576887131, + "learning_rate": 1.3831003253319265e-05, + "loss": 0.0002, + "step": 14322 + }, + { + "epoch": 3.7785252605197206, + "grad_norm": 0.006419263314455748, + "learning_rate": 1.3829830886016591e-05, + "loss": 0.0001, + "step": 14324 + }, + { + "epoch": 3.7790528953963856, + "grad_norm": 0.0016788585344329476, + "learning_rate": 1.3828658518713914e-05, + "loss": 0.0004, + "step": 14326 + }, + { + "epoch": 3.779580530273051, + "grad_norm": 0.006729661021381617, + "learning_rate": 1.3827486151411238e-05, + "loss": 0.0001, + "step": 14328 + }, + { + "epoch": 3.7801081651497164, + "grad_norm": 0.013921287842094898, + "learning_rate": 1.3826313784108563e-05, + "loss": 0.0046, + "step": 14330 + }, + { + "epoch": 3.7806358000263818, + "grad_norm": 0.0025145940016955137, + "learning_rate": 1.3825141416805887e-05, + "loss": 0.003, + "step": 14332 + }, + { + "epoch": 3.781163434903047, + "grad_norm": 0.03405098244547844, + "learning_rate": 1.382396904950321e-05, + "loss": 0.0002, + "step": 14334 + }, + { + "epoch": 3.7816910697797126, + "grad_norm": 0.007062812801450491, + "learning_rate": 1.3822796682200534e-05, + "loss": 0.0001, + "step": 14336 + }, + { + "epoch": 3.782218704656378, + "grad_norm": 0.007364463526755571, + "learning_rate": 1.382162431489786e-05, + "loss": 0.0005, + "step": 14338 + }, + { + "epoch": 3.782746339533043, + "grad_norm": 0.00928264856338501, + "learning_rate": 1.3820451947595184e-05, + "loss": 0.0001, + "step": 14340 + }, + { + "epoch": 3.7832739744097084, + "grad_norm": 0.008622189983725548, + "learning_rate": 1.3819279580292507e-05, + "loss": 0.0001, + "step": 14342 + }, + { + "epoch": 3.7838016092863738, + "grad_norm": 0.36814817786216736, + "learning_rate": 1.3818107212989831e-05, + "loss": 0.0023, + "step": 14344 + }, + { + "epoch": 3.784329244163039, + "grad_norm": 0.02009514719247818, + "learning_rate": 1.3816934845687155e-05, + "loss": 0.0002, + "step": 14346 + }, + { + "epoch": 3.7848568790397046, + "grad_norm": 0.015683254227042198, + "learning_rate": 1.3815762478384478e-05, + "loss": 0.0062, + "step": 14348 + }, + { + "epoch": 3.78538451391637, + "grad_norm": 0.004624547436833382, + "learning_rate": 1.3814590111081802e-05, + "loss": 0.0001, + "step": 14350 + }, + { + "epoch": 3.7859121487930354, + "grad_norm": 0.18266111612319946, + "learning_rate": 1.3813417743779127e-05, + "loss": 0.0025, + "step": 14352 + }, + { + "epoch": 3.7864397836697004, + "grad_norm": 0.002216882072389126, + "learning_rate": 1.3812245376476453e-05, + "loss": 0.0001, + "step": 14354 + }, + { + "epoch": 3.7869674185463658, + "grad_norm": 0.00406081136316061, + "learning_rate": 1.3811073009173775e-05, + "loss": 0.0001, + "step": 14356 + }, + { + "epoch": 3.787495053423031, + "grad_norm": 0.005777600221335888, + "learning_rate": 1.38099006418711e-05, + "loss": 0.0001, + "step": 14358 + }, + { + "epoch": 3.7880226882996966, + "grad_norm": 0.05936310812830925, + "learning_rate": 1.3808728274568424e-05, + "loss": 0.0003, + "step": 14360 + }, + { + "epoch": 3.788550323176362, + "grad_norm": 0.003314559580758214, + "learning_rate": 1.3807555907265748e-05, + "loss": 0.0001, + "step": 14362 + }, + { + "epoch": 3.7890779580530274, + "grad_norm": 0.012976057827472687, + "learning_rate": 1.380638353996307e-05, + "loss": 0.0001, + "step": 14364 + }, + { + "epoch": 3.789605592929693, + "grad_norm": 0.03157113119959831, + "learning_rate": 1.3805211172660395e-05, + "loss": 0.0002, + "step": 14366 + }, + { + "epoch": 3.7901332278063578, + "grad_norm": 0.037867672741413116, + "learning_rate": 1.380403880535772e-05, + "loss": 0.0051, + "step": 14368 + }, + { + "epoch": 3.7906608626830236, + "grad_norm": 0.03219502046704292, + "learning_rate": 1.3802866438055045e-05, + "loss": 0.0002, + "step": 14370 + }, + { + "epoch": 3.7911884975596886, + "grad_norm": 0.0017691005486994982, + "learning_rate": 1.3801694070752368e-05, + "loss": 0.0001, + "step": 14372 + }, + { + "epoch": 3.791716132436354, + "grad_norm": 0.0979076698422432, + "learning_rate": 1.3800521703449692e-05, + "loss": 0.0011, + "step": 14374 + }, + { + "epoch": 3.7922437673130194, + "grad_norm": 0.0022621515672653913, + "learning_rate": 1.3799349336147017e-05, + "loss": 0.0001, + "step": 14376 + }, + { + "epoch": 3.792771402189685, + "grad_norm": 0.003681938163936138, + "learning_rate": 1.3798176968844339e-05, + "loss": 0.0001, + "step": 14378 + }, + { + "epoch": 3.79329903706635, + "grad_norm": 0.0031618562061339617, + "learning_rate": 1.3797004601541663e-05, + "loss": 0.0001, + "step": 14380 + }, + { + "epoch": 3.7938266719430156, + "grad_norm": 0.006957904435694218, + "learning_rate": 1.3795832234238988e-05, + "loss": 0.0002, + "step": 14382 + }, + { + "epoch": 3.794354306819681, + "grad_norm": 0.005527916364371777, + "learning_rate": 1.3794659866936314e-05, + "loss": 0.0005, + "step": 14384 + }, + { + "epoch": 3.794881941696346, + "grad_norm": 0.009730218909680843, + "learning_rate": 1.3793487499633635e-05, + "loss": 0.0003, + "step": 14386 + }, + { + "epoch": 3.7954095765730114, + "grad_norm": 0.0017075673677027225, + "learning_rate": 1.379231513233096e-05, + "loss": 0.0001, + "step": 14388 + }, + { + "epoch": 3.7959372114496768, + "grad_norm": 0.053774647414684296, + "learning_rate": 1.3791142765028285e-05, + "loss": 0.0006, + "step": 14390 + }, + { + "epoch": 3.796464846326342, + "grad_norm": 0.003844248363748193, + "learning_rate": 1.378997039772561e-05, + "loss": 0.0001, + "step": 14392 + }, + { + "epoch": 3.7969924812030076, + "grad_norm": 0.005973429884761572, + "learning_rate": 1.3788798030422932e-05, + "loss": 0.0001, + "step": 14394 + }, + { + "epoch": 3.797520116079673, + "grad_norm": 0.0032552534248679876, + "learning_rate": 1.3787625663120256e-05, + "loss": 0.0002, + "step": 14396 + }, + { + "epoch": 3.7980477509563384, + "grad_norm": 0.005206180736422539, + "learning_rate": 1.378645329581758e-05, + "loss": 0.0001, + "step": 14398 + }, + { + "epoch": 3.7985753858330034, + "grad_norm": 0.02106003649532795, + "learning_rate": 1.3785280928514906e-05, + "loss": 0.0002, + "step": 14400 + }, + { + "epoch": 3.7991030207096688, + "grad_norm": 0.7805802822113037, + "learning_rate": 1.3784108561212229e-05, + "loss": 0.0009, + "step": 14402 + }, + { + "epoch": 3.799630655586334, + "grad_norm": 0.0007250352646224201, + "learning_rate": 1.3782936193909553e-05, + "loss": 0.0, + "step": 14404 + }, + { + "epoch": 3.8001582904629996, + "grad_norm": 0.003311963751912117, + "learning_rate": 1.3781763826606878e-05, + "loss": 0.0015, + "step": 14406 + }, + { + "epoch": 3.800685925339665, + "grad_norm": 0.4164274334907532, + "learning_rate": 1.37805914593042e-05, + "loss": 0.004, + "step": 14408 + }, + { + "epoch": 3.8012135602163304, + "grad_norm": 0.006405637599527836, + "learning_rate": 1.3779419092001525e-05, + "loss": 0.0002, + "step": 14410 + }, + { + "epoch": 3.801741195092996, + "grad_norm": 0.005195148289203644, + "learning_rate": 1.3778246724698849e-05, + "loss": 0.0001, + "step": 14412 + }, + { + "epoch": 3.8022688299696608, + "grad_norm": 0.02608470432460308, + "learning_rate": 1.3777074357396173e-05, + "loss": 0.0015, + "step": 14414 + }, + { + "epoch": 3.8027964648463266, + "grad_norm": 0.0005006646388210356, + "learning_rate": 1.3775901990093496e-05, + "loss": 0.0001, + "step": 14416 + }, + { + "epoch": 3.8033240997229916, + "grad_norm": 0.3431406319141388, + "learning_rate": 1.3774729622790822e-05, + "loss": 0.0017, + "step": 14418 + }, + { + "epoch": 3.803851734599657, + "grad_norm": 0.004085719119757414, + "learning_rate": 1.3773557255488146e-05, + "loss": 0.0001, + "step": 14420 + }, + { + "epoch": 3.8043793694763224, + "grad_norm": 0.1857244074344635, + "learning_rate": 1.377238488818547e-05, + "loss": 0.0052, + "step": 14422 + }, + { + "epoch": 3.804907004352988, + "grad_norm": 0.0009484086185693741, + "learning_rate": 1.3771212520882793e-05, + "loss": 0.0002, + "step": 14424 + }, + { + "epoch": 3.805434639229653, + "grad_norm": 0.10977201163768768, + "learning_rate": 1.3770040153580117e-05, + "loss": 0.0009, + "step": 14426 + }, + { + "epoch": 3.805962274106318, + "grad_norm": 0.2059626430273056, + "learning_rate": 1.3768867786277442e-05, + "loss": 0.0013, + "step": 14428 + }, + { + "epoch": 3.806489908982984, + "grad_norm": 0.07119429856538773, + "learning_rate": 1.3767695418974768e-05, + "loss": 0.0003, + "step": 14430 + }, + { + "epoch": 3.807017543859649, + "grad_norm": 0.2405288964509964, + "learning_rate": 1.3766523051672089e-05, + "loss": 0.0111, + "step": 14432 + }, + { + "epoch": 3.8075451787363144, + "grad_norm": 0.012929144315421581, + "learning_rate": 1.3765350684369415e-05, + "loss": 0.0031, + "step": 14434 + }, + { + "epoch": 3.80807281361298, + "grad_norm": 0.09251060336828232, + "learning_rate": 1.3764178317066739e-05, + "loss": 0.0044, + "step": 14436 + }, + { + "epoch": 3.808600448489645, + "grad_norm": 0.08757459372282028, + "learning_rate": 1.3763005949764061e-05, + "loss": 0.0008, + "step": 14438 + }, + { + "epoch": 3.8091280833663106, + "grad_norm": 0.24828989803791046, + "learning_rate": 1.3761833582461386e-05, + "loss": 0.0003, + "step": 14440 + }, + { + "epoch": 3.809655718242976, + "grad_norm": 0.11139433830976486, + "learning_rate": 1.376066121515871e-05, + "loss": 0.0007, + "step": 14442 + }, + { + "epoch": 3.8101833531196414, + "grad_norm": 0.002838022541254759, + "learning_rate": 1.3759488847856034e-05, + "loss": 0.0001, + "step": 14444 + }, + { + "epoch": 3.8107109879963064, + "grad_norm": 0.06943986564874649, + "learning_rate": 1.3758316480553357e-05, + "loss": 0.0002, + "step": 14446 + }, + { + "epoch": 3.8112386228729718, + "grad_norm": 0.032127752900123596, + "learning_rate": 1.3757144113250683e-05, + "loss": 0.0002, + "step": 14448 + }, + { + "epoch": 3.811766257749637, + "grad_norm": 0.0048413933254778385, + "learning_rate": 1.3755971745948007e-05, + "loss": 0.0001, + "step": 14450 + }, + { + "epoch": 3.8122938926263026, + "grad_norm": 0.0348435677587986, + "learning_rate": 1.3754799378645332e-05, + "loss": 0.0009, + "step": 14452 + }, + { + "epoch": 3.812821527502968, + "grad_norm": 0.0029791928827762604, + "learning_rate": 1.3753627011342654e-05, + "loss": 0.0004, + "step": 14454 + }, + { + "epoch": 3.8133491623796334, + "grad_norm": 0.02453823946416378, + "learning_rate": 1.3752454644039979e-05, + "loss": 0.0007, + "step": 14456 + }, + { + "epoch": 3.813876797256299, + "grad_norm": 0.0015861624851822853, + "learning_rate": 1.3751282276737303e-05, + "loss": 0.0001, + "step": 14458 + }, + { + "epoch": 3.8144044321329638, + "grad_norm": 0.0010864367941394448, + "learning_rate": 1.3750109909434629e-05, + "loss": 0.0001, + "step": 14460 + }, + { + "epoch": 3.8149320670096296, + "grad_norm": 0.0010637968080118299, + "learning_rate": 1.374893754213195e-05, + "loss": 0.0053, + "step": 14462 + }, + { + "epoch": 3.8154597018862946, + "grad_norm": 0.04416875168681145, + "learning_rate": 1.3747765174829276e-05, + "loss": 0.0005, + "step": 14464 + }, + { + "epoch": 3.81598733676296, + "grad_norm": 0.003718830645084381, + "learning_rate": 1.37465928075266e-05, + "loss": 0.0001, + "step": 14466 + }, + { + "epoch": 3.8165149716396254, + "grad_norm": 0.0012728129513561726, + "learning_rate": 1.3745420440223923e-05, + "loss": 0.0006, + "step": 14468 + }, + { + "epoch": 3.817042606516291, + "grad_norm": 0.0016254454385489225, + "learning_rate": 1.3744248072921247e-05, + "loss": 0.0001, + "step": 14470 + }, + { + "epoch": 3.817570241392956, + "grad_norm": 0.0031498235184699297, + "learning_rate": 1.3743075705618571e-05, + "loss": 0.0001, + "step": 14472 + }, + { + "epoch": 3.818097876269621, + "grad_norm": 0.0008763721561990678, + "learning_rate": 1.3741903338315896e-05, + "loss": 0.0, + "step": 14474 + }, + { + "epoch": 3.818625511146287, + "grad_norm": 0.03789638355374336, + "learning_rate": 1.3740730971013218e-05, + "loss": 0.0002, + "step": 14476 + }, + { + "epoch": 3.819153146022952, + "grad_norm": 0.007539605721831322, + "learning_rate": 1.3739558603710542e-05, + "loss": 0.0001, + "step": 14478 + }, + { + "epoch": 3.8196807808996174, + "grad_norm": 1.7487980127334595, + "learning_rate": 1.3738386236407868e-05, + "loss": 0.0005, + "step": 14480 + }, + { + "epoch": 3.820208415776283, + "grad_norm": 0.04539061337709427, + "learning_rate": 1.3737213869105193e-05, + "loss": 0.0037, + "step": 14482 + }, + { + "epoch": 3.820736050652948, + "grad_norm": 0.005208558402955532, + "learning_rate": 1.3736041501802515e-05, + "loss": 0.0001, + "step": 14484 + }, + { + "epoch": 3.8212636855296136, + "grad_norm": 0.0026461840607225895, + "learning_rate": 1.373486913449984e-05, + "loss": 0.0, + "step": 14486 + }, + { + "epoch": 3.821791320406279, + "grad_norm": 0.035850148648023605, + "learning_rate": 1.3733696767197164e-05, + "loss": 0.0003, + "step": 14488 + }, + { + "epoch": 3.8223189552829444, + "grad_norm": 0.04542425274848938, + "learning_rate": 1.3732524399894488e-05, + "loss": 0.0005, + "step": 14490 + }, + { + "epoch": 3.8228465901596094, + "grad_norm": 0.012237070128321648, + "learning_rate": 1.3731352032591811e-05, + "loss": 0.0001, + "step": 14492 + }, + { + "epoch": 3.8233742250362748, + "grad_norm": 0.113248310983181, + "learning_rate": 1.3730179665289137e-05, + "loss": 0.0024, + "step": 14494 + }, + { + "epoch": 3.82390185991294, + "grad_norm": 0.0022337548434734344, + "learning_rate": 1.3729007297986461e-05, + "loss": 0.0, + "step": 14496 + }, + { + "epoch": 3.8244294947896056, + "grad_norm": 0.013390691950917244, + "learning_rate": 1.3727834930683784e-05, + "loss": 0.0004, + "step": 14498 + }, + { + "epoch": 3.824957129666271, + "grad_norm": 0.001759099424816668, + "learning_rate": 1.3726662563381108e-05, + "loss": 0.0001, + "step": 14500 + }, + { + "epoch": 3.8254847645429364, + "grad_norm": 0.00687001459300518, + "learning_rate": 1.3725490196078432e-05, + "loss": 0.0001, + "step": 14502 + }, + { + "epoch": 3.826012399419602, + "grad_norm": 0.02239808440208435, + "learning_rate": 1.3724317828775757e-05, + "loss": 0.0003, + "step": 14504 + }, + { + "epoch": 3.8265400342962668, + "grad_norm": 6.918285846710205, + "learning_rate": 1.372314546147308e-05, + "loss": 0.0005, + "step": 14506 + }, + { + "epoch": 3.827067669172932, + "grad_norm": 0.23737861216068268, + "learning_rate": 1.3721973094170404e-05, + "loss": 0.0067, + "step": 14508 + }, + { + "epoch": 3.8275953040495976, + "grad_norm": 0.02790769934654236, + "learning_rate": 1.372080072686773e-05, + "loss": 0.0096, + "step": 14510 + }, + { + "epoch": 3.828122938926263, + "grad_norm": 0.020099403336644173, + "learning_rate": 1.3719628359565054e-05, + "loss": 0.0001, + "step": 14512 + }, + { + "epoch": 3.8286505738029284, + "grad_norm": 0.7676112651824951, + "learning_rate": 1.3718455992262377e-05, + "loss": 0.0005, + "step": 14514 + }, + { + "epoch": 3.829178208679594, + "grad_norm": 0.08059567213058472, + "learning_rate": 1.3717283624959701e-05, + "loss": 0.0004, + "step": 14516 + }, + { + "epoch": 3.829705843556259, + "grad_norm": 0.16028167307376862, + "learning_rate": 1.3716111257657025e-05, + "loss": 0.0019, + "step": 14518 + }, + { + "epoch": 3.830233478432924, + "grad_norm": 0.2121679186820984, + "learning_rate": 1.371493889035435e-05, + "loss": 0.001, + "step": 14520 + }, + { + "epoch": 3.83076111330959, + "grad_norm": 0.06911444664001465, + "learning_rate": 1.3713766523051672e-05, + "loss": 0.0003, + "step": 14522 + }, + { + "epoch": 3.831288748186255, + "grad_norm": 0.008951637893915176, + "learning_rate": 1.3712594155748996e-05, + "loss": 0.0001, + "step": 14524 + }, + { + "epoch": 3.8318163830629204, + "grad_norm": 0.09599300473928452, + "learning_rate": 1.3711421788446322e-05, + "loss": 0.0004, + "step": 14526 + }, + { + "epoch": 3.832344017939586, + "grad_norm": 0.017999550327658653, + "learning_rate": 1.3710249421143645e-05, + "loss": 0.0006, + "step": 14528 + }, + { + "epoch": 3.832871652816251, + "grad_norm": 0.0060646445490419865, + "learning_rate": 1.370907705384097e-05, + "loss": 0.0002, + "step": 14530 + }, + { + "epoch": 3.8333992876929166, + "grad_norm": 0.05510145053267479, + "learning_rate": 1.3707904686538294e-05, + "loss": 0.0001, + "step": 14532 + }, + { + "epoch": 3.833926922569582, + "grad_norm": 0.004138099029660225, + "learning_rate": 1.3706732319235618e-05, + "loss": 0.0001, + "step": 14534 + }, + { + "epoch": 3.8344545574462474, + "grad_norm": 0.009946796111762524, + "learning_rate": 1.370555995193294e-05, + "loss": 0.0001, + "step": 14536 + }, + { + "epoch": 3.8349821923229124, + "grad_norm": 0.030021708458662033, + "learning_rate": 1.3704387584630265e-05, + "loss": 0.0016, + "step": 14538 + }, + { + "epoch": 3.835509827199578, + "grad_norm": 0.6220201253890991, + "learning_rate": 1.370321521732759e-05, + "loss": 0.0001, + "step": 14540 + }, + { + "epoch": 3.836037462076243, + "grad_norm": 0.0030101591255515814, + "learning_rate": 1.3702042850024915e-05, + "loss": 0.0001, + "step": 14542 + }, + { + "epoch": 3.8365650969529086, + "grad_norm": 0.0014702747575938702, + "learning_rate": 1.3700870482722238e-05, + "loss": 0.0001, + "step": 14544 + }, + { + "epoch": 3.837092731829574, + "grad_norm": 0.007259028498083353, + "learning_rate": 1.3699698115419562e-05, + "loss": 0.0, + "step": 14546 + }, + { + "epoch": 3.8376203667062394, + "grad_norm": 0.4390477240085602, + "learning_rate": 1.3698525748116886e-05, + "loss": 0.0058, + "step": 14548 + }, + { + "epoch": 3.838148001582905, + "grad_norm": 0.004533727653324604, + "learning_rate": 1.369735338081421e-05, + "loss": 0.0002, + "step": 14550 + }, + { + "epoch": 3.8386756364595698, + "grad_norm": 0.1513035148382187, + "learning_rate": 1.3696181013511533e-05, + "loss": 0.0001, + "step": 14552 + }, + { + "epoch": 3.839203271336235, + "grad_norm": 0.15303729474544525, + "learning_rate": 1.3695008646208858e-05, + "loss": 0.0009, + "step": 14554 + }, + { + "epoch": 3.8397309062129006, + "grad_norm": 0.0015585552901029587, + "learning_rate": 1.3693836278906184e-05, + "loss": 0.0, + "step": 14556 + }, + { + "epoch": 3.840258541089566, + "grad_norm": 0.0051107509061694145, + "learning_rate": 1.3692663911603508e-05, + "loss": 0.0, + "step": 14558 + }, + { + "epoch": 3.8407861759662314, + "grad_norm": 0.11623746156692505, + "learning_rate": 1.369149154430083e-05, + "loss": 0.0056, + "step": 14560 + }, + { + "epoch": 3.841313810842897, + "grad_norm": 0.00725823687389493, + "learning_rate": 1.3690319176998155e-05, + "loss": 0.0001, + "step": 14562 + }, + { + "epoch": 3.841841445719562, + "grad_norm": 0.022417431697249413, + "learning_rate": 1.3689146809695479e-05, + "loss": 0.0002, + "step": 14564 + }, + { + "epoch": 3.842369080596227, + "grad_norm": 0.12038617581129074, + "learning_rate": 1.3687974442392802e-05, + "loss": 0.0035, + "step": 14566 + }, + { + "epoch": 3.842896715472893, + "grad_norm": 0.0031853842083364725, + "learning_rate": 1.3686802075090126e-05, + "loss": 0.0001, + "step": 14568 + }, + { + "epoch": 3.843424350349558, + "grad_norm": 0.00257253460586071, + "learning_rate": 1.3685629707787452e-05, + "loss": 0.0071, + "step": 14570 + }, + { + "epoch": 3.8439519852262234, + "grad_norm": 0.0016841379692777991, + "learning_rate": 1.3684457340484776e-05, + "loss": 0.0001, + "step": 14572 + }, + { + "epoch": 3.844479620102889, + "grad_norm": 0.16024647653102875, + "learning_rate": 1.3683284973182099e-05, + "loss": 0.0036, + "step": 14574 + }, + { + "epoch": 3.845007254979554, + "grad_norm": 0.002694039372727275, + "learning_rate": 1.3682112605879423e-05, + "loss": 0.0007, + "step": 14576 + }, + { + "epoch": 3.8455348898562196, + "grad_norm": 0.1133408397436142, + "learning_rate": 1.3680940238576748e-05, + "loss": 0.0066, + "step": 14578 + }, + { + "epoch": 3.8460625247328846, + "grad_norm": 0.012758426368236542, + "learning_rate": 1.3679767871274072e-05, + "loss": 0.0002, + "step": 14580 + }, + { + "epoch": 3.8465901596095504, + "grad_norm": 0.006811277940869331, + "learning_rate": 1.3678595503971394e-05, + "loss": 0.0016, + "step": 14582 + }, + { + "epoch": 3.8471177944862154, + "grad_norm": 0.03731268644332886, + "learning_rate": 1.3677423136668719e-05, + "loss": 0.0006, + "step": 14584 + }, + { + "epoch": 3.847645429362881, + "grad_norm": 0.05045279115438461, + "learning_rate": 1.3676250769366045e-05, + "loss": 0.0016, + "step": 14586 + }, + { + "epoch": 3.848173064239546, + "grad_norm": 0.008124659769237041, + "learning_rate": 1.3675078402063369e-05, + "loss": 0.0061, + "step": 14588 + }, + { + "epoch": 3.8487006991162116, + "grad_norm": 0.0034072939306497574, + "learning_rate": 1.3673906034760692e-05, + "loss": 0.0001, + "step": 14590 + }, + { + "epoch": 3.849228333992877, + "grad_norm": 0.006437212228775024, + "learning_rate": 1.3672733667458016e-05, + "loss": 0.0001, + "step": 14592 + }, + { + "epoch": 3.8497559688695424, + "grad_norm": 0.06471032649278641, + "learning_rate": 1.367156130015534e-05, + "loss": 0.0019, + "step": 14594 + }, + { + "epoch": 3.850283603746208, + "grad_norm": 0.05689052864909172, + "learning_rate": 1.3670388932852663e-05, + "loss": 0.0004, + "step": 14596 + }, + { + "epoch": 3.850811238622873, + "grad_norm": 0.01825176738202572, + "learning_rate": 1.3669216565549987e-05, + "loss": 0.0002, + "step": 14598 + }, + { + "epoch": 3.851338873499538, + "grad_norm": 0.044351931661367416, + "learning_rate": 1.3668044198247311e-05, + "loss": 0.0011, + "step": 14600 + }, + { + "epoch": 3.8518665083762036, + "grad_norm": 0.020565014332532883, + "learning_rate": 1.3666871830944637e-05, + "loss": 0.0002, + "step": 14602 + }, + { + "epoch": 3.852394143252869, + "grad_norm": 0.6295856833457947, + "learning_rate": 1.366569946364196e-05, + "loss": 0.0018, + "step": 14604 + }, + { + "epoch": 3.8529217781295344, + "grad_norm": 0.012559297494590282, + "learning_rate": 1.3664527096339284e-05, + "loss": 0.0004, + "step": 14606 + }, + { + "epoch": 3.8534494130062, + "grad_norm": 0.0031455582939088345, + "learning_rate": 1.3663354729036609e-05, + "loss": 0.0001, + "step": 14608 + }, + { + "epoch": 3.853977047882865, + "grad_norm": 0.019151562824845314, + "learning_rate": 1.3662182361733933e-05, + "loss": 0.0028, + "step": 14610 + }, + { + "epoch": 3.85450468275953, + "grad_norm": 0.0055540939792990685, + "learning_rate": 1.3661009994431256e-05, + "loss": 0.0001, + "step": 14612 + }, + { + "epoch": 3.855032317636196, + "grad_norm": 0.0027961861342191696, + "learning_rate": 1.365983762712858e-05, + "loss": 0.0001, + "step": 14614 + }, + { + "epoch": 3.855559952512861, + "grad_norm": 0.008316111750900745, + "learning_rate": 1.3658665259825906e-05, + "loss": 0.0001, + "step": 14616 + }, + { + "epoch": 3.8560875873895264, + "grad_norm": 0.040856778621673584, + "learning_rate": 1.365749289252323e-05, + "loss": 0.0001, + "step": 14618 + }, + { + "epoch": 3.856615222266192, + "grad_norm": 0.007940635085105896, + "learning_rate": 1.3656320525220553e-05, + "loss": 0.0001, + "step": 14620 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.2032942920923233, + "learning_rate": 1.3655148157917877e-05, + "loss": 0.0041, + "step": 14622 + }, + { + "epoch": 3.8576704920195226, + "grad_norm": 0.0022664563730359077, + "learning_rate": 1.3653975790615201e-05, + "loss": 0.0001, + "step": 14624 + }, + { + "epoch": 3.8581981268961876, + "grad_norm": 0.0023012154269963503, + "learning_rate": 1.3652803423312524e-05, + "loss": 0.0001, + "step": 14626 + }, + { + "epoch": 3.8587257617728534, + "grad_norm": 0.0006822922150604427, + "learning_rate": 1.3651631056009848e-05, + "loss": 0.0002, + "step": 14628 + }, + { + "epoch": 3.8592533966495184, + "grad_norm": 0.00881223101168871, + "learning_rate": 1.3650458688707173e-05, + "loss": 0.0002, + "step": 14630 + }, + { + "epoch": 3.859781031526184, + "grad_norm": 0.01166177075356245, + "learning_rate": 1.3649286321404499e-05, + "loss": 0.0001, + "step": 14632 + }, + { + "epoch": 3.860308666402849, + "grad_norm": 0.713156521320343, + "learning_rate": 1.364811395410182e-05, + "loss": 0.0006, + "step": 14634 + }, + { + "epoch": 3.8608363012795146, + "grad_norm": 0.0030546344351023436, + "learning_rate": 1.3646941586799146e-05, + "loss": 0.0001, + "step": 14636 + }, + { + "epoch": 3.86136393615618, + "grad_norm": 0.0006297017680481076, + "learning_rate": 1.364576921949647e-05, + "loss": 0.0, + "step": 14638 + }, + { + "epoch": 3.8618915710328454, + "grad_norm": 0.0015058747958391905, + "learning_rate": 1.3644596852193794e-05, + "loss": 0.0005, + "step": 14640 + }, + { + "epoch": 3.862419205909511, + "grad_norm": 0.02932639792561531, + "learning_rate": 1.3643424484891117e-05, + "loss": 0.0016, + "step": 14642 + }, + { + "epoch": 3.862946840786176, + "grad_norm": 0.23088403046131134, + "learning_rate": 1.3642252117588441e-05, + "loss": 0.0015, + "step": 14644 + }, + { + "epoch": 3.863474475662841, + "grad_norm": 0.0032686309423297644, + "learning_rate": 1.3641079750285765e-05, + "loss": 0.0001, + "step": 14646 + }, + { + "epoch": 3.8640021105395066, + "grad_norm": 0.00438228202983737, + "learning_rate": 1.3639907382983091e-05, + "loss": 0.0001, + "step": 14648 + }, + { + "epoch": 3.864529745416172, + "grad_norm": 0.007330704014748335, + "learning_rate": 1.3638735015680414e-05, + "loss": 0.0001, + "step": 14650 + }, + { + "epoch": 3.8650573802928374, + "grad_norm": 0.13244742155075073, + "learning_rate": 1.3637562648377738e-05, + "loss": 0.0009, + "step": 14652 + }, + { + "epoch": 3.865585015169503, + "grad_norm": 0.003967417869716883, + "learning_rate": 1.3636390281075063e-05, + "loss": 0.0, + "step": 14654 + }, + { + "epoch": 3.866112650046168, + "grad_norm": 0.0010778425494208932, + "learning_rate": 1.3635217913772385e-05, + "loss": 0.0001, + "step": 14656 + }, + { + "epoch": 3.866640284922833, + "grad_norm": 0.005595079623162746, + "learning_rate": 1.363404554646971e-05, + "loss": 0.0001, + "step": 14658 + }, + { + "epoch": 3.8671679197994986, + "grad_norm": 0.005547080188989639, + "learning_rate": 1.3632873179167034e-05, + "loss": 0.0005, + "step": 14660 + }, + { + "epoch": 3.867695554676164, + "grad_norm": 0.11734657734632492, + "learning_rate": 1.363170081186436e-05, + "loss": 0.0013, + "step": 14662 + }, + { + "epoch": 3.8682231895528294, + "grad_norm": 0.3294873833656311, + "learning_rate": 1.363052844456168e-05, + "loss": 0.0045, + "step": 14664 + }, + { + "epoch": 3.868750824429495, + "grad_norm": 0.09647127985954285, + "learning_rate": 1.3629356077259007e-05, + "loss": 0.0003, + "step": 14666 + }, + { + "epoch": 3.86927845930616, + "grad_norm": 0.008308762684464455, + "learning_rate": 1.3628183709956331e-05, + "loss": 0.0001, + "step": 14668 + }, + { + "epoch": 3.8698060941828256, + "grad_norm": 0.449252724647522, + "learning_rate": 1.3627011342653655e-05, + "loss": 0.0067, + "step": 14670 + }, + { + "epoch": 3.8703337290594906, + "grad_norm": 0.059709981083869934, + "learning_rate": 1.3625838975350978e-05, + "loss": 0.0003, + "step": 14672 + }, + { + "epoch": 3.8708613639361564, + "grad_norm": 0.006811710074543953, + "learning_rate": 1.3624666608048302e-05, + "loss": 0.0001, + "step": 14674 + }, + { + "epoch": 3.8713889988128214, + "grad_norm": 0.007771049160510302, + "learning_rate": 1.3623494240745627e-05, + "loss": 0.0001, + "step": 14676 + }, + { + "epoch": 3.871916633689487, + "grad_norm": 0.03308536484837532, + "learning_rate": 1.3622321873442953e-05, + "loss": 0.0022, + "step": 14678 + }, + { + "epoch": 3.872444268566152, + "grad_norm": 0.003402247792109847, + "learning_rate": 1.3621149506140275e-05, + "loss": 0.0001, + "step": 14680 + }, + { + "epoch": 3.8729719034428176, + "grad_norm": 0.039501842111349106, + "learning_rate": 1.36199771388376e-05, + "loss": 0.0003, + "step": 14682 + }, + { + "epoch": 3.873499538319483, + "grad_norm": 0.04692642390727997, + "learning_rate": 1.3618804771534924e-05, + "loss": 0.0002, + "step": 14684 + }, + { + "epoch": 3.8740271731961484, + "grad_norm": 0.007190990261733532, + "learning_rate": 1.3617632404232246e-05, + "loss": 0.0001, + "step": 14686 + }, + { + "epoch": 3.874554808072814, + "grad_norm": 0.02029905840754509, + "learning_rate": 1.361646003692957e-05, + "loss": 0.0004, + "step": 14688 + }, + { + "epoch": 3.875082442949479, + "grad_norm": 0.15358220040798187, + "learning_rate": 1.3615287669626895e-05, + "loss": 0.0007, + "step": 14690 + }, + { + "epoch": 3.875610077826144, + "grad_norm": 0.02488172985613346, + "learning_rate": 1.361411530232422e-05, + "loss": 0.0001, + "step": 14692 + }, + { + "epoch": 3.8761377127028096, + "grad_norm": 0.05808882415294647, + "learning_rate": 1.3612942935021542e-05, + "loss": 0.0037, + "step": 14694 + }, + { + "epoch": 3.876665347579475, + "grad_norm": 0.018146958202123642, + "learning_rate": 1.3611770567718868e-05, + "loss": 0.0003, + "step": 14696 + }, + { + "epoch": 3.8771929824561404, + "grad_norm": 0.019015097990632057, + "learning_rate": 1.3610598200416192e-05, + "loss": 0.0002, + "step": 14698 + }, + { + "epoch": 3.877720617332806, + "grad_norm": 0.7600139379501343, + "learning_rate": 1.3609425833113516e-05, + "loss": 0.0033, + "step": 14700 + }, + { + "epoch": 3.8782482522094712, + "grad_norm": 0.006832681130617857, + "learning_rate": 1.3608253465810839e-05, + "loss": 0.0001, + "step": 14702 + }, + { + "epoch": 3.878775887086136, + "grad_norm": 0.005951474886387587, + "learning_rate": 1.3607081098508163e-05, + "loss": 0.0001, + "step": 14704 + }, + { + "epoch": 3.8793035219628016, + "grad_norm": 0.03229880332946777, + "learning_rate": 1.3605908731205488e-05, + "loss": 0.0002, + "step": 14706 + }, + { + "epoch": 3.879831156839467, + "grad_norm": 0.006008047144860029, + "learning_rate": 1.3604736363902814e-05, + "loss": 0.0001, + "step": 14708 + }, + { + "epoch": 3.8803587917161324, + "grad_norm": 0.003760995576158166, + "learning_rate": 1.3603563996600135e-05, + "loss": 0.0001, + "step": 14710 + }, + { + "epoch": 3.880886426592798, + "grad_norm": 0.010409155860543251, + "learning_rate": 1.360239162929746e-05, + "loss": 0.0004, + "step": 14712 + }, + { + "epoch": 3.881414061469463, + "grad_norm": 0.31129875779151917, + "learning_rate": 1.3601219261994785e-05, + "loss": 0.0016, + "step": 14714 + }, + { + "epoch": 3.8819416963461286, + "grad_norm": 0.0015966014470905066, + "learning_rate": 1.3600046894692108e-05, + "loss": 0.0001, + "step": 14716 + }, + { + "epoch": 3.8824693312227936, + "grad_norm": 0.0013628399465233088, + "learning_rate": 1.3598874527389432e-05, + "loss": 0.0001, + "step": 14718 + }, + { + "epoch": 3.8829969660994594, + "grad_norm": 0.12852150201797485, + "learning_rate": 1.3597702160086756e-05, + "loss": 0.0017, + "step": 14720 + }, + { + "epoch": 3.8835246009761244, + "grad_norm": 0.05963204801082611, + "learning_rate": 1.359652979278408e-05, + "loss": 0.0002, + "step": 14722 + }, + { + "epoch": 3.88405223585279, + "grad_norm": 0.01048602256923914, + "learning_rate": 1.3595357425481403e-05, + "loss": 0.0011, + "step": 14724 + }, + { + "epoch": 3.884579870729455, + "grad_norm": 0.014486536383628845, + "learning_rate": 1.3594185058178729e-05, + "loss": 0.0002, + "step": 14726 + }, + { + "epoch": 3.8851075056061206, + "grad_norm": 0.0027457450050860643, + "learning_rate": 1.3593012690876053e-05, + "loss": 0.0015, + "step": 14728 + }, + { + "epoch": 3.885635140482786, + "grad_norm": 0.01718330755829811, + "learning_rate": 1.3591840323573378e-05, + "loss": 0.0001, + "step": 14730 + }, + { + "epoch": 3.886162775359451, + "grad_norm": 0.16835439205169678, + "learning_rate": 1.35906679562707e-05, + "loss": 0.0013, + "step": 14732 + }, + { + "epoch": 3.886690410236117, + "grad_norm": 0.004611199721693993, + "learning_rate": 1.3589495588968025e-05, + "loss": 0.0001, + "step": 14734 + }, + { + "epoch": 3.887218045112782, + "grad_norm": 0.0025666821748018265, + "learning_rate": 1.3588323221665349e-05, + "loss": 0.0001, + "step": 14736 + }, + { + "epoch": 3.887745679989447, + "grad_norm": 0.014943049289286137, + "learning_rate": 1.3587150854362673e-05, + "loss": 0.0001, + "step": 14738 + }, + { + "epoch": 3.8882733148661126, + "grad_norm": 0.08570202440023422, + "learning_rate": 1.3585978487059996e-05, + "loss": 0.0003, + "step": 14740 + }, + { + "epoch": 3.888800949742778, + "grad_norm": 0.11530959606170654, + "learning_rate": 1.3584806119757322e-05, + "loss": 0.0009, + "step": 14742 + }, + { + "epoch": 3.8893285846194434, + "grad_norm": 0.2859169542789459, + "learning_rate": 1.3583633752454646e-05, + "loss": 0.0019, + "step": 14744 + }, + { + "epoch": 3.889856219496109, + "grad_norm": 0.004372305702418089, + "learning_rate": 1.3582461385151969e-05, + "loss": 0.0, + "step": 14746 + }, + { + "epoch": 3.8903838543727742, + "grad_norm": 0.00748022086918354, + "learning_rate": 1.3581289017849293e-05, + "loss": 0.0002, + "step": 14748 + }, + { + "epoch": 3.890911489249439, + "grad_norm": 0.07864648848772049, + "learning_rate": 1.3580116650546617e-05, + "loss": 0.0002, + "step": 14750 + }, + { + "epoch": 3.8914391241261046, + "grad_norm": 0.36951878666877747, + "learning_rate": 1.3578944283243942e-05, + "loss": 0.0019, + "step": 14752 + }, + { + "epoch": 3.89196675900277, + "grad_norm": 0.16592109203338623, + "learning_rate": 1.3577771915941264e-05, + "loss": 0.0038, + "step": 14754 + }, + { + "epoch": 3.8924943938794354, + "grad_norm": 0.05567670613527298, + "learning_rate": 1.3576599548638589e-05, + "loss": 0.0038, + "step": 14756 + }, + { + "epoch": 3.893022028756101, + "grad_norm": 0.0034812872763723135, + "learning_rate": 1.3575427181335915e-05, + "loss": 0.0001, + "step": 14758 + }, + { + "epoch": 3.8935496636327662, + "grad_norm": 0.006085993722081184, + "learning_rate": 1.3574254814033239e-05, + "loss": 0.0002, + "step": 14760 + }, + { + "epoch": 3.8940772985094316, + "grad_norm": 0.017944155260920525, + "learning_rate": 1.3573082446730561e-05, + "loss": 0.0001, + "step": 14762 + }, + { + "epoch": 3.8946049333860966, + "grad_norm": 0.0143324825912714, + "learning_rate": 1.3571910079427886e-05, + "loss": 0.0002, + "step": 14764 + }, + { + "epoch": 3.8951325682627624, + "grad_norm": 0.054539065808057785, + "learning_rate": 1.357073771212521e-05, + "loss": 0.0004, + "step": 14766 + }, + { + "epoch": 3.8956602031394274, + "grad_norm": 0.04801550880074501, + "learning_rate": 1.3569565344822534e-05, + "loss": 0.0003, + "step": 14768 + }, + { + "epoch": 3.896187838016093, + "grad_norm": 0.00961683876812458, + "learning_rate": 1.3568392977519857e-05, + "loss": 0.0001, + "step": 14770 + }, + { + "epoch": 3.896715472892758, + "grad_norm": 0.005236964672803879, + "learning_rate": 1.3567220610217183e-05, + "loss": 0.0001, + "step": 14772 + }, + { + "epoch": 3.8972431077694236, + "grad_norm": 0.046738311648368835, + "learning_rate": 1.3566048242914507e-05, + "loss": 0.0016, + "step": 14774 + }, + { + "epoch": 3.897770742646089, + "grad_norm": 0.008736594580113888, + "learning_rate": 1.356487587561183e-05, + "loss": 0.0002, + "step": 14776 + }, + { + "epoch": 3.898298377522754, + "grad_norm": 0.00993509404361248, + "learning_rate": 1.3563703508309154e-05, + "loss": 0.001, + "step": 14778 + }, + { + "epoch": 3.89882601239942, + "grad_norm": 0.0017182957381010056, + "learning_rate": 1.3562531141006478e-05, + "loss": 0.0005, + "step": 14780 + }, + { + "epoch": 3.899353647276085, + "grad_norm": 0.0011957300594076514, + "learning_rate": 1.3561358773703803e-05, + "loss": 0.0001, + "step": 14782 + }, + { + "epoch": 3.89988128215275, + "grad_norm": 0.0021484652534127235, + "learning_rate": 1.3560186406401125e-05, + "loss": 0.0, + "step": 14784 + }, + { + "epoch": 3.9004089170294156, + "grad_norm": 0.0005789226852357388, + "learning_rate": 1.355901403909845e-05, + "loss": 0.0002, + "step": 14786 + }, + { + "epoch": 3.900936551906081, + "grad_norm": 0.004779341630637646, + "learning_rate": 1.3557841671795776e-05, + "loss": 0.0002, + "step": 14788 + }, + { + "epoch": 3.9014641867827464, + "grad_norm": 2.1212430000305176, + "learning_rate": 1.35566693044931e-05, + "loss": 0.0002, + "step": 14790 + }, + { + "epoch": 3.901991821659412, + "grad_norm": 0.0022663751151412725, + "learning_rate": 1.3555496937190423e-05, + "loss": 0.0001, + "step": 14792 + }, + { + "epoch": 3.9025194565360772, + "grad_norm": 0.02898290939629078, + "learning_rate": 1.3554324569887747e-05, + "loss": 0.0023, + "step": 14794 + }, + { + "epoch": 3.903047091412742, + "grad_norm": 0.001691908109933138, + "learning_rate": 1.3553152202585071e-05, + "loss": 0.0001, + "step": 14796 + }, + { + "epoch": 3.9035747262894076, + "grad_norm": 0.005016708746552467, + "learning_rate": 1.3551979835282396e-05, + "loss": 0.0036, + "step": 14798 + }, + { + "epoch": 3.904102361166073, + "grad_norm": 0.000852007360663265, + "learning_rate": 1.3550807467979718e-05, + "loss": 0.0001, + "step": 14800 + }, + { + "epoch": 3.9046299960427384, + "grad_norm": 0.0011864261468872428, + "learning_rate": 1.3549635100677042e-05, + "loss": 0.0001, + "step": 14802 + }, + { + "epoch": 3.905157630919404, + "grad_norm": 0.1495230346918106, + "learning_rate": 1.3548462733374368e-05, + "loss": 0.005, + "step": 14804 + }, + { + "epoch": 3.9056852657960692, + "grad_norm": 0.030188437551259995, + "learning_rate": 1.3547290366071691e-05, + "loss": 0.0052, + "step": 14806 + }, + { + "epoch": 3.9062129006727346, + "grad_norm": 0.09561850875616074, + "learning_rate": 1.3546117998769015e-05, + "loss": 0.0052, + "step": 14808 + }, + { + "epoch": 3.9067405355493996, + "grad_norm": 0.0025484771467745304, + "learning_rate": 1.354494563146634e-05, + "loss": 0.0001, + "step": 14810 + }, + { + "epoch": 3.907268170426065, + "grad_norm": 0.002242503222078085, + "learning_rate": 1.3543773264163664e-05, + "loss": 0.0001, + "step": 14812 + }, + { + "epoch": 3.9077958053027304, + "grad_norm": 0.19364088773727417, + "learning_rate": 1.3542600896860987e-05, + "loss": 0.002, + "step": 14814 + }, + { + "epoch": 3.908323440179396, + "grad_norm": 0.004076015204191208, + "learning_rate": 1.3541428529558311e-05, + "loss": 0.0001, + "step": 14816 + }, + { + "epoch": 3.908851075056061, + "grad_norm": 0.0027317802887409925, + "learning_rate": 1.3540256162255637e-05, + "loss": 0.0038, + "step": 14818 + }, + { + "epoch": 3.9093787099327266, + "grad_norm": 0.0730706974864006, + "learning_rate": 1.3539083794952961e-05, + "loss": 0.001, + "step": 14820 + }, + { + "epoch": 3.909906344809392, + "grad_norm": 0.009211280383169651, + "learning_rate": 1.3537911427650284e-05, + "loss": 0.0002, + "step": 14822 + }, + { + "epoch": 3.910433979686057, + "grad_norm": 0.05045577511191368, + "learning_rate": 1.3536739060347608e-05, + "loss": 0.0002, + "step": 14824 + }, + { + "epoch": 3.910961614562723, + "grad_norm": 0.08424458652734756, + "learning_rate": 1.3535566693044932e-05, + "loss": 0.0072, + "step": 14826 + }, + { + "epoch": 3.911489249439388, + "grad_norm": 0.005206217989325523, + "learning_rate": 1.3534394325742257e-05, + "loss": 0.0008, + "step": 14828 + }, + { + "epoch": 3.912016884316053, + "grad_norm": 0.013967098668217659, + "learning_rate": 1.353322195843958e-05, + "loss": 0.0003, + "step": 14830 + }, + { + "epoch": 3.9125445191927186, + "grad_norm": 0.00453093321993947, + "learning_rate": 1.3532049591136904e-05, + "loss": 0.0001, + "step": 14832 + }, + { + "epoch": 3.913072154069384, + "grad_norm": 0.028476838022470474, + "learning_rate": 1.353087722383423e-05, + "loss": 0.0025, + "step": 14834 + }, + { + "epoch": 3.9135997889460494, + "grad_norm": 0.11647181957960129, + "learning_rate": 1.3529704856531552e-05, + "loss": 0.0045, + "step": 14836 + }, + { + "epoch": 3.914127423822715, + "grad_norm": 0.008927443064749241, + "learning_rate": 1.3528532489228877e-05, + "loss": 0.0003, + "step": 14838 + }, + { + "epoch": 3.9146550586993802, + "grad_norm": 0.011752820573747158, + "learning_rate": 1.35273601219262e-05, + "loss": 0.0002, + "step": 14840 + }, + { + "epoch": 3.915182693576045, + "grad_norm": 0.04721174016594887, + "learning_rate": 1.3526187754623525e-05, + "loss": 0.0003, + "step": 14842 + }, + { + "epoch": 3.9157103284527106, + "grad_norm": 0.002559396205469966, + "learning_rate": 1.3525015387320848e-05, + "loss": 0.0005, + "step": 14844 + }, + { + "epoch": 3.916237963329376, + "grad_norm": 0.00206625834107399, + "learning_rate": 1.3523843020018172e-05, + "loss": 0.0001, + "step": 14846 + }, + { + "epoch": 3.9167655982060414, + "grad_norm": 0.006121226120740175, + "learning_rate": 1.3522670652715496e-05, + "loss": 0.0001, + "step": 14848 + }, + { + "epoch": 3.917293233082707, + "grad_norm": 0.029143813997507095, + "learning_rate": 1.3521498285412822e-05, + "loss": 0.0003, + "step": 14850 + }, + { + "epoch": 3.9178208679593722, + "grad_norm": 0.0022505817469209433, + "learning_rate": 1.3520325918110145e-05, + "loss": 0.0018, + "step": 14852 + }, + { + "epoch": 3.9183485028360376, + "grad_norm": 0.9499567151069641, + "learning_rate": 1.351915355080747e-05, + "loss": 0.0086, + "step": 14854 + }, + { + "epoch": 3.9188761377127026, + "grad_norm": 0.04350998252630234, + "learning_rate": 1.3517981183504794e-05, + "loss": 0.0008, + "step": 14856 + }, + { + "epoch": 3.919403772589368, + "grad_norm": 0.04656441882252693, + "learning_rate": 1.3516808816202118e-05, + "loss": 0.0002, + "step": 14858 + }, + { + "epoch": 3.9199314074660334, + "grad_norm": 0.36140432953834534, + "learning_rate": 1.351563644889944e-05, + "loss": 0.0035, + "step": 14860 + }, + { + "epoch": 3.920459042342699, + "grad_norm": 0.010649587027728558, + "learning_rate": 1.3514464081596765e-05, + "loss": 0.0016, + "step": 14862 + }, + { + "epoch": 3.9209866772193642, + "grad_norm": 0.11443762481212616, + "learning_rate": 1.351329171429409e-05, + "loss": 0.0019, + "step": 14864 + }, + { + "epoch": 3.9215143120960296, + "grad_norm": 0.03190314397215843, + "learning_rate": 1.3512119346991412e-05, + "loss": 0.0045, + "step": 14866 + }, + { + "epoch": 3.922041946972695, + "grad_norm": 0.0030058673582971096, + "learning_rate": 1.3510946979688738e-05, + "loss": 0.0004, + "step": 14868 + }, + { + "epoch": 3.92256958184936, + "grad_norm": 0.0835033431649208, + "learning_rate": 1.3509774612386062e-05, + "loss": 0.0006, + "step": 14870 + }, + { + "epoch": 3.923097216726026, + "grad_norm": 0.0058918665163218975, + "learning_rate": 1.3508602245083386e-05, + "loss": 0.0001, + "step": 14872 + }, + { + "epoch": 3.923624851602691, + "grad_norm": 0.010112016461789608, + "learning_rate": 1.3507429877780709e-05, + "loss": 0.0002, + "step": 14874 + }, + { + "epoch": 3.924152486479356, + "grad_norm": 0.03875430300831795, + "learning_rate": 1.3506257510478033e-05, + "loss": 0.0003, + "step": 14876 + }, + { + "epoch": 3.9246801213560216, + "grad_norm": 0.021130118519067764, + "learning_rate": 1.3505085143175358e-05, + "loss": 0.0003, + "step": 14878 + }, + { + "epoch": 3.925207756232687, + "grad_norm": 0.006040602456778288, + "learning_rate": 1.3503912775872684e-05, + "loss": 0.0007, + "step": 14880 + }, + { + "epoch": 3.9257353911093524, + "grad_norm": 0.010756335221230984, + "learning_rate": 1.3502740408570006e-05, + "loss": 0.0002, + "step": 14882 + }, + { + "epoch": 3.926263025986018, + "grad_norm": 0.09263692051172256, + "learning_rate": 1.350156804126733e-05, + "loss": 0.0005, + "step": 14884 + }, + { + "epoch": 3.9267906608626832, + "grad_norm": 0.37902045249938965, + "learning_rate": 1.3500395673964655e-05, + "loss": 0.0067, + "step": 14886 + }, + { + "epoch": 3.927318295739348, + "grad_norm": 0.002021918073296547, + "learning_rate": 1.3499223306661979e-05, + "loss": 0.0001, + "step": 14888 + }, + { + "epoch": 3.9278459306160136, + "grad_norm": 0.00543420622125268, + "learning_rate": 1.3498050939359302e-05, + "loss": 0.0001, + "step": 14890 + }, + { + "epoch": 3.928373565492679, + "grad_norm": 0.008292348124086857, + "learning_rate": 1.3496878572056626e-05, + "loss": 0.0001, + "step": 14892 + }, + { + "epoch": 3.9289012003693444, + "grad_norm": 0.37946847081184387, + "learning_rate": 1.3495706204753952e-05, + "loss": 0.0015, + "step": 14894 + }, + { + "epoch": 3.92942883524601, + "grad_norm": 0.0015909166540950537, + "learning_rate": 1.3494533837451273e-05, + "loss": 0.0001, + "step": 14896 + }, + { + "epoch": 3.9299564701226752, + "grad_norm": 0.034152887761592865, + "learning_rate": 1.3493361470148599e-05, + "loss": 0.0002, + "step": 14898 + }, + { + "epoch": 3.9304841049993406, + "grad_norm": 0.014499831013381481, + "learning_rate": 1.3492189102845923e-05, + "loss": 0.0003, + "step": 14900 + }, + { + "epoch": 3.9310117398760056, + "grad_norm": 0.012566502206027508, + "learning_rate": 1.3491016735543247e-05, + "loss": 0.0002, + "step": 14902 + }, + { + "epoch": 3.931539374752671, + "grad_norm": 0.6902835965156555, + "learning_rate": 1.348984436824057e-05, + "loss": 0.0032, + "step": 14904 + }, + { + "epoch": 3.9320670096293364, + "grad_norm": 0.007184337824583054, + "learning_rate": 1.3488672000937894e-05, + "loss": 0.0001, + "step": 14906 + }, + { + "epoch": 3.932594644506002, + "grad_norm": 0.004766768775880337, + "learning_rate": 1.3487499633635219e-05, + "loss": 0.0001, + "step": 14908 + }, + { + "epoch": 3.9331222793826672, + "grad_norm": 0.0010302638402208686, + "learning_rate": 1.3486327266332545e-05, + "loss": 0.0001, + "step": 14910 + }, + { + "epoch": 3.9336499142593326, + "grad_norm": 0.0018155022989958525, + "learning_rate": 1.3485154899029866e-05, + "loss": 0.0, + "step": 14912 + }, + { + "epoch": 3.934177549135998, + "grad_norm": 0.03613011911511421, + "learning_rate": 1.3483982531727192e-05, + "loss": 0.0001, + "step": 14914 + }, + { + "epoch": 3.934705184012663, + "grad_norm": 0.12355940043926239, + "learning_rate": 1.3482810164424516e-05, + "loss": 0.0005, + "step": 14916 + }, + { + "epoch": 3.935232818889329, + "grad_norm": 0.010800695978105068, + "learning_rate": 1.348163779712184e-05, + "loss": 0.0001, + "step": 14918 + }, + { + "epoch": 3.935760453765994, + "grad_norm": 0.3574387729167938, + "learning_rate": 1.3480465429819163e-05, + "loss": 0.0014, + "step": 14920 + }, + { + "epoch": 3.936288088642659, + "grad_norm": 0.0011758245527744293, + "learning_rate": 1.3479293062516487e-05, + "loss": 0.0003, + "step": 14922 + }, + { + "epoch": 3.9368157235193246, + "grad_norm": 0.007671814877539873, + "learning_rate": 1.3478120695213811e-05, + "loss": 0.0001, + "step": 14924 + }, + { + "epoch": 3.93734335839599, + "grad_norm": 0.003561123739928007, + "learning_rate": 1.3476948327911134e-05, + "loss": 0.0001, + "step": 14926 + }, + { + "epoch": 3.9378709932726554, + "grad_norm": 0.020067308098077774, + "learning_rate": 1.347577596060846e-05, + "loss": 0.0028, + "step": 14928 + }, + { + "epoch": 3.9383986281493204, + "grad_norm": 0.2561016082763672, + "learning_rate": 1.3474603593305784e-05, + "loss": 0.0021, + "step": 14930 + }, + { + "epoch": 3.9389262630259863, + "grad_norm": 0.004383671563118696, + "learning_rate": 1.3473431226003109e-05, + "loss": 0.0062, + "step": 14932 + }, + { + "epoch": 3.939453897902651, + "grad_norm": 0.004680974408984184, + "learning_rate": 1.3472258858700431e-05, + "loss": 0.0008, + "step": 14934 + }, + { + "epoch": 3.9399815327793166, + "grad_norm": 0.012729281559586525, + "learning_rate": 1.3471086491397756e-05, + "loss": 0.0055, + "step": 14936 + }, + { + "epoch": 3.940509167655982, + "grad_norm": 0.0012612231075763702, + "learning_rate": 1.346991412409508e-05, + "loss": 0.0035, + "step": 14938 + }, + { + "epoch": 3.9410368025326474, + "grad_norm": 0.014396997168660164, + "learning_rate": 1.3468741756792406e-05, + "loss": 0.0002, + "step": 14940 + }, + { + "epoch": 3.941564437409313, + "grad_norm": 0.07620096206665039, + "learning_rate": 1.3467569389489727e-05, + "loss": 0.0008, + "step": 14942 + }, + { + "epoch": 3.9420920722859782, + "grad_norm": 0.08241496235132217, + "learning_rate": 1.3466397022187053e-05, + "loss": 0.0016, + "step": 14944 + }, + { + "epoch": 3.9426197071626436, + "grad_norm": 0.00415373919531703, + "learning_rate": 1.3465224654884377e-05, + "loss": 0.0024, + "step": 14946 + }, + { + "epoch": 3.9431473420393086, + "grad_norm": 0.0829877257347107, + "learning_rate": 1.3464052287581701e-05, + "loss": 0.0014, + "step": 14948 + }, + { + "epoch": 3.943674976915974, + "grad_norm": 0.38163572549819946, + "learning_rate": 1.3462879920279024e-05, + "loss": 0.0038, + "step": 14950 + }, + { + "epoch": 3.9442026117926394, + "grad_norm": 0.06502123177051544, + "learning_rate": 1.3461707552976348e-05, + "loss": 0.0028, + "step": 14952 + }, + { + "epoch": 3.944730246669305, + "grad_norm": 0.08019313961267471, + "learning_rate": 1.3460535185673673e-05, + "loss": 0.0003, + "step": 14954 + }, + { + "epoch": 3.9452578815459702, + "grad_norm": 0.018394960090517998, + "learning_rate": 1.3459362818370995e-05, + "loss": 0.0007, + "step": 14956 + }, + { + "epoch": 3.9457855164226356, + "grad_norm": 0.0058549074456095695, + "learning_rate": 1.345819045106832e-05, + "loss": 0.0001, + "step": 14958 + }, + { + "epoch": 3.946313151299301, + "grad_norm": 0.17996414005756378, + "learning_rate": 1.3457018083765645e-05, + "loss": 0.0096, + "step": 14960 + }, + { + "epoch": 3.946840786175966, + "grad_norm": 0.015942147001624107, + "learning_rate": 1.345584571646297e-05, + "loss": 0.0002, + "step": 14962 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.0021300215739756823, + "learning_rate": 1.3454673349160292e-05, + "loss": 0.0001, + "step": 14964 + }, + { + "epoch": 3.947896055929297, + "grad_norm": 0.005482084583491087, + "learning_rate": 1.3453500981857617e-05, + "loss": 0.0002, + "step": 14966 + }, + { + "epoch": 3.9484236908059622, + "grad_norm": 0.001046805060468614, + "learning_rate": 1.3452328614554941e-05, + "loss": 0.0001, + "step": 14968 + }, + { + "epoch": 3.9489513256826276, + "grad_norm": 0.0019793505780398846, + "learning_rate": 1.3451156247252265e-05, + "loss": 0.0001, + "step": 14970 + }, + { + "epoch": 3.949478960559293, + "grad_norm": 0.01578979752957821, + "learning_rate": 1.3449983879949588e-05, + "loss": 0.0001, + "step": 14972 + }, + { + "epoch": 3.9500065954359584, + "grad_norm": 0.00482450844720006, + "learning_rate": 1.3448811512646914e-05, + "loss": 0.0001, + "step": 14974 + }, + { + "epoch": 3.9505342303126234, + "grad_norm": 0.004241123795509338, + "learning_rate": 1.3447639145344238e-05, + "loss": 0.0001, + "step": 14976 + }, + { + "epoch": 3.9510618651892893, + "grad_norm": 0.004233637358993292, + "learning_rate": 1.3446466778041563e-05, + "loss": 0.0001, + "step": 14978 + }, + { + "epoch": 3.951589500065954, + "grad_norm": 0.041176117956638336, + "learning_rate": 1.3445294410738885e-05, + "loss": 0.0003, + "step": 14980 + }, + { + "epoch": 3.9521171349426196, + "grad_norm": 0.28047534823417664, + "learning_rate": 1.344412204343621e-05, + "loss": 0.0035, + "step": 14982 + }, + { + "epoch": 3.952644769819285, + "grad_norm": 0.006834066938608885, + "learning_rate": 1.3442949676133534e-05, + "loss": 0.0001, + "step": 14984 + }, + { + "epoch": 3.9531724046959504, + "grad_norm": 0.6634602546691895, + "learning_rate": 1.3441777308830856e-05, + "loss": 0.006, + "step": 14986 + }, + { + "epoch": 3.953700039572616, + "grad_norm": 0.007702800445258617, + "learning_rate": 1.344060494152818e-05, + "loss": 0.0001, + "step": 14988 + }, + { + "epoch": 3.9542276744492812, + "grad_norm": 0.004459279589354992, + "learning_rate": 1.3439432574225507e-05, + "loss": 0.0004, + "step": 14990 + }, + { + "epoch": 3.9547553093259467, + "grad_norm": 0.004105039406567812, + "learning_rate": 1.3438260206922831e-05, + "loss": 0.0002, + "step": 14992 + }, + { + "epoch": 3.9552829442026116, + "grad_norm": 0.011828457936644554, + "learning_rate": 1.3437087839620154e-05, + "loss": 0.0007, + "step": 14994 + }, + { + "epoch": 3.955810579079277, + "grad_norm": 0.02315891534090042, + "learning_rate": 1.3435915472317478e-05, + "loss": 0.0003, + "step": 14996 + }, + { + "epoch": 3.9563382139559424, + "grad_norm": 0.9284602403640747, + "learning_rate": 1.3434743105014802e-05, + "loss": 0.0055, + "step": 14998 + }, + { + "epoch": 3.956865848832608, + "grad_norm": 0.007030523847788572, + "learning_rate": 1.3433570737712126e-05, + "loss": 0.0001, + "step": 15000 + }, + { + "epoch": 3.9573934837092732, + "grad_norm": 0.001905642682686448, + "learning_rate": 1.3432398370409449e-05, + "loss": 0.0001, + "step": 15002 + }, + { + "epoch": 3.9579211185859386, + "grad_norm": 0.12915997207164764, + "learning_rate": 1.3431226003106775e-05, + "loss": 0.0057, + "step": 15004 + }, + { + "epoch": 3.958448753462604, + "grad_norm": 0.004022705368697643, + "learning_rate": 1.34300536358041e-05, + "loss": 0.0002, + "step": 15006 + }, + { + "epoch": 3.958976388339269, + "grad_norm": 0.005507016088813543, + "learning_rate": 1.3428881268501424e-05, + "loss": 0.0001, + "step": 15008 + }, + { + "epoch": 3.9595040232159344, + "grad_norm": 0.0021422032732516527, + "learning_rate": 1.3427708901198746e-05, + "loss": 0.0001, + "step": 15010 + }, + { + "epoch": 3.9600316580926, + "grad_norm": 0.016851376742124557, + "learning_rate": 1.342653653389607e-05, + "loss": 0.0002, + "step": 15012 + }, + { + "epoch": 3.9605592929692652, + "grad_norm": 0.002977975644171238, + "learning_rate": 1.3425364166593395e-05, + "loss": 0.0001, + "step": 15014 + }, + { + "epoch": 3.9610869278459306, + "grad_norm": 0.2609853446483612, + "learning_rate": 1.3424191799290718e-05, + "loss": 0.0043, + "step": 15016 + }, + { + "epoch": 3.961614562722596, + "grad_norm": 0.06725091487169266, + "learning_rate": 1.3423019431988042e-05, + "loss": 0.0006, + "step": 15018 + }, + { + "epoch": 3.9621421975992615, + "grad_norm": 0.024307135492563248, + "learning_rate": 1.3421847064685368e-05, + "loss": 0.0002, + "step": 15020 + }, + { + "epoch": 3.9626698324759264, + "grad_norm": 0.5832816362380981, + "learning_rate": 1.3420674697382692e-05, + "loss": 0.0027, + "step": 15022 + }, + { + "epoch": 3.9631974673525923, + "grad_norm": 0.009921093471348286, + "learning_rate": 1.3419502330080015e-05, + "loss": 0.0003, + "step": 15024 + }, + { + "epoch": 3.9637251022292572, + "grad_norm": 0.0015157593879848719, + "learning_rate": 1.3418329962777339e-05, + "loss": 0.0, + "step": 15026 + }, + { + "epoch": 3.9642527371059226, + "grad_norm": 0.004175608977675438, + "learning_rate": 1.3417157595474663e-05, + "loss": 0.0016, + "step": 15028 + }, + { + "epoch": 3.964780371982588, + "grad_norm": 0.07378964871168137, + "learning_rate": 1.3415985228171988e-05, + "loss": 0.0003, + "step": 15030 + }, + { + "epoch": 3.9653080068592534, + "grad_norm": 0.0037587916012853384, + "learning_rate": 1.341481286086931e-05, + "loss": 0.0001, + "step": 15032 + }, + { + "epoch": 3.965835641735919, + "grad_norm": 0.004202570766210556, + "learning_rate": 1.3413640493566635e-05, + "loss": 0.0086, + "step": 15034 + }, + { + "epoch": 3.9663632766125843, + "grad_norm": 0.0051659089513123035, + "learning_rate": 1.341246812626396e-05, + "loss": 0.0009, + "step": 15036 + }, + { + "epoch": 3.9668909114892497, + "grad_norm": 0.08970528095960617, + "learning_rate": 1.3411295758961285e-05, + "loss": 0.0012, + "step": 15038 + }, + { + "epoch": 3.9674185463659146, + "grad_norm": 0.002292582532390952, + "learning_rate": 1.3410123391658607e-05, + "loss": 0.0001, + "step": 15040 + }, + { + "epoch": 3.96794618124258, + "grad_norm": 0.0031645058188587427, + "learning_rate": 1.3408951024355932e-05, + "loss": 0.0001, + "step": 15042 + }, + { + "epoch": 3.9684738161192454, + "grad_norm": 0.026685649529099464, + "learning_rate": 1.3407778657053256e-05, + "loss": 0.0002, + "step": 15044 + }, + { + "epoch": 3.969001450995911, + "grad_norm": 0.0029142473358660936, + "learning_rate": 1.3406606289750579e-05, + "loss": 0.0002, + "step": 15046 + }, + { + "epoch": 3.9695290858725762, + "grad_norm": 0.002773748245090246, + "learning_rate": 1.3405433922447903e-05, + "loss": 0.0001, + "step": 15048 + }, + { + "epoch": 3.9700567207492417, + "grad_norm": 0.31219568848609924, + "learning_rate": 1.3404261555145229e-05, + "loss": 0.0025, + "step": 15050 + }, + { + "epoch": 3.970584355625907, + "grad_norm": 0.00590544193983078, + "learning_rate": 1.3403089187842553e-05, + "loss": 0.0022, + "step": 15052 + }, + { + "epoch": 3.971111990502572, + "grad_norm": 0.004519733600318432, + "learning_rate": 1.3401916820539876e-05, + "loss": 0.0002, + "step": 15054 + }, + { + "epoch": 3.9716396253792374, + "grad_norm": 0.004168051760643721, + "learning_rate": 1.34007444532372e-05, + "loss": 0.0001, + "step": 15056 + }, + { + "epoch": 3.972167260255903, + "grad_norm": 0.002351196249946952, + "learning_rate": 1.3399572085934525e-05, + "loss": 0.0001, + "step": 15058 + }, + { + "epoch": 3.9726948951325682, + "grad_norm": 0.22801721096038818, + "learning_rate": 1.3398399718631849e-05, + "loss": 0.018, + "step": 15060 + }, + { + "epoch": 3.9732225300092336, + "grad_norm": 0.010084046050906181, + "learning_rate": 1.3397227351329171e-05, + "loss": 0.0002, + "step": 15062 + }, + { + "epoch": 3.973750164885899, + "grad_norm": 0.007496823091059923, + "learning_rate": 1.3396054984026496e-05, + "loss": 0.0001, + "step": 15064 + }, + { + "epoch": 3.9742777997625645, + "grad_norm": 0.09471915662288666, + "learning_rate": 1.3394882616723822e-05, + "loss": 0.0015, + "step": 15066 + }, + { + "epoch": 3.9748054346392294, + "grad_norm": 0.1997969001531601, + "learning_rate": 1.3393710249421146e-05, + "loss": 0.0023, + "step": 15068 + }, + { + "epoch": 3.9753330695158953, + "grad_norm": 0.004659629892557859, + "learning_rate": 1.3392537882118469e-05, + "loss": 0.0001, + "step": 15070 + }, + { + "epoch": 3.9758607043925602, + "grad_norm": 0.12437045574188232, + "learning_rate": 1.3391365514815793e-05, + "loss": 0.002, + "step": 15072 + }, + { + "epoch": 3.9763883392692256, + "grad_norm": 0.011642537079751492, + "learning_rate": 1.3390193147513117e-05, + "loss": 0.0003, + "step": 15074 + }, + { + "epoch": 3.976915974145891, + "grad_norm": 0.22609427571296692, + "learning_rate": 1.338902078021044e-05, + "loss": 0.0001, + "step": 15076 + }, + { + "epoch": 3.9774436090225564, + "grad_norm": 0.060207072645425797, + "learning_rate": 1.3387848412907764e-05, + "loss": 0.0044, + "step": 15078 + }, + { + "epoch": 3.977971243899222, + "grad_norm": 0.08422323316335678, + "learning_rate": 1.3386676045605088e-05, + "loss": 0.0039, + "step": 15080 + }, + { + "epoch": 3.978498878775887, + "grad_norm": 0.003685818752273917, + "learning_rate": 1.3385503678302414e-05, + "loss": 0.0001, + "step": 15082 + }, + { + "epoch": 3.9790265136525527, + "grad_norm": 0.22552357614040375, + "learning_rate": 1.3384331310999737e-05, + "loss": 0.0009, + "step": 15084 + }, + { + "epoch": 3.9795541485292176, + "grad_norm": 0.004547953139990568, + "learning_rate": 1.3383158943697061e-05, + "loss": 0.0001, + "step": 15086 + }, + { + "epoch": 3.980081783405883, + "grad_norm": 0.05659684166312218, + "learning_rate": 1.3381986576394386e-05, + "loss": 0.0006, + "step": 15088 + }, + { + "epoch": 3.9806094182825484, + "grad_norm": 0.04912148788571358, + "learning_rate": 1.338081420909171e-05, + "loss": 0.0038, + "step": 15090 + }, + { + "epoch": 3.981137053159214, + "grad_norm": 0.003866287413984537, + "learning_rate": 1.3379641841789033e-05, + "loss": 0.0013, + "step": 15092 + }, + { + "epoch": 3.9816646880358793, + "grad_norm": 0.008992412127554417, + "learning_rate": 1.3378469474486357e-05, + "loss": 0.0002, + "step": 15094 + }, + { + "epoch": 3.9821923229125447, + "grad_norm": 0.24338138103485107, + "learning_rate": 1.3377297107183683e-05, + "loss": 0.0027, + "step": 15096 + }, + { + "epoch": 3.98271995778921, + "grad_norm": 0.02189248986542225, + "learning_rate": 1.3376124739881007e-05, + "loss": 0.0002, + "step": 15098 + }, + { + "epoch": 3.983247592665875, + "grad_norm": 0.005750944372266531, + "learning_rate": 1.337495237257833e-05, + "loss": 0.0026, + "step": 15100 + }, + { + "epoch": 3.9837752275425404, + "grad_norm": 0.0011337802279740572, + "learning_rate": 1.3373780005275654e-05, + "loss": 0.0001, + "step": 15102 + }, + { + "epoch": 3.984302862419206, + "grad_norm": 0.18299853801727295, + "learning_rate": 1.3372607637972978e-05, + "loss": 0.0022, + "step": 15104 + }, + { + "epoch": 3.9848304972958712, + "grad_norm": 0.0034336771350353956, + "learning_rate": 1.3371435270670301e-05, + "loss": 0.0001, + "step": 15106 + }, + { + "epoch": 3.9853581321725366, + "grad_norm": 0.12666937708854675, + "learning_rate": 1.3370262903367625e-05, + "loss": 0.0006, + "step": 15108 + }, + { + "epoch": 3.985885767049202, + "grad_norm": 0.0017748059472069144, + "learning_rate": 1.336909053606495e-05, + "loss": 0.0001, + "step": 15110 + }, + { + "epoch": 3.9864134019258675, + "grad_norm": 0.003911977168172598, + "learning_rate": 1.3367918168762276e-05, + "loss": 0.0001, + "step": 15112 + }, + { + "epoch": 3.9869410368025324, + "grad_norm": 0.4291995167732239, + "learning_rate": 1.3366745801459598e-05, + "loss": 0.002, + "step": 15114 + }, + { + "epoch": 3.987468671679198, + "grad_norm": 0.0015181900234892964, + "learning_rate": 1.3365573434156923e-05, + "loss": 0.0001, + "step": 15116 + }, + { + "epoch": 3.9879963065558632, + "grad_norm": 0.22531576454639435, + "learning_rate": 1.3364401066854247e-05, + "loss": 0.0024, + "step": 15118 + }, + { + "epoch": 3.9885239414325286, + "grad_norm": 0.013400672934949398, + "learning_rate": 1.3363228699551571e-05, + "loss": 0.0001, + "step": 15120 + }, + { + "epoch": 3.989051576309194, + "grad_norm": 0.1486029028892517, + "learning_rate": 1.3362056332248894e-05, + "loss": 0.0004, + "step": 15122 + }, + { + "epoch": 3.9895792111858595, + "grad_norm": 0.36068031191825867, + "learning_rate": 1.3360883964946218e-05, + "loss": 0.0005, + "step": 15124 + }, + { + "epoch": 3.990106846062525, + "grad_norm": 0.14037247002124786, + "learning_rate": 1.3359711597643542e-05, + "loss": 0.0055, + "step": 15126 + }, + { + "epoch": 3.99063448093919, + "grad_norm": 0.23174108564853668, + "learning_rate": 1.3358539230340868e-05, + "loss": 0.0049, + "step": 15128 + }, + { + "epoch": 3.9911621158158557, + "grad_norm": 0.33077841997146606, + "learning_rate": 1.3357366863038191e-05, + "loss": 0.0051, + "step": 15130 + }, + { + "epoch": 3.9916897506925206, + "grad_norm": 0.009519876912236214, + "learning_rate": 1.3356194495735515e-05, + "loss": 0.0001, + "step": 15132 + }, + { + "epoch": 3.992217385569186, + "grad_norm": 0.0070053222589194775, + "learning_rate": 1.335502212843284e-05, + "loss": 0.0001, + "step": 15134 + }, + { + "epoch": 3.9927450204458514, + "grad_norm": 0.026665784418582916, + "learning_rate": 1.3353849761130162e-05, + "loss": 0.0003, + "step": 15136 + }, + { + "epoch": 3.993272655322517, + "grad_norm": 0.01163627952337265, + "learning_rate": 1.3352677393827487e-05, + "loss": 0.0002, + "step": 15138 + }, + { + "epoch": 3.9938002901991823, + "grad_norm": 0.007813025265932083, + "learning_rate": 1.335150502652481e-05, + "loss": 0.0002, + "step": 15140 + }, + { + "epoch": 3.9943279250758477, + "grad_norm": 0.022641699761152267, + "learning_rate": 1.3350332659222137e-05, + "loss": 0.0004, + "step": 15142 + }, + { + "epoch": 3.994855559952513, + "grad_norm": 0.05334793031215668, + "learning_rate": 1.3349160291919458e-05, + "loss": 0.0044, + "step": 15144 + }, + { + "epoch": 3.995383194829178, + "grad_norm": 0.015593692660331726, + "learning_rate": 1.3347987924616784e-05, + "loss": 0.0002, + "step": 15146 + }, + { + "epoch": 3.9959108297058434, + "grad_norm": 0.012570114806294441, + "learning_rate": 1.3346815557314108e-05, + "loss": 0.0003, + "step": 15148 + }, + { + "epoch": 3.996438464582509, + "grad_norm": 0.008545254357159138, + "learning_rate": 1.3345643190011432e-05, + "loss": 0.0002, + "step": 15150 + }, + { + "epoch": 3.9969660994591742, + "grad_norm": 0.057266898453235626, + "learning_rate": 1.3344470822708755e-05, + "loss": 0.0021, + "step": 15152 + }, + { + "epoch": 3.9974937343358397, + "grad_norm": 0.014180850237607956, + "learning_rate": 1.334329845540608e-05, + "loss": 0.0002, + "step": 15154 + }, + { + "epoch": 3.998021369212505, + "grad_norm": 0.03419080376625061, + "learning_rate": 1.3342126088103404e-05, + "loss": 0.0003, + "step": 15156 + }, + { + "epoch": 3.9985490040891705, + "grad_norm": 0.0070443362928926945, + "learning_rate": 1.334095372080073e-05, + "loss": 0.0003, + "step": 15158 + }, + { + "epoch": 3.9990766389658354, + "grad_norm": 0.010190690867602825, + "learning_rate": 1.3339781353498052e-05, + "loss": 0.0002, + "step": 15160 + }, + { + "epoch": 3.999604273842501, + "grad_norm": 0.013317078351974487, + "learning_rate": 1.3338608986195376e-05, + "loss": 0.0002, + "step": 15162 + }, + { + "epoch": 4.0, + "grad_norm": 0.00995288323611021, + "learning_rate": 1.33374366188927e-05, + "loss": 0.002, + "step": 15164 + }, + { + "epoch": 4.000527634876665, + "grad_norm": 0.009225658141076565, + "learning_rate": 1.3336264251590023e-05, + "loss": 0.0002, + "step": 15166 + }, + { + "epoch": 4.001055269753331, + "grad_norm": 0.05081173777580261, + "learning_rate": 1.3335091884287348e-05, + "loss": 0.0016, + "step": 15168 + }, + { + "epoch": 4.001582904629996, + "grad_norm": 0.0013967931736260653, + "learning_rate": 1.3333919516984672e-05, + "loss": 0.003, + "step": 15170 + }, + { + "epoch": 4.002110539506662, + "grad_norm": 0.004454005043953657, + "learning_rate": 1.3332747149681996e-05, + "loss": 0.0015, + "step": 15172 + }, + { + "epoch": 4.002638174383327, + "grad_norm": 0.01065692026168108, + "learning_rate": 1.3331574782379319e-05, + "loss": 0.0002, + "step": 15174 + }, + { + "epoch": 4.003165809259992, + "grad_norm": 0.0032159648835659027, + "learning_rate": 1.3330402415076645e-05, + "loss": 0.0002, + "step": 15176 + }, + { + "epoch": 4.003693444136657, + "grad_norm": 0.010800892487168312, + "learning_rate": 1.332923004777397e-05, + "loss": 0.0002, + "step": 15178 + }, + { + "epoch": 4.004221079013322, + "grad_norm": 0.17849749326705933, + "learning_rate": 1.3328057680471294e-05, + "loss": 0.0009, + "step": 15180 + }, + { + "epoch": 4.004748713889988, + "grad_norm": 0.005418187007308006, + "learning_rate": 1.3326885313168616e-05, + "loss": 0.0001, + "step": 15182 + }, + { + "epoch": 4.005276348766653, + "grad_norm": 0.05606290325522423, + "learning_rate": 1.332571294586594e-05, + "loss": 0.0005, + "step": 15184 + }, + { + "epoch": 4.005803983643319, + "grad_norm": 0.0035412325523793697, + "learning_rate": 1.3324540578563265e-05, + "loss": 0.0008, + "step": 15186 + }, + { + "epoch": 4.006331618519984, + "grad_norm": 0.00972820445895195, + "learning_rate": 1.332336821126059e-05, + "loss": 0.0003, + "step": 15188 + }, + { + "epoch": 4.00685925339665, + "grad_norm": 0.004559183493256569, + "learning_rate": 1.3322195843957912e-05, + "loss": 0.0002, + "step": 15190 + }, + { + "epoch": 4.007386888273315, + "grad_norm": 0.005124215502291918, + "learning_rate": 1.3321023476655238e-05, + "loss": 0.0036, + "step": 15192 + }, + { + "epoch": 4.007914523149981, + "grad_norm": 0.0023602107539772987, + "learning_rate": 1.3319851109352562e-05, + "loss": 0.0001, + "step": 15194 + }, + { + "epoch": 4.008442158026646, + "grad_norm": 0.008937584236264229, + "learning_rate": 1.3318678742049886e-05, + "loss": 0.0002, + "step": 15196 + }, + { + "epoch": 4.008969792903311, + "grad_norm": 0.0021123362239450216, + "learning_rate": 1.3317506374747209e-05, + "loss": 0.0001, + "step": 15198 + }, + { + "epoch": 4.009497427779976, + "grad_norm": 0.002478984883055091, + "learning_rate": 1.3316334007444533e-05, + "loss": 0.0001, + "step": 15200 + }, + { + "epoch": 4.010025062656641, + "grad_norm": 0.0017048751469701529, + "learning_rate": 1.3315161640141857e-05, + "loss": 0.0, + "step": 15202 + }, + { + "epoch": 4.010552697533307, + "grad_norm": 0.0009081019088625908, + "learning_rate": 1.331398927283918e-05, + "loss": 0.0001, + "step": 15204 + }, + { + "epoch": 4.011080332409972, + "grad_norm": 0.007187469396740198, + "learning_rate": 1.3312816905536506e-05, + "loss": 0.0001, + "step": 15206 + }, + { + "epoch": 4.011607967286638, + "grad_norm": 0.014444404281675816, + "learning_rate": 1.331164453823383e-05, + "loss": 0.0001, + "step": 15208 + }, + { + "epoch": 4.012135602163303, + "grad_norm": 0.0030194264836609364, + "learning_rate": 1.3310472170931155e-05, + "loss": 0.0002, + "step": 15210 + }, + { + "epoch": 4.012663237039968, + "grad_norm": 0.10653707385063171, + "learning_rate": 1.3309299803628477e-05, + "loss": 0.0004, + "step": 15212 + }, + { + "epoch": 4.013190871916634, + "grad_norm": 0.023691676557064056, + "learning_rate": 1.3308127436325802e-05, + "loss": 0.0002, + "step": 15214 + }, + { + "epoch": 4.013718506793299, + "grad_norm": 0.018418122082948685, + "learning_rate": 1.3306955069023126e-05, + "loss": 0.0001, + "step": 15216 + }, + { + "epoch": 4.014246141669965, + "grad_norm": 0.02929682657122612, + "learning_rate": 1.3305782701720452e-05, + "loss": 0.0002, + "step": 15218 + }, + { + "epoch": 4.01477377654663, + "grad_norm": 0.0017398445634171367, + "learning_rate": 1.3304610334417773e-05, + "loss": 0.0001, + "step": 15220 + }, + { + "epoch": 4.015301411423295, + "grad_norm": 0.002498262794688344, + "learning_rate": 1.3303437967115099e-05, + "loss": 0.0001, + "step": 15222 + }, + { + "epoch": 4.01582904629996, + "grad_norm": 0.001746327499859035, + "learning_rate": 1.3302265599812423e-05, + "loss": 0.0, + "step": 15224 + }, + { + "epoch": 4.016356681176625, + "grad_norm": 0.009980903007090092, + "learning_rate": 1.3301093232509747e-05, + "loss": 0.0002, + "step": 15226 + }, + { + "epoch": 4.016884316053291, + "grad_norm": 0.002579951658844948, + "learning_rate": 1.329992086520707e-05, + "loss": 0.0001, + "step": 15228 + }, + { + "epoch": 4.017411950929956, + "grad_norm": 0.0030029877088963985, + "learning_rate": 1.3298748497904394e-05, + "loss": 0.0001, + "step": 15230 + }, + { + "epoch": 4.017939585806622, + "grad_norm": 0.08613691478967667, + "learning_rate": 1.3297576130601719e-05, + "loss": 0.0024, + "step": 15232 + }, + { + "epoch": 4.018467220683287, + "grad_norm": 0.04925110563635826, + "learning_rate": 1.3296403763299041e-05, + "loss": 0.0041, + "step": 15234 + }, + { + "epoch": 4.018994855559953, + "grad_norm": 0.36063191294670105, + "learning_rate": 1.3295231395996366e-05, + "loss": 0.0048, + "step": 15236 + }, + { + "epoch": 4.019522490436618, + "grad_norm": 0.2555639445781708, + "learning_rate": 1.3294059028693692e-05, + "loss": 0.0009, + "step": 15238 + }, + { + "epoch": 4.020050125313283, + "grad_norm": 0.025852471590042114, + "learning_rate": 1.3292886661391016e-05, + "loss": 0.0003, + "step": 15240 + }, + { + "epoch": 4.020577760189949, + "grad_norm": 0.08291608840227127, + "learning_rate": 1.3291714294088338e-05, + "loss": 0.0026, + "step": 15242 + }, + { + "epoch": 4.021105395066614, + "grad_norm": 0.0008642249158583581, + "learning_rate": 1.3290541926785663e-05, + "loss": 0.0001, + "step": 15244 + }, + { + "epoch": 4.021633029943279, + "grad_norm": 0.19730712473392487, + "learning_rate": 1.3289369559482987e-05, + "loss": 0.0019, + "step": 15246 + }, + { + "epoch": 4.022160664819944, + "grad_norm": 0.002253951271995902, + "learning_rate": 1.3288197192180311e-05, + "loss": 0.0078, + "step": 15248 + }, + { + "epoch": 4.02268829969661, + "grad_norm": 0.04258657619357109, + "learning_rate": 1.3287024824877634e-05, + "loss": 0.0015, + "step": 15250 + }, + { + "epoch": 4.023215934573275, + "grad_norm": 0.025795284658670425, + "learning_rate": 1.328585245757496e-05, + "loss": 0.0001, + "step": 15252 + }, + { + "epoch": 4.023743569449941, + "grad_norm": 0.0012531987158581614, + "learning_rate": 1.3284680090272284e-05, + "loss": 0.0001, + "step": 15254 + }, + { + "epoch": 4.024271204326606, + "grad_norm": 0.06133928894996643, + "learning_rate": 1.3283507722969609e-05, + "loss": 0.0017, + "step": 15256 + }, + { + "epoch": 4.024798839203271, + "grad_norm": 0.0012946963543072343, + "learning_rate": 1.3282335355666931e-05, + "loss": 0.0001, + "step": 15258 + }, + { + "epoch": 4.025326474079937, + "grad_norm": 0.05359594523906708, + "learning_rate": 1.3281162988364255e-05, + "loss": 0.0023, + "step": 15260 + }, + { + "epoch": 4.025854108956602, + "grad_norm": 0.015672072768211365, + "learning_rate": 1.327999062106158e-05, + "loss": 0.0006, + "step": 15262 + }, + { + "epoch": 4.026381743833268, + "grad_norm": 0.021889278665184975, + "learning_rate": 1.3278818253758902e-05, + "loss": 0.0001, + "step": 15264 + }, + { + "epoch": 4.026909378709933, + "grad_norm": 0.08525523543357849, + "learning_rate": 1.3277645886456227e-05, + "loss": 0.0013, + "step": 15266 + }, + { + "epoch": 4.0274370135865984, + "grad_norm": 0.34208229184150696, + "learning_rate": 1.3276473519153553e-05, + "loss": 0.0005, + "step": 15268 + }, + { + "epoch": 4.027964648463263, + "grad_norm": 0.004145155195146799, + "learning_rate": 1.3275301151850877e-05, + "loss": 0.0025, + "step": 15270 + }, + { + "epoch": 4.028492283339928, + "grad_norm": 0.0014970471384003758, + "learning_rate": 1.32741287845482e-05, + "loss": 0.0001, + "step": 15272 + }, + { + "epoch": 4.029019918216594, + "grad_norm": 0.002969795372337103, + "learning_rate": 1.3272956417245524e-05, + "loss": 0.0006, + "step": 15274 + }, + { + "epoch": 4.029547553093259, + "grad_norm": 0.0035834850277751684, + "learning_rate": 1.3271784049942848e-05, + "loss": 0.0001, + "step": 15276 + }, + { + "epoch": 4.030075187969925, + "grad_norm": 0.0020770994015038013, + "learning_rate": 1.3270611682640173e-05, + "loss": 0.0001, + "step": 15278 + }, + { + "epoch": 4.03060282284659, + "grad_norm": 0.029133878648281097, + "learning_rate": 1.3269439315337495e-05, + "loss": 0.0016, + "step": 15280 + }, + { + "epoch": 4.031130457723256, + "grad_norm": 0.033301617950201035, + "learning_rate": 1.326826694803482e-05, + "loss": 0.0003, + "step": 15282 + }, + { + "epoch": 4.031658092599921, + "grad_norm": 0.002699842443689704, + "learning_rate": 1.3267094580732145e-05, + "loss": 0.0003, + "step": 15284 + }, + { + "epoch": 4.032185727476586, + "grad_norm": 0.002302297856658697, + "learning_rate": 1.326592221342947e-05, + "loss": 0.0001, + "step": 15286 + }, + { + "epoch": 4.032713362353252, + "grad_norm": 0.010125968605279922, + "learning_rate": 1.3264749846126792e-05, + "loss": 0.0001, + "step": 15288 + }, + { + "epoch": 4.033240997229917, + "grad_norm": 0.37221378087997437, + "learning_rate": 1.3263577478824117e-05, + "loss": 0.0008, + "step": 15290 + }, + { + "epoch": 4.033768632106582, + "grad_norm": 0.2485513538122177, + "learning_rate": 1.3262405111521441e-05, + "loss": 0.0018, + "step": 15292 + }, + { + "epoch": 4.034296266983247, + "grad_norm": 0.014063111506402493, + "learning_rate": 1.3261232744218764e-05, + "loss": 0.0001, + "step": 15294 + }, + { + "epoch": 4.034823901859913, + "grad_norm": 0.13838979601860046, + "learning_rate": 1.3260060376916088e-05, + "loss": 0.0027, + "step": 15296 + }, + { + "epoch": 4.035351536736578, + "grad_norm": 0.09412360936403275, + "learning_rate": 1.3258888009613414e-05, + "loss": 0.0004, + "step": 15298 + }, + { + "epoch": 4.035879171613244, + "grad_norm": 0.003836988937109709, + "learning_rate": 1.3257715642310738e-05, + "loss": 0.0001, + "step": 15300 + }, + { + "epoch": 4.036406806489909, + "grad_norm": 0.30206069350242615, + "learning_rate": 1.325654327500806e-05, + "loss": 0.0014, + "step": 15302 + }, + { + "epoch": 4.036934441366574, + "grad_norm": 0.007204712834209204, + "learning_rate": 1.3255370907705385e-05, + "loss": 0.0002, + "step": 15304 + }, + { + "epoch": 4.03746207624324, + "grad_norm": 0.003934894222766161, + "learning_rate": 1.325419854040271e-05, + "loss": 0.0, + "step": 15306 + }, + { + "epoch": 4.037989711119905, + "grad_norm": 0.0026154271326959133, + "learning_rate": 1.3253026173100034e-05, + "loss": 0.0001, + "step": 15308 + }, + { + "epoch": 4.038517345996571, + "grad_norm": 0.0056159403175115585, + "learning_rate": 1.3251853805797356e-05, + "loss": 0.0001, + "step": 15310 + }, + { + "epoch": 4.039044980873236, + "grad_norm": 0.0013140825321897864, + "learning_rate": 1.325068143849468e-05, + "loss": 0.0001, + "step": 15312 + }, + { + "epoch": 4.0395726157499015, + "grad_norm": 0.10881314426660538, + "learning_rate": 1.3249509071192007e-05, + "loss": 0.0011, + "step": 15314 + }, + { + "epoch": 4.040100250626566, + "grad_norm": 0.02000448666512966, + "learning_rate": 1.3248336703889331e-05, + "loss": 0.0001, + "step": 15316 + }, + { + "epoch": 4.040627885503231, + "grad_norm": 0.0023947441950440407, + "learning_rate": 1.3247164336586654e-05, + "loss": 0.0001, + "step": 15318 + }, + { + "epoch": 4.041155520379897, + "grad_norm": 0.0015868849586695433, + "learning_rate": 1.3245991969283978e-05, + "loss": 0.0001, + "step": 15320 + }, + { + "epoch": 4.041683155256562, + "grad_norm": 0.006118504796177149, + "learning_rate": 1.3244819601981302e-05, + "loss": 0.0001, + "step": 15322 + }, + { + "epoch": 4.042210790133228, + "grad_norm": 0.1588916927576065, + "learning_rate": 1.3243647234678625e-05, + "loss": 0.0034, + "step": 15324 + }, + { + "epoch": 4.042738425009893, + "grad_norm": 0.0010501369833946228, + "learning_rate": 1.3242474867375949e-05, + "loss": 0.0001, + "step": 15326 + }, + { + "epoch": 4.043266059886559, + "grad_norm": 0.0009680298389866948, + "learning_rate": 1.3241302500073275e-05, + "loss": 0.0001, + "step": 15328 + }, + { + "epoch": 4.043793694763224, + "grad_norm": 0.033697254955768585, + "learning_rate": 1.32401301327706e-05, + "loss": 0.0005, + "step": 15330 + }, + { + "epoch": 4.044321329639889, + "grad_norm": 0.009384039789438248, + "learning_rate": 1.3238957765467922e-05, + "loss": 0.0001, + "step": 15332 + }, + { + "epoch": 4.044848964516555, + "grad_norm": 0.2832116484642029, + "learning_rate": 1.3237785398165246e-05, + "loss": 0.0012, + "step": 15334 + }, + { + "epoch": 4.04537659939322, + "grad_norm": 0.005319125019013882, + "learning_rate": 1.323661303086257e-05, + "loss": 0.0001, + "step": 15336 + }, + { + "epoch": 4.045904234269885, + "grad_norm": 0.002320067258551717, + "learning_rate": 1.3235440663559895e-05, + "loss": 0.0001, + "step": 15338 + }, + { + "epoch": 4.04643186914655, + "grad_norm": 0.00619850168004632, + "learning_rate": 1.3234268296257217e-05, + "loss": 0.0002, + "step": 15340 + }, + { + "epoch": 4.046959504023216, + "grad_norm": 0.016891608014702797, + "learning_rate": 1.3233095928954542e-05, + "loss": 0.0001, + "step": 15342 + }, + { + "epoch": 4.047487138899881, + "grad_norm": 0.0016669717151671648, + "learning_rate": 1.3231923561651868e-05, + "loss": 0.0004, + "step": 15344 + }, + { + "epoch": 4.048014773776547, + "grad_norm": 0.2228674739599228, + "learning_rate": 1.3230751194349192e-05, + "loss": 0.0014, + "step": 15346 + }, + { + "epoch": 4.048542408653212, + "grad_norm": 0.004366709850728512, + "learning_rate": 1.3229578827046515e-05, + "loss": 0.0001, + "step": 15348 + }, + { + "epoch": 4.049070043529877, + "grad_norm": 0.012724676169455051, + "learning_rate": 1.3228406459743839e-05, + "loss": 0.0001, + "step": 15350 + }, + { + "epoch": 4.049597678406543, + "grad_norm": 0.003434449201449752, + "learning_rate": 1.3227234092441163e-05, + "loss": 0.0001, + "step": 15352 + }, + { + "epoch": 4.050125313283208, + "grad_norm": 0.010336434468626976, + "learning_rate": 1.3226061725138486e-05, + "loss": 0.0001, + "step": 15354 + }, + { + "epoch": 4.050652948159874, + "grad_norm": 0.16177089512348175, + "learning_rate": 1.322488935783581e-05, + "loss": 0.008, + "step": 15356 + }, + { + "epoch": 4.051180583036539, + "grad_norm": 0.0420273132622242, + "learning_rate": 1.3223716990533135e-05, + "loss": 0.0001, + "step": 15358 + }, + { + "epoch": 4.0517082179132045, + "grad_norm": 0.023458966985344887, + "learning_rate": 1.322254462323046e-05, + "loss": 0.0001, + "step": 15360 + }, + { + "epoch": 4.052235852789869, + "grad_norm": 0.0012740399688482285, + "learning_rate": 1.3221372255927783e-05, + "loss": 0.0001, + "step": 15362 + }, + { + "epoch": 4.052763487666534, + "grad_norm": 0.07945076376199722, + "learning_rate": 1.3220199888625107e-05, + "loss": 0.0005, + "step": 15364 + }, + { + "epoch": 4.0532911225432, + "grad_norm": 0.001195566263049841, + "learning_rate": 1.3219027521322432e-05, + "loss": 0.0001, + "step": 15366 + }, + { + "epoch": 4.053818757419865, + "grad_norm": 0.0015988522209227085, + "learning_rate": 1.3217855154019756e-05, + "loss": 0.0001, + "step": 15368 + }, + { + "epoch": 4.054346392296531, + "grad_norm": 0.0027340834494680166, + "learning_rate": 1.3216682786717079e-05, + "loss": 0.0001, + "step": 15370 + }, + { + "epoch": 4.054874027173196, + "grad_norm": 0.005646685604006052, + "learning_rate": 1.3215510419414403e-05, + "loss": 0.0005, + "step": 15372 + }, + { + "epoch": 4.055401662049862, + "grad_norm": 0.0004942418308928609, + "learning_rate": 1.3214338052111729e-05, + "loss": 0.0, + "step": 15374 + }, + { + "epoch": 4.055929296926527, + "grad_norm": 0.0012316845823079348, + "learning_rate": 1.3213165684809053e-05, + "loss": 0.0002, + "step": 15376 + }, + { + "epoch": 4.056456931803192, + "grad_norm": 0.0010415895376354456, + "learning_rate": 1.3211993317506376e-05, + "loss": 0.002, + "step": 15378 + }, + { + "epoch": 4.056984566679858, + "grad_norm": 0.0013498847838491201, + "learning_rate": 1.32108209502037e-05, + "loss": 0.0, + "step": 15380 + }, + { + "epoch": 4.057512201556523, + "grad_norm": 0.0009853998199105263, + "learning_rate": 1.3209648582901024e-05, + "loss": 0.0001, + "step": 15382 + }, + { + "epoch": 4.058039836433188, + "grad_norm": 0.0009410151978954673, + "learning_rate": 1.3208476215598347e-05, + "loss": 0.0, + "step": 15384 + }, + { + "epoch": 4.058567471309853, + "grad_norm": 0.27912190556526184, + "learning_rate": 1.3207303848295671e-05, + "loss": 0.0016, + "step": 15386 + }, + { + "epoch": 4.059095106186519, + "grad_norm": 0.004083642736077309, + "learning_rate": 1.3206131480992996e-05, + "loss": 0.0002, + "step": 15388 + }, + { + "epoch": 4.059622741063184, + "grad_norm": 0.004838280845433474, + "learning_rate": 1.3204959113690322e-05, + "loss": 0.0001, + "step": 15390 + }, + { + "epoch": 4.06015037593985, + "grad_norm": 0.002894169883802533, + "learning_rate": 1.3203786746387644e-05, + "loss": 0.0001, + "step": 15392 + }, + { + "epoch": 4.060678010816515, + "grad_norm": 0.002105585765093565, + "learning_rate": 1.3202614379084969e-05, + "loss": 0.0001, + "step": 15394 + }, + { + "epoch": 4.06120564569318, + "grad_norm": 0.004383400548249483, + "learning_rate": 1.3201442011782293e-05, + "loss": 0.0001, + "step": 15396 + }, + { + "epoch": 4.061733280569846, + "grad_norm": 0.045180972665548325, + "learning_rate": 1.3200269644479617e-05, + "loss": 0.0006, + "step": 15398 + }, + { + "epoch": 4.062260915446511, + "grad_norm": 0.037854380905628204, + "learning_rate": 1.319909727717694e-05, + "loss": 0.0002, + "step": 15400 + }, + { + "epoch": 4.062788550323177, + "grad_norm": 0.038657158613204956, + "learning_rate": 1.3197924909874264e-05, + "loss": 0.0007, + "step": 15402 + }, + { + "epoch": 4.063316185199842, + "grad_norm": 0.002741442294791341, + "learning_rate": 1.3196752542571588e-05, + "loss": 0.0002, + "step": 15404 + }, + { + "epoch": 4.0638438200765075, + "grad_norm": 0.15474063158035278, + "learning_rate": 1.3195580175268914e-05, + "loss": 0.0012, + "step": 15406 + }, + { + "epoch": 4.064371454953172, + "grad_norm": 0.22338221967220306, + "learning_rate": 1.3194407807966237e-05, + "loss": 0.0016, + "step": 15408 + }, + { + "epoch": 4.064899089829837, + "grad_norm": 0.02920624427497387, + "learning_rate": 1.3193235440663561e-05, + "loss": 0.0001, + "step": 15410 + }, + { + "epoch": 4.065426724706503, + "grad_norm": 0.0008252039551734924, + "learning_rate": 1.3192063073360886e-05, + "loss": 0.0001, + "step": 15412 + }, + { + "epoch": 4.065954359583168, + "grad_norm": 0.008136224932968616, + "learning_rate": 1.3190890706058208e-05, + "loss": 0.0001, + "step": 15414 + }, + { + "epoch": 4.066481994459834, + "grad_norm": 0.0013850268442183733, + "learning_rate": 1.3189718338755533e-05, + "loss": 0.0001, + "step": 15416 + }, + { + "epoch": 4.067009629336499, + "grad_norm": 0.0006918342551216483, + "learning_rate": 1.3188545971452857e-05, + "loss": 0.0021, + "step": 15418 + }, + { + "epoch": 4.067537264213165, + "grad_norm": 0.0202775988727808, + "learning_rate": 1.3187373604150183e-05, + "loss": 0.0, + "step": 15420 + }, + { + "epoch": 4.06806489908983, + "grad_norm": 0.008062735199928284, + "learning_rate": 1.3186201236847504e-05, + "loss": 0.0038, + "step": 15422 + }, + { + "epoch": 4.068592533966495, + "grad_norm": 0.0035700714215636253, + "learning_rate": 1.318502886954483e-05, + "loss": 0.0, + "step": 15424 + }, + { + "epoch": 4.069120168843161, + "grad_norm": 0.017712336033582687, + "learning_rate": 1.3183856502242154e-05, + "loss": 0.0001, + "step": 15426 + }, + { + "epoch": 4.069647803719826, + "grad_norm": 0.0069413334131240845, + "learning_rate": 1.3182684134939478e-05, + "loss": 0.0118, + "step": 15428 + }, + { + "epoch": 4.0701754385964914, + "grad_norm": 0.0018116496503353119, + "learning_rate": 1.3181511767636801e-05, + "loss": 0.0, + "step": 15430 + }, + { + "epoch": 4.070703073473156, + "grad_norm": 0.12590447068214417, + "learning_rate": 1.3180339400334125e-05, + "loss": 0.0041, + "step": 15432 + }, + { + "epoch": 4.071230708349822, + "grad_norm": 0.004738084971904755, + "learning_rate": 1.317916703303145e-05, + "loss": 0.0014, + "step": 15434 + }, + { + "epoch": 4.071758343226487, + "grad_norm": 0.0015357262454926968, + "learning_rate": 1.3177994665728776e-05, + "loss": 0.0001, + "step": 15436 + }, + { + "epoch": 4.072285978103152, + "grad_norm": 0.002163910074159503, + "learning_rate": 1.3176822298426098e-05, + "loss": 0.0001, + "step": 15438 + }, + { + "epoch": 4.072813612979818, + "grad_norm": 0.026269011199474335, + "learning_rate": 1.3175649931123423e-05, + "loss": 0.0001, + "step": 15440 + }, + { + "epoch": 4.073341247856483, + "grad_norm": 0.2591992914676666, + "learning_rate": 1.3174477563820747e-05, + "loss": 0.0034, + "step": 15442 + }, + { + "epoch": 4.073868882733149, + "grad_norm": 0.010535857640206814, + "learning_rate": 1.317330519651807e-05, + "loss": 0.0023, + "step": 15444 + }, + { + "epoch": 4.074396517609814, + "grad_norm": 0.19528786838054657, + "learning_rate": 1.3172132829215394e-05, + "loss": 0.0014, + "step": 15446 + }, + { + "epoch": 4.07492415248648, + "grad_norm": 0.020290737971663475, + "learning_rate": 1.3170960461912718e-05, + "loss": 0.0004, + "step": 15448 + }, + { + "epoch": 4.075451787363145, + "grad_norm": 0.001656879554502666, + "learning_rate": 1.3169788094610042e-05, + "loss": 0.0001, + "step": 15450 + }, + { + "epoch": 4.0759794222398105, + "grad_norm": 0.0013021946651861072, + "learning_rate": 1.3168615727307365e-05, + "loss": 0.0005, + "step": 15452 + }, + { + "epoch": 4.076507057116475, + "grad_norm": 0.0831322968006134, + "learning_rate": 1.3167443360004691e-05, + "loss": 0.0004, + "step": 15454 + }, + { + "epoch": 4.07703469199314, + "grad_norm": 0.006405011285096407, + "learning_rate": 1.3166270992702015e-05, + "loss": 0.0046, + "step": 15456 + }, + { + "epoch": 4.077562326869806, + "grad_norm": 0.22495262324810028, + "learning_rate": 1.316509862539934e-05, + "loss": 0.008, + "step": 15458 + }, + { + "epoch": 4.078089961746471, + "grad_norm": 0.006631402298808098, + "learning_rate": 1.3163926258096662e-05, + "loss": 0.0001, + "step": 15460 + }, + { + "epoch": 4.078617596623137, + "grad_norm": 0.011691302992403507, + "learning_rate": 1.3162753890793986e-05, + "loss": 0.0031, + "step": 15462 + }, + { + "epoch": 4.079145231499802, + "grad_norm": 0.007336670532822609, + "learning_rate": 1.316158152349131e-05, + "loss": 0.0002, + "step": 15464 + }, + { + "epoch": 4.079672866376468, + "grad_norm": 0.003976788371801376, + "learning_rate": 1.3160409156188637e-05, + "loss": 0.0011, + "step": 15466 + }, + { + "epoch": 4.080200501253133, + "grad_norm": 0.03667692840099335, + "learning_rate": 1.3159236788885958e-05, + "loss": 0.0003, + "step": 15468 + }, + { + "epoch": 4.080728136129798, + "grad_norm": 0.025268441066145897, + "learning_rate": 1.3158064421583284e-05, + "loss": 0.0001, + "step": 15470 + }, + { + "epoch": 4.081255771006464, + "grad_norm": 0.007343456149101257, + "learning_rate": 1.3156892054280608e-05, + "loss": 0.0002, + "step": 15472 + }, + { + "epoch": 4.081783405883129, + "grad_norm": 0.020872358232736588, + "learning_rate": 1.315571968697793e-05, + "loss": 0.0002, + "step": 15474 + }, + { + "epoch": 4.0823110407597945, + "grad_norm": 0.014893791638314724, + "learning_rate": 1.3154547319675255e-05, + "loss": 0.0002, + "step": 15476 + }, + { + "epoch": 4.082838675636459, + "grad_norm": 0.14082171022891998, + "learning_rate": 1.315337495237258e-05, + "loss": 0.0004, + "step": 15478 + }, + { + "epoch": 4.083366310513125, + "grad_norm": 0.04346031695604324, + "learning_rate": 1.3152202585069904e-05, + "loss": 0.0002, + "step": 15480 + }, + { + "epoch": 4.08389394538979, + "grad_norm": 0.005450617987662554, + "learning_rate": 1.3151030217767226e-05, + "loss": 0.0003, + "step": 15482 + }, + { + "epoch": 4.084421580266455, + "grad_norm": 0.0441313199698925, + "learning_rate": 1.3149857850464552e-05, + "loss": 0.0004, + "step": 15484 + }, + { + "epoch": 4.084949215143121, + "grad_norm": 0.00640102056786418, + "learning_rate": 1.3148685483161876e-05, + "loss": 0.0001, + "step": 15486 + }, + { + "epoch": 4.085476850019786, + "grad_norm": 0.012147153727710247, + "learning_rate": 1.31475131158592e-05, + "loss": 0.0001, + "step": 15488 + }, + { + "epoch": 4.086004484896452, + "grad_norm": 0.03470825031399727, + "learning_rate": 1.3146340748556523e-05, + "loss": 0.0002, + "step": 15490 + }, + { + "epoch": 4.086532119773117, + "grad_norm": 0.16406966745853424, + "learning_rate": 1.3145168381253848e-05, + "loss": 0.0006, + "step": 15492 + }, + { + "epoch": 4.087059754649783, + "grad_norm": 0.003366863587871194, + "learning_rate": 1.3143996013951172e-05, + "loss": 0.0001, + "step": 15494 + }, + { + "epoch": 4.087587389526448, + "grad_norm": 0.024572577327489853, + "learning_rate": 1.3142823646648496e-05, + "loss": 0.0004, + "step": 15496 + }, + { + "epoch": 4.0881150244031135, + "grad_norm": 0.007883534766733646, + "learning_rate": 1.3141651279345819e-05, + "loss": 0.0002, + "step": 15498 + }, + { + "epoch": 4.088642659279778, + "grad_norm": 0.001329144462943077, + "learning_rate": 1.3140478912043145e-05, + "loss": 0.0, + "step": 15500 + }, + { + "epoch": 4.089170294156443, + "grad_norm": 0.0042323702946305275, + "learning_rate": 1.313930654474047e-05, + "loss": 0.0001, + "step": 15502 + }, + { + "epoch": 4.089697929033109, + "grad_norm": 0.0032398891635239124, + "learning_rate": 1.3138134177437792e-05, + "loss": 0.0001, + "step": 15504 + }, + { + "epoch": 4.090225563909774, + "grad_norm": 0.0006235394976101816, + "learning_rate": 1.3136961810135116e-05, + "loss": 0.0, + "step": 15506 + }, + { + "epoch": 4.09075319878644, + "grad_norm": 0.0019131904700770974, + "learning_rate": 1.313578944283244e-05, + "loss": 0.0001, + "step": 15508 + }, + { + "epoch": 4.091280833663105, + "grad_norm": 0.015203478746116161, + "learning_rate": 1.3134617075529765e-05, + "loss": 0.0001, + "step": 15510 + }, + { + "epoch": 4.091808468539771, + "grad_norm": 0.0032666262704879045, + "learning_rate": 1.3133444708227087e-05, + "loss": 0.0001, + "step": 15512 + }, + { + "epoch": 4.092336103416436, + "grad_norm": 0.0008493408095091581, + "learning_rate": 1.3132272340924412e-05, + "loss": 0.0041, + "step": 15514 + }, + { + "epoch": 4.092863738293101, + "grad_norm": 0.006861924659460783, + "learning_rate": 1.3131099973621738e-05, + "loss": 0.0001, + "step": 15516 + }, + { + "epoch": 4.093391373169767, + "grad_norm": 0.001008913153782487, + "learning_rate": 1.3129927606319062e-05, + "loss": 0.0031, + "step": 15518 + }, + { + "epoch": 4.093919008046432, + "grad_norm": 0.11946624517440796, + "learning_rate": 1.3128755239016385e-05, + "loss": 0.0134, + "step": 15520 + }, + { + "epoch": 4.0944466429230975, + "grad_norm": 0.0052342843264341354, + "learning_rate": 1.3127582871713709e-05, + "loss": 0.0001, + "step": 15522 + }, + { + "epoch": 4.094974277799762, + "grad_norm": 0.019718846306204796, + "learning_rate": 1.3126410504411033e-05, + "loss": 0.0001, + "step": 15524 + }, + { + "epoch": 4.095501912676428, + "grad_norm": 0.02601725608110428, + "learning_rate": 1.3125238137108357e-05, + "loss": 0.0001, + "step": 15526 + }, + { + "epoch": 4.096029547553093, + "grad_norm": 0.00525417597964406, + "learning_rate": 1.312406576980568e-05, + "loss": 0.0001, + "step": 15528 + }, + { + "epoch": 4.096557182429758, + "grad_norm": 0.005518827587366104, + "learning_rate": 1.3122893402503006e-05, + "loss": 0.0001, + "step": 15530 + }, + { + "epoch": 4.097084817306424, + "grad_norm": 0.0022606453858315945, + "learning_rate": 1.312172103520033e-05, + "loss": 0.001, + "step": 15532 + }, + { + "epoch": 4.097612452183089, + "grad_norm": 0.02963593788444996, + "learning_rate": 1.3120548667897653e-05, + "loss": 0.0001, + "step": 15534 + }, + { + "epoch": 4.098140087059755, + "grad_norm": 0.004993618931621313, + "learning_rate": 1.3119376300594977e-05, + "loss": 0.0001, + "step": 15536 + }, + { + "epoch": 4.09866772193642, + "grad_norm": 0.004700666759163141, + "learning_rate": 1.3118203933292302e-05, + "loss": 0.0001, + "step": 15538 + }, + { + "epoch": 4.099195356813086, + "grad_norm": 0.00284180766902864, + "learning_rate": 1.3117031565989626e-05, + "loss": 0.0003, + "step": 15540 + }, + { + "epoch": 4.099722991689751, + "grad_norm": 0.14283499121665955, + "learning_rate": 1.3115859198686948e-05, + "loss": 0.0009, + "step": 15542 + }, + { + "epoch": 4.100250626566416, + "grad_norm": 0.0013788514770567417, + "learning_rate": 1.3114686831384273e-05, + "loss": 0.0009, + "step": 15544 + }, + { + "epoch": 4.100778261443081, + "grad_norm": 0.013716611079871655, + "learning_rate": 1.3113514464081599e-05, + "loss": 0.0001, + "step": 15546 + }, + { + "epoch": 4.101305896319746, + "grad_norm": 0.005106321536004543, + "learning_rate": 1.3112342096778923e-05, + "loss": 0.0001, + "step": 15548 + }, + { + "epoch": 4.101833531196412, + "grad_norm": 0.0030225501395761967, + "learning_rate": 1.3111169729476246e-05, + "loss": 0.0001, + "step": 15550 + }, + { + "epoch": 4.102361166073077, + "grad_norm": 0.014415538869798183, + "learning_rate": 1.310999736217357e-05, + "loss": 0.0013, + "step": 15552 + }, + { + "epoch": 4.102888800949743, + "grad_norm": 0.0016030516708269715, + "learning_rate": 1.3108824994870894e-05, + "loss": 0.0001, + "step": 15554 + }, + { + "epoch": 4.103416435826408, + "grad_norm": 0.02567782811820507, + "learning_rate": 1.3107652627568219e-05, + "loss": 0.0003, + "step": 15556 + }, + { + "epoch": 4.103944070703074, + "grad_norm": 0.04384177178144455, + "learning_rate": 1.3106480260265541e-05, + "loss": 0.0047, + "step": 15558 + }, + { + "epoch": 4.104471705579739, + "grad_norm": 0.004560254514217377, + "learning_rate": 1.3105307892962866e-05, + "loss": 0.0001, + "step": 15560 + }, + { + "epoch": 4.104999340456404, + "grad_norm": 0.1513758897781372, + "learning_rate": 1.3104135525660191e-05, + "loss": 0.0004, + "step": 15562 + }, + { + "epoch": 4.10552697533307, + "grad_norm": 0.005554235074669123, + "learning_rate": 1.3102963158357514e-05, + "loss": 0.0001, + "step": 15564 + }, + { + "epoch": 4.106054610209735, + "grad_norm": 0.002709906082600355, + "learning_rate": 1.3101790791054838e-05, + "loss": 0.0001, + "step": 15566 + }, + { + "epoch": 4.1065822450864005, + "grad_norm": 0.002839852124452591, + "learning_rate": 1.3100618423752163e-05, + "loss": 0.0002, + "step": 15568 + }, + { + "epoch": 4.107109879963065, + "grad_norm": 0.004678781609982252, + "learning_rate": 1.3099446056449487e-05, + "loss": 0.0001, + "step": 15570 + }, + { + "epoch": 4.107637514839731, + "grad_norm": 0.0017273527337238193, + "learning_rate": 1.309827368914681e-05, + "loss": 0.0058, + "step": 15572 + }, + { + "epoch": 4.108165149716396, + "grad_norm": 0.002415443304926157, + "learning_rate": 1.3097101321844134e-05, + "loss": 0.0001, + "step": 15574 + }, + { + "epoch": 4.108692784593061, + "grad_norm": 0.0699409693479538, + "learning_rate": 1.309592895454146e-05, + "loss": 0.0004, + "step": 15576 + }, + { + "epoch": 4.109220419469727, + "grad_norm": 0.025547703728079796, + "learning_rate": 1.3094756587238784e-05, + "loss": 0.0019, + "step": 15578 + }, + { + "epoch": 4.109748054346392, + "grad_norm": 0.012668807059526443, + "learning_rate": 1.3093584219936107e-05, + "loss": 0.0002, + "step": 15580 + }, + { + "epoch": 4.110275689223058, + "grad_norm": 0.0018764743581414223, + "learning_rate": 1.3092411852633431e-05, + "loss": 0.0001, + "step": 15582 + }, + { + "epoch": 4.110803324099723, + "grad_norm": 0.1952487975358963, + "learning_rate": 1.3091239485330755e-05, + "loss": 0.0017, + "step": 15584 + }, + { + "epoch": 4.111330958976389, + "grad_norm": 0.009499293752014637, + "learning_rate": 1.309006711802808e-05, + "loss": 0.0004, + "step": 15586 + }, + { + "epoch": 4.111858593853054, + "grad_norm": 0.6807006001472473, + "learning_rate": 1.3088894750725402e-05, + "loss": 0.0005, + "step": 15588 + }, + { + "epoch": 4.112386228729719, + "grad_norm": 0.004021158441901207, + "learning_rate": 1.3087722383422727e-05, + "loss": 0.0001, + "step": 15590 + }, + { + "epoch": 4.1129138636063844, + "grad_norm": 0.007960683666169643, + "learning_rate": 1.3086550016120053e-05, + "loss": 0.0006, + "step": 15592 + }, + { + "epoch": 4.113441498483049, + "grad_norm": 0.04832495376467705, + "learning_rate": 1.3085377648817375e-05, + "loss": 0.0004, + "step": 15594 + }, + { + "epoch": 4.113969133359715, + "grad_norm": 0.0014687615912407637, + "learning_rate": 1.30842052815147e-05, + "loss": 0.0021, + "step": 15596 + }, + { + "epoch": 4.11449676823638, + "grad_norm": 0.004367939196527004, + "learning_rate": 1.3083032914212024e-05, + "loss": 0.0012, + "step": 15598 + }, + { + "epoch": 4.115024403113046, + "grad_norm": 0.002302649198099971, + "learning_rate": 1.3081860546909348e-05, + "loss": 0.0, + "step": 15600 + }, + { + "epoch": 4.115552037989711, + "grad_norm": 0.947074830532074, + "learning_rate": 1.308068817960667e-05, + "loss": 0.0057, + "step": 15602 + }, + { + "epoch": 4.116079672866377, + "grad_norm": 0.004949087742716074, + "learning_rate": 1.3079515812303995e-05, + "loss": 0.0001, + "step": 15604 + }, + { + "epoch": 4.116607307743042, + "grad_norm": 0.0009397987159900367, + "learning_rate": 1.307834344500132e-05, + "loss": 0.0, + "step": 15606 + }, + { + "epoch": 4.117134942619707, + "grad_norm": 0.014565974473953247, + "learning_rate": 1.3077171077698645e-05, + "loss": 0.0002, + "step": 15608 + }, + { + "epoch": 4.117662577496373, + "grad_norm": 0.017899250611662865, + "learning_rate": 1.3075998710395968e-05, + "loss": 0.0002, + "step": 15610 + }, + { + "epoch": 4.118190212373038, + "grad_norm": 0.0012007283512502909, + "learning_rate": 1.3074826343093292e-05, + "loss": 0.0, + "step": 15612 + }, + { + "epoch": 4.1187178472497035, + "grad_norm": 0.11349900811910629, + "learning_rate": 1.3073653975790617e-05, + "loss": 0.0005, + "step": 15614 + }, + { + "epoch": 4.119245482126368, + "grad_norm": 0.0005847360007464886, + "learning_rate": 1.3072481608487941e-05, + "loss": 0.0, + "step": 15616 + }, + { + "epoch": 4.119773117003034, + "grad_norm": 0.005893051158636808, + "learning_rate": 1.3071309241185264e-05, + "loss": 0.0003, + "step": 15618 + }, + { + "epoch": 4.120300751879699, + "grad_norm": 0.021364310756325722, + "learning_rate": 1.3070136873882588e-05, + "loss": 0.0001, + "step": 15620 + }, + { + "epoch": 4.120828386756364, + "grad_norm": 0.0034174227621406317, + "learning_rate": 1.3068964506579914e-05, + "loss": 0.0001, + "step": 15622 + }, + { + "epoch": 4.12135602163303, + "grad_norm": 0.0011173009406775236, + "learning_rate": 1.3067792139277235e-05, + "loss": 0.0001, + "step": 15624 + }, + { + "epoch": 4.121883656509695, + "grad_norm": 0.3055115044116974, + "learning_rate": 1.306661977197456e-05, + "loss": 0.013, + "step": 15626 + }, + { + "epoch": 4.122411291386361, + "grad_norm": 0.28618666529655457, + "learning_rate": 1.3065447404671885e-05, + "loss": 0.0008, + "step": 15628 + }, + { + "epoch": 4.122938926263026, + "grad_norm": 0.0012229068670421839, + "learning_rate": 1.306427503736921e-05, + "loss": 0.0, + "step": 15630 + }, + { + "epoch": 4.123466561139692, + "grad_norm": 0.004367697983980179, + "learning_rate": 1.3063102670066532e-05, + "loss": 0.0002, + "step": 15632 + }, + { + "epoch": 4.123994196016357, + "grad_norm": 0.02370538003742695, + "learning_rate": 1.3061930302763856e-05, + "loss": 0.007, + "step": 15634 + }, + { + "epoch": 4.124521830893022, + "grad_norm": 0.0032113634515553713, + "learning_rate": 1.306075793546118e-05, + "loss": 0.0001, + "step": 15636 + }, + { + "epoch": 4.1250494657696875, + "grad_norm": 0.003972488455474377, + "learning_rate": 1.3059585568158507e-05, + "loss": 0.0002, + "step": 15638 + }, + { + "epoch": 4.125577100646352, + "grad_norm": 0.07454736530780792, + "learning_rate": 1.305841320085583e-05, + "loss": 0.0004, + "step": 15640 + }, + { + "epoch": 4.126104735523018, + "grad_norm": 0.009650299325585365, + "learning_rate": 1.3057240833553153e-05, + "loss": 0.0001, + "step": 15642 + }, + { + "epoch": 4.126632370399683, + "grad_norm": 0.0018526421627029777, + "learning_rate": 1.3056068466250478e-05, + "loss": 0.0005, + "step": 15644 + }, + { + "epoch": 4.127160005276349, + "grad_norm": 0.049239419400691986, + "learning_rate": 1.3054896098947802e-05, + "loss": 0.0002, + "step": 15646 + }, + { + "epoch": 4.127687640153014, + "grad_norm": 0.2869059443473816, + "learning_rate": 1.3053723731645125e-05, + "loss": 0.0006, + "step": 15648 + }, + { + "epoch": 4.12821527502968, + "grad_norm": 0.002778234425932169, + "learning_rate": 1.3052551364342449e-05, + "loss": 0.0001, + "step": 15650 + }, + { + "epoch": 4.128742909906345, + "grad_norm": 0.0030425337608903646, + "learning_rate": 1.3051378997039775e-05, + "loss": 0.0001, + "step": 15652 + }, + { + "epoch": 4.12927054478301, + "grad_norm": 0.10459502041339874, + "learning_rate": 1.3050206629737096e-05, + "loss": 0.0005, + "step": 15654 + }, + { + "epoch": 4.129798179659676, + "grad_norm": 0.0020717275328934193, + "learning_rate": 1.3049034262434422e-05, + "loss": 0.0039, + "step": 15656 + }, + { + "epoch": 4.130325814536341, + "grad_norm": 0.002591552212834358, + "learning_rate": 1.3047861895131746e-05, + "loss": 0.0001, + "step": 15658 + }, + { + "epoch": 4.1308534494130065, + "grad_norm": 0.0806497260928154, + "learning_rate": 1.304668952782907e-05, + "loss": 0.0004, + "step": 15660 + }, + { + "epoch": 4.131381084289671, + "grad_norm": 0.00566619960591197, + "learning_rate": 1.3045517160526393e-05, + "loss": 0.0001, + "step": 15662 + }, + { + "epoch": 4.131908719166337, + "grad_norm": 0.00576653191819787, + "learning_rate": 1.3044344793223717e-05, + "loss": 0.0001, + "step": 15664 + }, + { + "epoch": 4.132436354043002, + "grad_norm": 0.002869607647880912, + "learning_rate": 1.3043172425921042e-05, + "loss": 0.0001, + "step": 15666 + }, + { + "epoch": 4.132963988919667, + "grad_norm": 0.003051342675462365, + "learning_rate": 1.3042000058618368e-05, + "loss": 0.0001, + "step": 15668 + }, + { + "epoch": 4.133491623796333, + "grad_norm": 0.0014229852240532637, + "learning_rate": 1.3040827691315689e-05, + "loss": 0.0045, + "step": 15670 + }, + { + "epoch": 4.134019258672998, + "grad_norm": 0.0017826241673901677, + "learning_rate": 1.3039655324013015e-05, + "loss": 0.0001, + "step": 15672 + }, + { + "epoch": 4.134546893549664, + "grad_norm": 0.008421148173511028, + "learning_rate": 1.3038482956710339e-05, + "loss": 0.0001, + "step": 15674 + }, + { + "epoch": 4.135074528426329, + "grad_norm": 0.0029305655043572187, + "learning_rate": 1.3037310589407663e-05, + "loss": 0.0001, + "step": 15676 + }, + { + "epoch": 4.135602163302995, + "grad_norm": 0.0077714985236525536, + "learning_rate": 1.3036138222104986e-05, + "loss": 0.0001, + "step": 15678 + }, + { + "epoch": 4.13612979817966, + "grad_norm": 0.03842655569314957, + "learning_rate": 1.303496585480231e-05, + "loss": 0.0002, + "step": 15680 + }, + { + "epoch": 4.136657433056325, + "grad_norm": 0.010322073474526405, + "learning_rate": 1.3033793487499634e-05, + "loss": 0.0001, + "step": 15682 + }, + { + "epoch": 4.1371850679329905, + "grad_norm": 0.0022868760861456394, + "learning_rate": 1.3032621120196957e-05, + "loss": 0.0002, + "step": 15684 + }, + { + "epoch": 4.137712702809655, + "grad_norm": 0.1534622758626938, + "learning_rate": 1.3031448752894283e-05, + "loss": 0.006, + "step": 15686 + }, + { + "epoch": 4.138240337686321, + "grad_norm": 0.00962491799145937, + "learning_rate": 1.3030276385591607e-05, + "loss": 0.0001, + "step": 15688 + }, + { + "epoch": 4.138767972562986, + "grad_norm": 0.04138824716210365, + "learning_rate": 1.3029104018288932e-05, + "loss": 0.0002, + "step": 15690 + }, + { + "epoch": 4.139295607439652, + "grad_norm": 0.003085311269387603, + "learning_rate": 1.3027931650986254e-05, + "loss": 0.0016, + "step": 15692 + }, + { + "epoch": 4.139823242316317, + "grad_norm": 0.001551962224766612, + "learning_rate": 1.3026759283683579e-05, + "loss": 0.0006, + "step": 15694 + }, + { + "epoch": 4.140350877192983, + "grad_norm": 0.01111072301864624, + "learning_rate": 1.3025586916380903e-05, + "loss": 0.0006, + "step": 15696 + }, + { + "epoch": 4.140878512069648, + "grad_norm": 0.001258597243577242, + "learning_rate": 1.3024414549078229e-05, + "loss": 0.0031, + "step": 15698 + }, + { + "epoch": 4.141406146946313, + "grad_norm": 0.002947789616882801, + "learning_rate": 1.302324218177555e-05, + "loss": 0.0001, + "step": 15700 + }, + { + "epoch": 4.141933781822979, + "grad_norm": 0.03119937889277935, + "learning_rate": 1.3022069814472876e-05, + "loss": 0.0002, + "step": 15702 + }, + { + "epoch": 4.142461416699644, + "grad_norm": 0.0017199978465214372, + "learning_rate": 1.30208974471702e-05, + "loss": 0.0001, + "step": 15704 + }, + { + "epoch": 4.1429890515763095, + "grad_norm": 0.014957441948354244, + "learning_rate": 1.3019725079867524e-05, + "loss": 0.0003, + "step": 15706 + }, + { + "epoch": 4.143516686452974, + "grad_norm": 0.0009391532512381673, + "learning_rate": 1.3018552712564847e-05, + "loss": 0.0, + "step": 15708 + }, + { + "epoch": 4.14404432132964, + "grad_norm": 0.01566348783671856, + "learning_rate": 1.3017380345262171e-05, + "loss": 0.0002, + "step": 15710 + }, + { + "epoch": 4.144571956206305, + "grad_norm": 0.005994252860546112, + "learning_rate": 1.3016207977959496e-05, + "loss": 0.0002, + "step": 15712 + }, + { + "epoch": 4.14509959108297, + "grad_norm": 0.08747296035289764, + "learning_rate": 1.3015035610656818e-05, + "loss": 0.0009, + "step": 15714 + }, + { + "epoch": 4.145627225959636, + "grad_norm": 0.009726748801767826, + "learning_rate": 1.3013863243354144e-05, + "loss": 0.0, + "step": 15716 + }, + { + "epoch": 4.146154860836301, + "grad_norm": 0.0008720628684386611, + "learning_rate": 1.3012690876051469e-05, + "loss": 0.0, + "step": 15718 + }, + { + "epoch": 4.146682495712967, + "grad_norm": 0.001419119886122644, + "learning_rate": 1.3011518508748793e-05, + "loss": 0.0, + "step": 15720 + }, + { + "epoch": 4.147210130589632, + "grad_norm": 0.03347448632121086, + "learning_rate": 1.3010346141446115e-05, + "loss": 0.0001, + "step": 15722 + }, + { + "epoch": 4.147737765466298, + "grad_norm": 0.001078226137906313, + "learning_rate": 1.300917377414344e-05, + "loss": 0.0073, + "step": 15724 + }, + { + "epoch": 4.148265400342963, + "grad_norm": 0.0005571740912273526, + "learning_rate": 1.3008001406840764e-05, + "loss": 0.0001, + "step": 15726 + }, + { + "epoch": 4.148793035219628, + "grad_norm": 0.0015106478240340948, + "learning_rate": 1.3006829039538088e-05, + "loss": 0.0022, + "step": 15728 + }, + { + "epoch": 4.1493206700962935, + "grad_norm": 0.0005815731710754335, + "learning_rate": 1.3005656672235411e-05, + "loss": 0.0, + "step": 15730 + }, + { + "epoch": 4.149848304972958, + "grad_norm": 0.0041384403593838215, + "learning_rate": 1.3004484304932737e-05, + "loss": 0.0, + "step": 15732 + }, + { + "epoch": 4.150375939849624, + "grad_norm": 0.038259416818618774, + "learning_rate": 1.3003311937630061e-05, + "loss": 0.0034, + "step": 15734 + }, + { + "epoch": 4.150903574726289, + "grad_norm": 0.0019343801541253924, + "learning_rate": 1.3002139570327386e-05, + "loss": 0.0, + "step": 15736 + }, + { + "epoch": 4.151431209602955, + "grad_norm": 0.006550285499542952, + "learning_rate": 1.3000967203024708e-05, + "loss": 0.0001, + "step": 15738 + }, + { + "epoch": 4.15195884447962, + "grad_norm": 0.0018134111305698752, + "learning_rate": 1.2999794835722033e-05, + "loss": 0.0, + "step": 15740 + }, + { + "epoch": 4.152486479356286, + "grad_norm": 0.026869967579841614, + "learning_rate": 1.2998622468419357e-05, + "loss": 0.0002, + "step": 15742 + }, + { + "epoch": 4.153014114232951, + "grad_norm": 0.0005614230176433921, + "learning_rate": 1.299745010111668e-05, + "loss": 0.0, + "step": 15744 + }, + { + "epoch": 4.153541749109616, + "grad_norm": 0.005095919594168663, + "learning_rate": 1.2996277733814004e-05, + "loss": 0.0001, + "step": 15746 + }, + { + "epoch": 4.154069383986282, + "grad_norm": 0.0008720498299226165, + "learning_rate": 1.299510536651133e-05, + "loss": 0.0, + "step": 15748 + }, + { + "epoch": 4.154597018862947, + "grad_norm": 1.0848042964935303, + "learning_rate": 1.2993932999208654e-05, + "loss": 0.0044, + "step": 15750 + }, + { + "epoch": 4.1551246537396125, + "grad_norm": 0.002074531512334943, + "learning_rate": 1.2992760631905977e-05, + "loss": 0.0057, + "step": 15752 + }, + { + "epoch": 4.1556522886162774, + "grad_norm": 0.0033820499666035175, + "learning_rate": 1.2991588264603301e-05, + "loss": 0.0008, + "step": 15754 + }, + { + "epoch": 4.156179923492943, + "grad_norm": 0.0005950726917944849, + "learning_rate": 1.2990415897300625e-05, + "loss": 0.0001, + "step": 15756 + }, + { + "epoch": 4.156707558369608, + "grad_norm": 0.0010467965621501207, + "learning_rate": 1.298924352999795e-05, + "loss": 0.0, + "step": 15758 + }, + { + "epoch": 4.157235193246273, + "grad_norm": 0.004139173310250044, + "learning_rate": 1.2988071162695272e-05, + "loss": 0.0001, + "step": 15760 + }, + { + "epoch": 4.157762828122939, + "grad_norm": 0.0030192958656698465, + "learning_rate": 1.2986898795392598e-05, + "loss": 0.001, + "step": 15762 + }, + { + "epoch": 4.158290462999604, + "grad_norm": 0.001126273418776691, + "learning_rate": 1.2985726428089922e-05, + "loss": 0.0004, + "step": 15764 + }, + { + "epoch": 4.15881809787627, + "grad_norm": 0.001994447084143758, + "learning_rate": 1.2984554060787247e-05, + "loss": 0.0001, + "step": 15766 + }, + { + "epoch": 4.159345732752935, + "grad_norm": 0.008269269950687885, + "learning_rate": 1.298338169348457e-05, + "loss": 0.0014, + "step": 15768 + }, + { + "epoch": 4.159873367629601, + "grad_norm": 0.0011373615125194192, + "learning_rate": 1.2982209326181894e-05, + "loss": 0.0001, + "step": 15770 + }, + { + "epoch": 4.160401002506266, + "grad_norm": 0.002102244645357132, + "learning_rate": 1.2981036958879218e-05, + "loss": 0.0001, + "step": 15772 + }, + { + "epoch": 4.160928637382931, + "grad_norm": 0.0028972045984119177, + "learning_rate": 1.297986459157654e-05, + "loss": 0.0001, + "step": 15774 + }, + { + "epoch": 4.1614562722595965, + "grad_norm": 0.024645982310175896, + "learning_rate": 1.2978692224273865e-05, + "loss": 0.0076, + "step": 15776 + }, + { + "epoch": 4.161983907136261, + "grad_norm": 0.037142567336559296, + "learning_rate": 1.2977519856971191e-05, + "loss": 0.0003, + "step": 15778 + }, + { + "epoch": 4.162511542012927, + "grad_norm": 0.002460889518260956, + "learning_rate": 1.2976347489668515e-05, + "loss": 0.0002, + "step": 15780 + }, + { + "epoch": 4.163039176889592, + "grad_norm": 0.002220917260274291, + "learning_rate": 1.2975175122365838e-05, + "loss": 0.0001, + "step": 15782 + }, + { + "epoch": 4.163566811766258, + "grad_norm": 0.0014376918552443385, + "learning_rate": 1.2974002755063162e-05, + "loss": 0.0003, + "step": 15784 + }, + { + "epoch": 4.164094446642923, + "grad_norm": 0.002706419210880995, + "learning_rate": 1.2972830387760486e-05, + "loss": 0.0016, + "step": 15786 + }, + { + "epoch": 4.164622081519588, + "grad_norm": 0.0012668909039348364, + "learning_rate": 1.297165802045781e-05, + "loss": 0.0, + "step": 15788 + }, + { + "epoch": 4.165149716396254, + "grad_norm": 0.004393466282635927, + "learning_rate": 1.2970485653155133e-05, + "loss": 0.0001, + "step": 15790 + }, + { + "epoch": 4.165677351272919, + "grad_norm": 0.003242257982492447, + "learning_rate": 1.2969313285852458e-05, + "loss": 0.0001, + "step": 15792 + }, + { + "epoch": 4.166204986149585, + "grad_norm": 0.004368876572698355, + "learning_rate": 1.2968140918549784e-05, + "loss": 0.0001, + "step": 15794 + }, + { + "epoch": 4.16673262102625, + "grad_norm": 0.007634536363184452, + "learning_rate": 1.2966968551247108e-05, + "loss": 0.0001, + "step": 15796 + }, + { + "epoch": 4.1672602559029155, + "grad_norm": 0.029953492805361748, + "learning_rate": 1.296579618394443e-05, + "loss": 0.0001, + "step": 15798 + }, + { + "epoch": 4.1677878907795805, + "grad_norm": 0.00416643125936389, + "learning_rate": 1.2964623816641755e-05, + "loss": 0.0003, + "step": 15800 + }, + { + "epoch": 4.168315525656246, + "grad_norm": 0.0024100972805172205, + "learning_rate": 1.296345144933908e-05, + "loss": 0.0001, + "step": 15802 + }, + { + "epoch": 4.168843160532911, + "grad_norm": 0.018897851929068565, + "learning_rate": 1.2962279082036403e-05, + "loss": 0.0002, + "step": 15804 + }, + { + "epoch": 4.169370795409576, + "grad_norm": 0.019562579691410065, + "learning_rate": 1.2961106714733726e-05, + "loss": 0.0001, + "step": 15806 + }, + { + "epoch": 4.169898430286242, + "grad_norm": 0.0037618076894432306, + "learning_rate": 1.2959934347431052e-05, + "loss": 0.0001, + "step": 15808 + }, + { + "epoch": 4.170426065162907, + "grad_norm": 0.00928465649485588, + "learning_rate": 1.2958761980128376e-05, + "loss": 0.0001, + "step": 15810 + }, + { + "epoch": 4.170953700039573, + "grad_norm": 0.013268117792904377, + "learning_rate": 1.2957589612825699e-05, + "loss": 0.0001, + "step": 15812 + }, + { + "epoch": 4.171481334916238, + "grad_norm": 0.0023080739192664623, + "learning_rate": 1.2956417245523023e-05, + "loss": 0.0001, + "step": 15814 + }, + { + "epoch": 4.172008969792904, + "grad_norm": 0.041829660534858704, + "learning_rate": 1.2955244878220348e-05, + "loss": 0.0038, + "step": 15816 + }, + { + "epoch": 4.172536604669569, + "grad_norm": 0.02040237933397293, + "learning_rate": 1.2954072510917672e-05, + "loss": 0.0004, + "step": 15818 + }, + { + "epoch": 4.173064239546234, + "grad_norm": 0.0431012362241745, + "learning_rate": 1.2952900143614995e-05, + "loss": 0.0002, + "step": 15820 + }, + { + "epoch": 4.1735918744228995, + "grad_norm": 0.0007759176660329103, + "learning_rate": 1.2951727776312319e-05, + "loss": 0.0, + "step": 15822 + }, + { + "epoch": 4.174119509299564, + "grad_norm": 0.03843316063284874, + "learning_rate": 1.2950555409009645e-05, + "loss": 0.0002, + "step": 15824 + }, + { + "epoch": 4.17464714417623, + "grad_norm": 0.22589650750160217, + "learning_rate": 1.2949383041706969e-05, + "loss": 0.0114, + "step": 15826 + }, + { + "epoch": 4.175174779052895, + "grad_norm": 0.0008018459775485098, + "learning_rate": 1.2948210674404292e-05, + "loss": 0.0, + "step": 15828 + }, + { + "epoch": 4.175702413929561, + "grad_norm": 0.18325282633304596, + "learning_rate": 1.2947038307101616e-05, + "loss": 0.0062, + "step": 15830 + }, + { + "epoch": 4.176230048806226, + "grad_norm": 0.002526241121813655, + "learning_rate": 1.294586593979894e-05, + "loss": 0.0001, + "step": 15832 + }, + { + "epoch": 4.176757683682891, + "grad_norm": 0.005069173872470856, + "learning_rate": 1.2944693572496265e-05, + "loss": 0.0001, + "step": 15834 + }, + { + "epoch": 4.177285318559557, + "grad_norm": 0.28874045610427856, + "learning_rate": 1.2943521205193587e-05, + "loss": 0.0044, + "step": 15836 + }, + { + "epoch": 4.177812953436222, + "grad_norm": 0.014800256118178368, + "learning_rate": 1.2942348837890912e-05, + "loss": 0.003, + "step": 15838 + }, + { + "epoch": 4.178340588312888, + "grad_norm": 0.008538762107491493, + "learning_rate": 1.2941176470588238e-05, + "loss": 0.0001, + "step": 15840 + }, + { + "epoch": 4.178868223189553, + "grad_norm": 0.05366460978984833, + "learning_rate": 1.294000410328556e-05, + "loss": 0.0005, + "step": 15842 + }, + { + "epoch": 4.1793958580662185, + "grad_norm": 0.015897240489721298, + "learning_rate": 1.2938831735982884e-05, + "loss": 0.002, + "step": 15844 + }, + { + "epoch": 4.1799234929428835, + "grad_norm": 0.22279994189739227, + "learning_rate": 1.2937659368680209e-05, + "loss": 0.0032, + "step": 15846 + }, + { + "epoch": 4.180451127819548, + "grad_norm": 0.002454928820952773, + "learning_rate": 1.2936487001377533e-05, + "loss": 0.0001, + "step": 15848 + }, + { + "epoch": 4.180978762696214, + "grad_norm": 0.007064308039844036, + "learning_rate": 1.2935314634074856e-05, + "loss": 0.0001, + "step": 15850 + }, + { + "epoch": 4.181506397572879, + "grad_norm": 0.050196509808301926, + "learning_rate": 1.293414226677218e-05, + "loss": 0.0035, + "step": 15852 + }, + { + "epoch": 4.182034032449545, + "grad_norm": 0.0058953966945409775, + "learning_rate": 1.2932969899469506e-05, + "loss": 0.0005, + "step": 15854 + }, + { + "epoch": 4.18256166732621, + "grad_norm": 0.006276480853557587, + "learning_rate": 1.293179753216683e-05, + "loss": 0.0001, + "step": 15856 + }, + { + "epoch": 4.183089302202876, + "grad_norm": 0.011316078715026379, + "learning_rate": 1.2930625164864153e-05, + "loss": 0.0001, + "step": 15858 + }, + { + "epoch": 4.183616937079541, + "grad_norm": 0.09874352812767029, + "learning_rate": 1.2929452797561477e-05, + "loss": 0.0003, + "step": 15860 + }, + { + "epoch": 4.184144571956207, + "grad_norm": 0.002563171787187457, + "learning_rate": 1.2928280430258802e-05, + "loss": 0.0001, + "step": 15862 + }, + { + "epoch": 4.184672206832872, + "grad_norm": 0.0035813755821436644, + "learning_rate": 1.2927108062956126e-05, + "loss": 0.0001, + "step": 15864 + }, + { + "epoch": 4.185199841709537, + "grad_norm": 0.045391131192445755, + "learning_rate": 1.2925935695653448e-05, + "loss": 0.0003, + "step": 15866 + }, + { + "epoch": 4.1857274765862025, + "grad_norm": 0.04828207194805145, + "learning_rate": 1.2924763328350773e-05, + "loss": 0.0003, + "step": 15868 + }, + { + "epoch": 4.186255111462867, + "grad_norm": 0.008315686136484146, + "learning_rate": 1.2923590961048099e-05, + "loss": 0.0001, + "step": 15870 + }, + { + "epoch": 4.186782746339533, + "grad_norm": 0.013004160486161709, + "learning_rate": 1.2922418593745421e-05, + "loss": 0.0001, + "step": 15872 + }, + { + "epoch": 4.187310381216198, + "grad_norm": 0.026001011952757835, + "learning_rate": 1.2921246226442746e-05, + "loss": 0.0002, + "step": 15874 + }, + { + "epoch": 4.187838016092864, + "grad_norm": 0.005617877002805471, + "learning_rate": 1.292007385914007e-05, + "loss": 0.0001, + "step": 15876 + }, + { + "epoch": 4.188365650969529, + "grad_norm": 0.0023083018604665995, + "learning_rate": 1.2918901491837394e-05, + "loss": 0.0001, + "step": 15878 + }, + { + "epoch": 4.188893285846194, + "grad_norm": 0.10062679648399353, + "learning_rate": 1.2917729124534717e-05, + "loss": 0.0003, + "step": 15880 + }, + { + "epoch": 4.18942092072286, + "grad_norm": 0.03072880208492279, + "learning_rate": 1.2916556757232041e-05, + "loss": 0.0003, + "step": 15882 + }, + { + "epoch": 4.189948555599525, + "grad_norm": 0.001091520069167018, + "learning_rate": 1.2915384389929365e-05, + "loss": 0.0001, + "step": 15884 + }, + { + "epoch": 4.190476190476191, + "grad_norm": 0.037556521594524384, + "learning_rate": 1.2914212022626691e-05, + "loss": 0.0001, + "step": 15886 + }, + { + "epoch": 4.191003825352856, + "grad_norm": 0.05876323953270912, + "learning_rate": 1.2913039655324014e-05, + "loss": 0.0011, + "step": 15888 + }, + { + "epoch": 4.1915314602295215, + "grad_norm": 0.0009555640863254666, + "learning_rate": 1.2911867288021338e-05, + "loss": 0.0, + "step": 15890 + }, + { + "epoch": 4.1920590951061865, + "grad_norm": 0.004386589862406254, + "learning_rate": 1.2910694920718663e-05, + "loss": 0.0001, + "step": 15892 + }, + { + "epoch": 4.192586729982851, + "grad_norm": 0.0014118129620328546, + "learning_rate": 1.2909522553415987e-05, + "loss": 0.0001, + "step": 15894 + }, + { + "epoch": 4.193114364859517, + "grad_norm": 0.0006854152888990939, + "learning_rate": 1.290835018611331e-05, + "loss": 0.0, + "step": 15896 + }, + { + "epoch": 4.193641999736182, + "grad_norm": 0.007369672413915396, + "learning_rate": 1.2907177818810634e-05, + "loss": 0.0001, + "step": 15898 + }, + { + "epoch": 4.194169634612848, + "grad_norm": 0.0011711445404216647, + "learning_rate": 1.290600545150796e-05, + "loss": 0.0, + "step": 15900 + }, + { + "epoch": 4.194697269489513, + "grad_norm": 0.004404676612466574, + "learning_rate": 1.290483308420528e-05, + "loss": 0.0008, + "step": 15902 + }, + { + "epoch": 4.195224904366179, + "grad_norm": 0.0025274010840803385, + "learning_rate": 1.2903660716902607e-05, + "loss": 0.0001, + "step": 15904 + }, + { + "epoch": 4.195752539242844, + "grad_norm": 0.0007574207265861332, + "learning_rate": 1.2902488349599931e-05, + "loss": 0.0, + "step": 15906 + }, + { + "epoch": 4.19628017411951, + "grad_norm": 0.015255835838615894, + "learning_rate": 1.2901315982297255e-05, + "loss": 0.0001, + "step": 15908 + }, + { + "epoch": 4.196807808996175, + "grad_norm": 0.00032669337815605104, + "learning_rate": 1.2900143614994578e-05, + "loss": 0.0001, + "step": 15910 + }, + { + "epoch": 4.19733544387284, + "grad_norm": 0.0020058448426425457, + "learning_rate": 1.2898971247691902e-05, + "loss": 0.0, + "step": 15912 + }, + { + "epoch": 4.1978630787495055, + "grad_norm": 0.001745337271131575, + "learning_rate": 1.2897798880389227e-05, + "loss": 0.0001, + "step": 15914 + }, + { + "epoch": 4.1983907136261704, + "grad_norm": 0.0012449378846213222, + "learning_rate": 1.2896626513086553e-05, + "loss": 0.0, + "step": 15916 + }, + { + "epoch": 4.198918348502836, + "grad_norm": 0.000539153057616204, + "learning_rate": 1.2895454145783875e-05, + "loss": 0.0027, + "step": 15918 + }, + { + "epoch": 4.199445983379501, + "grad_norm": 0.008818540722131729, + "learning_rate": 1.28942817784812e-05, + "loss": 0.0024, + "step": 15920 + }, + { + "epoch": 4.199973618256167, + "grad_norm": 0.009742221795022488, + "learning_rate": 1.2893109411178524e-05, + "loss": 0.0001, + "step": 15922 + }, + { + "epoch": 4.200501253132832, + "grad_norm": 0.0009820982813835144, + "learning_rate": 1.2891937043875848e-05, + "loss": 0.0001, + "step": 15924 + }, + { + "epoch": 4.201028888009497, + "grad_norm": 0.07700215280056, + "learning_rate": 1.289076467657317e-05, + "loss": 0.0004, + "step": 15926 + }, + { + "epoch": 4.201556522886163, + "grad_norm": 0.009662200696766376, + "learning_rate": 1.2889592309270495e-05, + "loss": 0.0001, + "step": 15928 + }, + { + "epoch": 4.202084157762828, + "grad_norm": 0.0007864765939302742, + "learning_rate": 1.288841994196782e-05, + "loss": 0.0, + "step": 15930 + }, + { + "epoch": 4.202611792639494, + "grad_norm": 0.0007621030090376735, + "learning_rate": 1.2887247574665142e-05, + "loss": 0.0, + "step": 15932 + }, + { + "epoch": 4.203139427516159, + "grad_norm": 0.0008439971134066582, + "learning_rate": 1.2886075207362468e-05, + "loss": 0.0001, + "step": 15934 + }, + { + "epoch": 4.2036670623928245, + "grad_norm": 0.0024930420331656933, + "learning_rate": 1.2884902840059792e-05, + "loss": 0.0001, + "step": 15936 + }, + { + "epoch": 4.2041946972694895, + "grad_norm": 0.01104381587356329, + "learning_rate": 1.2883730472757117e-05, + "loss": 0.0029, + "step": 15938 + }, + { + "epoch": 4.204722332146154, + "grad_norm": 0.0007042757351882756, + "learning_rate": 1.288255810545444e-05, + "loss": 0.0, + "step": 15940 + }, + { + "epoch": 4.20524996702282, + "grad_norm": 0.0015464883763343096, + "learning_rate": 1.2881385738151763e-05, + "loss": 0.0, + "step": 15942 + }, + { + "epoch": 4.205777601899485, + "grad_norm": 0.0013548635179176927, + "learning_rate": 1.2880213370849088e-05, + "loss": 0.0, + "step": 15944 + }, + { + "epoch": 4.206305236776151, + "grad_norm": 0.002262615133076906, + "learning_rate": 1.2879041003546414e-05, + "loss": 0.0001, + "step": 15946 + }, + { + "epoch": 4.206832871652816, + "grad_norm": 0.0045971618965268135, + "learning_rate": 1.2877868636243735e-05, + "loss": 0.0002, + "step": 15948 + }, + { + "epoch": 4.207360506529482, + "grad_norm": 0.001846670056693256, + "learning_rate": 1.287669626894106e-05, + "loss": 0.0, + "step": 15950 + }, + { + "epoch": 4.207888141406147, + "grad_norm": 0.0007287993212230504, + "learning_rate": 1.2875523901638385e-05, + "loss": 0.0001, + "step": 15952 + }, + { + "epoch": 4.208415776282813, + "grad_norm": 0.0026874204631894827, + "learning_rate": 1.287435153433571e-05, + "loss": 0.0001, + "step": 15954 + }, + { + "epoch": 4.208943411159478, + "grad_norm": 0.0016768089262768626, + "learning_rate": 1.2873179167033032e-05, + "loss": 0.0, + "step": 15956 + }, + { + "epoch": 4.209471046036143, + "grad_norm": 0.21395598351955414, + "learning_rate": 1.2872006799730356e-05, + "loss": 0.0011, + "step": 15958 + }, + { + "epoch": 4.2099986809128085, + "grad_norm": 0.0023773068096488714, + "learning_rate": 1.287083443242768e-05, + "loss": 0.0134, + "step": 15960 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.04263174533843994, + "learning_rate": 1.2869662065125003e-05, + "loss": 0.0076, + "step": 15962 + }, + { + "epoch": 4.211053950666139, + "grad_norm": 0.044686030596494675, + "learning_rate": 1.2868489697822329e-05, + "loss": 0.0002, + "step": 15964 + }, + { + "epoch": 4.211581585542804, + "grad_norm": 0.014803688041865826, + "learning_rate": 1.2867317330519653e-05, + "loss": 0.0002, + "step": 15966 + }, + { + "epoch": 4.21210922041947, + "grad_norm": 0.000959709461312741, + "learning_rate": 1.2866144963216978e-05, + "loss": 0.0001, + "step": 15968 + }, + { + "epoch": 4.212636855296135, + "grad_norm": 0.5267980098724365, + "learning_rate": 1.28649725959143e-05, + "loss": 0.0023, + "step": 15970 + }, + { + "epoch": 4.2131644901728, + "grad_norm": 0.014420335181057453, + "learning_rate": 1.2863800228611625e-05, + "loss": 0.0002, + "step": 15972 + }, + { + "epoch": 4.213692125049466, + "grad_norm": 0.015478896908462048, + "learning_rate": 1.2862627861308949e-05, + "loss": 0.0003, + "step": 15974 + }, + { + "epoch": 4.214219759926131, + "grad_norm": 0.012834546156227589, + "learning_rate": 1.2861455494006275e-05, + "loss": 0.0005, + "step": 15976 + }, + { + "epoch": 4.214747394802797, + "grad_norm": 0.03960798680782318, + "learning_rate": 1.2860283126703596e-05, + "loss": 0.0002, + "step": 15978 + }, + { + "epoch": 4.215275029679462, + "grad_norm": 0.002689837943762541, + "learning_rate": 1.2859110759400922e-05, + "loss": 0.0001, + "step": 15980 + }, + { + "epoch": 4.2158026645561275, + "grad_norm": 0.008107393980026245, + "learning_rate": 1.2857938392098246e-05, + "loss": 0.0001, + "step": 15982 + }, + { + "epoch": 4.2163302994327925, + "grad_norm": 0.12946538627147675, + "learning_rate": 1.285676602479557e-05, + "loss": 0.0002, + "step": 15984 + }, + { + "epoch": 4.216857934309457, + "grad_norm": 0.0016346280463039875, + "learning_rate": 1.2855593657492893e-05, + "loss": 0.0001, + "step": 15986 + }, + { + "epoch": 4.217385569186123, + "grad_norm": 0.0010354232508689165, + "learning_rate": 1.2854421290190217e-05, + "loss": 0.0001, + "step": 15988 + }, + { + "epoch": 4.217913204062788, + "grad_norm": 0.01606285758316517, + "learning_rate": 1.2853248922887542e-05, + "loss": 0.0077, + "step": 15990 + }, + { + "epoch": 4.218440838939454, + "grad_norm": 0.001061116810888052, + "learning_rate": 1.2852076555584864e-05, + "loss": 0.0, + "step": 15992 + }, + { + "epoch": 4.218968473816119, + "grad_norm": 0.001917182351462543, + "learning_rate": 1.2850904188282189e-05, + "loss": 0.0001, + "step": 15994 + }, + { + "epoch": 4.219496108692785, + "grad_norm": 0.016338393092155457, + "learning_rate": 1.2849731820979515e-05, + "loss": 0.0011, + "step": 15996 + }, + { + "epoch": 4.22002374356945, + "grad_norm": 0.010344215668737888, + "learning_rate": 1.2848559453676839e-05, + "loss": 0.0002, + "step": 15998 + }, + { + "epoch": 4.220551378446116, + "grad_norm": 0.015559982508420944, + "learning_rate": 1.2847387086374162e-05, + "loss": 0.0047, + "step": 16000 + }, + { + "epoch": 4.220551378446116, + "eval_loss": 0.001628478872589767, + "eval_runtime": 304.7799, + "eval_samples_per_second": 707.53, + "eval_steps_per_second": 88.444, + "step": 16000 + }, + { + "epoch": 4.221079013322781, + "grad_norm": 0.004138896241784096, + "learning_rate": 1.2846214719071486e-05, + "loss": 0.0002, + "step": 16002 + }, + { + "epoch": 4.221606648199446, + "grad_norm": 0.0007855274598114192, + "learning_rate": 1.284504235176881e-05, + "loss": 0.0001, + "step": 16004 + }, + { + "epoch": 4.2221342830761115, + "grad_norm": 0.0038384513463824987, + "learning_rate": 1.2843869984466134e-05, + "loss": 0.0001, + "step": 16006 + }, + { + "epoch": 4.2226619179527765, + "grad_norm": 0.002627554116770625, + "learning_rate": 1.2842697617163457e-05, + "loss": 0.0001, + "step": 16008 + }, + { + "epoch": 4.223189552829442, + "grad_norm": 0.018182555213570595, + "learning_rate": 1.2841525249860783e-05, + "loss": 0.0002, + "step": 16010 + }, + { + "epoch": 4.223717187706107, + "grad_norm": 0.013200230896472931, + "learning_rate": 1.2840352882558107e-05, + "loss": 0.0003, + "step": 16012 + }, + { + "epoch": 4.224244822582773, + "grad_norm": 0.0008737283060327172, + "learning_rate": 1.2839180515255432e-05, + "loss": 0.0001, + "step": 16014 + }, + { + "epoch": 4.224772457459438, + "grad_norm": 0.0023589995689690113, + "learning_rate": 1.2838008147952754e-05, + "loss": 0.0001, + "step": 16016 + }, + { + "epoch": 4.225300092336103, + "grad_norm": 0.0031381126027554274, + "learning_rate": 1.2836835780650079e-05, + "loss": 0.0001, + "step": 16018 + }, + { + "epoch": 4.225827727212769, + "grad_norm": 0.0018686528783291578, + "learning_rate": 1.2835663413347403e-05, + "loss": 0.0001, + "step": 16020 + }, + { + "epoch": 4.226355362089434, + "grad_norm": 0.0026857436168938875, + "learning_rate": 1.2834491046044725e-05, + "loss": 0.0001, + "step": 16022 + }, + { + "epoch": 4.2268829969661, + "grad_norm": 0.0011714451247826219, + "learning_rate": 1.283331867874205e-05, + "loss": 0.0001, + "step": 16024 + }, + { + "epoch": 4.227410631842765, + "grad_norm": 0.2176285684108734, + "learning_rate": 1.2832146311439376e-05, + "loss": 0.0047, + "step": 16026 + }, + { + "epoch": 4.2279382667194305, + "grad_norm": 0.0023238584399223328, + "learning_rate": 1.28309739441367e-05, + "loss": 0.0001, + "step": 16028 + }, + { + "epoch": 4.2284659015960955, + "grad_norm": 0.00217427802272141, + "learning_rate": 1.2829801576834023e-05, + "loss": 0.0001, + "step": 16030 + }, + { + "epoch": 4.22899353647276, + "grad_norm": 0.0005576609401032329, + "learning_rate": 1.2828629209531347e-05, + "loss": 0.0004, + "step": 16032 + }, + { + "epoch": 4.229521171349426, + "grad_norm": 0.0022878714371472597, + "learning_rate": 1.2827456842228671e-05, + "loss": 0.0002, + "step": 16034 + }, + { + "epoch": 4.230048806226091, + "grad_norm": 0.0046708593145012856, + "learning_rate": 1.2826284474925996e-05, + "loss": 0.0001, + "step": 16036 + }, + { + "epoch": 4.230576441102757, + "grad_norm": 0.12334366887807846, + "learning_rate": 1.2825112107623318e-05, + "loss": 0.0006, + "step": 16038 + }, + { + "epoch": 4.231104075979422, + "grad_norm": 0.039127349853515625, + "learning_rate": 1.2823939740320644e-05, + "loss": 0.0002, + "step": 16040 + }, + { + "epoch": 4.231631710856088, + "grad_norm": 0.005474094767123461, + "learning_rate": 1.2822767373017969e-05, + "loss": 0.0001, + "step": 16042 + }, + { + "epoch": 4.232159345732753, + "grad_norm": 0.0027912193909287453, + "learning_rate": 1.2821595005715293e-05, + "loss": 0.0001, + "step": 16044 + }, + { + "epoch": 4.232686980609419, + "grad_norm": 0.005101568531244993, + "learning_rate": 1.2820422638412615e-05, + "loss": 0.0001, + "step": 16046 + }, + { + "epoch": 4.233214615486084, + "grad_norm": 0.0011214233236387372, + "learning_rate": 1.281925027110994e-05, + "loss": 0.0001, + "step": 16048 + }, + { + "epoch": 4.233742250362749, + "grad_norm": 0.0020728919189423323, + "learning_rate": 1.2818077903807264e-05, + "loss": 0.0001, + "step": 16050 + }, + { + "epoch": 4.2342698852394145, + "grad_norm": 0.004153089132159948, + "learning_rate": 1.2816905536504587e-05, + "loss": 0.0001, + "step": 16052 + }, + { + "epoch": 4.2347975201160795, + "grad_norm": 0.1529855728149414, + "learning_rate": 1.2815733169201911e-05, + "loss": 0.001, + "step": 16054 + }, + { + "epoch": 4.235325154992745, + "grad_norm": 0.003455069148913026, + "learning_rate": 1.2814560801899237e-05, + "loss": 0.0001, + "step": 16056 + }, + { + "epoch": 4.23585278986941, + "grad_norm": 0.0029849905986338854, + "learning_rate": 1.2813388434596561e-05, + "loss": 0.0001, + "step": 16058 + }, + { + "epoch": 4.236380424746076, + "grad_norm": 0.0013718802947551012, + "learning_rate": 1.2812216067293884e-05, + "loss": 0.0001, + "step": 16060 + }, + { + "epoch": 4.236908059622741, + "grad_norm": 0.019649337977170944, + "learning_rate": 1.2811043699991208e-05, + "loss": 0.0001, + "step": 16062 + }, + { + "epoch": 4.237435694499406, + "grad_norm": 0.0008774359012022614, + "learning_rate": 1.2809871332688532e-05, + "loss": 0.0001, + "step": 16064 + }, + { + "epoch": 4.237963329376072, + "grad_norm": 0.04580655321478844, + "learning_rate": 1.2808698965385857e-05, + "loss": 0.0003, + "step": 16066 + }, + { + "epoch": 4.238490964252737, + "grad_norm": 0.0004456449532881379, + "learning_rate": 1.280752659808318e-05, + "loss": 0.0001, + "step": 16068 + }, + { + "epoch": 4.239018599129403, + "grad_norm": 0.10676955431699753, + "learning_rate": 1.2806354230780504e-05, + "loss": 0.0071, + "step": 16070 + }, + { + "epoch": 4.239546234006068, + "grad_norm": 0.0021716959308832884, + "learning_rate": 1.280518186347783e-05, + "loss": 0.0, + "step": 16072 + }, + { + "epoch": 4.2400738688827335, + "grad_norm": 0.006033746525645256, + "learning_rate": 1.2804009496175154e-05, + "loss": 0.0001, + "step": 16074 + }, + { + "epoch": 4.2406015037593985, + "grad_norm": 0.0014848780119791627, + "learning_rate": 1.2802837128872477e-05, + "loss": 0.0001, + "step": 16076 + }, + { + "epoch": 4.2411291386360634, + "grad_norm": 0.0007894466398283839, + "learning_rate": 1.2801664761569801e-05, + "loss": 0.0, + "step": 16078 + }, + { + "epoch": 4.241656773512729, + "grad_norm": 0.001847301609814167, + "learning_rate": 1.2800492394267125e-05, + "loss": 0.0002, + "step": 16080 + }, + { + "epoch": 4.242184408389394, + "grad_norm": 0.09788081794977188, + "learning_rate": 1.2799320026964448e-05, + "loss": 0.0003, + "step": 16082 + }, + { + "epoch": 4.24271204326606, + "grad_norm": 0.0029557072557508945, + "learning_rate": 1.2798147659661772e-05, + "loss": 0.0001, + "step": 16084 + }, + { + "epoch": 4.243239678142725, + "grad_norm": 0.0007309162174351513, + "learning_rate": 1.2796975292359098e-05, + "loss": 0.0, + "step": 16086 + }, + { + "epoch": 4.243767313019391, + "grad_norm": 0.0036570567172020674, + "learning_rate": 1.2795802925056422e-05, + "loss": 0.0001, + "step": 16088 + }, + { + "epoch": 4.244294947896056, + "grad_norm": 0.5292673707008362, + "learning_rate": 1.2794630557753745e-05, + "loss": 0.0019, + "step": 16090 + }, + { + "epoch": 4.244822582772722, + "grad_norm": 0.023659568279981613, + "learning_rate": 1.279345819045107e-05, + "loss": 0.0016, + "step": 16092 + }, + { + "epoch": 4.245350217649387, + "grad_norm": 0.22330693900585175, + "learning_rate": 1.2792285823148394e-05, + "loss": 0.0073, + "step": 16094 + }, + { + "epoch": 4.245877852526052, + "grad_norm": 0.0029950065072625875, + "learning_rate": 1.2791113455845718e-05, + "loss": 0.0001, + "step": 16096 + }, + { + "epoch": 4.2464054874027175, + "grad_norm": 0.003855984192341566, + "learning_rate": 1.278994108854304e-05, + "loss": 0.0001, + "step": 16098 + }, + { + "epoch": 4.2469331222793825, + "grad_norm": 0.26822108030319214, + "learning_rate": 1.2788768721240365e-05, + "loss": 0.0027, + "step": 16100 + }, + { + "epoch": 4.247460757156048, + "grad_norm": 0.043197277933359146, + "learning_rate": 1.2787596353937691e-05, + "loss": 0.0001, + "step": 16102 + }, + { + "epoch": 4.247988392032713, + "grad_norm": 0.004132931586354971, + "learning_rate": 1.2786423986635015e-05, + "loss": 0.0001, + "step": 16104 + }, + { + "epoch": 4.248516026909379, + "grad_norm": 0.0011667234357446432, + "learning_rate": 1.2785251619332338e-05, + "loss": 0.0001, + "step": 16106 + }, + { + "epoch": 4.249043661786044, + "grad_norm": 0.2174082249403, + "learning_rate": 1.2784079252029662e-05, + "loss": 0.0033, + "step": 16108 + }, + { + "epoch": 4.249571296662709, + "grad_norm": 0.03972123935818672, + "learning_rate": 1.2782906884726986e-05, + "loss": 0.0034, + "step": 16110 + }, + { + "epoch": 4.250098931539375, + "grad_norm": 0.014917965978384018, + "learning_rate": 1.2781734517424309e-05, + "loss": 0.0001, + "step": 16112 + }, + { + "epoch": 4.25062656641604, + "grad_norm": 0.0005491700139828026, + "learning_rate": 1.2780562150121633e-05, + "loss": 0.0, + "step": 16114 + }, + { + "epoch": 4.251154201292706, + "grad_norm": 0.0011712624691426754, + "learning_rate": 1.2779389782818958e-05, + "loss": 0.0001, + "step": 16116 + }, + { + "epoch": 4.251681836169371, + "grad_norm": 0.00863533467054367, + "learning_rate": 1.2778217415516284e-05, + "loss": 0.0001, + "step": 16118 + }, + { + "epoch": 4.2522094710460365, + "grad_norm": 0.04879792779684067, + "learning_rate": 1.2777045048213606e-05, + "loss": 0.0003, + "step": 16120 + }, + { + "epoch": 4.2527371059227015, + "grad_norm": 0.0025548862759023905, + "learning_rate": 1.277587268091093e-05, + "loss": 0.0002, + "step": 16122 + }, + { + "epoch": 4.2532647407993665, + "grad_norm": 0.3214723467826843, + "learning_rate": 1.2774700313608255e-05, + "loss": 0.0056, + "step": 16124 + }, + { + "epoch": 4.253792375676032, + "grad_norm": 0.0041298335418105125, + "learning_rate": 1.2773527946305579e-05, + "loss": 0.0004, + "step": 16126 + }, + { + "epoch": 4.254320010552697, + "grad_norm": 0.004136952571570873, + "learning_rate": 1.2772355579002902e-05, + "loss": 0.0001, + "step": 16128 + }, + { + "epoch": 4.254847645429363, + "grad_norm": 0.0062667615711688995, + "learning_rate": 1.2771183211700226e-05, + "loss": 0.0003, + "step": 16130 + }, + { + "epoch": 4.255375280306028, + "grad_norm": 0.03108730912208557, + "learning_rate": 1.2770010844397552e-05, + "loss": 0.0003, + "step": 16132 + }, + { + "epoch": 4.255902915182694, + "grad_norm": 0.0017510892357677221, + "learning_rate": 1.2768838477094876e-05, + "loss": 0.0001, + "step": 16134 + }, + { + "epoch": 4.256430550059359, + "grad_norm": 0.006779746152460575, + "learning_rate": 1.2767666109792199e-05, + "loss": 0.0001, + "step": 16136 + }, + { + "epoch": 4.256958184936025, + "grad_norm": 0.01259431429207325, + "learning_rate": 1.2766493742489523e-05, + "loss": 0.0015, + "step": 16138 + }, + { + "epoch": 4.25748581981269, + "grad_norm": 0.05973977595567703, + "learning_rate": 1.2765321375186848e-05, + "loss": 0.0005, + "step": 16140 + }, + { + "epoch": 4.258013454689355, + "grad_norm": 0.0020020189695060253, + "learning_rate": 1.276414900788417e-05, + "loss": 0.0001, + "step": 16142 + }, + { + "epoch": 4.2585410895660205, + "grad_norm": 0.0314338356256485, + "learning_rate": 1.2762976640581494e-05, + "loss": 0.0001, + "step": 16144 + }, + { + "epoch": 4.2590687244426855, + "grad_norm": 0.04699266701936722, + "learning_rate": 1.2761804273278819e-05, + "loss": 0.0002, + "step": 16146 + }, + { + "epoch": 4.259596359319351, + "grad_norm": 0.0015107267536222935, + "learning_rate": 1.2760631905976145e-05, + "loss": 0.0, + "step": 16148 + }, + { + "epoch": 4.260123994196016, + "grad_norm": 0.004791730549186468, + "learning_rate": 1.2759459538673467e-05, + "loss": 0.0001, + "step": 16150 + }, + { + "epoch": 4.260651629072681, + "grad_norm": 0.007208667229861021, + "learning_rate": 1.2758287171370792e-05, + "loss": 0.0002, + "step": 16152 + }, + { + "epoch": 4.261179263949347, + "grad_norm": 0.0029295149724930525, + "learning_rate": 1.2757114804068116e-05, + "loss": 0.0001, + "step": 16154 + }, + { + "epoch": 4.261706898826012, + "grad_norm": 0.014263274148106575, + "learning_rate": 1.275594243676544e-05, + "loss": 0.0001, + "step": 16156 + }, + { + "epoch": 4.262234533702678, + "grad_norm": 0.0011811167933046818, + "learning_rate": 1.2754770069462763e-05, + "loss": 0.0, + "step": 16158 + }, + { + "epoch": 4.262762168579343, + "grad_norm": 0.001975195948034525, + "learning_rate": 1.2753597702160087e-05, + "loss": 0.0001, + "step": 16160 + }, + { + "epoch": 4.263289803456009, + "grad_norm": 0.12004264444112778, + "learning_rate": 1.2752425334857412e-05, + "loss": 0.0014, + "step": 16162 + }, + { + "epoch": 4.263817438332674, + "grad_norm": 0.002285122638568282, + "learning_rate": 1.2751252967554738e-05, + "loss": 0.0001, + "step": 16164 + }, + { + "epoch": 4.2643450732093395, + "grad_norm": 0.0007249966729432344, + "learning_rate": 1.275008060025206e-05, + "loss": 0.0001, + "step": 16166 + }, + { + "epoch": 4.2648727080860045, + "grad_norm": 0.0006582079804502428, + "learning_rate": 1.2748908232949384e-05, + "loss": 0.0001, + "step": 16168 + }, + { + "epoch": 4.2654003429626695, + "grad_norm": 0.013743549585342407, + "learning_rate": 1.2747735865646709e-05, + "loss": 0.0007, + "step": 16170 + }, + { + "epoch": 4.265927977839335, + "grad_norm": 0.03853803873062134, + "learning_rate": 1.2746563498344031e-05, + "loss": 0.0001, + "step": 16172 + }, + { + "epoch": 4.266455612716, + "grad_norm": 0.001438034581951797, + "learning_rate": 1.2745391131041356e-05, + "loss": 0.0, + "step": 16174 + }, + { + "epoch": 4.266983247592666, + "grad_norm": 0.02631726674735546, + "learning_rate": 1.274421876373868e-05, + "loss": 0.0016, + "step": 16176 + }, + { + "epoch": 4.267510882469331, + "grad_norm": 0.11805441230535507, + "learning_rate": 1.2743046396436006e-05, + "loss": 0.0004, + "step": 16178 + }, + { + "epoch": 4.268038517345997, + "grad_norm": 0.007701314054429531, + "learning_rate": 1.2741874029133327e-05, + "loss": 0.0001, + "step": 16180 + }, + { + "epoch": 4.268566152222662, + "grad_norm": 0.0026588812470436096, + "learning_rate": 1.2740701661830653e-05, + "loss": 0.0001, + "step": 16182 + }, + { + "epoch": 4.269093787099327, + "grad_norm": 0.013690237887203693, + "learning_rate": 1.2739529294527977e-05, + "loss": 0.0001, + "step": 16184 + }, + { + "epoch": 4.269621421975993, + "grad_norm": 0.001870851032435894, + "learning_rate": 1.2738356927225301e-05, + "loss": 0.0, + "step": 16186 + }, + { + "epoch": 4.270149056852658, + "grad_norm": 0.0048044538125395775, + "learning_rate": 1.2737184559922624e-05, + "loss": 0.0001, + "step": 16188 + }, + { + "epoch": 4.2706766917293235, + "grad_norm": 0.04635639488697052, + "learning_rate": 1.2736012192619948e-05, + "loss": 0.0001, + "step": 16190 + }, + { + "epoch": 4.2712043266059885, + "grad_norm": 0.0022115539759397507, + "learning_rate": 1.2734839825317273e-05, + "loss": 0.0, + "step": 16192 + }, + { + "epoch": 4.271731961482654, + "grad_norm": 0.06869552284479141, + "learning_rate": 1.2733667458014599e-05, + "loss": 0.0003, + "step": 16194 + }, + { + "epoch": 4.272259596359319, + "grad_norm": 0.011104350909590721, + "learning_rate": 1.2732495090711921e-05, + "loss": 0.0001, + "step": 16196 + }, + { + "epoch": 4.272787231235984, + "grad_norm": 0.0009204825619235635, + "learning_rate": 1.2731322723409246e-05, + "loss": 0.0, + "step": 16198 + }, + { + "epoch": 4.27331486611265, + "grad_norm": 0.4945656955242157, + "learning_rate": 1.273015035610657e-05, + "loss": 0.0012, + "step": 16200 + }, + { + "epoch": 4.273842500989315, + "grad_norm": 0.01113305427134037, + "learning_rate": 1.2728977988803893e-05, + "loss": 0.0001, + "step": 16202 + }, + { + "epoch": 4.274370135865981, + "grad_norm": 0.001302440301515162, + "learning_rate": 1.2727805621501217e-05, + "loss": 0.0009, + "step": 16204 + }, + { + "epoch": 4.274897770742646, + "grad_norm": 0.013020270504057407, + "learning_rate": 1.2726633254198541e-05, + "loss": 0.0017, + "step": 16206 + }, + { + "epoch": 4.275425405619312, + "grad_norm": 0.001384008675813675, + "learning_rate": 1.2725460886895865e-05, + "loss": 0.0009, + "step": 16208 + }, + { + "epoch": 4.275953040495977, + "grad_norm": 0.0007057594484649599, + "learning_rate": 1.2724288519593188e-05, + "loss": 0.0001, + "step": 16210 + }, + { + "epoch": 4.2764806753726425, + "grad_norm": 0.04192199185490608, + "learning_rate": 1.2723116152290514e-05, + "loss": 0.0001, + "step": 16212 + }, + { + "epoch": 4.2770083102493075, + "grad_norm": 0.0006844938616268337, + "learning_rate": 1.2721943784987838e-05, + "loss": 0.0, + "step": 16214 + }, + { + "epoch": 4.2775359451259725, + "grad_norm": 0.020414330065250397, + "learning_rate": 1.2720771417685163e-05, + "loss": 0.0001, + "step": 16216 + }, + { + "epoch": 4.278063580002638, + "grad_norm": 0.0004601288528647274, + "learning_rate": 1.2719599050382485e-05, + "loss": 0.0, + "step": 16218 + }, + { + "epoch": 4.278591214879303, + "grad_norm": 0.008360051549971104, + "learning_rate": 1.271842668307981e-05, + "loss": 0.0, + "step": 16220 + }, + { + "epoch": 4.279118849755969, + "grad_norm": 0.0007286676554940641, + "learning_rate": 1.2717254315777134e-05, + "loss": 0.0, + "step": 16222 + }, + { + "epoch": 4.279646484632634, + "grad_norm": 0.002138698473572731, + "learning_rate": 1.271608194847446e-05, + "loss": 0.0001, + "step": 16224 + }, + { + "epoch": 4.2801741195093, + "grad_norm": 0.004397241864353418, + "learning_rate": 1.271490958117178e-05, + "loss": 0.0006, + "step": 16226 + }, + { + "epoch": 4.280701754385965, + "grad_norm": 0.00024928717175498605, + "learning_rate": 1.2713737213869107e-05, + "loss": 0.0028, + "step": 16228 + }, + { + "epoch": 4.28122938926263, + "grad_norm": 0.007929799146950245, + "learning_rate": 1.2712564846566431e-05, + "loss": 0.0, + "step": 16230 + }, + { + "epoch": 4.281757024139296, + "grad_norm": 0.005223092623054981, + "learning_rate": 1.2711392479263754e-05, + "loss": 0.0005, + "step": 16232 + }, + { + "epoch": 4.282284659015961, + "grad_norm": 0.007431755308061838, + "learning_rate": 1.2710220111961078e-05, + "loss": 0.0007, + "step": 16234 + }, + { + "epoch": 4.2828122938926265, + "grad_norm": 0.015044864267110825, + "learning_rate": 1.2709047744658402e-05, + "loss": 0.0001, + "step": 16236 + }, + { + "epoch": 4.2833399287692915, + "grad_norm": 0.0004175734065938741, + "learning_rate": 1.2707875377355727e-05, + "loss": 0.0001, + "step": 16238 + }, + { + "epoch": 4.283867563645957, + "grad_norm": 0.001789992325939238, + "learning_rate": 1.270670301005305e-05, + "loss": 0.0, + "step": 16240 + }, + { + "epoch": 4.284395198522622, + "grad_norm": 0.0008133240044116974, + "learning_rate": 1.2705530642750375e-05, + "loss": 0.0, + "step": 16242 + }, + { + "epoch": 4.284922833399287, + "grad_norm": 0.13188156485557556, + "learning_rate": 1.27043582754477e-05, + "loss": 0.0005, + "step": 16244 + }, + { + "epoch": 4.285450468275953, + "grad_norm": 0.0028459886088967323, + "learning_rate": 1.2703185908145024e-05, + "loss": 0.0, + "step": 16246 + }, + { + "epoch": 4.285978103152618, + "grad_norm": 0.150262713432312, + "learning_rate": 1.2702013540842346e-05, + "loss": 0.0064, + "step": 16248 + }, + { + "epoch": 4.286505738029284, + "grad_norm": 0.0008036138606257737, + "learning_rate": 1.270084117353967e-05, + "loss": 0.0054, + "step": 16250 + }, + { + "epoch": 4.287033372905949, + "grad_norm": 0.0028453944250941277, + "learning_rate": 1.2699668806236995e-05, + "loss": 0.0013, + "step": 16252 + }, + { + "epoch": 4.287561007782615, + "grad_norm": 0.0023659595753997564, + "learning_rate": 1.269849643893432e-05, + "loss": 0.0001, + "step": 16254 + }, + { + "epoch": 4.28808864265928, + "grad_norm": 0.002073482610285282, + "learning_rate": 1.2697324071631642e-05, + "loss": 0.0001, + "step": 16256 + }, + { + "epoch": 4.2886162775359455, + "grad_norm": 0.0018242208752781153, + "learning_rate": 1.2696151704328968e-05, + "loss": 0.0001, + "step": 16258 + }, + { + "epoch": 4.2891439124126105, + "grad_norm": 0.002069185022264719, + "learning_rate": 1.2694979337026292e-05, + "loss": 0.0001, + "step": 16260 + }, + { + "epoch": 4.2896715472892755, + "grad_norm": 0.34239494800567627, + "learning_rate": 1.2693806969723615e-05, + "loss": 0.0027, + "step": 16262 + }, + { + "epoch": 4.290199182165941, + "grad_norm": 0.003018076764419675, + "learning_rate": 1.2692634602420939e-05, + "loss": 0.0011, + "step": 16264 + }, + { + "epoch": 4.290726817042606, + "grad_norm": 0.0015718488721176982, + "learning_rate": 1.2691462235118263e-05, + "loss": 0.0001, + "step": 16266 + }, + { + "epoch": 4.291254451919272, + "grad_norm": 0.008593758568167686, + "learning_rate": 1.2690289867815588e-05, + "loss": 0.0001, + "step": 16268 + }, + { + "epoch": 4.291782086795937, + "grad_norm": 0.008449259214103222, + "learning_rate": 1.268911750051291e-05, + "loss": 0.0015, + "step": 16270 + }, + { + "epoch": 4.292309721672603, + "grad_norm": 0.0037150925491005182, + "learning_rate": 1.2687945133210235e-05, + "loss": 0.0001, + "step": 16272 + }, + { + "epoch": 4.292837356549268, + "grad_norm": 0.012653865851461887, + "learning_rate": 1.268677276590756e-05, + "loss": 0.0001, + "step": 16274 + }, + { + "epoch": 4.293364991425933, + "grad_norm": 0.0029602847062051296, + "learning_rate": 1.2685600398604885e-05, + "loss": 0.0001, + "step": 16276 + }, + { + "epoch": 4.293892626302599, + "grad_norm": 0.34393447637557983, + "learning_rate": 1.2684428031302208e-05, + "loss": 0.0067, + "step": 16278 + }, + { + "epoch": 4.294420261179264, + "grad_norm": 0.21612541377544403, + "learning_rate": 1.2683255663999532e-05, + "loss": 0.007, + "step": 16280 + }, + { + "epoch": 4.2949478960559295, + "grad_norm": 0.11637351661920547, + "learning_rate": 1.2682083296696856e-05, + "loss": 0.0051, + "step": 16282 + }, + { + "epoch": 4.2954755309325945, + "grad_norm": 0.028560953214764595, + "learning_rate": 1.268091092939418e-05, + "loss": 0.0002, + "step": 16284 + }, + { + "epoch": 4.29600316580926, + "grad_norm": 0.019480645656585693, + "learning_rate": 1.2679738562091503e-05, + "loss": 0.0004, + "step": 16286 + }, + { + "epoch": 4.296530800685925, + "grad_norm": 0.00864739902317524, + "learning_rate": 1.2678566194788829e-05, + "loss": 0.0001, + "step": 16288 + }, + { + "epoch": 4.29705843556259, + "grad_norm": 0.2783288359642029, + "learning_rate": 1.2677393827486153e-05, + "loss": 0.0035, + "step": 16290 + }, + { + "epoch": 4.297586070439256, + "grad_norm": 0.2572425305843353, + "learning_rate": 1.2676221460183476e-05, + "loss": 0.0056, + "step": 16292 + }, + { + "epoch": 4.298113705315921, + "grad_norm": 0.008424678817391396, + "learning_rate": 1.26750490928808e-05, + "loss": 0.0001, + "step": 16294 + }, + { + "epoch": 4.298641340192587, + "grad_norm": 0.03624173626303673, + "learning_rate": 1.2673876725578125e-05, + "loss": 0.0002, + "step": 16296 + }, + { + "epoch": 4.299168975069252, + "grad_norm": 0.007149527780711651, + "learning_rate": 1.2672704358275449e-05, + "loss": 0.0002, + "step": 16298 + }, + { + "epoch": 4.299696609945918, + "grad_norm": 0.007057357579469681, + "learning_rate": 1.2671531990972772e-05, + "loss": 0.0004, + "step": 16300 + }, + { + "epoch": 4.300224244822583, + "grad_norm": 0.08096778392791748, + "learning_rate": 1.2670359623670096e-05, + "loss": 0.0002, + "step": 16302 + }, + { + "epoch": 4.3007518796992485, + "grad_norm": 0.0028427885845303535, + "learning_rate": 1.2669187256367422e-05, + "loss": 0.0007, + "step": 16304 + }, + { + "epoch": 4.3012795145759135, + "grad_norm": 0.01840594969689846, + "learning_rate": 1.2668014889064746e-05, + "loss": 0.0034, + "step": 16306 + }, + { + "epoch": 4.3018071494525785, + "grad_norm": 0.0041985884308815, + "learning_rate": 1.2666842521762069e-05, + "loss": 0.0001, + "step": 16308 + }, + { + "epoch": 4.302334784329244, + "grad_norm": 0.02285381592810154, + "learning_rate": 1.2665670154459393e-05, + "loss": 0.0001, + "step": 16310 + }, + { + "epoch": 4.302862419205909, + "grad_norm": 0.0017753519350662827, + "learning_rate": 1.2664497787156717e-05, + "loss": 0.0001, + "step": 16312 + }, + { + "epoch": 4.303390054082575, + "grad_norm": 0.03464468568563461, + "learning_rate": 1.2663325419854042e-05, + "loss": 0.002, + "step": 16314 + }, + { + "epoch": 4.30391768895924, + "grad_norm": 0.008633697405457497, + "learning_rate": 1.2662153052551364e-05, + "loss": 0.0013, + "step": 16316 + }, + { + "epoch": 4.304445323835906, + "grad_norm": 0.0009529105736874044, + "learning_rate": 1.2660980685248689e-05, + "loss": 0.0001, + "step": 16318 + }, + { + "epoch": 4.304972958712571, + "grad_norm": 0.002023228444159031, + "learning_rate": 1.2659808317946015e-05, + "loss": 0.0001, + "step": 16320 + }, + { + "epoch": 4.305500593589236, + "grad_norm": 0.006983259227126837, + "learning_rate": 1.2658635950643337e-05, + "loss": 0.0001, + "step": 16322 + }, + { + "epoch": 4.306028228465902, + "grad_norm": 0.008717787452042103, + "learning_rate": 1.2657463583340661e-05, + "loss": 0.0002, + "step": 16324 + }, + { + "epoch": 4.306555863342567, + "grad_norm": 0.001836817362345755, + "learning_rate": 1.2656291216037986e-05, + "loss": 0.0001, + "step": 16326 + }, + { + "epoch": 4.3070834982192325, + "grad_norm": 0.004542944487184286, + "learning_rate": 1.265511884873531e-05, + "loss": 0.0001, + "step": 16328 + }, + { + "epoch": 4.3076111330958975, + "grad_norm": 0.008612029254436493, + "learning_rate": 1.2653946481432633e-05, + "loss": 0.0002, + "step": 16330 + }, + { + "epoch": 4.308138767972563, + "grad_norm": 0.004438708070665598, + "learning_rate": 1.2652774114129957e-05, + "loss": 0.0035, + "step": 16332 + }, + { + "epoch": 4.308666402849228, + "grad_norm": 0.011475911363959312, + "learning_rate": 1.2651601746827283e-05, + "loss": 0.0044, + "step": 16334 + }, + { + "epoch": 4.309194037725893, + "grad_norm": 0.01258676964789629, + "learning_rate": 1.2650429379524607e-05, + "loss": 0.0001, + "step": 16336 + }, + { + "epoch": 4.309721672602559, + "grad_norm": 0.003663057927042246, + "learning_rate": 1.264925701222193e-05, + "loss": 0.0001, + "step": 16338 + }, + { + "epoch": 4.310249307479224, + "grad_norm": 0.18511749804019928, + "learning_rate": 1.2648084644919254e-05, + "loss": 0.0056, + "step": 16340 + }, + { + "epoch": 4.31077694235589, + "grad_norm": 0.004304013680666685, + "learning_rate": 1.2646912277616579e-05, + "loss": 0.0002, + "step": 16342 + }, + { + "epoch": 4.311304577232555, + "grad_norm": 0.0051460592076182365, + "learning_rate": 1.2645739910313903e-05, + "loss": 0.0014, + "step": 16344 + }, + { + "epoch": 4.311832212109221, + "grad_norm": 0.13038672506809235, + "learning_rate": 1.2644567543011225e-05, + "loss": 0.0005, + "step": 16346 + }, + { + "epoch": 4.312359846985886, + "grad_norm": 0.007402149960398674, + "learning_rate": 1.264339517570855e-05, + "loss": 0.0001, + "step": 16348 + }, + { + "epoch": 4.3128874818625516, + "grad_norm": 0.008007209748029709, + "learning_rate": 1.2642222808405876e-05, + "loss": 0.0003, + "step": 16350 + }, + { + "epoch": 4.3134151167392165, + "grad_norm": 0.014766646549105644, + "learning_rate": 1.2641050441103198e-05, + "loss": 0.0003, + "step": 16352 + }, + { + "epoch": 4.3139427516158815, + "grad_norm": 0.003026959951967001, + "learning_rate": 1.2639878073800523e-05, + "loss": 0.0001, + "step": 16354 + }, + { + "epoch": 4.314470386492547, + "grad_norm": 0.004982261452823877, + "learning_rate": 1.2638705706497847e-05, + "loss": 0.0001, + "step": 16356 + }, + { + "epoch": 4.314998021369212, + "grad_norm": 0.002488305326551199, + "learning_rate": 1.2637533339195171e-05, + "loss": 0.0001, + "step": 16358 + }, + { + "epoch": 4.315525656245878, + "grad_norm": 0.00296875205822289, + "learning_rate": 1.2636360971892494e-05, + "loss": 0.0001, + "step": 16360 + }, + { + "epoch": 4.316053291122543, + "grad_norm": 0.0015387928579002619, + "learning_rate": 1.2635188604589818e-05, + "loss": 0.0048, + "step": 16362 + }, + { + "epoch": 4.316580925999209, + "grad_norm": 0.07206343114376068, + "learning_rate": 1.2634016237287144e-05, + "loss": 0.0009, + "step": 16364 + }, + { + "epoch": 4.317108560875874, + "grad_norm": 0.029301071539521217, + "learning_rate": 1.2632843869984468e-05, + "loss": 0.0002, + "step": 16366 + }, + { + "epoch": 4.317636195752539, + "grad_norm": 0.0022865182254463434, + "learning_rate": 1.2631671502681791e-05, + "loss": 0.0005, + "step": 16368 + }, + { + "epoch": 4.318163830629205, + "grad_norm": 0.029646312817931175, + "learning_rate": 1.2630499135379115e-05, + "loss": 0.0024, + "step": 16370 + }, + { + "epoch": 4.31869146550587, + "grad_norm": 0.017193568870425224, + "learning_rate": 1.262932676807644e-05, + "loss": 0.0002, + "step": 16372 + }, + { + "epoch": 4.3192191003825355, + "grad_norm": 0.21449431777000427, + "learning_rate": 1.2628154400773764e-05, + "loss": 0.0047, + "step": 16374 + }, + { + "epoch": 4.3197467352592005, + "grad_norm": 0.04160043224692345, + "learning_rate": 1.2626982033471087e-05, + "loss": 0.0002, + "step": 16376 + }, + { + "epoch": 4.320274370135866, + "grad_norm": 0.005521779414266348, + "learning_rate": 1.2625809666168411e-05, + "loss": 0.0015, + "step": 16378 + }, + { + "epoch": 4.320802005012531, + "grad_norm": 0.08554708957672119, + "learning_rate": 1.2624637298865737e-05, + "loss": 0.0009, + "step": 16380 + }, + { + "epoch": 4.321329639889196, + "grad_norm": 0.0009445333853363991, + "learning_rate": 1.2623464931563058e-05, + "loss": 0.0001, + "step": 16382 + }, + { + "epoch": 4.321857274765862, + "grad_norm": 0.01791195571422577, + "learning_rate": 1.2622292564260384e-05, + "loss": 0.0002, + "step": 16384 + }, + { + "epoch": 4.322384909642527, + "grad_norm": 0.016730323433876038, + "learning_rate": 1.2621120196957708e-05, + "loss": 0.0004, + "step": 16386 + }, + { + "epoch": 4.322912544519193, + "grad_norm": 0.0012660876382142305, + "learning_rate": 1.2619947829655032e-05, + "loss": 0.0119, + "step": 16388 + }, + { + "epoch": 4.323440179395858, + "grad_norm": 0.03501764312386513, + "learning_rate": 1.2618775462352355e-05, + "loss": 0.0002, + "step": 16390 + }, + { + "epoch": 4.323967814272524, + "grad_norm": 0.005710313096642494, + "learning_rate": 1.261760309504968e-05, + "loss": 0.0097, + "step": 16392 + }, + { + "epoch": 4.324495449149189, + "grad_norm": 0.005708620417863131, + "learning_rate": 1.2616430727747004e-05, + "loss": 0.0001, + "step": 16394 + }, + { + "epoch": 4.325023084025855, + "grad_norm": 0.0661827027797699, + "learning_rate": 1.261525836044433e-05, + "loss": 0.0008, + "step": 16396 + }, + { + "epoch": 4.3255507189025195, + "grad_norm": 0.04769759625196457, + "learning_rate": 1.2614085993141652e-05, + "loss": 0.007, + "step": 16398 + }, + { + "epoch": 4.3260783537791845, + "grad_norm": 0.037915147840976715, + "learning_rate": 1.2612913625838977e-05, + "loss": 0.0009, + "step": 16400 + }, + { + "epoch": 4.32660598865585, + "grad_norm": 0.19338782131671906, + "learning_rate": 1.2611741258536301e-05, + "loss": 0.0009, + "step": 16402 + }, + { + "epoch": 4.327133623532515, + "grad_norm": 0.009552828036248684, + "learning_rate": 1.2610568891233625e-05, + "loss": 0.0003, + "step": 16404 + }, + { + "epoch": 4.327661258409181, + "grad_norm": 0.013166277669370174, + "learning_rate": 1.2609396523930948e-05, + "loss": 0.0002, + "step": 16406 + }, + { + "epoch": 4.328188893285846, + "grad_norm": 0.013497265055775642, + "learning_rate": 1.2608224156628272e-05, + "loss": 0.0002, + "step": 16408 + }, + { + "epoch": 4.328716528162512, + "grad_norm": 0.016285352408885956, + "learning_rate": 1.2607051789325598e-05, + "loss": 0.0002, + "step": 16410 + }, + { + "epoch": 4.329244163039177, + "grad_norm": 0.004054031800478697, + "learning_rate": 1.2605879422022922e-05, + "loss": 0.0007, + "step": 16412 + }, + { + "epoch": 4.329771797915842, + "grad_norm": 0.03641070798039436, + "learning_rate": 1.2604707054720245e-05, + "loss": 0.0002, + "step": 16414 + }, + { + "epoch": 4.330299432792508, + "grad_norm": 0.016026681289076805, + "learning_rate": 1.260353468741757e-05, + "loss": 0.0003, + "step": 16416 + }, + { + "epoch": 4.330827067669173, + "grad_norm": 0.0026840155478566885, + "learning_rate": 1.2602362320114894e-05, + "loss": 0.0001, + "step": 16418 + }, + { + "epoch": 4.3313547025458385, + "grad_norm": 0.004889281466603279, + "learning_rate": 1.2601189952812216e-05, + "loss": 0.0003, + "step": 16420 + }, + { + "epoch": 4.3318823374225035, + "grad_norm": 0.10416630655527115, + "learning_rate": 1.260001758550954e-05, + "loss": 0.0005, + "step": 16422 + }, + { + "epoch": 4.332409972299169, + "grad_norm": 0.005025269463658333, + "learning_rate": 1.2598845218206865e-05, + "loss": 0.0001, + "step": 16424 + }, + { + "epoch": 4.332937607175834, + "grad_norm": 0.0038575499784201384, + "learning_rate": 1.259767285090419e-05, + "loss": 0.0001, + "step": 16426 + }, + { + "epoch": 4.333465242052499, + "grad_norm": 0.0027150188107043505, + "learning_rate": 1.2596500483601512e-05, + "loss": 0.0002, + "step": 16428 + }, + { + "epoch": 4.333992876929165, + "grad_norm": 0.12891899049282074, + "learning_rate": 1.2595328116298838e-05, + "loss": 0.0033, + "step": 16430 + }, + { + "epoch": 4.33452051180583, + "grad_norm": 0.005280107259750366, + "learning_rate": 1.2594155748996162e-05, + "loss": 0.0059, + "step": 16432 + }, + { + "epoch": 4.335048146682496, + "grad_norm": 0.005467228125780821, + "learning_rate": 1.2592983381693486e-05, + "loss": 0.0065, + "step": 16434 + }, + { + "epoch": 4.335575781559161, + "grad_norm": 0.006659385748207569, + "learning_rate": 1.2591811014390809e-05, + "loss": 0.0019, + "step": 16436 + }, + { + "epoch": 4.336103416435827, + "grad_norm": 0.005516672972589731, + "learning_rate": 1.2590638647088133e-05, + "loss": 0.0001, + "step": 16438 + }, + { + "epoch": 4.336631051312492, + "grad_norm": 0.008924138732254505, + "learning_rate": 1.2589466279785458e-05, + "loss": 0.0002, + "step": 16440 + }, + { + "epoch": 4.337158686189158, + "grad_norm": 0.007944897748529911, + "learning_rate": 1.2588293912482784e-05, + "loss": 0.0001, + "step": 16442 + }, + { + "epoch": 4.3376863210658225, + "grad_norm": 0.016812989488244057, + "learning_rate": 1.2587121545180106e-05, + "loss": 0.0002, + "step": 16444 + }, + { + "epoch": 4.3382139559424875, + "grad_norm": 0.029099885374307632, + "learning_rate": 1.258594917787743e-05, + "loss": 0.0039, + "step": 16446 + }, + { + "epoch": 4.338741590819153, + "grad_norm": 0.053333014249801636, + "learning_rate": 1.2584776810574755e-05, + "loss": 0.0002, + "step": 16448 + }, + { + "epoch": 4.339269225695818, + "grad_norm": 0.07230465859174728, + "learning_rate": 1.2583604443272077e-05, + "loss": 0.0003, + "step": 16450 + }, + { + "epoch": 4.339796860572484, + "grad_norm": 0.011719915084540844, + "learning_rate": 1.2582432075969402e-05, + "loss": 0.0002, + "step": 16452 + }, + { + "epoch": 4.340324495449149, + "grad_norm": 0.23644722998142242, + "learning_rate": 1.2581259708666726e-05, + "loss": 0.0004, + "step": 16454 + }, + { + "epoch": 4.340852130325814, + "grad_norm": 0.13957349956035614, + "learning_rate": 1.2580087341364052e-05, + "loss": 0.0029, + "step": 16456 + }, + { + "epoch": 4.34137976520248, + "grad_norm": 0.002380247227847576, + "learning_rate": 1.2578914974061373e-05, + "loss": 0.0001, + "step": 16458 + }, + { + "epoch": 4.341907400079145, + "grad_norm": 0.021458573639392853, + "learning_rate": 1.2577742606758699e-05, + "loss": 0.0002, + "step": 16460 + }, + { + "epoch": 4.342435034955811, + "grad_norm": 0.006074420642107725, + "learning_rate": 1.2576570239456023e-05, + "loss": 0.005, + "step": 16462 + }, + { + "epoch": 4.342962669832476, + "grad_norm": 0.006319534033536911, + "learning_rate": 1.2575397872153348e-05, + "loss": 0.0001, + "step": 16464 + }, + { + "epoch": 4.3434903047091415, + "grad_norm": 0.006781982257962227, + "learning_rate": 1.257422550485067e-05, + "loss": 0.0002, + "step": 16466 + }, + { + "epoch": 4.3440179395858065, + "grad_norm": 0.08398887515068054, + "learning_rate": 1.2573053137547994e-05, + "loss": 0.0002, + "step": 16468 + }, + { + "epoch": 4.344545574462472, + "grad_norm": 0.0028994502499699593, + "learning_rate": 1.2571880770245319e-05, + "loss": 0.0009, + "step": 16470 + }, + { + "epoch": 4.345073209339137, + "grad_norm": 0.004891007207334042, + "learning_rate": 1.2570708402942645e-05, + "loss": 0.0029, + "step": 16472 + }, + { + "epoch": 4.345600844215802, + "grad_norm": 0.0011011603055521846, + "learning_rate": 1.2569536035639967e-05, + "loss": 0.0001, + "step": 16474 + }, + { + "epoch": 4.346128479092468, + "grad_norm": 0.0018791407346725464, + "learning_rate": 1.2568363668337292e-05, + "loss": 0.0001, + "step": 16476 + }, + { + "epoch": 4.346656113969133, + "grad_norm": 0.0008103107684291899, + "learning_rate": 1.2567191301034616e-05, + "loss": 0.0001, + "step": 16478 + }, + { + "epoch": 4.347183748845799, + "grad_norm": 0.001412118668667972, + "learning_rate": 1.2566018933731939e-05, + "loss": 0.0001, + "step": 16480 + }, + { + "epoch": 4.347711383722464, + "grad_norm": 0.0037910600658506155, + "learning_rate": 1.2564846566429263e-05, + "loss": 0.0001, + "step": 16482 + }, + { + "epoch": 4.34823901859913, + "grad_norm": 0.0287577286362648, + "learning_rate": 1.2563674199126587e-05, + "loss": 0.0002, + "step": 16484 + }, + { + "epoch": 4.348766653475795, + "grad_norm": 0.003706831019371748, + "learning_rate": 1.2562501831823911e-05, + "loss": 0.0011, + "step": 16486 + }, + { + "epoch": 4.349294288352461, + "grad_norm": 0.07087112963199615, + "learning_rate": 1.2561329464521234e-05, + "loss": 0.0002, + "step": 16488 + }, + { + "epoch": 4.3498219232291255, + "grad_norm": 0.002353292889893055, + "learning_rate": 1.256015709721856e-05, + "loss": 0.0003, + "step": 16490 + }, + { + "epoch": 4.3503495581057905, + "grad_norm": 0.030998792499303818, + "learning_rate": 1.2558984729915884e-05, + "loss": 0.0026, + "step": 16492 + }, + { + "epoch": 4.350877192982456, + "grad_norm": 0.0020111198537051678, + "learning_rate": 1.2557812362613209e-05, + "loss": 0.0, + "step": 16494 + }, + { + "epoch": 4.351404827859121, + "grad_norm": 0.0013221576809883118, + "learning_rate": 1.2556639995310531e-05, + "loss": 0.0001, + "step": 16496 + }, + { + "epoch": 4.351932462735787, + "grad_norm": 0.044753096997737885, + "learning_rate": 1.2555467628007856e-05, + "loss": 0.0011, + "step": 16498 + }, + { + "epoch": 4.352460097612452, + "grad_norm": 0.014686549082398415, + "learning_rate": 1.255429526070518e-05, + "loss": 0.0043, + "step": 16500 + }, + { + "epoch": 4.352987732489117, + "grad_norm": 0.003980942070484161, + "learning_rate": 1.2553122893402506e-05, + "loss": 0.0001, + "step": 16502 + }, + { + "epoch": 4.353515367365783, + "grad_norm": 0.0019878866150975227, + "learning_rate": 1.2551950526099827e-05, + "loss": 0.0017, + "step": 16504 + }, + { + "epoch": 4.354043002242448, + "grad_norm": 0.00917685218155384, + "learning_rate": 1.2550778158797153e-05, + "loss": 0.0001, + "step": 16506 + }, + { + "epoch": 4.354570637119114, + "grad_norm": 0.07211722433567047, + "learning_rate": 1.2549605791494477e-05, + "loss": 0.0003, + "step": 16508 + }, + { + "epoch": 4.355098271995779, + "grad_norm": 0.003165674163028598, + "learning_rate": 1.25484334241918e-05, + "loss": 0.0046, + "step": 16510 + }, + { + "epoch": 4.3556259068724446, + "grad_norm": 0.006146665196865797, + "learning_rate": 1.2547261056889124e-05, + "loss": 0.0004, + "step": 16512 + }, + { + "epoch": 4.3561535417491095, + "grad_norm": 0.014602693729102612, + "learning_rate": 1.2546088689586448e-05, + "loss": 0.0022, + "step": 16514 + }, + { + "epoch": 4.356681176625775, + "grad_norm": 0.0024765145499259233, + "learning_rate": 1.2544916322283773e-05, + "loss": 0.0002, + "step": 16516 + }, + { + "epoch": 4.35720881150244, + "grad_norm": 0.0064014168456196785, + "learning_rate": 1.2543743954981095e-05, + "loss": 0.0003, + "step": 16518 + }, + { + "epoch": 4.357736446379105, + "grad_norm": 0.0043909368105232716, + "learning_rate": 1.2542571587678421e-05, + "loss": 0.0001, + "step": 16520 + }, + { + "epoch": 4.358264081255771, + "grad_norm": 0.016086257994174957, + "learning_rate": 1.2541399220375746e-05, + "loss": 0.0004, + "step": 16522 + }, + { + "epoch": 4.358791716132436, + "grad_norm": 0.004913870245218277, + "learning_rate": 1.254022685307307e-05, + "loss": 0.0001, + "step": 16524 + }, + { + "epoch": 4.359319351009102, + "grad_norm": 0.017460009083151817, + "learning_rate": 1.2539054485770392e-05, + "loss": 0.0014, + "step": 16526 + }, + { + "epoch": 4.359846985885767, + "grad_norm": 0.003116914303973317, + "learning_rate": 1.2537882118467717e-05, + "loss": 0.0001, + "step": 16528 + }, + { + "epoch": 4.360374620762433, + "grad_norm": 0.007793054450303316, + "learning_rate": 1.2536709751165041e-05, + "loss": 0.0001, + "step": 16530 + }, + { + "epoch": 4.360902255639098, + "grad_norm": 0.00553861353546381, + "learning_rate": 1.2535537383862365e-05, + "loss": 0.0001, + "step": 16532 + }, + { + "epoch": 4.361429890515763, + "grad_norm": 0.0020341358613222837, + "learning_rate": 1.2534365016559688e-05, + "loss": 0.0, + "step": 16534 + }, + { + "epoch": 4.3619575253924285, + "grad_norm": 0.0076889172196388245, + "learning_rate": 1.2533192649257014e-05, + "loss": 0.0001, + "step": 16536 + }, + { + "epoch": 4.3624851602690935, + "grad_norm": 0.0027884419541805983, + "learning_rate": 1.2532020281954338e-05, + "loss": 0.0002, + "step": 16538 + }, + { + "epoch": 4.363012795145759, + "grad_norm": 0.009003886952996254, + "learning_rate": 1.2530847914651661e-05, + "loss": 0.0001, + "step": 16540 + }, + { + "epoch": 4.363540430022424, + "grad_norm": 0.0017165020108222961, + "learning_rate": 1.2529675547348985e-05, + "loss": 0.0, + "step": 16542 + }, + { + "epoch": 4.36406806489909, + "grad_norm": 0.008033297955989838, + "learning_rate": 1.252850318004631e-05, + "loss": 0.0001, + "step": 16544 + }, + { + "epoch": 4.364595699775755, + "grad_norm": 0.003176603000611067, + "learning_rate": 1.2527330812743634e-05, + "loss": 0.0003, + "step": 16546 + }, + { + "epoch": 4.36512333465242, + "grad_norm": 0.0011032193433493376, + "learning_rate": 1.2526158445440956e-05, + "loss": 0.0019, + "step": 16548 + }, + { + "epoch": 4.365650969529086, + "grad_norm": 0.11701949685811996, + "learning_rate": 1.252498607813828e-05, + "loss": 0.0044, + "step": 16550 + }, + { + "epoch": 4.366178604405751, + "grad_norm": 0.01121470145881176, + "learning_rate": 1.2523813710835607e-05, + "loss": 0.0001, + "step": 16552 + }, + { + "epoch": 4.366706239282417, + "grad_norm": 0.0022858555894345045, + "learning_rate": 1.2522641343532931e-05, + "loss": 0.0001, + "step": 16554 + }, + { + "epoch": 4.367233874159082, + "grad_norm": 0.0060573904775083065, + "learning_rate": 1.2521468976230254e-05, + "loss": 0.0001, + "step": 16556 + }, + { + "epoch": 4.367761509035748, + "grad_norm": 0.003957309760153294, + "learning_rate": 1.2520296608927578e-05, + "loss": 0.0001, + "step": 16558 + }, + { + "epoch": 4.3682891439124125, + "grad_norm": 0.04780803993344307, + "learning_rate": 1.2519124241624902e-05, + "loss": 0.0002, + "step": 16560 + }, + { + "epoch": 4.368816778789078, + "grad_norm": 0.003346038283780217, + "learning_rate": 1.2517951874322227e-05, + "loss": 0.0001, + "step": 16562 + }, + { + "epoch": 4.369344413665743, + "grad_norm": 0.08637099713087082, + "learning_rate": 1.2516779507019549e-05, + "loss": 0.0003, + "step": 16564 + }, + { + "epoch": 4.369872048542408, + "grad_norm": 0.000770020589698106, + "learning_rate": 1.2515607139716875e-05, + "loss": 0.0, + "step": 16566 + }, + { + "epoch": 4.370399683419074, + "grad_norm": 0.005130287725478411, + "learning_rate": 1.25144347724142e-05, + "loss": 0.0008, + "step": 16568 + }, + { + "epoch": 4.370927318295739, + "grad_norm": 0.018028879538178444, + "learning_rate": 1.2513262405111522e-05, + "loss": 0.0002, + "step": 16570 + }, + { + "epoch": 4.371454953172405, + "grad_norm": 0.0033185556530952454, + "learning_rate": 1.2512090037808846e-05, + "loss": 0.0001, + "step": 16572 + }, + { + "epoch": 4.37198258804907, + "grad_norm": 0.0007473429432138801, + "learning_rate": 1.251091767050617e-05, + "loss": 0.0013, + "step": 16574 + }, + { + "epoch": 4.372510222925736, + "grad_norm": 0.016092410311102867, + "learning_rate": 1.2509745303203495e-05, + "loss": 0.0001, + "step": 16576 + }, + { + "epoch": 4.373037857802401, + "grad_norm": 0.005129302851855755, + "learning_rate": 1.2508572935900818e-05, + "loss": 0.0001, + "step": 16578 + }, + { + "epoch": 4.373565492679066, + "grad_norm": 0.005626943428069353, + "learning_rate": 1.2507400568598142e-05, + "loss": 0.0001, + "step": 16580 + }, + { + "epoch": 4.3740931275557315, + "grad_norm": 0.0058740959502756596, + "learning_rate": 1.2506228201295468e-05, + "loss": 0.0001, + "step": 16582 + }, + { + "epoch": 4.3746207624323965, + "grad_norm": 0.0009643409866839647, + "learning_rate": 1.2505055833992792e-05, + "loss": 0.0, + "step": 16584 + }, + { + "epoch": 4.375148397309062, + "grad_norm": 0.005377665162086487, + "learning_rate": 1.2503883466690115e-05, + "loss": 0.0001, + "step": 16586 + }, + { + "epoch": 4.375676032185727, + "grad_norm": 0.004002218600362539, + "learning_rate": 1.2502711099387439e-05, + "loss": 0.0002, + "step": 16588 + }, + { + "epoch": 4.376203667062393, + "grad_norm": 0.15943823754787445, + "learning_rate": 1.2501538732084763e-05, + "loss": 0.0004, + "step": 16590 + }, + { + "epoch": 4.376731301939058, + "grad_norm": 0.0004960877122357488, + "learning_rate": 1.2500366364782088e-05, + "loss": 0.0, + "step": 16592 + }, + { + "epoch": 4.377258936815723, + "grad_norm": 0.002260599285364151, + "learning_rate": 1.249919399747941e-05, + "loss": 0.0002, + "step": 16594 + }, + { + "epoch": 4.377786571692389, + "grad_norm": 0.051909189671278, + "learning_rate": 1.2498021630176735e-05, + "loss": 0.0001, + "step": 16596 + }, + { + "epoch": 4.378314206569054, + "grad_norm": 0.04558038339018822, + "learning_rate": 1.249684926287406e-05, + "loss": 0.0004, + "step": 16598 + }, + { + "epoch": 4.37884184144572, + "grad_norm": 0.011716163717210293, + "learning_rate": 1.2495676895571383e-05, + "loss": 0.0001, + "step": 16600 + }, + { + "epoch": 4.379369476322385, + "grad_norm": 0.003117143642157316, + "learning_rate": 1.2494504528268708e-05, + "loss": 0.0001, + "step": 16602 + }, + { + "epoch": 4.379897111199051, + "grad_norm": 0.014421438798308372, + "learning_rate": 1.2493332160966032e-05, + "loss": 0.0001, + "step": 16604 + }, + { + "epoch": 4.3804247460757155, + "grad_norm": 0.0022172133903950453, + "learning_rate": 1.2492159793663356e-05, + "loss": 0.0045, + "step": 16606 + }, + { + "epoch": 4.380952380952381, + "grad_norm": 0.0020737210288643837, + "learning_rate": 1.2490987426360679e-05, + "loss": 0.0001, + "step": 16608 + }, + { + "epoch": 4.381480015829046, + "grad_norm": 0.04114295914769173, + "learning_rate": 1.2489815059058003e-05, + "loss": 0.0003, + "step": 16610 + }, + { + "epoch": 4.382007650705711, + "grad_norm": 0.003100705798715353, + "learning_rate": 1.2488642691755329e-05, + "loss": 0.0016, + "step": 16612 + }, + { + "epoch": 4.382535285582377, + "grad_norm": 0.0010450861882418394, + "learning_rate": 1.2487470324452653e-05, + "loss": 0.0, + "step": 16614 + }, + { + "epoch": 4.383062920459042, + "grad_norm": 0.014343170449137688, + "learning_rate": 1.2486297957149976e-05, + "loss": 0.0013, + "step": 16616 + }, + { + "epoch": 4.383590555335708, + "grad_norm": 0.00037335784872993827, + "learning_rate": 1.24851255898473e-05, + "loss": 0.0, + "step": 16618 + }, + { + "epoch": 4.384118190212373, + "grad_norm": 0.31569942831993103, + "learning_rate": 1.2483953222544625e-05, + "loss": 0.0028, + "step": 16620 + }, + { + "epoch": 4.384645825089039, + "grad_norm": 0.000908342597540468, + "learning_rate": 1.2482780855241949e-05, + "loss": 0.0, + "step": 16622 + }, + { + "epoch": 4.385173459965704, + "grad_norm": 0.01878373883664608, + "learning_rate": 1.2481608487939271e-05, + "loss": 0.0001, + "step": 16624 + }, + { + "epoch": 4.385701094842369, + "grad_norm": 0.0010402980260550976, + "learning_rate": 1.2480436120636596e-05, + "loss": 0.0, + "step": 16626 + }, + { + "epoch": 4.3862287297190345, + "grad_norm": 0.0012774895876646042, + "learning_rate": 1.2479263753333922e-05, + "loss": 0.0073, + "step": 16628 + }, + { + "epoch": 4.3867563645956995, + "grad_norm": 0.5679046511650085, + "learning_rate": 1.2478091386031244e-05, + "loss": 0.0033, + "step": 16630 + }, + { + "epoch": 4.387283999472365, + "grad_norm": 0.001912726555019617, + "learning_rate": 1.2476919018728569e-05, + "loss": 0.0022, + "step": 16632 + }, + { + "epoch": 4.38781163434903, + "grad_norm": 0.038386326283216476, + "learning_rate": 1.2475746651425893e-05, + "loss": 0.0034, + "step": 16634 + }, + { + "epoch": 4.388339269225696, + "grad_norm": 0.0019191765459254384, + "learning_rate": 1.2474574284123217e-05, + "loss": 0.0001, + "step": 16636 + }, + { + "epoch": 4.388866904102361, + "grad_norm": 0.479262113571167, + "learning_rate": 1.247340191682054e-05, + "loss": 0.003, + "step": 16638 + }, + { + "epoch": 4.389394538979026, + "grad_norm": 0.01563306525349617, + "learning_rate": 1.2472229549517864e-05, + "loss": 0.0001, + "step": 16640 + }, + { + "epoch": 4.389922173855692, + "grad_norm": 0.0511145293712616, + "learning_rate": 1.2471057182215189e-05, + "loss": 0.0002, + "step": 16642 + }, + { + "epoch": 4.390449808732357, + "grad_norm": 0.25695544481277466, + "learning_rate": 1.2469884814912515e-05, + "loss": 0.0036, + "step": 16644 + }, + { + "epoch": 4.390977443609023, + "grad_norm": 0.07390792667865753, + "learning_rate": 1.2468712447609837e-05, + "loss": 0.0005, + "step": 16646 + }, + { + "epoch": 4.391505078485688, + "grad_norm": 0.10802898555994034, + "learning_rate": 1.2467540080307161e-05, + "loss": 0.0006, + "step": 16648 + }, + { + "epoch": 4.392032713362354, + "grad_norm": 0.03169655054807663, + "learning_rate": 1.2466367713004486e-05, + "loss": 0.0006, + "step": 16650 + }, + { + "epoch": 4.3925603482390185, + "grad_norm": 0.03370866924524307, + "learning_rate": 1.246519534570181e-05, + "loss": 0.0034, + "step": 16652 + }, + { + "epoch": 4.393087983115684, + "grad_norm": 0.048287682235240936, + "learning_rate": 1.2464022978399133e-05, + "loss": 0.0004, + "step": 16654 + }, + { + "epoch": 4.393615617992349, + "grad_norm": 0.004223335068672895, + "learning_rate": 1.2462850611096457e-05, + "loss": 0.0001, + "step": 16656 + }, + { + "epoch": 4.394143252869014, + "grad_norm": 0.08193990588188171, + "learning_rate": 1.2461678243793783e-05, + "loss": 0.0004, + "step": 16658 + }, + { + "epoch": 4.39467088774568, + "grad_norm": 0.13843117654323578, + "learning_rate": 1.2460505876491104e-05, + "loss": 0.0004, + "step": 16660 + }, + { + "epoch": 4.395198522622345, + "grad_norm": 0.027519773691892624, + "learning_rate": 1.245933350918843e-05, + "loss": 0.0002, + "step": 16662 + }, + { + "epoch": 4.395726157499011, + "grad_norm": 0.03865988925099373, + "learning_rate": 1.2458161141885754e-05, + "loss": 0.0002, + "step": 16664 + }, + { + "epoch": 4.396253792375676, + "grad_norm": 0.006355302408337593, + "learning_rate": 1.2456988774583078e-05, + "loss": 0.0001, + "step": 16666 + }, + { + "epoch": 4.396781427252342, + "grad_norm": 0.0023784153163433075, + "learning_rate": 1.2455816407280401e-05, + "loss": 0.0002, + "step": 16668 + }, + { + "epoch": 4.397309062129007, + "grad_norm": 0.05418502539396286, + "learning_rate": 1.2454644039977725e-05, + "loss": 0.0, + "step": 16670 + }, + { + "epoch": 4.397836697005672, + "grad_norm": 0.010200180113315582, + "learning_rate": 1.245347167267505e-05, + "loss": 0.0001, + "step": 16672 + }, + { + "epoch": 4.3983643318823376, + "grad_norm": 0.024729229509830475, + "learning_rate": 1.2452299305372376e-05, + "loss": 0.0022, + "step": 16674 + }, + { + "epoch": 4.3988919667590025, + "grad_norm": 0.001934180036187172, + "learning_rate": 1.2451126938069698e-05, + "loss": 0.0, + "step": 16676 + }, + { + "epoch": 4.399419601635668, + "grad_norm": 0.13520587980747223, + "learning_rate": 1.2449954570767023e-05, + "loss": 0.0036, + "step": 16678 + }, + { + "epoch": 4.399947236512333, + "grad_norm": 0.0009392160573042929, + "learning_rate": 1.2448782203464347e-05, + "loss": 0.0001, + "step": 16680 + }, + { + "epoch": 4.400474871388999, + "grad_norm": 0.0005444478592835367, + "learning_rate": 1.2447609836161671e-05, + "loss": 0.0, + "step": 16682 + }, + { + "epoch": 4.401002506265664, + "grad_norm": 0.39196574687957764, + "learning_rate": 1.2446437468858994e-05, + "loss": 0.0012, + "step": 16684 + }, + { + "epoch": 4.401530141142329, + "grad_norm": 0.0008232410764321685, + "learning_rate": 1.2445265101556318e-05, + "loss": 0.0011, + "step": 16686 + }, + { + "epoch": 4.402057776018995, + "grad_norm": 0.0039504459127783775, + "learning_rate": 1.2444092734253644e-05, + "loss": 0.0001, + "step": 16688 + }, + { + "epoch": 4.40258541089566, + "grad_norm": 0.000809910474345088, + "learning_rate": 1.2442920366950965e-05, + "loss": 0.0001, + "step": 16690 + }, + { + "epoch": 4.403113045772326, + "grad_norm": 0.0029620006680488586, + "learning_rate": 1.2441747999648291e-05, + "loss": 0.0038, + "step": 16692 + }, + { + "epoch": 4.403640680648991, + "grad_norm": 0.04757542163133621, + "learning_rate": 1.2440575632345615e-05, + "loss": 0.0001, + "step": 16694 + }, + { + "epoch": 4.404168315525657, + "grad_norm": 0.0077940491028130054, + "learning_rate": 1.243940326504294e-05, + "loss": 0.0001, + "step": 16696 + }, + { + "epoch": 4.4046959504023215, + "grad_norm": 0.002716985298320651, + "learning_rate": 1.2438230897740262e-05, + "loss": 0.0001, + "step": 16698 + }, + { + "epoch": 4.405223585278987, + "grad_norm": 0.7753250598907471, + "learning_rate": 1.2437058530437587e-05, + "loss": 0.0054, + "step": 16700 + }, + { + "epoch": 4.405751220155652, + "grad_norm": 0.001813902286812663, + "learning_rate": 1.2435886163134911e-05, + "loss": 0.0084, + "step": 16702 + }, + { + "epoch": 4.406278855032317, + "grad_norm": 0.006366528104990721, + "learning_rate": 1.2434713795832237e-05, + "loss": 0.0002, + "step": 16704 + }, + { + "epoch": 4.406806489908983, + "grad_norm": 0.050171978771686554, + "learning_rate": 1.2433541428529558e-05, + "loss": 0.0004, + "step": 16706 + }, + { + "epoch": 4.407334124785648, + "grad_norm": 0.21542266011238098, + "learning_rate": 1.2432369061226884e-05, + "loss": 0.0032, + "step": 16708 + }, + { + "epoch": 4.407861759662314, + "grad_norm": 0.08128513395786285, + "learning_rate": 1.2431196693924208e-05, + "loss": 0.0059, + "step": 16710 + }, + { + "epoch": 4.408389394538979, + "grad_norm": 0.14975526928901672, + "learning_rate": 1.2430024326621532e-05, + "loss": 0.0014, + "step": 16712 + }, + { + "epoch": 4.408917029415645, + "grad_norm": 0.019762087613344193, + "learning_rate": 1.2428851959318855e-05, + "loss": 0.0001, + "step": 16714 + }, + { + "epoch": 4.40944466429231, + "grad_norm": 0.025432655587792397, + "learning_rate": 1.242767959201618e-05, + "loss": 0.0004, + "step": 16716 + }, + { + "epoch": 4.409972299168975, + "grad_norm": 0.0032157800160348415, + "learning_rate": 1.2426507224713504e-05, + "loss": 0.0001, + "step": 16718 + }, + { + "epoch": 4.410499934045641, + "grad_norm": 0.004361779894679785, + "learning_rate": 1.2425334857410826e-05, + "loss": 0.0001, + "step": 16720 + }, + { + "epoch": 4.4110275689223055, + "grad_norm": 0.00961346086114645, + "learning_rate": 1.2424162490108152e-05, + "loss": 0.0001, + "step": 16722 + }, + { + "epoch": 4.411555203798971, + "grad_norm": 0.03052450157701969, + "learning_rate": 1.2422990122805477e-05, + "loss": 0.0003, + "step": 16724 + }, + { + "epoch": 4.412082838675636, + "grad_norm": 0.01195231918245554, + "learning_rate": 1.24218177555028e-05, + "loss": 0.0002, + "step": 16726 + }, + { + "epoch": 4.412610473552302, + "grad_norm": 0.002798034343868494, + "learning_rate": 1.2420645388200123e-05, + "loss": 0.0036, + "step": 16728 + }, + { + "epoch": 4.413138108428967, + "grad_norm": 0.05809195712208748, + "learning_rate": 1.2419473020897448e-05, + "loss": 0.0003, + "step": 16730 + }, + { + "epoch": 4.413665743305632, + "grad_norm": 0.0015065913321450353, + "learning_rate": 1.2418300653594772e-05, + "loss": 0.0001, + "step": 16732 + }, + { + "epoch": 4.414193378182298, + "grad_norm": 0.1569632887840271, + "learning_rate": 1.2417128286292098e-05, + "loss": 0.0036, + "step": 16734 + }, + { + "epoch": 4.414721013058963, + "grad_norm": 0.0025149586144834757, + "learning_rate": 1.2415955918989419e-05, + "loss": 0.0001, + "step": 16736 + }, + { + "epoch": 4.415248647935629, + "grad_norm": 0.008539733476936817, + "learning_rate": 1.2414783551686745e-05, + "loss": 0.0002, + "step": 16738 + }, + { + "epoch": 4.415776282812294, + "grad_norm": 0.014310828410089016, + "learning_rate": 1.241361118438407e-05, + "loss": 0.0014, + "step": 16740 + }, + { + "epoch": 4.41630391768896, + "grad_norm": 0.8037586808204651, + "learning_rate": 1.2412438817081394e-05, + "loss": 0.0034, + "step": 16742 + }, + { + "epoch": 4.4168315525656245, + "grad_norm": 0.012836168520152569, + "learning_rate": 1.2411266449778716e-05, + "loss": 0.0003, + "step": 16744 + }, + { + "epoch": 4.41735918744229, + "grad_norm": 0.0074829598888754845, + "learning_rate": 1.241009408247604e-05, + "loss": 0.0001, + "step": 16746 + }, + { + "epoch": 4.417886822318955, + "grad_norm": 0.012590906582772732, + "learning_rate": 1.2408921715173365e-05, + "loss": 0.0002, + "step": 16748 + }, + { + "epoch": 4.41841445719562, + "grad_norm": 0.002063954947516322, + "learning_rate": 1.2407749347870687e-05, + "loss": 0.0001, + "step": 16750 + }, + { + "epoch": 4.418942092072286, + "grad_norm": 0.044815581291913986, + "learning_rate": 1.2406576980568012e-05, + "loss": 0.0004, + "step": 16752 + }, + { + "epoch": 4.419469726948951, + "grad_norm": 0.30925342440605164, + "learning_rate": 1.2405404613265338e-05, + "loss": 0.0059, + "step": 16754 + }, + { + "epoch": 4.419997361825617, + "grad_norm": 0.005355185829102993, + "learning_rate": 1.2404232245962662e-05, + "loss": 0.0001, + "step": 16756 + }, + { + "epoch": 4.420524996702282, + "grad_norm": 0.0028675373177975416, + "learning_rate": 1.2403059878659985e-05, + "loss": 0.0001, + "step": 16758 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.01722910813987255, + "learning_rate": 1.2401887511357309e-05, + "loss": 0.0001, + "step": 16760 + }, + { + "epoch": 4.421580266455613, + "grad_norm": 0.05583811551332474, + "learning_rate": 1.2400715144054633e-05, + "loss": 0.0005, + "step": 16762 + }, + { + "epoch": 4.422107901332278, + "grad_norm": 0.002180106472223997, + "learning_rate": 1.2399542776751958e-05, + "loss": 0.0001, + "step": 16764 + }, + { + "epoch": 4.422635536208944, + "grad_norm": 0.4395788908004761, + "learning_rate": 1.239837040944928e-05, + "loss": 0.0019, + "step": 16766 + }, + { + "epoch": 4.4231631710856085, + "grad_norm": 0.0019811380188912153, + "learning_rate": 1.2397198042146606e-05, + "loss": 0.0001, + "step": 16768 + }, + { + "epoch": 4.423690805962274, + "grad_norm": 0.175025075674057, + "learning_rate": 1.239602567484393e-05, + "loss": 0.0085, + "step": 16770 + }, + { + "epoch": 4.424218440838939, + "grad_norm": 0.0019280231790617108, + "learning_rate": 1.2394853307541255e-05, + "loss": 0.0002, + "step": 16772 + }, + { + "epoch": 4.424746075715605, + "grad_norm": 0.0035781385377049446, + "learning_rate": 1.2393680940238577e-05, + "loss": 0.0001, + "step": 16774 + }, + { + "epoch": 4.42527371059227, + "grad_norm": 0.00370834581553936, + "learning_rate": 1.2392508572935902e-05, + "loss": 0.0001, + "step": 16776 + }, + { + "epoch": 4.425801345468935, + "grad_norm": 0.005401809699833393, + "learning_rate": 1.2391336205633226e-05, + "loss": 0.003, + "step": 16778 + }, + { + "epoch": 4.426328980345601, + "grad_norm": 0.015942204743623734, + "learning_rate": 1.2390163838330549e-05, + "loss": 0.0037, + "step": 16780 + }, + { + "epoch": 4.426856615222266, + "grad_norm": 0.0023992478381842375, + "learning_rate": 1.2388991471027873e-05, + "loss": 0.0001, + "step": 16782 + }, + { + "epoch": 4.427384250098932, + "grad_norm": 0.037936072796583176, + "learning_rate": 1.2387819103725199e-05, + "loss": 0.0058, + "step": 16784 + }, + { + "epoch": 4.427911884975597, + "grad_norm": 0.15641137957572937, + "learning_rate": 1.2386646736422523e-05, + "loss": 0.0021, + "step": 16786 + }, + { + "epoch": 4.428439519852263, + "grad_norm": 0.009009133093059063, + "learning_rate": 1.2385474369119846e-05, + "loss": 0.0002, + "step": 16788 + }, + { + "epoch": 4.4289671547289275, + "grad_norm": 0.0027225338853895664, + "learning_rate": 1.238430200181717e-05, + "loss": 0.0001, + "step": 16790 + }, + { + "epoch": 4.429494789605593, + "grad_norm": 0.0023304305505007505, + "learning_rate": 1.2383129634514494e-05, + "loss": 0.0005, + "step": 16792 + }, + { + "epoch": 4.430022424482258, + "grad_norm": 0.022865723818540573, + "learning_rate": 1.2381957267211819e-05, + "loss": 0.0002, + "step": 16794 + }, + { + "epoch": 4.430550059358923, + "grad_norm": 0.012795113027095795, + "learning_rate": 1.2380784899909141e-05, + "loss": 0.0012, + "step": 16796 + }, + { + "epoch": 4.431077694235589, + "grad_norm": 0.015546767972409725, + "learning_rate": 1.2379612532606467e-05, + "loss": 0.0001, + "step": 16798 + }, + { + "epoch": 4.431605329112254, + "grad_norm": 0.43398189544677734, + "learning_rate": 1.2378440165303792e-05, + "loss": 0.0066, + "step": 16800 + }, + { + "epoch": 4.43213296398892, + "grad_norm": 0.00283744721673429, + "learning_rate": 1.2377267798001116e-05, + "loss": 0.0001, + "step": 16802 + }, + { + "epoch": 4.432660598865585, + "grad_norm": 0.010992015711963177, + "learning_rate": 1.2376095430698439e-05, + "loss": 0.0002, + "step": 16804 + }, + { + "epoch": 4.43318823374225, + "grad_norm": 0.07124587148427963, + "learning_rate": 1.2374923063395763e-05, + "loss": 0.0004, + "step": 16806 + }, + { + "epoch": 4.433715868618916, + "grad_norm": 0.10547497123479843, + "learning_rate": 1.2373750696093087e-05, + "loss": 0.0005, + "step": 16808 + }, + { + "epoch": 4.434243503495581, + "grad_norm": 0.3290728032588959, + "learning_rate": 1.237257832879041e-05, + "loss": 0.0047, + "step": 16810 + }, + { + "epoch": 4.434771138372247, + "grad_norm": 0.014012959785759449, + "learning_rate": 1.2371405961487734e-05, + "loss": 0.0002, + "step": 16812 + }, + { + "epoch": 4.4352987732489115, + "grad_norm": 0.0031181061640381813, + "learning_rate": 1.237023359418506e-05, + "loss": 0.0001, + "step": 16814 + }, + { + "epoch": 4.435826408125577, + "grad_norm": 0.008355469442903996, + "learning_rate": 1.2369061226882384e-05, + "loss": 0.0058, + "step": 16816 + }, + { + "epoch": 4.436354043002242, + "grad_norm": 0.0027487576007843018, + "learning_rate": 1.2367888859579707e-05, + "loss": 0.0001, + "step": 16818 + }, + { + "epoch": 4.436881677878908, + "grad_norm": 0.0009705013944767416, + "learning_rate": 1.2366716492277031e-05, + "loss": 0.0001, + "step": 16820 + }, + { + "epoch": 4.437409312755573, + "grad_norm": 0.007629431318491697, + "learning_rate": 1.2365544124974356e-05, + "loss": 0.0001, + "step": 16822 + }, + { + "epoch": 4.437936947632238, + "grad_norm": 0.004660394508391619, + "learning_rate": 1.236437175767168e-05, + "loss": 0.0001, + "step": 16824 + }, + { + "epoch": 4.438464582508904, + "grad_norm": 0.003906898200511932, + "learning_rate": 1.2363199390369002e-05, + "loss": 0.0001, + "step": 16826 + }, + { + "epoch": 4.438992217385569, + "grad_norm": 0.030255068093538284, + "learning_rate": 1.2362027023066327e-05, + "loss": 0.0003, + "step": 16828 + }, + { + "epoch": 4.439519852262235, + "grad_norm": 0.0034638368524610996, + "learning_rate": 1.2360854655763653e-05, + "loss": 0.0001, + "step": 16830 + }, + { + "epoch": 4.4400474871389, + "grad_norm": 0.008755982853472233, + "learning_rate": 1.2359682288460977e-05, + "loss": 0.0001, + "step": 16832 + }, + { + "epoch": 4.440575122015566, + "grad_norm": 0.004643904510885477, + "learning_rate": 1.23585099211583e-05, + "loss": 0.0001, + "step": 16834 + }, + { + "epoch": 4.4411027568922306, + "grad_norm": 0.11559121310710907, + "learning_rate": 1.2357337553855624e-05, + "loss": 0.006, + "step": 16836 + }, + { + "epoch": 4.4416303917688955, + "grad_norm": 0.8473979830741882, + "learning_rate": 1.2356165186552948e-05, + "loss": 0.0022, + "step": 16838 + }, + { + "epoch": 4.442158026645561, + "grad_norm": 0.008994701318442822, + "learning_rate": 1.2354992819250271e-05, + "loss": 0.0001, + "step": 16840 + }, + { + "epoch": 4.442685661522226, + "grad_norm": 0.2062123715877533, + "learning_rate": 1.2353820451947595e-05, + "loss": 0.0038, + "step": 16842 + }, + { + "epoch": 4.443213296398892, + "grad_norm": 0.004609155468642712, + "learning_rate": 1.2352648084644921e-05, + "loss": 0.0001, + "step": 16844 + }, + { + "epoch": 4.443740931275557, + "grad_norm": 0.004127544816583395, + "learning_rate": 1.2351475717342245e-05, + "loss": 0.0001, + "step": 16846 + }, + { + "epoch": 4.444268566152223, + "grad_norm": 0.3026742935180664, + "learning_rate": 1.2350303350039568e-05, + "loss": 0.0034, + "step": 16848 + }, + { + "epoch": 4.444796201028888, + "grad_norm": 0.3463081121444702, + "learning_rate": 1.2349130982736892e-05, + "loss": 0.0006, + "step": 16850 + }, + { + "epoch": 4.445323835905553, + "grad_norm": 0.0066457181237638, + "learning_rate": 1.2347958615434217e-05, + "loss": 0.0001, + "step": 16852 + }, + { + "epoch": 4.445851470782219, + "grad_norm": 0.14020898938179016, + "learning_rate": 1.2346786248131541e-05, + "loss": 0.0003, + "step": 16854 + }, + { + "epoch": 4.446379105658884, + "grad_norm": 0.02811536192893982, + "learning_rate": 1.2345613880828864e-05, + "loss": 0.0019, + "step": 16856 + }, + { + "epoch": 4.44690674053555, + "grad_norm": 0.14583683013916016, + "learning_rate": 1.2344441513526188e-05, + "loss": 0.0006, + "step": 16858 + }, + { + "epoch": 4.4474343754122145, + "grad_norm": 0.7113361954689026, + "learning_rate": 1.234385532987485e-05, + "loss": 0.0089, + "step": 16860 + }, + { + "epoch": 4.44796201028888, + "grad_norm": 0.02759428508579731, + "learning_rate": 1.2342682962572175e-05, + "loss": 0.0002, + "step": 16862 + }, + { + "epoch": 4.448489645165545, + "grad_norm": 0.0837317481637001, + "learning_rate": 1.23415105952695e-05, + "loss": 0.0005, + "step": 16864 + }, + { + "epoch": 4.449017280042211, + "grad_norm": 0.02221359871327877, + "learning_rate": 1.2340338227966824e-05, + "loss": 0.0006, + "step": 16866 + }, + { + "epoch": 4.449544914918876, + "grad_norm": 0.02344363182783127, + "learning_rate": 1.2339165860664146e-05, + "loss": 0.0059, + "step": 16868 + }, + { + "epoch": 4.450072549795541, + "grad_norm": 0.005118831526488066, + "learning_rate": 1.233799349336147e-05, + "loss": 0.0001, + "step": 16870 + }, + { + "epoch": 4.450600184672207, + "grad_norm": 0.005412136670202017, + "learning_rate": 1.2336821126058795e-05, + "loss": 0.0001, + "step": 16872 + }, + { + "epoch": 4.451127819548872, + "grad_norm": 0.12848514318466187, + "learning_rate": 1.2335648758756121e-05, + "loss": 0.004, + "step": 16874 + }, + { + "epoch": 4.451655454425538, + "grad_norm": 0.006821206770837307, + "learning_rate": 1.2334476391453442e-05, + "loss": 0.0005, + "step": 16876 + }, + { + "epoch": 4.452183089302203, + "grad_norm": 0.005309354513883591, + "learning_rate": 1.2333304024150768e-05, + "loss": 0.0001, + "step": 16878 + }, + { + "epoch": 4.452710724178869, + "grad_norm": 0.0054693156853318214, + "learning_rate": 1.2332131656848092e-05, + "loss": 0.0008, + "step": 16880 + }, + { + "epoch": 4.453238359055534, + "grad_norm": 0.031240347772836685, + "learning_rate": 1.2330959289545417e-05, + "loss": 0.0005, + "step": 16882 + }, + { + "epoch": 4.4537659939321985, + "grad_norm": 0.010766278952360153, + "learning_rate": 1.232978692224274e-05, + "loss": 0.0001, + "step": 16884 + }, + { + "epoch": 4.454293628808864, + "grad_norm": 0.11548792570829391, + "learning_rate": 1.2328614554940063e-05, + "loss": 0.0008, + "step": 16886 + }, + { + "epoch": 4.454821263685529, + "grad_norm": 0.0020615067332983017, + "learning_rate": 1.2327442187637388e-05, + "loss": 0.0001, + "step": 16888 + }, + { + "epoch": 4.455348898562195, + "grad_norm": 0.0032392435241490602, + "learning_rate": 1.232626982033471e-05, + "loss": 0.0001, + "step": 16890 + }, + { + "epoch": 4.45587653343886, + "grad_norm": 0.025367798283696175, + "learning_rate": 1.2325097453032035e-05, + "loss": 0.0014, + "step": 16892 + }, + { + "epoch": 4.456404168315526, + "grad_norm": 0.001802516053430736, + "learning_rate": 1.232392508572936e-05, + "loss": 0.0001, + "step": 16894 + }, + { + "epoch": 4.456931803192191, + "grad_norm": 0.0023608270566910505, + "learning_rate": 1.2322752718426685e-05, + "loss": 0.0001, + "step": 16896 + }, + { + "epoch": 4.457459438068856, + "grad_norm": 0.03042379580438137, + "learning_rate": 1.2321580351124008e-05, + "loss": 0.0002, + "step": 16898 + }, + { + "epoch": 4.457987072945522, + "grad_norm": 0.06366676837205887, + "learning_rate": 1.2320407983821332e-05, + "loss": 0.0017, + "step": 16900 + }, + { + "epoch": 4.458514707822187, + "grad_norm": 0.0006543230847455561, + "learning_rate": 1.2319235616518656e-05, + "loss": 0.0, + "step": 16902 + }, + { + "epoch": 4.459042342698853, + "grad_norm": 0.0037916768342256546, + "learning_rate": 1.231806324921598e-05, + "loss": 0.0002, + "step": 16904 + }, + { + "epoch": 4.4595699775755175, + "grad_norm": 0.0008070140611380339, + "learning_rate": 1.2316890881913303e-05, + "loss": 0.0035, + "step": 16906 + }, + { + "epoch": 4.460097612452183, + "grad_norm": 0.0029534990899264812, + "learning_rate": 1.2315718514610629e-05, + "loss": 0.0001, + "step": 16908 + }, + { + "epoch": 4.460625247328848, + "grad_norm": 0.01205621100962162, + "learning_rate": 1.2314546147307953e-05, + "loss": 0.0001, + "step": 16910 + }, + { + "epoch": 4.461152882205514, + "grad_norm": 0.0015102281467989087, + "learning_rate": 1.2313373780005278e-05, + "loss": 0.0, + "step": 16912 + }, + { + "epoch": 4.461680517082179, + "grad_norm": 0.0019722417928278446, + "learning_rate": 1.23122014127026e-05, + "loss": 0.0027, + "step": 16914 + }, + { + "epoch": 4.462208151958844, + "grad_norm": 0.09895485639572144, + "learning_rate": 1.2311029045399925e-05, + "loss": 0.0006, + "step": 16916 + }, + { + "epoch": 4.46273578683551, + "grad_norm": 0.028873464092612267, + "learning_rate": 1.2309856678097249e-05, + "loss": 0.0042, + "step": 16918 + }, + { + "epoch": 4.463263421712175, + "grad_norm": 0.002094908384606242, + "learning_rate": 1.2308684310794572e-05, + "loss": 0.0001, + "step": 16920 + }, + { + "epoch": 4.463791056588841, + "grad_norm": 0.002603776752948761, + "learning_rate": 1.2307511943491896e-05, + "loss": 0.0001, + "step": 16922 + }, + { + "epoch": 4.464318691465506, + "grad_norm": 0.22691921889781952, + "learning_rate": 1.2306339576189222e-05, + "loss": 0.0011, + "step": 16924 + }, + { + "epoch": 4.464846326342172, + "grad_norm": 0.008862046524882317, + "learning_rate": 1.2305167208886546e-05, + "loss": 0.0001, + "step": 16926 + }, + { + "epoch": 4.465373961218837, + "grad_norm": 0.004284392111003399, + "learning_rate": 1.2303994841583869e-05, + "loss": 0.0023, + "step": 16928 + }, + { + "epoch": 4.4659015960955015, + "grad_norm": 0.03009166195988655, + "learning_rate": 1.2302822474281193e-05, + "loss": 0.0031, + "step": 16930 + }, + { + "epoch": 4.466429230972167, + "grad_norm": 0.0012381133856251836, + "learning_rate": 1.2301650106978517e-05, + "loss": 0.0001, + "step": 16932 + }, + { + "epoch": 4.466956865848832, + "grad_norm": 0.001913287676870823, + "learning_rate": 1.2300477739675842e-05, + "loss": 0.0001, + "step": 16934 + }, + { + "epoch": 4.467484500725498, + "grad_norm": 0.0046948934905231, + "learning_rate": 1.2299305372373164e-05, + "loss": 0.0001, + "step": 16936 + }, + { + "epoch": 4.468012135602163, + "grad_norm": 0.007992113009095192, + "learning_rate": 1.2298133005070489e-05, + "loss": 0.001, + "step": 16938 + }, + { + "epoch": 4.468539770478829, + "grad_norm": 0.0007658631657250226, + "learning_rate": 1.2296960637767815e-05, + "loss": 0.0014, + "step": 16940 + }, + { + "epoch": 4.469067405355494, + "grad_norm": 0.4326348602771759, + "learning_rate": 1.2295788270465139e-05, + "loss": 0.0022, + "step": 16942 + }, + { + "epoch": 4.469595040232159, + "grad_norm": 0.0014124009758234024, + "learning_rate": 1.2294615903162462e-05, + "loss": 0.0005, + "step": 16944 + }, + { + "epoch": 4.470122675108825, + "grad_norm": 0.0034416536800563335, + "learning_rate": 1.2293443535859786e-05, + "loss": 0.0001, + "step": 16946 + }, + { + "epoch": 4.47065030998549, + "grad_norm": 0.001930899452418089, + "learning_rate": 1.229227116855711e-05, + "loss": 0.0022, + "step": 16948 + }, + { + "epoch": 4.471177944862156, + "grad_norm": 0.01168053038418293, + "learning_rate": 1.2291098801254433e-05, + "loss": 0.0001, + "step": 16950 + }, + { + "epoch": 4.4717055797388205, + "grad_norm": 0.0035668062046170235, + "learning_rate": 1.2289926433951757e-05, + "loss": 0.0, + "step": 16952 + }, + { + "epoch": 4.472233214615486, + "grad_norm": 0.004187426529824734, + "learning_rate": 1.2288754066649083e-05, + "loss": 0.0001, + "step": 16954 + }, + { + "epoch": 4.472760849492151, + "grad_norm": 0.0005548663320951164, + "learning_rate": 1.2287581699346407e-05, + "loss": 0.0, + "step": 16956 + }, + { + "epoch": 4.473288484368817, + "grad_norm": 0.002325110835954547, + "learning_rate": 1.228640933204373e-05, + "loss": 0.0, + "step": 16958 + }, + { + "epoch": 4.473816119245482, + "grad_norm": 0.0061767431907355785, + "learning_rate": 1.2285236964741054e-05, + "loss": 0.0001, + "step": 16960 + }, + { + "epoch": 4.474343754122147, + "grad_norm": 0.0009432589868083596, + "learning_rate": 1.2284064597438379e-05, + "loss": 0.0, + "step": 16962 + }, + { + "epoch": 4.474871388998813, + "grad_norm": 0.0017205107724294066, + "learning_rate": 1.2282892230135703e-05, + "loss": 0.0, + "step": 16964 + }, + { + "epoch": 4.475399023875478, + "grad_norm": 0.0008922766428440809, + "learning_rate": 1.2281719862833025e-05, + "loss": 0.0, + "step": 16966 + }, + { + "epoch": 4.475926658752144, + "grad_norm": 0.0014743966748937964, + "learning_rate": 1.228054749553035e-05, + "loss": 0.0001, + "step": 16968 + }, + { + "epoch": 4.476454293628809, + "grad_norm": 0.002019101521000266, + "learning_rate": 1.2279375128227676e-05, + "loss": 0.0063, + "step": 16970 + }, + { + "epoch": 4.476981928505475, + "grad_norm": 0.0004510096914600581, + "learning_rate": 1.2278202760925e-05, + "loss": 0.0057, + "step": 16972 + }, + { + "epoch": 4.47750956338214, + "grad_norm": 0.1421649158000946, + "learning_rate": 1.2277030393622323e-05, + "loss": 0.0004, + "step": 16974 + }, + { + "epoch": 4.4780371982588045, + "grad_norm": 0.0026437039487063885, + "learning_rate": 1.2275858026319647e-05, + "loss": 0.0031, + "step": 16976 + }, + { + "epoch": 4.47856483313547, + "grad_norm": 0.0019187885336577892, + "learning_rate": 1.2274685659016971e-05, + "loss": 0.0001, + "step": 16978 + }, + { + "epoch": 4.479092468012135, + "grad_norm": 0.014604649506509304, + "learning_rate": 1.2273513291714294e-05, + "loss": 0.0003, + "step": 16980 + }, + { + "epoch": 4.479620102888801, + "grad_norm": 0.0045859008096158504, + "learning_rate": 1.2272340924411618e-05, + "loss": 0.0001, + "step": 16982 + }, + { + "epoch": 4.480147737765466, + "grad_norm": 0.001395334373228252, + "learning_rate": 1.2271168557108944e-05, + "loss": 0.0, + "step": 16984 + }, + { + "epoch": 4.480675372642132, + "grad_norm": 0.006242763251066208, + "learning_rate": 1.2269996189806269e-05, + "loss": 0.0001, + "step": 16986 + }, + { + "epoch": 4.481203007518797, + "grad_norm": 0.014551707543432713, + "learning_rate": 1.2268823822503591e-05, + "loss": 0.0028, + "step": 16988 + }, + { + "epoch": 4.481730642395462, + "grad_norm": 0.004002216272056103, + "learning_rate": 1.2267651455200915e-05, + "loss": 0.0016, + "step": 16990 + }, + { + "epoch": 4.482258277272128, + "grad_norm": 0.021284740418195724, + "learning_rate": 1.226647908789824e-05, + "loss": 0.0003, + "step": 16992 + }, + { + "epoch": 4.482785912148793, + "grad_norm": 0.001264500431716442, + "learning_rate": 1.2265306720595564e-05, + "loss": 0.0001, + "step": 16994 + }, + { + "epoch": 4.483313547025459, + "grad_norm": 0.001677594380453229, + "learning_rate": 1.2264134353292887e-05, + "loss": 0.0002, + "step": 16996 + }, + { + "epoch": 4.4838411819021236, + "grad_norm": 0.10268788039684296, + "learning_rate": 1.2262961985990211e-05, + "loss": 0.0014, + "step": 16998 + }, + { + "epoch": 4.484368816778789, + "grad_norm": 0.08509688079357147, + "learning_rate": 1.2261789618687537e-05, + "loss": 0.0005, + "step": 17000 + }, + { + "epoch": 4.484896451655454, + "grad_norm": 0.01894952356815338, + "learning_rate": 1.2260617251384861e-05, + "loss": 0.0004, + "step": 17002 + }, + { + "epoch": 4.48542408653212, + "grad_norm": 0.0012080579763278365, + "learning_rate": 1.2259444884082184e-05, + "loss": 0.0001, + "step": 17004 + }, + { + "epoch": 4.485951721408785, + "grad_norm": 0.01126923318952322, + "learning_rate": 1.2258272516779508e-05, + "loss": 0.0002, + "step": 17006 + }, + { + "epoch": 4.48647935628545, + "grad_norm": 0.003787759691476822, + "learning_rate": 1.2257100149476832e-05, + "loss": 0.0001, + "step": 17008 + }, + { + "epoch": 4.487006991162116, + "grad_norm": 0.010567214339971542, + "learning_rate": 1.2255927782174155e-05, + "loss": 0.0001, + "step": 17010 + }, + { + "epoch": 4.487534626038781, + "grad_norm": 0.010093620046973228, + "learning_rate": 1.225475541487148e-05, + "loss": 0.0001, + "step": 17012 + }, + { + "epoch": 4.488062260915447, + "grad_norm": 0.0034116439055651426, + "learning_rate": 1.2253583047568804e-05, + "loss": 0.0002, + "step": 17014 + }, + { + "epoch": 4.488589895792112, + "grad_norm": 0.11458490043878555, + "learning_rate": 1.225241068026613e-05, + "loss": 0.0027, + "step": 17016 + }, + { + "epoch": 4.489117530668778, + "grad_norm": 0.014849178493022919, + "learning_rate": 1.2251238312963452e-05, + "loss": 0.0014, + "step": 17018 + }, + { + "epoch": 4.489645165545443, + "grad_norm": 0.035068709403276443, + "learning_rate": 1.2250065945660777e-05, + "loss": 0.0003, + "step": 17020 + }, + { + "epoch": 4.4901728004221075, + "grad_norm": 0.008283092640340328, + "learning_rate": 1.2248893578358101e-05, + "loss": 0.0001, + "step": 17022 + }, + { + "epoch": 4.490700435298773, + "grad_norm": 0.0010360273299738765, + "learning_rate": 1.2247721211055425e-05, + "loss": 0.0001, + "step": 17024 + }, + { + "epoch": 4.491228070175438, + "grad_norm": 0.006921628024429083, + "learning_rate": 1.2246548843752748e-05, + "loss": 0.0001, + "step": 17026 + }, + { + "epoch": 4.491755705052104, + "grad_norm": 0.0009907865896821022, + "learning_rate": 1.2245376476450072e-05, + "loss": 0.0035, + "step": 17028 + }, + { + "epoch": 4.492283339928769, + "grad_norm": 0.0006553842686116695, + "learning_rate": 1.2244204109147398e-05, + "loss": 0.0001, + "step": 17030 + }, + { + "epoch": 4.492810974805435, + "grad_norm": 0.00479769054800272, + "learning_rate": 1.2243031741844722e-05, + "loss": 0.0001, + "step": 17032 + }, + { + "epoch": 4.4933386096821, + "grad_norm": 0.013119714334607124, + "learning_rate": 1.2241859374542045e-05, + "loss": 0.0003, + "step": 17034 + }, + { + "epoch": 4.493866244558765, + "grad_norm": 0.006910274270921946, + "learning_rate": 1.224068700723937e-05, + "loss": 0.0001, + "step": 17036 + }, + { + "epoch": 4.494393879435431, + "grad_norm": 0.0014927634038031101, + "learning_rate": 1.2239514639936694e-05, + "loss": 0.0001, + "step": 17038 + }, + { + "epoch": 4.494921514312096, + "grad_norm": 0.0009359411778859794, + "learning_rate": 1.2238342272634016e-05, + "loss": 0.0001, + "step": 17040 + }, + { + "epoch": 4.495449149188762, + "grad_norm": 0.000934794545173645, + "learning_rate": 1.223716990533134e-05, + "loss": 0.0, + "step": 17042 + }, + { + "epoch": 4.495976784065427, + "grad_norm": 0.0007269515190273523, + "learning_rate": 1.2235997538028665e-05, + "loss": 0.0004, + "step": 17044 + }, + { + "epoch": 4.496504418942092, + "grad_norm": 0.0016129366122186184, + "learning_rate": 1.2234825170725991e-05, + "loss": 0.0001, + "step": 17046 + }, + { + "epoch": 4.497032053818757, + "grad_norm": 0.0021956206765025854, + "learning_rate": 1.2233652803423313e-05, + "loss": 0.0, + "step": 17048 + }, + { + "epoch": 4.497559688695423, + "grad_norm": 0.007165482733398676, + "learning_rate": 1.2232480436120638e-05, + "loss": 0.0001, + "step": 17050 + }, + { + "epoch": 4.498087323572088, + "grad_norm": 0.11256399005651474, + "learning_rate": 1.2231308068817962e-05, + "loss": 0.0054, + "step": 17052 + }, + { + "epoch": 4.498614958448753, + "grad_norm": 0.01060547400265932, + "learning_rate": 1.2230135701515286e-05, + "loss": 0.0001, + "step": 17054 + }, + { + "epoch": 4.499142593325419, + "grad_norm": 0.0039916206151247025, + "learning_rate": 1.2228963334212609e-05, + "loss": 0.0, + "step": 17056 + }, + { + "epoch": 4.499670228202084, + "grad_norm": 0.006872459314763546, + "learning_rate": 1.2227790966909933e-05, + "loss": 0.0001, + "step": 17058 + }, + { + "epoch": 4.50019786307875, + "grad_norm": 0.005774370860308409, + "learning_rate": 1.2226618599607258e-05, + "loss": 0.0001, + "step": 17060 + }, + { + "epoch": 4.500725497955415, + "grad_norm": 0.0017449666047468781, + "learning_rate": 1.2225446232304584e-05, + "loss": 0.0009, + "step": 17062 + }, + { + "epoch": 4.50125313283208, + "grad_norm": 0.6630313992500305, + "learning_rate": 1.2224273865001906e-05, + "loss": 0.0019, + "step": 17064 + }, + { + "epoch": 4.501780767708746, + "grad_norm": 0.004705359227955341, + "learning_rate": 1.222310149769923e-05, + "loss": 0.0001, + "step": 17066 + }, + { + "epoch": 4.5023084025854105, + "grad_norm": 0.010287079028785229, + "learning_rate": 1.2221929130396555e-05, + "loss": 0.0068, + "step": 17068 + }, + { + "epoch": 4.502836037462076, + "grad_norm": 0.019996976479887962, + "learning_rate": 1.2220756763093877e-05, + "loss": 0.0001, + "step": 17070 + }, + { + "epoch": 4.503363672338741, + "grad_norm": 0.0011120080016553402, + "learning_rate": 1.2219584395791202e-05, + "loss": 0.0001, + "step": 17072 + }, + { + "epoch": 4.503891307215407, + "grad_norm": 0.00487684179097414, + "learning_rate": 1.2218412028488526e-05, + "loss": 0.0003, + "step": 17074 + }, + { + "epoch": 4.504418942092072, + "grad_norm": 0.0019148924620822072, + "learning_rate": 1.2217239661185852e-05, + "loss": 0.0001, + "step": 17076 + }, + { + "epoch": 4.504946576968738, + "grad_norm": 0.010183017700910568, + "learning_rate": 1.2216067293883173e-05, + "loss": 0.0001, + "step": 17078 + }, + { + "epoch": 4.505474211845403, + "grad_norm": 0.009199154563248158, + "learning_rate": 1.2214894926580499e-05, + "loss": 0.0002, + "step": 17080 + }, + { + "epoch": 4.506001846722068, + "grad_norm": 0.006954750511795282, + "learning_rate": 1.2213722559277823e-05, + "loss": 0.0015, + "step": 17082 + }, + { + "epoch": 4.506529481598734, + "grad_norm": 0.2218155860900879, + "learning_rate": 1.2212550191975148e-05, + "loss": 0.0034, + "step": 17084 + }, + { + "epoch": 4.507057116475399, + "grad_norm": 0.05933155491948128, + "learning_rate": 1.221137782467247e-05, + "loss": 0.0012, + "step": 17086 + }, + { + "epoch": 4.507584751352065, + "grad_norm": 0.002135356655344367, + "learning_rate": 1.2210205457369794e-05, + "loss": 0.0001, + "step": 17088 + }, + { + "epoch": 4.50811238622873, + "grad_norm": 0.3811802566051483, + "learning_rate": 1.2209033090067119e-05, + "loss": 0.0013, + "step": 17090 + }, + { + "epoch": 4.508640021105395, + "grad_norm": 0.009644033387303352, + "learning_rate": 1.2207860722764445e-05, + "loss": 0.0003, + "step": 17092 + }, + { + "epoch": 4.50916765598206, + "grad_norm": 0.003328990424051881, + "learning_rate": 1.2206688355461767e-05, + "loss": 0.0001, + "step": 17094 + }, + { + "epoch": 4.509695290858726, + "grad_norm": 0.004603061359375715, + "learning_rate": 1.2205515988159092e-05, + "loss": 0.0001, + "step": 17096 + }, + { + "epoch": 4.510222925735391, + "grad_norm": 0.03651316836476326, + "learning_rate": 1.2204343620856416e-05, + "loss": 0.0003, + "step": 17098 + }, + { + "epoch": 4.510750560612056, + "grad_norm": 0.016302751377224922, + "learning_rate": 1.2203171253553739e-05, + "loss": 0.0004, + "step": 17100 + }, + { + "epoch": 4.511278195488722, + "grad_norm": 0.8376855254173279, + "learning_rate": 1.2201998886251063e-05, + "loss": 0.0084, + "step": 17102 + }, + { + "epoch": 4.511805830365387, + "grad_norm": 0.23097814619541168, + "learning_rate": 1.2200826518948387e-05, + "loss": 0.001, + "step": 17104 + }, + { + "epoch": 4.512333465242053, + "grad_norm": 0.0047801886685192585, + "learning_rate": 1.2199654151645712e-05, + "loss": 0.0001, + "step": 17106 + }, + { + "epoch": 4.512861100118718, + "grad_norm": 0.00866168737411499, + "learning_rate": 1.2198481784343034e-05, + "loss": 0.002, + "step": 17108 + }, + { + "epoch": 4.513388734995383, + "grad_norm": 0.010976283811032772, + "learning_rate": 1.219730941704036e-05, + "loss": 0.0002, + "step": 17110 + }, + { + "epoch": 4.513916369872049, + "grad_norm": 0.27301889657974243, + "learning_rate": 1.2196137049737684e-05, + "loss": 0.0082, + "step": 17112 + }, + { + "epoch": 4.5144440047487135, + "grad_norm": 0.0946468859910965, + "learning_rate": 1.2194964682435009e-05, + "loss": 0.0004, + "step": 17114 + }, + { + "epoch": 4.514971639625379, + "grad_norm": 0.18599507212638855, + "learning_rate": 1.2193792315132331e-05, + "loss": 0.0015, + "step": 17116 + }, + { + "epoch": 4.515499274502044, + "grad_norm": 0.08409346640110016, + "learning_rate": 1.2192619947829656e-05, + "loss": 0.0006, + "step": 17118 + }, + { + "epoch": 4.51602690937871, + "grad_norm": 0.29748231172561646, + "learning_rate": 1.219144758052698e-05, + "loss": 0.0028, + "step": 17120 + }, + { + "epoch": 4.516554544255375, + "grad_norm": 0.04852217808365822, + "learning_rate": 1.2190275213224306e-05, + "loss": 0.0003, + "step": 17122 + }, + { + "epoch": 4.517082179132041, + "grad_norm": 0.007688898593187332, + "learning_rate": 1.2189102845921627e-05, + "loss": 0.0003, + "step": 17124 + }, + { + "epoch": 4.517609814008706, + "grad_norm": 0.025381576269865036, + "learning_rate": 1.2187930478618953e-05, + "loss": 0.0003, + "step": 17126 + }, + { + "epoch": 4.518137448885371, + "grad_norm": 0.010018588043749332, + "learning_rate": 1.2186758111316277e-05, + "loss": 0.0001, + "step": 17128 + }, + { + "epoch": 4.518665083762037, + "grad_norm": 0.006788682658225298, + "learning_rate": 1.21855857440136e-05, + "loss": 0.0001, + "step": 17130 + }, + { + "epoch": 4.519192718638702, + "grad_norm": 0.7276675701141357, + "learning_rate": 1.2184413376710924e-05, + "loss": 0.0056, + "step": 17132 + }, + { + "epoch": 4.519720353515368, + "grad_norm": 0.005129619035869837, + "learning_rate": 1.2183241009408248e-05, + "loss": 0.0001, + "step": 17134 + }, + { + "epoch": 4.520247988392033, + "grad_norm": 0.004041648004204035, + "learning_rate": 1.2182068642105573e-05, + "loss": 0.0003, + "step": 17136 + }, + { + "epoch": 4.520775623268698, + "grad_norm": 0.006168779451400042, + "learning_rate": 1.2180896274802895e-05, + "loss": 0.0003, + "step": 17138 + }, + { + "epoch": 4.521303258145363, + "grad_norm": 0.0038138513918966055, + "learning_rate": 1.2179723907500221e-05, + "loss": 0.0002, + "step": 17140 + }, + { + "epoch": 4.521830893022029, + "grad_norm": 0.0018834486836567521, + "learning_rate": 1.2178551540197546e-05, + "loss": 0.0027, + "step": 17142 + }, + { + "epoch": 4.522358527898694, + "grad_norm": 0.005451798439025879, + "learning_rate": 1.217737917289487e-05, + "loss": 0.0001, + "step": 17144 + }, + { + "epoch": 4.522886162775359, + "grad_norm": 0.0067068166099488735, + "learning_rate": 1.2176206805592193e-05, + "loss": 0.0001, + "step": 17146 + }, + { + "epoch": 4.523413797652025, + "grad_norm": 0.033007629215717316, + "learning_rate": 1.2175034438289517e-05, + "loss": 0.0002, + "step": 17148 + }, + { + "epoch": 4.52394143252869, + "grad_norm": 0.059722889214754105, + "learning_rate": 1.2173862070986841e-05, + "loss": 0.0006, + "step": 17150 + }, + { + "epoch": 4.524469067405356, + "grad_norm": 0.004098460078239441, + "learning_rate": 1.2172689703684165e-05, + "loss": 0.0001, + "step": 17152 + }, + { + "epoch": 4.524996702282021, + "grad_norm": 0.022733837366104126, + "learning_rate": 1.2171517336381488e-05, + "loss": 0.0004, + "step": 17154 + }, + { + "epoch": 4.525524337158686, + "grad_norm": 0.007264971733093262, + "learning_rate": 1.2170344969078814e-05, + "loss": 0.0025, + "step": 17156 + }, + { + "epoch": 4.526051972035352, + "grad_norm": 0.005327705293893814, + "learning_rate": 1.2169172601776138e-05, + "loss": 0.0001, + "step": 17158 + }, + { + "epoch": 4.5265796069120166, + "grad_norm": 0.7711681127548218, + "learning_rate": 1.2168000234473461e-05, + "loss": 0.0025, + "step": 17160 + }, + { + "epoch": 4.527107241788682, + "grad_norm": 0.022732457146048546, + "learning_rate": 1.2166827867170785e-05, + "loss": 0.0032, + "step": 17162 + }, + { + "epoch": 4.527634876665347, + "grad_norm": 0.004113480914384127, + "learning_rate": 1.216565549986811e-05, + "loss": 0.0001, + "step": 17164 + }, + { + "epoch": 4.528162511542013, + "grad_norm": 0.05041518062353134, + "learning_rate": 1.2164483132565434e-05, + "loss": 0.0002, + "step": 17166 + }, + { + "epoch": 4.528690146418678, + "grad_norm": 0.0057698520831763744, + "learning_rate": 1.2163310765262756e-05, + "loss": 0.0001, + "step": 17168 + }, + { + "epoch": 4.529217781295344, + "grad_norm": 0.001530640758574009, + "learning_rate": 1.216213839796008e-05, + "loss": 0.0002, + "step": 17170 + }, + { + "epoch": 4.529745416172009, + "grad_norm": 0.0005221842438913882, + "learning_rate": 1.2160966030657407e-05, + "loss": 0.0001, + "step": 17172 + }, + { + "epoch": 4.530273051048674, + "grad_norm": 0.6235304474830627, + "learning_rate": 1.2159793663354731e-05, + "loss": 0.0021, + "step": 17174 + }, + { + "epoch": 4.53080068592534, + "grad_norm": 0.008165555074810982, + "learning_rate": 1.2158621296052054e-05, + "loss": 0.0, + "step": 17176 + }, + { + "epoch": 4.531328320802005, + "grad_norm": 0.004033291712403297, + "learning_rate": 1.2157448928749378e-05, + "loss": 0.0001, + "step": 17178 + }, + { + "epoch": 4.531855955678671, + "grad_norm": 0.02647573873400688, + "learning_rate": 1.2156276561446702e-05, + "loss": 0.0001, + "step": 17180 + }, + { + "epoch": 4.532383590555336, + "grad_norm": 0.19630655646324158, + "learning_rate": 1.2155104194144027e-05, + "loss": 0.0005, + "step": 17182 + }, + { + "epoch": 4.532911225432001, + "grad_norm": 0.061981212347745895, + "learning_rate": 1.215393182684135e-05, + "loss": 0.0041, + "step": 17184 + }, + { + "epoch": 4.533438860308666, + "grad_norm": 0.32606664299964905, + "learning_rate": 1.2152759459538675e-05, + "loss": 0.0026, + "step": 17186 + }, + { + "epoch": 4.533966495185332, + "grad_norm": 0.000944809929933399, + "learning_rate": 1.2151587092236e-05, + "loss": 0.0, + "step": 17188 + }, + { + "epoch": 4.534494130061997, + "grad_norm": 0.08566979318857193, + "learning_rate": 1.2150414724933322e-05, + "loss": 0.0006, + "step": 17190 + }, + { + "epoch": 4.535021764938662, + "grad_norm": 0.010514969006180763, + "learning_rate": 1.2149242357630646e-05, + "loss": 0.0002, + "step": 17192 + }, + { + "epoch": 4.535549399815328, + "grad_norm": 0.2142045795917511, + "learning_rate": 1.214806999032797e-05, + "loss": 0.0073, + "step": 17194 + }, + { + "epoch": 4.536077034691993, + "grad_norm": 0.011913549154996872, + "learning_rate": 1.2146897623025295e-05, + "loss": 0.0037, + "step": 17196 + }, + { + "epoch": 4.536604669568659, + "grad_norm": 0.0029313606210052967, + "learning_rate": 1.2145725255722618e-05, + "loss": 0.0001, + "step": 17198 + }, + { + "epoch": 4.537132304445324, + "grad_norm": 0.0017312833806499839, + "learning_rate": 1.2144552888419942e-05, + "loss": 0.0019, + "step": 17200 + }, + { + "epoch": 4.537659939321989, + "grad_norm": 0.0027327092830091715, + "learning_rate": 1.2143380521117268e-05, + "loss": 0.0001, + "step": 17202 + }, + { + "epoch": 4.538187574198655, + "grad_norm": 0.025091493502259254, + "learning_rate": 1.2142208153814592e-05, + "loss": 0.0004, + "step": 17204 + }, + { + "epoch": 4.53871520907532, + "grad_norm": 0.0061592222191393375, + "learning_rate": 1.2141035786511915e-05, + "loss": 0.0011, + "step": 17206 + }, + { + "epoch": 4.539242843951985, + "grad_norm": 0.006099005229771137, + "learning_rate": 1.2139863419209239e-05, + "loss": 0.0001, + "step": 17208 + }, + { + "epoch": 4.53977047882865, + "grad_norm": 0.0110753970220685, + "learning_rate": 1.2138691051906563e-05, + "loss": 0.0028, + "step": 17210 + }, + { + "epoch": 4.540298113705316, + "grad_norm": 0.00403424259275198, + "learning_rate": 1.2137518684603888e-05, + "loss": 0.0005, + "step": 17212 + }, + { + "epoch": 4.540825748581981, + "grad_norm": 0.004718739539384842, + "learning_rate": 1.213634631730121e-05, + "loss": 0.0001, + "step": 17214 + }, + { + "epoch": 4.541353383458647, + "grad_norm": 0.006748524494469166, + "learning_rate": 1.2135173949998535e-05, + "loss": 0.0001, + "step": 17216 + }, + { + "epoch": 4.541881018335312, + "grad_norm": 0.004292789846658707, + "learning_rate": 1.213400158269586e-05, + "loss": 0.0001, + "step": 17218 + }, + { + "epoch": 4.542408653211977, + "grad_norm": 0.004566709045320749, + "learning_rate": 1.2132829215393183e-05, + "loss": 0.0001, + "step": 17220 + }, + { + "epoch": 4.542936288088643, + "grad_norm": 0.006605999078601599, + "learning_rate": 1.2131656848090508e-05, + "loss": 0.0001, + "step": 17222 + }, + { + "epoch": 4.543463922965308, + "grad_norm": 0.018411295488476753, + "learning_rate": 1.2130484480787832e-05, + "loss": 0.0032, + "step": 17224 + }, + { + "epoch": 4.543991557841974, + "grad_norm": 0.03475944697856903, + "learning_rate": 1.2129312113485156e-05, + "loss": 0.0003, + "step": 17226 + }, + { + "epoch": 4.544519192718639, + "grad_norm": 0.020499899983406067, + "learning_rate": 1.2128139746182479e-05, + "loss": 0.0002, + "step": 17228 + }, + { + "epoch": 4.545046827595304, + "grad_norm": 0.002131714718416333, + "learning_rate": 1.2126967378879803e-05, + "loss": 0.0001, + "step": 17230 + }, + { + "epoch": 4.545574462471969, + "grad_norm": 0.04171101003885269, + "learning_rate": 1.2125795011577129e-05, + "loss": 0.0042, + "step": 17232 + }, + { + "epoch": 4.546102097348635, + "grad_norm": 0.025724826380610466, + "learning_rate": 1.2124622644274453e-05, + "loss": 0.0003, + "step": 17234 + }, + { + "epoch": 4.5466297322253, + "grad_norm": 0.0016384943155571818, + "learning_rate": 1.2123450276971776e-05, + "loss": 0.0001, + "step": 17236 + }, + { + "epoch": 4.547157367101965, + "grad_norm": 0.011239515617489815, + "learning_rate": 1.21222779096691e-05, + "loss": 0.0003, + "step": 17238 + }, + { + "epoch": 4.547685001978631, + "grad_norm": 0.0017286422662436962, + "learning_rate": 1.2121105542366425e-05, + "loss": 0.0001, + "step": 17240 + }, + { + "epoch": 4.548212636855296, + "grad_norm": 0.003288209903985262, + "learning_rate": 1.2119933175063749e-05, + "loss": 0.0001, + "step": 17242 + }, + { + "epoch": 4.548740271731962, + "grad_norm": 0.003959619905799627, + "learning_rate": 1.2118760807761072e-05, + "loss": 0.0099, + "step": 17244 + }, + { + "epoch": 4.549267906608627, + "grad_norm": 0.06091010570526123, + "learning_rate": 1.2117588440458396e-05, + "loss": 0.0023, + "step": 17246 + }, + { + "epoch": 4.549795541485292, + "grad_norm": 0.0068654161877930164, + "learning_rate": 1.2116416073155722e-05, + "loss": 0.0016, + "step": 17248 + }, + { + "epoch": 4.550323176361958, + "grad_norm": 0.0049444641917943954, + "learning_rate": 1.2115243705853044e-05, + "loss": 0.0001, + "step": 17250 + }, + { + "epoch": 4.550850811238623, + "grad_norm": 0.005819454789161682, + "learning_rate": 1.2114071338550369e-05, + "loss": 0.002, + "step": 17252 + }, + { + "epoch": 4.551378446115288, + "grad_norm": 2.1742703914642334, + "learning_rate": 1.2112898971247693e-05, + "loss": 0.0051, + "step": 17254 + }, + { + "epoch": 4.551906080991953, + "grad_norm": 0.011209788732230663, + "learning_rate": 1.2111726603945017e-05, + "loss": 0.0005, + "step": 17256 + }, + { + "epoch": 4.552433715868619, + "grad_norm": 0.0471874475479126, + "learning_rate": 1.211055423664234e-05, + "loss": 0.0003, + "step": 17258 + }, + { + "epoch": 4.552961350745284, + "grad_norm": 0.052041586488485336, + "learning_rate": 1.2109381869339664e-05, + "loss": 0.0004, + "step": 17260 + }, + { + "epoch": 4.55348898562195, + "grad_norm": 0.015476140193641186, + "learning_rate": 1.2108209502036989e-05, + "loss": 0.0001, + "step": 17262 + }, + { + "epoch": 4.554016620498615, + "grad_norm": 0.002393885515630245, + "learning_rate": 1.2107037134734315e-05, + "loss": 0.0001, + "step": 17264 + }, + { + "epoch": 4.55454425537528, + "grad_norm": 0.008450114168226719, + "learning_rate": 1.2105864767431637e-05, + "loss": 0.0001, + "step": 17266 + }, + { + "epoch": 4.555071890251946, + "grad_norm": 0.0025776163674890995, + "learning_rate": 1.2104692400128961e-05, + "loss": 0.0013, + "step": 17268 + }, + { + "epoch": 4.555599525128611, + "grad_norm": 0.0015362434787675738, + "learning_rate": 1.2103520032826286e-05, + "loss": 0.0001, + "step": 17270 + }, + { + "epoch": 4.556127160005277, + "grad_norm": 0.0028759862761944532, + "learning_rate": 1.210234766552361e-05, + "loss": 0.0001, + "step": 17272 + }, + { + "epoch": 4.556654794881942, + "grad_norm": 0.010073896497488022, + "learning_rate": 1.2101175298220933e-05, + "loss": 0.0052, + "step": 17274 + }, + { + "epoch": 4.5571824297586065, + "grad_norm": 0.03906293958425522, + "learning_rate": 1.2100002930918257e-05, + "loss": 0.0041, + "step": 17276 + }, + { + "epoch": 4.557710064635272, + "grad_norm": 0.05948324874043465, + "learning_rate": 1.2098830563615583e-05, + "loss": 0.0004, + "step": 17278 + }, + { + "epoch": 4.558237699511937, + "grad_norm": 0.0010722404113039374, + "learning_rate": 1.2097658196312904e-05, + "loss": 0.0001, + "step": 17280 + }, + { + "epoch": 4.558765334388603, + "grad_norm": 0.007005289196968079, + "learning_rate": 1.209648582901023e-05, + "loss": 0.0001, + "step": 17282 + }, + { + "epoch": 4.559292969265268, + "grad_norm": 0.004731977358460426, + "learning_rate": 1.2095313461707554e-05, + "loss": 0.0001, + "step": 17284 + }, + { + "epoch": 4.559820604141934, + "grad_norm": 0.5052949786186218, + "learning_rate": 1.2094141094404879e-05, + "loss": 0.0043, + "step": 17286 + }, + { + "epoch": 4.560348239018599, + "grad_norm": 0.0023210481740534306, + "learning_rate": 1.2092968727102201e-05, + "loss": 0.0, + "step": 17288 + }, + { + "epoch": 4.560875873895265, + "grad_norm": 0.0035265760961920023, + "learning_rate": 1.2091796359799525e-05, + "loss": 0.0001, + "step": 17290 + }, + { + "epoch": 4.56140350877193, + "grad_norm": 0.00509805791079998, + "learning_rate": 1.209062399249685e-05, + "loss": 0.0001, + "step": 17292 + }, + { + "epoch": 4.561931143648595, + "grad_norm": 0.001112203928641975, + "learning_rate": 1.2089451625194176e-05, + "loss": 0.0001, + "step": 17294 + }, + { + "epoch": 4.562458778525261, + "grad_norm": 0.0034694657661020756, + "learning_rate": 1.2088279257891498e-05, + "loss": 0.0001, + "step": 17296 + }, + { + "epoch": 4.562986413401926, + "grad_norm": 0.18566766381263733, + "learning_rate": 1.2087106890588823e-05, + "loss": 0.0025, + "step": 17298 + }, + { + "epoch": 4.563514048278591, + "grad_norm": 0.0026534495409578085, + "learning_rate": 1.2085934523286147e-05, + "loss": 0.0001, + "step": 17300 + }, + { + "epoch": 4.564041683155256, + "grad_norm": 0.005353722255676985, + "learning_rate": 1.2084762155983471e-05, + "loss": 0.0001, + "step": 17302 + }, + { + "epoch": 4.564569318031922, + "grad_norm": 0.03532061353325844, + "learning_rate": 1.2083589788680794e-05, + "loss": 0.0003, + "step": 17304 + }, + { + "epoch": 4.565096952908587, + "grad_norm": 0.017758328467607498, + "learning_rate": 1.2082417421378118e-05, + "loss": 0.0001, + "step": 17306 + }, + { + "epoch": 4.565624587785253, + "grad_norm": 0.014686382375657558, + "learning_rate": 1.2081245054075444e-05, + "loss": 0.0002, + "step": 17308 + }, + { + "epoch": 4.566152222661918, + "grad_norm": 0.0032733348198235035, + "learning_rate": 1.2080072686772765e-05, + "loss": 0.0001, + "step": 17310 + }, + { + "epoch": 4.566679857538583, + "grad_norm": 0.041481006890535355, + "learning_rate": 1.2078900319470091e-05, + "loss": 0.0002, + "step": 17312 + }, + { + "epoch": 4.567207492415249, + "grad_norm": 0.003446645103394985, + "learning_rate": 1.2077727952167415e-05, + "loss": 0.0, + "step": 17314 + }, + { + "epoch": 4.567735127291914, + "grad_norm": 0.003764841705560684, + "learning_rate": 1.207655558486474e-05, + "loss": 0.0005, + "step": 17316 + }, + { + "epoch": 4.56826276216858, + "grad_norm": 0.008948116563260555, + "learning_rate": 1.2075383217562062e-05, + "loss": 0.0001, + "step": 17318 + }, + { + "epoch": 4.568790397045245, + "grad_norm": 0.003276508767157793, + "learning_rate": 1.2074210850259387e-05, + "loss": 0.0001, + "step": 17320 + }, + { + "epoch": 4.5693180319219096, + "grad_norm": 0.004800219554454088, + "learning_rate": 1.2073038482956711e-05, + "loss": 0.0001, + "step": 17322 + }, + { + "epoch": 4.569845666798575, + "grad_norm": 0.004597678780555725, + "learning_rate": 1.2071866115654037e-05, + "loss": 0.0001, + "step": 17324 + }, + { + "epoch": 4.57037330167524, + "grad_norm": 0.0528780035674572, + "learning_rate": 1.2070693748351358e-05, + "loss": 0.0043, + "step": 17326 + }, + { + "epoch": 4.570900936551906, + "grad_norm": 0.0010812036925926805, + "learning_rate": 1.2069521381048684e-05, + "loss": 0.0, + "step": 17328 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.04201241955161095, + "learning_rate": 1.2068349013746008e-05, + "loss": 0.0019, + "step": 17330 + }, + { + "epoch": 4.571956206305237, + "grad_norm": 0.0013544086832553148, + "learning_rate": 1.2067176646443332e-05, + "loss": 0.0001, + "step": 17332 + }, + { + "epoch": 4.572483841181902, + "grad_norm": 0.0013539179926738143, + "learning_rate": 1.2066004279140655e-05, + "loss": 0.0003, + "step": 17334 + }, + { + "epoch": 4.573011476058568, + "grad_norm": 0.0006535224383696914, + "learning_rate": 1.206483191183798e-05, + "loss": 0.0, + "step": 17336 + }, + { + "epoch": 4.573539110935233, + "grad_norm": 0.0009682629606686532, + "learning_rate": 1.2063659544535304e-05, + "loss": 0.0, + "step": 17338 + }, + { + "epoch": 4.574066745811898, + "grad_norm": 0.02553674578666687, + "learning_rate": 1.206248717723263e-05, + "loss": 0.0009, + "step": 17340 + }, + { + "epoch": 4.574594380688564, + "grad_norm": 0.0032636835239827633, + "learning_rate": 1.2061314809929952e-05, + "loss": 0.0001, + "step": 17342 + }, + { + "epoch": 4.575122015565229, + "grad_norm": 0.0008711128612048924, + "learning_rate": 1.2060142442627277e-05, + "loss": 0.0001, + "step": 17344 + }, + { + "epoch": 4.575649650441894, + "grad_norm": 0.002280960325151682, + "learning_rate": 1.2058970075324601e-05, + "loss": 0.0001, + "step": 17346 + }, + { + "epoch": 4.576177285318559, + "grad_norm": 0.4022705554962158, + "learning_rate": 1.2057797708021923e-05, + "loss": 0.0105, + "step": 17348 + }, + { + "epoch": 4.576704920195225, + "grad_norm": 0.000989214051514864, + "learning_rate": 1.2056625340719248e-05, + "loss": 0.0001, + "step": 17350 + }, + { + "epoch": 4.57723255507189, + "grad_norm": 0.001316392794251442, + "learning_rate": 1.2055452973416572e-05, + "loss": 0.0, + "step": 17352 + }, + { + "epoch": 4.577760189948556, + "grad_norm": 0.0014219810254871845, + "learning_rate": 1.2054280606113898e-05, + "loss": 0.0006, + "step": 17354 + }, + { + "epoch": 4.578287824825221, + "grad_norm": 0.0029235477559268475, + "learning_rate": 1.2053108238811219e-05, + "loss": 0.0001, + "step": 17356 + }, + { + "epoch": 4.578815459701886, + "grad_norm": 0.025728341192007065, + "learning_rate": 1.2051935871508545e-05, + "loss": 0.0002, + "step": 17358 + }, + { + "epoch": 4.579343094578552, + "grad_norm": 0.1333659291267395, + "learning_rate": 1.205076350420587e-05, + "loss": 0.0138, + "step": 17360 + }, + { + "epoch": 4.579870729455217, + "grad_norm": 0.01370059885084629, + "learning_rate": 1.2049591136903194e-05, + "loss": 0.0001, + "step": 17362 + }, + { + "epoch": 4.580398364331883, + "grad_norm": 0.003973167855292559, + "learning_rate": 1.2048418769600516e-05, + "loss": 0.0002, + "step": 17364 + }, + { + "epoch": 4.580925999208548, + "grad_norm": 0.0022028738167136908, + "learning_rate": 1.204724640229784e-05, + "loss": 0.0001, + "step": 17366 + }, + { + "epoch": 4.581453634085213, + "grad_norm": 0.0771087184548378, + "learning_rate": 1.2046074034995165e-05, + "loss": 0.0003, + "step": 17368 + }, + { + "epoch": 4.581981268961878, + "grad_norm": 0.09617656469345093, + "learning_rate": 1.204490166769249e-05, + "loss": 0.0004, + "step": 17370 + }, + { + "epoch": 4.582508903838543, + "grad_norm": 0.008432221598923206, + "learning_rate": 1.2043729300389813e-05, + "loss": 0.0001, + "step": 17372 + }, + { + "epoch": 4.583036538715209, + "grad_norm": 0.6860820651054382, + "learning_rate": 1.2042556933087138e-05, + "loss": 0.0062, + "step": 17374 + }, + { + "epoch": 4.583564173591874, + "grad_norm": 0.011497551575303078, + "learning_rate": 1.2041384565784462e-05, + "loss": 0.0001, + "step": 17376 + }, + { + "epoch": 4.58409180846854, + "grad_norm": 0.018863771110773087, + "learning_rate": 1.2040212198481785e-05, + "loss": 0.0001, + "step": 17378 + }, + { + "epoch": 4.584619443345205, + "grad_norm": 0.14027626812458038, + "learning_rate": 1.2039039831179109e-05, + "loss": 0.0079, + "step": 17380 + }, + { + "epoch": 4.585147078221871, + "grad_norm": 0.0019208411686122417, + "learning_rate": 1.2037867463876433e-05, + "loss": 0.0001, + "step": 17382 + }, + { + "epoch": 4.585674713098536, + "grad_norm": 0.008341366425156593, + "learning_rate": 1.2036695096573758e-05, + "loss": 0.0001, + "step": 17384 + }, + { + "epoch": 4.586202347975201, + "grad_norm": 0.040268637239933014, + "learning_rate": 1.203552272927108e-05, + "loss": 0.0001, + "step": 17386 + }, + { + "epoch": 4.586729982851867, + "grad_norm": 0.0765393078327179, + "learning_rate": 1.2034350361968406e-05, + "loss": 0.0003, + "step": 17388 + }, + { + "epoch": 4.587257617728532, + "grad_norm": 0.0034813741222023964, + "learning_rate": 1.203317799466573e-05, + "loss": 0.0001, + "step": 17390 + }, + { + "epoch": 4.587785252605197, + "grad_norm": 0.007290379609912634, + "learning_rate": 1.2032005627363055e-05, + "loss": 0.0001, + "step": 17392 + }, + { + "epoch": 4.588312887481862, + "grad_norm": 0.3271102011203766, + "learning_rate": 1.2030833260060377e-05, + "loss": 0.0021, + "step": 17394 + }, + { + "epoch": 4.588840522358528, + "grad_norm": 0.0029716063290834427, + "learning_rate": 1.2029660892757702e-05, + "loss": 0.0001, + "step": 17396 + }, + { + "epoch": 4.589368157235193, + "grad_norm": 0.20211449265480042, + "learning_rate": 1.2028488525455026e-05, + "loss": 0.0024, + "step": 17398 + }, + { + "epoch": 4.589895792111859, + "grad_norm": 0.0031074017751961946, + "learning_rate": 1.2027316158152352e-05, + "loss": 0.0001, + "step": 17400 + }, + { + "epoch": 4.590423426988524, + "grad_norm": 0.312497615814209, + "learning_rate": 1.2026143790849673e-05, + "loss": 0.0029, + "step": 17402 + }, + { + "epoch": 4.590951061865189, + "grad_norm": 0.002147004706785083, + "learning_rate": 1.2024971423546999e-05, + "loss": 0.0001, + "step": 17404 + }, + { + "epoch": 4.591478696741855, + "grad_norm": 0.019577965140342712, + "learning_rate": 1.2023799056244323e-05, + "loss": 0.0002, + "step": 17406 + }, + { + "epoch": 4.59200633161852, + "grad_norm": 0.0077316430397331715, + "learning_rate": 1.2022626688941646e-05, + "loss": 0.0021, + "step": 17408 + }, + { + "epoch": 4.592533966495186, + "grad_norm": 0.004609152674674988, + "learning_rate": 1.202145432163897e-05, + "loss": 0.0001, + "step": 17410 + }, + { + "epoch": 4.593061601371851, + "grad_norm": 0.00687670661136508, + "learning_rate": 1.2020281954336294e-05, + "loss": 0.0001, + "step": 17412 + }, + { + "epoch": 4.593589236248516, + "grad_norm": 0.0031379875726997852, + "learning_rate": 1.2019109587033619e-05, + "loss": 0.0001, + "step": 17414 + }, + { + "epoch": 4.594116871125181, + "grad_norm": 0.001626557088457048, + "learning_rate": 1.2017937219730941e-05, + "loss": 0.0029, + "step": 17416 + }, + { + "epoch": 4.594644506001846, + "grad_norm": 0.11500902473926544, + "learning_rate": 1.2016764852428267e-05, + "loss": 0.006, + "step": 17418 + }, + { + "epoch": 4.595172140878512, + "grad_norm": 0.005331123247742653, + "learning_rate": 1.2015592485125592e-05, + "loss": 0.0001, + "step": 17420 + }, + { + "epoch": 4.595699775755177, + "grad_norm": 0.004732578061521053, + "learning_rate": 1.2014420117822916e-05, + "loss": 0.0001, + "step": 17422 + }, + { + "epoch": 4.596227410631843, + "grad_norm": 0.007041702512651682, + "learning_rate": 1.2013247750520239e-05, + "loss": 0.0002, + "step": 17424 + }, + { + "epoch": 4.596755045508508, + "grad_norm": 0.004732453729957342, + "learning_rate": 1.2012075383217563e-05, + "loss": 0.0001, + "step": 17426 + }, + { + "epoch": 4.597282680385174, + "grad_norm": 0.004626433830708265, + "learning_rate": 1.2010903015914887e-05, + "loss": 0.0001, + "step": 17428 + }, + { + "epoch": 4.597810315261839, + "grad_norm": 0.033502042293548584, + "learning_rate": 1.2009730648612211e-05, + "loss": 0.0027, + "step": 17430 + }, + { + "epoch": 4.598337950138504, + "grad_norm": 0.003412596881389618, + "learning_rate": 1.2008558281309534e-05, + "loss": 0.0001, + "step": 17432 + }, + { + "epoch": 4.59886558501517, + "grad_norm": 0.14854809641838074, + "learning_rate": 1.200738591400686e-05, + "loss": 0.0052, + "step": 17434 + }, + { + "epoch": 4.599393219891835, + "grad_norm": 1.1784288883209229, + "learning_rate": 1.2006213546704184e-05, + "loss": 0.0043, + "step": 17436 + }, + { + "epoch": 4.5999208547685, + "grad_norm": 0.3090318441390991, + "learning_rate": 1.2005041179401507e-05, + "loss": 0.0004, + "step": 17438 + }, + { + "epoch": 4.600448489645165, + "grad_norm": 0.23273399472236633, + "learning_rate": 1.2003868812098831e-05, + "loss": 0.0013, + "step": 17440 + }, + { + "epoch": 4.600976124521831, + "grad_norm": 0.0015038627898320556, + "learning_rate": 1.2002696444796156e-05, + "loss": 0.0002, + "step": 17442 + }, + { + "epoch": 4.601503759398496, + "grad_norm": 0.15795229375362396, + "learning_rate": 1.200152407749348e-05, + "loss": 0.0019, + "step": 17444 + }, + { + "epoch": 4.602031394275162, + "grad_norm": 0.01368867140263319, + "learning_rate": 1.2000351710190803e-05, + "loss": 0.0002, + "step": 17446 + }, + { + "epoch": 4.602559029151827, + "grad_norm": 0.11747279763221741, + "learning_rate": 1.1999179342888127e-05, + "loss": 0.0005, + "step": 17448 + }, + { + "epoch": 4.603086664028492, + "grad_norm": 0.05666331201791763, + "learning_rate": 1.1998006975585453e-05, + "loss": 0.0017, + "step": 17450 + }, + { + "epoch": 4.603614298905158, + "grad_norm": 0.0036228769458830357, + "learning_rate": 1.1996834608282777e-05, + "loss": 0.0002, + "step": 17452 + }, + { + "epoch": 4.604141933781823, + "grad_norm": 0.0047815409488976, + "learning_rate": 1.19956622409801e-05, + "loss": 0.0002, + "step": 17454 + }, + { + "epoch": 4.604669568658489, + "grad_norm": 0.0020070821046829224, + "learning_rate": 1.1994489873677424e-05, + "loss": 0.0001, + "step": 17456 + }, + { + "epoch": 4.605197203535154, + "grad_norm": 0.028203284367918968, + "learning_rate": 1.1993317506374748e-05, + "loss": 0.0002, + "step": 17458 + }, + { + "epoch": 4.605724838411819, + "grad_norm": 0.004398321732878685, + "learning_rate": 1.1992145139072073e-05, + "loss": 0.0001, + "step": 17460 + }, + { + "epoch": 4.606252473288484, + "grad_norm": 0.010769236832857132, + "learning_rate": 1.1990972771769395e-05, + "loss": 0.0005, + "step": 17462 + }, + { + "epoch": 4.606780108165149, + "grad_norm": 0.003012352157384157, + "learning_rate": 1.1989800404466721e-05, + "loss": 0.0001, + "step": 17464 + }, + { + "epoch": 4.607307743041815, + "grad_norm": 0.027808330953121185, + "learning_rate": 1.1988628037164046e-05, + "loss": 0.0037, + "step": 17466 + }, + { + "epoch": 4.60783537791848, + "grad_norm": 0.15360024571418762, + "learning_rate": 1.1987455669861368e-05, + "loss": 0.0003, + "step": 17468 + }, + { + "epoch": 4.608363012795146, + "grad_norm": 0.0018770075403153896, + "learning_rate": 1.1986283302558692e-05, + "loss": 0.0026, + "step": 17470 + }, + { + "epoch": 4.608890647671811, + "grad_norm": 0.3910122811794281, + "learning_rate": 1.1985110935256017e-05, + "loss": 0.0044, + "step": 17472 + }, + { + "epoch": 4.609418282548477, + "grad_norm": 0.012356400489807129, + "learning_rate": 1.1983938567953341e-05, + "loss": 0.0001, + "step": 17474 + }, + { + "epoch": 4.609945917425142, + "grad_norm": 0.002764717908576131, + "learning_rate": 1.1982766200650664e-05, + "loss": 0.0003, + "step": 17476 + }, + { + "epoch": 4.610473552301807, + "grad_norm": 0.004233905579894781, + "learning_rate": 1.1981593833347988e-05, + "loss": 0.0001, + "step": 17478 + }, + { + "epoch": 4.611001187178473, + "grad_norm": 0.0018341336399316788, + "learning_rate": 1.1980421466045314e-05, + "loss": 0.0002, + "step": 17480 + }, + { + "epoch": 4.611528822055138, + "grad_norm": 0.3463330864906311, + "learning_rate": 1.1979249098742638e-05, + "loss": 0.0027, + "step": 17482 + }, + { + "epoch": 4.612056456931803, + "grad_norm": 0.009097079746425152, + "learning_rate": 1.1978076731439961e-05, + "loss": 0.0002, + "step": 17484 + }, + { + "epoch": 4.612584091808468, + "grad_norm": 0.0254985261708498, + "learning_rate": 1.1976904364137285e-05, + "loss": 0.0003, + "step": 17486 + }, + { + "epoch": 4.613111726685134, + "grad_norm": 0.0015209561679512262, + "learning_rate": 1.197573199683461e-05, + "loss": 0.0001, + "step": 17488 + }, + { + "epoch": 4.613639361561799, + "grad_norm": 0.7931376695632935, + "learning_rate": 1.1974559629531934e-05, + "loss": 0.0058, + "step": 17490 + }, + { + "epoch": 4.614166996438465, + "grad_norm": 0.04680952429771423, + "learning_rate": 1.1973387262229256e-05, + "loss": 0.0002, + "step": 17492 + }, + { + "epoch": 4.61469463131513, + "grad_norm": 0.0027319188229739666, + "learning_rate": 1.197221489492658e-05, + "loss": 0.0001, + "step": 17494 + }, + { + "epoch": 4.615222266191795, + "grad_norm": 0.06504647433757782, + "learning_rate": 1.1971042527623907e-05, + "loss": 0.0034, + "step": 17496 + }, + { + "epoch": 4.615749901068461, + "grad_norm": 0.013296159915626049, + "learning_rate": 1.196987016032123e-05, + "loss": 0.0001, + "step": 17498 + }, + { + "epoch": 4.616277535945126, + "grad_norm": 0.061996255069971085, + "learning_rate": 1.1968697793018554e-05, + "loss": 0.0066, + "step": 17500 + }, + { + "epoch": 4.616805170821792, + "grad_norm": 0.014699934981763363, + "learning_rate": 1.1967525425715878e-05, + "loss": 0.0012, + "step": 17502 + }, + { + "epoch": 4.617332805698457, + "grad_norm": 0.0017244035843759775, + "learning_rate": 1.1966353058413202e-05, + "loss": 0.0037, + "step": 17504 + }, + { + "epoch": 4.617860440575122, + "grad_norm": 0.0032661205623298883, + "learning_rate": 1.1965180691110525e-05, + "loss": 0.0001, + "step": 17506 + }, + { + "epoch": 4.618388075451787, + "grad_norm": 0.011200918816030025, + "learning_rate": 1.1964008323807849e-05, + "loss": 0.0001, + "step": 17508 + }, + { + "epoch": 4.618915710328452, + "grad_norm": 0.21671481430530548, + "learning_rate": 1.1962835956505175e-05, + "loss": 0.0039, + "step": 17510 + }, + { + "epoch": 4.619443345205118, + "grad_norm": 0.0024188754614442587, + "learning_rate": 1.19616635892025e-05, + "loss": 0.0001, + "step": 17512 + }, + { + "epoch": 4.619970980081783, + "grad_norm": 0.0015472150407731533, + "learning_rate": 1.1960491221899822e-05, + "loss": 0.0023, + "step": 17514 + }, + { + "epoch": 4.620498614958449, + "grad_norm": 0.0024210941046476364, + "learning_rate": 1.1959318854597146e-05, + "loss": 0.0001, + "step": 17516 + }, + { + "epoch": 4.621026249835114, + "grad_norm": 0.0012509319931268692, + "learning_rate": 1.195814648729447e-05, + "loss": 0.0085, + "step": 17518 + }, + { + "epoch": 4.62155388471178, + "grad_norm": 0.0009422924485988915, + "learning_rate": 1.1956974119991795e-05, + "loss": 0.0011, + "step": 17520 + }, + { + "epoch": 4.622081519588445, + "grad_norm": 0.03635656088590622, + "learning_rate": 1.1955801752689118e-05, + "loss": 0.0002, + "step": 17522 + }, + { + "epoch": 4.62260915446511, + "grad_norm": 0.00209934264421463, + "learning_rate": 1.1954629385386442e-05, + "loss": 0.0001, + "step": 17524 + }, + { + "epoch": 4.623136789341776, + "grad_norm": 0.004680742975324392, + "learning_rate": 1.1953457018083768e-05, + "loss": 0.0001, + "step": 17526 + }, + { + "epoch": 4.623664424218441, + "grad_norm": 0.002432829700410366, + "learning_rate": 1.195228465078109e-05, + "loss": 0.0001, + "step": 17528 + }, + { + "epoch": 4.624192059095106, + "grad_norm": 0.05404594913125038, + "learning_rate": 1.1951112283478415e-05, + "loss": 0.0019, + "step": 17530 + }, + { + "epoch": 4.624719693971771, + "grad_norm": 0.025440281257033348, + "learning_rate": 1.1949939916175739e-05, + "loss": 0.0004, + "step": 17532 + }, + { + "epoch": 4.625247328848437, + "grad_norm": 0.039701852947473526, + "learning_rate": 1.1948767548873063e-05, + "loss": 0.0002, + "step": 17534 + }, + { + "epoch": 4.625774963725102, + "grad_norm": 0.0037624724209308624, + "learning_rate": 1.1947595181570386e-05, + "loss": 0.002, + "step": 17536 + }, + { + "epoch": 4.626302598601768, + "grad_norm": 0.14120762050151825, + "learning_rate": 1.194642281426771e-05, + "loss": 0.0032, + "step": 17538 + }, + { + "epoch": 4.626830233478433, + "grad_norm": 0.0017376708565279841, + "learning_rate": 1.1945250446965035e-05, + "loss": 0.0001, + "step": 17540 + }, + { + "epoch": 4.627357868355098, + "grad_norm": 0.003109022043645382, + "learning_rate": 1.194407807966236e-05, + "loss": 0.0001, + "step": 17542 + }, + { + "epoch": 4.627885503231764, + "grad_norm": 0.004340969026088715, + "learning_rate": 1.1942905712359683e-05, + "loss": 0.0001, + "step": 17544 + }, + { + "epoch": 4.628413138108429, + "grad_norm": 0.004221005365252495, + "learning_rate": 1.1941733345057008e-05, + "loss": 0.0002, + "step": 17546 + }, + { + "epoch": 4.628940772985095, + "grad_norm": 0.007747005205601454, + "learning_rate": 1.1940560977754332e-05, + "loss": 0.0001, + "step": 17548 + }, + { + "epoch": 4.62946840786176, + "grad_norm": 0.006833392195403576, + "learning_rate": 1.1939388610451656e-05, + "loss": 0.0001, + "step": 17550 + }, + { + "epoch": 4.629996042738425, + "grad_norm": 0.003137740306556225, + "learning_rate": 1.1938216243148979e-05, + "loss": 0.001, + "step": 17552 + }, + { + "epoch": 4.63052367761509, + "grad_norm": 0.020741572603583336, + "learning_rate": 1.1937043875846303e-05, + "loss": 0.0002, + "step": 17554 + }, + { + "epoch": 4.631051312491755, + "grad_norm": 0.017283469438552856, + "learning_rate": 1.1935871508543629e-05, + "loss": 0.0001, + "step": 17556 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.006956164725124836, + "learning_rate": 1.193469914124095e-05, + "loss": 0.0, + "step": 17558 + }, + { + "epoch": 4.632106582245086, + "grad_norm": 0.01953393593430519, + "learning_rate": 1.1933526773938276e-05, + "loss": 0.0002, + "step": 17560 + }, + { + "epoch": 4.632634217121752, + "grad_norm": 0.030199376866221428, + "learning_rate": 1.19323544066356e-05, + "loss": 0.0003, + "step": 17562 + }, + { + "epoch": 4.633161851998417, + "grad_norm": 0.0012844196753576398, + "learning_rate": 1.1931182039332925e-05, + "loss": 0.0003, + "step": 17564 + }, + { + "epoch": 4.633689486875083, + "grad_norm": 0.0020292475819587708, + "learning_rate": 1.1930009672030247e-05, + "loss": 0.0001, + "step": 17566 + }, + { + "epoch": 4.634217121751748, + "grad_norm": 0.00416170759126544, + "learning_rate": 1.1928837304727571e-05, + "loss": 0.0001, + "step": 17568 + }, + { + "epoch": 4.634744756628413, + "grad_norm": 0.016035281121730804, + "learning_rate": 1.1927664937424896e-05, + "loss": 0.0001, + "step": 17570 + }, + { + "epoch": 4.635272391505079, + "grad_norm": 0.0005862379912286997, + "learning_rate": 1.1926492570122222e-05, + "loss": 0.0001, + "step": 17572 + }, + { + "epoch": 4.635800026381744, + "grad_norm": 0.0015166038647294044, + "learning_rate": 1.1925320202819544e-05, + "loss": 0.0001, + "step": 17574 + }, + { + "epoch": 4.6363276612584094, + "grad_norm": 0.00038153064087964594, + "learning_rate": 1.1924147835516869e-05, + "loss": 0.0001, + "step": 17576 + }, + { + "epoch": 4.636855296135074, + "grad_norm": 0.0038189650513231754, + "learning_rate": 1.1922975468214193e-05, + "loss": 0.0001, + "step": 17578 + }, + { + "epoch": 4.63738293101174, + "grad_norm": 0.042723141610622406, + "learning_rate": 1.1921803100911517e-05, + "loss": 0.0002, + "step": 17580 + }, + { + "epoch": 4.637910565888405, + "grad_norm": 0.002469847211614251, + "learning_rate": 1.192063073360884e-05, + "loss": 0.0001, + "step": 17582 + }, + { + "epoch": 4.63843820076507, + "grad_norm": 0.0012627310352399945, + "learning_rate": 1.1919458366306164e-05, + "loss": 0.0001, + "step": 17584 + }, + { + "epoch": 4.638965835641736, + "grad_norm": 0.00265111424960196, + "learning_rate": 1.1918285999003489e-05, + "loss": 0.0001, + "step": 17586 + }, + { + "epoch": 4.639493470518401, + "grad_norm": 0.1528344452381134, + "learning_rate": 1.1917113631700811e-05, + "loss": 0.0047, + "step": 17588 + }, + { + "epoch": 4.640021105395067, + "grad_norm": 0.0018857797840610147, + "learning_rate": 1.1915941264398137e-05, + "loss": 0.0001, + "step": 17590 + }, + { + "epoch": 4.640548740271732, + "grad_norm": 0.00217316672205925, + "learning_rate": 1.1914768897095461e-05, + "loss": 0.0, + "step": 17592 + }, + { + "epoch": 4.641076375148398, + "grad_norm": 0.0012284278636798263, + "learning_rate": 1.1913596529792786e-05, + "loss": 0.0019, + "step": 17594 + }, + { + "epoch": 4.641604010025063, + "grad_norm": 0.022955147549510002, + "learning_rate": 1.1912424162490108e-05, + "loss": 0.0018, + "step": 17596 + }, + { + "epoch": 4.642131644901728, + "grad_norm": 0.002541962778195739, + "learning_rate": 1.1911251795187433e-05, + "loss": 0.0001, + "step": 17598 + }, + { + "epoch": 4.642659279778393, + "grad_norm": 0.002216736786067486, + "learning_rate": 1.1910079427884757e-05, + "loss": 0.0022, + "step": 17600 + }, + { + "epoch": 4.643186914655058, + "grad_norm": 0.0032877507619559765, + "learning_rate": 1.1908907060582083e-05, + "loss": 0.0007, + "step": 17602 + }, + { + "epoch": 4.643714549531724, + "grad_norm": 0.14225521683692932, + "learning_rate": 1.1907734693279404e-05, + "loss": 0.0046, + "step": 17604 + }, + { + "epoch": 4.644242184408389, + "grad_norm": 0.0030671257991343737, + "learning_rate": 1.190656232597673e-05, + "loss": 0.0001, + "step": 17606 + }, + { + "epoch": 4.644769819285055, + "grad_norm": 0.49305373430252075, + "learning_rate": 1.1905389958674054e-05, + "loss": 0.0037, + "step": 17608 + }, + { + "epoch": 4.64529745416172, + "grad_norm": 0.38777878880500793, + "learning_rate": 1.1904217591371378e-05, + "loss": 0.0077, + "step": 17610 + }, + { + "epoch": 4.645825089038386, + "grad_norm": 0.0034676091745495796, + "learning_rate": 1.1903045224068701e-05, + "loss": 0.002, + "step": 17612 + }, + { + "epoch": 4.646352723915051, + "grad_norm": 0.021497800946235657, + "learning_rate": 1.1901872856766025e-05, + "loss": 0.0002, + "step": 17614 + }, + { + "epoch": 4.646880358791716, + "grad_norm": 0.062479641288518906, + "learning_rate": 1.190070048946335e-05, + "loss": 0.0053, + "step": 17616 + }, + { + "epoch": 4.647407993668382, + "grad_norm": 0.003479254897683859, + "learning_rate": 1.1899528122160672e-05, + "loss": 0.0001, + "step": 17618 + }, + { + "epoch": 4.647935628545047, + "grad_norm": 0.004729834385216236, + "learning_rate": 1.1898355754857998e-05, + "loss": 0.0001, + "step": 17620 + }, + { + "epoch": 4.6484632634217125, + "grad_norm": 0.004598015919327736, + "learning_rate": 1.1897183387555323e-05, + "loss": 0.0001, + "step": 17622 + }, + { + "epoch": 4.648990898298377, + "grad_norm": 0.013192166574299335, + "learning_rate": 1.1896011020252647e-05, + "loss": 0.0002, + "step": 17624 + }, + { + "epoch": 4.649518533175042, + "grad_norm": 0.005220395978540182, + "learning_rate": 1.189483865294997e-05, + "loss": 0.0001, + "step": 17626 + }, + { + "epoch": 4.650046168051708, + "grad_norm": 0.08835229277610779, + "learning_rate": 1.1893666285647294e-05, + "loss": 0.0025, + "step": 17628 + }, + { + "epoch": 4.650573802928373, + "grad_norm": 0.1211179792881012, + "learning_rate": 1.1892493918344618e-05, + "loss": 0.0003, + "step": 17630 + }, + { + "epoch": 4.651101437805039, + "grad_norm": 0.005800396669656038, + "learning_rate": 1.1891321551041944e-05, + "loss": 0.0001, + "step": 17632 + }, + { + "epoch": 4.651629072681704, + "grad_norm": 0.0030688343103975058, + "learning_rate": 1.1890149183739265e-05, + "loss": 0.0002, + "step": 17634 + }, + { + "epoch": 4.65215670755837, + "grad_norm": 0.04765557870268822, + "learning_rate": 1.1888976816436591e-05, + "loss": 0.0027, + "step": 17636 + }, + { + "epoch": 4.652684342435035, + "grad_norm": 0.002846222370862961, + "learning_rate": 1.1887804449133915e-05, + "loss": 0.0001, + "step": 17638 + }, + { + "epoch": 4.653211977311701, + "grad_norm": 0.010314605198800564, + "learning_rate": 1.188663208183124e-05, + "loss": 0.0002, + "step": 17640 + }, + { + "epoch": 4.653739612188366, + "grad_norm": 0.003993054386228323, + "learning_rate": 1.1885459714528562e-05, + "loss": 0.0003, + "step": 17642 + }, + { + "epoch": 4.654267247065031, + "grad_norm": 0.00644712895154953, + "learning_rate": 1.1884287347225887e-05, + "loss": 0.0001, + "step": 17644 + }, + { + "epoch": 4.654794881941696, + "grad_norm": 0.004037715494632721, + "learning_rate": 1.1883114979923211e-05, + "loss": 0.0001, + "step": 17646 + }, + { + "epoch": 4.655322516818361, + "grad_norm": 0.002443782752379775, + "learning_rate": 1.1881942612620533e-05, + "loss": 0.0001, + "step": 17648 + }, + { + "epoch": 4.655850151695027, + "grad_norm": 0.026321668177843094, + "learning_rate": 1.1880770245317858e-05, + "loss": 0.0018, + "step": 17650 + }, + { + "epoch": 4.656377786571692, + "grad_norm": 0.038685012608766556, + "learning_rate": 1.1879597878015184e-05, + "loss": 0.0003, + "step": 17652 + }, + { + "epoch": 4.656905421448358, + "grad_norm": 0.09935443848371506, + "learning_rate": 1.1878425510712508e-05, + "loss": 0.0003, + "step": 17654 + }, + { + "epoch": 4.657433056325023, + "grad_norm": 0.02455829828977585, + "learning_rate": 1.187725314340983e-05, + "loss": 0.0013, + "step": 17656 + }, + { + "epoch": 4.657960691201689, + "grad_norm": 0.008726024068892002, + "learning_rate": 1.1876080776107155e-05, + "loss": 0.0005, + "step": 17658 + }, + { + "epoch": 4.658488326078354, + "grad_norm": 0.0019632186740636826, + "learning_rate": 1.187490840880448e-05, + "loss": 0.0002, + "step": 17660 + }, + { + "epoch": 4.659015960955019, + "grad_norm": 0.010620662942528725, + "learning_rate": 1.1873736041501804e-05, + "loss": 0.0001, + "step": 17662 + }, + { + "epoch": 4.659543595831685, + "grad_norm": 0.0070631979033350945, + "learning_rate": 1.1872563674199126e-05, + "loss": 0.0001, + "step": 17664 + }, + { + "epoch": 4.66007123070835, + "grad_norm": 0.027598118409514427, + "learning_rate": 1.1871391306896452e-05, + "loss": 0.0011, + "step": 17666 + }, + { + "epoch": 4.6605988655850155, + "grad_norm": 0.002734602428972721, + "learning_rate": 1.1870218939593777e-05, + "loss": 0.0, + "step": 17668 + }, + { + "epoch": 4.66112650046168, + "grad_norm": 0.2641873359680176, + "learning_rate": 1.18690465722911e-05, + "loss": 0.0009, + "step": 17670 + }, + { + "epoch": 4.661654135338345, + "grad_norm": 0.003166806884109974, + "learning_rate": 1.1867874204988423e-05, + "loss": 0.0001, + "step": 17672 + }, + { + "epoch": 4.662181770215011, + "grad_norm": 0.013490970246493816, + "learning_rate": 1.1866701837685748e-05, + "loss": 0.0001, + "step": 17674 + }, + { + "epoch": 4.662709405091676, + "grad_norm": 0.0013876123121008277, + "learning_rate": 1.1865529470383072e-05, + "loss": 0.0001, + "step": 17676 + }, + { + "epoch": 4.663237039968342, + "grad_norm": 0.00919520016759634, + "learning_rate": 1.1864357103080395e-05, + "loss": 0.0001, + "step": 17678 + }, + { + "epoch": 4.663764674845007, + "grad_norm": 0.0016035635489970446, + "learning_rate": 1.1863184735777719e-05, + "loss": 0.0001, + "step": 17680 + }, + { + "epoch": 4.664292309721673, + "grad_norm": 0.056879233568906784, + "learning_rate": 1.1862012368475045e-05, + "loss": 0.0003, + "step": 17682 + }, + { + "epoch": 4.664819944598338, + "grad_norm": 0.0023206479381769896, + "learning_rate": 1.186084000117237e-05, + "loss": 0.0001, + "step": 17684 + }, + { + "epoch": 4.665347579475004, + "grad_norm": 0.008817306719720364, + "learning_rate": 1.1859667633869692e-05, + "loss": 0.0001, + "step": 17686 + }, + { + "epoch": 4.665875214351669, + "grad_norm": 0.000888954265974462, + "learning_rate": 1.1858495266567016e-05, + "loss": 0.0, + "step": 17688 + }, + { + "epoch": 4.666402849228334, + "grad_norm": 0.004177425056695938, + "learning_rate": 1.185732289926434e-05, + "loss": 0.0, + "step": 17690 + }, + { + "epoch": 4.666930484104999, + "grad_norm": 0.0017631802475079894, + "learning_rate": 1.1856150531961665e-05, + "loss": 0.0032, + "step": 17692 + }, + { + "epoch": 4.667458118981664, + "grad_norm": 0.00045325650717131793, + "learning_rate": 1.1854978164658987e-05, + "loss": 0.0, + "step": 17694 + }, + { + "epoch": 4.66798575385833, + "grad_norm": 0.0018539674347266555, + "learning_rate": 1.1853805797356313e-05, + "loss": 0.0001, + "step": 17696 + }, + { + "epoch": 4.668513388734995, + "grad_norm": 0.05623412877321243, + "learning_rate": 1.1852633430053638e-05, + "loss": 0.0002, + "step": 17698 + }, + { + "epoch": 4.669041023611661, + "grad_norm": 0.013947281055152416, + "learning_rate": 1.1851461062750962e-05, + "loss": 0.0002, + "step": 17700 + }, + { + "epoch": 4.669568658488326, + "grad_norm": 0.0007302664453163743, + "learning_rate": 1.1850288695448285e-05, + "loss": 0.0, + "step": 17702 + }, + { + "epoch": 4.670096293364992, + "grad_norm": 0.0007491898722946644, + "learning_rate": 1.1849116328145609e-05, + "loss": 0.0, + "step": 17704 + }, + { + "epoch": 4.670623928241657, + "grad_norm": 0.0006512539694085717, + "learning_rate": 1.1847943960842933e-05, + "loss": 0.0, + "step": 17706 + }, + { + "epoch": 4.671151563118322, + "grad_norm": 0.0020678367000073195, + "learning_rate": 1.1846771593540256e-05, + "loss": 0.0001, + "step": 17708 + }, + { + "epoch": 4.671679197994988, + "grad_norm": 0.001386971795000136, + "learning_rate": 1.184559922623758e-05, + "loss": 0.0031, + "step": 17710 + }, + { + "epoch": 4.672206832871653, + "grad_norm": 0.008797744289040565, + "learning_rate": 1.1844426858934906e-05, + "loss": 0.0001, + "step": 17712 + }, + { + "epoch": 4.6727344677483185, + "grad_norm": 0.0025297950487583876, + "learning_rate": 1.184325449163223e-05, + "loss": 0.0001, + "step": 17714 + }, + { + "epoch": 4.673262102624983, + "grad_norm": 0.14502421021461487, + "learning_rate": 1.1842082124329553e-05, + "loss": 0.0091, + "step": 17716 + }, + { + "epoch": 4.673789737501648, + "grad_norm": 0.2914828658103943, + "learning_rate": 1.1840909757026877e-05, + "loss": 0.0009, + "step": 17718 + }, + { + "epoch": 4.674317372378314, + "grad_norm": 0.001298045157454908, + "learning_rate": 1.1839737389724202e-05, + "loss": 0.0, + "step": 17720 + }, + { + "epoch": 4.674845007254979, + "grad_norm": 0.001044301432557404, + "learning_rate": 1.1838565022421526e-05, + "loss": 0.0, + "step": 17722 + }, + { + "epoch": 4.675372642131645, + "grad_norm": 0.0008230286184698343, + "learning_rate": 1.1837392655118849e-05, + "loss": 0.0, + "step": 17724 + }, + { + "epoch": 4.67590027700831, + "grad_norm": 0.005909441038966179, + "learning_rate": 1.1836220287816173e-05, + "loss": 0.0106, + "step": 17726 + }, + { + "epoch": 4.676427911884976, + "grad_norm": 0.03539324924349785, + "learning_rate": 1.1835047920513499e-05, + "loss": 0.0026, + "step": 17728 + }, + { + "epoch": 4.676955546761641, + "grad_norm": 0.0006617540493607521, + "learning_rate": 1.1833875553210823e-05, + "loss": 0.0001, + "step": 17730 + }, + { + "epoch": 4.677483181638307, + "grad_norm": 0.0022325594909489155, + "learning_rate": 1.1832703185908146e-05, + "loss": 0.0, + "step": 17732 + }, + { + "epoch": 4.678010816514972, + "grad_norm": 0.36086180806159973, + "learning_rate": 1.183153081860547e-05, + "loss": 0.0018, + "step": 17734 + }, + { + "epoch": 4.678538451391637, + "grad_norm": 0.39365053176879883, + "learning_rate": 1.1830358451302794e-05, + "loss": 0.006, + "step": 17736 + }, + { + "epoch": 4.6790660862683024, + "grad_norm": 0.0904511883854866, + "learning_rate": 1.1829186084000117e-05, + "loss": 0.0005, + "step": 17738 + }, + { + "epoch": 4.679593721144967, + "grad_norm": 0.0015662115765735507, + "learning_rate": 1.1828013716697441e-05, + "loss": 0.0001, + "step": 17740 + }, + { + "epoch": 4.680121356021633, + "grad_norm": 0.002498243935406208, + "learning_rate": 1.1826841349394767e-05, + "loss": 0.0041, + "step": 17742 + }, + { + "epoch": 4.680648990898298, + "grad_norm": 0.018232738599181175, + "learning_rate": 1.1825668982092092e-05, + "loss": 0.0002, + "step": 17744 + }, + { + "epoch": 4.681176625774964, + "grad_norm": 0.21946081519126892, + "learning_rate": 1.1824496614789414e-05, + "loss": 0.0008, + "step": 17746 + }, + { + "epoch": 4.681704260651629, + "grad_norm": 0.05726639926433563, + "learning_rate": 1.1823324247486739e-05, + "loss": 0.0003, + "step": 17748 + }, + { + "epoch": 4.682231895528295, + "grad_norm": 0.6397253274917603, + "learning_rate": 1.1822151880184063e-05, + "loss": 0.0031, + "step": 17750 + }, + { + "epoch": 4.68275953040496, + "grad_norm": 0.009562092833220959, + "learning_rate": 1.1820979512881387e-05, + "loss": 0.0001, + "step": 17752 + }, + { + "epoch": 4.683287165281625, + "grad_norm": 0.2240561693906784, + "learning_rate": 1.181980714557871e-05, + "loss": 0.0058, + "step": 17754 + }, + { + "epoch": 4.683814800158291, + "grad_norm": 0.24699769914150238, + "learning_rate": 1.1818634778276034e-05, + "loss": 0.0022, + "step": 17756 + }, + { + "epoch": 4.684342435034956, + "grad_norm": 0.01413020770996809, + "learning_rate": 1.181746241097336e-05, + "loss": 0.0001, + "step": 17758 + }, + { + "epoch": 4.6848700699116215, + "grad_norm": 0.011911839246749878, + "learning_rate": 1.1816290043670684e-05, + "loss": 0.0001, + "step": 17760 + }, + { + "epoch": 4.685397704788286, + "grad_norm": 0.09183624386787415, + "learning_rate": 1.1815117676368007e-05, + "loss": 0.0036, + "step": 17762 + }, + { + "epoch": 4.685925339664951, + "grad_norm": 0.00913438480347395, + "learning_rate": 1.1813945309065331e-05, + "loss": 0.0001, + "step": 17764 + }, + { + "epoch": 4.686452974541617, + "grad_norm": 0.013856896199285984, + "learning_rate": 1.1812772941762656e-05, + "loss": 0.0004, + "step": 17766 + }, + { + "epoch": 4.686980609418282, + "grad_norm": 0.15385814011096954, + "learning_rate": 1.1811600574459978e-05, + "loss": 0.0011, + "step": 17768 + }, + { + "epoch": 4.687508244294948, + "grad_norm": 0.003583695273846388, + "learning_rate": 1.1810428207157302e-05, + "loss": 0.0001, + "step": 17770 + }, + { + "epoch": 4.688035879171613, + "grad_norm": 0.13996903598308563, + "learning_rate": 1.1809255839854627e-05, + "loss": 0.0029, + "step": 17772 + }, + { + "epoch": 4.688563514048279, + "grad_norm": 0.04193791747093201, + "learning_rate": 1.1808083472551953e-05, + "loss": 0.0017, + "step": 17774 + }, + { + "epoch": 4.689091148924944, + "grad_norm": 0.009405029937624931, + "learning_rate": 1.1806911105249275e-05, + "loss": 0.0001, + "step": 17776 + }, + { + "epoch": 4.68961878380161, + "grad_norm": 0.17813678085803986, + "learning_rate": 1.18057387379466e-05, + "loss": 0.0006, + "step": 17778 + }, + { + "epoch": 4.690146418678275, + "grad_norm": 0.009931202977895737, + "learning_rate": 1.1804566370643924e-05, + "loss": 0.0008, + "step": 17780 + }, + { + "epoch": 4.69067405355494, + "grad_norm": 0.010043474845588207, + "learning_rate": 1.1803394003341248e-05, + "loss": 0.0001, + "step": 17782 + }, + { + "epoch": 4.6912016884316055, + "grad_norm": 0.010032378137111664, + "learning_rate": 1.1802221636038571e-05, + "loss": 0.0001, + "step": 17784 + }, + { + "epoch": 4.69172932330827, + "grad_norm": 0.0024207255337387323, + "learning_rate": 1.1801049268735895e-05, + "loss": 0.0001, + "step": 17786 + }, + { + "epoch": 4.692256958184936, + "grad_norm": 0.015297671779990196, + "learning_rate": 1.1799876901433221e-05, + "loss": 0.0002, + "step": 17788 + }, + { + "epoch": 4.692784593061601, + "grad_norm": 0.0012973567936569452, + "learning_rate": 1.1798704534130545e-05, + "loss": 0.0, + "step": 17790 + }, + { + "epoch": 4.693312227938267, + "grad_norm": 0.013438630849123001, + "learning_rate": 1.1797532166827868e-05, + "loss": 0.0002, + "step": 17792 + }, + { + "epoch": 4.693839862814932, + "grad_norm": 0.004418237134814262, + "learning_rate": 1.1796359799525192e-05, + "loss": 0.0001, + "step": 17794 + }, + { + "epoch": 4.694367497691598, + "grad_norm": 0.05953308939933777, + "learning_rate": 1.1795187432222517e-05, + "loss": 0.0005, + "step": 17796 + }, + { + "epoch": 4.694895132568263, + "grad_norm": 0.004519242327660322, + "learning_rate": 1.179401506491984e-05, + "loss": 0.0001, + "step": 17798 + }, + { + "epoch": 4.695422767444928, + "grad_norm": 0.0067565301433205605, + "learning_rate": 1.1792842697617164e-05, + "loss": 0.0001, + "step": 17800 + }, + { + "epoch": 4.695950402321594, + "grad_norm": 0.12813962996006012, + "learning_rate": 1.1791670330314488e-05, + "loss": 0.0003, + "step": 17802 + }, + { + "epoch": 4.696478037198259, + "grad_norm": 0.005476847756654024, + "learning_rate": 1.1790497963011814e-05, + "loss": 0.0001, + "step": 17804 + }, + { + "epoch": 4.6970056720749245, + "grad_norm": 0.026322104036808014, + "learning_rate": 1.1789325595709137e-05, + "loss": 0.0004, + "step": 17806 + }, + { + "epoch": 4.697533306951589, + "grad_norm": 0.039035722613334656, + "learning_rate": 1.178815322840646e-05, + "loss": 0.0001, + "step": 17808 + }, + { + "epoch": 4.698060941828254, + "grad_norm": 0.18554207682609558, + "learning_rate": 1.1786980861103785e-05, + "loss": 0.0076, + "step": 17810 + }, + { + "epoch": 4.69858857670492, + "grad_norm": 0.03263377025723457, + "learning_rate": 1.178580849380111e-05, + "loss": 0.0036, + "step": 17812 + }, + { + "epoch": 4.699116211581585, + "grad_norm": 0.0016835236456245184, + "learning_rate": 1.1784636126498432e-05, + "loss": 0.0001, + "step": 17814 + }, + { + "epoch": 4.699643846458251, + "grad_norm": 0.034890301525592804, + "learning_rate": 1.1783463759195756e-05, + "loss": 0.0003, + "step": 17816 + }, + { + "epoch": 4.700171481334916, + "grad_norm": 0.0013453017454594374, + "learning_rate": 1.178229139189308e-05, + "loss": 0.0002, + "step": 17818 + }, + { + "epoch": 4.700699116211582, + "grad_norm": 0.019981298595666885, + "learning_rate": 1.1781119024590407e-05, + "loss": 0.0001, + "step": 17820 + }, + { + "epoch": 4.701226751088247, + "grad_norm": 0.009109802544116974, + "learning_rate": 1.177994665728773e-05, + "loss": 0.0001, + "step": 17822 + }, + { + "epoch": 4.701754385964913, + "grad_norm": 0.004298048093914986, + "learning_rate": 1.1778774289985054e-05, + "loss": 0.0001, + "step": 17824 + }, + { + "epoch": 4.702282020841578, + "grad_norm": 0.019586196169257164, + "learning_rate": 1.1777601922682378e-05, + "loss": 0.0002, + "step": 17826 + }, + { + "epoch": 4.702809655718243, + "grad_norm": 0.005513543728739023, + "learning_rate": 1.17764295553797e-05, + "loss": 0.0001, + "step": 17828 + }, + { + "epoch": 4.7033372905949085, + "grad_norm": 0.05203039571642876, + "learning_rate": 1.1775257188077025e-05, + "loss": 0.0008, + "step": 17830 + }, + { + "epoch": 4.703864925471573, + "grad_norm": 0.003276845207437873, + "learning_rate": 1.1774084820774349e-05, + "loss": 0.0001, + "step": 17832 + }, + { + "epoch": 4.704392560348239, + "grad_norm": 0.07876742631196976, + "learning_rate": 1.1772912453471675e-05, + "loss": 0.0004, + "step": 17834 + }, + { + "epoch": 4.704920195224904, + "grad_norm": 0.003429098054766655, + "learning_rate": 1.1771740086168996e-05, + "loss": 0.0001, + "step": 17836 + }, + { + "epoch": 4.70544783010157, + "grad_norm": 0.21489186584949493, + "learning_rate": 1.1770567718866322e-05, + "loss": 0.0057, + "step": 17838 + }, + { + "epoch": 4.705975464978235, + "grad_norm": 0.021777387708425522, + "learning_rate": 1.1769395351563646e-05, + "loss": 0.0002, + "step": 17840 + }, + { + "epoch": 4.706503099854901, + "grad_norm": 0.007544282358139753, + "learning_rate": 1.176822298426097e-05, + "loss": 0.0054, + "step": 17842 + }, + { + "epoch": 4.707030734731566, + "grad_norm": 0.008330273441970348, + "learning_rate": 1.1767050616958293e-05, + "loss": 0.0001, + "step": 17844 + }, + { + "epoch": 4.707558369608231, + "grad_norm": 0.018271103501319885, + "learning_rate": 1.1765878249655618e-05, + "loss": 0.0002, + "step": 17846 + }, + { + "epoch": 4.708086004484897, + "grad_norm": 0.0031101026106625795, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.0001, + "step": 17848 + }, + { + "epoch": 4.708613639361562, + "grad_norm": 0.015249382704496384, + "learning_rate": 1.1763533515050268e-05, + "loss": 0.0028, + "step": 17850 + }, + { + "epoch": 4.7091412742382275, + "grad_norm": 0.0038849329575896263, + "learning_rate": 1.176236114774759e-05, + "loss": 0.0004, + "step": 17852 + }, + { + "epoch": 4.709668909114892, + "grad_norm": 0.011564873158931732, + "learning_rate": 1.1761188780444915e-05, + "loss": 0.0002, + "step": 17854 + }, + { + "epoch": 4.710196543991557, + "grad_norm": 0.010514555498957634, + "learning_rate": 1.1760016413142239e-05, + "loss": 0.0001, + "step": 17856 + }, + { + "epoch": 4.710724178868223, + "grad_norm": 0.0070268940180540085, + "learning_rate": 1.1758844045839562e-05, + "loss": 0.0001, + "step": 17858 + }, + { + "epoch": 4.711251813744888, + "grad_norm": 0.03205162659287453, + "learning_rate": 1.1757671678536886e-05, + "loss": 0.0003, + "step": 17860 + }, + { + "epoch": 4.711779448621554, + "grad_norm": 0.09706932306289673, + "learning_rate": 1.175649931123421e-05, + "loss": 0.0003, + "step": 17862 + }, + { + "epoch": 4.712307083498219, + "grad_norm": 0.013229667209088802, + "learning_rate": 1.1755326943931535e-05, + "loss": 0.0022, + "step": 17864 + }, + { + "epoch": 4.712834718374885, + "grad_norm": 0.002850034274160862, + "learning_rate": 1.1754154576628857e-05, + "loss": 0.0001, + "step": 17866 + }, + { + "epoch": 4.71336235325155, + "grad_norm": 0.0393708162009716, + "learning_rate": 1.1752982209326183e-05, + "loss": 0.0034, + "step": 17868 + }, + { + "epoch": 4.713889988128216, + "grad_norm": 0.0028256415389478207, + "learning_rate": 1.1751809842023507e-05, + "loss": 0.0001, + "step": 17870 + }, + { + "epoch": 4.714417623004881, + "grad_norm": 0.003299711737781763, + "learning_rate": 1.1750637474720832e-05, + "loss": 0.0001, + "step": 17872 + }, + { + "epoch": 4.714945257881546, + "grad_norm": 0.00451298663392663, + "learning_rate": 1.1749465107418154e-05, + "loss": 0.0029, + "step": 17874 + }, + { + "epoch": 4.7154728927582115, + "grad_norm": 0.0008320064516738057, + "learning_rate": 1.1748292740115479e-05, + "loss": 0.0, + "step": 17876 + }, + { + "epoch": 4.716000527634876, + "grad_norm": 0.175450399518013, + "learning_rate": 1.1747120372812803e-05, + "loss": 0.0013, + "step": 17878 + }, + { + "epoch": 4.716528162511542, + "grad_norm": 0.0026603140868246555, + "learning_rate": 1.1745948005510129e-05, + "loss": 0.0001, + "step": 17880 + }, + { + "epoch": 4.717055797388207, + "grad_norm": 0.0017124013975262642, + "learning_rate": 1.174477563820745e-05, + "loss": 0.0, + "step": 17882 + }, + { + "epoch": 4.717583432264873, + "grad_norm": 0.051899734884500504, + "learning_rate": 1.1743603270904776e-05, + "loss": 0.0018, + "step": 17884 + }, + { + "epoch": 4.718111067141538, + "grad_norm": 0.0062712072394788265, + "learning_rate": 1.17424309036021e-05, + "loss": 0.0001, + "step": 17886 + }, + { + "epoch": 4.718638702018204, + "grad_norm": 0.0031797289848327637, + "learning_rate": 1.1741258536299423e-05, + "loss": 0.0005, + "step": 17888 + }, + { + "epoch": 4.719166336894869, + "grad_norm": 0.0023341900669038296, + "learning_rate": 1.1740086168996747e-05, + "loss": 0.0, + "step": 17890 + }, + { + "epoch": 4.719693971771534, + "grad_norm": 0.0027996678836643696, + "learning_rate": 1.1738913801694071e-05, + "loss": 0.0, + "step": 17892 + }, + { + "epoch": 4.7202216066482, + "grad_norm": 0.004202332813292742, + "learning_rate": 1.1737741434391396e-05, + "loss": 0.0001, + "step": 17894 + }, + { + "epoch": 4.720749241524865, + "grad_norm": 0.0013641462428495288, + "learning_rate": 1.1736569067088718e-05, + "loss": 0.0001, + "step": 17896 + }, + { + "epoch": 4.7212768764015305, + "grad_norm": 0.0016771330265328288, + "learning_rate": 1.1735396699786044e-05, + "loss": 0.0001, + "step": 17898 + }, + { + "epoch": 4.7218045112781954, + "grad_norm": 0.0060205343179404736, + "learning_rate": 1.1734224332483369e-05, + "loss": 0.0003, + "step": 17900 + }, + { + "epoch": 4.72233214615486, + "grad_norm": 0.0029537035152316093, + "learning_rate": 1.1733051965180693e-05, + "loss": 0.0, + "step": 17902 + }, + { + "epoch": 4.722859781031526, + "grad_norm": 0.14214308559894562, + "learning_rate": 1.1731879597878016e-05, + "loss": 0.0002, + "step": 17904 + }, + { + "epoch": 4.723387415908191, + "grad_norm": 0.07675842195749283, + "learning_rate": 1.173070723057534e-05, + "loss": 0.0003, + "step": 17906 + }, + { + "epoch": 4.723915050784857, + "grad_norm": 0.0007770888041704893, + "learning_rate": 1.1729534863272664e-05, + "loss": 0.0, + "step": 17908 + }, + { + "epoch": 4.724442685661522, + "grad_norm": 0.0006714158225804567, + "learning_rate": 1.1728362495969988e-05, + "loss": 0.0, + "step": 17910 + }, + { + "epoch": 4.724970320538188, + "grad_norm": 0.0015531240496784449, + "learning_rate": 1.1727190128667311e-05, + "loss": 0.0, + "step": 17912 + }, + { + "epoch": 4.725497955414853, + "grad_norm": 0.0006398982950486243, + "learning_rate": 1.1726017761364637e-05, + "loss": 0.0001, + "step": 17914 + }, + { + "epoch": 4.726025590291519, + "grad_norm": 0.0008295616135001183, + "learning_rate": 1.1724845394061961e-05, + "loss": 0.0, + "step": 17916 + }, + { + "epoch": 4.726553225168184, + "grad_norm": 0.0011093163629993796, + "learning_rate": 1.1723673026759284e-05, + "loss": 0.0001, + "step": 17918 + }, + { + "epoch": 4.727080860044849, + "grad_norm": 0.0030612775590270758, + "learning_rate": 1.1722500659456608e-05, + "loss": 0.0001, + "step": 17920 + }, + { + "epoch": 4.7276084949215145, + "grad_norm": 0.3956137001514435, + "learning_rate": 1.1721328292153933e-05, + "loss": 0.0071, + "step": 17922 + }, + { + "epoch": 4.728136129798179, + "grad_norm": 0.0008948935428634286, + "learning_rate": 1.1720155924851257e-05, + "loss": 0.0, + "step": 17924 + }, + { + "epoch": 4.728663764674845, + "grad_norm": 0.0007061992073431611, + "learning_rate": 1.171898355754858e-05, + "loss": 0.0, + "step": 17926 + }, + { + "epoch": 4.72919139955151, + "grad_norm": 0.0048861089162528515, + "learning_rate": 1.1717811190245904e-05, + "loss": 0.0001, + "step": 17928 + }, + { + "epoch": 4.729719034428175, + "grad_norm": 0.35129013657569885, + "learning_rate": 1.171663882294323e-05, + "loss": 0.0025, + "step": 17930 + }, + { + "epoch": 4.730246669304841, + "grad_norm": 0.0026316603180021048, + "learning_rate": 1.1715466455640554e-05, + "loss": 0.0001, + "step": 17932 + }, + { + "epoch": 4.730774304181506, + "grad_norm": 0.03334641829133034, + "learning_rate": 1.1714294088337877e-05, + "loss": 0.002, + "step": 17934 + }, + { + "epoch": 4.731301939058172, + "grad_norm": 0.040111638605594635, + "learning_rate": 1.1713121721035201e-05, + "loss": 0.0003, + "step": 17936 + }, + { + "epoch": 4.731829573934837, + "grad_norm": 0.00933187734335661, + "learning_rate": 1.1711949353732525e-05, + "loss": 0.0036, + "step": 17938 + }, + { + "epoch": 4.732357208811503, + "grad_norm": 0.012324212118983269, + "learning_rate": 1.171077698642985e-05, + "loss": 0.0061, + "step": 17940 + }, + { + "epoch": 4.732884843688168, + "grad_norm": 0.005885401740670204, + "learning_rate": 1.1709604619127172e-05, + "loss": 0.0019, + "step": 17942 + }, + { + "epoch": 4.7334124785648335, + "grad_norm": 0.0038026978727430105, + "learning_rate": 1.1708432251824498e-05, + "loss": 0.0002, + "step": 17944 + }, + { + "epoch": 4.7339401134414985, + "grad_norm": 0.0024006564635783434, + "learning_rate": 1.1707259884521823e-05, + "loss": 0.0002, + "step": 17946 + }, + { + "epoch": 4.734467748318163, + "grad_norm": 0.0022090200800448656, + "learning_rate": 1.1706087517219147e-05, + "loss": 0.0001, + "step": 17948 + }, + { + "epoch": 4.734995383194829, + "grad_norm": 0.1268916130065918, + "learning_rate": 1.170491514991647e-05, + "loss": 0.0003, + "step": 17950 + }, + { + "epoch": 4.735523018071494, + "grad_norm": 0.0018402190180495381, + "learning_rate": 1.1703742782613794e-05, + "loss": 0.0, + "step": 17952 + }, + { + "epoch": 4.73605065294816, + "grad_norm": 0.029309000819921494, + "learning_rate": 1.1702570415311118e-05, + "loss": 0.0019, + "step": 17954 + }, + { + "epoch": 4.736578287824825, + "grad_norm": 0.0009737558430060744, + "learning_rate": 1.170139804800844e-05, + "loss": 0.0, + "step": 17956 + }, + { + "epoch": 4.737105922701491, + "grad_norm": 0.003533783834427595, + "learning_rate": 1.1700225680705765e-05, + "loss": 0.0002, + "step": 17958 + }, + { + "epoch": 4.737633557578156, + "grad_norm": 0.01023514848202467, + "learning_rate": 1.1699053313403091e-05, + "loss": 0.0002, + "step": 17960 + }, + { + "epoch": 4.738161192454822, + "grad_norm": 0.000757816422265023, + "learning_rate": 1.1697880946100415e-05, + "loss": 0.0045, + "step": 17962 + }, + { + "epoch": 4.738688827331487, + "grad_norm": 0.0025113397277891636, + "learning_rate": 1.1696708578797738e-05, + "loss": 0.0001, + "step": 17964 + }, + { + "epoch": 4.739216462208152, + "grad_norm": 0.005145610310137272, + "learning_rate": 1.1695536211495062e-05, + "loss": 0.0001, + "step": 17966 + }, + { + "epoch": 4.7397440970848175, + "grad_norm": 0.19727960228919983, + "learning_rate": 1.1694363844192387e-05, + "loss": 0.0032, + "step": 17968 + }, + { + "epoch": 4.740271731961482, + "grad_norm": 0.02674773894250393, + "learning_rate": 1.169319147688971e-05, + "loss": 0.002, + "step": 17970 + }, + { + "epoch": 4.740799366838148, + "grad_norm": 0.0007247881731018424, + "learning_rate": 1.1692019109587033e-05, + "loss": 0.0026, + "step": 17972 + }, + { + "epoch": 4.741327001714813, + "grad_norm": 0.010535967536270618, + "learning_rate": 1.1690846742284358e-05, + "loss": 0.0001, + "step": 17974 + }, + { + "epoch": 4.741854636591478, + "grad_norm": 0.005599545314908028, + "learning_rate": 1.1689674374981684e-05, + "loss": 0.0001, + "step": 17976 + }, + { + "epoch": 4.742382271468144, + "grad_norm": 0.029898913577198982, + "learning_rate": 1.1688502007679008e-05, + "loss": 0.0018, + "step": 17978 + }, + { + "epoch": 4.742909906344809, + "grad_norm": 0.0016575149493291974, + "learning_rate": 1.168732964037633e-05, + "loss": 0.0001, + "step": 17980 + }, + { + "epoch": 4.743437541221475, + "grad_norm": 0.013292700052261353, + "learning_rate": 1.1686157273073655e-05, + "loss": 0.0001, + "step": 17982 + }, + { + "epoch": 4.74396517609814, + "grad_norm": 0.0016562979435548186, + "learning_rate": 1.168498490577098e-05, + "loss": 0.0, + "step": 17984 + }, + { + "epoch": 4.744492810974806, + "grad_norm": 0.19880783557891846, + "learning_rate": 1.1683812538468302e-05, + "loss": 0.0025, + "step": 17986 + }, + { + "epoch": 4.745020445851471, + "grad_norm": 0.004646742716431618, + "learning_rate": 1.1682640171165626e-05, + "loss": 0.0033, + "step": 17988 + }, + { + "epoch": 4.7455480807281365, + "grad_norm": 0.030042750760912895, + "learning_rate": 1.1681467803862952e-05, + "loss": 0.0004, + "step": 17990 + }, + { + "epoch": 4.7460757156048015, + "grad_norm": 0.14291206002235413, + "learning_rate": 1.1680295436560276e-05, + "loss": 0.002, + "step": 17992 + }, + { + "epoch": 4.746603350481466, + "grad_norm": 0.007251972798258066, + "learning_rate": 1.1679123069257599e-05, + "loss": 0.0079, + "step": 17994 + }, + { + "epoch": 4.747130985358132, + "grad_norm": 0.002422675956040621, + "learning_rate": 1.1677950701954923e-05, + "loss": 0.0022, + "step": 17996 + }, + { + "epoch": 4.747658620234797, + "grad_norm": 0.0013060750206932425, + "learning_rate": 1.1676778334652248e-05, + "loss": 0.0001, + "step": 17998 + }, + { + "epoch": 4.748186255111463, + "grad_norm": 0.10244200378656387, + "learning_rate": 1.1675605967349572e-05, + "loss": 0.0002, + "step": 18000 + }, + { + "epoch": 4.748186255111463, + "eval_loss": 0.001613546279259026, + "eval_runtime": 304.3538, + "eval_samples_per_second": 708.521, + "eval_steps_per_second": 88.568, + "step": 18000 + } + ], + "logging_steps": 2, + "max_steps": 37910, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.4250409066052977e+18, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}